9c562def05f5588cf03e40b8955b3f09f4564dd3
[dpdk.git] / drivers / net / ifc / ifcvf_vdpa.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2018 Intel Corporation
3  */
4
5 #include <unistd.h>
6 #include <pthread.h>
7 #include <fcntl.h>
8 #include <string.h>
9 #include <sys/ioctl.h>
10 #include <sys/epoll.h>
11 #include <linux/virtio_net.h>
12 #include <stdbool.h>
13
14 #include <rte_malloc.h>
15 #include <rte_memory.h>
16 #include <rte_bus_pci.h>
17 #include <rte_vhost.h>
18 #include <rte_vdpa.h>
19 #include <rte_vfio.h>
20 #include <rte_spinlock.h>
21 #include <rte_log.h>
22 #include <rte_kvargs.h>
23 #include <rte_devargs.h>
24
25 #include "base/ifcvf.h"
26
27 #define DRV_LOG(level, fmt, args...) \
28         rte_log(RTE_LOG_ ## level, ifcvf_vdpa_logtype, \
29                 "IFCVF %s(): " fmt "\n", __func__, ##args)
30
31 #ifndef PAGE_SIZE
32 #define PAGE_SIZE 4096
33 #endif
34
35 #define IFCVF_USED_RING_LEN(size) \
36         ((size) * sizeof(struct vring_used_elem) + sizeof(uint16_t) * 3)
37
38 #define IFCVF_VDPA_MODE         "vdpa"
39 #define IFCVF_SW_FALLBACK_LM    "sw-live-migration"
40
41 static const char * const ifcvf_valid_arguments[] = {
42         IFCVF_VDPA_MODE,
43         IFCVF_SW_FALLBACK_LM,
44         NULL
45 };
46
47 static int ifcvf_vdpa_logtype;
48
49 struct ifcvf_internal {
50         struct rte_vdpa_dev_addr dev_addr;
51         struct rte_pci_device *pdev;
52         struct ifcvf_hw hw;
53         int vfio_container_fd;
54         int vfio_group_fd;
55         int vfio_dev_fd;
56         pthread_t tid;  /* thread for notify relay */
57         int epfd;
58         int vid;
59         int did;
60         uint16_t max_queues;
61         uint64_t features;
62         rte_atomic32_t started;
63         rte_atomic32_t dev_attached;
64         rte_atomic32_t running;
65         rte_spinlock_t lock;
66         bool sw_lm;
67         bool sw_fallback_running;
68         /* mediated vring for sw fallback */
69         struct vring m_vring[IFCVF_MAX_QUEUES * 2];
70         /* eventfd for used ring interrupt */
71         int intr_fd[IFCVF_MAX_QUEUES * 2];
72 };
73
74 struct internal_list {
75         TAILQ_ENTRY(internal_list) next;
76         struct ifcvf_internal *internal;
77 };
78
79 TAILQ_HEAD(internal_list_head, internal_list);
80 static struct internal_list_head internal_list =
81         TAILQ_HEAD_INITIALIZER(internal_list);
82
83 static pthread_mutex_t internal_list_lock = PTHREAD_MUTEX_INITIALIZER;
84
85 static void update_used_ring(struct ifcvf_internal *internal, uint16_t qid);
86
87 static struct internal_list *
88 find_internal_resource_by_did(int did)
89 {
90         int found = 0;
91         struct internal_list *list;
92
93         pthread_mutex_lock(&internal_list_lock);
94
95         TAILQ_FOREACH(list, &internal_list, next) {
96                 if (did == list->internal->did) {
97                         found = 1;
98                         break;
99                 }
100         }
101
102         pthread_mutex_unlock(&internal_list_lock);
103
104         if (!found)
105                 return NULL;
106
107         return list;
108 }
109
110 static struct internal_list *
111 find_internal_resource_by_dev(struct rte_pci_device *pdev)
112 {
113         int found = 0;
114         struct internal_list *list;
115
116         pthread_mutex_lock(&internal_list_lock);
117
118         TAILQ_FOREACH(list, &internal_list, next) {
119                 if (pdev == list->internal->pdev) {
120                         found = 1;
121                         break;
122                 }
123         }
124
125         pthread_mutex_unlock(&internal_list_lock);
126
127         if (!found)
128                 return NULL;
129
130         return list;
131 }
132
133 static int
134 ifcvf_vfio_setup(struct ifcvf_internal *internal)
135 {
136         struct rte_pci_device *dev = internal->pdev;
137         char devname[RTE_DEV_NAME_MAX_LEN] = {0};
138         int iommu_group_num;
139         int i;
140
141         internal->vfio_dev_fd = -1;
142         internal->vfio_group_fd = -1;
143         internal->vfio_container_fd = -1;
144
145         rte_pci_device_name(&dev->addr, devname, RTE_DEV_NAME_MAX_LEN);
146         rte_vfio_get_group_num(rte_pci_get_sysfs_path(), devname,
147                         &iommu_group_num);
148
149         internal->vfio_container_fd = rte_vfio_container_create();
150         if (internal->vfio_container_fd < 0)
151                 return -1;
152
153         internal->vfio_group_fd = rte_vfio_container_group_bind(
154                         internal->vfio_container_fd, iommu_group_num);
155         if (internal->vfio_group_fd < 0)
156                 goto err;
157
158         if (rte_pci_map_device(dev))
159                 goto err;
160
161         internal->vfio_dev_fd = dev->intr_handle.vfio_dev_fd;
162
163         for (i = 0; i < RTE_MIN(PCI_MAX_RESOURCE, IFCVF_PCI_MAX_RESOURCE);
164                         i++) {
165                 internal->hw.mem_resource[i].addr =
166                         internal->pdev->mem_resource[i].addr;
167                 internal->hw.mem_resource[i].phys_addr =
168                         internal->pdev->mem_resource[i].phys_addr;
169                 internal->hw.mem_resource[i].len =
170                         internal->pdev->mem_resource[i].len;
171         }
172
173         return 0;
174
175 err:
176         rte_vfio_container_destroy(internal->vfio_container_fd);
177         return -1;
178 }
179
180 static int
181 ifcvf_dma_map(struct ifcvf_internal *internal, int do_map)
182 {
183         uint32_t i;
184         int ret;
185         struct rte_vhost_memory *mem = NULL;
186         int vfio_container_fd;
187
188         ret = rte_vhost_get_mem_table(internal->vid, &mem);
189         if (ret < 0) {
190                 DRV_LOG(ERR, "failed to get VM memory layout.");
191                 goto exit;
192         }
193
194         vfio_container_fd = internal->vfio_container_fd;
195
196         for (i = 0; i < mem->nregions; i++) {
197                 struct rte_vhost_mem_region *reg;
198
199                 reg = &mem->regions[i];
200                 DRV_LOG(INFO, "%s, region %u: HVA 0x%" PRIx64 ", "
201                         "GPA 0x%" PRIx64 ", size 0x%" PRIx64 ".",
202                         do_map ? "DMA map" : "DMA unmap", i,
203                         reg->host_user_addr, reg->guest_phys_addr, reg->size);
204
205                 if (do_map) {
206                         ret = rte_vfio_container_dma_map(vfio_container_fd,
207                                 reg->host_user_addr, reg->guest_phys_addr,
208                                 reg->size);
209                         if (ret < 0) {
210                                 DRV_LOG(ERR, "DMA map failed.");
211                                 goto exit;
212                         }
213                 } else {
214                         ret = rte_vfio_container_dma_unmap(vfio_container_fd,
215                                 reg->host_user_addr, reg->guest_phys_addr,
216                                 reg->size);
217                         if (ret < 0) {
218                                 DRV_LOG(ERR, "DMA unmap failed.");
219                                 goto exit;
220                         }
221                 }
222         }
223
224 exit:
225         if (mem)
226                 free(mem);
227         return ret;
228 }
229
230 static uint64_t
231 hva_to_gpa(int vid, uint64_t hva)
232 {
233         struct rte_vhost_memory *mem = NULL;
234         struct rte_vhost_mem_region *reg;
235         uint32_t i;
236         uint64_t gpa = 0;
237
238         if (rte_vhost_get_mem_table(vid, &mem) < 0)
239                 goto exit;
240
241         for (i = 0; i < mem->nregions; i++) {
242                 reg = &mem->regions[i];
243
244                 if (hva >= reg->host_user_addr &&
245                                 hva < reg->host_user_addr + reg->size) {
246                         gpa = hva - reg->host_user_addr + reg->guest_phys_addr;
247                         break;
248                 }
249         }
250
251 exit:
252         if (mem)
253                 free(mem);
254         return gpa;
255 }
256
257 static int
258 vdpa_ifcvf_start(struct ifcvf_internal *internal)
259 {
260         struct ifcvf_hw *hw = &internal->hw;
261         int i, nr_vring;
262         int vid;
263         struct rte_vhost_vring vq;
264         uint64_t gpa;
265
266         vid = internal->vid;
267         nr_vring = rte_vhost_get_vring_num(vid);
268         rte_vhost_get_negotiated_features(vid, &hw->req_features);
269
270         for (i = 0; i < nr_vring; i++) {
271                 rte_vhost_get_vhost_vring(vid, i, &vq);
272                 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.desc);
273                 if (gpa == 0) {
274                         DRV_LOG(ERR, "Fail to get GPA for descriptor ring.");
275                         return -1;
276                 }
277                 hw->vring[i].desc = gpa;
278
279                 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.avail);
280                 if (gpa == 0) {
281                         DRV_LOG(ERR, "Fail to get GPA for available ring.");
282                         return -1;
283                 }
284                 hw->vring[i].avail = gpa;
285
286                 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.used);
287                 if (gpa == 0) {
288                         DRV_LOG(ERR, "Fail to get GPA for used ring.");
289                         return -1;
290                 }
291                 hw->vring[i].used = gpa;
292
293                 hw->vring[i].size = vq.size;
294                 rte_vhost_get_vring_base(vid, i, &hw->vring[i].last_avail_idx,
295                                 &hw->vring[i].last_used_idx);
296         }
297         hw->nr_vring = i;
298
299         return ifcvf_start_hw(&internal->hw);
300 }
301
302 static void
303 vdpa_ifcvf_stop(struct ifcvf_internal *internal)
304 {
305         struct ifcvf_hw *hw = &internal->hw;
306         uint32_t i;
307         int vid;
308         uint64_t features = 0;
309         uint64_t log_base = 0, log_size = 0;
310         uint64_t len;
311
312         vid = internal->vid;
313         ifcvf_stop_hw(hw);
314
315         for (i = 0; i < hw->nr_vring; i++)
316                 rte_vhost_set_vring_base(vid, i, hw->vring[i].last_avail_idx,
317                                 hw->vring[i].last_used_idx);
318
319         if (internal->sw_lm)
320                 return;
321
322         rte_vhost_get_negotiated_features(vid, &features);
323         if (RTE_VHOST_NEED_LOG(features)) {
324                 ifcvf_disable_logging(hw);
325                 rte_vhost_get_log_base(internal->vid, &log_base, &log_size);
326                 rte_vfio_container_dma_unmap(internal->vfio_container_fd,
327                                 log_base, IFCVF_LOG_BASE, log_size);
328                 /*
329                  * IFCVF marks dirty memory pages for only packet buffer,
330                  * SW helps to mark the used ring as dirty after device stops.
331                  */
332                 for (i = 0; i < hw->nr_vring; i++) {
333                         len = IFCVF_USED_RING_LEN(hw->vring[i].size);
334                         rte_vhost_log_used_vring(vid, i, 0, len);
335                 }
336         }
337 }
338
339 #define MSIX_IRQ_SET_BUF_LEN (sizeof(struct vfio_irq_set) + \
340                 sizeof(int) * (IFCVF_MAX_QUEUES * 2 + 1))
341 static int
342 vdpa_enable_vfio_intr(struct ifcvf_internal *internal, bool m_rx)
343 {
344         int ret;
345         uint32_t i, nr_vring;
346         char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
347         struct vfio_irq_set *irq_set;
348         int *fd_ptr;
349         struct rte_vhost_vring vring;
350         int fd;
351
352         vring.callfd = -1;
353
354         nr_vring = rte_vhost_get_vring_num(internal->vid);
355
356         irq_set = (struct vfio_irq_set *)irq_set_buf;
357         irq_set->argsz = sizeof(irq_set_buf);
358         irq_set->count = nr_vring + 1;
359         irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
360                          VFIO_IRQ_SET_ACTION_TRIGGER;
361         irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
362         irq_set->start = 0;
363         fd_ptr = (int *)&irq_set->data;
364         fd_ptr[RTE_INTR_VEC_ZERO_OFFSET] = internal->pdev->intr_handle.fd;
365
366         for (i = 0; i < nr_vring; i++)
367                 internal->intr_fd[i] = -1;
368
369         for (i = 0; i < nr_vring; i++) {
370                 rte_vhost_get_vhost_vring(internal->vid, i, &vring);
371                 fd_ptr[RTE_INTR_VEC_RXTX_OFFSET + i] = vring.callfd;
372                 if ((i & 1) == 0 && m_rx == true) {
373                         fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
374                         if (fd < 0) {
375                                 DRV_LOG(ERR, "can't setup eventfd: %s",
376                                         strerror(errno));
377                                 return -1;
378                         }
379                         internal->intr_fd[i] = fd;
380                         fd_ptr[RTE_INTR_VEC_RXTX_OFFSET + i] = fd;
381                 }
382         }
383
384         ret = ioctl(internal->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
385         if (ret) {
386                 DRV_LOG(ERR, "Error enabling MSI-X interrupts: %s",
387                                 strerror(errno));
388                 return -1;
389         }
390
391         return 0;
392 }
393
394 static int
395 vdpa_disable_vfio_intr(struct ifcvf_internal *internal)
396 {
397         int ret;
398         uint32_t i, nr_vring;
399         char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
400         struct vfio_irq_set *irq_set;
401
402         irq_set = (struct vfio_irq_set *)irq_set_buf;
403         irq_set->argsz = sizeof(irq_set_buf);
404         irq_set->count = 0;
405         irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
406         irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
407         irq_set->start = 0;
408
409         nr_vring = rte_vhost_get_vring_num(internal->vid);
410         for (i = 0; i < nr_vring; i++) {
411                 if (internal->intr_fd[i] >= 0)
412                         close(internal->intr_fd[i]);
413                 internal->intr_fd[i] = -1;
414         }
415
416         ret = ioctl(internal->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
417         if (ret) {
418                 DRV_LOG(ERR, "Error disabling MSI-X interrupts: %s",
419                                 strerror(errno));
420                 return -1;
421         }
422
423         return 0;
424 }
425
426 static void *
427 notify_relay(void *arg)
428 {
429         int i, kickfd, epfd, nfds = 0;
430         uint32_t qid, q_num;
431         struct epoll_event events[IFCVF_MAX_QUEUES * 2];
432         struct epoll_event ev;
433         uint64_t buf;
434         int nbytes;
435         struct rte_vhost_vring vring;
436         struct ifcvf_internal *internal = (struct ifcvf_internal *)arg;
437         struct ifcvf_hw *hw = &internal->hw;
438
439         q_num = rte_vhost_get_vring_num(internal->vid);
440
441         epfd = epoll_create(IFCVF_MAX_QUEUES * 2);
442         if (epfd < 0) {
443                 DRV_LOG(ERR, "failed to create epoll instance.");
444                 return NULL;
445         }
446         internal->epfd = epfd;
447
448         vring.kickfd = -1;
449         for (qid = 0; qid < q_num; qid++) {
450                 ev.events = EPOLLIN | EPOLLPRI;
451                 rte_vhost_get_vhost_vring(internal->vid, qid, &vring);
452                 ev.data.u64 = qid | (uint64_t)vring.kickfd << 32;
453                 if (epoll_ctl(epfd, EPOLL_CTL_ADD, vring.kickfd, &ev) < 0) {
454                         DRV_LOG(ERR, "epoll add error: %s", strerror(errno));
455                         return NULL;
456                 }
457         }
458
459         for (;;) {
460                 nfds = epoll_wait(epfd, events, q_num, -1);
461                 if (nfds < 0) {
462                         if (errno == EINTR)
463                                 continue;
464                         DRV_LOG(ERR, "epoll_wait return fail\n");
465                         return NULL;
466                 }
467
468                 for (i = 0; i < nfds; i++) {
469                         qid = events[i].data.u32;
470                         kickfd = (uint32_t)(events[i].data.u64 >> 32);
471                         do {
472                                 nbytes = read(kickfd, &buf, 8);
473                                 if (nbytes < 0) {
474                                         if (errno == EINTR ||
475                                             errno == EWOULDBLOCK ||
476                                             errno == EAGAIN)
477                                                 continue;
478                                         DRV_LOG(INFO, "Error reading "
479                                                 "kickfd: %s",
480                                                 strerror(errno));
481                                 }
482                                 break;
483                         } while (1);
484
485                         ifcvf_notify_queue(hw, qid);
486                 }
487         }
488
489         return NULL;
490 }
491
492 static int
493 setup_notify_relay(struct ifcvf_internal *internal)
494 {
495         int ret;
496
497         ret = pthread_create(&internal->tid, NULL, notify_relay,
498                         (void *)internal);
499         if (ret) {
500                 DRV_LOG(ERR, "failed to create notify relay pthread.");
501                 return -1;
502         }
503         return 0;
504 }
505
506 static int
507 unset_notify_relay(struct ifcvf_internal *internal)
508 {
509         void *status;
510
511         if (internal->tid) {
512                 pthread_cancel(internal->tid);
513                 pthread_join(internal->tid, &status);
514         }
515         internal->tid = 0;
516
517         if (internal->epfd >= 0)
518                 close(internal->epfd);
519         internal->epfd = -1;
520
521         return 0;
522 }
523
524 static int
525 update_datapath(struct ifcvf_internal *internal)
526 {
527         int ret;
528
529         rte_spinlock_lock(&internal->lock);
530
531         if (!rte_atomic32_read(&internal->running) &&
532             (rte_atomic32_read(&internal->started) &&
533              rte_atomic32_read(&internal->dev_attached))) {
534                 ret = ifcvf_dma_map(internal, 1);
535                 if (ret)
536                         goto err;
537
538                 ret = vdpa_enable_vfio_intr(internal, 0);
539                 if (ret)
540                         goto err;
541
542                 ret = vdpa_ifcvf_start(internal);
543                 if (ret)
544                         goto err;
545
546                 ret = setup_notify_relay(internal);
547                 if (ret)
548                         goto err;
549
550                 rte_atomic32_set(&internal->running, 1);
551         } else if (rte_atomic32_read(&internal->running) &&
552                    (!rte_atomic32_read(&internal->started) ||
553                     !rte_atomic32_read(&internal->dev_attached))) {
554                 ret = unset_notify_relay(internal);
555                 if (ret)
556                         goto err;
557
558                 vdpa_ifcvf_stop(internal);
559
560                 ret = vdpa_disable_vfio_intr(internal);
561                 if (ret)
562                         goto err;
563
564                 ret = ifcvf_dma_map(internal, 0);
565                 if (ret)
566                         goto err;
567
568                 rte_atomic32_set(&internal->running, 0);
569         }
570
571         rte_spinlock_unlock(&internal->lock);
572         return 0;
573 err:
574         rte_spinlock_unlock(&internal->lock);
575         return ret;
576 }
577
578 static int
579 m_ifcvf_start(struct ifcvf_internal *internal)
580 {
581         struct ifcvf_hw *hw = &internal->hw;
582         uint32_t i, nr_vring;
583         int vid, ret;
584         struct rte_vhost_vring vq;
585         void *vring_buf;
586         uint64_t m_vring_iova = IFCVF_MEDIATED_VRING;
587         uint64_t size;
588         uint64_t gpa;
589
590         memset(&vq, 0, sizeof(vq));
591         vid = internal->vid;
592         nr_vring = rte_vhost_get_vring_num(vid);
593         rte_vhost_get_negotiated_features(vid, &hw->req_features);
594
595         for (i = 0; i < nr_vring; i++) {
596                 rte_vhost_get_vhost_vring(vid, i, &vq);
597
598                 size = RTE_ALIGN_CEIL(vring_size(vq.size, PAGE_SIZE),
599                                 PAGE_SIZE);
600                 vring_buf = rte_zmalloc("ifcvf", size, PAGE_SIZE);
601                 vring_init(&internal->m_vring[i], vq.size, vring_buf,
602                                 PAGE_SIZE);
603
604                 ret = rte_vfio_container_dma_map(internal->vfio_container_fd,
605                         (uint64_t)(uintptr_t)vring_buf, m_vring_iova, size);
606                 if (ret < 0) {
607                         DRV_LOG(ERR, "mediated vring DMA map failed.");
608                         goto error;
609                 }
610
611                 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.desc);
612                 if (gpa == 0) {
613                         DRV_LOG(ERR, "Fail to get GPA for descriptor ring.");
614                         return -1;
615                 }
616                 hw->vring[i].desc = gpa;
617
618                 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.avail);
619                 if (gpa == 0) {
620                         DRV_LOG(ERR, "Fail to get GPA for available ring.");
621                         return -1;
622                 }
623                 hw->vring[i].avail = gpa;
624
625                 /* Direct I/O for Tx queue, relay for Rx queue */
626                 if (i & 1) {
627                         gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.used);
628                         if (gpa == 0) {
629                                 DRV_LOG(ERR, "Fail to get GPA for used ring.");
630                                 return -1;
631                         }
632                         hw->vring[i].used = gpa;
633                 } else {
634                         hw->vring[i].used = m_vring_iova +
635                                 (char *)internal->m_vring[i].used -
636                                 (char *)internal->m_vring[i].desc;
637                 }
638
639                 hw->vring[i].size = vq.size;
640
641                 rte_vhost_get_vring_base(vid, i,
642                                 &internal->m_vring[i].avail->idx,
643                                 &internal->m_vring[i].used->idx);
644
645                 rte_vhost_get_vring_base(vid, i, &hw->vring[i].last_avail_idx,
646                                 &hw->vring[i].last_used_idx);
647
648                 m_vring_iova += size;
649         }
650         hw->nr_vring = nr_vring;
651
652         return ifcvf_start_hw(&internal->hw);
653
654 error:
655         for (i = 0; i < nr_vring; i++)
656                 if (internal->m_vring[i].desc)
657                         rte_free(internal->m_vring[i].desc);
658
659         return -1;
660 }
661
662 static int
663 m_ifcvf_stop(struct ifcvf_internal *internal)
664 {
665         int vid;
666         uint32_t i;
667         struct rte_vhost_vring vq;
668         struct ifcvf_hw *hw = &internal->hw;
669         uint64_t m_vring_iova = IFCVF_MEDIATED_VRING;
670         uint64_t size, len;
671
672         vid = internal->vid;
673         ifcvf_stop_hw(hw);
674
675         for (i = 0; i < hw->nr_vring; i++) {
676                 /* synchronize remaining new used entries if any */
677                 if ((i & 1) == 0)
678                         update_used_ring(internal, i);
679
680                 rte_vhost_get_vhost_vring(vid, i, &vq);
681                 len = IFCVF_USED_RING_LEN(vq.size);
682                 rte_vhost_log_used_vring(vid, i, 0, len);
683
684                 size = RTE_ALIGN_CEIL(vring_size(vq.size, PAGE_SIZE),
685                                 PAGE_SIZE);
686                 rte_vfio_container_dma_unmap(internal->vfio_container_fd,
687                         (uint64_t)(uintptr_t)internal->m_vring[i].desc,
688                         m_vring_iova, size);
689
690                 rte_vhost_set_vring_base(vid, i, hw->vring[i].last_avail_idx,
691                                 hw->vring[i].last_used_idx);
692                 rte_free(internal->m_vring[i].desc);
693                 m_vring_iova += size;
694         }
695
696         return 0;
697 }
698
699 static void
700 update_used_ring(struct ifcvf_internal *internal, uint16_t qid)
701 {
702         rte_vdpa_relay_vring_used(internal->vid, qid, &internal->m_vring[qid]);
703         rte_vhost_vring_call(internal->vid, qid);
704 }
705
706 static void *
707 vring_relay(void *arg)
708 {
709         int i, vid, epfd, fd, nfds;
710         struct ifcvf_internal *internal = (struct ifcvf_internal *)arg;
711         struct rte_vhost_vring vring;
712         uint16_t qid, q_num;
713         struct epoll_event events[IFCVF_MAX_QUEUES * 4];
714         struct epoll_event ev;
715         int nbytes;
716         uint64_t buf;
717
718         vid = internal->vid;
719         q_num = rte_vhost_get_vring_num(vid);
720
721         /* add notify fd and interrupt fd to epoll */
722         epfd = epoll_create(IFCVF_MAX_QUEUES * 2);
723         if (epfd < 0) {
724                 DRV_LOG(ERR, "failed to create epoll instance.");
725                 return NULL;
726         }
727         internal->epfd = epfd;
728
729         vring.kickfd = -1;
730         for (qid = 0; qid < q_num; qid++) {
731                 ev.events = EPOLLIN | EPOLLPRI;
732                 rte_vhost_get_vhost_vring(vid, qid, &vring);
733                 ev.data.u64 = qid << 1 | (uint64_t)vring.kickfd << 32;
734                 if (epoll_ctl(epfd, EPOLL_CTL_ADD, vring.kickfd, &ev) < 0) {
735                         DRV_LOG(ERR, "epoll add error: %s", strerror(errno));
736                         return NULL;
737                 }
738         }
739
740         for (qid = 0; qid < q_num; qid += 2) {
741                 ev.events = EPOLLIN | EPOLLPRI;
742                 /* leave a flag to mark it's for interrupt */
743                 ev.data.u64 = 1 | qid << 1 |
744                         (uint64_t)internal->intr_fd[qid] << 32;
745                 if (epoll_ctl(epfd, EPOLL_CTL_ADD, internal->intr_fd[qid], &ev)
746                                 < 0) {
747                         DRV_LOG(ERR, "epoll add error: %s", strerror(errno));
748                         return NULL;
749                 }
750                 update_used_ring(internal, qid);
751         }
752
753         /* start relay with a first kick */
754         for (qid = 0; qid < q_num; qid++)
755                 ifcvf_notify_queue(&internal->hw, qid);
756
757         /* listen to the events and react accordingly */
758         for (;;) {
759                 nfds = epoll_wait(epfd, events, q_num * 2, -1);
760                 if (nfds < 0) {
761                         if (errno == EINTR)
762                                 continue;
763                         DRV_LOG(ERR, "epoll_wait return fail\n");
764                         return NULL;
765                 }
766
767                 for (i = 0; i < nfds; i++) {
768                         fd = (uint32_t)(events[i].data.u64 >> 32);
769                         do {
770                                 nbytes = read(fd, &buf, 8);
771                                 if (nbytes < 0) {
772                                         if (errno == EINTR ||
773                                             errno == EWOULDBLOCK ||
774                                             errno == EAGAIN)
775                                                 continue;
776                                         DRV_LOG(INFO, "Error reading "
777                                                 "kickfd: %s",
778                                                 strerror(errno));
779                                 }
780                                 break;
781                         } while (1);
782
783                         qid = events[i].data.u32 >> 1;
784
785                         if (events[i].data.u32 & 1)
786                                 update_used_ring(internal, qid);
787                         else
788                                 ifcvf_notify_queue(&internal->hw, qid);
789                 }
790         }
791
792         return NULL;
793 }
794
795 static int
796 setup_vring_relay(struct ifcvf_internal *internal)
797 {
798         int ret;
799
800         ret = pthread_create(&internal->tid, NULL, vring_relay,
801                         (void *)internal);
802         if (ret) {
803                 DRV_LOG(ERR, "failed to create ring relay pthread.");
804                 return -1;
805         }
806         return 0;
807 }
808
809 static int
810 unset_vring_relay(struct ifcvf_internal *internal)
811 {
812         void *status;
813
814         if (internal->tid) {
815                 pthread_cancel(internal->tid);
816                 pthread_join(internal->tid, &status);
817         }
818         internal->tid = 0;
819
820         if (internal->epfd >= 0)
821                 close(internal->epfd);
822         internal->epfd = -1;
823
824         return 0;
825 }
826
827 static int
828 ifcvf_sw_fallback_switchover(struct ifcvf_internal *internal)
829 {
830         int ret;
831         int vid = internal->vid;
832
833         /* stop the direct IO data path */
834         unset_notify_relay(internal);
835         vdpa_ifcvf_stop(internal);
836         vdpa_disable_vfio_intr(internal);
837
838         ret = rte_vhost_host_notifier_ctrl(vid, false);
839         if (ret && ret != -ENOTSUP)
840                 goto error;
841
842         /* set up interrupt for interrupt relay */
843         ret = vdpa_enable_vfio_intr(internal, 1);
844         if (ret)
845                 goto unmap;
846
847         /* config the VF */
848         ret = m_ifcvf_start(internal);
849         if (ret)
850                 goto unset_intr;
851
852         /* set up vring relay thread */
853         ret = setup_vring_relay(internal);
854         if (ret)
855                 goto stop_vf;
856
857         rte_vhost_host_notifier_ctrl(vid, true);
858
859         internal->sw_fallback_running = true;
860
861         return 0;
862
863 stop_vf:
864         m_ifcvf_stop(internal);
865 unset_intr:
866         vdpa_disable_vfio_intr(internal);
867 unmap:
868         ifcvf_dma_map(internal, 0);
869 error:
870         return -1;
871 }
872
873 static int
874 ifcvf_dev_config(int vid)
875 {
876         int did;
877         struct internal_list *list;
878         struct ifcvf_internal *internal;
879
880         did = rte_vhost_get_vdpa_device_id(vid);
881         list = find_internal_resource_by_did(did);
882         if (list == NULL) {
883                 DRV_LOG(ERR, "Invalid device id: %d", did);
884                 return -1;
885         }
886
887         internal = list->internal;
888         internal->vid = vid;
889         rte_atomic32_set(&internal->dev_attached, 1);
890         update_datapath(internal);
891
892         if (rte_vhost_host_notifier_ctrl(vid, true) != 0)
893                 DRV_LOG(NOTICE, "vDPA (%d): software relay is used.", did);
894
895         return 0;
896 }
897
898 static int
899 ifcvf_dev_close(int vid)
900 {
901         int did;
902         struct internal_list *list;
903         struct ifcvf_internal *internal;
904
905         did = rte_vhost_get_vdpa_device_id(vid);
906         list = find_internal_resource_by_did(did);
907         if (list == NULL) {
908                 DRV_LOG(ERR, "Invalid device id: %d", did);
909                 return -1;
910         }
911
912         internal = list->internal;
913
914         if (internal->sw_fallback_running) {
915                 /* unset ring relay */
916                 unset_vring_relay(internal);
917
918                 /* reset VF */
919                 m_ifcvf_stop(internal);
920
921                 /* remove interrupt setting */
922                 vdpa_disable_vfio_intr(internal);
923
924                 /* unset DMA map for guest memory */
925                 ifcvf_dma_map(internal, 0);
926
927                 internal->sw_fallback_running = false;
928         } else {
929                 rte_atomic32_set(&internal->dev_attached, 0);
930                 update_datapath(internal);
931         }
932
933         return 0;
934 }
935
936 static int
937 ifcvf_set_features(int vid)
938 {
939         uint64_t features = 0;
940         int did;
941         struct internal_list *list;
942         struct ifcvf_internal *internal;
943         uint64_t log_base = 0, log_size = 0;
944
945         did = rte_vhost_get_vdpa_device_id(vid);
946         list = find_internal_resource_by_did(did);
947         if (list == NULL) {
948                 DRV_LOG(ERR, "Invalid device id: %d", did);
949                 return -1;
950         }
951
952         internal = list->internal;
953         rte_vhost_get_negotiated_features(vid, &features);
954
955         if (!RTE_VHOST_NEED_LOG(features))
956                 return 0;
957
958         if (internal->sw_lm) {
959                 ifcvf_sw_fallback_switchover(internal);
960         } else {
961                 rte_vhost_get_log_base(vid, &log_base, &log_size);
962                 rte_vfio_container_dma_map(internal->vfio_container_fd,
963                                 log_base, IFCVF_LOG_BASE, log_size);
964                 ifcvf_enable_logging(&internal->hw, IFCVF_LOG_BASE, log_size);
965         }
966
967         return 0;
968 }
969
970 static int
971 ifcvf_get_vfio_group_fd(int vid)
972 {
973         int did;
974         struct internal_list *list;
975
976         did = rte_vhost_get_vdpa_device_id(vid);
977         list = find_internal_resource_by_did(did);
978         if (list == NULL) {
979                 DRV_LOG(ERR, "Invalid device id: %d", did);
980                 return -1;
981         }
982
983         return list->internal->vfio_group_fd;
984 }
985
986 static int
987 ifcvf_get_vfio_device_fd(int vid)
988 {
989         int did;
990         struct internal_list *list;
991
992         did = rte_vhost_get_vdpa_device_id(vid);
993         list = find_internal_resource_by_did(did);
994         if (list == NULL) {
995                 DRV_LOG(ERR, "Invalid device id: %d", did);
996                 return -1;
997         }
998
999         return list->internal->vfio_dev_fd;
1000 }
1001
1002 static int
1003 ifcvf_get_notify_area(int vid, int qid, uint64_t *offset, uint64_t *size)
1004 {
1005         int did;
1006         struct internal_list *list;
1007         struct ifcvf_internal *internal;
1008         struct vfio_region_info reg = { .argsz = sizeof(reg) };
1009         int ret;
1010
1011         did = rte_vhost_get_vdpa_device_id(vid);
1012         list = find_internal_resource_by_did(did);
1013         if (list == NULL) {
1014                 DRV_LOG(ERR, "Invalid device id: %d", did);
1015                 return -1;
1016         }
1017
1018         internal = list->internal;
1019
1020         reg.index = ifcvf_get_notify_region(&internal->hw);
1021         ret = ioctl(internal->vfio_dev_fd, VFIO_DEVICE_GET_REGION_INFO, &reg);
1022         if (ret) {
1023                 DRV_LOG(ERR, "Get not get device region info: %s",
1024                                 strerror(errno));
1025                 return -1;
1026         }
1027
1028         *offset = ifcvf_get_queue_notify_off(&internal->hw, qid) + reg.offset;
1029         *size = 0x1000;
1030
1031         return 0;
1032 }
1033
1034 static int
1035 ifcvf_get_queue_num(int did, uint32_t *queue_num)
1036 {
1037         struct internal_list *list;
1038
1039         list = find_internal_resource_by_did(did);
1040         if (list == NULL) {
1041                 DRV_LOG(ERR, "Invalid device id: %d", did);
1042                 return -1;
1043         }
1044
1045         *queue_num = list->internal->max_queues;
1046
1047         return 0;
1048 }
1049
1050 static int
1051 ifcvf_get_vdpa_features(int did, uint64_t *features)
1052 {
1053         struct internal_list *list;
1054
1055         list = find_internal_resource_by_did(did);
1056         if (list == NULL) {
1057                 DRV_LOG(ERR, "Invalid device id: %d", did);
1058                 return -1;
1059         }
1060
1061         *features = list->internal->features;
1062
1063         return 0;
1064 }
1065
1066 #define VDPA_SUPPORTED_PROTOCOL_FEATURES \
1067                 (1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK | \
1068                  1ULL << VHOST_USER_PROTOCOL_F_SLAVE_REQ | \
1069                  1ULL << VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD | \
1070                  1ULL << VHOST_USER_PROTOCOL_F_HOST_NOTIFIER | \
1071                  1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD)
1072 static int
1073 ifcvf_get_protocol_features(int did __rte_unused, uint64_t *features)
1074 {
1075         *features = VDPA_SUPPORTED_PROTOCOL_FEATURES;
1076         return 0;
1077 }
1078
1079 static struct rte_vdpa_dev_ops ifcvf_ops = {
1080         .get_queue_num = ifcvf_get_queue_num,
1081         .get_features = ifcvf_get_vdpa_features,
1082         .get_protocol_features = ifcvf_get_protocol_features,
1083         .dev_conf = ifcvf_dev_config,
1084         .dev_close = ifcvf_dev_close,
1085         .set_vring_state = NULL,
1086         .set_features = ifcvf_set_features,
1087         .migration_done = NULL,
1088         .get_vfio_group_fd = ifcvf_get_vfio_group_fd,
1089         .get_vfio_device_fd = ifcvf_get_vfio_device_fd,
1090         .get_notify_area = ifcvf_get_notify_area,
1091 };
1092
1093 static inline int
1094 open_int(const char *key __rte_unused, const char *value, void *extra_args)
1095 {
1096         uint16_t *n = extra_args;
1097
1098         if (value == NULL || extra_args == NULL)
1099                 return -EINVAL;
1100
1101         *n = (uint16_t)strtoul(value, NULL, 0);
1102         if (*n == USHRT_MAX && errno == ERANGE)
1103                 return -1;
1104
1105         return 0;
1106 }
1107
1108 static int
1109 ifcvf_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
1110                 struct rte_pci_device *pci_dev)
1111 {
1112         uint64_t features;
1113         struct ifcvf_internal *internal = NULL;
1114         struct internal_list *list = NULL;
1115         int vdpa_mode = 0;
1116         int sw_fallback_lm = 0;
1117         struct rte_kvargs *kvlist = NULL;
1118         int ret = 0;
1119
1120         if (rte_eal_process_type() != RTE_PROC_PRIMARY)
1121                 return 0;
1122
1123         if (!pci_dev->device.devargs)
1124                 return 1;
1125
1126         kvlist = rte_kvargs_parse(pci_dev->device.devargs->args,
1127                         ifcvf_valid_arguments);
1128         if (kvlist == NULL)
1129                 return 1;
1130
1131         /* probe only when vdpa mode is specified */
1132         if (rte_kvargs_count(kvlist, IFCVF_VDPA_MODE) == 0) {
1133                 rte_kvargs_free(kvlist);
1134                 return 1;
1135         }
1136
1137         ret = rte_kvargs_process(kvlist, IFCVF_VDPA_MODE, &open_int,
1138                         &vdpa_mode);
1139         if (ret < 0 || vdpa_mode == 0) {
1140                 rte_kvargs_free(kvlist);
1141                 return 1;
1142         }
1143
1144         list = rte_zmalloc("ifcvf", sizeof(*list), 0);
1145         if (list == NULL)
1146                 goto error;
1147
1148         internal = rte_zmalloc("ifcvf", sizeof(*internal), 0);
1149         if (internal == NULL)
1150                 goto error;
1151
1152         internal->pdev = pci_dev;
1153         rte_spinlock_init(&internal->lock);
1154
1155         if (ifcvf_vfio_setup(internal) < 0) {
1156                 DRV_LOG(ERR, "failed to setup device %s", pci_dev->name);
1157                 goto error;
1158         }
1159
1160         if (ifcvf_init_hw(&internal->hw, internal->pdev) < 0) {
1161                 DRV_LOG(ERR, "failed to init device %s", pci_dev->name);
1162                 goto error;
1163         }
1164
1165         internal->max_queues = IFCVF_MAX_QUEUES;
1166         features = ifcvf_get_features(&internal->hw);
1167         internal->features = (features &
1168                 ~(1ULL << VIRTIO_F_IOMMU_PLATFORM)) |
1169                 (1ULL << VIRTIO_NET_F_GUEST_ANNOUNCE) |
1170                 (1ULL << VIRTIO_NET_F_CTRL_VQ) |
1171                 (1ULL << VIRTIO_NET_F_STATUS) |
1172                 (1ULL << VHOST_USER_F_PROTOCOL_FEATURES) |
1173                 (1ULL << VHOST_F_LOG_ALL);
1174
1175         internal->dev_addr.pci_addr = pci_dev->addr;
1176         internal->dev_addr.type = PCI_ADDR;
1177         list->internal = internal;
1178
1179         if (rte_kvargs_count(kvlist, IFCVF_SW_FALLBACK_LM)) {
1180                 ret = rte_kvargs_process(kvlist, IFCVF_SW_FALLBACK_LM,
1181                                 &open_int, &sw_fallback_lm);
1182                 if (ret < 0)
1183                         goto error;
1184         }
1185         internal->sw_lm = sw_fallback_lm;
1186
1187         internal->did = rte_vdpa_register_device(&internal->dev_addr,
1188                                 &ifcvf_ops);
1189         if (internal->did < 0) {
1190                 DRV_LOG(ERR, "failed to register device %s", pci_dev->name);
1191                 goto error;
1192         }
1193
1194         pthread_mutex_lock(&internal_list_lock);
1195         TAILQ_INSERT_TAIL(&internal_list, list, next);
1196         pthread_mutex_unlock(&internal_list_lock);
1197
1198         rte_atomic32_set(&internal->started, 1);
1199         update_datapath(internal);
1200
1201         rte_kvargs_free(kvlist);
1202         return 0;
1203
1204 error:
1205         rte_kvargs_free(kvlist);
1206         rte_free(list);
1207         rte_free(internal);
1208         return -1;
1209 }
1210
1211 static int
1212 ifcvf_pci_remove(struct rte_pci_device *pci_dev)
1213 {
1214         struct ifcvf_internal *internal;
1215         struct internal_list *list;
1216
1217         if (rte_eal_process_type() != RTE_PROC_PRIMARY)
1218                 return 0;
1219
1220         list = find_internal_resource_by_dev(pci_dev);
1221         if (list == NULL) {
1222                 DRV_LOG(ERR, "Invalid device: %s", pci_dev->name);
1223                 return -1;
1224         }
1225
1226         internal = list->internal;
1227         rte_atomic32_set(&internal->started, 0);
1228         update_datapath(internal);
1229
1230         rte_pci_unmap_device(internal->pdev);
1231         rte_vfio_container_destroy(internal->vfio_container_fd);
1232         rte_vdpa_unregister_device(internal->did);
1233
1234         pthread_mutex_lock(&internal_list_lock);
1235         TAILQ_REMOVE(&internal_list, list, next);
1236         pthread_mutex_unlock(&internal_list_lock);
1237
1238         rte_free(list);
1239         rte_free(internal);
1240
1241         return 0;
1242 }
1243
1244 /*
1245  * IFCVF has the same vendor ID and device ID as virtio net PCI
1246  * device, with its specific subsystem vendor ID and device ID.
1247  */
1248 static const struct rte_pci_id pci_id_ifcvf_map[] = {
1249         { .class_id = RTE_CLASS_ANY_ID,
1250           .vendor_id = IFCVF_VENDOR_ID,
1251           .device_id = IFCVF_DEVICE_ID,
1252           .subsystem_vendor_id = IFCVF_SUBSYS_VENDOR_ID,
1253           .subsystem_device_id = IFCVF_SUBSYS_DEVICE_ID,
1254         },
1255
1256         { .vendor_id = 0, /* sentinel */
1257         },
1258 };
1259
1260 static struct rte_pci_driver rte_ifcvf_vdpa = {
1261         .id_table = pci_id_ifcvf_map,
1262         .drv_flags = 0,
1263         .probe = ifcvf_pci_probe,
1264         .remove = ifcvf_pci_remove,
1265 };
1266
1267 RTE_PMD_REGISTER_PCI(net_ifcvf, rte_ifcvf_vdpa);
1268 RTE_PMD_REGISTER_PCI_TABLE(net_ifcvf, pci_id_ifcvf_map);
1269 RTE_PMD_REGISTER_KMOD_DEP(net_ifcvf, "* vfio-pci");
1270
1271 RTE_INIT(ifcvf_vdpa_init_log)
1272 {
1273         ifcvf_vdpa_logtype = rte_log_register("pmd.net.ifcvf_vdpa");
1274         if (ifcvf_vdpa_logtype >= 0)
1275                 rte_log_set_level(ifcvf_vdpa_logtype, RTE_LOG_NOTICE);
1276 }