net/ifc: do not relay for Tx queue
[dpdk.git] / drivers / net / ifc / ifcvf_vdpa.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2018 Intel Corporation
3  */
4
5 #include <unistd.h>
6 #include <pthread.h>
7 #include <fcntl.h>
8 #include <sys/ioctl.h>
9 #include <sys/epoll.h>
10 #include <linux/virtio_net.h>
11 #include <stdbool.h>
12
13 #include <rte_malloc.h>
14 #include <rte_memory.h>
15 #include <rte_bus_pci.h>
16 #include <rte_vhost.h>
17 #include <rte_vdpa.h>
18 #include <rte_vfio.h>
19 #include <rte_spinlock.h>
20 #include <rte_log.h>
21 #include <rte_kvargs.h>
22 #include <rte_devargs.h>
23
24 #include "base/ifcvf.h"
25
26 #define DRV_LOG(level, fmt, args...) \
27         rte_log(RTE_LOG_ ## level, ifcvf_vdpa_logtype, \
28                 "IFCVF %s(): " fmt "\n", __func__, ##args)
29
30 #ifndef PAGE_SIZE
31 #define PAGE_SIZE 4096
32 #endif
33
34 #define IFCVF_USED_RING_LEN(size) \
35         ((size) * sizeof(struct vring_used_elem) + sizeof(uint16_t) * 3)
36
37 #define IFCVF_VDPA_MODE         "vdpa"
38 #define IFCVF_SW_FALLBACK_LM    "sw-live-migration"
39
40 static const char * const ifcvf_valid_arguments[] = {
41         IFCVF_VDPA_MODE,
42         IFCVF_SW_FALLBACK_LM,
43         NULL
44 };
45
46 static int ifcvf_vdpa_logtype;
47
48 struct ifcvf_internal {
49         struct rte_vdpa_dev_addr dev_addr;
50         struct rte_pci_device *pdev;
51         struct ifcvf_hw hw;
52         int vfio_container_fd;
53         int vfio_group_fd;
54         int vfio_dev_fd;
55         pthread_t tid;  /* thread for notify relay */
56         int epfd;
57         int vid;
58         int did;
59         uint16_t max_queues;
60         uint64_t features;
61         rte_atomic32_t started;
62         rte_atomic32_t dev_attached;
63         rte_atomic32_t running;
64         rte_spinlock_t lock;
65         bool sw_lm;
66         bool sw_fallback_running;
67         /* mediated vring for sw fallback */
68         struct vring m_vring[IFCVF_MAX_QUEUES * 2];
69         /* eventfd for used ring interrupt */
70         int intr_fd[IFCVF_MAX_QUEUES * 2];
71 };
72
73 struct internal_list {
74         TAILQ_ENTRY(internal_list) next;
75         struct ifcvf_internal *internal;
76 };
77
78 TAILQ_HEAD(internal_list_head, internal_list);
79 static struct internal_list_head internal_list =
80         TAILQ_HEAD_INITIALIZER(internal_list);
81
82 static pthread_mutex_t internal_list_lock = PTHREAD_MUTEX_INITIALIZER;
83
84 static struct internal_list *
85 find_internal_resource_by_did(int did)
86 {
87         int found = 0;
88         struct internal_list *list;
89
90         pthread_mutex_lock(&internal_list_lock);
91
92         TAILQ_FOREACH(list, &internal_list, next) {
93                 if (did == list->internal->did) {
94                         found = 1;
95                         break;
96                 }
97         }
98
99         pthread_mutex_unlock(&internal_list_lock);
100
101         if (!found)
102                 return NULL;
103
104         return list;
105 }
106
107 static struct internal_list *
108 find_internal_resource_by_dev(struct rte_pci_device *pdev)
109 {
110         int found = 0;
111         struct internal_list *list;
112
113         pthread_mutex_lock(&internal_list_lock);
114
115         TAILQ_FOREACH(list, &internal_list, next) {
116                 if (pdev == list->internal->pdev) {
117                         found = 1;
118                         break;
119                 }
120         }
121
122         pthread_mutex_unlock(&internal_list_lock);
123
124         if (!found)
125                 return NULL;
126
127         return list;
128 }
129
130 static int
131 ifcvf_vfio_setup(struct ifcvf_internal *internal)
132 {
133         struct rte_pci_device *dev = internal->pdev;
134         char devname[RTE_DEV_NAME_MAX_LEN] = {0};
135         int iommu_group_num;
136         int i;
137
138         internal->vfio_dev_fd = -1;
139         internal->vfio_group_fd = -1;
140         internal->vfio_container_fd = -1;
141
142         rte_pci_device_name(&dev->addr, devname, RTE_DEV_NAME_MAX_LEN);
143         rte_vfio_get_group_num(rte_pci_get_sysfs_path(), devname,
144                         &iommu_group_num);
145
146         internal->vfio_container_fd = rte_vfio_container_create();
147         if (internal->vfio_container_fd < 0)
148                 return -1;
149
150         internal->vfio_group_fd = rte_vfio_container_group_bind(
151                         internal->vfio_container_fd, iommu_group_num);
152         if (internal->vfio_group_fd < 0)
153                 goto err;
154
155         if (rte_pci_map_device(dev))
156                 goto err;
157
158         internal->vfio_dev_fd = dev->intr_handle.vfio_dev_fd;
159
160         for (i = 0; i < RTE_MIN(PCI_MAX_RESOURCE, IFCVF_PCI_MAX_RESOURCE);
161                         i++) {
162                 internal->hw.mem_resource[i].addr =
163                         internal->pdev->mem_resource[i].addr;
164                 internal->hw.mem_resource[i].phys_addr =
165                         internal->pdev->mem_resource[i].phys_addr;
166                 internal->hw.mem_resource[i].len =
167                         internal->pdev->mem_resource[i].len;
168         }
169
170         return 0;
171
172 err:
173         rte_vfio_container_destroy(internal->vfio_container_fd);
174         return -1;
175 }
176
177 static int
178 ifcvf_dma_map(struct ifcvf_internal *internal, int do_map)
179 {
180         uint32_t i;
181         int ret;
182         struct rte_vhost_memory *mem = NULL;
183         int vfio_container_fd;
184
185         ret = rte_vhost_get_mem_table(internal->vid, &mem);
186         if (ret < 0) {
187                 DRV_LOG(ERR, "failed to get VM memory layout.");
188                 goto exit;
189         }
190
191         vfio_container_fd = internal->vfio_container_fd;
192
193         for (i = 0; i < mem->nregions; i++) {
194                 struct rte_vhost_mem_region *reg;
195
196                 reg = &mem->regions[i];
197                 DRV_LOG(INFO, "%s, region %u: HVA 0x%" PRIx64 ", "
198                         "GPA 0x%" PRIx64 ", size 0x%" PRIx64 ".",
199                         do_map ? "DMA map" : "DMA unmap", i,
200                         reg->host_user_addr, reg->guest_phys_addr, reg->size);
201
202                 if (do_map) {
203                         ret = rte_vfio_container_dma_map(vfio_container_fd,
204                                 reg->host_user_addr, reg->guest_phys_addr,
205                                 reg->size);
206                         if (ret < 0) {
207                                 DRV_LOG(ERR, "DMA map failed.");
208                                 goto exit;
209                         }
210                 } else {
211                         ret = rte_vfio_container_dma_unmap(vfio_container_fd,
212                                 reg->host_user_addr, reg->guest_phys_addr,
213                                 reg->size);
214                         if (ret < 0) {
215                                 DRV_LOG(ERR, "DMA unmap failed.");
216                                 goto exit;
217                         }
218                 }
219         }
220
221 exit:
222         if (mem)
223                 free(mem);
224         return ret;
225 }
226
227 static uint64_t
228 hva_to_gpa(int vid, uint64_t hva)
229 {
230         struct rte_vhost_memory *mem = NULL;
231         struct rte_vhost_mem_region *reg;
232         uint32_t i;
233         uint64_t gpa = 0;
234
235         if (rte_vhost_get_mem_table(vid, &mem) < 0)
236                 goto exit;
237
238         for (i = 0; i < mem->nregions; i++) {
239                 reg = &mem->regions[i];
240
241                 if (hva >= reg->host_user_addr &&
242                                 hva < reg->host_user_addr + reg->size) {
243                         gpa = hva - reg->host_user_addr + reg->guest_phys_addr;
244                         break;
245                 }
246         }
247
248 exit:
249         if (mem)
250                 free(mem);
251         return gpa;
252 }
253
254 static int
255 vdpa_ifcvf_start(struct ifcvf_internal *internal)
256 {
257         struct ifcvf_hw *hw = &internal->hw;
258         int i, nr_vring;
259         int vid;
260         struct rte_vhost_vring vq;
261         uint64_t gpa;
262
263         vid = internal->vid;
264         nr_vring = rte_vhost_get_vring_num(vid);
265         rte_vhost_get_negotiated_features(vid, &hw->req_features);
266
267         for (i = 0; i < nr_vring; i++) {
268                 rte_vhost_get_vhost_vring(vid, i, &vq);
269                 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.desc);
270                 if (gpa == 0) {
271                         DRV_LOG(ERR, "Fail to get GPA for descriptor ring.");
272                         return -1;
273                 }
274                 hw->vring[i].desc = gpa;
275
276                 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.avail);
277                 if (gpa == 0) {
278                         DRV_LOG(ERR, "Fail to get GPA for available ring.");
279                         return -1;
280                 }
281                 hw->vring[i].avail = gpa;
282
283                 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.used);
284                 if (gpa == 0) {
285                         DRV_LOG(ERR, "Fail to get GPA for used ring.");
286                         return -1;
287                 }
288                 hw->vring[i].used = gpa;
289
290                 hw->vring[i].size = vq.size;
291                 rte_vhost_get_vring_base(vid, i, &hw->vring[i].last_avail_idx,
292                                 &hw->vring[i].last_used_idx);
293         }
294         hw->nr_vring = i;
295
296         return ifcvf_start_hw(&internal->hw);
297 }
298
299 static void
300 vdpa_ifcvf_stop(struct ifcvf_internal *internal)
301 {
302         struct ifcvf_hw *hw = &internal->hw;
303         uint32_t i;
304         int vid;
305         uint64_t features;
306         uint64_t log_base, log_size;
307         uint64_t len;
308
309         vid = internal->vid;
310         ifcvf_stop_hw(hw);
311
312         for (i = 0; i < hw->nr_vring; i++)
313                 rte_vhost_set_vring_base(vid, i, hw->vring[i].last_avail_idx,
314                                 hw->vring[i].last_used_idx);
315
316         if (internal->sw_lm)
317                 return;
318
319         rte_vhost_get_negotiated_features(vid, &features);
320         if (RTE_VHOST_NEED_LOG(features)) {
321                 ifcvf_disable_logging(hw);
322                 rte_vhost_get_log_base(internal->vid, &log_base, &log_size);
323                 rte_vfio_container_dma_unmap(internal->vfio_container_fd,
324                                 log_base, IFCVF_LOG_BASE, log_size);
325                 /*
326                  * IFCVF marks dirty memory pages for only packet buffer,
327                  * SW helps to mark the used ring as dirty after device stops.
328                  */
329                 for (i = 0; i < hw->nr_vring; i++) {
330                         len = IFCVF_USED_RING_LEN(hw->vring[i].size);
331                         rte_vhost_log_used_vring(vid, i, 0, len);
332                 }
333         }
334 }
335
336 #define MSIX_IRQ_SET_BUF_LEN (sizeof(struct vfio_irq_set) + \
337                 sizeof(int) * (IFCVF_MAX_QUEUES * 2 + 1))
338 static int
339 vdpa_enable_vfio_intr(struct ifcvf_internal *internal, bool m_rx)
340 {
341         int ret;
342         uint32_t i, nr_vring;
343         char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
344         struct vfio_irq_set *irq_set;
345         int *fd_ptr;
346         struct rte_vhost_vring vring;
347         int fd;
348
349         nr_vring = rte_vhost_get_vring_num(internal->vid);
350
351         irq_set = (struct vfio_irq_set *)irq_set_buf;
352         irq_set->argsz = sizeof(irq_set_buf);
353         irq_set->count = nr_vring + 1;
354         irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
355                          VFIO_IRQ_SET_ACTION_TRIGGER;
356         irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
357         irq_set->start = 0;
358         fd_ptr = (int *)&irq_set->data;
359         fd_ptr[RTE_INTR_VEC_ZERO_OFFSET] = internal->pdev->intr_handle.fd;
360
361         for (i = 0; i < nr_vring; i++)
362                 internal->intr_fd[i] = -1;
363
364         for (i = 0; i < nr_vring; i++) {
365                 rte_vhost_get_vhost_vring(internal->vid, i, &vring);
366                 fd_ptr[RTE_INTR_VEC_RXTX_OFFSET + i] = vring.callfd;
367                 if ((i & 1) == 0 && m_rx == true) {
368                         fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
369                         if (fd < 0) {
370                                 DRV_LOG(ERR, "can't setup eventfd: %s",
371                                         strerror(errno));
372                                 return -1;
373                         }
374                         internal->intr_fd[i] = fd;
375                         fd_ptr[RTE_INTR_VEC_RXTX_OFFSET + i] = fd;
376                 }
377         }
378
379         ret = ioctl(internal->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
380         if (ret) {
381                 DRV_LOG(ERR, "Error enabling MSI-X interrupts: %s",
382                                 strerror(errno));
383                 return -1;
384         }
385
386         return 0;
387 }
388
389 static int
390 vdpa_disable_vfio_intr(struct ifcvf_internal *internal)
391 {
392         int ret;
393         uint32_t i, nr_vring;
394         char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
395         struct vfio_irq_set *irq_set;
396
397         irq_set = (struct vfio_irq_set *)irq_set_buf;
398         irq_set->argsz = sizeof(irq_set_buf);
399         irq_set->count = 0;
400         irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
401         irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
402         irq_set->start = 0;
403
404         nr_vring = rte_vhost_get_vring_num(internal->vid);
405         for (i = 0; i < nr_vring; i++) {
406                 if (internal->intr_fd[i] >= 0)
407                         close(internal->intr_fd[i]);
408                 internal->intr_fd[i] = -1;
409         }
410
411         ret = ioctl(internal->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
412         if (ret) {
413                 DRV_LOG(ERR, "Error disabling MSI-X interrupts: %s",
414                                 strerror(errno));
415                 return -1;
416         }
417
418         return 0;
419 }
420
421 static void *
422 notify_relay(void *arg)
423 {
424         int i, kickfd, epfd, nfds = 0;
425         uint32_t qid, q_num;
426         struct epoll_event events[IFCVF_MAX_QUEUES * 2];
427         struct epoll_event ev;
428         uint64_t buf;
429         int nbytes;
430         struct rte_vhost_vring vring;
431         struct ifcvf_internal *internal = (struct ifcvf_internal *)arg;
432         struct ifcvf_hw *hw = &internal->hw;
433
434         q_num = rte_vhost_get_vring_num(internal->vid);
435
436         epfd = epoll_create(IFCVF_MAX_QUEUES * 2);
437         if (epfd < 0) {
438                 DRV_LOG(ERR, "failed to create epoll instance.");
439                 return NULL;
440         }
441         internal->epfd = epfd;
442
443         for (qid = 0; qid < q_num; qid++) {
444                 ev.events = EPOLLIN | EPOLLPRI;
445                 rte_vhost_get_vhost_vring(internal->vid, qid, &vring);
446                 ev.data.u64 = qid | (uint64_t)vring.kickfd << 32;
447                 if (epoll_ctl(epfd, EPOLL_CTL_ADD, vring.kickfd, &ev) < 0) {
448                         DRV_LOG(ERR, "epoll add error: %s", strerror(errno));
449                         return NULL;
450                 }
451         }
452
453         for (;;) {
454                 nfds = epoll_wait(epfd, events, q_num, -1);
455                 if (nfds < 0) {
456                         if (errno == EINTR)
457                                 continue;
458                         DRV_LOG(ERR, "epoll_wait return fail\n");
459                         return NULL;
460                 }
461
462                 for (i = 0; i < nfds; i++) {
463                         qid = events[i].data.u32;
464                         kickfd = (uint32_t)(events[i].data.u64 >> 32);
465                         do {
466                                 nbytes = read(kickfd, &buf, 8);
467                                 if (nbytes < 0) {
468                                         if (errno == EINTR ||
469                                             errno == EWOULDBLOCK ||
470                                             errno == EAGAIN)
471                                                 continue;
472                                         DRV_LOG(INFO, "Error reading "
473                                                 "kickfd: %s",
474                                                 strerror(errno));
475                                 }
476                                 break;
477                         } while (1);
478
479                         ifcvf_notify_queue(hw, qid);
480                 }
481         }
482
483         return NULL;
484 }
485
486 static int
487 setup_notify_relay(struct ifcvf_internal *internal)
488 {
489         int ret;
490
491         ret = pthread_create(&internal->tid, NULL, notify_relay,
492                         (void *)internal);
493         if (ret) {
494                 DRV_LOG(ERR, "failed to create notify relay pthread.");
495                 return -1;
496         }
497         return 0;
498 }
499
500 static int
501 unset_notify_relay(struct ifcvf_internal *internal)
502 {
503         void *status;
504
505         if (internal->tid) {
506                 pthread_cancel(internal->tid);
507                 pthread_join(internal->tid, &status);
508         }
509         internal->tid = 0;
510
511         if (internal->epfd >= 0)
512                 close(internal->epfd);
513         internal->epfd = -1;
514
515         return 0;
516 }
517
518 static int
519 update_datapath(struct ifcvf_internal *internal)
520 {
521         int ret;
522
523         rte_spinlock_lock(&internal->lock);
524
525         if (!rte_atomic32_read(&internal->running) &&
526             (rte_atomic32_read(&internal->started) &&
527              rte_atomic32_read(&internal->dev_attached))) {
528                 ret = ifcvf_dma_map(internal, 1);
529                 if (ret)
530                         goto err;
531
532                 ret = vdpa_enable_vfio_intr(internal, 0);
533                 if (ret)
534                         goto err;
535
536                 ret = vdpa_ifcvf_start(internal);
537                 if (ret)
538                         goto err;
539
540                 ret = setup_notify_relay(internal);
541                 if (ret)
542                         goto err;
543
544                 rte_atomic32_set(&internal->running, 1);
545         } else if (rte_atomic32_read(&internal->running) &&
546                    (!rte_atomic32_read(&internal->started) ||
547                     !rte_atomic32_read(&internal->dev_attached))) {
548                 ret = unset_notify_relay(internal);
549                 if (ret)
550                         goto err;
551
552                 vdpa_ifcvf_stop(internal);
553
554                 ret = vdpa_disable_vfio_intr(internal);
555                 if (ret)
556                         goto err;
557
558                 ret = ifcvf_dma_map(internal, 0);
559                 if (ret)
560                         goto err;
561
562                 rte_atomic32_set(&internal->running, 0);
563         }
564
565         rte_spinlock_unlock(&internal->lock);
566         return 0;
567 err:
568         rte_spinlock_unlock(&internal->lock);
569         return ret;
570 }
571
572 static int
573 m_ifcvf_start(struct ifcvf_internal *internal)
574 {
575         struct ifcvf_hw *hw = &internal->hw;
576         uint32_t i, nr_vring;
577         int vid, ret;
578         struct rte_vhost_vring vq;
579         void *vring_buf;
580         uint64_t m_vring_iova = IFCVF_MEDIATED_VRING;
581         uint64_t size;
582         uint64_t gpa;
583
584         vid = internal->vid;
585         nr_vring = rte_vhost_get_vring_num(vid);
586         rte_vhost_get_negotiated_features(vid, &hw->req_features);
587
588         for (i = 0; i < nr_vring; i++) {
589                 rte_vhost_get_vhost_vring(vid, i, &vq);
590
591                 size = RTE_ALIGN_CEIL(vring_size(vq.size, PAGE_SIZE),
592                                 PAGE_SIZE);
593                 vring_buf = rte_zmalloc("ifcvf", size, PAGE_SIZE);
594                 vring_init(&internal->m_vring[i], vq.size, vring_buf,
595                                 PAGE_SIZE);
596
597                 ret = rte_vfio_container_dma_map(internal->vfio_container_fd,
598                         (uint64_t)(uintptr_t)vring_buf, m_vring_iova, size);
599                 if (ret < 0) {
600                         DRV_LOG(ERR, "mediated vring DMA map failed.");
601                         goto error;
602                 }
603
604                 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.desc);
605                 if (gpa == 0) {
606                         DRV_LOG(ERR, "Fail to get GPA for descriptor ring.");
607                         return -1;
608                 }
609                 hw->vring[i].desc = gpa;
610
611                 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.avail);
612                 if (gpa == 0) {
613                         DRV_LOG(ERR, "Fail to get GPA for available ring.");
614                         return -1;
615                 }
616                 hw->vring[i].avail = gpa;
617
618                 /* Direct I/O for Tx queue, relay for Rx queue */
619                 if (i & 1) {
620                         gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.used);
621                         if (gpa == 0) {
622                                 DRV_LOG(ERR, "Fail to get GPA for used ring.");
623                                 return -1;
624                         }
625                         hw->vring[i].used = gpa;
626                 } else {
627                         hw->vring[i].used = m_vring_iova +
628                                 (char *)internal->m_vring[i].used -
629                                 (char *)internal->m_vring[i].desc;
630                 }
631
632                 hw->vring[i].size = vq.size;
633
634                 rte_vhost_get_vring_base(vid, i, &hw->vring[i].last_avail_idx,
635                                 &hw->vring[i].last_used_idx);
636
637                 m_vring_iova += size;
638         }
639         hw->nr_vring = nr_vring;
640
641         return ifcvf_start_hw(&internal->hw);
642
643 error:
644         for (i = 0; i < nr_vring; i++)
645                 if (internal->m_vring[i].desc)
646                         rte_free(internal->m_vring[i].desc);
647
648         return -1;
649 }
650
651 static int
652 m_ifcvf_stop(struct ifcvf_internal *internal)
653 {
654         int vid;
655         uint32_t i;
656         struct rte_vhost_vring vq;
657         struct ifcvf_hw *hw = &internal->hw;
658         uint64_t m_vring_iova = IFCVF_MEDIATED_VRING;
659         uint64_t size, len;
660
661         vid = internal->vid;
662         ifcvf_stop_hw(hw);
663
664         for (i = 0; i < hw->nr_vring; i++) {
665                 rte_vhost_get_vhost_vring(vid, i, &vq);
666                 len = IFCVF_USED_RING_LEN(vq.size);
667                 rte_vhost_log_used_vring(vid, i, 0, len);
668
669                 size = RTE_ALIGN_CEIL(vring_size(vq.size, PAGE_SIZE),
670                                 PAGE_SIZE);
671                 rte_vfio_container_dma_unmap(internal->vfio_container_fd,
672                         (uint64_t)(uintptr_t)internal->m_vring[i].desc,
673                         m_vring_iova, size);
674
675                 rte_vhost_set_vring_base(vid, i, hw->vring[i].last_avail_idx,
676                                 hw->vring[i].last_used_idx);
677                 rte_free(internal->m_vring[i].desc);
678                 m_vring_iova += size;
679         }
680
681         return 0;
682 }
683
684 static void
685 update_used_ring(struct ifcvf_internal *internal, uint16_t qid)
686 {
687         rte_vdpa_relay_vring_used(internal->vid, qid, &internal->m_vring[qid]);
688         rte_vhost_vring_call(internal->vid, qid);
689 }
690
691 static void *
692 vring_relay(void *arg)
693 {
694         int i, vid, epfd, fd, nfds;
695         struct ifcvf_internal *internal = (struct ifcvf_internal *)arg;
696         struct rte_vhost_vring vring;
697         uint16_t qid, q_num;
698         struct epoll_event events[IFCVF_MAX_QUEUES * 4];
699         struct epoll_event ev;
700         int nbytes;
701         uint64_t buf;
702
703         vid = internal->vid;
704         q_num = rte_vhost_get_vring_num(vid);
705         /* prepare the mediated vring */
706         for (qid = 0; qid < q_num; qid++)
707                 rte_vhost_get_vring_base(vid, qid,
708                                 &internal->m_vring[qid].avail->idx,
709                                 &internal->m_vring[qid].used->idx);
710
711         /* add notify fd and interrupt fd to epoll */
712         epfd = epoll_create(IFCVF_MAX_QUEUES * 2);
713         if (epfd < 0) {
714                 DRV_LOG(ERR, "failed to create epoll instance.");
715                 return NULL;
716         }
717         internal->epfd = epfd;
718
719         for (qid = 0; qid < q_num; qid++) {
720                 ev.events = EPOLLIN | EPOLLPRI;
721                 rte_vhost_get_vhost_vring(vid, qid, &vring);
722                 ev.data.u64 = qid << 1 | (uint64_t)vring.kickfd << 32;
723                 if (epoll_ctl(epfd, EPOLL_CTL_ADD, vring.kickfd, &ev) < 0) {
724                         DRV_LOG(ERR, "epoll add error: %s", strerror(errno));
725                         return NULL;
726                 }
727         }
728
729         for (qid = 0; qid < q_num; qid += 2) {
730                 ev.events = EPOLLIN | EPOLLPRI;
731                 /* leave a flag to mark it's for interrupt */
732                 ev.data.u64 = 1 | qid << 1 |
733                         (uint64_t)internal->intr_fd[qid] << 32;
734                 if (epoll_ctl(epfd, EPOLL_CTL_ADD, internal->intr_fd[qid], &ev)
735                                 < 0) {
736                         DRV_LOG(ERR, "epoll add error: %s", strerror(errno));
737                         return NULL;
738                 }
739         }
740
741         /* start relay with a first kick */
742         for (qid = 0; qid < q_num; qid++)
743                 ifcvf_notify_queue(&internal->hw, qid);
744
745         /* listen to the events and react accordingly */
746         for (;;) {
747                 nfds = epoll_wait(epfd, events, q_num * 2, -1);
748                 if (nfds < 0) {
749                         if (errno == EINTR)
750                                 continue;
751                         DRV_LOG(ERR, "epoll_wait return fail\n");
752                         return NULL;
753                 }
754
755                 for (i = 0; i < nfds; i++) {
756                         fd = (uint32_t)(events[i].data.u64 >> 32);
757                         do {
758                                 nbytes = read(fd, &buf, 8);
759                                 if (nbytes < 0) {
760                                         if (errno == EINTR ||
761                                             errno == EWOULDBLOCK ||
762                                             errno == EAGAIN)
763                                                 continue;
764                                         DRV_LOG(INFO, "Error reading "
765                                                 "kickfd: %s",
766                                                 strerror(errno));
767                                 }
768                                 break;
769                         } while (1);
770
771                         qid = events[i].data.u32 >> 1;
772
773                         if (events[i].data.u32 & 1)
774                                 update_used_ring(internal, qid);
775                         else
776                                 ifcvf_notify_queue(&internal->hw, qid);
777                 }
778         }
779
780         return NULL;
781 }
782
783 static int
784 setup_vring_relay(struct ifcvf_internal *internal)
785 {
786         int ret;
787
788         ret = pthread_create(&internal->tid, NULL, vring_relay,
789                         (void *)internal);
790         if (ret) {
791                 DRV_LOG(ERR, "failed to create ring relay pthread.");
792                 return -1;
793         }
794         return 0;
795 }
796
797 static int
798 unset_vring_relay(struct ifcvf_internal *internal)
799 {
800         void *status;
801
802         if (internal->tid) {
803                 pthread_cancel(internal->tid);
804                 pthread_join(internal->tid, &status);
805         }
806         internal->tid = 0;
807
808         if (internal->epfd >= 0)
809                 close(internal->epfd);
810         internal->epfd = -1;
811
812         return 0;
813 }
814
815 static int
816 ifcvf_sw_fallback_switchover(struct ifcvf_internal *internal)
817 {
818         int ret;
819         int vid = internal->vid;
820
821         /* stop the direct IO data path */
822         unset_notify_relay(internal);
823         vdpa_ifcvf_stop(internal);
824         vdpa_disable_vfio_intr(internal);
825
826         ret = rte_vhost_host_notifier_ctrl(vid, false);
827         if (ret && ret != -ENOTSUP)
828                 goto error;
829
830         /* set up interrupt for interrupt relay */
831         ret = vdpa_enable_vfio_intr(internal, 1);
832         if (ret)
833                 goto unmap;
834
835         /* config the VF */
836         ret = m_ifcvf_start(internal);
837         if (ret)
838                 goto unset_intr;
839
840         /* set up vring relay thread */
841         ret = setup_vring_relay(internal);
842         if (ret)
843                 goto stop_vf;
844
845         rte_vhost_host_notifier_ctrl(vid, true);
846
847         internal->sw_fallback_running = true;
848
849         return 0;
850
851 stop_vf:
852         m_ifcvf_stop(internal);
853 unset_intr:
854         vdpa_disable_vfio_intr(internal);
855 unmap:
856         ifcvf_dma_map(internal, 0);
857 error:
858         return -1;
859 }
860
861 static int
862 ifcvf_dev_config(int vid)
863 {
864         int did;
865         struct internal_list *list;
866         struct ifcvf_internal *internal;
867
868         did = rte_vhost_get_vdpa_device_id(vid);
869         list = find_internal_resource_by_did(did);
870         if (list == NULL) {
871                 DRV_LOG(ERR, "Invalid device id: %d", did);
872                 return -1;
873         }
874
875         internal = list->internal;
876         internal->vid = vid;
877         rte_atomic32_set(&internal->dev_attached, 1);
878         update_datapath(internal);
879
880         if (rte_vhost_host_notifier_ctrl(vid, true) != 0)
881                 DRV_LOG(NOTICE, "vDPA (%d): software relay is used.", did);
882
883         return 0;
884 }
885
886 static int
887 ifcvf_dev_close(int vid)
888 {
889         int did;
890         struct internal_list *list;
891         struct ifcvf_internal *internal;
892
893         did = rte_vhost_get_vdpa_device_id(vid);
894         list = find_internal_resource_by_did(did);
895         if (list == NULL) {
896                 DRV_LOG(ERR, "Invalid device id: %d", did);
897                 return -1;
898         }
899
900         internal = list->internal;
901
902         if (internal->sw_fallback_running) {
903                 /* unset ring relay */
904                 unset_vring_relay(internal);
905
906                 /* reset VF */
907                 m_ifcvf_stop(internal);
908
909                 /* remove interrupt setting */
910                 vdpa_disable_vfio_intr(internal);
911
912                 /* unset DMA map for guest memory */
913                 ifcvf_dma_map(internal, 0);
914
915                 internal->sw_fallback_running = false;
916         } else {
917                 rte_atomic32_set(&internal->dev_attached, 0);
918                 update_datapath(internal);
919         }
920
921         return 0;
922 }
923
924 static int
925 ifcvf_set_features(int vid)
926 {
927         uint64_t features;
928         int did;
929         struct internal_list *list;
930         struct ifcvf_internal *internal;
931         uint64_t log_base, log_size;
932
933         did = rte_vhost_get_vdpa_device_id(vid);
934         list = find_internal_resource_by_did(did);
935         if (list == NULL) {
936                 DRV_LOG(ERR, "Invalid device id: %d", did);
937                 return -1;
938         }
939
940         internal = list->internal;
941         rte_vhost_get_negotiated_features(vid, &features);
942
943         if (!RTE_VHOST_NEED_LOG(features))
944                 return 0;
945
946         if (internal->sw_lm) {
947                 ifcvf_sw_fallback_switchover(internal);
948         } else {
949                 rte_vhost_get_log_base(vid, &log_base, &log_size);
950                 rte_vfio_container_dma_map(internal->vfio_container_fd,
951                                 log_base, IFCVF_LOG_BASE, log_size);
952                 ifcvf_enable_logging(&internal->hw, IFCVF_LOG_BASE, log_size);
953         }
954
955         return 0;
956 }
957
958 static int
959 ifcvf_get_vfio_group_fd(int vid)
960 {
961         int did;
962         struct internal_list *list;
963
964         did = rte_vhost_get_vdpa_device_id(vid);
965         list = find_internal_resource_by_did(did);
966         if (list == NULL) {
967                 DRV_LOG(ERR, "Invalid device id: %d", did);
968                 return -1;
969         }
970
971         return list->internal->vfio_group_fd;
972 }
973
974 static int
975 ifcvf_get_vfio_device_fd(int vid)
976 {
977         int did;
978         struct internal_list *list;
979
980         did = rte_vhost_get_vdpa_device_id(vid);
981         list = find_internal_resource_by_did(did);
982         if (list == NULL) {
983                 DRV_LOG(ERR, "Invalid device id: %d", did);
984                 return -1;
985         }
986
987         return list->internal->vfio_dev_fd;
988 }
989
990 static int
991 ifcvf_get_notify_area(int vid, int qid, uint64_t *offset, uint64_t *size)
992 {
993         int did;
994         struct internal_list *list;
995         struct ifcvf_internal *internal;
996         struct vfio_region_info reg = { .argsz = sizeof(reg) };
997         int ret;
998
999         did = rte_vhost_get_vdpa_device_id(vid);
1000         list = find_internal_resource_by_did(did);
1001         if (list == NULL) {
1002                 DRV_LOG(ERR, "Invalid device id: %d", did);
1003                 return -1;
1004         }
1005
1006         internal = list->internal;
1007
1008         reg.index = ifcvf_get_notify_region(&internal->hw);
1009         ret = ioctl(internal->vfio_dev_fd, VFIO_DEVICE_GET_REGION_INFO, &reg);
1010         if (ret) {
1011                 DRV_LOG(ERR, "Get not get device region info: %s",
1012                                 strerror(errno));
1013                 return -1;
1014         }
1015
1016         *offset = ifcvf_get_queue_notify_off(&internal->hw, qid) + reg.offset;
1017         *size = 0x1000;
1018
1019         return 0;
1020 }
1021
1022 static int
1023 ifcvf_get_queue_num(int did, uint32_t *queue_num)
1024 {
1025         struct internal_list *list;
1026
1027         list = find_internal_resource_by_did(did);
1028         if (list == NULL) {
1029                 DRV_LOG(ERR, "Invalid device id: %d", did);
1030                 return -1;
1031         }
1032
1033         *queue_num = list->internal->max_queues;
1034
1035         return 0;
1036 }
1037
1038 static int
1039 ifcvf_get_vdpa_features(int did, uint64_t *features)
1040 {
1041         struct internal_list *list;
1042
1043         list = find_internal_resource_by_did(did);
1044         if (list == NULL) {
1045                 DRV_LOG(ERR, "Invalid device id: %d", did);
1046                 return -1;
1047         }
1048
1049         *features = list->internal->features;
1050
1051         return 0;
1052 }
1053
1054 #define VDPA_SUPPORTED_PROTOCOL_FEATURES \
1055                 (1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK | \
1056                  1ULL << VHOST_USER_PROTOCOL_F_SLAVE_REQ | \
1057                  1ULL << VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD | \
1058                  1ULL << VHOST_USER_PROTOCOL_F_HOST_NOTIFIER | \
1059                  1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD)
1060 static int
1061 ifcvf_get_protocol_features(int did __rte_unused, uint64_t *features)
1062 {
1063         *features = VDPA_SUPPORTED_PROTOCOL_FEATURES;
1064         return 0;
1065 }
1066
1067 static struct rte_vdpa_dev_ops ifcvf_ops = {
1068         .get_queue_num = ifcvf_get_queue_num,
1069         .get_features = ifcvf_get_vdpa_features,
1070         .get_protocol_features = ifcvf_get_protocol_features,
1071         .dev_conf = ifcvf_dev_config,
1072         .dev_close = ifcvf_dev_close,
1073         .set_vring_state = NULL,
1074         .set_features = ifcvf_set_features,
1075         .migration_done = NULL,
1076         .get_vfio_group_fd = ifcvf_get_vfio_group_fd,
1077         .get_vfio_device_fd = ifcvf_get_vfio_device_fd,
1078         .get_notify_area = ifcvf_get_notify_area,
1079 };
1080
1081 static inline int
1082 open_int(const char *key __rte_unused, const char *value, void *extra_args)
1083 {
1084         uint16_t *n = extra_args;
1085
1086         if (value == NULL || extra_args == NULL)
1087                 return -EINVAL;
1088
1089         *n = (uint16_t)strtoul(value, NULL, 0);
1090         if (*n == USHRT_MAX && errno == ERANGE)
1091                 return -1;
1092
1093         return 0;
1094 }
1095
1096 static int
1097 ifcvf_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
1098                 struct rte_pci_device *pci_dev)
1099 {
1100         uint64_t features;
1101         struct ifcvf_internal *internal = NULL;
1102         struct internal_list *list = NULL;
1103         int vdpa_mode = 0;
1104         int sw_fallback_lm = 0;
1105         struct rte_kvargs *kvlist = NULL;
1106         int ret = 0;
1107
1108         if (rte_eal_process_type() != RTE_PROC_PRIMARY)
1109                 return 0;
1110
1111         kvlist = rte_kvargs_parse(pci_dev->device.devargs->args,
1112                         ifcvf_valid_arguments);
1113         if (kvlist == NULL)
1114                 return 1;
1115
1116         /* probe only when vdpa mode is specified */
1117         if (rte_kvargs_count(kvlist, IFCVF_VDPA_MODE) == 0) {
1118                 rte_kvargs_free(kvlist);
1119                 return 1;
1120         }
1121
1122         ret = rte_kvargs_process(kvlist, IFCVF_VDPA_MODE, &open_int,
1123                         &vdpa_mode);
1124         if (ret < 0 || vdpa_mode == 0) {
1125                 rte_kvargs_free(kvlist);
1126                 return 1;
1127         }
1128
1129         list = rte_zmalloc("ifcvf", sizeof(*list), 0);
1130         if (list == NULL)
1131                 goto error;
1132
1133         internal = rte_zmalloc("ifcvf", sizeof(*internal), 0);
1134         if (internal == NULL)
1135                 goto error;
1136
1137         internal->pdev = pci_dev;
1138         rte_spinlock_init(&internal->lock);
1139
1140         if (ifcvf_vfio_setup(internal) < 0) {
1141                 DRV_LOG(ERR, "failed to setup device %s", pci_dev->name);
1142                 goto error;
1143         }
1144
1145         if (ifcvf_init_hw(&internal->hw, internal->pdev) < 0) {
1146                 DRV_LOG(ERR, "failed to init device %s", pci_dev->name);
1147                 goto error;
1148         }
1149
1150         internal->max_queues = IFCVF_MAX_QUEUES;
1151         features = ifcvf_get_features(&internal->hw);
1152         internal->features = (features &
1153                 ~(1ULL << VIRTIO_F_IOMMU_PLATFORM)) |
1154                 (1ULL << VIRTIO_NET_F_GUEST_ANNOUNCE) |
1155                 (1ULL << VIRTIO_NET_F_CTRL_VQ) |
1156                 (1ULL << VIRTIO_NET_F_STATUS) |
1157                 (1ULL << VHOST_USER_F_PROTOCOL_FEATURES) |
1158                 (1ULL << VHOST_F_LOG_ALL);
1159
1160         internal->dev_addr.pci_addr = pci_dev->addr;
1161         internal->dev_addr.type = PCI_ADDR;
1162         list->internal = internal;
1163
1164         if (rte_kvargs_count(kvlist, IFCVF_SW_FALLBACK_LM)) {
1165                 ret = rte_kvargs_process(kvlist, IFCVF_SW_FALLBACK_LM,
1166                                 &open_int, &sw_fallback_lm);
1167                 if (ret < 0)
1168                         goto error;
1169         }
1170         internal->sw_lm = sw_fallback_lm;
1171
1172         internal->did = rte_vdpa_register_device(&internal->dev_addr,
1173                                 &ifcvf_ops);
1174         if (internal->did < 0) {
1175                 DRV_LOG(ERR, "failed to register device %s", pci_dev->name);
1176                 goto error;
1177         }
1178
1179         pthread_mutex_lock(&internal_list_lock);
1180         TAILQ_INSERT_TAIL(&internal_list, list, next);
1181         pthread_mutex_unlock(&internal_list_lock);
1182
1183         rte_atomic32_set(&internal->started, 1);
1184         update_datapath(internal);
1185
1186         rte_kvargs_free(kvlist);
1187         return 0;
1188
1189 error:
1190         rte_kvargs_free(kvlist);
1191         rte_free(list);
1192         rte_free(internal);
1193         return -1;
1194 }
1195
1196 static int
1197 ifcvf_pci_remove(struct rte_pci_device *pci_dev)
1198 {
1199         struct ifcvf_internal *internal;
1200         struct internal_list *list;
1201
1202         if (rte_eal_process_type() != RTE_PROC_PRIMARY)
1203                 return 0;
1204
1205         list = find_internal_resource_by_dev(pci_dev);
1206         if (list == NULL) {
1207                 DRV_LOG(ERR, "Invalid device: %s", pci_dev->name);
1208                 return -1;
1209         }
1210
1211         internal = list->internal;
1212         rte_atomic32_set(&internal->started, 0);
1213         update_datapath(internal);
1214
1215         rte_pci_unmap_device(internal->pdev);
1216         rte_vfio_container_destroy(internal->vfio_container_fd);
1217         rte_vdpa_unregister_device(internal->did);
1218
1219         pthread_mutex_lock(&internal_list_lock);
1220         TAILQ_REMOVE(&internal_list, list, next);
1221         pthread_mutex_unlock(&internal_list_lock);
1222
1223         rte_free(list);
1224         rte_free(internal);
1225
1226         return 0;
1227 }
1228
1229 /*
1230  * IFCVF has the same vendor ID and device ID as virtio net PCI
1231  * device, with its specific subsystem vendor ID and device ID.
1232  */
1233 static const struct rte_pci_id pci_id_ifcvf_map[] = {
1234         { .class_id = RTE_CLASS_ANY_ID,
1235           .vendor_id = IFCVF_VENDOR_ID,
1236           .device_id = IFCVF_DEVICE_ID,
1237           .subsystem_vendor_id = IFCVF_SUBSYS_VENDOR_ID,
1238           .subsystem_device_id = IFCVF_SUBSYS_DEVICE_ID,
1239         },
1240
1241         { .vendor_id = 0, /* sentinel */
1242         },
1243 };
1244
1245 static struct rte_pci_driver rte_ifcvf_vdpa = {
1246         .id_table = pci_id_ifcvf_map,
1247         .drv_flags = 0,
1248         .probe = ifcvf_pci_probe,
1249         .remove = ifcvf_pci_remove,
1250 };
1251
1252 RTE_PMD_REGISTER_PCI(net_ifcvf, rte_ifcvf_vdpa);
1253 RTE_PMD_REGISTER_PCI_TABLE(net_ifcvf, pci_id_ifcvf_map);
1254 RTE_PMD_REGISTER_KMOD_DEP(net_ifcvf, "* vfio-pci");
1255
1256 RTE_INIT(ifcvf_vdpa_init_log)
1257 {
1258         ifcvf_vdpa_logtype = rte_log_register("pmd.net.ifcvf_vdpa");
1259         if (ifcvf_vdpa_logtype >= 0)
1260                 rte_log_set_level(ifcvf_vdpa_logtype, RTE_LOG_NOTICE);
1261 }