net/ifc: add devargs pointer check
[dpdk.git] / drivers / net / ifc / ifcvf_vdpa.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2018 Intel Corporation
3  */
4
5 #include <unistd.h>
6 #include <pthread.h>
7 #include <fcntl.h>
8 #include <sys/ioctl.h>
9 #include <sys/epoll.h>
10 #include <linux/virtio_net.h>
11 #include <stdbool.h>
12
13 #include <rte_malloc.h>
14 #include <rte_memory.h>
15 #include <rte_bus_pci.h>
16 #include <rte_vhost.h>
17 #include <rte_vdpa.h>
18 #include <rte_vfio.h>
19 #include <rte_spinlock.h>
20 #include <rte_log.h>
21 #include <rte_kvargs.h>
22 #include <rte_devargs.h>
23
24 #include "base/ifcvf.h"
25
26 #define DRV_LOG(level, fmt, args...) \
27         rte_log(RTE_LOG_ ## level, ifcvf_vdpa_logtype, \
28                 "IFCVF %s(): " fmt "\n", __func__, ##args)
29
30 #ifndef PAGE_SIZE
31 #define PAGE_SIZE 4096
32 #endif
33
34 #define IFCVF_USED_RING_LEN(size) \
35         ((size) * sizeof(struct vring_used_elem) + sizeof(uint16_t) * 3)
36
37 #define IFCVF_VDPA_MODE         "vdpa"
38 #define IFCVF_SW_FALLBACK_LM    "sw-live-migration"
39
40 static const char * const ifcvf_valid_arguments[] = {
41         IFCVF_VDPA_MODE,
42         IFCVF_SW_FALLBACK_LM,
43         NULL
44 };
45
46 static int ifcvf_vdpa_logtype;
47
48 struct ifcvf_internal {
49         struct rte_vdpa_dev_addr dev_addr;
50         struct rte_pci_device *pdev;
51         struct ifcvf_hw hw;
52         int vfio_container_fd;
53         int vfio_group_fd;
54         int vfio_dev_fd;
55         pthread_t tid;  /* thread for notify relay */
56         int epfd;
57         int vid;
58         int did;
59         uint16_t max_queues;
60         uint64_t features;
61         rte_atomic32_t started;
62         rte_atomic32_t dev_attached;
63         rte_atomic32_t running;
64         rte_spinlock_t lock;
65         bool sw_lm;
66         bool sw_fallback_running;
67         /* mediated vring for sw fallback */
68         struct vring m_vring[IFCVF_MAX_QUEUES * 2];
69         /* eventfd for used ring interrupt */
70         int intr_fd[IFCVF_MAX_QUEUES * 2];
71 };
72
73 struct internal_list {
74         TAILQ_ENTRY(internal_list) next;
75         struct ifcvf_internal *internal;
76 };
77
78 TAILQ_HEAD(internal_list_head, internal_list);
79 static struct internal_list_head internal_list =
80         TAILQ_HEAD_INITIALIZER(internal_list);
81
82 static pthread_mutex_t internal_list_lock = PTHREAD_MUTEX_INITIALIZER;
83
84 static void update_used_ring(struct ifcvf_internal *internal, uint16_t qid);
85
86 static struct internal_list *
87 find_internal_resource_by_did(int did)
88 {
89         int found = 0;
90         struct internal_list *list;
91
92         pthread_mutex_lock(&internal_list_lock);
93
94         TAILQ_FOREACH(list, &internal_list, next) {
95                 if (did == list->internal->did) {
96                         found = 1;
97                         break;
98                 }
99         }
100
101         pthread_mutex_unlock(&internal_list_lock);
102
103         if (!found)
104                 return NULL;
105
106         return list;
107 }
108
109 static struct internal_list *
110 find_internal_resource_by_dev(struct rte_pci_device *pdev)
111 {
112         int found = 0;
113         struct internal_list *list;
114
115         pthread_mutex_lock(&internal_list_lock);
116
117         TAILQ_FOREACH(list, &internal_list, next) {
118                 if (pdev == list->internal->pdev) {
119                         found = 1;
120                         break;
121                 }
122         }
123
124         pthread_mutex_unlock(&internal_list_lock);
125
126         if (!found)
127                 return NULL;
128
129         return list;
130 }
131
132 static int
133 ifcvf_vfio_setup(struct ifcvf_internal *internal)
134 {
135         struct rte_pci_device *dev = internal->pdev;
136         char devname[RTE_DEV_NAME_MAX_LEN] = {0};
137         int iommu_group_num;
138         int i;
139
140         internal->vfio_dev_fd = -1;
141         internal->vfio_group_fd = -1;
142         internal->vfio_container_fd = -1;
143
144         rte_pci_device_name(&dev->addr, devname, RTE_DEV_NAME_MAX_LEN);
145         rte_vfio_get_group_num(rte_pci_get_sysfs_path(), devname,
146                         &iommu_group_num);
147
148         internal->vfio_container_fd = rte_vfio_container_create();
149         if (internal->vfio_container_fd < 0)
150                 return -1;
151
152         internal->vfio_group_fd = rte_vfio_container_group_bind(
153                         internal->vfio_container_fd, iommu_group_num);
154         if (internal->vfio_group_fd < 0)
155                 goto err;
156
157         if (rte_pci_map_device(dev))
158                 goto err;
159
160         internal->vfio_dev_fd = dev->intr_handle.vfio_dev_fd;
161
162         for (i = 0; i < RTE_MIN(PCI_MAX_RESOURCE, IFCVF_PCI_MAX_RESOURCE);
163                         i++) {
164                 internal->hw.mem_resource[i].addr =
165                         internal->pdev->mem_resource[i].addr;
166                 internal->hw.mem_resource[i].phys_addr =
167                         internal->pdev->mem_resource[i].phys_addr;
168                 internal->hw.mem_resource[i].len =
169                         internal->pdev->mem_resource[i].len;
170         }
171
172         return 0;
173
174 err:
175         rte_vfio_container_destroy(internal->vfio_container_fd);
176         return -1;
177 }
178
179 static int
180 ifcvf_dma_map(struct ifcvf_internal *internal, int do_map)
181 {
182         uint32_t i;
183         int ret;
184         struct rte_vhost_memory *mem = NULL;
185         int vfio_container_fd;
186
187         ret = rte_vhost_get_mem_table(internal->vid, &mem);
188         if (ret < 0) {
189                 DRV_LOG(ERR, "failed to get VM memory layout.");
190                 goto exit;
191         }
192
193         vfio_container_fd = internal->vfio_container_fd;
194
195         for (i = 0; i < mem->nregions; i++) {
196                 struct rte_vhost_mem_region *reg;
197
198                 reg = &mem->regions[i];
199                 DRV_LOG(INFO, "%s, region %u: HVA 0x%" PRIx64 ", "
200                         "GPA 0x%" PRIx64 ", size 0x%" PRIx64 ".",
201                         do_map ? "DMA map" : "DMA unmap", i,
202                         reg->host_user_addr, reg->guest_phys_addr, reg->size);
203
204                 if (do_map) {
205                         ret = rte_vfio_container_dma_map(vfio_container_fd,
206                                 reg->host_user_addr, reg->guest_phys_addr,
207                                 reg->size);
208                         if (ret < 0) {
209                                 DRV_LOG(ERR, "DMA map failed.");
210                                 goto exit;
211                         }
212                 } else {
213                         ret = rte_vfio_container_dma_unmap(vfio_container_fd,
214                                 reg->host_user_addr, reg->guest_phys_addr,
215                                 reg->size);
216                         if (ret < 0) {
217                                 DRV_LOG(ERR, "DMA unmap failed.");
218                                 goto exit;
219                         }
220                 }
221         }
222
223 exit:
224         if (mem)
225                 free(mem);
226         return ret;
227 }
228
229 static uint64_t
230 hva_to_gpa(int vid, uint64_t hva)
231 {
232         struct rte_vhost_memory *mem = NULL;
233         struct rte_vhost_mem_region *reg;
234         uint32_t i;
235         uint64_t gpa = 0;
236
237         if (rte_vhost_get_mem_table(vid, &mem) < 0)
238                 goto exit;
239
240         for (i = 0; i < mem->nregions; i++) {
241                 reg = &mem->regions[i];
242
243                 if (hva >= reg->host_user_addr &&
244                                 hva < reg->host_user_addr + reg->size) {
245                         gpa = hva - reg->host_user_addr + reg->guest_phys_addr;
246                         break;
247                 }
248         }
249
250 exit:
251         if (mem)
252                 free(mem);
253         return gpa;
254 }
255
256 static int
257 vdpa_ifcvf_start(struct ifcvf_internal *internal)
258 {
259         struct ifcvf_hw *hw = &internal->hw;
260         int i, nr_vring;
261         int vid;
262         struct rte_vhost_vring vq;
263         uint64_t gpa;
264
265         vid = internal->vid;
266         nr_vring = rte_vhost_get_vring_num(vid);
267         rte_vhost_get_negotiated_features(vid, &hw->req_features);
268
269         for (i = 0; i < nr_vring; i++) {
270                 rte_vhost_get_vhost_vring(vid, i, &vq);
271                 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.desc);
272                 if (gpa == 0) {
273                         DRV_LOG(ERR, "Fail to get GPA for descriptor ring.");
274                         return -1;
275                 }
276                 hw->vring[i].desc = gpa;
277
278                 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.avail);
279                 if (gpa == 0) {
280                         DRV_LOG(ERR, "Fail to get GPA for available ring.");
281                         return -1;
282                 }
283                 hw->vring[i].avail = gpa;
284
285                 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.used);
286                 if (gpa == 0) {
287                         DRV_LOG(ERR, "Fail to get GPA for used ring.");
288                         return -1;
289                 }
290                 hw->vring[i].used = gpa;
291
292                 hw->vring[i].size = vq.size;
293                 rte_vhost_get_vring_base(vid, i, &hw->vring[i].last_avail_idx,
294                                 &hw->vring[i].last_used_idx);
295         }
296         hw->nr_vring = i;
297
298         return ifcvf_start_hw(&internal->hw);
299 }
300
301 static void
302 vdpa_ifcvf_stop(struct ifcvf_internal *internal)
303 {
304         struct ifcvf_hw *hw = &internal->hw;
305         uint32_t i;
306         int vid;
307         uint64_t features;
308         uint64_t log_base, log_size;
309         uint64_t len;
310
311         vid = internal->vid;
312         ifcvf_stop_hw(hw);
313
314         for (i = 0; i < hw->nr_vring; i++)
315                 rte_vhost_set_vring_base(vid, i, hw->vring[i].last_avail_idx,
316                                 hw->vring[i].last_used_idx);
317
318         if (internal->sw_lm)
319                 return;
320
321         rte_vhost_get_negotiated_features(vid, &features);
322         if (RTE_VHOST_NEED_LOG(features)) {
323                 ifcvf_disable_logging(hw);
324                 rte_vhost_get_log_base(internal->vid, &log_base, &log_size);
325                 rte_vfio_container_dma_unmap(internal->vfio_container_fd,
326                                 log_base, IFCVF_LOG_BASE, log_size);
327                 /*
328                  * IFCVF marks dirty memory pages for only packet buffer,
329                  * SW helps to mark the used ring as dirty after device stops.
330                  */
331                 for (i = 0; i < hw->nr_vring; i++) {
332                         len = IFCVF_USED_RING_LEN(hw->vring[i].size);
333                         rte_vhost_log_used_vring(vid, i, 0, len);
334                 }
335         }
336 }
337
338 #define MSIX_IRQ_SET_BUF_LEN (sizeof(struct vfio_irq_set) + \
339                 sizeof(int) * (IFCVF_MAX_QUEUES * 2 + 1))
340 static int
341 vdpa_enable_vfio_intr(struct ifcvf_internal *internal, bool m_rx)
342 {
343         int ret;
344         uint32_t i, nr_vring;
345         char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
346         struct vfio_irq_set *irq_set;
347         int *fd_ptr;
348         struct rte_vhost_vring vring;
349         int fd;
350
351         nr_vring = rte_vhost_get_vring_num(internal->vid);
352
353         irq_set = (struct vfio_irq_set *)irq_set_buf;
354         irq_set->argsz = sizeof(irq_set_buf);
355         irq_set->count = nr_vring + 1;
356         irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
357                          VFIO_IRQ_SET_ACTION_TRIGGER;
358         irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
359         irq_set->start = 0;
360         fd_ptr = (int *)&irq_set->data;
361         fd_ptr[RTE_INTR_VEC_ZERO_OFFSET] = internal->pdev->intr_handle.fd;
362
363         for (i = 0; i < nr_vring; i++)
364                 internal->intr_fd[i] = -1;
365
366         for (i = 0; i < nr_vring; i++) {
367                 rte_vhost_get_vhost_vring(internal->vid, i, &vring);
368                 fd_ptr[RTE_INTR_VEC_RXTX_OFFSET + i] = vring.callfd;
369                 if ((i & 1) == 0 && m_rx == true) {
370                         fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
371                         if (fd < 0) {
372                                 DRV_LOG(ERR, "can't setup eventfd: %s",
373                                         strerror(errno));
374                                 return -1;
375                         }
376                         internal->intr_fd[i] = fd;
377                         fd_ptr[RTE_INTR_VEC_RXTX_OFFSET + i] = fd;
378                 }
379         }
380
381         ret = ioctl(internal->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
382         if (ret) {
383                 DRV_LOG(ERR, "Error enabling MSI-X interrupts: %s",
384                                 strerror(errno));
385                 return -1;
386         }
387
388         return 0;
389 }
390
391 static int
392 vdpa_disable_vfio_intr(struct ifcvf_internal *internal)
393 {
394         int ret;
395         uint32_t i, nr_vring;
396         char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
397         struct vfio_irq_set *irq_set;
398
399         irq_set = (struct vfio_irq_set *)irq_set_buf;
400         irq_set->argsz = sizeof(irq_set_buf);
401         irq_set->count = 0;
402         irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
403         irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
404         irq_set->start = 0;
405
406         nr_vring = rte_vhost_get_vring_num(internal->vid);
407         for (i = 0; i < nr_vring; i++) {
408                 if (internal->intr_fd[i] >= 0)
409                         close(internal->intr_fd[i]);
410                 internal->intr_fd[i] = -1;
411         }
412
413         ret = ioctl(internal->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
414         if (ret) {
415                 DRV_LOG(ERR, "Error disabling MSI-X interrupts: %s",
416                                 strerror(errno));
417                 return -1;
418         }
419
420         return 0;
421 }
422
423 static void *
424 notify_relay(void *arg)
425 {
426         int i, kickfd, epfd, nfds = 0;
427         uint32_t qid, q_num;
428         struct epoll_event events[IFCVF_MAX_QUEUES * 2];
429         struct epoll_event ev;
430         uint64_t buf;
431         int nbytes;
432         struct rte_vhost_vring vring;
433         struct ifcvf_internal *internal = (struct ifcvf_internal *)arg;
434         struct ifcvf_hw *hw = &internal->hw;
435
436         q_num = rte_vhost_get_vring_num(internal->vid);
437
438         epfd = epoll_create(IFCVF_MAX_QUEUES * 2);
439         if (epfd < 0) {
440                 DRV_LOG(ERR, "failed to create epoll instance.");
441                 return NULL;
442         }
443         internal->epfd = epfd;
444
445         for (qid = 0; qid < q_num; qid++) {
446                 ev.events = EPOLLIN | EPOLLPRI;
447                 rte_vhost_get_vhost_vring(internal->vid, qid, &vring);
448                 ev.data.u64 = qid | (uint64_t)vring.kickfd << 32;
449                 if (epoll_ctl(epfd, EPOLL_CTL_ADD, vring.kickfd, &ev) < 0) {
450                         DRV_LOG(ERR, "epoll add error: %s", strerror(errno));
451                         return NULL;
452                 }
453         }
454
455         for (;;) {
456                 nfds = epoll_wait(epfd, events, q_num, -1);
457                 if (nfds < 0) {
458                         if (errno == EINTR)
459                                 continue;
460                         DRV_LOG(ERR, "epoll_wait return fail\n");
461                         return NULL;
462                 }
463
464                 for (i = 0; i < nfds; i++) {
465                         qid = events[i].data.u32;
466                         kickfd = (uint32_t)(events[i].data.u64 >> 32);
467                         do {
468                                 nbytes = read(kickfd, &buf, 8);
469                                 if (nbytes < 0) {
470                                         if (errno == EINTR ||
471                                             errno == EWOULDBLOCK ||
472                                             errno == EAGAIN)
473                                                 continue;
474                                         DRV_LOG(INFO, "Error reading "
475                                                 "kickfd: %s",
476                                                 strerror(errno));
477                                 }
478                                 break;
479                         } while (1);
480
481                         ifcvf_notify_queue(hw, qid);
482                 }
483         }
484
485         return NULL;
486 }
487
488 static int
489 setup_notify_relay(struct ifcvf_internal *internal)
490 {
491         int ret;
492
493         ret = pthread_create(&internal->tid, NULL, notify_relay,
494                         (void *)internal);
495         if (ret) {
496                 DRV_LOG(ERR, "failed to create notify relay pthread.");
497                 return -1;
498         }
499         return 0;
500 }
501
502 static int
503 unset_notify_relay(struct ifcvf_internal *internal)
504 {
505         void *status;
506
507         if (internal->tid) {
508                 pthread_cancel(internal->tid);
509                 pthread_join(internal->tid, &status);
510         }
511         internal->tid = 0;
512
513         if (internal->epfd >= 0)
514                 close(internal->epfd);
515         internal->epfd = -1;
516
517         return 0;
518 }
519
520 static int
521 update_datapath(struct ifcvf_internal *internal)
522 {
523         int ret;
524
525         rte_spinlock_lock(&internal->lock);
526
527         if (!rte_atomic32_read(&internal->running) &&
528             (rte_atomic32_read(&internal->started) &&
529              rte_atomic32_read(&internal->dev_attached))) {
530                 ret = ifcvf_dma_map(internal, 1);
531                 if (ret)
532                         goto err;
533
534                 ret = vdpa_enable_vfio_intr(internal, 0);
535                 if (ret)
536                         goto err;
537
538                 ret = vdpa_ifcvf_start(internal);
539                 if (ret)
540                         goto err;
541
542                 ret = setup_notify_relay(internal);
543                 if (ret)
544                         goto err;
545
546                 rte_atomic32_set(&internal->running, 1);
547         } else if (rte_atomic32_read(&internal->running) &&
548                    (!rte_atomic32_read(&internal->started) ||
549                     !rte_atomic32_read(&internal->dev_attached))) {
550                 ret = unset_notify_relay(internal);
551                 if (ret)
552                         goto err;
553
554                 vdpa_ifcvf_stop(internal);
555
556                 ret = vdpa_disable_vfio_intr(internal);
557                 if (ret)
558                         goto err;
559
560                 ret = ifcvf_dma_map(internal, 0);
561                 if (ret)
562                         goto err;
563
564                 rte_atomic32_set(&internal->running, 0);
565         }
566
567         rte_spinlock_unlock(&internal->lock);
568         return 0;
569 err:
570         rte_spinlock_unlock(&internal->lock);
571         return ret;
572 }
573
574 static int
575 m_ifcvf_start(struct ifcvf_internal *internal)
576 {
577         struct ifcvf_hw *hw = &internal->hw;
578         uint32_t i, nr_vring;
579         int vid, ret;
580         struct rte_vhost_vring vq;
581         void *vring_buf;
582         uint64_t m_vring_iova = IFCVF_MEDIATED_VRING;
583         uint64_t size;
584         uint64_t gpa;
585
586         vid = internal->vid;
587         nr_vring = rte_vhost_get_vring_num(vid);
588         rte_vhost_get_negotiated_features(vid, &hw->req_features);
589
590         for (i = 0; i < nr_vring; i++) {
591                 rte_vhost_get_vhost_vring(vid, i, &vq);
592
593                 size = RTE_ALIGN_CEIL(vring_size(vq.size, PAGE_SIZE),
594                                 PAGE_SIZE);
595                 vring_buf = rte_zmalloc("ifcvf", size, PAGE_SIZE);
596                 vring_init(&internal->m_vring[i], vq.size, vring_buf,
597                                 PAGE_SIZE);
598
599                 ret = rte_vfio_container_dma_map(internal->vfio_container_fd,
600                         (uint64_t)(uintptr_t)vring_buf, m_vring_iova, size);
601                 if (ret < 0) {
602                         DRV_LOG(ERR, "mediated vring DMA map failed.");
603                         goto error;
604                 }
605
606                 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.desc);
607                 if (gpa == 0) {
608                         DRV_LOG(ERR, "Fail to get GPA for descriptor ring.");
609                         return -1;
610                 }
611                 hw->vring[i].desc = gpa;
612
613                 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.avail);
614                 if (gpa == 0) {
615                         DRV_LOG(ERR, "Fail to get GPA for available ring.");
616                         return -1;
617                 }
618                 hw->vring[i].avail = gpa;
619
620                 /* Direct I/O for Tx queue, relay for Rx queue */
621                 if (i & 1) {
622                         gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.used);
623                         if (gpa == 0) {
624                                 DRV_LOG(ERR, "Fail to get GPA for used ring.");
625                                 return -1;
626                         }
627                         hw->vring[i].used = gpa;
628                 } else {
629                         hw->vring[i].used = m_vring_iova +
630                                 (char *)internal->m_vring[i].used -
631                                 (char *)internal->m_vring[i].desc;
632                 }
633
634                 hw->vring[i].size = vq.size;
635
636                 rte_vhost_get_vring_base(vid, i,
637                                 &internal->m_vring[i].avail->idx,
638                                 &internal->m_vring[i].used->idx);
639
640                 rte_vhost_get_vring_base(vid, i, &hw->vring[i].last_avail_idx,
641                                 &hw->vring[i].last_used_idx);
642
643                 m_vring_iova += size;
644         }
645         hw->nr_vring = nr_vring;
646
647         return ifcvf_start_hw(&internal->hw);
648
649 error:
650         for (i = 0; i < nr_vring; i++)
651                 if (internal->m_vring[i].desc)
652                         rte_free(internal->m_vring[i].desc);
653
654         return -1;
655 }
656
657 static int
658 m_ifcvf_stop(struct ifcvf_internal *internal)
659 {
660         int vid;
661         uint32_t i;
662         struct rte_vhost_vring vq;
663         struct ifcvf_hw *hw = &internal->hw;
664         uint64_t m_vring_iova = IFCVF_MEDIATED_VRING;
665         uint64_t size, len;
666
667         vid = internal->vid;
668         ifcvf_stop_hw(hw);
669
670         for (i = 0; i < hw->nr_vring; i++) {
671                 /* synchronize remaining new used entries if any */
672                 if ((i & 1) == 0)
673                         update_used_ring(internal, i);
674
675                 rte_vhost_get_vhost_vring(vid, i, &vq);
676                 len = IFCVF_USED_RING_LEN(vq.size);
677                 rte_vhost_log_used_vring(vid, i, 0, len);
678
679                 size = RTE_ALIGN_CEIL(vring_size(vq.size, PAGE_SIZE),
680                                 PAGE_SIZE);
681                 rte_vfio_container_dma_unmap(internal->vfio_container_fd,
682                         (uint64_t)(uintptr_t)internal->m_vring[i].desc,
683                         m_vring_iova, size);
684
685                 rte_vhost_set_vring_base(vid, i, hw->vring[i].last_avail_idx,
686                                 hw->vring[i].last_used_idx);
687                 rte_free(internal->m_vring[i].desc);
688                 m_vring_iova += size;
689         }
690
691         return 0;
692 }
693
694 static void
695 update_used_ring(struct ifcvf_internal *internal, uint16_t qid)
696 {
697         rte_vdpa_relay_vring_used(internal->vid, qid, &internal->m_vring[qid]);
698         rte_vhost_vring_call(internal->vid, qid);
699 }
700
701 static void *
702 vring_relay(void *arg)
703 {
704         int i, vid, epfd, fd, nfds;
705         struct ifcvf_internal *internal = (struct ifcvf_internal *)arg;
706         struct rte_vhost_vring vring;
707         uint16_t qid, q_num;
708         struct epoll_event events[IFCVF_MAX_QUEUES * 4];
709         struct epoll_event ev;
710         int nbytes;
711         uint64_t buf;
712
713         vid = internal->vid;
714         q_num = rte_vhost_get_vring_num(vid);
715
716         /* add notify fd and interrupt fd to epoll */
717         epfd = epoll_create(IFCVF_MAX_QUEUES * 2);
718         if (epfd < 0) {
719                 DRV_LOG(ERR, "failed to create epoll instance.");
720                 return NULL;
721         }
722         internal->epfd = epfd;
723
724         for (qid = 0; qid < q_num; qid++) {
725                 ev.events = EPOLLIN | EPOLLPRI;
726                 rte_vhost_get_vhost_vring(vid, qid, &vring);
727                 ev.data.u64 = qid << 1 | (uint64_t)vring.kickfd << 32;
728                 if (epoll_ctl(epfd, EPOLL_CTL_ADD, vring.kickfd, &ev) < 0) {
729                         DRV_LOG(ERR, "epoll add error: %s", strerror(errno));
730                         return NULL;
731                 }
732         }
733
734         for (qid = 0; qid < q_num; qid += 2) {
735                 ev.events = EPOLLIN | EPOLLPRI;
736                 /* leave a flag to mark it's for interrupt */
737                 ev.data.u64 = 1 | qid << 1 |
738                         (uint64_t)internal->intr_fd[qid] << 32;
739                 if (epoll_ctl(epfd, EPOLL_CTL_ADD, internal->intr_fd[qid], &ev)
740                                 < 0) {
741                         DRV_LOG(ERR, "epoll add error: %s", strerror(errno));
742                         return NULL;
743                 }
744                 update_used_ring(internal, qid);
745         }
746
747         /* start relay with a first kick */
748         for (qid = 0; qid < q_num; qid++)
749                 ifcvf_notify_queue(&internal->hw, qid);
750
751         /* listen to the events and react accordingly */
752         for (;;) {
753                 nfds = epoll_wait(epfd, events, q_num * 2, -1);
754                 if (nfds < 0) {
755                         if (errno == EINTR)
756                                 continue;
757                         DRV_LOG(ERR, "epoll_wait return fail\n");
758                         return NULL;
759                 }
760
761                 for (i = 0; i < nfds; i++) {
762                         fd = (uint32_t)(events[i].data.u64 >> 32);
763                         do {
764                                 nbytes = read(fd, &buf, 8);
765                                 if (nbytes < 0) {
766                                         if (errno == EINTR ||
767                                             errno == EWOULDBLOCK ||
768                                             errno == EAGAIN)
769                                                 continue;
770                                         DRV_LOG(INFO, "Error reading "
771                                                 "kickfd: %s",
772                                                 strerror(errno));
773                                 }
774                                 break;
775                         } while (1);
776
777                         qid = events[i].data.u32 >> 1;
778
779                         if (events[i].data.u32 & 1)
780                                 update_used_ring(internal, qid);
781                         else
782                                 ifcvf_notify_queue(&internal->hw, qid);
783                 }
784         }
785
786         return NULL;
787 }
788
789 static int
790 setup_vring_relay(struct ifcvf_internal *internal)
791 {
792         int ret;
793
794         ret = pthread_create(&internal->tid, NULL, vring_relay,
795                         (void *)internal);
796         if (ret) {
797                 DRV_LOG(ERR, "failed to create ring relay pthread.");
798                 return -1;
799         }
800         return 0;
801 }
802
803 static int
804 unset_vring_relay(struct ifcvf_internal *internal)
805 {
806         void *status;
807
808         if (internal->tid) {
809                 pthread_cancel(internal->tid);
810                 pthread_join(internal->tid, &status);
811         }
812         internal->tid = 0;
813
814         if (internal->epfd >= 0)
815                 close(internal->epfd);
816         internal->epfd = -1;
817
818         return 0;
819 }
820
821 static int
822 ifcvf_sw_fallback_switchover(struct ifcvf_internal *internal)
823 {
824         int ret;
825         int vid = internal->vid;
826
827         /* stop the direct IO data path */
828         unset_notify_relay(internal);
829         vdpa_ifcvf_stop(internal);
830         vdpa_disable_vfio_intr(internal);
831
832         ret = rte_vhost_host_notifier_ctrl(vid, false);
833         if (ret && ret != -ENOTSUP)
834                 goto error;
835
836         /* set up interrupt for interrupt relay */
837         ret = vdpa_enable_vfio_intr(internal, 1);
838         if (ret)
839                 goto unmap;
840
841         /* config the VF */
842         ret = m_ifcvf_start(internal);
843         if (ret)
844                 goto unset_intr;
845
846         /* set up vring relay thread */
847         ret = setup_vring_relay(internal);
848         if (ret)
849                 goto stop_vf;
850
851         rte_vhost_host_notifier_ctrl(vid, true);
852
853         internal->sw_fallback_running = true;
854
855         return 0;
856
857 stop_vf:
858         m_ifcvf_stop(internal);
859 unset_intr:
860         vdpa_disable_vfio_intr(internal);
861 unmap:
862         ifcvf_dma_map(internal, 0);
863 error:
864         return -1;
865 }
866
867 static int
868 ifcvf_dev_config(int vid)
869 {
870         int did;
871         struct internal_list *list;
872         struct ifcvf_internal *internal;
873
874         did = rte_vhost_get_vdpa_device_id(vid);
875         list = find_internal_resource_by_did(did);
876         if (list == NULL) {
877                 DRV_LOG(ERR, "Invalid device id: %d", did);
878                 return -1;
879         }
880
881         internal = list->internal;
882         internal->vid = vid;
883         rte_atomic32_set(&internal->dev_attached, 1);
884         update_datapath(internal);
885
886         if (rte_vhost_host_notifier_ctrl(vid, true) != 0)
887                 DRV_LOG(NOTICE, "vDPA (%d): software relay is used.", did);
888
889         return 0;
890 }
891
892 static int
893 ifcvf_dev_close(int vid)
894 {
895         int did;
896         struct internal_list *list;
897         struct ifcvf_internal *internal;
898
899         did = rte_vhost_get_vdpa_device_id(vid);
900         list = find_internal_resource_by_did(did);
901         if (list == NULL) {
902                 DRV_LOG(ERR, "Invalid device id: %d", did);
903                 return -1;
904         }
905
906         internal = list->internal;
907
908         if (internal->sw_fallback_running) {
909                 /* unset ring relay */
910                 unset_vring_relay(internal);
911
912                 /* reset VF */
913                 m_ifcvf_stop(internal);
914
915                 /* remove interrupt setting */
916                 vdpa_disable_vfio_intr(internal);
917
918                 /* unset DMA map for guest memory */
919                 ifcvf_dma_map(internal, 0);
920
921                 internal->sw_fallback_running = false;
922         } else {
923                 rte_atomic32_set(&internal->dev_attached, 0);
924                 update_datapath(internal);
925         }
926
927         return 0;
928 }
929
930 static int
931 ifcvf_set_features(int vid)
932 {
933         uint64_t features;
934         int did;
935         struct internal_list *list;
936         struct ifcvf_internal *internal;
937         uint64_t log_base, log_size;
938
939         did = rte_vhost_get_vdpa_device_id(vid);
940         list = find_internal_resource_by_did(did);
941         if (list == NULL) {
942                 DRV_LOG(ERR, "Invalid device id: %d", did);
943                 return -1;
944         }
945
946         internal = list->internal;
947         rte_vhost_get_negotiated_features(vid, &features);
948
949         if (!RTE_VHOST_NEED_LOG(features))
950                 return 0;
951
952         if (internal->sw_lm) {
953                 ifcvf_sw_fallback_switchover(internal);
954         } else {
955                 rte_vhost_get_log_base(vid, &log_base, &log_size);
956                 rte_vfio_container_dma_map(internal->vfio_container_fd,
957                                 log_base, IFCVF_LOG_BASE, log_size);
958                 ifcvf_enable_logging(&internal->hw, IFCVF_LOG_BASE, log_size);
959         }
960
961         return 0;
962 }
963
964 static int
965 ifcvf_get_vfio_group_fd(int vid)
966 {
967         int did;
968         struct internal_list *list;
969
970         did = rte_vhost_get_vdpa_device_id(vid);
971         list = find_internal_resource_by_did(did);
972         if (list == NULL) {
973                 DRV_LOG(ERR, "Invalid device id: %d", did);
974                 return -1;
975         }
976
977         return list->internal->vfio_group_fd;
978 }
979
980 static int
981 ifcvf_get_vfio_device_fd(int vid)
982 {
983         int did;
984         struct internal_list *list;
985
986         did = rte_vhost_get_vdpa_device_id(vid);
987         list = find_internal_resource_by_did(did);
988         if (list == NULL) {
989                 DRV_LOG(ERR, "Invalid device id: %d", did);
990                 return -1;
991         }
992
993         return list->internal->vfio_dev_fd;
994 }
995
996 static int
997 ifcvf_get_notify_area(int vid, int qid, uint64_t *offset, uint64_t *size)
998 {
999         int did;
1000         struct internal_list *list;
1001         struct ifcvf_internal *internal;
1002         struct vfio_region_info reg = { .argsz = sizeof(reg) };
1003         int ret;
1004
1005         did = rte_vhost_get_vdpa_device_id(vid);
1006         list = find_internal_resource_by_did(did);
1007         if (list == NULL) {
1008                 DRV_LOG(ERR, "Invalid device id: %d", did);
1009                 return -1;
1010         }
1011
1012         internal = list->internal;
1013
1014         reg.index = ifcvf_get_notify_region(&internal->hw);
1015         ret = ioctl(internal->vfio_dev_fd, VFIO_DEVICE_GET_REGION_INFO, &reg);
1016         if (ret) {
1017                 DRV_LOG(ERR, "Get not get device region info: %s",
1018                                 strerror(errno));
1019                 return -1;
1020         }
1021
1022         *offset = ifcvf_get_queue_notify_off(&internal->hw, qid) + reg.offset;
1023         *size = 0x1000;
1024
1025         return 0;
1026 }
1027
1028 static int
1029 ifcvf_get_queue_num(int did, uint32_t *queue_num)
1030 {
1031         struct internal_list *list;
1032
1033         list = find_internal_resource_by_did(did);
1034         if (list == NULL) {
1035                 DRV_LOG(ERR, "Invalid device id: %d", did);
1036                 return -1;
1037         }
1038
1039         *queue_num = list->internal->max_queues;
1040
1041         return 0;
1042 }
1043
1044 static int
1045 ifcvf_get_vdpa_features(int did, uint64_t *features)
1046 {
1047         struct internal_list *list;
1048
1049         list = find_internal_resource_by_did(did);
1050         if (list == NULL) {
1051                 DRV_LOG(ERR, "Invalid device id: %d", did);
1052                 return -1;
1053         }
1054
1055         *features = list->internal->features;
1056
1057         return 0;
1058 }
1059
1060 #define VDPA_SUPPORTED_PROTOCOL_FEATURES \
1061                 (1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK | \
1062                  1ULL << VHOST_USER_PROTOCOL_F_SLAVE_REQ | \
1063                  1ULL << VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD | \
1064                  1ULL << VHOST_USER_PROTOCOL_F_HOST_NOTIFIER | \
1065                  1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD)
1066 static int
1067 ifcvf_get_protocol_features(int did __rte_unused, uint64_t *features)
1068 {
1069         *features = VDPA_SUPPORTED_PROTOCOL_FEATURES;
1070         return 0;
1071 }
1072
1073 static struct rte_vdpa_dev_ops ifcvf_ops = {
1074         .get_queue_num = ifcvf_get_queue_num,
1075         .get_features = ifcvf_get_vdpa_features,
1076         .get_protocol_features = ifcvf_get_protocol_features,
1077         .dev_conf = ifcvf_dev_config,
1078         .dev_close = ifcvf_dev_close,
1079         .set_vring_state = NULL,
1080         .set_features = ifcvf_set_features,
1081         .migration_done = NULL,
1082         .get_vfio_group_fd = ifcvf_get_vfio_group_fd,
1083         .get_vfio_device_fd = ifcvf_get_vfio_device_fd,
1084         .get_notify_area = ifcvf_get_notify_area,
1085 };
1086
1087 static inline int
1088 open_int(const char *key __rte_unused, const char *value, void *extra_args)
1089 {
1090         uint16_t *n = extra_args;
1091
1092         if (value == NULL || extra_args == NULL)
1093                 return -EINVAL;
1094
1095         *n = (uint16_t)strtoul(value, NULL, 0);
1096         if (*n == USHRT_MAX && errno == ERANGE)
1097                 return -1;
1098
1099         return 0;
1100 }
1101
1102 static int
1103 ifcvf_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
1104                 struct rte_pci_device *pci_dev)
1105 {
1106         uint64_t features;
1107         struct ifcvf_internal *internal = NULL;
1108         struct internal_list *list = NULL;
1109         int vdpa_mode = 0;
1110         int sw_fallback_lm = 0;
1111         struct rte_kvargs *kvlist = NULL;
1112         int ret = 0;
1113
1114         if (rte_eal_process_type() != RTE_PROC_PRIMARY)
1115                 return 0;
1116
1117         if (!pci_dev->device.devargs)
1118                 return 1;
1119
1120         kvlist = rte_kvargs_parse(pci_dev->device.devargs->args,
1121                         ifcvf_valid_arguments);
1122         if (kvlist == NULL)
1123                 return 1;
1124
1125         /* probe only when vdpa mode is specified */
1126         if (rte_kvargs_count(kvlist, IFCVF_VDPA_MODE) == 0) {
1127                 rte_kvargs_free(kvlist);
1128                 return 1;
1129         }
1130
1131         ret = rte_kvargs_process(kvlist, IFCVF_VDPA_MODE, &open_int,
1132                         &vdpa_mode);
1133         if (ret < 0 || vdpa_mode == 0) {
1134                 rte_kvargs_free(kvlist);
1135                 return 1;
1136         }
1137
1138         list = rte_zmalloc("ifcvf", sizeof(*list), 0);
1139         if (list == NULL)
1140                 goto error;
1141
1142         internal = rte_zmalloc("ifcvf", sizeof(*internal), 0);
1143         if (internal == NULL)
1144                 goto error;
1145
1146         internal->pdev = pci_dev;
1147         rte_spinlock_init(&internal->lock);
1148
1149         if (ifcvf_vfio_setup(internal) < 0) {
1150                 DRV_LOG(ERR, "failed to setup device %s", pci_dev->name);
1151                 goto error;
1152         }
1153
1154         if (ifcvf_init_hw(&internal->hw, internal->pdev) < 0) {
1155                 DRV_LOG(ERR, "failed to init device %s", pci_dev->name);
1156                 goto error;
1157         }
1158
1159         internal->max_queues = IFCVF_MAX_QUEUES;
1160         features = ifcvf_get_features(&internal->hw);
1161         internal->features = (features &
1162                 ~(1ULL << VIRTIO_F_IOMMU_PLATFORM)) |
1163                 (1ULL << VIRTIO_NET_F_GUEST_ANNOUNCE) |
1164                 (1ULL << VIRTIO_NET_F_CTRL_VQ) |
1165                 (1ULL << VIRTIO_NET_F_STATUS) |
1166                 (1ULL << VHOST_USER_F_PROTOCOL_FEATURES) |
1167                 (1ULL << VHOST_F_LOG_ALL);
1168
1169         internal->dev_addr.pci_addr = pci_dev->addr;
1170         internal->dev_addr.type = PCI_ADDR;
1171         list->internal = internal;
1172
1173         if (rte_kvargs_count(kvlist, IFCVF_SW_FALLBACK_LM)) {
1174                 ret = rte_kvargs_process(kvlist, IFCVF_SW_FALLBACK_LM,
1175                                 &open_int, &sw_fallback_lm);
1176                 if (ret < 0)
1177                         goto error;
1178         }
1179         internal->sw_lm = sw_fallback_lm;
1180
1181         internal->did = rte_vdpa_register_device(&internal->dev_addr,
1182                                 &ifcvf_ops);
1183         if (internal->did < 0) {
1184                 DRV_LOG(ERR, "failed to register device %s", pci_dev->name);
1185                 goto error;
1186         }
1187
1188         pthread_mutex_lock(&internal_list_lock);
1189         TAILQ_INSERT_TAIL(&internal_list, list, next);
1190         pthread_mutex_unlock(&internal_list_lock);
1191
1192         rte_atomic32_set(&internal->started, 1);
1193         update_datapath(internal);
1194
1195         rte_kvargs_free(kvlist);
1196         return 0;
1197
1198 error:
1199         rte_kvargs_free(kvlist);
1200         rte_free(list);
1201         rte_free(internal);
1202         return -1;
1203 }
1204
1205 static int
1206 ifcvf_pci_remove(struct rte_pci_device *pci_dev)
1207 {
1208         struct ifcvf_internal *internal;
1209         struct internal_list *list;
1210
1211         if (rte_eal_process_type() != RTE_PROC_PRIMARY)
1212                 return 0;
1213
1214         list = find_internal_resource_by_dev(pci_dev);
1215         if (list == NULL) {
1216                 DRV_LOG(ERR, "Invalid device: %s", pci_dev->name);
1217                 return -1;
1218         }
1219
1220         internal = list->internal;
1221         rte_atomic32_set(&internal->started, 0);
1222         update_datapath(internal);
1223
1224         rte_pci_unmap_device(internal->pdev);
1225         rte_vfio_container_destroy(internal->vfio_container_fd);
1226         rte_vdpa_unregister_device(internal->did);
1227
1228         pthread_mutex_lock(&internal_list_lock);
1229         TAILQ_REMOVE(&internal_list, list, next);
1230         pthread_mutex_unlock(&internal_list_lock);
1231
1232         rte_free(list);
1233         rte_free(internal);
1234
1235         return 0;
1236 }
1237
1238 /*
1239  * IFCVF has the same vendor ID and device ID as virtio net PCI
1240  * device, with its specific subsystem vendor ID and device ID.
1241  */
1242 static const struct rte_pci_id pci_id_ifcvf_map[] = {
1243         { .class_id = RTE_CLASS_ANY_ID,
1244           .vendor_id = IFCVF_VENDOR_ID,
1245           .device_id = IFCVF_DEVICE_ID,
1246           .subsystem_vendor_id = IFCVF_SUBSYS_VENDOR_ID,
1247           .subsystem_device_id = IFCVF_SUBSYS_DEVICE_ID,
1248         },
1249
1250         { .vendor_id = 0, /* sentinel */
1251         },
1252 };
1253
1254 static struct rte_pci_driver rte_ifcvf_vdpa = {
1255         .id_table = pci_id_ifcvf_map,
1256         .drv_flags = 0,
1257         .probe = ifcvf_pci_probe,
1258         .remove = ifcvf_pci_remove,
1259 };
1260
1261 RTE_PMD_REGISTER_PCI(net_ifcvf, rte_ifcvf_vdpa);
1262 RTE_PMD_REGISTER_PCI_TABLE(net_ifcvf, pci_id_ifcvf_map);
1263 RTE_PMD_REGISTER_KMOD_DEP(net_ifcvf, "* vfio-pci");
1264
1265 RTE_INIT(ifcvf_vdpa_init_log)
1266 {
1267         ifcvf_vdpa_logtype = rte_log_register("pmd.net.ifcvf_vdpa");
1268         if (ifcvf_vdpa_logtype >= 0)
1269                 rte_log_set_level(ifcvf_vdpa_logtype, RTE_LOG_NOTICE);
1270 }