1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright(c) 2018 Intel Corporation
10 #include <linux/virtio_net.h>
12 #include <rte_malloc.h>
13 #include <rte_memory.h>
14 #include <rte_bus_pci.h>
15 #include <rte_vhost.h>
18 #include <rte_spinlock.h>
21 #include "base/ifcvf.h"
23 #define DRV_LOG(level, fmt, args...) \
24 rte_log(RTE_LOG_ ## level, ifcvf_vdpa_logtype, \
25 "%s(): " fmt "\n", __func__, ##args)
28 #define PAGE_SIZE 4096
31 static int ifcvf_vdpa_logtype;
33 struct ifcvf_internal {
34 struct rte_vdpa_dev_addr dev_addr;
35 struct rte_pci_device *pdev;
37 int vfio_container_fd;
40 pthread_t tid; /* thread for notify relay */
46 rte_atomic32_t started;
47 rte_atomic32_t dev_attached;
48 rte_atomic32_t running;
52 struct internal_list {
53 TAILQ_ENTRY(internal_list) next;
54 struct ifcvf_internal *internal;
57 TAILQ_HEAD(internal_list_head, internal_list);
58 static struct internal_list_head internal_list =
59 TAILQ_HEAD_INITIALIZER(internal_list);
61 static pthread_mutex_t internal_list_lock = PTHREAD_MUTEX_INITIALIZER;
63 static struct internal_list *
64 find_internal_resource_by_did(int did)
67 struct internal_list *list;
69 pthread_mutex_lock(&internal_list_lock);
71 TAILQ_FOREACH(list, &internal_list, next) {
72 if (did == list->internal->did) {
78 pthread_mutex_unlock(&internal_list_lock);
86 static struct internal_list *
87 find_internal_resource_by_dev(struct rte_pci_device *pdev)
90 struct internal_list *list;
92 pthread_mutex_lock(&internal_list_lock);
94 TAILQ_FOREACH(list, &internal_list, next) {
95 if (pdev == list->internal->pdev) {
101 pthread_mutex_unlock(&internal_list_lock);
110 ifcvf_vfio_setup(struct ifcvf_internal *internal)
112 struct rte_pci_device *dev = internal->pdev;
113 char devname[RTE_DEV_NAME_MAX_LEN] = {0};
118 internal->vfio_dev_fd = -1;
119 internal->vfio_group_fd = -1;
120 internal->vfio_container_fd = -1;
122 rte_pci_device_name(&dev->addr, devname, RTE_DEV_NAME_MAX_LEN);
123 rte_vfio_get_group_num(rte_pci_get_sysfs_path(), devname,
126 internal->vfio_container_fd = rte_vfio_container_create();
127 if (internal->vfio_container_fd < 0)
130 internal->vfio_group_fd = rte_vfio_container_group_bind(
131 internal->vfio_container_fd, iommu_group_num);
132 if (internal->vfio_group_fd < 0)
135 if (rte_pci_map_device(dev))
138 internal->vfio_dev_fd = dev->intr_handle.vfio_dev_fd;
140 for (i = 0; i < RTE_MIN(PCI_MAX_RESOURCE, IFCVF_PCI_MAX_RESOURCE);
142 internal->hw.mem_resource[i].addr =
143 internal->pdev->mem_resource[i].addr;
144 internal->hw.mem_resource[i].phys_addr =
145 internal->pdev->mem_resource[i].phys_addr;
146 internal->hw.mem_resource[i].len =
147 internal->pdev->mem_resource[i].len;
149 ret = ifcvf_init_hw(&internal->hw, internal->pdev);
154 rte_vfio_container_destroy(internal->vfio_container_fd);
159 ifcvf_dma_map(struct ifcvf_internal *internal, int do_map)
163 struct rte_vhost_memory *mem = NULL;
164 int vfio_container_fd;
166 ret = rte_vhost_get_mem_table(internal->vid, &mem);
168 DRV_LOG(ERR, "failed to get VM memory layout.");
172 vfio_container_fd = internal->vfio_container_fd;
174 for (i = 0; i < mem->nregions; i++) {
175 struct rte_vhost_mem_region *reg;
177 reg = &mem->regions[i];
178 DRV_LOG(INFO, "%s, region %u: HVA 0x%" PRIx64 ", "
179 "GPA 0x%" PRIx64 ", size 0x%" PRIx64 ".",
180 do_map ? "DMA map" : "DMA unmap", i,
181 reg->host_user_addr, reg->guest_phys_addr, reg->size);
184 ret = rte_vfio_container_dma_map(vfio_container_fd,
185 reg->host_user_addr, reg->guest_phys_addr,
188 DRV_LOG(ERR, "DMA map failed.");
192 ret = rte_vfio_container_dma_unmap(vfio_container_fd,
193 reg->host_user_addr, reg->guest_phys_addr,
196 DRV_LOG(ERR, "DMA unmap failed.");
209 qva_to_gpa(int vid, uint64_t qva)
211 struct rte_vhost_memory *mem = NULL;
212 struct rte_vhost_mem_region *reg;
216 if (rte_vhost_get_mem_table(vid, &mem) < 0)
219 for (i = 0; i < mem->nregions; i++) {
220 reg = &mem->regions[i];
222 if (qva >= reg->host_user_addr &&
223 qva < reg->host_user_addr + reg->size) {
224 gpa = qva - reg->host_user_addr + reg->guest_phys_addr;
236 vdpa_ifcvf_start(struct ifcvf_internal *internal)
238 struct ifcvf_hw *hw = &internal->hw;
241 struct rte_vhost_vring vq;
245 nr_vring = rte_vhost_get_vring_num(vid);
246 rte_vhost_get_negotiated_features(vid, &hw->req_features);
248 for (i = 0; i < nr_vring; i++) {
249 rte_vhost_get_vhost_vring(vid, i, &vq);
250 gpa = qva_to_gpa(vid, (uint64_t)(uintptr_t)vq.desc);
252 DRV_LOG(ERR, "Fail to get GPA for descriptor ring.");
255 hw->vring[i].desc = gpa;
257 gpa = qva_to_gpa(vid, (uint64_t)(uintptr_t)vq.avail);
259 DRV_LOG(ERR, "Fail to get GPA for available ring.");
262 hw->vring[i].avail = gpa;
264 gpa = qva_to_gpa(vid, (uint64_t)(uintptr_t)vq.used);
266 DRV_LOG(ERR, "Fail to get GPA for used ring.");
269 hw->vring[i].used = gpa;
271 hw->vring[i].size = vq.size;
272 rte_vhost_get_vring_base(vid, i, &hw->vring[i].last_avail_idx,
273 &hw->vring[i].last_used_idx);
277 return ifcvf_start_hw(&internal->hw);
281 ifcvf_used_ring_log(struct ifcvf_hw *hw, uint32_t queue, uint8_t *log_buf)
286 pfn = hw->vring[queue].used / PAGE_SIZE;
287 size = hw->vring[queue].size * sizeof(struct vring_used_elem) +
288 sizeof(uint16_t) * 3;
290 for (i = 0; i <= size / PAGE_SIZE; i++)
291 __sync_fetch_and_or_8(&log_buf[(pfn + i) / 8],
292 1 << ((pfn + i) % 8));
296 vdpa_ifcvf_stop(struct ifcvf_internal *internal)
298 struct ifcvf_hw *hw = &internal->hw;
302 uint64_t log_base, log_size;
308 for (i = 0; i < hw->nr_vring; i++)
309 rte_vhost_set_vring_base(vid, i, hw->vring[i].last_avail_idx,
310 hw->vring[i].last_used_idx);
312 rte_vhost_get_negotiated_features(vid, &features);
313 if (RTE_VHOST_NEED_LOG(features)) {
314 ifcvf_disable_logging(hw);
315 rte_vhost_get_log_base(internal->vid, &log_base, &log_size);
316 rte_vfio_container_dma_unmap(internal->vfio_container_fd,
317 log_base, IFCVF_LOG_BASE, log_size);
319 * IFCVF marks dirty memory pages for only packet buffer,
320 * SW helps to mark the used ring as dirty after device stops.
322 log_buf = (uint8_t *)(uintptr_t)log_base;
323 for (i = 0; i < hw->nr_vring; i++)
324 ifcvf_used_ring_log(hw, i, log_buf);
328 #define MSIX_IRQ_SET_BUF_LEN (sizeof(struct vfio_irq_set) + \
329 sizeof(int) * (IFCVF_MAX_QUEUES * 2 + 1))
331 vdpa_enable_vfio_intr(struct ifcvf_internal *internal)
334 uint32_t i, nr_vring;
335 char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
336 struct vfio_irq_set *irq_set;
338 struct rte_vhost_vring vring;
340 nr_vring = rte_vhost_get_vring_num(internal->vid);
342 irq_set = (struct vfio_irq_set *)irq_set_buf;
343 irq_set->argsz = sizeof(irq_set_buf);
344 irq_set->count = nr_vring + 1;
345 irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
346 VFIO_IRQ_SET_ACTION_TRIGGER;
347 irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
349 fd_ptr = (int *)&irq_set->data;
350 fd_ptr[RTE_INTR_VEC_ZERO_OFFSET] = internal->pdev->intr_handle.fd;
352 for (i = 0; i < nr_vring; i++) {
353 rte_vhost_get_vhost_vring(internal->vid, i, &vring);
354 fd_ptr[RTE_INTR_VEC_RXTX_OFFSET + i] = vring.callfd;
357 ret = ioctl(internal->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
359 DRV_LOG(ERR, "Error enabling MSI-X interrupts: %s",
368 vdpa_disable_vfio_intr(struct ifcvf_internal *internal)
371 char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
372 struct vfio_irq_set *irq_set;
374 irq_set = (struct vfio_irq_set *)irq_set_buf;
375 irq_set->argsz = sizeof(irq_set_buf);
377 irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
378 irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
381 ret = ioctl(internal->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
383 DRV_LOG(ERR, "Error disabling MSI-X interrupts: %s",
392 notify_relay(void *arg)
394 int i, kickfd, epfd, nfds = 0;
396 struct epoll_event events[IFCVF_MAX_QUEUES * 2];
397 struct epoll_event ev;
400 struct rte_vhost_vring vring;
401 struct ifcvf_internal *internal = (struct ifcvf_internal *)arg;
402 struct ifcvf_hw *hw = &internal->hw;
404 q_num = rte_vhost_get_vring_num(internal->vid);
406 epfd = epoll_create(IFCVF_MAX_QUEUES * 2);
408 DRV_LOG(ERR, "failed to create epoll instance.");
411 internal->epfd = epfd;
413 for (qid = 0; qid < q_num; qid++) {
414 ev.events = EPOLLIN | EPOLLPRI;
415 rte_vhost_get_vhost_vring(internal->vid, qid, &vring);
416 ev.data.u64 = qid | (uint64_t)vring.kickfd << 32;
417 if (epoll_ctl(epfd, EPOLL_CTL_ADD, vring.kickfd, &ev) < 0) {
418 DRV_LOG(ERR, "epoll add error: %s", strerror(errno));
424 nfds = epoll_wait(epfd, events, q_num, -1);
428 DRV_LOG(ERR, "epoll_wait return fail\n");
432 for (i = 0; i < nfds; i++) {
433 qid = events[i].data.u32;
434 kickfd = (uint32_t)(events[i].data.u64 >> 32);
436 nbytes = read(kickfd, &buf, 8);
438 if (errno == EINTR ||
439 errno == EWOULDBLOCK ||
442 DRV_LOG(INFO, "Error reading "
449 ifcvf_notify_queue(hw, qid);
457 setup_notify_relay(struct ifcvf_internal *internal)
461 ret = pthread_create(&internal->tid, NULL, notify_relay,
464 DRV_LOG(ERR, "failed to create notify relay pthread.");
471 unset_notify_relay(struct ifcvf_internal *internal)
476 pthread_cancel(internal->tid);
477 pthread_join(internal->tid, &status);
481 if (internal->epfd >= 0)
482 close(internal->epfd);
489 update_datapath(struct ifcvf_internal *internal)
493 rte_spinlock_lock(&internal->lock);
495 if (!rte_atomic32_read(&internal->running) &&
496 (rte_atomic32_read(&internal->started) &&
497 rte_atomic32_read(&internal->dev_attached))) {
498 ret = ifcvf_dma_map(internal, 1);
502 ret = vdpa_enable_vfio_intr(internal);
506 ret = vdpa_ifcvf_start(internal);
510 ret = setup_notify_relay(internal);
514 rte_atomic32_set(&internal->running, 1);
515 } else if (rte_atomic32_read(&internal->running) &&
516 (!rte_atomic32_read(&internal->started) ||
517 !rte_atomic32_read(&internal->dev_attached))) {
518 ret = unset_notify_relay(internal);
522 vdpa_ifcvf_stop(internal);
524 ret = vdpa_disable_vfio_intr(internal);
528 ret = ifcvf_dma_map(internal, 0);
532 rte_atomic32_set(&internal->running, 0);
535 rte_spinlock_unlock(&internal->lock);
538 rte_spinlock_unlock(&internal->lock);
543 ifcvf_dev_config(int vid)
546 struct internal_list *list;
547 struct ifcvf_internal *internal;
549 did = rte_vhost_get_vdpa_device_id(vid);
550 list = find_internal_resource_by_did(did);
552 DRV_LOG(ERR, "Invalid device id: %d", did);
556 internal = list->internal;
558 rte_atomic32_set(&internal->dev_attached, 1);
559 update_datapath(internal);
565 ifcvf_dev_close(int vid)
568 struct internal_list *list;
569 struct ifcvf_internal *internal;
571 did = rte_vhost_get_vdpa_device_id(vid);
572 list = find_internal_resource_by_did(did);
574 DRV_LOG(ERR, "Invalid device id: %d", did);
578 internal = list->internal;
579 rte_atomic32_set(&internal->dev_attached, 0);
580 update_datapath(internal);
586 ifcvf_set_features(int vid)
590 struct internal_list *list;
591 struct ifcvf_internal *internal;
592 uint64_t log_base, log_size;
594 did = rte_vhost_get_vdpa_device_id(vid);
595 list = find_internal_resource_by_did(did);
597 DRV_LOG(ERR, "Invalid device id: %d", did);
601 internal = list->internal;
602 rte_vhost_get_negotiated_features(vid, &features);
604 if (RTE_VHOST_NEED_LOG(features)) {
605 rte_vhost_get_log_base(vid, &log_base, &log_size);
606 rte_vfio_container_dma_map(internal->vfio_container_fd,
607 log_base, IFCVF_LOG_BASE, log_size);
608 ifcvf_enable_logging(&internal->hw, IFCVF_LOG_BASE, log_size);
615 ifcvf_get_vfio_group_fd(int vid)
618 struct internal_list *list;
620 did = rte_vhost_get_vdpa_device_id(vid);
621 list = find_internal_resource_by_did(did);
623 DRV_LOG(ERR, "Invalid device id: %d", did);
627 return list->internal->vfio_group_fd;
631 ifcvf_get_vfio_device_fd(int vid)
634 struct internal_list *list;
636 did = rte_vhost_get_vdpa_device_id(vid);
637 list = find_internal_resource_by_did(did);
639 DRV_LOG(ERR, "Invalid device id: %d", did);
643 return list->internal->vfio_dev_fd;
647 ifcvf_get_notify_area(int vid, int qid, uint64_t *offset, uint64_t *size)
650 struct internal_list *list;
651 struct ifcvf_internal *internal;
652 struct vfio_region_info reg = { .argsz = sizeof(reg) };
655 did = rte_vhost_get_vdpa_device_id(vid);
656 list = find_internal_resource_by_did(did);
658 DRV_LOG(ERR, "Invalid device id: %d", did);
662 internal = list->internal;
664 reg.index = ifcvf_get_notify_region(&internal->hw);
665 ret = ioctl(internal->vfio_dev_fd, VFIO_DEVICE_GET_REGION_INFO, ®);
667 DRV_LOG(ERR, "Get not get device region info: %s",
672 *offset = ifcvf_get_queue_notify_off(&internal->hw, qid) + reg.offset;
679 ifcvf_get_queue_num(int did, uint32_t *queue_num)
681 struct internal_list *list;
683 list = find_internal_resource_by_did(did);
685 DRV_LOG(ERR, "Invalid device id: %d", did);
689 *queue_num = list->internal->max_queues;
695 ifcvf_get_vdpa_features(int did, uint64_t *features)
697 struct internal_list *list;
699 list = find_internal_resource_by_did(did);
701 DRV_LOG(ERR, "Invalid device id: %d", did);
705 *features = list->internal->features;
710 #define VDPA_SUPPORTED_PROTOCOL_FEATURES \
711 (1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK | \
712 1ULL << VHOST_USER_PROTOCOL_F_SLAVE_REQ | \
713 1ULL << VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD | \
714 1ULL << VHOST_USER_PROTOCOL_F_HOST_NOTIFIER | \
715 1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD)
717 ifcvf_get_protocol_features(int did __rte_unused, uint64_t *features)
719 *features = VDPA_SUPPORTED_PROTOCOL_FEATURES;
723 struct rte_vdpa_dev_ops ifcvf_ops = {
724 .get_queue_num = ifcvf_get_queue_num,
725 .get_features = ifcvf_get_vdpa_features,
726 .get_protocol_features = ifcvf_get_protocol_features,
727 .dev_conf = ifcvf_dev_config,
728 .dev_close = ifcvf_dev_close,
729 .set_vring_state = NULL,
730 .set_features = ifcvf_set_features,
731 .migration_done = NULL,
732 .get_vfio_group_fd = ifcvf_get_vfio_group_fd,
733 .get_vfio_device_fd = ifcvf_get_vfio_device_fd,
734 .get_notify_area = ifcvf_get_notify_area,
738 ifcvf_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
739 struct rte_pci_device *pci_dev)
742 struct ifcvf_internal *internal = NULL;
743 struct internal_list *list = NULL;
745 if (rte_eal_process_type() != RTE_PROC_PRIMARY)
748 list = rte_zmalloc("ifcvf", sizeof(*list), 0);
752 internal = rte_zmalloc("ifcvf", sizeof(*internal), 0);
753 if (internal == NULL)
756 internal->pdev = pci_dev;
757 rte_spinlock_init(&internal->lock);
758 if (ifcvf_vfio_setup(internal) < 0)
761 internal->max_queues = IFCVF_MAX_QUEUES;
762 features = ifcvf_get_features(&internal->hw);
763 internal->features = (features &
764 ~(1ULL << VIRTIO_F_IOMMU_PLATFORM)) |
765 (1ULL << VIRTIO_NET_F_GUEST_ANNOUNCE) |
766 (1ULL << VIRTIO_NET_F_CTRL_VQ) |
767 (1ULL << VIRTIO_NET_F_STATUS) |
768 (1ULL << VHOST_USER_F_PROTOCOL_FEATURES) |
769 (1ULL << VHOST_F_LOG_ALL);
771 internal->dev_addr.pci_addr = pci_dev->addr;
772 internal->dev_addr.type = PCI_ADDR;
773 list->internal = internal;
775 pthread_mutex_lock(&internal_list_lock);
776 TAILQ_INSERT_TAIL(&internal_list, list, next);
777 pthread_mutex_unlock(&internal_list_lock);
779 internal->did = rte_vdpa_register_device(&internal->dev_addr,
781 if (internal->did < 0)
784 rte_atomic32_set(&internal->started, 1);
785 update_datapath(internal);
796 ifcvf_pci_remove(struct rte_pci_device *pci_dev)
798 struct ifcvf_internal *internal;
799 struct internal_list *list;
801 if (rte_eal_process_type() != RTE_PROC_PRIMARY)
804 list = find_internal_resource_by_dev(pci_dev);
806 DRV_LOG(ERR, "Invalid device: %s", pci_dev->name);
810 internal = list->internal;
811 rte_atomic32_set(&internal->started, 0);
812 update_datapath(internal);
814 rte_pci_unmap_device(internal->pdev);
815 rte_vfio_container_destroy(internal->vfio_container_fd);
816 rte_vdpa_unregister_device(internal->did);
818 pthread_mutex_lock(&internal_list_lock);
819 TAILQ_REMOVE(&internal_list, list, next);
820 pthread_mutex_unlock(&internal_list_lock);
829 * IFCVF has the same vendor ID and device ID as virtio net PCI
830 * device, with its specific subsystem vendor ID and device ID.
832 static const struct rte_pci_id pci_id_ifcvf_map[] = {
833 { .class_id = RTE_CLASS_ANY_ID,
834 .vendor_id = IFCVF_VENDOR_ID,
835 .device_id = IFCVF_DEVICE_ID,
836 .subsystem_vendor_id = IFCVF_SUBSYS_VENDOR_ID,
837 .subsystem_device_id = IFCVF_SUBSYS_DEVICE_ID,
840 { .vendor_id = 0, /* sentinel */
844 static struct rte_pci_driver rte_ifcvf_vdpa = {
845 .id_table = pci_id_ifcvf_map,
847 .probe = ifcvf_pci_probe,
848 .remove = ifcvf_pci_remove,
851 RTE_PMD_REGISTER_PCI(net_ifcvf, rte_ifcvf_vdpa);
852 RTE_PMD_REGISTER_PCI_TABLE(net_ifcvf, pci_id_ifcvf_map);
853 RTE_PMD_REGISTER_KMOD_DEP(net_ifcvf, "* vfio-pci");
855 RTE_INIT(ifcvf_vdpa_init_log)
857 ifcvf_vdpa_logtype = rte_log_register("pmd.net.ifcvf_vdpa");
858 if (ifcvf_vdpa_logtype >= 0)
859 rte_log_set_level(ifcvf_vdpa_logtype, RTE_LOG_NOTICE);