1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright(c) 2018 Intel Corporation
10 #include <linux/virtio_net.h>
12 #include <rte_malloc.h>
13 #include <rte_memory.h>
14 #include <rte_bus_pci.h>
15 #include <rte_vhost.h>
18 #include <rte_spinlock.h>
21 #include "base/ifcvf.h"
23 #define DRV_LOG(level, fmt, args...) \
24 rte_log(RTE_LOG_ ## level, ifcvf_vdpa_logtype, \
25 "%s(): " fmt "\n", __func__, ##args)
28 #define PAGE_SIZE 4096
31 static int ifcvf_vdpa_logtype;
33 struct ifcvf_internal {
34 struct rte_vdpa_dev_addr dev_addr;
35 struct rte_pci_device *pdev;
37 int vfio_container_fd;
40 pthread_t tid; /* thread for notify relay */
46 rte_atomic32_t started;
47 rte_atomic32_t dev_attached;
48 rte_atomic32_t running;
52 struct internal_list {
53 TAILQ_ENTRY(internal_list) next;
54 struct ifcvf_internal *internal;
57 TAILQ_HEAD(internal_list_head, internal_list);
58 static struct internal_list_head internal_list =
59 TAILQ_HEAD_INITIALIZER(internal_list);
61 static pthread_mutex_t internal_list_lock = PTHREAD_MUTEX_INITIALIZER;
63 static struct internal_list *
64 find_internal_resource_by_did(int did)
67 struct internal_list *list;
69 pthread_mutex_lock(&internal_list_lock);
71 TAILQ_FOREACH(list, &internal_list, next) {
72 if (did == list->internal->did) {
78 pthread_mutex_unlock(&internal_list_lock);
86 static struct internal_list *
87 find_internal_resource_by_dev(struct rte_pci_device *pdev)
90 struct internal_list *list;
92 pthread_mutex_lock(&internal_list_lock);
94 TAILQ_FOREACH(list, &internal_list, next) {
95 if (pdev == list->internal->pdev) {
101 pthread_mutex_unlock(&internal_list_lock);
110 ifcvf_vfio_setup(struct ifcvf_internal *internal)
112 struct rte_pci_device *dev = internal->pdev;
113 char devname[RTE_DEV_NAME_MAX_LEN] = {0};
117 internal->vfio_dev_fd = -1;
118 internal->vfio_group_fd = -1;
119 internal->vfio_container_fd = -1;
121 rte_pci_device_name(&dev->addr, devname, RTE_DEV_NAME_MAX_LEN);
122 rte_vfio_get_group_num(rte_pci_get_sysfs_path(), devname,
125 internal->vfio_container_fd = rte_vfio_container_create();
126 if (internal->vfio_container_fd < 0)
129 internal->vfio_group_fd = rte_vfio_container_group_bind(
130 internal->vfio_container_fd, iommu_group_num);
131 if (internal->vfio_group_fd < 0)
134 if (rte_pci_map_device(dev))
137 internal->vfio_dev_fd = dev->intr_handle.vfio_dev_fd;
139 for (i = 0; i < RTE_MIN(PCI_MAX_RESOURCE, IFCVF_PCI_MAX_RESOURCE);
141 internal->hw.mem_resource[i].addr =
142 internal->pdev->mem_resource[i].addr;
143 internal->hw.mem_resource[i].phys_addr =
144 internal->pdev->mem_resource[i].phys_addr;
145 internal->hw.mem_resource[i].len =
146 internal->pdev->mem_resource[i].len;
152 rte_vfio_container_destroy(internal->vfio_container_fd);
157 ifcvf_dma_map(struct ifcvf_internal *internal, int do_map)
161 struct rte_vhost_memory *mem = NULL;
162 int vfio_container_fd;
164 ret = rte_vhost_get_mem_table(internal->vid, &mem);
166 DRV_LOG(ERR, "failed to get VM memory layout.");
170 vfio_container_fd = internal->vfio_container_fd;
172 for (i = 0; i < mem->nregions; i++) {
173 struct rte_vhost_mem_region *reg;
175 reg = &mem->regions[i];
176 DRV_LOG(INFO, "%s, region %u: HVA 0x%" PRIx64 ", "
177 "GPA 0x%" PRIx64 ", size 0x%" PRIx64 ".",
178 do_map ? "DMA map" : "DMA unmap", i,
179 reg->host_user_addr, reg->guest_phys_addr, reg->size);
182 ret = rte_vfio_container_dma_map(vfio_container_fd,
183 reg->host_user_addr, reg->guest_phys_addr,
186 DRV_LOG(ERR, "DMA map failed.");
190 ret = rte_vfio_container_dma_unmap(vfio_container_fd,
191 reg->host_user_addr, reg->guest_phys_addr,
194 DRV_LOG(ERR, "DMA unmap failed.");
207 hva_to_gpa(int vid, uint64_t hva)
209 struct rte_vhost_memory *mem = NULL;
210 struct rte_vhost_mem_region *reg;
214 if (rte_vhost_get_mem_table(vid, &mem) < 0)
217 for (i = 0; i < mem->nregions; i++) {
218 reg = &mem->regions[i];
220 if (hva >= reg->host_user_addr &&
221 hva < reg->host_user_addr + reg->size) {
222 gpa = hva - reg->host_user_addr + reg->guest_phys_addr;
234 vdpa_ifcvf_start(struct ifcvf_internal *internal)
236 struct ifcvf_hw *hw = &internal->hw;
239 struct rte_vhost_vring vq;
243 nr_vring = rte_vhost_get_vring_num(vid);
244 rte_vhost_get_negotiated_features(vid, &hw->req_features);
246 for (i = 0; i < nr_vring; i++) {
247 rte_vhost_get_vhost_vring(vid, i, &vq);
248 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.desc);
250 DRV_LOG(ERR, "Fail to get GPA for descriptor ring.");
253 hw->vring[i].desc = gpa;
255 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.avail);
257 DRV_LOG(ERR, "Fail to get GPA for available ring.");
260 hw->vring[i].avail = gpa;
262 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.used);
264 DRV_LOG(ERR, "Fail to get GPA for used ring.");
267 hw->vring[i].used = gpa;
269 hw->vring[i].size = vq.size;
270 rte_vhost_get_vring_base(vid, i, &hw->vring[i].last_avail_idx,
271 &hw->vring[i].last_used_idx);
275 return ifcvf_start_hw(&internal->hw);
279 ifcvf_used_ring_log(struct ifcvf_hw *hw, uint32_t queue, uint8_t *log_buf)
284 pfn = hw->vring[queue].used / PAGE_SIZE;
285 size = hw->vring[queue].size * sizeof(struct vring_used_elem) +
286 sizeof(uint16_t) * 3;
288 for (i = 0; i <= size / PAGE_SIZE; i++)
289 __sync_fetch_and_or_8(&log_buf[(pfn + i) / 8],
290 1 << ((pfn + i) % 8));
294 vdpa_ifcvf_stop(struct ifcvf_internal *internal)
296 struct ifcvf_hw *hw = &internal->hw;
300 uint64_t log_base, log_size;
306 for (i = 0; i < hw->nr_vring; i++)
307 rte_vhost_set_vring_base(vid, i, hw->vring[i].last_avail_idx,
308 hw->vring[i].last_used_idx);
310 rte_vhost_get_negotiated_features(vid, &features);
311 if (RTE_VHOST_NEED_LOG(features)) {
312 ifcvf_disable_logging(hw);
313 rte_vhost_get_log_base(internal->vid, &log_base, &log_size);
314 rte_vfio_container_dma_unmap(internal->vfio_container_fd,
315 log_base, IFCVF_LOG_BASE, log_size);
317 * IFCVF marks dirty memory pages for only packet buffer,
318 * SW helps to mark the used ring as dirty after device stops.
320 log_buf = (uint8_t *)(uintptr_t)log_base;
321 for (i = 0; i < hw->nr_vring; i++)
322 ifcvf_used_ring_log(hw, i, log_buf);
326 #define MSIX_IRQ_SET_BUF_LEN (sizeof(struct vfio_irq_set) + \
327 sizeof(int) * (IFCVF_MAX_QUEUES * 2 + 1))
329 vdpa_enable_vfio_intr(struct ifcvf_internal *internal)
332 uint32_t i, nr_vring;
333 char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
334 struct vfio_irq_set *irq_set;
336 struct rte_vhost_vring vring;
338 nr_vring = rte_vhost_get_vring_num(internal->vid);
340 irq_set = (struct vfio_irq_set *)irq_set_buf;
341 irq_set->argsz = sizeof(irq_set_buf);
342 irq_set->count = nr_vring + 1;
343 irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
344 VFIO_IRQ_SET_ACTION_TRIGGER;
345 irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
347 fd_ptr = (int *)&irq_set->data;
348 fd_ptr[RTE_INTR_VEC_ZERO_OFFSET] = internal->pdev->intr_handle.fd;
350 for (i = 0; i < nr_vring; i++) {
351 rte_vhost_get_vhost_vring(internal->vid, i, &vring);
352 fd_ptr[RTE_INTR_VEC_RXTX_OFFSET + i] = vring.callfd;
355 ret = ioctl(internal->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
357 DRV_LOG(ERR, "Error enabling MSI-X interrupts: %s",
366 vdpa_disable_vfio_intr(struct ifcvf_internal *internal)
369 char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
370 struct vfio_irq_set *irq_set;
372 irq_set = (struct vfio_irq_set *)irq_set_buf;
373 irq_set->argsz = sizeof(irq_set_buf);
375 irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
376 irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
379 ret = ioctl(internal->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
381 DRV_LOG(ERR, "Error disabling MSI-X interrupts: %s",
390 notify_relay(void *arg)
392 int i, kickfd, epfd, nfds = 0;
394 struct epoll_event events[IFCVF_MAX_QUEUES * 2];
395 struct epoll_event ev;
398 struct rte_vhost_vring vring;
399 struct ifcvf_internal *internal = (struct ifcvf_internal *)arg;
400 struct ifcvf_hw *hw = &internal->hw;
402 q_num = rte_vhost_get_vring_num(internal->vid);
404 epfd = epoll_create(IFCVF_MAX_QUEUES * 2);
406 DRV_LOG(ERR, "failed to create epoll instance.");
409 internal->epfd = epfd;
411 for (qid = 0; qid < q_num; qid++) {
412 ev.events = EPOLLIN | EPOLLPRI;
413 rte_vhost_get_vhost_vring(internal->vid, qid, &vring);
414 ev.data.u64 = qid | (uint64_t)vring.kickfd << 32;
415 if (epoll_ctl(epfd, EPOLL_CTL_ADD, vring.kickfd, &ev) < 0) {
416 DRV_LOG(ERR, "epoll add error: %s", strerror(errno));
422 nfds = epoll_wait(epfd, events, q_num, -1);
426 DRV_LOG(ERR, "epoll_wait return fail\n");
430 for (i = 0; i < nfds; i++) {
431 qid = events[i].data.u32;
432 kickfd = (uint32_t)(events[i].data.u64 >> 32);
434 nbytes = read(kickfd, &buf, 8);
436 if (errno == EINTR ||
437 errno == EWOULDBLOCK ||
440 DRV_LOG(INFO, "Error reading "
447 ifcvf_notify_queue(hw, qid);
455 setup_notify_relay(struct ifcvf_internal *internal)
459 ret = pthread_create(&internal->tid, NULL, notify_relay,
462 DRV_LOG(ERR, "failed to create notify relay pthread.");
469 unset_notify_relay(struct ifcvf_internal *internal)
474 pthread_cancel(internal->tid);
475 pthread_join(internal->tid, &status);
479 if (internal->epfd >= 0)
480 close(internal->epfd);
487 update_datapath(struct ifcvf_internal *internal)
491 rte_spinlock_lock(&internal->lock);
493 if (!rte_atomic32_read(&internal->running) &&
494 (rte_atomic32_read(&internal->started) &&
495 rte_atomic32_read(&internal->dev_attached))) {
496 ret = ifcvf_dma_map(internal, 1);
500 ret = vdpa_enable_vfio_intr(internal);
504 ret = vdpa_ifcvf_start(internal);
508 ret = setup_notify_relay(internal);
512 rte_atomic32_set(&internal->running, 1);
513 } else if (rte_atomic32_read(&internal->running) &&
514 (!rte_atomic32_read(&internal->started) ||
515 !rte_atomic32_read(&internal->dev_attached))) {
516 ret = unset_notify_relay(internal);
520 vdpa_ifcvf_stop(internal);
522 ret = vdpa_disable_vfio_intr(internal);
526 ret = ifcvf_dma_map(internal, 0);
530 rte_atomic32_set(&internal->running, 0);
533 rte_spinlock_unlock(&internal->lock);
536 rte_spinlock_unlock(&internal->lock);
541 ifcvf_dev_config(int vid)
544 struct internal_list *list;
545 struct ifcvf_internal *internal;
547 did = rte_vhost_get_vdpa_device_id(vid);
548 list = find_internal_resource_by_did(did);
550 DRV_LOG(ERR, "Invalid device id: %d", did);
554 internal = list->internal;
556 rte_atomic32_set(&internal->dev_attached, 1);
557 update_datapath(internal);
559 if (rte_vhost_host_notifier_ctrl(vid, true) != 0)
560 DRV_LOG(NOTICE, "vDPA (%d): software relay is used.", did);
566 ifcvf_dev_close(int vid)
569 struct internal_list *list;
570 struct ifcvf_internal *internal;
572 did = rte_vhost_get_vdpa_device_id(vid);
573 list = find_internal_resource_by_did(did);
575 DRV_LOG(ERR, "Invalid device id: %d", did);
579 internal = list->internal;
580 rte_atomic32_set(&internal->dev_attached, 0);
581 update_datapath(internal);
587 ifcvf_set_features(int vid)
591 struct internal_list *list;
592 struct ifcvf_internal *internal;
593 uint64_t log_base, log_size;
595 did = rte_vhost_get_vdpa_device_id(vid);
596 list = find_internal_resource_by_did(did);
598 DRV_LOG(ERR, "Invalid device id: %d", did);
602 internal = list->internal;
603 rte_vhost_get_negotiated_features(vid, &features);
605 if (RTE_VHOST_NEED_LOG(features)) {
606 rte_vhost_get_log_base(vid, &log_base, &log_size);
607 rte_vfio_container_dma_map(internal->vfio_container_fd,
608 log_base, IFCVF_LOG_BASE, log_size);
609 ifcvf_enable_logging(&internal->hw, IFCVF_LOG_BASE, log_size);
616 ifcvf_get_vfio_group_fd(int vid)
619 struct internal_list *list;
621 did = rte_vhost_get_vdpa_device_id(vid);
622 list = find_internal_resource_by_did(did);
624 DRV_LOG(ERR, "Invalid device id: %d", did);
628 return list->internal->vfio_group_fd;
632 ifcvf_get_vfio_device_fd(int vid)
635 struct internal_list *list;
637 did = rte_vhost_get_vdpa_device_id(vid);
638 list = find_internal_resource_by_did(did);
640 DRV_LOG(ERR, "Invalid device id: %d", did);
644 return list->internal->vfio_dev_fd;
648 ifcvf_get_notify_area(int vid, int qid, uint64_t *offset, uint64_t *size)
651 struct internal_list *list;
652 struct ifcvf_internal *internal;
653 struct vfio_region_info reg = { .argsz = sizeof(reg) };
656 did = rte_vhost_get_vdpa_device_id(vid);
657 list = find_internal_resource_by_did(did);
659 DRV_LOG(ERR, "Invalid device id: %d", did);
663 internal = list->internal;
665 reg.index = ifcvf_get_notify_region(&internal->hw);
666 ret = ioctl(internal->vfio_dev_fd, VFIO_DEVICE_GET_REGION_INFO, ®);
668 DRV_LOG(ERR, "Get not get device region info: %s",
673 *offset = ifcvf_get_queue_notify_off(&internal->hw, qid) + reg.offset;
680 ifcvf_get_queue_num(int did, uint32_t *queue_num)
682 struct internal_list *list;
684 list = find_internal_resource_by_did(did);
686 DRV_LOG(ERR, "Invalid device id: %d", did);
690 *queue_num = list->internal->max_queues;
696 ifcvf_get_vdpa_features(int did, uint64_t *features)
698 struct internal_list *list;
700 list = find_internal_resource_by_did(did);
702 DRV_LOG(ERR, "Invalid device id: %d", did);
706 *features = list->internal->features;
711 #define VDPA_SUPPORTED_PROTOCOL_FEATURES \
712 (1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK | \
713 1ULL << VHOST_USER_PROTOCOL_F_SLAVE_REQ | \
714 1ULL << VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD | \
715 1ULL << VHOST_USER_PROTOCOL_F_HOST_NOTIFIER | \
716 1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD)
718 ifcvf_get_protocol_features(int did __rte_unused, uint64_t *features)
720 *features = VDPA_SUPPORTED_PROTOCOL_FEATURES;
724 static struct rte_vdpa_dev_ops ifcvf_ops = {
725 .get_queue_num = ifcvf_get_queue_num,
726 .get_features = ifcvf_get_vdpa_features,
727 .get_protocol_features = ifcvf_get_protocol_features,
728 .dev_conf = ifcvf_dev_config,
729 .dev_close = ifcvf_dev_close,
730 .set_vring_state = NULL,
731 .set_features = ifcvf_set_features,
732 .migration_done = NULL,
733 .get_vfio_group_fd = ifcvf_get_vfio_group_fd,
734 .get_vfio_device_fd = ifcvf_get_vfio_device_fd,
735 .get_notify_area = ifcvf_get_notify_area,
739 ifcvf_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
740 struct rte_pci_device *pci_dev)
743 struct ifcvf_internal *internal = NULL;
744 struct internal_list *list = NULL;
746 if (rte_eal_process_type() != RTE_PROC_PRIMARY)
749 list = rte_zmalloc("ifcvf", sizeof(*list), 0);
753 internal = rte_zmalloc("ifcvf", sizeof(*internal), 0);
754 if (internal == NULL)
757 internal->pdev = pci_dev;
758 rte_spinlock_init(&internal->lock);
759 if (ifcvf_vfio_setup(internal) < 0)
762 if (ifcvf_init_hw(&internal->hw, internal->pdev) < 0)
765 internal->max_queues = IFCVF_MAX_QUEUES;
766 features = ifcvf_get_features(&internal->hw);
767 internal->features = (features &
768 ~(1ULL << VIRTIO_F_IOMMU_PLATFORM)) |
769 (1ULL << VIRTIO_NET_F_GUEST_ANNOUNCE) |
770 (1ULL << VIRTIO_NET_F_CTRL_VQ) |
771 (1ULL << VIRTIO_NET_F_STATUS) |
772 (1ULL << VHOST_USER_F_PROTOCOL_FEATURES) |
773 (1ULL << VHOST_F_LOG_ALL);
775 internal->dev_addr.pci_addr = pci_dev->addr;
776 internal->dev_addr.type = PCI_ADDR;
777 list->internal = internal;
779 pthread_mutex_lock(&internal_list_lock);
780 TAILQ_INSERT_TAIL(&internal_list, list, next);
781 pthread_mutex_unlock(&internal_list_lock);
783 internal->did = rte_vdpa_register_device(&internal->dev_addr,
785 if (internal->did < 0)
788 rte_atomic32_set(&internal->started, 1);
789 update_datapath(internal);
800 ifcvf_pci_remove(struct rte_pci_device *pci_dev)
802 struct ifcvf_internal *internal;
803 struct internal_list *list;
805 if (rte_eal_process_type() != RTE_PROC_PRIMARY)
808 list = find_internal_resource_by_dev(pci_dev);
810 DRV_LOG(ERR, "Invalid device: %s", pci_dev->name);
814 internal = list->internal;
815 rte_atomic32_set(&internal->started, 0);
816 update_datapath(internal);
818 rte_pci_unmap_device(internal->pdev);
819 rte_vfio_container_destroy(internal->vfio_container_fd);
820 rte_vdpa_unregister_device(internal->did);
822 pthread_mutex_lock(&internal_list_lock);
823 TAILQ_REMOVE(&internal_list, list, next);
824 pthread_mutex_unlock(&internal_list_lock);
833 * IFCVF has the same vendor ID and device ID as virtio net PCI
834 * device, with its specific subsystem vendor ID and device ID.
836 static const struct rte_pci_id pci_id_ifcvf_map[] = {
837 { .class_id = RTE_CLASS_ANY_ID,
838 .vendor_id = IFCVF_VENDOR_ID,
839 .device_id = IFCVF_DEVICE_ID,
840 .subsystem_vendor_id = IFCVF_SUBSYS_VENDOR_ID,
841 .subsystem_device_id = IFCVF_SUBSYS_DEVICE_ID,
844 { .vendor_id = 0, /* sentinel */
848 static struct rte_pci_driver rte_ifcvf_vdpa = {
849 .id_table = pci_id_ifcvf_map,
851 .probe = ifcvf_pci_probe,
852 .remove = ifcvf_pci_remove,
855 RTE_PMD_REGISTER_PCI(net_ifcvf, rte_ifcvf_vdpa);
856 RTE_PMD_REGISTER_PCI_TABLE(net_ifcvf, pci_id_ifcvf_map);
857 RTE_PMD_REGISTER_KMOD_DEP(net_ifcvf, "* vfio-pci");
859 RTE_INIT(ifcvf_vdpa_init_log)
861 ifcvf_vdpa_logtype = rte_log_register("pmd.net.ifcvf_vdpa");
862 if (ifcvf_vdpa_logtype >= 0)
863 rte_log_set_level(ifcvf_vdpa_logtype, RTE_LOG_NOTICE);