1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright(c) 2018 Intel Corporation
10 #include <linux/virtio_net.h>
13 #include <rte_malloc.h>
14 #include <rte_memory.h>
15 #include <rte_bus_pci.h>
16 #include <rte_vhost.h>
19 #include <rte_spinlock.h>
21 #include <rte_kvargs.h>
22 #include <rte_devargs.h>
24 #include "base/ifcvf.h"
26 #define DRV_LOG(level, fmt, args...) \
27 rte_log(RTE_LOG_ ## level, ifcvf_vdpa_logtype, \
28 "IFCVF %s(): " fmt "\n", __func__, ##args)
31 #define PAGE_SIZE 4096
34 #define IFCVF_VDPA_MODE "vdpa"
35 #define IFCVF_SW_FALLBACK_LM "sw-live-migration"
37 static const char * const ifcvf_valid_arguments[] = {
43 static int ifcvf_vdpa_logtype;
45 struct ifcvf_internal {
46 struct rte_vdpa_dev_addr dev_addr;
47 struct rte_pci_device *pdev;
49 int vfio_container_fd;
52 pthread_t tid; /* thread for notify relay */
58 rte_atomic32_t started;
59 rte_atomic32_t dev_attached;
60 rte_atomic32_t running;
65 struct internal_list {
66 TAILQ_ENTRY(internal_list) next;
67 struct ifcvf_internal *internal;
70 TAILQ_HEAD(internal_list_head, internal_list);
71 static struct internal_list_head internal_list =
72 TAILQ_HEAD_INITIALIZER(internal_list);
74 static pthread_mutex_t internal_list_lock = PTHREAD_MUTEX_INITIALIZER;
76 static struct internal_list *
77 find_internal_resource_by_did(int did)
80 struct internal_list *list;
82 pthread_mutex_lock(&internal_list_lock);
84 TAILQ_FOREACH(list, &internal_list, next) {
85 if (did == list->internal->did) {
91 pthread_mutex_unlock(&internal_list_lock);
99 static struct internal_list *
100 find_internal_resource_by_dev(struct rte_pci_device *pdev)
103 struct internal_list *list;
105 pthread_mutex_lock(&internal_list_lock);
107 TAILQ_FOREACH(list, &internal_list, next) {
108 if (pdev == list->internal->pdev) {
114 pthread_mutex_unlock(&internal_list_lock);
123 ifcvf_vfio_setup(struct ifcvf_internal *internal)
125 struct rte_pci_device *dev = internal->pdev;
126 char devname[RTE_DEV_NAME_MAX_LEN] = {0};
130 internal->vfio_dev_fd = -1;
131 internal->vfio_group_fd = -1;
132 internal->vfio_container_fd = -1;
134 rte_pci_device_name(&dev->addr, devname, RTE_DEV_NAME_MAX_LEN);
135 rte_vfio_get_group_num(rte_pci_get_sysfs_path(), devname,
138 internal->vfio_container_fd = rte_vfio_container_create();
139 if (internal->vfio_container_fd < 0)
142 internal->vfio_group_fd = rte_vfio_container_group_bind(
143 internal->vfio_container_fd, iommu_group_num);
144 if (internal->vfio_group_fd < 0)
147 if (rte_pci_map_device(dev))
150 internal->vfio_dev_fd = dev->intr_handle.vfio_dev_fd;
152 for (i = 0; i < RTE_MIN(PCI_MAX_RESOURCE, IFCVF_PCI_MAX_RESOURCE);
154 internal->hw.mem_resource[i].addr =
155 internal->pdev->mem_resource[i].addr;
156 internal->hw.mem_resource[i].phys_addr =
157 internal->pdev->mem_resource[i].phys_addr;
158 internal->hw.mem_resource[i].len =
159 internal->pdev->mem_resource[i].len;
165 rte_vfio_container_destroy(internal->vfio_container_fd);
170 ifcvf_dma_map(struct ifcvf_internal *internal, int do_map)
174 struct rte_vhost_memory *mem = NULL;
175 int vfio_container_fd;
177 ret = rte_vhost_get_mem_table(internal->vid, &mem);
179 DRV_LOG(ERR, "failed to get VM memory layout.");
183 vfio_container_fd = internal->vfio_container_fd;
185 for (i = 0; i < mem->nregions; i++) {
186 struct rte_vhost_mem_region *reg;
188 reg = &mem->regions[i];
189 DRV_LOG(INFO, "%s, region %u: HVA 0x%" PRIx64 ", "
190 "GPA 0x%" PRIx64 ", size 0x%" PRIx64 ".",
191 do_map ? "DMA map" : "DMA unmap", i,
192 reg->host_user_addr, reg->guest_phys_addr, reg->size);
195 ret = rte_vfio_container_dma_map(vfio_container_fd,
196 reg->host_user_addr, reg->guest_phys_addr,
199 DRV_LOG(ERR, "DMA map failed.");
203 ret = rte_vfio_container_dma_unmap(vfio_container_fd,
204 reg->host_user_addr, reg->guest_phys_addr,
207 DRV_LOG(ERR, "DMA unmap failed.");
220 hva_to_gpa(int vid, uint64_t hva)
222 struct rte_vhost_memory *mem = NULL;
223 struct rte_vhost_mem_region *reg;
227 if (rte_vhost_get_mem_table(vid, &mem) < 0)
230 for (i = 0; i < mem->nregions; i++) {
231 reg = &mem->regions[i];
233 if (hva >= reg->host_user_addr &&
234 hva < reg->host_user_addr + reg->size) {
235 gpa = hva - reg->host_user_addr + reg->guest_phys_addr;
247 vdpa_ifcvf_start(struct ifcvf_internal *internal)
249 struct ifcvf_hw *hw = &internal->hw;
252 struct rte_vhost_vring vq;
256 nr_vring = rte_vhost_get_vring_num(vid);
257 rte_vhost_get_negotiated_features(vid, &hw->req_features);
259 for (i = 0; i < nr_vring; i++) {
260 rte_vhost_get_vhost_vring(vid, i, &vq);
261 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.desc);
263 DRV_LOG(ERR, "Fail to get GPA for descriptor ring.");
266 hw->vring[i].desc = gpa;
268 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.avail);
270 DRV_LOG(ERR, "Fail to get GPA for available ring.");
273 hw->vring[i].avail = gpa;
275 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.used);
277 DRV_LOG(ERR, "Fail to get GPA for used ring.");
280 hw->vring[i].used = gpa;
282 hw->vring[i].size = vq.size;
283 rte_vhost_get_vring_base(vid, i, &hw->vring[i].last_avail_idx,
284 &hw->vring[i].last_used_idx);
288 return ifcvf_start_hw(&internal->hw);
292 ifcvf_used_ring_log(struct ifcvf_hw *hw, uint32_t queue, uint8_t *log_buf)
297 pfn = hw->vring[queue].used / PAGE_SIZE;
298 size = hw->vring[queue].size * sizeof(struct vring_used_elem) +
299 sizeof(uint16_t) * 3;
301 for (i = 0; i <= size / PAGE_SIZE; i++)
302 __sync_fetch_and_or_8(&log_buf[(pfn + i) / 8],
303 1 << ((pfn + i) % 8));
307 vdpa_ifcvf_stop(struct ifcvf_internal *internal)
309 struct ifcvf_hw *hw = &internal->hw;
313 uint64_t log_base, log_size;
319 for (i = 0; i < hw->nr_vring; i++)
320 rte_vhost_set_vring_base(vid, i, hw->vring[i].last_avail_idx,
321 hw->vring[i].last_used_idx);
323 rte_vhost_get_negotiated_features(vid, &features);
324 if (RTE_VHOST_NEED_LOG(features)) {
325 ifcvf_disable_logging(hw);
326 rte_vhost_get_log_base(internal->vid, &log_base, &log_size);
327 rte_vfio_container_dma_unmap(internal->vfio_container_fd,
328 log_base, IFCVF_LOG_BASE, log_size);
330 * IFCVF marks dirty memory pages for only packet buffer,
331 * SW helps to mark the used ring as dirty after device stops.
333 log_buf = (uint8_t *)(uintptr_t)log_base;
334 for (i = 0; i < hw->nr_vring; i++)
335 ifcvf_used_ring_log(hw, i, log_buf);
339 #define MSIX_IRQ_SET_BUF_LEN (sizeof(struct vfio_irq_set) + \
340 sizeof(int) * (IFCVF_MAX_QUEUES * 2 + 1))
342 vdpa_enable_vfio_intr(struct ifcvf_internal *internal)
345 uint32_t i, nr_vring;
346 char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
347 struct vfio_irq_set *irq_set;
349 struct rte_vhost_vring vring;
351 nr_vring = rte_vhost_get_vring_num(internal->vid);
353 irq_set = (struct vfio_irq_set *)irq_set_buf;
354 irq_set->argsz = sizeof(irq_set_buf);
355 irq_set->count = nr_vring + 1;
356 irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
357 VFIO_IRQ_SET_ACTION_TRIGGER;
358 irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
360 fd_ptr = (int *)&irq_set->data;
361 fd_ptr[RTE_INTR_VEC_ZERO_OFFSET] = internal->pdev->intr_handle.fd;
363 for (i = 0; i < nr_vring; i++) {
364 rte_vhost_get_vhost_vring(internal->vid, i, &vring);
365 fd_ptr[RTE_INTR_VEC_RXTX_OFFSET + i] = vring.callfd;
368 ret = ioctl(internal->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
370 DRV_LOG(ERR, "Error enabling MSI-X interrupts: %s",
379 vdpa_disable_vfio_intr(struct ifcvf_internal *internal)
382 char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
383 struct vfio_irq_set *irq_set;
385 irq_set = (struct vfio_irq_set *)irq_set_buf;
386 irq_set->argsz = sizeof(irq_set_buf);
388 irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
389 irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
392 ret = ioctl(internal->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
394 DRV_LOG(ERR, "Error disabling MSI-X interrupts: %s",
403 notify_relay(void *arg)
405 int i, kickfd, epfd, nfds = 0;
407 struct epoll_event events[IFCVF_MAX_QUEUES * 2];
408 struct epoll_event ev;
411 struct rte_vhost_vring vring;
412 struct ifcvf_internal *internal = (struct ifcvf_internal *)arg;
413 struct ifcvf_hw *hw = &internal->hw;
415 q_num = rte_vhost_get_vring_num(internal->vid);
417 epfd = epoll_create(IFCVF_MAX_QUEUES * 2);
419 DRV_LOG(ERR, "failed to create epoll instance.");
422 internal->epfd = epfd;
424 for (qid = 0; qid < q_num; qid++) {
425 ev.events = EPOLLIN | EPOLLPRI;
426 rte_vhost_get_vhost_vring(internal->vid, qid, &vring);
427 ev.data.u64 = qid | (uint64_t)vring.kickfd << 32;
428 if (epoll_ctl(epfd, EPOLL_CTL_ADD, vring.kickfd, &ev) < 0) {
429 DRV_LOG(ERR, "epoll add error: %s", strerror(errno));
435 nfds = epoll_wait(epfd, events, q_num, -1);
439 DRV_LOG(ERR, "epoll_wait return fail\n");
443 for (i = 0; i < nfds; i++) {
444 qid = events[i].data.u32;
445 kickfd = (uint32_t)(events[i].data.u64 >> 32);
447 nbytes = read(kickfd, &buf, 8);
449 if (errno == EINTR ||
450 errno == EWOULDBLOCK ||
453 DRV_LOG(INFO, "Error reading "
460 ifcvf_notify_queue(hw, qid);
468 setup_notify_relay(struct ifcvf_internal *internal)
472 ret = pthread_create(&internal->tid, NULL, notify_relay,
475 DRV_LOG(ERR, "failed to create notify relay pthread.");
482 unset_notify_relay(struct ifcvf_internal *internal)
487 pthread_cancel(internal->tid);
488 pthread_join(internal->tid, &status);
492 if (internal->epfd >= 0)
493 close(internal->epfd);
500 update_datapath(struct ifcvf_internal *internal)
504 rte_spinlock_lock(&internal->lock);
506 if (!rte_atomic32_read(&internal->running) &&
507 (rte_atomic32_read(&internal->started) &&
508 rte_atomic32_read(&internal->dev_attached))) {
509 ret = ifcvf_dma_map(internal, 1);
513 ret = vdpa_enable_vfio_intr(internal);
517 ret = vdpa_ifcvf_start(internal);
521 ret = setup_notify_relay(internal);
525 rte_atomic32_set(&internal->running, 1);
526 } else if (rte_atomic32_read(&internal->running) &&
527 (!rte_atomic32_read(&internal->started) ||
528 !rte_atomic32_read(&internal->dev_attached))) {
529 ret = unset_notify_relay(internal);
533 vdpa_ifcvf_stop(internal);
535 ret = vdpa_disable_vfio_intr(internal);
539 ret = ifcvf_dma_map(internal, 0);
543 rte_atomic32_set(&internal->running, 0);
546 rte_spinlock_unlock(&internal->lock);
549 rte_spinlock_unlock(&internal->lock);
554 ifcvf_dev_config(int vid)
557 struct internal_list *list;
558 struct ifcvf_internal *internal;
560 did = rte_vhost_get_vdpa_device_id(vid);
561 list = find_internal_resource_by_did(did);
563 DRV_LOG(ERR, "Invalid device id: %d", did);
567 internal = list->internal;
569 rte_atomic32_set(&internal->dev_attached, 1);
570 update_datapath(internal);
572 if (rte_vhost_host_notifier_ctrl(vid, true) != 0)
573 DRV_LOG(NOTICE, "vDPA (%d): software relay is used.", did);
579 ifcvf_dev_close(int vid)
582 struct internal_list *list;
583 struct ifcvf_internal *internal;
585 did = rte_vhost_get_vdpa_device_id(vid);
586 list = find_internal_resource_by_did(did);
588 DRV_LOG(ERR, "Invalid device id: %d", did);
592 internal = list->internal;
593 rte_atomic32_set(&internal->dev_attached, 0);
594 update_datapath(internal);
600 ifcvf_set_features(int vid)
604 struct internal_list *list;
605 struct ifcvf_internal *internal;
606 uint64_t log_base, log_size;
608 did = rte_vhost_get_vdpa_device_id(vid);
609 list = find_internal_resource_by_did(did);
611 DRV_LOG(ERR, "Invalid device id: %d", did);
615 internal = list->internal;
616 rte_vhost_get_negotiated_features(vid, &features);
618 if (RTE_VHOST_NEED_LOG(features)) {
619 rte_vhost_get_log_base(vid, &log_base, &log_size);
620 rte_vfio_container_dma_map(internal->vfio_container_fd,
621 log_base, IFCVF_LOG_BASE, log_size);
622 ifcvf_enable_logging(&internal->hw, IFCVF_LOG_BASE, log_size);
629 ifcvf_get_vfio_group_fd(int vid)
632 struct internal_list *list;
634 did = rte_vhost_get_vdpa_device_id(vid);
635 list = find_internal_resource_by_did(did);
637 DRV_LOG(ERR, "Invalid device id: %d", did);
641 return list->internal->vfio_group_fd;
645 ifcvf_get_vfio_device_fd(int vid)
648 struct internal_list *list;
650 did = rte_vhost_get_vdpa_device_id(vid);
651 list = find_internal_resource_by_did(did);
653 DRV_LOG(ERR, "Invalid device id: %d", did);
657 return list->internal->vfio_dev_fd;
661 ifcvf_get_notify_area(int vid, int qid, uint64_t *offset, uint64_t *size)
664 struct internal_list *list;
665 struct ifcvf_internal *internal;
666 struct vfio_region_info reg = { .argsz = sizeof(reg) };
669 did = rte_vhost_get_vdpa_device_id(vid);
670 list = find_internal_resource_by_did(did);
672 DRV_LOG(ERR, "Invalid device id: %d", did);
676 internal = list->internal;
678 reg.index = ifcvf_get_notify_region(&internal->hw);
679 ret = ioctl(internal->vfio_dev_fd, VFIO_DEVICE_GET_REGION_INFO, ®);
681 DRV_LOG(ERR, "Get not get device region info: %s",
686 *offset = ifcvf_get_queue_notify_off(&internal->hw, qid) + reg.offset;
693 ifcvf_get_queue_num(int did, uint32_t *queue_num)
695 struct internal_list *list;
697 list = find_internal_resource_by_did(did);
699 DRV_LOG(ERR, "Invalid device id: %d", did);
703 *queue_num = list->internal->max_queues;
709 ifcvf_get_vdpa_features(int did, uint64_t *features)
711 struct internal_list *list;
713 list = find_internal_resource_by_did(did);
715 DRV_LOG(ERR, "Invalid device id: %d", did);
719 *features = list->internal->features;
724 #define VDPA_SUPPORTED_PROTOCOL_FEATURES \
725 (1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK | \
726 1ULL << VHOST_USER_PROTOCOL_F_SLAVE_REQ | \
727 1ULL << VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD | \
728 1ULL << VHOST_USER_PROTOCOL_F_HOST_NOTIFIER | \
729 1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD)
731 ifcvf_get_protocol_features(int did __rte_unused, uint64_t *features)
733 *features = VDPA_SUPPORTED_PROTOCOL_FEATURES;
737 static struct rte_vdpa_dev_ops ifcvf_ops = {
738 .get_queue_num = ifcvf_get_queue_num,
739 .get_features = ifcvf_get_vdpa_features,
740 .get_protocol_features = ifcvf_get_protocol_features,
741 .dev_conf = ifcvf_dev_config,
742 .dev_close = ifcvf_dev_close,
743 .set_vring_state = NULL,
744 .set_features = ifcvf_set_features,
745 .migration_done = NULL,
746 .get_vfio_group_fd = ifcvf_get_vfio_group_fd,
747 .get_vfio_device_fd = ifcvf_get_vfio_device_fd,
748 .get_notify_area = ifcvf_get_notify_area,
752 open_int(const char *key __rte_unused, const char *value, void *extra_args)
754 uint16_t *n = extra_args;
756 if (value == NULL || extra_args == NULL)
759 *n = (uint16_t)strtoul(value, NULL, 0);
760 if (*n == USHRT_MAX && errno == ERANGE)
767 ifcvf_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
768 struct rte_pci_device *pci_dev)
771 struct ifcvf_internal *internal = NULL;
772 struct internal_list *list = NULL;
774 int sw_fallback_lm = 0;
775 struct rte_kvargs *kvlist = NULL;
778 if (rte_eal_process_type() != RTE_PROC_PRIMARY)
781 kvlist = rte_kvargs_parse(pci_dev->device.devargs->args,
782 ifcvf_valid_arguments);
786 /* probe only when vdpa mode is specified */
787 if (rte_kvargs_count(kvlist, IFCVF_VDPA_MODE) == 0) {
788 rte_kvargs_free(kvlist);
792 ret = rte_kvargs_process(kvlist, IFCVF_VDPA_MODE, &open_int,
794 if (ret < 0 || vdpa_mode == 0) {
795 rte_kvargs_free(kvlist);
799 list = rte_zmalloc("ifcvf", sizeof(*list), 0);
803 internal = rte_zmalloc("ifcvf", sizeof(*internal), 0);
804 if (internal == NULL)
807 internal->pdev = pci_dev;
808 rte_spinlock_init(&internal->lock);
810 if (ifcvf_vfio_setup(internal) < 0) {
811 DRV_LOG(ERR, "failed to setup device %s", pci_dev->name);
815 if (ifcvf_init_hw(&internal->hw, internal->pdev) < 0) {
816 DRV_LOG(ERR, "failed to init device %s", pci_dev->name);
820 internal->max_queues = IFCVF_MAX_QUEUES;
821 features = ifcvf_get_features(&internal->hw);
822 internal->features = (features &
823 ~(1ULL << VIRTIO_F_IOMMU_PLATFORM)) |
824 (1ULL << VIRTIO_NET_F_GUEST_ANNOUNCE) |
825 (1ULL << VIRTIO_NET_F_CTRL_VQ) |
826 (1ULL << VIRTIO_NET_F_STATUS) |
827 (1ULL << VHOST_USER_F_PROTOCOL_FEATURES) |
828 (1ULL << VHOST_F_LOG_ALL);
830 internal->dev_addr.pci_addr = pci_dev->addr;
831 internal->dev_addr.type = PCI_ADDR;
832 list->internal = internal;
834 if (rte_kvargs_count(kvlist, IFCVF_SW_FALLBACK_LM)) {
835 ret = rte_kvargs_process(kvlist, IFCVF_SW_FALLBACK_LM,
836 &open_int, &sw_fallback_lm);
840 internal->sw_lm = sw_fallback_lm;
842 internal->did = rte_vdpa_register_device(&internal->dev_addr,
844 if (internal->did < 0) {
845 DRV_LOG(ERR, "failed to register device %s", pci_dev->name);
849 pthread_mutex_lock(&internal_list_lock);
850 TAILQ_INSERT_TAIL(&internal_list, list, next);
851 pthread_mutex_unlock(&internal_list_lock);
853 rte_atomic32_set(&internal->started, 1);
854 update_datapath(internal);
856 rte_kvargs_free(kvlist);
860 rte_kvargs_free(kvlist);
867 ifcvf_pci_remove(struct rte_pci_device *pci_dev)
869 struct ifcvf_internal *internal;
870 struct internal_list *list;
872 if (rte_eal_process_type() != RTE_PROC_PRIMARY)
875 list = find_internal_resource_by_dev(pci_dev);
877 DRV_LOG(ERR, "Invalid device: %s", pci_dev->name);
881 internal = list->internal;
882 rte_atomic32_set(&internal->started, 0);
883 update_datapath(internal);
885 rte_pci_unmap_device(internal->pdev);
886 rte_vfio_container_destroy(internal->vfio_container_fd);
887 rte_vdpa_unregister_device(internal->did);
889 pthread_mutex_lock(&internal_list_lock);
890 TAILQ_REMOVE(&internal_list, list, next);
891 pthread_mutex_unlock(&internal_list_lock);
900 * IFCVF has the same vendor ID and device ID as virtio net PCI
901 * device, with its specific subsystem vendor ID and device ID.
903 static const struct rte_pci_id pci_id_ifcvf_map[] = {
904 { .class_id = RTE_CLASS_ANY_ID,
905 .vendor_id = IFCVF_VENDOR_ID,
906 .device_id = IFCVF_DEVICE_ID,
907 .subsystem_vendor_id = IFCVF_SUBSYS_VENDOR_ID,
908 .subsystem_device_id = IFCVF_SUBSYS_DEVICE_ID,
911 { .vendor_id = 0, /* sentinel */
915 static struct rte_pci_driver rte_ifcvf_vdpa = {
916 .id_table = pci_id_ifcvf_map,
918 .probe = ifcvf_pci_probe,
919 .remove = ifcvf_pci_remove,
922 RTE_PMD_REGISTER_PCI(net_ifcvf, rte_ifcvf_vdpa);
923 RTE_PMD_REGISTER_PCI_TABLE(net_ifcvf, pci_id_ifcvf_map);
924 RTE_PMD_REGISTER_KMOD_DEP(net_ifcvf, "* vfio-pci");
926 RTE_INIT(ifcvf_vdpa_init_log)
928 ifcvf_vdpa_logtype = rte_log_register("pmd.net.ifcvf_vdpa");
929 if (ifcvf_vdpa_logtype >= 0)
930 rte_log_set_level(ifcvf_vdpa_logtype, RTE_LOG_NOTICE);