1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright(c) 2018 Intel Corporation
10 #include <linux/virtio_net.h>
12 #include <rte_malloc.h>
13 #include <rte_memory.h>
14 #include <rte_bus_pci.h>
15 #include <rte_vhost.h>
18 #include <rte_spinlock.h>
20 #include <rte_kvargs.h>
21 #include <rte_devargs.h>
23 #include "base/ifcvf.h"
25 #define DRV_LOG(level, fmt, args...) \
26 rte_log(RTE_LOG_ ## level, ifcvf_vdpa_logtype, \
27 "IFCVF %s(): " fmt "\n", __func__, ##args)
30 #define PAGE_SIZE 4096
33 #define IFCVF_VDPA_MODE "vdpa"
35 static const char * const ifcvf_valid_arguments[] = {
40 static int ifcvf_vdpa_logtype;
42 struct ifcvf_internal {
43 struct rte_vdpa_dev_addr dev_addr;
44 struct rte_pci_device *pdev;
46 int vfio_container_fd;
49 pthread_t tid; /* thread for notify relay */
55 rte_atomic32_t started;
56 rte_atomic32_t dev_attached;
57 rte_atomic32_t running;
61 struct internal_list {
62 TAILQ_ENTRY(internal_list) next;
63 struct ifcvf_internal *internal;
66 TAILQ_HEAD(internal_list_head, internal_list);
67 static struct internal_list_head internal_list =
68 TAILQ_HEAD_INITIALIZER(internal_list);
70 static pthread_mutex_t internal_list_lock = PTHREAD_MUTEX_INITIALIZER;
72 static struct internal_list *
73 find_internal_resource_by_did(int did)
76 struct internal_list *list;
78 pthread_mutex_lock(&internal_list_lock);
80 TAILQ_FOREACH(list, &internal_list, next) {
81 if (did == list->internal->did) {
87 pthread_mutex_unlock(&internal_list_lock);
95 static struct internal_list *
96 find_internal_resource_by_dev(struct rte_pci_device *pdev)
99 struct internal_list *list;
101 pthread_mutex_lock(&internal_list_lock);
103 TAILQ_FOREACH(list, &internal_list, next) {
104 if (pdev == list->internal->pdev) {
110 pthread_mutex_unlock(&internal_list_lock);
119 ifcvf_vfio_setup(struct ifcvf_internal *internal)
121 struct rte_pci_device *dev = internal->pdev;
122 char devname[RTE_DEV_NAME_MAX_LEN] = {0};
126 internal->vfio_dev_fd = -1;
127 internal->vfio_group_fd = -1;
128 internal->vfio_container_fd = -1;
130 rte_pci_device_name(&dev->addr, devname, RTE_DEV_NAME_MAX_LEN);
131 rte_vfio_get_group_num(rte_pci_get_sysfs_path(), devname,
134 internal->vfio_container_fd = rte_vfio_container_create();
135 if (internal->vfio_container_fd < 0)
138 internal->vfio_group_fd = rte_vfio_container_group_bind(
139 internal->vfio_container_fd, iommu_group_num);
140 if (internal->vfio_group_fd < 0)
143 if (rte_pci_map_device(dev))
146 internal->vfio_dev_fd = dev->intr_handle.vfio_dev_fd;
148 for (i = 0; i < RTE_MIN(PCI_MAX_RESOURCE, IFCVF_PCI_MAX_RESOURCE);
150 internal->hw.mem_resource[i].addr =
151 internal->pdev->mem_resource[i].addr;
152 internal->hw.mem_resource[i].phys_addr =
153 internal->pdev->mem_resource[i].phys_addr;
154 internal->hw.mem_resource[i].len =
155 internal->pdev->mem_resource[i].len;
161 rte_vfio_container_destroy(internal->vfio_container_fd);
166 ifcvf_dma_map(struct ifcvf_internal *internal, int do_map)
170 struct rte_vhost_memory *mem = NULL;
171 int vfio_container_fd;
173 ret = rte_vhost_get_mem_table(internal->vid, &mem);
175 DRV_LOG(ERR, "failed to get VM memory layout.");
179 vfio_container_fd = internal->vfio_container_fd;
181 for (i = 0; i < mem->nregions; i++) {
182 struct rte_vhost_mem_region *reg;
184 reg = &mem->regions[i];
185 DRV_LOG(INFO, "%s, region %u: HVA 0x%" PRIx64 ", "
186 "GPA 0x%" PRIx64 ", size 0x%" PRIx64 ".",
187 do_map ? "DMA map" : "DMA unmap", i,
188 reg->host_user_addr, reg->guest_phys_addr, reg->size);
191 ret = rte_vfio_container_dma_map(vfio_container_fd,
192 reg->host_user_addr, reg->guest_phys_addr,
195 DRV_LOG(ERR, "DMA map failed.");
199 ret = rte_vfio_container_dma_unmap(vfio_container_fd,
200 reg->host_user_addr, reg->guest_phys_addr,
203 DRV_LOG(ERR, "DMA unmap failed.");
216 hva_to_gpa(int vid, uint64_t hva)
218 struct rte_vhost_memory *mem = NULL;
219 struct rte_vhost_mem_region *reg;
223 if (rte_vhost_get_mem_table(vid, &mem) < 0)
226 for (i = 0; i < mem->nregions; i++) {
227 reg = &mem->regions[i];
229 if (hva >= reg->host_user_addr &&
230 hva < reg->host_user_addr + reg->size) {
231 gpa = hva - reg->host_user_addr + reg->guest_phys_addr;
243 vdpa_ifcvf_start(struct ifcvf_internal *internal)
245 struct ifcvf_hw *hw = &internal->hw;
248 struct rte_vhost_vring vq;
252 nr_vring = rte_vhost_get_vring_num(vid);
253 rte_vhost_get_negotiated_features(vid, &hw->req_features);
255 for (i = 0; i < nr_vring; i++) {
256 rte_vhost_get_vhost_vring(vid, i, &vq);
257 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.desc);
259 DRV_LOG(ERR, "Fail to get GPA for descriptor ring.");
262 hw->vring[i].desc = gpa;
264 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.avail);
266 DRV_LOG(ERR, "Fail to get GPA for available ring.");
269 hw->vring[i].avail = gpa;
271 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.used);
273 DRV_LOG(ERR, "Fail to get GPA for used ring.");
276 hw->vring[i].used = gpa;
278 hw->vring[i].size = vq.size;
279 rte_vhost_get_vring_base(vid, i, &hw->vring[i].last_avail_idx,
280 &hw->vring[i].last_used_idx);
284 return ifcvf_start_hw(&internal->hw);
288 ifcvf_used_ring_log(struct ifcvf_hw *hw, uint32_t queue, uint8_t *log_buf)
293 pfn = hw->vring[queue].used / PAGE_SIZE;
294 size = hw->vring[queue].size * sizeof(struct vring_used_elem) +
295 sizeof(uint16_t) * 3;
297 for (i = 0; i <= size / PAGE_SIZE; i++)
298 __sync_fetch_and_or_8(&log_buf[(pfn + i) / 8],
299 1 << ((pfn + i) % 8));
303 vdpa_ifcvf_stop(struct ifcvf_internal *internal)
305 struct ifcvf_hw *hw = &internal->hw;
309 uint64_t log_base, log_size;
315 for (i = 0; i < hw->nr_vring; i++)
316 rte_vhost_set_vring_base(vid, i, hw->vring[i].last_avail_idx,
317 hw->vring[i].last_used_idx);
319 rte_vhost_get_negotiated_features(vid, &features);
320 if (RTE_VHOST_NEED_LOG(features)) {
321 ifcvf_disable_logging(hw);
322 rte_vhost_get_log_base(internal->vid, &log_base, &log_size);
323 rte_vfio_container_dma_unmap(internal->vfio_container_fd,
324 log_base, IFCVF_LOG_BASE, log_size);
326 * IFCVF marks dirty memory pages for only packet buffer,
327 * SW helps to mark the used ring as dirty after device stops.
329 log_buf = (uint8_t *)(uintptr_t)log_base;
330 for (i = 0; i < hw->nr_vring; i++)
331 ifcvf_used_ring_log(hw, i, log_buf);
335 #define MSIX_IRQ_SET_BUF_LEN (sizeof(struct vfio_irq_set) + \
336 sizeof(int) * (IFCVF_MAX_QUEUES * 2 + 1))
338 vdpa_enable_vfio_intr(struct ifcvf_internal *internal)
341 uint32_t i, nr_vring;
342 char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
343 struct vfio_irq_set *irq_set;
345 struct rte_vhost_vring vring;
347 nr_vring = rte_vhost_get_vring_num(internal->vid);
349 irq_set = (struct vfio_irq_set *)irq_set_buf;
350 irq_set->argsz = sizeof(irq_set_buf);
351 irq_set->count = nr_vring + 1;
352 irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
353 VFIO_IRQ_SET_ACTION_TRIGGER;
354 irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
356 fd_ptr = (int *)&irq_set->data;
357 fd_ptr[RTE_INTR_VEC_ZERO_OFFSET] = internal->pdev->intr_handle.fd;
359 for (i = 0; i < nr_vring; i++) {
360 rte_vhost_get_vhost_vring(internal->vid, i, &vring);
361 fd_ptr[RTE_INTR_VEC_RXTX_OFFSET + i] = vring.callfd;
364 ret = ioctl(internal->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
366 DRV_LOG(ERR, "Error enabling MSI-X interrupts: %s",
375 vdpa_disable_vfio_intr(struct ifcvf_internal *internal)
378 char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
379 struct vfio_irq_set *irq_set;
381 irq_set = (struct vfio_irq_set *)irq_set_buf;
382 irq_set->argsz = sizeof(irq_set_buf);
384 irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
385 irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
388 ret = ioctl(internal->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
390 DRV_LOG(ERR, "Error disabling MSI-X interrupts: %s",
399 notify_relay(void *arg)
401 int i, kickfd, epfd, nfds = 0;
403 struct epoll_event events[IFCVF_MAX_QUEUES * 2];
404 struct epoll_event ev;
407 struct rte_vhost_vring vring;
408 struct ifcvf_internal *internal = (struct ifcvf_internal *)arg;
409 struct ifcvf_hw *hw = &internal->hw;
411 q_num = rte_vhost_get_vring_num(internal->vid);
413 epfd = epoll_create(IFCVF_MAX_QUEUES * 2);
415 DRV_LOG(ERR, "failed to create epoll instance.");
418 internal->epfd = epfd;
420 for (qid = 0; qid < q_num; qid++) {
421 ev.events = EPOLLIN | EPOLLPRI;
422 rte_vhost_get_vhost_vring(internal->vid, qid, &vring);
423 ev.data.u64 = qid | (uint64_t)vring.kickfd << 32;
424 if (epoll_ctl(epfd, EPOLL_CTL_ADD, vring.kickfd, &ev) < 0) {
425 DRV_LOG(ERR, "epoll add error: %s", strerror(errno));
431 nfds = epoll_wait(epfd, events, q_num, -1);
435 DRV_LOG(ERR, "epoll_wait return fail\n");
439 for (i = 0; i < nfds; i++) {
440 qid = events[i].data.u32;
441 kickfd = (uint32_t)(events[i].data.u64 >> 32);
443 nbytes = read(kickfd, &buf, 8);
445 if (errno == EINTR ||
446 errno == EWOULDBLOCK ||
449 DRV_LOG(INFO, "Error reading "
456 ifcvf_notify_queue(hw, qid);
464 setup_notify_relay(struct ifcvf_internal *internal)
468 ret = pthread_create(&internal->tid, NULL, notify_relay,
471 DRV_LOG(ERR, "failed to create notify relay pthread.");
478 unset_notify_relay(struct ifcvf_internal *internal)
483 pthread_cancel(internal->tid);
484 pthread_join(internal->tid, &status);
488 if (internal->epfd >= 0)
489 close(internal->epfd);
496 update_datapath(struct ifcvf_internal *internal)
500 rte_spinlock_lock(&internal->lock);
502 if (!rte_atomic32_read(&internal->running) &&
503 (rte_atomic32_read(&internal->started) &&
504 rte_atomic32_read(&internal->dev_attached))) {
505 ret = ifcvf_dma_map(internal, 1);
509 ret = vdpa_enable_vfio_intr(internal);
513 ret = vdpa_ifcvf_start(internal);
517 ret = setup_notify_relay(internal);
521 rte_atomic32_set(&internal->running, 1);
522 } else if (rte_atomic32_read(&internal->running) &&
523 (!rte_atomic32_read(&internal->started) ||
524 !rte_atomic32_read(&internal->dev_attached))) {
525 ret = unset_notify_relay(internal);
529 vdpa_ifcvf_stop(internal);
531 ret = vdpa_disable_vfio_intr(internal);
535 ret = ifcvf_dma_map(internal, 0);
539 rte_atomic32_set(&internal->running, 0);
542 rte_spinlock_unlock(&internal->lock);
545 rte_spinlock_unlock(&internal->lock);
550 ifcvf_dev_config(int vid)
553 struct internal_list *list;
554 struct ifcvf_internal *internal;
556 did = rte_vhost_get_vdpa_device_id(vid);
557 list = find_internal_resource_by_did(did);
559 DRV_LOG(ERR, "Invalid device id: %d", did);
563 internal = list->internal;
565 rte_atomic32_set(&internal->dev_attached, 1);
566 update_datapath(internal);
568 if (rte_vhost_host_notifier_ctrl(vid, true) != 0)
569 DRV_LOG(NOTICE, "vDPA (%d): software relay is used.", did);
575 ifcvf_dev_close(int vid)
578 struct internal_list *list;
579 struct ifcvf_internal *internal;
581 did = rte_vhost_get_vdpa_device_id(vid);
582 list = find_internal_resource_by_did(did);
584 DRV_LOG(ERR, "Invalid device id: %d", did);
588 internal = list->internal;
589 rte_atomic32_set(&internal->dev_attached, 0);
590 update_datapath(internal);
596 ifcvf_set_features(int vid)
600 struct internal_list *list;
601 struct ifcvf_internal *internal;
602 uint64_t log_base, log_size;
604 did = rte_vhost_get_vdpa_device_id(vid);
605 list = find_internal_resource_by_did(did);
607 DRV_LOG(ERR, "Invalid device id: %d", did);
611 internal = list->internal;
612 rte_vhost_get_negotiated_features(vid, &features);
614 if (RTE_VHOST_NEED_LOG(features)) {
615 rte_vhost_get_log_base(vid, &log_base, &log_size);
616 rte_vfio_container_dma_map(internal->vfio_container_fd,
617 log_base, IFCVF_LOG_BASE, log_size);
618 ifcvf_enable_logging(&internal->hw, IFCVF_LOG_BASE, log_size);
625 ifcvf_get_vfio_group_fd(int vid)
628 struct internal_list *list;
630 did = rte_vhost_get_vdpa_device_id(vid);
631 list = find_internal_resource_by_did(did);
633 DRV_LOG(ERR, "Invalid device id: %d", did);
637 return list->internal->vfio_group_fd;
641 ifcvf_get_vfio_device_fd(int vid)
644 struct internal_list *list;
646 did = rte_vhost_get_vdpa_device_id(vid);
647 list = find_internal_resource_by_did(did);
649 DRV_LOG(ERR, "Invalid device id: %d", did);
653 return list->internal->vfio_dev_fd;
657 ifcvf_get_notify_area(int vid, int qid, uint64_t *offset, uint64_t *size)
660 struct internal_list *list;
661 struct ifcvf_internal *internal;
662 struct vfio_region_info reg = { .argsz = sizeof(reg) };
665 did = rte_vhost_get_vdpa_device_id(vid);
666 list = find_internal_resource_by_did(did);
668 DRV_LOG(ERR, "Invalid device id: %d", did);
672 internal = list->internal;
674 reg.index = ifcvf_get_notify_region(&internal->hw);
675 ret = ioctl(internal->vfio_dev_fd, VFIO_DEVICE_GET_REGION_INFO, ®);
677 DRV_LOG(ERR, "Get not get device region info: %s",
682 *offset = ifcvf_get_queue_notify_off(&internal->hw, qid) + reg.offset;
689 ifcvf_get_queue_num(int did, uint32_t *queue_num)
691 struct internal_list *list;
693 list = find_internal_resource_by_did(did);
695 DRV_LOG(ERR, "Invalid device id: %d", did);
699 *queue_num = list->internal->max_queues;
705 ifcvf_get_vdpa_features(int did, uint64_t *features)
707 struct internal_list *list;
709 list = find_internal_resource_by_did(did);
711 DRV_LOG(ERR, "Invalid device id: %d", did);
715 *features = list->internal->features;
720 #define VDPA_SUPPORTED_PROTOCOL_FEATURES \
721 (1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK | \
722 1ULL << VHOST_USER_PROTOCOL_F_SLAVE_REQ | \
723 1ULL << VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD | \
724 1ULL << VHOST_USER_PROTOCOL_F_HOST_NOTIFIER | \
725 1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD)
727 ifcvf_get_protocol_features(int did __rte_unused, uint64_t *features)
729 *features = VDPA_SUPPORTED_PROTOCOL_FEATURES;
733 static struct rte_vdpa_dev_ops ifcvf_ops = {
734 .get_queue_num = ifcvf_get_queue_num,
735 .get_features = ifcvf_get_vdpa_features,
736 .get_protocol_features = ifcvf_get_protocol_features,
737 .dev_conf = ifcvf_dev_config,
738 .dev_close = ifcvf_dev_close,
739 .set_vring_state = NULL,
740 .set_features = ifcvf_set_features,
741 .migration_done = NULL,
742 .get_vfio_group_fd = ifcvf_get_vfio_group_fd,
743 .get_vfio_device_fd = ifcvf_get_vfio_device_fd,
744 .get_notify_area = ifcvf_get_notify_area,
748 open_int(const char *key __rte_unused, const char *value, void *extra_args)
750 uint16_t *n = extra_args;
752 if (value == NULL || extra_args == NULL)
755 *n = (uint16_t)strtoul(value, NULL, 0);
756 if (*n == USHRT_MAX && errno == ERANGE)
763 ifcvf_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
764 struct rte_pci_device *pci_dev)
767 struct ifcvf_internal *internal = NULL;
768 struct internal_list *list = NULL;
770 struct rte_kvargs *kvlist = NULL;
773 if (rte_eal_process_type() != RTE_PROC_PRIMARY)
776 kvlist = rte_kvargs_parse(pci_dev->device.devargs->args,
777 ifcvf_valid_arguments);
781 /* probe only when vdpa mode is specified */
782 if (rte_kvargs_count(kvlist, IFCVF_VDPA_MODE) == 0) {
783 rte_kvargs_free(kvlist);
787 ret = rte_kvargs_process(kvlist, IFCVF_VDPA_MODE, &open_int,
789 if (ret < 0 || vdpa_mode == 0) {
790 rte_kvargs_free(kvlist);
794 list = rte_zmalloc("ifcvf", sizeof(*list), 0);
798 internal = rte_zmalloc("ifcvf", sizeof(*internal), 0);
799 if (internal == NULL)
802 internal->pdev = pci_dev;
803 rte_spinlock_init(&internal->lock);
805 if (ifcvf_vfio_setup(internal) < 0) {
806 DRV_LOG(ERR, "failed to setup device %s", pci_dev->name);
810 if (ifcvf_init_hw(&internal->hw, internal->pdev) < 0) {
811 DRV_LOG(ERR, "failed to init device %s", pci_dev->name);
815 internal->max_queues = IFCVF_MAX_QUEUES;
816 features = ifcvf_get_features(&internal->hw);
817 internal->features = (features &
818 ~(1ULL << VIRTIO_F_IOMMU_PLATFORM)) |
819 (1ULL << VIRTIO_NET_F_GUEST_ANNOUNCE) |
820 (1ULL << VIRTIO_NET_F_CTRL_VQ) |
821 (1ULL << VIRTIO_NET_F_STATUS) |
822 (1ULL << VHOST_USER_F_PROTOCOL_FEATURES) |
823 (1ULL << VHOST_F_LOG_ALL);
825 internal->dev_addr.pci_addr = pci_dev->addr;
826 internal->dev_addr.type = PCI_ADDR;
827 list->internal = internal;
829 internal->did = rte_vdpa_register_device(&internal->dev_addr,
831 if (internal->did < 0) {
832 DRV_LOG(ERR, "failed to register device %s", pci_dev->name);
836 pthread_mutex_lock(&internal_list_lock);
837 TAILQ_INSERT_TAIL(&internal_list, list, next);
838 pthread_mutex_unlock(&internal_list_lock);
840 rte_atomic32_set(&internal->started, 1);
841 update_datapath(internal);
843 rte_kvargs_free(kvlist);
847 rte_kvargs_free(kvlist);
854 ifcvf_pci_remove(struct rte_pci_device *pci_dev)
856 struct ifcvf_internal *internal;
857 struct internal_list *list;
859 if (rte_eal_process_type() != RTE_PROC_PRIMARY)
862 list = find_internal_resource_by_dev(pci_dev);
864 DRV_LOG(ERR, "Invalid device: %s", pci_dev->name);
868 internal = list->internal;
869 rte_atomic32_set(&internal->started, 0);
870 update_datapath(internal);
872 rte_pci_unmap_device(internal->pdev);
873 rte_vfio_container_destroy(internal->vfio_container_fd);
874 rte_vdpa_unregister_device(internal->did);
876 pthread_mutex_lock(&internal_list_lock);
877 TAILQ_REMOVE(&internal_list, list, next);
878 pthread_mutex_unlock(&internal_list_lock);
887 * IFCVF has the same vendor ID and device ID as virtio net PCI
888 * device, with its specific subsystem vendor ID and device ID.
890 static const struct rte_pci_id pci_id_ifcvf_map[] = {
891 { .class_id = RTE_CLASS_ANY_ID,
892 .vendor_id = IFCVF_VENDOR_ID,
893 .device_id = IFCVF_DEVICE_ID,
894 .subsystem_vendor_id = IFCVF_SUBSYS_VENDOR_ID,
895 .subsystem_device_id = IFCVF_SUBSYS_DEVICE_ID,
898 { .vendor_id = 0, /* sentinel */
902 static struct rte_pci_driver rte_ifcvf_vdpa = {
903 .id_table = pci_id_ifcvf_map,
905 .probe = ifcvf_pci_probe,
906 .remove = ifcvf_pci_remove,
909 RTE_PMD_REGISTER_PCI(net_ifcvf, rte_ifcvf_vdpa);
910 RTE_PMD_REGISTER_PCI_TABLE(net_ifcvf, pci_id_ifcvf_map);
911 RTE_PMD_REGISTER_KMOD_DEP(net_ifcvf, "* vfio-pci");
913 RTE_INIT(ifcvf_vdpa_init_log)
915 ifcvf_vdpa_logtype = rte_log_register("pmd.net.ifcvf_vdpa");
916 if (ifcvf_vdpa_logtype >= 0)
917 rte_log_set_level(ifcvf_vdpa_logtype, RTE_LOG_NOTICE);