1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright(c) 2018 Intel Corporation
10 #include <sys/epoll.h>
11 #include <linux/virtio_net.h>
14 #include <rte_eal_paging.h>
15 #include <rte_malloc.h>
16 #include <rte_memory.h>
17 #include <rte_bus_pci.h>
18 #include <rte_vhost.h>
20 #include <vdpa_driver.h>
22 #include <rte_spinlock.h>
24 #include <rte_kvargs.h>
25 #include <rte_devargs.h>
27 #include "base/ifcvf.h"
29 RTE_LOG_REGISTER(ifcvf_vdpa_logtype, pmd.vdpa.ifcvf, NOTICE);
30 #define DRV_LOG(level, fmt, args...) \
31 rte_log(RTE_LOG_ ## level, ifcvf_vdpa_logtype, \
32 "IFCVF %s(): " fmt "\n", __func__, ##args)
34 #define IFCVF_USED_RING_LEN(size) \
35 ((size) * sizeof(struct vring_used_elem) + sizeof(uint16_t) * 3)
37 #define IFCVF_VDPA_MODE "vdpa"
38 #define IFCVF_SW_FALLBACK_LM "sw-live-migration"
40 #define THREAD_NAME_LEN 16
42 static const char * const ifcvf_valid_arguments[] = {
48 struct ifcvf_internal {
49 struct rte_pci_device *pdev;
52 int vfio_container_fd;
55 pthread_t tid; /* thread for notify relay */
56 pthread_t intr_tid; /* thread for config space change interrupt relay */
60 struct rte_vdpa_device *vdev;
63 rte_atomic32_t started;
64 rte_atomic32_t dev_attached;
65 rte_atomic32_t running;
68 bool sw_fallback_running;
69 /* mediated vring for sw fallback */
70 struct vring m_vring[IFCVF_MAX_QUEUES * 2];
71 /* eventfd for used ring interrupt */
72 int intr_fd[IFCVF_MAX_QUEUES * 2];
75 struct internal_list {
76 TAILQ_ENTRY(internal_list) next;
77 struct ifcvf_internal *internal;
80 /* vdpa device info includes device features and devcic operation. */
81 struct rte_vdpa_dev_info {
83 struct rte_vdpa_dev_ops *ops;
86 TAILQ_HEAD(internal_list_head, internal_list);
87 static struct internal_list_head internal_list =
88 TAILQ_HEAD_INITIALIZER(internal_list);
90 static pthread_mutex_t internal_list_lock = PTHREAD_MUTEX_INITIALIZER;
92 static void update_used_ring(struct ifcvf_internal *internal, uint16_t qid);
94 static struct internal_list *
95 find_internal_resource_by_vdev(struct rte_vdpa_device *vdev)
98 struct internal_list *list;
100 pthread_mutex_lock(&internal_list_lock);
102 TAILQ_FOREACH(list, &internal_list, next) {
103 if (vdev == list->internal->vdev) {
109 pthread_mutex_unlock(&internal_list_lock);
117 static struct internal_list *
118 find_internal_resource_by_dev(struct rte_pci_device *pdev)
121 struct internal_list *list;
123 pthread_mutex_lock(&internal_list_lock);
125 TAILQ_FOREACH(list, &internal_list, next) {
126 if (!rte_pci_addr_cmp(&pdev->addr,
127 &list->internal->pdev->addr)) {
133 pthread_mutex_unlock(&internal_list_lock);
142 ifcvf_vfio_setup(struct ifcvf_internal *internal)
144 struct rte_pci_device *dev = internal->pdev;
145 char devname[RTE_DEV_NAME_MAX_LEN] = {0};
149 internal->vfio_dev_fd = -1;
150 internal->vfio_group_fd = -1;
151 internal->vfio_container_fd = -1;
153 rte_pci_device_name(&dev->addr, devname, RTE_DEV_NAME_MAX_LEN);
154 ret = rte_vfio_get_group_num(rte_pci_get_sysfs_path(), devname,
157 DRV_LOG(ERR, "%s failed to get IOMMU group", devname);
161 internal->vfio_container_fd = rte_vfio_container_create();
162 if (internal->vfio_container_fd < 0)
165 internal->vfio_group_fd = rte_vfio_container_group_bind(
166 internal->vfio_container_fd, iommu_group_num);
167 if (internal->vfio_group_fd < 0)
170 if (rte_pci_map_device(dev))
173 internal->vfio_dev_fd = rte_intr_dev_fd_get(dev->intr_handle);
175 for (i = 0; i < RTE_MIN(PCI_MAX_RESOURCE, IFCVF_PCI_MAX_RESOURCE);
177 internal->hw.mem_resource[i].addr =
178 internal->pdev->mem_resource[i].addr;
179 internal->hw.mem_resource[i].phys_addr =
180 internal->pdev->mem_resource[i].phys_addr;
181 internal->hw.mem_resource[i].len =
182 internal->pdev->mem_resource[i].len;
188 rte_vfio_container_destroy(internal->vfio_container_fd);
193 ifcvf_dma_map(struct ifcvf_internal *internal, bool do_map)
197 struct rte_vhost_memory *mem = NULL;
198 int vfio_container_fd;
200 ret = rte_vhost_get_mem_table(internal->vid, &mem);
202 DRV_LOG(ERR, "failed to get VM memory layout.");
206 vfio_container_fd = internal->vfio_container_fd;
208 for (i = 0; i < mem->nregions; i++) {
209 struct rte_vhost_mem_region *reg;
211 reg = &mem->regions[i];
212 DRV_LOG(INFO, "%s, region %u: HVA 0x%" PRIx64 ", "
213 "GPA 0x%" PRIx64 ", size 0x%" PRIx64 ".",
214 do_map ? "DMA map" : "DMA unmap", i,
215 reg->host_user_addr, reg->guest_phys_addr, reg->size);
218 ret = rte_vfio_container_dma_map(vfio_container_fd,
219 reg->host_user_addr, reg->guest_phys_addr,
222 DRV_LOG(ERR, "DMA map failed.");
226 ret = rte_vfio_container_dma_unmap(vfio_container_fd,
227 reg->host_user_addr, reg->guest_phys_addr,
230 DRV_LOG(ERR, "DMA unmap failed.");
242 hva_to_gpa(int vid, uint64_t hva)
244 struct rte_vhost_memory *mem = NULL;
245 struct rte_vhost_mem_region *reg;
249 if (rte_vhost_get_mem_table(vid, &mem) < 0)
252 for (i = 0; i < mem->nregions; i++) {
253 reg = &mem->regions[i];
255 if (hva >= reg->host_user_addr &&
256 hva < reg->host_user_addr + reg->size) {
257 gpa = hva - reg->host_user_addr + reg->guest_phys_addr;
268 vdpa_ifcvf_start(struct ifcvf_internal *internal)
270 struct ifcvf_hw *hw = &internal->hw;
273 struct rte_vhost_vring vq;
277 nr_vring = rte_vhost_get_vring_num(vid);
278 rte_vhost_get_negotiated_features(vid, &hw->req_features);
280 for (i = 0; i < nr_vring; i++) {
281 rte_vhost_get_vhost_vring(vid, i, &vq);
282 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.desc);
284 DRV_LOG(ERR, "Fail to get GPA for descriptor ring.");
287 hw->vring[i].desc = gpa;
289 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.avail);
291 DRV_LOG(ERR, "Fail to get GPA for available ring.");
294 hw->vring[i].avail = gpa;
296 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.used);
298 DRV_LOG(ERR, "Fail to get GPA for used ring.");
301 hw->vring[i].used = gpa;
303 hw->vring[i].size = vq.size;
304 rte_vhost_get_vring_base(vid, i, &hw->vring[i].last_avail_idx,
305 &hw->vring[i].last_used_idx);
309 return ifcvf_start_hw(&internal->hw);
313 vdpa_ifcvf_stop(struct ifcvf_internal *internal)
315 struct ifcvf_hw *hw = &internal->hw;
318 uint64_t features = 0;
319 uint64_t log_base = 0, log_size = 0;
325 /* to make sure no packet is lost for blk device
326 * do not stop until last_avail_idx == last_used_idx
328 if (internal->hw.device_type == IFCVF_BLK) {
329 for (i = 0; i < hw->nr_vring; i++) {
331 if (hw->lm_cfg != NULL)
332 ring_state = *(u32 *)(hw->lm_cfg +
333 IFCVF_LM_RING_STATE_OFFSET +
334 i * IFCVF_LM_CFG_SIZE);
335 hw->vring[i].last_avail_idx =
336 (u16)(ring_state & IFCVF_16_BIT_MASK);
337 hw->vring[i].last_used_idx =
338 (u16)(ring_state >> 16);
339 if (hw->vring[i].last_avail_idx !=
340 hw->vring[i].last_used_idx) {
341 ifcvf_notify_queue(hw, i);
344 } while (hw->vring[i].last_avail_idx !=
345 hw->vring[i].last_used_idx);
351 for (i = 0; i < hw->nr_vring; i++)
352 rte_vhost_set_vring_base(vid, i, hw->vring[i].last_avail_idx,
353 hw->vring[i].last_used_idx);
358 rte_vhost_get_negotiated_features(vid, &features);
359 if (RTE_VHOST_NEED_LOG(features)) {
360 ifcvf_disable_logging(hw);
361 rte_vhost_get_log_base(internal->vid, &log_base, &log_size);
362 rte_vfio_container_dma_unmap(internal->vfio_container_fd,
363 log_base, IFCVF_LOG_BASE, log_size);
365 * IFCVF marks dirty memory pages for only packet buffer,
366 * SW helps to mark the used ring as dirty after device stops.
368 for (i = 0; i < hw->nr_vring; i++) {
369 len = IFCVF_USED_RING_LEN(hw->vring[i].size);
370 rte_vhost_log_used_vring(vid, i, 0, len);
375 #define MSIX_IRQ_SET_BUF_LEN (sizeof(struct vfio_irq_set) + \
376 sizeof(int) * (IFCVF_MAX_QUEUES * 2 + 1))
378 vdpa_enable_vfio_intr(struct ifcvf_internal *internal, bool m_rx)
381 uint32_t i, nr_vring;
382 char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
383 struct vfio_irq_set *irq_set;
385 struct rte_vhost_vring vring;
390 nr_vring = rte_vhost_get_vring_num(internal->vid);
392 irq_set = (struct vfio_irq_set *)irq_set_buf;
393 irq_set->argsz = sizeof(irq_set_buf);
394 irq_set->count = nr_vring + 1;
395 irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
396 VFIO_IRQ_SET_ACTION_TRIGGER;
397 irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
399 fd_ptr = (int *)&irq_set->data;
400 /* The first interrupt is for the configure space change notification */
401 fd_ptr[RTE_INTR_VEC_ZERO_OFFSET] =
402 rte_intr_fd_get(internal->pdev->intr_handle);
404 for (i = 0; i < nr_vring; i++)
405 internal->intr_fd[i] = -1;
407 for (i = 0; i < nr_vring; i++) {
408 rte_vhost_get_vhost_vring(internal->vid, i, &vring);
409 fd_ptr[RTE_INTR_VEC_RXTX_OFFSET + i] = vring.callfd;
411 ((i & 1) == 0 || internal->hw.device_type == IFCVF_BLK)) {
412 /* For the net we only need to relay rx queue,
413 * which will change the mem of VM.
414 * For the blk we need to relay all the read cmd
417 fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
419 DRV_LOG(ERR, "can't setup eventfd: %s",
423 internal->intr_fd[i] = fd;
424 fd_ptr[RTE_INTR_VEC_RXTX_OFFSET + i] = fd;
428 ret = ioctl(internal->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
430 DRV_LOG(ERR, "Error enabling MSI-X interrupts: %s",
439 vdpa_disable_vfio_intr(struct ifcvf_internal *internal)
442 uint32_t i, nr_vring;
443 char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
444 struct vfio_irq_set *irq_set;
446 irq_set = (struct vfio_irq_set *)irq_set_buf;
447 irq_set->argsz = sizeof(irq_set_buf);
449 irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
450 irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
453 nr_vring = rte_vhost_get_vring_num(internal->vid);
454 for (i = 0; i < nr_vring; i++) {
455 if (internal->intr_fd[i] >= 0)
456 close(internal->intr_fd[i]);
457 internal->intr_fd[i] = -1;
460 ret = ioctl(internal->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
462 DRV_LOG(ERR, "Error disabling MSI-X interrupts: %s",
471 notify_relay(void *arg)
473 int i, kickfd, epfd, nfds = 0;
475 struct epoll_event events[IFCVF_MAX_QUEUES * 2];
476 struct epoll_event ev;
479 struct rte_vhost_vring vring;
480 struct ifcvf_internal *internal = (struct ifcvf_internal *)arg;
481 struct ifcvf_hw *hw = &internal->hw;
483 q_num = rte_vhost_get_vring_num(internal->vid);
485 epfd = epoll_create(IFCVF_MAX_QUEUES * 2);
487 DRV_LOG(ERR, "failed to create epoll instance.");
490 internal->epfd = epfd;
493 for (qid = 0; qid < q_num; qid++) {
494 ev.events = EPOLLIN | EPOLLPRI;
495 rte_vhost_get_vhost_vring(internal->vid, qid, &vring);
496 ev.data.u64 = qid | (uint64_t)vring.kickfd << 32;
497 if (epoll_ctl(epfd, EPOLL_CTL_ADD, vring.kickfd, &ev) < 0) {
498 DRV_LOG(ERR, "epoll add error: %s", strerror(errno));
504 nfds = epoll_wait(epfd, events, q_num, -1);
508 DRV_LOG(ERR, "epoll_wait return fail\n");
512 for (i = 0; i < nfds; i++) {
513 qid = events[i].data.u32;
514 kickfd = (uint32_t)(events[i].data.u64 >> 32);
516 nbytes = read(kickfd, &buf, 8);
518 if (errno == EINTR ||
519 errno == EWOULDBLOCK ||
522 DRV_LOG(INFO, "Error reading "
529 ifcvf_notify_queue(hw, qid);
537 setup_notify_relay(struct ifcvf_internal *internal)
539 char name[THREAD_NAME_LEN];
542 snprintf(name, sizeof(name), "ifc-notify-%d", internal->vid);
543 ret = rte_ctrl_thread_create(&internal->tid, name, NULL, notify_relay,
546 DRV_LOG(ERR, "failed to create notify relay pthread.");
554 unset_notify_relay(struct ifcvf_internal *internal)
559 pthread_cancel(internal->tid);
560 pthread_join(internal->tid, &status);
564 if (internal->epfd >= 0)
565 close(internal->epfd);
572 virtio_interrupt_handler(struct ifcvf_internal *internal)
574 int vid = internal->vid;
577 ret = rte_vhost_slave_config_change(vid, 1);
579 DRV_LOG(ERR, "failed to notify the guest about configuration space change.");
583 intr_relay(void *arg)
585 struct ifcvf_internal *internal = (struct ifcvf_internal *)arg;
586 struct epoll_event csc_event;
587 struct epoll_event ev;
590 int csc_epfd, csc_val = 0;
592 csc_epfd = epoll_create(1);
594 DRV_LOG(ERR, "failed to create epoll for config space change.");
598 ev.events = EPOLLIN | EPOLLPRI | EPOLLRDHUP | EPOLLHUP;
599 ev.data.fd = rte_intr_fd_get(internal->pdev->intr_handle);
600 if (epoll_ctl(csc_epfd, EPOLL_CTL_ADD,
601 rte_intr_fd_get(internal->pdev->intr_handle), &ev) < 0) {
602 DRV_LOG(ERR, "epoll add error: %s", strerror(errno));
606 internal->csc_epfd = csc_epfd;
609 csc_val = epoll_wait(csc_epfd, &csc_event, 1, -1);
613 DRV_LOG(ERR, "epoll_wait return fail.");
615 } else if (csc_val == 0) {
619 nbytes = read(csc_event.data.fd, &buf, 8);
621 if (errno == EINTR ||
622 errno == EWOULDBLOCK ||
625 DRV_LOG(ERR, "Error reading from file descriptor %d: %s\n",
629 } else if (nbytes == 0) {
630 DRV_LOG(ERR, "Read nothing from file descriptor %d\n",
634 virtio_interrupt_handler(internal);
642 internal->csc_epfd = -1;
648 setup_intr_relay(struct ifcvf_internal *internal)
650 char name[THREAD_NAME_LEN];
653 snprintf(name, sizeof(name), "ifc-intr-%d", internal->vid);
654 ret = rte_ctrl_thread_create(&internal->intr_tid, name, NULL,
655 intr_relay, (void *)internal);
657 DRV_LOG(ERR, "failed to create notify relay pthread.");
664 unset_intr_relay(struct ifcvf_internal *internal)
668 if (internal->intr_tid) {
669 pthread_cancel(internal->intr_tid);
670 pthread_join(internal->intr_tid, &status);
672 internal->intr_tid = 0;
674 if (internal->csc_epfd >= 0)
675 close(internal->csc_epfd);
676 internal->csc_epfd = -1;
680 update_datapath(struct ifcvf_internal *internal)
684 rte_spinlock_lock(&internal->lock);
686 if (!rte_atomic32_read(&internal->running) &&
687 (rte_atomic32_read(&internal->started) &&
688 rte_atomic32_read(&internal->dev_attached))) {
689 ret = ifcvf_dma_map(internal, true);
693 ret = vdpa_enable_vfio_intr(internal, false);
697 ret = vdpa_ifcvf_start(internal);
701 ret = setup_notify_relay(internal);
705 ret = setup_intr_relay(internal);
709 rte_atomic32_set(&internal->running, 1);
710 } else if (rte_atomic32_read(&internal->running) &&
711 (!rte_atomic32_read(&internal->started) ||
712 !rte_atomic32_read(&internal->dev_attached))) {
713 unset_intr_relay(internal);
715 ret = unset_notify_relay(internal);
719 vdpa_ifcvf_stop(internal);
721 ret = vdpa_disable_vfio_intr(internal);
725 ret = ifcvf_dma_map(internal, false);
729 rte_atomic32_set(&internal->running, 0);
732 rte_spinlock_unlock(&internal->lock);
735 rte_spinlock_unlock(&internal->lock);
740 m_ifcvf_start(struct ifcvf_internal *internal)
742 struct ifcvf_hw *hw = &internal->hw;
743 uint32_t i, nr_vring;
745 struct rte_vhost_vring vq;
747 uint64_t m_vring_iova = IFCVF_MEDIATED_VRING;
751 memset(&vq, 0, sizeof(vq));
753 nr_vring = rte_vhost_get_vring_num(vid);
754 rte_vhost_get_negotiated_features(vid, &hw->req_features);
756 for (i = 0; i < nr_vring; i++) {
757 rte_vhost_get_vhost_vring(vid, i, &vq);
759 size = RTE_ALIGN_CEIL(vring_size(vq.size, rte_mem_page_size()),
760 rte_mem_page_size());
761 vring_buf = rte_zmalloc("ifcvf", size, rte_mem_page_size());
762 vring_init(&internal->m_vring[i], vq.size, vring_buf,
763 rte_mem_page_size());
765 ret = rte_vfio_container_dma_map(internal->vfio_container_fd,
766 (uint64_t)(uintptr_t)vring_buf, m_vring_iova, size);
768 DRV_LOG(ERR, "mediated vring DMA map failed.");
772 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.desc);
774 DRV_LOG(ERR, "Fail to get GPA for descriptor ring.");
777 hw->vring[i].desc = gpa;
779 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.avail);
781 DRV_LOG(ERR, "Fail to get GPA for available ring.");
784 hw->vring[i].avail = gpa;
786 /* NET: Direct I/O for Tx queue, relay for Rx queue
787 * BLK: relay every queue
789 if ((internal->hw.device_type == IFCVF_NET) && (i & 1)) {
790 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.used);
792 DRV_LOG(ERR, "Fail to get GPA for used ring.");
795 hw->vring[i].used = gpa;
797 hw->vring[i].used = m_vring_iova +
798 (char *)internal->m_vring[i].used -
799 (char *)internal->m_vring[i].desc;
802 hw->vring[i].size = vq.size;
804 rte_vhost_get_vring_base(vid, i,
805 &internal->m_vring[i].avail->idx,
806 &internal->m_vring[i].used->idx);
808 rte_vhost_get_vring_base(vid, i, &hw->vring[i].last_avail_idx,
809 &hw->vring[i].last_used_idx);
811 m_vring_iova += size;
813 hw->nr_vring = nr_vring;
815 return ifcvf_start_hw(&internal->hw);
818 for (i = 0; i < nr_vring; i++)
819 rte_free(internal->m_vring[i].desc);
825 m_ifcvf_stop(struct ifcvf_internal *internal)
829 struct rte_vhost_vring vq;
830 struct ifcvf_hw *hw = &internal->hw;
831 uint64_t m_vring_iova = IFCVF_MEDIATED_VRING;
837 for (i = 0; i < hw->nr_vring; i++) {
838 /* synchronize remaining new used entries if any */
839 if (internal->hw.device_type == IFCVF_NET) {
841 update_used_ring(internal, i);
842 } else if (internal->hw.device_type == IFCVF_BLK) {
843 update_used_ring(internal, i);
846 rte_vhost_get_vhost_vring(vid, i, &vq);
847 len = IFCVF_USED_RING_LEN(vq.size);
848 rte_vhost_log_used_vring(vid, i, 0, len);
850 size = RTE_ALIGN_CEIL(vring_size(vq.size, rte_mem_page_size()),
851 rte_mem_page_size());
852 rte_vfio_container_dma_unmap(internal->vfio_container_fd,
853 (uint64_t)(uintptr_t)internal->m_vring[i].desc,
856 rte_vhost_set_vring_base(vid, i, hw->vring[i].last_avail_idx,
857 hw->vring[i].last_used_idx);
858 rte_free(internal->m_vring[i].desc);
859 m_vring_iova += size;
866 update_used_ring(struct ifcvf_internal *internal, uint16_t qid)
868 rte_vdpa_relay_vring_used(internal->vid, qid, &internal->m_vring[qid]);
869 rte_vhost_vring_call(internal->vid, qid);
873 vring_relay(void *arg)
875 int i, vid, epfd, fd, nfds;
876 struct ifcvf_internal *internal = (struct ifcvf_internal *)arg;
877 struct rte_vhost_vring vring;
879 struct epoll_event events[IFCVF_MAX_QUEUES * 4];
880 struct epoll_event ev;
885 q_num = rte_vhost_get_vring_num(vid);
887 /* add notify fd and interrupt fd to epoll */
888 epfd = epoll_create(IFCVF_MAX_QUEUES * 2);
890 DRV_LOG(ERR, "failed to create epoll instance.");
893 internal->epfd = epfd;
896 for (qid = 0; qid < q_num; qid++) {
897 ev.events = EPOLLIN | EPOLLPRI;
898 rte_vhost_get_vhost_vring(vid, qid, &vring);
899 ev.data.u64 = qid << 1 | (uint64_t)vring.kickfd << 32;
900 if (epoll_ctl(epfd, EPOLL_CTL_ADD, vring.kickfd, &ev) < 0) {
901 DRV_LOG(ERR, "epoll add error: %s", strerror(errno));
906 for (qid = 0; qid < q_num; qid += 1) {
907 if ((internal->hw.device_type == IFCVF_NET) && (qid & 1))
909 ev.events = EPOLLIN | EPOLLPRI;
910 /* leave a flag to mark it's for interrupt */
911 ev.data.u64 = 1 | qid << 1 |
912 (uint64_t)internal->intr_fd[qid] << 32;
913 if (epoll_ctl(epfd, EPOLL_CTL_ADD, internal->intr_fd[qid], &ev)
915 DRV_LOG(ERR, "epoll add error: %s", strerror(errno));
918 update_used_ring(internal, qid);
921 /* start relay with a first kick */
922 for (qid = 0; qid < q_num; qid++)
923 ifcvf_notify_queue(&internal->hw, qid);
925 /* listen to the events and react accordingly */
927 nfds = epoll_wait(epfd, events, q_num * 2, -1);
931 DRV_LOG(ERR, "epoll_wait return fail.");
935 for (i = 0; i < nfds; i++) {
936 fd = (uint32_t)(events[i].data.u64 >> 32);
938 nbytes = read(fd, &buf, 8);
940 if (errno == EINTR ||
941 errno == EWOULDBLOCK ||
944 DRV_LOG(INFO, "Error reading "
951 qid = events[i].data.u32 >> 1;
953 if (events[i].data.u32 & 1)
954 update_used_ring(internal, qid);
956 ifcvf_notify_queue(&internal->hw, qid);
964 setup_vring_relay(struct ifcvf_internal *internal)
966 char name[THREAD_NAME_LEN];
969 snprintf(name, sizeof(name), "ifc-vring-%d", internal->vid);
970 ret = rte_ctrl_thread_create(&internal->tid, name, NULL, vring_relay,
973 DRV_LOG(ERR, "failed to create ring relay pthread.");
981 unset_vring_relay(struct ifcvf_internal *internal)
986 pthread_cancel(internal->tid);
987 pthread_join(internal->tid, &status);
991 if (internal->epfd >= 0)
992 close(internal->epfd);
999 ifcvf_sw_fallback_switchover(struct ifcvf_internal *internal)
1002 int vid = internal->vid;
1004 /* stop the direct IO data path */
1005 unset_notify_relay(internal);
1006 vdpa_ifcvf_stop(internal);
1008 unset_intr_relay(internal);
1010 vdpa_disable_vfio_intr(internal);
1012 ret = rte_vhost_host_notifier_ctrl(vid, RTE_VHOST_QUEUE_ALL, false);
1013 if (ret && ret != -ENOTSUP)
1016 /* set up interrupt for interrupt relay */
1017 ret = vdpa_enable_vfio_intr(internal, true);
1022 ret = m_ifcvf_start(internal);
1026 /* set up vring relay thread */
1027 ret = setup_vring_relay(internal);
1031 rte_vhost_host_notifier_ctrl(vid, RTE_VHOST_QUEUE_ALL, true);
1033 internal->sw_fallback_running = true;
1038 m_ifcvf_stop(internal);
1040 vdpa_disable_vfio_intr(internal);
1042 ifcvf_dma_map(internal, false);
1048 ifcvf_dev_config(int vid)
1050 struct rte_vdpa_device *vdev;
1051 struct internal_list *list;
1052 struct ifcvf_internal *internal;
1054 vdev = rte_vhost_get_vdpa_device(vid);
1055 list = find_internal_resource_by_vdev(vdev);
1057 DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
1061 internal = list->internal;
1062 internal->vid = vid;
1063 rte_atomic32_set(&internal->dev_attached, 1);
1064 update_datapath(internal);
1066 if (rte_vhost_host_notifier_ctrl(vid, RTE_VHOST_QUEUE_ALL, true) != 0)
1067 DRV_LOG(NOTICE, "vDPA (%s): software relay is used.",
1068 vdev->device->name);
1070 internal->configured = 1;
1075 ifcvf_dev_close(int vid)
1077 struct rte_vdpa_device *vdev;
1078 struct internal_list *list;
1079 struct ifcvf_internal *internal;
1081 vdev = rte_vhost_get_vdpa_device(vid);
1082 list = find_internal_resource_by_vdev(vdev);
1084 DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
1088 internal = list->internal;
1090 if (internal->sw_fallback_running) {
1091 /* unset ring relay */
1092 unset_vring_relay(internal);
1095 m_ifcvf_stop(internal);
1097 /* remove interrupt setting */
1098 vdpa_disable_vfio_intr(internal);
1100 /* unset DMA map for guest memory */
1101 ifcvf_dma_map(internal, false);
1103 internal->sw_fallback_running = false;
1105 rte_atomic32_set(&internal->dev_attached, 0);
1106 update_datapath(internal);
1109 internal->configured = 0;
1114 ifcvf_set_features(int vid)
1116 uint64_t features = 0;
1117 struct rte_vdpa_device *vdev;
1118 struct internal_list *list;
1119 struct ifcvf_internal *internal;
1120 uint64_t log_base = 0, log_size = 0;
1122 vdev = rte_vhost_get_vdpa_device(vid);
1123 list = find_internal_resource_by_vdev(vdev);
1125 DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
1129 internal = list->internal;
1130 rte_vhost_get_negotiated_features(vid, &features);
1132 if (!RTE_VHOST_NEED_LOG(features))
1135 if (internal->sw_lm) {
1136 ifcvf_sw_fallback_switchover(internal);
1138 rte_vhost_get_log_base(vid, &log_base, &log_size);
1139 rte_vfio_container_dma_map(internal->vfio_container_fd,
1140 log_base, IFCVF_LOG_BASE, log_size);
1141 ifcvf_enable_logging(&internal->hw, IFCVF_LOG_BASE, log_size);
1148 ifcvf_get_vfio_group_fd(int vid)
1150 struct rte_vdpa_device *vdev;
1151 struct internal_list *list;
1153 vdev = rte_vhost_get_vdpa_device(vid);
1154 list = find_internal_resource_by_vdev(vdev);
1156 DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
1160 return list->internal->vfio_group_fd;
1164 ifcvf_get_vfio_device_fd(int vid)
1166 struct rte_vdpa_device *vdev;
1167 struct internal_list *list;
1169 vdev = rte_vhost_get_vdpa_device(vid);
1170 list = find_internal_resource_by_vdev(vdev);
1172 DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
1176 return list->internal->vfio_dev_fd;
1180 ifcvf_get_notify_area(int vid, int qid, uint64_t *offset, uint64_t *size)
1182 struct rte_vdpa_device *vdev;
1183 struct internal_list *list;
1184 struct ifcvf_internal *internal;
1185 struct vfio_region_info reg = { .argsz = sizeof(reg) };
1188 vdev = rte_vhost_get_vdpa_device(vid);
1189 list = find_internal_resource_by_vdev(vdev);
1191 DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
1195 internal = list->internal;
1197 reg.index = ifcvf_get_notify_region(&internal->hw);
1198 ret = ioctl(internal->vfio_dev_fd, VFIO_DEVICE_GET_REGION_INFO, ®);
1200 DRV_LOG(ERR, "Get not get device region info: %s",
1205 *offset = ifcvf_get_queue_notify_off(&internal->hw, qid) + reg.offset;
1212 ifcvf_get_queue_num(struct rte_vdpa_device *vdev, uint32_t *queue_num)
1214 struct internal_list *list;
1216 list = find_internal_resource_by_vdev(vdev);
1218 DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
1222 *queue_num = list->internal->max_queues;
1228 ifcvf_get_vdpa_features(struct rte_vdpa_device *vdev, uint64_t *features)
1230 struct internal_list *list;
1232 list = find_internal_resource_by_vdev(vdev);
1234 DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
1238 *features = list->internal->features;
1243 #define VDPA_SUPPORTED_PROTOCOL_FEATURES \
1244 (1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK | \
1245 1ULL << VHOST_USER_PROTOCOL_F_SLAVE_REQ | \
1246 1ULL << VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD | \
1247 1ULL << VHOST_USER_PROTOCOL_F_HOST_NOTIFIER | \
1248 1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD | \
1249 1ULL << VHOST_USER_PROTOCOL_F_STATUS)
1251 #define VDPA_BLK_PROTOCOL_FEATURES \
1252 (1ULL << VHOST_USER_PROTOCOL_F_CONFIG)
1255 ifcvf_get_protocol_features(struct rte_vdpa_device *vdev, uint64_t *features)
1259 *features = VDPA_SUPPORTED_PROTOCOL_FEATURES;
1264 ifcvf_set_vring_state(int vid, int vring, int state)
1266 struct rte_vdpa_device *vdev;
1267 struct internal_list *list;
1268 struct ifcvf_internal *internal;
1269 struct ifcvf_hw *hw;
1270 struct ifcvf_pci_common_cfg *cfg;
1273 vdev = rte_vhost_get_vdpa_device(vid);
1274 list = find_internal_resource_by_vdev(vdev);
1276 DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
1280 internal = list->internal;
1281 if (vring < 0 || vring >= internal->max_queues * 2) {
1282 DRV_LOG(ERR, "Vring index %d not correct", vring);
1287 if (!internal->configured)
1290 cfg = hw->common_cfg;
1291 IFCVF_WRITE_REG16(vring, &cfg->queue_select);
1292 IFCVF_WRITE_REG16(!!state, &cfg->queue_enable);
1294 if (!state && hw->vring[vring].enable) {
1295 ret = vdpa_disable_vfio_intr(internal);
1300 if (state && !hw->vring[vring].enable) {
1301 ret = vdpa_enable_vfio_intr(internal, false);
1307 hw->vring[vring].enable = !!state;
1312 ifcvf_get_device_type(struct rte_vdpa_device *vdev,
1315 struct ifcvf_internal *internal;
1316 struct internal_list *list;
1318 list = find_internal_resource_by_vdev(vdev);
1320 DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
1324 internal = list->internal;
1326 if (internal->hw.device_type == IFCVF_BLK)
1327 *type = RTE_VHOST_VDPA_DEVICE_TYPE_BLK;
1329 *type = RTE_VHOST_VDPA_DEVICE_TYPE_NET;
1334 static struct rte_vdpa_dev_ops ifcvf_net_ops = {
1335 .get_queue_num = ifcvf_get_queue_num,
1336 .get_features = ifcvf_get_vdpa_features,
1337 .get_protocol_features = ifcvf_get_protocol_features,
1338 .dev_conf = ifcvf_dev_config,
1339 .dev_close = ifcvf_dev_close,
1340 .set_vring_state = ifcvf_set_vring_state,
1341 .set_features = ifcvf_set_features,
1342 .migration_done = NULL,
1343 .get_vfio_group_fd = ifcvf_get_vfio_group_fd,
1344 .get_vfio_device_fd = ifcvf_get_vfio_device_fd,
1345 .get_notify_area = ifcvf_get_notify_area,
1346 .get_dev_type = ifcvf_get_device_type,
1350 open_int(const char *key __rte_unused, const char *value, void *extra_args)
1352 uint16_t *n = extra_args;
1354 if (value == NULL || extra_args == NULL)
1357 *n = (uint16_t)strtoul(value, NULL, 0);
1358 if (*n == USHRT_MAX && errno == ERANGE)
1365 ifcvf_pci_get_device_type(struct rte_pci_device *pci_dev)
1367 uint16_t pci_device_id = pci_dev->id.device_id;
1370 if (pci_device_id < 0x1000 || pci_device_id > 0x107f) {
1371 DRV_LOG(ERR, "Probe device is not a virtio device\n");
1375 if (pci_device_id < 0x1040) {
1376 /* Transitional devices: use the PCI subsystem device id as
1377 * virtio device id, same as legacy driver always did.
1379 device_id = pci_dev->id.subsystem_device_id;
1381 /* Modern devices: simply use PCI device id,
1382 * but start from 0x1040.
1384 device_id = pci_device_id - 0x1040;
1391 ifcvf_blk_get_config(int vid, uint8_t *config, uint32_t size)
1393 struct virtio_blk_config *dev_cfg;
1394 struct ifcvf_internal *internal;
1395 struct rte_vdpa_device *vdev;
1396 struct internal_list *list;
1398 uint64_t capacity = 0;
1401 if (size != sizeof(struct virtio_blk_config)) {
1402 DRV_LOG(ERR, "Invalid len: %u, required: %u",
1403 size, (uint32_t)sizeof(struct virtio_blk_config));
1407 vdev = rte_vhost_get_vdpa_device(vid);
1409 DRV_LOG(ERR, "Invalid vDPA device vid: %d", vid);
1413 list = find_internal_resource_by_vdev(vdev);
1415 DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
1419 internal = list->internal;
1421 for (i = 0; i < sizeof(struct virtio_blk_config); i++)
1422 config[i] = *((u8 *)internal->hw.blk_cfg + i);
1424 dev_cfg = (struct virtio_blk_config *)internal->hw.blk_cfg;
1426 /* cannot read 64-bit register in one attempt, so read byte by byte. */
1427 for (i = 0; i < sizeof(internal->hw.blk_cfg->capacity); i++) {
1428 byte = (uint8_t *)&internal->hw.blk_cfg->capacity + i;
1429 capacity |= (uint64_t)*byte << (i * 8);
1431 /* The capacity is number of sectors in 512-byte.
1432 * So right shift 1 bit we get in K,
1433 * another right shift 10 bits we get in M,
1434 * right shift 10 more bits, we get in G.
1435 * To show capacity in G, we right shift 21 bits in total.
1437 DRV_LOG(DEBUG, "capacity : %"PRIu64"G", capacity >> 21);
1439 DRV_LOG(DEBUG, "size_max : 0x%08x", dev_cfg->size_max);
1440 DRV_LOG(DEBUG, "seg_max : 0x%08x", dev_cfg->seg_max);
1441 DRV_LOG(DEBUG, "blk_size : 0x%08x", dev_cfg->blk_size);
1442 DRV_LOG(DEBUG, "geometry");
1443 DRV_LOG(DEBUG, " cylinders: %u", dev_cfg->geometry.cylinders);
1444 DRV_LOG(DEBUG, " heads : %u", dev_cfg->geometry.heads);
1445 DRV_LOG(DEBUG, " sectors : %u", dev_cfg->geometry.sectors);
1446 DRV_LOG(DEBUG, "num_queues: 0x%08x", dev_cfg->num_queues);
1448 DRV_LOG(DEBUG, "config: [%x] [%x] [%x] [%x] [%x] [%x] [%x] [%x]\n",
1449 config[0], config[1], config[2], config[3], config[4],
1450 config[5], config[6], config[7]);
1455 ifcvf_blk_get_protocol_features(struct rte_vdpa_device *vdev,
1460 *features = VDPA_SUPPORTED_PROTOCOL_FEATURES;
1461 *features |= VDPA_BLK_PROTOCOL_FEATURES;
1465 static struct rte_vdpa_dev_ops ifcvf_blk_ops = {
1466 .get_queue_num = ifcvf_get_queue_num,
1467 .get_features = ifcvf_get_vdpa_features,
1468 .set_features = ifcvf_set_features,
1469 .get_protocol_features = ifcvf_blk_get_protocol_features,
1470 .dev_conf = ifcvf_dev_config,
1471 .dev_close = ifcvf_dev_close,
1472 .set_vring_state = ifcvf_set_vring_state,
1473 .migration_done = NULL,
1474 .get_vfio_group_fd = ifcvf_get_vfio_group_fd,
1475 .get_vfio_device_fd = ifcvf_get_vfio_device_fd,
1476 .get_notify_area = ifcvf_get_notify_area,
1477 .get_config = ifcvf_blk_get_config,
1478 .get_dev_type = ifcvf_get_device_type,
1481 struct rte_vdpa_dev_info dev_info[] = {
1483 .features = (1ULL << VIRTIO_NET_F_GUEST_ANNOUNCE) |
1484 (1ULL << VIRTIO_NET_F_CTRL_VQ) |
1485 (1ULL << VIRTIO_NET_F_STATUS) |
1486 (1ULL << VHOST_USER_F_PROTOCOL_FEATURES) |
1487 (1ULL << VHOST_F_LOG_ALL),
1488 .ops = &ifcvf_net_ops,
1491 .features = (1ULL << VHOST_USER_F_PROTOCOL_FEATURES) |
1492 (1ULL << VHOST_F_LOG_ALL),
1493 .ops = &ifcvf_blk_ops,
1498 ifcvf_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
1499 struct rte_pci_device *pci_dev)
1502 struct ifcvf_internal *internal = NULL;
1503 struct internal_list *list = NULL;
1505 int sw_fallback_lm = 0;
1506 struct rte_kvargs *kvlist = NULL;
1509 uint64_t capacity = 0;
1513 if (rte_eal_process_type() != RTE_PROC_PRIMARY)
1516 if (!pci_dev->device.devargs)
1519 kvlist = rte_kvargs_parse(pci_dev->device.devargs->args,
1520 ifcvf_valid_arguments);
1524 /* probe only when vdpa mode is specified */
1525 if (rte_kvargs_count(kvlist, IFCVF_VDPA_MODE) == 0) {
1526 rte_kvargs_free(kvlist);
1530 ret = rte_kvargs_process(kvlist, IFCVF_VDPA_MODE, &open_int,
1532 if (ret < 0 || vdpa_mode == 0) {
1533 rte_kvargs_free(kvlist);
1537 list = rte_zmalloc("ifcvf", sizeof(*list), 0);
1541 internal = rte_zmalloc("ifcvf", sizeof(*internal), 0);
1542 if (internal == NULL)
1545 internal->pdev = pci_dev;
1546 rte_spinlock_init(&internal->lock);
1548 if (ifcvf_vfio_setup(internal) < 0) {
1549 DRV_LOG(ERR, "failed to setup device %s", pci_dev->name);
1553 if (ifcvf_init_hw(&internal->hw, internal->pdev) < 0) {
1554 DRV_LOG(ERR, "failed to init device %s", pci_dev->name);
1558 internal->configured = 0;
1559 internal->max_queues = IFCVF_MAX_QUEUES;
1560 features = ifcvf_get_features(&internal->hw);
1562 device_id = ifcvf_pci_get_device_type(pci_dev);
1563 if (device_id < 0) {
1564 DRV_LOG(ERR, "failed to get device %s type", pci_dev->name);
1568 if (device_id == VIRTIO_ID_NET) {
1569 internal->hw.device_type = IFCVF_NET;
1570 internal->features = features &
1571 ~(1ULL << VIRTIO_F_IOMMU_PLATFORM);
1572 internal->features |= dev_info[IFCVF_NET].features;
1573 } else if (device_id == VIRTIO_ID_BLOCK) {
1574 internal->hw.device_type = IFCVF_BLK;
1575 internal->features = features &
1576 ~(1ULL << VIRTIO_F_IOMMU_PLATFORM);
1577 internal->features |= dev_info[IFCVF_BLK].features;
1579 /* cannot read 64-bit register in one attempt,
1580 * so read byte by byte.
1582 for (i = 0; i < sizeof(internal->hw.blk_cfg->capacity); i++) {
1583 byte = (uint8_t *)&internal->hw.blk_cfg->capacity + i;
1584 capacity |= (uint64_t)*byte << (i * 8);
1586 /* The capacity is number of sectors in 512-byte.
1587 * So right shift 1 bit we get in K,
1588 * another right shift 10 bits we get in M,
1589 * right shift 10 more bits, we get in G.
1590 * To show capacity in G, we right shift 21 bits in total.
1592 DRV_LOG(DEBUG, "capacity : %"PRIu64"G", capacity >> 21);
1594 DRV_LOG(DEBUG, "size_max : 0x%08x",
1595 internal->hw.blk_cfg->size_max);
1596 DRV_LOG(DEBUG, "seg_max : 0x%08x",
1597 internal->hw.blk_cfg->seg_max);
1598 DRV_LOG(DEBUG, "blk_size : 0x%08x",
1599 internal->hw.blk_cfg->blk_size);
1600 DRV_LOG(DEBUG, "geometry");
1601 DRV_LOG(DEBUG, " cylinders: %u",
1602 internal->hw.blk_cfg->geometry.cylinders);
1603 DRV_LOG(DEBUG, " heads : %u",
1604 internal->hw.blk_cfg->geometry.heads);
1605 DRV_LOG(DEBUG, " sectors : %u",
1606 internal->hw.blk_cfg->geometry.sectors);
1607 DRV_LOG(DEBUG, "num_queues: 0x%08x",
1608 internal->hw.blk_cfg->num_queues);
1611 list->internal = internal;
1613 if (rte_kvargs_count(kvlist, IFCVF_SW_FALLBACK_LM)) {
1614 ret = rte_kvargs_process(kvlist, IFCVF_SW_FALLBACK_LM,
1615 &open_int, &sw_fallback_lm);
1619 internal->sw_lm = sw_fallback_lm;
1621 internal->vdev = rte_vdpa_register_device(&pci_dev->device,
1622 dev_info[internal->hw.device_type].ops);
1623 if (internal->vdev == NULL) {
1624 DRV_LOG(ERR, "failed to register device %s", pci_dev->name);
1628 pthread_mutex_lock(&internal_list_lock);
1629 TAILQ_INSERT_TAIL(&internal_list, list, next);
1630 pthread_mutex_unlock(&internal_list_lock);
1632 rte_atomic32_set(&internal->started, 1);
1633 update_datapath(internal);
1635 rte_kvargs_free(kvlist);
1639 rte_kvargs_free(kvlist);
1646 ifcvf_pci_remove(struct rte_pci_device *pci_dev)
1648 struct ifcvf_internal *internal;
1649 struct internal_list *list;
1651 if (rte_eal_process_type() != RTE_PROC_PRIMARY)
1654 list = find_internal_resource_by_dev(pci_dev);
1656 DRV_LOG(ERR, "Invalid device: %s", pci_dev->name);
1660 internal = list->internal;
1661 rte_atomic32_set(&internal->started, 0);
1662 update_datapath(internal);
1664 rte_pci_unmap_device(internal->pdev);
1665 rte_vfio_container_destroy(internal->vfio_container_fd);
1666 rte_vdpa_unregister_device(internal->vdev);
1668 pthread_mutex_lock(&internal_list_lock);
1669 TAILQ_REMOVE(&internal_list, list, next);
1670 pthread_mutex_unlock(&internal_list_lock);
1679 * IFCVF has the same vendor ID and device ID as virtio net PCI
1680 * device, with its specific subsystem vendor ID and device ID.
1682 static const struct rte_pci_id pci_id_ifcvf_map[] = {
1683 { .class_id = RTE_CLASS_ANY_ID,
1684 .vendor_id = IFCVF_VENDOR_ID,
1685 .device_id = IFCVF_NET_DEVICE_ID,
1686 .subsystem_vendor_id = IFCVF_SUBSYS_VENDOR_ID,
1687 .subsystem_device_id = IFCVF_SUBSYS_DEVICE_ID,
1690 { .class_id = RTE_CLASS_ANY_ID,
1691 .vendor_id = IFCVF_VENDOR_ID,
1692 .device_id = IFCVF_BLK_TRANSITIONAL_DEVICE_ID,
1693 .subsystem_vendor_id = IFCVF_SUBSYS_VENDOR_ID,
1694 .subsystem_device_id = IFCVF_BLK_DEVICE_ID,
1697 { .class_id = RTE_CLASS_ANY_ID,
1698 .vendor_id = IFCVF_VENDOR_ID,
1699 .device_id = IFCVF_BLK_MODERN_DEVICE_ID,
1700 .subsystem_vendor_id = IFCVF_SUBSYS_VENDOR_ID,
1701 .subsystem_device_id = IFCVF_BLK_DEVICE_ID,
1704 { .vendor_id = 0, /* sentinel */
1708 static struct rte_pci_driver rte_ifcvf_vdpa = {
1709 .id_table = pci_id_ifcvf_map,
1711 .probe = ifcvf_pci_probe,
1712 .remove = ifcvf_pci_remove,
1715 RTE_PMD_REGISTER_PCI(net_ifcvf, rte_ifcvf_vdpa);
1716 RTE_PMD_REGISTER_PCI_TABLE(net_ifcvf, pci_id_ifcvf_map);
1717 RTE_PMD_REGISTER_KMOD_DEP(net_ifcvf, "* vfio-pci");