1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright(c) 2018 Intel Corporation
10 #include <sys/epoll.h>
11 #include <linux/virtio_net.h>
14 #include <rte_eal_paging.h>
15 #include <rte_malloc.h>
16 #include <rte_memory.h>
17 #include <rte_bus_pci.h>
18 #include <rte_vhost.h>
20 #include <vdpa_driver.h>
22 #include <rte_spinlock.h>
24 #include <rte_kvargs.h>
25 #include <rte_devargs.h>
27 #include "base/ifcvf.h"
29 RTE_LOG_REGISTER(ifcvf_vdpa_logtype, pmd.vdpa.ifcvf, NOTICE);
30 #define DRV_LOG(level, fmt, args...) \
31 rte_log(RTE_LOG_ ## level, ifcvf_vdpa_logtype, \
32 "IFCVF %s(): " fmt "\n", __func__, ##args)
34 #define IFCVF_USED_RING_LEN(size) \
35 ((size) * sizeof(struct vring_used_elem) + sizeof(uint16_t) * 3)
37 #define IFCVF_VDPA_MODE "vdpa"
38 #define IFCVF_SW_FALLBACK_LM "sw-live-migration"
40 #define THREAD_NAME_LEN 16
42 static const char * const ifcvf_valid_arguments[] = {
48 struct ifcvf_internal {
49 struct rte_pci_device *pdev;
52 int vfio_container_fd;
55 pthread_t tid; /* thread for notify relay */
56 pthread_t intr_tid; /* thread for config space change interrupt relay */
60 struct rte_vdpa_device *vdev;
63 rte_atomic32_t started;
64 rte_atomic32_t dev_attached;
65 rte_atomic32_t running;
68 bool sw_fallback_running;
69 /* mediated vring for sw fallback */
70 struct vring m_vring[IFCVF_MAX_QUEUES * 2];
71 /* eventfd for used ring interrupt */
72 int intr_fd[IFCVF_MAX_QUEUES * 2];
75 struct internal_list {
76 TAILQ_ENTRY(internal_list) next;
77 struct ifcvf_internal *internal;
80 /* vdpa device info includes device features and devcic operation. */
81 struct rte_vdpa_dev_info {
83 struct rte_vdpa_dev_ops *ops;
86 TAILQ_HEAD(internal_list_head, internal_list);
87 static struct internal_list_head internal_list =
88 TAILQ_HEAD_INITIALIZER(internal_list);
90 static pthread_mutex_t internal_list_lock = PTHREAD_MUTEX_INITIALIZER;
92 static void update_used_ring(struct ifcvf_internal *internal, uint16_t qid);
94 static struct internal_list *
95 find_internal_resource_by_vdev(struct rte_vdpa_device *vdev)
98 struct internal_list *list;
100 pthread_mutex_lock(&internal_list_lock);
102 TAILQ_FOREACH(list, &internal_list, next) {
103 if (vdev == list->internal->vdev) {
109 pthread_mutex_unlock(&internal_list_lock);
117 static struct internal_list *
118 find_internal_resource_by_dev(struct rte_pci_device *pdev)
121 struct internal_list *list;
123 pthread_mutex_lock(&internal_list_lock);
125 TAILQ_FOREACH(list, &internal_list, next) {
126 if (!rte_pci_addr_cmp(&pdev->addr,
127 &list->internal->pdev->addr)) {
133 pthread_mutex_unlock(&internal_list_lock);
142 ifcvf_vfio_setup(struct ifcvf_internal *internal)
144 struct rte_pci_device *dev = internal->pdev;
145 char devname[RTE_DEV_NAME_MAX_LEN] = {0};
149 internal->vfio_dev_fd = -1;
150 internal->vfio_group_fd = -1;
151 internal->vfio_container_fd = -1;
153 rte_pci_device_name(&dev->addr, devname, RTE_DEV_NAME_MAX_LEN);
154 ret = rte_vfio_get_group_num(rte_pci_get_sysfs_path(), devname,
157 DRV_LOG(ERR, "%s failed to get IOMMU group", devname);
161 internal->vfio_container_fd = rte_vfio_container_create();
162 if (internal->vfio_container_fd < 0)
165 internal->vfio_group_fd = rte_vfio_container_group_bind(
166 internal->vfio_container_fd, iommu_group_num);
167 if (internal->vfio_group_fd < 0)
170 if (rte_pci_map_device(dev))
173 internal->vfio_dev_fd = rte_intr_dev_fd_get(dev->intr_handle);
175 for (i = 0; i < RTE_MIN(PCI_MAX_RESOURCE, IFCVF_PCI_MAX_RESOURCE);
177 internal->hw.mem_resource[i].addr =
178 internal->pdev->mem_resource[i].addr;
179 internal->hw.mem_resource[i].phys_addr =
180 internal->pdev->mem_resource[i].phys_addr;
181 internal->hw.mem_resource[i].len =
182 internal->pdev->mem_resource[i].len;
188 rte_vfio_container_destroy(internal->vfio_container_fd);
193 ifcvf_dma_map(struct ifcvf_internal *internal, bool do_map)
197 struct rte_vhost_memory *mem = NULL;
198 int vfio_container_fd;
200 ret = rte_vhost_get_mem_table(internal->vid, &mem);
202 DRV_LOG(ERR, "failed to get VM memory layout.");
206 vfio_container_fd = internal->vfio_container_fd;
208 for (i = 0; i < mem->nregions; i++) {
209 struct rte_vhost_mem_region *reg;
211 reg = &mem->regions[i];
212 DRV_LOG(INFO, "%s, region %u: HVA 0x%" PRIx64 ", "
213 "GPA 0x%" PRIx64 ", size 0x%" PRIx64 ".",
214 do_map ? "DMA map" : "DMA unmap", i,
215 reg->host_user_addr, reg->guest_phys_addr, reg->size);
218 ret = rte_vfio_container_dma_map(vfio_container_fd,
219 reg->host_user_addr, reg->guest_phys_addr,
222 DRV_LOG(ERR, "DMA map failed.");
226 ret = rte_vfio_container_dma_unmap(vfio_container_fd,
227 reg->host_user_addr, reg->guest_phys_addr,
230 DRV_LOG(ERR, "DMA unmap failed.");
242 hva_to_gpa(int vid, uint64_t hva)
244 struct rte_vhost_memory *mem = NULL;
245 struct rte_vhost_mem_region *reg;
249 if (rte_vhost_get_mem_table(vid, &mem) < 0)
252 for (i = 0; i < mem->nregions; i++) {
253 reg = &mem->regions[i];
255 if (hva >= reg->host_user_addr &&
256 hva < reg->host_user_addr + reg->size) {
257 gpa = hva - reg->host_user_addr + reg->guest_phys_addr;
268 vdpa_ifcvf_start(struct ifcvf_internal *internal)
270 struct ifcvf_hw *hw = &internal->hw;
273 struct rte_vhost_vring vq;
277 nr_vring = rte_vhost_get_vring_num(vid);
278 rte_vhost_get_negotiated_features(vid, &hw->req_features);
280 for (i = 0; i < nr_vring; i++) {
281 rte_vhost_get_vhost_vring(vid, i, &vq);
282 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.desc);
284 DRV_LOG(ERR, "Fail to get GPA for descriptor ring.");
287 hw->vring[i].desc = gpa;
289 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.avail);
291 DRV_LOG(ERR, "Fail to get GPA for available ring.");
294 hw->vring[i].avail = gpa;
296 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.used);
298 DRV_LOG(ERR, "Fail to get GPA for used ring.");
301 hw->vring[i].used = gpa;
303 hw->vring[i].size = vq.size;
304 rte_vhost_get_vring_base(vid, i, &hw->vring[i].last_avail_idx,
305 &hw->vring[i].last_used_idx);
309 return ifcvf_start_hw(&internal->hw);
313 vdpa_ifcvf_stop(struct ifcvf_internal *internal)
315 struct ifcvf_hw *hw = &internal->hw;
318 uint64_t features = 0;
319 uint64_t log_base = 0, log_size = 0;
325 /* to make sure no packet is lost for blk device
326 * do not stop until last_avail_idx == last_used_idx
328 if (internal->hw.device_type == IFCVF_BLK) {
329 for (i = 0; i < hw->nr_vring; i++) {
331 if (hw->lm_cfg != NULL)
332 ring_state = *(u32 *)(hw->lm_cfg +
333 IFCVF_LM_RING_STATE_OFFSET +
334 i * IFCVF_LM_CFG_SIZE);
335 hw->vring[i].last_avail_idx =
336 (u16)(ring_state & IFCVF_16_BIT_MASK);
337 hw->vring[i].last_used_idx =
338 (u16)(ring_state >> 16);
339 if (hw->vring[i].last_avail_idx !=
340 hw->vring[i].last_used_idx) {
341 ifcvf_notify_queue(hw, i);
344 } while (hw->vring[i].last_avail_idx !=
345 hw->vring[i].last_used_idx);
351 for (i = 0; i < hw->nr_vring; i++)
352 rte_vhost_set_vring_base(vid, i, hw->vring[i].last_avail_idx,
353 hw->vring[i].last_used_idx);
358 rte_vhost_get_negotiated_features(vid, &features);
359 if (RTE_VHOST_NEED_LOG(features)) {
360 ifcvf_disable_logging(hw);
361 rte_vhost_get_log_base(internal->vid, &log_base, &log_size);
362 rte_vfio_container_dma_unmap(internal->vfio_container_fd,
363 log_base, IFCVF_LOG_BASE, log_size);
365 * IFCVF marks dirty memory pages for only packet buffer,
366 * SW helps to mark the used ring as dirty after device stops.
368 for (i = 0; i < hw->nr_vring; i++) {
369 len = IFCVF_USED_RING_LEN(hw->vring[i].size);
370 rte_vhost_log_used_vring(vid, i, 0, len);
375 #define MSIX_IRQ_SET_BUF_LEN (sizeof(struct vfio_irq_set) + \
376 sizeof(int) * (IFCVF_MAX_QUEUES * 2 + 1))
378 vdpa_enable_vfio_intr(struct ifcvf_internal *internal, bool m_rx)
381 uint32_t i, nr_vring;
382 char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
383 struct vfio_irq_set *irq_set;
385 struct rte_vhost_vring vring;
390 nr_vring = rte_vhost_get_vring_num(internal->vid);
391 if (nr_vring > IFCVF_MAX_QUEUES * 2)
394 irq_set = (struct vfio_irq_set *)irq_set_buf;
395 irq_set->argsz = sizeof(irq_set_buf);
396 irq_set->count = nr_vring + 1;
397 irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
398 VFIO_IRQ_SET_ACTION_TRIGGER;
399 irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
401 fd_ptr = (int *)&irq_set->data;
402 /* The first interrupt is for the configure space change notification */
403 fd_ptr[RTE_INTR_VEC_ZERO_OFFSET] =
404 rte_intr_fd_get(internal->pdev->intr_handle);
406 for (i = 0; i < nr_vring; i++)
407 internal->intr_fd[i] = -1;
409 for (i = 0; i < nr_vring; i++) {
410 rte_vhost_get_vhost_vring(internal->vid, i, &vring);
411 fd_ptr[RTE_INTR_VEC_RXTX_OFFSET + i] = vring.callfd;
413 ((i & 1) == 0 || internal->hw.device_type == IFCVF_BLK)) {
414 /* For the net we only need to relay rx queue,
415 * which will change the mem of VM.
416 * For the blk we need to relay all the read cmd
419 fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
421 DRV_LOG(ERR, "can't setup eventfd: %s",
425 internal->intr_fd[i] = fd;
426 fd_ptr[RTE_INTR_VEC_RXTX_OFFSET + i] = fd;
430 ret = ioctl(internal->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
432 DRV_LOG(ERR, "Error enabling MSI-X interrupts: %s",
441 vdpa_disable_vfio_intr(struct ifcvf_internal *internal)
444 uint32_t i, nr_vring;
445 char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
446 struct vfio_irq_set *irq_set;
448 irq_set = (struct vfio_irq_set *)irq_set_buf;
449 irq_set->argsz = sizeof(irq_set_buf);
451 irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
452 irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
455 nr_vring = rte_vhost_get_vring_num(internal->vid);
456 for (i = 0; i < nr_vring; i++) {
457 if (internal->intr_fd[i] >= 0)
458 close(internal->intr_fd[i]);
459 internal->intr_fd[i] = -1;
462 ret = ioctl(internal->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
464 DRV_LOG(ERR, "Error disabling MSI-X interrupts: %s",
473 notify_relay(void *arg)
475 int i, kickfd, epfd, nfds = 0;
477 struct epoll_event events[IFCVF_MAX_QUEUES * 2];
478 struct epoll_event ev;
481 struct rte_vhost_vring vring;
482 struct ifcvf_internal *internal = (struct ifcvf_internal *)arg;
483 struct ifcvf_hw *hw = &internal->hw;
485 q_num = rte_vhost_get_vring_num(internal->vid);
487 epfd = epoll_create(IFCVF_MAX_QUEUES * 2);
489 DRV_LOG(ERR, "failed to create epoll instance.");
492 internal->epfd = epfd;
495 for (qid = 0; qid < q_num; qid++) {
496 ev.events = EPOLLIN | EPOLLPRI;
497 rte_vhost_get_vhost_vring(internal->vid, qid, &vring);
498 ev.data.u64 = qid | (uint64_t)vring.kickfd << 32;
499 if (epoll_ctl(epfd, EPOLL_CTL_ADD, vring.kickfd, &ev) < 0) {
500 DRV_LOG(ERR, "epoll add error: %s", strerror(errno));
506 nfds = epoll_wait(epfd, events, q_num, -1);
510 DRV_LOG(ERR, "epoll_wait return fail\n");
514 for (i = 0; i < nfds; i++) {
515 qid = events[i].data.u32;
516 kickfd = (uint32_t)(events[i].data.u64 >> 32);
518 nbytes = read(kickfd, &buf, 8);
520 if (errno == EINTR ||
521 errno == EWOULDBLOCK ||
524 DRV_LOG(INFO, "Error reading "
531 ifcvf_notify_queue(hw, qid);
539 setup_notify_relay(struct ifcvf_internal *internal)
541 char name[THREAD_NAME_LEN];
544 snprintf(name, sizeof(name), "ifc-notify-%d", internal->vid);
545 ret = rte_ctrl_thread_create(&internal->tid, name, NULL, notify_relay,
548 DRV_LOG(ERR, "failed to create notify relay pthread.");
556 unset_notify_relay(struct ifcvf_internal *internal)
561 pthread_cancel(internal->tid);
562 pthread_join(internal->tid, &status);
566 if (internal->epfd >= 0)
567 close(internal->epfd);
574 virtio_interrupt_handler(struct ifcvf_internal *internal)
576 int vid = internal->vid;
579 ret = rte_vhost_slave_config_change(vid, 1);
581 DRV_LOG(ERR, "failed to notify the guest about configuration space change.");
585 intr_relay(void *arg)
587 struct ifcvf_internal *internal = (struct ifcvf_internal *)arg;
588 struct epoll_event csc_event;
589 struct epoll_event ev;
592 int csc_epfd, csc_val = 0;
594 csc_epfd = epoll_create(1);
596 DRV_LOG(ERR, "failed to create epoll for config space change.");
600 ev.events = EPOLLIN | EPOLLPRI | EPOLLRDHUP | EPOLLHUP;
601 ev.data.fd = rte_intr_fd_get(internal->pdev->intr_handle);
602 if (epoll_ctl(csc_epfd, EPOLL_CTL_ADD,
603 rte_intr_fd_get(internal->pdev->intr_handle), &ev) < 0) {
604 DRV_LOG(ERR, "epoll add error: %s", strerror(errno));
608 internal->csc_epfd = csc_epfd;
611 csc_val = epoll_wait(csc_epfd, &csc_event, 1, -1);
615 DRV_LOG(ERR, "epoll_wait return fail.");
617 } else if (csc_val == 0) {
621 nbytes = read(csc_event.data.fd, &buf, 8);
623 if (errno == EINTR ||
624 errno == EWOULDBLOCK ||
627 DRV_LOG(ERR, "Error reading from file descriptor %d: %s\n",
631 } else if (nbytes == 0) {
632 DRV_LOG(ERR, "Read nothing from file descriptor %d\n",
636 virtio_interrupt_handler(internal);
644 internal->csc_epfd = -1;
650 setup_intr_relay(struct ifcvf_internal *internal)
652 char name[THREAD_NAME_LEN];
655 snprintf(name, sizeof(name), "ifc-intr-%d", internal->vid);
656 ret = rte_ctrl_thread_create(&internal->intr_tid, name, NULL,
657 intr_relay, (void *)internal);
659 DRV_LOG(ERR, "failed to create notify relay pthread.");
666 unset_intr_relay(struct ifcvf_internal *internal)
670 if (internal->intr_tid) {
671 pthread_cancel(internal->intr_tid);
672 pthread_join(internal->intr_tid, &status);
674 internal->intr_tid = 0;
676 if (internal->csc_epfd >= 0)
677 close(internal->csc_epfd);
678 internal->csc_epfd = -1;
682 update_datapath(struct ifcvf_internal *internal)
686 rte_spinlock_lock(&internal->lock);
688 if (!rte_atomic32_read(&internal->running) &&
689 (rte_atomic32_read(&internal->started) &&
690 rte_atomic32_read(&internal->dev_attached))) {
691 ret = ifcvf_dma_map(internal, true);
695 ret = vdpa_enable_vfio_intr(internal, false);
699 ret = vdpa_ifcvf_start(internal);
703 ret = setup_notify_relay(internal);
707 ret = setup_intr_relay(internal);
711 rte_atomic32_set(&internal->running, 1);
712 } else if (rte_atomic32_read(&internal->running) &&
713 (!rte_atomic32_read(&internal->started) ||
714 !rte_atomic32_read(&internal->dev_attached))) {
715 unset_intr_relay(internal);
717 ret = unset_notify_relay(internal);
721 vdpa_ifcvf_stop(internal);
723 ret = vdpa_disable_vfio_intr(internal);
727 ret = ifcvf_dma_map(internal, false);
731 rte_atomic32_set(&internal->running, 0);
734 rte_spinlock_unlock(&internal->lock);
737 rte_spinlock_unlock(&internal->lock);
742 m_ifcvf_start(struct ifcvf_internal *internal)
744 struct ifcvf_hw *hw = &internal->hw;
745 uint32_t i, nr_vring;
747 struct rte_vhost_vring vq;
749 uint64_t m_vring_iova = IFCVF_MEDIATED_VRING;
753 memset(&vq, 0, sizeof(vq));
755 nr_vring = rte_vhost_get_vring_num(vid);
756 rte_vhost_get_negotiated_features(vid, &hw->req_features);
758 for (i = 0; i < nr_vring; i++) {
759 rte_vhost_get_vhost_vring(vid, i, &vq);
761 size = RTE_ALIGN_CEIL(vring_size(vq.size, rte_mem_page_size()),
762 rte_mem_page_size());
763 vring_buf = rte_zmalloc("ifcvf", size, rte_mem_page_size());
764 vring_init(&internal->m_vring[i], vq.size, vring_buf,
765 rte_mem_page_size());
767 ret = rte_vfio_container_dma_map(internal->vfio_container_fd,
768 (uint64_t)(uintptr_t)vring_buf, m_vring_iova, size);
770 DRV_LOG(ERR, "mediated vring DMA map failed.");
774 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.desc);
776 DRV_LOG(ERR, "Fail to get GPA for descriptor ring.");
779 hw->vring[i].desc = gpa;
781 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.avail);
783 DRV_LOG(ERR, "Fail to get GPA for available ring.");
786 hw->vring[i].avail = gpa;
788 /* NET: Direct I/O for Tx queue, relay for Rx queue
789 * BLK: relay every queue
791 if ((internal->hw.device_type == IFCVF_NET) && (i & 1)) {
792 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.used);
794 DRV_LOG(ERR, "Fail to get GPA for used ring.");
797 hw->vring[i].used = gpa;
799 hw->vring[i].used = m_vring_iova +
800 (char *)internal->m_vring[i].used -
801 (char *)internal->m_vring[i].desc;
804 hw->vring[i].size = vq.size;
806 rte_vhost_get_vring_base(vid, i,
807 &internal->m_vring[i].avail->idx,
808 &internal->m_vring[i].used->idx);
810 rte_vhost_get_vring_base(vid, i, &hw->vring[i].last_avail_idx,
811 &hw->vring[i].last_used_idx);
813 m_vring_iova += size;
815 hw->nr_vring = nr_vring;
817 return ifcvf_start_hw(&internal->hw);
820 for (i = 0; i < nr_vring; i++)
821 rte_free(internal->m_vring[i].desc);
827 m_ifcvf_stop(struct ifcvf_internal *internal)
831 struct rte_vhost_vring vq;
832 struct ifcvf_hw *hw = &internal->hw;
833 uint64_t m_vring_iova = IFCVF_MEDIATED_VRING;
839 for (i = 0; i < hw->nr_vring; i++) {
840 /* synchronize remaining new used entries if any */
841 if (internal->hw.device_type == IFCVF_NET) {
843 update_used_ring(internal, i);
844 } else if (internal->hw.device_type == IFCVF_BLK) {
845 update_used_ring(internal, i);
848 rte_vhost_get_vhost_vring(vid, i, &vq);
849 len = IFCVF_USED_RING_LEN(vq.size);
850 rte_vhost_log_used_vring(vid, i, 0, len);
852 size = RTE_ALIGN_CEIL(vring_size(vq.size, rte_mem_page_size()),
853 rte_mem_page_size());
854 rte_vfio_container_dma_unmap(internal->vfio_container_fd,
855 (uint64_t)(uintptr_t)internal->m_vring[i].desc,
858 rte_vhost_set_vring_base(vid, i, hw->vring[i].last_avail_idx,
859 hw->vring[i].last_used_idx);
860 rte_free(internal->m_vring[i].desc);
861 m_vring_iova += size;
868 update_used_ring(struct ifcvf_internal *internal, uint16_t qid)
870 rte_vdpa_relay_vring_used(internal->vid, qid, &internal->m_vring[qid]);
871 rte_vhost_vring_call(internal->vid, qid);
875 vring_relay(void *arg)
877 int i, vid, epfd, fd, nfds;
878 struct ifcvf_internal *internal = (struct ifcvf_internal *)arg;
879 struct rte_vhost_vring vring;
881 struct epoll_event events[IFCVF_MAX_QUEUES * 4];
882 struct epoll_event ev;
887 q_num = rte_vhost_get_vring_num(vid);
889 /* add notify fd and interrupt fd to epoll */
890 epfd = epoll_create(IFCVF_MAX_QUEUES * 2);
892 DRV_LOG(ERR, "failed to create epoll instance.");
895 internal->epfd = epfd;
898 for (qid = 0; qid < q_num; qid++) {
899 ev.events = EPOLLIN | EPOLLPRI;
900 rte_vhost_get_vhost_vring(vid, qid, &vring);
901 ev.data.u64 = qid << 1 | (uint64_t)vring.kickfd << 32;
902 if (epoll_ctl(epfd, EPOLL_CTL_ADD, vring.kickfd, &ev) < 0) {
903 DRV_LOG(ERR, "epoll add error: %s", strerror(errno));
908 for (qid = 0; qid < q_num; qid += 1) {
909 if ((internal->hw.device_type == IFCVF_NET) && (qid & 1))
911 ev.events = EPOLLIN | EPOLLPRI;
912 /* leave a flag to mark it's for interrupt */
913 ev.data.u64 = 1 | qid << 1 |
914 (uint64_t)internal->intr_fd[qid] << 32;
915 if (epoll_ctl(epfd, EPOLL_CTL_ADD, internal->intr_fd[qid], &ev)
917 DRV_LOG(ERR, "epoll add error: %s", strerror(errno));
920 update_used_ring(internal, qid);
923 /* start relay with a first kick */
924 for (qid = 0; qid < q_num; qid++)
925 ifcvf_notify_queue(&internal->hw, qid);
927 /* listen to the events and react accordingly */
929 nfds = epoll_wait(epfd, events, q_num * 2, -1);
933 DRV_LOG(ERR, "epoll_wait return fail.");
937 for (i = 0; i < nfds; i++) {
938 fd = (uint32_t)(events[i].data.u64 >> 32);
940 nbytes = read(fd, &buf, 8);
942 if (errno == EINTR ||
943 errno == EWOULDBLOCK ||
946 DRV_LOG(INFO, "Error reading "
953 qid = events[i].data.u32 >> 1;
955 if (events[i].data.u32 & 1)
956 update_used_ring(internal, qid);
958 ifcvf_notify_queue(&internal->hw, qid);
966 setup_vring_relay(struct ifcvf_internal *internal)
968 char name[THREAD_NAME_LEN];
971 snprintf(name, sizeof(name), "ifc-vring-%d", internal->vid);
972 ret = rte_ctrl_thread_create(&internal->tid, name, NULL, vring_relay,
975 DRV_LOG(ERR, "failed to create ring relay pthread.");
983 unset_vring_relay(struct ifcvf_internal *internal)
988 pthread_cancel(internal->tid);
989 pthread_join(internal->tid, &status);
993 if (internal->epfd >= 0)
994 close(internal->epfd);
1001 ifcvf_sw_fallback_switchover(struct ifcvf_internal *internal)
1004 int vid = internal->vid;
1006 /* stop the direct IO data path */
1007 unset_notify_relay(internal);
1008 vdpa_ifcvf_stop(internal);
1010 unset_intr_relay(internal);
1012 vdpa_disable_vfio_intr(internal);
1014 ret = rte_vhost_host_notifier_ctrl(vid, RTE_VHOST_QUEUE_ALL, false);
1015 if (ret && ret != -ENOTSUP)
1018 /* set up interrupt for interrupt relay */
1019 ret = vdpa_enable_vfio_intr(internal, true);
1024 ret = m_ifcvf_start(internal);
1028 /* set up vring relay thread */
1029 ret = setup_vring_relay(internal);
1033 rte_vhost_host_notifier_ctrl(vid, RTE_VHOST_QUEUE_ALL, true);
1035 internal->sw_fallback_running = true;
1040 m_ifcvf_stop(internal);
1042 vdpa_disable_vfio_intr(internal);
1044 ifcvf_dma_map(internal, false);
1050 ifcvf_dev_config(int vid)
1052 struct rte_vdpa_device *vdev;
1053 struct internal_list *list;
1054 struct ifcvf_internal *internal;
1056 vdev = rte_vhost_get_vdpa_device(vid);
1057 list = find_internal_resource_by_vdev(vdev);
1059 DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
1063 internal = list->internal;
1064 internal->vid = vid;
1065 rte_atomic32_set(&internal->dev_attached, 1);
1066 update_datapath(internal);
1068 if (rte_vhost_host_notifier_ctrl(vid, RTE_VHOST_QUEUE_ALL, true) != 0)
1069 DRV_LOG(NOTICE, "vDPA (%s): software relay is used.",
1070 vdev->device->name);
1072 internal->configured = 1;
1077 ifcvf_dev_close(int vid)
1079 struct rte_vdpa_device *vdev;
1080 struct internal_list *list;
1081 struct ifcvf_internal *internal;
1083 vdev = rte_vhost_get_vdpa_device(vid);
1084 list = find_internal_resource_by_vdev(vdev);
1086 DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
1090 internal = list->internal;
1092 if (internal->sw_fallback_running) {
1093 /* unset ring relay */
1094 unset_vring_relay(internal);
1097 m_ifcvf_stop(internal);
1099 /* remove interrupt setting */
1100 vdpa_disable_vfio_intr(internal);
1102 /* unset DMA map for guest memory */
1103 ifcvf_dma_map(internal, false);
1105 internal->sw_fallback_running = false;
1107 rte_atomic32_set(&internal->dev_attached, 0);
1108 update_datapath(internal);
1111 internal->configured = 0;
1116 ifcvf_set_features(int vid)
1118 uint64_t features = 0;
1119 struct rte_vdpa_device *vdev;
1120 struct internal_list *list;
1121 struct ifcvf_internal *internal;
1122 uint64_t log_base = 0, log_size = 0;
1124 vdev = rte_vhost_get_vdpa_device(vid);
1125 list = find_internal_resource_by_vdev(vdev);
1127 DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
1131 internal = list->internal;
1132 rte_vhost_get_negotiated_features(vid, &features);
1134 if (!RTE_VHOST_NEED_LOG(features))
1137 if (internal->sw_lm) {
1138 ifcvf_sw_fallback_switchover(internal);
1140 rte_vhost_get_log_base(vid, &log_base, &log_size);
1141 rte_vfio_container_dma_map(internal->vfio_container_fd,
1142 log_base, IFCVF_LOG_BASE, log_size);
1143 ifcvf_enable_logging(&internal->hw, IFCVF_LOG_BASE, log_size);
1150 ifcvf_get_vfio_group_fd(int vid)
1152 struct rte_vdpa_device *vdev;
1153 struct internal_list *list;
1155 vdev = rte_vhost_get_vdpa_device(vid);
1156 list = find_internal_resource_by_vdev(vdev);
1158 DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
1162 return list->internal->vfio_group_fd;
1166 ifcvf_get_vfio_device_fd(int vid)
1168 struct rte_vdpa_device *vdev;
1169 struct internal_list *list;
1171 vdev = rte_vhost_get_vdpa_device(vid);
1172 list = find_internal_resource_by_vdev(vdev);
1174 DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
1178 return list->internal->vfio_dev_fd;
1182 ifcvf_get_notify_area(int vid, int qid, uint64_t *offset, uint64_t *size)
1184 struct rte_vdpa_device *vdev;
1185 struct internal_list *list;
1186 struct ifcvf_internal *internal;
1187 struct vfio_region_info reg = { .argsz = sizeof(reg) };
1190 vdev = rte_vhost_get_vdpa_device(vid);
1191 list = find_internal_resource_by_vdev(vdev);
1193 DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
1197 internal = list->internal;
1199 reg.index = ifcvf_get_notify_region(&internal->hw);
1200 ret = ioctl(internal->vfio_dev_fd, VFIO_DEVICE_GET_REGION_INFO, ®);
1202 DRV_LOG(ERR, "Get not get device region info: %s",
1207 *offset = ifcvf_get_queue_notify_off(&internal->hw, qid) + reg.offset;
1214 ifcvf_get_queue_num(struct rte_vdpa_device *vdev, uint32_t *queue_num)
1216 struct internal_list *list;
1218 list = find_internal_resource_by_vdev(vdev);
1220 DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
1224 *queue_num = list->internal->max_queues;
1230 ifcvf_get_vdpa_features(struct rte_vdpa_device *vdev, uint64_t *features)
1232 struct internal_list *list;
1234 list = find_internal_resource_by_vdev(vdev);
1236 DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
1240 *features = list->internal->features;
1245 #define VDPA_SUPPORTED_PROTOCOL_FEATURES \
1246 (1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK | \
1247 1ULL << VHOST_USER_PROTOCOL_F_SLAVE_REQ | \
1248 1ULL << VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD | \
1249 1ULL << VHOST_USER_PROTOCOL_F_HOST_NOTIFIER | \
1250 1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD | \
1251 1ULL << VHOST_USER_PROTOCOL_F_STATUS)
1253 #define VDPA_BLK_PROTOCOL_FEATURES \
1254 (1ULL << VHOST_USER_PROTOCOL_F_CONFIG)
1257 ifcvf_get_protocol_features(struct rte_vdpa_device *vdev, uint64_t *features)
1261 *features = VDPA_SUPPORTED_PROTOCOL_FEATURES;
1266 ifcvf_set_vring_state(int vid, int vring, int state)
1268 struct rte_vdpa_device *vdev;
1269 struct internal_list *list;
1270 struct ifcvf_internal *internal;
1271 struct ifcvf_hw *hw;
1272 struct ifcvf_pci_common_cfg *cfg;
1275 vdev = rte_vhost_get_vdpa_device(vid);
1276 list = find_internal_resource_by_vdev(vdev);
1278 DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
1282 internal = list->internal;
1283 if (vring < 0 || vring >= internal->max_queues * 2) {
1284 DRV_LOG(ERR, "Vring index %d not correct", vring);
1289 if (!internal->configured)
1292 cfg = hw->common_cfg;
1293 IFCVF_WRITE_REG16(vring, &cfg->queue_select);
1294 IFCVF_WRITE_REG16(!!state, &cfg->queue_enable);
1296 if (!state && hw->vring[vring].enable) {
1297 ret = vdpa_disable_vfio_intr(internal);
1302 if (state && !hw->vring[vring].enable) {
1303 ret = vdpa_enable_vfio_intr(internal, false);
1309 hw->vring[vring].enable = !!state;
1314 ifcvf_get_device_type(struct rte_vdpa_device *vdev,
1317 struct ifcvf_internal *internal;
1318 struct internal_list *list;
1320 list = find_internal_resource_by_vdev(vdev);
1322 DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
1326 internal = list->internal;
1328 if (internal->hw.device_type == IFCVF_BLK)
1329 *type = RTE_VHOST_VDPA_DEVICE_TYPE_BLK;
1331 *type = RTE_VHOST_VDPA_DEVICE_TYPE_NET;
1336 static struct rte_vdpa_dev_ops ifcvf_net_ops = {
1337 .get_queue_num = ifcvf_get_queue_num,
1338 .get_features = ifcvf_get_vdpa_features,
1339 .get_protocol_features = ifcvf_get_protocol_features,
1340 .dev_conf = ifcvf_dev_config,
1341 .dev_close = ifcvf_dev_close,
1342 .set_vring_state = ifcvf_set_vring_state,
1343 .set_features = ifcvf_set_features,
1344 .migration_done = NULL,
1345 .get_vfio_group_fd = ifcvf_get_vfio_group_fd,
1346 .get_vfio_device_fd = ifcvf_get_vfio_device_fd,
1347 .get_notify_area = ifcvf_get_notify_area,
1348 .get_dev_type = ifcvf_get_device_type,
1352 open_int(const char *key __rte_unused, const char *value, void *extra_args)
1354 uint16_t *n = extra_args;
1356 if (value == NULL || extra_args == NULL)
1359 *n = (uint16_t)strtoul(value, NULL, 0);
1360 if (*n == USHRT_MAX && errno == ERANGE)
1367 ifcvf_pci_get_device_type(struct rte_pci_device *pci_dev)
1369 uint16_t pci_device_id = pci_dev->id.device_id;
1372 if (pci_device_id < 0x1000 || pci_device_id > 0x107f) {
1373 DRV_LOG(ERR, "Probe device is not a virtio device\n");
1377 if (pci_device_id < 0x1040) {
1378 /* Transitional devices: use the PCI subsystem device id as
1379 * virtio device id, same as legacy driver always did.
1381 device_id = pci_dev->id.subsystem_device_id;
1383 /* Modern devices: simply use PCI device id,
1384 * but start from 0x1040.
1386 device_id = pci_device_id - 0x1040;
1393 ifcvf_blk_get_config(int vid, uint8_t *config, uint32_t size)
1395 struct virtio_blk_config *dev_cfg;
1396 struct ifcvf_internal *internal;
1397 struct rte_vdpa_device *vdev;
1398 struct internal_list *list;
1400 uint64_t capacity = 0;
1403 if (size != sizeof(struct virtio_blk_config)) {
1404 DRV_LOG(ERR, "Invalid len: %u, required: %u",
1405 size, (uint32_t)sizeof(struct virtio_blk_config));
1409 vdev = rte_vhost_get_vdpa_device(vid);
1411 DRV_LOG(ERR, "Invalid vDPA device vid: %d", vid);
1415 list = find_internal_resource_by_vdev(vdev);
1417 DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
1421 internal = list->internal;
1423 for (i = 0; i < sizeof(struct virtio_blk_config); i++)
1424 config[i] = *((u8 *)internal->hw.blk_cfg + i);
1426 dev_cfg = (struct virtio_blk_config *)internal->hw.blk_cfg;
1428 /* cannot read 64-bit register in one attempt, so read byte by byte. */
1429 for (i = 0; i < sizeof(internal->hw.blk_cfg->capacity); i++) {
1430 byte = (uint8_t *)&internal->hw.blk_cfg->capacity + i;
1431 capacity |= (uint64_t)*byte << (i * 8);
1433 /* The capacity is number of sectors in 512-byte.
1434 * So right shift 1 bit we get in K,
1435 * another right shift 10 bits we get in M,
1436 * right shift 10 more bits, we get in G.
1437 * To show capacity in G, we right shift 21 bits in total.
1439 DRV_LOG(DEBUG, "capacity : %"PRIu64"G", capacity >> 21);
1441 DRV_LOG(DEBUG, "size_max : 0x%08x", dev_cfg->size_max);
1442 DRV_LOG(DEBUG, "seg_max : 0x%08x", dev_cfg->seg_max);
1443 DRV_LOG(DEBUG, "blk_size : 0x%08x", dev_cfg->blk_size);
1444 DRV_LOG(DEBUG, "geometry");
1445 DRV_LOG(DEBUG, " cylinders: %u", dev_cfg->geometry.cylinders);
1446 DRV_LOG(DEBUG, " heads : %u", dev_cfg->geometry.heads);
1447 DRV_LOG(DEBUG, " sectors : %u", dev_cfg->geometry.sectors);
1448 DRV_LOG(DEBUG, "num_queues: 0x%08x", dev_cfg->num_queues);
1450 DRV_LOG(DEBUG, "config: [%x] [%x] [%x] [%x] [%x] [%x] [%x] [%x]\n",
1451 config[0], config[1], config[2], config[3], config[4],
1452 config[5], config[6], config[7]);
1457 ifcvf_blk_get_protocol_features(struct rte_vdpa_device *vdev,
1462 *features = VDPA_SUPPORTED_PROTOCOL_FEATURES;
1463 *features |= VDPA_BLK_PROTOCOL_FEATURES;
1467 static struct rte_vdpa_dev_ops ifcvf_blk_ops = {
1468 .get_queue_num = ifcvf_get_queue_num,
1469 .get_features = ifcvf_get_vdpa_features,
1470 .set_features = ifcvf_set_features,
1471 .get_protocol_features = ifcvf_blk_get_protocol_features,
1472 .dev_conf = ifcvf_dev_config,
1473 .dev_close = ifcvf_dev_close,
1474 .set_vring_state = ifcvf_set_vring_state,
1475 .migration_done = NULL,
1476 .get_vfio_group_fd = ifcvf_get_vfio_group_fd,
1477 .get_vfio_device_fd = ifcvf_get_vfio_device_fd,
1478 .get_notify_area = ifcvf_get_notify_area,
1479 .get_config = ifcvf_blk_get_config,
1480 .get_dev_type = ifcvf_get_device_type,
1483 struct rte_vdpa_dev_info dev_info[] = {
1485 .features = (1ULL << VIRTIO_NET_F_GUEST_ANNOUNCE) |
1486 (1ULL << VIRTIO_NET_F_CTRL_VQ) |
1487 (1ULL << VIRTIO_NET_F_STATUS) |
1488 (1ULL << VHOST_USER_F_PROTOCOL_FEATURES) |
1489 (1ULL << VHOST_F_LOG_ALL),
1490 .ops = &ifcvf_net_ops,
1493 .features = (1ULL << VHOST_USER_F_PROTOCOL_FEATURES) |
1494 (1ULL << VHOST_F_LOG_ALL),
1495 .ops = &ifcvf_blk_ops,
1500 ifcvf_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
1501 struct rte_pci_device *pci_dev)
1504 struct ifcvf_internal *internal = NULL;
1505 struct internal_list *list = NULL;
1507 int sw_fallback_lm = 0;
1508 struct rte_kvargs *kvlist = NULL;
1511 uint64_t capacity = 0;
1515 if (rte_eal_process_type() != RTE_PROC_PRIMARY)
1518 if (!pci_dev->device.devargs)
1521 kvlist = rte_kvargs_parse(pci_dev->device.devargs->args,
1522 ifcvf_valid_arguments);
1526 /* probe only when vdpa mode is specified */
1527 if (rte_kvargs_count(kvlist, IFCVF_VDPA_MODE) == 0) {
1528 rte_kvargs_free(kvlist);
1532 ret = rte_kvargs_process(kvlist, IFCVF_VDPA_MODE, &open_int,
1534 if (ret < 0 || vdpa_mode == 0) {
1535 rte_kvargs_free(kvlist);
1539 list = rte_zmalloc("ifcvf", sizeof(*list), 0);
1543 internal = rte_zmalloc("ifcvf", sizeof(*internal), 0);
1544 if (internal == NULL)
1547 internal->pdev = pci_dev;
1548 rte_spinlock_init(&internal->lock);
1550 if (ifcvf_vfio_setup(internal) < 0) {
1551 DRV_LOG(ERR, "failed to setup device %s", pci_dev->name);
1555 if (ifcvf_init_hw(&internal->hw, internal->pdev) < 0) {
1556 DRV_LOG(ERR, "failed to init device %s", pci_dev->name);
1560 internal->configured = 0;
1561 internal->max_queues = IFCVF_MAX_QUEUES;
1562 features = ifcvf_get_features(&internal->hw);
1564 device_id = ifcvf_pci_get_device_type(pci_dev);
1565 if (device_id < 0) {
1566 DRV_LOG(ERR, "failed to get device %s type", pci_dev->name);
1570 if (device_id == VIRTIO_ID_NET) {
1571 internal->hw.device_type = IFCVF_NET;
1572 internal->features = features &
1573 ~(1ULL << VIRTIO_F_IOMMU_PLATFORM);
1574 internal->features |= dev_info[IFCVF_NET].features;
1575 } else if (device_id == VIRTIO_ID_BLOCK) {
1576 internal->hw.device_type = IFCVF_BLK;
1577 internal->features = features &
1578 ~(1ULL << VIRTIO_F_IOMMU_PLATFORM);
1579 internal->features |= dev_info[IFCVF_BLK].features;
1581 /* cannot read 64-bit register in one attempt,
1582 * so read byte by byte.
1584 for (i = 0; i < sizeof(internal->hw.blk_cfg->capacity); i++) {
1585 byte = (uint8_t *)&internal->hw.blk_cfg->capacity + i;
1586 capacity |= (uint64_t)*byte << (i * 8);
1588 /* The capacity is number of sectors in 512-byte.
1589 * So right shift 1 bit we get in K,
1590 * another right shift 10 bits we get in M,
1591 * right shift 10 more bits, we get in G.
1592 * To show capacity in G, we right shift 21 bits in total.
1594 DRV_LOG(DEBUG, "capacity : %"PRIu64"G", capacity >> 21);
1596 DRV_LOG(DEBUG, "size_max : 0x%08x",
1597 internal->hw.blk_cfg->size_max);
1598 DRV_LOG(DEBUG, "seg_max : 0x%08x",
1599 internal->hw.blk_cfg->seg_max);
1600 DRV_LOG(DEBUG, "blk_size : 0x%08x",
1601 internal->hw.blk_cfg->blk_size);
1602 DRV_LOG(DEBUG, "geometry");
1603 DRV_LOG(DEBUG, " cylinders: %u",
1604 internal->hw.blk_cfg->geometry.cylinders);
1605 DRV_LOG(DEBUG, " heads : %u",
1606 internal->hw.blk_cfg->geometry.heads);
1607 DRV_LOG(DEBUG, " sectors : %u",
1608 internal->hw.blk_cfg->geometry.sectors);
1609 DRV_LOG(DEBUG, "num_queues: 0x%08x",
1610 internal->hw.blk_cfg->num_queues);
1613 list->internal = internal;
1615 if (rte_kvargs_count(kvlist, IFCVF_SW_FALLBACK_LM)) {
1616 ret = rte_kvargs_process(kvlist, IFCVF_SW_FALLBACK_LM,
1617 &open_int, &sw_fallback_lm);
1621 internal->sw_lm = sw_fallback_lm;
1623 internal->vdev = rte_vdpa_register_device(&pci_dev->device,
1624 dev_info[internal->hw.device_type].ops);
1625 if (internal->vdev == NULL) {
1626 DRV_LOG(ERR, "failed to register device %s", pci_dev->name);
1630 pthread_mutex_lock(&internal_list_lock);
1631 TAILQ_INSERT_TAIL(&internal_list, list, next);
1632 pthread_mutex_unlock(&internal_list_lock);
1634 rte_atomic32_set(&internal->started, 1);
1635 update_datapath(internal);
1637 rte_kvargs_free(kvlist);
1641 rte_kvargs_free(kvlist);
1648 ifcvf_pci_remove(struct rte_pci_device *pci_dev)
1650 struct ifcvf_internal *internal;
1651 struct internal_list *list;
1653 if (rte_eal_process_type() != RTE_PROC_PRIMARY)
1656 list = find_internal_resource_by_dev(pci_dev);
1658 DRV_LOG(ERR, "Invalid device: %s", pci_dev->name);
1662 internal = list->internal;
1663 rte_atomic32_set(&internal->started, 0);
1664 update_datapath(internal);
1666 rte_pci_unmap_device(internal->pdev);
1667 rte_vfio_container_destroy(internal->vfio_container_fd);
1668 rte_vdpa_unregister_device(internal->vdev);
1670 pthread_mutex_lock(&internal_list_lock);
1671 TAILQ_REMOVE(&internal_list, list, next);
1672 pthread_mutex_unlock(&internal_list_lock);
1681 * IFCVF has the same vendor ID and device ID as virtio net PCI
1682 * device, with its specific subsystem vendor ID and device ID.
1684 static const struct rte_pci_id pci_id_ifcvf_map[] = {
1685 { .class_id = RTE_CLASS_ANY_ID,
1686 .vendor_id = IFCVF_VENDOR_ID,
1687 .device_id = IFCVF_NET_DEVICE_ID,
1688 .subsystem_vendor_id = IFCVF_SUBSYS_VENDOR_ID,
1689 .subsystem_device_id = IFCVF_SUBSYS_DEVICE_ID,
1692 { .class_id = RTE_CLASS_ANY_ID,
1693 .vendor_id = IFCVF_VENDOR_ID,
1694 .device_id = IFCVF_BLK_TRANSITIONAL_DEVICE_ID,
1695 .subsystem_vendor_id = IFCVF_SUBSYS_VENDOR_ID,
1696 .subsystem_device_id = IFCVF_BLK_DEVICE_ID,
1699 { .class_id = RTE_CLASS_ANY_ID,
1700 .vendor_id = IFCVF_VENDOR_ID,
1701 .device_id = IFCVF_BLK_MODERN_DEVICE_ID,
1702 .subsystem_vendor_id = IFCVF_SUBSYS_VENDOR_ID,
1703 .subsystem_device_id = IFCVF_BLK_DEVICE_ID,
1706 { .vendor_id = 0, /* sentinel */
1710 static struct rte_pci_driver rte_ifcvf_vdpa = {
1711 .id_table = pci_id_ifcvf_map,
1713 .probe = ifcvf_pci_probe,
1714 .remove = ifcvf_pci_remove,
1717 RTE_PMD_REGISTER_PCI(net_ifcvf, rte_ifcvf_vdpa);
1718 RTE_PMD_REGISTER_PCI_TABLE(net_ifcvf, pci_id_ifcvf_map);
1719 RTE_PMD_REGISTER_KMOD_DEP(net_ifcvf, "* vfio-pci");