#include <rte_log.h>
#include <rte_ether.h>
-#include <rte_rwlock.h>
#include <rte_malloc.h>
+#include <rte_dmadev.h>
#include "rte_vhost.h"
-#include "rte_vdpa.h"
-#include "rte_vdpa_dev.h"
+#include "vdpa_driver.h"
#include "rte_vhost_async.h"
#define VIRTIO_DEV_FEATURES_FAILED ((uint32_t)1 << 4)
/* Used to indicate that the virtio_net tx code should fill TX ol_flags */
#define VIRTIO_DEV_LEGACY_OL_FLAGS ((uint32_t)1 << 5)
+/* Used to indicate the application has requested statistics collection */
+#define VIRTIO_DEV_STATS_ENABLED ((uint32_t)1 << 6)
/* Backend value set by guest. */
#define VIRTIO_DEV_STOPPED -1
#define MAX_PKT_BURST 32
#define VHOST_MAX_ASYNC_IT (MAX_PKT_BURST)
-#define VHOST_MAX_ASYNC_VEC (BUF_VECTOR_MAX * 2)
+#define VHOST_MAX_ASYNC_VEC 2048
+#define VIRTIO_MAX_RX_PKTLEN 9728U
+#define VHOST_DMA_MAX_COPY_COMPLETE ((VIRTIO_MAX_RX_PKTLEN / RTE_MBUF_DEFAULT_DATAROOM) \
+ * MAX_PKT_BURST)
#define PACKED_DESC_ENQUEUE_USED_FLAG(w) \
((w) ? (VRING_DESC_F_AVAIL | VRING_DESC_F_USED | VRING_DESC_F_WRITE) : \
uint32_t count;
};
+/**
+ * Virtqueue statistics
+ */
+struct virtqueue_stats {
+ uint64_t packets;
+ uint64_t bytes;
+ uint64_t multicast;
+ uint64_t broadcast;
+ /* Size bins in array as RFC 2819, undersized [0], 64 [1], etc */
+ uint64_t size_bins[8];
+ uint64_t guest_notifications;
+ uint64_t iotlb_hits;
+ uint64_t iotlb_misses;
+ uint64_t inflight_submitted;
+ uint64_t inflight_completed;
+};
+
+/**
+ * iovec
+ */
+struct vhost_iovec {
+ void *src_addr;
+ void *dst_addr;
+ size_t len;
+};
+
+/**
+ * iovec iterator
+ */
+struct vhost_iov_iter {
+ /** pointer to the iovec array */
+ struct vhost_iovec *iov;
+ /** number of iovec in this iterator */
+ unsigned long nr_segs;
+};
+
+struct async_dma_vchan_info {
+ /* circular array to track if packet copy completes */
+ bool **pkts_cmpl_flag_addr;
+
+ /* max elements in 'pkts_cmpl_flag_addr' */
+ uint16_t ring_size;
+ /* ring index mask for 'pkts_cmpl_flag_addr' */
+ uint16_t ring_mask;
+
+ /**
+ * DMA virtual channel lock. Although it is able to bind DMA
+ * virtual channels to data plane threads, vhost control plane
+ * thread could call data plane functions too, thus causing
+ * DMA device contention.
+ *
+ * For example, in VM exit case, vhost control plane thread needs
+ * to clear in-flight packets before disable vring, but there could
+ * be anotther data plane thread is enqueuing packets to the same
+ * vring with the same DMA virtual channel. As dmadev PMD functions
+ * are lock-free, the control plane and data plane threads could
+ * operate the same DMA virtual channel at the same time.
+ */
+ rte_spinlock_t dma_lock;
+};
+
+struct async_dma_info {
+ struct async_dma_vchan_info *vchans;
+ /* number of registered virtual channels */
+ uint16_t nr_vchans;
+};
+
+extern struct async_dma_info dma_copy_track[RTE_DMADEV_DEFAULT_MAX];
+
/**
* inflight async packet information
*/
struct rte_mbuf *mbuf;
uint16_t descs; /* num of descs inflight */
uint16_t nr_buffers; /* num of buffers inflight for packed ring */
+ struct virtio_net_hdr nethdr;
};
struct vhost_async {
- /* operation callbacks for DMA */
- struct rte_vhost_async_channel_ops ops;
-
- struct rte_vhost_iov_iter iov_iter[VHOST_MAX_ASYNC_IT];
- struct rte_vhost_iovec iovec[VHOST_MAX_ASYNC_VEC];
+ struct vhost_iov_iter iov_iter[VHOST_MAX_ASYNC_IT];
+ struct vhost_iovec iovec[VHOST_MAX_ASYNC_VEC];
uint16_t iter_idx;
uint16_t iovec_idx;
/* data transfer status */
struct async_inflight_info *pkts_info;
+ /**
+ * Packet reorder array. "true" indicates that DMA device
+ * completes all copies for the packet.
+ *
+ * Note that this array could be written by multiple threads
+ * simultaneously. For example, in the case of thread0 and
+ * thread1 RX packets from NIC and then enqueue packets to
+ * vring0 and vring1 with own DMA device DMA0 and DMA1, it's
+ * possible for thread0 to get completed copies belonging to
+ * vring1 from DMA0, while thread0 is calling rte_vhost_poll
+ * _enqueue_completed() for vring0 and thread1 is calling
+ * rte_vhost_submit_enqueue_burst() for vring1. In this case,
+ * vq->access_lock cannot protect pkts_cmpl_flag of vring1.
+ *
+ * However, since offloading is per-packet basis, each packet
+ * flag will only be written by one thread. And single byte
+ * write is atomic, so no lock for pkts_cmpl_flag is needed.
+ */
+ bool *pkts_cmpl_flag;
uint16_t pkts_idx;
uint16_t pkts_inflight_n;
union {
#define VIRTIO_UNINITIALIZED_NOTIF (-1)
struct vhost_vring_addr ring_addrs;
+ struct virtqueue_stats stats;
} __rte_cache_aligned;
/* Virtio device status as per Virtio specification */
struct guest_page {
uint64_t guest_phys_addr;
- uint64_t host_phys_addr;
+ uint64_t host_iova;
+ uint64_t host_user_addr;
uint64_t size;
};
uint16_t mtu;
uint8_t status;
- struct vhost_device_ops const *notify_ops;
+ struct rte_vhost_device_ops const *notify_ops;
uint32_t nr_guest_pages;
uint32_t max_guest_pages;
#define PRINT_PACKET(device, addr, size, header) do {} while (0)
#endif
-#define MAX_VHOST_DEVICE 1024
-extern struct virtio_net *vhost_devices[MAX_VHOST_DEVICE];
+extern struct virtio_net *vhost_devices[RTE_MAX_VHOST_DEVICE];
#define VHOST_BINARY_SEARCH_THRESH 256
return 0;
}
+static __rte_always_inline int guest_page_rangecmp(const void *p1, const void *p2)
+{
+ const struct guest_page *page1 = (const struct guest_page *)p1;
+ const struct guest_page *page2 = (const struct guest_page *)p2;
+
+ if (page1->guest_phys_addr >= page2->guest_phys_addr) {
+ if (page1->guest_phys_addr < page2->guest_phys_addr + page2->size)
+ return 0;
+ else
+ return 1;
+ } else
+ return -1;
+}
+
static __rte_always_inline rte_iova_t
gpa_to_first_hpa(struct virtio_net *dev, uint64_t gpa,
uint64_t gpa_size, uint64_t *hpa_size)
*hpa_size = gpa_size;
if (dev->nr_guest_pages >= VHOST_BINARY_SEARCH_THRESH) {
- key.guest_phys_addr = gpa & ~(dev->guest_pages[0].size - 1);
+ key.guest_phys_addr = gpa;
page = bsearch(&key, dev->guest_pages, dev->nr_guest_pages,
- sizeof(struct guest_page), guest_page_addrcmp);
+ sizeof(struct guest_page), guest_page_rangecmp);
if (page) {
if (gpa + gpa_size <=
page->guest_phys_addr + page->size) {
return gpa - page->guest_phys_addr +
- page->host_phys_addr;
+ page->host_iova;
} else if (gpa < page->guest_phys_addr +
page->size) {
*hpa_size = page->guest_phys_addr +
page->size - gpa;
return gpa - page->guest_phys_addr +
- page->host_phys_addr;
+ page->host_iova;
}
}
} else {
if (gpa + gpa_size <=
page->guest_phys_addr + page->size) {
return gpa - page->guest_phys_addr +
- page->host_phys_addr;
+ page->host_iova;
} else if (gpa < page->guest_phys_addr +
page->size) {
*hpa_size = page->guest_phys_addr +
page->size - gpa;
return gpa - page->guest_phys_addr +
- page->host_phys_addr;
+ page->host_iova;
}
}
}
void vhost_attach_vdpa_device(int vid, struct rte_vdpa_device *dev);
void vhost_set_ifname(int, const char *if_name, unsigned int if_len);
-void vhost_setup_virtio_net(int vid, bool enable, bool legacy_ol_flags);
+void vhost_setup_virtio_net(int vid, bool enable, bool legacy_ol_flags, bool stats_enabled);
void vhost_enable_extbuf(int vid);
void vhost_enable_linearbuf(int vid);
int vhost_enable_guest_notification(struct virtio_net *dev,
struct vhost_virtqueue *vq, int enable);
-struct vhost_device_ops const *vhost_driver_callback_get(const char *path);
+struct rte_vhost_device_ops const *vhost_driver_callback_get(const char *path);
/*
* Backend-specific cleanup.
(vq->callfd >= 0)) ||
unlikely(!signalled_used_valid)) {
eventfd_write(vq->callfd, (eventfd_t) 1);
+ if (dev->flags & VIRTIO_DEV_STATS_ENABLED)
+ vq->stats.guest_notifications++;
if (dev->notify_ops->guest_notified)
dev->notify_ops->guest_notified(dev->vid);
}
if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
&& (vq->callfd >= 0)) {
eventfd_write(vq->callfd, (eventfd_t)1);
+ if (dev->flags & VIRTIO_DEV_STATS_ENABLED)
+ vq->stats.guest_notifications++;
if (dev->notify_ops->guest_notified)
dev->notify_ops->guest_notified(dev->vid);
}
return true;
}
-
#endif /* _VHOST_NET_CDEV_H_ */