X-Git-Url: http://git.droids-corp.org/?a=blobdiff_plain;f=lib%2Fvhost%2Fvhost.h;h=a9edc271aa7f6fd4457b5c1f0f1839b00d6d259c;hb=1fd3de64ff47f8865f9cbd13bb8f20611beb3cfa;hp=1e3f1bad91786d0bad00a3a8d2645774897b2bdc;hpb=d5d25cfd855d57f450cbf17113f88fba675ca6e9;p=dpdk.git

diff --git a/lib/vhost/vhost.h b/lib/vhost/vhost.h
index 1e3f1bad91..a9edc271aa 100644
--- a/lib/vhost/vhost.h
+++ b/lib/vhost/vhost.h
@@ -17,12 +17,11 @@
 
 #include <rte_log.h>
 #include <rte_ether.h>
-#include <rte_rwlock.h>
 #include <rte_malloc.h>
+#include <rte_dmadev.h>
 
 #include "rte_vhost.h"
-#include "rte_vdpa.h"
-#include "rte_vdpa_dev.h"
+#include "vdpa_driver.h"
 
 #include "rte_vhost_async.h"
 
@@ -49,7 +48,10 @@
 #define MAX_PKT_BURST 32
 
 #define VHOST_MAX_ASYNC_IT (MAX_PKT_BURST)
-#define VHOST_MAX_ASYNC_VEC (BUF_VECTOR_MAX * 2)
+#define VHOST_MAX_ASYNC_VEC 2048
+#define VIRTIO_MAX_RX_PKTLEN 9728U
+#define VHOST_DMA_MAX_COPY_COMPLETE ((VIRTIO_MAX_RX_PKTLEN / RTE_MBUF_DEFAULT_DATAROOM) \
+		* MAX_PKT_BURST)
 
 #define PACKED_DESC_ENQUEUE_USED_FLAG(w)	\
 	((w) ? (VRING_DESC_F_AVAIL | VRING_DESC_F_USED | VRING_DESC_F_WRITE) : \
@@ -119,6 +121,58 @@ struct vring_used_elem_packed {
 	uint32_t count;
 };
 
+/**
+ * iovec
+ */
+struct vhost_iovec {
+	void *src_addr;
+	void *dst_addr;
+	size_t len;
+};
+
+/**
+ * iovec iterator
+ */
+struct vhost_iov_iter {
+	/** pointer to the iovec array */
+	struct vhost_iovec *iov;
+	/** number of iovec in this iterator */
+	unsigned long nr_segs;
+};
+
+struct async_dma_vchan_info {
+	/* circular array to track if packet copy completes */
+	bool **pkts_cmpl_flag_addr;
+
+	/* max elements in 'pkts_cmpl_flag_addr' */
+	uint16_t ring_size;
+	/* ring index mask for 'pkts_cmpl_flag_addr' */
+	uint16_t ring_mask;
+
+	/**
+	 * DMA virtual channel lock. Although it is able to bind DMA
+	 * virtual channels to data plane threads, vhost control plane
+	 * thread could call data plane functions too, thus causing
+	 * DMA device contention.
+	 *
+	 * For example, in VM exit case, vhost control plane thread needs
+	 * to clear in-flight packets before disable vring, but there could
+	 * be anotther data plane thread is enqueuing packets to the same
+	 * vring with the same DMA virtual channel. As dmadev PMD functions
+	 * are lock-free, the control plane and data plane threads could
+	 * operate the same DMA virtual channel at the same time.
+	 */
+	rte_spinlock_t dma_lock;
+};
+
+struct async_dma_info {
+	struct async_dma_vchan_info *vchans;
+	/* number of registered virtual channels */
+	uint16_t nr_vchans;
+};
+
+extern struct async_dma_info dma_copy_track[RTE_DMADEV_DEFAULT_MAX];
+
 /**
  * inflight async packet information
  */
@@ -129,19 +183,34 @@ struct async_inflight_info {
 };
 
 struct vhost_async {
-	/* operation callbacks for DMA */
-	struct rte_vhost_async_channel_ops ops;
-
-	struct rte_vhost_iov_iter iov_iter[VHOST_MAX_ASYNC_IT];
-	struct rte_vhost_iovec iovec[VHOST_MAX_ASYNC_VEC];
+	struct vhost_iov_iter iov_iter[VHOST_MAX_ASYNC_IT];
+	struct vhost_iovec iovec[VHOST_MAX_ASYNC_VEC];
 	uint16_t iter_idx;
 	uint16_t iovec_idx;
 
 	/* data transfer status */
 	struct async_inflight_info *pkts_info;
+	/**
+	 * Packet reorder array. "true" indicates that DMA device
+	 * completes all copies for the packet.
+	 *
+	 * Note that this array could be written by multiple threads
+	 * simultaneously. For example, in the case of thread0 and
+	 * thread1 RX packets from NIC and then enqueue packets to
+	 * vring0 and vring1 with own DMA device DMA0 and DMA1, it's
+	 * possible for thread0 to get completed copies belonging to
+	 * vring1 from DMA0, while thread0 is calling rte_vhost_poll
+	 * _enqueue_completed() for vring0 and thread1 is calling
+	 * rte_vhost_submit_enqueue_burst() for vring1. In this case,
+	 * vq->access_lock cannot protect pkts_cmpl_flag of vring1.
+	 *
+	 * However, since offloading is per-packet basis, each packet
+	 * flag will only be written by one thread. And single byte
+	 * write is atomic, so no lock for pkts_cmpl_flag is needed.
+	 */
+	bool *pkts_cmpl_flag;
 	uint16_t pkts_idx;
 	uint16_t pkts_inflight_n;
-	uint16_t last_pkts_n;
 	union {
 		struct vring_used_elem  *descs_split;
 		struct vring_used_elem_packed *buffers_packed;
@@ -355,7 +424,8 @@ struct vring_packed_desc_event {
 
 struct guest_page {
 	uint64_t guest_phys_addr;
-	uint64_t host_phys_addr;
+	uint64_t host_iova;
+	uint64_t host_user_addr;
 	uint64_t size;
 };
 
@@ -395,7 +465,7 @@ struct virtio_net {
 	uint16_t		mtu;
 	uint8_t			status;
 
-	struct vhost_device_ops const *notify_ops;
+	struct rte_vhost_device_ops const *notify_ops;
 
 	uint32_t		nr_guest_pages;
 	uint32_t		max_guest_pages;
@@ -569,8 +639,7 @@ extern int vhost_data_log_level;
 #define PRINT_PACKET(device, addr, size, header) do {} while (0)
 #endif
 
-#define MAX_VHOST_DEVICE	1024
-extern struct virtio_net *vhost_devices[MAX_VHOST_DEVICE];
+extern struct virtio_net *vhost_devices[RTE_MAX_VHOST_DEVICE];
 
 #define VHOST_BINARY_SEARCH_THRESH 256
 
@@ -588,6 +657,20 @@ static __rte_always_inline int guest_page_addrcmp(const void *p1,
 	return 0;
 }
 
+static __rte_always_inline int guest_page_rangecmp(const void *p1, const void *p2)
+{
+	const struct guest_page *page1 = (const struct guest_page *)p1;
+	const struct guest_page *page2 = (const struct guest_page *)p2;
+
+	if (page1->guest_phys_addr >= page2->guest_phys_addr) {
+		if (page1->guest_phys_addr < page2->guest_phys_addr + page2->size)
+			return 0;
+		else
+			return 1;
+	} else
+		return -1;
+}
+
 static __rte_always_inline rte_iova_t
 gpa_to_first_hpa(struct virtio_net *dev, uint64_t gpa,
 	uint64_t gpa_size, uint64_t *hpa_size)
@@ -598,20 +681,20 @@ gpa_to_first_hpa(struct virtio_net *dev, uint64_t gpa,
 
 	*hpa_size = gpa_size;
 	if (dev->nr_guest_pages >= VHOST_BINARY_SEARCH_THRESH) {
-		key.guest_phys_addr = gpa & ~(dev->guest_pages[0].size - 1);
+		key.guest_phys_addr = gpa;
 		page = bsearch(&key, dev->guest_pages, dev->nr_guest_pages,
-			       sizeof(struct guest_page), guest_page_addrcmp);
+			       sizeof(struct guest_page), guest_page_rangecmp);
 		if (page) {
 			if (gpa + gpa_size <=
 					page->guest_phys_addr + page->size) {
 				return gpa - page->guest_phys_addr +
-					page->host_phys_addr;
+					page->host_iova;
 			} else if (gpa < page->guest_phys_addr +
 						page->size) {
 				*hpa_size = page->guest_phys_addr +
 					page->size - gpa;
 				return gpa - page->guest_phys_addr +
-					page->host_phys_addr;
+					page->host_iova;
 			}
 		}
 	} else {
@@ -622,13 +705,13 @@ gpa_to_first_hpa(struct virtio_net *dev, uint64_t gpa,
 				if (gpa + gpa_size <=
 					page->guest_phys_addr + page->size) {
 					return gpa - page->guest_phys_addr +
-						page->host_phys_addr;
+						page->host_iova;
 				} else if (gpa < page->guest_phys_addr +
 							page->size) {
 					*hpa_size = page->guest_phys_addr +
 						page->size - gpa;
 					return gpa - page->guest_phys_addr +
-						page->host_phys_addr;
+						page->host_iova;
 				}
 			}
 		}
@@ -703,7 +786,7 @@ void vhost_enable_linearbuf(int vid);
 int vhost_enable_guest_notification(struct virtio_net *dev,
 		struct vhost_virtqueue *vq, int enable);
 
-struct vhost_device_ops const *vhost_driver_callback_get(const char *path);
+struct rte_vhost_device_ops const *vhost_driver_callback_get(const char *path);
 
 /*
  * Backend-specific cleanup.