X-Git-Url: http://git.droids-corp.org/?a=blobdiff_plain;ds=sidebyside;f=lib%2Flibrte_vhost%2Fvhost_rxtx.c;h=5d9cc91de845b18c9f3b5933d58449c73faf49c2;hb=d0cf91303d73;hp=d75ae026f0f32b94a74a55e3683267952ed540b0;hpb=fa325fa4130c42065f7d6457234113f1dc889b4e;p=dpdk.git

diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c
index d75ae026f0..5d9cc91de8 100644
--- a/lib/librte_vhost/vhost_rxtx.c
+++ b/lib/librte_vhost/vhost_rxtx.c
@@ -31,48 +31,48 @@
  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include <arpa/inet.h>
-#include <getopt.h>
-#include <linux/if_ether.h>
-#include <linux/if_vlan.h>
-#include <linux/virtio_net.h>
-#include <linux/virtio_ring.h>
-#include <signal.h>
 #include <stdint.h>
-#include <sys/eventfd.h>
-#include <sys/param.h>
-#include <unistd.h>
+#include <stdbool.h>
+#include <linux/virtio_net.h>
 
-#include <rte_atomic.h>
-#include <rte_cycles.h>
-#include <rte_ethdev.h>
-#include <rte_log.h>
-#include <rte_string_fns.h>
-#include <rte_malloc.h>
+#include <rte_mbuf.h>
+#include <rte_memcpy.h>
+#include <rte_ether.h>
+#include <rte_ip.h>
+#include <rte_virtio_net.h>
+#include <rte_tcp.h>
+#include <rte_udp.h>
+#include <rte_sctp.h>
 
-#include "main.h"
-#include "virtio-net.h"
-#include "vhost-net-cdev.h"
+#include "vhost-net.h"
 
-#define MAX_PKT_BURST 32 		/* Max burst size for RX/TX */
+#define MAX_PKT_BURST 32
 
-/*
+static bool
+is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t qp_nb)
+{
+	return (is_tx ^ (idx & 1)) == 0 && idx < qp_nb * VIRTIO_QNUM;
+}
+
+/**
  * This function adds buffers to the virtio devices RX virtqueue. Buffers can
  * be received from the physical port or from another virtio device. A packet
- * count is returned to indicate the number of packets that were succesfully
- * added to the RX queue. This function works when mergeable is disabled.
+ * count is returned to indicate the number of packets that are succesfully
+ * added to the RX queue. This function works when the mbuf is scattered, but
+ * it doesn't support the mergeable feature.
  */
 static inline uint32_t __attribute__((always_inline))
-virtio_dev_rx(struct virtio_net *dev, struct rte_mbuf **pkts, uint32_t count)
+virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
+	struct rte_mbuf **pkts, uint32_t count)
 {
 	struct vhost_virtqueue *vq;
 	struct vring_desc *desc;
 	struct rte_mbuf *buff;
 	/* The virtio_hdr is initialised to 0. */
-	struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0,0,0,0,0,0},0};
+	struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0};
 	uint64_t buff_addr = 0;
 	uint64_t buff_hdr_addr = 0;
-	uint32_t head[MAX_PKT_BURST], packet_len = 0;
+	uint32_t head[MAX_PKT_BURST];
 	uint32_t head_idx, packet_success = 0;
 	uint16_t avail_idx, res_cur_idx;
 	uint16_t res_base_idx, res_end_idx;
@@ -80,10 +80,23 @@ virtio_dev_rx(struct virtio_net *dev, struct rte_mbuf **pkts, uint32_t count)
 	uint8_t success = 0;
 
 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh);
-	vq = dev->virtqueue[VIRTIO_RXQ];
+	if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->virt_qp_nb))) {
+		RTE_LOG(ERR, VHOST_DATA,
+			"%s (%"PRIu64"): virtqueue idx:%d invalid.\n",
+			__func__, dev->device_fh, queue_id);
+		return 0;
+	}
+
+	vq = dev->virtqueue[queue_id];
+	if (unlikely(vq->enabled == 0))
+		return 0;
+
 	count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count;
 
-	/* As many data cores may want access to available buffers, they need to be reserved. */
+	/*
+	 * As many data cores may want access to available buffers,
+	 * they need to be reserved.
+	 */
 	do {
 		res_base_idx = vq->last_used_idx_res;
 		avail_idx = *((volatile uint16_t *)&vq->avail->idx);
@@ -98,23 +111,30 @@ virtio_dev_rx(struct virtio_net *dev, struct rte_mbuf **pkts, uint32_t count)
 
 		res_end_idx = res_base_idx + count;
 		/* vq->last_used_idx_res is atomically updated. */
-		success = rte_atomic16_cmpset(&vq->last_used_idx_res, res_base_idx,
-									res_end_idx);
+		/* TODO: Allow to disable cmpset if no concurrency in application. */
+		success = rte_atomic16_cmpset(&vq->last_used_idx_res,
+				res_base_idx, res_end_idx);
 	} while (unlikely(success == 0));
 	res_cur_idx = res_base_idx;
-	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n", dev->device_fh, res_cur_idx, res_end_idx);
+	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n",
+			dev->device_fh, res_cur_idx, res_end_idx);
 
 	/* Prefetch available ring to retrieve indexes. */
 	rte_prefetch0(&vq->avail->ring[res_cur_idx & (vq->size - 1)]);
 
 	/* Retrieve all of the head indexes first to avoid caching issues. */
 	for (head_idx = 0; head_idx < count; head_idx++)
-		head[head_idx] = vq->avail->ring[(res_cur_idx + head_idx) & (vq->size - 1)];
+		head[head_idx] = vq->avail->ring[(res_cur_idx + head_idx) &
+					(vq->size - 1)];
 
 	/*Prefetch descriptor index. */
 	rte_prefetch0(&vq->desc[head[packet_success]]);
 
 	while (res_cur_idx != res_end_idx) {
+		uint32_t offset = 0, vb_offset = 0;
+		uint32_t pkt_len, len_to_cpy, data_len, total_copied = 0;
+		uint8_t hdr = 0, uncompleted_pkt = 0;
+
 		/* Get descriptor from available ring */
 		desc = &vq->desc[head[packet_success]];
 
@@ -123,41 +143,85 @@ virtio_dev_rx(struct virtio_net *dev, struct rte_mbuf **pkts, uint32_t count)
 		/* Convert from gpa to vva (guest physical addr -> vhost virtual addr) */
 		buff_addr = gpa_to_vva(dev, desc->addr);
 		/* Prefetch buffer address. */
-		rte_prefetch0((void*)(uintptr_t)buff_addr);
+		rte_prefetch0((void *)(uintptr_t)buff_addr);
 
 		/* Copy virtio_hdr to packet and increment buffer address */
 		buff_hdr_addr = buff_addr;
-		packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen;
 
 		/*
 		 * If the descriptors are chained the header and data are
 		 * placed in separate buffers.
 		 */
-		if (desc->flags & VRING_DESC_F_NEXT) {
-			desc->len = vq->vhost_hlen;
+		if ((desc->flags & VRING_DESC_F_NEXT) &&
+			(desc->len == vq->vhost_hlen)) {
 			desc = &vq->desc[desc->next];
 			/* Buffer address translation. */
 			buff_addr = gpa_to_vva(dev, desc->addr);
-			desc->len = rte_pktmbuf_data_len(buff);
 		} else {
-			buff_addr += vq->vhost_hlen;
-			desc->len = packet_len;
+			vb_offset += vq->vhost_hlen;
+			hdr = 1;
+		}
+
+		pkt_len = rte_pktmbuf_pkt_len(buff);
+		data_len = rte_pktmbuf_data_len(buff);
+		len_to_cpy = RTE_MIN(data_len,
+			hdr ? desc->len - vq->vhost_hlen : desc->len);
+		while (total_copied < pkt_len) {
+			/* Copy mbuf data to buffer */
+			rte_memcpy((void *)(uintptr_t)(buff_addr + vb_offset),
+				rte_pktmbuf_mtod_offset(buff, const void *, offset),
+				len_to_cpy);
+			PRINT_PACKET(dev, (uintptr_t)(buff_addr + vb_offset),
+				len_to_cpy, 0);
+
+			offset += len_to_cpy;
+			vb_offset += len_to_cpy;
+			total_copied += len_to_cpy;
+
+			/* The whole packet completes */
+			if (total_copied == pkt_len)
+				break;
+
+			/* The current segment completes */
+			if (offset == data_len) {
+				buff = buff->next;
+				offset = 0;
+				data_len = rte_pktmbuf_data_len(buff);
+			}
+
+			/* The current vring descriptor done */
+			if (vb_offset == desc->len) {
+				if (desc->flags & VRING_DESC_F_NEXT) {
+					desc = &vq->desc[desc->next];
+					buff_addr = gpa_to_vva(dev, desc->addr);
+					vb_offset = 0;
+				} else {
+					/* Room in vring buffer is not enough */
+					uncompleted_pkt = 1;
+					break;
+				}
+			}
+			len_to_cpy = RTE_MIN(data_len - offset, desc->len - vb_offset);
 		}
 
 		/* Update used ring with desc information */
-		vq->used->ring[res_cur_idx & (vq->size - 1)].id = head[packet_success];
-		vq->used->ring[res_cur_idx & (vq->size - 1)].len = packet_len;
+		vq->used->ring[res_cur_idx & (vq->size - 1)].id =
+							head[packet_success];
 
-		/* Copy mbuf data to buffer */
-		rte_memcpy((void *)(uintptr_t)buff_addr,
-			rte_pktmbuf_mtod(buff, const void *),
-			rte_pktmbuf_data_len(buff));
-		PRINT_PACKET(dev, (uintptr_t)buff_addr,
-			rte_pktmbuf_data_len(buff), 0);
+		/* Drop the packet if it is uncompleted */
+		if (unlikely(uncompleted_pkt == 1))
+			vq->used->ring[res_cur_idx & (vq->size - 1)].len =
+							vq->vhost_hlen;
+		else
+			vq->used->ring[res_cur_idx & (vq->size - 1)].len =
+							pkt_len + vq->vhost_hlen;
 
 		res_cur_idx++;
 		packet_success++;
 
+		if (unlikely(uncompleted_pkt == 1))
+			continue;
+
 		rte_memcpy((void *)(uintptr_t)buff_hdr_addr,
 			(const void *)&virtio_hdr, vq->vhost_hlen);
 
@@ -178,16 +242,19 @@ virtio_dev_rx(struct virtio_net *dev, struct rte_mbuf **pkts, uint32_t count)
 	*(volatile uint16_t *)&vq->used->idx += count;
 	vq->last_used_idx = res_end_idx;
 
+	/* flush used->idx update before we read avail->flags. */
+	rte_mb();
+
 	/* Kick the guest if necessary. */
 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
-		eventfd_write((int)vq->kickfd, 1);
+		eventfd_write(vq->callfd, (eventfd_t)1);
 	return count;
 }
 
 static inline uint32_t __attribute__((always_inline))
-copy_from_mbuf_to_vring(struct virtio_net *dev,
-	uint16_t res_base_idx, uint16_t res_end_idx,
-	struct rte_mbuf *pkt)
+copy_from_mbuf_to_vring(struct virtio_net *dev, uint32_t queue_id,
+			uint16_t res_base_idx, uint16_t res_end_idx,
+			struct rte_mbuf *pkt)
 {
 	uint32_t vec_idx = 0;
 	uint32_t entry_success = 0;
@@ -215,9 +282,9 @@ copy_from_mbuf_to_vring(struct virtio_net *dev,
 	 * Convert from gpa to vva
 	 * (guest physical addr -> vhost virtual addr)
 	 */
-	vq = dev->virtqueue[VIRTIO_RXQ];
-	vb_addr =
-		gpa_to_vva(dev, vq->buf_vec[vec_idx].buf_addr);
+	vq = dev->virtqueue[queue_id];
+
+	vb_addr = gpa_to_vva(dev, vq->buf_vec[vec_idx].buf_addr);
 	vb_hdr_addr = vb_addr;
 
 	/* Prefetch buffer address. */
@@ -235,15 +302,13 @@ copy_from_mbuf_to_vring(struct virtio_net *dev,
 
 	seg_avail = rte_pktmbuf_data_len(pkt);
 	vb_offset = vq->vhost_hlen;
-	vb_avail =
-		vq->buf_vec[vec_idx].buf_len - vq->vhost_hlen;
+	vb_avail = vq->buf_vec[vec_idx].buf_len - vq->vhost_hlen;
 
 	entry_len = vq->vhost_hlen;
 
 	if (vb_avail == 0) {
 		uint32_t desc_idx =
 			vq->buf_vec[vec_idx].desc_idx;
-		vq->desc[desc_idx].len = vq->vhost_hlen;
 
 		if ((vq->desc[desc_idx].flags
 			& VRING_DESC_F_NEXT) == 0) {
@@ -259,8 +324,7 @@ copy_from_mbuf_to_vring(struct virtio_net *dev,
 		}
 
 		vec_idx++;
-		vb_addr =
-			gpa_to_vva(dev, vq->buf_vec[vec_idx].buf_addr);
+		vb_addr = gpa_to_vva(dev, vq->buf_vec[vec_idx].buf_addr);
 
 		/* Prefetch buffer address. */
 		rte_prefetch0((void *)(uintptr_t)vb_addr);
@@ -273,7 +337,7 @@ copy_from_mbuf_to_vring(struct virtio_net *dev,
 	while (cpy_len > 0) {
 		/* Copy mbuf data to vring buffer */
 		rte_memcpy((void *)(uintptr_t)(vb_addr + vb_offset),
-			(const void *)(rte_pktmbuf_mtod(pkt, char*) + seg_offset),
+			rte_pktmbuf_mtod_offset(pkt, const void *, seg_offset),
 			cpy_len);
 
 		PRINT_PACKET(dev,
@@ -328,7 +392,6 @@ copy_from_mbuf_to_vring(struct virtio_net *dev,
 					 */
 					uint32_t desc_idx =
 						vq->buf_vec[vec_idx].desc_idx;
-					vq->desc[desc_idx].len = vb_offset;
 
 					if ((vq->desc[desc_idx].flags &
 						VRING_DESC_F_NEXT) == 0) {
@@ -363,26 +426,13 @@ copy_from_mbuf_to_vring(struct virtio_net *dev,
 				/*
 				 * This whole packet completes.
 				 */
-				uint32_t desc_idx =
-					vq->buf_vec[vec_idx].desc_idx;
-				vq->desc[desc_idx].len = vb_offset;
-
-				while (vq->desc[desc_idx].flags &
-					VRING_DESC_F_NEXT) {
-					desc_idx = vq->desc[desc_idx].next;
-					 vq->desc[desc_idx].len = 0;
-				}
-
 				/* Update used ring with desc information */
 				vq->used->ring[cur_idx & (vq->size - 1)].id
 					= vq->buf_vec[vec_idx].desc_idx;
 				vq->used->ring[cur_idx & (vq->size - 1)].len
 					= entry_len;
-				entry_len = 0;
-				cur_idx++;
 				entry_success++;
-				seg_avail = 0;
-				cpy_len = RTE_MIN(vb_avail, seg_avail);
+				break;
 			}
 		}
 	}
@@ -390,73 +440,87 @@ copy_from_mbuf_to_vring(struct virtio_net *dev,
 	return entry_success;
 }
 
+static inline void __attribute__((always_inline))
+update_secure_len(struct vhost_virtqueue *vq, uint32_t id,
+	uint32_t *secure_len, uint32_t *vec_idx)
+{
+	uint16_t wrapped_idx = id & (vq->size - 1);
+	uint32_t idx = vq->avail->ring[wrapped_idx];
+	uint8_t next_desc;
+	uint32_t len = *secure_len;
+	uint32_t vec_id = *vec_idx;
+
+	do {
+		next_desc = 0;
+		len += vq->desc[idx].len;
+		vq->buf_vec[vec_id].buf_addr = vq->desc[idx].addr;
+		vq->buf_vec[vec_id].buf_len = vq->desc[idx].len;
+		vq->buf_vec[vec_id].desc_idx = idx;
+		vec_id++;
+
+		if (vq->desc[idx].flags & VRING_DESC_F_NEXT) {
+			idx = vq->desc[idx].next;
+			next_desc = 1;
+		}
+	} while (next_desc);
+
+	*secure_len = len;
+	*vec_idx = vec_id;
+}
+
 /*
- * This function adds buffers to the virtio devices RX virtqueue. Buffers can
- * be received from the physical port or from another virtio device. A packet
- * count is returned to indicate the number of packets that were succesfully
- * added to the RX queue. This function works for mergeable RX.
+ * This function works for mergeable RX.
  */
 static inline uint32_t __attribute__((always_inline))
-virtio_dev_merge_rx(struct virtio_net *dev, struct rte_mbuf **pkts,
-	uint32_t count)
+virtio_dev_merge_rx(struct virtio_net *dev, uint16_t queue_id,
+	struct rte_mbuf **pkts, uint32_t count)
 {
 	struct vhost_virtqueue *vq;
 	uint32_t pkt_idx = 0, entry_success = 0;
-	uint16_t avail_idx, res_cur_idx;
-	uint16_t res_base_idx, res_end_idx;
+	uint16_t avail_idx;
+	uint16_t res_base_idx, res_cur_idx;
 	uint8_t success = 0;
 
 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_merge_rx()\n",
 		dev->device_fh);
-	vq = dev->virtqueue[VIRTIO_RXQ];
+	if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->virt_qp_nb))) {
+		RTE_LOG(ERR, VHOST_DATA,
+			"%s (%"PRIu64"): virtqueue idx:%d invalid.\n",
+			__func__, dev->device_fh, queue_id);
+		return 0;
+	}
+
+	vq = dev->virtqueue[queue_id];
+	if (unlikely(vq->enabled == 0))
+		return 0;
+
 	count = RTE_MIN((uint32_t)MAX_PKT_BURST, count);
 
 	if (count == 0)
 		return 0;
 
 	for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
-		uint32_t secure_len = 0;
-		uint16_t need_cnt;
-		uint32_t vec_idx = 0;
 		uint32_t pkt_len = pkts[pkt_idx]->pkt_len + vq->vhost_hlen;
-		uint16_t i, id;
 
 		do {
 			/*
 			 * As many data cores may want access to available
 			 * buffers, they need to be reserved.
 			 */
+			uint32_t secure_len = 0;
+			uint32_t vec_idx = 0;
+
 			res_base_idx = vq->last_used_idx_res;
 			res_cur_idx = res_base_idx;
 
 			do {
 				avail_idx = *((volatile uint16_t *)&vq->avail->idx);
-				if (unlikely(res_cur_idx == avail_idx)) {
-					LOG_DEBUG(VHOST_DATA,
-						"(%"PRIu64") Failed "
-						"to get enough desc from "
-						"vring\n",
-						dev->device_fh);
-					return pkt_idx;
-				} else {
-					uint16_t wrapped_idx =
-						(res_cur_idx) & (vq->size - 1);
-					uint32_t idx =
-						vq->avail->ring[wrapped_idx];
-					uint8_t next_desc;
-
-					do {
-						next_desc = 0;
-						secure_len += vq->desc[idx].len;
-						if (vq->desc[idx].flags &
-							VRING_DESC_F_NEXT) {
-							idx = vq->desc[idx].next;
-							next_desc = 1;
-						}
-					} while (next_desc);
+				if (unlikely(res_cur_idx == avail_idx))
+					goto merge_rx_exit;
 
-					res_cur_idx++;
-				}
+				update_secure_len(vq, res_cur_idx,
+						  &secure_len, &vec_idx);
+				res_cur_idx++;
 			} while (pkt_len > secure_len);
 
 			/* vq->last_used_idx_res is atomically updated. */
@@ -465,33 +529,8 @@ virtio_dev_merge_rx(struct virtio_net *dev, struct rte_mbuf **pkts,
 							res_cur_idx);
 		} while (success == 0);
 
-		id = res_base_idx;
-		need_cnt = res_cur_idx - res_base_idx;
-
-		for (i = 0; i < need_cnt; i++, id++) {
-			uint16_t wrapped_idx = id & (vq->size - 1);
-			uint32_t idx = vq->avail->ring[wrapped_idx];
-			uint8_t next_desc;
-			do {
-				next_desc = 0;
-				vq->buf_vec[vec_idx].buf_addr =
-					vq->desc[idx].addr;
-				vq->buf_vec[vec_idx].buf_len =
-					vq->desc[idx].len;
-				vq->buf_vec[vec_idx].desc_idx = idx;
-				vec_idx++;
-
-				if (vq->desc[idx].flags & VRING_DESC_F_NEXT) {
-					idx = vq->desc[idx].next;
-					next_desc = 1;
-				}
-			} while (next_desc);
-		}
-
-		res_end_idx = res_cur_idx;
-
-		entry_success = copy_from_mbuf_to_vring(dev, res_base_idx,
-			res_end_idx, pkts[pkt_idx]);
+		entry_success = copy_from_mbuf_to_vring(dev, queue_id,
+			res_base_idx, res_cur_idx, pkts[pkt_idx]);
 
 		rte_compiler_barrier();
 
@@ -503,38 +542,157 @@ virtio_dev_merge_rx(struct virtio_net *dev, struct rte_mbuf **pkts,
 			rte_pause();
 
 		*(volatile uint16_t *)&vq->used->idx += entry_success;
-		vq->last_used_idx = res_end_idx;
+		vq->last_used_idx = res_cur_idx;
+	}
+
+merge_rx_exit:
+	if (likely(pkt_idx)) {
+		/* flush used->idx update before we read avail->flags. */
+		rte_mb();
 
 		/* Kick the guest if necessary. */
 		if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
-			eventfd_write((int)vq->kickfd, 1);
+			eventfd_write(vq->callfd, (eventfd_t)1);
 	}
 
-	return count;
+	return pkt_idx;
+}
+
+uint16_t
+rte_vhost_enqueue_burst(struct virtio_net *dev, uint16_t queue_id,
+	struct rte_mbuf **pkts, uint16_t count)
+{
+	if (unlikely(dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF)))
+		return virtio_dev_merge_rx(dev, queue_id, pkts, count);
+	else
+		return virtio_dev_rx(dev, queue_id, pkts, count);
+}
+
+static void
+parse_ethernet(struct rte_mbuf *m, uint16_t *l4_proto, void **l4_hdr)
+{
+	struct ipv4_hdr *ipv4_hdr;
+	struct ipv6_hdr *ipv6_hdr;
+	void *l3_hdr = NULL;
+	struct ether_hdr *eth_hdr;
+	uint16_t ethertype;
+
+	eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
+
+	m->l2_len = sizeof(struct ether_hdr);
+	ethertype = rte_be_to_cpu_16(eth_hdr->ether_type);
+
+	if (ethertype == ETHER_TYPE_VLAN) {
+		struct vlan_hdr *vlan_hdr = (struct vlan_hdr *)(eth_hdr + 1);
+
+		m->l2_len += sizeof(struct vlan_hdr);
+		ethertype = rte_be_to_cpu_16(vlan_hdr->eth_proto);
+	}
+
+	l3_hdr = (char *)eth_hdr + m->l2_len;
+
+	switch (ethertype) {
+	case ETHER_TYPE_IPv4:
+		ipv4_hdr = (struct ipv4_hdr *)l3_hdr;
+		*l4_proto = ipv4_hdr->next_proto_id;
+		m->l3_len = (ipv4_hdr->version_ihl & 0x0f) * 4;
+		*l4_hdr = (char *)l3_hdr + m->l3_len;
+		m->ol_flags |= PKT_TX_IPV4;
+		break;
+	case ETHER_TYPE_IPv6:
+		ipv6_hdr = (struct ipv6_hdr *)l3_hdr;
+		*l4_proto = ipv6_hdr->proto;
+		m->l3_len = sizeof(struct ipv6_hdr);
+		*l4_hdr = (char *)l3_hdr + m->l3_len;
+		m->ol_flags |= PKT_TX_IPV6;
+		break;
+	default:
+		m->l3_len = 0;
+		*l4_proto = 0;
+		break;
+	}
 }
 
-/* This function works for TX packets with mergeable feature enabled. */
-static inline uint16_t __attribute__((always_inline))
-virtio_dev_merge_tx(struct virtio_net *dev, struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count)
+static inline void __attribute__((always_inline))
+vhost_dequeue_offload(struct virtio_net_hdr *hdr, struct rte_mbuf *m)
+{
+	uint16_t l4_proto = 0;
+	void *l4_hdr = NULL;
+	struct tcp_hdr *tcp_hdr = NULL;
+
+	parse_ethernet(m, &l4_proto, &l4_hdr);
+	if (hdr->flags == VIRTIO_NET_HDR_F_NEEDS_CSUM) {
+		if (hdr->csum_start == (m->l2_len + m->l3_len)) {
+			switch (hdr->csum_offset) {
+			case (offsetof(struct tcp_hdr, cksum)):
+				if (l4_proto == IPPROTO_TCP)
+					m->ol_flags |= PKT_TX_TCP_CKSUM;
+				break;
+			case (offsetof(struct udp_hdr, dgram_cksum)):
+				if (l4_proto == IPPROTO_UDP)
+					m->ol_flags |= PKT_TX_UDP_CKSUM;
+				break;
+			case (offsetof(struct sctp_hdr, cksum)):
+				if (l4_proto == IPPROTO_SCTP)
+					m->ol_flags |= PKT_TX_SCTP_CKSUM;
+				break;
+			default:
+				break;
+			}
+		}
+	}
+
+	if (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
+		switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
+		case VIRTIO_NET_HDR_GSO_TCPV4:
+		case VIRTIO_NET_HDR_GSO_TCPV6:
+			tcp_hdr = (struct tcp_hdr *)l4_hdr;
+			m->ol_flags |= PKT_TX_TCP_SEG;
+			m->tso_segsz = hdr->gso_size;
+			m->l4_len = (tcp_hdr->data_off & 0xf0) >> 2;
+			break;
+		default:
+			RTE_LOG(WARNING, VHOST_DATA,
+				"unsupported gso type %u.\n", hdr->gso_type);
+			break;
+		}
+	}
+}
+
+uint16_t
+rte_vhost_dequeue_burst(struct virtio_net *dev, uint16_t queue_id,
+	struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count)
 {
 	struct rte_mbuf *m, *prev;
 	struct vhost_virtqueue *vq;
 	struct vring_desc *desc;
 	uint64_t vb_addr = 0;
+	uint64_t vb_net_hdr_addr = 0;
 	uint32_t head[MAX_PKT_BURST];
 	uint32_t used_idx;
 	uint32_t i;
 	uint16_t free_entries, entry_success = 0;
 	uint16_t avail_idx;
+	struct virtio_net_hdr *hdr = NULL;
+
+	if (unlikely(!is_valid_virt_queue_idx(queue_id, 1, dev->virt_qp_nb))) {
+		RTE_LOG(ERR, VHOST_DATA,
+			"%s (%"PRIu64"): virtqueue idx:%d invalid.\n",
+			__func__, dev->device_fh, queue_id);
+		return 0;
+	}
+
+	vq = dev->virtqueue[queue_id];
+	if (unlikely(vq->enabled == 0))
+		return 0;
 
-	vq = dev->virtqueue[VIRTIO_TXQ];
 	avail_idx =  *((volatile uint16_t *)&vq->avail->idx);
 
 	/* If there are no available buffers then return. */
 	if (vq->last_used_idx == avail_idx)
 		return 0;
 
-	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_merge_tx()\n",
+	LOG_DEBUG(VHOST_DATA, "%s (%"PRIu64")\n", __func__,
 		dev->device_fh);
 
 	/* Prefetch available ring to retrieve head indexes. */
@@ -548,7 +706,7 @@ virtio_dev_merge_tx(struct virtio_net *dev, struct rte_mempool *mbuf_pool, struc
 	free_entries = RTE_MIN(free_entries, MAX_PKT_BURST);
 
 	LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n",
-		dev->device_fh, free_entries);
+			dev->device_fh, free_entries);
 	/* Retrieve all of the head indexes first to avoid caching issues. */
 	for (i = 0; i < free_entries; i++)
 		head[i] = vq->avail->ring[(vq->last_used_idx + i) & (vq->size - 1)];
@@ -567,8 +725,18 @@ virtio_dev_merge_tx(struct virtio_net *dev, struct rte_mempool *mbuf_pool, struc
 
 		desc = &vq->desc[head[entry_success]];
 
+		vb_net_hdr_addr = gpa_to_vva(dev, desc->addr);
+		hdr = (struct virtio_net_hdr *)((uintptr_t)vb_net_hdr_addr);
+
 		/* Discard first buffer as it is the virtio header */
-		desc = &vq->desc[desc->next];
+		if (desc->flags & VRING_DESC_F_NEXT) {
+			desc = &vq->desc[desc->next];
+			vb_offset = 0;
+			vb_avail = desc->len;
+		} else {
+			vb_offset = vq->vhost_hlen;
+			vb_avail = desc->len - vb_offset;
+		}
 
 		/* Buffer address translation. */
 		vb_addr = gpa_to_vva(dev, desc->addr);
@@ -587,14 +755,12 @@ virtio_dev_merge_tx(struct virtio_net *dev, struct rte_mempool *mbuf_pool, struc
 		vq->used->ring[used_idx].id = head[entry_success];
 		vq->used->ring[used_idx].len = 0;
 
-		vb_offset = 0;
-		vb_avail = desc->len;
 		/* Allocate an mbuf and populate the structure. */
 		m = rte_pktmbuf_alloc(mbuf_pool);
 		if (unlikely(m == NULL)) {
 			RTE_LOG(ERR, VHOST_DATA,
 				"Failed to allocate memory for mbuf.\n");
-			return entry_success;
+			break;
 		}
 		seg_offset = 0;
 		seg_avail = m->buf_len - RTE_PKTMBUF_HEADROOM;
@@ -606,7 +772,7 @@ virtio_dev_merge_tx(struct virtio_net *dev, struct rte_mempool *mbuf_pool, struc
 		cur = m;
 		prev = m;
 		while (cpy_len != 0) {
-			rte_memcpy((void *)(rte_pktmbuf_mtod(cur, char *) + seg_offset),
+			rte_memcpy(rte_pktmbuf_mtod_offset(cur, void *, seg_offset),
 				(void *)((uintptr_t)(vb_addr + vb_offset)),
 				cpy_len);
 
@@ -700,6 +866,8 @@ virtio_dev_merge_tx(struct virtio_net *dev, struct rte_mempool *mbuf_pool, struc
 			break;
 
 		m->nb_segs = seg_num;
+		if ((hdr->flags != 0) || (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE))
+			vhost_dequeue_offload(hdr, m);
 
 		pkts[entry_success] = m;
 		vq->last_used_idx++;
@@ -710,7 +878,6 @@ virtio_dev_merge_tx(struct virtio_net *dev, struct rte_mempool *mbuf_pool, struc
 	vq->used->idx += entry_success;
 	/* Kick guest if required. */
 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
-		eventfd_write((int)vq->kickfd, 1);
+		eventfd_write(vq->callfd, (eventfd_t)1);
 	return entry_success;
-
 }