mbuf: replace data pointer by an offset
[dpdk.git] / examples / vhost / main.c
index e0fc2c9..85ee8b8 100644 (file)
 #define BURST_RX_WAIT_US 15    /* Defines how long we wait between retries on RX */
 #define BURST_RX_RETRIES 4             /* Number of retries on RX. */
 
+#define JUMBO_FRAME_MAX_SIZE    0x2600
+
 /* State of virtio device. */
 #define DEVICE_MAC_LEARNING 0
 #define DEVICE_RX                      1
 /* Max number of devices. Limited by vmdq. */
 #define MAX_DEVICES 64
 
-/* Size of buffers used for rte_snprintfs. */
+/* Size of buffers used for snprintfs. */
 #define MAX_PRINT_BUFF 6072
 
 /* Maximum character device basename size. */
@@ -498,7 +500,7 @@ us_vhost_parse_basename(const char *q_arg)
        if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ)
                return -1;
        else
-               rte_snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg);
+               snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg);
 
        return 0;
 }
@@ -676,8 +678,12 @@ us_vhost_parse_args(int argc, char **argv)
                                        us_vhost_usage(prgname);
                                        return -1;
                                } else {
-                                       if (ret)
+                                       if (ret) {
+                                               vmdq_conf_default.rxmode.jumbo_frame = 1;
+                                               vmdq_conf_default.rxmode.max_rx_pkt_len
+                                                       = JUMBO_FRAME_MAX_SIZE;
                                                VHOST_FEATURES = (1ULL << VIRTIO_NET_F_MRG_RXBUF);
+                                       }
                                }
                        }
 
@@ -727,10 +733,10 @@ us_vhost_parse_args(int argc, char **argv)
                                        zero_copy = ret;
 
                                if (zero_copy) {
-#ifdef RTE_MBUF_SCATTER_GATHER
+#ifdef RTE_MBUF_REFCNT
                                        RTE_LOG(ERR, VHOST_CONFIG, "Before running "
                                        "zero copy vhost APP, please "
-                                       "disable RTE_MBUF_SCATTER_GATHER\n"
+                                       "disable RTE_MBUF_REFCNT\n"
                                        "in config file and then rebuild DPDK "
                                        "core lib!\n"
                                        "Otherwise please disable zero copy "
@@ -797,6 +803,14 @@ us_vhost_parse_args(int argc, char **argv)
                return -1;
        }
 
+       if ((zero_copy == 1) && (vmdq_conf_default.rxmode.jumbo_frame == 1)) {
+               RTE_LOG(INFO, VHOST_PORT,
+                       "Vhost zero copy doesn't support jumbo frame,"
+                       "please specify '--mergeable 0' to disable the "
+                       "mergeable feature.\n");
+               return -1;
+       }
+
        return 0;
 }
 
@@ -837,14 +851,14 @@ static unsigned check_ports_num(unsigned nb_ports)
        char packet[MAX_PRINT_BUFF];                                                                                                                                                                    \
                                                                                                                                                                                                                                        \
        if ((header))                                                                                                                                                                                                   \
-               rte_snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size));                              \
+               snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size));                          \
        else                                                                                                                                                                                                                    \
-               rte_snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size));                              \
+               snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size));                          \
        for (index = 0; index < (size); index++) {                                                                                                                                              \
-               rte_snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF),        \
+               snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF),    \
                        "%02hhx ", pkt_addr[index]);                                                                                                                                                    \
        }                                                                                                                                                                                                                               \
-       rte_snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n"); \
+       snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n");     \
                                                                                                                                                                                                                                        \
        LOG_DEBUG(VHOST_DATA, "%s", packet);                                                                                                                                                                    \
 } while(0)
@@ -916,7 +930,7 @@ gpa_to_hpa(struct virtio_net *dev, uint64_t guest_pa,
  * This function adds buffers to the virtio devices RX virtqueue. Buffers can
  * be received from the physical port or from another virtio device. A packet
  * count is returned to indicate the number of packets that were succesfully
- * added to the RX queue.
+ * added to the RX queue. This function works when mergeable is disabled.
  */
 static inline uint32_t __attribute__((always_inline))
 virtio_dev_rx(struct virtio_net *dev, struct rte_mbuf **pkts, uint32_t count)
@@ -930,7 +944,6 @@ virtio_dev_rx(struct virtio_net *dev, struct rte_mbuf **pkts, uint32_t count)
        uint64_t buff_hdr_addr = 0;
        uint32_t head[MAX_PKT_BURST], packet_len = 0;
        uint32_t head_idx, packet_success = 0;
-       uint32_t mergeable, mrg_count = 0;
        uint32_t retry = 0;
        uint16_t avail_idx, res_cur_idx;
        uint16_t res_base_idx, res_end_idx;
@@ -940,6 +953,7 @@ virtio_dev_rx(struct virtio_net *dev, struct rte_mbuf **pkts, uint32_t count)
        LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh);
        vq = dev->virtqueue[VIRTIO_RXQ];
        count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count;
+
        /* As many data cores may want access to available buffers, they need to be reserved. */
        do {
                res_base_idx = vq->last_used_idx_res;
@@ -976,9 +990,6 @@ virtio_dev_rx(struct virtio_net *dev, struct rte_mbuf **pkts, uint32_t count)
        /* Prefetch available ring to retrieve indexes. */
        rte_prefetch0(&vq->avail->ring[res_cur_idx & (vq->size - 1)]);
 
-       /* Check if the VIRTIO_NET_F_MRG_RXBUF feature is enabled. */
-       mergeable = dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF);
-
        /* Retrieve all of the head indexes first to avoid caching issues. */
        for (head_idx = 0; head_idx < count; head_idx++)
                head[head_idx] = vq->avail->ring[(res_cur_idx + head_idx) & (vq->size - 1)];
@@ -997,56 +1008,44 @@ virtio_dev_rx(struct virtio_net *dev, struct rte_mbuf **pkts, uint32_t count)
                /* Prefetch buffer address. */
                rte_prefetch0((void*)(uintptr_t)buff_addr);
 
-               if (mergeable && (mrg_count != 0)) {
-                       desc->len = packet_len = rte_pktmbuf_data_len(buff);
-               } else {
-                       /* Copy virtio_hdr to packet and increment buffer address */
-                       buff_hdr_addr = buff_addr;
-                       packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen;
+               /* Copy virtio_hdr to packet and increment buffer address */
+               buff_hdr_addr = buff_addr;
+               packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen;
 
-                       /*
-                        * If the descriptors are chained the header and data are placed in
-                        * separate buffers.
-                        */
-                       if (desc->flags & VRING_DESC_F_NEXT) {
-                               desc->len = vq->vhost_hlen;
-                               desc = &vq->desc[desc->next];
-                               /* Buffer address translation. */
-                               buff_addr = gpa_to_vva(dev, desc->addr);
-                               desc->len = rte_pktmbuf_data_len(buff);
-                       } else {
-                               buff_addr += vq->vhost_hlen;
-                               desc->len = packet_len;
-                       }
+               /*
+                * If the descriptors are chained the header and data are
+                * placed in separate buffers.
+                */
+               if (desc->flags & VRING_DESC_F_NEXT) {
+                       desc->len = vq->vhost_hlen;
+                       desc = &vq->desc[desc->next];
+                       /* Buffer address translation. */
+                       buff_addr = gpa_to_vva(dev, desc->addr);
+                       desc->len = rte_pktmbuf_data_len(buff);
+               } else {
+                       buff_addr += vq->vhost_hlen;
+                       desc->len = packet_len;
                }
 
-               PRINT_PACKET(dev, (uintptr_t)buff_addr, rte_pktmbuf_data_len(buff), 0);
-
                /* Update used ring with desc information */
                vq->used->ring[res_cur_idx & (vq->size - 1)].id = head[packet_success];
                vq->used->ring[res_cur_idx & (vq->size - 1)].len = packet_len;
 
                /* Copy mbuf data to buffer */
-               rte_memcpy((void *)(uintptr_t)buff_addr, (const void*)buff->pkt.data, rte_pktmbuf_data_len(buff));
+               rte_memcpy((void *)(uintptr_t)buff_addr,
+                       rte_pktmbuf_mtod(buff, const void *),
+                       rte_pktmbuf_data_len(buff));
+               PRINT_PACKET(dev, (uintptr_t)buff_addr,
+                       rte_pktmbuf_data_len(buff), 0);
 
                res_cur_idx++;
                packet_success++;
 
-               /* If mergeable is disabled then a header is required per buffer. */
-               if (!mergeable) {
-                       rte_memcpy((void *)(uintptr_t)buff_hdr_addr, (const void*)&virtio_hdr, vq->vhost_hlen);
-                       PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
-               } else {
-                       mrg_count++;
-                       /* Merge buffer can only handle so many buffers at a time. Tell the guest if this limit is reached. */
-                       if ((mrg_count == MAX_MRG_PKT_BURST) || (res_cur_idx == res_end_idx)) {
-                               virtio_hdr.num_buffers = mrg_count;
-                               LOG_DEBUG(VHOST_DATA, "(%"PRIu64") RX: Num merge buffers %d\n", dev->device_fh, virtio_hdr.num_buffers);
-                               rte_memcpy((void *)(uintptr_t)buff_hdr_addr, (const void*)&virtio_hdr, vq->vhost_hlen);
-                               PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
-                               mrg_count = 0;
-                       }
-               }
+               rte_memcpy((void *)(uintptr_t)buff_hdr_addr,
+                       (const void *)&virtio_hdr, vq->vhost_hlen);
+
+               PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
+
                if (res_cur_idx < res_end_idx) {
                        /* Prefetch descriptor index. */
                        rte_prefetch0(&vq->desc[head[packet_success]]);
@@ -1068,6 +1067,356 @@ virtio_dev_rx(struct virtio_net *dev, struct rte_mbuf **pkts, uint32_t count)
        return count;
 }
 
+static inline uint32_t __attribute__((always_inline))
+copy_from_mbuf_to_vring(struct virtio_net *dev,
+       uint16_t res_base_idx, uint16_t res_end_idx,
+       struct rte_mbuf *pkt)
+{
+       uint32_t vec_idx = 0;
+       uint32_t entry_success = 0;
+       struct vhost_virtqueue *vq;
+       /* The virtio_hdr is initialised to 0. */
+       struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {
+               {0, 0, 0, 0, 0, 0}, 0};
+       uint16_t cur_idx = res_base_idx;
+       uint64_t vb_addr = 0;
+       uint64_t vb_hdr_addr = 0;
+       uint32_t seg_offset = 0;
+       uint32_t vb_offset = 0;
+       uint32_t seg_avail;
+       uint32_t vb_avail;
+       uint32_t cpy_len, entry_len;
+
+       if (pkt == NULL)
+               return 0;
+
+       LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| "
+               "End Index %d\n",
+               dev->device_fh, cur_idx, res_end_idx);
+
+       /*
+        * Convert from gpa to vva
+        * (guest physical addr -> vhost virtual addr)
+        */
+       vq = dev->virtqueue[VIRTIO_RXQ];
+       vb_addr =
+               gpa_to_vva(dev, vq->buf_vec[vec_idx].buf_addr);
+       vb_hdr_addr = vb_addr;
+
+       /* Prefetch buffer address. */
+       rte_prefetch0((void *)(uintptr_t)vb_addr);
+
+       virtio_hdr.num_buffers = res_end_idx - res_base_idx;
+
+       LOG_DEBUG(VHOST_DATA, "(%"PRIu64") RX: Num merge buffers %d\n",
+               dev->device_fh, virtio_hdr.num_buffers);
+
+       rte_memcpy((void *)(uintptr_t)vb_hdr_addr,
+               (const void *)&virtio_hdr, vq->vhost_hlen);
+
+       PRINT_PACKET(dev, (uintptr_t)vb_hdr_addr, vq->vhost_hlen, 1);
+
+       seg_avail = rte_pktmbuf_data_len(pkt);
+       vb_offset = vq->vhost_hlen;
+       vb_avail =
+               vq->buf_vec[vec_idx].buf_len - vq->vhost_hlen;
+
+       entry_len = vq->vhost_hlen;
+
+       if (vb_avail == 0) {
+               uint32_t desc_idx =
+                       vq->buf_vec[vec_idx].desc_idx;
+               vq->desc[desc_idx].len = vq->vhost_hlen;
+
+               if ((vq->desc[desc_idx].flags
+                       & VRING_DESC_F_NEXT) == 0) {
+                       /* Update used ring with desc information */
+                       vq->used->ring[cur_idx & (vq->size - 1)].id
+                               = vq->buf_vec[vec_idx].desc_idx;
+                       vq->used->ring[cur_idx & (vq->size - 1)].len
+                               = entry_len;
+
+                       entry_len = 0;
+                       cur_idx++;
+                       entry_success++;
+               }
+
+               vec_idx++;
+               vb_addr =
+                       gpa_to_vva(dev, vq->buf_vec[vec_idx].buf_addr);
+
+               /* Prefetch buffer address. */
+               rte_prefetch0((void *)(uintptr_t)vb_addr);
+               vb_offset = 0;
+               vb_avail = vq->buf_vec[vec_idx].buf_len;
+       }
+
+       cpy_len = RTE_MIN(vb_avail, seg_avail);
+
+       while (cpy_len > 0) {
+               /* Copy mbuf data to vring buffer */
+               rte_memcpy((void *)(uintptr_t)(vb_addr + vb_offset),
+                       (const void *)(rte_pktmbuf_mtod(pkt, char*) + seg_offset),
+                       cpy_len);
+
+               PRINT_PACKET(dev,
+                       (uintptr_t)(vb_addr + vb_offset),
+                       cpy_len, 0);
+
+               seg_offset += cpy_len;
+               vb_offset += cpy_len;
+               seg_avail -= cpy_len;
+               vb_avail -= cpy_len;
+               entry_len += cpy_len;
+
+               if (seg_avail != 0) {
+                       /*
+                        * The virtio buffer in this vring
+                        * entry reach to its end.
+                        * But the segment doesn't complete.
+                        */
+                       if ((vq->desc[vq->buf_vec[vec_idx].desc_idx].flags &
+                               VRING_DESC_F_NEXT) == 0) {
+                               /* Update used ring with desc information */
+                               vq->used->ring[cur_idx & (vq->size - 1)].id
+                                       = vq->buf_vec[vec_idx].desc_idx;
+                               vq->used->ring[cur_idx & (vq->size - 1)].len
+                                       = entry_len;
+                               entry_len = 0;
+                               cur_idx++;
+                               entry_success++;
+                       }
+
+                       vec_idx++;
+                       vb_addr = gpa_to_vva(dev,
+                               vq->buf_vec[vec_idx].buf_addr);
+                       vb_offset = 0;
+                       vb_avail = vq->buf_vec[vec_idx].buf_len;
+                       cpy_len = RTE_MIN(vb_avail, seg_avail);
+               } else {
+                       /*
+                        * This current segment complete, need continue to
+                        * check if the whole packet complete or not.
+                        */
+                       pkt = pkt->next;
+                       if (pkt != NULL) {
+                               /*
+                                * There are more segments.
+                                */
+                               if (vb_avail == 0) {
+                                       /*
+                                        * This current buffer from vring is
+                                        * used up, need fetch next buffer
+                                        * from buf_vec.
+                                        */
+                                       uint32_t desc_idx =
+                                               vq->buf_vec[vec_idx].desc_idx;
+                                       vq->desc[desc_idx].len = vb_offset;
+
+                                       if ((vq->desc[desc_idx].flags &
+                                               VRING_DESC_F_NEXT) == 0) {
+                                               uint16_t wrapped_idx =
+                                                       cur_idx & (vq->size - 1);
+                                               /*
+                                                * Update used ring with the
+                                                * descriptor information
+                                                */
+                                               vq->used->ring[wrapped_idx].id
+                                                       = desc_idx;
+                                               vq->used->ring[wrapped_idx].len
+                                                       = entry_len;
+                                               entry_success++;
+                                               entry_len = 0;
+                                               cur_idx++;
+                                       }
+
+                                       /* Get next buffer from buf_vec. */
+                                       vec_idx++;
+                                       vb_addr = gpa_to_vva(dev,
+                                               vq->buf_vec[vec_idx].buf_addr);
+                                       vb_avail =
+                                               vq->buf_vec[vec_idx].buf_len;
+                                       vb_offset = 0;
+                               }
+
+                               seg_offset = 0;
+                               seg_avail = rte_pktmbuf_data_len(pkt);
+                               cpy_len = RTE_MIN(vb_avail, seg_avail);
+                       } else {
+                               /*
+                                * This whole packet completes.
+                                */
+                               uint32_t desc_idx =
+                                       vq->buf_vec[vec_idx].desc_idx;
+                               vq->desc[desc_idx].len = vb_offset;
+
+                               while (vq->desc[desc_idx].flags &
+                                       VRING_DESC_F_NEXT) {
+                                       desc_idx = vq->desc[desc_idx].next;
+                                        vq->desc[desc_idx].len = 0;
+                               }
+
+                               /* Update used ring with desc information */
+                               vq->used->ring[cur_idx & (vq->size - 1)].id
+                                       = vq->buf_vec[vec_idx].desc_idx;
+                               vq->used->ring[cur_idx & (vq->size - 1)].len
+                                       = entry_len;
+                               entry_len = 0;
+                               cur_idx++;
+                               entry_success++;
+                               seg_avail = 0;
+                               cpy_len = RTE_MIN(vb_avail, seg_avail);
+                       }
+               }
+       }
+
+       return entry_success;
+}
+
+/*
+ * This function adds buffers to the virtio devices RX virtqueue. Buffers can
+ * be received from the physical port or from another virtio device. A packet
+ * count is returned to indicate the number of packets that were succesfully
+ * added to the RX queue. This function works for mergeable RX.
+ */
+static inline uint32_t __attribute__((always_inline))
+virtio_dev_merge_rx(struct virtio_net *dev, struct rte_mbuf **pkts,
+       uint32_t count)
+{
+       struct vhost_virtqueue *vq;
+       uint32_t pkt_idx = 0, entry_success = 0;
+       uint32_t retry = 0;
+       uint16_t avail_idx, res_cur_idx;
+       uint16_t res_base_idx, res_end_idx;
+       uint8_t success = 0;
+
+       LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_merge_rx()\n",
+               dev->device_fh);
+       vq = dev->virtqueue[VIRTIO_RXQ];
+       count = RTE_MIN((uint32_t)MAX_PKT_BURST, count);
+
+       if (count == 0)
+               return 0;
+
+       for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
+               uint32_t secure_len = 0;
+               uint16_t need_cnt;
+               uint32_t vec_idx = 0;
+               uint32_t pkt_len = pkts[pkt_idx]->pkt_len + vq->vhost_hlen;
+               uint16_t i, id;
+
+               do {
+                       /*
+                        * As many data cores may want access to available
+                        * buffers, they need to be reserved.
+                        */
+                       res_base_idx = vq->last_used_idx_res;
+                       res_cur_idx = res_base_idx;
+
+                       do {
+                               avail_idx = *((volatile uint16_t *)&vq->avail->idx);
+                               if (unlikely(res_cur_idx == avail_idx)) {
+                                       /*
+                                        * If retry is enabled and the queue is
+                                        * full then we wait and retry to avoid
+                                        * packet loss.
+                                        */
+                                       if (enable_retry) {
+                                               uint8_t cont = 0;
+                                               for (retry = 0; retry < burst_rx_retry_num; retry++) {
+                                                       rte_delay_us(burst_rx_delay_time);
+                                                       avail_idx =
+                                                               *((volatile uint16_t *)&vq->avail->idx);
+                                                       if (likely(res_cur_idx != avail_idx)) {
+                                                               cont = 1;
+                                                               break;
+                                                       }
+                                               }
+                                               if (cont == 1)
+                                                       continue;
+                                       }
+
+                                       LOG_DEBUG(VHOST_DATA,
+                                               "(%"PRIu64") Failed "
+                                               "to get enough desc from "
+                                               "vring\n",
+                                               dev->device_fh);
+                                       return pkt_idx;
+                               } else {
+                                       uint16_t wrapped_idx =
+                                               (res_cur_idx) & (vq->size - 1);
+                                       uint32_t idx =
+                                               vq->avail->ring[wrapped_idx];
+                                       uint8_t next_desc;
+
+                                       do {
+                                               next_desc = 0;
+                                               secure_len += vq->desc[idx].len;
+                                               if (vq->desc[idx].flags &
+                                                       VRING_DESC_F_NEXT) {
+                                                       idx = vq->desc[idx].next;
+                                                       next_desc = 1;
+                                               }
+                                       } while (next_desc);
+
+                                       res_cur_idx++;
+                               }
+                       } while (pkt_len > secure_len);
+
+                       /* vq->last_used_idx_res is atomically updated. */
+                       success = rte_atomic16_cmpset(&vq->last_used_idx_res,
+                                                       res_base_idx,
+                                                       res_cur_idx);
+               } while (success == 0);
+
+               id = res_base_idx;
+               need_cnt = res_cur_idx - res_base_idx;
+
+               for (i = 0; i < need_cnt; i++, id++) {
+                       uint16_t wrapped_idx = id & (vq->size - 1);
+                       uint32_t idx = vq->avail->ring[wrapped_idx];
+                       uint8_t next_desc;
+                       do {
+                               next_desc = 0;
+                               vq->buf_vec[vec_idx].buf_addr =
+                                       vq->desc[idx].addr;
+                               vq->buf_vec[vec_idx].buf_len =
+                                       vq->desc[idx].len;
+                               vq->buf_vec[vec_idx].desc_idx = idx;
+                               vec_idx++;
+
+                               if (vq->desc[idx].flags & VRING_DESC_F_NEXT) {
+                                       idx = vq->desc[idx].next;
+                                       next_desc = 1;
+                               }
+                       } while (next_desc);
+               }
+
+               res_end_idx = res_cur_idx;
+
+               entry_success = copy_from_mbuf_to_vring(dev, res_base_idx,
+                       res_end_idx, pkts[pkt_idx]);
+
+               rte_compiler_barrier();
+
+               /*
+                * Wait until it's our turn to add our buffer
+                * to the used ring.
+                */
+               while (unlikely(vq->last_used_idx != res_base_idx))
+                       rte_pause();
+
+               *(volatile uint16_t *)&vq->used->idx += entry_success;
+               vq->last_used_idx = res_end_idx;
+
+               /* Kick the guest if necessary. */
+               if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
+                       eventfd_write((int)vq->kickfd, 1);
+       }
+
+       return count;
+}
+
 /*
  * Compares a packet destination MAC address to a device MAC address.
  */
@@ -1089,7 +1438,7 @@ link_vmdq(struct virtio_net *dev, struct rte_mbuf *m)
        int i, ret;
 
        /* Learn MAC address of guest device from packet */
-       pkt_hdr = (struct ether_hdr *)m->pkt.data;
+       pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
 
        dev_ll = ll_root_used;
 
@@ -1176,7 +1525,7 @@ virtio_tx_local(struct virtio_net *dev, struct rte_mbuf *m)
        struct ether_hdr *pkt_hdr;
        uint64_t ret = 0;
 
-       pkt_hdr = (struct ether_hdr *)m->pkt.data;
+       pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
 
        /*get the used devices list*/
        dev_ll = ll_root_used;
@@ -1199,8 +1548,17 @@ virtio_tx_local(struct virtio_net *dev, struct rte_mbuf *m)
                                /*drop the packet if the device is marked for removal*/
                                LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Device is marked for removal\n", dev_ll->dev->device_fh);
                        } else {
+                               uint32_t mergeable =
+                                       dev_ll->dev->features &
+                                       (1 << VIRTIO_NET_F_MRG_RXBUF);
+
                                /*send the packet to the local virtio device*/
-                               ret = virtio_dev_rx(dev_ll->dev, &m, 1);
+                               if (likely(mergeable == 0))
+                                       ret = virtio_dev_rx(dev_ll->dev, &m, 1);
+                               else
+                                       ret = virtio_dev_merge_rx(dev_ll->dev,
+                                               &m, 1);
+
                                if (enable_stats) {
                                        rte_atomic64_add(
                                        &dev_statistics[dev_ll->dev->device_fh].rx_total_atomic,
@@ -1231,11 +1589,11 @@ virtio_tx_route(struct virtio_net* dev, struct rte_mbuf *m, struct rte_mempool *
        struct mbuf_table *tx_q;
        struct vlan_ethhdr *vlan_hdr;
        struct rte_mbuf **m_table;
-       struct rte_mbuf *mbuf;
+       struct rte_mbuf *mbuf, *prev;
        unsigned len, ret, offset = 0;
        const uint16_t lcore_id = rte_lcore_id();
        struct virtio_net_data_ll *dev_ll = ll_root_used;
-       struct ether_hdr *pkt_hdr = (struct ether_hdr *)m->pkt.data;
+       struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
 
        /*check if destination is local VM*/
        if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(dev, m) == 0))
@@ -1284,26 +1642,54 @@ virtio_tx_route(struct virtio_net* dev, struct rte_mbuf *m, struct rte_mempool *
        /* Allocate an mbuf and populate the structure. */
        mbuf = rte_pktmbuf_alloc(mbuf_pool);
        if (unlikely(mbuf == NULL)) {
-               RTE_LOG(ERR, VHOST_DATA, "Failed to allocate memory for mbuf.\n");
+               RTE_LOG(ERR, VHOST_DATA,
+                       "Failed to allocate memory for mbuf.\n");
                return;
        }
 
-       mbuf->pkt.data_len = m->pkt.data_len + VLAN_HLEN + offset;
-       mbuf->pkt.pkt_len = mbuf->pkt.data_len;
+       mbuf->data_len = m->data_len + VLAN_HLEN + offset;
+       mbuf->pkt_len = m->pkt_len + VLAN_HLEN + offset;
+       mbuf->nb_segs = m->nb_segs;
 
        /* Copy ethernet header to mbuf. */
-       rte_memcpy((void*)mbuf->pkt.data, (const void*)m->pkt.data, ETH_HLEN);
+       rte_memcpy(rte_pktmbuf_mtod(mbuf, void *),
+               rte_pktmbuf_mtod(m, const void *),
+               ETH_HLEN);
 
 
        /* Setup vlan header. Bytes need to be re-ordered for network with htons()*/
-       vlan_hdr = (struct vlan_ethhdr *) mbuf->pkt.data;
+       vlan_hdr = rte_pktmbuf_mtod(mbuf, struct vlan_ethhdr *);
        vlan_hdr->h_vlan_encapsulated_proto = vlan_hdr->h_vlan_proto;
        vlan_hdr->h_vlan_proto = htons(ETH_P_8021Q);
        vlan_hdr->h_vlan_TCI = htons(vlan_tag);
 
        /* Copy the remaining packet contents to the mbuf. */
-       rte_memcpy((void*) ((uint8_t*)mbuf->pkt.data + VLAN_ETH_HLEN),
-               (const void*) ((uint8_t*)m->pkt.data + ETH_HLEN), (m->pkt.data_len - ETH_HLEN));
+       rte_memcpy((void *)(rte_pktmbuf_mtod(mbuf, uint8_t *) + VLAN_ETH_HLEN),
+               (const void *)(rte_pktmbuf_mtod(m, uint8_t *) + ETH_HLEN),
+               (m->data_len - ETH_HLEN));
+
+       /* Copy the remaining segments for the whole packet. */
+       prev = mbuf;
+       while (m->next) {
+               /* Allocate an mbuf and populate the structure. */
+               struct rte_mbuf *next_mbuf = rte_pktmbuf_alloc(mbuf_pool);
+               if (unlikely(next_mbuf == NULL)) {
+                       rte_pktmbuf_free(mbuf);
+                       RTE_LOG(ERR, VHOST_DATA,
+                               "Failed to allocate memory for mbuf.\n");
+                       return;
+               }
+
+               m = m->next;
+               prev->next = next_mbuf;
+               prev = next_mbuf;
+               next_mbuf->data_len = m->data_len;
+
+               /* Copy data to next mbuf. */
+               rte_memcpy(rte_pktmbuf_mtod(next_mbuf, void *),
+                       rte_pktmbuf_mtod(m, const void *), m->data_len);
+       }
+
        tx_q->m_table[len] = mbuf;
        len++;
        if (enable_stats) {
@@ -1393,8 +1779,9 @@ virtio_dev_tx(struct virtio_net* dev, struct rte_mempool *mbuf_pool)
                vq->used->ring[used_idx].len = 0;
 
                /* Setup dummy mbuf. This is copied to a real mbuf if transmitted out the physical port. */
-               m.pkt.data_len = desc->len;
-               m.pkt.data = (void*)(uintptr_t)buff_addr;
+               m.data_len = desc->len;
+               m.pkt_len = desc->len;
+               m.data_off = 0;
 
                PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0);
 
@@ -1420,6 +1807,227 @@ virtio_dev_tx(struct virtio_net* dev, struct rte_mempool *mbuf_pool)
                eventfd_write((int)vq->kickfd, 1);
 }
 
+/* This function works for TX packets with mergeable feature enabled. */
+static inline void __attribute__((always_inline))
+virtio_dev_merge_tx(struct virtio_net *dev, struct rte_mempool *mbuf_pool)
+{
+       struct rte_mbuf *m, *prev;
+       struct vhost_virtqueue *vq;
+       struct vring_desc *desc;
+       uint64_t vb_addr = 0;
+       uint32_t head[MAX_PKT_BURST];
+       uint32_t used_idx;
+       uint32_t i;
+       uint16_t free_entries, entry_success = 0;
+       uint16_t avail_idx;
+       uint32_t buf_size = MBUF_SIZE - (sizeof(struct rte_mbuf)
+                       + RTE_PKTMBUF_HEADROOM);
+
+       vq = dev->virtqueue[VIRTIO_TXQ];
+       avail_idx =  *((volatile uint16_t *)&vq->avail->idx);
+
+       /* If there are no available buffers then return. */
+       if (vq->last_used_idx == avail_idx)
+               return;
+
+       LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_merge_tx()\n",
+               dev->device_fh);
+
+       /* Prefetch available ring to retrieve head indexes. */
+       rte_prefetch0(&vq->avail->ring[vq->last_used_idx & (vq->size - 1)]);
+
+       /*get the number of free entries in the ring*/
+       free_entries = (avail_idx - vq->last_used_idx);
+
+       /* Limit to MAX_PKT_BURST. */
+       free_entries = RTE_MIN(free_entries, MAX_PKT_BURST);
+
+       LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n",
+               dev->device_fh, free_entries);
+       /* Retrieve all of the head indexes first to avoid caching issues. */
+       for (i = 0; i < free_entries; i++)
+               head[i] = vq->avail->ring[(vq->last_used_idx + i) & (vq->size - 1)];
+
+       /* Prefetch descriptor index. */
+       rte_prefetch0(&vq->desc[head[entry_success]]);
+       rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
+
+       while (entry_success < free_entries) {
+               uint32_t vb_avail, vb_offset;
+               uint32_t seg_avail, seg_offset;
+               uint32_t cpy_len;
+               uint32_t seg_num = 0;
+               struct rte_mbuf *cur;
+               uint8_t alloc_err = 0;
+
+               desc = &vq->desc[head[entry_success]];
+
+               /* Discard first buffer as it is the virtio header */
+               desc = &vq->desc[desc->next];
+
+               /* Buffer address translation. */
+               vb_addr = gpa_to_vva(dev, desc->addr);
+               /* Prefetch buffer address. */
+               rte_prefetch0((void *)(uintptr_t)vb_addr);
+
+               used_idx = vq->last_used_idx & (vq->size - 1);
+
+               if (entry_success < (free_entries - 1)) {
+                       /* Prefetch descriptor index. */
+                       rte_prefetch0(&vq->desc[head[entry_success+1]]);
+                       rte_prefetch0(&vq->used->ring[(used_idx + 1) & (vq->size - 1)]);
+               }
+
+               /* Update used index buffer information. */
+               vq->used->ring[used_idx].id = head[entry_success];
+               vq->used->ring[used_idx].len = 0;
+
+               vb_offset = 0;
+               vb_avail = desc->len;
+               seg_offset = 0;
+               seg_avail = buf_size;
+               cpy_len = RTE_MIN(vb_avail, seg_avail);
+
+               PRINT_PACKET(dev, (uintptr_t)vb_addr, desc->len, 0);
+
+               /* Allocate an mbuf and populate the structure. */
+               m = rte_pktmbuf_alloc(mbuf_pool);
+               if (unlikely(m == NULL)) {
+                       RTE_LOG(ERR, VHOST_DATA,
+                               "Failed to allocate memory for mbuf.\n");
+                       return;
+               }
+
+               seg_num++;
+               cur = m;
+               prev = m;
+               while (cpy_len != 0) {
+                       rte_memcpy((void *)(rte_pktmbuf_mtod(cur, char *) + seg_offset),
+                               (void *)((uintptr_t)(vb_addr + vb_offset)),
+                               cpy_len);
+
+                       seg_offset += cpy_len;
+                       vb_offset += cpy_len;
+                       vb_avail -= cpy_len;
+                       seg_avail -= cpy_len;
+
+                       if (vb_avail != 0) {
+                               /*
+                                * The segment reachs to its end,
+                                * while the virtio buffer in TX vring has
+                                * more data to be copied.
+                                */
+                               cur->data_len = seg_offset;
+                               m->pkt_len += seg_offset;
+                               /* Allocate mbuf and populate the structure. */
+                               cur = rte_pktmbuf_alloc(mbuf_pool);
+                               if (unlikely(cur == NULL)) {
+                                       RTE_LOG(ERR, VHOST_DATA, "Failed to "
+                                               "allocate memory for mbuf.\n");
+                                       rte_pktmbuf_free(m);
+                                       alloc_err = 1;
+                                       break;
+                               }
+
+                               seg_num++;
+                               prev->next = cur;
+                               prev = cur;
+                               seg_offset = 0;
+                               seg_avail = buf_size;
+                       } else {
+                               if (desc->flags & VRING_DESC_F_NEXT) {
+                                       /*
+                                        * There are more virtio buffers in
+                                        * same vring entry need to be copied.
+                                        */
+                                       if (seg_avail == 0) {
+                                               /*
+                                                * The current segment hasn't
+                                                * room to accomodate more
+                                                * data.
+                                                */
+                                               cur->data_len = seg_offset;
+                                               m->pkt_len += seg_offset;
+                                               /*
+                                                * Allocate an mbuf and
+                                                * populate the structure.
+                                                */
+                                               cur = rte_pktmbuf_alloc(mbuf_pool);
+                                               if (unlikely(cur == NULL)) {
+                                                       RTE_LOG(ERR,
+                                                               VHOST_DATA,
+                                                               "Failed to "
+                                                               "allocate memory "
+                                                               "for mbuf\n");
+                                                       rte_pktmbuf_free(m);
+                                                       alloc_err = 1;
+                                                       break;
+                                               }
+                                               seg_num++;
+                                               prev->next = cur;
+                                               prev = cur;
+                                               seg_offset = 0;
+                                               seg_avail = buf_size;
+                                       }
+
+                                       desc = &vq->desc[desc->next];
+
+                                       /* Buffer address translation. */
+                                       vb_addr = gpa_to_vva(dev, desc->addr);
+                                       /* Prefetch buffer address. */
+                                       rte_prefetch0((void *)(uintptr_t)vb_addr);
+                                       vb_offset = 0;
+                                       vb_avail = desc->len;
+
+                                       PRINT_PACKET(dev, (uintptr_t)vb_addr,
+                                               desc->len, 0);
+                               } else {
+                                       /* The whole packet completes. */
+                                       cur->data_len = seg_offset;
+                                       m->pkt_len += seg_offset;
+                                       vb_avail = 0;
+                               }
+                       }
+
+                       cpy_len = RTE_MIN(vb_avail, seg_avail);
+               }
+
+               if (unlikely(alloc_err == 1))
+                       break;
+
+               m->nb_segs = seg_num;
+
+               /*
+                * If this is the first received packet we need to learn
+                * the MAC and setup VMDQ
+                */
+               if (dev->ready == DEVICE_MAC_LEARNING) {
+                       if (dev->remove || (link_vmdq(dev, m) == -1)) {
+                               /*
+                                * Discard frame if device is scheduled for
+                                * removal or a duplicate MAC address is found.
+                                */
+                               entry_success = free_entries;
+                               vq->last_used_idx += entry_success;
+                               rte_pktmbuf_free(m);
+                               break;
+                       }
+               }
+
+               virtio_tx_route(dev, m, mbuf_pool, (uint16_t)dev->device_fh);
+               vq->last_used_idx++;
+               entry_success++;
+               rte_pktmbuf_free(m);
+       }
+
+       rte_compiler_barrier();
+       vq->used->idx += entry_success;
+       /* Kick guest if required. */
+       if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
+               eventfd_write((int)vq->kickfd, 1);
+
+}
+
 /*
  * This function is called by each data core. It handles all RX/TX registered with the
  * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared
@@ -1440,8 +2048,9 @@ switch_worker(__attribute__((unused)) void *arg)
        const uint16_t lcore_id = rte_lcore_id();
        const uint16_t num_cores = (uint16_t)rte_lcore_count();
        uint16_t rx_count = 0;
+       uint32_t mergeable = 0;
 
-       RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started \n", lcore_id);
+       RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
        lcore_ll = lcore_info[lcore_id].lcore_ll;
        prev_tsc = 0;
 
@@ -1497,6 +2106,8 @@ switch_worker(__attribute__((unused)) void *arg)
                while (dev_ll != NULL) {
                        /*get virtio device ID*/
                        dev = dev_ll->dev;
+                       mergeable =
+                               dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF);
 
                        if (dev->remove) {
                                dev_ll = dev_ll->next;
@@ -1510,7 +2121,15 @@ switch_worker(__attribute__((unused)) void *arg)
                                        (uint16_t)dev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
 
                                if (rx_count) {
-                                       ret_count = virtio_dev_rx(dev, pkts_burst, rx_count);
+                                       if (likely(mergeable == 0))
+                                               ret_count =
+                                                       virtio_dev_rx(dev,
+                                                       pkts_burst, rx_count);
+                                       else
+                                               ret_count =
+                                                       virtio_dev_merge_rx(dev,
+                                                       pkts_burst, rx_count);
+
                                        if (enable_stats) {
                                                rte_atomic64_add(
                                                &dev_statistics[dev_ll->dev->device_fh].rx_total_atomic,
@@ -1520,15 +2139,19 @@ switch_worker(__attribute__((unused)) void *arg)
                                        }
                                        while (likely(rx_count)) {
                                                rx_count--;
-                                               rte_pktmbuf_free_seg(pkts_burst[rx_count]);
+                                               rte_pktmbuf_free(pkts_burst[rx_count]);
                                        }
 
                                }
                        }
 
-                       if (!dev->remove)
+                       if (!dev->remove) {
                                /*Handle guest TX*/
-                               virtio_dev_tx(dev, mbuf_pool);
+                               if (likely(mergeable == 0))
+                                       virtio_dev_tx(dev, mbuf_pool);
+                               else
+                                       virtio_dev_merge_tx(dev, mbuf_pool);
+                       }
 
                        /*move to the next device in the list*/
                        dev_ll = dev_ll->next;
@@ -1713,9 +2336,9 @@ attach_rxmbuf_zcp(struct virtio_net *dev)
        }
 
        mbuf->buf_addr = (void *)(uintptr_t)(buff_addr - RTE_PKTMBUF_HEADROOM);
-       mbuf->pkt.data = (void *)(uintptr_t)(buff_addr);
+       mbuf->data_off = RTE_PKTMBUF_HEADROOM;
        mbuf->buf_physaddr = phys_addr - RTE_PKTMBUF_HEADROOM;
-       mbuf->pkt.data_len = desc->len;
+       mbuf->data_len = desc->len;
        MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
 
        LOG_DEBUG(VHOST_DATA,
@@ -1750,9 +2373,9 @@ static inline void pktmbuf_detach_zcp(struct rte_mbuf *m)
 
        buf_ofs = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ?
                        RTE_PKTMBUF_HEADROOM : m->buf_len;
-       m->pkt.data = (char *) m->buf_addr + buf_ofs;
+       m->data_off = buf_ofs;
 
-       m->pkt.data_len = 0;
+       m->data_len = 0;
 }
 
 /*
@@ -1984,7 +2607,7 @@ virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m,
        unsigned len, ret, offset = 0;
        struct vpool *vpool;
        struct virtio_net_data_ll *dev_ll = ll_root_used;
-       struct ether_hdr *pkt_hdr = (struct ether_hdr *)m->pkt.data;
+       struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
        uint16_t vlan_tag = (uint16_t)vlan_tags[(uint16_t)dev->device_fh];
 
        /*Add packet to the port tx queue*/
@@ -2055,24 +2678,24 @@ virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m,
                }
        }
 
-       mbuf->pkt.nb_segs = m->pkt.nb_segs;
-       mbuf->pkt.next = m->pkt.next;
-       mbuf->pkt.data_len = m->pkt.data_len + offset;
-       mbuf->pkt.pkt_len = mbuf->pkt.data_len;
+       mbuf->nb_segs = m->nb_segs;
+       mbuf->next = m->next;
+       mbuf->data_len = m->data_len + offset;
+       mbuf->pkt_len = mbuf->data_len;
        if (unlikely(need_copy)) {
                /* Copy the packet contents to the mbuf. */
-               rte_memcpy((void *)((uint8_t *)mbuf->pkt.data),
-                       (const void *) ((uint8_t *)m->pkt.data),
-                       m->pkt.data_len);
+               rte_memcpy(rte_pktmbuf_mtod(mbuf, void *),
+                       rte_pktmbuf_mtod(m, void *),
+                       m->data_len);
        } else {
-               mbuf->pkt.data = m->pkt.data;
+               mbuf->data_off = m->data_off;
                mbuf->buf_physaddr = m->buf_physaddr;
                mbuf->buf_addr = m->buf_addr;
        }
        mbuf->ol_flags = PKT_TX_VLAN_PKT;
-       mbuf->pkt.vlan_macip.f.vlan_tci = vlan_tag;
-       mbuf->pkt.vlan_macip.f.l2_len = sizeof(struct ether_hdr);
-       mbuf->pkt.vlan_macip.f.l3_len = sizeof(struct ipv4_hdr);
+       mbuf->vlan_tci = vlan_tag;
+       mbuf->l2_len = sizeof(struct ether_hdr);
+       mbuf->l3_len = sizeof(struct ipv4_hdr);
        MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
 
        tx_q->m_table[len] = mbuf;
@@ -2081,8 +2704,8 @@ virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m,
        LOG_DEBUG(VHOST_DATA,
                "(%"PRIu64") in tx_route_zcp: pkt: nb_seg: %d, next:%s\n",
                dev->device_fh,
-               mbuf->pkt.nb_segs,
-               (mbuf->pkt.next == NULL) ? "null" : "non-null");
+               mbuf->nb_segs,
+               (mbuf->next == NULL) ? "null" : "non-null");
 
        if (enable_stats) {
                dev_statistics[dev->device_fh].tx_total++;
@@ -2196,11 +2819,11 @@ virtio_dev_tx_zcp(struct virtio_net *dev)
                 * Setup dummy mbuf. This is copied to a real mbuf if
                 * transmitted out the physical port.
                 */
-               m.pkt.data_len = desc->len;
-               m.pkt.nb_segs = 1;
-               m.pkt.next = NULL;
-               m.pkt.data = (void *)(uintptr_t)buff_addr;
-               m.buf_addr = m.pkt.data;
+               m.data_len = desc->len;
+               m.nb_segs = 1;
+               m.next = NULL;
+               m.data_off = 0;
+               m.buf_addr = (void *)(uintptr_t)buff_addr;
                m.buf_physaddr = phys_addr;
 
                /*
@@ -2992,9 +3615,9 @@ MAIN(int argc, char *argv[])
                        + num_switching_cores * MAX_PKT_BURST;
 
                for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
-                       rte_snprintf(pool_name, sizeof(pool_name),
+                       snprintf(pool_name, sizeof(pool_name),
                                "rxmbuf_pool_%u", queue_id);
-                       rte_snprintf(ring_name, sizeof(ring_name),
+                       snprintf(ring_name, sizeof(ring_name),
                                "rxmbuf_ring_%u", queue_id);
                        setup_mempool_tbl(rte_socket_id(), queue_id,
                                pool_name, ring_name, nb_mbuf);
@@ -3005,9 +3628,9 @@ MAIN(int argc, char *argv[])
                                + num_switching_cores * MAX_PKT_BURST;
 
                for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
-                       rte_snprintf(pool_name, sizeof(pool_name),
+                       snprintf(pool_name, sizeof(pool_name),
                                "txmbuf_pool_%u", queue_id);
-                       rte_snprintf(ring_name, sizeof(ring_name),
+                       snprintf(ring_name, sizeof(ring_name),
                                "txmbuf_ring_%u", queue_id);
                        setup_mempool_tbl(rte_socket_id(),
                                (queue_id + MAX_QUEUES),