doc: add Meson coding style to contributors guide
[dpdk.git] / lib / librte_vhost / virtio_net.c
index 0307956..ff39878 100644 (file)
@@ -117,31 +117,6 @@ flush_shadow_used_ring_split(struct virtio_net *dev, struct vhost_virtqueue *vq)
                sizeof(vq->used->idx));
 }
 
-static __rte_always_inline void
-async_flush_shadow_used_ring_split(struct virtio_net *dev,
-       struct vhost_virtqueue *vq)
-{
-       uint16_t used_idx = vq->last_used_idx & (vq->size - 1);
-
-       if (used_idx + vq->shadow_used_idx <= vq->size) {
-               do_flush_shadow_used_ring_split(dev, vq, used_idx, 0,
-                                         vq->shadow_used_idx);
-       } else {
-               uint16_t size;
-
-               /* update used ring interval [used_idx, vq->size] */
-               size = vq->size - used_idx;
-               do_flush_shadow_used_ring_split(dev, vq, used_idx, 0, size);
-
-               /* update the left half used ring interval [0, left_size] */
-               do_flush_shadow_used_ring_split(dev, vq, 0, size,
-                                         vq->shadow_used_idx - size);
-       }
-
-       vq->last_used_idx += vq->shadow_used_idx;
-       vq->shadow_used_idx = 0;
-}
-
 static __rte_always_inline void
 update_shadow_used_ring_split(struct vhost_virtqueue *vq,
                         uint16_t desc_idx, uint32_t len)
@@ -171,7 +146,8 @@ vhost_flush_enqueue_shadow_packed(struct virtio_net *dev,
                        used_idx -= vq->size;
        }
 
-       rte_smp_wmb();
+       /* The ordering for storing desc flags needs to be enforced. */
+       rte_atomic_thread_fence(__ATOMIC_RELEASE);
 
        for (i = 0; i < vq->shadow_used_idx; i++) {
                uint16_t flags;
@@ -222,8 +198,9 @@ vhost_flush_dequeue_shadow_packed(struct virtio_net *dev,
        struct vring_used_elem_packed *used_elem = &vq->shadow_used_packed[0];
 
        vq->desc_packed[vq->shadow_last_used_idx].id = used_elem->id;
-       rte_smp_wmb();
-       vq->desc_packed[vq->shadow_last_used_idx].flags = used_elem->flags;
+       /* desc flags is the synchronization point for virtio packed vring */
+       __atomic_store_n(&vq->desc_packed[vq->shadow_last_used_idx].flags,
+                        used_elem->flags, __ATOMIC_RELEASE);
 
        vhost_log_cache_used_vring(dev, vq, vq->shadow_last_used_idx *
                                   sizeof(struct vring_packed_desc),
@@ -253,7 +230,7 @@ vhost_flush_enqueue_batch_packed(struct virtio_net *dev,
                vq->desc_packed[vq->last_used_idx + i].len = lens[i];
        }
 
-       rte_smp_wmb();
+       rte_atomic_thread_fence(__ATOMIC_RELEASE);
 
        vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
                vq->desc_packed[vq->last_used_idx + i].flags = flags;
@@ -312,7 +289,7 @@ vhost_shadow_dequeue_batch_packed(struct virtio_net *dev,
                vq->desc_packed[vq->last_used_idx + i].len = 0;
        }
 
-       rte_smp_wmb();
+       rte_atomic_thread_fence(__ATOMIC_RELEASE);
        vhost_for_each_try_unroll(i, begin, PACKED_BATCH_SIZE)
                vq->desc_packed[vq->last_used_idx + i].flags = flags;
 
@@ -571,10 +548,11 @@ fill_vec_buf_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
                        return -1;
                }
 
-               len += descs[idx].len;
+               dlen = descs[idx].len;
+               len += dlen;
 
                if (unlikely(map_one_desc(dev, vq, buf_vec, &vec_id,
-                                               descs[idx].addr, descs[idx].len,
+                                               descs[idx].addr, dlen,
                                                perm))) {
                        free_ind_table(idesc);
                        return -1;
@@ -691,9 +669,10 @@ fill_vec_buf_packed_indirect(struct virtio_net *dev,
                        return -1;
                }
 
-               *len += descs[i].len;
+               dlen = descs[i].len;
+               *len += dlen;
                if (unlikely(map_one_desc(dev, vq, buf_vec, &vec_id,
-                                               descs[i].addr, descs[i].len,
+                                               descs[i].addr, dlen,
                                                perm)))
                        return -1;
        }
@@ -714,6 +693,7 @@ fill_vec_buf_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
        bool wrap_counter = vq->avail_wrap_counter;
        struct vring_packed_desc *descs = vq->desc_packed;
        uint16_t vec_id = *vec_idx;
+       uint64_t dlen;
 
        if (avail_idx < vq->last_avail_idx)
                wrap_counter ^= 1;
@@ -746,11 +726,12 @@ fill_vec_buf_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
                                                        len, perm) < 0))
                                return -1;
                } else {
-                       *len += descs[avail_idx].len;
+                       dlen = descs[avail_idx].len;
+                       *len += dlen;
 
                        if (unlikely(map_one_desc(dev, vq, buf_vec, &vec_id,
                                                        descs[avail_idx].addr,
-                                                       descs[avail_idx].len,
+                                                       dlen,
                                                        perm)))
                                return -1;
                }
@@ -831,9 +812,10 @@ copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
 
        hdr_mbuf = m;
        hdr_addr = buf_addr;
-       if (unlikely(buf_len < dev->vhost_hlen))
+       if (unlikely(buf_len < dev->vhost_hlen)) {
+               memset(&tmp_hdr, 0, sizeof(struct virtio_net_hdr_mrg_rxbuf));
                hdr = &tmp_hdr;
-       else
+       else
                hdr = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)hdr_addr;
 
        VHOST_LOG_DATA(DEBUG, "(%d) RX: num merge buffers %d\n",
@@ -1004,9 +986,10 @@ async_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
 
        hdr_mbuf = m;
        hdr_addr = buf_addr;
-       if (unlikely(buf_len < dev->vhost_hlen))
+       if (unlikely(buf_len < dev->vhost_hlen)) {
+               memset(&tmp_hdr, 0, sizeof(struct virtio_net_hdr_mrg_rxbuf));
                hdr = &tmp_hdr;
-       else
+       else
                hdr = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)hdr_addr;
 
        VHOST_LOG_DATA(DEBUG, "(%d) RX: num merge buffers %d\n",
@@ -1128,8 +1111,12 @@ async_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
        }
 
 out:
-       async_fill_iter(src_it, tlen, src_iovec, tvec_idx);
-       async_fill_iter(dst_it, tlen, dst_iovec, tvec_idx);
+       if (tlen) {
+               async_fill_iter(src_it, tlen, src_iovec, tvec_idx);
+               async_fill_iter(dst_it, tlen, dst_iovec, tvec_idx);
+       } else {
+               src_it->count = 0;
+       }
 
        return error;
 }
@@ -1281,8 +1268,6 @@ virtio_dev_rx_batch_packed(struct virtio_net *dev,
                        return -1;
        }
 
-       rte_smp_rmb();
-
        vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
                lens[i] = descs[avail_idx + i].len;
 
@@ -1343,7 +1328,6 @@ virtio_dev_rx_single_packed(struct virtio_net *dev,
        struct buf_vector buf_vec[BUF_VECTOR_MAX];
        uint16_t nr_descs = 0;
 
-       rte_smp_rmb();
        if (unlikely(vhost_enqueue_single_packed(dev, vq, pkt, buf_vec,
                                                 &nr_descs) < 0)) {
                VHOST_LOG_DATA(DEBUG,
@@ -1418,13 +1402,13 @@ virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
 
        rte_spinlock_lock(&vq->access_lock);
 
-       if (unlikely(vq->enabled == 0))
+       if (unlikely(!vq->enabled))
                goto out_access_unlock;
 
        if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
                vhost_user_iotlb_rd_lock(vq);
 
-       if (unlikely(vq->access_ok == 0))
+       if (unlikely(!vq->access_ok))
                if (unlikely(vring_translate(dev, vq) < 0))
                        goto out;
 
@@ -1477,7 +1461,8 @@ virtio_dev_rx_async_get_info_idx(uint16_t pkts_idx,
 static __rte_noinline uint32_t
 virtio_dev_rx_async_submit_split(struct virtio_net *dev,
        struct vhost_virtqueue *vq, uint16_t queue_id,
-       struct rte_mbuf **pkts, uint32_t count)
+       struct rte_mbuf **pkts, uint32_t count,
+       struct rte_mbuf **comp_pkts, uint32_t *comp_count)
 {
        uint32_t pkt_idx = 0, pkt_burst_idx = 0;
        uint16_t num_buffers;
@@ -1491,19 +1476,20 @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
        struct iovec *dst_iovec = vec_pool + (VHOST_MAX_ASYNC_VEC >> 1);
        struct rte_vhost_iov_iter *src_it = it_pool;
        struct rte_vhost_iov_iter *dst_it = it_pool + 1;
-       uint16_t n_free_slot, slot_idx;
-       uint16_t pkt_err = 0;
+       uint16_t slot_idx = 0;
        uint16_t segs_await = 0;
        struct async_inflight_info *pkts_info = vq->async_pkts_info;
-       int n_pkts = 0;
-
-       avail_head = __atomic_load_n(&vq->avail->idx, __ATOMIC_ACQUIRE);
+       uint32_t n_pkts = 0, pkt_err = 0;
+       uint32_t num_async_pkts = 0, num_done_pkts = 0;
+       struct {
+               uint16_t pkt_idx;
+               uint16_t last_avail_idx;
+       } async_pkts_log[MAX_PKT_BURST];
 
        /*
-        * The ordering between avail index and
-        * desc reads needs to be enforced.
+        * The ordering between avail index and desc reads need to be enforced.
         */
-       rte_smp_rmb();
+       avail_head = __atomic_load_n(&vq->avail->idx, __ATOMIC_ACQUIRE);
 
        rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]);
 
@@ -1532,34 +1518,61 @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
                        break;
                }
 
-               slot_idx = (vq->async_pkts_idx + pkt_idx) & (vq->size - 1);
+               slot_idx = (vq->async_pkts_idx + num_async_pkts) &
+                       (vq->size - 1);
                if (src_it->count) {
-                       async_fill_desc(&tdes[pkt_burst_idx], src_it, dst_it);
-                       pkt_burst_idx++;
+                       uint16_t from, to;
+
+                       async_fill_desc(&tdes[pkt_burst_idx++], src_it, dst_it);
                        pkts_info[slot_idx].descs = num_buffers;
-                       pkts_info[slot_idx].segs = src_it->nr_segs;
+                       pkts_info[slot_idx].mbuf = pkts[pkt_idx];
+                       async_pkts_log[num_async_pkts].pkt_idx = pkt_idx;
+                       async_pkts_log[num_async_pkts++].last_avail_idx =
+                               vq->last_avail_idx;
                        src_iovec += src_it->nr_segs;
                        dst_iovec += dst_it->nr_segs;
                        src_it += 2;
                        dst_it += 2;
                        segs_await += src_it->nr_segs;
-               } else {
-                       pkts_info[slot_idx].info = num_buffers;
-                       vq->async_pkts_inflight_n++;
-               }
+
+                       /**
+                        * recover shadow used ring and keep DMA-occupied
+                        * descriptors.
+                        */
+                       from = vq->shadow_used_idx - num_buffers;
+                       to = vq->async_desc_idx & (vq->size - 1);
+                       if (num_buffers + to <= vq->size) {
+                               rte_memcpy(&vq->async_descs_split[to],
+                                               &vq->shadow_used_split[from],
+                                               num_buffers *
+                                               sizeof(struct vring_used_elem));
+                       } else {
+                               int size = vq->size - to;
+
+                               rte_memcpy(&vq->async_descs_split[to],
+                                               &vq->shadow_used_split[from],
+                                               size *
+                                               sizeof(struct vring_used_elem));
+                               rte_memcpy(vq->async_descs_split,
+                                               &vq->shadow_used_split[from +
+                                               size], (num_buffers - size) *
+                                          sizeof(struct vring_used_elem));
+                       }
+                       vq->async_desc_idx += num_buffers;
+                       vq->shadow_used_idx -= num_buffers;
+               } else
+                       comp_pkts[num_done_pkts++] = pkts[pkt_idx];
 
                vq->last_avail_idx += num_buffers;
 
                /*
                 * conditions to trigger async device transfer:
                 * - buffered packet number reaches transfer threshold
-                * - this is the last packet in the burst enqueue
                 * - unused async iov number is less than max vhost vector
                 */
-               if (pkt_burst_idx >= VHOST_ASYNC_BATCH_THRESHOLD ||
-                       (pkt_idx == count - 1 && pkt_burst_idx) ||
-                       (VHOST_MAX_ASYNC_VEC / 2 - segs_await <
-                       BUF_VECTOR_MAX)) {
+               if (unlikely(pkt_burst_idx >= VHOST_ASYNC_BATCH_THRESHOLD ||
+                       ((VHOST_MAX_ASYNC_VEC >> 1) - segs_await <
+                       BUF_VECTOR_MAX))) {
                        n_pkts = vq->async_ops.transfer_data(dev->vid,
                                        queue_id, tdes, 0, pkt_burst_idx);
                        src_iovec = vec_pool;
@@ -1569,7 +1582,7 @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
                        segs_await = 0;
                        vq->async_pkts_inflight_n += n_pkts;
 
-                       if (unlikely(n_pkts < (int)pkt_burst_idx)) {
+                       if (unlikely(n_pkts < pkt_burst_idx)) {
                                /*
                                 * log error packets number here and do actual
                                 * error processing when applications poll
@@ -1589,38 +1602,39 @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
                                queue_id, tdes, 0, pkt_burst_idx);
                vq->async_pkts_inflight_n += n_pkts;
 
-               if (unlikely(n_pkts < (int)pkt_burst_idx))
+               if (unlikely(n_pkts < pkt_burst_idx))
                        pkt_err = pkt_burst_idx - n_pkts;
        }
 
        do_data_copy_enqueue(dev, vq);
 
-       while (unlikely(pkt_err && pkt_idx)) {
-               if (pkts_info[slot_idx].segs)
-                       pkt_err--;
-               vq->last_avail_idx -= pkts_info[slot_idx].descs;
-               vq->shadow_used_idx -= pkts_info[slot_idx].descs;
-               vq->async_pkts_inflight_n--;
-               slot_idx = (slot_idx - 1) & (vq->size - 1);
-               pkt_idx--;
-       }
-
-       n_free_slot = vq->size - vq->async_pkts_idx;
-       if (n_free_slot > pkt_idx) {
-               rte_memcpy(&vq->async_pkts_pending[vq->async_pkts_idx],
-                       pkts, pkt_idx * sizeof(uintptr_t));
-               vq->async_pkts_idx += pkt_idx;
-       } else {
-               rte_memcpy(&vq->async_pkts_pending[vq->async_pkts_idx],
-                       pkts, n_free_slot * sizeof(uintptr_t));
-               rte_memcpy(&vq->async_pkts_pending[0],
-                       &pkts[n_free_slot],
-                       (pkt_idx - n_free_slot) * sizeof(uintptr_t));
-               vq->async_pkts_idx = pkt_idx - n_free_slot;
+       if (unlikely(pkt_err)) {
+               uint16_t num_descs = 0;
+
+               num_async_pkts -= pkt_err;
+               /* calculate the sum of descriptors of DMA-error packets. */
+               while (pkt_err-- > 0) {
+                       num_descs += pkts_info[slot_idx & (vq->size - 1)].descs;
+                       slot_idx--;
+               }
+               vq->async_desc_idx -= num_descs;
+               /* recover shadow used ring and available ring */
+               vq->shadow_used_idx -= (vq->last_avail_idx -
+                               async_pkts_log[num_async_pkts].last_avail_idx -
+                               num_descs);
+               vq->last_avail_idx =
+                       async_pkts_log[num_async_pkts].last_avail_idx;
+               pkt_idx = async_pkts_log[num_async_pkts].pkt_idx;
+               num_done_pkts = pkt_idx - num_async_pkts;
        }
 
-       if (likely(vq->shadow_used_idx))
-               async_flush_shadow_used_ring_split(dev, vq);
+       vq->async_pkts_idx += num_async_pkts;
+       *comp_count = num_done_pkts;
+
+       if (likely(vq->shadow_used_idx)) {
+               flush_shadow_used_ring_split(dev, vq);
+               vhost_vring_call_split(dev, vq);
+       }
 
        return pkt_idx;
 }
@@ -1632,8 +1646,8 @@ uint16_t rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
        struct vhost_virtqueue *vq;
        uint16_t n_pkts_cpl = 0, n_pkts_put = 0, n_descs = 0;
        uint16_t start_idx, pkts_idx, vq_size;
-       uint16_t n_inflight;
        struct async_inflight_info *pkts_info;
+       uint16_t from, i;
 
        if (!dev)
                return 0;
@@ -1655,8 +1669,7 @@ uint16_t rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
 
        rte_spinlock_lock(&vq->access_lock);
 
-       n_inflight = vq->async_pkts_inflight_n;
-       pkts_idx = vq->async_pkts_idx;
+       pkts_idx = vq->async_pkts_idx & (vq->size - 1);
        pkts_info = vq->async_pkts_info;
        vq_size = vq->size;
        start_idx = virtio_dev_rx_async_get_info_idx(pkts_idx,
@@ -1667,42 +1680,61 @@ uint16_t rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
                        queue_id, 0, count - vq->async_last_pkts_n);
        n_pkts_cpl += vq->async_last_pkts_n;
 
-       rte_smp_wmb();
-
-       while (likely((n_pkts_put < count) && n_inflight)) {
-               uint16_t info_idx = (start_idx + n_pkts_put) & (vq_size - 1);
-               if (n_pkts_cpl && pkts_info[info_idx].segs)
-                       n_pkts_cpl--;
-               else if (!n_pkts_cpl && pkts_info[info_idx].segs)
-                       break;
-               n_pkts_put++;
-               n_inflight--;
-               n_descs += pkts_info[info_idx].descs;
-       }
-
-       vq->async_last_pkts_n = n_pkts_cpl;
+       n_pkts_put = RTE_MIN(count, n_pkts_cpl);
+       if (unlikely(n_pkts_put == 0)) {
+               vq->async_last_pkts_n = n_pkts_cpl;
+               goto done;
+       }
+
+       for (i = 0; i < n_pkts_put; i++) {
+               from = (start_idx + i) & (vq_size - 1);
+               n_descs += pkts_info[from].descs;
+               pkts[i] = pkts_info[from].mbuf;
+       }
+       vq->async_last_pkts_n = n_pkts_cpl - n_pkts_put;
+       vq->async_pkts_inflight_n -= n_pkts_put;
+
+       if (likely(vq->enabled && vq->access_ok)) {
+               uint16_t nr_left = n_descs;
+               uint16_t nr_copy;
+               uint16_t to;
+
+               /* write back completed descriptors to used ring */
+               do {
+                       from = vq->last_async_desc_idx & (vq->size - 1);
+                       nr_copy = nr_left + from <= vq->size ? nr_left :
+                               vq->size - from;
+                       to = vq->last_used_idx & (vq->size - 1);
+
+                       if (to + nr_copy <= vq->size) {
+                               rte_memcpy(&vq->used->ring[to],
+                                               &vq->async_descs_split[from],
+                                               nr_copy *
+                                               sizeof(struct vring_used_elem));
+                       } else {
+                               uint16_t size = vq->size - to;
+
+                               rte_memcpy(&vq->used->ring[to],
+                                               &vq->async_descs_split[from],
+                                               size *
+                                               sizeof(struct vring_used_elem));
+                               rte_memcpy(vq->used->ring,
+                                               &vq->async_descs_split[from +
+                                               size], (nr_copy - size) *
+                                               sizeof(struct vring_used_elem));
+                       }
 
-       if (n_pkts_put) {
-               vq->async_pkts_inflight_n = n_inflight;
-               if (likely(vq->enabled && vq->access_ok)) {
-                       __atomic_add_fetch(&vq->used->idx,
-                                       n_descs, __ATOMIC_RELEASE);
-                       vhost_vring_call_split(dev, vq);
-               }
+                       vq->last_async_desc_idx += nr_copy;
+                       vq->last_used_idx += nr_copy;
+                       nr_left -= nr_copy;
+               } while (nr_left > 0);
 
-               if (start_idx + n_pkts_put <= vq_size) {
-                       rte_memcpy(pkts, &vq->async_pkts_pending[start_idx],
-                               n_pkts_put * sizeof(uintptr_t));
-               } else {
-                       rte_memcpy(pkts, &vq->async_pkts_pending[start_idx],
-                               (vq_size - start_idx) * sizeof(uintptr_t));
-                       rte_memcpy(&pkts[vq_size - start_idx],
-                               vq->async_pkts_pending,
-                               (n_pkts_put + start_idx - vq_size) *
-                               sizeof(uintptr_t));
-               }
-       }
+               __atomic_add_fetch(&vq->used->idx, n_descs, __ATOMIC_RELEASE);
+               vhost_vring_call_split(dev, vq);
+       } else
+               vq->last_async_desc_idx += n_descs;
 
+done:
        rte_spinlock_unlock(&vq->access_lock);
 
        return n_pkts_put;
@@ -1710,7 +1742,8 @@ uint16_t rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
 
 static __rte_always_inline uint32_t
 virtio_dev_rx_async_submit(struct virtio_net *dev, uint16_t queue_id,
-       struct rte_mbuf **pkts, uint32_t count)
+       struct rte_mbuf **pkts, uint32_t count,
+       struct rte_mbuf **comp_pkts, uint32_t *comp_count)
 {
        struct vhost_virtqueue *vq;
        uint32_t nb_tx = 0;
@@ -1726,13 +1759,13 @@ virtio_dev_rx_async_submit(struct virtio_net *dev, uint16_t queue_id,
 
        rte_spinlock_lock(&vq->access_lock);
 
-       if (unlikely(vq->enabled == 0 || !vq->async_registered))
+       if (unlikely(!vq->enabled || !vq->async_registered))
                goto out_access_unlock;
 
        if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
                vhost_user_iotlb_rd_lock(vq);
 
-       if (unlikely(vq->access_ok == 0))
+       if (unlikely(!vq->access_ok))
                if (unlikely(vring_translate(dev, vq) < 0))
                        goto out;
 
@@ -1745,7 +1778,8 @@ virtio_dev_rx_async_submit(struct virtio_net *dev, uint16_t queue_id,
                nb_tx = 0;
        else
                nb_tx = virtio_dev_rx_async_submit_split(dev,
-                               vq, queue_id, pkts, count);
+                               vq, queue_id, pkts, count, comp_pkts,
+                               comp_count);
 
 out:
        if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
@@ -1759,10 +1793,12 @@ out_access_unlock:
 
 uint16_t
 rte_vhost_submit_enqueue_burst(int vid, uint16_t queue_id,
-               struct rte_mbuf **pkts, uint16_t count)
+               struct rte_mbuf **pkts, uint16_t count,
+               struct rte_mbuf **comp_pkts, uint32_t *comp_count)
 {
        struct virtio_net *dev = get_device(vid);
 
+       *comp_count = 0;
        if (!dev)
                return 0;
 
@@ -1773,7 +1809,8 @@ rte_vhost_submit_enqueue_burst(int vid, uint16_t queue_id,
                return 0;
        }
 
-       return virtio_dev_rx_async_submit(dev, queue_id, pkts, count);
+       return virtio_dev_rx_async_submit(dev, queue_id, pkts, count, comp_pkts,
+                       comp_count);
 }
 
 static inline bool
@@ -2232,7 +2269,6 @@ vhost_reserve_avail_batch_packed(struct virtio_net *dev,
 {
        bool wrap = vq->avail_wrap_counter;
        struct vring_packed_desc *descs = vq->desc_packed;
-       struct virtio_net_hdr *hdr;
        uint64_t lens[PACKED_BATCH_SIZE];
        uint64_t buf_lens[PACKED_BATCH_SIZE];
        uint32_t buf_offset = sizeof(struct virtio_net_hdr_mrg_rxbuf);
@@ -2251,7 +2287,7 @@ vhost_reserve_avail_batch_packed(struct virtio_net *dev,
                        return -1;
        }
 
-       rte_smp_rmb();
+       rte_atomic_thread_fence(__ATOMIC_ACQUIRE);
 
        vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
                lens[i] = descs[avail_idx + i].len;
@@ -2284,18 +2320,11 @@ vhost_reserve_avail_batch_packed(struct virtio_net *dev,
        }
 
        vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
-               pkts[i]->pkt_len = descs[avail_idx + i].len - buf_offset;
+               pkts[i]->pkt_len = lens[i] - buf_offset;
                pkts[i]->data_len = pkts[i]->pkt_len;
                ids[i] = descs[avail_idx + i].id;
        }
 
-       if (virtio_net_with_host_offload(dev)) {
-               vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
-                       hdr = (struct virtio_net_hdr *)(desc_addrs[i]);
-                       vhost_dequeue_offload(hdr, pkts[i]);
-               }
-       }
-
        return 0;
 
 free_buf:
@@ -2313,6 +2342,7 @@ virtio_dev_tx_batch_packed(struct virtio_net *dev,
 {
        uint16_t avail_idx = vq->last_avail_idx;
        uint32_t buf_offset = sizeof(struct virtio_net_hdr_mrg_rxbuf);
+       struct virtio_net_hdr *hdr;
        uintptr_t desc_addrs[PACKED_BATCH_SIZE];
        uint16_t ids[PACKED_BATCH_SIZE];
        uint16_t i;
@@ -2329,6 +2359,13 @@ virtio_dev_tx_batch_packed(struct virtio_net *dev,
                           (void *)(uintptr_t)(desc_addrs[i] + buf_offset),
                           pkts[i]->pkt_len);
 
+       if (virtio_net_with_host_offload(dev)) {
+               vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
+                       hdr = (struct virtio_net_hdr *)(desc_addrs[i]);
+                       vhost_dequeue_offload(hdr, pkts[i]);
+               }
+       }
+
        if (virtio_net_is_inorder(dev))
                vhost_shadow_dequeue_batch_packed_inorder(vq,
                        ids[PACKED_BATCH_SIZE - 1]);
@@ -2487,7 +2524,7 @@ rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
        if (unlikely(rte_spinlock_trylock(&vq->access_lock) == 0))
                return 0;
 
-       if (unlikely(vq->enabled == 0)) {
+       if (unlikely(!vq->enabled)) {
                count = 0;
                goto out_access_unlock;
        }
@@ -2495,7 +2532,7 @@ rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
        if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
                vhost_user_iotlb_rd_lock(vq);
 
-       if (unlikely(vq->access_ok == 0))
+       if (unlikely(!vq->access_ok))
                if (unlikely(vring_translate(dev, vq) < 0)) {
                        count = 0;
                        goto out;