This reconnect option is enabled by default. However, it can be turned off
by setting this flag.
- - ``RTE_VHOST_USER_DEQUEUE_ZERO_COPY``
-
- Dequeue zero copy will be enabled when this flag is set. It is disabled by
- default.
-
- There are some truths (including limitations) you might want to know while
- setting this flag:
-
- * zero copy is not good for small packets (typically for packet size below
- 512).
-
- * zero copy is really good for VM2VM case. For iperf between two VMs, the
- boost could be above 70% (when TSO is enabled).
-
- * For zero copy in VM2NIC case, guest Tx used vring may be starved if the
- PMD driver consume the mbuf but not release them timely.
-
- For example, i40e driver has an optimization to maximum NIC pipeline which
- postpones returning transmitted mbuf until only tx_free_threshold free
- descs left. The virtio TX used ring will be starved if the formula
- (num_i40e_tx_desc - num_virtio_tx_desc > tx_free_threshold) is true, since
- i40e will not return back mbuf.
-
- A performance tip for tuning zero copy in VM2NIC case is to adjust the
- frequency of mbuf free (i.e. adjust tx_free_threshold of i40e driver) to
- balance consumer and producer.
-
- * Guest memory should be backended with huge pages to achieve better
- performance. Using 1G page size is the best.
-
- When dequeue zero copy is enabled, the guest phys address and host phys
- address mapping has to be established. Using non-huge pages means far
- more page segments. To make it simple, DPDK vhost does a linear search
- of those segments, thus the fewer the segments, the quicker we will get
- the mapping. NOTE: we may speed it by using tree searching in future.
-
- * zero copy can not work when using vfio-pci with iommu mode currently, this
- is because we don't setup iommu dma mapping for guest memory. If you have
- to use vfio-pci driver, please insert vfio-pci kernel module in noiommu
- mode.
-
- * The consumer of zero copy mbufs should consume these mbufs as soon as
- possible, otherwise it may block the operations in vhost.
-
- ``RTE_VHOST_USER_IOMMU_SUPPORT``
IOMMU support will be enabled when this flag is set. It is disabled by
* Memory pre-allocation
- For non-zerocopy non-async data path, guest memory pre-allocation is not a
+ For non-async data path, guest memory pre-allocation is not a
must. This can help save of memory. If users really want the guest memory
to be pre-allocated (e.g., for performance reason), we can add option
``-mem-prealloc`` when starting QEMU. Or, we can lock all memory at vhost
side which will force memory to be allocated when mmap at vhost side;
option --mlockall in ovs-dpdk is an example in hand.
- For async and zerocopy data path, we force the VM memory to be
- pre-allocated at vhost lib when mapping the guest memory; and also we need
- to lock the memory to prevent pages being swapped out to disk.
+ For async data path, we force the VM memory to be pre-allocated at vhost
+ lib when mapping the guest memory; and also we need to lock the memory to
+ prevent pages being swapped out to disk.
* Memory sharing
return ret == -1 ? (uint64_t)-1 : (uint64_t)stat.st_blksize;
}
-/*
- * Reclaim all the outstanding zmbufs for a virtqueue.
- */
-static void
-drain_zmbuf_list(struct vhost_virtqueue *vq)
-{
- struct zcopy_mbuf *zmbuf, *next;
-
- for (zmbuf = TAILQ_FIRST(&vq->zmbuf_list);
- zmbuf != NULL; zmbuf = next) {
- next = TAILQ_NEXT(zmbuf, next);
-
- while (!mbuf_is_consumed(zmbuf->mbuf))
- usleep(1000);
-
- TAILQ_REMOVE(&vq->zmbuf_list, zmbuf, next);
- restore_mbuf(zmbuf->mbuf);
- rte_pktmbuf_free(zmbuf->mbuf);
- put_zmbuf(zmbuf);
- vq->nr_zmbuf -= 1;
- }
-}
-
static void
free_mem_region(struct virtio_net *dev)
{
uint32_t i;
struct rte_vhost_mem_region *reg;
- struct vhost_virtqueue *vq;
if (!dev || !dev->mem)
return;
- if (dev->dequeue_zero_copy) {
- for (i = 0; i < dev->nr_vring; i++) {
- vq = dev->virtqueue[i];
- if (vq)
- drain_zmbuf_list(vq);
- }
- }
-
for (i = 0; i < dev->mem->nregions; i++) {
reg = &dev->mem->regions[i];
if (reg->host_user_addr) {
return RTE_VHOST_MSG_RESULT_ERR;
}
- if (dev->dequeue_zero_copy) {
- vq->nr_zmbuf = 0;
- vq->last_zmbuf_idx = 0;
- vq->zmbuf_size = vq->size;
- if (vq->zmbufs)
- rte_free(vq->zmbufs);
- vq->zmbufs = rte_zmalloc(NULL, vq->zmbuf_size *
- sizeof(struct zcopy_mbuf), 0);
- if (vq->zmbufs == NULL) {
- VHOST_LOG_CONFIG(WARNING,
- "failed to allocate mem for zero copy; "
- "zero copy is force disabled\n");
- dev->dequeue_zero_copy = 0;
- }
- TAILQ_INIT(&vq->zmbuf_list);
- }
-
if (vq_is_packed(dev)) {
if (vq->shadow_used_packed)
rte_free(vq->shadow_used_packed);
int oldnode, newnode;
struct virtio_net *old_dev;
struct vhost_virtqueue *old_vq, *vq;
- struct zcopy_mbuf *new_zmbuf;
struct vring_used_elem *new_shadow_used_split;
struct vring_used_elem_packed *new_shadow_used_packed;
struct batch_copy_elem *new_batch_copy_elems;
return dev;
memcpy(vq, old_vq, sizeof(*vq));
- TAILQ_INIT(&vq->zmbuf_list);
-
- if (dev->dequeue_zero_copy) {
- new_zmbuf = rte_malloc_socket(NULL, vq->zmbuf_size *
- sizeof(struct zcopy_mbuf), 0, newnode);
- if (new_zmbuf) {
- rte_free(vq->zmbufs);
- vq->zmbufs = new_zmbuf;
- }
- }
if (vq_is_packed(dev)) {
new_shadow_used_packed = rte_malloc_socket(NULL,
goto err_mmap;
}
- populate = (dev->dequeue_zero_copy || dev->async_copy) ?
- MAP_POPULATE : 0;
+ populate = dev->async_copy ? MAP_POPULATE : 0;
mmap_addr = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
MAP_SHARED | populate, fd, 0);
reg->host_user_addr = (uint64_t)(uintptr_t)mmap_addr +
mmap_offset;
- if (dev->dequeue_zero_copy || dev->async_copy)
+ if (dev->async_copy)
if (add_guest_pages(dev, reg, alignment) < 0) {
VHOST_LOG_CONFIG(ERR,
"adding guest pages to region %u failed.\n",
return RTE_VHOST_MSG_RESULT_OK;
}
-static void
-free_zmbufs(struct vhost_virtqueue *vq)
-{
- drain_zmbuf_list(vq);
-
- rte_free(vq->zmbufs);
- vq->zmbufs = NULL;
-}
-
/*
* when virtio is stopped, qemu will send us the GET_VRING_BASE message.
*/
vq->signalled_used_valid = false;
- if (dev->dequeue_zero_copy)
- free_zmbufs(vq);
if (vq_is_packed(dev)) {
rte_free(vq->shadow_used_packed);
vq->shadow_used_packed = NULL;
}
}
- /* On disable, rings have to be stopped being processed. */
- if (!enable && dev->dequeue_zero_copy)
- drain_zmbuf_list(dev->virtqueue[index]);
-
dev->virtqueue[index]->enabled = enable;
return RTE_VHOST_MSG_RESULT_OK;
struct rte_mbuf *m, struct rte_mempool *mbuf_pool)
{
uint32_t buf_avail, buf_offset;
- uint64_t buf_addr, buf_iova, buf_len;
+ uint64_t buf_addr, buf_len;
uint32_t mbuf_avail, mbuf_offset;
uint32_t cpy_len;
struct rte_mbuf *cur = m, *prev = m;
int error = 0;
buf_addr = buf_vec[vec_idx].buf_addr;
- buf_iova = buf_vec[vec_idx].buf_iova;
buf_len = buf_vec[vec_idx].buf_len;
if (unlikely(buf_len < dev->vhost_hlen && nr_vec <= 1)) {
buf_offset = dev->vhost_hlen - buf_len;
vec_idx++;
buf_addr = buf_vec[vec_idx].buf_addr;
- buf_iova = buf_vec[vec_idx].buf_iova;
buf_len = buf_vec[vec_idx].buf_len;
buf_avail = buf_len - buf_offset;
} else if (buf_len == dev->vhost_hlen) {
if (unlikely(++vec_idx >= nr_vec))
goto out;
buf_addr = buf_vec[vec_idx].buf_addr;
- buf_iova = buf_vec[vec_idx].buf_iova;
buf_len = buf_vec[vec_idx].buf_len;
buf_offset = 0;
mbuf_offset = 0;
mbuf_avail = m->buf_len - RTE_PKTMBUF_HEADROOM;
while (1) {
- uint64_t hpa;
-
cpy_len = RTE_MIN(buf_avail, mbuf_avail);
- /*
- * A desc buf might across two host physical pages that are
- * not continuous. In such case (gpa_to_hpa returns 0), data
- * will be copied even though zero copy is enabled.
- */
- if (unlikely(dev->dequeue_zero_copy && (hpa = gpa_to_hpa(dev,
- buf_iova + buf_offset, cpy_len)))) {
- cur->data_len = cpy_len;
- cur->data_off = 0;
- cur->buf_addr =
- (void *)(uintptr_t)(buf_addr + buf_offset);
- cur->buf_iova = hpa;
-
- /*
- * In zero copy mode, one mbuf can only reference data
- * for one or partial of one desc buff.
- */
- mbuf_avail = cpy_len;
- } else {
- if (likely(cpy_len > MAX_BATCH_LEN ||
- vq->batch_copy_nb_elems >= vq->size ||
- (hdr && cur == m))) {
- rte_memcpy(rte_pktmbuf_mtod_offset(cur, void *,
- mbuf_offset),
- (void *)((uintptr_t)(buf_addr +
- buf_offset)),
- cpy_len);
- } else {
- batch_copy[vq->batch_copy_nb_elems].dst =
- rte_pktmbuf_mtod_offset(cur, void *,
- mbuf_offset);
- batch_copy[vq->batch_copy_nb_elems].src =
+ if (likely(cpy_len > MAX_BATCH_LEN ||
+ vq->batch_copy_nb_elems >= vq->size ||
+ (hdr && cur == m))) {
+ rte_memcpy(rte_pktmbuf_mtod_offset(cur, void *,
+ mbuf_offset),
(void *)((uintptr_t)(buf_addr +
- buf_offset));
- batch_copy[vq->batch_copy_nb_elems].len =
- cpy_len;
- vq->batch_copy_nb_elems++;
- }
+ buf_offset)), cpy_len);
+ } else {
+ batch_copy[vq->batch_copy_nb_elems].dst =
+ rte_pktmbuf_mtod_offset(cur, void *,
+ mbuf_offset);
+ batch_copy[vq->batch_copy_nb_elems].src =
+ (void *)((uintptr_t)(buf_addr + buf_offset));
+ batch_copy[vq->batch_copy_nb_elems].len = cpy_len;
+ vq->batch_copy_nb_elems++;
}
mbuf_avail -= cpy_len;
break;
buf_addr = buf_vec[vec_idx].buf_addr;
- buf_iova = buf_vec[vec_idx].buf_iova;
buf_len = buf_vec[vec_idx].buf_len;
buf_offset = 0;
error = -1;
goto out;
}
- if (unlikely(dev->dequeue_zero_copy))
- rte_mbuf_refcnt_update(cur, 1);
prev->next = cur;
prev->data_len = mbuf_offset;
return error;
}
-static __rte_always_inline struct zcopy_mbuf *
-get_zmbuf(struct vhost_virtqueue *vq)
-{
- uint16_t i;
- uint16_t last;
- int tries = 0;
-
- /* search [last_zmbuf_idx, zmbuf_size) */
- i = vq->last_zmbuf_idx;
- last = vq->zmbuf_size;
-
-again:
- for (; i < last; i++) {
- if (vq->zmbufs[i].in_use == 0) {
- vq->last_zmbuf_idx = i + 1;
- vq->zmbufs[i].in_use = 1;
- return &vq->zmbufs[i];
- }
- }
-
- tries++;
- if (tries == 1) {
- /* search [0, last_zmbuf_idx) */
- i = 0;
- last = vq->last_zmbuf_idx;
- goto again;
- }
-
- return NULL;
-}
-
static void
virtio_dev_extbuf_free(void *addr __rte_unused, void *opaque)
{
uint16_t dropped = 0;
static bool allocerr_warned;
- if (unlikely(dev->dequeue_zero_copy)) {
- struct zcopy_mbuf *zmbuf, *next;
-
- for (zmbuf = TAILQ_FIRST(&vq->zmbuf_list);
- zmbuf != NULL; zmbuf = next) {
- next = TAILQ_NEXT(zmbuf, next);
-
- if (mbuf_is_consumed(zmbuf->mbuf)) {
- update_shadow_used_ring_split(vq,
- zmbuf->desc_idx, 0);
- TAILQ_REMOVE(&vq->zmbuf_list, zmbuf, next);
- restore_mbuf(zmbuf->mbuf);
- rte_pktmbuf_free(zmbuf->mbuf);
- put_zmbuf(zmbuf);
- vq->nr_zmbuf -= 1;
- }
- }
-
- if (likely(vq->shadow_used_idx)) {
- flush_shadow_used_ring_split(dev, vq);
- vhost_vring_call_split(dev, vq);
- }
- }
-
/*
* The ordering between avail index and
* desc reads needs to be enforced.
VHOST_ACCESS_RO) < 0))
break;
- if (likely(dev->dequeue_zero_copy == 0))
- update_shadow_used_ring_split(vq, head_idx, 0);
+ update_shadow_used_ring_split(vq, head_idx, 0);
pkts[i] = virtio_dev_pktmbuf_alloc(dev, mbuf_pool, buf_len);
if (unlikely(pkts[i] == NULL)) {
i++;
break;
}
-
- if (unlikely(dev->dequeue_zero_copy)) {
- struct zcopy_mbuf *zmbuf;
-
- zmbuf = get_zmbuf(vq);
- if (!zmbuf) {
- rte_pktmbuf_free(pkts[i]);
- dropped += 1;
- i++;
- break;
- }
- zmbuf->mbuf = pkts[i];
- zmbuf->desc_idx = head_idx;
-
- /*
- * Pin lock the mbuf; we will check later to see
- * whether the mbuf is freed (when we are the last
- * user) or not. If that's the case, we then could
- * update the used ring safely.
- */
- rte_mbuf_refcnt_update(pkts[i], 1);
-
- vq->nr_zmbuf += 1;
- TAILQ_INSERT_TAIL(&vq->zmbuf_list, zmbuf, next);
- }
}
+
vq->last_avail_idx += i;
- if (likely(dev->dequeue_zero_copy == 0)) {
- do_data_copy_dequeue(vq);
- if (unlikely(i < count))
- vq->shadow_used_idx = i;
- if (likely(vq->shadow_used_idx)) {
- flush_shadow_used_ring_split(dev, vq);
- vhost_vring_call_split(dev, vq);
- }
+ do_data_copy_dequeue(vq);
+ if (unlikely(i < count))
+ vq->shadow_used_idx = i;
+ if (likely(vq->shadow_used_idx)) {
+ flush_shadow_used_ring_split(dev, vq);
+ vhost_vring_call_split(dev, vq);
}
return (i - dropped);
return ret;
}
-static __rte_always_inline int
-virtio_dev_tx_batch_packed_zmbuf(struct virtio_net *dev,
- struct vhost_virtqueue *vq,
- struct rte_mempool *mbuf_pool,
- struct rte_mbuf **pkts)
-{
- struct zcopy_mbuf *zmbufs[PACKED_BATCH_SIZE];
- uintptr_t desc_addrs[PACKED_BATCH_SIZE];
- uint16_t ids[PACKED_BATCH_SIZE];
- uint16_t i;
-
- uint16_t avail_idx = vq->last_avail_idx;
-
- if (vhost_reserve_avail_batch_packed(dev, vq, mbuf_pool, pkts,
- avail_idx, desc_addrs, ids))
- return -1;
-
- vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
- zmbufs[i] = get_zmbuf(vq);
-
- vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
- if (!zmbufs[i])
- goto free_pkt;
- }
-
- vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
- zmbufs[i]->mbuf = pkts[i];
- zmbufs[i]->desc_idx = ids[i];
- zmbufs[i]->desc_count = 1;
- }
-
- vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
- rte_mbuf_refcnt_update(pkts[i], 1);
-
- vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
- TAILQ_INSERT_TAIL(&vq->zmbuf_list, zmbufs[i], next);
-
- vq->nr_zmbuf += PACKED_BATCH_SIZE;
- vq_inc_last_avail_packed(vq, PACKED_BATCH_SIZE);
-
- return 0;
-
-free_pkt:
- vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
- rte_pktmbuf_free(pkts[i]);
-
- return -1;
-}
-
-static __rte_always_inline int
-virtio_dev_tx_single_packed_zmbuf(struct virtio_net *dev,
- struct vhost_virtqueue *vq,
- struct rte_mempool *mbuf_pool,
- struct rte_mbuf **pkts)
-{
- uint16_t buf_id, desc_count;
- struct zcopy_mbuf *zmbuf;
-
- if (vhost_dequeue_single_packed(dev, vq, mbuf_pool, pkts, &buf_id,
- &desc_count))
- return -1;
-
- zmbuf = get_zmbuf(vq);
- if (!zmbuf) {
- rte_pktmbuf_free(*pkts);
- return -1;
- }
- zmbuf->mbuf = *pkts;
- zmbuf->desc_idx = buf_id;
- zmbuf->desc_count = desc_count;
-
- rte_mbuf_refcnt_update(*pkts, 1);
-
- vq->nr_zmbuf += 1;
- TAILQ_INSERT_TAIL(&vq->zmbuf_list, zmbuf, next);
-
- vq_inc_last_avail_packed(vq, desc_count);
- return 0;
-}
-
-static __rte_always_inline void
-free_zmbuf(struct vhost_virtqueue *vq)
-{
- struct zcopy_mbuf *next = NULL;
- struct zcopy_mbuf *zmbuf;
-
- for (zmbuf = TAILQ_FIRST(&vq->zmbuf_list);
- zmbuf != NULL; zmbuf = next) {
- next = TAILQ_NEXT(zmbuf, next);
-
- uint16_t last_used_idx = vq->last_used_idx;
-
- if (mbuf_is_consumed(zmbuf->mbuf)) {
- uint16_t flags;
- flags = vq->desc_packed[last_used_idx].flags;
- if (vq->used_wrap_counter) {
- flags |= VRING_DESC_F_USED;
- flags |= VRING_DESC_F_AVAIL;
- } else {
- flags &= ~VRING_DESC_F_USED;
- flags &= ~VRING_DESC_F_AVAIL;
- }
-
- vq->desc_packed[last_used_idx].id = zmbuf->desc_idx;
- vq->desc_packed[last_used_idx].len = 0;
-
- rte_smp_wmb();
- vq->desc_packed[last_used_idx].flags = flags;
-
- vq_inc_last_used_packed(vq, zmbuf->desc_count);
-
- TAILQ_REMOVE(&vq->zmbuf_list, zmbuf, next);
- restore_mbuf(zmbuf->mbuf);
- rte_pktmbuf_free(zmbuf->mbuf);
- put_zmbuf(zmbuf);
- vq->nr_zmbuf -= 1;
- }
- }
-}
-
-static __rte_noinline uint16_t
-virtio_dev_tx_packed_zmbuf(struct virtio_net *dev,
- struct vhost_virtqueue *__rte_restrict vq,
- struct rte_mempool *mbuf_pool,
- struct rte_mbuf **__rte_restrict pkts,
- uint32_t count)
-{
- uint32_t pkt_idx = 0;
- uint32_t remained = count;
-
- free_zmbuf(vq);
-
- do {
- if (remained >= PACKED_BATCH_SIZE) {
- if (!virtio_dev_tx_batch_packed_zmbuf(dev, vq,
- mbuf_pool, &pkts[pkt_idx])) {
- pkt_idx += PACKED_BATCH_SIZE;
- remained -= PACKED_BATCH_SIZE;
- continue;
- }
- }
-
- if (virtio_dev_tx_single_packed_zmbuf(dev, vq, mbuf_pool,
- &pkts[pkt_idx]))
- break;
- pkt_idx++;
- remained--;
-
- } while (remained);
-
- if (pkt_idx)
- vhost_vring_call_packed(dev, vq);
-
- return pkt_idx;
-}
-
static __rte_noinline uint16_t
virtio_dev_tx_packed(struct virtio_net *dev,
struct vhost_virtqueue *__rte_restrict vq,
count -= 1;
}
- if (vq_is_packed(dev)) {
- if (unlikely(dev->dequeue_zero_copy))
- count = virtio_dev_tx_packed_zmbuf(dev, vq, mbuf_pool,
- pkts, count);
- else
- count = virtio_dev_tx_packed(dev, vq, mbuf_pool, pkts,
- count);
- } else
+ if (vq_is_packed(dev))
+ count = virtio_dev_tx_packed(dev, vq, mbuf_pool, pkts, count);
+ else
count = virtio_dev_tx_split(dev, vq, mbuf_pool, pkts, count);
out: