vhost: support indirect Tx descriptors
authorMaxime Coquelin <maxime.coquelin@redhat.com>
Tue, 27 Sep 2016 08:42:49 +0000 (10:42 +0200)
committerYuanhan Liu <yuanhan.liu@linux.intel.com>
Wed, 28 Sep 2016 00:18:33 +0000 (02:18 +0200)
Indirect descriptors are usually supported by virtio-net devices,
allowing to dispatch a larger number of requests.

When the virtio device sends a packet using indirect descriptors,
only one slot is used in the ring, even for large packets.

The main effect is to improve the 0% packet loss benchmark.
A PVP benchmark using Moongen (64 bytes) on the TE, and testpmd
(fwd io for host, macswap for VM) on DUT shows a +50% gain for
zero loss.

On the downside, micro-benchmark using testpmd txonly in VM and
rxonly on host shows a loss between 1 and 4%. But depending on
the needs, feature can be disabled at VM boot time by passing
indirect_desc=off argument to vhost-user device in Qemu.

Signed-off-by: Maxime Coquelin <maxime.coquelin@redhat.com>
Acked-by: Yuanhan Liu <yuanhan.liu@linux.intel.com>
doc/guides/rel_notes/release_16_11.rst
lib/librte_vhost/vhost.c
lib/librte_vhost/virtio_net.c

index a9a6095..cc4b4d7 100644 (file)
@@ -36,6 +36,18 @@ New Features
 
      This section is a comment. Make sure to start the actual text at the margin.
 
+* **Added vhost-user indirect descriptors support.**
+
+  If indirect descriptor feature is negotiated, each packet sent by the guest
+  will take exactly one slot in the enqueue virtqueue. Without the feature, in
+  current version, even 64 bytes packets take two slots with Virtio PMD on guest
+  side.
+
+  The main impact is better performance for 0% packet loss use-cases, as it
+  behaves as if the virtqueue size was enlarged, so more packets can be buffered
+  in case of system perturbations. On the downside, small performance degradation
+  is measured when running micro-benchmarks.
+
 
 Resolved Issues
 ---------------
index 46095c3..30bb0ce 100644 (file)
@@ -65,7 +65,8 @@
                                (1ULL << VIRTIO_NET_F_CSUM)    | \
                                (1ULL << VIRTIO_NET_F_GUEST_CSUM) | \
                                (1ULL << VIRTIO_NET_F_GUEST_TSO4) | \
-                               (1ULL << VIRTIO_NET_F_GUEST_TSO6))
+                               (1ULL << VIRTIO_NET_F_GUEST_TSO6) | \
+                               (1ULL << VIRTIO_RING_F_INDIRECT_DESC))
 
 uint64_t VHOST_FEATURES = VHOST_SUPPORTED_FEATURES;
 
index 8a151af..a59c39b 100644 (file)
@@ -679,8 +679,8 @@ make_rarp_packet(struct rte_mbuf *rarp_mbuf, const struct ether_addr *mac)
 }
 
 static inline int __attribute__((always_inline))
-copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
-                 struct rte_mbuf *m, uint16_t desc_idx,
+copy_desc_to_mbuf(struct virtio_net *dev, struct vring_desc *descs,
+                 uint16_t max_desc, struct rte_mbuf *m, uint16_t desc_idx,
                  struct rte_mempool *mbuf_pool)
 {
        struct vring_desc *desc;
@@ -693,8 +693,9 @@ copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
        /* A counter to avoid desc dead loop chain */
        uint32_t nr_desc = 1;
 
-       desc = &vq->desc[desc_idx];
-       if (unlikely(desc->len < dev->vhost_hlen))
+       desc = &descs[desc_idx];
+       if (unlikely((desc->len < dev->vhost_hlen)) ||
+                       (desc->flags & VRING_DESC_F_INDIRECT))
                return -1;
 
        desc_addr = gpa_to_vva(dev, desc->addr);
@@ -711,7 +712,9 @@ copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
         */
        if (likely((desc->len == dev->vhost_hlen) &&
                   (desc->flags & VRING_DESC_F_NEXT) != 0)) {
-               desc = &vq->desc[desc->next];
+               desc = &descs[desc->next];
+               if (unlikely(desc->flags & VRING_DESC_F_INDIRECT))
+                       return -1;
 
                desc_addr = gpa_to_vva(dev, desc->addr);
                if (unlikely(!desc_addr))
@@ -747,10 +750,12 @@ copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
                        if ((desc->flags & VRING_DESC_F_NEXT) == 0)
                                break;
 
-                       if (unlikely(desc->next >= vq->size ||
-                                    ++nr_desc > vq->size))
+                       if (unlikely(desc->next >= max_desc ||
+                                    ++nr_desc > max_desc))
+                               return -1;
+                       desc = &descs[desc->next];
+                       if (unlikely(desc->flags & VRING_DESC_F_INDIRECT))
                                return -1;
-                       desc = &vq->desc[desc->next];
 
                        desc_addr = gpa_to_vva(dev, desc->addr);
                        if (unlikely(!desc_addr))
@@ -878,19 +883,35 @@ rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
        /* Prefetch descriptor index. */
        rte_prefetch0(&vq->desc[desc_indexes[0]]);
        for (i = 0; i < count; i++) {
+               struct vring_desc *desc;
+               uint16_t sz, idx;
                int err;
 
                if (likely(i + 1 < count))
                        rte_prefetch0(&vq->desc[desc_indexes[i + 1]]);
 
+               if (vq->desc[desc_indexes[i]].flags & VRING_DESC_F_INDIRECT) {
+                       desc = (struct vring_desc *)(uintptr_t)gpa_to_vva(dev,
+                                       vq->desc[desc_indexes[i]].addr);
+                       if (unlikely(!desc))
+                               break;
+
+                       rte_prefetch0(desc);
+                       sz = vq->desc[desc_indexes[i]].len / sizeof(*desc);
+                       idx = 0;
+               } else {
+                       desc = vq->desc;
+                       sz = vq->size;
+                       idx = desc_indexes[i];
+               }
+
                pkts[i] = rte_pktmbuf_alloc(mbuf_pool);
                if (unlikely(pkts[i] == NULL)) {
                        RTE_LOG(ERR, VHOST_DATA,
                                "Failed to allocate memory for mbuf.\n");
                        break;
                }
-               err = copy_desc_to_mbuf(dev, vq, pkts[i], desc_indexes[i],
-                                       mbuf_pool);
+               err = copy_desc_to_mbuf(dev, desc, sz, pkts[i], idx, mbuf_pool);
                if (unlikely(err)) {
                        rte_pktmbuf_free(pkts[i]);
                        break;