2be3e7a7ec1701518046dbb18825e42fb99ab107
[dpdk.git] / lib / librte_vhost / virtio_net.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2016 Intel Corporation
3  */
4
5 #include <stdint.h>
6 #include <stdbool.h>
7 #include <linux/virtio_net.h>
8
9 #include <rte_mbuf.h>
10 #include <rte_memcpy.h>
11 #include <rte_ether.h>
12 #include <rte_ip.h>
13 #include <rte_vhost.h>
14 #include <rte_tcp.h>
15 #include <rte_udp.h>
16 #include <rte_sctp.h>
17 #include <rte_arp.h>
18 #include <rte_spinlock.h>
19
20 #include "iotlb.h"
21 #include "vhost.h"
22
23 #define MAX_PKT_BURST 32
24
25 #define MAX_BATCH_LEN 256
26
27 static bool
28 is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t nr_vring)
29 {
30         return (is_tx ^ (idx & 1)) == 0 && idx < nr_vring;
31 }
32
33 static __rte_always_inline void
34 do_flush_shadow_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq,
35                           uint16_t to, uint16_t from, uint16_t size)
36 {
37         rte_memcpy(&vq->used->ring[to],
38                         &vq->shadow_used_ring[from],
39                         size * sizeof(struct vring_used_elem));
40         vhost_log_used_vring(dev, vq,
41                         offsetof(struct vring_used, ring[to]),
42                         size * sizeof(struct vring_used_elem));
43 }
44
45 static __rte_always_inline void
46 flush_shadow_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq)
47 {
48         uint16_t used_idx = vq->last_used_idx & (vq->size - 1);
49
50         if (used_idx + vq->shadow_used_idx <= vq->size) {
51                 do_flush_shadow_used_ring(dev, vq, used_idx, 0,
52                                           vq->shadow_used_idx);
53         } else {
54                 uint16_t size;
55
56                 /* update used ring interval [used_idx, vq->size] */
57                 size = vq->size - used_idx;
58                 do_flush_shadow_used_ring(dev, vq, used_idx, 0, size);
59
60                 /* update the left half used ring interval [0, left_size] */
61                 do_flush_shadow_used_ring(dev, vq, 0, size,
62                                           vq->shadow_used_idx - size);
63         }
64         vq->last_used_idx += vq->shadow_used_idx;
65
66         rte_smp_wmb();
67
68         *(volatile uint16_t *)&vq->used->idx += vq->shadow_used_idx;
69         vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
70                 sizeof(vq->used->idx));
71 }
72
73 static __rte_always_inline void
74 update_shadow_used_ring(struct vhost_virtqueue *vq,
75                          uint16_t desc_idx, uint16_t len)
76 {
77         uint16_t i = vq->shadow_used_idx++;
78
79         vq->shadow_used_ring[i].id  = desc_idx;
80         vq->shadow_used_ring[i].len = len;
81 }
82
83 static inline void
84 do_data_copy_enqueue(struct virtio_net *dev, struct vhost_virtqueue *vq)
85 {
86         struct batch_copy_elem *elem = vq->batch_copy_elems;
87         uint16_t count = vq->batch_copy_nb_elems;
88         int i;
89
90         for (i = 0; i < count; i++) {
91                 rte_memcpy(elem[i].dst, elem[i].src, elem[i].len);
92                 vhost_log_write(dev, elem[i].log_addr, elem[i].len);
93                 PRINT_PACKET(dev, (uintptr_t)elem[i].dst, elem[i].len, 0);
94         }
95 }
96
97 static inline void
98 do_data_copy_dequeue(struct vhost_virtqueue *vq)
99 {
100         struct batch_copy_elem *elem = vq->batch_copy_elems;
101         uint16_t count = vq->batch_copy_nb_elems;
102         int i;
103
104         for (i = 0; i < count; i++)
105                 rte_memcpy(elem[i].dst, elem[i].src, elem[i].len);
106 }
107
108 /* avoid write operation when necessary, to lessen cache issues */
109 #define ASSIGN_UNLESS_EQUAL(var, val) do {      \
110         if ((var) != (val))                     \
111                 (var) = (val);                  \
112 } while (0)
113
114 static void
115 virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
116 {
117         uint64_t csum_l4 = m_buf->ol_flags & PKT_TX_L4_MASK;
118
119         if (m_buf->ol_flags & PKT_TX_TCP_SEG)
120                 csum_l4 |= PKT_TX_TCP_CKSUM;
121
122         if (csum_l4) {
123                 net_hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
124                 net_hdr->csum_start = m_buf->l2_len + m_buf->l3_len;
125
126                 switch (csum_l4) {
127                 case PKT_TX_TCP_CKSUM:
128                         net_hdr->csum_offset = (offsetof(struct tcp_hdr,
129                                                 cksum));
130                         break;
131                 case PKT_TX_UDP_CKSUM:
132                         net_hdr->csum_offset = (offsetof(struct udp_hdr,
133                                                 dgram_cksum));
134                         break;
135                 case PKT_TX_SCTP_CKSUM:
136                         net_hdr->csum_offset = (offsetof(struct sctp_hdr,
137                                                 cksum));
138                         break;
139                 }
140         } else {
141                 ASSIGN_UNLESS_EQUAL(net_hdr->csum_start, 0);
142                 ASSIGN_UNLESS_EQUAL(net_hdr->csum_offset, 0);
143                 ASSIGN_UNLESS_EQUAL(net_hdr->flags, 0);
144         }
145
146         /* IP cksum verification cannot be bypassed, then calculate here */
147         if (m_buf->ol_flags & PKT_TX_IP_CKSUM) {
148                 struct ipv4_hdr *ipv4_hdr;
149
150                 ipv4_hdr = rte_pktmbuf_mtod_offset(m_buf, struct ipv4_hdr *,
151                                                    m_buf->l2_len);
152                 ipv4_hdr->hdr_checksum = rte_ipv4_cksum(ipv4_hdr);
153         }
154
155         if (m_buf->ol_flags & PKT_TX_TCP_SEG) {
156                 if (m_buf->ol_flags & PKT_TX_IPV4)
157                         net_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
158                 else
159                         net_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
160                 net_hdr->gso_size = m_buf->tso_segsz;
161                 net_hdr->hdr_len = m_buf->l2_len + m_buf->l3_len
162                                         + m_buf->l4_len;
163         } else if (m_buf->ol_flags & PKT_TX_UDP_SEG) {
164                 net_hdr->gso_type = VIRTIO_NET_HDR_GSO_UDP;
165                 net_hdr->gso_size = m_buf->tso_segsz;
166                 net_hdr->hdr_len = m_buf->l2_len + m_buf->l3_len +
167                         m_buf->l4_len;
168         } else {
169                 ASSIGN_UNLESS_EQUAL(net_hdr->gso_type, 0);
170                 ASSIGN_UNLESS_EQUAL(net_hdr->gso_size, 0);
171                 ASSIGN_UNLESS_EQUAL(net_hdr->hdr_len, 0);
172         }
173 }
174
175 static __rte_always_inline int
176 copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
177                   struct vring_desc *descs, struct rte_mbuf *m,
178                   uint16_t desc_idx, uint32_t size)
179 {
180         uint32_t desc_avail, desc_offset;
181         uint32_t mbuf_avail, mbuf_offset;
182         uint32_t cpy_len;
183         uint64_t dlen;
184         struct vring_desc *desc;
185         uint64_t desc_addr;
186         /* A counter to avoid desc dead loop chain */
187         uint16_t nr_desc = 1;
188         struct batch_copy_elem *batch_copy = vq->batch_copy_elems;
189         uint16_t copy_nb = vq->batch_copy_nb_elems;
190         int error = 0;
191
192         desc = &descs[desc_idx];
193         dlen = desc->len;
194         desc_addr = vhost_iova_to_vva(dev, vq, desc->addr,
195                                         &dlen, VHOST_ACCESS_RW);
196         /*
197          * Checking of 'desc_addr' placed outside of 'unlikely' macro to avoid
198          * performance issue with some versions of gcc (4.8.4 and 5.3.0) which
199          * otherwise stores offset on the stack instead of in a register.
200          */
201         if (unlikely(dlen != desc->len || desc->len < dev->vhost_hlen) ||
202                         !desc_addr) {
203                 error = -1;
204                 goto out;
205         }
206
207         rte_prefetch0((void *)(uintptr_t)desc_addr);
208
209         virtio_enqueue_offload(m, (struct virtio_net_hdr *)(uintptr_t)desc_addr);
210         vhost_log_write(dev, desc->addr, dev->vhost_hlen);
211         PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0);
212
213         desc_offset = dev->vhost_hlen;
214         desc_avail  = desc->len - dev->vhost_hlen;
215
216         mbuf_avail  = rte_pktmbuf_data_len(m);
217         mbuf_offset = 0;
218         while (mbuf_avail != 0 || m->next != NULL) {
219                 /* done with current mbuf, fetch next */
220                 if (mbuf_avail == 0) {
221                         m = m->next;
222
223                         mbuf_offset = 0;
224                         mbuf_avail  = rte_pktmbuf_data_len(m);
225                 }
226
227                 /* done with current desc buf, fetch next */
228                 if (desc_avail == 0) {
229                         if ((desc->flags & VRING_DESC_F_NEXT) == 0) {
230                                 /* Room in vring buffer is not enough */
231                                 error = -1;
232                                 goto out;
233                         }
234                         if (unlikely(desc->next >= size || ++nr_desc > size)) {
235                                 error = -1;
236                                 goto out;
237                         }
238
239                         desc = &descs[desc->next];
240                         dlen = desc->len;
241                         desc_addr = vhost_iova_to_vva(dev, vq, desc->addr,
242                                                         &dlen,
243                                                         VHOST_ACCESS_RW);
244                         if (unlikely(!desc_addr || dlen != desc->len)) {
245                                 error = -1;
246                                 goto out;
247                         }
248
249                         desc_offset = 0;
250                         desc_avail  = desc->len;
251                 }
252
253                 cpy_len = RTE_MIN(desc_avail, mbuf_avail);
254                 if (likely(cpy_len > MAX_BATCH_LEN || copy_nb >= vq->size)) {
255                         rte_memcpy((void *)((uintptr_t)(desc_addr +
256                                                         desc_offset)),
257                                 rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
258                                 cpy_len);
259                         vhost_log_write(dev, desc->addr + desc_offset, cpy_len);
260                         PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset),
261                                      cpy_len, 0);
262                 } else {
263                         batch_copy[copy_nb].dst =
264                                 (void *)((uintptr_t)(desc_addr + desc_offset));
265                         batch_copy[copy_nb].src =
266                                 rte_pktmbuf_mtod_offset(m, void *, mbuf_offset);
267                         batch_copy[copy_nb].log_addr = desc->addr + desc_offset;
268                         batch_copy[copy_nb].len = cpy_len;
269                         copy_nb++;
270                 }
271
272                 mbuf_avail  -= cpy_len;
273                 mbuf_offset += cpy_len;
274                 desc_avail  -= cpy_len;
275                 desc_offset += cpy_len;
276         }
277
278 out:
279         vq->batch_copy_nb_elems = copy_nb;
280
281         return error;
282 }
283
284 /**
285  * This function adds buffers to the virtio devices RX virtqueue. Buffers can
286  * be received from the physical port or from another virtio device. A packet
287  * count is returned to indicate the number of packets that are successfully
288  * added to the RX queue. This function works when the mbuf is scattered, but
289  * it doesn't support the mergeable feature.
290  */
291 static __rte_always_inline uint32_t
292 virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
293               struct rte_mbuf **pkts, uint32_t count)
294 {
295         struct vhost_virtqueue *vq;
296         uint16_t avail_idx, free_entries, start_idx;
297         uint16_t desc_indexes[MAX_PKT_BURST];
298         struct vring_desc *descs;
299         uint16_t used_idx;
300         uint32_t i, sz;
301
302         VHOST_LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
303         if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->nr_vring))) {
304                 RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n",
305                         dev->vid, __func__, queue_id);
306                 return 0;
307         }
308
309         vq = dev->virtqueue[queue_id];
310
311         rte_spinlock_lock(&vq->access_lock);
312
313         if (unlikely(vq->enabled == 0))
314                 goto out_access_unlock;
315
316         if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
317                 vhost_user_iotlb_rd_lock(vq);
318
319         if (unlikely(vq->access_ok == 0)) {
320                 if (unlikely(vring_translate(dev, vq) < 0)) {
321                         count = 0;
322                         goto out;
323                 }
324         }
325
326         avail_idx = *((volatile uint16_t *)&vq->avail->idx);
327         start_idx = vq->last_used_idx;
328         free_entries = avail_idx - start_idx;
329         count = RTE_MIN(count, free_entries);
330         count = RTE_MIN(count, (uint32_t)MAX_PKT_BURST);
331         if (count == 0)
332                 goto out;
333
334         VHOST_LOG_DEBUG(VHOST_DATA, "(%d) start_idx %d | end_idx %d\n",
335                 dev->vid, start_idx, start_idx + count);
336
337         vq->batch_copy_nb_elems = 0;
338
339         /* Retrieve all of the desc indexes first to avoid caching issues. */
340         rte_prefetch0(&vq->avail->ring[start_idx & (vq->size - 1)]);
341         for (i = 0; i < count; i++) {
342                 used_idx = (start_idx + i) & (vq->size - 1);
343                 desc_indexes[i] = vq->avail->ring[used_idx];
344                 vq->used->ring[used_idx].id = desc_indexes[i];
345                 vq->used->ring[used_idx].len = pkts[i]->pkt_len +
346                                                dev->vhost_hlen;
347                 vhost_log_used_vring(dev, vq,
348                         offsetof(struct vring_used, ring[used_idx]),
349                         sizeof(vq->used->ring[used_idx]));
350         }
351
352         rte_prefetch0(&vq->desc[desc_indexes[0]]);
353         for (i = 0; i < count; i++) {
354                 uint16_t desc_idx = desc_indexes[i];
355                 int err;
356
357                 if (vq->desc[desc_idx].flags & VRING_DESC_F_INDIRECT) {
358                         uint64_t dlen = vq->desc[desc_idx].len;
359                         descs = (struct vring_desc *)(uintptr_t)
360                                 vhost_iova_to_vva(dev,
361                                                 vq, vq->desc[desc_idx].addr,
362                                                 &dlen, VHOST_ACCESS_RO);
363                         if (unlikely(!descs ||
364                                         dlen != vq->desc[desc_idx].len)) {
365                                 count = i;
366                                 break;
367                         }
368
369                         desc_idx = 0;
370                         sz = vq->desc[desc_idx].len / sizeof(*descs);
371                 } else {
372                         descs = vq->desc;
373                         sz = vq->size;
374                 }
375
376                 err = copy_mbuf_to_desc(dev, vq, descs, pkts[i], desc_idx, sz);
377                 if (unlikely(err)) {
378                         count = i;
379                         break;
380                 }
381
382                 if (i + 1 < count)
383                         rte_prefetch0(&vq->desc[desc_indexes[i+1]]);
384         }
385
386         do_data_copy_enqueue(dev, vq);
387
388         rte_smp_wmb();
389
390         *(volatile uint16_t *)&vq->used->idx += count;
391         vq->last_used_idx += count;
392         vhost_log_used_vring(dev, vq,
393                 offsetof(struct vring_used, idx),
394                 sizeof(vq->used->idx));
395
396         vhost_vring_call(dev, vq);
397 out:
398         if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
399                 vhost_user_iotlb_rd_unlock(vq);
400
401 out_access_unlock:
402         rte_spinlock_unlock(&vq->access_lock);
403
404         return count;
405 }
406
407 static __rte_always_inline int
408 fill_vec_buf(struct virtio_net *dev, struct vhost_virtqueue *vq,
409                          uint32_t avail_idx, uint32_t *vec_idx,
410                          struct buf_vector *buf_vec, uint16_t *desc_chain_head,
411                          uint16_t *desc_chain_len)
412 {
413         uint16_t idx = vq->avail->ring[avail_idx & (vq->size - 1)];
414         uint32_t vec_id = *vec_idx;
415         uint32_t len    = 0;
416         uint64_t dlen;
417         struct vring_desc *descs = vq->desc;
418
419         *desc_chain_head = idx;
420
421         if (vq->desc[idx].flags & VRING_DESC_F_INDIRECT) {
422                 dlen = vq->desc[idx].len;
423                 descs = (struct vring_desc *)(uintptr_t)
424                         vhost_iova_to_vva(dev, vq, vq->desc[idx].addr,
425                                                 &dlen,
426                                                 VHOST_ACCESS_RO);
427                 if (unlikely(!descs || dlen != vq->desc[idx].len))
428                         return -1;
429
430                 idx = 0;
431         }
432
433         while (1) {
434                 if (unlikely(vec_id >= BUF_VECTOR_MAX || idx >= vq->size))
435                         return -1;
436
437                 len += descs[idx].len;
438                 buf_vec[vec_id].buf_addr = descs[idx].addr;
439                 buf_vec[vec_id].buf_len  = descs[idx].len;
440                 buf_vec[vec_id].desc_idx = idx;
441                 vec_id++;
442
443                 if ((descs[idx].flags & VRING_DESC_F_NEXT) == 0)
444                         break;
445
446                 idx = descs[idx].next;
447         }
448
449         *desc_chain_len = len;
450         *vec_idx = vec_id;
451
452         return 0;
453 }
454
455 /*
456  * Returns -1 on fail, 0 on success
457  */
458 static inline int
459 reserve_avail_buf_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq,
460                                 uint32_t size, struct buf_vector *buf_vec,
461                                 uint16_t *num_buffers, uint16_t avail_head)
462 {
463         uint16_t cur_idx;
464         uint32_t vec_idx = 0;
465         uint16_t tries = 0;
466
467         uint16_t head_idx = 0;
468         uint16_t len = 0;
469
470         *num_buffers = 0;
471         cur_idx  = vq->last_avail_idx;
472
473         while (size > 0) {
474                 if (unlikely(cur_idx == avail_head))
475                         return -1;
476
477                 if (unlikely(fill_vec_buf(dev, vq, cur_idx, &vec_idx, buf_vec,
478                                                 &head_idx, &len) < 0))
479                         return -1;
480                 len = RTE_MIN(len, size);
481                 update_shadow_used_ring(vq, head_idx, len);
482                 size -= len;
483
484                 cur_idx++;
485                 tries++;
486                 *num_buffers += 1;
487
488                 /*
489                  * if we tried all available ring items, and still
490                  * can't get enough buf, it means something abnormal
491                  * happened.
492                  */
493                 if (unlikely(tries >= vq->size))
494                         return -1;
495         }
496
497         return 0;
498 }
499
500 static __rte_always_inline int
501 copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq,
502                             struct rte_mbuf *m, struct buf_vector *buf_vec,
503                             uint16_t num_buffers)
504 {
505         uint32_t vec_idx = 0;
506         uint64_t desc_addr;
507         uint32_t mbuf_offset, mbuf_avail;
508         uint32_t desc_offset, desc_avail;
509         uint32_t cpy_len;
510         uint64_t dlen;
511         uint64_t hdr_addr, hdr_phys_addr;
512         struct rte_mbuf *hdr_mbuf;
513         struct batch_copy_elem *batch_copy = vq->batch_copy_elems;
514         uint16_t copy_nb = vq->batch_copy_nb_elems;
515         int error = 0;
516
517         if (unlikely(m == NULL)) {
518                 error = -1;
519                 goto out;
520         }
521
522         dlen = buf_vec[vec_idx].buf_len;
523         desc_addr = vhost_iova_to_vva(dev, vq, buf_vec[vec_idx].buf_addr,
524                                                 &dlen, VHOST_ACCESS_RW);
525         if (dlen != buf_vec[vec_idx].buf_len ||
526                         buf_vec[vec_idx].buf_len < dev->vhost_hlen ||
527                         !desc_addr) {
528                 error = -1;
529                 goto out;
530         }
531
532         hdr_mbuf = m;
533         hdr_addr = desc_addr;
534         hdr_phys_addr = buf_vec[vec_idx].buf_addr;
535         rte_prefetch0((void *)(uintptr_t)hdr_addr);
536
537         VHOST_LOG_DEBUG(VHOST_DATA, "(%d) RX: num merge buffers %d\n",
538                 dev->vid, num_buffers);
539
540         desc_avail  = buf_vec[vec_idx].buf_len - dev->vhost_hlen;
541         desc_offset = dev->vhost_hlen;
542
543         mbuf_avail  = rte_pktmbuf_data_len(m);
544         mbuf_offset = 0;
545         while (mbuf_avail != 0 || m->next != NULL) {
546                 /* done with current desc buf, get the next one */
547                 if (desc_avail == 0) {
548                         vec_idx++;
549                         dlen = buf_vec[vec_idx].buf_len;
550                         desc_addr =
551                                 vhost_iova_to_vva(dev, vq,
552                                         buf_vec[vec_idx].buf_addr,
553                                         &dlen,
554                                         VHOST_ACCESS_RW);
555                         if (unlikely(!desc_addr ||
556                                         dlen != buf_vec[vec_idx].buf_len)) {
557                                 error = -1;
558                                 goto out;
559                         }
560
561                         /* Prefetch buffer address. */
562                         rte_prefetch0((void *)(uintptr_t)desc_addr);
563                         desc_offset = 0;
564                         desc_avail  = buf_vec[vec_idx].buf_len;
565                 }
566
567                 /* done with current mbuf, get the next one */
568                 if (mbuf_avail == 0) {
569                         m = m->next;
570
571                         mbuf_offset = 0;
572                         mbuf_avail  = rte_pktmbuf_data_len(m);
573                 }
574
575                 if (hdr_addr) {
576                         struct virtio_net_hdr_mrg_rxbuf *hdr;
577
578                         hdr = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)
579                                 hdr_addr;
580                         virtio_enqueue_offload(hdr_mbuf, &hdr->hdr);
581                         ASSIGN_UNLESS_EQUAL(hdr->num_buffers, num_buffers);
582
583                         vhost_log_write(dev, hdr_phys_addr, dev->vhost_hlen);
584                         PRINT_PACKET(dev, (uintptr_t)hdr_addr,
585                                      dev->vhost_hlen, 0);
586
587                         hdr_addr = 0;
588                 }
589
590                 cpy_len = RTE_MIN(desc_avail, mbuf_avail);
591
592                 if (likely(cpy_len > MAX_BATCH_LEN || copy_nb >= vq->size)) {
593                         rte_memcpy((void *)((uintptr_t)(desc_addr +
594                                                         desc_offset)),
595                                 rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
596                                 cpy_len);
597                         vhost_log_write(dev,
598                                 buf_vec[vec_idx].buf_addr + desc_offset,
599                                 cpy_len);
600                         PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset),
601                                 cpy_len, 0);
602                 } else {
603                         batch_copy[copy_nb].dst =
604                                 (void *)((uintptr_t)(desc_addr + desc_offset));
605                         batch_copy[copy_nb].src =
606                                 rte_pktmbuf_mtod_offset(m, void *, mbuf_offset);
607                         batch_copy[copy_nb].log_addr =
608                                 buf_vec[vec_idx].buf_addr + desc_offset;
609                         batch_copy[copy_nb].len = cpy_len;
610                         copy_nb++;
611                 }
612
613                 mbuf_avail  -= cpy_len;
614                 mbuf_offset += cpy_len;
615                 desc_avail  -= cpy_len;
616                 desc_offset += cpy_len;
617         }
618
619 out:
620         vq->batch_copy_nb_elems = copy_nb;
621
622         return error;
623 }
624
625 static __rte_always_inline uint32_t
626 virtio_dev_merge_rx(struct virtio_net *dev, uint16_t queue_id,
627         struct rte_mbuf **pkts, uint32_t count)
628 {
629         struct vhost_virtqueue *vq;
630         uint32_t pkt_idx = 0;
631         uint16_t num_buffers;
632         struct buf_vector buf_vec[BUF_VECTOR_MAX];
633         uint16_t avail_head;
634
635         VHOST_LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
636         if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->nr_vring))) {
637                 RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n",
638                         dev->vid, __func__, queue_id);
639                 return 0;
640         }
641
642         vq = dev->virtqueue[queue_id];
643
644         rte_spinlock_lock(&vq->access_lock);
645
646         if (unlikely(vq->enabled == 0))
647                 goto out_access_unlock;
648
649         if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
650                 vhost_user_iotlb_rd_lock(vq);
651
652         if (unlikely(vq->access_ok == 0))
653                 if (unlikely(vring_translate(dev, vq) < 0))
654                         goto out;
655
656         count = RTE_MIN((uint32_t)MAX_PKT_BURST, count);
657         if (count == 0)
658                 goto out;
659
660         vq->batch_copy_nb_elems = 0;
661
662         rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]);
663
664         vq->shadow_used_idx = 0;
665         avail_head = *((volatile uint16_t *)&vq->avail->idx);
666         for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
667                 uint32_t pkt_len = pkts[pkt_idx]->pkt_len + dev->vhost_hlen;
668
669                 if (unlikely(reserve_avail_buf_mergeable(dev, vq,
670                                                 pkt_len, buf_vec, &num_buffers,
671                                                 avail_head) < 0)) {
672                         VHOST_LOG_DEBUG(VHOST_DATA,
673                                 "(%d) failed to get enough desc from vring\n",
674                                 dev->vid);
675                         vq->shadow_used_idx -= num_buffers;
676                         break;
677                 }
678
679                 VHOST_LOG_DEBUG(VHOST_DATA, "(%d) current index %d | end index %d\n",
680                         dev->vid, vq->last_avail_idx,
681                         vq->last_avail_idx + num_buffers);
682
683                 if (copy_mbuf_to_desc_mergeable(dev, vq, pkts[pkt_idx],
684                                                 buf_vec, num_buffers) < 0) {
685                         vq->shadow_used_idx -= num_buffers;
686                         break;
687                 }
688
689                 vq->last_avail_idx += num_buffers;
690         }
691
692         do_data_copy_enqueue(dev, vq);
693
694         if (likely(vq->shadow_used_idx)) {
695                 flush_shadow_used_ring(dev, vq);
696                 vhost_vring_call(dev, vq);
697         }
698
699 out:
700         if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
701                 vhost_user_iotlb_rd_unlock(vq);
702
703 out_access_unlock:
704         rte_spinlock_unlock(&vq->access_lock);
705
706         return pkt_idx;
707 }
708
709 uint16_t
710 rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
711         struct rte_mbuf **pkts, uint16_t count)
712 {
713         struct virtio_net *dev = get_device(vid);
714
715         if (!dev)
716                 return 0;
717
718         if (unlikely(!(dev->flags & VIRTIO_DEV_BUILTIN_VIRTIO_NET))) {
719                 RTE_LOG(ERR, VHOST_DATA,
720                         "(%d) %s: built-in vhost net backend is disabled.\n",
721                         dev->vid, __func__);
722                 return 0;
723         }
724
725         if (dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF))
726                 return virtio_dev_merge_rx(dev, queue_id, pkts, count);
727         else
728                 return virtio_dev_rx(dev, queue_id, pkts, count);
729 }
730
731 static inline bool
732 virtio_net_with_host_offload(struct virtio_net *dev)
733 {
734         if (dev->features &
735                         ((1ULL << VIRTIO_NET_F_CSUM) |
736                          (1ULL << VIRTIO_NET_F_HOST_ECN) |
737                          (1ULL << VIRTIO_NET_F_HOST_TSO4) |
738                          (1ULL << VIRTIO_NET_F_HOST_TSO6) |
739                          (1ULL << VIRTIO_NET_F_HOST_UFO)))
740                 return true;
741
742         return false;
743 }
744
745 static void
746 parse_ethernet(struct rte_mbuf *m, uint16_t *l4_proto, void **l4_hdr)
747 {
748         struct ipv4_hdr *ipv4_hdr;
749         struct ipv6_hdr *ipv6_hdr;
750         void *l3_hdr = NULL;
751         struct ether_hdr *eth_hdr;
752         uint16_t ethertype;
753
754         eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
755
756         m->l2_len = sizeof(struct ether_hdr);
757         ethertype = rte_be_to_cpu_16(eth_hdr->ether_type);
758
759         if (ethertype == ETHER_TYPE_VLAN) {
760                 struct vlan_hdr *vlan_hdr = (struct vlan_hdr *)(eth_hdr + 1);
761
762                 m->l2_len += sizeof(struct vlan_hdr);
763                 ethertype = rte_be_to_cpu_16(vlan_hdr->eth_proto);
764         }
765
766         l3_hdr = (char *)eth_hdr + m->l2_len;
767
768         switch (ethertype) {
769         case ETHER_TYPE_IPv4:
770                 ipv4_hdr = l3_hdr;
771                 *l4_proto = ipv4_hdr->next_proto_id;
772                 m->l3_len = (ipv4_hdr->version_ihl & 0x0f) * 4;
773                 *l4_hdr = (char *)l3_hdr + m->l3_len;
774                 m->ol_flags |= PKT_TX_IPV4;
775                 break;
776         case ETHER_TYPE_IPv6:
777                 ipv6_hdr = l3_hdr;
778                 *l4_proto = ipv6_hdr->proto;
779                 m->l3_len = sizeof(struct ipv6_hdr);
780                 *l4_hdr = (char *)l3_hdr + m->l3_len;
781                 m->ol_flags |= PKT_TX_IPV6;
782                 break;
783         default:
784                 m->l3_len = 0;
785                 *l4_proto = 0;
786                 *l4_hdr = NULL;
787                 break;
788         }
789 }
790
791 static __rte_always_inline void
792 vhost_dequeue_offload(struct virtio_net_hdr *hdr, struct rte_mbuf *m)
793 {
794         uint16_t l4_proto = 0;
795         void *l4_hdr = NULL;
796         struct tcp_hdr *tcp_hdr = NULL;
797
798         if (hdr->flags == 0 && hdr->gso_type == VIRTIO_NET_HDR_GSO_NONE)
799                 return;
800
801         parse_ethernet(m, &l4_proto, &l4_hdr);
802         if (hdr->flags == VIRTIO_NET_HDR_F_NEEDS_CSUM) {
803                 if (hdr->csum_start == (m->l2_len + m->l3_len)) {
804                         switch (hdr->csum_offset) {
805                         case (offsetof(struct tcp_hdr, cksum)):
806                                 if (l4_proto == IPPROTO_TCP)
807                                         m->ol_flags |= PKT_TX_TCP_CKSUM;
808                                 break;
809                         case (offsetof(struct udp_hdr, dgram_cksum)):
810                                 if (l4_proto == IPPROTO_UDP)
811                                         m->ol_flags |= PKT_TX_UDP_CKSUM;
812                                 break;
813                         case (offsetof(struct sctp_hdr, cksum)):
814                                 if (l4_proto == IPPROTO_SCTP)
815                                         m->ol_flags |= PKT_TX_SCTP_CKSUM;
816                                 break;
817                         default:
818                                 break;
819                         }
820                 }
821         }
822
823         if (l4_hdr && hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
824                 switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
825                 case VIRTIO_NET_HDR_GSO_TCPV4:
826                 case VIRTIO_NET_HDR_GSO_TCPV6:
827                         tcp_hdr = l4_hdr;
828                         m->ol_flags |= PKT_TX_TCP_SEG;
829                         m->tso_segsz = hdr->gso_size;
830                         m->l4_len = (tcp_hdr->data_off & 0xf0) >> 2;
831                         break;
832                 case VIRTIO_NET_HDR_GSO_UDP:
833                         m->ol_flags |= PKT_TX_UDP_SEG;
834                         m->tso_segsz = hdr->gso_size;
835                         m->l4_len = sizeof(struct udp_hdr);
836                         break;
837                 default:
838                         RTE_LOG(WARNING, VHOST_DATA,
839                                 "unsupported gso type %u.\n", hdr->gso_type);
840                         break;
841                 }
842         }
843 }
844
845 static __rte_always_inline void
846 put_zmbuf(struct zcopy_mbuf *zmbuf)
847 {
848         zmbuf->in_use = 0;
849 }
850
851 static __rte_always_inline int
852 copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
853                   struct vring_desc *descs, uint16_t max_desc,
854                   struct rte_mbuf *m, uint16_t desc_idx,
855                   struct rte_mempool *mbuf_pool)
856 {
857         struct vring_desc *desc;
858         uint64_t desc_addr;
859         uint32_t desc_avail, desc_offset;
860         uint32_t mbuf_avail, mbuf_offset;
861         uint32_t cpy_len;
862         uint64_t dlen;
863         struct rte_mbuf *cur = m, *prev = m;
864         struct virtio_net_hdr *hdr = NULL;
865         /* A counter to avoid desc dead loop chain */
866         uint32_t nr_desc = 1;
867         struct batch_copy_elem *batch_copy = vq->batch_copy_elems;
868         uint16_t copy_nb = vq->batch_copy_nb_elems;
869         int error = 0;
870
871         desc = &descs[desc_idx];
872         if (unlikely((desc->len < dev->vhost_hlen)) ||
873                         (desc->flags & VRING_DESC_F_INDIRECT)) {
874                 error = -1;
875                 goto out;
876         }
877
878         dlen = desc->len;
879         desc_addr = vhost_iova_to_vva(dev,
880                                         vq, desc->addr,
881                                         &dlen,
882                                         VHOST_ACCESS_RO);
883         if (unlikely(!desc_addr || dlen != desc->len)) {
884                 error = -1;
885                 goto out;
886         }
887
888         if (virtio_net_with_host_offload(dev)) {
889                 hdr = (struct virtio_net_hdr *)((uintptr_t)desc_addr);
890                 rte_prefetch0(hdr);
891         }
892
893         /*
894          * A virtio driver normally uses at least 2 desc buffers
895          * for Tx: the first for storing the header, and others
896          * for storing the data.
897          */
898         if (likely((desc->len == dev->vhost_hlen) &&
899                    (desc->flags & VRING_DESC_F_NEXT) != 0)) {
900                 desc = &descs[desc->next];
901                 if (unlikely(desc->flags & VRING_DESC_F_INDIRECT)) {
902                         error = -1;
903                         goto out;
904                 }
905
906                 dlen = desc->len;
907                 desc_addr = vhost_iova_to_vva(dev,
908                                                         vq, desc->addr,
909                                                         &dlen,
910                                                         VHOST_ACCESS_RO);
911                 if (unlikely(!desc_addr || dlen != desc->len)) {
912                         error = -1;
913                         goto out;
914                 }
915
916                 desc_offset = 0;
917                 desc_avail  = desc->len;
918                 nr_desc    += 1;
919         } else {
920                 desc_avail  = desc->len - dev->vhost_hlen;
921                 desc_offset = dev->vhost_hlen;
922         }
923
924         rte_prefetch0((void *)(uintptr_t)(desc_addr + desc_offset));
925
926         PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset), desc_avail, 0);
927
928         mbuf_offset = 0;
929         mbuf_avail  = m->buf_len - RTE_PKTMBUF_HEADROOM;
930         while (1) {
931                 uint64_t hpa;
932
933                 cpy_len = RTE_MIN(desc_avail, mbuf_avail);
934
935                 /*
936                  * A desc buf might across two host physical pages that are
937                  * not continuous. In such case (gpa_to_hpa returns 0), data
938                  * will be copied even though zero copy is enabled.
939                  */
940                 if (unlikely(dev->dequeue_zero_copy && (hpa = gpa_to_hpa(dev,
941                                         desc->addr + desc_offset, cpy_len)))) {
942                         cur->data_len = cpy_len;
943                         cur->data_off = 0;
944                         cur->buf_addr = (void *)(uintptr_t)(desc_addr
945                                 + desc_offset);
946                         cur->buf_iova = hpa;
947
948                         /*
949                          * In zero copy mode, one mbuf can only reference data
950                          * for one or partial of one desc buff.
951                          */
952                         mbuf_avail = cpy_len;
953                 } else {
954                         if (likely(cpy_len > MAX_BATCH_LEN ||
955                                    copy_nb >= vq->size ||
956                                    (hdr && cur == m))) {
957                                 rte_memcpy(rte_pktmbuf_mtod_offset(cur, void *,
958                                                                    mbuf_offset),
959                                            (void *)((uintptr_t)(desc_addr +
960                                                                 desc_offset)),
961                                            cpy_len);
962                         } else {
963                                 batch_copy[copy_nb].dst =
964                                         rte_pktmbuf_mtod_offset(cur, void *,
965                                                                 mbuf_offset);
966                                 batch_copy[copy_nb].src =
967                                         (void *)((uintptr_t)(desc_addr +
968                                                              desc_offset));
969                                 batch_copy[copy_nb].len = cpy_len;
970                                 copy_nb++;
971                         }
972                 }
973
974                 mbuf_avail  -= cpy_len;
975                 mbuf_offset += cpy_len;
976                 desc_avail  -= cpy_len;
977                 desc_offset += cpy_len;
978
979                 /* This desc reaches to its end, get the next one */
980                 if (desc_avail == 0) {
981                         if ((desc->flags & VRING_DESC_F_NEXT) == 0)
982                                 break;
983
984                         if (unlikely(desc->next >= max_desc ||
985                                      ++nr_desc > max_desc)) {
986                                 error = -1;
987                                 goto out;
988                         }
989                         desc = &descs[desc->next];
990                         if (unlikely(desc->flags & VRING_DESC_F_INDIRECT)) {
991                                 error = -1;
992                                 goto out;
993                         }
994
995                         dlen = desc->len;
996                         desc_addr = vhost_iova_to_vva(dev,
997                                                         vq, desc->addr,
998                                                         &dlen, VHOST_ACCESS_RO);
999                         if (unlikely(!desc_addr || dlen != desc->len)) {
1000                                 error = -1;
1001                                 goto out;
1002                         }
1003
1004                         rte_prefetch0((void *)(uintptr_t)desc_addr);
1005
1006                         desc_offset = 0;
1007                         desc_avail  = desc->len;
1008
1009                         PRINT_PACKET(dev, (uintptr_t)desc_addr, desc->len, 0);
1010                 }
1011
1012                 /*
1013                  * This mbuf reaches to its end, get a new one
1014                  * to hold more data.
1015                  */
1016                 if (mbuf_avail == 0) {
1017                         cur = rte_pktmbuf_alloc(mbuf_pool);
1018                         if (unlikely(cur == NULL)) {
1019                                 RTE_LOG(ERR, VHOST_DATA, "Failed to "
1020                                         "allocate memory for mbuf.\n");
1021                                 error = -1;
1022                                 goto out;
1023                         }
1024                         if (unlikely(dev->dequeue_zero_copy))
1025                                 rte_mbuf_refcnt_update(cur, 1);
1026
1027                         prev->next = cur;
1028                         prev->data_len = mbuf_offset;
1029                         m->nb_segs += 1;
1030                         m->pkt_len += mbuf_offset;
1031                         prev = cur;
1032
1033                         mbuf_offset = 0;
1034                         mbuf_avail  = cur->buf_len - RTE_PKTMBUF_HEADROOM;
1035                 }
1036         }
1037
1038         prev->data_len = mbuf_offset;
1039         m->pkt_len    += mbuf_offset;
1040
1041         if (hdr)
1042                 vhost_dequeue_offload(hdr, m);
1043
1044 out:
1045         vq->batch_copy_nb_elems = copy_nb;
1046
1047         return error;
1048 }
1049
1050 static __rte_always_inline void
1051 update_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq,
1052                  uint32_t used_idx, uint32_t desc_idx)
1053 {
1054         vq->used->ring[used_idx].id  = desc_idx;
1055         vq->used->ring[used_idx].len = 0;
1056         vhost_log_used_vring(dev, vq,
1057                         offsetof(struct vring_used, ring[used_idx]),
1058                         sizeof(vq->used->ring[used_idx]));
1059 }
1060
1061 static __rte_always_inline void
1062 update_used_idx(struct virtio_net *dev, struct vhost_virtqueue *vq,
1063                 uint32_t count)
1064 {
1065         if (unlikely(count == 0))
1066                 return;
1067
1068         rte_smp_wmb();
1069         rte_smp_rmb();
1070
1071         vq->used->idx += count;
1072         vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
1073                         sizeof(vq->used->idx));
1074         vhost_vring_call(dev, vq);
1075 }
1076
1077 static __rte_always_inline struct zcopy_mbuf *
1078 get_zmbuf(struct vhost_virtqueue *vq)
1079 {
1080         uint16_t i;
1081         uint16_t last;
1082         int tries = 0;
1083
1084         /* search [last_zmbuf_idx, zmbuf_size) */
1085         i = vq->last_zmbuf_idx;
1086         last = vq->zmbuf_size;
1087
1088 again:
1089         for (; i < last; i++) {
1090                 if (vq->zmbufs[i].in_use == 0) {
1091                         vq->last_zmbuf_idx = i + 1;
1092                         vq->zmbufs[i].in_use = 1;
1093                         return &vq->zmbufs[i];
1094                 }
1095         }
1096
1097         tries++;
1098         if (tries == 1) {
1099                 /* search [0, last_zmbuf_idx) */
1100                 i = 0;
1101                 last = vq->last_zmbuf_idx;
1102                 goto again;
1103         }
1104
1105         return NULL;
1106 }
1107
1108 static __rte_always_inline bool
1109 mbuf_is_consumed(struct rte_mbuf *m)
1110 {
1111         while (m) {
1112                 if (rte_mbuf_refcnt_read(m) > 1)
1113                         return false;
1114                 m = m->next;
1115         }
1116
1117         return true;
1118 }
1119
1120 static __rte_always_inline void
1121 restore_mbuf(struct rte_mbuf *m)
1122 {
1123         uint32_t mbuf_size, priv_size;
1124
1125         while (m) {
1126                 priv_size = rte_pktmbuf_priv_size(m->pool);
1127                 mbuf_size = sizeof(struct rte_mbuf) + priv_size;
1128                 /* start of buffer is after mbuf structure and priv data */
1129
1130                 m->buf_addr = (char *)m + mbuf_size;
1131                 m->buf_iova = rte_mempool_virt2iova(m) + mbuf_size;
1132                 m = m->next;
1133         }
1134 }
1135
1136 uint16_t
1137 rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
1138         struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count)
1139 {
1140         struct virtio_net *dev;
1141         struct rte_mbuf *rarp_mbuf = NULL;
1142         struct vhost_virtqueue *vq;
1143         uint32_t desc_indexes[MAX_PKT_BURST];
1144         uint32_t used_idx;
1145         uint32_t i = 0;
1146         uint16_t free_entries;
1147         uint16_t avail_idx;
1148
1149         dev = get_device(vid);
1150         if (!dev)
1151                 return 0;
1152
1153         if (unlikely(!(dev->flags & VIRTIO_DEV_BUILTIN_VIRTIO_NET))) {
1154                 RTE_LOG(ERR, VHOST_DATA,
1155                         "(%d) %s: built-in vhost net backend is disabled.\n",
1156                         dev->vid, __func__);
1157                 return 0;
1158         }
1159
1160         if (unlikely(!is_valid_virt_queue_idx(queue_id, 1, dev->nr_vring))) {
1161                 RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n",
1162                         dev->vid, __func__, queue_id);
1163                 return 0;
1164         }
1165
1166         vq = dev->virtqueue[queue_id];
1167
1168         if (unlikely(rte_spinlock_trylock(&vq->access_lock) == 0))
1169                 return 0;
1170
1171         if (unlikely(vq->enabled == 0))
1172                 goto out_access_unlock;
1173
1174         vq->batch_copy_nb_elems = 0;
1175
1176         if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
1177                 vhost_user_iotlb_rd_lock(vq);
1178
1179         if (unlikely(vq->access_ok == 0))
1180                 if (unlikely(vring_translate(dev, vq) < 0))
1181                         goto out;
1182
1183         if (unlikely(dev->dequeue_zero_copy)) {
1184                 struct zcopy_mbuf *zmbuf, *next;
1185                 int nr_updated = 0;
1186
1187                 for (zmbuf = TAILQ_FIRST(&vq->zmbuf_list);
1188                      zmbuf != NULL; zmbuf = next) {
1189                         next = TAILQ_NEXT(zmbuf, next);
1190
1191                         if (mbuf_is_consumed(zmbuf->mbuf)) {
1192                                 used_idx = vq->last_used_idx++ & (vq->size - 1);
1193                                 update_used_ring(dev, vq, used_idx,
1194                                                  zmbuf->desc_idx);
1195                                 nr_updated += 1;
1196
1197                                 TAILQ_REMOVE(&vq->zmbuf_list, zmbuf, next);
1198                                 restore_mbuf(zmbuf->mbuf);
1199                                 rte_pktmbuf_free(zmbuf->mbuf);
1200                                 put_zmbuf(zmbuf);
1201                                 vq->nr_zmbuf -= 1;
1202                         }
1203                 }
1204
1205                 update_used_idx(dev, vq, nr_updated);
1206         }
1207
1208         /*
1209          * Construct a RARP broadcast packet, and inject it to the "pkts"
1210          * array, to looks like that guest actually send such packet.
1211          *
1212          * Check user_send_rarp() for more information.
1213          *
1214          * broadcast_rarp shares a cacheline in the virtio_net structure
1215          * with some fields that are accessed during enqueue and
1216          * rte_atomic16_cmpset() causes a write if using cmpxchg. This could
1217          * result in false sharing between enqueue and dequeue.
1218          *
1219          * Prevent unnecessary false sharing by reading broadcast_rarp first
1220          * and only performing cmpset if the read indicates it is likely to
1221          * be set.
1222          */
1223
1224         if (unlikely(rte_atomic16_read(&dev->broadcast_rarp) &&
1225                         rte_atomic16_cmpset((volatile uint16_t *)
1226                                 &dev->broadcast_rarp.cnt, 1, 0))) {
1227
1228                 rarp_mbuf = rte_net_make_rarp_packet(mbuf_pool, &dev->mac);
1229                 if (rarp_mbuf == NULL) {
1230                         RTE_LOG(ERR, VHOST_DATA,
1231                                 "Failed to make RARP packet.\n");
1232                         return 0;
1233                 }
1234                 count -= 1;
1235         }
1236
1237         free_entries = *((volatile uint16_t *)&vq->avail->idx) -
1238                         vq->last_avail_idx;
1239         if (free_entries == 0)
1240                 goto out;
1241
1242         VHOST_LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
1243
1244         /* Prefetch available and used ring */
1245         avail_idx = vq->last_avail_idx & (vq->size - 1);
1246         used_idx  = vq->last_used_idx  & (vq->size - 1);
1247         rte_prefetch0(&vq->avail->ring[avail_idx]);
1248         rte_prefetch0(&vq->used->ring[used_idx]);
1249
1250         count = RTE_MIN(count, MAX_PKT_BURST);
1251         count = RTE_MIN(count, free_entries);
1252         VHOST_LOG_DEBUG(VHOST_DATA, "(%d) about to dequeue %u buffers\n",
1253                         dev->vid, count);
1254
1255         /* Retrieve all of the head indexes first to avoid caching issues. */
1256         for (i = 0; i < count; i++) {
1257                 avail_idx = (vq->last_avail_idx + i) & (vq->size - 1);
1258                 used_idx  = (vq->last_used_idx  + i) & (vq->size - 1);
1259                 desc_indexes[i] = vq->avail->ring[avail_idx];
1260
1261                 if (likely(dev->dequeue_zero_copy == 0))
1262                         update_used_ring(dev, vq, used_idx, desc_indexes[i]);
1263         }
1264
1265         /* Prefetch descriptor index. */
1266         rte_prefetch0(&vq->desc[desc_indexes[0]]);
1267         for (i = 0; i < count; i++) {
1268                 struct vring_desc *desc;
1269                 uint16_t sz, idx;
1270                 uint64_t dlen;
1271                 int err;
1272
1273                 if (likely(i + 1 < count))
1274                         rte_prefetch0(&vq->desc[desc_indexes[i + 1]]);
1275
1276                 if (vq->desc[desc_indexes[i]].flags & VRING_DESC_F_INDIRECT) {
1277                         dlen = vq->desc[desc_indexes[i]].len;
1278                         desc = (struct vring_desc *)(uintptr_t)
1279                                 vhost_iova_to_vva(dev, vq,
1280                                                 vq->desc[desc_indexes[i]].addr,
1281                                                 &dlen,
1282                                                 VHOST_ACCESS_RO);
1283                         if (unlikely(!desc ||
1284                                         dlen != vq->desc[desc_indexes[i]].len))
1285                                 break;
1286
1287                         rte_prefetch0(desc);
1288                         sz = vq->desc[desc_indexes[i]].len / sizeof(*desc);
1289                         idx = 0;
1290                 } else {
1291                         desc = vq->desc;
1292                         sz = vq->size;
1293                         idx = desc_indexes[i];
1294                 }
1295
1296                 pkts[i] = rte_pktmbuf_alloc(mbuf_pool);
1297                 if (unlikely(pkts[i] == NULL)) {
1298                         RTE_LOG(ERR, VHOST_DATA,
1299                                 "Failed to allocate memory for mbuf.\n");
1300                         break;
1301                 }
1302
1303                 err = copy_desc_to_mbuf(dev, vq, desc, sz, pkts[i], idx,
1304                                         mbuf_pool);
1305                 if (unlikely(err)) {
1306                         rte_pktmbuf_free(pkts[i]);
1307                         break;
1308                 }
1309
1310                 if (unlikely(dev->dequeue_zero_copy)) {
1311                         struct zcopy_mbuf *zmbuf;
1312
1313                         zmbuf = get_zmbuf(vq);
1314                         if (!zmbuf) {
1315                                 rte_pktmbuf_free(pkts[i]);
1316                                 break;
1317                         }
1318                         zmbuf->mbuf = pkts[i];
1319                         zmbuf->desc_idx = desc_indexes[i];
1320
1321                         /*
1322                          * Pin lock the mbuf; we will check later to see
1323                          * whether the mbuf is freed (when we are the last
1324                          * user) or not. If that's the case, we then could
1325                          * update the used ring safely.
1326                          */
1327                         rte_mbuf_refcnt_update(pkts[i], 1);
1328
1329                         vq->nr_zmbuf += 1;
1330                         TAILQ_INSERT_TAIL(&vq->zmbuf_list, zmbuf, next);
1331                 }
1332         }
1333         vq->last_avail_idx += i;
1334
1335         if (likely(dev->dequeue_zero_copy == 0)) {
1336                 do_data_copy_dequeue(vq);
1337                 vq->last_used_idx += i;
1338                 update_used_idx(dev, vq, i);
1339         }
1340
1341 out:
1342         if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
1343                 vhost_user_iotlb_rd_unlock(vq);
1344
1345 out_access_unlock:
1346         rte_spinlock_unlock(&vq->access_lock);
1347
1348         if (unlikely(rarp_mbuf != NULL)) {
1349                 /*
1350                  * Inject it to the head of "pkts" array, so that switch's mac
1351                  * learning table will get updated first.
1352                  */
1353                 memmove(&pkts[1], pkts, i * sizeof(struct rte_mbuf *));
1354                 pkts[0] = rarp_mbuf;
1355                 i += 1;
1356         }
1357
1358         return i;
1359 }