531425792d48fb1bd8d4de303247042cdfc2b392
[dpdk.git] / lib / librte_vhost / virtio_net.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2016 Intel Corporation
3  */
4
5 #include <stdint.h>
6 #include <stdbool.h>
7 #include <linux/virtio_net.h>
8
9 #include <rte_mbuf.h>
10 #include <rte_memcpy.h>
11 #include <rte_ether.h>
12 #include <rte_ip.h>
13 #include <rte_vhost.h>
14 #include <rte_tcp.h>
15 #include <rte_udp.h>
16 #include <rte_sctp.h>
17 #include <rte_arp.h>
18 #include <rte_spinlock.h>
19 #include <rte_malloc.h>
20
21 #include "iotlb.h"
22 #include "vhost.h"
23
24 #define MAX_PKT_BURST 32
25
26 #define MAX_BATCH_LEN 256
27
28 static bool
29 is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t nr_vring)
30 {
31         return (is_tx ^ (idx & 1)) == 0 && idx < nr_vring;
32 }
33
34 static __rte_always_inline struct vring_desc *
35 alloc_copy_ind_table(struct virtio_net *dev, struct vhost_virtqueue *vq,
36                                          struct vring_desc *desc)
37 {
38         struct vring_desc *idesc;
39         uint64_t src, dst;
40         uint64_t len, remain = desc->len;
41         uint64_t desc_addr = desc->addr;
42
43         idesc = rte_malloc(__func__, desc->len, 0);
44         if (unlikely(!idesc))
45                 return 0;
46
47         dst = (uint64_t)(uintptr_t)idesc;
48
49         while (remain) {
50                 len = remain;
51                 src = vhost_iova_to_vva(dev, vq, desc_addr, &len,
52                                 VHOST_ACCESS_RO);
53                 if (unlikely(!src || !len)) {
54                         rte_free(idesc);
55                         return 0;
56                 }
57
58                 rte_memcpy((void *)(uintptr_t)dst, (void *)(uintptr_t)src, len);
59
60                 remain -= len;
61                 dst += len;
62                 desc_addr += len;
63         }
64
65         return idesc;
66 }
67
68 static __rte_always_inline void
69 free_ind_table(struct vring_desc *idesc)
70 {
71         rte_free(idesc);
72 }
73
74 static __rte_always_inline void
75 do_flush_shadow_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq,
76                           uint16_t to, uint16_t from, uint16_t size)
77 {
78         rte_memcpy(&vq->used->ring[to],
79                         &vq->shadow_used_ring[from],
80                         size * sizeof(struct vring_used_elem));
81         vhost_log_used_vring(dev, vq,
82                         offsetof(struct vring_used, ring[to]),
83                         size * sizeof(struct vring_used_elem));
84 }
85
86 static __rte_always_inline void
87 flush_shadow_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq)
88 {
89         uint16_t used_idx = vq->last_used_idx & (vq->size - 1);
90
91         if (used_idx + vq->shadow_used_idx <= vq->size) {
92                 do_flush_shadow_used_ring(dev, vq, used_idx, 0,
93                                           vq->shadow_used_idx);
94         } else {
95                 uint16_t size;
96
97                 /* update used ring interval [used_idx, vq->size] */
98                 size = vq->size - used_idx;
99                 do_flush_shadow_used_ring(dev, vq, used_idx, 0, size);
100
101                 /* update the left half used ring interval [0, left_size] */
102                 do_flush_shadow_used_ring(dev, vq, 0, size,
103                                           vq->shadow_used_idx - size);
104         }
105         vq->last_used_idx += vq->shadow_used_idx;
106
107         rte_smp_wmb();
108
109         *(volatile uint16_t *)&vq->used->idx += vq->shadow_used_idx;
110         vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
111                 sizeof(vq->used->idx));
112 }
113
114 static __rte_always_inline void
115 update_shadow_used_ring(struct vhost_virtqueue *vq,
116                          uint16_t desc_idx, uint16_t len)
117 {
118         uint16_t i = vq->shadow_used_idx++;
119
120         vq->shadow_used_ring[i].id  = desc_idx;
121         vq->shadow_used_ring[i].len = len;
122 }
123
124 static inline void
125 do_data_copy_enqueue(struct virtio_net *dev, struct vhost_virtqueue *vq)
126 {
127         struct batch_copy_elem *elem = vq->batch_copy_elems;
128         uint16_t count = vq->batch_copy_nb_elems;
129         int i;
130
131         for (i = 0; i < count; i++) {
132                 rte_memcpy(elem[i].dst, elem[i].src, elem[i].len);
133                 vhost_log_write(dev, elem[i].log_addr, elem[i].len);
134                 PRINT_PACKET(dev, (uintptr_t)elem[i].dst, elem[i].len, 0);
135         }
136 }
137
138 static inline void
139 do_data_copy_dequeue(struct vhost_virtqueue *vq)
140 {
141         struct batch_copy_elem *elem = vq->batch_copy_elems;
142         uint16_t count = vq->batch_copy_nb_elems;
143         int i;
144
145         for (i = 0; i < count; i++)
146                 rte_memcpy(elem[i].dst, elem[i].src, elem[i].len);
147 }
148
149 /* avoid write operation when necessary, to lessen cache issues */
150 #define ASSIGN_UNLESS_EQUAL(var, val) do {      \
151         if ((var) != (val))                     \
152                 (var) = (val);                  \
153 } while (0)
154
155 static void
156 virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
157 {
158         uint64_t csum_l4 = m_buf->ol_flags & PKT_TX_L4_MASK;
159
160         if (m_buf->ol_flags & PKT_TX_TCP_SEG)
161                 csum_l4 |= PKT_TX_TCP_CKSUM;
162
163         if (csum_l4) {
164                 net_hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
165                 net_hdr->csum_start = m_buf->l2_len + m_buf->l3_len;
166
167                 switch (csum_l4) {
168                 case PKT_TX_TCP_CKSUM:
169                         net_hdr->csum_offset = (offsetof(struct tcp_hdr,
170                                                 cksum));
171                         break;
172                 case PKT_TX_UDP_CKSUM:
173                         net_hdr->csum_offset = (offsetof(struct udp_hdr,
174                                                 dgram_cksum));
175                         break;
176                 case PKT_TX_SCTP_CKSUM:
177                         net_hdr->csum_offset = (offsetof(struct sctp_hdr,
178                                                 cksum));
179                         break;
180                 }
181         } else {
182                 ASSIGN_UNLESS_EQUAL(net_hdr->csum_start, 0);
183                 ASSIGN_UNLESS_EQUAL(net_hdr->csum_offset, 0);
184                 ASSIGN_UNLESS_EQUAL(net_hdr->flags, 0);
185         }
186
187         /* IP cksum verification cannot be bypassed, then calculate here */
188         if (m_buf->ol_flags & PKT_TX_IP_CKSUM) {
189                 struct ipv4_hdr *ipv4_hdr;
190
191                 ipv4_hdr = rte_pktmbuf_mtod_offset(m_buf, struct ipv4_hdr *,
192                                                    m_buf->l2_len);
193                 ipv4_hdr->hdr_checksum = rte_ipv4_cksum(ipv4_hdr);
194         }
195
196         if (m_buf->ol_flags & PKT_TX_TCP_SEG) {
197                 if (m_buf->ol_flags & PKT_TX_IPV4)
198                         net_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
199                 else
200                         net_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
201                 net_hdr->gso_size = m_buf->tso_segsz;
202                 net_hdr->hdr_len = m_buf->l2_len + m_buf->l3_len
203                                         + m_buf->l4_len;
204         } else if (m_buf->ol_flags & PKT_TX_UDP_SEG) {
205                 net_hdr->gso_type = VIRTIO_NET_HDR_GSO_UDP;
206                 net_hdr->gso_size = m_buf->tso_segsz;
207                 net_hdr->hdr_len = m_buf->l2_len + m_buf->l3_len +
208                         m_buf->l4_len;
209         } else {
210                 ASSIGN_UNLESS_EQUAL(net_hdr->gso_type, 0);
211                 ASSIGN_UNLESS_EQUAL(net_hdr->gso_size, 0);
212                 ASSIGN_UNLESS_EQUAL(net_hdr->hdr_len, 0);
213         }
214 }
215
216 static __rte_always_inline int
217 copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
218                   struct vring_desc *descs, struct rte_mbuf *m,
219                   uint16_t desc_idx, uint32_t size)
220 {
221         uint32_t desc_avail, desc_offset;
222         uint32_t mbuf_avail, mbuf_offset;
223         uint32_t cpy_len;
224         uint64_t desc_chunck_len;
225         struct vring_desc *desc;
226         uint64_t desc_addr, desc_gaddr;
227         /* A counter to avoid desc dead loop chain */
228         uint16_t nr_desc = 1;
229         struct batch_copy_elem *batch_copy = vq->batch_copy_elems;
230         uint16_t copy_nb = vq->batch_copy_nb_elems;
231         int error = 0;
232
233         desc = &descs[desc_idx];
234         desc_chunck_len = desc->len;
235         desc_gaddr = desc->addr;
236         desc_addr = vhost_iova_to_vva(dev, vq, desc_gaddr,
237                                         &desc_chunck_len, VHOST_ACCESS_RW);
238         /*
239          * Checking of 'desc_addr' placed outside of 'unlikely' macro to avoid
240          * performance issue with some versions of gcc (4.8.4 and 5.3.0) which
241          * otherwise stores offset on the stack instead of in a register.
242          */
243         if (unlikely(desc->len < dev->vhost_hlen) || !desc_addr) {
244                 error = -1;
245                 goto out;
246         }
247
248         rte_prefetch0((void *)(uintptr_t)desc_addr);
249
250         if (likely(desc_chunck_len >= dev->vhost_hlen)) {
251                 virtio_enqueue_offload(m,
252                                 (struct virtio_net_hdr *)(uintptr_t)desc_addr);
253                 PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0);
254                 vhost_log_write(dev, desc_gaddr, dev->vhost_hlen);
255         } else {
256                 struct virtio_net_hdr vnet_hdr;
257                 uint64_t remain = dev->vhost_hlen;
258                 uint64_t len;
259                 uint64_t src = (uint64_t)(uintptr_t)&vnet_hdr, dst;
260                 uint64_t guest_addr = desc_gaddr;
261
262                 virtio_enqueue_offload(m, &vnet_hdr);
263
264                 while (remain) {
265                         len = remain;
266                         dst = vhost_iova_to_vva(dev, vq, guest_addr,
267                                         &len, VHOST_ACCESS_RW);
268                         if (unlikely(!dst || !len)) {
269                                 error = -1;
270                                 goto out;
271                         }
272
273                         rte_memcpy((void *)(uintptr_t)dst,
274                                         (void *)(uintptr_t)src, len);
275
276                         PRINT_PACKET(dev, (uintptr_t)dst, (uint32_t)len, 0);
277                         vhost_log_write(dev, guest_addr, len);
278                         remain -= len;
279                         guest_addr += len;
280                         dst += len;
281                 }
282         }
283
284         desc_avail  = desc->len - dev->vhost_hlen;
285         if (unlikely(desc_chunck_len < dev->vhost_hlen)) {
286                 desc_chunck_len = desc_avail;
287                 desc_gaddr = desc->addr + dev->vhost_hlen;
288                 desc_addr = vhost_iova_to_vva(dev,
289                                 vq, desc_gaddr,
290                                 &desc_chunck_len,
291                                 VHOST_ACCESS_RW);
292                 if (unlikely(!desc_addr)) {
293                         error = -1;
294                         goto out;
295                 }
296
297                 desc_offset = 0;
298         } else {
299                 desc_offset = dev->vhost_hlen;
300                 desc_chunck_len -= dev->vhost_hlen;
301         }
302
303         mbuf_avail  = rte_pktmbuf_data_len(m);
304         mbuf_offset = 0;
305         while (mbuf_avail != 0 || m->next != NULL) {
306                 /* done with current mbuf, fetch next */
307                 if (mbuf_avail == 0) {
308                         m = m->next;
309
310                         mbuf_offset = 0;
311                         mbuf_avail  = rte_pktmbuf_data_len(m);
312                 }
313
314                 /* done with current desc buf, fetch next */
315                 if (desc_avail == 0) {
316                         if ((desc->flags & VRING_DESC_F_NEXT) == 0) {
317                                 /* Room in vring buffer is not enough */
318                                 error = -1;
319                                 goto out;
320                         }
321                         if (unlikely(desc->next >= size || ++nr_desc > size)) {
322                                 error = -1;
323                                 goto out;
324                         }
325
326                         desc = &descs[desc->next];
327                         desc_chunck_len = desc->len;
328                         desc_gaddr = desc->addr;
329                         desc_addr = vhost_iova_to_vva(dev, vq, desc_gaddr,
330                                                         &desc_chunck_len,
331                                                         VHOST_ACCESS_RW);
332                         if (unlikely(!desc_addr)) {
333                                 error = -1;
334                                 goto out;
335                         }
336
337                         desc_offset = 0;
338                         desc_avail  = desc->len;
339                 } else if (unlikely(desc_chunck_len == 0)) {
340                         desc_chunck_len = desc_avail;
341                         desc_gaddr += desc_offset;
342                         desc_addr = vhost_iova_to_vva(dev,
343                                         vq, desc_gaddr,
344                                         &desc_chunck_len, VHOST_ACCESS_RW);
345                         if (unlikely(!desc_addr)) {
346                                 error = -1;
347                                 goto out;
348                         }
349                         desc_offset = 0;
350                 }
351
352                 cpy_len = RTE_MIN(desc_chunck_len, mbuf_avail);
353                 if (likely(cpy_len > MAX_BATCH_LEN || copy_nb >= vq->size)) {
354                         rte_memcpy((void *)((uintptr_t)(desc_addr +
355                                                         desc_offset)),
356                                 rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
357                                 cpy_len);
358                         vhost_log_write(dev, desc_gaddr + desc_offset, cpy_len);
359                         PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset),
360                                      cpy_len, 0);
361                 } else {
362                         batch_copy[copy_nb].dst =
363                                 (void *)((uintptr_t)(desc_addr + desc_offset));
364                         batch_copy[copy_nb].src =
365                                 rte_pktmbuf_mtod_offset(m, void *, mbuf_offset);
366                         batch_copy[copy_nb].log_addr = desc_gaddr + desc_offset;
367                         batch_copy[copy_nb].len = cpy_len;
368                         copy_nb++;
369                 }
370
371                 mbuf_avail  -= cpy_len;
372                 mbuf_offset += cpy_len;
373                 desc_avail  -= cpy_len;
374                 desc_offset += cpy_len;
375                 desc_chunck_len -= cpy_len;
376         }
377
378 out:
379         vq->batch_copy_nb_elems = copy_nb;
380
381         return error;
382 }
383
384 /**
385  * This function adds buffers to the virtio devices RX virtqueue. Buffers can
386  * be received from the physical port or from another virtio device. A packet
387  * count is returned to indicate the number of packets that are successfully
388  * added to the RX queue. This function works when the mbuf is scattered, but
389  * it doesn't support the mergeable feature.
390  */
391 static __rte_always_inline uint32_t
392 virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
393               struct rte_mbuf **pkts, uint32_t count)
394 {
395         struct vhost_virtqueue *vq;
396         uint16_t avail_idx, free_entries, start_idx;
397         uint16_t desc_indexes[MAX_PKT_BURST];
398         struct vring_desc *descs;
399         uint16_t used_idx;
400         uint32_t i, sz;
401
402         VHOST_LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
403         if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->nr_vring))) {
404                 RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n",
405                         dev->vid, __func__, queue_id);
406                 return 0;
407         }
408
409         vq = dev->virtqueue[queue_id];
410
411         rte_spinlock_lock(&vq->access_lock);
412
413         if (unlikely(vq->enabled == 0))
414                 goto out_access_unlock;
415
416         if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
417                 vhost_user_iotlb_rd_lock(vq);
418
419         if (unlikely(vq->access_ok == 0)) {
420                 if (unlikely(vring_translate(dev, vq) < 0)) {
421                         count = 0;
422                         goto out;
423                 }
424         }
425
426         avail_idx = *((volatile uint16_t *)&vq->avail->idx);
427         start_idx = vq->last_used_idx;
428         free_entries = avail_idx - start_idx;
429         count = RTE_MIN(count, free_entries);
430         count = RTE_MIN(count, (uint32_t)MAX_PKT_BURST);
431         if (count == 0)
432                 goto out;
433
434         VHOST_LOG_DEBUG(VHOST_DATA, "(%d) start_idx %d | end_idx %d\n",
435                 dev->vid, start_idx, start_idx + count);
436
437         vq->batch_copy_nb_elems = 0;
438
439         /* Retrieve all of the desc indexes first to avoid caching issues. */
440         rte_prefetch0(&vq->avail->ring[start_idx & (vq->size - 1)]);
441         for (i = 0; i < count; i++) {
442                 used_idx = (start_idx + i) & (vq->size - 1);
443                 desc_indexes[i] = vq->avail->ring[used_idx];
444                 vq->used->ring[used_idx].id = desc_indexes[i];
445                 vq->used->ring[used_idx].len = pkts[i]->pkt_len +
446                                                dev->vhost_hlen;
447                 vhost_log_used_vring(dev, vq,
448                         offsetof(struct vring_used, ring[used_idx]),
449                         sizeof(vq->used->ring[used_idx]));
450         }
451
452         rte_prefetch0(&vq->desc[desc_indexes[0]]);
453         for (i = 0; i < count; i++) {
454                 struct vring_desc *idesc = NULL;
455                 uint16_t desc_idx = desc_indexes[i];
456                 int err;
457
458                 if (vq->desc[desc_idx].flags & VRING_DESC_F_INDIRECT) {
459                         uint64_t dlen = vq->desc[desc_idx].len;
460                         descs = (struct vring_desc *)(uintptr_t)
461                                 vhost_iova_to_vva(dev,
462                                                 vq, vq->desc[desc_idx].addr,
463                                                 &dlen, VHOST_ACCESS_RO);
464                         if (unlikely(!descs)) {
465                                 count = i;
466                                 break;
467                         }
468
469                         if (unlikely(dlen < vq->desc[desc_idx].len)) {
470                                 /*
471                                  * The indirect desc table is not contiguous
472                                  * in process VA space, we have to copy it.
473                                  */
474                                 idesc = alloc_copy_ind_table(dev, vq,
475                                                         &vq->desc[desc_idx]);
476                                 if (unlikely(!idesc))
477                                         break;
478
479                                 descs = idesc;
480                         }
481
482                         desc_idx = 0;
483                         sz = vq->desc[desc_idx].len / sizeof(*descs);
484                 } else {
485                         descs = vq->desc;
486                         sz = vq->size;
487                 }
488
489                 err = copy_mbuf_to_desc(dev, vq, descs, pkts[i], desc_idx, sz);
490                 if (unlikely(err)) {
491                         count = i;
492                         free_ind_table(idesc);
493                         break;
494                 }
495
496                 if (i + 1 < count)
497                         rte_prefetch0(&vq->desc[desc_indexes[i+1]]);
498
499                 if (unlikely(!!idesc))
500                         free_ind_table(idesc);
501         }
502
503         do_data_copy_enqueue(dev, vq);
504
505         rte_smp_wmb();
506
507         *(volatile uint16_t *)&vq->used->idx += count;
508         vq->last_used_idx += count;
509         vhost_log_used_vring(dev, vq,
510                 offsetof(struct vring_used, idx),
511                 sizeof(vq->used->idx));
512
513         vhost_vring_call(dev, vq);
514 out:
515         if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
516                 vhost_user_iotlb_rd_unlock(vq);
517
518 out_access_unlock:
519         rte_spinlock_unlock(&vq->access_lock);
520
521         return count;
522 }
523
524 static __rte_always_inline int
525 fill_vec_buf(struct virtio_net *dev, struct vhost_virtqueue *vq,
526                          uint32_t avail_idx, uint32_t *vec_idx,
527                          struct buf_vector *buf_vec, uint16_t *desc_chain_head,
528                          uint16_t *desc_chain_len)
529 {
530         uint16_t idx = vq->avail->ring[avail_idx & (vq->size - 1)];
531         uint32_t vec_id = *vec_idx;
532         uint32_t len    = 0;
533         uint64_t dlen;
534         struct vring_desc *descs = vq->desc;
535         struct vring_desc *idesc = NULL;
536
537         *desc_chain_head = idx;
538
539         if (vq->desc[idx].flags & VRING_DESC_F_INDIRECT) {
540                 dlen = vq->desc[idx].len;
541                 descs = (struct vring_desc *)(uintptr_t)
542                         vhost_iova_to_vva(dev, vq, vq->desc[idx].addr,
543                                                 &dlen,
544                                                 VHOST_ACCESS_RO);
545                 if (unlikely(!descs))
546                         return -1;
547
548                 if (unlikely(dlen < vq->desc[idx].len)) {
549                         /*
550                          * The indirect desc table is not contiguous
551                          * in process VA space, we have to copy it.
552                          */
553                         idesc = alloc_copy_ind_table(dev, vq, &vq->desc[idx]);
554                         if (unlikely(!idesc))
555                                 return -1;
556
557                         descs = idesc;
558                 }
559
560                 idx = 0;
561         }
562
563         while (1) {
564                 if (unlikely(vec_id >= BUF_VECTOR_MAX || idx >= vq->size)) {
565                         free_ind_table(idesc);
566                         return -1;
567                 }
568
569                 len += descs[idx].len;
570                 buf_vec[vec_id].buf_addr = descs[idx].addr;
571                 buf_vec[vec_id].buf_len  = descs[idx].len;
572                 buf_vec[vec_id].desc_idx = idx;
573                 vec_id++;
574
575                 if ((descs[idx].flags & VRING_DESC_F_NEXT) == 0)
576                         break;
577
578                 idx = descs[idx].next;
579         }
580
581         *desc_chain_len = len;
582         *vec_idx = vec_id;
583
584         if (unlikely(!!idesc))
585                 free_ind_table(idesc);
586
587         return 0;
588 }
589
590 /*
591  * Returns -1 on fail, 0 on success
592  */
593 static inline int
594 reserve_avail_buf_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq,
595                                 uint32_t size, struct buf_vector *buf_vec,
596                                 uint16_t *num_buffers, uint16_t avail_head)
597 {
598         uint16_t cur_idx;
599         uint32_t vec_idx = 0;
600         uint16_t tries = 0;
601
602         uint16_t head_idx = 0;
603         uint16_t len = 0;
604
605         *num_buffers = 0;
606         cur_idx  = vq->last_avail_idx;
607
608         while (size > 0) {
609                 if (unlikely(cur_idx == avail_head))
610                         return -1;
611
612                 if (unlikely(fill_vec_buf(dev, vq, cur_idx, &vec_idx, buf_vec,
613                                                 &head_idx, &len) < 0))
614                         return -1;
615                 len = RTE_MIN(len, size);
616                 update_shadow_used_ring(vq, head_idx, len);
617                 size -= len;
618
619                 cur_idx++;
620                 tries++;
621                 *num_buffers += 1;
622
623                 /*
624                  * if we tried all available ring items, and still
625                  * can't get enough buf, it means something abnormal
626                  * happened.
627                  */
628                 if (unlikely(tries >= vq->size))
629                         return -1;
630         }
631
632         return 0;
633 }
634
635 static __rte_always_inline int
636 copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq,
637                             struct rte_mbuf *m, struct buf_vector *buf_vec,
638                             uint16_t num_buffers)
639 {
640         uint32_t vec_idx = 0;
641         uint64_t desc_addr;
642         uint32_t mbuf_offset, mbuf_avail;
643         uint32_t desc_offset, desc_avail;
644         uint32_t cpy_len;
645         uint64_t dlen;
646         uint64_t hdr_addr, hdr_phys_addr;
647         struct rte_mbuf *hdr_mbuf;
648         struct batch_copy_elem *batch_copy = vq->batch_copy_elems;
649         uint16_t copy_nb = vq->batch_copy_nb_elems;
650         int error = 0;
651
652         if (unlikely(m == NULL)) {
653                 error = -1;
654                 goto out;
655         }
656
657         dlen = buf_vec[vec_idx].buf_len;
658         desc_addr = vhost_iova_to_vva(dev, vq, buf_vec[vec_idx].buf_addr,
659                                                 &dlen, VHOST_ACCESS_RW);
660         if (dlen != buf_vec[vec_idx].buf_len ||
661                         buf_vec[vec_idx].buf_len < dev->vhost_hlen ||
662                         !desc_addr) {
663                 error = -1;
664                 goto out;
665         }
666
667         hdr_mbuf = m;
668         hdr_addr = desc_addr;
669         hdr_phys_addr = buf_vec[vec_idx].buf_addr;
670         rte_prefetch0((void *)(uintptr_t)hdr_addr);
671
672         VHOST_LOG_DEBUG(VHOST_DATA, "(%d) RX: num merge buffers %d\n",
673                 dev->vid, num_buffers);
674
675         desc_avail  = buf_vec[vec_idx].buf_len - dev->vhost_hlen;
676         desc_offset = dev->vhost_hlen;
677
678         mbuf_avail  = rte_pktmbuf_data_len(m);
679         mbuf_offset = 0;
680         while (mbuf_avail != 0 || m->next != NULL) {
681                 /* done with current desc buf, get the next one */
682                 if (desc_avail == 0) {
683                         vec_idx++;
684                         dlen = buf_vec[vec_idx].buf_len;
685                         desc_addr =
686                                 vhost_iova_to_vva(dev, vq,
687                                         buf_vec[vec_idx].buf_addr,
688                                         &dlen,
689                                         VHOST_ACCESS_RW);
690                         if (unlikely(!desc_addr ||
691                                         dlen != buf_vec[vec_idx].buf_len)) {
692                                 error = -1;
693                                 goto out;
694                         }
695
696                         /* Prefetch buffer address. */
697                         rte_prefetch0((void *)(uintptr_t)desc_addr);
698                         desc_offset = 0;
699                         desc_avail  = buf_vec[vec_idx].buf_len;
700                 }
701
702                 /* done with current mbuf, get the next one */
703                 if (mbuf_avail == 0) {
704                         m = m->next;
705
706                         mbuf_offset = 0;
707                         mbuf_avail  = rte_pktmbuf_data_len(m);
708                 }
709
710                 if (hdr_addr) {
711                         struct virtio_net_hdr_mrg_rxbuf *hdr;
712
713                         hdr = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)
714                                 hdr_addr;
715                         virtio_enqueue_offload(hdr_mbuf, &hdr->hdr);
716                         ASSIGN_UNLESS_EQUAL(hdr->num_buffers, num_buffers);
717
718                         vhost_log_write(dev, hdr_phys_addr, dev->vhost_hlen);
719                         PRINT_PACKET(dev, (uintptr_t)hdr_addr,
720                                      dev->vhost_hlen, 0);
721
722                         hdr_addr = 0;
723                 }
724
725                 cpy_len = RTE_MIN(desc_avail, mbuf_avail);
726
727                 if (likely(cpy_len > MAX_BATCH_LEN || copy_nb >= vq->size)) {
728                         rte_memcpy((void *)((uintptr_t)(desc_addr +
729                                                         desc_offset)),
730                                 rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
731                                 cpy_len);
732                         vhost_log_write(dev,
733                                 buf_vec[vec_idx].buf_addr + desc_offset,
734                                 cpy_len);
735                         PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset),
736                                 cpy_len, 0);
737                 } else {
738                         batch_copy[copy_nb].dst =
739                                 (void *)((uintptr_t)(desc_addr + desc_offset));
740                         batch_copy[copy_nb].src =
741                                 rte_pktmbuf_mtod_offset(m, void *, mbuf_offset);
742                         batch_copy[copy_nb].log_addr =
743                                 buf_vec[vec_idx].buf_addr + desc_offset;
744                         batch_copy[copy_nb].len = cpy_len;
745                         copy_nb++;
746                 }
747
748                 mbuf_avail  -= cpy_len;
749                 mbuf_offset += cpy_len;
750                 desc_avail  -= cpy_len;
751                 desc_offset += cpy_len;
752         }
753
754 out:
755         vq->batch_copy_nb_elems = copy_nb;
756
757         return error;
758 }
759
760 static __rte_always_inline uint32_t
761 virtio_dev_merge_rx(struct virtio_net *dev, uint16_t queue_id,
762         struct rte_mbuf **pkts, uint32_t count)
763 {
764         struct vhost_virtqueue *vq;
765         uint32_t pkt_idx = 0;
766         uint16_t num_buffers;
767         struct buf_vector buf_vec[BUF_VECTOR_MAX];
768         uint16_t avail_head;
769
770         VHOST_LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
771         if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->nr_vring))) {
772                 RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n",
773                         dev->vid, __func__, queue_id);
774                 return 0;
775         }
776
777         vq = dev->virtqueue[queue_id];
778
779         rte_spinlock_lock(&vq->access_lock);
780
781         if (unlikely(vq->enabled == 0))
782                 goto out_access_unlock;
783
784         if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
785                 vhost_user_iotlb_rd_lock(vq);
786
787         if (unlikely(vq->access_ok == 0))
788                 if (unlikely(vring_translate(dev, vq) < 0))
789                         goto out;
790
791         count = RTE_MIN((uint32_t)MAX_PKT_BURST, count);
792         if (count == 0)
793                 goto out;
794
795         vq->batch_copy_nb_elems = 0;
796
797         rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]);
798
799         vq->shadow_used_idx = 0;
800         avail_head = *((volatile uint16_t *)&vq->avail->idx);
801         for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
802                 uint32_t pkt_len = pkts[pkt_idx]->pkt_len + dev->vhost_hlen;
803
804                 if (unlikely(reserve_avail_buf_mergeable(dev, vq,
805                                                 pkt_len, buf_vec, &num_buffers,
806                                                 avail_head) < 0)) {
807                         VHOST_LOG_DEBUG(VHOST_DATA,
808                                 "(%d) failed to get enough desc from vring\n",
809                                 dev->vid);
810                         vq->shadow_used_idx -= num_buffers;
811                         break;
812                 }
813
814                 VHOST_LOG_DEBUG(VHOST_DATA, "(%d) current index %d | end index %d\n",
815                         dev->vid, vq->last_avail_idx,
816                         vq->last_avail_idx + num_buffers);
817
818                 if (copy_mbuf_to_desc_mergeable(dev, vq, pkts[pkt_idx],
819                                                 buf_vec, num_buffers) < 0) {
820                         vq->shadow_used_idx -= num_buffers;
821                         break;
822                 }
823
824                 vq->last_avail_idx += num_buffers;
825         }
826
827         do_data_copy_enqueue(dev, vq);
828
829         if (likely(vq->shadow_used_idx)) {
830                 flush_shadow_used_ring(dev, vq);
831                 vhost_vring_call(dev, vq);
832         }
833
834 out:
835         if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
836                 vhost_user_iotlb_rd_unlock(vq);
837
838 out_access_unlock:
839         rte_spinlock_unlock(&vq->access_lock);
840
841         return pkt_idx;
842 }
843
844 uint16_t
845 rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
846         struct rte_mbuf **pkts, uint16_t count)
847 {
848         struct virtio_net *dev = get_device(vid);
849
850         if (!dev)
851                 return 0;
852
853         if (unlikely(!(dev->flags & VIRTIO_DEV_BUILTIN_VIRTIO_NET))) {
854                 RTE_LOG(ERR, VHOST_DATA,
855                         "(%d) %s: built-in vhost net backend is disabled.\n",
856                         dev->vid, __func__);
857                 return 0;
858         }
859
860         if (dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF))
861                 return virtio_dev_merge_rx(dev, queue_id, pkts, count);
862         else
863                 return virtio_dev_rx(dev, queue_id, pkts, count);
864 }
865
866 static inline bool
867 virtio_net_with_host_offload(struct virtio_net *dev)
868 {
869         if (dev->features &
870                         ((1ULL << VIRTIO_NET_F_CSUM) |
871                          (1ULL << VIRTIO_NET_F_HOST_ECN) |
872                          (1ULL << VIRTIO_NET_F_HOST_TSO4) |
873                          (1ULL << VIRTIO_NET_F_HOST_TSO6) |
874                          (1ULL << VIRTIO_NET_F_HOST_UFO)))
875                 return true;
876
877         return false;
878 }
879
880 static void
881 parse_ethernet(struct rte_mbuf *m, uint16_t *l4_proto, void **l4_hdr)
882 {
883         struct ipv4_hdr *ipv4_hdr;
884         struct ipv6_hdr *ipv6_hdr;
885         void *l3_hdr = NULL;
886         struct ether_hdr *eth_hdr;
887         uint16_t ethertype;
888
889         eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
890
891         m->l2_len = sizeof(struct ether_hdr);
892         ethertype = rte_be_to_cpu_16(eth_hdr->ether_type);
893
894         if (ethertype == ETHER_TYPE_VLAN) {
895                 struct vlan_hdr *vlan_hdr = (struct vlan_hdr *)(eth_hdr + 1);
896
897                 m->l2_len += sizeof(struct vlan_hdr);
898                 ethertype = rte_be_to_cpu_16(vlan_hdr->eth_proto);
899         }
900
901         l3_hdr = (char *)eth_hdr + m->l2_len;
902
903         switch (ethertype) {
904         case ETHER_TYPE_IPv4:
905                 ipv4_hdr = l3_hdr;
906                 *l4_proto = ipv4_hdr->next_proto_id;
907                 m->l3_len = (ipv4_hdr->version_ihl & 0x0f) * 4;
908                 *l4_hdr = (char *)l3_hdr + m->l3_len;
909                 m->ol_flags |= PKT_TX_IPV4;
910                 break;
911         case ETHER_TYPE_IPv6:
912                 ipv6_hdr = l3_hdr;
913                 *l4_proto = ipv6_hdr->proto;
914                 m->l3_len = sizeof(struct ipv6_hdr);
915                 *l4_hdr = (char *)l3_hdr + m->l3_len;
916                 m->ol_flags |= PKT_TX_IPV6;
917                 break;
918         default:
919                 m->l3_len = 0;
920                 *l4_proto = 0;
921                 *l4_hdr = NULL;
922                 break;
923         }
924 }
925
926 static __rte_always_inline void
927 vhost_dequeue_offload(struct virtio_net_hdr *hdr, struct rte_mbuf *m)
928 {
929         uint16_t l4_proto = 0;
930         void *l4_hdr = NULL;
931         struct tcp_hdr *tcp_hdr = NULL;
932
933         if (hdr->flags == 0 && hdr->gso_type == VIRTIO_NET_HDR_GSO_NONE)
934                 return;
935
936         parse_ethernet(m, &l4_proto, &l4_hdr);
937         if (hdr->flags == VIRTIO_NET_HDR_F_NEEDS_CSUM) {
938                 if (hdr->csum_start == (m->l2_len + m->l3_len)) {
939                         switch (hdr->csum_offset) {
940                         case (offsetof(struct tcp_hdr, cksum)):
941                                 if (l4_proto == IPPROTO_TCP)
942                                         m->ol_flags |= PKT_TX_TCP_CKSUM;
943                                 break;
944                         case (offsetof(struct udp_hdr, dgram_cksum)):
945                                 if (l4_proto == IPPROTO_UDP)
946                                         m->ol_flags |= PKT_TX_UDP_CKSUM;
947                                 break;
948                         case (offsetof(struct sctp_hdr, cksum)):
949                                 if (l4_proto == IPPROTO_SCTP)
950                                         m->ol_flags |= PKT_TX_SCTP_CKSUM;
951                                 break;
952                         default:
953                                 break;
954                         }
955                 }
956         }
957
958         if (l4_hdr && hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
959                 switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
960                 case VIRTIO_NET_HDR_GSO_TCPV4:
961                 case VIRTIO_NET_HDR_GSO_TCPV6:
962                         tcp_hdr = l4_hdr;
963                         m->ol_flags |= PKT_TX_TCP_SEG;
964                         m->tso_segsz = hdr->gso_size;
965                         m->l4_len = (tcp_hdr->data_off & 0xf0) >> 2;
966                         break;
967                 case VIRTIO_NET_HDR_GSO_UDP:
968                         m->ol_flags |= PKT_TX_UDP_SEG;
969                         m->tso_segsz = hdr->gso_size;
970                         m->l4_len = sizeof(struct udp_hdr);
971                         break;
972                 default:
973                         RTE_LOG(WARNING, VHOST_DATA,
974                                 "unsupported gso type %u.\n", hdr->gso_type);
975                         break;
976                 }
977         }
978 }
979
980 static __rte_always_inline void
981 put_zmbuf(struct zcopy_mbuf *zmbuf)
982 {
983         zmbuf->in_use = 0;
984 }
985
986 static __rte_always_inline int
987 copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
988                   struct vring_desc *descs, uint16_t max_desc,
989                   struct rte_mbuf *m, uint16_t desc_idx,
990                   struct rte_mempool *mbuf_pool)
991 {
992         struct vring_desc *desc;
993         uint64_t desc_addr, desc_gaddr;
994         uint32_t desc_avail, desc_offset;
995         uint32_t mbuf_avail, mbuf_offset;
996         uint32_t cpy_len;
997         uint64_t desc_chunck_len;
998         struct rte_mbuf *cur = m, *prev = m;
999         struct virtio_net_hdr tmp_hdr;
1000         struct virtio_net_hdr *hdr = NULL;
1001         /* A counter to avoid desc dead loop chain */
1002         uint32_t nr_desc = 1;
1003         struct batch_copy_elem *batch_copy = vq->batch_copy_elems;
1004         uint16_t copy_nb = vq->batch_copy_nb_elems;
1005         int error = 0;
1006
1007         desc = &descs[desc_idx];
1008         if (unlikely((desc->len < dev->vhost_hlen)) ||
1009                         (desc->flags & VRING_DESC_F_INDIRECT)) {
1010                 error = -1;
1011                 goto out;
1012         }
1013
1014         desc_chunck_len = desc->len;
1015         desc_gaddr = desc->addr;
1016         desc_addr = vhost_iova_to_vva(dev,
1017                                         vq, desc_gaddr,
1018                                         &desc_chunck_len,
1019                                         VHOST_ACCESS_RO);
1020         if (unlikely(!desc_addr)) {
1021                 error = -1;
1022                 goto out;
1023         }
1024
1025         if (virtio_net_with_host_offload(dev)) {
1026                 if (unlikely(desc_chunck_len < sizeof(struct virtio_net_hdr))) {
1027                         uint64_t len = desc_chunck_len;
1028                         uint64_t remain = sizeof(struct virtio_net_hdr);
1029                         uint64_t src = desc_addr;
1030                         uint64_t dst = (uint64_t)(uintptr_t)&tmp_hdr;
1031                         uint64_t guest_addr = desc_gaddr;
1032
1033                         /*
1034                          * No luck, the virtio-net header doesn't fit
1035                          * in a contiguous virtual area.
1036                          */
1037                         while (remain) {
1038                                 len = remain;
1039                                 src = vhost_iova_to_vva(dev, vq,
1040                                                 guest_addr, &len,
1041                                                 VHOST_ACCESS_RO);
1042                                 if (unlikely(!src || !len)) {
1043                                         error = -1;
1044                                         goto out;
1045                                 }
1046
1047                                 rte_memcpy((void *)(uintptr_t)dst,
1048                                                    (void *)(uintptr_t)src, len);
1049
1050                                 guest_addr += len;
1051                                 remain -= len;
1052                                 dst += len;
1053                         }
1054
1055                         hdr = &tmp_hdr;
1056                 } else {
1057                         hdr = (struct virtio_net_hdr *)((uintptr_t)desc_addr);
1058                         rte_prefetch0(hdr);
1059                 }
1060         }
1061
1062         /*
1063          * A virtio driver normally uses at least 2 desc buffers
1064          * for Tx: the first for storing the header, and others
1065          * for storing the data.
1066          */
1067         if (likely((desc->len == dev->vhost_hlen) &&
1068                    (desc->flags & VRING_DESC_F_NEXT) != 0)) {
1069                 desc = &descs[desc->next];
1070                 if (unlikely(desc->flags & VRING_DESC_F_INDIRECT)) {
1071                         error = -1;
1072                         goto out;
1073                 }
1074
1075                 desc_chunck_len = desc->len;
1076                 desc_gaddr = desc->addr;
1077                 desc_addr = vhost_iova_to_vva(dev,
1078                                                         vq, desc_gaddr,
1079                                                         &desc_chunck_len,
1080                                                         VHOST_ACCESS_RO);
1081                 if (unlikely(!desc_addr)) {
1082                         error = -1;
1083                         goto out;
1084                 }
1085
1086                 desc_offset = 0;
1087                 desc_avail  = desc->len;
1088                 nr_desc    += 1;
1089         } else {
1090                 desc_avail  = desc->len - dev->vhost_hlen;
1091
1092                 if (unlikely(desc_chunck_len < dev->vhost_hlen)) {
1093                         desc_chunck_len = desc_avail;
1094                         desc_gaddr += dev->vhost_hlen;
1095                         desc_addr = vhost_iova_to_vva(dev,
1096                                         vq, desc_gaddr,
1097                                         &desc_chunck_len,
1098                                         VHOST_ACCESS_RO);
1099                         if (unlikely(!desc_addr)) {
1100                                 error = -1;
1101                                 goto out;
1102                         }
1103
1104                         desc_offset = 0;
1105                 } else {
1106                         desc_offset = dev->vhost_hlen;
1107                         desc_chunck_len -= dev->vhost_hlen;
1108                 }
1109         }
1110
1111         rte_prefetch0((void *)(uintptr_t)(desc_addr + desc_offset));
1112
1113         PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset),
1114                         (uint32_t)desc_chunck_len, 0);
1115
1116         mbuf_offset = 0;
1117         mbuf_avail  = m->buf_len - RTE_PKTMBUF_HEADROOM;
1118         while (1) {
1119                 uint64_t hpa;
1120
1121                 cpy_len = RTE_MIN(desc_chunck_len, mbuf_avail);
1122
1123                 /*
1124                  * A desc buf might across two host physical pages that are
1125                  * not continuous. In such case (gpa_to_hpa returns 0), data
1126                  * will be copied even though zero copy is enabled.
1127                  */
1128                 if (unlikely(dev->dequeue_zero_copy && (hpa = gpa_to_hpa(dev,
1129                                         desc_gaddr + desc_offset, cpy_len)))) {
1130                         cur->data_len = cpy_len;
1131                         cur->data_off = 0;
1132                         cur->buf_addr = (void *)(uintptr_t)(desc_addr
1133                                 + desc_offset);
1134                         cur->buf_iova = hpa;
1135
1136                         /*
1137                          * In zero copy mode, one mbuf can only reference data
1138                          * for one or partial of one desc buff.
1139                          */
1140                         mbuf_avail = cpy_len;
1141                 } else {
1142                         if (likely(cpy_len > MAX_BATCH_LEN ||
1143                                    copy_nb >= vq->size ||
1144                                    (hdr && cur == m) ||
1145                                    desc->len != desc_chunck_len)) {
1146                                 rte_memcpy(rte_pktmbuf_mtod_offset(cur, void *,
1147                                                                    mbuf_offset),
1148                                            (void *)((uintptr_t)(desc_addr +
1149                                                                 desc_offset)),
1150                                            cpy_len);
1151                         } else {
1152                                 batch_copy[copy_nb].dst =
1153                                         rte_pktmbuf_mtod_offset(cur, void *,
1154                                                                 mbuf_offset);
1155                                 batch_copy[copy_nb].src =
1156                                         (void *)((uintptr_t)(desc_addr +
1157                                                              desc_offset));
1158                                 batch_copy[copy_nb].len = cpy_len;
1159                                 copy_nb++;
1160                         }
1161                 }
1162
1163                 mbuf_avail  -= cpy_len;
1164                 mbuf_offset += cpy_len;
1165                 desc_avail  -= cpy_len;
1166                 desc_chunck_len -= cpy_len;
1167                 desc_offset += cpy_len;
1168
1169                 /* This desc reaches to its end, get the next one */
1170                 if (desc_avail == 0) {
1171                         if ((desc->flags & VRING_DESC_F_NEXT) == 0)
1172                                 break;
1173
1174                         if (unlikely(desc->next >= max_desc ||
1175                                      ++nr_desc > max_desc)) {
1176                                 error = -1;
1177                                 goto out;
1178                         }
1179                         desc = &descs[desc->next];
1180                         if (unlikely(desc->flags & VRING_DESC_F_INDIRECT)) {
1181                                 error = -1;
1182                                 goto out;
1183                         }
1184
1185                         desc_chunck_len = desc->len;
1186                         desc_gaddr = desc->addr;
1187                         desc_addr = vhost_iova_to_vva(dev,
1188                                                         vq, desc_gaddr,
1189                                                         &desc_chunck_len,
1190                                                         VHOST_ACCESS_RO);
1191                         if (unlikely(!desc_addr)) {
1192                                 error = -1;
1193                                 goto out;
1194                         }
1195
1196                         rte_prefetch0((void *)(uintptr_t)desc_addr);
1197
1198                         desc_offset = 0;
1199                         desc_avail  = desc->len;
1200
1201                         PRINT_PACKET(dev, (uintptr_t)desc_addr,
1202                                         (uint32_t)desc_chunck_len, 0);
1203                 } else if (unlikely(desc_chunck_len == 0)) {
1204                         desc_chunck_len = desc_avail;
1205                         desc_gaddr += desc_offset;
1206                         desc_addr = vhost_iova_to_vva(dev, vq,
1207                                         desc_gaddr,
1208                                         &desc_chunck_len,
1209                                         VHOST_ACCESS_RO);
1210                         if (unlikely(!desc_addr)) {
1211                                 error = -1;
1212                                 goto out;
1213                         }
1214                         desc_offset = 0;
1215
1216                         PRINT_PACKET(dev, (uintptr_t)desc_addr,
1217                                         (uint32_t)desc_chunck_len, 0);
1218                 }
1219
1220                 /*
1221                  * This mbuf reaches to its end, get a new one
1222                  * to hold more data.
1223                  */
1224                 if (mbuf_avail == 0) {
1225                         cur = rte_pktmbuf_alloc(mbuf_pool);
1226                         if (unlikely(cur == NULL)) {
1227                                 RTE_LOG(ERR, VHOST_DATA, "Failed to "
1228                                         "allocate memory for mbuf.\n");
1229                                 error = -1;
1230                                 goto out;
1231                         }
1232                         if (unlikely(dev->dequeue_zero_copy))
1233                                 rte_mbuf_refcnt_update(cur, 1);
1234
1235                         prev->next = cur;
1236                         prev->data_len = mbuf_offset;
1237                         m->nb_segs += 1;
1238                         m->pkt_len += mbuf_offset;
1239                         prev = cur;
1240
1241                         mbuf_offset = 0;
1242                         mbuf_avail  = cur->buf_len - RTE_PKTMBUF_HEADROOM;
1243                 }
1244         }
1245
1246         prev->data_len = mbuf_offset;
1247         m->pkt_len    += mbuf_offset;
1248
1249         if (hdr)
1250                 vhost_dequeue_offload(hdr, m);
1251
1252 out:
1253         vq->batch_copy_nb_elems = copy_nb;
1254
1255         return error;
1256 }
1257
1258 static __rte_always_inline void
1259 update_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq,
1260                  uint32_t used_idx, uint32_t desc_idx)
1261 {
1262         vq->used->ring[used_idx].id  = desc_idx;
1263         vq->used->ring[used_idx].len = 0;
1264         vhost_log_used_vring(dev, vq,
1265                         offsetof(struct vring_used, ring[used_idx]),
1266                         sizeof(vq->used->ring[used_idx]));
1267 }
1268
1269 static __rte_always_inline void
1270 update_used_idx(struct virtio_net *dev, struct vhost_virtqueue *vq,
1271                 uint32_t count)
1272 {
1273         if (unlikely(count == 0))
1274                 return;
1275
1276         rte_smp_wmb();
1277         rte_smp_rmb();
1278
1279         vq->used->idx += count;
1280         vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
1281                         sizeof(vq->used->idx));
1282         vhost_vring_call(dev, vq);
1283 }
1284
1285 static __rte_always_inline struct zcopy_mbuf *
1286 get_zmbuf(struct vhost_virtqueue *vq)
1287 {
1288         uint16_t i;
1289         uint16_t last;
1290         int tries = 0;
1291
1292         /* search [last_zmbuf_idx, zmbuf_size) */
1293         i = vq->last_zmbuf_idx;
1294         last = vq->zmbuf_size;
1295
1296 again:
1297         for (; i < last; i++) {
1298                 if (vq->zmbufs[i].in_use == 0) {
1299                         vq->last_zmbuf_idx = i + 1;
1300                         vq->zmbufs[i].in_use = 1;
1301                         return &vq->zmbufs[i];
1302                 }
1303         }
1304
1305         tries++;
1306         if (tries == 1) {
1307                 /* search [0, last_zmbuf_idx) */
1308                 i = 0;
1309                 last = vq->last_zmbuf_idx;
1310                 goto again;
1311         }
1312
1313         return NULL;
1314 }
1315
1316 static __rte_always_inline bool
1317 mbuf_is_consumed(struct rte_mbuf *m)
1318 {
1319         while (m) {
1320                 if (rte_mbuf_refcnt_read(m) > 1)
1321                         return false;
1322                 m = m->next;
1323         }
1324
1325         return true;
1326 }
1327
1328 static __rte_always_inline void
1329 restore_mbuf(struct rte_mbuf *m)
1330 {
1331         uint32_t mbuf_size, priv_size;
1332
1333         while (m) {
1334                 priv_size = rte_pktmbuf_priv_size(m->pool);
1335                 mbuf_size = sizeof(struct rte_mbuf) + priv_size;
1336                 /* start of buffer is after mbuf structure and priv data */
1337
1338                 m->buf_addr = (char *)m + mbuf_size;
1339                 m->buf_iova = rte_mempool_virt2iova(m) + mbuf_size;
1340                 m = m->next;
1341         }
1342 }
1343
1344 uint16_t
1345 rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
1346         struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count)
1347 {
1348         struct virtio_net *dev;
1349         struct rte_mbuf *rarp_mbuf = NULL;
1350         struct vhost_virtqueue *vq;
1351         uint32_t desc_indexes[MAX_PKT_BURST];
1352         uint32_t used_idx;
1353         uint32_t i = 0;
1354         uint16_t free_entries;
1355         uint16_t avail_idx;
1356
1357         dev = get_device(vid);
1358         if (!dev)
1359                 return 0;
1360
1361         if (unlikely(!(dev->flags & VIRTIO_DEV_BUILTIN_VIRTIO_NET))) {
1362                 RTE_LOG(ERR, VHOST_DATA,
1363                         "(%d) %s: built-in vhost net backend is disabled.\n",
1364                         dev->vid, __func__);
1365                 return 0;
1366         }
1367
1368         if (unlikely(!is_valid_virt_queue_idx(queue_id, 1, dev->nr_vring))) {
1369                 RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n",
1370                         dev->vid, __func__, queue_id);
1371                 return 0;
1372         }
1373
1374         vq = dev->virtqueue[queue_id];
1375
1376         if (unlikely(rte_spinlock_trylock(&vq->access_lock) == 0))
1377                 return 0;
1378
1379         if (unlikely(vq->enabled == 0))
1380                 goto out_access_unlock;
1381
1382         vq->batch_copy_nb_elems = 0;
1383
1384         if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
1385                 vhost_user_iotlb_rd_lock(vq);
1386
1387         if (unlikely(vq->access_ok == 0))
1388                 if (unlikely(vring_translate(dev, vq) < 0))
1389                         goto out;
1390
1391         if (unlikely(dev->dequeue_zero_copy)) {
1392                 struct zcopy_mbuf *zmbuf, *next;
1393                 int nr_updated = 0;
1394
1395                 for (zmbuf = TAILQ_FIRST(&vq->zmbuf_list);
1396                      zmbuf != NULL; zmbuf = next) {
1397                         next = TAILQ_NEXT(zmbuf, next);
1398
1399                         if (mbuf_is_consumed(zmbuf->mbuf)) {
1400                                 used_idx = vq->last_used_idx++ & (vq->size - 1);
1401                                 update_used_ring(dev, vq, used_idx,
1402                                                  zmbuf->desc_idx);
1403                                 nr_updated += 1;
1404
1405                                 TAILQ_REMOVE(&vq->zmbuf_list, zmbuf, next);
1406                                 restore_mbuf(zmbuf->mbuf);
1407                                 rte_pktmbuf_free(zmbuf->mbuf);
1408                                 put_zmbuf(zmbuf);
1409                                 vq->nr_zmbuf -= 1;
1410                         }
1411                 }
1412
1413                 update_used_idx(dev, vq, nr_updated);
1414         }
1415
1416         /*
1417          * Construct a RARP broadcast packet, and inject it to the "pkts"
1418          * array, to looks like that guest actually send such packet.
1419          *
1420          * Check user_send_rarp() for more information.
1421          *
1422          * broadcast_rarp shares a cacheline in the virtio_net structure
1423          * with some fields that are accessed during enqueue and
1424          * rte_atomic16_cmpset() causes a write if using cmpxchg. This could
1425          * result in false sharing between enqueue and dequeue.
1426          *
1427          * Prevent unnecessary false sharing by reading broadcast_rarp first
1428          * and only performing cmpset if the read indicates it is likely to
1429          * be set.
1430          */
1431
1432         if (unlikely(rte_atomic16_read(&dev->broadcast_rarp) &&
1433                         rte_atomic16_cmpset((volatile uint16_t *)
1434                                 &dev->broadcast_rarp.cnt, 1, 0))) {
1435
1436                 rarp_mbuf = rte_net_make_rarp_packet(mbuf_pool, &dev->mac);
1437                 if (rarp_mbuf == NULL) {
1438                         RTE_LOG(ERR, VHOST_DATA,
1439                                 "Failed to make RARP packet.\n");
1440                         return 0;
1441                 }
1442                 count -= 1;
1443         }
1444
1445         free_entries = *((volatile uint16_t *)&vq->avail->idx) -
1446                         vq->last_avail_idx;
1447         if (free_entries == 0)
1448                 goto out;
1449
1450         VHOST_LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
1451
1452         /* Prefetch available and used ring */
1453         avail_idx = vq->last_avail_idx & (vq->size - 1);
1454         used_idx  = vq->last_used_idx  & (vq->size - 1);
1455         rte_prefetch0(&vq->avail->ring[avail_idx]);
1456         rte_prefetch0(&vq->used->ring[used_idx]);
1457
1458         count = RTE_MIN(count, MAX_PKT_BURST);
1459         count = RTE_MIN(count, free_entries);
1460         VHOST_LOG_DEBUG(VHOST_DATA, "(%d) about to dequeue %u buffers\n",
1461                         dev->vid, count);
1462
1463         /* Retrieve all of the head indexes first to avoid caching issues. */
1464         for (i = 0; i < count; i++) {
1465                 avail_idx = (vq->last_avail_idx + i) & (vq->size - 1);
1466                 used_idx  = (vq->last_used_idx  + i) & (vq->size - 1);
1467                 desc_indexes[i] = vq->avail->ring[avail_idx];
1468
1469                 if (likely(dev->dequeue_zero_copy == 0))
1470                         update_used_ring(dev, vq, used_idx, desc_indexes[i]);
1471         }
1472
1473         /* Prefetch descriptor index. */
1474         rte_prefetch0(&vq->desc[desc_indexes[0]]);
1475         for (i = 0; i < count; i++) {
1476                 struct vring_desc *desc, *idesc = NULL;
1477                 uint16_t sz, idx;
1478                 uint64_t dlen;
1479                 int err;
1480
1481                 if (likely(i + 1 < count))
1482                         rte_prefetch0(&vq->desc[desc_indexes[i + 1]]);
1483
1484                 if (vq->desc[desc_indexes[i]].flags & VRING_DESC_F_INDIRECT) {
1485                         dlen = vq->desc[desc_indexes[i]].len;
1486                         desc = (struct vring_desc *)(uintptr_t)
1487                                 vhost_iova_to_vva(dev, vq,
1488                                                 vq->desc[desc_indexes[i]].addr,
1489                                                 &dlen,
1490                                                 VHOST_ACCESS_RO);
1491                         if (unlikely(!desc))
1492                                 break;
1493
1494                         if (unlikely(dlen < vq->desc[desc_indexes[i]].len)) {
1495                                 /*
1496                                  * The indirect desc table is not contiguous
1497                                  * in process VA space, we have to copy it.
1498                                  */
1499                                 idesc = alloc_copy_ind_table(dev, vq,
1500                                                 &vq->desc[desc_indexes[i]]);
1501                                 if (unlikely(!idesc))
1502                                         break;
1503
1504                                 desc = idesc;
1505                         }
1506
1507                         rte_prefetch0(desc);
1508                         sz = vq->desc[desc_indexes[i]].len / sizeof(*desc);
1509                         idx = 0;
1510                 } else {
1511                         desc = vq->desc;
1512                         sz = vq->size;
1513                         idx = desc_indexes[i];
1514                 }
1515
1516                 pkts[i] = rte_pktmbuf_alloc(mbuf_pool);
1517                 if (unlikely(pkts[i] == NULL)) {
1518                         RTE_LOG(ERR, VHOST_DATA,
1519                                 "Failed to allocate memory for mbuf.\n");
1520                         free_ind_table(idesc);
1521                         break;
1522                 }
1523
1524                 err = copy_desc_to_mbuf(dev, vq, desc, sz, pkts[i], idx,
1525                                         mbuf_pool);
1526                 if (unlikely(err)) {
1527                         rte_pktmbuf_free(pkts[i]);
1528                         free_ind_table(idesc);
1529                         break;
1530                 }
1531
1532                 if (unlikely(dev->dequeue_zero_copy)) {
1533                         struct zcopy_mbuf *zmbuf;
1534
1535                         zmbuf = get_zmbuf(vq);
1536                         if (!zmbuf) {
1537                                 rte_pktmbuf_free(pkts[i]);
1538                                 free_ind_table(idesc);
1539                                 break;
1540                         }
1541                         zmbuf->mbuf = pkts[i];
1542                         zmbuf->desc_idx = desc_indexes[i];
1543
1544                         /*
1545                          * Pin lock the mbuf; we will check later to see
1546                          * whether the mbuf is freed (when we are the last
1547                          * user) or not. If that's the case, we then could
1548                          * update the used ring safely.
1549                          */
1550                         rte_mbuf_refcnt_update(pkts[i], 1);
1551
1552                         vq->nr_zmbuf += 1;
1553                         TAILQ_INSERT_TAIL(&vq->zmbuf_list, zmbuf, next);
1554                 }
1555
1556                 if (unlikely(!!idesc))
1557                         free_ind_table(idesc);
1558         }
1559         vq->last_avail_idx += i;
1560
1561         if (likely(dev->dequeue_zero_copy == 0)) {
1562                 do_data_copy_dequeue(vq);
1563                 vq->last_used_idx += i;
1564                 update_used_idx(dev, vq, i);
1565         }
1566
1567 out:
1568         if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
1569                 vhost_user_iotlb_rd_unlock(vq);
1570
1571 out_access_unlock:
1572         rte_spinlock_unlock(&vq->access_lock);
1573
1574         if (unlikely(rarp_mbuf != NULL)) {
1575                 /*
1576                  * Inject it to the head of "pkts" array, so that switch's mac
1577                  * learning table will get updated first.
1578                  */
1579                 memmove(&pkts[1], pkts, i * sizeof(struct rte_mbuf *));
1580                 pkts[0] = rarp_mbuf;
1581                 i += 1;
1582         }
1583
1584         return i;
1585 }