eed6b02272be1c7b03ca479fc0d9e7acfa9c1142
[dpdk.git] / lib / librte_vhost / virtio_net.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2016 Intel Corporation
3  */
4
5 #include <stdint.h>
6 #include <stdbool.h>
7 #include <linux/virtio_net.h>
8
9 #include <rte_mbuf.h>
10 #include <rte_memcpy.h>
11 #include <rte_ether.h>
12 #include <rte_ip.h>
13 #include <rte_vhost.h>
14 #include <rte_tcp.h>
15 #include <rte_udp.h>
16 #include <rte_sctp.h>
17 #include <rte_arp.h>
18 #include <rte_spinlock.h>
19 #include <rte_malloc.h>
20
21 #include "iotlb.h"
22 #include "vhost.h"
23
24 #define MAX_PKT_BURST 32
25
26 #define MAX_BATCH_LEN 256
27
28 static bool
29 is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t nr_vring)
30 {
31         return (is_tx ^ (idx & 1)) == 0 && idx < nr_vring;
32 }
33
34 static __rte_always_inline struct vring_desc *
35 alloc_copy_ind_table(struct virtio_net *dev, struct vhost_virtqueue *vq,
36                                          struct vring_desc *desc)
37 {
38         struct vring_desc *idesc;
39         uint64_t src, dst;
40         uint64_t len, remain = desc->len;
41         uint64_t desc_addr = desc->addr;
42
43         idesc = rte_malloc(__func__, desc->len, 0);
44         if (unlikely(!idesc))
45                 return 0;
46
47         dst = (uint64_t)(uintptr_t)idesc;
48
49         while (remain) {
50                 len = remain;
51                 src = vhost_iova_to_vva(dev, vq, desc_addr, &len,
52                                 VHOST_ACCESS_RO);
53                 if (unlikely(!src || !len)) {
54                         rte_free(idesc);
55                         return 0;
56                 }
57
58                 rte_memcpy((void *)(uintptr_t)dst, (void *)(uintptr_t)src, len);
59
60                 remain -= len;
61                 dst += len;
62                 desc_addr += len;
63         }
64
65         return idesc;
66 }
67
68 static __rte_always_inline void
69 free_ind_table(struct vring_desc *idesc)
70 {
71         rte_free(idesc);
72 }
73
74 static __rte_always_inline void
75 do_flush_shadow_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq,
76                           uint16_t to, uint16_t from, uint16_t size)
77 {
78         rte_memcpy(&vq->used->ring[to],
79                         &vq->shadow_used_ring[from],
80                         size * sizeof(struct vring_used_elem));
81         vhost_log_used_vring(dev, vq,
82                         offsetof(struct vring_used, ring[to]),
83                         size * sizeof(struct vring_used_elem));
84 }
85
86 static __rte_always_inline void
87 flush_shadow_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq)
88 {
89         uint16_t used_idx = vq->last_used_idx & (vq->size - 1);
90
91         if (used_idx + vq->shadow_used_idx <= vq->size) {
92                 do_flush_shadow_used_ring(dev, vq, used_idx, 0,
93                                           vq->shadow_used_idx);
94         } else {
95                 uint16_t size;
96
97                 /* update used ring interval [used_idx, vq->size] */
98                 size = vq->size - used_idx;
99                 do_flush_shadow_used_ring(dev, vq, used_idx, 0, size);
100
101                 /* update the left half used ring interval [0, left_size] */
102                 do_flush_shadow_used_ring(dev, vq, 0, size,
103                                           vq->shadow_used_idx - size);
104         }
105         vq->last_used_idx += vq->shadow_used_idx;
106
107         rte_smp_wmb();
108
109         *(volatile uint16_t *)&vq->used->idx += vq->shadow_used_idx;
110         vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
111                 sizeof(vq->used->idx));
112 }
113
114 static __rte_always_inline void
115 update_shadow_used_ring(struct vhost_virtqueue *vq,
116                          uint16_t desc_idx, uint16_t len)
117 {
118         uint16_t i = vq->shadow_used_idx++;
119
120         vq->shadow_used_ring[i].id  = desc_idx;
121         vq->shadow_used_ring[i].len = len;
122 }
123
124 static inline void
125 do_data_copy_enqueue(struct virtio_net *dev, struct vhost_virtqueue *vq)
126 {
127         struct batch_copy_elem *elem = vq->batch_copy_elems;
128         uint16_t count = vq->batch_copy_nb_elems;
129         int i;
130
131         for (i = 0; i < count; i++) {
132                 rte_memcpy(elem[i].dst, elem[i].src, elem[i].len);
133                 vhost_log_write(dev, elem[i].log_addr, elem[i].len);
134                 PRINT_PACKET(dev, (uintptr_t)elem[i].dst, elem[i].len, 0);
135         }
136 }
137
138 static inline void
139 do_data_copy_dequeue(struct vhost_virtqueue *vq)
140 {
141         struct batch_copy_elem *elem = vq->batch_copy_elems;
142         uint16_t count = vq->batch_copy_nb_elems;
143         int i;
144
145         for (i = 0; i < count; i++)
146                 rte_memcpy(elem[i].dst, elem[i].src, elem[i].len);
147 }
148
149 /* avoid write operation when necessary, to lessen cache issues */
150 #define ASSIGN_UNLESS_EQUAL(var, val) do {      \
151         if ((var) != (val))                     \
152                 (var) = (val);                  \
153 } while (0)
154
155 static void
156 virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
157 {
158         uint64_t csum_l4 = m_buf->ol_flags & PKT_TX_L4_MASK;
159
160         if (m_buf->ol_flags & PKT_TX_TCP_SEG)
161                 csum_l4 |= PKT_TX_TCP_CKSUM;
162
163         if (csum_l4) {
164                 net_hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
165                 net_hdr->csum_start = m_buf->l2_len + m_buf->l3_len;
166
167                 switch (csum_l4) {
168                 case PKT_TX_TCP_CKSUM:
169                         net_hdr->csum_offset = (offsetof(struct tcp_hdr,
170                                                 cksum));
171                         break;
172                 case PKT_TX_UDP_CKSUM:
173                         net_hdr->csum_offset = (offsetof(struct udp_hdr,
174                                                 dgram_cksum));
175                         break;
176                 case PKT_TX_SCTP_CKSUM:
177                         net_hdr->csum_offset = (offsetof(struct sctp_hdr,
178                                                 cksum));
179                         break;
180                 }
181         } else {
182                 ASSIGN_UNLESS_EQUAL(net_hdr->csum_start, 0);
183                 ASSIGN_UNLESS_EQUAL(net_hdr->csum_offset, 0);
184                 ASSIGN_UNLESS_EQUAL(net_hdr->flags, 0);
185         }
186
187         /* IP cksum verification cannot be bypassed, then calculate here */
188         if (m_buf->ol_flags & PKT_TX_IP_CKSUM) {
189                 struct ipv4_hdr *ipv4_hdr;
190
191                 ipv4_hdr = rte_pktmbuf_mtod_offset(m_buf, struct ipv4_hdr *,
192                                                    m_buf->l2_len);
193                 ipv4_hdr->hdr_checksum = rte_ipv4_cksum(ipv4_hdr);
194         }
195
196         if (m_buf->ol_flags & PKT_TX_TCP_SEG) {
197                 if (m_buf->ol_flags & PKT_TX_IPV4)
198                         net_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
199                 else
200                         net_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
201                 net_hdr->gso_size = m_buf->tso_segsz;
202                 net_hdr->hdr_len = m_buf->l2_len + m_buf->l3_len
203                                         + m_buf->l4_len;
204         } else if (m_buf->ol_flags & PKT_TX_UDP_SEG) {
205                 net_hdr->gso_type = VIRTIO_NET_HDR_GSO_UDP;
206                 net_hdr->gso_size = m_buf->tso_segsz;
207                 net_hdr->hdr_len = m_buf->l2_len + m_buf->l3_len +
208                         m_buf->l4_len;
209         } else {
210                 ASSIGN_UNLESS_EQUAL(net_hdr->gso_type, 0);
211                 ASSIGN_UNLESS_EQUAL(net_hdr->gso_size, 0);
212                 ASSIGN_UNLESS_EQUAL(net_hdr->hdr_len, 0);
213         }
214 }
215
216 static __rte_always_inline int
217 copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
218                   struct vring_desc *descs, struct rte_mbuf *m,
219                   uint16_t desc_idx, uint32_t size)
220 {
221         uint32_t desc_avail, desc_offset;
222         uint32_t mbuf_avail, mbuf_offset;
223         uint32_t cpy_len;
224         uint64_t desc_chunck_len;
225         struct vring_desc *desc;
226         uint64_t desc_addr, desc_gaddr;
227         /* A counter to avoid desc dead loop chain */
228         uint16_t nr_desc = 1;
229         struct batch_copy_elem *batch_copy = vq->batch_copy_elems;
230         uint16_t copy_nb = vq->batch_copy_nb_elems;
231         int error = 0;
232
233         desc = &descs[desc_idx];
234         desc_chunck_len = desc->len;
235         desc_gaddr = desc->addr;
236         desc_addr = vhost_iova_to_vva(dev, vq, desc_gaddr,
237                                         &desc_chunck_len, VHOST_ACCESS_RW);
238         /*
239          * Checking of 'desc_addr' placed outside of 'unlikely' macro to avoid
240          * performance issue with some versions of gcc (4.8.4 and 5.3.0) which
241          * otherwise stores offset on the stack instead of in a register.
242          */
243         if (unlikely(desc->len < dev->vhost_hlen) || !desc_addr) {
244                 error = -1;
245                 goto out;
246         }
247
248         rte_prefetch0((void *)(uintptr_t)desc_addr);
249
250         if (likely(desc_chunck_len >= dev->vhost_hlen)) {
251                 virtio_enqueue_offload(m,
252                                 (struct virtio_net_hdr *)(uintptr_t)desc_addr);
253                 PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0);
254                 vhost_log_write(dev, desc_gaddr, dev->vhost_hlen);
255         } else {
256                 struct virtio_net_hdr vnet_hdr;
257                 uint64_t remain = dev->vhost_hlen;
258                 uint64_t len;
259                 uint64_t src = (uint64_t)(uintptr_t)&vnet_hdr, dst;
260                 uint64_t guest_addr = desc_gaddr;
261
262                 virtio_enqueue_offload(m, &vnet_hdr);
263
264                 while (remain) {
265                         len = remain;
266                         dst = vhost_iova_to_vva(dev, vq, guest_addr,
267                                         &len, VHOST_ACCESS_RW);
268                         if (unlikely(!dst || !len)) {
269                                 error = -1;
270                                 goto out;
271                         }
272
273                         rte_memcpy((void *)(uintptr_t)dst,
274                                         (void *)(uintptr_t)src, len);
275
276                         PRINT_PACKET(dev, (uintptr_t)dst, (uint32_t)len, 0);
277                         vhost_log_write(dev, guest_addr, len);
278                         remain -= len;
279                         guest_addr += len;
280                         src += len;
281                 }
282         }
283
284         desc_avail  = desc->len - dev->vhost_hlen;
285         if (unlikely(desc_chunck_len < dev->vhost_hlen)) {
286                 desc_chunck_len = desc_avail;
287                 desc_gaddr = desc->addr + dev->vhost_hlen;
288                 desc_addr = vhost_iova_to_vva(dev,
289                                 vq, desc_gaddr,
290                                 &desc_chunck_len,
291                                 VHOST_ACCESS_RW);
292                 if (unlikely(!desc_addr)) {
293                         error = -1;
294                         goto out;
295                 }
296
297                 desc_offset = 0;
298         } else {
299                 desc_offset = dev->vhost_hlen;
300                 desc_chunck_len -= dev->vhost_hlen;
301         }
302
303         mbuf_avail  = rte_pktmbuf_data_len(m);
304         mbuf_offset = 0;
305         while (mbuf_avail != 0 || m->next != NULL) {
306                 /* done with current mbuf, fetch next */
307                 if (mbuf_avail == 0) {
308                         m = m->next;
309
310                         mbuf_offset = 0;
311                         mbuf_avail  = rte_pktmbuf_data_len(m);
312                 }
313
314                 /* done with current desc buf, fetch next */
315                 if (desc_avail == 0) {
316                         if ((desc->flags & VRING_DESC_F_NEXT) == 0) {
317                                 /* Room in vring buffer is not enough */
318                                 error = -1;
319                                 goto out;
320                         }
321                         if (unlikely(desc->next >= size || ++nr_desc > size)) {
322                                 error = -1;
323                                 goto out;
324                         }
325
326                         desc = &descs[desc->next];
327                         desc_chunck_len = desc->len;
328                         desc_gaddr = desc->addr;
329                         desc_addr = vhost_iova_to_vva(dev, vq, desc_gaddr,
330                                                         &desc_chunck_len,
331                                                         VHOST_ACCESS_RW);
332                         if (unlikely(!desc_addr)) {
333                                 error = -1;
334                                 goto out;
335                         }
336
337                         desc_offset = 0;
338                         desc_avail  = desc->len;
339                 } else if (unlikely(desc_chunck_len == 0)) {
340                         desc_chunck_len = desc_avail;
341                         desc_gaddr += desc_offset;
342                         desc_addr = vhost_iova_to_vva(dev,
343                                         vq, desc_gaddr,
344                                         &desc_chunck_len, VHOST_ACCESS_RW);
345                         if (unlikely(!desc_addr)) {
346                                 error = -1;
347                                 goto out;
348                         }
349                         desc_offset = 0;
350                 }
351
352                 cpy_len = RTE_MIN(desc_chunck_len, mbuf_avail);
353                 if (likely(cpy_len > MAX_BATCH_LEN || copy_nb >= vq->size)) {
354                         rte_memcpy((void *)((uintptr_t)(desc_addr +
355                                                         desc_offset)),
356                                 rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
357                                 cpy_len);
358                         vhost_log_write(dev, desc_gaddr + desc_offset, cpy_len);
359                         PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset),
360                                      cpy_len, 0);
361                 } else {
362                         batch_copy[copy_nb].dst =
363                                 (void *)((uintptr_t)(desc_addr + desc_offset));
364                         batch_copy[copy_nb].src =
365                                 rte_pktmbuf_mtod_offset(m, void *, mbuf_offset);
366                         batch_copy[copy_nb].log_addr = desc_gaddr + desc_offset;
367                         batch_copy[copy_nb].len = cpy_len;
368                         copy_nb++;
369                 }
370
371                 mbuf_avail  -= cpy_len;
372                 mbuf_offset += cpy_len;
373                 desc_avail  -= cpy_len;
374                 desc_offset += cpy_len;
375                 desc_chunck_len -= cpy_len;
376         }
377
378 out:
379         vq->batch_copy_nb_elems = copy_nb;
380
381         return error;
382 }
383
384 /**
385  * This function adds buffers to the virtio devices RX virtqueue. Buffers can
386  * be received from the physical port or from another virtio device. A packet
387  * count is returned to indicate the number of packets that are successfully
388  * added to the RX queue. This function works when the mbuf is scattered, but
389  * it doesn't support the mergeable feature.
390  */
391 static __rte_always_inline uint32_t
392 virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
393               struct rte_mbuf **pkts, uint32_t count)
394 {
395         struct vhost_virtqueue *vq;
396         uint16_t avail_idx, free_entries, start_idx;
397         uint16_t desc_indexes[MAX_PKT_BURST];
398         struct vring_desc *descs;
399         uint16_t used_idx;
400         uint32_t i, sz;
401
402         VHOST_LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
403         if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->nr_vring))) {
404                 RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n",
405                         dev->vid, __func__, queue_id);
406                 return 0;
407         }
408
409         vq = dev->virtqueue[queue_id];
410
411         rte_spinlock_lock(&vq->access_lock);
412
413         if (unlikely(vq->enabled == 0))
414                 goto out_access_unlock;
415
416         if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
417                 vhost_user_iotlb_rd_lock(vq);
418
419         if (unlikely(vq->access_ok == 0)) {
420                 if (unlikely(vring_translate(dev, vq) < 0)) {
421                         count = 0;
422                         goto out;
423                 }
424         }
425
426         avail_idx = *((volatile uint16_t *)&vq->avail->idx);
427         start_idx = vq->last_used_idx;
428         free_entries = avail_idx - start_idx;
429         count = RTE_MIN(count, free_entries);
430         count = RTE_MIN(count, (uint32_t)MAX_PKT_BURST);
431         if (count == 0)
432                 goto out;
433
434         VHOST_LOG_DEBUG(VHOST_DATA, "(%d) start_idx %d | end_idx %d\n",
435                 dev->vid, start_idx, start_idx + count);
436
437         vq->batch_copy_nb_elems = 0;
438
439         /* Retrieve all of the desc indexes first to avoid caching issues. */
440         rte_prefetch0(&vq->avail->ring[start_idx & (vq->size - 1)]);
441         for (i = 0; i < count; i++) {
442                 used_idx = (start_idx + i) & (vq->size - 1);
443                 desc_indexes[i] = vq->avail->ring[used_idx];
444                 vq->used->ring[used_idx].id = desc_indexes[i];
445                 vq->used->ring[used_idx].len = pkts[i]->pkt_len +
446                                                dev->vhost_hlen;
447                 vhost_log_used_vring(dev, vq,
448                         offsetof(struct vring_used, ring[used_idx]),
449                         sizeof(vq->used->ring[used_idx]));
450         }
451
452         rte_prefetch0(&vq->desc[desc_indexes[0]]);
453         for (i = 0; i < count; i++) {
454                 struct vring_desc *idesc = NULL;
455                 uint16_t desc_idx = desc_indexes[i];
456                 int err;
457
458                 if (vq->desc[desc_idx].flags & VRING_DESC_F_INDIRECT) {
459                         uint64_t dlen = vq->desc[desc_idx].len;
460                         descs = (struct vring_desc *)(uintptr_t)
461                                 vhost_iova_to_vva(dev,
462                                                 vq, vq->desc[desc_idx].addr,
463                                                 &dlen, VHOST_ACCESS_RO);
464                         if (unlikely(!descs)) {
465                                 count = i;
466                                 break;
467                         }
468
469                         if (unlikely(dlen < vq->desc[desc_idx].len)) {
470                                 /*
471                                  * The indirect desc table is not contiguous
472                                  * in process VA space, we have to copy it.
473                                  */
474                                 idesc = alloc_copy_ind_table(dev, vq,
475                                                         &vq->desc[desc_idx]);
476                                 if (unlikely(!idesc))
477                                         break;
478
479                                 descs = idesc;
480                         }
481
482                         desc_idx = 0;
483                         sz = vq->desc[desc_idx].len / sizeof(*descs);
484                 } else {
485                         descs = vq->desc;
486                         sz = vq->size;
487                 }
488
489                 err = copy_mbuf_to_desc(dev, vq, descs, pkts[i], desc_idx, sz);
490                 if (unlikely(err)) {
491                         count = i;
492                         free_ind_table(idesc);
493                         break;
494                 }
495
496                 if (i + 1 < count)
497                         rte_prefetch0(&vq->desc[desc_indexes[i+1]]);
498
499                 if (unlikely(!!idesc))
500                         free_ind_table(idesc);
501         }
502
503         do_data_copy_enqueue(dev, vq);
504
505         rte_smp_wmb();
506
507         *(volatile uint16_t *)&vq->used->idx += count;
508         vq->last_used_idx += count;
509         vhost_log_used_vring(dev, vq,
510                 offsetof(struct vring_used, idx),
511                 sizeof(vq->used->idx));
512
513         vhost_vring_call(dev, vq);
514 out:
515         if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
516                 vhost_user_iotlb_rd_unlock(vq);
517
518 out_access_unlock:
519         rte_spinlock_unlock(&vq->access_lock);
520
521         return count;
522 }
523
524 static __rte_always_inline int
525 fill_vec_buf(struct virtio_net *dev, struct vhost_virtqueue *vq,
526                          uint32_t avail_idx, uint32_t *vec_idx,
527                          struct buf_vector *buf_vec, uint16_t *desc_chain_head,
528                          uint16_t *desc_chain_len)
529 {
530         uint16_t idx = vq->avail->ring[avail_idx & (vq->size - 1)];
531         uint32_t vec_id = *vec_idx;
532         uint32_t len    = 0;
533         uint64_t dlen;
534         struct vring_desc *descs = vq->desc;
535         struct vring_desc *idesc = NULL;
536
537         *desc_chain_head = idx;
538
539         if (vq->desc[idx].flags & VRING_DESC_F_INDIRECT) {
540                 dlen = vq->desc[idx].len;
541                 descs = (struct vring_desc *)(uintptr_t)
542                         vhost_iova_to_vva(dev, vq, vq->desc[idx].addr,
543                                                 &dlen,
544                                                 VHOST_ACCESS_RO);
545                 if (unlikely(!descs))
546                         return -1;
547
548                 if (unlikely(dlen < vq->desc[idx].len)) {
549                         /*
550                          * The indirect desc table is not contiguous
551                          * in process VA space, we have to copy it.
552                          */
553                         idesc = alloc_copy_ind_table(dev, vq, &vq->desc[idx]);
554                         if (unlikely(!idesc))
555                                 return -1;
556
557                         descs = idesc;
558                 }
559
560                 idx = 0;
561         }
562
563         while (1) {
564                 if (unlikely(vec_id >= BUF_VECTOR_MAX || idx >= vq->size)) {
565                         free_ind_table(idesc);
566                         return -1;
567                 }
568
569                 len += descs[idx].len;
570                 buf_vec[vec_id].buf_addr = descs[idx].addr;
571                 buf_vec[vec_id].buf_len  = descs[idx].len;
572                 buf_vec[vec_id].desc_idx = idx;
573                 vec_id++;
574
575                 if ((descs[idx].flags & VRING_DESC_F_NEXT) == 0)
576                         break;
577
578                 idx = descs[idx].next;
579         }
580
581         *desc_chain_len = len;
582         *vec_idx = vec_id;
583
584         if (unlikely(!!idesc))
585                 free_ind_table(idesc);
586
587         return 0;
588 }
589
590 /*
591  * Returns -1 on fail, 0 on success
592  */
593 static inline int
594 reserve_avail_buf_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq,
595                                 uint32_t size, struct buf_vector *buf_vec,
596                                 uint16_t *num_buffers, uint16_t avail_head)
597 {
598         uint16_t cur_idx;
599         uint32_t vec_idx = 0;
600         uint16_t tries = 0;
601
602         uint16_t head_idx = 0;
603         uint16_t len = 0;
604
605         *num_buffers = 0;
606         cur_idx  = vq->last_avail_idx;
607
608         while (size > 0) {
609                 if (unlikely(cur_idx == avail_head))
610                         return -1;
611
612                 if (unlikely(fill_vec_buf(dev, vq, cur_idx, &vec_idx, buf_vec,
613                                                 &head_idx, &len) < 0))
614                         return -1;
615                 len = RTE_MIN(len, size);
616                 update_shadow_used_ring(vq, head_idx, len);
617                 size -= len;
618
619                 cur_idx++;
620                 tries++;
621                 *num_buffers += 1;
622
623                 /*
624                  * if we tried all available ring items, and still
625                  * can't get enough buf, it means something abnormal
626                  * happened.
627                  */
628                 if (unlikely(tries >= vq->size))
629                         return -1;
630         }
631
632         return 0;
633 }
634
635 static __rte_always_inline int
636 copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq,
637                             struct rte_mbuf *m, struct buf_vector *buf_vec,
638                             uint16_t num_buffers)
639 {
640         uint32_t vec_idx = 0;
641         uint64_t desc_addr, desc_gaddr;
642         uint32_t mbuf_offset, mbuf_avail;
643         uint32_t desc_offset, desc_avail;
644         uint32_t cpy_len;
645         uint64_t desc_chunck_len;
646         uint64_t hdr_addr, hdr_phys_addr;
647         struct rte_mbuf *hdr_mbuf;
648         struct batch_copy_elem *batch_copy = vq->batch_copy_elems;
649         struct virtio_net_hdr_mrg_rxbuf tmp_hdr, *hdr = NULL;
650         uint16_t copy_nb = vq->batch_copy_nb_elems;
651         int error = 0;
652
653         if (unlikely(m == NULL)) {
654                 error = -1;
655                 goto out;
656         }
657
658         desc_chunck_len = buf_vec[vec_idx].buf_len;
659         desc_gaddr = buf_vec[vec_idx].buf_addr;
660         desc_addr = vhost_iova_to_vva(dev, vq,
661                                         desc_gaddr,
662                                         &desc_chunck_len,
663                                         VHOST_ACCESS_RW);
664         if (buf_vec[vec_idx].buf_len < dev->vhost_hlen || !desc_addr) {
665                 error = -1;
666                 goto out;
667         }
668
669         hdr_mbuf = m;
670         hdr_addr = desc_addr;
671         if (unlikely(desc_chunck_len < dev->vhost_hlen))
672                 hdr = &tmp_hdr;
673         else
674                 hdr = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)hdr_addr;
675         hdr_phys_addr = desc_gaddr;
676         rte_prefetch0((void *)(uintptr_t)hdr_addr);
677
678         VHOST_LOG_DEBUG(VHOST_DATA, "(%d) RX: num merge buffers %d\n",
679                 dev->vid, num_buffers);
680
681         desc_avail  = buf_vec[vec_idx].buf_len - dev->vhost_hlen;
682         if (unlikely(desc_chunck_len < dev->vhost_hlen)) {
683                 desc_chunck_len = desc_avail;
684                 desc_gaddr += dev->vhost_hlen;
685                 desc_addr = vhost_iova_to_vva(dev, vq,
686                                 desc_gaddr,
687                                 &desc_chunck_len,
688                                 VHOST_ACCESS_RW);
689                 if (unlikely(!desc_addr)) {
690                         error = -1;
691                         goto out;
692                 }
693
694                 desc_offset = 0;
695         } else {
696                 desc_offset = dev->vhost_hlen;
697                 desc_chunck_len -= dev->vhost_hlen;
698         }
699
700
701         mbuf_avail  = rte_pktmbuf_data_len(m);
702         mbuf_offset = 0;
703         while (mbuf_avail != 0 || m->next != NULL) {
704                 /* done with current desc buf, get the next one */
705                 if (desc_avail == 0) {
706                         vec_idx++;
707                         desc_chunck_len = buf_vec[vec_idx].buf_len;
708                         desc_gaddr = buf_vec[vec_idx].buf_addr;
709                         desc_addr =
710                                 vhost_iova_to_vva(dev, vq,
711                                         desc_gaddr,
712                                         &desc_chunck_len,
713                                         VHOST_ACCESS_RW);
714                         if (unlikely(!desc_addr)) {
715                                 error = -1;
716                                 goto out;
717                         }
718
719                         /* Prefetch buffer address. */
720                         rte_prefetch0((void *)(uintptr_t)desc_addr);
721                         desc_offset = 0;
722                         desc_avail  = buf_vec[vec_idx].buf_len;
723                 } else if (unlikely(desc_chunck_len == 0)) {
724                         desc_chunck_len = desc_avail;
725                         desc_gaddr += desc_offset;
726                         desc_addr = vhost_iova_to_vva(dev, vq,
727                                         desc_gaddr,
728                                         &desc_chunck_len, VHOST_ACCESS_RW);
729                         if (unlikely(!desc_addr)) {
730                                 error = -1;
731                                 goto out;
732                         }
733                         desc_offset = 0;
734                 }
735
736                 /* done with current mbuf, get the next one */
737                 if (mbuf_avail == 0) {
738                         m = m->next;
739
740                         mbuf_offset = 0;
741                         mbuf_avail  = rte_pktmbuf_data_len(m);
742                 }
743
744                 if (hdr_addr) {
745                         virtio_enqueue_offload(hdr_mbuf, &hdr->hdr);
746                         ASSIGN_UNLESS_EQUAL(hdr->num_buffers, num_buffers);
747
748                         if (unlikely(hdr == &tmp_hdr)) {
749                                 uint64_t len;
750                                 uint64_t remain = dev->vhost_hlen;
751                                 uint64_t src = (uint64_t)(uintptr_t)hdr, dst;
752                                 uint64_t guest_addr = hdr_phys_addr;
753
754                                 while (remain) {
755                                         len = remain;
756                                         dst = vhost_iova_to_vva(dev, vq,
757                                                         guest_addr, &len,
758                                                         VHOST_ACCESS_RW);
759                                         if (unlikely(!dst || !len)) {
760                                                 error = -1;
761                                                 goto out;
762                                         }
763
764                                         rte_memcpy((void *)(uintptr_t)dst,
765                                                         (void *)(uintptr_t)src,
766                                                         len);
767
768                                         PRINT_PACKET(dev, (uintptr_t)dst,
769                                                         (uint32_t)len, 0);
770                                         vhost_log_write(dev, guest_addr, len);
771
772                                         remain -= len;
773                                         guest_addr += len;
774                                         src += len;
775                                 }
776                         } else {
777                                 PRINT_PACKET(dev, (uintptr_t)hdr_addr,
778                                                 dev->vhost_hlen, 0);
779                                 vhost_log_write(dev, hdr_phys_addr,
780                                                 dev->vhost_hlen);
781                         }
782
783                         hdr_addr = 0;
784                 }
785
786                 cpy_len = RTE_MIN(desc_chunck_len, mbuf_avail);
787
788                 if (likely(cpy_len > MAX_BATCH_LEN || copy_nb >= vq->size)) {
789                         rte_memcpy((void *)((uintptr_t)(desc_addr +
790                                                         desc_offset)),
791                                 rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
792                                 cpy_len);
793                         vhost_log_write(dev, desc_gaddr + desc_offset, cpy_len);
794                         PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset),
795                                 cpy_len, 0);
796                 } else {
797                         batch_copy[copy_nb].dst =
798                                 (void *)((uintptr_t)(desc_addr + desc_offset));
799                         batch_copy[copy_nb].src =
800                                 rte_pktmbuf_mtod_offset(m, void *, mbuf_offset);
801                         batch_copy[copy_nb].log_addr = desc_gaddr + desc_offset;
802                         batch_copy[copy_nb].len = cpy_len;
803                         copy_nb++;
804                 }
805
806                 mbuf_avail  -= cpy_len;
807                 mbuf_offset += cpy_len;
808                 desc_avail  -= cpy_len;
809                 desc_offset += cpy_len;
810                 desc_chunck_len -= cpy_len;
811         }
812
813 out:
814         vq->batch_copy_nb_elems = copy_nb;
815
816         return error;
817 }
818
819 static __rte_always_inline uint32_t
820 virtio_dev_merge_rx(struct virtio_net *dev, uint16_t queue_id,
821         struct rte_mbuf **pkts, uint32_t count)
822 {
823         struct vhost_virtqueue *vq;
824         uint32_t pkt_idx = 0;
825         uint16_t num_buffers;
826         struct buf_vector buf_vec[BUF_VECTOR_MAX];
827         uint16_t avail_head;
828
829         VHOST_LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
830         if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->nr_vring))) {
831                 RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n",
832                         dev->vid, __func__, queue_id);
833                 return 0;
834         }
835
836         vq = dev->virtqueue[queue_id];
837
838         rte_spinlock_lock(&vq->access_lock);
839
840         if (unlikely(vq->enabled == 0))
841                 goto out_access_unlock;
842
843         if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
844                 vhost_user_iotlb_rd_lock(vq);
845
846         if (unlikely(vq->access_ok == 0))
847                 if (unlikely(vring_translate(dev, vq) < 0))
848                         goto out;
849
850         count = RTE_MIN((uint32_t)MAX_PKT_BURST, count);
851         if (count == 0)
852                 goto out;
853
854         vq->batch_copy_nb_elems = 0;
855
856         rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]);
857
858         vq->shadow_used_idx = 0;
859         avail_head = *((volatile uint16_t *)&vq->avail->idx);
860         for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
861                 uint32_t pkt_len = pkts[pkt_idx]->pkt_len + dev->vhost_hlen;
862
863                 if (unlikely(reserve_avail_buf_mergeable(dev, vq,
864                                                 pkt_len, buf_vec, &num_buffers,
865                                                 avail_head) < 0)) {
866                         VHOST_LOG_DEBUG(VHOST_DATA,
867                                 "(%d) failed to get enough desc from vring\n",
868                                 dev->vid);
869                         vq->shadow_used_idx -= num_buffers;
870                         break;
871                 }
872
873                 VHOST_LOG_DEBUG(VHOST_DATA, "(%d) current index %d | end index %d\n",
874                         dev->vid, vq->last_avail_idx,
875                         vq->last_avail_idx + num_buffers);
876
877                 if (copy_mbuf_to_desc_mergeable(dev, vq, pkts[pkt_idx],
878                                                 buf_vec, num_buffers) < 0) {
879                         vq->shadow_used_idx -= num_buffers;
880                         break;
881                 }
882
883                 vq->last_avail_idx += num_buffers;
884         }
885
886         do_data_copy_enqueue(dev, vq);
887
888         if (likely(vq->shadow_used_idx)) {
889                 flush_shadow_used_ring(dev, vq);
890                 vhost_vring_call(dev, vq);
891         }
892
893 out:
894         if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
895                 vhost_user_iotlb_rd_unlock(vq);
896
897 out_access_unlock:
898         rte_spinlock_unlock(&vq->access_lock);
899
900         return pkt_idx;
901 }
902
903 uint16_t
904 rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
905         struct rte_mbuf **pkts, uint16_t count)
906 {
907         struct virtio_net *dev = get_device(vid);
908
909         if (!dev)
910                 return 0;
911
912         if (unlikely(!(dev->flags & VIRTIO_DEV_BUILTIN_VIRTIO_NET))) {
913                 RTE_LOG(ERR, VHOST_DATA,
914                         "(%d) %s: built-in vhost net backend is disabled.\n",
915                         dev->vid, __func__);
916                 return 0;
917         }
918
919         if (dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF))
920                 return virtio_dev_merge_rx(dev, queue_id, pkts, count);
921         else
922                 return virtio_dev_rx(dev, queue_id, pkts, count);
923 }
924
925 static inline bool
926 virtio_net_with_host_offload(struct virtio_net *dev)
927 {
928         if (dev->features &
929                         ((1ULL << VIRTIO_NET_F_CSUM) |
930                          (1ULL << VIRTIO_NET_F_HOST_ECN) |
931                          (1ULL << VIRTIO_NET_F_HOST_TSO4) |
932                          (1ULL << VIRTIO_NET_F_HOST_TSO6) |
933                          (1ULL << VIRTIO_NET_F_HOST_UFO)))
934                 return true;
935
936         return false;
937 }
938
939 static void
940 parse_ethernet(struct rte_mbuf *m, uint16_t *l4_proto, void **l4_hdr)
941 {
942         struct ipv4_hdr *ipv4_hdr;
943         struct ipv6_hdr *ipv6_hdr;
944         void *l3_hdr = NULL;
945         struct ether_hdr *eth_hdr;
946         uint16_t ethertype;
947
948         eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
949
950         m->l2_len = sizeof(struct ether_hdr);
951         ethertype = rte_be_to_cpu_16(eth_hdr->ether_type);
952
953         if (ethertype == ETHER_TYPE_VLAN) {
954                 struct vlan_hdr *vlan_hdr = (struct vlan_hdr *)(eth_hdr + 1);
955
956                 m->l2_len += sizeof(struct vlan_hdr);
957                 ethertype = rte_be_to_cpu_16(vlan_hdr->eth_proto);
958         }
959
960         l3_hdr = (char *)eth_hdr + m->l2_len;
961
962         switch (ethertype) {
963         case ETHER_TYPE_IPv4:
964                 ipv4_hdr = l3_hdr;
965                 *l4_proto = ipv4_hdr->next_proto_id;
966                 m->l3_len = (ipv4_hdr->version_ihl & 0x0f) * 4;
967                 *l4_hdr = (char *)l3_hdr + m->l3_len;
968                 m->ol_flags |= PKT_TX_IPV4;
969                 break;
970         case ETHER_TYPE_IPv6:
971                 ipv6_hdr = l3_hdr;
972                 *l4_proto = ipv6_hdr->proto;
973                 m->l3_len = sizeof(struct ipv6_hdr);
974                 *l4_hdr = (char *)l3_hdr + m->l3_len;
975                 m->ol_flags |= PKT_TX_IPV6;
976                 break;
977         default:
978                 m->l3_len = 0;
979                 *l4_proto = 0;
980                 *l4_hdr = NULL;
981                 break;
982         }
983 }
984
985 static __rte_always_inline void
986 vhost_dequeue_offload(struct virtio_net_hdr *hdr, struct rte_mbuf *m)
987 {
988         uint16_t l4_proto = 0;
989         void *l4_hdr = NULL;
990         struct tcp_hdr *tcp_hdr = NULL;
991
992         if (hdr->flags == 0 && hdr->gso_type == VIRTIO_NET_HDR_GSO_NONE)
993                 return;
994
995         parse_ethernet(m, &l4_proto, &l4_hdr);
996         if (hdr->flags == VIRTIO_NET_HDR_F_NEEDS_CSUM) {
997                 if (hdr->csum_start == (m->l2_len + m->l3_len)) {
998                         switch (hdr->csum_offset) {
999                         case (offsetof(struct tcp_hdr, cksum)):
1000                                 if (l4_proto == IPPROTO_TCP)
1001                                         m->ol_flags |= PKT_TX_TCP_CKSUM;
1002                                 break;
1003                         case (offsetof(struct udp_hdr, dgram_cksum)):
1004                                 if (l4_proto == IPPROTO_UDP)
1005                                         m->ol_flags |= PKT_TX_UDP_CKSUM;
1006                                 break;
1007                         case (offsetof(struct sctp_hdr, cksum)):
1008                                 if (l4_proto == IPPROTO_SCTP)
1009                                         m->ol_flags |= PKT_TX_SCTP_CKSUM;
1010                                 break;
1011                         default:
1012                                 break;
1013                         }
1014                 }
1015         }
1016
1017         if (l4_hdr && hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
1018                 switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
1019                 case VIRTIO_NET_HDR_GSO_TCPV4:
1020                 case VIRTIO_NET_HDR_GSO_TCPV6:
1021                         tcp_hdr = l4_hdr;
1022                         m->ol_flags |= PKT_TX_TCP_SEG;
1023                         m->tso_segsz = hdr->gso_size;
1024                         m->l4_len = (tcp_hdr->data_off & 0xf0) >> 2;
1025                         break;
1026                 case VIRTIO_NET_HDR_GSO_UDP:
1027                         m->ol_flags |= PKT_TX_UDP_SEG;
1028                         m->tso_segsz = hdr->gso_size;
1029                         m->l4_len = sizeof(struct udp_hdr);
1030                         break;
1031                 default:
1032                         RTE_LOG(WARNING, VHOST_DATA,
1033                                 "unsupported gso type %u.\n", hdr->gso_type);
1034                         break;
1035                 }
1036         }
1037 }
1038
1039 static __rte_always_inline void
1040 put_zmbuf(struct zcopy_mbuf *zmbuf)
1041 {
1042         zmbuf->in_use = 0;
1043 }
1044
1045 static __rte_always_inline int
1046 copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
1047                   struct vring_desc *descs, uint16_t max_desc,
1048                   struct rte_mbuf *m, uint16_t desc_idx,
1049                   struct rte_mempool *mbuf_pool)
1050 {
1051         struct vring_desc *desc;
1052         uint64_t desc_addr, desc_gaddr;
1053         uint32_t desc_avail, desc_offset;
1054         uint32_t mbuf_avail, mbuf_offset;
1055         uint32_t cpy_len;
1056         uint64_t desc_chunck_len;
1057         struct rte_mbuf *cur = m, *prev = m;
1058         struct virtio_net_hdr tmp_hdr;
1059         struct virtio_net_hdr *hdr = NULL;
1060         /* A counter to avoid desc dead loop chain */
1061         uint32_t nr_desc = 1;
1062         struct batch_copy_elem *batch_copy = vq->batch_copy_elems;
1063         uint16_t copy_nb = vq->batch_copy_nb_elems;
1064         int error = 0;
1065
1066         desc = &descs[desc_idx];
1067         if (unlikely((desc->len < dev->vhost_hlen)) ||
1068                         (desc->flags & VRING_DESC_F_INDIRECT)) {
1069                 error = -1;
1070                 goto out;
1071         }
1072
1073         desc_chunck_len = desc->len;
1074         desc_gaddr = desc->addr;
1075         desc_addr = vhost_iova_to_vva(dev,
1076                                         vq, desc_gaddr,
1077                                         &desc_chunck_len,
1078                                         VHOST_ACCESS_RO);
1079         if (unlikely(!desc_addr)) {
1080                 error = -1;
1081                 goto out;
1082         }
1083
1084         if (virtio_net_with_host_offload(dev)) {
1085                 if (unlikely(desc_chunck_len < sizeof(struct virtio_net_hdr))) {
1086                         uint64_t len = desc_chunck_len;
1087                         uint64_t remain = sizeof(struct virtio_net_hdr);
1088                         uint64_t src = desc_addr;
1089                         uint64_t dst = (uint64_t)(uintptr_t)&tmp_hdr;
1090                         uint64_t guest_addr = desc_gaddr;
1091
1092                         /*
1093                          * No luck, the virtio-net header doesn't fit
1094                          * in a contiguous virtual area.
1095                          */
1096                         while (remain) {
1097                                 len = remain;
1098                                 src = vhost_iova_to_vva(dev, vq,
1099                                                 guest_addr, &len,
1100                                                 VHOST_ACCESS_RO);
1101                                 if (unlikely(!src || !len)) {
1102                                         error = -1;
1103                                         goto out;
1104                                 }
1105
1106                                 rte_memcpy((void *)(uintptr_t)dst,
1107                                                    (void *)(uintptr_t)src, len);
1108
1109                                 guest_addr += len;
1110                                 remain -= len;
1111                                 dst += len;
1112                         }
1113
1114                         hdr = &tmp_hdr;
1115                 } else {
1116                         hdr = (struct virtio_net_hdr *)((uintptr_t)desc_addr);
1117                         rte_prefetch0(hdr);
1118                 }
1119         }
1120
1121         /*
1122          * A virtio driver normally uses at least 2 desc buffers
1123          * for Tx: the first for storing the header, and others
1124          * for storing the data.
1125          */
1126         if (likely((desc->len == dev->vhost_hlen) &&
1127                    (desc->flags & VRING_DESC_F_NEXT) != 0)) {
1128                 desc = &descs[desc->next];
1129                 if (unlikely(desc->flags & VRING_DESC_F_INDIRECT)) {
1130                         error = -1;
1131                         goto out;
1132                 }
1133
1134                 desc_chunck_len = desc->len;
1135                 desc_gaddr = desc->addr;
1136                 desc_addr = vhost_iova_to_vva(dev,
1137                                                         vq, desc_gaddr,
1138                                                         &desc_chunck_len,
1139                                                         VHOST_ACCESS_RO);
1140                 if (unlikely(!desc_addr)) {
1141                         error = -1;
1142                         goto out;
1143                 }
1144
1145                 desc_offset = 0;
1146                 desc_avail  = desc->len;
1147                 nr_desc    += 1;
1148         } else {
1149                 desc_avail  = desc->len - dev->vhost_hlen;
1150
1151                 if (unlikely(desc_chunck_len < dev->vhost_hlen)) {
1152                         desc_chunck_len = desc_avail;
1153                         desc_gaddr += dev->vhost_hlen;
1154                         desc_addr = vhost_iova_to_vva(dev,
1155                                         vq, desc_gaddr,
1156                                         &desc_chunck_len,
1157                                         VHOST_ACCESS_RO);
1158                         if (unlikely(!desc_addr)) {
1159                                 error = -1;
1160                                 goto out;
1161                         }
1162
1163                         desc_offset = 0;
1164                 } else {
1165                         desc_offset = dev->vhost_hlen;
1166                         desc_chunck_len -= dev->vhost_hlen;
1167                 }
1168         }
1169
1170         rte_prefetch0((void *)(uintptr_t)(desc_addr + desc_offset));
1171
1172         PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset),
1173                         (uint32_t)desc_chunck_len, 0);
1174
1175         mbuf_offset = 0;
1176         mbuf_avail  = m->buf_len - RTE_PKTMBUF_HEADROOM;
1177         while (1) {
1178                 uint64_t hpa;
1179
1180                 cpy_len = RTE_MIN(desc_chunck_len, mbuf_avail);
1181
1182                 /*
1183                  * A desc buf might across two host physical pages that are
1184                  * not continuous. In such case (gpa_to_hpa returns 0), data
1185                  * will be copied even though zero copy is enabled.
1186                  */
1187                 if (unlikely(dev->dequeue_zero_copy && (hpa = gpa_to_hpa(dev,
1188                                         desc_gaddr + desc_offset, cpy_len)))) {
1189                         cur->data_len = cpy_len;
1190                         cur->data_off = 0;
1191                         cur->buf_addr = (void *)(uintptr_t)(desc_addr
1192                                 + desc_offset);
1193                         cur->buf_iova = hpa;
1194
1195                         /*
1196                          * In zero copy mode, one mbuf can only reference data
1197                          * for one or partial of one desc buff.
1198                          */
1199                         mbuf_avail = cpy_len;
1200                 } else {
1201                         if (likely(cpy_len > MAX_BATCH_LEN ||
1202                                    copy_nb >= vq->size ||
1203                                    (hdr && cur == m) ||
1204                                    desc->len != desc_chunck_len)) {
1205                                 rte_memcpy(rte_pktmbuf_mtod_offset(cur, void *,
1206                                                                    mbuf_offset),
1207                                            (void *)((uintptr_t)(desc_addr +
1208                                                                 desc_offset)),
1209                                            cpy_len);
1210                         } else {
1211                                 batch_copy[copy_nb].dst =
1212                                         rte_pktmbuf_mtod_offset(cur, void *,
1213                                                                 mbuf_offset);
1214                                 batch_copy[copy_nb].src =
1215                                         (void *)((uintptr_t)(desc_addr +
1216                                                              desc_offset));
1217                                 batch_copy[copy_nb].len = cpy_len;
1218                                 copy_nb++;
1219                         }
1220                 }
1221
1222                 mbuf_avail  -= cpy_len;
1223                 mbuf_offset += cpy_len;
1224                 desc_avail  -= cpy_len;
1225                 desc_chunck_len -= cpy_len;
1226                 desc_offset += cpy_len;
1227
1228                 /* This desc reaches to its end, get the next one */
1229                 if (desc_avail == 0) {
1230                         if ((desc->flags & VRING_DESC_F_NEXT) == 0)
1231                                 break;
1232
1233                         if (unlikely(desc->next >= max_desc ||
1234                                      ++nr_desc > max_desc)) {
1235                                 error = -1;
1236                                 goto out;
1237                         }
1238                         desc = &descs[desc->next];
1239                         if (unlikely(desc->flags & VRING_DESC_F_INDIRECT)) {
1240                                 error = -1;
1241                                 goto out;
1242                         }
1243
1244                         desc_chunck_len = desc->len;
1245                         desc_gaddr = desc->addr;
1246                         desc_addr = vhost_iova_to_vva(dev,
1247                                                         vq, desc_gaddr,
1248                                                         &desc_chunck_len,
1249                                                         VHOST_ACCESS_RO);
1250                         if (unlikely(!desc_addr)) {
1251                                 error = -1;
1252                                 goto out;
1253                         }
1254
1255                         rte_prefetch0((void *)(uintptr_t)desc_addr);
1256
1257                         desc_offset = 0;
1258                         desc_avail  = desc->len;
1259
1260                         PRINT_PACKET(dev, (uintptr_t)desc_addr,
1261                                         (uint32_t)desc_chunck_len, 0);
1262                 } else if (unlikely(desc_chunck_len == 0)) {
1263                         desc_chunck_len = desc_avail;
1264                         desc_gaddr += desc_offset;
1265                         desc_addr = vhost_iova_to_vva(dev, vq,
1266                                         desc_gaddr,
1267                                         &desc_chunck_len,
1268                                         VHOST_ACCESS_RO);
1269                         if (unlikely(!desc_addr)) {
1270                                 error = -1;
1271                                 goto out;
1272                         }
1273                         desc_offset = 0;
1274
1275                         PRINT_PACKET(dev, (uintptr_t)desc_addr,
1276                                         (uint32_t)desc_chunck_len, 0);
1277                 }
1278
1279                 /*
1280                  * This mbuf reaches to its end, get a new one
1281                  * to hold more data.
1282                  */
1283                 if (mbuf_avail == 0) {
1284                         cur = rte_pktmbuf_alloc(mbuf_pool);
1285                         if (unlikely(cur == NULL)) {
1286                                 RTE_LOG(ERR, VHOST_DATA, "Failed to "
1287                                         "allocate memory for mbuf.\n");
1288                                 error = -1;
1289                                 goto out;
1290                         }
1291                         if (unlikely(dev->dequeue_zero_copy))
1292                                 rte_mbuf_refcnt_update(cur, 1);
1293
1294                         prev->next = cur;
1295                         prev->data_len = mbuf_offset;
1296                         m->nb_segs += 1;
1297                         m->pkt_len += mbuf_offset;
1298                         prev = cur;
1299
1300                         mbuf_offset = 0;
1301                         mbuf_avail  = cur->buf_len - RTE_PKTMBUF_HEADROOM;
1302                 }
1303         }
1304
1305         prev->data_len = mbuf_offset;
1306         m->pkt_len    += mbuf_offset;
1307
1308         if (hdr)
1309                 vhost_dequeue_offload(hdr, m);
1310
1311 out:
1312         vq->batch_copy_nb_elems = copy_nb;
1313
1314         return error;
1315 }
1316
1317 static __rte_always_inline void
1318 update_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq,
1319                  uint32_t used_idx, uint32_t desc_idx)
1320 {
1321         vq->used->ring[used_idx].id  = desc_idx;
1322         vq->used->ring[used_idx].len = 0;
1323         vhost_log_used_vring(dev, vq,
1324                         offsetof(struct vring_used, ring[used_idx]),
1325                         sizeof(vq->used->ring[used_idx]));
1326 }
1327
1328 static __rte_always_inline void
1329 update_used_idx(struct virtio_net *dev, struct vhost_virtqueue *vq,
1330                 uint32_t count)
1331 {
1332         if (unlikely(count == 0))
1333                 return;
1334
1335         rte_smp_wmb();
1336         rte_smp_rmb();
1337
1338         vq->used->idx += count;
1339         vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
1340                         sizeof(vq->used->idx));
1341         vhost_vring_call(dev, vq);
1342 }
1343
1344 static __rte_always_inline struct zcopy_mbuf *
1345 get_zmbuf(struct vhost_virtqueue *vq)
1346 {
1347         uint16_t i;
1348         uint16_t last;
1349         int tries = 0;
1350
1351         /* search [last_zmbuf_idx, zmbuf_size) */
1352         i = vq->last_zmbuf_idx;
1353         last = vq->zmbuf_size;
1354
1355 again:
1356         for (; i < last; i++) {
1357                 if (vq->zmbufs[i].in_use == 0) {
1358                         vq->last_zmbuf_idx = i + 1;
1359                         vq->zmbufs[i].in_use = 1;
1360                         return &vq->zmbufs[i];
1361                 }
1362         }
1363
1364         tries++;
1365         if (tries == 1) {
1366                 /* search [0, last_zmbuf_idx) */
1367                 i = 0;
1368                 last = vq->last_zmbuf_idx;
1369                 goto again;
1370         }
1371
1372         return NULL;
1373 }
1374
1375 static __rte_always_inline bool
1376 mbuf_is_consumed(struct rte_mbuf *m)
1377 {
1378         while (m) {
1379                 if (rte_mbuf_refcnt_read(m) > 1)
1380                         return false;
1381                 m = m->next;
1382         }
1383
1384         return true;
1385 }
1386
1387 static __rte_always_inline void
1388 restore_mbuf(struct rte_mbuf *m)
1389 {
1390         uint32_t mbuf_size, priv_size;
1391
1392         while (m) {
1393                 priv_size = rte_pktmbuf_priv_size(m->pool);
1394                 mbuf_size = sizeof(struct rte_mbuf) + priv_size;
1395                 /* start of buffer is after mbuf structure and priv data */
1396
1397                 m->buf_addr = (char *)m + mbuf_size;
1398                 m->buf_iova = rte_mempool_virt2iova(m) + mbuf_size;
1399                 m = m->next;
1400         }
1401 }
1402
1403 uint16_t
1404 rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
1405         struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count)
1406 {
1407         struct virtio_net *dev;
1408         struct rte_mbuf *rarp_mbuf = NULL;
1409         struct vhost_virtqueue *vq;
1410         uint32_t desc_indexes[MAX_PKT_BURST];
1411         uint32_t used_idx;
1412         uint32_t i = 0;
1413         uint16_t free_entries;
1414         uint16_t avail_idx;
1415
1416         dev = get_device(vid);
1417         if (!dev)
1418                 return 0;
1419
1420         if (unlikely(!(dev->flags & VIRTIO_DEV_BUILTIN_VIRTIO_NET))) {
1421                 RTE_LOG(ERR, VHOST_DATA,
1422                         "(%d) %s: built-in vhost net backend is disabled.\n",
1423                         dev->vid, __func__);
1424                 return 0;
1425         }
1426
1427         if (unlikely(!is_valid_virt_queue_idx(queue_id, 1, dev->nr_vring))) {
1428                 RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n",
1429                         dev->vid, __func__, queue_id);
1430                 return 0;
1431         }
1432
1433         vq = dev->virtqueue[queue_id];
1434
1435         if (unlikely(rte_spinlock_trylock(&vq->access_lock) == 0))
1436                 return 0;
1437
1438         if (unlikely(vq->enabled == 0))
1439                 goto out_access_unlock;
1440
1441         vq->batch_copy_nb_elems = 0;
1442
1443         if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
1444                 vhost_user_iotlb_rd_lock(vq);
1445
1446         if (unlikely(vq->access_ok == 0))
1447                 if (unlikely(vring_translate(dev, vq) < 0))
1448                         goto out;
1449
1450         if (unlikely(dev->dequeue_zero_copy)) {
1451                 struct zcopy_mbuf *zmbuf, *next;
1452                 int nr_updated = 0;
1453
1454                 for (zmbuf = TAILQ_FIRST(&vq->zmbuf_list);
1455                      zmbuf != NULL; zmbuf = next) {
1456                         next = TAILQ_NEXT(zmbuf, next);
1457
1458                         if (mbuf_is_consumed(zmbuf->mbuf)) {
1459                                 used_idx = vq->last_used_idx++ & (vq->size - 1);
1460                                 update_used_ring(dev, vq, used_idx,
1461                                                  zmbuf->desc_idx);
1462                                 nr_updated += 1;
1463
1464                                 TAILQ_REMOVE(&vq->zmbuf_list, zmbuf, next);
1465                                 restore_mbuf(zmbuf->mbuf);
1466                                 rte_pktmbuf_free(zmbuf->mbuf);
1467                                 put_zmbuf(zmbuf);
1468                                 vq->nr_zmbuf -= 1;
1469                         }
1470                 }
1471
1472                 update_used_idx(dev, vq, nr_updated);
1473         }
1474
1475         /*
1476          * Construct a RARP broadcast packet, and inject it to the "pkts"
1477          * array, to looks like that guest actually send such packet.
1478          *
1479          * Check user_send_rarp() for more information.
1480          *
1481          * broadcast_rarp shares a cacheline in the virtio_net structure
1482          * with some fields that are accessed during enqueue and
1483          * rte_atomic16_cmpset() causes a write if using cmpxchg. This could
1484          * result in false sharing between enqueue and dequeue.
1485          *
1486          * Prevent unnecessary false sharing by reading broadcast_rarp first
1487          * and only performing cmpset if the read indicates it is likely to
1488          * be set.
1489          */
1490
1491         if (unlikely(rte_atomic16_read(&dev->broadcast_rarp) &&
1492                         rte_atomic16_cmpset((volatile uint16_t *)
1493                                 &dev->broadcast_rarp.cnt, 1, 0))) {
1494
1495                 rarp_mbuf = rte_net_make_rarp_packet(mbuf_pool, &dev->mac);
1496                 if (rarp_mbuf == NULL) {
1497                         RTE_LOG(ERR, VHOST_DATA,
1498                                 "Failed to make RARP packet.\n");
1499                         return 0;
1500                 }
1501                 count -= 1;
1502         }
1503
1504         free_entries = *((volatile uint16_t *)&vq->avail->idx) -
1505                         vq->last_avail_idx;
1506         if (free_entries == 0)
1507                 goto out;
1508
1509         VHOST_LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
1510
1511         /* Prefetch available and used ring */
1512         avail_idx = vq->last_avail_idx & (vq->size - 1);
1513         used_idx  = vq->last_used_idx  & (vq->size - 1);
1514         rte_prefetch0(&vq->avail->ring[avail_idx]);
1515         rte_prefetch0(&vq->used->ring[used_idx]);
1516
1517         count = RTE_MIN(count, MAX_PKT_BURST);
1518         count = RTE_MIN(count, free_entries);
1519         VHOST_LOG_DEBUG(VHOST_DATA, "(%d) about to dequeue %u buffers\n",
1520                         dev->vid, count);
1521
1522         /* Retrieve all of the head indexes first to avoid caching issues. */
1523         for (i = 0; i < count; i++) {
1524                 avail_idx = (vq->last_avail_idx + i) & (vq->size - 1);
1525                 used_idx  = (vq->last_used_idx  + i) & (vq->size - 1);
1526                 desc_indexes[i] = vq->avail->ring[avail_idx];
1527
1528                 if (likely(dev->dequeue_zero_copy == 0))
1529                         update_used_ring(dev, vq, used_idx, desc_indexes[i]);
1530         }
1531
1532         /* Prefetch descriptor index. */
1533         rte_prefetch0(&vq->desc[desc_indexes[0]]);
1534         for (i = 0; i < count; i++) {
1535                 struct vring_desc *desc, *idesc = NULL;
1536                 uint16_t sz, idx;
1537                 uint64_t dlen;
1538                 int err;
1539
1540                 if (likely(i + 1 < count))
1541                         rte_prefetch0(&vq->desc[desc_indexes[i + 1]]);
1542
1543                 if (vq->desc[desc_indexes[i]].flags & VRING_DESC_F_INDIRECT) {
1544                         dlen = vq->desc[desc_indexes[i]].len;
1545                         desc = (struct vring_desc *)(uintptr_t)
1546                                 vhost_iova_to_vva(dev, vq,
1547                                                 vq->desc[desc_indexes[i]].addr,
1548                                                 &dlen,
1549                                                 VHOST_ACCESS_RO);
1550                         if (unlikely(!desc))
1551                                 break;
1552
1553                         if (unlikely(dlen < vq->desc[desc_indexes[i]].len)) {
1554                                 /*
1555                                  * The indirect desc table is not contiguous
1556                                  * in process VA space, we have to copy it.
1557                                  */
1558                                 idesc = alloc_copy_ind_table(dev, vq,
1559                                                 &vq->desc[desc_indexes[i]]);
1560                                 if (unlikely(!idesc))
1561                                         break;
1562
1563                                 desc = idesc;
1564                         }
1565
1566                         rte_prefetch0(desc);
1567                         sz = vq->desc[desc_indexes[i]].len / sizeof(*desc);
1568                         idx = 0;
1569                 } else {
1570                         desc = vq->desc;
1571                         sz = vq->size;
1572                         idx = desc_indexes[i];
1573                 }
1574
1575                 pkts[i] = rte_pktmbuf_alloc(mbuf_pool);
1576                 if (unlikely(pkts[i] == NULL)) {
1577                         RTE_LOG(ERR, VHOST_DATA,
1578                                 "Failed to allocate memory for mbuf.\n");
1579                         free_ind_table(idesc);
1580                         break;
1581                 }
1582
1583                 err = copy_desc_to_mbuf(dev, vq, desc, sz, pkts[i], idx,
1584                                         mbuf_pool);
1585                 if (unlikely(err)) {
1586                         rte_pktmbuf_free(pkts[i]);
1587                         free_ind_table(idesc);
1588                         break;
1589                 }
1590
1591                 if (unlikely(dev->dequeue_zero_copy)) {
1592                         struct zcopy_mbuf *zmbuf;
1593
1594                         zmbuf = get_zmbuf(vq);
1595                         if (!zmbuf) {
1596                                 rte_pktmbuf_free(pkts[i]);
1597                                 free_ind_table(idesc);
1598                                 break;
1599                         }
1600                         zmbuf->mbuf = pkts[i];
1601                         zmbuf->desc_idx = desc_indexes[i];
1602
1603                         /*
1604                          * Pin lock the mbuf; we will check later to see
1605                          * whether the mbuf is freed (when we are the last
1606                          * user) or not. If that's the case, we then could
1607                          * update the used ring safely.
1608                          */
1609                         rte_mbuf_refcnt_update(pkts[i], 1);
1610
1611                         vq->nr_zmbuf += 1;
1612                         TAILQ_INSERT_TAIL(&vq->zmbuf_list, zmbuf, next);
1613                 }
1614
1615                 if (unlikely(!!idesc))
1616                         free_ind_table(idesc);
1617         }
1618         vq->last_avail_idx += i;
1619
1620         if (likely(dev->dequeue_zero_copy == 0)) {
1621                 do_data_copy_dequeue(vq);
1622                 vq->last_used_idx += i;
1623                 update_used_idx(dev, vq, i);
1624         }
1625
1626 out:
1627         if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
1628                 vhost_user_iotlb_rd_unlock(vq);
1629
1630 out_access_unlock:
1631         rte_spinlock_unlock(&vq->access_lock);
1632
1633         if (unlikely(rarp_mbuf != NULL)) {
1634                 /*
1635                  * Inject it to the head of "pkts" array, so that switch's mac
1636                  * learning table will get updated first.
1637                  */
1638                 memmove(&pkts[1], pkts, i * sizeof(struct rte_mbuf *));
1639                 pkts[0] = rarp_mbuf;
1640                 i += 1;
1641         }
1642
1643         return i;
1644 }