vhost: fix possible dead loop in vector filling
[dpdk.git] / lib / librte_vhost / virtio_net.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2016 Intel Corporation
3  */
4
5 #include <stdint.h>
6 #include <stdbool.h>
7 #include <linux/virtio_net.h>
8
9 #include <rte_mbuf.h>
10 #include <rte_memcpy.h>
11 #include <rte_ether.h>
12 #include <rte_ip.h>
13 #include <rte_vhost.h>
14 #include <rte_tcp.h>
15 #include <rte_udp.h>
16 #include <rte_sctp.h>
17 #include <rte_arp.h>
18 #include <rte_spinlock.h>
19 #include <rte_malloc.h>
20
21 #include "iotlb.h"
22 #include "vhost.h"
23
24 #define MAX_PKT_BURST 32
25
26 #define MAX_BATCH_LEN 256
27
28 static  __rte_always_inline bool
29 rxvq_is_mergeable(struct virtio_net *dev)
30 {
31         return dev->features & (1ULL << VIRTIO_NET_F_MRG_RXBUF);
32 }
33
34 static bool
35 is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t nr_vring)
36 {
37         return (is_tx ^ (idx & 1)) == 0 && idx < nr_vring;
38 }
39
40 static __rte_always_inline void
41 do_flush_shadow_used_ring_split(struct virtio_net *dev,
42                         struct vhost_virtqueue *vq,
43                         uint16_t to, uint16_t from, uint16_t size)
44 {
45         rte_memcpy(&vq->used->ring[to],
46                         &vq->shadow_used_split[from],
47                         size * sizeof(struct vring_used_elem));
48         vhost_log_cache_used_vring(dev, vq,
49                         offsetof(struct vring_used, ring[to]),
50                         size * sizeof(struct vring_used_elem));
51 }
52
53 static __rte_always_inline void
54 flush_shadow_used_ring_split(struct virtio_net *dev, struct vhost_virtqueue *vq)
55 {
56         uint16_t used_idx = vq->last_used_idx & (vq->size - 1);
57
58         if (used_idx + vq->shadow_used_idx <= vq->size) {
59                 do_flush_shadow_used_ring_split(dev, vq, used_idx, 0,
60                                           vq->shadow_used_idx);
61         } else {
62                 uint16_t size;
63
64                 /* update used ring interval [used_idx, vq->size] */
65                 size = vq->size - used_idx;
66                 do_flush_shadow_used_ring_split(dev, vq, used_idx, 0, size);
67
68                 /* update the left half used ring interval [0, left_size] */
69                 do_flush_shadow_used_ring_split(dev, vq, 0, size,
70                                           vq->shadow_used_idx - size);
71         }
72         vq->last_used_idx += vq->shadow_used_idx;
73
74         rte_smp_wmb();
75
76         vhost_log_cache_sync(dev, vq);
77
78         *(volatile uint16_t *)&vq->used->idx += vq->shadow_used_idx;
79         vq->shadow_used_idx = 0;
80         vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
81                 sizeof(vq->used->idx));
82 }
83
84 static __rte_always_inline void
85 update_shadow_used_ring_split(struct vhost_virtqueue *vq,
86                          uint16_t desc_idx, uint32_t len)
87 {
88         uint16_t i = vq->shadow_used_idx++;
89
90         vq->shadow_used_split[i].id  = desc_idx;
91         vq->shadow_used_split[i].len = len;
92 }
93
94 static __rte_always_inline void
95 flush_shadow_used_ring_packed(struct virtio_net *dev,
96                         struct vhost_virtqueue *vq)
97 {
98         int i;
99         uint16_t used_idx = vq->last_used_idx;
100         uint16_t head_idx = vq->last_used_idx;
101         uint16_t head_flags = 0;
102
103         /* Split loop in two to save memory barriers */
104         for (i = 0; i < vq->shadow_used_idx; i++) {
105                 vq->desc_packed[used_idx].id = vq->shadow_used_packed[i].id;
106                 vq->desc_packed[used_idx].len = vq->shadow_used_packed[i].len;
107
108                 used_idx += vq->shadow_used_packed[i].count;
109                 if (used_idx >= vq->size)
110                         used_idx -= vq->size;
111         }
112
113         rte_smp_wmb();
114
115         for (i = 0; i < vq->shadow_used_idx; i++) {
116                 uint16_t flags;
117
118                 if (vq->shadow_used_packed[i].len)
119                         flags = VRING_DESC_F_WRITE;
120                 else
121                         flags = 0;
122
123                 if (vq->used_wrap_counter) {
124                         flags |= VRING_DESC_F_USED;
125                         flags |= VRING_DESC_F_AVAIL;
126                 } else {
127                         flags &= ~VRING_DESC_F_USED;
128                         flags &= ~VRING_DESC_F_AVAIL;
129                 }
130
131                 if (i > 0) {
132                         vq->desc_packed[vq->last_used_idx].flags = flags;
133
134                         vhost_log_cache_used_vring(dev, vq,
135                                         vq->last_used_idx *
136                                         sizeof(struct vring_packed_desc),
137                                         sizeof(struct vring_packed_desc));
138                 } else {
139                         head_idx = vq->last_used_idx;
140                         head_flags = flags;
141                 }
142
143                 vq->last_used_idx += vq->shadow_used_packed[i].count;
144                 if (vq->last_used_idx >= vq->size) {
145                         vq->used_wrap_counter ^= 1;
146                         vq->last_used_idx -= vq->size;
147                 }
148         }
149
150         vq->desc_packed[head_idx].flags = head_flags;
151
152         vhost_log_cache_used_vring(dev, vq,
153                                 head_idx *
154                                 sizeof(struct vring_packed_desc),
155                                 sizeof(struct vring_packed_desc));
156
157         vq->shadow_used_idx = 0;
158         vhost_log_cache_sync(dev, vq);
159 }
160
161 static __rte_always_inline void
162 update_shadow_used_ring_packed(struct vhost_virtqueue *vq,
163                          uint16_t desc_idx, uint32_t len, uint16_t count)
164 {
165         uint16_t i = vq->shadow_used_idx++;
166
167         vq->shadow_used_packed[i].id  = desc_idx;
168         vq->shadow_used_packed[i].len = len;
169         vq->shadow_used_packed[i].count = count;
170 }
171
172 static inline void
173 do_data_copy_enqueue(struct virtio_net *dev, struct vhost_virtqueue *vq)
174 {
175         struct batch_copy_elem *elem = vq->batch_copy_elems;
176         uint16_t count = vq->batch_copy_nb_elems;
177         int i;
178
179         for (i = 0; i < count; i++) {
180                 rte_memcpy(elem[i].dst, elem[i].src, elem[i].len);
181                 vhost_log_cache_write(dev, vq, elem[i].log_addr, elem[i].len);
182                 PRINT_PACKET(dev, (uintptr_t)elem[i].dst, elem[i].len, 0);
183         }
184
185         vq->batch_copy_nb_elems = 0;
186 }
187
188 static inline void
189 do_data_copy_dequeue(struct vhost_virtqueue *vq)
190 {
191         struct batch_copy_elem *elem = vq->batch_copy_elems;
192         uint16_t count = vq->batch_copy_nb_elems;
193         int i;
194
195         for (i = 0; i < count; i++)
196                 rte_memcpy(elem[i].dst, elem[i].src, elem[i].len);
197
198         vq->batch_copy_nb_elems = 0;
199 }
200
201 /* avoid write operation when necessary, to lessen cache issues */
202 #define ASSIGN_UNLESS_EQUAL(var, val) do {      \
203         if ((var) != (val))                     \
204                 (var) = (val);                  \
205 } while (0)
206
207 static __rte_always_inline void
208 virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
209 {
210         uint64_t csum_l4 = m_buf->ol_flags & PKT_TX_L4_MASK;
211
212         if (m_buf->ol_flags & PKT_TX_TCP_SEG)
213                 csum_l4 |= PKT_TX_TCP_CKSUM;
214
215         if (csum_l4) {
216                 net_hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
217                 net_hdr->csum_start = m_buf->l2_len + m_buf->l3_len;
218
219                 switch (csum_l4) {
220                 case PKT_TX_TCP_CKSUM:
221                         net_hdr->csum_offset = (offsetof(struct tcp_hdr,
222                                                 cksum));
223                         break;
224                 case PKT_TX_UDP_CKSUM:
225                         net_hdr->csum_offset = (offsetof(struct udp_hdr,
226                                                 dgram_cksum));
227                         break;
228                 case PKT_TX_SCTP_CKSUM:
229                         net_hdr->csum_offset = (offsetof(struct sctp_hdr,
230                                                 cksum));
231                         break;
232                 }
233         } else {
234                 ASSIGN_UNLESS_EQUAL(net_hdr->csum_start, 0);
235                 ASSIGN_UNLESS_EQUAL(net_hdr->csum_offset, 0);
236                 ASSIGN_UNLESS_EQUAL(net_hdr->flags, 0);
237         }
238
239         /* IP cksum verification cannot be bypassed, then calculate here */
240         if (m_buf->ol_flags & PKT_TX_IP_CKSUM) {
241                 struct ipv4_hdr *ipv4_hdr;
242
243                 ipv4_hdr = rte_pktmbuf_mtod_offset(m_buf, struct ipv4_hdr *,
244                                                    m_buf->l2_len);
245                 ipv4_hdr->hdr_checksum = rte_ipv4_cksum(ipv4_hdr);
246         }
247
248         if (m_buf->ol_flags & PKT_TX_TCP_SEG) {
249                 if (m_buf->ol_flags & PKT_TX_IPV4)
250                         net_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
251                 else
252                         net_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
253                 net_hdr->gso_size = m_buf->tso_segsz;
254                 net_hdr->hdr_len = m_buf->l2_len + m_buf->l3_len
255                                         + m_buf->l4_len;
256         } else if (m_buf->ol_flags & PKT_TX_UDP_SEG) {
257                 net_hdr->gso_type = VIRTIO_NET_HDR_GSO_UDP;
258                 net_hdr->gso_size = m_buf->tso_segsz;
259                 net_hdr->hdr_len = m_buf->l2_len + m_buf->l3_len +
260                         m_buf->l4_len;
261         } else {
262                 ASSIGN_UNLESS_EQUAL(net_hdr->gso_type, 0);
263                 ASSIGN_UNLESS_EQUAL(net_hdr->gso_size, 0);
264                 ASSIGN_UNLESS_EQUAL(net_hdr->hdr_len, 0);
265         }
266 }
267
268 static __rte_always_inline int
269 map_one_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
270                 struct buf_vector *buf_vec, uint16_t *vec_idx,
271                 uint64_t desc_iova, uint64_t desc_len, uint8_t perm)
272 {
273         uint16_t vec_id = *vec_idx;
274
275         while (desc_len) {
276                 uint64_t desc_addr;
277                 uint64_t desc_chunck_len = desc_len;
278
279                 if (unlikely(vec_id >= BUF_VECTOR_MAX))
280                         return -1;
281
282                 desc_addr = vhost_iova_to_vva(dev, vq,
283                                 desc_iova,
284                                 &desc_chunck_len,
285                                 perm);
286                 if (unlikely(!desc_addr))
287                         return -1;
288
289                 buf_vec[vec_id].buf_iova = desc_iova;
290                 buf_vec[vec_id].buf_addr = desc_addr;
291                 buf_vec[vec_id].buf_len  = desc_chunck_len;
292
293                 desc_len -= desc_chunck_len;
294                 desc_iova += desc_chunck_len;
295                 vec_id++;
296         }
297         *vec_idx = vec_id;
298
299         return 0;
300 }
301
302 static __rte_always_inline int
303 fill_vec_buf_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
304                          uint32_t avail_idx, uint16_t *vec_idx,
305                          struct buf_vector *buf_vec, uint16_t *desc_chain_head,
306                          uint32_t *desc_chain_len, uint8_t perm)
307 {
308         uint16_t idx = vq->avail->ring[avail_idx & (vq->size - 1)];
309         uint16_t vec_id = *vec_idx;
310         uint32_t len    = 0;
311         uint64_t dlen;
312         uint32_t nr_descs = vq->size;
313         struct vring_desc *descs = vq->desc;
314         struct vring_desc *idesc = NULL;
315
316         if (unlikely(idx >= vq->size))
317                 return -1;
318
319         *desc_chain_head = idx;
320
321         if (vq->desc[idx].flags & VRING_DESC_F_INDIRECT) {
322                 dlen = vq->desc[idx].len;
323                 nr_descs = dlen / sizeof(struct vring_desc);
324                 if (unlikely(nr_descs > vq->size))
325                         return -1;
326
327                 descs = (struct vring_desc *)(uintptr_t)
328                         vhost_iova_to_vva(dev, vq, vq->desc[idx].addr,
329                                                 &dlen,
330                                                 VHOST_ACCESS_RO);
331                 if (unlikely(!descs))
332                         return -1;
333
334                 if (unlikely(dlen < vq->desc[idx].len)) {
335                         /*
336                          * The indirect desc table is not contiguous
337                          * in process VA space, we have to copy it.
338                          */
339                         idesc = alloc_copy_ind_table(dev, vq,
340                                         vq->desc[idx].addr, vq->desc[idx].len);
341                         if (unlikely(!idesc))
342                                 return -1;
343
344                         descs = idesc;
345                 }
346
347                 idx = 0;
348         }
349
350         while (1) {
351                 if (unlikely(idx >= vq->size)) {
352                         free_ind_table(idesc);
353                         return -1;
354                 }
355
356                 if (unlikely(nr_descs-- == 0)) {
357                         free_ind_table(idesc);
358                         return -1;
359                 }
360
361                 len += descs[idx].len;
362
363                 if (unlikely(map_one_desc(dev, vq, buf_vec, &vec_id,
364                                                 descs[idx].addr, descs[idx].len,
365                                                 perm))) {
366                         free_ind_table(idesc);
367                         return -1;
368                 }
369
370                 if ((descs[idx].flags & VRING_DESC_F_NEXT) == 0)
371                         break;
372
373                 idx = descs[idx].next;
374         }
375
376         *desc_chain_len = len;
377         *vec_idx = vec_id;
378
379         if (unlikely(!!idesc))
380                 free_ind_table(idesc);
381
382         return 0;
383 }
384
385 /*
386  * Returns -1 on fail, 0 on success
387  */
388 static inline int
389 reserve_avail_buf_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
390                                 uint32_t size, struct buf_vector *buf_vec,
391                                 uint16_t *num_buffers, uint16_t avail_head,
392                                 uint16_t *nr_vec)
393 {
394         uint16_t cur_idx;
395         uint16_t vec_idx = 0;
396         uint16_t max_tries, tries = 0;
397
398         uint16_t head_idx = 0;
399         uint32_t len = 0;
400
401         *num_buffers = 0;
402         cur_idx  = vq->last_avail_idx;
403
404         if (rxvq_is_mergeable(dev))
405                 max_tries = vq->size - 1;
406         else
407                 max_tries = 1;
408
409         while (size > 0) {
410                 if (unlikely(cur_idx == avail_head))
411                         return -1;
412                 /*
413                  * if we tried all available ring items, and still
414                  * can't get enough buf, it means something abnormal
415                  * happened.
416                  */
417                 if (unlikely(++tries > max_tries))
418                         return -1;
419
420                 if (unlikely(fill_vec_buf_split(dev, vq, cur_idx,
421                                                 &vec_idx, buf_vec,
422                                                 &head_idx, &len,
423                                                 VHOST_ACCESS_RW) < 0))
424                         return -1;
425                 len = RTE_MIN(len, size);
426                 update_shadow_used_ring_split(vq, head_idx, len);
427                 size -= len;
428
429                 cur_idx++;
430                 *num_buffers += 1;
431         }
432
433         *nr_vec = vec_idx;
434
435         return 0;
436 }
437
438 static __rte_always_inline int
439 fill_vec_buf_packed_indirect(struct virtio_net *dev,
440                         struct vhost_virtqueue *vq,
441                         struct vring_packed_desc *desc, uint16_t *vec_idx,
442                         struct buf_vector *buf_vec, uint32_t *len, uint8_t perm)
443 {
444         uint16_t i;
445         uint32_t nr_descs;
446         uint16_t vec_id = *vec_idx;
447         uint64_t dlen;
448         struct vring_packed_desc *descs, *idescs = NULL;
449
450         dlen = desc->len;
451         descs = (struct vring_packed_desc *)(uintptr_t)
452                 vhost_iova_to_vva(dev, vq, desc->addr, &dlen, VHOST_ACCESS_RO);
453         if (unlikely(!descs))
454                 return -1;
455
456         if (unlikely(dlen < desc->len)) {
457                 /*
458                  * The indirect desc table is not contiguous
459                  * in process VA space, we have to copy it.
460                  */
461                 idescs = alloc_copy_ind_table(dev, vq, desc->addr, desc->len);
462                 if (unlikely(!idescs))
463                         return -1;
464
465                 descs = idescs;
466         }
467
468         nr_descs =  desc->len / sizeof(struct vring_packed_desc);
469         if (unlikely(nr_descs >= vq->size)) {
470                 free_ind_table(idescs);
471                 return -1;
472         }
473
474         for (i = 0; i < nr_descs; i++) {
475                 if (unlikely(vec_id >= BUF_VECTOR_MAX)) {
476                         free_ind_table(idescs);
477                         return -1;
478                 }
479
480                 *len += descs[i].len;
481                 if (unlikely(map_one_desc(dev, vq, buf_vec, &vec_id,
482                                                 descs[i].addr, descs[i].len,
483                                                 perm)))
484                         return -1;
485         }
486         *vec_idx = vec_id;
487
488         if (unlikely(!!idescs))
489                 free_ind_table(idescs);
490
491         return 0;
492 }
493
494 static __rte_always_inline int
495 fill_vec_buf_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
496                                 uint16_t avail_idx, uint16_t *desc_count,
497                                 struct buf_vector *buf_vec, uint16_t *vec_idx,
498                                 uint16_t *buf_id, uint32_t *len, uint8_t perm)
499 {
500         bool wrap_counter = vq->avail_wrap_counter;
501         struct vring_packed_desc *descs = vq->desc_packed;
502         uint16_t vec_id = *vec_idx;
503
504         if (avail_idx < vq->last_avail_idx)
505                 wrap_counter ^= 1;
506
507         if (unlikely(!desc_is_avail(&descs[avail_idx], wrap_counter)))
508                 return -1;
509
510         /*
511          * The ordering between desc flags and desc
512          * content reads need to be enforced.
513          */
514         rte_smp_rmb();
515
516         *desc_count = 0;
517         *len = 0;
518
519         while (1) {
520                 if (unlikely(vec_id >= BUF_VECTOR_MAX))
521                         return -1;
522
523                 if (unlikely(*desc_count >= vq->size))
524                         return -1;
525
526                 *desc_count += 1;
527                 *buf_id = descs[avail_idx].id;
528
529                 if (descs[avail_idx].flags & VRING_DESC_F_INDIRECT) {
530                         if (unlikely(fill_vec_buf_packed_indirect(dev, vq,
531                                                         &descs[avail_idx],
532                                                         &vec_id, buf_vec,
533                                                         len, perm) < 0))
534                                 return -1;
535                 } else {
536                         *len += descs[avail_idx].len;
537
538                         if (unlikely(map_one_desc(dev, vq, buf_vec, &vec_id,
539                                                         descs[avail_idx].addr,
540                                                         descs[avail_idx].len,
541                                                         perm)))
542                                 return -1;
543                 }
544
545                 if ((descs[avail_idx].flags & VRING_DESC_F_NEXT) == 0)
546                         break;
547
548                 if (++avail_idx >= vq->size) {
549                         avail_idx -= vq->size;
550                         wrap_counter ^= 1;
551                 }
552         }
553
554         *vec_idx = vec_id;
555
556         return 0;
557 }
558
559 /*
560  * Returns -1 on fail, 0 on success
561  */
562 static inline int
563 reserve_avail_buf_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
564                                 uint32_t size, struct buf_vector *buf_vec,
565                                 uint16_t *nr_vec, uint16_t *num_buffers,
566                                 uint16_t *nr_descs)
567 {
568         uint16_t avail_idx;
569         uint16_t vec_idx = 0;
570         uint16_t max_tries, tries = 0;
571
572         uint16_t buf_id = 0;
573         uint32_t len = 0;
574         uint16_t desc_count;
575
576         *num_buffers = 0;
577         avail_idx = vq->last_avail_idx;
578
579         if (rxvq_is_mergeable(dev))
580                 max_tries = vq->size - 1;
581         else
582                 max_tries = 1;
583
584         while (size > 0) {
585                 /*
586                  * if we tried all available ring items, and still
587                  * can't get enough buf, it means something abnormal
588                  * happened.
589                  */
590                 if (unlikely(++tries > max_tries))
591                         return -1;
592
593                 if (unlikely(fill_vec_buf_packed(dev, vq,
594                                                 avail_idx, &desc_count,
595                                                 buf_vec, &vec_idx,
596                                                 &buf_id, &len,
597                                                 VHOST_ACCESS_RW) < 0))
598                         return -1;
599
600                 len = RTE_MIN(len, size);
601                 update_shadow_used_ring_packed(vq, buf_id, len, desc_count);
602                 size -= len;
603
604                 avail_idx += desc_count;
605                 if (avail_idx >= vq->size)
606                         avail_idx -= vq->size;
607
608                 *nr_descs += desc_count;
609                 *num_buffers += 1;
610         }
611
612         *nr_vec = vec_idx;
613
614         return 0;
615 }
616
617 static __rte_always_inline int
618 copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
619                             struct rte_mbuf *m, struct buf_vector *buf_vec,
620                             uint16_t nr_vec, uint16_t num_buffers)
621 {
622         uint32_t vec_idx = 0;
623         uint32_t mbuf_offset, mbuf_avail;
624         uint32_t buf_offset, buf_avail;
625         uint64_t buf_addr, buf_iova, buf_len;
626         uint32_t cpy_len;
627         uint64_t hdr_addr;
628         struct rte_mbuf *hdr_mbuf;
629         struct batch_copy_elem *batch_copy = vq->batch_copy_elems;
630         struct virtio_net_hdr_mrg_rxbuf tmp_hdr, *hdr = NULL;
631         int error = 0;
632
633         if (unlikely(m == NULL)) {
634                 error = -1;
635                 goto out;
636         }
637
638         buf_addr = buf_vec[vec_idx].buf_addr;
639         buf_iova = buf_vec[vec_idx].buf_iova;
640         buf_len = buf_vec[vec_idx].buf_len;
641
642         if (nr_vec > 1)
643                 rte_prefetch0((void *)(uintptr_t)buf_vec[1].buf_addr);
644
645         if (unlikely(buf_len < dev->vhost_hlen && nr_vec <= 1)) {
646                 error = -1;
647                 goto out;
648         }
649
650         hdr_mbuf = m;
651         hdr_addr = buf_addr;
652         if (unlikely(buf_len < dev->vhost_hlen))
653                 hdr = &tmp_hdr;
654         else
655                 hdr = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)hdr_addr;
656
657         VHOST_LOG_DEBUG(VHOST_DATA, "(%d) RX: num merge buffers %d\n",
658                 dev->vid, num_buffers);
659
660         if (unlikely(buf_len < dev->vhost_hlen)) {
661                 buf_offset = dev->vhost_hlen - buf_len;
662                 vec_idx++;
663                 buf_addr = buf_vec[vec_idx].buf_addr;
664                 buf_iova = buf_vec[vec_idx].buf_iova;
665                 buf_len = buf_vec[vec_idx].buf_len;
666                 buf_avail = buf_len - buf_offset;
667         } else {
668                 buf_offset = dev->vhost_hlen;
669                 buf_avail = buf_len - dev->vhost_hlen;
670         }
671
672         mbuf_avail  = rte_pktmbuf_data_len(m);
673         mbuf_offset = 0;
674         while (mbuf_avail != 0 || m->next != NULL) {
675                 /* done with current buf, get the next one */
676                 if (buf_avail == 0) {
677                         vec_idx++;
678                         if (unlikely(vec_idx >= nr_vec)) {
679                                 error = -1;
680                                 goto out;
681                         }
682
683                         buf_addr = buf_vec[vec_idx].buf_addr;
684                         buf_iova = buf_vec[vec_idx].buf_iova;
685                         buf_len = buf_vec[vec_idx].buf_len;
686
687                         /* Prefetch next buffer address. */
688                         if (vec_idx + 1 < nr_vec)
689                                 rte_prefetch0((void *)(uintptr_t)
690                                                 buf_vec[vec_idx + 1].buf_addr);
691                         buf_offset = 0;
692                         buf_avail  = buf_len;
693                 }
694
695                 /* done with current mbuf, get the next one */
696                 if (mbuf_avail == 0) {
697                         m = m->next;
698
699                         mbuf_offset = 0;
700                         mbuf_avail  = rte_pktmbuf_data_len(m);
701                 }
702
703                 if (hdr_addr) {
704                         virtio_enqueue_offload(hdr_mbuf, &hdr->hdr);
705                         if (rxvq_is_mergeable(dev))
706                                 ASSIGN_UNLESS_EQUAL(hdr->num_buffers,
707                                                 num_buffers);
708
709                         if (unlikely(hdr == &tmp_hdr)) {
710                                 uint64_t len;
711                                 uint64_t remain = dev->vhost_hlen;
712                                 uint64_t src = (uint64_t)(uintptr_t)hdr, dst;
713                                 uint64_t iova = buf_vec[0].buf_iova;
714                                 uint16_t hdr_vec_idx = 0;
715
716                                 while (remain) {
717                                         len = RTE_MIN(remain,
718                                                 buf_vec[hdr_vec_idx].buf_len);
719                                         dst = buf_vec[hdr_vec_idx].buf_addr;
720                                         rte_memcpy((void *)(uintptr_t)dst,
721                                                         (void *)(uintptr_t)src,
722                                                         len);
723
724                                         PRINT_PACKET(dev, (uintptr_t)dst,
725                                                         (uint32_t)len, 0);
726                                         vhost_log_cache_write(dev, vq,
727                                                         iova, len);
728
729                                         remain -= len;
730                                         iova += len;
731                                         src += len;
732                                         hdr_vec_idx++;
733                                 }
734                         } else {
735                                 PRINT_PACKET(dev, (uintptr_t)hdr_addr,
736                                                 dev->vhost_hlen, 0);
737                                 vhost_log_cache_write(dev, vq,
738                                                 buf_vec[0].buf_iova,
739                                                 dev->vhost_hlen);
740                         }
741
742                         hdr_addr = 0;
743                 }
744
745                 cpy_len = RTE_MIN(buf_avail, mbuf_avail);
746
747                 if (likely(cpy_len > MAX_BATCH_LEN ||
748                                         vq->batch_copy_nb_elems >= vq->size)) {
749                         rte_memcpy((void *)((uintptr_t)(buf_addr + buf_offset)),
750                                 rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
751                                 cpy_len);
752                         vhost_log_cache_write(dev, vq, buf_iova + buf_offset,
753                                         cpy_len);
754                         PRINT_PACKET(dev, (uintptr_t)(buf_addr + buf_offset),
755                                 cpy_len, 0);
756                 } else {
757                         batch_copy[vq->batch_copy_nb_elems].dst =
758                                 (void *)((uintptr_t)(buf_addr + buf_offset));
759                         batch_copy[vq->batch_copy_nb_elems].src =
760                                 rte_pktmbuf_mtod_offset(m, void *, mbuf_offset);
761                         batch_copy[vq->batch_copy_nb_elems].log_addr =
762                                 buf_iova + buf_offset;
763                         batch_copy[vq->batch_copy_nb_elems].len = cpy_len;
764                         vq->batch_copy_nb_elems++;
765                 }
766
767                 mbuf_avail  -= cpy_len;
768                 mbuf_offset += cpy_len;
769                 buf_avail  -= cpy_len;
770                 buf_offset += cpy_len;
771         }
772
773 out:
774
775         return error;
776 }
777
778 static __rte_always_inline uint32_t
779 virtio_dev_rx_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
780         struct rte_mbuf **pkts, uint32_t count)
781 {
782         uint32_t pkt_idx = 0;
783         uint16_t num_buffers;
784         struct buf_vector buf_vec[BUF_VECTOR_MAX];
785         uint16_t avail_head;
786
787         avail_head = *((volatile uint16_t *)&vq->avail->idx);
788
789         /*
790          * The ordering between avail index and
791          * desc reads needs to be enforced.
792          */
793         rte_smp_rmb();
794
795         rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]);
796
797         for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
798                 uint32_t pkt_len = pkts[pkt_idx]->pkt_len + dev->vhost_hlen;
799                 uint16_t nr_vec = 0;
800
801                 if (unlikely(reserve_avail_buf_split(dev, vq,
802                                                 pkt_len, buf_vec, &num_buffers,
803                                                 avail_head, &nr_vec) < 0)) {
804                         VHOST_LOG_DEBUG(VHOST_DATA,
805                                 "(%d) failed to get enough desc from vring\n",
806                                 dev->vid);
807                         vq->shadow_used_idx -= num_buffers;
808                         break;
809                 }
810
811                 rte_prefetch0((void *)(uintptr_t)buf_vec[0].buf_addr);
812
813                 VHOST_LOG_DEBUG(VHOST_DATA, "(%d) current index %d | end index %d\n",
814                         dev->vid, vq->last_avail_idx,
815                         vq->last_avail_idx + num_buffers);
816
817                 if (copy_mbuf_to_desc(dev, vq, pkts[pkt_idx],
818                                                 buf_vec, nr_vec,
819                                                 num_buffers) < 0) {
820                         vq->shadow_used_idx -= num_buffers;
821                         break;
822                 }
823
824                 vq->last_avail_idx += num_buffers;
825         }
826
827         do_data_copy_enqueue(dev, vq);
828
829         if (likely(vq->shadow_used_idx)) {
830                 flush_shadow_used_ring_split(dev, vq);
831                 vhost_vring_call_split(dev, vq);
832         }
833
834         return pkt_idx;
835 }
836
837 static __rte_always_inline uint32_t
838 virtio_dev_rx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
839         struct rte_mbuf **pkts, uint32_t count)
840 {
841         uint32_t pkt_idx = 0;
842         uint16_t num_buffers;
843         struct buf_vector buf_vec[BUF_VECTOR_MAX];
844
845         for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
846                 uint32_t pkt_len = pkts[pkt_idx]->pkt_len + dev->vhost_hlen;
847                 uint16_t nr_vec = 0;
848                 uint16_t nr_descs = 0;
849
850                 if (unlikely(reserve_avail_buf_packed(dev, vq,
851                                                 pkt_len, buf_vec, &nr_vec,
852                                                 &num_buffers, &nr_descs) < 0)) {
853                         VHOST_LOG_DEBUG(VHOST_DATA,
854                                 "(%d) failed to get enough desc from vring\n",
855                                 dev->vid);
856                         vq->shadow_used_idx -= num_buffers;
857                         break;
858                 }
859
860                 rte_prefetch0((void *)(uintptr_t)buf_vec[0].buf_addr);
861
862                 VHOST_LOG_DEBUG(VHOST_DATA, "(%d) current index %d | end index %d\n",
863                         dev->vid, vq->last_avail_idx,
864                         vq->last_avail_idx + num_buffers);
865
866                 if (copy_mbuf_to_desc(dev, vq, pkts[pkt_idx],
867                                                 buf_vec, nr_vec,
868                                                 num_buffers) < 0) {
869                         vq->shadow_used_idx -= num_buffers;
870                         break;
871                 }
872
873                 vq->last_avail_idx += nr_descs;
874                 if (vq->last_avail_idx >= vq->size) {
875                         vq->last_avail_idx -= vq->size;
876                         vq->avail_wrap_counter ^= 1;
877                 }
878         }
879
880         do_data_copy_enqueue(dev, vq);
881
882         if (likely(vq->shadow_used_idx)) {
883                 flush_shadow_used_ring_packed(dev, vq);
884                 vhost_vring_call_packed(dev, vq);
885         }
886
887         return pkt_idx;
888 }
889
890 static __rte_always_inline uint32_t
891 virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
892         struct rte_mbuf **pkts, uint32_t count)
893 {
894         struct vhost_virtqueue *vq;
895         uint32_t nb_tx = 0;
896
897         VHOST_LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
898         if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->nr_vring))) {
899                 RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n",
900                         dev->vid, __func__, queue_id);
901                 return 0;
902         }
903
904         vq = dev->virtqueue[queue_id];
905
906         rte_spinlock_lock(&vq->access_lock);
907
908         if (unlikely(vq->enabled == 0))
909                 goto out_access_unlock;
910
911         if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
912                 vhost_user_iotlb_rd_lock(vq);
913
914         if (unlikely(vq->access_ok == 0))
915                 if (unlikely(vring_translate(dev, vq) < 0))
916                         goto out;
917
918         count = RTE_MIN((uint32_t)MAX_PKT_BURST, count);
919         if (count == 0)
920                 goto out;
921
922         if (vq_is_packed(dev))
923                 nb_tx = virtio_dev_rx_packed(dev, vq, pkts, count);
924         else
925                 nb_tx = virtio_dev_rx_split(dev, vq, pkts, count);
926
927 out:
928         if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
929                 vhost_user_iotlb_rd_unlock(vq);
930
931 out_access_unlock:
932         rte_spinlock_unlock(&vq->access_lock);
933
934         return nb_tx;
935 }
936
937 uint16_t
938 rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
939         struct rte_mbuf **pkts, uint16_t count)
940 {
941         struct virtio_net *dev = get_device(vid);
942
943         if (!dev)
944                 return 0;
945
946         if (unlikely(!(dev->flags & VIRTIO_DEV_BUILTIN_VIRTIO_NET))) {
947                 RTE_LOG(ERR, VHOST_DATA,
948                         "(%d) %s: built-in vhost net backend is disabled.\n",
949                         dev->vid, __func__);
950                 return 0;
951         }
952
953         return virtio_dev_rx(dev, queue_id, pkts, count);
954 }
955
956 static inline bool
957 virtio_net_with_host_offload(struct virtio_net *dev)
958 {
959         if (dev->features &
960                         ((1ULL << VIRTIO_NET_F_CSUM) |
961                          (1ULL << VIRTIO_NET_F_HOST_ECN) |
962                          (1ULL << VIRTIO_NET_F_HOST_TSO4) |
963                          (1ULL << VIRTIO_NET_F_HOST_TSO6) |
964                          (1ULL << VIRTIO_NET_F_HOST_UFO)))
965                 return true;
966
967         return false;
968 }
969
970 static void
971 parse_ethernet(struct rte_mbuf *m, uint16_t *l4_proto, void **l4_hdr)
972 {
973         struct ipv4_hdr *ipv4_hdr;
974         struct ipv6_hdr *ipv6_hdr;
975         void *l3_hdr = NULL;
976         struct ether_hdr *eth_hdr;
977         uint16_t ethertype;
978
979         eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
980
981         m->l2_len = sizeof(struct ether_hdr);
982         ethertype = rte_be_to_cpu_16(eth_hdr->ether_type);
983
984         if (ethertype == ETHER_TYPE_VLAN) {
985                 struct vlan_hdr *vlan_hdr = (struct vlan_hdr *)(eth_hdr + 1);
986
987                 m->l2_len += sizeof(struct vlan_hdr);
988                 ethertype = rte_be_to_cpu_16(vlan_hdr->eth_proto);
989         }
990
991         l3_hdr = (char *)eth_hdr + m->l2_len;
992
993         switch (ethertype) {
994         case ETHER_TYPE_IPv4:
995                 ipv4_hdr = l3_hdr;
996                 *l4_proto = ipv4_hdr->next_proto_id;
997                 m->l3_len = (ipv4_hdr->version_ihl & 0x0f) * 4;
998                 *l4_hdr = (char *)l3_hdr + m->l3_len;
999                 m->ol_flags |= PKT_TX_IPV4;
1000                 break;
1001         case ETHER_TYPE_IPv6:
1002                 ipv6_hdr = l3_hdr;
1003                 *l4_proto = ipv6_hdr->proto;
1004                 m->l3_len = sizeof(struct ipv6_hdr);
1005                 *l4_hdr = (char *)l3_hdr + m->l3_len;
1006                 m->ol_flags |= PKT_TX_IPV6;
1007                 break;
1008         default:
1009                 m->l3_len = 0;
1010                 *l4_proto = 0;
1011                 *l4_hdr = NULL;
1012                 break;
1013         }
1014 }
1015
1016 static __rte_always_inline void
1017 vhost_dequeue_offload(struct virtio_net_hdr *hdr, struct rte_mbuf *m)
1018 {
1019         uint16_t l4_proto = 0;
1020         void *l4_hdr = NULL;
1021         struct tcp_hdr *tcp_hdr = NULL;
1022
1023         if (hdr->flags == 0 && hdr->gso_type == VIRTIO_NET_HDR_GSO_NONE)
1024                 return;
1025
1026         parse_ethernet(m, &l4_proto, &l4_hdr);
1027         if (hdr->flags == VIRTIO_NET_HDR_F_NEEDS_CSUM) {
1028                 if (hdr->csum_start == (m->l2_len + m->l3_len)) {
1029                         switch (hdr->csum_offset) {
1030                         case (offsetof(struct tcp_hdr, cksum)):
1031                                 if (l4_proto == IPPROTO_TCP)
1032                                         m->ol_flags |= PKT_TX_TCP_CKSUM;
1033                                 break;
1034                         case (offsetof(struct udp_hdr, dgram_cksum)):
1035                                 if (l4_proto == IPPROTO_UDP)
1036                                         m->ol_flags |= PKT_TX_UDP_CKSUM;
1037                                 break;
1038                         case (offsetof(struct sctp_hdr, cksum)):
1039                                 if (l4_proto == IPPROTO_SCTP)
1040                                         m->ol_flags |= PKT_TX_SCTP_CKSUM;
1041                                 break;
1042                         default:
1043                                 break;
1044                         }
1045                 }
1046         }
1047
1048         if (l4_hdr && hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
1049                 switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
1050                 case VIRTIO_NET_HDR_GSO_TCPV4:
1051                 case VIRTIO_NET_HDR_GSO_TCPV6:
1052                         tcp_hdr = l4_hdr;
1053                         m->ol_flags |= PKT_TX_TCP_SEG;
1054                         m->tso_segsz = hdr->gso_size;
1055                         m->l4_len = (tcp_hdr->data_off & 0xf0) >> 2;
1056                         break;
1057                 case VIRTIO_NET_HDR_GSO_UDP:
1058                         m->ol_flags |= PKT_TX_UDP_SEG;
1059                         m->tso_segsz = hdr->gso_size;
1060                         m->l4_len = sizeof(struct udp_hdr);
1061                         break;
1062                 default:
1063                         RTE_LOG(WARNING, VHOST_DATA,
1064                                 "unsupported gso type %u.\n", hdr->gso_type);
1065                         break;
1066                 }
1067         }
1068 }
1069
1070 static __rte_always_inline void
1071 put_zmbuf(struct zcopy_mbuf *zmbuf)
1072 {
1073         zmbuf->in_use = 0;
1074 }
1075
1076 static __rte_always_inline int
1077 copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
1078                   struct buf_vector *buf_vec, uint16_t nr_vec,
1079                   struct rte_mbuf *m, struct rte_mempool *mbuf_pool)
1080 {
1081         uint32_t buf_avail, buf_offset;
1082         uint64_t buf_addr, buf_iova, buf_len;
1083         uint32_t mbuf_avail, mbuf_offset;
1084         uint32_t cpy_len;
1085         struct rte_mbuf *cur = m, *prev = m;
1086         struct virtio_net_hdr tmp_hdr;
1087         struct virtio_net_hdr *hdr = NULL;
1088         /* A counter to avoid desc dead loop chain */
1089         uint16_t vec_idx = 0;
1090         struct batch_copy_elem *batch_copy = vq->batch_copy_elems;
1091         int error = 0;
1092
1093         buf_addr = buf_vec[vec_idx].buf_addr;
1094         buf_iova = buf_vec[vec_idx].buf_iova;
1095         buf_len = buf_vec[vec_idx].buf_len;
1096
1097         if (unlikely(buf_len < dev->vhost_hlen && nr_vec <= 1)) {
1098                 error = -1;
1099                 goto out;
1100         }
1101
1102         if (likely(nr_vec > 1))
1103                 rte_prefetch0((void *)(uintptr_t)buf_vec[1].buf_addr);
1104
1105         if (virtio_net_with_host_offload(dev)) {
1106                 if (unlikely(buf_len < sizeof(struct virtio_net_hdr))) {
1107                         uint64_t len;
1108                         uint64_t remain = sizeof(struct virtio_net_hdr);
1109                         uint64_t src;
1110                         uint64_t dst = (uint64_t)(uintptr_t)&tmp_hdr;
1111                         uint16_t hdr_vec_idx = 0;
1112
1113                         /*
1114                          * No luck, the virtio-net header doesn't fit
1115                          * in a contiguous virtual area.
1116                          */
1117                         while (remain) {
1118                                 len = RTE_MIN(remain,
1119                                         buf_vec[hdr_vec_idx].buf_len);
1120                                 src = buf_vec[hdr_vec_idx].buf_addr;
1121                                 rte_memcpy((void *)(uintptr_t)dst,
1122                                                    (void *)(uintptr_t)src, len);
1123
1124                                 remain -= len;
1125                                 dst += len;
1126                                 hdr_vec_idx++;
1127                         }
1128
1129                         hdr = &tmp_hdr;
1130                 } else {
1131                         hdr = (struct virtio_net_hdr *)((uintptr_t)buf_addr);
1132                         rte_prefetch0(hdr);
1133                 }
1134         }
1135
1136         /*
1137          * A virtio driver normally uses at least 2 desc buffers
1138          * for Tx: the first for storing the header, and others
1139          * for storing the data.
1140          */
1141         if (unlikely(buf_len < dev->vhost_hlen)) {
1142                 buf_offset = dev->vhost_hlen - buf_len;
1143                 vec_idx++;
1144                 buf_addr = buf_vec[vec_idx].buf_addr;
1145                 buf_iova = buf_vec[vec_idx].buf_iova;
1146                 buf_len = buf_vec[vec_idx].buf_len;
1147                 buf_avail  = buf_len - buf_offset;
1148         } else if (buf_len == dev->vhost_hlen) {
1149                 if (unlikely(++vec_idx >= nr_vec))
1150                         goto out;
1151                 buf_addr = buf_vec[vec_idx].buf_addr;
1152                 buf_iova = buf_vec[vec_idx].buf_iova;
1153                 buf_len = buf_vec[vec_idx].buf_len;
1154
1155                 buf_offset = 0;
1156                 buf_avail = buf_len;
1157         } else {
1158                 buf_offset = dev->vhost_hlen;
1159                 buf_avail = buf_vec[vec_idx].buf_len - dev->vhost_hlen;
1160         }
1161
1162         rte_prefetch0((void *)(uintptr_t)
1163                         (buf_addr + buf_offset));
1164
1165         PRINT_PACKET(dev,
1166                         (uintptr_t)(buf_addr + buf_offset),
1167                         (uint32_t)buf_avail, 0);
1168
1169         mbuf_offset = 0;
1170         mbuf_avail  = m->buf_len - RTE_PKTMBUF_HEADROOM;
1171         while (1) {
1172                 uint64_t hpa;
1173
1174                 cpy_len = RTE_MIN(buf_avail, mbuf_avail);
1175
1176                 /*
1177                  * A desc buf might across two host physical pages that are
1178                  * not continuous. In such case (gpa_to_hpa returns 0), data
1179                  * will be copied even though zero copy is enabled.
1180                  */
1181                 if (unlikely(dev->dequeue_zero_copy && (hpa = gpa_to_hpa(dev,
1182                                         buf_iova + buf_offset, cpy_len)))) {
1183                         cur->data_len = cpy_len;
1184                         cur->data_off = 0;
1185                         cur->buf_addr =
1186                                 (void *)(uintptr_t)(buf_addr + buf_offset);
1187                         cur->buf_iova = hpa;
1188
1189                         /*
1190                          * In zero copy mode, one mbuf can only reference data
1191                          * for one or partial of one desc buff.
1192                          */
1193                         mbuf_avail = cpy_len;
1194                 } else {
1195                         if (likely(cpy_len > MAX_BATCH_LEN ||
1196                                    vq->batch_copy_nb_elems >= vq->size ||
1197                                    (hdr && cur == m))) {
1198                                 rte_memcpy(rte_pktmbuf_mtod_offset(cur, void *,
1199                                                                    mbuf_offset),
1200                                            (void *)((uintptr_t)(buf_addr +
1201                                                            buf_offset)),
1202                                            cpy_len);
1203                         } else {
1204                                 batch_copy[vq->batch_copy_nb_elems].dst =
1205                                         rte_pktmbuf_mtod_offset(cur, void *,
1206                                                                 mbuf_offset);
1207                                 batch_copy[vq->batch_copy_nb_elems].src =
1208                                         (void *)((uintptr_t)(buf_addr +
1209                                                                 buf_offset));
1210                                 batch_copy[vq->batch_copy_nb_elems].len =
1211                                         cpy_len;
1212                                 vq->batch_copy_nb_elems++;
1213                         }
1214                 }
1215
1216                 mbuf_avail  -= cpy_len;
1217                 mbuf_offset += cpy_len;
1218                 buf_avail -= cpy_len;
1219                 buf_offset += cpy_len;
1220
1221                 /* This buf reaches to its end, get the next one */
1222                 if (buf_avail == 0) {
1223                         if (++vec_idx >= nr_vec)
1224                                 break;
1225
1226                         buf_addr = buf_vec[vec_idx].buf_addr;
1227                         buf_iova = buf_vec[vec_idx].buf_iova;
1228                         buf_len = buf_vec[vec_idx].buf_len;
1229
1230                         /*
1231                          * Prefecth desc n + 1 buffer while
1232                          * desc n buffer is processed.
1233                          */
1234                         if (vec_idx + 1 < nr_vec)
1235                                 rte_prefetch0((void *)(uintptr_t)
1236                                                 buf_vec[vec_idx + 1].buf_addr);
1237
1238                         buf_offset = 0;
1239                         buf_avail  = buf_len;
1240
1241                         PRINT_PACKET(dev, (uintptr_t)buf_addr,
1242                                         (uint32_t)buf_avail, 0);
1243                 }
1244
1245                 /*
1246                  * This mbuf reaches to its end, get a new one
1247                  * to hold more data.
1248                  */
1249                 if (mbuf_avail == 0) {
1250                         cur = rte_pktmbuf_alloc(mbuf_pool);
1251                         if (unlikely(cur == NULL)) {
1252                                 RTE_LOG(ERR, VHOST_DATA, "Failed to "
1253                                         "allocate memory for mbuf.\n");
1254                                 error = -1;
1255                                 goto out;
1256                         }
1257                         if (unlikely(dev->dequeue_zero_copy))
1258                                 rte_mbuf_refcnt_update(cur, 1);
1259
1260                         prev->next = cur;
1261                         prev->data_len = mbuf_offset;
1262                         m->nb_segs += 1;
1263                         m->pkt_len += mbuf_offset;
1264                         prev = cur;
1265
1266                         mbuf_offset = 0;
1267                         mbuf_avail  = cur->buf_len - RTE_PKTMBUF_HEADROOM;
1268                 }
1269         }
1270
1271         prev->data_len = mbuf_offset;
1272         m->pkt_len    += mbuf_offset;
1273
1274         if (hdr)
1275                 vhost_dequeue_offload(hdr, m);
1276
1277 out:
1278
1279         return error;
1280 }
1281
1282 static __rte_always_inline struct zcopy_mbuf *
1283 get_zmbuf(struct vhost_virtqueue *vq)
1284 {
1285         uint16_t i;
1286         uint16_t last;
1287         int tries = 0;
1288
1289         /* search [last_zmbuf_idx, zmbuf_size) */
1290         i = vq->last_zmbuf_idx;
1291         last = vq->zmbuf_size;
1292
1293 again:
1294         for (; i < last; i++) {
1295                 if (vq->zmbufs[i].in_use == 0) {
1296                         vq->last_zmbuf_idx = i + 1;
1297                         vq->zmbufs[i].in_use = 1;
1298                         return &vq->zmbufs[i];
1299                 }
1300         }
1301
1302         tries++;
1303         if (tries == 1) {
1304                 /* search [0, last_zmbuf_idx) */
1305                 i = 0;
1306                 last = vq->last_zmbuf_idx;
1307                 goto again;
1308         }
1309
1310         return NULL;
1311 }
1312
1313 static __rte_always_inline bool
1314 mbuf_is_consumed(struct rte_mbuf *m)
1315 {
1316         while (m) {
1317                 if (rte_mbuf_refcnt_read(m) > 1)
1318                         return false;
1319                 m = m->next;
1320         }
1321
1322         return true;
1323 }
1324
1325 static __rte_always_inline void
1326 restore_mbuf(struct rte_mbuf *m)
1327 {
1328         uint32_t mbuf_size, priv_size;
1329
1330         while (m) {
1331                 priv_size = rte_pktmbuf_priv_size(m->pool);
1332                 mbuf_size = sizeof(struct rte_mbuf) + priv_size;
1333                 /* start of buffer is after mbuf structure and priv data */
1334
1335                 m->buf_addr = (char *)m + mbuf_size;
1336                 m->buf_iova = rte_mempool_virt2iova(m) + mbuf_size;
1337                 m = m->next;
1338         }
1339 }
1340
1341 static __rte_always_inline uint16_t
1342 virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
1343         struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count)
1344 {
1345         uint16_t i;
1346         uint16_t free_entries;
1347
1348         if (unlikely(dev->dequeue_zero_copy)) {
1349                 struct zcopy_mbuf *zmbuf, *next;
1350
1351                 for (zmbuf = TAILQ_FIRST(&vq->zmbuf_list);
1352                      zmbuf != NULL; zmbuf = next) {
1353                         next = TAILQ_NEXT(zmbuf, next);
1354
1355                         if (mbuf_is_consumed(zmbuf->mbuf)) {
1356                                 update_shadow_used_ring_split(vq,
1357                                                 zmbuf->desc_idx, 0);
1358                                 TAILQ_REMOVE(&vq->zmbuf_list, zmbuf, next);
1359                                 restore_mbuf(zmbuf->mbuf);
1360                                 rte_pktmbuf_free(zmbuf->mbuf);
1361                                 put_zmbuf(zmbuf);
1362                                 vq->nr_zmbuf -= 1;
1363                         }
1364                 }
1365
1366                 if (likely(vq->shadow_used_idx)) {
1367                         flush_shadow_used_ring_split(dev, vq);
1368                         vhost_vring_call_split(dev, vq);
1369                 }
1370         }
1371
1372         free_entries = *((volatile uint16_t *)&vq->avail->idx) -
1373                         vq->last_avail_idx;
1374         if (free_entries == 0)
1375                 return 0;
1376
1377         /*
1378          * The ordering between avail index and
1379          * desc reads needs to be enforced.
1380          */
1381         rte_smp_rmb();
1382
1383         rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]);
1384
1385         VHOST_LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
1386
1387         count = RTE_MIN(count, MAX_PKT_BURST);
1388         count = RTE_MIN(count, free_entries);
1389         VHOST_LOG_DEBUG(VHOST_DATA, "(%d) about to dequeue %u buffers\n",
1390                         dev->vid, count);
1391
1392         for (i = 0; i < count; i++) {
1393                 struct buf_vector buf_vec[BUF_VECTOR_MAX];
1394                 uint16_t head_idx;
1395                 uint32_t dummy_len;
1396                 uint16_t nr_vec = 0;
1397                 int err;
1398
1399                 if (unlikely(fill_vec_buf_split(dev, vq,
1400                                                 vq->last_avail_idx + i,
1401                                                 &nr_vec, buf_vec,
1402                                                 &head_idx, &dummy_len,
1403                                                 VHOST_ACCESS_RO) < 0))
1404                         break;
1405
1406                 if (likely(dev->dequeue_zero_copy == 0))
1407                         update_shadow_used_ring_split(vq, head_idx, 0);
1408
1409                 rte_prefetch0((void *)(uintptr_t)buf_vec[0].buf_addr);
1410
1411                 pkts[i] = rte_pktmbuf_alloc(mbuf_pool);
1412                 if (unlikely(pkts[i] == NULL)) {
1413                         RTE_LOG(ERR, VHOST_DATA,
1414                                 "Failed to allocate memory for mbuf.\n");
1415                         break;
1416                 }
1417
1418                 err = copy_desc_to_mbuf(dev, vq, buf_vec, nr_vec, pkts[i],
1419                                 mbuf_pool);
1420                 if (unlikely(err)) {
1421                         rte_pktmbuf_free(pkts[i]);
1422                         break;
1423                 }
1424
1425                 if (unlikely(dev->dequeue_zero_copy)) {
1426                         struct zcopy_mbuf *zmbuf;
1427
1428                         zmbuf = get_zmbuf(vq);
1429                         if (!zmbuf) {
1430                                 rte_pktmbuf_free(pkts[i]);
1431                                 break;
1432                         }
1433                         zmbuf->mbuf = pkts[i];
1434                         zmbuf->desc_idx = head_idx;
1435
1436                         /*
1437                          * Pin lock the mbuf; we will check later to see
1438                          * whether the mbuf is freed (when we are the last
1439                          * user) or not. If that's the case, we then could
1440                          * update the used ring safely.
1441                          */
1442                         rte_mbuf_refcnt_update(pkts[i], 1);
1443
1444                         vq->nr_zmbuf += 1;
1445                         TAILQ_INSERT_TAIL(&vq->zmbuf_list, zmbuf, next);
1446                 }
1447         }
1448         vq->last_avail_idx += i;
1449
1450         if (likely(dev->dequeue_zero_copy == 0)) {
1451                 do_data_copy_dequeue(vq);
1452                 if (unlikely(i < count))
1453                         vq->shadow_used_idx = i;
1454                 if (likely(vq->shadow_used_idx)) {
1455                         flush_shadow_used_ring_split(dev, vq);
1456                         vhost_vring_call_split(dev, vq);
1457                 }
1458         }
1459
1460         return i;
1461 }
1462
1463 static __rte_always_inline uint16_t
1464 virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
1465         struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count)
1466 {
1467         uint16_t i;
1468
1469         if (unlikely(dev->dequeue_zero_copy)) {
1470                 struct zcopy_mbuf *zmbuf, *next;
1471
1472                 for (zmbuf = TAILQ_FIRST(&vq->zmbuf_list);
1473                      zmbuf != NULL; zmbuf = next) {
1474                         next = TAILQ_NEXT(zmbuf, next);
1475
1476                         if (mbuf_is_consumed(zmbuf->mbuf)) {
1477                                 update_shadow_used_ring_packed(vq,
1478                                                 zmbuf->desc_idx,
1479                                                 0,
1480                                                 zmbuf->desc_count);
1481
1482                                 TAILQ_REMOVE(&vq->zmbuf_list, zmbuf, next);
1483                                 restore_mbuf(zmbuf->mbuf);
1484                                 rte_pktmbuf_free(zmbuf->mbuf);
1485                                 put_zmbuf(zmbuf);
1486                                 vq->nr_zmbuf -= 1;
1487                         }
1488                 }
1489
1490                 if (likely(vq->shadow_used_idx)) {
1491                         flush_shadow_used_ring_packed(dev, vq);
1492                         vhost_vring_call_packed(dev, vq);
1493                 }
1494         }
1495
1496         VHOST_LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
1497
1498         count = RTE_MIN(count, MAX_PKT_BURST);
1499         VHOST_LOG_DEBUG(VHOST_DATA, "(%d) about to dequeue %u buffers\n",
1500                         dev->vid, count);
1501
1502         for (i = 0; i < count; i++) {
1503                 struct buf_vector buf_vec[BUF_VECTOR_MAX];
1504                 uint16_t buf_id;
1505                 uint32_t dummy_len;
1506                 uint16_t desc_count, nr_vec = 0;
1507                 int err;
1508
1509                 if (unlikely(fill_vec_buf_packed(dev, vq,
1510                                                 vq->last_avail_idx, &desc_count,
1511                                                 buf_vec, &nr_vec,
1512                                                 &buf_id, &dummy_len,
1513                                                 VHOST_ACCESS_RO) < 0))
1514                         break;
1515
1516                 if (likely(dev->dequeue_zero_copy == 0))
1517                         update_shadow_used_ring_packed(vq, buf_id, 0,
1518                                         desc_count);
1519
1520                 rte_prefetch0((void *)(uintptr_t)buf_vec[0].buf_addr);
1521
1522                 pkts[i] = rte_pktmbuf_alloc(mbuf_pool);
1523                 if (unlikely(pkts[i] == NULL)) {
1524                         RTE_LOG(ERR, VHOST_DATA,
1525                                 "Failed to allocate memory for mbuf.\n");
1526                         break;
1527                 }
1528
1529                 err = copy_desc_to_mbuf(dev, vq, buf_vec, nr_vec, pkts[i],
1530                                 mbuf_pool);
1531                 if (unlikely(err)) {
1532                         rte_pktmbuf_free(pkts[i]);
1533                         break;
1534                 }
1535
1536                 if (unlikely(dev->dequeue_zero_copy)) {
1537                         struct zcopy_mbuf *zmbuf;
1538
1539                         zmbuf = get_zmbuf(vq);
1540                         if (!zmbuf) {
1541                                 rte_pktmbuf_free(pkts[i]);
1542                                 break;
1543                         }
1544                         zmbuf->mbuf = pkts[i];
1545                         zmbuf->desc_idx = buf_id;
1546                         zmbuf->desc_count = desc_count;
1547
1548                         /*
1549                          * Pin lock the mbuf; we will check later to see
1550                          * whether the mbuf is freed (when we are the last
1551                          * user) or not. If that's the case, we then could
1552                          * update the used ring safely.
1553                          */
1554                         rte_mbuf_refcnt_update(pkts[i], 1);
1555
1556                         vq->nr_zmbuf += 1;
1557                         TAILQ_INSERT_TAIL(&vq->zmbuf_list, zmbuf, next);
1558                 }
1559
1560                 vq->last_avail_idx += desc_count;
1561                 if (vq->last_avail_idx >= vq->size) {
1562                         vq->last_avail_idx -= vq->size;
1563                         vq->avail_wrap_counter ^= 1;
1564                 }
1565         }
1566
1567         if (likely(dev->dequeue_zero_copy == 0)) {
1568                 do_data_copy_dequeue(vq);
1569                 if (unlikely(i < count))
1570                         vq->shadow_used_idx = i;
1571                 if (likely(vq->shadow_used_idx)) {
1572                         flush_shadow_used_ring_packed(dev, vq);
1573                         vhost_vring_call_packed(dev, vq);
1574                 }
1575         }
1576
1577         return i;
1578 }
1579
1580 uint16_t
1581 rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
1582         struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count)
1583 {
1584         struct virtio_net *dev;
1585         struct rte_mbuf *rarp_mbuf = NULL;
1586         struct vhost_virtqueue *vq;
1587
1588         dev = get_device(vid);
1589         if (!dev)
1590                 return 0;
1591
1592         if (unlikely(!(dev->flags & VIRTIO_DEV_BUILTIN_VIRTIO_NET))) {
1593                 RTE_LOG(ERR, VHOST_DATA,
1594                         "(%d) %s: built-in vhost net backend is disabled.\n",
1595                         dev->vid, __func__);
1596                 return 0;
1597         }
1598
1599         if (unlikely(!is_valid_virt_queue_idx(queue_id, 1, dev->nr_vring))) {
1600                 RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n",
1601                         dev->vid, __func__, queue_id);
1602                 return 0;
1603         }
1604
1605         vq = dev->virtqueue[queue_id];
1606
1607         if (unlikely(rte_spinlock_trylock(&vq->access_lock) == 0))
1608                 return 0;
1609
1610         if (unlikely(vq->enabled == 0)) {
1611                 count = 0;
1612                 goto out_access_unlock;
1613         }
1614
1615         if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
1616                 vhost_user_iotlb_rd_lock(vq);
1617
1618         if (unlikely(vq->access_ok == 0))
1619                 if (unlikely(vring_translate(dev, vq) < 0)) {
1620                         count = 0;
1621                         goto out;
1622                 }
1623
1624         /*
1625          * Construct a RARP broadcast packet, and inject it to the "pkts"
1626          * array, to looks like that guest actually send such packet.
1627          *
1628          * Check user_send_rarp() for more information.
1629          *
1630          * broadcast_rarp shares a cacheline in the virtio_net structure
1631          * with some fields that are accessed during enqueue and
1632          * rte_atomic16_cmpset() causes a write if using cmpxchg. This could
1633          * result in false sharing between enqueue and dequeue.
1634          *
1635          * Prevent unnecessary false sharing by reading broadcast_rarp first
1636          * and only performing cmpset if the read indicates it is likely to
1637          * be set.
1638          */
1639         if (unlikely(rte_atomic16_read(&dev->broadcast_rarp) &&
1640                         rte_atomic16_cmpset((volatile uint16_t *)
1641                                 &dev->broadcast_rarp.cnt, 1, 0))) {
1642
1643                 rarp_mbuf = rte_net_make_rarp_packet(mbuf_pool, &dev->mac);
1644                 if (rarp_mbuf == NULL) {
1645                         RTE_LOG(ERR, VHOST_DATA,
1646                                 "Failed to make RARP packet.\n");
1647                         count = 0;
1648                         goto out;
1649                 }
1650                 count -= 1;
1651         }
1652
1653         if (vq_is_packed(dev))
1654                 count = virtio_dev_tx_packed(dev, vq, mbuf_pool, pkts, count);
1655         else
1656                 count = virtio_dev_tx_split(dev, vq, mbuf_pool, pkts, count);
1657
1658 out:
1659         if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
1660                 vhost_user_iotlb_rd_unlock(vq);
1661
1662 out_access_unlock:
1663         rte_spinlock_unlock(&vq->access_lock);
1664
1665         if (unlikely(rarp_mbuf != NULL)) {
1666                 /*
1667                  * Inject it to the head of "pkts" array, so that switch's mac
1668                  * learning table will get updated first.
1669                  */
1670                 memmove(&pkts[1], pkts, count * sizeof(struct rte_mbuf *));
1671                 pkts[0] = rarp_mbuf;
1672                 count += 1;
1673         }
1674
1675         return count;
1676 }