1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright(c) 2010-2016 Intel Corporation
7 #include <linux/virtio_net.h>
10 #include <rte_memcpy.h>
11 #include <rte_ether.h>
13 #include <rte_vhost.h>
18 #include <rte_spinlock.h>
19 #include <rte_malloc.h>
24 #define MAX_PKT_BURST 32
26 #define MAX_BATCH_LEN 256
28 static __rte_always_inline bool
29 rxvq_is_mergeable(struct virtio_net *dev)
31 return dev->features & (1ULL << VIRTIO_NET_F_MRG_RXBUF);
34 static __rte_always_inline bool
35 virtio_net_is_inorder(struct virtio_net *dev)
37 return dev->features & (1ULL << VIRTIO_F_IN_ORDER);
41 is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t nr_vring)
43 return (is_tx ^ (idx & 1)) == 0 && idx < nr_vring;
47 do_data_copy_enqueue(struct virtio_net *dev, struct vhost_virtqueue *vq)
49 struct batch_copy_elem *elem = vq->batch_copy_elems;
50 uint16_t count = vq->batch_copy_nb_elems;
53 for (i = 0; i < count; i++) {
54 rte_memcpy(elem[i].dst, elem[i].src, elem[i].len);
55 vhost_log_cache_write_iova(dev, vq, elem[i].log_addr,
57 PRINT_PACKET(dev, (uintptr_t)elem[i].dst, elem[i].len, 0);
60 vq->batch_copy_nb_elems = 0;
64 do_data_copy_dequeue(struct vhost_virtqueue *vq)
66 struct batch_copy_elem *elem = vq->batch_copy_elems;
67 uint16_t count = vq->batch_copy_nb_elems;
70 for (i = 0; i < count; i++)
71 rte_memcpy(elem[i].dst, elem[i].src, elem[i].len);
73 vq->batch_copy_nb_elems = 0;
76 static __rte_always_inline void
77 do_flush_shadow_used_ring_split(struct virtio_net *dev,
78 struct vhost_virtqueue *vq,
79 uint16_t to, uint16_t from, uint16_t size)
81 rte_memcpy(&vq->used->ring[to],
82 &vq->shadow_used_split[from],
83 size * sizeof(struct vring_used_elem));
84 vhost_log_cache_used_vring(dev, vq,
85 offsetof(struct vring_used, ring[to]),
86 size * sizeof(struct vring_used_elem));
89 static __rte_always_inline void
90 flush_shadow_used_ring_split(struct virtio_net *dev, struct vhost_virtqueue *vq)
92 uint16_t used_idx = vq->last_used_idx & (vq->size - 1);
94 if (used_idx + vq->shadow_used_idx <= vq->size) {
95 do_flush_shadow_used_ring_split(dev, vq, used_idx, 0,
100 /* update used ring interval [used_idx, vq->size] */
101 size = vq->size - used_idx;
102 do_flush_shadow_used_ring_split(dev, vq, used_idx, 0, size);
104 /* update the left half used ring interval [0, left_size] */
105 do_flush_shadow_used_ring_split(dev, vq, 0, size,
106 vq->shadow_used_idx - size);
108 vq->last_used_idx += vq->shadow_used_idx;
112 vhost_log_cache_sync(dev, vq);
114 *(volatile uint16_t *)&vq->used->idx += vq->shadow_used_idx;
115 vq->shadow_used_idx = 0;
116 vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
117 sizeof(vq->used->idx));
120 static __rte_always_inline void
121 update_shadow_used_ring_split(struct vhost_virtqueue *vq,
122 uint16_t desc_idx, uint32_t len)
124 uint16_t i = vq->shadow_used_idx++;
126 vq->shadow_used_split[i].id = desc_idx;
127 vq->shadow_used_split[i].len = len;
130 static __rte_always_inline void
131 vhost_flush_enqueue_shadow_packed(struct virtio_net *dev,
132 struct vhost_virtqueue *vq)
135 uint16_t used_idx = vq->last_used_idx;
136 uint16_t head_idx = vq->last_used_idx;
137 uint16_t head_flags = 0;
139 /* Split loop in two to save memory barriers */
140 for (i = 0; i < vq->shadow_used_idx; i++) {
141 vq->desc_packed[used_idx].id = vq->shadow_used_packed[i].id;
142 vq->desc_packed[used_idx].len = vq->shadow_used_packed[i].len;
144 used_idx += vq->shadow_used_packed[i].count;
145 if (used_idx >= vq->size)
146 used_idx -= vq->size;
151 for (i = 0; i < vq->shadow_used_idx; i++) {
154 if (vq->shadow_used_packed[i].len)
155 flags = VRING_DESC_F_WRITE;
159 if (vq->used_wrap_counter) {
160 flags |= VRING_DESC_F_USED;
161 flags |= VRING_DESC_F_AVAIL;
163 flags &= ~VRING_DESC_F_USED;
164 flags &= ~VRING_DESC_F_AVAIL;
168 vq->desc_packed[vq->last_used_idx].flags = flags;
170 vhost_log_cache_used_vring(dev, vq,
172 sizeof(struct vring_packed_desc),
173 sizeof(struct vring_packed_desc));
175 head_idx = vq->last_used_idx;
179 vq_inc_last_used_packed(vq, vq->shadow_used_packed[i].count);
182 vq->desc_packed[head_idx].flags = head_flags;
184 vhost_log_cache_used_vring(dev, vq,
186 sizeof(struct vring_packed_desc),
187 sizeof(struct vring_packed_desc));
189 vq->shadow_used_idx = 0;
190 vhost_log_cache_sync(dev, vq);
193 static __rte_always_inline void
194 vhost_flush_dequeue_shadow_packed(struct virtio_net *dev,
195 struct vhost_virtqueue *vq)
197 struct vring_used_elem_packed *used_elem = &vq->shadow_used_packed[0];
199 vq->desc_packed[vq->shadow_last_used_idx].id = used_elem->id;
201 vq->desc_packed[vq->shadow_last_used_idx].flags = used_elem->flags;
203 vhost_log_cache_used_vring(dev, vq, vq->shadow_last_used_idx *
204 sizeof(struct vring_packed_desc),
205 sizeof(struct vring_packed_desc));
206 vq->shadow_used_idx = 0;
207 vhost_log_cache_sync(dev, vq);
210 static __rte_always_inline void
211 vhost_flush_enqueue_batch_packed(struct virtio_net *dev,
212 struct vhost_virtqueue *vq,
219 if (vq->shadow_used_idx) {
220 do_data_copy_enqueue(dev, vq);
221 vhost_flush_enqueue_shadow_packed(dev, vq);
224 flags = PACKED_DESC_ENQUEUE_USED_FLAG(vq->used_wrap_counter);
226 vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
227 vq->desc_packed[vq->last_used_idx + i].id = ids[i];
228 vq->desc_packed[vq->last_used_idx + i].len = lens[i];
233 vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
234 vq->desc_packed[vq->last_used_idx + i].flags = flags;
236 vhost_log_cache_used_vring(dev, vq, vq->last_used_idx *
237 sizeof(struct vring_packed_desc),
238 sizeof(struct vring_packed_desc) *
240 vhost_log_cache_sync(dev, vq);
242 vq_inc_last_used_packed(vq, PACKED_BATCH_SIZE);
245 static __rte_always_inline void
246 vhost_shadow_dequeue_batch_packed_inorder(struct vhost_virtqueue *vq,
249 vq->shadow_used_packed[0].id = id;
251 if (!vq->shadow_used_idx) {
252 vq->shadow_last_used_idx = vq->last_used_idx;
253 vq->shadow_used_packed[0].flags =
254 PACKED_DESC_DEQUEUE_USED_FLAG(vq->used_wrap_counter);
255 vq->shadow_used_packed[0].len = 0;
256 vq->shadow_used_packed[0].count = 1;
257 vq->shadow_used_idx++;
260 vq_inc_last_used_packed(vq, PACKED_BATCH_SIZE);
263 static __rte_always_inline void
264 vhost_shadow_dequeue_batch_packed(struct virtio_net *dev,
265 struct vhost_virtqueue *vq,
272 flags = PACKED_DESC_DEQUEUE_USED_FLAG(vq->used_wrap_counter);
274 if (!vq->shadow_used_idx) {
275 vq->shadow_last_used_idx = vq->last_used_idx;
276 vq->shadow_used_packed[0].id = ids[0];
277 vq->shadow_used_packed[0].len = 0;
278 vq->shadow_used_packed[0].count = 1;
279 vq->shadow_used_packed[0].flags = flags;
280 vq->shadow_used_idx++;
285 vhost_for_each_try_unroll(i, begin, PACKED_BATCH_SIZE) {
286 vq->desc_packed[vq->last_used_idx + i].id = ids[i];
287 vq->desc_packed[vq->last_used_idx + i].len = 0;
291 vhost_for_each_try_unroll(i, begin, PACKED_BATCH_SIZE)
292 vq->desc_packed[vq->last_used_idx + i].flags = flags;
294 vhost_log_cache_used_vring(dev, vq, vq->last_used_idx *
295 sizeof(struct vring_packed_desc),
296 sizeof(struct vring_packed_desc) *
298 vhost_log_cache_sync(dev, vq);
300 vq_inc_last_used_packed(vq, PACKED_BATCH_SIZE);
303 static __rte_always_inline void
304 vhost_shadow_dequeue_single_packed(struct vhost_virtqueue *vq,
310 flags = vq->desc_packed[vq->last_used_idx].flags;
311 if (vq->used_wrap_counter) {
312 flags |= VRING_DESC_F_USED;
313 flags |= VRING_DESC_F_AVAIL;
315 flags &= ~VRING_DESC_F_USED;
316 flags &= ~VRING_DESC_F_AVAIL;
319 if (!vq->shadow_used_idx) {
320 vq->shadow_last_used_idx = vq->last_used_idx;
322 vq->shadow_used_packed[0].id = buf_id;
323 vq->shadow_used_packed[0].len = 0;
324 vq->shadow_used_packed[0].flags = flags;
325 vq->shadow_used_idx++;
327 vq->desc_packed[vq->last_used_idx].id = buf_id;
328 vq->desc_packed[vq->last_used_idx].len = 0;
329 vq->desc_packed[vq->last_used_idx].flags = flags;
332 vq_inc_last_used_packed(vq, count);
335 static __rte_always_inline void
336 vhost_shadow_dequeue_single_packed_inorder(struct vhost_virtqueue *vq,
342 vq->shadow_used_packed[0].id = buf_id;
344 flags = vq->desc_packed[vq->last_used_idx].flags;
345 if (vq->used_wrap_counter) {
346 flags |= VRING_DESC_F_USED;
347 flags |= VRING_DESC_F_AVAIL;
349 flags &= ~VRING_DESC_F_USED;
350 flags &= ~VRING_DESC_F_AVAIL;
353 if (!vq->shadow_used_idx) {
354 vq->shadow_last_used_idx = vq->last_used_idx;
355 vq->shadow_used_packed[0].len = 0;
356 vq->shadow_used_packed[0].flags = flags;
357 vq->shadow_used_idx++;
360 vq_inc_last_used_packed(vq, count);
363 static __rte_always_inline void
364 vhost_shadow_enqueue_single_packed(struct virtio_net *dev,
365 struct vhost_virtqueue *vq,
369 uint16_t num_buffers)
372 for (i = 0; i < num_buffers; i++) {
373 /* enqueue shadow flush action aligned with batch num */
374 if (!vq->shadow_used_idx)
375 vq->shadow_aligned_idx = vq->last_used_idx &
377 vq->shadow_used_packed[vq->shadow_used_idx].id = id[i];
378 vq->shadow_used_packed[vq->shadow_used_idx].len = len[i];
379 vq->shadow_used_packed[vq->shadow_used_idx].count = count[i];
380 vq->shadow_aligned_idx += count[i];
381 vq->shadow_used_idx++;
384 if (vq->shadow_aligned_idx >= PACKED_BATCH_SIZE) {
385 do_data_copy_enqueue(dev, vq);
386 vhost_flush_enqueue_shadow_packed(dev, vq);
390 /* avoid write operation when necessary, to lessen cache issues */
391 #define ASSIGN_UNLESS_EQUAL(var, val) do { \
392 if ((var) != (val)) \
396 static __rte_always_inline void
397 virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
399 uint64_t csum_l4 = m_buf->ol_flags & PKT_TX_L4_MASK;
401 if (m_buf->ol_flags & PKT_TX_TCP_SEG)
402 csum_l4 |= PKT_TX_TCP_CKSUM;
405 net_hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
406 net_hdr->csum_start = m_buf->l2_len + m_buf->l3_len;
409 case PKT_TX_TCP_CKSUM:
410 net_hdr->csum_offset = (offsetof(struct rte_tcp_hdr,
413 case PKT_TX_UDP_CKSUM:
414 net_hdr->csum_offset = (offsetof(struct rte_udp_hdr,
417 case PKT_TX_SCTP_CKSUM:
418 net_hdr->csum_offset = (offsetof(struct rte_sctp_hdr,
423 ASSIGN_UNLESS_EQUAL(net_hdr->csum_start, 0);
424 ASSIGN_UNLESS_EQUAL(net_hdr->csum_offset, 0);
425 ASSIGN_UNLESS_EQUAL(net_hdr->flags, 0);
428 /* IP cksum verification cannot be bypassed, then calculate here */
429 if (m_buf->ol_flags & PKT_TX_IP_CKSUM) {
430 struct rte_ipv4_hdr *ipv4_hdr;
432 ipv4_hdr = rte_pktmbuf_mtod_offset(m_buf, struct rte_ipv4_hdr *,
434 ipv4_hdr->hdr_checksum = 0;
435 ipv4_hdr->hdr_checksum = rte_ipv4_cksum(ipv4_hdr);
438 if (m_buf->ol_flags & PKT_TX_TCP_SEG) {
439 if (m_buf->ol_flags & PKT_TX_IPV4)
440 net_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
442 net_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
443 net_hdr->gso_size = m_buf->tso_segsz;
444 net_hdr->hdr_len = m_buf->l2_len + m_buf->l3_len
446 } else if (m_buf->ol_flags & PKT_TX_UDP_SEG) {
447 net_hdr->gso_type = VIRTIO_NET_HDR_GSO_UDP;
448 net_hdr->gso_size = m_buf->tso_segsz;
449 net_hdr->hdr_len = m_buf->l2_len + m_buf->l3_len +
452 ASSIGN_UNLESS_EQUAL(net_hdr->gso_type, 0);
453 ASSIGN_UNLESS_EQUAL(net_hdr->gso_size, 0);
454 ASSIGN_UNLESS_EQUAL(net_hdr->hdr_len, 0);
458 static __rte_always_inline int
459 map_one_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
460 struct buf_vector *buf_vec, uint16_t *vec_idx,
461 uint64_t desc_iova, uint64_t desc_len, uint8_t perm)
463 uint16_t vec_id = *vec_idx;
467 uint64_t desc_chunck_len = desc_len;
469 if (unlikely(vec_id >= BUF_VECTOR_MAX))
472 desc_addr = vhost_iova_to_vva(dev, vq,
476 if (unlikely(!desc_addr))
479 rte_prefetch0((void *)(uintptr_t)desc_addr);
481 buf_vec[vec_id].buf_iova = desc_iova;
482 buf_vec[vec_id].buf_addr = desc_addr;
483 buf_vec[vec_id].buf_len = desc_chunck_len;
485 desc_len -= desc_chunck_len;
486 desc_iova += desc_chunck_len;
494 static __rte_always_inline int
495 fill_vec_buf_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
496 uint32_t avail_idx, uint16_t *vec_idx,
497 struct buf_vector *buf_vec, uint16_t *desc_chain_head,
498 uint32_t *desc_chain_len, uint8_t perm)
500 uint16_t idx = vq->avail->ring[avail_idx & (vq->size - 1)];
501 uint16_t vec_id = *vec_idx;
504 uint32_t nr_descs = vq->size;
506 struct vring_desc *descs = vq->desc;
507 struct vring_desc *idesc = NULL;
509 if (unlikely(idx >= vq->size))
512 *desc_chain_head = idx;
514 if (vq->desc[idx].flags & VRING_DESC_F_INDIRECT) {
515 dlen = vq->desc[idx].len;
516 nr_descs = dlen / sizeof(struct vring_desc);
517 if (unlikely(nr_descs > vq->size))
520 descs = (struct vring_desc *)(uintptr_t)
521 vhost_iova_to_vva(dev, vq, vq->desc[idx].addr,
524 if (unlikely(!descs))
527 if (unlikely(dlen < vq->desc[idx].len)) {
529 * The indirect desc table is not contiguous
530 * in process VA space, we have to copy it.
532 idesc = vhost_alloc_copy_ind_table(dev, vq,
533 vq->desc[idx].addr, vq->desc[idx].len);
534 if (unlikely(!idesc))
544 if (unlikely(idx >= nr_descs || cnt++ >= nr_descs)) {
545 free_ind_table(idesc);
549 len += descs[idx].len;
551 if (unlikely(map_one_desc(dev, vq, buf_vec, &vec_id,
552 descs[idx].addr, descs[idx].len,
554 free_ind_table(idesc);
558 if ((descs[idx].flags & VRING_DESC_F_NEXT) == 0)
561 idx = descs[idx].next;
564 *desc_chain_len = len;
567 if (unlikely(!!idesc))
568 free_ind_table(idesc);
574 * Returns -1 on fail, 0 on success
577 reserve_avail_buf_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
578 uint32_t size, struct buf_vector *buf_vec,
579 uint16_t *num_buffers, uint16_t avail_head,
583 uint16_t vec_idx = 0;
584 uint16_t max_tries, tries = 0;
586 uint16_t head_idx = 0;
590 cur_idx = vq->last_avail_idx;
592 if (rxvq_is_mergeable(dev))
593 max_tries = vq->size - 1;
598 if (unlikely(cur_idx == avail_head))
601 * if we tried all available ring items, and still
602 * can't get enough buf, it means something abnormal
605 if (unlikely(++tries > max_tries))
608 if (unlikely(fill_vec_buf_split(dev, vq, cur_idx,
611 VHOST_ACCESS_RW) < 0))
613 len = RTE_MIN(len, size);
614 update_shadow_used_ring_split(vq, head_idx, len);
626 static __rte_always_inline int
627 fill_vec_buf_packed_indirect(struct virtio_net *dev,
628 struct vhost_virtqueue *vq,
629 struct vring_packed_desc *desc, uint16_t *vec_idx,
630 struct buf_vector *buf_vec, uint32_t *len, uint8_t perm)
634 uint16_t vec_id = *vec_idx;
636 struct vring_packed_desc *descs, *idescs = NULL;
639 descs = (struct vring_packed_desc *)(uintptr_t)
640 vhost_iova_to_vva(dev, vq, desc->addr, &dlen, VHOST_ACCESS_RO);
641 if (unlikely(!descs))
644 if (unlikely(dlen < desc->len)) {
646 * The indirect desc table is not contiguous
647 * in process VA space, we have to copy it.
649 idescs = vhost_alloc_copy_ind_table(dev,
650 vq, desc->addr, desc->len);
651 if (unlikely(!idescs))
657 nr_descs = desc->len / sizeof(struct vring_packed_desc);
658 if (unlikely(nr_descs >= vq->size)) {
659 free_ind_table(idescs);
663 for (i = 0; i < nr_descs; i++) {
664 if (unlikely(vec_id >= BUF_VECTOR_MAX)) {
665 free_ind_table(idescs);
669 *len += descs[i].len;
670 if (unlikely(map_one_desc(dev, vq, buf_vec, &vec_id,
671 descs[i].addr, descs[i].len,
677 if (unlikely(!!idescs))
678 free_ind_table(idescs);
683 static __rte_always_inline int
684 fill_vec_buf_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
685 uint16_t avail_idx, uint16_t *desc_count,
686 struct buf_vector *buf_vec, uint16_t *vec_idx,
687 uint16_t *buf_id, uint32_t *len, uint8_t perm)
689 bool wrap_counter = vq->avail_wrap_counter;
690 struct vring_packed_desc *descs = vq->desc_packed;
691 uint16_t vec_id = *vec_idx;
693 if (avail_idx < vq->last_avail_idx)
697 * Perform a load-acquire barrier in desc_is_avail to
698 * enforce the ordering between desc flags and desc
701 if (unlikely(!desc_is_avail(&descs[avail_idx], wrap_counter)))
708 if (unlikely(vec_id >= BUF_VECTOR_MAX))
711 if (unlikely(*desc_count >= vq->size))
715 *buf_id = descs[avail_idx].id;
717 if (descs[avail_idx].flags & VRING_DESC_F_INDIRECT) {
718 if (unlikely(fill_vec_buf_packed_indirect(dev, vq,
724 *len += descs[avail_idx].len;
726 if (unlikely(map_one_desc(dev, vq, buf_vec, &vec_id,
727 descs[avail_idx].addr,
728 descs[avail_idx].len,
733 if ((descs[avail_idx].flags & VRING_DESC_F_NEXT) == 0)
736 if (++avail_idx >= vq->size) {
737 avail_idx -= vq->size;
747 static __rte_noinline void
748 copy_vnet_hdr_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
749 struct buf_vector *buf_vec,
750 struct virtio_net_hdr_mrg_rxbuf *hdr)
753 uint64_t remain = dev->vhost_hlen;
754 uint64_t src = (uint64_t)(uintptr_t)hdr, dst;
755 uint64_t iova = buf_vec->buf_iova;
758 len = RTE_MIN(remain,
760 dst = buf_vec->buf_addr;
761 rte_memcpy((void *)(uintptr_t)dst,
762 (void *)(uintptr_t)src,
765 PRINT_PACKET(dev, (uintptr_t)dst,
767 vhost_log_cache_write_iova(dev, vq,
777 static __rte_always_inline int
778 copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
779 struct rte_mbuf *m, struct buf_vector *buf_vec,
780 uint16_t nr_vec, uint16_t num_buffers)
782 uint32_t vec_idx = 0;
783 uint32_t mbuf_offset, mbuf_avail;
784 uint32_t buf_offset, buf_avail;
785 uint64_t buf_addr, buf_iova, buf_len;
788 struct rte_mbuf *hdr_mbuf;
789 struct batch_copy_elem *batch_copy = vq->batch_copy_elems;
790 struct virtio_net_hdr_mrg_rxbuf tmp_hdr, *hdr = NULL;
793 if (unlikely(m == NULL)) {
798 buf_addr = buf_vec[vec_idx].buf_addr;
799 buf_iova = buf_vec[vec_idx].buf_iova;
800 buf_len = buf_vec[vec_idx].buf_len;
802 if (unlikely(buf_len < dev->vhost_hlen && nr_vec <= 1)) {
809 if (unlikely(buf_len < dev->vhost_hlen))
812 hdr = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)hdr_addr;
814 VHOST_LOG_DATA(DEBUG, "(%d) RX: num merge buffers %d\n",
815 dev->vid, num_buffers);
817 if (unlikely(buf_len < dev->vhost_hlen)) {
818 buf_offset = dev->vhost_hlen - buf_len;
820 buf_addr = buf_vec[vec_idx].buf_addr;
821 buf_iova = buf_vec[vec_idx].buf_iova;
822 buf_len = buf_vec[vec_idx].buf_len;
823 buf_avail = buf_len - buf_offset;
825 buf_offset = dev->vhost_hlen;
826 buf_avail = buf_len - dev->vhost_hlen;
829 mbuf_avail = rte_pktmbuf_data_len(m);
831 while (mbuf_avail != 0 || m->next != NULL) {
832 /* done with current buf, get the next one */
833 if (buf_avail == 0) {
835 if (unlikely(vec_idx >= nr_vec)) {
840 buf_addr = buf_vec[vec_idx].buf_addr;
841 buf_iova = buf_vec[vec_idx].buf_iova;
842 buf_len = buf_vec[vec_idx].buf_len;
848 /* done with current mbuf, get the next one */
849 if (mbuf_avail == 0) {
853 mbuf_avail = rte_pktmbuf_data_len(m);
857 virtio_enqueue_offload(hdr_mbuf, &hdr->hdr);
858 if (rxvq_is_mergeable(dev))
859 ASSIGN_UNLESS_EQUAL(hdr->num_buffers,
862 if (unlikely(hdr == &tmp_hdr)) {
863 copy_vnet_hdr_to_desc(dev, vq, buf_vec, hdr);
865 PRINT_PACKET(dev, (uintptr_t)hdr_addr,
867 vhost_log_cache_write_iova(dev, vq,
875 cpy_len = RTE_MIN(buf_avail, mbuf_avail);
877 if (likely(cpy_len > MAX_BATCH_LEN ||
878 vq->batch_copy_nb_elems >= vq->size)) {
879 rte_memcpy((void *)((uintptr_t)(buf_addr + buf_offset)),
880 rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
882 vhost_log_cache_write_iova(dev, vq,
883 buf_iova + buf_offset,
885 PRINT_PACKET(dev, (uintptr_t)(buf_addr + buf_offset),
888 batch_copy[vq->batch_copy_nb_elems].dst =
889 (void *)((uintptr_t)(buf_addr + buf_offset));
890 batch_copy[vq->batch_copy_nb_elems].src =
891 rte_pktmbuf_mtod_offset(m, void *, mbuf_offset);
892 batch_copy[vq->batch_copy_nb_elems].log_addr =
893 buf_iova + buf_offset;
894 batch_copy[vq->batch_copy_nb_elems].len = cpy_len;
895 vq->batch_copy_nb_elems++;
898 mbuf_avail -= cpy_len;
899 mbuf_offset += cpy_len;
900 buf_avail -= cpy_len;
901 buf_offset += cpy_len;
909 static __rte_always_inline int
910 vhost_enqueue_single_packed(struct virtio_net *dev,
911 struct vhost_virtqueue *vq,
912 struct rte_mbuf *pkt,
913 struct buf_vector *buf_vec,
917 uint16_t avail_idx = vq->last_avail_idx;
918 uint16_t max_tries, tries = 0;
922 uint32_t size = pkt->pkt_len + dev->vhost_hlen;
923 uint16_t num_buffers = 0;
924 uint32_t buffer_len[vq->size];
925 uint16_t buffer_buf_id[vq->size];
926 uint16_t buffer_desc_count[vq->size];
928 if (rxvq_is_mergeable(dev))
929 max_tries = vq->size - 1;
935 * if we tried all available ring items, and still
936 * can't get enough buf, it means something abnormal
939 if (unlikely(++tries > max_tries))
942 if (unlikely(fill_vec_buf_packed(dev, vq,
943 avail_idx, &desc_count,
946 VHOST_ACCESS_RW) < 0))
949 len = RTE_MIN(len, size);
952 buffer_len[num_buffers] = len;
953 buffer_buf_id[num_buffers] = buf_id;
954 buffer_desc_count[num_buffers] = desc_count;
957 *nr_descs += desc_count;
958 avail_idx += desc_count;
959 if (avail_idx >= vq->size)
960 avail_idx -= vq->size;
963 if (copy_mbuf_to_desc(dev, vq, pkt, buf_vec, nr_vec, num_buffers) < 0)
966 vhost_shadow_enqueue_single_packed(dev, vq, buffer_len, buffer_buf_id,
967 buffer_desc_count, num_buffers);
972 static __rte_noinline uint32_t
973 virtio_dev_rx_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
974 struct rte_mbuf **pkts, uint32_t count)
976 uint32_t pkt_idx = 0;
977 uint16_t num_buffers;
978 struct buf_vector buf_vec[BUF_VECTOR_MAX];
981 avail_head = *((volatile uint16_t *)&vq->avail->idx);
984 * The ordering between avail index and
985 * desc reads needs to be enforced.
989 rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]);
991 for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
992 uint32_t pkt_len = pkts[pkt_idx]->pkt_len + dev->vhost_hlen;
995 if (unlikely(reserve_avail_buf_split(dev, vq,
996 pkt_len, buf_vec, &num_buffers,
997 avail_head, &nr_vec) < 0)) {
998 VHOST_LOG_DATA(DEBUG,
999 "(%d) failed to get enough desc from vring\n",
1001 vq->shadow_used_idx -= num_buffers;
1005 VHOST_LOG_DATA(DEBUG, "(%d) current index %d | end index %d\n",
1006 dev->vid, vq->last_avail_idx,
1007 vq->last_avail_idx + num_buffers);
1009 if (copy_mbuf_to_desc(dev, vq, pkts[pkt_idx],
1012 vq->shadow_used_idx -= num_buffers;
1016 vq->last_avail_idx += num_buffers;
1019 do_data_copy_enqueue(dev, vq);
1021 if (likely(vq->shadow_used_idx)) {
1022 flush_shadow_used_ring_split(dev, vq);
1023 vhost_vring_call_split(dev, vq);
1029 static __rte_always_inline int
1030 virtio_dev_rx_batch_packed(struct virtio_net *dev,
1031 struct vhost_virtqueue *vq,
1032 struct rte_mbuf **pkts)
1034 bool wrap_counter = vq->avail_wrap_counter;
1035 struct vring_packed_desc *descs = vq->desc_packed;
1036 uint16_t avail_idx = vq->last_avail_idx;
1037 uint64_t desc_addrs[PACKED_BATCH_SIZE];
1038 struct virtio_net_hdr_mrg_rxbuf *hdrs[PACKED_BATCH_SIZE];
1039 uint32_t buf_offset = dev->vhost_hlen;
1040 uint64_t lens[PACKED_BATCH_SIZE];
1041 uint16_t ids[PACKED_BATCH_SIZE];
1044 if (unlikely(avail_idx & PACKED_BATCH_MASK))
1047 if (unlikely((avail_idx + PACKED_BATCH_SIZE) > vq->size))
1050 vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
1051 if (unlikely(pkts[i]->next != NULL))
1053 if (unlikely(!desc_is_avail(&descs[avail_idx + i],
1060 vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
1061 lens[i] = descs[avail_idx + i].len;
1063 vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
1064 if (unlikely(pkts[i]->pkt_len > (lens[i] - buf_offset)))
1068 vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
1069 desc_addrs[i] = vhost_iova_to_vva(dev, vq,
1070 descs[avail_idx + i].addr,
1074 vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
1075 if (unlikely(lens[i] != descs[avail_idx + i].len))
1079 vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
1080 rte_prefetch0((void *)(uintptr_t)desc_addrs[i]);
1081 hdrs[i] = (struct virtio_net_hdr_mrg_rxbuf *)
1082 (uintptr_t)desc_addrs[i];
1083 lens[i] = pkts[i]->pkt_len + dev->vhost_hlen;
1086 vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
1087 virtio_enqueue_offload(pkts[i], &hdrs[i]->hdr);
1089 vq_inc_last_avail_packed(vq, PACKED_BATCH_SIZE);
1091 vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
1092 rte_memcpy((void *)(uintptr_t)(desc_addrs[i] + buf_offset),
1093 rte_pktmbuf_mtod_offset(pkts[i], void *, 0),
1097 vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
1098 vhost_log_cache_write_iova(dev, vq, descs[avail_idx + i].addr,
1101 vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
1102 ids[i] = descs[avail_idx + i].id;
1104 vhost_flush_enqueue_batch_packed(dev, vq, lens, ids);
1109 static __rte_always_inline int16_t
1110 virtio_dev_rx_single_packed(struct virtio_net *dev,
1111 struct vhost_virtqueue *vq,
1112 struct rte_mbuf *pkt)
1114 struct buf_vector buf_vec[BUF_VECTOR_MAX];
1115 uint16_t nr_descs = 0;
1118 if (unlikely(vhost_enqueue_single_packed(dev, vq, pkt, buf_vec,
1120 VHOST_LOG_DATA(DEBUG,
1121 "(%d) failed to get enough desc from vring\n",
1126 VHOST_LOG_DATA(DEBUG, "(%d) current index %d | end index %d\n",
1127 dev->vid, vq->last_avail_idx,
1128 vq->last_avail_idx + nr_descs);
1130 vq_inc_last_avail_packed(vq, nr_descs);
1135 static __rte_noinline uint32_t
1136 virtio_dev_rx_packed(struct virtio_net *dev,
1137 struct vhost_virtqueue *vq,
1138 struct rte_mbuf **pkts,
1141 uint32_t pkt_idx = 0;
1142 uint32_t remained = count;
1145 rte_prefetch0(&vq->desc_packed[vq->last_avail_idx]);
1147 if (remained >= PACKED_BATCH_SIZE) {
1148 if (!virtio_dev_rx_batch_packed(dev, vq,
1150 pkt_idx += PACKED_BATCH_SIZE;
1151 remained -= PACKED_BATCH_SIZE;
1156 if (virtio_dev_rx_single_packed(dev, vq, pkts[pkt_idx]))
1161 } while (pkt_idx < count);
1163 if (vq->shadow_used_idx) {
1164 do_data_copy_enqueue(dev, vq);
1165 vhost_flush_enqueue_shadow_packed(dev, vq);
1169 vhost_vring_call_packed(dev, vq);
1174 static __rte_always_inline uint32_t
1175 virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
1176 struct rte_mbuf **pkts, uint32_t count)
1178 struct vhost_virtqueue *vq;
1181 VHOST_LOG_DATA(DEBUG, "(%d) %s\n", dev->vid, __func__);
1182 if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->nr_vring))) {
1183 VHOST_LOG_DATA(ERR, "(%d) %s: invalid virtqueue idx %d.\n",
1184 dev->vid, __func__, queue_id);
1188 vq = dev->virtqueue[queue_id];
1190 rte_spinlock_lock(&vq->access_lock);
1192 if (unlikely(vq->enabled == 0))
1193 goto out_access_unlock;
1195 if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
1196 vhost_user_iotlb_rd_lock(vq);
1198 if (unlikely(vq->access_ok == 0))
1199 if (unlikely(vring_translate(dev, vq) < 0))
1202 count = RTE_MIN((uint32_t)MAX_PKT_BURST, count);
1206 if (vq_is_packed(dev))
1207 nb_tx = virtio_dev_rx_packed(dev, vq, pkts, count);
1209 nb_tx = virtio_dev_rx_split(dev, vq, pkts, count);
1212 if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
1213 vhost_user_iotlb_rd_unlock(vq);
1216 rte_spinlock_unlock(&vq->access_lock);
1222 rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
1223 struct rte_mbuf **pkts, uint16_t count)
1225 struct virtio_net *dev = get_device(vid);
1230 if (unlikely(!(dev->flags & VIRTIO_DEV_BUILTIN_VIRTIO_NET))) {
1232 "(%d) %s: built-in vhost net backend is disabled.\n",
1233 dev->vid, __func__);
1237 return virtio_dev_rx(dev, queue_id, pkts, count);
1241 virtio_net_with_host_offload(struct virtio_net *dev)
1244 ((1ULL << VIRTIO_NET_F_CSUM) |
1245 (1ULL << VIRTIO_NET_F_HOST_ECN) |
1246 (1ULL << VIRTIO_NET_F_HOST_TSO4) |
1247 (1ULL << VIRTIO_NET_F_HOST_TSO6) |
1248 (1ULL << VIRTIO_NET_F_HOST_UFO)))
1255 parse_ethernet(struct rte_mbuf *m, uint16_t *l4_proto, void **l4_hdr)
1257 struct rte_ipv4_hdr *ipv4_hdr;
1258 struct rte_ipv6_hdr *ipv6_hdr;
1259 void *l3_hdr = NULL;
1260 struct rte_ether_hdr *eth_hdr;
1263 eth_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1265 m->l2_len = sizeof(struct rte_ether_hdr);
1266 ethertype = rte_be_to_cpu_16(eth_hdr->ether_type);
1268 if (ethertype == RTE_ETHER_TYPE_VLAN) {
1269 struct rte_vlan_hdr *vlan_hdr =
1270 (struct rte_vlan_hdr *)(eth_hdr + 1);
1272 m->l2_len += sizeof(struct rte_vlan_hdr);
1273 ethertype = rte_be_to_cpu_16(vlan_hdr->eth_proto);
1276 l3_hdr = (char *)eth_hdr + m->l2_len;
1278 switch (ethertype) {
1279 case RTE_ETHER_TYPE_IPV4:
1281 *l4_proto = ipv4_hdr->next_proto_id;
1282 m->l3_len = (ipv4_hdr->version_ihl & 0x0f) * 4;
1283 *l4_hdr = (char *)l3_hdr + m->l3_len;
1284 m->ol_flags |= PKT_TX_IPV4;
1286 case RTE_ETHER_TYPE_IPV6:
1288 *l4_proto = ipv6_hdr->proto;
1289 m->l3_len = sizeof(struct rte_ipv6_hdr);
1290 *l4_hdr = (char *)l3_hdr + m->l3_len;
1291 m->ol_flags |= PKT_TX_IPV6;
1301 static __rte_always_inline void
1302 vhost_dequeue_offload(struct virtio_net_hdr *hdr, struct rte_mbuf *m)
1304 uint16_t l4_proto = 0;
1305 void *l4_hdr = NULL;
1306 struct rte_tcp_hdr *tcp_hdr = NULL;
1308 if (hdr->flags == 0 && hdr->gso_type == VIRTIO_NET_HDR_GSO_NONE)
1311 parse_ethernet(m, &l4_proto, &l4_hdr);
1312 if (hdr->flags == VIRTIO_NET_HDR_F_NEEDS_CSUM) {
1313 if (hdr->csum_start == (m->l2_len + m->l3_len)) {
1314 switch (hdr->csum_offset) {
1315 case (offsetof(struct rte_tcp_hdr, cksum)):
1316 if (l4_proto == IPPROTO_TCP)
1317 m->ol_flags |= PKT_TX_TCP_CKSUM;
1319 case (offsetof(struct rte_udp_hdr, dgram_cksum)):
1320 if (l4_proto == IPPROTO_UDP)
1321 m->ol_flags |= PKT_TX_UDP_CKSUM;
1323 case (offsetof(struct rte_sctp_hdr, cksum)):
1324 if (l4_proto == IPPROTO_SCTP)
1325 m->ol_flags |= PKT_TX_SCTP_CKSUM;
1333 if (l4_hdr && hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
1334 switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
1335 case VIRTIO_NET_HDR_GSO_TCPV4:
1336 case VIRTIO_NET_HDR_GSO_TCPV6:
1338 m->ol_flags |= PKT_TX_TCP_SEG;
1339 m->tso_segsz = hdr->gso_size;
1340 m->l4_len = (tcp_hdr->data_off & 0xf0) >> 2;
1342 case VIRTIO_NET_HDR_GSO_UDP:
1343 m->ol_flags |= PKT_TX_UDP_SEG;
1344 m->tso_segsz = hdr->gso_size;
1345 m->l4_len = sizeof(struct rte_udp_hdr);
1348 VHOST_LOG_DATA(WARNING,
1349 "unsupported gso type %u.\n", hdr->gso_type);
1355 static __rte_noinline void
1356 copy_vnet_hdr_from_desc(struct virtio_net_hdr *hdr,
1357 struct buf_vector *buf_vec)
1360 uint64_t remain = sizeof(struct virtio_net_hdr);
1362 uint64_t dst = (uint64_t)(uintptr_t)hdr;
1365 len = RTE_MIN(remain, buf_vec->buf_len);
1366 src = buf_vec->buf_addr;
1367 rte_memcpy((void *)(uintptr_t)dst,
1368 (void *)(uintptr_t)src, len);
1376 static __rte_always_inline int
1377 copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
1378 struct buf_vector *buf_vec, uint16_t nr_vec,
1379 struct rte_mbuf *m, struct rte_mempool *mbuf_pool)
1381 uint32_t buf_avail, buf_offset;
1382 uint64_t buf_addr, buf_iova, buf_len;
1383 uint32_t mbuf_avail, mbuf_offset;
1385 struct rte_mbuf *cur = m, *prev = m;
1386 struct virtio_net_hdr tmp_hdr;
1387 struct virtio_net_hdr *hdr = NULL;
1388 /* A counter to avoid desc dead loop chain */
1389 uint16_t vec_idx = 0;
1390 struct batch_copy_elem *batch_copy = vq->batch_copy_elems;
1393 buf_addr = buf_vec[vec_idx].buf_addr;
1394 buf_iova = buf_vec[vec_idx].buf_iova;
1395 buf_len = buf_vec[vec_idx].buf_len;
1397 if (unlikely(buf_len < dev->vhost_hlen && nr_vec <= 1)) {
1402 if (virtio_net_with_host_offload(dev)) {
1403 if (unlikely(buf_len < sizeof(struct virtio_net_hdr))) {
1405 * No luck, the virtio-net header doesn't fit
1406 * in a contiguous virtual area.
1408 copy_vnet_hdr_from_desc(&tmp_hdr, buf_vec);
1411 hdr = (struct virtio_net_hdr *)((uintptr_t)buf_addr);
1416 * A virtio driver normally uses at least 2 desc buffers
1417 * for Tx: the first for storing the header, and others
1418 * for storing the data.
1420 if (unlikely(buf_len < dev->vhost_hlen)) {
1421 buf_offset = dev->vhost_hlen - buf_len;
1423 buf_addr = buf_vec[vec_idx].buf_addr;
1424 buf_iova = buf_vec[vec_idx].buf_iova;
1425 buf_len = buf_vec[vec_idx].buf_len;
1426 buf_avail = buf_len - buf_offset;
1427 } else if (buf_len == dev->vhost_hlen) {
1428 if (unlikely(++vec_idx >= nr_vec))
1430 buf_addr = buf_vec[vec_idx].buf_addr;
1431 buf_iova = buf_vec[vec_idx].buf_iova;
1432 buf_len = buf_vec[vec_idx].buf_len;
1435 buf_avail = buf_len;
1437 buf_offset = dev->vhost_hlen;
1438 buf_avail = buf_vec[vec_idx].buf_len - dev->vhost_hlen;
1442 (uintptr_t)(buf_addr + buf_offset),
1443 (uint32_t)buf_avail, 0);
1446 mbuf_avail = m->buf_len - RTE_PKTMBUF_HEADROOM;
1450 cpy_len = RTE_MIN(buf_avail, mbuf_avail);
1453 * A desc buf might across two host physical pages that are
1454 * not continuous. In such case (gpa_to_hpa returns 0), data
1455 * will be copied even though zero copy is enabled.
1457 if (unlikely(dev->dequeue_zero_copy && (hpa = gpa_to_hpa(dev,
1458 buf_iova + buf_offset, cpy_len)))) {
1459 cur->data_len = cpy_len;
1462 (void *)(uintptr_t)(buf_addr + buf_offset);
1463 cur->buf_iova = hpa;
1466 * In zero copy mode, one mbuf can only reference data
1467 * for one or partial of one desc buff.
1469 mbuf_avail = cpy_len;
1471 if (likely(cpy_len > MAX_BATCH_LEN ||
1472 vq->batch_copy_nb_elems >= vq->size ||
1473 (hdr && cur == m))) {
1474 rte_memcpy(rte_pktmbuf_mtod_offset(cur, void *,
1476 (void *)((uintptr_t)(buf_addr +
1480 batch_copy[vq->batch_copy_nb_elems].dst =
1481 rte_pktmbuf_mtod_offset(cur, void *,
1483 batch_copy[vq->batch_copy_nb_elems].src =
1484 (void *)((uintptr_t)(buf_addr +
1486 batch_copy[vq->batch_copy_nb_elems].len =
1488 vq->batch_copy_nb_elems++;
1492 mbuf_avail -= cpy_len;
1493 mbuf_offset += cpy_len;
1494 buf_avail -= cpy_len;
1495 buf_offset += cpy_len;
1497 /* This buf reaches to its end, get the next one */
1498 if (buf_avail == 0) {
1499 if (++vec_idx >= nr_vec)
1502 buf_addr = buf_vec[vec_idx].buf_addr;
1503 buf_iova = buf_vec[vec_idx].buf_iova;
1504 buf_len = buf_vec[vec_idx].buf_len;
1507 buf_avail = buf_len;
1509 PRINT_PACKET(dev, (uintptr_t)buf_addr,
1510 (uint32_t)buf_avail, 0);
1514 * This mbuf reaches to its end, get a new one
1515 * to hold more data.
1517 if (mbuf_avail == 0) {
1518 cur = rte_pktmbuf_alloc(mbuf_pool);
1519 if (unlikely(cur == NULL)) {
1520 VHOST_LOG_DATA(ERR, "Failed to "
1521 "allocate memory for mbuf.\n");
1525 if (unlikely(dev->dequeue_zero_copy))
1526 rte_mbuf_refcnt_update(cur, 1);
1529 prev->data_len = mbuf_offset;
1531 m->pkt_len += mbuf_offset;
1535 mbuf_avail = cur->buf_len - RTE_PKTMBUF_HEADROOM;
1539 prev->data_len = mbuf_offset;
1540 m->pkt_len += mbuf_offset;
1543 vhost_dequeue_offload(hdr, m);
1550 static __rte_always_inline struct zcopy_mbuf *
1551 get_zmbuf(struct vhost_virtqueue *vq)
1557 /* search [last_zmbuf_idx, zmbuf_size) */
1558 i = vq->last_zmbuf_idx;
1559 last = vq->zmbuf_size;
1562 for (; i < last; i++) {
1563 if (vq->zmbufs[i].in_use == 0) {
1564 vq->last_zmbuf_idx = i + 1;
1565 vq->zmbufs[i].in_use = 1;
1566 return &vq->zmbufs[i];
1572 /* search [0, last_zmbuf_idx) */
1574 last = vq->last_zmbuf_idx;
1582 virtio_dev_extbuf_free(void *addr __rte_unused, void *opaque)
1588 virtio_dev_extbuf_alloc(struct rte_mbuf *pkt, uint32_t size)
1590 struct rte_mbuf_ext_shared_info *shinfo = NULL;
1591 uint32_t total_len = RTE_PKTMBUF_HEADROOM + size;
1596 /* Try to use pkt buffer to store shinfo to reduce the amount of memory
1597 * required, otherwise store shinfo in the new buffer.
1599 if (rte_pktmbuf_tailroom(pkt) >= sizeof(*shinfo))
1600 shinfo = rte_pktmbuf_mtod(pkt,
1601 struct rte_mbuf_ext_shared_info *);
1603 total_len += sizeof(*shinfo) + sizeof(uintptr_t);
1604 total_len = RTE_ALIGN_CEIL(total_len, sizeof(uintptr_t));
1607 if (unlikely(total_len > UINT16_MAX))
1610 buf_len = total_len;
1611 buf = rte_malloc(NULL, buf_len, RTE_CACHE_LINE_SIZE);
1612 if (unlikely(buf == NULL))
1615 /* Initialize shinfo */
1617 shinfo->free_cb = virtio_dev_extbuf_free;
1618 shinfo->fcb_opaque = buf;
1619 rte_mbuf_ext_refcnt_set(shinfo, 1);
1621 shinfo = rte_pktmbuf_ext_shinfo_init_helper(buf, &buf_len,
1622 virtio_dev_extbuf_free, buf);
1623 if (unlikely(shinfo == NULL)) {
1625 VHOST_LOG_DATA(ERR, "Failed to init shinfo\n");
1630 iova = rte_malloc_virt2iova(buf);
1631 rte_pktmbuf_attach_extbuf(pkt, buf, iova, buf_len, shinfo);
1632 rte_pktmbuf_reset_headroom(pkt);
1638 * Allocate a host supported pktmbuf.
1640 static __rte_always_inline struct rte_mbuf *
1641 virtio_dev_pktmbuf_alloc(struct virtio_net *dev, struct rte_mempool *mp,
1644 struct rte_mbuf *pkt = rte_pktmbuf_alloc(mp);
1646 if (unlikely(pkt == NULL)) {
1648 "Failed to allocate memory for mbuf.\n");
1652 if (rte_pktmbuf_tailroom(pkt) >= data_len)
1655 /* attach an external buffer if supported */
1656 if (dev->extbuf && !virtio_dev_extbuf_alloc(pkt, data_len))
1659 /* check if chained buffers are allowed */
1660 if (!dev->linearbuf)
1663 /* Data doesn't fit into the buffer and the host supports
1664 * only linear buffers
1666 rte_pktmbuf_free(pkt);
1671 static __rte_noinline uint16_t
1672 virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
1673 struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count)
1676 uint16_t free_entries;
1678 if (unlikely(dev->dequeue_zero_copy)) {
1679 struct zcopy_mbuf *zmbuf, *next;
1681 for (zmbuf = TAILQ_FIRST(&vq->zmbuf_list);
1682 zmbuf != NULL; zmbuf = next) {
1683 next = TAILQ_NEXT(zmbuf, next);
1685 if (mbuf_is_consumed(zmbuf->mbuf)) {
1686 update_shadow_used_ring_split(vq,
1687 zmbuf->desc_idx, 0);
1688 TAILQ_REMOVE(&vq->zmbuf_list, zmbuf, next);
1689 restore_mbuf(zmbuf->mbuf);
1690 rte_pktmbuf_free(zmbuf->mbuf);
1696 if (likely(vq->shadow_used_idx)) {
1697 flush_shadow_used_ring_split(dev, vq);
1698 vhost_vring_call_split(dev, vq);
1702 free_entries = *((volatile uint16_t *)&vq->avail->idx) -
1704 if (free_entries == 0)
1708 * The ordering between avail index and
1709 * desc reads needs to be enforced.
1713 rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]);
1715 VHOST_LOG_DATA(DEBUG, "(%d) %s\n", dev->vid, __func__);
1717 count = RTE_MIN(count, MAX_PKT_BURST);
1718 count = RTE_MIN(count, free_entries);
1719 VHOST_LOG_DATA(DEBUG, "(%d) about to dequeue %u buffers\n",
1722 for (i = 0; i < count; i++) {
1723 struct buf_vector buf_vec[BUF_VECTOR_MAX];
1726 uint16_t nr_vec = 0;
1729 if (unlikely(fill_vec_buf_split(dev, vq,
1730 vq->last_avail_idx + i,
1732 &head_idx, &buf_len,
1733 VHOST_ACCESS_RO) < 0))
1736 if (likely(dev->dequeue_zero_copy == 0))
1737 update_shadow_used_ring_split(vq, head_idx, 0);
1739 pkts[i] = virtio_dev_pktmbuf_alloc(dev, mbuf_pool, buf_len);
1740 if (unlikely(pkts[i] == NULL))
1743 err = copy_desc_to_mbuf(dev, vq, buf_vec, nr_vec, pkts[i],
1745 if (unlikely(err)) {
1746 rte_pktmbuf_free(pkts[i]);
1750 if (unlikely(dev->dequeue_zero_copy)) {
1751 struct zcopy_mbuf *zmbuf;
1753 zmbuf = get_zmbuf(vq);
1755 rte_pktmbuf_free(pkts[i]);
1758 zmbuf->mbuf = pkts[i];
1759 zmbuf->desc_idx = head_idx;
1762 * Pin lock the mbuf; we will check later to see
1763 * whether the mbuf is freed (when we are the last
1764 * user) or not. If that's the case, we then could
1765 * update the used ring safely.
1767 rte_mbuf_refcnt_update(pkts[i], 1);
1770 TAILQ_INSERT_TAIL(&vq->zmbuf_list, zmbuf, next);
1773 vq->last_avail_idx += i;
1775 if (likely(dev->dequeue_zero_copy == 0)) {
1776 do_data_copy_dequeue(vq);
1777 if (unlikely(i < count))
1778 vq->shadow_used_idx = i;
1779 if (likely(vq->shadow_used_idx)) {
1780 flush_shadow_used_ring_split(dev, vq);
1781 vhost_vring_call_split(dev, vq);
1788 static __rte_always_inline int
1789 vhost_reserve_avail_batch_packed(struct virtio_net *dev,
1790 struct vhost_virtqueue *vq,
1791 struct rte_mempool *mbuf_pool,
1792 struct rte_mbuf **pkts,
1794 uintptr_t *desc_addrs,
1797 bool wrap = vq->avail_wrap_counter;
1798 struct vring_packed_desc *descs = vq->desc_packed;
1799 struct virtio_net_hdr *hdr;
1800 uint64_t lens[PACKED_BATCH_SIZE];
1801 uint64_t buf_lens[PACKED_BATCH_SIZE];
1802 uint32_t buf_offset = dev->vhost_hlen;
1805 if (unlikely(avail_idx & PACKED_BATCH_MASK))
1807 if (unlikely((avail_idx + PACKED_BATCH_SIZE) > vq->size))
1810 vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
1811 flags = descs[avail_idx + i].flags;
1812 if (unlikely((wrap != !!(flags & VRING_DESC_F_AVAIL)) ||
1813 (wrap == !!(flags & VRING_DESC_F_USED)) ||
1814 (flags & PACKED_DESC_SINGLE_DEQUEUE_FLAG)))
1820 vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
1821 lens[i] = descs[avail_idx + i].len;
1823 vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
1824 desc_addrs[i] = vhost_iova_to_vva(dev, vq,
1825 descs[avail_idx + i].addr,
1826 &lens[i], VHOST_ACCESS_RW);
1829 vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
1830 if (unlikely((lens[i] != descs[avail_idx + i].len)))
1834 vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
1835 pkts[i] = virtio_dev_pktmbuf_alloc(dev, mbuf_pool, lens[i]);
1840 vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
1841 buf_lens[i] = pkts[i]->buf_len - pkts[i]->data_off;
1843 vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
1844 if (unlikely(buf_lens[i] < (lens[i] - buf_offset)))
1848 vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
1849 pkts[i]->pkt_len = descs[avail_idx + i].len - buf_offset;
1850 pkts[i]->data_len = pkts[i]->pkt_len;
1851 ids[i] = descs[avail_idx + i].id;
1854 if (virtio_net_with_host_offload(dev)) {
1855 vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
1856 hdr = (struct virtio_net_hdr *)(desc_addrs[i]);
1857 vhost_dequeue_offload(hdr, pkts[i]);
1864 for (i = 0; i < PACKED_BATCH_SIZE; i++)
1865 rte_pktmbuf_free(pkts[i]);
1870 static __rte_always_inline int
1871 virtio_dev_tx_batch_packed(struct virtio_net *dev,
1872 struct vhost_virtqueue *vq,
1873 struct rte_mempool *mbuf_pool,
1874 struct rte_mbuf **pkts)
1876 uint16_t avail_idx = vq->last_avail_idx;
1877 uint32_t buf_offset = dev->vhost_hlen;
1878 uintptr_t desc_addrs[PACKED_BATCH_SIZE];
1879 uint16_t ids[PACKED_BATCH_SIZE];
1882 if (vhost_reserve_avail_batch_packed(dev, vq, mbuf_pool, pkts,
1883 avail_idx, desc_addrs, ids))
1886 vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
1887 rte_prefetch0((void *)(uintptr_t)desc_addrs[i]);
1889 vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
1890 rte_memcpy(rte_pktmbuf_mtod_offset(pkts[i], void *, 0),
1891 (void *)(uintptr_t)(desc_addrs[i] + buf_offset),
1894 if (virtio_net_is_inorder(dev))
1895 vhost_shadow_dequeue_batch_packed_inorder(vq,
1896 ids[PACKED_BATCH_SIZE - 1]);
1898 vhost_shadow_dequeue_batch_packed(dev, vq, ids);
1900 vq_inc_last_avail_packed(vq, PACKED_BATCH_SIZE);
1905 static __rte_always_inline int
1906 vhost_dequeue_single_packed(struct virtio_net *dev,
1907 struct vhost_virtqueue *vq,
1908 struct rte_mempool *mbuf_pool,
1909 struct rte_mbuf **pkts,
1911 uint16_t *desc_count)
1913 struct buf_vector buf_vec[BUF_VECTOR_MAX];
1915 uint16_t nr_vec = 0;
1918 if (unlikely(fill_vec_buf_packed(dev, vq,
1919 vq->last_avail_idx, desc_count,
1922 VHOST_ACCESS_RO) < 0))
1925 *pkts = virtio_dev_pktmbuf_alloc(dev, mbuf_pool, buf_len);
1926 if (unlikely(*pkts == NULL)) {
1928 "Failed to allocate memory for mbuf.\n");
1932 err = copy_desc_to_mbuf(dev, vq, buf_vec, nr_vec, *pkts,
1934 if (unlikely(err)) {
1935 rte_pktmbuf_free(*pkts);
1942 static __rte_always_inline int
1943 virtio_dev_tx_single_packed(struct virtio_net *dev,
1944 struct vhost_virtqueue *vq,
1945 struct rte_mempool *mbuf_pool,
1946 struct rte_mbuf **pkts)
1949 uint16_t buf_id, desc_count;
1951 if (vhost_dequeue_single_packed(dev, vq, mbuf_pool, pkts, &buf_id,
1955 if (virtio_net_is_inorder(dev))
1956 vhost_shadow_dequeue_single_packed_inorder(vq, buf_id,
1959 vhost_shadow_dequeue_single_packed(vq, buf_id, desc_count);
1961 vq_inc_last_avail_packed(vq, desc_count);
1966 static __rte_always_inline int
1967 virtio_dev_tx_batch_packed_zmbuf(struct virtio_net *dev,
1968 struct vhost_virtqueue *vq,
1969 struct rte_mempool *mbuf_pool,
1970 struct rte_mbuf **pkts)
1972 struct zcopy_mbuf *zmbufs[PACKED_BATCH_SIZE];
1973 uintptr_t desc_addrs[PACKED_BATCH_SIZE];
1974 uint16_t ids[PACKED_BATCH_SIZE];
1977 uint16_t avail_idx = vq->last_avail_idx;
1979 if (vhost_reserve_avail_batch_packed(dev, vq, mbuf_pool, pkts,
1980 avail_idx, desc_addrs, ids))
1983 vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
1984 zmbufs[i] = get_zmbuf(vq);
1986 vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
1991 vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
1992 zmbufs[i]->mbuf = pkts[i];
1993 zmbufs[i]->desc_idx = ids[i];
1994 zmbufs[i]->desc_count = 1;
1997 vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
1998 rte_mbuf_refcnt_update(pkts[i], 1);
2000 vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
2001 TAILQ_INSERT_TAIL(&vq->zmbuf_list, zmbufs[i], next);
2003 vq->nr_zmbuf += PACKED_BATCH_SIZE;
2004 vq_inc_last_avail_packed(vq, PACKED_BATCH_SIZE);
2009 vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
2010 rte_pktmbuf_free(pkts[i]);
2015 static __rte_always_inline int
2016 virtio_dev_tx_single_packed_zmbuf(struct virtio_net *dev,
2017 struct vhost_virtqueue *vq,
2018 struct rte_mempool *mbuf_pool,
2019 struct rte_mbuf **pkts)
2021 uint16_t buf_id, desc_count;
2022 struct zcopy_mbuf *zmbuf;
2024 if (vhost_dequeue_single_packed(dev, vq, mbuf_pool, pkts, &buf_id,
2028 zmbuf = get_zmbuf(vq);
2030 rte_pktmbuf_free(*pkts);
2033 zmbuf->mbuf = *pkts;
2034 zmbuf->desc_idx = buf_id;
2035 zmbuf->desc_count = desc_count;
2037 rte_mbuf_refcnt_update(*pkts, 1);
2040 TAILQ_INSERT_TAIL(&vq->zmbuf_list, zmbuf, next);
2042 vq_inc_last_avail_packed(vq, desc_count);
2046 static __rte_always_inline void
2047 free_zmbuf(struct vhost_virtqueue *vq)
2049 struct zcopy_mbuf *next = NULL;
2050 struct zcopy_mbuf *zmbuf;
2052 for (zmbuf = TAILQ_FIRST(&vq->zmbuf_list);
2053 zmbuf != NULL; zmbuf = next) {
2054 next = TAILQ_NEXT(zmbuf, next);
2056 uint16_t last_used_idx = vq->last_used_idx;
2058 if (mbuf_is_consumed(zmbuf->mbuf)) {
2060 flags = vq->desc_packed[last_used_idx].flags;
2061 if (vq->used_wrap_counter) {
2062 flags |= VRING_DESC_F_USED;
2063 flags |= VRING_DESC_F_AVAIL;
2065 flags &= ~VRING_DESC_F_USED;
2066 flags &= ~VRING_DESC_F_AVAIL;
2069 vq->desc_packed[last_used_idx].id = zmbuf->desc_idx;
2070 vq->desc_packed[last_used_idx].len = 0;
2073 vq->desc_packed[last_used_idx].flags = flags;
2075 vq_inc_last_used_packed(vq, zmbuf->desc_count);
2077 TAILQ_REMOVE(&vq->zmbuf_list, zmbuf, next);
2078 restore_mbuf(zmbuf->mbuf);
2079 rte_pktmbuf_free(zmbuf->mbuf);
2086 static __rte_noinline uint16_t
2087 virtio_dev_tx_packed_zmbuf(struct virtio_net *dev,
2088 struct vhost_virtqueue *vq,
2089 struct rte_mempool *mbuf_pool,
2090 struct rte_mbuf **pkts,
2093 uint32_t pkt_idx = 0;
2094 uint32_t remained = count;
2099 if (remained >= PACKED_BATCH_SIZE) {
2100 if (!virtio_dev_tx_batch_packed_zmbuf(dev, vq,
2101 mbuf_pool, &pkts[pkt_idx])) {
2102 pkt_idx += PACKED_BATCH_SIZE;
2103 remained -= PACKED_BATCH_SIZE;
2108 if (virtio_dev_tx_single_packed_zmbuf(dev, vq, mbuf_pool,
2117 vhost_vring_call_packed(dev, vq);
2122 static __rte_noinline uint16_t
2123 virtio_dev_tx_packed(struct virtio_net *dev,
2124 struct vhost_virtqueue *vq,
2125 struct rte_mempool *mbuf_pool,
2126 struct rte_mbuf **pkts,
2129 uint32_t pkt_idx = 0;
2130 uint32_t remained = count;
2133 rte_prefetch0(&vq->desc_packed[vq->last_avail_idx]);
2135 if (remained >= PACKED_BATCH_SIZE) {
2136 if (!virtio_dev_tx_batch_packed(dev, vq, mbuf_pool,
2138 pkt_idx += PACKED_BATCH_SIZE;
2139 remained -= PACKED_BATCH_SIZE;
2144 if (virtio_dev_tx_single_packed(dev, vq, mbuf_pool,
2152 if (vq->shadow_used_idx) {
2153 do_data_copy_dequeue(vq);
2155 vhost_flush_dequeue_shadow_packed(dev, vq);
2156 vhost_vring_call_packed(dev, vq);
2163 rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
2164 struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count)
2166 struct virtio_net *dev;
2167 struct rte_mbuf *rarp_mbuf = NULL;
2168 struct vhost_virtqueue *vq;
2170 dev = get_device(vid);
2174 if (unlikely(!(dev->flags & VIRTIO_DEV_BUILTIN_VIRTIO_NET))) {
2176 "(%d) %s: built-in vhost net backend is disabled.\n",
2177 dev->vid, __func__);
2181 if (unlikely(!is_valid_virt_queue_idx(queue_id, 1, dev->nr_vring))) {
2183 "(%d) %s: invalid virtqueue idx %d.\n",
2184 dev->vid, __func__, queue_id);
2188 vq = dev->virtqueue[queue_id];
2190 if (unlikely(rte_spinlock_trylock(&vq->access_lock) == 0))
2193 if (unlikely(vq->enabled == 0)) {
2195 goto out_access_unlock;
2198 if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
2199 vhost_user_iotlb_rd_lock(vq);
2201 if (unlikely(vq->access_ok == 0))
2202 if (unlikely(vring_translate(dev, vq) < 0)) {
2208 * Construct a RARP broadcast packet, and inject it to the "pkts"
2209 * array, to looks like that guest actually send such packet.
2211 * Check user_send_rarp() for more information.
2213 * broadcast_rarp shares a cacheline in the virtio_net structure
2214 * with some fields that are accessed during enqueue and
2215 * rte_atomic16_cmpset() causes a write if using cmpxchg. This could
2216 * result in false sharing between enqueue and dequeue.
2218 * Prevent unnecessary false sharing by reading broadcast_rarp first
2219 * and only performing cmpset if the read indicates it is likely to
2222 if (unlikely(rte_atomic16_read(&dev->broadcast_rarp) &&
2223 rte_atomic16_cmpset((volatile uint16_t *)
2224 &dev->broadcast_rarp.cnt, 1, 0))) {
2226 rarp_mbuf = rte_net_make_rarp_packet(mbuf_pool, &dev->mac);
2227 if (rarp_mbuf == NULL) {
2228 VHOST_LOG_DATA(ERR, "Failed to make RARP packet.\n");
2235 if (vq_is_packed(dev)) {
2236 if (unlikely(dev->dequeue_zero_copy))
2237 count = virtio_dev_tx_packed_zmbuf(dev, vq, mbuf_pool,
2240 count = virtio_dev_tx_packed(dev, vq, mbuf_pool, pkts,
2243 count = virtio_dev_tx_split(dev, vq, mbuf_pool, pkts, count);
2246 if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
2247 vhost_user_iotlb_rd_unlock(vq);
2250 rte_spinlock_unlock(&vq->access_lock);
2252 if (unlikely(rarp_mbuf != NULL)) {
2254 * Inject it to the head of "pkts" array, so that switch's mac
2255 * learning table will get updated first.
2257 memmove(&pkts[1], pkts, count * sizeof(struct rte_mbuf *));
2258 pkts[0] = rarp_mbuf;