1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright(c) 2010-2016 Intel Corporation
7 #include <linux/virtio_net.h>
10 #include <rte_memcpy.h>
11 #include <rte_ether.h>
13 #include <rte_vhost.h>
18 #include <rte_spinlock.h>
19 #include <rte_malloc.h>
20 #include <rte_vhost_async.h>
25 #define MAX_BATCH_LEN 256
27 #define VHOST_ASYNC_BATCH_THRESHOLD 32
29 static __rte_always_inline bool
30 rxvq_is_mergeable(struct virtio_net *dev)
32 return dev->features & (1ULL << VIRTIO_NET_F_MRG_RXBUF);
35 static __rte_always_inline bool
36 virtio_net_is_inorder(struct virtio_net *dev)
38 return dev->features & (1ULL << VIRTIO_F_IN_ORDER);
42 is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t nr_vring)
44 return (is_tx ^ (idx & 1)) == 0 && idx < nr_vring;
48 do_data_copy_enqueue(struct virtio_net *dev, struct vhost_virtqueue *vq)
50 struct batch_copy_elem *elem = vq->batch_copy_elems;
51 uint16_t count = vq->batch_copy_nb_elems;
54 for (i = 0; i < count; i++) {
55 rte_memcpy(elem[i].dst, elem[i].src, elem[i].len);
56 vhost_log_cache_write_iova(dev, vq, elem[i].log_addr,
58 PRINT_PACKET(dev, (uintptr_t)elem[i].dst, elem[i].len, 0);
61 vq->batch_copy_nb_elems = 0;
65 do_data_copy_dequeue(struct vhost_virtqueue *vq)
67 struct batch_copy_elem *elem = vq->batch_copy_elems;
68 uint16_t count = vq->batch_copy_nb_elems;
71 for (i = 0; i < count; i++)
72 rte_memcpy(elem[i].dst, elem[i].src, elem[i].len);
74 vq->batch_copy_nb_elems = 0;
77 static __rte_always_inline void
78 do_flush_shadow_used_ring_split(struct virtio_net *dev,
79 struct vhost_virtqueue *vq,
80 uint16_t to, uint16_t from, uint16_t size)
82 rte_memcpy(&vq->used->ring[to],
83 &vq->shadow_used_split[from],
84 size * sizeof(struct vring_used_elem));
85 vhost_log_cache_used_vring(dev, vq,
86 offsetof(struct vring_used, ring[to]),
87 size * sizeof(struct vring_used_elem));
90 static __rte_always_inline void
91 flush_shadow_used_ring_split(struct virtio_net *dev, struct vhost_virtqueue *vq)
93 uint16_t used_idx = vq->last_used_idx & (vq->size - 1);
95 if (used_idx + vq->shadow_used_idx <= vq->size) {
96 do_flush_shadow_used_ring_split(dev, vq, used_idx, 0,
101 /* update used ring interval [used_idx, vq->size] */
102 size = vq->size - used_idx;
103 do_flush_shadow_used_ring_split(dev, vq, used_idx, 0, size);
105 /* update the left half used ring interval [0, left_size] */
106 do_flush_shadow_used_ring_split(dev, vq, 0, size,
107 vq->shadow_used_idx - size);
109 vq->last_used_idx += vq->shadow_used_idx;
111 vhost_log_cache_sync(dev, vq);
113 __atomic_add_fetch(&vq->used->idx, vq->shadow_used_idx,
115 vq->shadow_used_idx = 0;
116 vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
117 sizeof(vq->used->idx));
120 static __rte_always_inline void
121 update_shadow_used_ring_split(struct vhost_virtqueue *vq,
122 uint16_t desc_idx, uint32_t len)
124 uint16_t i = vq->shadow_used_idx++;
126 vq->shadow_used_split[i].id = desc_idx;
127 vq->shadow_used_split[i].len = len;
130 static __rte_always_inline void
131 vhost_flush_enqueue_shadow_packed(struct virtio_net *dev,
132 struct vhost_virtqueue *vq)
135 uint16_t used_idx = vq->last_used_idx;
136 uint16_t head_idx = vq->last_used_idx;
137 uint16_t head_flags = 0;
139 /* Split loop in two to save memory barriers */
140 for (i = 0; i < vq->shadow_used_idx; i++) {
141 vq->desc_packed[used_idx].id = vq->shadow_used_packed[i].id;
142 vq->desc_packed[used_idx].len = vq->shadow_used_packed[i].len;
144 used_idx += vq->shadow_used_packed[i].count;
145 if (used_idx >= vq->size)
146 used_idx -= vq->size;
149 /* The ordering for storing desc flags needs to be enforced. */
150 rte_atomic_thread_fence(__ATOMIC_RELEASE);
152 for (i = 0; i < vq->shadow_used_idx; i++) {
155 if (vq->shadow_used_packed[i].len)
156 flags = VRING_DESC_F_WRITE;
160 if (vq->used_wrap_counter) {
161 flags |= VRING_DESC_F_USED;
162 flags |= VRING_DESC_F_AVAIL;
164 flags &= ~VRING_DESC_F_USED;
165 flags &= ~VRING_DESC_F_AVAIL;
169 vq->desc_packed[vq->last_used_idx].flags = flags;
171 vhost_log_cache_used_vring(dev, vq,
173 sizeof(struct vring_packed_desc),
174 sizeof(struct vring_packed_desc));
176 head_idx = vq->last_used_idx;
180 vq_inc_last_used_packed(vq, vq->shadow_used_packed[i].count);
183 vq->desc_packed[head_idx].flags = head_flags;
185 vhost_log_cache_used_vring(dev, vq,
187 sizeof(struct vring_packed_desc),
188 sizeof(struct vring_packed_desc));
190 vq->shadow_used_idx = 0;
191 vhost_log_cache_sync(dev, vq);
194 static __rte_always_inline void
195 vhost_flush_dequeue_shadow_packed(struct virtio_net *dev,
196 struct vhost_virtqueue *vq)
198 struct vring_used_elem_packed *used_elem = &vq->shadow_used_packed[0];
200 vq->desc_packed[vq->shadow_last_used_idx].id = used_elem->id;
201 /* desc flags is the synchronization point for virtio packed vring */
202 __atomic_store_n(&vq->desc_packed[vq->shadow_last_used_idx].flags,
203 used_elem->flags, __ATOMIC_RELEASE);
205 vhost_log_cache_used_vring(dev, vq, vq->shadow_last_used_idx *
206 sizeof(struct vring_packed_desc),
207 sizeof(struct vring_packed_desc));
208 vq->shadow_used_idx = 0;
209 vhost_log_cache_sync(dev, vq);
212 static __rte_always_inline void
213 vhost_flush_enqueue_batch_packed(struct virtio_net *dev,
214 struct vhost_virtqueue *vq,
220 uint16_t last_used_idx = vq->last_used_idx;
221 struct vring_packed_desc *desc_base = &vq->desc_packed[last_used_idx];
223 if (vq->shadow_used_idx) {
224 do_data_copy_enqueue(dev, vq);
225 vhost_flush_enqueue_shadow_packed(dev, vq);
228 flags = PACKED_DESC_ENQUEUE_USED_FLAG(vq->used_wrap_counter);
230 vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
231 desc_base[i].id = ids[i];
232 desc_base[i].len = lens[i];
235 rte_atomic_thread_fence(__ATOMIC_RELEASE);
237 vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
238 desc_base[i].flags = flags;
241 vhost_log_cache_used_vring(dev, vq, last_used_idx *
242 sizeof(struct vring_packed_desc),
243 sizeof(struct vring_packed_desc) *
245 vhost_log_cache_sync(dev, vq);
247 vq_inc_last_used_packed(vq, PACKED_BATCH_SIZE);
250 static __rte_always_inline void
251 vhost_shadow_dequeue_batch_packed_inorder(struct vhost_virtqueue *vq,
254 vq->shadow_used_packed[0].id = id;
256 if (!vq->shadow_used_idx) {
257 vq->shadow_last_used_idx = vq->last_used_idx;
258 vq->shadow_used_packed[0].flags =
259 PACKED_DESC_DEQUEUE_USED_FLAG(vq->used_wrap_counter);
260 vq->shadow_used_packed[0].len = 0;
261 vq->shadow_used_packed[0].count = 1;
262 vq->shadow_used_idx++;
265 vq_inc_last_used_packed(vq, PACKED_BATCH_SIZE);
268 static __rte_always_inline void
269 vhost_shadow_dequeue_batch_packed(struct virtio_net *dev,
270 struct vhost_virtqueue *vq,
277 flags = PACKED_DESC_DEQUEUE_USED_FLAG(vq->used_wrap_counter);
279 if (!vq->shadow_used_idx) {
280 vq->shadow_last_used_idx = vq->last_used_idx;
281 vq->shadow_used_packed[0].id = ids[0];
282 vq->shadow_used_packed[0].len = 0;
283 vq->shadow_used_packed[0].count = 1;
284 vq->shadow_used_packed[0].flags = flags;
285 vq->shadow_used_idx++;
290 vhost_for_each_try_unroll(i, begin, PACKED_BATCH_SIZE) {
291 vq->desc_packed[vq->last_used_idx + i].id = ids[i];
292 vq->desc_packed[vq->last_used_idx + i].len = 0;
295 rte_atomic_thread_fence(__ATOMIC_RELEASE);
296 vhost_for_each_try_unroll(i, begin, PACKED_BATCH_SIZE)
297 vq->desc_packed[vq->last_used_idx + i].flags = flags;
299 vhost_log_cache_used_vring(dev, vq, vq->last_used_idx *
300 sizeof(struct vring_packed_desc),
301 sizeof(struct vring_packed_desc) *
303 vhost_log_cache_sync(dev, vq);
305 vq_inc_last_used_packed(vq, PACKED_BATCH_SIZE);
308 static __rte_always_inline void
309 vhost_shadow_dequeue_single_packed(struct vhost_virtqueue *vq,
315 flags = vq->desc_packed[vq->last_used_idx].flags;
316 if (vq->used_wrap_counter) {
317 flags |= VRING_DESC_F_USED;
318 flags |= VRING_DESC_F_AVAIL;
320 flags &= ~VRING_DESC_F_USED;
321 flags &= ~VRING_DESC_F_AVAIL;
324 if (!vq->shadow_used_idx) {
325 vq->shadow_last_used_idx = vq->last_used_idx;
327 vq->shadow_used_packed[0].id = buf_id;
328 vq->shadow_used_packed[0].len = 0;
329 vq->shadow_used_packed[0].flags = flags;
330 vq->shadow_used_idx++;
332 vq->desc_packed[vq->last_used_idx].id = buf_id;
333 vq->desc_packed[vq->last_used_idx].len = 0;
334 vq->desc_packed[vq->last_used_idx].flags = flags;
337 vq_inc_last_used_packed(vq, count);
340 static __rte_always_inline void
341 vhost_shadow_dequeue_single_packed_inorder(struct vhost_virtqueue *vq,
347 vq->shadow_used_packed[0].id = buf_id;
349 flags = vq->desc_packed[vq->last_used_idx].flags;
350 if (vq->used_wrap_counter) {
351 flags |= VRING_DESC_F_USED;
352 flags |= VRING_DESC_F_AVAIL;
354 flags &= ~VRING_DESC_F_USED;
355 flags &= ~VRING_DESC_F_AVAIL;
358 if (!vq->shadow_used_idx) {
359 vq->shadow_last_used_idx = vq->last_used_idx;
360 vq->shadow_used_packed[0].len = 0;
361 vq->shadow_used_packed[0].flags = flags;
362 vq->shadow_used_idx++;
365 vq_inc_last_used_packed(vq, count);
368 static __rte_always_inline void
369 vhost_shadow_enqueue_packed(struct vhost_virtqueue *vq,
373 uint16_t num_buffers)
377 for (i = 0; i < num_buffers; i++) {
378 /* enqueue shadow flush action aligned with batch num */
379 if (!vq->shadow_used_idx)
380 vq->shadow_aligned_idx = vq->last_used_idx &
382 vq->shadow_used_packed[vq->shadow_used_idx].id = id[i];
383 vq->shadow_used_packed[vq->shadow_used_idx].len = len[i];
384 vq->shadow_used_packed[vq->shadow_used_idx].count = count[i];
385 vq->shadow_aligned_idx += count[i];
386 vq->shadow_used_idx++;
390 static __rte_always_inline void
391 vhost_shadow_enqueue_single_packed(struct virtio_net *dev,
392 struct vhost_virtqueue *vq,
396 uint16_t num_buffers)
398 vhost_shadow_enqueue_packed(vq, len, id, count, num_buffers);
400 if (vq->shadow_aligned_idx >= PACKED_BATCH_SIZE) {
401 do_data_copy_enqueue(dev, vq);
402 vhost_flush_enqueue_shadow_packed(dev, vq);
406 /* avoid write operation when necessary, to lessen cache issues */
407 #define ASSIGN_UNLESS_EQUAL(var, val) do { \
408 if ((var) != (val)) \
412 static __rte_always_inline void
413 virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
415 uint64_t csum_l4 = m_buf->ol_flags & PKT_TX_L4_MASK;
417 if (m_buf->ol_flags & PKT_TX_TCP_SEG)
418 csum_l4 |= PKT_TX_TCP_CKSUM;
421 net_hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
422 net_hdr->csum_start = m_buf->l2_len + m_buf->l3_len;
425 case PKT_TX_TCP_CKSUM:
426 net_hdr->csum_offset = (offsetof(struct rte_tcp_hdr,
429 case PKT_TX_UDP_CKSUM:
430 net_hdr->csum_offset = (offsetof(struct rte_udp_hdr,
433 case PKT_TX_SCTP_CKSUM:
434 net_hdr->csum_offset = (offsetof(struct rte_sctp_hdr,
439 ASSIGN_UNLESS_EQUAL(net_hdr->csum_start, 0);
440 ASSIGN_UNLESS_EQUAL(net_hdr->csum_offset, 0);
441 ASSIGN_UNLESS_EQUAL(net_hdr->flags, 0);
444 /* IP cksum verification cannot be bypassed, then calculate here */
445 if (m_buf->ol_flags & PKT_TX_IP_CKSUM) {
446 struct rte_ipv4_hdr *ipv4_hdr;
448 ipv4_hdr = rte_pktmbuf_mtod_offset(m_buf, struct rte_ipv4_hdr *,
450 ipv4_hdr->hdr_checksum = 0;
451 ipv4_hdr->hdr_checksum = rte_ipv4_cksum(ipv4_hdr);
454 if (m_buf->ol_flags & PKT_TX_TCP_SEG) {
455 if (m_buf->ol_flags & PKT_TX_IPV4)
456 net_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
458 net_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
459 net_hdr->gso_size = m_buf->tso_segsz;
460 net_hdr->hdr_len = m_buf->l2_len + m_buf->l3_len
462 } else if (m_buf->ol_flags & PKT_TX_UDP_SEG) {
463 net_hdr->gso_type = VIRTIO_NET_HDR_GSO_UDP;
464 net_hdr->gso_size = m_buf->tso_segsz;
465 net_hdr->hdr_len = m_buf->l2_len + m_buf->l3_len +
468 ASSIGN_UNLESS_EQUAL(net_hdr->gso_type, 0);
469 ASSIGN_UNLESS_EQUAL(net_hdr->gso_size, 0);
470 ASSIGN_UNLESS_EQUAL(net_hdr->hdr_len, 0);
474 static __rte_always_inline int
475 map_one_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
476 struct buf_vector *buf_vec, uint16_t *vec_idx,
477 uint64_t desc_iova, uint64_t desc_len, uint8_t perm)
479 uint16_t vec_id = *vec_idx;
483 uint64_t desc_chunck_len = desc_len;
485 if (unlikely(vec_id >= BUF_VECTOR_MAX))
488 desc_addr = vhost_iova_to_vva(dev, vq,
492 if (unlikely(!desc_addr))
495 rte_prefetch0((void *)(uintptr_t)desc_addr);
497 buf_vec[vec_id].buf_iova = desc_iova;
498 buf_vec[vec_id].buf_addr = desc_addr;
499 buf_vec[vec_id].buf_len = desc_chunck_len;
501 desc_len -= desc_chunck_len;
502 desc_iova += desc_chunck_len;
510 static __rte_always_inline int
511 fill_vec_buf_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
512 uint32_t avail_idx, uint16_t *vec_idx,
513 struct buf_vector *buf_vec, uint16_t *desc_chain_head,
514 uint32_t *desc_chain_len, uint8_t perm)
516 uint16_t idx = vq->avail->ring[avail_idx & (vq->size - 1)];
517 uint16_t vec_id = *vec_idx;
520 uint32_t nr_descs = vq->size;
522 struct vring_desc *descs = vq->desc;
523 struct vring_desc *idesc = NULL;
525 if (unlikely(idx >= vq->size))
528 *desc_chain_head = idx;
530 if (vq->desc[idx].flags & VRING_DESC_F_INDIRECT) {
531 dlen = vq->desc[idx].len;
532 nr_descs = dlen / sizeof(struct vring_desc);
533 if (unlikely(nr_descs > vq->size))
536 descs = (struct vring_desc *)(uintptr_t)
537 vhost_iova_to_vva(dev, vq, vq->desc[idx].addr,
540 if (unlikely(!descs))
543 if (unlikely(dlen < vq->desc[idx].len)) {
545 * The indirect desc table is not contiguous
546 * in process VA space, we have to copy it.
548 idesc = vhost_alloc_copy_ind_table(dev, vq,
549 vq->desc[idx].addr, vq->desc[idx].len);
550 if (unlikely(!idesc))
560 if (unlikely(idx >= nr_descs || cnt++ >= nr_descs)) {
561 free_ind_table(idesc);
565 dlen = descs[idx].len;
568 if (unlikely(map_one_desc(dev, vq, buf_vec, &vec_id,
569 descs[idx].addr, dlen,
571 free_ind_table(idesc);
575 if ((descs[idx].flags & VRING_DESC_F_NEXT) == 0)
578 idx = descs[idx].next;
581 *desc_chain_len = len;
584 if (unlikely(!!idesc))
585 free_ind_table(idesc);
591 * Returns -1 on fail, 0 on success
594 reserve_avail_buf_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
595 uint32_t size, struct buf_vector *buf_vec,
596 uint16_t *num_buffers, uint16_t avail_head,
600 uint16_t vec_idx = 0;
601 uint16_t max_tries, tries = 0;
603 uint16_t head_idx = 0;
607 cur_idx = vq->last_avail_idx;
609 if (rxvq_is_mergeable(dev))
610 max_tries = vq->size - 1;
615 if (unlikely(cur_idx == avail_head))
618 * if we tried all available ring items, and still
619 * can't get enough buf, it means something abnormal
622 if (unlikely(++tries > max_tries))
625 if (unlikely(fill_vec_buf_split(dev, vq, cur_idx,
628 VHOST_ACCESS_RW) < 0))
630 len = RTE_MIN(len, size);
631 update_shadow_used_ring_split(vq, head_idx, len);
643 static __rte_always_inline int
644 fill_vec_buf_packed_indirect(struct virtio_net *dev,
645 struct vhost_virtqueue *vq,
646 struct vring_packed_desc *desc, uint16_t *vec_idx,
647 struct buf_vector *buf_vec, uint32_t *len, uint8_t perm)
651 uint16_t vec_id = *vec_idx;
653 struct vring_packed_desc *descs, *idescs = NULL;
656 descs = (struct vring_packed_desc *)(uintptr_t)
657 vhost_iova_to_vva(dev, vq, desc->addr, &dlen, VHOST_ACCESS_RO);
658 if (unlikely(!descs))
661 if (unlikely(dlen < desc->len)) {
663 * The indirect desc table is not contiguous
664 * in process VA space, we have to copy it.
666 idescs = vhost_alloc_copy_ind_table(dev,
667 vq, desc->addr, desc->len);
668 if (unlikely(!idescs))
674 nr_descs = desc->len / sizeof(struct vring_packed_desc);
675 if (unlikely(nr_descs >= vq->size)) {
676 free_ind_table(idescs);
680 for (i = 0; i < nr_descs; i++) {
681 if (unlikely(vec_id >= BUF_VECTOR_MAX)) {
682 free_ind_table(idescs);
688 if (unlikely(map_one_desc(dev, vq, buf_vec, &vec_id,
695 if (unlikely(!!idescs))
696 free_ind_table(idescs);
701 static __rte_always_inline int
702 fill_vec_buf_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
703 uint16_t avail_idx, uint16_t *desc_count,
704 struct buf_vector *buf_vec, uint16_t *vec_idx,
705 uint16_t *buf_id, uint32_t *len, uint8_t perm)
707 bool wrap_counter = vq->avail_wrap_counter;
708 struct vring_packed_desc *descs = vq->desc_packed;
709 uint16_t vec_id = *vec_idx;
712 if (avail_idx < vq->last_avail_idx)
716 * Perform a load-acquire barrier in desc_is_avail to
717 * enforce the ordering between desc flags and desc
720 if (unlikely(!desc_is_avail(&descs[avail_idx], wrap_counter)))
727 if (unlikely(vec_id >= BUF_VECTOR_MAX))
730 if (unlikely(*desc_count >= vq->size))
734 *buf_id = descs[avail_idx].id;
736 if (descs[avail_idx].flags & VRING_DESC_F_INDIRECT) {
737 if (unlikely(fill_vec_buf_packed_indirect(dev, vq,
743 dlen = descs[avail_idx].len;
746 if (unlikely(map_one_desc(dev, vq, buf_vec, &vec_id,
747 descs[avail_idx].addr,
753 if ((descs[avail_idx].flags & VRING_DESC_F_NEXT) == 0)
756 if (++avail_idx >= vq->size) {
757 avail_idx -= vq->size;
767 static __rte_noinline void
768 copy_vnet_hdr_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
769 struct buf_vector *buf_vec,
770 struct virtio_net_hdr_mrg_rxbuf *hdr)
773 uint64_t remain = dev->vhost_hlen;
774 uint64_t src = (uint64_t)(uintptr_t)hdr, dst;
775 uint64_t iova = buf_vec->buf_iova;
778 len = RTE_MIN(remain,
780 dst = buf_vec->buf_addr;
781 rte_memcpy((void *)(uintptr_t)dst,
782 (void *)(uintptr_t)src,
785 PRINT_PACKET(dev, (uintptr_t)dst,
787 vhost_log_cache_write_iova(dev, vq,
797 static __rte_always_inline int
798 copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
799 struct rte_mbuf *m, struct buf_vector *buf_vec,
800 uint16_t nr_vec, uint16_t num_buffers)
802 uint32_t vec_idx = 0;
803 uint32_t mbuf_offset, mbuf_avail;
804 uint32_t buf_offset, buf_avail;
805 uint64_t buf_addr, buf_iova, buf_len;
808 struct rte_mbuf *hdr_mbuf;
809 struct batch_copy_elem *batch_copy = vq->batch_copy_elems;
810 struct virtio_net_hdr_mrg_rxbuf tmp_hdr, *hdr = NULL;
813 if (unlikely(m == NULL)) {
818 buf_addr = buf_vec[vec_idx].buf_addr;
819 buf_iova = buf_vec[vec_idx].buf_iova;
820 buf_len = buf_vec[vec_idx].buf_len;
822 if (unlikely(buf_len < dev->vhost_hlen && nr_vec <= 1)) {
829 if (unlikely(buf_len < dev->vhost_hlen)) {
830 memset(&tmp_hdr, 0, sizeof(struct virtio_net_hdr_mrg_rxbuf));
833 hdr = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)hdr_addr;
835 VHOST_LOG_DATA(DEBUG, "(%d) RX: num merge buffers %d\n",
836 dev->vid, num_buffers);
838 if (unlikely(buf_len < dev->vhost_hlen)) {
839 buf_offset = dev->vhost_hlen - buf_len;
841 buf_addr = buf_vec[vec_idx].buf_addr;
842 buf_iova = buf_vec[vec_idx].buf_iova;
843 buf_len = buf_vec[vec_idx].buf_len;
844 buf_avail = buf_len - buf_offset;
846 buf_offset = dev->vhost_hlen;
847 buf_avail = buf_len - dev->vhost_hlen;
850 mbuf_avail = rte_pktmbuf_data_len(m);
852 while (mbuf_avail != 0 || m->next != NULL) {
853 /* done with current buf, get the next one */
854 if (buf_avail == 0) {
856 if (unlikely(vec_idx >= nr_vec)) {
861 buf_addr = buf_vec[vec_idx].buf_addr;
862 buf_iova = buf_vec[vec_idx].buf_iova;
863 buf_len = buf_vec[vec_idx].buf_len;
869 /* done with current mbuf, get the next one */
870 if (mbuf_avail == 0) {
874 mbuf_avail = rte_pktmbuf_data_len(m);
878 virtio_enqueue_offload(hdr_mbuf, &hdr->hdr);
879 if (rxvq_is_mergeable(dev))
880 ASSIGN_UNLESS_EQUAL(hdr->num_buffers,
883 if (unlikely(hdr == &tmp_hdr)) {
884 copy_vnet_hdr_to_desc(dev, vq, buf_vec, hdr);
886 PRINT_PACKET(dev, (uintptr_t)hdr_addr,
888 vhost_log_cache_write_iova(dev, vq,
896 cpy_len = RTE_MIN(buf_avail, mbuf_avail);
898 if (likely(cpy_len > MAX_BATCH_LEN ||
899 vq->batch_copy_nb_elems >= vq->size)) {
900 rte_memcpy((void *)((uintptr_t)(buf_addr + buf_offset)),
901 rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
903 vhost_log_cache_write_iova(dev, vq,
904 buf_iova + buf_offset,
906 PRINT_PACKET(dev, (uintptr_t)(buf_addr + buf_offset),
909 batch_copy[vq->batch_copy_nb_elems].dst =
910 (void *)((uintptr_t)(buf_addr + buf_offset));
911 batch_copy[vq->batch_copy_nb_elems].src =
912 rte_pktmbuf_mtod_offset(m, void *, mbuf_offset);
913 batch_copy[vq->batch_copy_nb_elems].log_addr =
914 buf_iova + buf_offset;
915 batch_copy[vq->batch_copy_nb_elems].len = cpy_len;
916 vq->batch_copy_nb_elems++;
919 mbuf_avail -= cpy_len;
920 mbuf_offset += cpy_len;
921 buf_avail -= cpy_len;
922 buf_offset += cpy_len;
930 static __rte_always_inline void
931 async_fill_vec(struct iovec *v, void *base, size_t len)
937 static __rte_always_inline void
938 async_fill_iter(struct rte_vhost_iov_iter *it, size_t count,
939 struct iovec *vec, unsigned long nr_seg)
946 it->nr_segs = nr_seg;
953 static __rte_always_inline void
954 async_fill_desc(struct rte_vhost_async_desc *desc,
955 struct rte_vhost_iov_iter *src, struct rte_vhost_iov_iter *dst)
961 static __rte_always_inline int
962 async_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
963 struct rte_mbuf *m, struct buf_vector *buf_vec,
964 uint16_t nr_vec, uint16_t num_buffers,
965 struct iovec *src_iovec, struct iovec *dst_iovec,
966 struct rte_vhost_iov_iter *src_it,
967 struct rte_vhost_iov_iter *dst_it)
969 uint32_t vec_idx = 0;
970 uint32_t mbuf_offset, mbuf_avail;
971 uint32_t buf_offset, buf_avail;
972 uint64_t buf_addr, buf_iova, buf_len;
973 uint32_t cpy_len, cpy_threshold;
975 struct rte_mbuf *hdr_mbuf;
976 struct batch_copy_elem *batch_copy = vq->batch_copy_elems;
977 struct virtio_net_hdr_mrg_rxbuf tmp_hdr, *hdr = NULL;
985 if (unlikely(m == NULL)) {
990 cpy_threshold = vq->async_threshold;
992 buf_addr = buf_vec[vec_idx].buf_addr;
993 buf_iova = buf_vec[vec_idx].buf_iova;
994 buf_len = buf_vec[vec_idx].buf_len;
996 if (unlikely(buf_len < dev->vhost_hlen && nr_vec <= 1)) {
1002 hdr_addr = buf_addr;
1003 if (unlikely(buf_len < dev->vhost_hlen)) {
1004 memset(&tmp_hdr, 0, sizeof(struct virtio_net_hdr_mrg_rxbuf));
1007 hdr = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)hdr_addr;
1009 VHOST_LOG_DATA(DEBUG, "(%d) RX: num merge buffers %d\n",
1010 dev->vid, num_buffers);
1012 if (unlikely(buf_len < dev->vhost_hlen)) {
1013 buf_offset = dev->vhost_hlen - buf_len;
1015 buf_addr = buf_vec[vec_idx].buf_addr;
1016 buf_iova = buf_vec[vec_idx].buf_iova;
1017 buf_len = buf_vec[vec_idx].buf_len;
1018 buf_avail = buf_len - buf_offset;
1020 buf_offset = dev->vhost_hlen;
1021 buf_avail = buf_len - dev->vhost_hlen;
1024 mbuf_avail = rte_pktmbuf_data_len(m);
1027 while (mbuf_avail != 0 || m->next != NULL) {
1028 /* done with current buf, get the next one */
1029 if (buf_avail == 0) {
1031 if (unlikely(vec_idx >= nr_vec)) {
1036 buf_addr = buf_vec[vec_idx].buf_addr;
1037 buf_iova = buf_vec[vec_idx].buf_iova;
1038 buf_len = buf_vec[vec_idx].buf_len;
1041 buf_avail = buf_len;
1044 /* done with current mbuf, get the next one */
1045 if (mbuf_avail == 0) {
1049 mbuf_avail = rte_pktmbuf_data_len(m);
1053 virtio_enqueue_offload(hdr_mbuf, &hdr->hdr);
1054 if (rxvq_is_mergeable(dev))
1055 ASSIGN_UNLESS_EQUAL(hdr->num_buffers,
1058 if (unlikely(hdr == &tmp_hdr)) {
1059 copy_vnet_hdr_to_desc(dev, vq, buf_vec, hdr);
1061 PRINT_PACKET(dev, (uintptr_t)hdr_addr,
1062 dev->vhost_hlen, 0);
1063 vhost_log_cache_write_iova(dev, vq,
1064 buf_vec[0].buf_iova,
1071 cpy_len = RTE_MIN(buf_avail, mbuf_avail);
1073 while (unlikely(cpy_len && cpy_len >= cpy_threshold)) {
1074 hpa = (void *)(uintptr_t)gpa_to_first_hpa(dev,
1075 buf_iova + buf_offset,
1076 cpy_len, &mapped_len);
1078 if (unlikely(!hpa || mapped_len < cpy_threshold))
1081 async_fill_vec(src_iovec + tvec_idx,
1082 (void *)(uintptr_t)rte_pktmbuf_iova_offset(m,
1083 mbuf_offset), (size_t)mapped_len);
1085 async_fill_vec(dst_iovec + tvec_idx,
1086 hpa, (size_t)mapped_len);
1088 tlen += (uint32_t)mapped_len;
1089 cpy_len -= (uint32_t)mapped_len;
1090 mbuf_avail -= (uint32_t)mapped_len;
1091 mbuf_offset += (uint32_t)mapped_len;
1092 buf_avail -= (uint32_t)mapped_len;
1093 buf_offset += (uint32_t)mapped_len;
1097 if (likely(cpy_len)) {
1098 if (unlikely(vq->batch_copy_nb_elems >= vq->size)) {
1100 (void *)((uintptr_t)(buf_addr + buf_offset)),
1101 rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
1105 (uintptr_t)(buf_addr + buf_offset),
1108 batch_copy[vq->batch_copy_nb_elems].dst =
1109 (void *)((uintptr_t)(buf_addr + buf_offset));
1110 batch_copy[vq->batch_copy_nb_elems].src =
1111 rte_pktmbuf_mtod_offset(m, void *, mbuf_offset);
1112 batch_copy[vq->batch_copy_nb_elems].log_addr =
1113 buf_iova + buf_offset;
1114 batch_copy[vq->batch_copy_nb_elems].len =
1116 vq->batch_copy_nb_elems++;
1119 mbuf_avail -= cpy_len;
1120 mbuf_offset += cpy_len;
1121 buf_avail -= cpy_len;
1122 buf_offset += cpy_len;
1129 async_fill_iter(src_it, tlen, src_iovec, tvec_idx);
1130 async_fill_iter(dst_it, tlen, dst_iovec, tvec_idx);
1138 static __rte_always_inline int
1139 vhost_enqueue_single_packed(struct virtio_net *dev,
1140 struct vhost_virtqueue *vq,
1141 struct rte_mbuf *pkt,
1142 struct buf_vector *buf_vec,
1145 uint16_t nr_vec = 0;
1146 uint16_t avail_idx = vq->last_avail_idx;
1147 uint16_t max_tries, tries = 0;
1148 uint16_t buf_id = 0;
1150 uint16_t desc_count;
1151 uint32_t size = pkt->pkt_len + sizeof(struct virtio_net_hdr_mrg_rxbuf);
1152 uint16_t num_buffers = 0;
1153 uint32_t buffer_len[vq->size];
1154 uint16_t buffer_buf_id[vq->size];
1155 uint16_t buffer_desc_count[vq->size];
1157 if (rxvq_is_mergeable(dev))
1158 max_tries = vq->size - 1;
1164 * if we tried all available ring items, and still
1165 * can't get enough buf, it means something abnormal
1168 if (unlikely(++tries > max_tries))
1171 if (unlikely(fill_vec_buf_packed(dev, vq,
1172 avail_idx, &desc_count,
1175 VHOST_ACCESS_RW) < 0))
1178 len = RTE_MIN(len, size);
1181 buffer_len[num_buffers] = len;
1182 buffer_buf_id[num_buffers] = buf_id;
1183 buffer_desc_count[num_buffers] = desc_count;
1186 *nr_descs += desc_count;
1187 avail_idx += desc_count;
1188 if (avail_idx >= vq->size)
1189 avail_idx -= vq->size;
1192 if (copy_mbuf_to_desc(dev, vq, pkt, buf_vec, nr_vec, num_buffers) < 0)
1195 vhost_shadow_enqueue_single_packed(dev, vq, buffer_len, buffer_buf_id,
1196 buffer_desc_count, num_buffers);
1201 static __rte_noinline uint32_t
1202 virtio_dev_rx_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
1203 struct rte_mbuf **pkts, uint32_t count)
1205 uint32_t pkt_idx = 0;
1206 uint16_t num_buffers;
1207 struct buf_vector buf_vec[BUF_VECTOR_MAX];
1208 uint16_t avail_head;
1211 * The ordering between avail index and
1212 * desc reads needs to be enforced.
1214 avail_head = __atomic_load_n(&vq->avail->idx, __ATOMIC_ACQUIRE);
1216 rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]);
1218 for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
1219 uint32_t pkt_len = pkts[pkt_idx]->pkt_len + dev->vhost_hlen;
1220 uint16_t nr_vec = 0;
1222 if (unlikely(reserve_avail_buf_split(dev, vq,
1223 pkt_len, buf_vec, &num_buffers,
1224 avail_head, &nr_vec) < 0)) {
1225 VHOST_LOG_DATA(DEBUG,
1226 "(%d) failed to get enough desc from vring\n",
1228 vq->shadow_used_idx -= num_buffers;
1232 VHOST_LOG_DATA(DEBUG, "(%d) current index %d | end index %d\n",
1233 dev->vid, vq->last_avail_idx,
1234 vq->last_avail_idx + num_buffers);
1236 if (copy_mbuf_to_desc(dev, vq, pkts[pkt_idx],
1239 vq->shadow_used_idx -= num_buffers;
1243 vq->last_avail_idx += num_buffers;
1246 do_data_copy_enqueue(dev, vq);
1248 if (likely(vq->shadow_used_idx)) {
1249 flush_shadow_used_ring_split(dev, vq);
1250 vhost_vring_call_split(dev, vq);
1256 static __rte_always_inline int
1257 virtio_dev_rx_batch_packed(struct virtio_net *dev,
1258 struct vhost_virtqueue *vq,
1259 struct rte_mbuf **pkts)
1261 bool wrap_counter = vq->avail_wrap_counter;
1262 struct vring_packed_desc *descs = vq->desc_packed;
1263 uint16_t avail_idx = vq->last_avail_idx;
1264 uint64_t desc_addrs[PACKED_BATCH_SIZE];
1265 struct virtio_net_hdr_mrg_rxbuf *hdrs[PACKED_BATCH_SIZE];
1266 uint32_t buf_offset = sizeof(struct virtio_net_hdr_mrg_rxbuf);
1267 uint64_t lens[PACKED_BATCH_SIZE];
1268 uint16_t ids[PACKED_BATCH_SIZE];
1271 if (unlikely(avail_idx & PACKED_BATCH_MASK))
1274 if (unlikely((avail_idx + PACKED_BATCH_SIZE) > vq->size))
1277 vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
1278 if (unlikely(pkts[i]->next != NULL))
1280 if (unlikely(!desc_is_avail(&descs[avail_idx + i],
1285 vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
1286 lens[i] = descs[avail_idx + i].len;
1288 vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
1289 if (unlikely(pkts[i]->pkt_len > (lens[i] - buf_offset)))
1293 vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
1294 desc_addrs[i] = vhost_iova_to_vva(dev, vq,
1295 descs[avail_idx + i].addr,
1299 vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
1300 if (unlikely(!desc_addrs[i]))
1302 if (unlikely(lens[i] != descs[avail_idx + i].len))
1306 vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
1307 rte_prefetch0((void *)(uintptr_t)desc_addrs[i]);
1308 hdrs[i] = (struct virtio_net_hdr_mrg_rxbuf *)
1309 (uintptr_t)desc_addrs[i];
1310 lens[i] = pkts[i]->pkt_len +
1311 sizeof(struct virtio_net_hdr_mrg_rxbuf);
1314 vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
1315 virtio_enqueue_offload(pkts[i], &hdrs[i]->hdr);
1317 vq_inc_last_avail_packed(vq, PACKED_BATCH_SIZE);
1319 vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
1320 rte_memcpy((void *)(uintptr_t)(desc_addrs[i] + buf_offset),
1321 rte_pktmbuf_mtod_offset(pkts[i], void *, 0),
1325 vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
1326 vhost_log_cache_write_iova(dev, vq, descs[avail_idx + i].addr,
1329 vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
1330 ids[i] = descs[avail_idx + i].id;
1332 vhost_flush_enqueue_batch_packed(dev, vq, lens, ids);
1337 static __rte_always_inline int16_t
1338 virtio_dev_rx_single_packed(struct virtio_net *dev,
1339 struct vhost_virtqueue *vq,
1340 struct rte_mbuf *pkt)
1342 struct buf_vector buf_vec[BUF_VECTOR_MAX];
1343 uint16_t nr_descs = 0;
1345 if (unlikely(vhost_enqueue_single_packed(dev, vq, pkt, buf_vec,
1347 VHOST_LOG_DATA(DEBUG,
1348 "(%d) failed to get enough desc from vring\n",
1353 VHOST_LOG_DATA(DEBUG, "(%d) current index %d | end index %d\n",
1354 dev->vid, vq->last_avail_idx,
1355 vq->last_avail_idx + nr_descs);
1357 vq_inc_last_avail_packed(vq, nr_descs);
1362 static __rte_noinline uint32_t
1363 virtio_dev_rx_packed(struct virtio_net *dev,
1364 struct vhost_virtqueue *__rte_restrict vq,
1365 struct rte_mbuf **__rte_restrict pkts,
1368 uint32_t pkt_idx = 0;
1371 rte_prefetch0(&vq->desc_packed[vq->last_avail_idx]);
1373 if (count - pkt_idx >= PACKED_BATCH_SIZE) {
1374 if (!virtio_dev_rx_batch_packed(dev, vq,
1376 pkt_idx += PACKED_BATCH_SIZE;
1381 if (virtio_dev_rx_single_packed(dev, vq, pkts[pkt_idx]))
1385 } while (pkt_idx < count);
1387 if (vq->shadow_used_idx) {
1388 do_data_copy_enqueue(dev, vq);
1389 vhost_flush_enqueue_shadow_packed(dev, vq);
1393 vhost_vring_call_packed(dev, vq);
1398 static __rte_always_inline uint32_t
1399 virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
1400 struct rte_mbuf **pkts, uint32_t count)
1402 struct vhost_virtqueue *vq;
1405 VHOST_LOG_DATA(DEBUG, "(%d) %s\n", dev->vid, __func__);
1406 if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->nr_vring))) {
1407 VHOST_LOG_DATA(ERR, "(%d) %s: invalid virtqueue idx %d.\n",
1408 dev->vid, __func__, queue_id);
1412 vq = dev->virtqueue[queue_id];
1414 rte_spinlock_lock(&vq->access_lock);
1416 if (unlikely(!vq->enabled))
1417 goto out_access_unlock;
1419 if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
1420 vhost_user_iotlb_rd_lock(vq);
1422 if (unlikely(!vq->access_ok))
1423 if (unlikely(vring_translate(dev, vq) < 0))
1426 count = RTE_MIN((uint32_t)MAX_PKT_BURST, count);
1430 if (vq_is_packed(dev))
1431 nb_tx = virtio_dev_rx_packed(dev, vq, pkts, count);
1433 nb_tx = virtio_dev_rx_split(dev, vq, pkts, count);
1436 if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
1437 vhost_user_iotlb_rd_unlock(vq);
1440 rte_spinlock_unlock(&vq->access_lock);
1446 rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
1447 struct rte_mbuf **__rte_restrict pkts, uint16_t count)
1449 struct virtio_net *dev = get_device(vid);
1454 if (unlikely(!(dev->flags & VIRTIO_DEV_BUILTIN_VIRTIO_NET))) {
1456 "(%d) %s: built-in vhost net backend is disabled.\n",
1457 dev->vid, __func__);
1461 return virtio_dev_rx(dev, queue_id, pkts, count);
1464 static __rte_always_inline uint16_t
1465 virtio_dev_rx_async_get_info_idx(uint16_t pkts_idx,
1466 uint16_t vq_size, uint16_t n_inflight)
1468 return pkts_idx > n_inflight ? (pkts_idx - n_inflight) :
1469 (vq_size - n_inflight + pkts_idx) & (vq_size - 1);
1472 static __rte_always_inline void
1473 store_dma_desc_info_split(struct vring_used_elem *s_ring, struct vring_used_elem *d_ring,
1474 uint16_t ring_size, uint16_t s_idx, uint16_t d_idx, uint16_t count)
1476 uint16_t elem_size = sizeof(struct vring_used_elem);
1478 if (d_idx + count <= ring_size) {
1479 rte_memcpy(d_ring + d_idx, s_ring + s_idx, count * elem_size);
1481 uint16_t size = ring_size - d_idx;
1483 rte_memcpy(d_ring + d_idx, s_ring + s_idx, size * elem_size);
1484 rte_memcpy(d_ring, s_ring + s_idx + size, (count - size) * elem_size);
1488 static __rte_always_inline void
1489 store_dma_desc_info_packed(struct vring_used_elem_packed *s_ring,
1490 struct vring_used_elem_packed *d_ring,
1491 uint16_t ring_size, uint16_t s_idx, uint16_t d_idx, uint16_t count)
1493 uint16_t elem_size = sizeof(struct vring_used_elem_packed);
1495 if (d_idx + count <= ring_size) {
1496 rte_memcpy(d_ring + d_idx, s_ring + s_idx, count * elem_size);
1498 uint16_t size = ring_size - d_idx;
1500 rte_memcpy(d_ring + d_idx, s_ring + s_idx, size * elem_size);
1501 rte_memcpy(d_ring, s_ring + s_idx + size, (count - size) * elem_size);
1505 static __rte_noinline uint32_t
1506 virtio_dev_rx_async_submit_split(struct virtio_net *dev,
1507 struct vhost_virtqueue *vq, uint16_t queue_id,
1508 struct rte_mbuf **pkts, uint32_t count,
1509 struct rte_mbuf **comp_pkts, uint32_t *comp_count)
1511 uint32_t pkt_idx = 0, pkt_burst_idx = 0;
1512 uint16_t num_buffers;
1513 struct buf_vector buf_vec[BUF_VECTOR_MAX];
1514 uint16_t avail_head;
1516 struct rte_vhost_iov_iter *it_pool = vq->it_pool;
1517 struct iovec *vec_pool = vq->vec_pool;
1518 struct rte_vhost_async_desc tdes[MAX_PKT_BURST];
1519 struct iovec *src_iovec = vec_pool;
1520 struct iovec *dst_iovec = vec_pool + (VHOST_MAX_ASYNC_VEC >> 1);
1521 uint16_t slot_idx = 0;
1522 uint16_t segs_await = 0;
1523 uint16_t iovec_idx = 0, it_idx = 0;
1524 struct async_inflight_info *pkts_info = vq->async_pkts_info;
1525 uint32_t n_pkts = 0, pkt_err = 0;
1526 uint32_t num_async_pkts = 0, num_done_pkts = 0;
1529 uint16_t last_avail_idx;
1530 } async_pkts_log[MAX_PKT_BURST];
1533 * The ordering between avail index and desc reads need to be enforced.
1535 avail_head = __atomic_load_n(&vq->avail->idx, __ATOMIC_ACQUIRE);
1537 rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]);
1539 for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
1540 uint32_t pkt_len = pkts[pkt_idx]->pkt_len + dev->vhost_hlen;
1541 uint16_t nr_vec = 0;
1543 if (unlikely(reserve_avail_buf_split(dev, vq,
1544 pkt_len, buf_vec, &num_buffers,
1545 avail_head, &nr_vec) < 0)) {
1546 VHOST_LOG_DATA(DEBUG,
1547 "(%d) failed to get enough desc from vring\n",
1549 vq->shadow_used_idx -= num_buffers;
1553 VHOST_LOG_DATA(DEBUG, "(%d) current index %d | end index %d\n",
1554 dev->vid, vq->last_avail_idx,
1555 vq->last_avail_idx + num_buffers);
1557 if (async_mbuf_to_desc(dev, vq, pkts[pkt_idx], buf_vec, nr_vec, num_buffers,
1558 &src_iovec[iovec_idx], &dst_iovec[iovec_idx],
1559 &it_pool[it_idx], &it_pool[it_idx + 1]) < 0) {
1560 vq->shadow_used_idx -= num_buffers;
1564 slot_idx = (vq->async_pkts_idx + num_async_pkts) &
1566 if (it_pool[it_idx].count) {
1569 async_fill_desc(&tdes[pkt_burst_idx++],
1570 &it_pool[it_idx], &it_pool[it_idx + 1]);
1571 pkts_info[slot_idx].descs = num_buffers;
1572 pkts_info[slot_idx].mbuf = pkts[pkt_idx];
1573 async_pkts_log[num_async_pkts].pkt_idx = pkt_idx;
1574 async_pkts_log[num_async_pkts++].last_avail_idx =
1577 iovec_idx += it_pool[it_idx].nr_segs;
1580 segs_await += it_pool[it_idx].nr_segs;
1583 * recover shadow used ring and keep DMA-occupied
1586 from = vq->shadow_used_idx - num_buffers;
1587 to = vq->async_desc_idx_split & (vq->size - 1);
1589 store_dma_desc_info_split(vq->shadow_used_split,
1590 vq->async_descs_split, vq->size, from, to, num_buffers);
1592 vq->async_desc_idx_split += num_buffers;
1593 vq->shadow_used_idx -= num_buffers;
1595 comp_pkts[num_done_pkts++] = pkts[pkt_idx];
1597 vq->last_avail_idx += num_buffers;
1600 * conditions to trigger async device transfer:
1601 * - buffered packet number reaches transfer threshold
1602 * - unused async iov number is less than max vhost vector
1604 if (unlikely(pkt_burst_idx >= VHOST_ASYNC_BATCH_THRESHOLD ||
1605 ((VHOST_MAX_ASYNC_VEC >> 1) - segs_await <
1607 n_pkts = vq->async_ops.transfer_data(dev->vid,
1608 queue_id, tdes, 0, pkt_burst_idx);
1613 vq->async_pkts_inflight_n += n_pkts;
1615 if (unlikely(n_pkts < pkt_burst_idx)) {
1617 * log error packets number here and do actual
1618 * error processing when applications poll
1621 pkt_err = pkt_burst_idx - n_pkts;
1630 if (pkt_burst_idx) {
1631 n_pkts = vq->async_ops.transfer_data(dev->vid,
1632 queue_id, tdes, 0, pkt_burst_idx);
1633 vq->async_pkts_inflight_n += n_pkts;
1635 if (unlikely(n_pkts < pkt_burst_idx))
1636 pkt_err = pkt_burst_idx - n_pkts;
1639 do_data_copy_enqueue(dev, vq);
1641 if (unlikely(pkt_err)) {
1642 uint16_t num_descs = 0;
1644 num_async_pkts -= pkt_err;
1645 /* calculate the sum of descriptors of DMA-error packets. */
1646 while (pkt_err-- > 0) {
1647 num_descs += pkts_info[slot_idx & (vq->size - 1)].descs;
1650 vq->async_desc_idx_split -= num_descs;
1651 /* recover shadow used ring and available ring */
1652 vq->shadow_used_idx -= (vq->last_avail_idx -
1653 async_pkts_log[num_async_pkts].last_avail_idx -
1655 vq->last_avail_idx =
1656 async_pkts_log[num_async_pkts].last_avail_idx;
1657 pkt_idx = async_pkts_log[num_async_pkts].pkt_idx;
1658 num_done_pkts = pkt_idx - num_async_pkts;
1661 vq->async_pkts_idx += num_async_pkts;
1662 *comp_count = num_done_pkts;
1664 if (likely(vq->shadow_used_idx)) {
1665 flush_shadow_used_ring_split(dev, vq);
1666 vhost_vring_call_split(dev, vq);
1672 static __rte_always_inline void
1673 vhost_update_used_packed(struct vhost_virtqueue *vq,
1674 struct vring_used_elem_packed *shadow_ring,
1678 uint16_t used_idx = vq->last_used_idx;
1679 uint16_t head_idx = vq->last_used_idx;
1680 uint16_t head_flags = 0;
1685 /* Split loop in two to save memory barriers */
1686 for (i = 0; i < count; i++) {
1687 vq->desc_packed[used_idx].id = shadow_ring[i].id;
1688 vq->desc_packed[used_idx].len = shadow_ring[i].len;
1690 used_idx += shadow_ring[i].count;
1691 if (used_idx >= vq->size)
1692 used_idx -= vq->size;
1695 /* The ordering for storing desc flags needs to be enforced. */
1696 rte_atomic_thread_fence(__ATOMIC_RELEASE);
1698 for (i = 0; i < count; i++) {
1701 if (vq->shadow_used_packed[i].len)
1702 flags = VRING_DESC_F_WRITE;
1706 if (vq->used_wrap_counter) {
1707 flags |= VRING_DESC_F_USED;
1708 flags |= VRING_DESC_F_AVAIL;
1710 flags &= ~VRING_DESC_F_USED;
1711 flags &= ~VRING_DESC_F_AVAIL;
1715 vq->desc_packed[vq->last_used_idx].flags = flags;
1717 head_idx = vq->last_used_idx;
1721 vq_inc_last_used_packed(vq, shadow_ring[i].count);
1724 vq->desc_packed[head_idx].flags = head_flags;
1727 static __rte_always_inline int
1728 vhost_enqueue_async_single_packed(struct virtio_net *dev,
1729 struct vhost_virtqueue *vq,
1730 struct rte_mbuf *pkt,
1731 struct buf_vector *buf_vec,
1733 uint16_t *nr_buffers,
1734 struct vring_packed_desc *async_descs,
1735 struct iovec *src_iovec, struct iovec *dst_iovec,
1736 struct rte_vhost_iov_iter *src_it,
1737 struct rte_vhost_iov_iter *dst_it)
1739 uint16_t nr_vec = 0;
1740 uint16_t avail_idx = vq->last_avail_idx;
1741 uint16_t max_tries, tries = 0;
1742 uint16_t buf_id = 0;
1744 uint16_t desc_count = 0;
1745 uint32_t size = pkt->pkt_len + sizeof(struct virtio_net_hdr_mrg_rxbuf);
1746 uint32_t buffer_len[vq->size];
1747 uint16_t buffer_buf_id[vq->size];
1748 uint16_t buffer_desc_count[vq->size];
1750 if (rxvq_is_mergeable(dev))
1751 max_tries = vq->size - 1;
1757 * if we tried all available ring items, and still
1758 * can't get enough buf, it means something abnormal
1761 if (unlikely(++tries > max_tries))
1764 if (unlikely(fill_vec_buf_packed(dev, vq, avail_idx, &desc_count, buf_vec, &nr_vec,
1765 &buf_id, &len, VHOST_ACCESS_RW) < 0))
1768 len = RTE_MIN(len, size);
1771 buffer_len[*nr_buffers] = len;
1772 buffer_buf_id[*nr_buffers] = buf_id;
1773 buffer_desc_count[*nr_buffers] = desc_count;
1776 *nr_descs += desc_count;
1777 avail_idx += desc_count;
1778 if (avail_idx >= vq->size)
1779 avail_idx -= vq->size;
1782 if (async_mbuf_to_desc(dev, vq, pkt, buf_vec, nr_vec, *nr_buffers, src_iovec, dst_iovec,
1783 src_it, dst_it) < 0)
1785 /* store descriptors for DMA */
1786 if (avail_idx >= *nr_descs) {
1787 rte_memcpy(async_descs, &vq->desc_packed[vq->last_avail_idx],
1788 *nr_descs * sizeof(struct vring_packed_desc));
1790 uint16_t nr_copy = vq->size - vq->last_avail_idx;
1792 rte_memcpy(async_descs, &vq->desc_packed[vq->last_avail_idx],
1793 nr_copy * sizeof(struct vring_packed_desc));
1794 rte_memcpy(async_descs + nr_copy, vq->desc_packed,
1795 (*nr_descs - nr_copy) * sizeof(struct vring_packed_desc));
1798 vhost_shadow_enqueue_packed(vq, buffer_len, buffer_buf_id, buffer_desc_count, *nr_buffers);
1803 static __rte_always_inline int16_t
1804 virtio_dev_rx_async_single_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
1805 struct rte_mbuf *pkt, uint16_t *nr_descs, uint16_t *nr_buffers,
1806 struct vring_packed_desc *async_descs,
1807 struct iovec *src_iovec, struct iovec *dst_iovec,
1808 struct rte_vhost_iov_iter *src_it, struct rte_vhost_iov_iter *dst_it)
1810 struct buf_vector buf_vec[BUF_VECTOR_MAX];
1812 if (unlikely(vhost_enqueue_async_single_packed(dev, vq, pkt, buf_vec, nr_descs, nr_buffers,
1813 async_descs, src_iovec, dst_iovec,
1814 src_it, dst_it) < 0)) {
1815 VHOST_LOG_DATA(DEBUG, "(%d) failed to get enough desc from vring\n", dev->vid);
1819 VHOST_LOG_DATA(DEBUG, "(%d) current index %d | end index %d\n",
1820 dev->vid, vq->last_avail_idx, vq->last_avail_idx + *nr_descs);
1825 static __rte_always_inline void
1826 dma_error_handler_packed(struct vhost_virtqueue *vq, struct vring_packed_desc *async_descs,
1827 uint16_t async_descs_idx, uint16_t slot_idx, uint32_t nr_err,
1828 uint32_t *pkt_idx, uint32_t *num_async_pkts, uint32_t *num_done_pkts)
1830 uint16_t descs_err = 0;
1831 uint16_t buffers_err = 0;
1832 struct async_inflight_info *pkts_info = vq->async_pkts_info;
1834 *num_async_pkts -= nr_err;
1836 /* calculate the sum of buffers and descs of DMA-error packets. */
1837 while (nr_err-- > 0) {
1838 descs_err += pkts_info[slot_idx % vq->size].descs;
1839 buffers_err += pkts_info[slot_idx % vq->size].nr_buffers;
1843 vq->async_buffer_idx_packed -= buffers_err;
1845 if (vq->last_avail_idx >= descs_err) {
1846 vq->last_avail_idx -= descs_err;
1848 rte_memcpy(&vq->desc_packed[vq->last_avail_idx],
1849 &async_descs[async_descs_idx - descs_err],
1850 descs_err * sizeof(struct vring_packed_desc));
1854 vq->last_avail_idx = vq->last_avail_idx + vq->size - descs_err;
1855 nr_copy = vq->size - vq->last_avail_idx;
1856 rte_memcpy(&vq->desc_packed[vq->last_avail_idx],
1857 &async_descs[async_descs_idx - descs_err],
1858 nr_copy * sizeof(struct vring_packed_desc));
1859 descs_err -= nr_copy;
1860 rte_memcpy(&vq->desc_packed[0], &async_descs[async_descs_idx - descs_err],
1861 descs_err * sizeof(struct vring_packed_desc));
1862 vq->avail_wrap_counter ^= 1;
1865 *num_done_pkts = *pkt_idx - *num_async_pkts;
1868 static __rte_noinline uint32_t
1869 virtio_dev_rx_async_submit_packed(struct virtio_net *dev,
1870 struct vhost_virtqueue *vq, uint16_t queue_id,
1871 struct rte_mbuf **pkts, uint32_t count,
1872 struct rte_mbuf **comp_pkts, uint32_t *comp_count)
1874 uint32_t pkt_idx = 0, pkt_burst_idx = 0;
1875 uint16_t async_descs_idx = 0;
1876 uint16_t num_buffers;
1879 struct rte_vhost_iov_iter *it_pool = vq->it_pool;
1880 struct iovec *vec_pool = vq->vec_pool;
1881 struct rte_vhost_async_desc tdes[MAX_PKT_BURST];
1882 struct iovec *src_iovec = vec_pool;
1883 struct iovec *dst_iovec = vec_pool + (VHOST_MAX_ASYNC_VEC >> 1);
1884 uint16_t slot_idx = 0;
1885 uint16_t segs_await = 0;
1886 uint16_t iovec_idx = 0, it_idx = 0;
1887 struct async_inflight_info *pkts_info = vq->async_pkts_info;
1888 uint32_t n_pkts = 0, pkt_err = 0;
1889 uint32_t num_async_pkts = 0, num_done_pkts = 0;
1890 struct vring_packed_desc async_descs[vq->size];
1892 rte_prefetch0(&vq->desc_packed[vq->last_avail_idx]);
1894 for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
1898 if (unlikely(virtio_dev_rx_async_single_packed(dev, vq, pkts[pkt_idx],
1899 &num_descs, &num_buffers,
1900 &async_descs[async_descs_idx],
1901 &src_iovec[iovec_idx], &dst_iovec[iovec_idx],
1902 &it_pool[it_idx], &it_pool[it_idx + 1]) < 0))
1905 VHOST_LOG_DATA(DEBUG, "(%d) current index %d | end index %d\n",
1906 dev->vid, vq->last_avail_idx,
1907 vq->last_avail_idx + num_descs);
1909 slot_idx = (vq->async_pkts_idx + num_async_pkts) % vq->size;
1910 if (it_pool[it_idx].count) {
1913 async_descs_idx += num_descs;
1914 async_fill_desc(&tdes[pkt_burst_idx++],
1915 &it_pool[it_idx], &it_pool[it_idx + 1]);
1916 pkts_info[slot_idx].descs = num_descs;
1917 pkts_info[slot_idx].nr_buffers = num_buffers;
1918 pkts_info[slot_idx].mbuf = pkts[pkt_idx];
1920 iovec_idx += it_pool[it_idx].nr_segs;
1923 segs_await += it_pool[it_idx].nr_segs;
1926 * recover shadow used ring and keep DMA-occupied
1929 from = vq->shadow_used_idx - num_buffers;
1930 to = vq->async_buffer_idx_packed % vq->size;
1931 store_dma_desc_info_packed(vq->shadow_used_packed,
1932 vq->async_buffers_packed, vq->size, from, to, num_buffers);
1934 vq->async_buffer_idx_packed += num_buffers;
1935 vq->shadow_used_idx -= num_buffers;
1937 comp_pkts[num_done_pkts++] = pkts[pkt_idx];
1940 vq_inc_last_avail_packed(vq, num_descs);
1943 * conditions to trigger async device transfer:
1944 * - buffered packet number reaches transfer threshold
1945 * - unused async iov number is less than max vhost vector
1947 if (unlikely(pkt_burst_idx >= VHOST_ASYNC_BATCH_THRESHOLD ||
1948 ((VHOST_MAX_ASYNC_VEC >> 1) - segs_await < BUF_VECTOR_MAX))) {
1949 n_pkts = vq->async_ops.transfer_data(dev->vid, queue_id,
1950 tdes, 0, pkt_burst_idx);
1954 vq->async_pkts_inflight_n += n_pkts;
1956 if (unlikely(n_pkts < pkt_burst_idx)) {
1958 * log error packets number here and do actual
1959 * error processing when applications poll
1962 pkt_err = pkt_burst_idx - n_pkts;
1972 if (pkt_burst_idx) {
1973 n_pkts = vq->async_ops.transfer_data(dev->vid, queue_id, tdes, 0, pkt_burst_idx);
1974 vq->async_pkts_inflight_n += n_pkts;
1976 if (unlikely(n_pkts < pkt_burst_idx))
1977 pkt_err = pkt_burst_idx - n_pkts;
1980 do_data_copy_enqueue(dev, vq);
1982 if (unlikely(pkt_err))
1983 dma_error_handler_packed(vq, async_descs, async_descs_idx, slot_idx, pkt_err,
1984 &pkt_idx, &num_async_pkts, &num_done_pkts);
1985 vq->async_pkts_idx += num_async_pkts;
1986 *comp_count = num_done_pkts;
1988 if (likely(vq->shadow_used_idx)) {
1989 vhost_flush_enqueue_shadow_packed(dev, vq);
1990 vhost_vring_call_packed(dev, vq);
1996 static __rte_always_inline void
1997 write_back_completed_descs_split(struct vhost_virtqueue *vq, uint16_t n_descs)
1999 uint16_t nr_left = n_descs;
2004 from = vq->last_async_desc_idx_split & (vq->size - 1);
2005 nr_copy = nr_left + from <= vq->size ? nr_left : vq->size - from;
2006 to = vq->last_used_idx & (vq->size - 1);
2008 if (to + nr_copy <= vq->size) {
2009 rte_memcpy(&vq->used->ring[to], &vq->async_descs_split[from],
2010 nr_copy * sizeof(struct vring_used_elem));
2012 uint16_t size = vq->size - to;
2014 rte_memcpy(&vq->used->ring[to], &vq->async_descs_split[from],
2015 size * sizeof(struct vring_used_elem));
2016 rte_memcpy(&vq->used->ring[0], &vq->async_descs_split[from + size],
2017 (nr_copy - size) * sizeof(struct vring_used_elem));
2020 vq->last_async_desc_idx_split += nr_copy;
2021 vq->last_used_idx += nr_copy;
2023 } while (nr_left > 0);
2026 static __rte_always_inline void
2027 write_back_completed_descs_packed(struct vhost_virtqueue *vq,
2030 uint16_t nr_left = n_buffers;
2034 from = vq->last_async_buffer_idx_packed % vq->size;
2035 to = (from + nr_left) % vq->size;
2037 vhost_update_used_packed(vq, vq->async_buffers_packed + from, to - from);
2038 vq->last_async_buffer_idx_packed += nr_left;
2041 vhost_update_used_packed(vq, vq->async_buffers_packed + from,
2043 vq->last_async_buffer_idx_packed += vq->size - from;
2044 nr_left -= vq->size - from;
2046 } while (nr_left > 0);
2049 uint16_t rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
2050 struct rte_mbuf **pkts, uint16_t count)
2052 struct virtio_net *dev = get_device(vid);
2053 struct vhost_virtqueue *vq;
2054 uint16_t n_pkts_cpl = 0, n_pkts_put = 0, n_descs = 0, n_buffers = 0;
2055 uint16_t start_idx, pkts_idx, vq_size;
2056 struct async_inflight_info *pkts_info;
2062 VHOST_LOG_DATA(DEBUG, "(%d) %s\n", dev->vid, __func__);
2063 if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->nr_vring))) {
2064 VHOST_LOG_DATA(ERR, "(%d) %s: invalid virtqueue idx %d.\n",
2065 dev->vid, __func__, queue_id);
2069 vq = dev->virtqueue[queue_id];
2071 if (unlikely(!vq->async_registered)) {
2072 VHOST_LOG_DATA(ERR, "(%d) %s: async not registered for queue id %d.\n",
2073 dev->vid, __func__, queue_id);
2077 rte_spinlock_lock(&vq->access_lock);
2079 pkts_idx = vq->async_pkts_idx % vq->size;
2080 pkts_info = vq->async_pkts_info;
2082 start_idx = virtio_dev_rx_async_get_info_idx(pkts_idx,
2083 vq_size, vq->async_pkts_inflight_n);
2085 if (count > vq->async_last_pkts_n)
2086 n_pkts_cpl = vq->async_ops.check_completed_copies(vid,
2087 queue_id, 0, count - vq->async_last_pkts_n);
2088 n_pkts_cpl += vq->async_last_pkts_n;
2090 n_pkts_put = RTE_MIN(count, n_pkts_cpl);
2091 if (unlikely(n_pkts_put == 0)) {
2092 vq->async_last_pkts_n = n_pkts_cpl;
2096 if (vq_is_packed(dev)) {
2097 for (i = 0; i < n_pkts_put; i++) {
2098 from = (start_idx + i) & (vq_size - 1);
2099 n_buffers += pkts_info[from].nr_buffers;
2100 pkts[i] = pkts_info[from].mbuf;
2103 for (i = 0; i < n_pkts_put; i++) {
2104 from = (start_idx + i) & (vq_size - 1);
2105 n_descs += pkts_info[from].descs;
2106 pkts[i] = pkts_info[from].mbuf;
2110 vq->async_last_pkts_n = n_pkts_cpl - n_pkts_put;
2111 vq->async_pkts_inflight_n -= n_pkts_put;
2113 if (likely(vq->enabled && vq->access_ok)) {
2114 if (vq_is_packed(dev)) {
2115 write_back_completed_descs_packed(vq, n_buffers);
2117 vhost_vring_call_packed(dev, vq);
2119 write_back_completed_descs_split(vq, n_descs);
2121 __atomic_add_fetch(&vq->used->idx, n_descs,
2123 vhost_vring_call_split(dev, vq);
2126 if (vq_is_packed(dev))
2127 vq->last_async_buffer_idx_packed += n_buffers;
2129 vq->last_async_desc_idx_split += n_descs;
2133 rte_spinlock_unlock(&vq->access_lock);
2138 static __rte_always_inline uint32_t
2139 virtio_dev_rx_async_submit(struct virtio_net *dev, uint16_t queue_id,
2140 struct rte_mbuf **pkts, uint32_t count,
2141 struct rte_mbuf **comp_pkts, uint32_t *comp_count)
2143 struct vhost_virtqueue *vq;
2146 VHOST_LOG_DATA(DEBUG, "(%d) %s\n", dev->vid, __func__);
2147 if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->nr_vring))) {
2148 VHOST_LOG_DATA(ERR, "(%d) %s: invalid virtqueue idx %d.\n",
2149 dev->vid, __func__, queue_id);
2153 vq = dev->virtqueue[queue_id];
2155 rte_spinlock_lock(&vq->access_lock);
2157 if (unlikely(!vq->enabled || !vq->async_registered))
2158 goto out_access_unlock;
2160 if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
2161 vhost_user_iotlb_rd_lock(vq);
2163 if (unlikely(!vq->access_ok))
2164 if (unlikely(vring_translate(dev, vq) < 0))
2167 count = RTE_MIN((uint32_t)MAX_PKT_BURST, count);
2171 if (vq_is_packed(dev))
2172 nb_tx = virtio_dev_rx_async_submit_packed(dev,
2173 vq, queue_id, pkts, count, comp_pkts,
2176 nb_tx = virtio_dev_rx_async_submit_split(dev,
2177 vq, queue_id, pkts, count, comp_pkts,
2181 if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
2182 vhost_user_iotlb_rd_unlock(vq);
2185 rte_spinlock_unlock(&vq->access_lock);
2191 rte_vhost_submit_enqueue_burst(int vid, uint16_t queue_id,
2192 struct rte_mbuf **pkts, uint16_t count,
2193 struct rte_mbuf **comp_pkts, uint32_t *comp_count)
2195 struct virtio_net *dev = get_device(vid);
2201 if (unlikely(!(dev->flags & VIRTIO_DEV_BUILTIN_VIRTIO_NET))) {
2203 "(%d) %s: built-in vhost net backend is disabled.\n",
2204 dev->vid, __func__);
2208 return virtio_dev_rx_async_submit(dev, queue_id, pkts, count, comp_pkts,
2213 virtio_net_with_host_offload(struct virtio_net *dev)
2216 ((1ULL << VIRTIO_NET_F_CSUM) |
2217 (1ULL << VIRTIO_NET_F_HOST_ECN) |
2218 (1ULL << VIRTIO_NET_F_HOST_TSO4) |
2219 (1ULL << VIRTIO_NET_F_HOST_TSO6) |
2220 (1ULL << VIRTIO_NET_F_HOST_UFO)))
2227 parse_ethernet(struct rte_mbuf *m, uint16_t *l4_proto, void **l4_hdr)
2229 struct rte_ipv4_hdr *ipv4_hdr;
2230 struct rte_ipv6_hdr *ipv6_hdr;
2231 void *l3_hdr = NULL;
2232 struct rte_ether_hdr *eth_hdr;
2235 eth_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
2237 m->l2_len = sizeof(struct rte_ether_hdr);
2238 ethertype = rte_be_to_cpu_16(eth_hdr->ether_type);
2240 if (ethertype == RTE_ETHER_TYPE_VLAN) {
2241 struct rte_vlan_hdr *vlan_hdr =
2242 (struct rte_vlan_hdr *)(eth_hdr + 1);
2244 m->l2_len += sizeof(struct rte_vlan_hdr);
2245 ethertype = rte_be_to_cpu_16(vlan_hdr->eth_proto);
2248 l3_hdr = (char *)eth_hdr + m->l2_len;
2250 switch (ethertype) {
2251 case RTE_ETHER_TYPE_IPV4:
2253 *l4_proto = ipv4_hdr->next_proto_id;
2254 m->l3_len = rte_ipv4_hdr_len(ipv4_hdr);
2255 *l4_hdr = (char *)l3_hdr + m->l3_len;
2256 m->ol_flags |= PKT_TX_IPV4;
2258 case RTE_ETHER_TYPE_IPV6:
2260 *l4_proto = ipv6_hdr->proto;
2261 m->l3_len = sizeof(struct rte_ipv6_hdr);
2262 *l4_hdr = (char *)l3_hdr + m->l3_len;
2263 m->ol_flags |= PKT_TX_IPV6;
2273 static __rte_always_inline void
2274 vhost_dequeue_offload(struct virtio_net_hdr *hdr, struct rte_mbuf *m)
2276 uint16_t l4_proto = 0;
2277 void *l4_hdr = NULL;
2278 struct rte_tcp_hdr *tcp_hdr = NULL;
2280 if (hdr->flags == 0 && hdr->gso_type == VIRTIO_NET_HDR_GSO_NONE)
2283 parse_ethernet(m, &l4_proto, &l4_hdr);
2284 if (hdr->flags == VIRTIO_NET_HDR_F_NEEDS_CSUM) {
2285 if (hdr->csum_start == (m->l2_len + m->l3_len)) {
2286 switch (hdr->csum_offset) {
2287 case (offsetof(struct rte_tcp_hdr, cksum)):
2288 if (l4_proto == IPPROTO_TCP)
2289 m->ol_flags |= PKT_TX_TCP_CKSUM;
2291 case (offsetof(struct rte_udp_hdr, dgram_cksum)):
2292 if (l4_proto == IPPROTO_UDP)
2293 m->ol_flags |= PKT_TX_UDP_CKSUM;
2295 case (offsetof(struct rte_sctp_hdr, cksum)):
2296 if (l4_proto == IPPROTO_SCTP)
2297 m->ol_flags |= PKT_TX_SCTP_CKSUM;
2305 if (l4_hdr && hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
2306 switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
2307 case VIRTIO_NET_HDR_GSO_TCPV4:
2308 case VIRTIO_NET_HDR_GSO_TCPV6:
2310 m->ol_flags |= PKT_TX_TCP_SEG;
2311 m->tso_segsz = hdr->gso_size;
2312 m->l4_len = (tcp_hdr->data_off & 0xf0) >> 2;
2314 case VIRTIO_NET_HDR_GSO_UDP:
2315 m->ol_flags |= PKT_TX_UDP_SEG;
2316 m->tso_segsz = hdr->gso_size;
2317 m->l4_len = sizeof(struct rte_udp_hdr);
2320 VHOST_LOG_DATA(WARNING,
2321 "unsupported gso type %u.\n", hdr->gso_type);
2327 static __rte_noinline void
2328 copy_vnet_hdr_from_desc(struct virtio_net_hdr *hdr,
2329 struct buf_vector *buf_vec)
2332 uint64_t remain = sizeof(struct virtio_net_hdr);
2334 uint64_t dst = (uint64_t)(uintptr_t)hdr;
2337 len = RTE_MIN(remain, buf_vec->buf_len);
2338 src = buf_vec->buf_addr;
2339 rte_memcpy((void *)(uintptr_t)dst,
2340 (void *)(uintptr_t)src, len);
2348 static __rte_always_inline int
2349 copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
2350 struct buf_vector *buf_vec, uint16_t nr_vec,
2351 struct rte_mbuf *m, struct rte_mempool *mbuf_pool)
2353 uint32_t buf_avail, buf_offset;
2354 uint64_t buf_addr, buf_len;
2355 uint32_t mbuf_avail, mbuf_offset;
2357 struct rte_mbuf *cur = m, *prev = m;
2358 struct virtio_net_hdr tmp_hdr;
2359 struct virtio_net_hdr *hdr = NULL;
2360 /* A counter to avoid desc dead loop chain */
2361 uint16_t vec_idx = 0;
2362 struct batch_copy_elem *batch_copy = vq->batch_copy_elems;
2365 buf_addr = buf_vec[vec_idx].buf_addr;
2366 buf_len = buf_vec[vec_idx].buf_len;
2368 if (unlikely(buf_len < dev->vhost_hlen && nr_vec <= 1)) {
2373 if (virtio_net_with_host_offload(dev)) {
2374 if (unlikely(buf_len < sizeof(struct virtio_net_hdr))) {
2376 * No luck, the virtio-net header doesn't fit
2377 * in a contiguous virtual area.
2379 copy_vnet_hdr_from_desc(&tmp_hdr, buf_vec);
2382 hdr = (struct virtio_net_hdr *)((uintptr_t)buf_addr);
2387 * A virtio driver normally uses at least 2 desc buffers
2388 * for Tx: the first for storing the header, and others
2389 * for storing the data.
2391 if (unlikely(buf_len < dev->vhost_hlen)) {
2392 buf_offset = dev->vhost_hlen - buf_len;
2394 buf_addr = buf_vec[vec_idx].buf_addr;
2395 buf_len = buf_vec[vec_idx].buf_len;
2396 buf_avail = buf_len - buf_offset;
2397 } else if (buf_len == dev->vhost_hlen) {
2398 if (unlikely(++vec_idx >= nr_vec))
2400 buf_addr = buf_vec[vec_idx].buf_addr;
2401 buf_len = buf_vec[vec_idx].buf_len;
2404 buf_avail = buf_len;
2406 buf_offset = dev->vhost_hlen;
2407 buf_avail = buf_vec[vec_idx].buf_len - dev->vhost_hlen;
2411 (uintptr_t)(buf_addr + buf_offset),
2412 (uint32_t)buf_avail, 0);
2415 mbuf_avail = m->buf_len - RTE_PKTMBUF_HEADROOM;
2417 cpy_len = RTE_MIN(buf_avail, mbuf_avail);
2419 if (likely(cpy_len > MAX_BATCH_LEN ||
2420 vq->batch_copy_nb_elems >= vq->size ||
2421 (hdr && cur == m))) {
2422 rte_memcpy(rte_pktmbuf_mtod_offset(cur, void *,
2424 (void *)((uintptr_t)(buf_addr +
2425 buf_offset)), cpy_len);
2427 batch_copy[vq->batch_copy_nb_elems].dst =
2428 rte_pktmbuf_mtod_offset(cur, void *,
2430 batch_copy[vq->batch_copy_nb_elems].src =
2431 (void *)((uintptr_t)(buf_addr + buf_offset));
2432 batch_copy[vq->batch_copy_nb_elems].len = cpy_len;
2433 vq->batch_copy_nb_elems++;
2436 mbuf_avail -= cpy_len;
2437 mbuf_offset += cpy_len;
2438 buf_avail -= cpy_len;
2439 buf_offset += cpy_len;
2441 /* This buf reaches to its end, get the next one */
2442 if (buf_avail == 0) {
2443 if (++vec_idx >= nr_vec)
2446 buf_addr = buf_vec[vec_idx].buf_addr;
2447 buf_len = buf_vec[vec_idx].buf_len;
2450 buf_avail = buf_len;
2452 PRINT_PACKET(dev, (uintptr_t)buf_addr,
2453 (uint32_t)buf_avail, 0);
2457 * This mbuf reaches to its end, get a new one
2458 * to hold more data.
2460 if (mbuf_avail == 0) {
2461 cur = rte_pktmbuf_alloc(mbuf_pool);
2462 if (unlikely(cur == NULL)) {
2463 VHOST_LOG_DATA(ERR, "Failed to "
2464 "allocate memory for mbuf.\n");
2470 prev->data_len = mbuf_offset;
2472 m->pkt_len += mbuf_offset;
2476 mbuf_avail = cur->buf_len - RTE_PKTMBUF_HEADROOM;
2480 prev->data_len = mbuf_offset;
2481 m->pkt_len += mbuf_offset;
2484 vhost_dequeue_offload(hdr, m);
2492 virtio_dev_extbuf_free(void *addr __rte_unused, void *opaque)
2498 virtio_dev_extbuf_alloc(struct rte_mbuf *pkt, uint32_t size)
2500 struct rte_mbuf_ext_shared_info *shinfo = NULL;
2501 uint32_t total_len = RTE_PKTMBUF_HEADROOM + size;
2506 total_len += sizeof(*shinfo) + sizeof(uintptr_t);
2507 total_len = RTE_ALIGN_CEIL(total_len, sizeof(uintptr_t));
2509 if (unlikely(total_len > UINT16_MAX))
2512 buf_len = total_len;
2513 buf = rte_malloc(NULL, buf_len, RTE_CACHE_LINE_SIZE);
2514 if (unlikely(buf == NULL))
2517 /* Initialize shinfo */
2518 shinfo = rte_pktmbuf_ext_shinfo_init_helper(buf, &buf_len,
2519 virtio_dev_extbuf_free, buf);
2520 if (unlikely(shinfo == NULL)) {
2522 VHOST_LOG_DATA(ERR, "Failed to init shinfo\n");
2526 iova = rte_malloc_virt2iova(buf);
2527 rte_pktmbuf_attach_extbuf(pkt, buf, iova, buf_len, shinfo);
2528 rte_pktmbuf_reset_headroom(pkt);
2533 static __rte_always_inline int
2534 virtio_dev_pktmbuf_prep(struct virtio_net *dev, struct rte_mbuf *pkt,
2537 if (rte_pktmbuf_tailroom(pkt) >= data_len)
2540 /* attach an external buffer if supported */
2541 if (dev->extbuf && !virtio_dev_extbuf_alloc(pkt, data_len))
2544 /* check if chained buffers are allowed */
2545 if (!dev->linearbuf)
2552 * Allocate a host supported pktmbuf.
2554 static __rte_always_inline struct rte_mbuf *
2555 virtio_dev_pktmbuf_alloc(struct virtio_net *dev, struct rte_mempool *mp,
2558 struct rte_mbuf *pkt = rte_pktmbuf_alloc(mp);
2560 if (unlikely(pkt == NULL)) {
2562 "Failed to allocate memory for mbuf.\n");
2566 if (virtio_dev_pktmbuf_prep(dev, pkt, data_len)) {
2567 /* Data doesn't fit into the buffer and the host supports
2568 * only linear buffers
2570 rte_pktmbuf_free(pkt);
2577 static __rte_noinline uint16_t
2578 virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
2579 struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count)
2582 uint16_t free_entries;
2583 uint16_t dropped = 0;
2584 static bool allocerr_warned;
2587 * The ordering between avail index and
2588 * desc reads needs to be enforced.
2590 free_entries = __atomic_load_n(&vq->avail->idx, __ATOMIC_ACQUIRE) -
2592 if (free_entries == 0)
2595 rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]);
2597 VHOST_LOG_DATA(DEBUG, "(%d) %s\n", dev->vid, __func__);
2599 count = RTE_MIN(count, MAX_PKT_BURST);
2600 count = RTE_MIN(count, free_entries);
2601 VHOST_LOG_DATA(DEBUG, "(%d) about to dequeue %u buffers\n",
2604 for (i = 0; i < count; i++) {
2605 struct buf_vector buf_vec[BUF_VECTOR_MAX];
2608 uint16_t nr_vec = 0;
2611 if (unlikely(fill_vec_buf_split(dev, vq,
2612 vq->last_avail_idx + i,
2614 &head_idx, &buf_len,
2615 VHOST_ACCESS_RO) < 0))
2618 update_shadow_used_ring_split(vq, head_idx, 0);
2620 pkts[i] = virtio_dev_pktmbuf_alloc(dev, mbuf_pool, buf_len);
2621 if (unlikely(pkts[i] == NULL)) {
2623 * mbuf allocation fails for jumbo packets when external
2624 * buffer allocation is not allowed and linear buffer
2625 * is required. Drop this packet.
2627 if (!allocerr_warned) {
2629 "Failed mbuf alloc of size %d from %s on %s.\n",
2630 buf_len, mbuf_pool->name, dev->ifname);
2631 allocerr_warned = true;
2638 err = copy_desc_to_mbuf(dev, vq, buf_vec, nr_vec, pkts[i],
2640 if (unlikely(err)) {
2641 rte_pktmbuf_free(pkts[i]);
2642 if (!allocerr_warned) {
2644 "Failed to copy desc to mbuf on %s.\n",
2646 allocerr_warned = true;
2654 vq->last_avail_idx += i;
2656 do_data_copy_dequeue(vq);
2657 if (unlikely(i < count))
2658 vq->shadow_used_idx = i;
2659 if (likely(vq->shadow_used_idx)) {
2660 flush_shadow_used_ring_split(dev, vq);
2661 vhost_vring_call_split(dev, vq);
2664 return (i - dropped);
2667 static __rte_always_inline int
2668 vhost_reserve_avail_batch_packed(struct virtio_net *dev,
2669 struct vhost_virtqueue *vq,
2670 struct rte_mbuf **pkts,
2672 uintptr_t *desc_addrs,
2675 bool wrap = vq->avail_wrap_counter;
2676 struct vring_packed_desc *descs = vq->desc_packed;
2677 uint64_t lens[PACKED_BATCH_SIZE];
2678 uint64_t buf_lens[PACKED_BATCH_SIZE];
2679 uint32_t buf_offset = sizeof(struct virtio_net_hdr_mrg_rxbuf);
2682 if (unlikely(avail_idx & PACKED_BATCH_MASK))
2684 if (unlikely((avail_idx + PACKED_BATCH_SIZE) > vq->size))
2687 vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
2688 flags = descs[avail_idx + i].flags;
2689 if (unlikely((wrap != !!(flags & VRING_DESC_F_AVAIL)) ||
2690 (wrap == !!(flags & VRING_DESC_F_USED)) ||
2691 (flags & PACKED_DESC_SINGLE_DEQUEUE_FLAG)))
2695 rte_atomic_thread_fence(__ATOMIC_ACQUIRE);
2697 vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
2698 lens[i] = descs[avail_idx + i].len;
2700 vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
2701 desc_addrs[i] = vhost_iova_to_vva(dev, vq,
2702 descs[avail_idx + i].addr,
2703 &lens[i], VHOST_ACCESS_RW);
2706 vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
2707 if (unlikely(!desc_addrs[i]))
2709 if (unlikely((lens[i] != descs[avail_idx + i].len)))
2713 vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
2714 if (virtio_dev_pktmbuf_prep(dev, pkts[i], lens[i]))
2718 vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
2719 buf_lens[i] = pkts[i]->buf_len - pkts[i]->data_off;
2721 vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
2722 if (unlikely(buf_lens[i] < (lens[i] - buf_offset)))
2726 vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
2727 pkts[i]->pkt_len = lens[i] - buf_offset;
2728 pkts[i]->data_len = pkts[i]->pkt_len;
2729 ids[i] = descs[avail_idx + i].id;
2738 static __rte_always_inline int
2739 virtio_dev_tx_batch_packed(struct virtio_net *dev,
2740 struct vhost_virtqueue *vq,
2741 struct rte_mbuf **pkts)
2743 uint16_t avail_idx = vq->last_avail_idx;
2744 uint32_t buf_offset = sizeof(struct virtio_net_hdr_mrg_rxbuf);
2745 struct virtio_net_hdr *hdr;
2746 uintptr_t desc_addrs[PACKED_BATCH_SIZE];
2747 uint16_t ids[PACKED_BATCH_SIZE];
2750 if (vhost_reserve_avail_batch_packed(dev, vq, pkts, avail_idx,
2754 vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
2755 rte_prefetch0((void *)(uintptr_t)desc_addrs[i]);
2757 vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
2758 rte_memcpy(rte_pktmbuf_mtod_offset(pkts[i], void *, 0),
2759 (void *)(uintptr_t)(desc_addrs[i] + buf_offset),
2762 if (virtio_net_with_host_offload(dev)) {
2763 vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
2764 hdr = (struct virtio_net_hdr *)(desc_addrs[i]);
2765 vhost_dequeue_offload(hdr, pkts[i]);
2769 if (virtio_net_is_inorder(dev))
2770 vhost_shadow_dequeue_batch_packed_inorder(vq,
2771 ids[PACKED_BATCH_SIZE - 1]);
2773 vhost_shadow_dequeue_batch_packed(dev, vq, ids);
2775 vq_inc_last_avail_packed(vq, PACKED_BATCH_SIZE);
2780 static __rte_always_inline int
2781 vhost_dequeue_single_packed(struct virtio_net *dev,
2782 struct vhost_virtqueue *vq,
2783 struct rte_mempool *mbuf_pool,
2784 struct rte_mbuf *pkts,
2786 uint16_t *desc_count)
2788 struct buf_vector buf_vec[BUF_VECTOR_MAX];
2790 uint16_t nr_vec = 0;
2792 static bool allocerr_warned;
2794 if (unlikely(fill_vec_buf_packed(dev, vq,
2795 vq->last_avail_idx, desc_count,
2798 VHOST_ACCESS_RO) < 0))
2801 if (unlikely(virtio_dev_pktmbuf_prep(dev, pkts, buf_len))) {
2802 if (!allocerr_warned) {
2804 "Failed mbuf alloc of size %d from %s on %s.\n",
2805 buf_len, mbuf_pool->name, dev->ifname);
2806 allocerr_warned = true;
2811 err = copy_desc_to_mbuf(dev, vq, buf_vec, nr_vec, pkts,
2813 if (unlikely(err)) {
2814 if (!allocerr_warned) {
2816 "Failed to copy desc to mbuf on %s.\n",
2818 allocerr_warned = true;
2826 static __rte_always_inline int
2827 virtio_dev_tx_single_packed(struct virtio_net *dev,
2828 struct vhost_virtqueue *vq,
2829 struct rte_mempool *mbuf_pool,
2830 struct rte_mbuf *pkts)
2833 uint16_t buf_id, desc_count = 0;
2836 ret = vhost_dequeue_single_packed(dev, vq, mbuf_pool, pkts, &buf_id,
2839 if (likely(desc_count > 0)) {
2840 if (virtio_net_is_inorder(dev))
2841 vhost_shadow_dequeue_single_packed_inorder(vq, buf_id,
2844 vhost_shadow_dequeue_single_packed(vq, buf_id,
2847 vq_inc_last_avail_packed(vq, desc_count);
2853 static __rte_noinline uint16_t
2854 virtio_dev_tx_packed(struct virtio_net *dev,
2855 struct vhost_virtqueue *__rte_restrict vq,
2856 struct rte_mempool *mbuf_pool,
2857 struct rte_mbuf **__rte_restrict pkts,
2860 uint32_t pkt_idx = 0;
2862 if (rte_pktmbuf_alloc_bulk(mbuf_pool, pkts, count))
2866 rte_prefetch0(&vq->desc_packed[vq->last_avail_idx]);
2868 if (count - pkt_idx >= PACKED_BATCH_SIZE) {
2869 if (!virtio_dev_tx_batch_packed(dev, vq,
2871 pkt_idx += PACKED_BATCH_SIZE;
2876 if (virtio_dev_tx_single_packed(dev, vq, mbuf_pool,
2880 } while (pkt_idx < count);
2882 if (pkt_idx != count)
2883 rte_pktmbuf_free_bulk(&pkts[pkt_idx], count - pkt_idx);
2885 if (vq->shadow_used_idx) {
2886 do_data_copy_dequeue(vq);
2888 vhost_flush_dequeue_shadow_packed(dev, vq);
2889 vhost_vring_call_packed(dev, vq);
2896 rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
2897 struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count)
2899 struct virtio_net *dev;
2900 struct rte_mbuf *rarp_mbuf = NULL;
2901 struct vhost_virtqueue *vq;
2902 int16_t success = 1;
2904 dev = get_device(vid);
2908 if (unlikely(!(dev->flags & VIRTIO_DEV_BUILTIN_VIRTIO_NET))) {
2910 "(%d) %s: built-in vhost net backend is disabled.\n",
2911 dev->vid, __func__);
2915 if (unlikely(!is_valid_virt_queue_idx(queue_id, 1, dev->nr_vring))) {
2917 "(%d) %s: invalid virtqueue idx %d.\n",
2918 dev->vid, __func__, queue_id);
2922 vq = dev->virtqueue[queue_id];
2924 if (unlikely(rte_spinlock_trylock(&vq->access_lock) == 0))
2927 if (unlikely(!vq->enabled)) {
2929 goto out_access_unlock;
2932 if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
2933 vhost_user_iotlb_rd_lock(vq);
2935 if (unlikely(!vq->access_ok))
2936 if (unlikely(vring_translate(dev, vq) < 0)) {
2942 * Construct a RARP broadcast packet, and inject it to the "pkts"
2943 * array, to looks like that guest actually send such packet.
2945 * Check user_send_rarp() for more information.
2947 * broadcast_rarp shares a cacheline in the virtio_net structure
2948 * with some fields that are accessed during enqueue and
2949 * __atomic_compare_exchange_n causes a write if performed compare
2950 * and exchange. This could result in false sharing between enqueue
2953 * Prevent unnecessary false sharing by reading broadcast_rarp first
2954 * and only performing compare and exchange if the read indicates it
2955 * is likely to be set.
2957 if (unlikely(__atomic_load_n(&dev->broadcast_rarp, __ATOMIC_ACQUIRE) &&
2958 __atomic_compare_exchange_n(&dev->broadcast_rarp,
2959 &success, 0, 0, __ATOMIC_RELEASE, __ATOMIC_RELAXED))) {
2961 rarp_mbuf = rte_net_make_rarp_packet(mbuf_pool, &dev->mac);
2962 if (rarp_mbuf == NULL) {
2963 VHOST_LOG_DATA(ERR, "Failed to make RARP packet.\n");
2970 if (vq_is_packed(dev))
2971 count = virtio_dev_tx_packed(dev, vq, mbuf_pool, pkts, count);
2973 count = virtio_dev_tx_split(dev, vq, mbuf_pool, pkts, count);
2976 if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
2977 vhost_user_iotlb_rd_unlock(vq);
2980 rte_spinlock_unlock(&vq->access_lock);
2982 if (unlikely(rarp_mbuf != NULL)) {
2984 * Inject it to the head of "pkts" array, so that switch's mac
2985 * learning table will get updated first.
2987 memmove(&pkts[1], pkts, count * sizeof(struct rte_mbuf *));
2988 pkts[0] = rarp_mbuf;