1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright(c) 2010-2016 Intel Corporation
7 #include <linux/virtio_net.h>
10 #include <rte_memcpy.h>
11 #include <rte_ether.h>
13 #include <rte_vhost.h>
18 #include <rte_spinlock.h>
19 #include <rte_malloc.h>
24 #define MAX_PKT_BURST 32
26 #define MAX_BATCH_LEN 256
29 is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t nr_vring)
31 return (is_tx ^ (idx & 1)) == 0 && idx < nr_vring;
34 static __rte_always_inline struct vring_desc *
35 alloc_copy_ind_table(struct virtio_net *dev, struct vhost_virtqueue *vq,
36 struct vring_desc *desc)
38 struct vring_desc *idesc;
40 uint64_t len, remain = desc->len;
41 uint64_t desc_addr = desc->addr;
43 idesc = rte_malloc(__func__, desc->len, 0);
47 dst = (uint64_t)(uintptr_t)idesc;
51 src = vhost_iova_to_vva(dev, vq, desc_addr, &len,
53 if (unlikely(!src || !len)) {
58 rte_memcpy((void *)(uintptr_t)dst, (void *)(uintptr_t)src, len);
68 static __rte_always_inline void
69 free_ind_table(struct vring_desc *idesc)
74 static __rte_always_inline void
75 do_flush_shadow_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq,
76 uint16_t to, uint16_t from, uint16_t size)
78 rte_memcpy(&vq->used->ring[to],
79 &vq->shadow_used_ring[from],
80 size * sizeof(struct vring_used_elem));
81 vhost_log_used_vring(dev, vq,
82 offsetof(struct vring_used, ring[to]),
83 size * sizeof(struct vring_used_elem));
86 static __rte_always_inline void
87 flush_shadow_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq)
89 uint16_t used_idx = vq->last_used_idx & (vq->size - 1);
91 if (used_idx + vq->shadow_used_idx <= vq->size) {
92 do_flush_shadow_used_ring(dev, vq, used_idx, 0,
97 /* update used ring interval [used_idx, vq->size] */
98 size = vq->size - used_idx;
99 do_flush_shadow_used_ring(dev, vq, used_idx, 0, size);
101 /* update the left half used ring interval [0, left_size] */
102 do_flush_shadow_used_ring(dev, vq, 0, size,
103 vq->shadow_used_idx - size);
105 vq->last_used_idx += vq->shadow_used_idx;
109 *(volatile uint16_t *)&vq->used->idx += vq->shadow_used_idx;
110 vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
111 sizeof(vq->used->idx));
114 static __rte_always_inline void
115 update_shadow_used_ring(struct vhost_virtqueue *vq,
116 uint16_t desc_idx, uint16_t len)
118 uint16_t i = vq->shadow_used_idx++;
120 vq->shadow_used_ring[i].id = desc_idx;
121 vq->shadow_used_ring[i].len = len;
125 do_data_copy_enqueue(struct virtio_net *dev, struct vhost_virtqueue *vq)
127 struct batch_copy_elem *elem = vq->batch_copy_elems;
128 uint16_t count = vq->batch_copy_nb_elems;
131 for (i = 0; i < count; i++) {
132 rte_memcpy(elem[i].dst, elem[i].src, elem[i].len);
133 vhost_log_write(dev, elem[i].log_addr, elem[i].len);
134 PRINT_PACKET(dev, (uintptr_t)elem[i].dst, elem[i].len, 0);
139 do_data_copy_dequeue(struct vhost_virtqueue *vq)
141 struct batch_copy_elem *elem = vq->batch_copy_elems;
142 uint16_t count = vq->batch_copy_nb_elems;
145 for (i = 0; i < count; i++)
146 rte_memcpy(elem[i].dst, elem[i].src, elem[i].len);
149 /* avoid write operation when necessary, to lessen cache issues */
150 #define ASSIGN_UNLESS_EQUAL(var, val) do { \
151 if ((var) != (val)) \
156 virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
158 uint64_t csum_l4 = m_buf->ol_flags & PKT_TX_L4_MASK;
160 if (m_buf->ol_flags & PKT_TX_TCP_SEG)
161 csum_l4 |= PKT_TX_TCP_CKSUM;
164 net_hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
165 net_hdr->csum_start = m_buf->l2_len + m_buf->l3_len;
168 case PKT_TX_TCP_CKSUM:
169 net_hdr->csum_offset = (offsetof(struct tcp_hdr,
172 case PKT_TX_UDP_CKSUM:
173 net_hdr->csum_offset = (offsetof(struct udp_hdr,
176 case PKT_TX_SCTP_CKSUM:
177 net_hdr->csum_offset = (offsetof(struct sctp_hdr,
182 ASSIGN_UNLESS_EQUAL(net_hdr->csum_start, 0);
183 ASSIGN_UNLESS_EQUAL(net_hdr->csum_offset, 0);
184 ASSIGN_UNLESS_EQUAL(net_hdr->flags, 0);
187 /* IP cksum verification cannot be bypassed, then calculate here */
188 if (m_buf->ol_flags & PKT_TX_IP_CKSUM) {
189 struct ipv4_hdr *ipv4_hdr;
191 ipv4_hdr = rte_pktmbuf_mtod_offset(m_buf, struct ipv4_hdr *,
193 ipv4_hdr->hdr_checksum = rte_ipv4_cksum(ipv4_hdr);
196 if (m_buf->ol_flags & PKT_TX_TCP_SEG) {
197 if (m_buf->ol_flags & PKT_TX_IPV4)
198 net_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
200 net_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
201 net_hdr->gso_size = m_buf->tso_segsz;
202 net_hdr->hdr_len = m_buf->l2_len + m_buf->l3_len
204 } else if (m_buf->ol_flags & PKT_TX_UDP_SEG) {
205 net_hdr->gso_type = VIRTIO_NET_HDR_GSO_UDP;
206 net_hdr->gso_size = m_buf->tso_segsz;
207 net_hdr->hdr_len = m_buf->l2_len + m_buf->l3_len +
210 ASSIGN_UNLESS_EQUAL(net_hdr->gso_type, 0);
211 ASSIGN_UNLESS_EQUAL(net_hdr->gso_size, 0);
212 ASSIGN_UNLESS_EQUAL(net_hdr->hdr_len, 0);
216 static __rte_always_inline int
217 copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
218 struct vring_desc *descs, struct rte_mbuf *m,
219 uint16_t desc_idx, uint32_t size)
221 uint32_t desc_avail, desc_offset;
222 uint32_t mbuf_avail, mbuf_offset;
224 uint64_t desc_chunck_len;
225 struct vring_desc *desc;
226 uint64_t desc_addr, desc_gaddr;
227 /* A counter to avoid desc dead loop chain */
228 uint16_t nr_desc = 1;
229 struct batch_copy_elem *batch_copy = vq->batch_copy_elems;
230 uint16_t copy_nb = vq->batch_copy_nb_elems;
233 desc = &descs[desc_idx];
234 desc_chunck_len = desc->len;
235 desc_gaddr = desc->addr;
236 desc_addr = vhost_iova_to_vva(dev, vq, desc_gaddr,
237 &desc_chunck_len, VHOST_ACCESS_RW);
239 * Checking of 'desc_addr' placed outside of 'unlikely' macro to avoid
240 * performance issue with some versions of gcc (4.8.4 and 5.3.0) which
241 * otherwise stores offset on the stack instead of in a register.
243 if (unlikely(desc->len < dev->vhost_hlen) || !desc_addr) {
248 rte_prefetch0((void *)(uintptr_t)desc_addr);
250 if (likely(desc_chunck_len >= dev->vhost_hlen)) {
251 virtio_enqueue_offload(m,
252 (struct virtio_net_hdr *)(uintptr_t)desc_addr);
253 PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0);
254 vhost_log_write(dev, desc_gaddr, dev->vhost_hlen);
256 struct virtio_net_hdr vnet_hdr;
257 uint64_t remain = dev->vhost_hlen;
259 uint64_t src = (uint64_t)(uintptr_t)&vnet_hdr, dst;
260 uint64_t guest_addr = desc_gaddr;
262 virtio_enqueue_offload(m, &vnet_hdr);
266 dst = vhost_iova_to_vva(dev, vq, guest_addr,
267 &len, VHOST_ACCESS_RW);
268 if (unlikely(!dst || !len)) {
273 rte_memcpy((void *)(uintptr_t)dst,
274 (void *)(uintptr_t)src, len);
276 PRINT_PACKET(dev, (uintptr_t)dst, (uint32_t)len, 0);
277 vhost_log_write(dev, guest_addr, len);
284 desc_avail = desc->len - dev->vhost_hlen;
285 if (unlikely(desc_chunck_len < dev->vhost_hlen)) {
286 desc_chunck_len = desc_avail;
287 desc_gaddr = desc->addr + dev->vhost_hlen;
288 desc_addr = vhost_iova_to_vva(dev,
292 if (unlikely(!desc_addr)) {
299 desc_offset = dev->vhost_hlen;
300 desc_chunck_len -= dev->vhost_hlen;
303 mbuf_avail = rte_pktmbuf_data_len(m);
305 while (mbuf_avail != 0 || m->next != NULL) {
306 /* done with current mbuf, fetch next */
307 if (mbuf_avail == 0) {
311 mbuf_avail = rte_pktmbuf_data_len(m);
314 /* done with current desc buf, fetch next */
315 if (desc_avail == 0) {
316 if ((desc->flags & VRING_DESC_F_NEXT) == 0) {
317 /* Room in vring buffer is not enough */
321 if (unlikely(desc->next >= size || ++nr_desc > size)) {
326 desc = &descs[desc->next];
327 desc_chunck_len = desc->len;
328 desc_gaddr = desc->addr;
329 desc_addr = vhost_iova_to_vva(dev, vq, desc_gaddr,
332 if (unlikely(!desc_addr)) {
338 desc_avail = desc->len;
339 } else if (unlikely(desc_chunck_len == 0)) {
340 desc_chunck_len = desc_avail;
341 desc_gaddr += desc_offset;
342 desc_addr = vhost_iova_to_vva(dev,
344 &desc_chunck_len, VHOST_ACCESS_RW);
345 if (unlikely(!desc_addr)) {
352 cpy_len = RTE_MIN(desc_chunck_len, mbuf_avail);
353 if (likely(cpy_len > MAX_BATCH_LEN || copy_nb >= vq->size)) {
354 rte_memcpy((void *)((uintptr_t)(desc_addr +
356 rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
358 vhost_log_write(dev, desc_gaddr + desc_offset, cpy_len);
359 PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset),
362 batch_copy[copy_nb].dst =
363 (void *)((uintptr_t)(desc_addr + desc_offset));
364 batch_copy[copy_nb].src =
365 rte_pktmbuf_mtod_offset(m, void *, mbuf_offset);
366 batch_copy[copy_nb].log_addr = desc_gaddr + desc_offset;
367 batch_copy[copy_nb].len = cpy_len;
371 mbuf_avail -= cpy_len;
372 mbuf_offset += cpy_len;
373 desc_avail -= cpy_len;
374 desc_offset += cpy_len;
375 desc_chunck_len -= cpy_len;
379 vq->batch_copy_nb_elems = copy_nb;
385 * This function adds buffers to the virtio devices RX virtqueue. Buffers can
386 * be received from the physical port or from another virtio device. A packet
387 * count is returned to indicate the number of packets that are successfully
388 * added to the RX queue. This function works when the mbuf is scattered, but
389 * it doesn't support the mergeable feature.
391 static __rte_always_inline uint32_t
392 virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
393 struct rte_mbuf **pkts, uint32_t count)
395 struct vhost_virtqueue *vq;
396 uint16_t avail_idx, free_entries, start_idx;
397 uint16_t desc_indexes[MAX_PKT_BURST];
398 struct vring_desc *descs;
402 VHOST_LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
403 if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->nr_vring))) {
404 RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n",
405 dev->vid, __func__, queue_id);
409 vq = dev->virtqueue[queue_id];
411 rte_spinlock_lock(&vq->access_lock);
413 if (unlikely(vq->enabled == 0))
414 goto out_access_unlock;
416 if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
417 vhost_user_iotlb_rd_lock(vq);
419 if (unlikely(vq->access_ok == 0)) {
420 if (unlikely(vring_translate(dev, vq) < 0)) {
426 avail_idx = *((volatile uint16_t *)&vq->avail->idx);
427 start_idx = vq->last_used_idx;
428 free_entries = avail_idx - start_idx;
429 count = RTE_MIN(count, free_entries);
430 count = RTE_MIN(count, (uint32_t)MAX_PKT_BURST);
434 VHOST_LOG_DEBUG(VHOST_DATA, "(%d) start_idx %d | end_idx %d\n",
435 dev->vid, start_idx, start_idx + count);
437 vq->batch_copy_nb_elems = 0;
439 /* Retrieve all of the desc indexes first to avoid caching issues. */
440 rte_prefetch0(&vq->avail->ring[start_idx & (vq->size - 1)]);
441 for (i = 0; i < count; i++) {
442 used_idx = (start_idx + i) & (vq->size - 1);
443 desc_indexes[i] = vq->avail->ring[used_idx];
444 vq->used->ring[used_idx].id = desc_indexes[i];
445 vq->used->ring[used_idx].len = pkts[i]->pkt_len +
447 vhost_log_used_vring(dev, vq,
448 offsetof(struct vring_used, ring[used_idx]),
449 sizeof(vq->used->ring[used_idx]));
452 rte_prefetch0(&vq->desc[desc_indexes[0]]);
453 for (i = 0; i < count; i++) {
454 struct vring_desc *idesc = NULL;
455 uint16_t desc_idx = desc_indexes[i];
458 if (vq->desc[desc_idx].flags & VRING_DESC_F_INDIRECT) {
459 uint64_t dlen = vq->desc[desc_idx].len;
460 descs = (struct vring_desc *)(uintptr_t)
461 vhost_iova_to_vva(dev,
462 vq, vq->desc[desc_idx].addr,
463 &dlen, VHOST_ACCESS_RO);
464 if (unlikely(!descs)) {
469 if (unlikely(dlen < vq->desc[desc_idx].len)) {
471 * The indirect desc table is not contiguous
472 * in process VA space, we have to copy it.
474 idesc = alloc_copy_ind_table(dev, vq,
475 &vq->desc[desc_idx]);
476 if (unlikely(!idesc))
483 sz = vq->desc[desc_idx].len / sizeof(*descs);
489 err = copy_mbuf_to_desc(dev, vq, descs, pkts[i], desc_idx, sz);
492 free_ind_table(idesc);
497 rte_prefetch0(&vq->desc[desc_indexes[i+1]]);
499 if (unlikely(!!idesc))
500 free_ind_table(idesc);
503 do_data_copy_enqueue(dev, vq);
507 *(volatile uint16_t *)&vq->used->idx += count;
508 vq->last_used_idx += count;
509 vhost_log_used_vring(dev, vq,
510 offsetof(struct vring_used, idx),
511 sizeof(vq->used->idx));
513 vhost_vring_call(dev, vq);
515 if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
516 vhost_user_iotlb_rd_unlock(vq);
519 rte_spinlock_unlock(&vq->access_lock);
524 static __rte_always_inline int
525 fill_vec_buf(struct virtio_net *dev, struct vhost_virtqueue *vq,
526 uint32_t avail_idx, uint32_t *vec_idx,
527 struct buf_vector *buf_vec, uint16_t *desc_chain_head,
528 uint16_t *desc_chain_len)
530 uint16_t idx = vq->avail->ring[avail_idx & (vq->size - 1)];
531 uint32_t vec_id = *vec_idx;
534 struct vring_desc *descs = vq->desc;
535 struct vring_desc *idesc = NULL;
537 *desc_chain_head = idx;
539 if (vq->desc[idx].flags & VRING_DESC_F_INDIRECT) {
540 dlen = vq->desc[idx].len;
541 descs = (struct vring_desc *)(uintptr_t)
542 vhost_iova_to_vva(dev, vq, vq->desc[idx].addr,
545 if (unlikely(!descs))
548 if (unlikely(dlen < vq->desc[idx].len)) {
550 * The indirect desc table is not contiguous
551 * in process VA space, we have to copy it.
553 idesc = alloc_copy_ind_table(dev, vq, &vq->desc[idx]);
554 if (unlikely(!idesc))
564 if (unlikely(vec_id >= BUF_VECTOR_MAX || idx >= vq->size)) {
565 free_ind_table(idesc);
569 len += descs[idx].len;
570 buf_vec[vec_id].buf_addr = descs[idx].addr;
571 buf_vec[vec_id].buf_len = descs[idx].len;
572 buf_vec[vec_id].desc_idx = idx;
575 if ((descs[idx].flags & VRING_DESC_F_NEXT) == 0)
578 idx = descs[idx].next;
581 *desc_chain_len = len;
584 if (unlikely(!!idesc))
585 free_ind_table(idesc);
591 * Returns -1 on fail, 0 on success
594 reserve_avail_buf_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq,
595 uint32_t size, struct buf_vector *buf_vec,
596 uint16_t *num_buffers, uint16_t avail_head)
599 uint32_t vec_idx = 0;
602 uint16_t head_idx = 0;
606 cur_idx = vq->last_avail_idx;
609 if (unlikely(cur_idx == avail_head))
612 if (unlikely(fill_vec_buf(dev, vq, cur_idx, &vec_idx, buf_vec,
613 &head_idx, &len) < 0))
615 len = RTE_MIN(len, size);
616 update_shadow_used_ring(vq, head_idx, len);
624 * if we tried all available ring items, and still
625 * can't get enough buf, it means something abnormal
628 if (unlikely(tries >= vq->size))
635 static __rte_always_inline int
636 copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq,
637 struct rte_mbuf *m, struct buf_vector *buf_vec,
638 uint16_t num_buffers)
640 uint32_t vec_idx = 0;
641 uint64_t desc_addr, desc_gaddr;
642 uint32_t mbuf_offset, mbuf_avail;
643 uint32_t desc_offset, desc_avail;
645 uint64_t desc_chunck_len;
646 uint64_t hdr_addr, hdr_phys_addr;
647 struct rte_mbuf *hdr_mbuf;
648 struct batch_copy_elem *batch_copy = vq->batch_copy_elems;
649 struct virtio_net_hdr_mrg_rxbuf tmp_hdr, *hdr = NULL;
650 uint16_t copy_nb = vq->batch_copy_nb_elems;
653 if (unlikely(m == NULL)) {
658 desc_chunck_len = buf_vec[vec_idx].buf_len;
659 desc_gaddr = buf_vec[vec_idx].buf_addr;
660 desc_addr = vhost_iova_to_vva(dev, vq,
664 if (buf_vec[vec_idx].buf_len < dev->vhost_hlen || !desc_addr) {
670 hdr_addr = desc_addr;
671 if (unlikely(desc_chunck_len < dev->vhost_hlen))
674 hdr = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)hdr_addr;
675 hdr_phys_addr = desc_gaddr;
676 rte_prefetch0((void *)(uintptr_t)hdr_addr);
678 VHOST_LOG_DEBUG(VHOST_DATA, "(%d) RX: num merge buffers %d\n",
679 dev->vid, num_buffers);
681 desc_avail = buf_vec[vec_idx].buf_len - dev->vhost_hlen;
682 if (unlikely(desc_chunck_len < dev->vhost_hlen)) {
683 desc_chunck_len = desc_avail;
684 desc_gaddr += dev->vhost_hlen;
685 desc_addr = vhost_iova_to_vva(dev, vq,
689 if (unlikely(!desc_addr)) {
696 desc_offset = dev->vhost_hlen;
697 desc_chunck_len -= dev->vhost_hlen;
701 mbuf_avail = rte_pktmbuf_data_len(m);
703 while (mbuf_avail != 0 || m->next != NULL) {
704 /* done with current desc buf, get the next one */
705 if (desc_avail == 0) {
707 desc_chunck_len = buf_vec[vec_idx].buf_len;
708 desc_gaddr = buf_vec[vec_idx].buf_addr;
710 vhost_iova_to_vva(dev, vq,
714 if (unlikely(!desc_addr)) {
719 /* Prefetch buffer address. */
720 rte_prefetch0((void *)(uintptr_t)desc_addr);
722 desc_avail = buf_vec[vec_idx].buf_len;
723 } else if (unlikely(desc_chunck_len == 0)) {
724 desc_chunck_len = desc_avail;
725 desc_gaddr += desc_offset;
726 desc_addr = vhost_iova_to_vva(dev, vq,
728 &desc_chunck_len, VHOST_ACCESS_RW);
729 if (unlikely(!desc_addr)) {
736 /* done with current mbuf, get the next one */
737 if (mbuf_avail == 0) {
741 mbuf_avail = rte_pktmbuf_data_len(m);
745 virtio_enqueue_offload(hdr_mbuf, &hdr->hdr);
746 ASSIGN_UNLESS_EQUAL(hdr->num_buffers, num_buffers);
748 if (unlikely(hdr == &tmp_hdr)) {
750 uint64_t remain = dev->vhost_hlen;
751 uint64_t src = (uint64_t)(uintptr_t)hdr, dst;
752 uint64_t guest_addr = hdr_phys_addr;
756 dst = vhost_iova_to_vva(dev, vq,
759 if (unlikely(!dst || !len)) {
764 rte_memcpy((void *)(uintptr_t)dst,
765 (void *)(uintptr_t)src,
768 PRINT_PACKET(dev, (uintptr_t)dst,
770 vhost_log_write(dev, guest_addr, len);
777 PRINT_PACKET(dev, (uintptr_t)hdr_addr,
779 vhost_log_write(dev, hdr_phys_addr,
786 cpy_len = RTE_MIN(desc_chunck_len, mbuf_avail);
788 if (likely(cpy_len > MAX_BATCH_LEN || copy_nb >= vq->size)) {
789 rte_memcpy((void *)((uintptr_t)(desc_addr +
791 rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
793 vhost_log_write(dev, desc_gaddr + desc_offset, cpy_len);
794 PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset),
797 batch_copy[copy_nb].dst =
798 (void *)((uintptr_t)(desc_addr + desc_offset));
799 batch_copy[copy_nb].src =
800 rte_pktmbuf_mtod_offset(m, void *, mbuf_offset);
801 batch_copy[copy_nb].log_addr = desc_gaddr + desc_offset;
802 batch_copy[copy_nb].len = cpy_len;
806 mbuf_avail -= cpy_len;
807 mbuf_offset += cpy_len;
808 desc_avail -= cpy_len;
809 desc_offset += cpy_len;
810 desc_chunck_len -= cpy_len;
814 vq->batch_copy_nb_elems = copy_nb;
819 static __rte_always_inline uint32_t
820 virtio_dev_merge_rx(struct virtio_net *dev, uint16_t queue_id,
821 struct rte_mbuf **pkts, uint32_t count)
823 struct vhost_virtqueue *vq;
824 uint32_t pkt_idx = 0;
825 uint16_t num_buffers;
826 struct buf_vector buf_vec[BUF_VECTOR_MAX];
829 VHOST_LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
830 if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->nr_vring))) {
831 RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n",
832 dev->vid, __func__, queue_id);
836 vq = dev->virtqueue[queue_id];
838 rte_spinlock_lock(&vq->access_lock);
840 if (unlikely(vq->enabled == 0))
841 goto out_access_unlock;
843 if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
844 vhost_user_iotlb_rd_lock(vq);
846 if (unlikely(vq->access_ok == 0))
847 if (unlikely(vring_translate(dev, vq) < 0))
850 count = RTE_MIN((uint32_t)MAX_PKT_BURST, count);
854 vq->batch_copy_nb_elems = 0;
856 rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]);
858 vq->shadow_used_idx = 0;
859 avail_head = *((volatile uint16_t *)&vq->avail->idx);
860 for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
861 uint32_t pkt_len = pkts[pkt_idx]->pkt_len + dev->vhost_hlen;
863 if (unlikely(reserve_avail_buf_mergeable(dev, vq,
864 pkt_len, buf_vec, &num_buffers,
866 VHOST_LOG_DEBUG(VHOST_DATA,
867 "(%d) failed to get enough desc from vring\n",
869 vq->shadow_used_idx -= num_buffers;
873 VHOST_LOG_DEBUG(VHOST_DATA, "(%d) current index %d | end index %d\n",
874 dev->vid, vq->last_avail_idx,
875 vq->last_avail_idx + num_buffers);
877 if (copy_mbuf_to_desc_mergeable(dev, vq, pkts[pkt_idx],
878 buf_vec, num_buffers) < 0) {
879 vq->shadow_used_idx -= num_buffers;
883 vq->last_avail_idx += num_buffers;
886 do_data_copy_enqueue(dev, vq);
888 if (likely(vq->shadow_used_idx)) {
889 flush_shadow_used_ring(dev, vq);
890 vhost_vring_call(dev, vq);
894 if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
895 vhost_user_iotlb_rd_unlock(vq);
898 rte_spinlock_unlock(&vq->access_lock);
904 rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
905 struct rte_mbuf **pkts, uint16_t count)
907 struct virtio_net *dev = get_device(vid);
912 if (unlikely(!(dev->flags & VIRTIO_DEV_BUILTIN_VIRTIO_NET))) {
913 RTE_LOG(ERR, VHOST_DATA,
914 "(%d) %s: built-in vhost net backend is disabled.\n",
919 if (dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF))
920 return virtio_dev_merge_rx(dev, queue_id, pkts, count);
922 return virtio_dev_rx(dev, queue_id, pkts, count);
926 virtio_net_with_host_offload(struct virtio_net *dev)
929 ((1ULL << VIRTIO_NET_F_CSUM) |
930 (1ULL << VIRTIO_NET_F_HOST_ECN) |
931 (1ULL << VIRTIO_NET_F_HOST_TSO4) |
932 (1ULL << VIRTIO_NET_F_HOST_TSO6) |
933 (1ULL << VIRTIO_NET_F_HOST_UFO)))
940 parse_ethernet(struct rte_mbuf *m, uint16_t *l4_proto, void **l4_hdr)
942 struct ipv4_hdr *ipv4_hdr;
943 struct ipv6_hdr *ipv6_hdr;
945 struct ether_hdr *eth_hdr;
948 eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
950 m->l2_len = sizeof(struct ether_hdr);
951 ethertype = rte_be_to_cpu_16(eth_hdr->ether_type);
953 if (ethertype == ETHER_TYPE_VLAN) {
954 struct vlan_hdr *vlan_hdr = (struct vlan_hdr *)(eth_hdr + 1);
956 m->l2_len += sizeof(struct vlan_hdr);
957 ethertype = rte_be_to_cpu_16(vlan_hdr->eth_proto);
960 l3_hdr = (char *)eth_hdr + m->l2_len;
963 case ETHER_TYPE_IPv4:
965 *l4_proto = ipv4_hdr->next_proto_id;
966 m->l3_len = (ipv4_hdr->version_ihl & 0x0f) * 4;
967 *l4_hdr = (char *)l3_hdr + m->l3_len;
968 m->ol_flags |= PKT_TX_IPV4;
970 case ETHER_TYPE_IPv6:
972 *l4_proto = ipv6_hdr->proto;
973 m->l3_len = sizeof(struct ipv6_hdr);
974 *l4_hdr = (char *)l3_hdr + m->l3_len;
975 m->ol_flags |= PKT_TX_IPV6;
985 static __rte_always_inline void
986 vhost_dequeue_offload(struct virtio_net_hdr *hdr, struct rte_mbuf *m)
988 uint16_t l4_proto = 0;
990 struct tcp_hdr *tcp_hdr = NULL;
992 if (hdr->flags == 0 && hdr->gso_type == VIRTIO_NET_HDR_GSO_NONE)
995 parse_ethernet(m, &l4_proto, &l4_hdr);
996 if (hdr->flags == VIRTIO_NET_HDR_F_NEEDS_CSUM) {
997 if (hdr->csum_start == (m->l2_len + m->l3_len)) {
998 switch (hdr->csum_offset) {
999 case (offsetof(struct tcp_hdr, cksum)):
1000 if (l4_proto == IPPROTO_TCP)
1001 m->ol_flags |= PKT_TX_TCP_CKSUM;
1003 case (offsetof(struct udp_hdr, dgram_cksum)):
1004 if (l4_proto == IPPROTO_UDP)
1005 m->ol_flags |= PKT_TX_UDP_CKSUM;
1007 case (offsetof(struct sctp_hdr, cksum)):
1008 if (l4_proto == IPPROTO_SCTP)
1009 m->ol_flags |= PKT_TX_SCTP_CKSUM;
1017 if (l4_hdr && hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
1018 switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
1019 case VIRTIO_NET_HDR_GSO_TCPV4:
1020 case VIRTIO_NET_HDR_GSO_TCPV6:
1022 m->ol_flags |= PKT_TX_TCP_SEG;
1023 m->tso_segsz = hdr->gso_size;
1024 m->l4_len = (tcp_hdr->data_off & 0xf0) >> 2;
1026 case VIRTIO_NET_HDR_GSO_UDP:
1027 m->ol_flags |= PKT_TX_UDP_SEG;
1028 m->tso_segsz = hdr->gso_size;
1029 m->l4_len = sizeof(struct udp_hdr);
1032 RTE_LOG(WARNING, VHOST_DATA,
1033 "unsupported gso type %u.\n", hdr->gso_type);
1039 static __rte_always_inline void
1040 put_zmbuf(struct zcopy_mbuf *zmbuf)
1045 static __rte_always_inline int
1046 copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
1047 struct vring_desc *descs, uint16_t max_desc,
1048 struct rte_mbuf *m, uint16_t desc_idx,
1049 struct rte_mempool *mbuf_pool)
1051 struct vring_desc *desc;
1052 uint64_t desc_addr, desc_gaddr;
1053 uint32_t desc_avail, desc_offset;
1054 uint32_t mbuf_avail, mbuf_offset;
1056 uint64_t desc_chunck_len;
1057 struct rte_mbuf *cur = m, *prev = m;
1058 struct virtio_net_hdr tmp_hdr;
1059 struct virtio_net_hdr *hdr = NULL;
1060 /* A counter to avoid desc dead loop chain */
1061 uint32_t nr_desc = 1;
1062 struct batch_copy_elem *batch_copy = vq->batch_copy_elems;
1063 uint16_t copy_nb = vq->batch_copy_nb_elems;
1066 desc = &descs[desc_idx];
1067 if (unlikely((desc->len < dev->vhost_hlen)) ||
1068 (desc->flags & VRING_DESC_F_INDIRECT)) {
1073 desc_chunck_len = desc->len;
1074 desc_gaddr = desc->addr;
1075 desc_addr = vhost_iova_to_vva(dev,
1079 if (unlikely(!desc_addr)) {
1084 if (virtio_net_with_host_offload(dev)) {
1085 if (unlikely(desc_chunck_len < sizeof(struct virtio_net_hdr))) {
1086 uint64_t len = desc_chunck_len;
1087 uint64_t remain = sizeof(struct virtio_net_hdr);
1088 uint64_t src = desc_addr;
1089 uint64_t dst = (uint64_t)(uintptr_t)&tmp_hdr;
1090 uint64_t guest_addr = desc_gaddr;
1093 * No luck, the virtio-net header doesn't fit
1094 * in a contiguous virtual area.
1098 src = vhost_iova_to_vva(dev, vq,
1101 if (unlikely(!src || !len)) {
1106 rte_memcpy((void *)(uintptr_t)dst,
1107 (void *)(uintptr_t)src, len);
1116 hdr = (struct virtio_net_hdr *)((uintptr_t)desc_addr);
1122 * A virtio driver normally uses at least 2 desc buffers
1123 * for Tx: the first for storing the header, and others
1124 * for storing the data.
1126 if (likely((desc->len == dev->vhost_hlen) &&
1127 (desc->flags & VRING_DESC_F_NEXT) != 0)) {
1128 desc = &descs[desc->next];
1129 if (unlikely(desc->flags & VRING_DESC_F_INDIRECT)) {
1134 desc_chunck_len = desc->len;
1135 desc_gaddr = desc->addr;
1136 desc_addr = vhost_iova_to_vva(dev,
1140 if (unlikely(!desc_addr)) {
1146 desc_avail = desc->len;
1149 desc_avail = desc->len - dev->vhost_hlen;
1151 if (unlikely(desc_chunck_len < dev->vhost_hlen)) {
1152 desc_chunck_len = desc_avail;
1153 desc_gaddr += dev->vhost_hlen;
1154 desc_addr = vhost_iova_to_vva(dev,
1158 if (unlikely(!desc_addr)) {
1165 desc_offset = dev->vhost_hlen;
1166 desc_chunck_len -= dev->vhost_hlen;
1170 rte_prefetch0((void *)(uintptr_t)(desc_addr + desc_offset));
1172 PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset),
1173 (uint32_t)desc_chunck_len, 0);
1176 mbuf_avail = m->buf_len - RTE_PKTMBUF_HEADROOM;
1180 cpy_len = RTE_MIN(desc_chunck_len, mbuf_avail);
1183 * A desc buf might across two host physical pages that are
1184 * not continuous. In such case (gpa_to_hpa returns 0), data
1185 * will be copied even though zero copy is enabled.
1187 if (unlikely(dev->dequeue_zero_copy && (hpa = gpa_to_hpa(dev,
1188 desc_gaddr + desc_offset, cpy_len)))) {
1189 cur->data_len = cpy_len;
1191 cur->buf_addr = (void *)(uintptr_t)(desc_addr
1193 cur->buf_iova = hpa;
1196 * In zero copy mode, one mbuf can only reference data
1197 * for one or partial of one desc buff.
1199 mbuf_avail = cpy_len;
1201 if (likely(cpy_len > MAX_BATCH_LEN ||
1202 copy_nb >= vq->size ||
1203 (hdr && cur == m) ||
1204 desc->len != desc_chunck_len)) {
1205 rte_memcpy(rte_pktmbuf_mtod_offset(cur, void *,
1207 (void *)((uintptr_t)(desc_addr +
1211 batch_copy[copy_nb].dst =
1212 rte_pktmbuf_mtod_offset(cur, void *,
1214 batch_copy[copy_nb].src =
1215 (void *)((uintptr_t)(desc_addr +
1217 batch_copy[copy_nb].len = cpy_len;
1222 mbuf_avail -= cpy_len;
1223 mbuf_offset += cpy_len;
1224 desc_avail -= cpy_len;
1225 desc_chunck_len -= cpy_len;
1226 desc_offset += cpy_len;
1228 /* This desc reaches to its end, get the next one */
1229 if (desc_avail == 0) {
1230 if ((desc->flags & VRING_DESC_F_NEXT) == 0)
1233 if (unlikely(desc->next >= max_desc ||
1234 ++nr_desc > max_desc)) {
1238 desc = &descs[desc->next];
1239 if (unlikely(desc->flags & VRING_DESC_F_INDIRECT)) {
1244 desc_chunck_len = desc->len;
1245 desc_gaddr = desc->addr;
1246 desc_addr = vhost_iova_to_vva(dev,
1250 if (unlikely(!desc_addr)) {
1255 rte_prefetch0((void *)(uintptr_t)desc_addr);
1258 desc_avail = desc->len;
1260 PRINT_PACKET(dev, (uintptr_t)desc_addr,
1261 (uint32_t)desc_chunck_len, 0);
1262 } else if (unlikely(desc_chunck_len == 0)) {
1263 desc_chunck_len = desc_avail;
1264 desc_gaddr += desc_offset;
1265 desc_addr = vhost_iova_to_vva(dev, vq,
1269 if (unlikely(!desc_addr)) {
1275 PRINT_PACKET(dev, (uintptr_t)desc_addr,
1276 (uint32_t)desc_chunck_len, 0);
1280 * This mbuf reaches to its end, get a new one
1281 * to hold more data.
1283 if (mbuf_avail == 0) {
1284 cur = rte_pktmbuf_alloc(mbuf_pool);
1285 if (unlikely(cur == NULL)) {
1286 RTE_LOG(ERR, VHOST_DATA, "Failed to "
1287 "allocate memory for mbuf.\n");
1291 if (unlikely(dev->dequeue_zero_copy))
1292 rte_mbuf_refcnt_update(cur, 1);
1295 prev->data_len = mbuf_offset;
1297 m->pkt_len += mbuf_offset;
1301 mbuf_avail = cur->buf_len - RTE_PKTMBUF_HEADROOM;
1305 prev->data_len = mbuf_offset;
1306 m->pkt_len += mbuf_offset;
1309 vhost_dequeue_offload(hdr, m);
1312 vq->batch_copy_nb_elems = copy_nb;
1317 static __rte_always_inline void
1318 update_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq,
1319 uint32_t used_idx, uint32_t desc_idx)
1321 vq->used->ring[used_idx].id = desc_idx;
1322 vq->used->ring[used_idx].len = 0;
1323 vhost_log_used_vring(dev, vq,
1324 offsetof(struct vring_used, ring[used_idx]),
1325 sizeof(vq->used->ring[used_idx]));
1328 static __rte_always_inline void
1329 update_used_idx(struct virtio_net *dev, struct vhost_virtqueue *vq,
1332 if (unlikely(count == 0))
1338 vq->used->idx += count;
1339 vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
1340 sizeof(vq->used->idx));
1341 vhost_vring_call(dev, vq);
1344 static __rte_always_inline struct zcopy_mbuf *
1345 get_zmbuf(struct vhost_virtqueue *vq)
1351 /* search [last_zmbuf_idx, zmbuf_size) */
1352 i = vq->last_zmbuf_idx;
1353 last = vq->zmbuf_size;
1356 for (; i < last; i++) {
1357 if (vq->zmbufs[i].in_use == 0) {
1358 vq->last_zmbuf_idx = i + 1;
1359 vq->zmbufs[i].in_use = 1;
1360 return &vq->zmbufs[i];
1366 /* search [0, last_zmbuf_idx) */
1368 last = vq->last_zmbuf_idx;
1375 static __rte_always_inline bool
1376 mbuf_is_consumed(struct rte_mbuf *m)
1379 if (rte_mbuf_refcnt_read(m) > 1)
1387 static __rte_always_inline void
1388 restore_mbuf(struct rte_mbuf *m)
1390 uint32_t mbuf_size, priv_size;
1393 priv_size = rte_pktmbuf_priv_size(m->pool);
1394 mbuf_size = sizeof(struct rte_mbuf) + priv_size;
1395 /* start of buffer is after mbuf structure and priv data */
1397 m->buf_addr = (char *)m + mbuf_size;
1398 m->buf_iova = rte_mempool_virt2iova(m) + mbuf_size;
1404 rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
1405 struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count)
1407 struct virtio_net *dev;
1408 struct rte_mbuf *rarp_mbuf = NULL;
1409 struct vhost_virtqueue *vq;
1410 uint32_t desc_indexes[MAX_PKT_BURST];
1413 uint16_t free_entries;
1416 dev = get_device(vid);
1420 if (unlikely(!(dev->flags & VIRTIO_DEV_BUILTIN_VIRTIO_NET))) {
1421 RTE_LOG(ERR, VHOST_DATA,
1422 "(%d) %s: built-in vhost net backend is disabled.\n",
1423 dev->vid, __func__);
1427 if (unlikely(!is_valid_virt_queue_idx(queue_id, 1, dev->nr_vring))) {
1428 RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n",
1429 dev->vid, __func__, queue_id);
1433 vq = dev->virtqueue[queue_id];
1435 if (unlikely(rte_spinlock_trylock(&vq->access_lock) == 0))
1438 if (unlikely(vq->enabled == 0))
1439 goto out_access_unlock;
1441 vq->batch_copy_nb_elems = 0;
1443 if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
1444 vhost_user_iotlb_rd_lock(vq);
1446 if (unlikely(vq->access_ok == 0))
1447 if (unlikely(vring_translate(dev, vq) < 0))
1450 if (unlikely(dev->dequeue_zero_copy)) {
1451 struct zcopy_mbuf *zmbuf, *next;
1454 for (zmbuf = TAILQ_FIRST(&vq->zmbuf_list);
1455 zmbuf != NULL; zmbuf = next) {
1456 next = TAILQ_NEXT(zmbuf, next);
1458 if (mbuf_is_consumed(zmbuf->mbuf)) {
1459 used_idx = vq->last_used_idx++ & (vq->size - 1);
1460 update_used_ring(dev, vq, used_idx,
1464 TAILQ_REMOVE(&vq->zmbuf_list, zmbuf, next);
1465 restore_mbuf(zmbuf->mbuf);
1466 rte_pktmbuf_free(zmbuf->mbuf);
1472 update_used_idx(dev, vq, nr_updated);
1476 * Construct a RARP broadcast packet, and inject it to the "pkts"
1477 * array, to looks like that guest actually send such packet.
1479 * Check user_send_rarp() for more information.
1481 * broadcast_rarp shares a cacheline in the virtio_net structure
1482 * with some fields that are accessed during enqueue and
1483 * rte_atomic16_cmpset() causes a write if using cmpxchg. This could
1484 * result in false sharing between enqueue and dequeue.
1486 * Prevent unnecessary false sharing by reading broadcast_rarp first
1487 * and only performing cmpset if the read indicates it is likely to
1491 if (unlikely(rte_atomic16_read(&dev->broadcast_rarp) &&
1492 rte_atomic16_cmpset((volatile uint16_t *)
1493 &dev->broadcast_rarp.cnt, 1, 0))) {
1495 rarp_mbuf = rte_net_make_rarp_packet(mbuf_pool, &dev->mac);
1496 if (rarp_mbuf == NULL) {
1497 RTE_LOG(ERR, VHOST_DATA,
1498 "Failed to make RARP packet.\n");
1504 free_entries = *((volatile uint16_t *)&vq->avail->idx) -
1506 if (free_entries == 0)
1509 VHOST_LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
1511 /* Prefetch available and used ring */
1512 avail_idx = vq->last_avail_idx & (vq->size - 1);
1513 used_idx = vq->last_used_idx & (vq->size - 1);
1514 rte_prefetch0(&vq->avail->ring[avail_idx]);
1515 rte_prefetch0(&vq->used->ring[used_idx]);
1517 count = RTE_MIN(count, MAX_PKT_BURST);
1518 count = RTE_MIN(count, free_entries);
1519 VHOST_LOG_DEBUG(VHOST_DATA, "(%d) about to dequeue %u buffers\n",
1522 /* Retrieve all of the head indexes first to avoid caching issues. */
1523 for (i = 0; i < count; i++) {
1524 avail_idx = (vq->last_avail_idx + i) & (vq->size - 1);
1525 used_idx = (vq->last_used_idx + i) & (vq->size - 1);
1526 desc_indexes[i] = vq->avail->ring[avail_idx];
1528 if (likely(dev->dequeue_zero_copy == 0))
1529 update_used_ring(dev, vq, used_idx, desc_indexes[i]);
1532 /* Prefetch descriptor index. */
1533 rte_prefetch0(&vq->desc[desc_indexes[0]]);
1534 for (i = 0; i < count; i++) {
1535 struct vring_desc *desc, *idesc = NULL;
1540 if (likely(i + 1 < count))
1541 rte_prefetch0(&vq->desc[desc_indexes[i + 1]]);
1543 if (vq->desc[desc_indexes[i]].flags & VRING_DESC_F_INDIRECT) {
1544 dlen = vq->desc[desc_indexes[i]].len;
1545 desc = (struct vring_desc *)(uintptr_t)
1546 vhost_iova_to_vva(dev, vq,
1547 vq->desc[desc_indexes[i]].addr,
1550 if (unlikely(!desc))
1553 if (unlikely(dlen < vq->desc[desc_indexes[i]].len)) {
1555 * The indirect desc table is not contiguous
1556 * in process VA space, we have to copy it.
1558 idesc = alloc_copy_ind_table(dev, vq,
1559 &vq->desc[desc_indexes[i]]);
1560 if (unlikely(!idesc))
1566 rte_prefetch0(desc);
1567 sz = vq->desc[desc_indexes[i]].len / sizeof(*desc);
1572 idx = desc_indexes[i];
1575 pkts[i] = rte_pktmbuf_alloc(mbuf_pool);
1576 if (unlikely(pkts[i] == NULL)) {
1577 RTE_LOG(ERR, VHOST_DATA,
1578 "Failed to allocate memory for mbuf.\n");
1579 free_ind_table(idesc);
1583 err = copy_desc_to_mbuf(dev, vq, desc, sz, pkts[i], idx,
1585 if (unlikely(err)) {
1586 rte_pktmbuf_free(pkts[i]);
1587 free_ind_table(idesc);
1591 if (unlikely(dev->dequeue_zero_copy)) {
1592 struct zcopy_mbuf *zmbuf;
1594 zmbuf = get_zmbuf(vq);
1596 rte_pktmbuf_free(pkts[i]);
1597 free_ind_table(idesc);
1600 zmbuf->mbuf = pkts[i];
1601 zmbuf->desc_idx = desc_indexes[i];
1604 * Pin lock the mbuf; we will check later to see
1605 * whether the mbuf is freed (when we are the last
1606 * user) or not. If that's the case, we then could
1607 * update the used ring safely.
1609 rte_mbuf_refcnt_update(pkts[i], 1);
1612 TAILQ_INSERT_TAIL(&vq->zmbuf_list, zmbuf, next);
1615 if (unlikely(!!idesc))
1616 free_ind_table(idesc);
1618 vq->last_avail_idx += i;
1620 if (likely(dev->dequeue_zero_copy == 0)) {
1621 do_data_copy_dequeue(vq);
1622 vq->last_used_idx += i;
1623 update_used_idx(dev, vq, i);
1627 if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
1628 vhost_user_iotlb_rd_unlock(vq);
1631 rte_spinlock_unlock(&vq->access_lock);
1633 if (unlikely(rarp_mbuf != NULL)) {
1635 * Inject it to the head of "pkts" array, so that switch's mac
1636 * learning table will get updated first.
1638 memmove(&pkts[1], pkts, i * sizeof(struct rte_mbuf *));
1639 pkts[0] = rarp_mbuf;