4 * Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * * Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * * Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * * Neither the name of Intel Corporation nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 #include <linux/virtio_net.h>
39 #include <rte_memcpy.h>
40 #include <rte_ether.h>
42 #include <rte_virtio_net.h>
47 #include "vhost-net.h"
49 #define MAX_PKT_BURST 32
50 #define VHOST_LOG_PAGE 4096
52 static inline void __attribute__((always_inline))
53 vhost_log_page(uint8_t *log_base, uint64_t page)
55 log_base[page / 8] |= 1 << (page % 8);
58 static inline void __attribute__((always_inline))
59 vhost_log_write(struct virtio_net *dev, uint64_t addr, uint64_t len)
63 if (likely(((dev->features & (1ULL << VHOST_F_LOG_ALL)) == 0) ||
64 !dev->log_base || !len))
67 if (unlikely(dev->log_size <= ((addr + len - 1) / VHOST_LOG_PAGE / 8)))
70 /* To make sure guest memory updates are committed before logging */
73 page = addr / VHOST_LOG_PAGE;
74 while (page * VHOST_LOG_PAGE < addr + len) {
75 vhost_log_page((uint8_t *)(uintptr_t)dev->log_base, page);
80 static inline void __attribute__((always_inline))
81 vhost_log_used_vring(struct virtio_net *dev, struct vhost_virtqueue *vq,
82 uint64_t offset, uint64_t len)
84 vhost_log_write(dev, vq->log_guest_addr + offset, len);
88 is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t qp_nb)
90 return (is_tx ^ (idx & 1)) == 0 && idx < qp_nb * VIRTIO_QNUM;
94 virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
96 memset(net_hdr, 0, sizeof(struct virtio_net_hdr));
98 if (m_buf->ol_flags & PKT_TX_L4_MASK) {
99 net_hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
100 net_hdr->csum_start = m_buf->l2_len + m_buf->l3_len;
102 switch (m_buf->ol_flags & PKT_TX_L4_MASK) {
103 case PKT_TX_TCP_CKSUM:
104 net_hdr->csum_offset = (offsetof(struct tcp_hdr,
107 case PKT_TX_UDP_CKSUM:
108 net_hdr->csum_offset = (offsetof(struct udp_hdr,
111 case PKT_TX_SCTP_CKSUM:
112 net_hdr->csum_offset = (offsetof(struct sctp_hdr,
118 if (m_buf->ol_flags & PKT_TX_TCP_SEG) {
119 if (m_buf->ol_flags & PKT_TX_IPV4)
120 net_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
122 net_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
123 net_hdr->gso_size = m_buf->tso_segsz;
124 net_hdr->hdr_len = m_buf->l2_len + m_buf->l3_len
132 * This function adds buffers to the virtio devices RX virtqueue. Buffers can
133 * be received from the physical port or from another virtio device. A packet
134 * count is returned to indicate the number of packets that are succesfully
135 * added to the RX queue. This function works when the mbuf is scattered, but
136 * it doesn't support the mergeable feature.
138 static inline uint32_t __attribute__((always_inline))
139 virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
140 struct rte_mbuf **pkts, uint32_t count)
142 struct vhost_virtqueue *vq;
143 struct vring_desc *desc;
144 struct rte_mbuf *buff, *first_buff;
145 /* The virtio_hdr is initialised to 0. */
146 struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0};
147 uint64_t buff_addr = 0;
148 uint64_t buff_hdr_addr = 0;
149 uint32_t head[MAX_PKT_BURST];
150 uint32_t head_idx, packet_success = 0;
151 uint16_t avail_idx, res_cur_idx;
152 uint16_t res_base_idx, res_end_idx;
153 uint16_t free_entries;
156 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh);
157 if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->virt_qp_nb))) {
158 RTE_LOG(ERR, VHOST_DATA,
159 "%s (%"PRIu64"): virtqueue idx:%d invalid.\n",
160 __func__, dev->device_fh, queue_id);
164 vq = dev->virtqueue[queue_id];
165 if (unlikely(vq->enabled == 0))
168 count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count;
171 * As many data cores may want access to available buffers,
172 * they need to be reserved.
175 res_base_idx = vq->last_used_idx_res;
176 avail_idx = *((volatile uint16_t *)&vq->avail->idx);
178 free_entries = (avail_idx - res_base_idx);
179 /*check that we have enough buffers*/
180 if (unlikely(count > free_entries))
181 count = free_entries;
186 res_end_idx = res_base_idx + count;
187 /* vq->last_used_idx_res is atomically updated. */
188 /* TODO: Allow to disable cmpset if no concurrency in application. */
189 success = rte_atomic16_cmpset(&vq->last_used_idx_res,
190 res_base_idx, res_end_idx);
191 } while (unlikely(success == 0));
192 res_cur_idx = res_base_idx;
193 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n",
194 dev->device_fh, res_cur_idx, res_end_idx);
196 /* Prefetch available ring to retrieve indexes. */
197 rte_prefetch0(&vq->avail->ring[res_cur_idx & (vq->size - 1)]);
199 /* Retrieve all of the head indexes first to avoid caching issues. */
200 for (head_idx = 0; head_idx < count; head_idx++)
201 head[head_idx] = vq->avail->ring[(res_cur_idx + head_idx) &
204 /*Prefetch descriptor index. */
205 rte_prefetch0(&vq->desc[head[packet_success]]);
207 while (res_cur_idx != res_end_idx) {
208 uint32_t offset = 0, vb_offset = 0;
209 uint32_t pkt_len, len_to_cpy, data_len, total_copied = 0;
210 uint8_t hdr = 0, uncompleted_pkt = 0;
213 /* Get descriptor from available ring */
214 desc = &vq->desc[head[packet_success]];
216 buff = pkts[packet_success];
219 /* Convert from gpa to vva (guest physical addr -> vhost virtual addr) */
220 buff_addr = gpa_to_vva(dev, desc->addr);
221 /* Prefetch buffer address. */
222 rte_prefetch0((void *)(uintptr_t)buff_addr);
224 /* Copy virtio_hdr to packet and increment buffer address */
225 buff_hdr_addr = buff_addr;
228 * If the descriptors are chained the header and data are
229 * placed in separate buffers.
231 if ((desc->flags & VRING_DESC_F_NEXT) &&
232 (desc->len == vq->vhost_hlen)) {
233 desc = &vq->desc[desc->next];
234 /* Buffer address translation. */
235 buff_addr = gpa_to_vva(dev, desc->addr);
237 vb_offset += vq->vhost_hlen;
241 pkt_len = rte_pktmbuf_pkt_len(buff);
242 data_len = rte_pktmbuf_data_len(buff);
243 len_to_cpy = RTE_MIN(data_len,
244 hdr ? desc->len - vq->vhost_hlen : desc->len);
245 while (total_copied < pkt_len) {
246 /* Copy mbuf data to buffer */
247 rte_memcpy((void *)(uintptr_t)(buff_addr + vb_offset),
248 rte_pktmbuf_mtod_offset(buff, const void *, offset),
250 PRINT_PACKET(dev, (uintptr_t)(buff_addr + vb_offset),
253 offset += len_to_cpy;
254 vb_offset += len_to_cpy;
255 total_copied += len_to_cpy;
257 /* The whole packet completes */
258 if (total_copied == pkt_len)
261 /* The current segment completes */
262 if (offset == data_len) {
265 data_len = rte_pktmbuf_data_len(buff);
268 /* The current vring descriptor done */
269 if (vb_offset == desc->len) {
270 if (desc->flags & VRING_DESC_F_NEXT) {
271 desc = &vq->desc[desc->next];
272 buff_addr = gpa_to_vva(dev, desc->addr);
275 /* Room in vring buffer is not enough */
280 len_to_cpy = RTE_MIN(data_len - offset, desc->len - vb_offset);
283 /* Update used ring with desc information */
284 idx = res_cur_idx & (vq->size - 1);
285 vq->used->ring[idx].id = head[packet_success];
287 /* Drop the packet if it is uncompleted */
288 if (unlikely(uncompleted_pkt == 1))
289 vq->used->ring[idx].len = vq->vhost_hlen;
291 vq->used->ring[idx].len = pkt_len + vq->vhost_hlen;
293 vhost_log_used_vring(dev, vq,
294 offsetof(struct vring_used, ring[idx]),
295 sizeof(vq->used->ring[idx]));
300 if (unlikely(uncompleted_pkt == 1))
303 virtio_enqueue_offload(first_buff, &virtio_hdr.hdr);
305 rte_memcpy((void *)(uintptr_t)buff_hdr_addr,
306 (const void *)&virtio_hdr, vq->vhost_hlen);
308 PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
310 if (res_cur_idx < res_end_idx) {
311 /* Prefetch descriptor index. */
312 rte_prefetch0(&vq->desc[head[packet_success]]);
316 rte_compiler_barrier();
318 /* Wait until it's our turn to add our buffer to the used ring. */
319 while (unlikely(vq->last_used_idx != res_base_idx))
322 *(volatile uint16_t *)&vq->used->idx += count;
323 vq->last_used_idx = res_end_idx;
324 vhost_log_used_vring(dev, vq,
325 offsetof(struct vring_used, idx),
326 sizeof(vq->used->idx));
328 /* flush used->idx update before we read avail->flags. */
331 /* Kick the guest if necessary. */
332 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
333 eventfd_write(vq->callfd, (eventfd_t)1);
337 static inline uint32_t __attribute__((always_inline))
338 copy_from_mbuf_to_vring(struct virtio_net *dev, uint32_t queue_id,
339 uint16_t res_base_idx, uint16_t res_end_idx,
340 struct rte_mbuf *pkt)
342 uint32_t vec_idx = 0;
343 uint32_t entry_success = 0;
344 struct vhost_virtqueue *vq;
345 /* The virtio_hdr is initialised to 0. */
346 struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {
347 {0, 0, 0, 0, 0, 0}, 0};
348 uint16_t cur_idx = res_base_idx;
349 uint64_t vb_addr = 0;
350 uint64_t vb_hdr_addr = 0;
351 uint32_t seg_offset = 0;
352 uint32_t vb_offset = 0;
355 uint32_t cpy_len, entry_len;
361 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| "
363 dev->device_fh, cur_idx, res_end_idx);
366 * Convert from gpa to vva
367 * (guest physical addr -> vhost virtual addr)
369 vq = dev->virtqueue[queue_id];
371 vb_addr = gpa_to_vva(dev, vq->buf_vec[vec_idx].buf_addr);
372 vb_hdr_addr = vb_addr;
374 /* Prefetch buffer address. */
375 rte_prefetch0((void *)(uintptr_t)vb_addr);
377 virtio_hdr.num_buffers = res_end_idx - res_base_idx;
379 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") RX: Num merge buffers %d\n",
380 dev->device_fh, virtio_hdr.num_buffers);
382 virtio_enqueue_offload(pkt, &virtio_hdr.hdr);
384 rte_memcpy((void *)(uintptr_t)vb_hdr_addr,
385 (const void *)&virtio_hdr, vq->vhost_hlen);
387 PRINT_PACKET(dev, (uintptr_t)vb_hdr_addr, vq->vhost_hlen, 1);
389 seg_avail = rte_pktmbuf_data_len(pkt);
390 vb_offset = vq->vhost_hlen;
391 vb_avail = vq->buf_vec[vec_idx].buf_len - vq->vhost_hlen;
393 entry_len = vq->vhost_hlen;
396 uint32_t desc_idx = vq->buf_vec[vec_idx].desc_idx;
398 if ((vq->desc[desc_idx].flags & VRING_DESC_F_NEXT) == 0) {
399 idx = cur_idx & (vq->size - 1);
401 /* Update used ring with desc information */
402 vq->used->ring[idx].id = vq->buf_vec[vec_idx].desc_idx;
403 vq->used->ring[idx].len = entry_len;
405 vhost_log_used_vring(dev, vq,
406 offsetof(struct vring_used, ring[idx]),
407 sizeof(vq->used->ring[idx]));
415 vb_addr = gpa_to_vva(dev, vq->buf_vec[vec_idx].buf_addr);
417 /* Prefetch buffer address. */
418 rte_prefetch0((void *)(uintptr_t)vb_addr);
420 vb_avail = vq->buf_vec[vec_idx].buf_len;
423 cpy_len = RTE_MIN(vb_avail, seg_avail);
425 while (cpy_len > 0) {
426 /* Copy mbuf data to vring buffer */
427 rte_memcpy((void *)(uintptr_t)(vb_addr + vb_offset),
428 rte_pktmbuf_mtod_offset(pkt, const void *, seg_offset),
432 (uintptr_t)(vb_addr + vb_offset),
435 seg_offset += cpy_len;
436 vb_offset += cpy_len;
437 seg_avail -= cpy_len;
439 entry_len += cpy_len;
441 if (seg_avail != 0) {
443 * The virtio buffer in this vring
444 * entry reach to its end.
445 * But the segment doesn't complete.
447 if ((vq->desc[vq->buf_vec[vec_idx].desc_idx].flags &
448 VRING_DESC_F_NEXT) == 0) {
449 /* Update used ring with desc information */
450 idx = cur_idx & (vq->size - 1);
451 vq->used->ring[idx].id
452 = vq->buf_vec[vec_idx].desc_idx;
453 vq->used->ring[idx].len = entry_len;
454 vhost_log_used_vring(dev, vq,
455 offsetof(struct vring_used, ring[idx]),
456 sizeof(vq->used->ring[idx]));
463 vb_addr = gpa_to_vva(dev,
464 vq->buf_vec[vec_idx].buf_addr);
466 vb_avail = vq->buf_vec[vec_idx].buf_len;
467 cpy_len = RTE_MIN(vb_avail, seg_avail);
470 * This current segment complete, need continue to
471 * check if the whole packet complete or not.
476 * There are more segments.
480 * This current buffer from vring is
481 * used up, need fetch next buffer
485 vq->buf_vec[vec_idx].desc_idx;
487 if ((vq->desc[desc_idx].flags &
488 VRING_DESC_F_NEXT) == 0) {
489 idx = cur_idx & (vq->size - 1);
491 * Update used ring with the
492 * descriptor information
494 vq->used->ring[idx].id
496 vq->used->ring[idx].len
498 vhost_log_used_vring(dev, vq,
499 offsetof(struct vring_used, ring[idx]),
500 sizeof(vq->used->ring[idx]));
506 /* Get next buffer from buf_vec. */
508 vb_addr = gpa_to_vva(dev,
509 vq->buf_vec[vec_idx].buf_addr);
511 vq->buf_vec[vec_idx].buf_len;
516 seg_avail = rte_pktmbuf_data_len(pkt);
517 cpy_len = RTE_MIN(vb_avail, seg_avail);
520 * This whole packet completes.
522 /* Update used ring with desc information */
523 idx = cur_idx & (vq->size - 1);
524 vq->used->ring[idx].id
525 = vq->buf_vec[vec_idx].desc_idx;
526 vq->used->ring[idx].len = entry_len;
527 vhost_log_used_vring(dev, vq,
528 offsetof(struct vring_used, ring[idx]),
529 sizeof(vq->used->ring[idx]));
536 return entry_success;
539 static inline void __attribute__((always_inline))
540 update_secure_len(struct vhost_virtqueue *vq, uint32_t id,
541 uint32_t *secure_len, uint32_t *vec_idx)
543 uint16_t wrapped_idx = id & (vq->size - 1);
544 uint32_t idx = vq->avail->ring[wrapped_idx];
546 uint32_t len = *secure_len;
547 uint32_t vec_id = *vec_idx;
551 len += vq->desc[idx].len;
552 vq->buf_vec[vec_id].buf_addr = vq->desc[idx].addr;
553 vq->buf_vec[vec_id].buf_len = vq->desc[idx].len;
554 vq->buf_vec[vec_id].desc_idx = idx;
557 if (vq->desc[idx].flags & VRING_DESC_F_NEXT) {
558 idx = vq->desc[idx].next;
568 * This function works for mergeable RX.
570 static inline uint32_t __attribute__((always_inline))
571 virtio_dev_merge_rx(struct virtio_net *dev, uint16_t queue_id,
572 struct rte_mbuf **pkts, uint32_t count)
574 struct vhost_virtqueue *vq;
575 uint32_t pkt_idx = 0, entry_success = 0;
577 uint16_t res_base_idx, res_cur_idx;
580 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_merge_rx()\n",
582 if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->virt_qp_nb))) {
583 RTE_LOG(ERR, VHOST_DATA,
584 "%s (%"PRIu64"): virtqueue idx:%d invalid.\n",
585 __func__, dev->device_fh, queue_id);
589 vq = dev->virtqueue[queue_id];
590 if (unlikely(vq->enabled == 0))
593 count = RTE_MIN((uint32_t)MAX_PKT_BURST, count);
598 for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
599 uint32_t pkt_len = pkts[pkt_idx]->pkt_len + vq->vhost_hlen;
603 * As many data cores may want access to available
604 * buffers, they need to be reserved.
606 uint32_t secure_len = 0;
607 uint32_t vec_idx = 0;
609 res_base_idx = vq->last_used_idx_res;
610 res_cur_idx = res_base_idx;
613 avail_idx = *((volatile uint16_t *)&vq->avail->idx);
614 if (unlikely(res_cur_idx == avail_idx))
617 update_secure_len(vq, res_cur_idx,
618 &secure_len, &vec_idx);
620 } while (pkt_len > secure_len);
622 /* vq->last_used_idx_res is atomically updated. */
623 success = rte_atomic16_cmpset(&vq->last_used_idx_res,
626 } while (success == 0);
628 entry_success = copy_from_mbuf_to_vring(dev, queue_id,
629 res_base_idx, res_cur_idx, pkts[pkt_idx]);
631 rte_compiler_barrier();
634 * Wait until it's our turn to add our buffer
637 while (unlikely(vq->last_used_idx != res_base_idx))
640 *(volatile uint16_t *)&vq->used->idx += entry_success;
641 vq->last_used_idx = res_cur_idx;
645 if (likely(pkt_idx)) {
646 /* flush used->idx update before we read avail->flags. */
649 /* Kick the guest if necessary. */
650 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
651 eventfd_write(vq->callfd, (eventfd_t)1);
658 rte_vhost_enqueue_burst(struct virtio_net *dev, uint16_t queue_id,
659 struct rte_mbuf **pkts, uint16_t count)
661 if (unlikely(dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF)))
662 return virtio_dev_merge_rx(dev, queue_id, pkts, count);
664 return virtio_dev_rx(dev, queue_id, pkts, count);
668 parse_ethernet(struct rte_mbuf *m, uint16_t *l4_proto, void **l4_hdr)
670 struct ipv4_hdr *ipv4_hdr;
671 struct ipv6_hdr *ipv6_hdr;
673 struct ether_hdr *eth_hdr;
676 eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
678 m->l2_len = sizeof(struct ether_hdr);
679 ethertype = rte_be_to_cpu_16(eth_hdr->ether_type);
681 if (ethertype == ETHER_TYPE_VLAN) {
682 struct vlan_hdr *vlan_hdr = (struct vlan_hdr *)(eth_hdr + 1);
684 m->l2_len += sizeof(struct vlan_hdr);
685 ethertype = rte_be_to_cpu_16(vlan_hdr->eth_proto);
688 l3_hdr = (char *)eth_hdr + m->l2_len;
691 case ETHER_TYPE_IPv4:
692 ipv4_hdr = (struct ipv4_hdr *)l3_hdr;
693 *l4_proto = ipv4_hdr->next_proto_id;
694 m->l3_len = (ipv4_hdr->version_ihl & 0x0f) * 4;
695 *l4_hdr = (char *)l3_hdr + m->l3_len;
696 m->ol_flags |= PKT_TX_IPV4;
698 case ETHER_TYPE_IPv6:
699 ipv6_hdr = (struct ipv6_hdr *)l3_hdr;
700 *l4_proto = ipv6_hdr->proto;
701 m->l3_len = sizeof(struct ipv6_hdr);
702 *l4_hdr = (char *)l3_hdr + m->l3_len;
703 m->ol_flags |= PKT_TX_IPV6;
712 static inline void __attribute__((always_inline))
713 vhost_dequeue_offload(struct virtio_net_hdr *hdr, struct rte_mbuf *m)
715 uint16_t l4_proto = 0;
717 struct tcp_hdr *tcp_hdr = NULL;
719 parse_ethernet(m, &l4_proto, &l4_hdr);
720 if (hdr->flags == VIRTIO_NET_HDR_F_NEEDS_CSUM) {
721 if (hdr->csum_start == (m->l2_len + m->l3_len)) {
722 switch (hdr->csum_offset) {
723 case (offsetof(struct tcp_hdr, cksum)):
724 if (l4_proto == IPPROTO_TCP)
725 m->ol_flags |= PKT_TX_TCP_CKSUM;
727 case (offsetof(struct udp_hdr, dgram_cksum)):
728 if (l4_proto == IPPROTO_UDP)
729 m->ol_flags |= PKT_TX_UDP_CKSUM;
731 case (offsetof(struct sctp_hdr, cksum)):
732 if (l4_proto == IPPROTO_SCTP)
733 m->ol_flags |= PKT_TX_SCTP_CKSUM;
741 if (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
742 switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
743 case VIRTIO_NET_HDR_GSO_TCPV4:
744 case VIRTIO_NET_HDR_GSO_TCPV6:
745 tcp_hdr = (struct tcp_hdr *)l4_hdr;
746 m->ol_flags |= PKT_TX_TCP_SEG;
747 m->tso_segsz = hdr->gso_size;
748 m->l4_len = (tcp_hdr->data_off & 0xf0) >> 2;
751 RTE_LOG(WARNING, VHOST_DATA,
752 "unsupported gso type %u.\n", hdr->gso_type);
759 rte_vhost_dequeue_burst(struct virtio_net *dev, uint16_t queue_id,
760 struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count)
762 struct rte_mbuf *m, *prev;
763 struct vhost_virtqueue *vq;
764 struct vring_desc *desc;
765 uint64_t vb_addr = 0;
766 uint64_t vb_net_hdr_addr = 0;
767 uint32_t head[MAX_PKT_BURST];
770 uint16_t free_entries, entry_success = 0;
772 struct virtio_net_hdr *hdr = NULL;
774 if (unlikely(!is_valid_virt_queue_idx(queue_id, 1, dev->virt_qp_nb))) {
775 RTE_LOG(ERR, VHOST_DATA,
776 "%s (%"PRIu64"): virtqueue idx:%d invalid.\n",
777 __func__, dev->device_fh, queue_id);
781 vq = dev->virtqueue[queue_id];
782 if (unlikely(vq->enabled == 0))
785 avail_idx = *((volatile uint16_t *)&vq->avail->idx);
787 /* If there are no available buffers then return. */
788 if (vq->last_used_idx == avail_idx)
791 LOG_DEBUG(VHOST_DATA, "%s (%"PRIu64")\n", __func__,
794 /* Prefetch available ring to retrieve head indexes. */
795 rte_prefetch0(&vq->avail->ring[vq->last_used_idx & (vq->size - 1)]);
797 /*get the number of free entries in the ring*/
798 free_entries = (avail_idx - vq->last_used_idx);
800 free_entries = RTE_MIN(free_entries, count);
801 /* Limit to MAX_PKT_BURST. */
802 free_entries = RTE_MIN(free_entries, MAX_PKT_BURST);
804 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n",
805 dev->device_fh, free_entries);
806 /* Retrieve all of the head indexes first to avoid caching issues. */
807 for (i = 0; i < free_entries; i++)
808 head[i] = vq->avail->ring[(vq->last_used_idx + i) & (vq->size - 1)];
810 /* Prefetch descriptor index. */
811 rte_prefetch0(&vq->desc[head[entry_success]]);
812 rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
814 while (entry_success < free_entries) {
815 uint32_t vb_avail, vb_offset;
816 uint32_t seg_avail, seg_offset;
818 uint32_t seg_num = 0;
819 struct rte_mbuf *cur;
820 uint8_t alloc_err = 0;
822 desc = &vq->desc[head[entry_success]];
824 vb_net_hdr_addr = gpa_to_vva(dev, desc->addr);
825 hdr = (struct virtio_net_hdr *)((uintptr_t)vb_net_hdr_addr);
827 /* Discard first buffer as it is the virtio header */
828 if (desc->flags & VRING_DESC_F_NEXT) {
829 desc = &vq->desc[desc->next];
831 vb_avail = desc->len;
833 vb_offset = vq->vhost_hlen;
834 vb_avail = desc->len - vb_offset;
837 /* Buffer address translation. */
838 vb_addr = gpa_to_vva(dev, desc->addr);
839 /* Prefetch buffer address. */
840 rte_prefetch0((void *)(uintptr_t)vb_addr);
842 used_idx = vq->last_used_idx & (vq->size - 1);
844 if (entry_success < (free_entries - 1)) {
845 /* Prefetch descriptor index. */
846 rte_prefetch0(&vq->desc[head[entry_success+1]]);
847 rte_prefetch0(&vq->used->ring[(used_idx + 1) & (vq->size - 1)]);
850 /* Update used index buffer information. */
851 vq->used->ring[used_idx].id = head[entry_success];
852 vq->used->ring[used_idx].len = 0;
853 vhost_log_used_vring(dev, vq,
854 offsetof(struct vring_used, ring[used_idx]),
855 sizeof(vq->used->ring[used_idx]));
857 /* Allocate an mbuf and populate the structure. */
858 m = rte_pktmbuf_alloc(mbuf_pool);
859 if (unlikely(m == NULL)) {
860 RTE_LOG(ERR, VHOST_DATA,
861 "Failed to allocate memory for mbuf.\n");
865 seg_avail = m->buf_len - RTE_PKTMBUF_HEADROOM;
866 cpy_len = RTE_MIN(vb_avail, seg_avail);
868 PRINT_PACKET(dev, (uintptr_t)vb_addr, desc->len, 0);
873 while (cpy_len != 0) {
874 rte_memcpy(rte_pktmbuf_mtod_offset(cur, void *, seg_offset),
875 (void *)((uintptr_t)(vb_addr + vb_offset)),
878 seg_offset += cpy_len;
879 vb_offset += cpy_len;
881 seg_avail -= cpy_len;
885 * The segment reachs to its end,
886 * while the virtio buffer in TX vring has
887 * more data to be copied.
889 cur->data_len = seg_offset;
890 m->pkt_len += seg_offset;
891 /* Allocate mbuf and populate the structure. */
892 cur = rte_pktmbuf_alloc(mbuf_pool);
893 if (unlikely(cur == NULL)) {
894 RTE_LOG(ERR, VHOST_DATA, "Failed to "
895 "allocate memory for mbuf.\n");
905 seg_avail = cur->buf_len - RTE_PKTMBUF_HEADROOM;
907 if (desc->flags & VRING_DESC_F_NEXT) {
909 * There are more virtio buffers in
910 * same vring entry need to be copied.
912 if (seg_avail == 0) {
914 * The current segment hasn't
915 * room to accomodate more
918 cur->data_len = seg_offset;
919 m->pkt_len += seg_offset;
921 * Allocate an mbuf and
922 * populate the structure.
924 cur = rte_pktmbuf_alloc(mbuf_pool);
925 if (unlikely(cur == NULL)) {
939 seg_avail = cur->buf_len - RTE_PKTMBUF_HEADROOM;
942 desc = &vq->desc[desc->next];
944 /* Buffer address translation. */
945 vb_addr = gpa_to_vva(dev, desc->addr);
946 /* Prefetch buffer address. */
947 rte_prefetch0((void *)(uintptr_t)vb_addr);
949 vb_avail = desc->len;
951 PRINT_PACKET(dev, (uintptr_t)vb_addr,
954 /* The whole packet completes. */
955 cur->data_len = seg_offset;
956 m->pkt_len += seg_offset;
961 cpy_len = RTE_MIN(vb_avail, seg_avail);
964 if (unlikely(alloc_err == 1))
967 m->nb_segs = seg_num;
968 if ((hdr->flags != 0) || (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE))
969 vhost_dequeue_offload(hdr, m);
971 pkts[entry_success] = m;
976 rte_compiler_barrier();
977 vq->used->idx += entry_success;
978 vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
979 sizeof(vq->used->idx));
980 /* Kick guest if required. */
981 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
982 eventfd_write(vq->callfd, (eventfd_t)1);
983 return entry_success;