4 * Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * * Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * * Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * * Neither the name of Intel Corporation nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 #include <linux/virtio_net.h>
39 #include <rte_memcpy.h>
40 #include <rte_ether.h>
42 #include <rte_virtio_net.h>
47 #include "vhost-net.h"
49 #define MAX_PKT_BURST 32
50 #define VHOST_LOG_PAGE 4096
52 static inline void __attribute__((always_inline))
53 vhost_log_page(uint8_t *log_base, uint64_t page)
55 log_base[page / 8] |= 1 << (page % 8);
58 static inline void __attribute__((always_inline))
59 vhost_log_write(struct virtio_net *dev, uint64_t addr, uint64_t len)
63 if (likely(((dev->features & (1ULL << VHOST_F_LOG_ALL)) == 0) ||
64 !dev->log_base || !len))
67 if (unlikely(dev->log_size <= ((addr + len - 1) / VHOST_LOG_PAGE / 8)))
70 /* To make sure guest memory updates are committed before logging */
73 page = addr / VHOST_LOG_PAGE;
74 while (page * VHOST_LOG_PAGE < addr + len) {
75 vhost_log_page((uint8_t *)(uintptr_t)dev->log_base, page);
80 static inline void __attribute__((always_inline))
81 vhost_log_used_vring(struct virtio_net *dev, struct vhost_virtqueue *vq,
82 uint64_t offset, uint64_t len)
84 vhost_log_write(dev, vq->log_guest_addr + offset, len);
88 is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t qp_nb)
90 return (is_tx ^ (idx & 1)) == 0 && idx < qp_nb * VIRTIO_QNUM;
94 virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
96 memset(net_hdr, 0, sizeof(struct virtio_net_hdr));
98 if (m_buf->ol_flags & PKT_TX_L4_MASK) {
99 net_hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
100 net_hdr->csum_start = m_buf->l2_len + m_buf->l3_len;
102 switch (m_buf->ol_flags & PKT_TX_L4_MASK) {
103 case PKT_TX_TCP_CKSUM:
104 net_hdr->csum_offset = (offsetof(struct tcp_hdr,
107 case PKT_TX_UDP_CKSUM:
108 net_hdr->csum_offset = (offsetof(struct udp_hdr,
111 case PKT_TX_SCTP_CKSUM:
112 net_hdr->csum_offset = (offsetof(struct sctp_hdr,
118 if (m_buf->ol_flags & PKT_TX_TCP_SEG) {
119 if (m_buf->ol_flags & PKT_TX_IPV4)
120 net_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
122 net_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
123 net_hdr->gso_size = m_buf->tso_segsz;
124 net_hdr->hdr_len = m_buf->l2_len + m_buf->l3_len
132 * This function adds buffers to the virtio devices RX virtqueue. Buffers can
133 * be received from the physical port or from another virtio device. A packet
134 * count is returned to indicate the number of packets that are succesfully
135 * added to the RX queue. This function works when the mbuf is scattered, but
136 * it doesn't support the mergeable feature.
138 static inline uint32_t __attribute__((always_inline))
139 virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
140 struct rte_mbuf **pkts, uint32_t count)
142 struct vhost_virtqueue *vq;
143 struct vring_desc *desc, *hdr_desc;
144 struct rte_mbuf *buff, *first_buff;
145 /* The virtio_hdr is initialised to 0. */
146 struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0};
147 uint64_t buff_addr = 0;
148 uint64_t buff_hdr_addr = 0;
149 uint32_t head[MAX_PKT_BURST];
150 uint32_t head_idx, packet_success = 0;
151 uint16_t avail_idx, res_cur_idx;
152 uint16_t res_base_idx, res_end_idx;
153 uint16_t free_entries;
156 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh);
157 if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->virt_qp_nb))) {
158 RTE_LOG(ERR, VHOST_DATA,
159 "%s (%"PRIu64"): virtqueue idx:%d invalid.\n",
160 __func__, dev->device_fh, queue_id);
164 vq = dev->virtqueue[queue_id];
165 if (unlikely(vq->enabled == 0))
168 count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count;
171 * As many data cores may want access to available buffers,
172 * they need to be reserved.
175 res_base_idx = vq->last_used_idx_res;
176 avail_idx = *((volatile uint16_t *)&vq->avail->idx);
178 free_entries = (avail_idx - res_base_idx);
179 /*check that we have enough buffers*/
180 if (unlikely(count > free_entries))
181 count = free_entries;
186 res_end_idx = res_base_idx + count;
187 /* vq->last_used_idx_res is atomically updated. */
188 /* TODO: Allow to disable cmpset if no concurrency in application. */
189 success = rte_atomic16_cmpset(&vq->last_used_idx_res,
190 res_base_idx, res_end_idx);
191 } while (unlikely(success == 0));
192 res_cur_idx = res_base_idx;
193 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n",
194 dev->device_fh, res_cur_idx, res_end_idx);
196 /* Prefetch available ring to retrieve indexes. */
197 rte_prefetch0(&vq->avail->ring[res_cur_idx & (vq->size - 1)]);
199 /* Retrieve all of the head indexes first to avoid caching issues. */
200 for (head_idx = 0; head_idx < count; head_idx++)
201 head[head_idx] = vq->avail->ring[(res_cur_idx + head_idx) &
204 /*Prefetch descriptor index. */
205 rte_prefetch0(&vq->desc[head[packet_success]]);
207 while (res_cur_idx != res_end_idx) {
208 uint32_t offset = 0, vb_offset = 0;
209 uint32_t pkt_len, len_to_cpy, data_len, total_copied = 0;
210 uint8_t hdr = 0, uncompleted_pkt = 0;
213 /* Get descriptor from available ring */
214 desc = &vq->desc[head[packet_success]];
216 buff = pkts[packet_success];
219 /* Convert from gpa to vva (guest physical addr -> vhost virtual addr) */
220 buff_addr = gpa_to_vva(dev, desc->addr);
221 /* Prefetch buffer address. */
222 rte_prefetch0((void *)(uintptr_t)buff_addr);
224 /* Copy virtio_hdr to packet and increment buffer address */
225 buff_hdr_addr = buff_addr;
229 * If the descriptors are chained the header and data are
230 * placed in separate buffers.
232 if ((desc->flags & VRING_DESC_F_NEXT) &&
233 (desc->len == vq->vhost_hlen)) {
234 desc = &vq->desc[desc->next];
235 /* Buffer address translation. */
236 buff_addr = gpa_to_vva(dev, desc->addr);
238 vb_offset += vq->vhost_hlen;
242 pkt_len = rte_pktmbuf_pkt_len(buff);
243 data_len = rte_pktmbuf_data_len(buff);
244 len_to_cpy = RTE_MIN(data_len,
245 hdr ? desc->len - vq->vhost_hlen : desc->len);
246 while (total_copied < pkt_len) {
247 /* Copy mbuf data to buffer */
248 rte_memcpy((void *)(uintptr_t)(buff_addr + vb_offset),
249 rte_pktmbuf_mtod_offset(buff, const void *, offset),
251 vhost_log_write(dev, desc->addr + vb_offset, len_to_cpy);
252 PRINT_PACKET(dev, (uintptr_t)(buff_addr + vb_offset),
255 offset += len_to_cpy;
256 vb_offset += len_to_cpy;
257 total_copied += len_to_cpy;
259 /* The whole packet completes */
260 if (total_copied == pkt_len)
263 /* The current segment completes */
264 if (offset == data_len) {
267 data_len = rte_pktmbuf_data_len(buff);
270 /* The current vring descriptor done */
271 if (vb_offset == desc->len) {
272 if (desc->flags & VRING_DESC_F_NEXT) {
273 desc = &vq->desc[desc->next];
274 buff_addr = gpa_to_vva(dev, desc->addr);
277 /* Room in vring buffer is not enough */
282 len_to_cpy = RTE_MIN(data_len - offset, desc->len - vb_offset);
285 /* Update used ring with desc information */
286 idx = res_cur_idx & (vq->size - 1);
287 vq->used->ring[idx].id = head[packet_success];
289 /* Drop the packet if it is uncompleted */
290 if (unlikely(uncompleted_pkt == 1))
291 vq->used->ring[idx].len = vq->vhost_hlen;
293 vq->used->ring[idx].len = pkt_len + vq->vhost_hlen;
295 vhost_log_used_vring(dev, vq,
296 offsetof(struct vring_used, ring[idx]),
297 sizeof(vq->used->ring[idx]));
302 if (unlikely(uncompleted_pkt == 1))
305 virtio_enqueue_offload(first_buff, &virtio_hdr.hdr);
307 rte_memcpy((void *)(uintptr_t)buff_hdr_addr,
308 (const void *)&virtio_hdr, vq->vhost_hlen);
309 vhost_log_write(dev, hdr_desc->addr, vq->vhost_hlen);
311 PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
313 if (res_cur_idx < res_end_idx) {
314 /* Prefetch descriptor index. */
315 rte_prefetch0(&vq->desc[head[packet_success]]);
319 rte_compiler_barrier();
321 /* Wait until it's our turn to add our buffer to the used ring. */
322 while (unlikely(vq->last_used_idx != res_base_idx))
325 *(volatile uint16_t *)&vq->used->idx += count;
326 vq->last_used_idx = res_end_idx;
327 vhost_log_used_vring(dev, vq,
328 offsetof(struct vring_used, idx),
329 sizeof(vq->used->idx));
331 /* flush used->idx update before we read avail->flags. */
334 /* Kick the guest if necessary. */
335 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
336 eventfd_write(vq->callfd, (eventfd_t)1);
340 static inline uint32_t __attribute__((always_inline))
341 copy_from_mbuf_to_vring(struct virtio_net *dev, uint32_t queue_id,
342 uint16_t res_base_idx, uint16_t res_end_idx,
343 struct rte_mbuf *pkt)
345 uint32_t vec_idx = 0;
346 uint32_t entry_success = 0;
347 struct vhost_virtqueue *vq;
348 /* The virtio_hdr is initialised to 0. */
349 struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {
350 {0, 0, 0, 0, 0, 0}, 0};
351 uint16_t cur_idx = res_base_idx;
352 uint64_t vb_addr = 0;
353 uint64_t vb_hdr_addr = 0;
354 uint32_t seg_offset = 0;
355 uint32_t vb_offset = 0;
358 uint32_t cpy_len, entry_len;
364 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| "
366 dev->device_fh, cur_idx, res_end_idx);
369 * Convert from gpa to vva
370 * (guest physical addr -> vhost virtual addr)
372 vq = dev->virtqueue[queue_id];
374 vb_addr = gpa_to_vva(dev, vq->buf_vec[vec_idx].buf_addr);
375 vb_hdr_addr = vb_addr;
377 /* Prefetch buffer address. */
378 rte_prefetch0((void *)(uintptr_t)vb_addr);
380 virtio_hdr.num_buffers = res_end_idx - res_base_idx;
382 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") RX: Num merge buffers %d\n",
383 dev->device_fh, virtio_hdr.num_buffers);
385 virtio_enqueue_offload(pkt, &virtio_hdr.hdr);
387 rte_memcpy((void *)(uintptr_t)vb_hdr_addr,
388 (const void *)&virtio_hdr, vq->vhost_hlen);
389 vhost_log_write(dev, vq->buf_vec[vec_idx].buf_addr, vq->vhost_hlen);
391 PRINT_PACKET(dev, (uintptr_t)vb_hdr_addr, vq->vhost_hlen, 1);
393 seg_avail = rte_pktmbuf_data_len(pkt);
394 vb_offset = vq->vhost_hlen;
395 vb_avail = vq->buf_vec[vec_idx].buf_len - vq->vhost_hlen;
397 entry_len = vq->vhost_hlen;
400 uint32_t desc_idx = vq->buf_vec[vec_idx].desc_idx;
402 if ((vq->desc[desc_idx].flags & VRING_DESC_F_NEXT) == 0) {
403 idx = cur_idx & (vq->size - 1);
405 /* Update used ring with desc information */
406 vq->used->ring[idx].id = vq->buf_vec[vec_idx].desc_idx;
407 vq->used->ring[idx].len = entry_len;
409 vhost_log_used_vring(dev, vq,
410 offsetof(struct vring_used, ring[idx]),
411 sizeof(vq->used->ring[idx]));
419 vb_addr = gpa_to_vva(dev, vq->buf_vec[vec_idx].buf_addr);
421 /* Prefetch buffer address. */
422 rte_prefetch0((void *)(uintptr_t)vb_addr);
424 vb_avail = vq->buf_vec[vec_idx].buf_len;
427 cpy_len = RTE_MIN(vb_avail, seg_avail);
429 while (cpy_len > 0) {
430 /* Copy mbuf data to vring buffer */
431 rte_memcpy((void *)(uintptr_t)(vb_addr + vb_offset),
432 rte_pktmbuf_mtod_offset(pkt, const void *, seg_offset),
434 vhost_log_write(dev, vq->buf_vec[vec_idx].buf_addr + vb_offset,
438 (uintptr_t)(vb_addr + vb_offset),
441 seg_offset += cpy_len;
442 vb_offset += cpy_len;
443 seg_avail -= cpy_len;
445 entry_len += cpy_len;
447 if (seg_avail != 0) {
449 * The virtio buffer in this vring
450 * entry reach to its end.
451 * But the segment doesn't complete.
453 if ((vq->desc[vq->buf_vec[vec_idx].desc_idx].flags &
454 VRING_DESC_F_NEXT) == 0) {
455 /* Update used ring with desc information */
456 idx = cur_idx & (vq->size - 1);
457 vq->used->ring[idx].id
458 = vq->buf_vec[vec_idx].desc_idx;
459 vq->used->ring[idx].len = entry_len;
460 vhost_log_used_vring(dev, vq,
461 offsetof(struct vring_used, ring[idx]),
462 sizeof(vq->used->ring[idx]));
469 vb_addr = gpa_to_vva(dev,
470 vq->buf_vec[vec_idx].buf_addr);
472 vb_avail = vq->buf_vec[vec_idx].buf_len;
473 cpy_len = RTE_MIN(vb_avail, seg_avail);
476 * This current segment complete, need continue to
477 * check if the whole packet complete or not.
482 * There are more segments.
486 * This current buffer from vring is
487 * used up, need fetch next buffer
491 vq->buf_vec[vec_idx].desc_idx;
493 if ((vq->desc[desc_idx].flags &
494 VRING_DESC_F_NEXT) == 0) {
495 idx = cur_idx & (vq->size - 1);
497 * Update used ring with the
498 * descriptor information
500 vq->used->ring[idx].id
502 vq->used->ring[idx].len
504 vhost_log_used_vring(dev, vq,
505 offsetof(struct vring_used, ring[idx]),
506 sizeof(vq->used->ring[idx]));
512 /* Get next buffer from buf_vec. */
514 vb_addr = gpa_to_vva(dev,
515 vq->buf_vec[vec_idx].buf_addr);
517 vq->buf_vec[vec_idx].buf_len;
522 seg_avail = rte_pktmbuf_data_len(pkt);
523 cpy_len = RTE_MIN(vb_avail, seg_avail);
526 * This whole packet completes.
528 /* Update used ring with desc information */
529 idx = cur_idx & (vq->size - 1);
530 vq->used->ring[idx].id
531 = vq->buf_vec[vec_idx].desc_idx;
532 vq->used->ring[idx].len = entry_len;
533 vhost_log_used_vring(dev, vq,
534 offsetof(struct vring_used, ring[idx]),
535 sizeof(vq->used->ring[idx]));
542 return entry_success;
545 static inline void __attribute__((always_inline))
546 update_secure_len(struct vhost_virtqueue *vq, uint32_t id,
547 uint32_t *secure_len, uint32_t *vec_idx)
549 uint16_t wrapped_idx = id & (vq->size - 1);
550 uint32_t idx = vq->avail->ring[wrapped_idx];
552 uint32_t len = *secure_len;
553 uint32_t vec_id = *vec_idx;
557 len += vq->desc[idx].len;
558 vq->buf_vec[vec_id].buf_addr = vq->desc[idx].addr;
559 vq->buf_vec[vec_id].buf_len = vq->desc[idx].len;
560 vq->buf_vec[vec_id].desc_idx = idx;
563 if (vq->desc[idx].flags & VRING_DESC_F_NEXT) {
564 idx = vq->desc[idx].next;
574 * This function works for mergeable RX.
576 static inline uint32_t __attribute__((always_inline))
577 virtio_dev_merge_rx(struct virtio_net *dev, uint16_t queue_id,
578 struct rte_mbuf **pkts, uint32_t count)
580 struct vhost_virtqueue *vq;
581 uint32_t pkt_idx = 0, entry_success = 0;
583 uint16_t res_base_idx, res_cur_idx;
586 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_merge_rx()\n",
588 if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->virt_qp_nb))) {
589 RTE_LOG(ERR, VHOST_DATA,
590 "%s (%"PRIu64"): virtqueue idx:%d invalid.\n",
591 __func__, dev->device_fh, queue_id);
595 vq = dev->virtqueue[queue_id];
596 if (unlikely(vq->enabled == 0))
599 count = RTE_MIN((uint32_t)MAX_PKT_BURST, count);
604 for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
605 uint32_t pkt_len = pkts[pkt_idx]->pkt_len + vq->vhost_hlen;
609 * As many data cores may want access to available
610 * buffers, they need to be reserved.
612 uint32_t secure_len = 0;
613 uint32_t vec_idx = 0;
615 res_base_idx = vq->last_used_idx_res;
616 res_cur_idx = res_base_idx;
619 avail_idx = *((volatile uint16_t *)&vq->avail->idx);
620 if (unlikely(res_cur_idx == avail_idx))
623 update_secure_len(vq, res_cur_idx,
624 &secure_len, &vec_idx);
626 } while (pkt_len > secure_len);
628 /* vq->last_used_idx_res is atomically updated. */
629 success = rte_atomic16_cmpset(&vq->last_used_idx_res,
632 } while (success == 0);
634 entry_success = copy_from_mbuf_to_vring(dev, queue_id,
635 res_base_idx, res_cur_idx, pkts[pkt_idx]);
637 rte_compiler_barrier();
640 * Wait until it's our turn to add our buffer
643 while (unlikely(vq->last_used_idx != res_base_idx))
646 *(volatile uint16_t *)&vq->used->idx += entry_success;
647 vq->last_used_idx = res_cur_idx;
651 if (likely(pkt_idx)) {
652 /* flush used->idx update before we read avail->flags. */
655 /* Kick the guest if necessary. */
656 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
657 eventfd_write(vq->callfd, (eventfd_t)1);
664 rte_vhost_enqueue_burst(struct virtio_net *dev, uint16_t queue_id,
665 struct rte_mbuf **pkts, uint16_t count)
667 if (unlikely(dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF)))
668 return virtio_dev_merge_rx(dev, queue_id, pkts, count);
670 return virtio_dev_rx(dev, queue_id, pkts, count);
674 parse_ethernet(struct rte_mbuf *m, uint16_t *l4_proto, void **l4_hdr)
676 struct ipv4_hdr *ipv4_hdr;
677 struct ipv6_hdr *ipv6_hdr;
679 struct ether_hdr *eth_hdr;
682 eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
684 m->l2_len = sizeof(struct ether_hdr);
685 ethertype = rte_be_to_cpu_16(eth_hdr->ether_type);
687 if (ethertype == ETHER_TYPE_VLAN) {
688 struct vlan_hdr *vlan_hdr = (struct vlan_hdr *)(eth_hdr + 1);
690 m->l2_len += sizeof(struct vlan_hdr);
691 ethertype = rte_be_to_cpu_16(vlan_hdr->eth_proto);
694 l3_hdr = (char *)eth_hdr + m->l2_len;
697 case ETHER_TYPE_IPv4:
698 ipv4_hdr = (struct ipv4_hdr *)l3_hdr;
699 *l4_proto = ipv4_hdr->next_proto_id;
700 m->l3_len = (ipv4_hdr->version_ihl & 0x0f) * 4;
701 *l4_hdr = (char *)l3_hdr + m->l3_len;
702 m->ol_flags |= PKT_TX_IPV4;
704 case ETHER_TYPE_IPv6:
705 ipv6_hdr = (struct ipv6_hdr *)l3_hdr;
706 *l4_proto = ipv6_hdr->proto;
707 m->l3_len = sizeof(struct ipv6_hdr);
708 *l4_hdr = (char *)l3_hdr + m->l3_len;
709 m->ol_flags |= PKT_TX_IPV6;
718 static inline void __attribute__((always_inline))
719 vhost_dequeue_offload(struct virtio_net_hdr *hdr, struct rte_mbuf *m)
721 uint16_t l4_proto = 0;
723 struct tcp_hdr *tcp_hdr = NULL;
725 parse_ethernet(m, &l4_proto, &l4_hdr);
726 if (hdr->flags == VIRTIO_NET_HDR_F_NEEDS_CSUM) {
727 if (hdr->csum_start == (m->l2_len + m->l3_len)) {
728 switch (hdr->csum_offset) {
729 case (offsetof(struct tcp_hdr, cksum)):
730 if (l4_proto == IPPROTO_TCP)
731 m->ol_flags |= PKT_TX_TCP_CKSUM;
733 case (offsetof(struct udp_hdr, dgram_cksum)):
734 if (l4_proto == IPPROTO_UDP)
735 m->ol_flags |= PKT_TX_UDP_CKSUM;
737 case (offsetof(struct sctp_hdr, cksum)):
738 if (l4_proto == IPPROTO_SCTP)
739 m->ol_flags |= PKT_TX_SCTP_CKSUM;
747 if (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
748 switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
749 case VIRTIO_NET_HDR_GSO_TCPV4:
750 case VIRTIO_NET_HDR_GSO_TCPV6:
751 tcp_hdr = (struct tcp_hdr *)l4_hdr;
752 m->ol_flags |= PKT_TX_TCP_SEG;
753 m->tso_segsz = hdr->gso_size;
754 m->l4_len = (tcp_hdr->data_off & 0xf0) >> 2;
757 RTE_LOG(WARNING, VHOST_DATA,
758 "unsupported gso type %u.\n", hdr->gso_type);
765 rte_vhost_dequeue_burst(struct virtio_net *dev, uint16_t queue_id,
766 struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count)
768 struct rte_mbuf *m, *prev;
769 struct vhost_virtqueue *vq;
770 struct vring_desc *desc;
771 uint64_t vb_addr = 0;
772 uint64_t vb_net_hdr_addr = 0;
773 uint32_t head[MAX_PKT_BURST];
776 uint16_t free_entries, entry_success = 0;
778 struct virtio_net_hdr *hdr = NULL;
780 if (unlikely(!is_valid_virt_queue_idx(queue_id, 1, dev->virt_qp_nb))) {
781 RTE_LOG(ERR, VHOST_DATA,
782 "%s (%"PRIu64"): virtqueue idx:%d invalid.\n",
783 __func__, dev->device_fh, queue_id);
787 vq = dev->virtqueue[queue_id];
788 if (unlikely(vq->enabled == 0))
791 avail_idx = *((volatile uint16_t *)&vq->avail->idx);
793 /* If there are no available buffers then return. */
794 if (vq->last_used_idx == avail_idx)
797 LOG_DEBUG(VHOST_DATA, "%s (%"PRIu64")\n", __func__,
800 /* Prefetch available ring to retrieve head indexes. */
801 rte_prefetch0(&vq->avail->ring[vq->last_used_idx & (vq->size - 1)]);
803 /*get the number of free entries in the ring*/
804 free_entries = (avail_idx - vq->last_used_idx);
806 free_entries = RTE_MIN(free_entries, count);
807 /* Limit to MAX_PKT_BURST. */
808 free_entries = RTE_MIN(free_entries, MAX_PKT_BURST);
810 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n",
811 dev->device_fh, free_entries);
812 /* Retrieve all of the head indexes first to avoid caching issues. */
813 for (i = 0; i < free_entries; i++)
814 head[i] = vq->avail->ring[(vq->last_used_idx + i) & (vq->size - 1)];
816 /* Prefetch descriptor index. */
817 rte_prefetch0(&vq->desc[head[entry_success]]);
818 rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
820 while (entry_success < free_entries) {
821 uint32_t vb_avail, vb_offset;
822 uint32_t seg_avail, seg_offset;
824 uint32_t seg_num = 0;
825 struct rte_mbuf *cur;
826 uint8_t alloc_err = 0;
828 desc = &vq->desc[head[entry_success]];
830 vb_net_hdr_addr = gpa_to_vva(dev, desc->addr);
831 hdr = (struct virtio_net_hdr *)((uintptr_t)vb_net_hdr_addr);
833 /* Discard first buffer as it is the virtio header */
834 if (desc->flags & VRING_DESC_F_NEXT) {
835 desc = &vq->desc[desc->next];
837 vb_avail = desc->len;
839 vb_offset = vq->vhost_hlen;
840 vb_avail = desc->len - vb_offset;
843 /* Buffer address translation. */
844 vb_addr = gpa_to_vva(dev, desc->addr);
845 /* Prefetch buffer address. */
846 rte_prefetch0((void *)(uintptr_t)vb_addr);
848 used_idx = vq->last_used_idx & (vq->size - 1);
850 if (entry_success < (free_entries - 1)) {
851 /* Prefetch descriptor index. */
852 rte_prefetch0(&vq->desc[head[entry_success+1]]);
853 rte_prefetch0(&vq->used->ring[(used_idx + 1) & (vq->size - 1)]);
856 /* Update used index buffer information. */
857 vq->used->ring[used_idx].id = head[entry_success];
858 vq->used->ring[used_idx].len = 0;
859 vhost_log_used_vring(dev, vq,
860 offsetof(struct vring_used, ring[used_idx]),
861 sizeof(vq->used->ring[used_idx]));
863 /* Allocate an mbuf and populate the structure. */
864 m = rte_pktmbuf_alloc(mbuf_pool);
865 if (unlikely(m == NULL)) {
866 RTE_LOG(ERR, VHOST_DATA,
867 "Failed to allocate memory for mbuf.\n");
871 seg_avail = m->buf_len - RTE_PKTMBUF_HEADROOM;
872 cpy_len = RTE_MIN(vb_avail, seg_avail);
874 PRINT_PACKET(dev, (uintptr_t)vb_addr, desc->len, 0);
879 while (cpy_len != 0) {
880 rte_memcpy(rte_pktmbuf_mtod_offset(cur, void *, seg_offset),
881 (void *)((uintptr_t)(vb_addr + vb_offset)),
884 seg_offset += cpy_len;
885 vb_offset += cpy_len;
887 seg_avail -= cpy_len;
891 * The segment reachs to its end,
892 * while the virtio buffer in TX vring has
893 * more data to be copied.
895 cur->data_len = seg_offset;
896 m->pkt_len += seg_offset;
897 /* Allocate mbuf and populate the structure. */
898 cur = rte_pktmbuf_alloc(mbuf_pool);
899 if (unlikely(cur == NULL)) {
900 RTE_LOG(ERR, VHOST_DATA, "Failed to "
901 "allocate memory for mbuf.\n");
911 seg_avail = cur->buf_len - RTE_PKTMBUF_HEADROOM;
913 if (desc->flags & VRING_DESC_F_NEXT) {
915 * There are more virtio buffers in
916 * same vring entry need to be copied.
918 if (seg_avail == 0) {
920 * The current segment hasn't
921 * room to accomodate more
924 cur->data_len = seg_offset;
925 m->pkt_len += seg_offset;
927 * Allocate an mbuf and
928 * populate the structure.
930 cur = rte_pktmbuf_alloc(mbuf_pool);
931 if (unlikely(cur == NULL)) {
945 seg_avail = cur->buf_len - RTE_PKTMBUF_HEADROOM;
948 desc = &vq->desc[desc->next];
950 /* Buffer address translation. */
951 vb_addr = gpa_to_vva(dev, desc->addr);
952 /* Prefetch buffer address. */
953 rte_prefetch0((void *)(uintptr_t)vb_addr);
955 vb_avail = desc->len;
957 PRINT_PACKET(dev, (uintptr_t)vb_addr,
960 /* The whole packet completes. */
961 cur->data_len = seg_offset;
962 m->pkt_len += seg_offset;
967 cpy_len = RTE_MIN(vb_avail, seg_avail);
970 if (unlikely(alloc_err == 1))
973 m->nb_segs = seg_num;
974 if ((hdr->flags != 0) || (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE))
975 vhost_dequeue_offload(hdr, m);
977 pkts[entry_success] = m;
982 rte_compiler_barrier();
983 vq->used->idx += entry_success;
984 vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
985 sizeof(vq->used->idx));
986 /* Kick guest if required. */
987 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
988 eventfd_write(vq->callfd, (eventfd_t)1);
989 return entry_success;