4 * Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * * Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * * Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * * Neither the name of Intel Corporation nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 #include <arpa/inet.h>
36 #include <linux/if_ether.h>
37 #include <linux/if_vlan.h>
38 #include <linux/virtio_net.h>
39 #include <linux/virtio_ring.h>
42 #include <sys/eventfd.h>
43 #include <sys/param.h>
46 #include <rte_atomic.h>
47 #include <rte_cycles.h>
48 #include <rte_ethdev.h>
50 #include <rte_string_fns.h>
51 #include <rte_malloc.h>
54 #include "virtio-net.h"
55 #include "vhost-net-cdev.h"
57 #define MAX_PKT_BURST 32 /* Max burst size for RX/TX */
60 * This function adds buffers to the virtio devices RX virtqueue. Buffers can
61 * be received from the physical port or from another virtio device. A packet
62 * count is returned to indicate the number of packets that were succesfully
63 * added to the RX queue. This function works when mergeable is disabled.
65 static inline uint32_t __attribute__((always_inline))
66 virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id, struct rte_mbuf **pkts, uint32_t count)
68 struct vhost_virtqueue *vq;
69 struct vring_desc *desc;
70 struct rte_mbuf *buff;
71 /* The virtio_hdr is initialised to 0. */
72 struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0,0,0,0,0,0},0};
73 uint64_t buff_addr = 0;
74 uint64_t buff_hdr_addr = 0;
75 uint32_t head[MAX_PKT_BURST], packet_len = 0;
76 uint32_t head_idx, packet_success = 0;
77 uint16_t avail_idx, res_cur_idx;
78 uint16_t res_base_idx, res_end_idx;
79 uint16_t free_entries;
82 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh);
83 if (unlikely(queue_id != VIRTIO_RXQ)) {
84 LOG_DEBUG(VHOST_DATA, "mq isn't supported in this version.\n");
88 vq = dev->virtqueue[VIRTIO_RXQ];
89 count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count;
91 /* As many data cores may want access to available buffers, they need to be reserved. */
93 res_base_idx = vq->last_used_idx_res;
94 avail_idx = *((volatile uint16_t *)&vq->avail->idx);
96 free_entries = (avail_idx - res_base_idx);
97 /*check that we have enough buffers*/
98 if (unlikely(count > free_entries))
104 res_end_idx = res_base_idx + count;
105 /* vq->last_used_idx_res is atomically updated. */
106 success = rte_atomic16_cmpset(&vq->last_used_idx_res, res_base_idx,
108 } while (unlikely(success == 0));
109 res_cur_idx = res_base_idx;
110 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n", dev->device_fh, res_cur_idx, res_end_idx);
112 /* Prefetch available ring to retrieve indexes. */
113 rte_prefetch0(&vq->avail->ring[res_cur_idx & (vq->size - 1)]);
115 /* Retrieve all of the head indexes first to avoid caching issues. */
116 for (head_idx = 0; head_idx < count; head_idx++)
117 head[head_idx] = vq->avail->ring[(res_cur_idx + head_idx) & (vq->size - 1)];
119 /*Prefetch descriptor index. */
120 rte_prefetch0(&vq->desc[head[packet_success]]);
122 while (res_cur_idx != res_end_idx) {
123 /* Get descriptor from available ring */
124 desc = &vq->desc[head[packet_success]];
126 buff = pkts[packet_success];
128 /* Convert from gpa to vva (guest physical addr -> vhost virtual addr) */
129 buff_addr = gpa_to_vva(dev, desc->addr);
130 /* Prefetch buffer address. */
131 rte_prefetch0((void*)(uintptr_t)buff_addr);
133 /* Copy virtio_hdr to packet and increment buffer address */
134 buff_hdr_addr = buff_addr;
135 packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen;
138 * If the descriptors are chained the header and data are
139 * placed in separate buffers.
141 if (desc->flags & VRING_DESC_F_NEXT) {
142 desc->len = vq->vhost_hlen;
143 desc = &vq->desc[desc->next];
144 /* Buffer address translation. */
145 buff_addr = gpa_to_vva(dev, desc->addr);
146 desc->len = rte_pktmbuf_data_len(buff);
148 buff_addr += vq->vhost_hlen;
149 desc->len = packet_len;
152 /* Update used ring with desc information */
153 vq->used->ring[res_cur_idx & (vq->size - 1)].id = head[packet_success];
154 vq->used->ring[res_cur_idx & (vq->size - 1)].len = packet_len;
156 /* Copy mbuf data to buffer */
157 rte_memcpy((void *)(uintptr_t)buff_addr,
158 rte_pktmbuf_mtod(buff, const void *),
159 rte_pktmbuf_data_len(buff));
160 PRINT_PACKET(dev, (uintptr_t)buff_addr,
161 rte_pktmbuf_data_len(buff), 0);
166 rte_memcpy((void *)(uintptr_t)buff_hdr_addr,
167 (const void *)&virtio_hdr, vq->vhost_hlen);
169 PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
171 if (res_cur_idx < res_end_idx) {
172 /* Prefetch descriptor index. */
173 rte_prefetch0(&vq->desc[head[packet_success]]);
177 rte_compiler_barrier();
179 /* Wait until it's our turn to add our buffer to the used ring. */
180 while (unlikely(vq->last_used_idx != res_base_idx))
183 *(volatile uint16_t *)&vq->used->idx += count;
184 vq->last_used_idx = res_end_idx;
186 /* Kick the guest if necessary. */
187 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
188 eventfd_write((int)vq->kickfd, 1);
192 static inline uint32_t __attribute__((always_inline))
193 copy_from_mbuf_to_vring(struct virtio_net *dev,
194 uint16_t res_base_idx, uint16_t res_end_idx,
195 struct rte_mbuf *pkt)
197 uint32_t vec_idx = 0;
198 uint32_t entry_success = 0;
199 struct vhost_virtqueue *vq;
200 /* The virtio_hdr is initialised to 0. */
201 struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {
202 {0, 0, 0, 0, 0, 0}, 0};
203 uint16_t cur_idx = res_base_idx;
204 uint64_t vb_addr = 0;
205 uint64_t vb_hdr_addr = 0;
206 uint32_t seg_offset = 0;
207 uint32_t vb_offset = 0;
210 uint32_t cpy_len, entry_len;
215 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| "
217 dev->device_fh, cur_idx, res_end_idx);
220 * Convert from gpa to vva
221 * (guest physical addr -> vhost virtual addr)
223 vq = dev->virtqueue[VIRTIO_RXQ];
225 gpa_to_vva(dev, vq->buf_vec[vec_idx].buf_addr);
226 vb_hdr_addr = vb_addr;
228 /* Prefetch buffer address. */
229 rte_prefetch0((void *)(uintptr_t)vb_addr);
231 virtio_hdr.num_buffers = res_end_idx - res_base_idx;
233 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") RX: Num merge buffers %d\n",
234 dev->device_fh, virtio_hdr.num_buffers);
236 rte_memcpy((void *)(uintptr_t)vb_hdr_addr,
237 (const void *)&virtio_hdr, vq->vhost_hlen);
239 PRINT_PACKET(dev, (uintptr_t)vb_hdr_addr, vq->vhost_hlen, 1);
241 seg_avail = rte_pktmbuf_data_len(pkt);
242 vb_offset = vq->vhost_hlen;
244 vq->buf_vec[vec_idx].buf_len - vq->vhost_hlen;
246 entry_len = vq->vhost_hlen;
250 vq->buf_vec[vec_idx].desc_idx;
251 vq->desc[desc_idx].len = vq->vhost_hlen;
253 if ((vq->desc[desc_idx].flags
254 & VRING_DESC_F_NEXT) == 0) {
255 /* Update used ring with desc information */
256 vq->used->ring[cur_idx & (vq->size - 1)].id
257 = vq->buf_vec[vec_idx].desc_idx;
258 vq->used->ring[cur_idx & (vq->size - 1)].len
268 gpa_to_vva(dev, vq->buf_vec[vec_idx].buf_addr);
270 /* Prefetch buffer address. */
271 rte_prefetch0((void *)(uintptr_t)vb_addr);
273 vb_avail = vq->buf_vec[vec_idx].buf_len;
276 cpy_len = RTE_MIN(vb_avail, seg_avail);
278 while (cpy_len > 0) {
279 /* Copy mbuf data to vring buffer */
280 rte_memcpy((void *)(uintptr_t)(vb_addr + vb_offset),
281 (const void *)(rte_pktmbuf_mtod(pkt, char*) + seg_offset),
285 (uintptr_t)(vb_addr + vb_offset),
288 seg_offset += cpy_len;
289 vb_offset += cpy_len;
290 seg_avail -= cpy_len;
292 entry_len += cpy_len;
294 if (seg_avail != 0) {
296 * The virtio buffer in this vring
297 * entry reach to its end.
298 * But the segment doesn't complete.
300 if ((vq->desc[vq->buf_vec[vec_idx].desc_idx].flags &
301 VRING_DESC_F_NEXT) == 0) {
302 /* Update used ring with desc information */
303 vq->used->ring[cur_idx & (vq->size - 1)].id
304 = vq->buf_vec[vec_idx].desc_idx;
305 vq->used->ring[cur_idx & (vq->size - 1)].len
313 vb_addr = gpa_to_vva(dev,
314 vq->buf_vec[vec_idx].buf_addr);
316 vb_avail = vq->buf_vec[vec_idx].buf_len;
317 cpy_len = RTE_MIN(vb_avail, seg_avail);
320 * This current segment complete, need continue to
321 * check if the whole packet complete or not.
326 * There are more segments.
330 * This current buffer from vring is
331 * used up, need fetch next buffer
335 vq->buf_vec[vec_idx].desc_idx;
336 vq->desc[desc_idx].len = vb_offset;
338 if ((vq->desc[desc_idx].flags &
339 VRING_DESC_F_NEXT) == 0) {
340 uint16_t wrapped_idx =
341 cur_idx & (vq->size - 1);
343 * Update used ring with the
344 * descriptor information
346 vq->used->ring[wrapped_idx].id
348 vq->used->ring[wrapped_idx].len
355 /* Get next buffer from buf_vec. */
357 vb_addr = gpa_to_vva(dev,
358 vq->buf_vec[vec_idx].buf_addr);
360 vq->buf_vec[vec_idx].buf_len;
365 seg_avail = rte_pktmbuf_data_len(pkt);
366 cpy_len = RTE_MIN(vb_avail, seg_avail);
369 * This whole packet completes.
372 vq->buf_vec[vec_idx].desc_idx;
373 vq->desc[desc_idx].len = vb_offset;
375 while (vq->desc[desc_idx].flags &
377 desc_idx = vq->desc[desc_idx].next;
378 vq->desc[desc_idx].len = 0;
381 /* Update used ring with desc information */
382 vq->used->ring[cur_idx & (vq->size - 1)].id
383 = vq->buf_vec[vec_idx].desc_idx;
384 vq->used->ring[cur_idx & (vq->size - 1)].len
390 cpy_len = RTE_MIN(vb_avail, seg_avail);
395 return entry_success;
399 * This function adds buffers to the virtio devices RX virtqueue. Buffers can
400 * be received from the physical port or from another virtio device. A packet
401 * count is returned to indicate the number of packets that were succesfully
402 * added to the RX queue. This function works for mergeable RX.
404 static inline uint32_t __attribute__((always_inline))
405 virtio_dev_merge_rx(struct virtio_net *dev, uint16_t queue_id, struct rte_mbuf **pkts,
408 struct vhost_virtqueue *vq;
409 uint32_t pkt_idx = 0, entry_success = 0;
410 uint16_t avail_idx, res_cur_idx;
411 uint16_t res_base_idx, res_end_idx;
414 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_merge_rx()\n",
416 if (unlikely(queue_id != VIRTIO_RXQ)) {
417 LOG_DEBUG(VHOST_DATA, "mq isn't supported in this version.\n");
420 vq = dev->virtqueue[VIRTIO_RXQ];
421 count = RTE_MIN((uint32_t)MAX_PKT_BURST, count);
426 for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
427 uint32_t secure_len = 0;
429 uint32_t vec_idx = 0;
430 uint32_t pkt_len = pkts[pkt_idx]->pkt_len + vq->vhost_hlen;
435 * As many data cores may want access to available
436 * buffers, they need to be reserved.
438 res_base_idx = vq->last_used_idx_res;
439 res_cur_idx = res_base_idx;
442 avail_idx = *((volatile uint16_t *)&vq->avail->idx);
443 if (unlikely(res_cur_idx == avail_idx)) {
444 LOG_DEBUG(VHOST_DATA,
445 "(%"PRIu64") Failed "
446 "to get enough desc from "
451 uint16_t wrapped_idx =
452 (res_cur_idx) & (vq->size - 1);
454 vq->avail->ring[wrapped_idx];
459 secure_len += vq->desc[idx].len;
460 if (vq->desc[idx].flags &
462 idx = vq->desc[idx].next;
469 } while (pkt_len > secure_len);
471 /* vq->last_used_idx_res is atomically updated. */
472 success = rte_atomic16_cmpset(&vq->last_used_idx_res,
475 } while (success == 0);
478 need_cnt = res_cur_idx - res_base_idx;
480 for (i = 0; i < need_cnt; i++, id++) {
481 uint16_t wrapped_idx = id & (vq->size - 1);
482 uint32_t idx = vq->avail->ring[wrapped_idx];
486 vq->buf_vec[vec_idx].buf_addr =
488 vq->buf_vec[vec_idx].buf_len =
490 vq->buf_vec[vec_idx].desc_idx = idx;
493 if (vq->desc[idx].flags & VRING_DESC_F_NEXT) {
494 idx = vq->desc[idx].next;
500 res_end_idx = res_cur_idx;
502 entry_success = copy_from_mbuf_to_vring(dev, res_base_idx,
503 res_end_idx, pkts[pkt_idx]);
505 rte_compiler_barrier();
508 * Wait until it's our turn to add our buffer
511 while (unlikely(vq->last_used_idx != res_base_idx))
514 *(volatile uint16_t *)&vq->used->idx += entry_success;
515 vq->last_used_idx = res_end_idx;
517 /* Kick the guest if necessary. */
518 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
519 eventfd_write((int)vq->kickfd, 1);
526 rte_vhost_enqueue_burst(struct virtio_net *dev, uint16_t queue_id, struct rte_mbuf **pkts, uint16_t count)
528 if (unlikely(dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF)))
529 return virtio_dev_merge_rx(dev, queue_id, pkts, count);
531 return virtio_dev_rx(dev, queue_id, pkts, count);
535 rte_vhost_dequeue_burst(struct virtio_net *dev, uint16_t queue_id, struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count)
537 struct rte_mbuf *m, *prev;
538 struct vhost_virtqueue *vq;
539 struct vring_desc *desc;
540 uint64_t vb_addr = 0;
541 uint32_t head[MAX_PKT_BURST];
544 uint16_t free_entries, entry_success = 0;
547 if (unlikely(queue_id != VIRTIO_TXQ)) {
548 LOG_DEBUG(VHOST_DATA, "mq isn't supported in this version.\n");
552 vq = dev->virtqueue[VIRTIO_TXQ];
553 avail_idx = *((volatile uint16_t *)&vq->avail->idx);
555 /* If there are no available buffers then return. */
556 if (vq->last_used_idx == avail_idx)
559 LOG_DEBUG(VHOST_DATA, "%s (%"PRIu64")\n", __func__,
562 /* Prefetch available ring to retrieve head indexes. */
563 rte_prefetch0(&vq->avail->ring[vq->last_used_idx & (vq->size - 1)]);
565 /*get the number of free entries in the ring*/
566 free_entries = (avail_idx - vq->last_used_idx);
568 free_entries = RTE_MIN(free_entries, count);
569 /* Limit to MAX_PKT_BURST. */
570 free_entries = RTE_MIN(free_entries, MAX_PKT_BURST);
572 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n",
573 dev->device_fh, free_entries);
574 /* Retrieve all of the head indexes first to avoid caching issues. */
575 for (i = 0; i < free_entries; i++)
576 head[i] = vq->avail->ring[(vq->last_used_idx + i) & (vq->size - 1)];
578 /* Prefetch descriptor index. */
579 rte_prefetch0(&vq->desc[head[entry_success]]);
580 rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
582 while (entry_success < free_entries) {
583 uint32_t vb_avail, vb_offset;
584 uint32_t seg_avail, seg_offset;
586 uint32_t seg_num = 0;
587 struct rte_mbuf *cur;
588 uint8_t alloc_err = 0;
590 desc = &vq->desc[head[entry_success]];
592 /* Discard first buffer as it is the virtio header */
593 desc = &vq->desc[desc->next];
595 /* Buffer address translation. */
596 vb_addr = gpa_to_vva(dev, desc->addr);
597 /* Prefetch buffer address. */
598 rte_prefetch0((void *)(uintptr_t)vb_addr);
600 used_idx = vq->last_used_idx & (vq->size - 1);
602 if (entry_success < (free_entries - 1)) {
603 /* Prefetch descriptor index. */
604 rte_prefetch0(&vq->desc[head[entry_success+1]]);
605 rte_prefetch0(&vq->used->ring[(used_idx + 1) & (vq->size - 1)]);
608 /* Update used index buffer information. */
609 vq->used->ring[used_idx].id = head[entry_success];
610 vq->used->ring[used_idx].len = 0;
613 vb_avail = desc->len;
614 /* Allocate an mbuf and populate the structure. */
615 m = rte_pktmbuf_alloc(mbuf_pool);
616 if (unlikely(m == NULL)) {
617 RTE_LOG(ERR, VHOST_DATA,
618 "Failed to allocate memory for mbuf.\n");
619 return entry_success;
622 seg_avail = m->buf_len - RTE_PKTMBUF_HEADROOM;
623 cpy_len = RTE_MIN(vb_avail, seg_avail);
625 PRINT_PACKET(dev, (uintptr_t)vb_addr, desc->len, 0);
630 while (cpy_len != 0) {
631 rte_memcpy((void *)(rte_pktmbuf_mtod(cur, char *) + seg_offset),
632 (void *)((uintptr_t)(vb_addr + vb_offset)),
635 seg_offset += cpy_len;
636 vb_offset += cpy_len;
638 seg_avail -= cpy_len;
642 * The segment reachs to its end,
643 * while the virtio buffer in TX vring has
644 * more data to be copied.
646 cur->data_len = seg_offset;
647 m->pkt_len += seg_offset;
648 /* Allocate mbuf and populate the structure. */
649 cur = rte_pktmbuf_alloc(mbuf_pool);
650 if (unlikely(cur == NULL)) {
651 RTE_LOG(ERR, VHOST_DATA, "Failed to "
652 "allocate memory for mbuf.\n");
662 seg_avail = cur->buf_len - RTE_PKTMBUF_HEADROOM;
664 if (desc->flags & VRING_DESC_F_NEXT) {
666 * There are more virtio buffers in
667 * same vring entry need to be copied.
669 if (seg_avail == 0) {
671 * The current segment hasn't
672 * room to accomodate more
675 cur->data_len = seg_offset;
676 m->pkt_len += seg_offset;
678 * Allocate an mbuf and
679 * populate the structure.
681 cur = rte_pktmbuf_alloc(mbuf_pool);
682 if (unlikely(cur == NULL)) {
696 seg_avail = cur->buf_len - RTE_PKTMBUF_HEADROOM;
699 desc = &vq->desc[desc->next];
701 /* Buffer address translation. */
702 vb_addr = gpa_to_vva(dev, desc->addr);
703 /* Prefetch buffer address. */
704 rte_prefetch0((void *)(uintptr_t)vb_addr);
706 vb_avail = desc->len;
708 PRINT_PACKET(dev, (uintptr_t)vb_addr,
711 /* The whole packet completes. */
712 cur->data_len = seg_offset;
713 m->pkt_len += seg_offset;
718 cpy_len = RTE_MIN(vb_avail, seg_avail);
721 if (unlikely(alloc_err == 1))
724 m->nb_segs = seg_num;
726 pkts[entry_success] = m;
731 rte_compiler_barrier();
732 vq->used->idx += entry_success;
733 /* Kick guest if required. */
734 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
735 eventfd_write((int)vq->kickfd, 1);
736 return entry_success;