4 * Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * * Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * * Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * * Neither the name of Intel Corporation nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 #include <arpa/inet.h>
36 #include <linux/if_ether.h>
37 #include <linux/if_vlan.h>
38 #include <linux/virtio_net.h>
39 #include <linux/virtio_ring.h>
42 #include <sys/eventfd.h>
43 #include <sys/param.h>
46 #include <rte_atomic.h>
47 #include <rte_cycles.h>
48 #include <rte_ethdev.h>
50 #include <rte_string_fns.h>
51 #include <rte_malloc.h>
54 #include "virtio-net.h"
55 #include "vhost-net-cdev.h"
57 #define MAX_PKT_BURST 32 /* Max burst size for RX/TX */
60 * This function adds buffers to the virtio devices RX virtqueue. Buffers can
61 * be received from the physical port or from another virtio device. A packet
62 * count is returned to indicate the number of packets that were succesfully
63 * added to the RX queue. This function works when mergeable is disabled.
65 static inline uint32_t __attribute__((always_inline))
66 virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id, struct rte_mbuf **pkts, uint32_t count)
68 struct vhost_virtqueue *vq;
69 struct vring_desc *desc;
70 struct rte_mbuf *buff;
71 /* The virtio_hdr is initialised to 0. */
72 struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0,0,0,0,0,0},0};
73 uint64_t buff_addr = 0;
74 uint64_t buff_hdr_addr = 0;
75 uint32_t head[MAX_PKT_BURST], packet_len = 0;
76 uint32_t head_idx, packet_success = 0;
77 uint16_t avail_idx, res_cur_idx;
78 uint16_t res_base_idx, res_end_idx;
79 uint16_t free_entries;
82 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh);
83 if (unlikely(queue_id != VIRTIO_RXQ)) {
84 LOG_DEBUG(VHOST_DATA, "mq isn't supported in this version.\n");
88 vq = dev->virtqueue[VIRTIO_RXQ];
89 count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count;
91 /* As many data cores may want access to available buffers, they need to be reserved. */
93 res_base_idx = vq->last_used_idx_res;
94 avail_idx = *((volatile uint16_t *)&vq->avail->idx);
96 free_entries = (avail_idx - res_base_idx);
97 /*check that we have enough buffers*/
98 if (unlikely(count > free_entries))
104 res_end_idx = res_base_idx + count;
105 /* vq->last_used_idx_res is atomically updated. */
106 success = rte_atomic16_cmpset(&vq->last_used_idx_res, res_base_idx,
108 } while (unlikely(success == 0));
109 res_cur_idx = res_base_idx;
110 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n", dev->device_fh, res_cur_idx, res_end_idx);
112 /* Prefetch available ring to retrieve indexes. */
113 rte_prefetch0(&vq->avail->ring[res_cur_idx & (vq->size - 1)]);
115 /* Retrieve all of the head indexes first to avoid caching issues. */
116 for (head_idx = 0; head_idx < count; head_idx++)
117 head[head_idx] = vq->avail->ring[(res_cur_idx + head_idx) & (vq->size - 1)];
119 /*Prefetch descriptor index. */
120 rte_prefetch0(&vq->desc[head[packet_success]]);
122 while (res_cur_idx != res_end_idx) {
123 /* Get descriptor from available ring */
124 desc = &vq->desc[head[packet_success]];
126 buff = pkts[packet_success];
128 /* Convert from gpa to vva (guest physical addr -> vhost virtual addr) */
129 buff_addr = gpa_to_vva(dev, desc->addr);
130 /* Prefetch buffer address. */
131 rte_prefetch0((void*)(uintptr_t)buff_addr);
133 /* Copy virtio_hdr to packet and increment buffer address */
134 buff_hdr_addr = buff_addr;
135 packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen;
138 * If the descriptors are chained the header and data are
139 * placed in separate buffers.
141 if (desc->flags & VRING_DESC_F_NEXT) {
142 desc->len = vq->vhost_hlen;
143 desc = &vq->desc[desc->next];
144 /* Buffer address translation. */
145 buff_addr = gpa_to_vva(dev, desc->addr);
146 desc->len = rte_pktmbuf_data_len(buff);
148 buff_addr += vq->vhost_hlen;
149 desc->len = packet_len;
152 /* Update used ring with desc information */
153 vq->used->ring[res_cur_idx & (vq->size - 1)].id = head[packet_success];
154 vq->used->ring[res_cur_idx & (vq->size - 1)].len = packet_len;
156 /* Copy mbuf data to buffer */
157 rte_memcpy((void *)(uintptr_t)buff_addr,
158 rte_pktmbuf_mtod(buff, const void *),
159 rte_pktmbuf_data_len(buff));
160 PRINT_PACKET(dev, (uintptr_t)buff_addr,
161 rte_pktmbuf_data_len(buff), 0);
166 rte_memcpy((void *)(uintptr_t)buff_hdr_addr,
167 (const void *)&virtio_hdr, vq->vhost_hlen);
169 PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
171 if (res_cur_idx < res_end_idx) {
172 /* Prefetch descriptor index. */
173 rte_prefetch0(&vq->desc[head[packet_success]]);
177 rte_compiler_barrier();
179 /* Wait until it's our turn to add our buffer to the used ring. */
180 while (unlikely(vq->last_used_idx != res_base_idx))
183 *(volatile uint16_t *)&vq->used->idx += count;
184 vq->last_used_idx = res_end_idx;
186 /* Kick the guest if necessary. */
187 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
188 eventfd_write((int)vq->kickfd, 1);
192 static inline uint32_t __attribute__((always_inline))
193 copy_from_mbuf_to_vring(struct virtio_net *dev,
194 uint16_t res_base_idx, uint16_t res_end_idx,
195 struct rte_mbuf *pkt)
197 uint32_t vec_idx = 0;
198 uint32_t entry_success = 0;
199 struct vhost_virtqueue *vq;
200 /* The virtio_hdr is initialised to 0. */
201 struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {
202 {0, 0, 0, 0, 0, 0}, 0};
203 uint16_t cur_idx = res_base_idx;
204 uint64_t vb_addr = 0;
205 uint64_t vb_hdr_addr = 0;
206 uint32_t seg_offset = 0;
207 uint32_t vb_offset = 0;
210 uint32_t cpy_len, entry_len;
215 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| "
217 dev->device_fh, cur_idx, res_end_idx);
220 * Convert from gpa to vva
221 * (guest physical addr -> vhost virtual addr)
223 vq = dev->virtqueue[VIRTIO_RXQ];
225 gpa_to_vva(dev, vq->buf_vec[vec_idx].buf_addr);
226 vb_hdr_addr = vb_addr;
228 /* Prefetch buffer address. */
229 rte_prefetch0((void *)(uintptr_t)vb_addr);
231 virtio_hdr.num_buffers = res_end_idx - res_base_idx;
233 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") RX: Num merge buffers %d\n",
234 dev->device_fh, virtio_hdr.num_buffers);
236 rte_memcpy((void *)(uintptr_t)vb_hdr_addr,
237 (const void *)&virtio_hdr, vq->vhost_hlen);
239 PRINT_PACKET(dev, (uintptr_t)vb_hdr_addr, vq->vhost_hlen, 1);
241 seg_avail = rte_pktmbuf_data_len(pkt);
242 vb_offset = vq->vhost_hlen;
244 vq->buf_vec[vec_idx].buf_len - vq->vhost_hlen;
246 entry_len = vq->vhost_hlen;
250 vq->buf_vec[vec_idx].desc_idx;
251 vq->desc[desc_idx].len = vq->vhost_hlen;
253 if ((vq->desc[desc_idx].flags
254 & VRING_DESC_F_NEXT) == 0) {
255 /* Update used ring with desc information */
256 vq->used->ring[cur_idx & (vq->size - 1)].id
257 = vq->buf_vec[vec_idx].desc_idx;
258 vq->used->ring[cur_idx & (vq->size - 1)].len
268 gpa_to_vva(dev, vq->buf_vec[vec_idx].buf_addr);
270 /* Prefetch buffer address. */
271 rte_prefetch0((void *)(uintptr_t)vb_addr);
273 vb_avail = vq->buf_vec[vec_idx].buf_len;
276 cpy_len = RTE_MIN(vb_avail, seg_avail);
278 while (cpy_len > 0) {
279 /* Copy mbuf data to vring buffer */
280 rte_memcpy((void *)(uintptr_t)(vb_addr + vb_offset),
281 (const void *)(rte_pktmbuf_mtod(pkt, char*) + seg_offset),
285 (uintptr_t)(vb_addr + vb_offset),
288 seg_offset += cpy_len;
289 vb_offset += cpy_len;
290 seg_avail -= cpy_len;
292 entry_len += cpy_len;
294 if (seg_avail != 0) {
296 * The virtio buffer in this vring
297 * entry reach to its end.
298 * But the segment doesn't complete.
300 if ((vq->desc[vq->buf_vec[vec_idx].desc_idx].flags &
301 VRING_DESC_F_NEXT) == 0) {
302 /* Update used ring with desc information */
303 vq->used->ring[cur_idx & (vq->size - 1)].id
304 = vq->buf_vec[vec_idx].desc_idx;
305 vq->used->ring[cur_idx & (vq->size - 1)].len
313 vb_addr = gpa_to_vva(dev,
314 vq->buf_vec[vec_idx].buf_addr);
316 vb_avail = vq->buf_vec[vec_idx].buf_len;
317 cpy_len = RTE_MIN(vb_avail, seg_avail);
320 * This current segment complete, need continue to
321 * check if the whole packet complete or not.
326 * There are more segments.
330 * This current buffer from vring is
331 * used up, need fetch next buffer
335 vq->buf_vec[vec_idx].desc_idx;
336 vq->desc[desc_idx].len = vb_offset;
338 if ((vq->desc[desc_idx].flags &
339 VRING_DESC_F_NEXT) == 0) {
340 uint16_t wrapped_idx =
341 cur_idx & (vq->size - 1);
343 * Update used ring with the
344 * descriptor information
346 vq->used->ring[wrapped_idx].id
348 vq->used->ring[wrapped_idx].len
355 /* Get next buffer from buf_vec. */
357 vb_addr = gpa_to_vva(dev,
358 vq->buf_vec[vec_idx].buf_addr);
360 vq->buf_vec[vec_idx].buf_len;
365 seg_avail = rte_pktmbuf_data_len(pkt);
366 cpy_len = RTE_MIN(vb_avail, seg_avail);
369 * This whole packet completes.
372 vq->buf_vec[vec_idx].desc_idx;
373 vq->desc[desc_idx].len = vb_offset;
375 while (vq->desc[desc_idx].flags &
377 desc_idx = vq->desc[desc_idx].next;
378 vq->desc[desc_idx].len = 0;
381 /* Update used ring with desc information */
382 vq->used->ring[cur_idx & (vq->size - 1)].id
383 = vq->buf_vec[vec_idx].desc_idx;
384 vq->used->ring[cur_idx & (vq->size - 1)].len
390 cpy_len = RTE_MIN(vb_avail, seg_avail);
395 return entry_success;
399 * This function adds buffers to the virtio devices RX virtqueue. Buffers can
400 * be received from the physical port or from another virtio device. A packet
401 * count is returned to indicate the number of packets that were succesfully
402 * added to the RX queue. This function works for mergeable RX.
404 static inline uint32_t __attribute__((always_inline))
405 virtio_dev_merge_rx(struct virtio_net *dev, uint16_t queue_id, struct rte_mbuf **pkts,
408 struct vhost_virtqueue *vq;
409 uint32_t pkt_idx = 0, entry_success = 0;
410 uint16_t avail_idx, res_cur_idx;
411 uint16_t res_base_idx, res_end_idx;
414 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_merge_rx()\n",
416 if (unlikely(queue_id != VIRTIO_RXQ)) {
417 LOG_DEBUG(VHOST_DATA, "mq isn't supported in this version.\n");
420 vq = dev->virtqueue[VIRTIO_RXQ];
421 count = RTE_MIN((uint32_t)MAX_PKT_BURST, count);
426 for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
427 uint32_t secure_len = 0;
429 uint32_t vec_idx = 0;
430 uint32_t pkt_len = pkts[pkt_idx]->pkt_len + vq->vhost_hlen;
435 * As many data cores may want access to available
436 * buffers, they need to be reserved.
438 res_base_idx = vq->last_used_idx_res;
439 res_cur_idx = res_base_idx;
442 avail_idx = *((volatile uint16_t *)&vq->avail->idx);
443 if (unlikely(res_cur_idx == avail_idx)) {
444 LOG_DEBUG(VHOST_DATA,
445 "(%"PRIu64") Failed "
446 "to get enough desc from "
451 uint16_t wrapped_idx =
452 (res_cur_idx) & (vq->size - 1);
454 vq->avail->ring[wrapped_idx];
459 secure_len += vq->desc[idx].len;
460 if (vq->desc[idx].flags &
462 idx = vq->desc[idx].next;
469 } while (pkt_len > secure_len);
471 /* vq->last_used_idx_res is atomically updated. */
472 success = rte_atomic16_cmpset(&vq->last_used_idx_res,
475 } while (success == 0);
478 need_cnt = res_cur_idx - res_base_idx;
480 for (i = 0; i < need_cnt; i++, id++) {
481 uint16_t wrapped_idx = id & (vq->size - 1);
482 uint32_t idx = vq->avail->ring[wrapped_idx];
486 vq->buf_vec[vec_idx].buf_addr =
488 vq->buf_vec[vec_idx].buf_len =
490 vq->buf_vec[vec_idx].desc_idx = idx;
493 if (vq->desc[idx].flags & VRING_DESC_F_NEXT) {
494 idx = vq->desc[idx].next;
500 res_end_idx = res_cur_idx;
502 entry_success = copy_from_mbuf_to_vring(dev, res_base_idx,
503 res_end_idx, pkts[pkt_idx]);
505 rte_compiler_barrier();
508 * Wait until it's our turn to add our buffer
511 while (unlikely(vq->last_used_idx != res_base_idx))
514 *(volatile uint16_t *)&vq->used->idx += entry_success;
515 vq->last_used_idx = res_end_idx;
517 /* Kick the guest if necessary. */
518 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
519 eventfd_write((int)vq->kickfd, 1);
525 /* This function works for TX packets with mergeable feature enabled. */
526 static inline uint16_t __attribute__((always_inline))
527 virtio_dev_merge_tx(struct virtio_net *dev, uint16_t queue_id, struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count)
529 struct rte_mbuf *m, *prev;
530 struct vhost_virtqueue *vq;
531 struct vring_desc *desc;
532 uint64_t vb_addr = 0;
533 uint32_t head[MAX_PKT_BURST];
536 uint16_t free_entries, entry_success = 0;
539 if (unlikely(queue_id != VIRTIO_TXQ)) {
540 LOG_DEBUG(VHOST_DATA, "mq isn't supported in this version.\n");
544 vq = dev->virtqueue[VIRTIO_TXQ];
545 avail_idx = *((volatile uint16_t *)&vq->avail->idx);
547 /* If there are no available buffers then return. */
548 if (vq->last_used_idx == avail_idx)
551 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_merge_tx()\n",
554 /* Prefetch available ring to retrieve head indexes. */
555 rte_prefetch0(&vq->avail->ring[vq->last_used_idx & (vq->size - 1)]);
557 /*get the number of free entries in the ring*/
558 free_entries = (avail_idx - vq->last_used_idx);
560 free_entries = RTE_MIN(free_entries, count);
561 /* Limit to MAX_PKT_BURST. */
562 free_entries = RTE_MIN(free_entries, MAX_PKT_BURST);
564 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n",
565 dev->device_fh, free_entries);
566 /* Retrieve all of the head indexes first to avoid caching issues. */
567 for (i = 0; i < free_entries; i++)
568 head[i] = vq->avail->ring[(vq->last_used_idx + i) & (vq->size - 1)];
570 /* Prefetch descriptor index. */
571 rte_prefetch0(&vq->desc[head[entry_success]]);
572 rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
574 while (entry_success < free_entries) {
575 uint32_t vb_avail, vb_offset;
576 uint32_t seg_avail, seg_offset;
578 uint32_t seg_num = 0;
579 struct rte_mbuf *cur;
580 uint8_t alloc_err = 0;
582 desc = &vq->desc[head[entry_success]];
584 /* Discard first buffer as it is the virtio header */
585 desc = &vq->desc[desc->next];
587 /* Buffer address translation. */
588 vb_addr = gpa_to_vva(dev, desc->addr);
589 /* Prefetch buffer address. */
590 rte_prefetch0((void *)(uintptr_t)vb_addr);
592 used_idx = vq->last_used_idx & (vq->size - 1);
594 if (entry_success < (free_entries - 1)) {
595 /* Prefetch descriptor index. */
596 rte_prefetch0(&vq->desc[head[entry_success+1]]);
597 rte_prefetch0(&vq->used->ring[(used_idx + 1) & (vq->size - 1)]);
600 /* Update used index buffer information. */
601 vq->used->ring[used_idx].id = head[entry_success];
602 vq->used->ring[used_idx].len = 0;
605 vb_avail = desc->len;
606 /* Allocate an mbuf and populate the structure. */
607 m = rte_pktmbuf_alloc(mbuf_pool);
608 if (unlikely(m == NULL)) {
609 RTE_LOG(ERR, VHOST_DATA,
610 "Failed to allocate memory for mbuf.\n");
611 return entry_success;
614 seg_avail = m->buf_len - RTE_PKTMBUF_HEADROOM;
615 cpy_len = RTE_MIN(vb_avail, seg_avail);
617 PRINT_PACKET(dev, (uintptr_t)vb_addr, desc->len, 0);
622 while (cpy_len != 0) {
623 rte_memcpy((void *)(rte_pktmbuf_mtod(cur, char *) + seg_offset),
624 (void *)((uintptr_t)(vb_addr + vb_offset)),
627 seg_offset += cpy_len;
628 vb_offset += cpy_len;
630 seg_avail -= cpy_len;
634 * The segment reachs to its end,
635 * while the virtio buffer in TX vring has
636 * more data to be copied.
638 cur->data_len = seg_offset;
639 m->pkt_len += seg_offset;
640 /* Allocate mbuf and populate the structure. */
641 cur = rte_pktmbuf_alloc(mbuf_pool);
642 if (unlikely(cur == NULL)) {
643 RTE_LOG(ERR, VHOST_DATA, "Failed to "
644 "allocate memory for mbuf.\n");
654 seg_avail = cur->buf_len - RTE_PKTMBUF_HEADROOM;
656 if (desc->flags & VRING_DESC_F_NEXT) {
658 * There are more virtio buffers in
659 * same vring entry need to be copied.
661 if (seg_avail == 0) {
663 * The current segment hasn't
664 * room to accomodate more
667 cur->data_len = seg_offset;
668 m->pkt_len += seg_offset;
670 * Allocate an mbuf and
671 * populate the structure.
673 cur = rte_pktmbuf_alloc(mbuf_pool);
674 if (unlikely(cur == NULL)) {
688 seg_avail = cur->buf_len - RTE_PKTMBUF_HEADROOM;
691 desc = &vq->desc[desc->next];
693 /* Buffer address translation. */
694 vb_addr = gpa_to_vva(dev, desc->addr);
695 /* Prefetch buffer address. */
696 rte_prefetch0((void *)(uintptr_t)vb_addr);
698 vb_avail = desc->len;
700 PRINT_PACKET(dev, (uintptr_t)vb_addr,
703 /* The whole packet completes. */
704 cur->data_len = seg_offset;
705 m->pkt_len += seg_offset;
710 cpy_len = RTE_MIN(vb_avail, seg_avail);
713 if (unlikely(alloc_err == 1))
716 m->nb_segs = seg_num;
718 pkts[entry_success] = m;
723 rte_compiler_barrier();
724 vq->used->idx += entry_success;
725 /* Kick guest if required. */
726 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
727 eventfd_write((int)vq->kickfd, 1);
728 return entry_success;