vhost: add guest offload setting
[dpdk.git] / lib / librte_vhost / vhost_rxtx.c
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33
34 #include <stdint.h>
35 #include <stdbool.h>
36 #include <linux/virtio_net.h>
37
38 #include <rte_mbuf.h>
39 #include <rte_memcpy.h>
40 #include <rte_ether.h>
41 #include <rte_ip.h>
42 #include <rte_virtio_net.h>
43 #include <rte_tcp.h>
44 #include <rte_udp.h>
45 #include <rte_sctp.h>
46
47 #include "vhost-net.h"
48
49 #define MAX_PKT_BURST 32
50
51 static bool
52 is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t qp_nb)
53 {
54         return (is_tx ^ (idx & 1)) == 0 && idx < qp_nb * VIRTIO_QNUM;
55 }
56
57 static void
58 virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
59 {
60         memset(net_hdr, 0, sizeof(struct virtio_net_hdr));
61
62         if (m_buf->ol_flags & PKT_TX_L4_MASK) {
63                 net_hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
64                 net_hdr->csum_start = m_buf->l2_len + m_buf->l3_len;
65
66                 switch (m_buf->ol_flags & PKT_TX_L4_MASK) {
67                 case PKT_TX_TCP_CKSUM:
68                         net_hdr->csum_offset = (offsetof(struct tcp_hdr,
69                                                 cksum));
70                         break;
71                 case PKT_TX_UDP_CKSUM:
72                         net_hdr->csum_offset = (offsetof(struct udp_hdr,
73                                                 dgram_cksum));
74                         break;
75                 case PKT_TX_SCTP_CKSUM:
76                         net_hdr->csum_offset = (offsetof(struct sctp_hdr,
77                                                 cksum));
78                         break;
79                 }
80         }
81
82         if (m_buf->ol_flags & PKT_TX_TCP_SEG) {
83                 if (m_buf->ol_flags & PKT_TX_IPV4)
84                         net_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
85                 else
86                         net_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
87                 net_hdr->gso_size = m_buf->tso_segsz;
88                 net_hdr->hdr_len = m_buf->l2_len + m_buf->l3_len
89                                         + m_buf->l4_len;
90         }
91
92         return;
93 }
94
95 /**
96  * This function adds buffers to the virtio devices RX virtqueue. Buffers can
97  * be received from the physical port or from another virtio device. A packet
98  * count is returned to indicate the number of packets that are succesfully
99  * added to the RX queue. This function works when the mbuf is scattered, but
100  * it doesn't support the mergeable feature.
101  */
102 static inline uint32_t __attribute__((always_inline))
103 virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
104         struct rte_mbuf **pkts, uint32_t count)
105 {
106         struct vhost_virtqueue *vq;
107         struct vring_desc *desc;
108         struct rte_mbuf *buff, *first_buff;
109         /* The virtio_hdr is initialised to 0. */
110         struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0};
111         uint64_t buff_addr = 0;
112         uint64_t buff_hdr_addr = 0;
113         uint32_t head[MAX_PKT_BURST];
114         uint32_t head_idx, packet_success = 0;
115         uint16_t avail_idx, res_cur_idx;
116         uint16_t res_base_idx, res_end_idx;
117         uint16_t free_entries;
118         uint8_t success = 0;
119
120         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh);
121         if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->virt_qp_nb))) {
122                 RTE_LOG(ERR, VHOST_DATA,
123                         "%s (%"PRIu64"): virtqueue idx:%d invalid.\n",
124                         __func__, dev->device_fh, queue_id);
125                 return 0;
126         }
127
128         vq = dev->virtqueue[queue_id];
129         if (unlikely(vq->enabled == 0))
130                 return 0;
131
132         count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count;
133
134         /*
135          * As many data cores may want access to available buffers,
136          * they need to be reserved.
137          */
138         do {
139                 res_base_idx = vq->last_used_idx_res;
140                 avail_idx = *((volatile uint16_t *)&vq->avail->idx);
141
142                 free_entries = (avail_idx - res_base_idx);
143                 /*check that we have enough buffers*/
144                 if (unlikely(count > free_entries))
145                         count = free_entries;
146
147                 if (count == 0)
148                         return 0;
149
150                 res_end_idx = res_base_idx + count;
151                 /* vq->last_used_idx_res is atomically updated. */
152                 /* TODO: Allow to disable cmpset if no concurrency in application. */
153                 success = rte_atomic16_cmpset(&vq->last_used_idx_res,
154                                 res_base_idx, res_end_idx);
155         } while (unlikely(success == 0));
156         res_cur_idx = res_base_idx;
157         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n",
158                         dev->device_fh, res_cur_idx, res_end_idx);
159
160         /* Prefetch available ring to retrieve indexes. */
161         rte_prefetch0(&vq->avail->ring[res_cur_idx & (vq->size - 1)]);
162
163         /* Retrieve all of the head indexes first to avoid caching issues. */
164         for (head_idx = 0; head_idx < count; head_idx++)
165                 head[head_idx] = vq->avail->ring[(res_cur_idx + head_idx) &
166                                         (vq->size - 1)];
167
168         /*Prefetch descriptor index. */
169         rte_prefetch0(&vq->desc[head[packet_success]]);
170
171         while (res_cur_idx != res_end_idx) {
172                 uint32_t offset = 0, vb_offset = 0;
173                 uint32_t pkt_len, len_to_cpy, data_len, total_copied = 0;
174                 uint8_t hdr = 0, uncompleted_pkt = 0;
175
176                 /* Get descriptor from available ring */
177                 desc = &vq->desc[head[packet_success]];
178
179                 buff = pkts[packet_success];
180                 first_buff = buff;
181
182                 /* Convert from gpa to vva (guest physical addr -> vhost virtual addr) */
183                 buff_addr = gpa_to_vva(dev, desc->addr);
184                 /* Prefetch buffer address. */
185                 rte_prefetch0((void *)(uintptr_t)buff_addr);
186
187                 /* Copy virtio_hdr to packet and increment buffer address */
188                 buff_hdr_addr = buff_addr;
189
190                 /*
191                  * If the descriptors are chained the header and data are
192                  * placed in separate buffers.
193                  */
194                 if ((desc->flags & VRING_DESC_F_NEXT) &&
195                         (desc->len == vq->vhost_hlen)) {
196                         desc = &vq->desc[desc->next];
197                         /* Buffer address translation. */
198                         buff_addr = gpa_to_vva(dev, desc->addr);
199                 } else {
200                         vb_offset += vq->vhost_hlen;
201                         hdr = 1;
202                 }
203
204                 pkt_len = rte_pktmbuf_pkt_len(buff);
205                 data_len = rte_pktmbuf_data_len(buff);
206                 len_to_cpy = RTE_MIN(data_len,
207                         hdr ? desc->len - vq->vhost_hlen : desc->len);
208                 while (total_copied < pkt_len) {
209                         /* Copy mbuf data to buffer */
210                         rte_memcpy((void *)(uintptr_t)(buff_addr + vb_offset),
211                                 rte_pktmbuf_mtod_offset(buff, const void *, offset),
212                                 len_to_cpy);
213                         PRINT_PACKET(dev, (uintptr_t)(buff_addr + vb_offset),
214                                 len_to_cpy, 0);
215
216                         offset += len_to_cpy;
217                         vb_offset += len_to_cpy;
218                         total_copied += len_to_cpy;
219
220                         /* The whole packet completes */
221                         if (total_copied == pkt_len)
222                                 break;
223
224                         /* The current segment completes */
225                         if (offset == data_len) {
226                                 buff = buff->next;
227                                 offset = 0;
228                                 data_len = rte_pktmbuf_data_len(buff);
229                         }
230
231                         /* The current vring descriptor done */
232                         if (vb_offset == desc->len) {
233                                 if (desc->flags & VRING_DESC_F_NEXT) {
234                                         desc = &vq->desc[desc->next];
235                                         buff_addr = gpa_to_vva(dev, desc->addr);
236                                         vb_offset = 0;
237                                 } else {
238                                         /* Room in vring buffer is not enough */
239                                         uncompleted_pkt = 1;
240                                         break;
241                                 }
242                         }
243                         len_to_cpy = RTE_MIN(data_len - offset, desc->len - vb_offset);
244                 }
245
246                 /* Update used ring with desc information */
247                 vq->used->ring[res_cur_idx & (vq->size - 1)].id =
248                                                         head[packet_success];
249
250                 /* Drop the packet if it is uncompleted */
251                 if (unlikely(uncompleted_pkt == 1))
252                         vq->used->ring[res_cur_idx & (vq->size - 1)].len =
253                                                         vq->vhost_hlen;
254                 else
255                         vq->used->ring[res_cur_idx & (vq->size - 1)].len =
256                                                         pkt_len + vq->vhost_hlen;
257
258                 res_cur_idx++;
259                 packet_success++;
260
261                 if (unlikely(uncompleted_pkt == 1))
262                         continue;
263
264                 virtio_enqueue_offload(first_buff, &virtio_hdr.hdr);
265
266                 rte_memcpy((void *)(uintptr_t)buff_hdr_addr,
267                         (const void *)&virtio_hdr, vq->vhost_hlen);
268
269                 PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
270
271                 if (res_cur_idx < res_end_idx) {
272                         /* Prefetch descriptor index. */
273                         rte_prefetch0(&vq->desc[head[packet_success]]);
274                 }
275         }
276
277         rte_compiler_barrier();
278
279         /* Wait until it's our turn to add our buffer to the used ring. */
280         while (unlikely(vq->last_used_idx != res_base_idx))
281                 rte_pause();
282
283         *(volatile uint16_t *)&vq->used->idx += count;
284         vq->last_used_idx = res_end_idx;
285
286         /* flush used->idx update before we read avail->flags. */
287         rte_mb();
288
289         /* Kick the guest if necessary. */
290         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
291                 eventfd_write(vq->callfd, (eventfd_t)1);
292         return count;
293 }
294
295 static inline uint32_t __attribute__((always_inline))
296 copy_from_mbuf_to_vring(struct virtio_net *dev, uint32_t queue_id,
297                         uint16_t res_base_idx, uint16_t res_end_idx,
298                         struct rte_mbuf *pkt)
299 {
300         uint32_t vec_idx = 0;
301         uint32_t entry_success = 0;
302         struct vhost_virtqueue *vq;
303         /* The virtio_hdr is initialised to 0. */
304         struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {
305                 {0, 0, 0, 0, 0, 0}, 0};
306         uint16_t cur_idx = res_base_idx;
307         uint64_t vb_addr = 0;
308         uint64_t vb_hdr_addr = 0;
309         uint32_t seg_offset = 0;
310         uint32_t vb_offset = 0;
311         uint32_t seg_avail;
312         uint32_t vb_avail;
313         uint32_t cpy_len, entry_len;
314
315         if (pkt == NULL)
316                 return 0;
317
318         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| "
319                 "End Index %d\n",
320                 dev->device_fh, cur_idx, res_end_idx);
321
322         /*
323          * Convert from gpa to vva
324          * (guest physical addr -> vhost virtual addr)
325          */
326         vq = dev->virtqueue[queue_id];
327
328         vb_addr = gpa_to_vva(dev, vq->buf_vec[vec_idx].buf_addr);
329         vb_hdr_addr = vb_addr;
330
331         /* Prefetch buffer address. */
332         rte_prefetch0((void *)(uintptr_t)vb_addr);
333
334         virtio_hdr.num_buffers = res_end_idx - res_base_idx;
335
336         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") RX: Num merge buffers %d\n",
337                 dev->device_fh, virtio_hdr.num_buffers);
338
339         virtio_enqueue_offload(pkt, &virtio_hdr.hdr);
340
341         rte_memcpy((void *)(uintptr_t)vb_hdr_addr,
342                 (const void *)&virtio_hdr, vq->vhost_hlen);
343
344         PRINT_PACKET(dev, (uintptr_t)vb_hdr_addr, vq->vhost_hlen, 1);
345
346         seg_avail = rte_pktmbuf_data_len(pkt);
347         vb_offset = vq->vhost_hlen;
348         vb_avail = vq->buf_vec[vec_idx].buf_len - vq->vhost_hlen;
349
350         entry_len = vq->vhost_hlen;
351
352         if (vb_avail == 0) {
353                 uint32_t desc_idx =
354                         vq->buf_vec[vec_idx].desc_idx;
355
356                 if ((vq->desc[desc_idx].flags
357                         & VRING_DESC_F_NEXT) == 0) {
358                         /* Update used ring with desc information */
359                         vq->used->ring[cur_idx & (vq->size - 1)].id
360                                 = vq->buf_vec[vec_idx].desc_idx;
361                         vq->used->ring[cur_idx & (vq->size - 1)].len
362                                 = entry_len;
363
364                         entry_len = 0;
365                         cur_idx++;
366                         entry_success++;
367                 }
368
369                 vec_idx++;
370                 vb_addr = gpa_to_vva(dev, vq->buf_vec[vec_idx].buf_addr);
371
372                 /* Prefetch buffer address. */
373                 rte_prefetch0((void *)(uintptr_t)vb_addr);
374                 vb_offset = 0;
375                 vb_avail = vq->buf_vec[vec_idx].buf_len;
376         }
377
378         cpy_len = RTE_MIN(vb_avail, seg_avail);
379
380         while (cpy_len > 0) {
381                 /* Copy mbuf data to vring buffer */
382                 rte_memcpy((void *)(uintptr_t)(vb_addr + vb_offset),
383                         rte_pktmbuf_mtod_offset(pkt, const void *, seg_offset),
384                         cpy_len);
385
386                 PRINT_PACKET(dev,
387                         (uintptr_t)(vb_addr + vb_offset),
388                         cpy_len, 0);
389
390                 seg_offset += cpy_len;
391                 vb_offset += cpy_len;
392                 seg_avail -= cpy_len;
393                 vb_avail -= cpy_len;
394                 entry_len += cpy_len;
395
396                 if (seg_avail != 0) {
397                         /*
398                          * The virtio buffer in this vring
399                          * entry reach to its end.
400                          * But the segment doesn't complete.
401                          */
402                         if ((vq->desc[vq->buf_vec[vec_idx].desc_idx].flags &
403                                 VRING_DESC_F_NEXT) == 0) {
404                                 /* Update used ring with desc information */
405                                 vq->used->ring[cur_idx & (vq->size - 1)].id
406                                         = vq->buf_vec[vec_idx].desc_idx;
407                                 vq->used->ring[cur_idx & (vq->size - 1)].len
408                                         = entry_len;
409                                 entry_len = 0;
410                                 cur_idx++;
411                                 entry_success++;
412                         }
413
414                         vec_idx++;
415                         vb_addr = gpa_to_vva(dev,
416                                 vq->buf_vec[vec_idx].buf_addr);
417                         vb_offset = 0;
418                         vb_avail = vq->buf_vec[vec_idx].buf_len;
419                         cpy_len = RTE_MIN(vb_avail, seg_avail);
420                 } else {
421                         /*
422                          * This current segment complete, need continue to
423                          * check if the whole packet complete or not.
424                          */
425                         pkt = pkt->next;
426                         if (pkt != NULL) {
427                                 /*
428                                  * There are more segments.
429                                  */
430                                 if (vb_avail == 0) {
431                                         /*
432                                          * This current buffer from vring is
433                                          * used up, need fetch next buffer
434                                          * from buf_vec.
435                                          */
436                                         uint32_t desc_idx =
437                                                 vq->buf_vec[vec_idx].desc_idx;
438
439                                         if ((vq->desc[desc_idx].flags &
440                                                 VRING_DESC_F_NEXT) == 0) {
441                                                 uint16_t wrapped_idx =
442                                                         cur_idx & (vq->size - 1);
443                                                 /*
444                                                  * Update used ring with the
445                                                  * descriptor information
446                                                  */
447                                                 vq->used->ring[wrapped_idx].id
448                                                         = desc_idx;
449                                                 vq->used->ring[wrapped_idx].len
450                                                         = entry_len;
451                                                 entry_success++;
452                                                 entry_len = 0;
453                                                 cur_idx++;
454                                         }
455
456                                         /* Get next buffer from buf_vec. */
457                                         vec_idx++;
458                                         vb_addr = gpa_to_vva(dev,
459                                                 vq->buf_vec[vec_idx].buf_addr);
460                                         vb_avail =
461                                                 vq->buf_vec[vec_idx].buf_len;
462                                         vb_offset = 0;
463                                 }
464
465                                 seg_offset = 0;
466                                 seg_avail = rte_pktmbuf_data_len(pkt);
467                                 cpy_len = RTE_MIN(vb_avail, seg_avail);
468                         } else {
469                                 /*
470                                  * This whole packet completes.
471                                  */
472                                 /* Update used ring with desc information */
473                                 vq->used->ring[cur_idx & (vq->size - 1)].id
474                                         = vq->buf_vec[vec_idx].desc_idx;
475                                 vq->used->ring[cur_idx & (vq->size - 1)].len
476                                         = entry_len;
477                                 entry_success++;
478                                 break;
479                         }
480                 }
481         }
482
483         return entry_success;
484 }
485
486 static inline void __attribute__((always_inline))
487 update_secure_len(struct vhost_virtqueue *vq, uint32_t id,
488         uint32_t *secure_len, uint32_t *vec_idx)
489 {
490         uint16_t wrapped_idx = id & (vq->size - 1);
491         uint32_t idx = vq->avail->ring[wrapped_idx];
492         uint8_t next_desc;
493         uint32_t len = *secure_len;
494         uint32_t vec_id = *vec_idx;
495
496         do {
497                 next_desc = 0;
498                 len += vq->desc[idx].len;
499                 vq->buf_vec[vec_id].buf_addr = vq->desc[idx].addr;
500                 vq->buf_vec[vec_id].buf_len = vq->desc[idx].len;
501                 vq->buf_vec[vec_id].desc_idx = idx;
502                 vec_id++;
503
504                 if (vq->desc[idx].flags & VRING_DESC_F_NEXT) {
505                         idx = vq->desc[idx].next;
506                         next_desc = 1;
507                 }
508         } while (next_desc);
509
510         *secure_len = len;
511         *vec_idx = vec_id;
512 }
513
514 /*
515  * This function works for mergeable RX.
516  */
517 static inline uint32_t __attribute__((always_inline))
518 virtio_dev_merge_rx(struct virtio_net *dev, uint16_t queue_id,
519         struct rte_mbuf **pkts, uint32_t count)
520 {
521         struct vhost_virtqueue *vq;
522         uint32_t pkt_idx = 0, entry_success = 0;
523         uint16_t avail_idx;
524         uint16_t res_base_idx, res_cur_idx;
525         uint8_t success = 0;
526
527         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_merge_rx()\n",
528                 dev->device_fh);
529         if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->virt_qp_nb))) {
530                 RTE_LOG(ERR, VHOST_DATA,
531                         "%s (%"PRIu64"): virtqueue idx:%d invalid.\n",
532                         __func__, dev->device_fh, queue_id);
533                 return 0;
534         }
535
536         vq = dev->virtqueue[queue_id];
537         if (unlikely(vq->enabled == 0))
538                 return 0;
539
540         count = RTE_MIN((uint32_t)MAX_PKT_BURST, count);
541
542         if (count == 0)
543                 return 0;
544
545         for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
546                 uint32_t pkt_len = pkts[pkt_idx]->pkt_len + vq->vhost_hlen;
547
548                 do {
549                         /*
550                          * As many data cores may want access to available
551                          * buffers, they need to be reserved.
552                          */
553                         uint32_t secure_len = 0;
554                         uint32_t vec_idx = 0;
555
556                         res_base_idx = vq->last_used_idx_res;
557                         res_cur_idx = res_base_idx;
558
559                         do {
560                                 avail_idx = *((volatile uint16_t *)&vq->avail->idx);
561                                 if (unlikely(res_cur_idx == avail_idx))
562                                         goto merge_rx_exit;
563
564                                 update_secure_len(vq, res_cur_idx,
565                                                   &secure_len, &vec_idx);
566                                 res_cur_idx++;
567                         } while (pkt_len > secure_len);
568
569                         /* vq->last_used_idx_res is atomically updated. */
570                         success = rte_atomic16_cmpset(&vq->last_used_idx_res,
571                                                         res_base_idx,
572                                                         res_cur_idx);
573                 } while (success == 0);
574
575                 entry_success = copy_from_mbuf_to_vring(dev, queue_id,
576                         res_base_idx, res_cur_idx, pkts[pkt_idx]);
577
578                 rte_compiler_barrier();
579
580                 /*
581                  * Wait until it's our turn to add our buffer
582                  * to the used ring.
583                  */
584                 while (unlikely(vq->last_used_idx != res_base_idx))
585                         rte_pause();
586
587                 *(volatile uint16_t *)&vq->used->idx += entry_success;
588                 vq->last_used_idx = res_cur_idx;
589         }
590
591 merge_rx_exit:
592         if (likely(pkt_idx)) {
593                 /* flush used->idx update before we read avail->flags. */
594                 rte_mb();
595
596                 /* Kick the guest if necessary. */
597                 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
598                         eventfd_write(vq->callfd, (eventfd_t)1);
599         }
600
601         return pkt_idx;
602 }
603
604 uint16_t
605 rte_vhost_enqueue_burst(struct virtio_net *dev, uint16_t queue_id,
606         struct rte_mbuf **pkts, uint16_t count)
607 {
608         if (unlikely(dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF)))
609                 return virtio_dev_merge_rx(dev, queue_id, pkts, count);
610         else
611                 return virtio_dev_rx(dev, queue_id, pkts, count);
612 }
613
614 static void
615 parse_ethernet(struct rte_mbuf *m, uint16_t *l4_proto, void **l4_hdr)
616 {
617         struct ipv4_hdr *ipv4_hdr;
618         struct ipv6_hdr *ipv6_hdr;
619         void *l3_hdr = NULL;
620         struct ether_hdr *eth_hdr;
621         uint16_t ethertype;
622
623         eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
624
625         m->l2_len = sizeof(struct ether_hdr);
626         ethertype = rte_be_to_cpu_16(eth_hdr->ether_type);
627
628         if (ethertype == ETHER_TYPE_VLAN) {
629                 struct vlan_hdr *vlan_hdr = (struct vlan_hdr *)(eth_hdr + 1);
630
631                 m->l2_len += sizeof(struct vlan_hdr);
632                 ethertype = rte_be_to_cpu_16(vlan_hdr->eth_proto);
633         }
634
635         l3_hdr = (char *)eth_hdr + m->l2_len;
636
637         switch (ethertype) {
638         case ETHER_TYPE_IPv4:
639                 ipv4_hdr = (struct ipv4_hdr *)l3_hdr;
640                 *l4_proto = ipv4_hdr->next_proto_id;
641                 m->l3_len = (ipv4_hdr->version_ihl & 0x0f) * 4;
642                 *l4_hdr = (char *)l3_hdr + m->l3_len;
643                 m->ol_flags |= PKT_TX_IPV4;
644                 break;
645         case ETHER_TYPE_IPv6:
646                 ipv6_hdr = (struct ipv6_hdr *)l3_hdr;
647                 *l4_proto = ipv6_hdr->proto;
648                 m->l3_len = sizeof(struct ipv6_hdr);
649                 *l4_hdr = (char *)l3_hdr + m->l3_len;
650                 m->ol_flags |= PKT_TX_IPV6;
651                 break;
652         default:
653                 m->l3_len = 0;
654                 *l4_proto = 0;
655                 break;
656         }
657 }
658
659 static inline void __attribute__((always_inline))
660 vhost_dequeue_offload(struct virtio_net_hdr *hdr, struct rte_mbuf *m)
661 {
662         uint16_t l4_proto = 0;
663         void *l4_hdr = NULL;
664         struct tcp_hdr *tcp_hdr = NULL;
665
666         parse_ethernet(m, &l4_proto, &l4_hdr);
667         if (hdr->flags == VIRTIO_NET_HDR_F_NEEDS_CSUM) {
668                 if (hdr->csum_start == (m->l2_len + m->l3_len)) {
669                         switch (hdr->csum_offset) {
670                         case (offsetof(struct tcp_hdr, cksum)):
671                                 if (l4_proto == IPPROTO_TCP)
672                                         m->ol_flags |= PKT_TX_TCP_CKSUM;
673                                 break;
674                         case (offsetof(struct udp_hdr, dgram_cksum)):
675                                 if (l4_proto == IPPROTO_UDP)
676                                         m->ol_flags |= PKT_TX_UDP_CKSUM;
677                                 break;
678                         case (offsetof(struct sctp_hdr, cksum)):
679                                 if (l4_proto == IPPROTO_SCTP)
680                                         m->ol_flags |= PKT_TX_SCTP_CKSUM;
681                                 break;
682                         default:
683                                 break;
684                         }
685                 }
686         }
687
688         if (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
689                 switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
690                 case VIRTIO_NET_HDR_GSO_TCPV4:
691                 case VIRTIO_NET_HDR_GSO_TCPV6:
692                         tcp_hdr = (struct tcp_hdr *)l4_hdr;
693                         m->ol_flags |= PKT_TX_TCP_SEG;
694                         m->tso_segsz = hdr->gso_size;
695                         m->l4_len = (tcp_hdr->data_off & 0xf0) >> 2;
696                         break;
697                 default:
698                         RTE_LOG(WARNING, VHOST_DATA,
699                                 "unsupported gso type %u.\n", hdr->gso_type);
700                         break;
701                 }
702         }
703 }
704
705 uint16_t
706 rte_vhost_dequeue_burst(struct virtio_net *dev, uint16_t queue_id,
707         struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count)
708 {
709         struct rte_mbuf *m, *prev;
710         struct vhost_virtqueue *vq;
711         struct vring_desc *desc;
712         uint64_t vb_addr = 0;
713         uint64_t vb_net_hdr_addr = 0;
714         uint32_t head[MAX_PKT_BURST];
715         uint32_t used_idx;
716         uint32_t i;
717         uint16_t free_entries, entry_success = 0;
718         uint16_t avail_idx;
719         struct virtio_net_hdr *hdr = NULL;
720
721         if (unlikely(!is_valid_virt_queue_idx(queue_id, 1, dev->virt_qp_nb))) {
722                 RTE_LOG(ERR, VHOST_DATA,
723                         "%s (%"PRIu64"): virtqueue idx:%d invalid.\n",
724                         __func__, dev->device_fh, queue_id);
725                 return 0;
726         }
727
728         vq = dev->virtqueue[queue_id];
729         if (unlikely(vq->enabled == 0))
730                 return 0;
731
732         avail_idx =  *((volatile uint16_t *)&vq->avail->idx);
733
734         /* If there are no available buffers then return. */
735         if (vq->last_used_idx == avail_idx)
736                 return 0;
737
738         LOG_DEBUG(VHOST_DATA, "%s (%"PRIu64")\n", __func__,
739                 dev->device_fh);
740
741         /* Prefetch available ring to retrieve head indexes. */
742         rte_prefetch0(&vq->avail->ring[vq->last_used_idx & (vq->size - 1)]);
743
744         /*get the number of free entries in the ring*/
745         free_entries = (avail_idx - vq->last_used_idx);
746
747         free_entries = RTE_MIN(free_entries, count);
748         /* Limit to MAX_PKT_BURST. */
749         free_entries = RTE_MIN(free_entries, MAX_PKT_BURST);
750
751         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n",
752                         dev->device_fh, free_entries);
753         /* Retrieve all of the head indexes first to avoid caching issues. */
754         for (i = 0; i < free_entries; i++)
755                 head[i] = vq->avail->ring[(vq->last_used_idx + i) & (vq->size - 1)];
756
757         /* Prefetch descriptor index. */
758         rte_prefetch0(&vq->desc[head[entry_success]]);
759         rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
760
761         while (entry_success < free_entries) {
762                 uint32_t vb_avail, vb_offset;
763                 uint32_t seg_avail, seg_offset;
764                 uint32_t cpy_len;
765                 uint32_t seg_num = 0;
766                 struct rte_mbuf *cur;
767                 uint8_t alloc_err = 0;
768
769                 desc = &vq->desc[head[entry_success]];
770
771                 vb_net_hdr_addr = gpa_to_vva(dev, desc->addr);
772                 hdr = (struct virtio_net_hdr *)((uintptr_t)vb_net_hdr_addr);
773
774                 /* Discard first buffer as it is the virtio header */
775                 if (desc->flags & VRING_DESC_F_NEXT) {
776                         desc = &vq->desc[desc->next];
777                         vb_offset = 0;
778                         vb_avail = desc->len;
779                 } else {
780                         vb_offset = vq->vhost_hlen;
781                         vb_avail = desc->len - vb_offset;
782                 }
783
784                 /* Buffer address translation. */
785                 vb_addr = gpa_to_vva(dev, desc->addr);
786                 /* Prefetch buffer address. */
787                 rte_prefetch0((void *)(uintptr_t)vb_addr);
788
789                 used_idx = vq->last_used_idx & (vq->size - 1);
790
791                 if (entry_success < (free_entries - 1)) {
792                         /* Prefetch descriptor index. */
793                         rte_prefetch0(&vq->desc[head[entry_success+1]]);
794                         rte_prefetch0(&vq->used->ring[(used_idx + 1) & (vq->size - 1)]);
795                 }
796
797                 /* Update used index buffer information. */
798                 vq->used->ring[used_idx].id = head[entry_success];
799                 vq->used->ring[used_idx].len = 0;
800
801                 /* Allocate an mbuf and populate the structure. */
802                 m = rte_pktmbuf_alloc(mbuf_pool);
803                 if (unlikely(m == NULL)) {
804                         RTE_LOG(ERR, VHOST_DATA,
805                                 "Failed to allocate memory for mbuf.\n");
806                         break;
807                 }
808                 seg_offset = 0;
809                 seg_avail = m->buf_len - RTE_PKTMBUF_HEADROOM;
810                 cpy_len = RTE_MIN(vb_avail, seg_avail);
811
812                 PRINT_PACKET(dev, (uintptr_t)vb_addr, desc->len, 0);
813
814                 seg_num++;
815                 cur = m;
816                 prev = m;
817                 while (cpy_len != 0) {
818                         rte_memcpy(rte_pktmbuf_mtod_offset(cur, void *, seg_offset),
819                                 (void *)((uintptr_t)(vb_addr + vb_offset)),
820                                 cpy_len);
821
822                         seg_offset += cpy_len;
823                         vb_offset += cpy_len;
824                         vb_avail -= cpy_len;
825                         seg_avail -= cpy_len;
826
827                         if (vb_avail != 0) {
828                                 /*
829                                  * The segment reachs to its end,
830                                  * while the virtio buffer in TX vring has
831                                  * more data to be copied.
832                                  */
833                                 cur->data_len = seg_offset;
834                                 m->pkt_len += seg_offset;
835                                 /* Allocate mbuf and populate the structure. */
836                                 cur = rte_pktmbuf_alloc(mbuf_pool);
837                                 if (unlikely(cur == NULL)) {
838                                         RTE_LOG(ERR, VHOST_DATA, "Failed to "
839                                                 "allocate memory for mbuf.\n");
840                                         rte_pktmbuf_free(m);
841                                         alloc_err = 1;
842                                         break;
843                                 }
844
845                                 seg_num++;
846                                 prev->next = cur;
847                                 prev = cur;
848                                 seg_offset = 0;
849                                 seg_avail = cur->buf_len - RTE_PKTMBUF_HEADROOM;
850                         } else {
851                                 if (desc->flags & VRING_DESC_F_NEXT) {
852                                         /*
853                                          * There are more virtio buffers in
854                                          * same vring entry need to be copied.
855                                          */
856                                         if (seg_avail == 0) {
857                                                 /*
858                                                  * The current segment hasn't
859                                                  * room to accomodate more
860                                                  * data.
861                                                  */
862                                                 cur->data_len = seg_offset;
863                                                 m->pkt_len += seg_offset;
864                                                 /*
865                                                  * Allocate an mbuf and
866                                                  * populate the structure.
867                                                  */
868                                                 cur = rte_pktmbuf_alloc(mbuf_pool);
869                                                 if (unlikely(cur == NULL)) {
870                                                         RTE_LOG(ERR,
871                                                                 VHOST_DATA,
872                                                                 "Failed to "
873                                                                 "allocate memory "
874                                                                 "for mbuf\n");
875                                                         rte_pktmbuf_free(m);
876                                                         alloc_err = 1;
877                                                         break;
878                                                 }
879                                                 seg_num++;
880                                                 prev->next = cur;
881                                                 prev = cur;
882                                                 seg_offset = 0;
883                                                 seg_avail = cur->buf_len - RTE_PKTMBUF_HEADROOM;
884                                         }
885
886                                         desc = &vq->desc[desc->next];
887
888                                         /* Buffer address translation. */
889                                         vb_addr = gpa_to_vva(dev, desc->addr);
890                                         /* Prefetch buffer address. */
891                                         rte_prefetch0((void *)(uintptr_t)vb_addr);
892                                         vb_offset = 0;
893                                         vb_avail = desc->len;
894
895                                         PRINT_PACKET(dev, (uintptr_t)vb_addr,
896                                                 desc->len, 0);
897                                 } else {
898                                         /* The whole packet completes. */
899                                         cur->data_len = seg_offset;
900                                         m->pkt_len += seg_offset;
901                                         vb_avail = 0;
902                                 }
903                         }
904
905                         cpy_len = RTE_MIN(vb_avail, seg_avail);
906                 }
907
908                 if (unlikely(alloc_err == 1))
909                         break;
910
911                 m->nb_segs = seg_num;
912                 if ((hdr->flags != 0) || (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE))
913                         vhost_dequeue_offload(hdr, m);
914
915                 pkts[entry_success] = m;
916                 vq->last_used_idx++;
917                 entry_success++;
918         }
919
920         rte_compiler_barrier();
921         vq->used->idx += entry_success;
922         /* Kick guest if required. */
923         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
924                 eventfd_write(vq->callfd, (eventfd_t)1);
925         return entry_success;
926 }