vhost: log vring desc buffer changes
[dpdk.git] / lib / librte_vhost / vhost_rxtx.c
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33
34 #include <stdint.h>
35 #include <stdbool.h>
36 #include <linux/virtio_net.h>
37
38 #include <rte_mbuf.h>
39 #include <rte_memcpy.h>
40 #include <rte_ether.h>
41 #include <rte_ip.h>
42 #include <rte_virtio_net.h>
43 #include <rte_tcp.h>
44 #include <rte_udp.h>
45 #include <rte_sctp.h>
46
47 #include "vhost-net.h"
48
49 #define MAX_PKT_BURST 32
50 #define VHOST_LOG_PAGE  4096
51
52 static inline void __attribute__((always_inline))
53 vhost_log_page(uint8_t *log_base, uint64_t page)
54 {
55         log_base[page / 8] |= 1 << (page % 8);
56 }
57
58 static inline void __attribute__((always_inline))
59 vhost_log_write(struct virtio_net *dev, uint64_t addr, uint64_t len)
60 {
61         uint64_t page;
62
63         if (likely(((dev->features & (1ULL << VHOST_F_LOG_ALL)) == 0) ||
64                    !dev->log_base || !len))
65                 return;
66
67         if (unlikely(dev->log_size <= ((addr + len - 1) / VHOST_LOG_PAGE / 8)))
68                 return;
69
70         /* To make sure guest memory updates are committed before logging */
71         rte_smp_wmb();
72
73         page = addr / VHOST_LOG_PAGE;
74         while (page * VHOST_LOG_PAGE < addr + len) {
75                 vhost_log_page((uint8_t *)(uintptr_t)dev->log_base, page);
76                 page += 1;
77         }
78 }
79
80 static inline void __attribute__((always_inline))
81 vhost_log_used_vring(struct virtio_net *dev, struct vhost_virtqueue *vq,
82                      uint64_t offset, uint64_t len)
83 {
84         vhost_log_write(dev, vq->log_guest_addr + offset, len);
85 }
86
87 static bool
88 is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t qp_nb)
89 {
90         return (is_tx ^ (idx & 1)) == 0 && idx < qp_nb * VIRTIO_QNUM;
91 }
92
93 static void
94 virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
95 {
96         memset(net_hdr, 0, sizeof(struct virtio_net_hdr));
97
98         if (m_buf->ol_flags & PKT_TX_L4_MASK) {
99                 net_hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
100                 net_hdr->csum_start = m_buf->l2_len + m_buf->l3_len;
101
102                 switch (m_buf->ol_flags & PKT_TX_L4_MASK) {
103                 case PKT_TX_TCP_CKSUM:
104                         net_hdr->csum_offset = (offsetof(struct tcp_hdr,
105                                                 cksum));
106                         break;
107                 case PKT_TX_UDP_CKSUM:
108                         net_hdr->csum_offset = (offsetof(struct udp_hdr,
109                                                 dgram_cksum));
110                         break;
111                 case PKT_TX_SCTP_CKSUM:
112                         net_hdr->csum_offset = (offsetof(struct sctp_hdr,
113                                                 cksum));
114                         break;
115                 }
116         }
117
118         if (m_buf->ol_flags & PKT_TX_TCP_SEG) {
119                 if (m_buf->ol_flags & PKT_TX_IPV4)
120                         net_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
121                 else
122                         net_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
123                 net_hdr->gso_size = m_buf->tso_segsz;
124                 net_hdr->hdr_len = m_buf->l2_len + m_buf->l3_len
125                                         + m_buf->l4_len;
126         }
127
128         return;
129 }
130
131 /**
132  * This function adds buffers to the virtio devices RX virtqueue. Buffers can
133  * be received from the physical port or from another virtio device. A packet
134  * count is returned to indicate the number of packets that are succesfully
135  * added to the RX queue. This function works when the mbuf is scattered, but
136  * it doesn't support the mergeable feature.
137  */
138 static inline uint32_t __attribute__((always_inline))
139 virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
140         struct rte_mbuf **pkts, uint32_t count)
141 {
142         struct vhost_virtqueue *vq;
143         struct vring_desc *desc, *hdr_desc;
144         struct rte_mbuf *buff, *first_buff;
145         /* The virtio_hdr is initialised to 0. */
146         struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0};
147         uint64_t buff_addr = 0;
148         uint64_t buff_hdr_addr = 0;
149         uint32_t head[MAX_PKT_BURST];
150         uint32_t head_idx, packet_success = 0;
151         uint16_t avail_idx, res_cur_idx;
152         uint16_t res_base_idx, res_end_idx;
153         uint16_t free_entries;
154         uint8_t success = 0;
155
156         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh);
157         if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->virt_qp_nb))) {
158                 RTE_LOG(ERR, VHOST_DATA,
159                         "%s (%"PRIu64"): virtqueue idx:%d invalid.\n",
160                         __func__, dev->device_fh, queue_id);
161                 return 0;
162         }
163
164         vq = dev->virtqueue[queue_id];
165         if (unlikely(vq->enabled == 0))
166                 return 0;
167
168         count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count;
169
170         /*
171          * As many data cores may want access to available buffers,
172          * they need to be reserved.
173          */
174         do {
175                 res_base_idx = vq->last_used_idx_res;
176                 avail_idx = *((volatile uint16_t *)&vq->avail->idx);
177
178                 free_entries = (avail_idx - res_base_idx);
179                 /*check that we have enough buffers*/
180                 if (unlikely(count > free_entries))
181                         count = free_entries;
182
183                 if (count == 0)
184                         return 0;
185
186                 res_end_idx = res_base_idx + count;
187                 /* vq->last_used_idx_res is atomically updated. */
188                 /* TODO: Allow to disable cmpset if no concurrency in application. */
189                 success = rte_atomic16_cmpset(&vq->last_used_idx_res,
190                                 res_base_idx, res_end_idx);
191         } while (unlikely(success == 0));
192         res_cur_idx = res_base_idx;
193         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n",
194                         dev->device_fh, res_cur_idx, res_end_idx);
195
196         /* Prefetch available ring to retrieve indexes. */
197         rte_prefetch0(&vq->avail->ring[res_cur_idx & (vq->size - 1)]);
198
199         /* Retrieve all of the head indexes first to avoid caching issues. */
200         for (head_idx = 0; head_idx < count; head_idx++)
201                 head[head_idx] = vq->avail->ring[(res_cur_idx + head_idx) &
202                                         (vq->size - 1)];
203
204         /*Prefetch descriptor index. */
205         rte_prefetch0(&vq->desc[head[packet_success]]);
206
207         while (res_cur_idx != res_end_idx) {
208                 uint32_t offset = 0, vb_offset = 0;
209                 uint32_t pkt_len, len_to_cpy, data_len, total_copied = 0;
210                 uint8_t hdr = 0, uncompleted_pkt = 0;
211                 uint16_t idx;
212
213                 /* Get descriptor from available ring */
214                 desc = &vq->desc[head[packet_success]];
215
216                 buff = pkts[packet_success];
217                 first_buff = buff;
218
219                 /* Convert from gpa to vva (guest physical addr -> vhost virtual addr) */
220                 buff_addr = gpa_to_vva(dev, desc->addr);
221                 /* Prefetch buffer address. */
222                 rte_prefetch0((void *)(uintptr_t)buff_addr);
223
224                 /* Copy virtio_hdr to packet and increment buffer address */
225                 buff_hdr_addr = buff_addr;
226                 hdr_desc = desc;
227
228                 /*
229                  * If the descriptors are chained the header and data are
230                  * placed in separate buffers.
231                  */
232                 if ((desc->flags & VRING_DESC_F_NEXT) &&
233                         (desc->len == vq->vhost_hlen)) {
234                         desc = &vq->desc[desc->next];
235                         /* Buffer address translation. */
236                         buff_addr = gpa_to_vva(dev, desc->addr);
237                 } else {
238                         vb_offset += vq->vhost_hlen;
239                         hdr = 1;
240                 }
241
242                 pkt_len = rte_pktmbuf_pkt_len(buff);
243                 data_len = rte_pktmbuf_data_len(buff);
244                 len_to_cpy = RTE_MIN(data_len,
245                         hdr ? desc->len - vq->vhost_hlen : desc->len);
246                 while (total_copied < pkt_len) {
247                         /* Copy mbuf data to buffer */
248                         rte_memcpy((void *)(uintptr_t)(buff_addr + vb_offset),
249                                 rte_pktmbuf_mtod_offset(buff, const void *, offset),
250                                 len_to_cpy);
251                         vhost_log_write(dev, desc->addr + vb_offset, len_to_cpy);
252                         PRINT_PACKET(dev, (uintptr_t)(buff_addr + vb_offset),
253                                 len_to_cpy, 0);
254
255                         offset += len_to_cpy;
256                         vb_offset += len_to_cpy;
257                         total_copied += len_to_cpy;
258
259                         /* The whole packet completes */
260                         if (total_copied == pkt_len)
261                                 break;
262
263                         /* The current segment completes */
264                         if (offset == data_len) {
265                                 buff = buff->next;
266                                 offset = 0;
267                                 data_len = rte_pktmbuf_data_len(buff);
268                         }
269
270                         /* The current vring descriptor done */
271                         if (vb_offset == desc->len) {
272                                 if (desc->flags & VRING_DESC_F_NEXT) {
273                                         desc = &vq->desc[desc->next];
274                                         buff_addr = gpa_to_vva(dev, desc->addr);
275                                         vb_offset = 0;
276                                 } else {
277                                         /* Room in vring buffer is not enough */
278                                         uncompleted_pkt = 1;
279                                         break;
280                                 }
281                         }
282                         len_to_cpy = RTE_MIN(data_len - offset, desc->len - vb_offset);
283                 }
284
285                 /* Update used ring with desc information */
286                 idx = res_cur_idx & (vq->size - 1);
287                 vq->used->ring[idx].id = head[packet_success];
288
289                 /* Drop the packet if it is uncompleted */
290                 if (unlikely(uncompleted_pkt == 1))
291                         vq->used->ring[idx].len = vq->vhost_hlen;
292                 else
293                         vq->used->ring[idx].len = pkt_len + vq->vhost_hlen;
294
295                 vhost_log_used_vring(dev, vq,
296                         offsetof(struct vring_used, ring[idx]),
297                         sizeof(vq->used->ring[idx]));
298
299                 res_cur_idx++;
300                 packet_success++;
301
302                 if (unlikely(uncompleted_pkt == 1))
303                         continue;
304
305                 virtio_enqueue_offload(first_buff, &virtio_hdr.hdr);
306
307                 rte_memcpy((void *)(uintptr_t)buff_hdr_addr,
308                         (const void *)&virtio_hdr, vq->vhost_hlen);
309                 vhost_log_write(dev, hdr_desc->addr, vq->vhost_hlen);
310
311                 PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
312
313                 if (res_cur_idx < res_end_idx) {
314                         /* Prefetch descriptor index. */
315                         rte_prefetch0(&vq->desc[head[packet_success]]);
316                 }
317         }
318
319         rte_compiler_barrier();
320
321         /* Wait until it's our turn to add our buffer to the used ring. */
322         while (unlikely(vq->last_used_idx != res_base_idx))
323                 rte_pause();
324
325         *(volatile uint16_t *)&vq->used->idx += count;
326         vq->last_used_idx = res_end_idx;
327         vhost_log_used_vring(dev, vq,
328                 offsetof(struct vring_used, idx),
329                 sizeof(vq->used->idx));
330
331         /* flush used->idx update before we read avail->flags. */
332         rte_mb();
333
334         /* Kick the guest if necessary. */
335         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
336                 eventfd_write(vq->callfd, (eventfd_t)1);
337         return count;
338 }
339
340 static inline uint32_t __attribute__((always_inline))
341 copy_from_mbuf_to_vring(struct virtio_net *dev, uint32_t queue_id,
342                         uint16_t res_base_idx, uint16_t res_end_idx,
343                         struct rte_mbuf *pkt)
344 {
345         uint32_t vec_idx = 0;
346         uint32_t entry_success = 0;
347         struct vhost_virtqueue *vq;
348         /* The virtio_hdr is initialised to 0. */
349         struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {
350                 {0, 0, 0, 0, 0, 0}, 0};
351         uint16_t cur_idx = res_base_idx;
352         uint64_t vb_addr = 0;
353         uint64_t vb_hdr_addr = 0;
354         uint32_t seg_offset = 0;
355         uint32_t vb_offset = 0;
356         uint32_t seg_avail;
357         uint32_t vb_avail;
358         uint32_t cpy_len, entry_len;
359         uint16_t idx;
360
361         if (pkt == NULL)
362                 return 0;
363
364         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| "
365                 "End Index %d\n",
366                 dev->device_fh, cur_idx, res_end_idx);
367
368         /*
369          * Convert from gpa to vva
370          * (guest physical addr -> vhost virtual addr)
371          */
372         vq = dev->virtqueue[queue_id];
373
374         vb_addr = gpa_to_vva(dev, vq->buf_vec[vec_idx].buf_addr);
375         vb_hdr_addr = vb_addr;
376
377         /* Prefetch buffer address. */
378         rte_prefetch0((void *)(uintptr_t)vb_addr);
379
380         virtio_hdr.num_buffers = res_end_idx - res_base_idx;
381
382         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") RX: Num merge buffers %d\n",
383                 dev->device_fh, virtio_hdr.num_buffers);
384
385         virtio_enqueue_offload(pkt, &virtio_hdr.hdr);
386
387         rte_memcpy((void *)(uintptr_t)vb_hdr_addr,
388                 (const void *)&virtio_hdr, vq->vhost_hlen);
389         vhost_log_write(dev, vq->buf_vec[vec_idx].buf_addr, vq->vhost_hlen);
390
391         PRINT_PACKET(dev, (uintptr_t)vb_hdr_addr, vq->vhost_hlen, 1);
392
393         seg_avail = rte_pktmbuf_data_len(pkt);
394         vb_offset = vq->vhost_hlen;
395         vb_avail = vq->buf_vec[vec_idx].buf_len - vq->vhost_hlen;
396
397         entry_len = vq->vhost_hlen;
398
399         if (vb_avail == 0) {
400                 uint32_t desc_idx = vq->buf_vec[vec_idx].desc_idx;
401
402                 if ((vq->desc[desc_idx].flags & VRING_DESC_F_NEXT) == 0) {
403                         idx = cur_idx & (vq->size - 1);
404
405                         /* Update used ring with desc information */
406                         vq->used->ring[idx].id = vq->buf_vec[vec_idx].desc_idx;
407                         vq->used->ring[idx].len = entry_len;
408
409                         vhost_log_used_vring(dev, vq,
410                                         offsetof(struct vring_used, ring[idx]),
411                                         sizeof(vq->used->ring[idx]));
412
413                         entry_len = 0;
414                         cur_idx++;
415                         entry_success++;
416                 }
417
418                 vec_idx++;
419                 vb_addr = gpa_to_vva(dev, vq->buf_vec[vec_idx].buf_addr);
420
421                 /* Prefetch buffer address. */
422                 rte_prefetch0((void *)(uintptr_t)vb_addr);
423                 vb_offset = 0;
424                 vb_avail = vq->buf_vec[vec_idx].buf_len;
425         }
426
427         cpy_len = RTE_MIN(vb_avail, seg_avail);
428
429         while (cpy_len > 0) {
430                 /* Copy mbuf data to vring buffer */
431                 rte_memcpy((void *)(uintptr_t)(vb_addr + vb_offset),
432                         rte_pktmbuf_mtod_offset(pkt, const void *, seg_offset),
433                         cpy_len);
434                 vhost_log_write(dev, vq->buf_vec[vec_idx].buf_addr + vb_offset,
435                         cpy_len);
436
437                 PRINT_PACKET(dev,
438                         (uintptr_t)(vb_addr + vb_offset),
439                         cpy_len, 0);
440
441                 seg_offset += cpy_len;
442                 vb_offset += cpy_len;
443                 seg_avail -= cpy_len;
444                 vb_avail -= cpy_len;
445                 entry_len += cpy_len;
446
447                 if (seg_avail != 0) {
448                         /*
449                          * The virtio buffer in this vring
450                          * entry reach to its end.
451                          * But the segment doesn't complete.
452                          */
453                         if ((vq->desc[vq->buf_vec[vec_idx].desc_idx].flags &
454                                 VRING_DESC_F_NEXT) == 0) {
455                                 /* Update used ring with desc information */
456                                 idx = cur_idx & (vq->size - 1);
457                                 vq->used->ring[idx].id
458                                         = vq->buf_vec[vec_idx].desc_idx;
459                                 vq->used->ring[idx].len = entry_len;
460                                 vhost_log_used_vring(dev, vq,
461                                         offsetof(struct vring_used, ring[idx]),
462                                         sizeof(vq->used->ring[idx]));
463                                 entry_len = 0;
464                                 cur_idx++;
465                                 entry_success++;
466                         }
467
468                         vec_idx++;
469                         vb_addr = gpa_to_vva(dev,
470                                 vq->buf_vec[vec_idx].buf_addr);
471                         vb_offset = 0;
472                         vb_avail = vq->buf_vec[vec_idx].buf_len;
473                         cpy_len = RTE_MIN(vb_avail, seg_avail);
474                 } else {
475                         /*
476                          * This current segment complete, need continue to
477                          * check if the whole packet complete or not.
478                          */
479                         pkt = pkt->next;
480                         if (pkt != NULL) {
481                                 /*
482                                  * There are more segments.
483                                  */
484                                 if (vb_avail == 0) {
485                                         /*
486                                          * This current buffer from vring is
487                                          * used up, need fetch next buffer
488                                          * from buf_vec.
489                                          */
490                                         uint32_t desc_idx =
491                                                 vq->buf_vec[vec_idx].desc_idx;
492
493                                         if ((vq->desc[desc_idx].flags &
494                                                 VRING_DESC_F_NEXT) == 0) {
495                                                 idx = cur_idx & (vq->size - 1);
496                                                 /*
497                                                  * Update used ring with the
498                                                  * descriptor information
499                                                  */
500                                                 vq->used->ring[idx].id
501                                                         = desc_idx;
502                                                 vq->used->ring[idx].len
503                                                         = entry_len;
504                                                 vhost_log_used_vring(dev, vq,
505                                                         offsetof(struct vring_used, ring[idx]),
506                                                         sizeof(vq->used->ring[idx]));
507                                                 entry_success++;
508                                                 entry_len = 0;
509                                                 cur_idx++;
510                                         }
511
512                                         /* Get next buffer from buf_vec. */
513                                         vec_idx++;
514                                         vb_addr = gpa_to_vva(dev,
515                                                 vq->buf_vec[vec_idx].buf_addr);
516                                         vb_avail =
517                                                 vq->buf_vec[vec_idx].buf_len;
518                                         vb_offset = 0;
519                                 }
520
521                                 seg_offset = 0;
522                                 seg_avail = rte_pktmbuf_data_len(pkt);
523                                 cpy_len = RTE_MIN(vb_avail, seg_avail);
524                         } else {
525                                 /*
526                                  * This whole packet completes.
527                                  */
528                                 /* Update used ring with desc information */
529                                 idx = cur_idx & (vq->size - 1);
530                                 vq->used->ring[idx].id
531                                         = vq->buf_vec[vec_idx].desc_idx;
532                                 vq->used->ring[idx].len = entry_len;
533                                 vhost_log_used_vring(dev, vq,
534                                         offsetof(struct vring_used, ring[idx]),
535                                         sizeof(vq->used->ring[idx]));
536                                 entry_success++;
537                                 break;
538                         }
539                 }
540         }
541
542         return entry_success;
543 }
544
545 static inline void __attribute__((always_inline))
546 update_secure_len(struct vhost_virtqueue *vq, uint32_t id,
547         uint32_t *secure_len, uint32_t *vec_idx)
548 {
549         uint16_t wrapped_idx = id & (vq->size - 1);
550         uint32_t idx = vq->avail->ring[wrapped_idx];
551         uint8_t next_desc;
552         uint32_t len = *secure_len;
553         uint32_t vec_id = *vec_idx;
554
555         do {
556                 next_desc = 0;
557                 len += vq->desc[idx].len;
558                 vq->buf_vec[vec_id].buf_addr = vq->desc[idx].addr;
559                 vq->buf_vec[vec_id].buf_len = vq->desc[idx].len;
560                 vq->buf_vec[vec_id].desc_idx = idx;
561                 vec_id++;
562
563                 if (vq->desc[idx].flags & VRING_DESC_F_NEXT) {
564                         idx = vq->desc[idx].next;
565                         next_desc = 1;
566                 }
567         } while (next_desc);
568
569         *secure_len = len;
570         *vec_idx = vec_id;
571 }
572
573 /*
574  * This function works for mergeable RX.
575  */
576 static inline uint32_t __attribute__((always_inline))
577 virtio_dev_merge_rx(struct virtio_net *dev, uint16_t queue_id,
578         struct rte_mbuf **pkts, uint32_t count)
579 {
580         struct vhost_virtqueue *vq;
581         uint32_t pkt_idx = 0, entry_success = 0;
582         uint16_t avail_idx;
583         uint16_t res_base_idx, res_cur_idx;
584         uint8_t success = 0;
585
586         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_merge_rx()\n",
587                 dev->device_fh);
588         if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->virt_qp_nb))) {
589                 RTE_LOG(ERR, VHOST_DATA,
590                         "%s (%"PRIu64"): virtqueue idx:%d invalid.\n",
591                         __func__, dev->device_fh, queue_id);
592                 return 0;
593         }
594
595         vq = dev->virtqueue[queue_id];
596         if (unlikely(vq->enabled == 0))
597                 return 0;
598
599         count = RTE_MIN((uint32_t)MAX_PKT_BURST, count);
600
601         if (count == 0)
602                 return 0;
603
604         for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
605                 uint32_t pkt_len = pkts[pkt_idx]->pkt_len + vq->vhost_hlen;
606
607                 do {
608                         /*
609                          * As many data cores may want access to available
610                          * buffers, they need to be reserved.
611                          */
612                         uint32_t secure_len = 0;
613                         uint32_t vec_idx = 0;
614
615                         res_base_idx = vq->last_used_idx_res;
616                         res_cur_idx = res_base_idx;
617
618                         do {
619                                 avail_idx = *((volatile uint16_t *)&vq->avail->idx);
620                                 if (unlikely(res_cur_idx == avail_idx))
621                                         goto merge_rx_exit;
622
623                                 update_secure_len(vq, res_cur_idx,
624                                                   &secure_len, &vec_idx);
625                                 res_cur_idx++;
626                         } while (pkt_len > secure_len);
627
628                         /* vq->last_used_idx_res is atomically updated. */
629                         success = rte_atomic16_cmpset(&vq->last_used_idx_res,
630                                                         res_base_idx,
631                                                         res_cur_idx);
632                 } while (success == 0);
633
634                 entry_success = copy_from_mbuf_to_vring(dev, queue_id,
635                         res_base_idx, res_cur_idx, pkts[pkt_idx]);
636
637                 rte_compiler_barrier();
638
639                 /*
640                  * Wait until it's our turn to add our buffer
641                  * to the used ring.
642                  */
643                 while (unlikely(vq->last_used_idx != res_base_idx))
644                         rte_pause();
645
646                 *(volatile uint16_t *)&vq->used->idx += entry_success;
647                 vq->last_used_idx = res_cur_idx;
648         }
649
650 merge_rx_exit:
651         if (likely(pkt_idx)) {
652                 /* flush used->idx update before we read avail->flags. */
653                 rte_mb();
654
655                 /* Kick the guest if necessary. */
656                 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
657                         eventfd_write(vq->callfd, (eventfd_t)1);
658         }
659
660         return pkt_idx;
661 }
662
663 uint16_t
664 rte_vhost_enqueue_burst(struct virtio_net *dev, uint16_t queue_id,
665         struct rte_mbuf **pkts, uint16_t count)
666 {
667         if (unlikely(dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF)))
668                 return virtio_dev_merge_rx(dev, queue_id, pkts, count);
669         else
670                 return virtio_dev_rx(dev, queue_id, pkts, count);
671 }
672
673 static void
674 parse_ethernet(struct rte_mbuf *m, uint16_t *l4_proto, void **l4_hdr)
675 {
676         struct ipv4_hdr *ipv4_hdr;
677         struct ipv6_hdr *ipv6_hdr;
678         void *l3_hdr = NULL;
679         struct ether_hdr *eth_hdr;
680         uint16_t ethertype;
681
682         eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
683
684         m->l2_len = sizeof(struct ether_hdr);
685         ethertype = rte_be_to_cpu_16(eth_hdr->ether_type);
686
687         if (ethertype == ETHER_TYPE_VLAN) {
688                 struct vlan_hdr *vlan_hdr = (struct vlan_hdr *)(eth_hdr + 1);
689
690                 m->l2_len += sizeof(struct vlan_hdr);
691                 ethertype = rte_be_to_cpu_16(vlan_hdr->eth_proto);
692         }
693
694         l3_hdr = (char *)eth_hdr + m->l2_len;
695
696         switch (ethertype) {
697         case ETHER_TYPE_IPv4:
698                 ipv4_hdr = (struct ipv4_hdr *)l3_hdr;
699                 *l4_proto = ipv4_hdr->next_proto_id;
700                 m->l3_len = (ipv4_hdr->version_ihl & 0x0f) * 4;
701                 *l4_hdr = (char *)l3_hdr + m->l3_len;
702                 m->ol_flags |= PKT_TX_IPV4;
703                 break;
704         case ETHER_TYPE_IPv6:
705                 ipv6_hdr = (struct ipv6_hdr *)l3_hdr;
706                 *l4_proto = ipv6_hdr->proto;
707                 m->l3_len = sizeof(struct ipv6_hdr);
708                 *l4_hdr = (char *)l3_hdr + m->l3_len;
709                 m->ol_flags |= PKT_TX_IPV6;
710                 break;
711         default:
712                 m->l3_len = 0;
713                 *l4_proto = 0;
714                 break;
715         }
716 }
717
718 static inline void __attribute__((always_inline))
719 vhost_dequeue_offload(struct virtio_net_hdr *hdr, struct rte_mbuf *m)
720 {
721         uint16_t l4_proto = 0;
722         void *l4_hdr = NULL;
723         struct tcp_hdr *tcp_hdr = NULL;
724
725         parse_ethernet(m, &l4_proto, &l4_hdr);
726         if (hdr->flags == VIRTIO_NET_HDR_F_NEEDS_CSUM) {
727                 if (hdr->csum_start == (m->l2_len + m->l3_len)) {
728                         switch (hdr->csum_offset) {
729                         case (offsetof(struct tcp_hdr, cksum)):
730                                 if (l4_proto == IPPROTO_TCP)
731                                         m->ol_flags |= PKT_TX_TCP_CKSUM;
732                                 break;
733                         case (offsetof(struct udp_hdr, dgram_cksum)):
734                                 if (l4_proto == IPPROTO_UDP)
735                                         m->ol_flags |= PKT_TX_UDP_CKSUM;
736                                 break;
737                         case (offsetof(struct sctp_hdr, cksum)):
738                                 if (l4_proto == IPPROTO_SCTP)
739                                         m->ol_flags |= PKT_TX_SCTP_CKSUM;
740                                 break;
741                         default:
742                                 break;
743                         }
744                 }
745         }
746
747         if (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
748                 switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
749                 case VIRTIO_NET_HDR_GSO_TCPV4:
750                 case VIRTIO_NET_HDR_GSO_TCPV6:
751                         tcp_hdr = (struct tcp_hdr *)l4_hdr;
752                         m->ol_flags |= PKT_TX_TCP_SEG;
753                         m->tso_segsz = hdr->gso_size;
754                         m->l4_len = (tcp_hdr->data_off & 0xf0) >> 2;
755                         break;
756                 default:
757                         RTE_LOG(WARNING, VHOST_DATA,
758                                 "unsupported gso type %u.\n", hdr->gso_type);
759                         break;
760                 }
761         }
762 }
763
764 uint16_t
765 rte_vhost_dequeue_burst(struct virtio_net *dev, uint16_t queue_id,
766         struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count)
767 {
768         struct rte_mbuf *m, *prev;
769         struct vhost_virtqueue *vq;
770         struct vring_desc *desc;
771         uint64_t vb_addr = 0;
772         uint64_t vb_net_hdr_addr = 0;
773         uint32_t head[MAX_PKT_BURST];
774         uint32_t used_idx;
775         uint32_t i;
776         uint16_t free_entries, entry_success = 0;
777         uint16_t avail_idx;
778         struct virtio_net_hdr *hdr = NULL;
779
780         if (unlikely(!is_valid_virt_queue_idx(queue_id, 1, dev->virt_qp_nb))) {
781                 RTE_LOG(ERR, VHOST_DATA,
782                         "%s (%"PRIu64"): virtqueue idx:%d invalid.\n",
783                         __func__, dev->device_fh, queue_id);
784                 return 0;
785         }
786
787         vq = dev->virtqueue[queue_id];
788         if (unlikely(vq->enabled == 0))
789                 return 0;
790
791         avail_idx =  *((volatile uint16_t *)&vq->avail->idx);
792
793         /* If there are no available buffers then return. */
794         if (vq->last_used_idx == avail_idx)
795                 return 0;
796
797         LOG_DEBUG(VHOST_DATA, "%s (%"PRIu64")\n", __func__,
798                 dev->device_fh);
799
800         /* Prefetch available ring to retrieve head indexes. */
801         rte_prefetch0(&vq->avail->ring[vq->last_used_idx & (vq->size - 1)]);
802
803         /*get the number of free entries in the ring*/
804         free_entries = (avail_idx - vq->last_used_idx);
805
806         free_entries = RTE_MIN(free_entries, count);
807         /* Limit to MAX_PKT_BURST. */
808         free_entries = RTE_MIN(free_entries, MAX_PKT_BURST);
809
810         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n",
811                         dev->device_fh, free_entries);
812         /* Retrieve all of the head indexes first to avoid caching issues. */
813         for (i = 0; i < free_entries; i++)
814                 head[i] = vq->avail->ring[(vq->last_used_idx + i) & (vq->size - 1)];
815
816         /* Prefetch descriptor index. */
817         rte_prefetch0(&vq->desc[head[entry_success]]);
818         rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
819
820         while (entry_success < free_entries) {
821                 uint32_t vb_avail, vb_offset;
822                 uint32_t seg_avail, seg_offset;
823                 uint32_t cpy_len;
824                 uint32_t seg_num = 0;
825                 struct rte_mbuf *cur;
826                 uint8_t alloc_err = 0;
827
828                 desc = &vq->desc[head[entry_success]];
829
830                 vb_net_hdr_addr = gpa_to_vva(dev, desc->addr);
831                 hdr = (struct virtio_net_hdr *)((uintptr_t)vb_net_hdr_addr);
832
833                 /* Discard first buffer as it is the virtio header */
834                 if (desc->flags & VRING_DESC_F_NEXT) {
835                         desc = &vq->desc[desc->next];
836                         vb_offset = 0;
837                         vb_avail = desc->len;
838                 } else {
839                         vb_offset = vq->vhost_hlen;
840                         vb_avail = desc->len - vb_offset;
841                 }
842
843                 /* Buffer address translation. */
844                 vb_addr = gpa_to_vva(dev, desc->addr);
845                 /* Prefetch buffer address. */
846                 rte_prefetch0((void *)(uintptr_t)vb_addr);
847
848                 used_idx = vq->last_used_idx & (vq->size - 1);
849
850                 if (entry_success < (free_entries - 1)) {
851                         /* Prefetch descriptor index. */
852                         rte_prefetch0(&vq->desc[head[entry_success+1]]);
853                         rte_prefetch0(&vq->used->ring[(used_idx + 1) & (vq->size - 1)]);
854                 }
855
856                 /* Update used index buffer information. */
857                 vq->used->ring[used_idx].id = head[entry_success];
858                 vq->used->ring[used_idx].len = 0;
859                 vhost_log_used_vring(dev, vq,
860                                 offsetof(struct vring_used, ring[used_idx]),
861                                 sizeof(vq->used->ring[used_idx]));
862
863                 /* Allocate an mbuf and populate the structure. */
864                 m = rte_pktmbuf_alloc(mbuf_pool);
865                 if (unlikely(m == NULL)) {
866                         RTE_LOG(ERR, VHOST_DATA,
867                                 "Failed to allocate memory for mbuf.\n");
868                         break;
869                 }
870                 seg_offset = 0;
871                 seg_avail = m->buf_len - RTE_PKTMBUF_HEADROOM;
872                 cpy_len = RTE_MIN(vb_avail, seg_avail);
873
874                 PRINT_PACKET(dev, (uintptr_t)vb_addr, desc->len, 0);
875
876                 seg_num++;
877                 cur = m;
878                 prev = m;
879                 while (cpy_len != 0) {
880                         rte_memcpy(rte_pktmbuf_mtod_offset(cur, void *, seg_offset),
881                                 (void *)((uintptr_t)(vb_addr + vb_offset)),
882                                 cpy_len);
883
884                         seg_offset += cpy_len;
885                         vb_offset += cpy_len;
886                         vb_avail -= cpy_len;
887                         seg_avail -= cpy_len;
888
889                         if (vb_avail != 0) {
890                                 /*
891                                  * The segment reachs to its end,
892                                  * while the virtio buffer in TX vring has
893                                  * more data to be copied.
894                                  */
895                                 cur->data_len = seg_offset;
896                                 m->pkt_len += seg_offset;
897                                 /* Allocate mbuf and populate the structure. */
898                                 cur = rte_pktmbuf_alloc(mbuf_pool);
899                                 if (unlikely(cur == NULL)) {
900                                         RTE_LOG(ERR, VHOST_DATA, "Failed to "
901                                                 "allocate memory for mbuf.\n");
902                                         rte_pktmbuf_free(m);
903                                         alloc_err = 1;
904                                         break;
905                                 }
906
907                                 seg_num++;
908                                 prev->next = cur;
909                                 prev = cur;
910                                 seg_offset = 0;
911                                 seg_avail = cur->buf_len - RTE_PKTMBUF_HEADROOM;
912                         } else {
913                                 if (desc->flags & VRING_DESC_F_NEXT) {
914                                         /*
915                                          * There are more virtio buffers in
916                                          * same vring entry need to be copied.
917                                          */
918                                         if (seg_avail == 0) {
919                                                 /*
920                                                  * The current segment hasn't
921                                                  * room to accomodate more
922                                                  * data.
923                                                  */
924                                                 cur->data_len = seg_offset;
925                                                 m->pkt_len += seg_offset;
926                                                 /*
927                                                  * Allocate an mbuf and
928                                                  * populate the structure.
929                                                  */
930                                                 cur = rte_pktmbuf_alloc(mbuf_pool);
931                                                 if (unlikely(cur == NULL)) {
932                                                         RTE_LOG(ERR,
933                                                                 VHOST_DATA,
934                                                                 "Failed to "
935                                                                 "allocate memory "
936                                                                 "for mbuf\n");
937                                                         rte_pktmbuf_free(m);
938                                                         alloc_err = 1;
939                                                         break;
940                                                 }
941                                                 seg_num++;
942                                                 prev->next = cur;
943                                                 prev = cur;
944                                                 seg_offset = 0;
945                                                 seg_avail = cur->buf_len - RTE_PKTMBUF_HEADROOM;
946                                         }
947
948                                         desc = &vq->desc[desc->next];
949
950                                         /* Buffer address translation. */
951                                         vb_addr = gpa_to_vva(dev, desc->addr);
952                                         /* Prefetch buffer address. */
953                                         rte_prefetch0((void *)(uintptr_t)vb_addr);
954                                         vb_offset = 0;
955                                         vb_avail = desc->len;
956
957                                         PRINT_PACKET(dev, (uintptr_t)vb_addr,
958                                                 desc->len, 0);
959                                 } else {
960                                         /* The whole packet completes. */
961                                         cur->data_len = seg_offset;
962                                         m->pkt_len += seg_offset;
963                                         vb_avail = 0;
964                                 }
965                         }
966
967                         cpy_len = RTE_MIN(vb_avail, seg_avail);
968                 }
969
970                 if (unlikely(alloc_err == 1))
971                         break;
972
973                 m->nb_segs = seg_num;
974                 if ((hdr->flags != 0) || (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE))
975                         vhost_dequeue_offload(hdr, m);
976
977                 pkts[entry_success] = m;
978                 vq->last_used_idx++;
979                 entry_success++;
980         }
981
982         rte_compiler_barrier();
983         vq->used->idx += entry_success;
984         vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
985                         sizeof(vq->used->idx));
986         /* Kick guest if required. */
987         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
988                 eventfd_write(vq->callfd, (eventfd_t)1);
989         return entry_success;
990 }