vhost: log used vring changes
[dpdk.git] / lib / librte_vhost / vhost_rxtx.c
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33
34 #include <stdint.h>
35 #include <stdbool.h>
36 #include <linux/virtio_net.h>
37
38 #include <rte_mbuf.h>
39 #include <rte_memcpy.h>
40 #include <rte_ether.h>
41 #include <rte_ip.h>
42 #include <rte_virtio_net.h>
43 #include <rte_tcp.h>
44 #include <rte_udp.h>
45 #include <rte_sctp.h>
46
47 #include "vhost-net.h"
48
49 #define MAX_PKT_BURST 32
50 #define VHOST_LOG_PAGE  4096
51
52 static inline void __attribute__((always_inline))
53 vhost_log_page(uint8_t *log_base, uint64_t page)
54 {
55         log_base[page / 8] |= 1 << (page % 8);
56 }
57
58 static inline void __attribute__((always_inline))
59 vhost_log_write(struct virtio_net *dev, uint64_t addr, uint64_t len)
60 {
61         uint64_t page;
62
63         if (likely(((dev->features & (1ULL << VHOST_F_LOG_ALL)) == 0) ||
64                    !dev->log_base || !len))
65                 return;
66
67         if (unlikely(dev->log_size <= ((addr + len - 1) / VHOST_LOG_PAGE / 8)))
68                 return;
69
70         /* To make sure guest memory updates are committed before logging */
71         rte_smp_wmb();
72
73         page = addr / VHOST_LOG_PAGE;
74         while (page * VHOST_LOG_PAGE < addr + len) {
75                 vhost_log_page((uint8_t *)(uintptr_t)dev->log_base, page);
76                 page += 1;
77         }
78 }
79
80 static inline void __attribute__((always_inline))
81 vhost_log_used_vring(struct virtio_net *dev, struct vhost_virtqueue *vq,
82                      uint64_t offset, uint64_t len)
83 {
84         vhost_log_write(dev, vq->log_guest_addr + offset, len);
85 }
86
87 static bool
88 is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t qp_nb)
89 {
90         return (is_tx ^ (idx & 1)) == 0 && idx < qp_nb * VIRTIO_QNUM;
91 }
92
93 static void
94 virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
95 {
96         memset(net_hdr, 0, sizeof(struct virtio_net_hdr));
97
98         if (m_buf->ol_flags & PKT_TX_L4_MASK) {
99                 net_hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
100                 net_hdr->csum_start = m_buf->l2_len + m_buf->l3_len;
101
102                 switch (m_buf->ol_flags & PKT_TX_L4_MASK) {
103                 case PKT_TX_TCP_CKSUM:
104                         net_hdr->csum_offset = (offsetof(struct tcp_hdr,
105                                                 cksum));
106                         break;
107                 case PKT_TX_UDP_CKSUM:
108                         net_hdr->csum_offset = (offsetof(struct udp_hdr,
109                                                 dgram_cksum));
110                         break;
111                 case PKT_TX_SCTP_CKSUM:
112                         net_hdr->csum_offset = (offsetof(struct sctp_hdr,
113                                                 cksum));
114                         break;
115                 }
116         }
117
118         if (m_buf->ol_flags & PKT_TX_TCP_SEG) {
119                 if (m_buf->ol_flags & PKT_TX_IPV4)
120                         net_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
121                 else
122                         net_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
123                 net_hdr->gso_size = m_buf->tso_segsz;
124                 net_hdr->hdr_len = m_buf->l2_len + m_buf->l3_len
125                                         + m_buf->l4_len;
126         }
127
128         return;
129 }
130
131 /**
132  * This function adds buffers to the virtio devices RX virtqueue. Buffers can
133  * be received from the physical port or from another virtio device. A packet
134  * count is returned to indicate the number of packets that are succesfully
135  * added to the RX queue. This function works when the mbuf is scattered, but
136  * it doesn't support the mergeable feature.
137  */
138 static inline uint32_t __attribute__((always_inline))
139 virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
140         struct rte_mbuf **pkts, uint32_t count)
141 {
142         struct vhost_virtqueue *vq;
143         struct vring_desc *desc;
144         struct rte_mbuf *buff, *first_buff;
145         /* The virtio_hdr is initialised to 0. */
146         struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0};
147         uint64_t buff_addr = 0;
148         uint64_t buff_hdr_addr = 0;
149         uint32_t head[MAX_PKT_BURST];
150         uint32_t head_idx, packet_success = 0;
151         uint16_t avail_idx, res_cur_idx;
152         uint16_t res_base_idx, res_end_idx;
153         uint16_t free_entries;
154         uint8_t success = 0;
155
156         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh);
157         if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->virt_qp_nb))) {
158                 RTE_LOG(ERR, VHOST_DATA,
159                         "%s (%"PRIu64"): virtqueue idx:%d invalid.\n",
160                         __func__, dev->device_fh, queue_id);
161                 return 0;
162         }
163
164         vq = dev->virtqueue[queue_id];
165         if (unlikely(vq->enabled == 0))
166                 return 0;
167
168         count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count;
169
170         /*
171          * As many data cores may want access to available buffers,
172          * they need to be reserved.
173          */
174         do {
175                 res_base_idx = vq->last_used_idx_res;
176                 avail_idx = *((volatile uint16_t *)&vq->avail->idx);
177
178                 free_entries = (avail_idx - res_base_idx);
179                 /*check that we have enough buffers*/
180                 if (unlikely(count > free_entries))
181                         count = free_entries;
182
183                 if (count == 0)
184                         return 0;
185
186                 res_end_idx = res_base_idx + count;
187                 /* vq->last_used_idx_res is atomically updated. */
188                 /* TODO: Allow to disable cmpset if no concurrency in application. */
189                 success = rte_atomic16_cmpset(&vq->last_used_idx_res,
190                                 res_base_idx, res_end_idx);
191         } while (unlikely(success == 0));
192         res_cur_idx = res_base_idx;
193         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n",
194                         dev->device_fh, res_cur_idx, res_end_idx);
195
196         /* Prefetch available ring to retrieve indexes. */
197         rte_prefetch0(&vq->avail->ring[res_cur_idx & (vq->size - 1)]);
198
199         /* Retrieve all of the head indexes first to avoid caching issues. */
200         for (head_idx = 0; head_idx < count; head_idx++)
201                 head[head_idx] = vq->avail->ring[(res_cur_idx + head_idx) &
202                                         (vq->size - 1)];
203
204         /*Prefetch descriptor index. */
205         rte_prefetch0(&vq->desc[head[packet_success]]);
206
207         while (res_cur_idx != res_end_idx) {
208                 uint32_t offset = 0, vb_offset = 0;
209                 uint32_t pkt_len, len_to_cpy, data_len, total_copied = 0;
210                 uint8_t hdr = 0, uncompleted_pkt = 0;
211                 uint16_t idx;
212
213                 /* Get descriptor from available ring */
214                 desc = &vq->desc[head[packet_success]];
215
216                 buff = pkts[packet_success];
217                 first_buff = buff;
218
219                 /* Convert from gpa to vva (guest physical addr -> vhost virtual addr) */
220                 buff_addr = gpa_to_vva(dev, desc->addr);
221                 /* Prefetch buffer address. */
222                 rte_prefetch0((void *)(uintptr_t)buff_addr);
223
224                 /* Copy virtio_hdr to packet and increment buffer address */
225                 buff_hdr_addr = buff_addr;
226
227                 /*
228                  * If the descriptors are chained the header and data are
229                  * placed in separate buffers.
230                  */
231                 if ((desc->flags & VRING_DESC_F_NEXT) &&
232                         (desc->len == vq->vhost_hlen)) {
233                         desc = &vq->desc[desc->next];
234                         /* Buffer address translation. */
235                         buff_addr = gpa_to_vva(dev, desc->addr);
236                 } else {
237                         vb_offset += vq->vhost_hlen;
238                         hdr = 1;
239                 }
240
241                 pkt_len = rte_pktmbuf_pkt_len(buff);
242                 data_len = rte_pktmbuf_data_len(buff);
243                 len_to_cpy = RTE_MIN(data_len,
244                         hdr ? desc->len - vq->vhost_hlen : desc->len);
245                 while (total_copied < pkt_len) {
246                         /* Copy mbuf data to buffer */
247                         rte_memcpy((void *)(uintptr_t)(buff_addr + vb_offset),
248                                 rte_pktmbuf_mtod_offset(buff, const void *, offset),
249                                 len_to_cpy);
250                         PRINT_PACKET(dev, (uintptr_t)(buff_addr + vb_offset),
251                                 len_to_cpy, 0);
252
253                         offset += len_to_cpy;
254                         vb_offset += len_to_cpy;
255                         total_copied += len_to_cpy;
256
257                         /* The whole packet completes */
258                         if (total_copied == pkt_len)
259                                 break;
260
261                         /* The current segment completes */
262                         if (offset == data_len) {
263                                 buff = buff->next;
264                                 offset = 0;
265                                 data_len = rte_pktmbuf_data_len(buff);
266                         }
267
268                         /* The current vring descriptor done */
269                         if (vb_offset == desc->len) {
270                                 if (desc->flags & VRING_DESC_F_NEXT) {
271                                         desc = &vq->desc[desc->next];
272                                         buff_addr = gpa_to_vva(dev, desc->addr);
273                                         vb_offset = 0;
274                                 } else {
275                                         /* Room in vring buffer is not enough */
276                                         uncompleted_pkt = 1;
277                                         break;
278                                 }
279                         }
280                         len_to_cpy = RTE_MIN(data_len - offset, desc->len - vb_offset);
281                 }
282
283                 /* Update used ring with desc information */
284                 idx = res_cur_idx & (vq->size - 1);
285                 vq->used->ring[idx].id = head[packet_success];
286
287                 /* Drop the packet if it is uncompleted */
288                 if (unlikely(uncompleted_pkt == 1))
289                         vq->used->ring[idx].len = vq->vhost_hlen;
290                 else
291                         vq->used->ring[idx].len = pkt_len + vq->vhost_hlen;
292
293                 vhost_log_used_vring(dev, vq,
294                         offsetof(struct vring_used, ring[idx]),
295                         sizeof(vq->used->ring[idx]));
296
297                 res_cur_idx++;
298                 packet_success++;
299
300                 if (unlikely(uncompleted_pkt == 1))
301                         continue;
302
303                 virtio_enqueue_offload(first_buff, &virtio_hdr.hdr);
304
305                 rte_memcpy((void *)(uintptr_t)buff_hdr_addr,
306                         (const void *)&virtio_hdr, vq->vhost_hlen);
307
308                 PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
309
310                 if (res_cur_idx < res_end_idx) {
311                         /* Prefetch descriptor index. */
312                         rte_prefetch0(&vq->desc[head[packet_success]]);
313                 }
314         }
315
316         rte_compiler_barrier();
317
318         /* Wait until it's our turn to add our buffer to the used ring. */
319         while (unlikely(vq->last_used_idx != res_base_idx))
320                 rte_pause();
321
322         *(volatile uint16_t *)&vq->used->idx += count;
323         vq->last_used_idx = res_end_idx;
324         vhost_log_used_vring(dev, vq,
325                 offsetof(struct vring_used, idx),
326                 sizeof(vq->used->idx));
327
328         /* flush used->idx update before we read avail->flags. */
329         rte_mb();
330
331         /* Kick the guest if necessary. */
332         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
333                 eventfd_write(vq->callfd, (eventfd_t)1);
334         return count;
335 }
336
337 static inline uint32_t __attribute__((always_inline))
338 copy_from_mbuf_to_vring(struct virtio_net *dev, uint32_t queue_id,
339                         uint16_t res_base_idx, uint16_t res_end_idx,
340                         struct rte_mbuf *pkt)
341 {
342         uint32_t vec_idx = 0;
343         uint32_t entry_success = 0;
344         struct vhost_virtqueue *vq;
345         /* The virtio_hdr is initialised to 0. */
346         struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {
347                 {0, 0, 0, 0, 0, 0}, 0};
348         uint16_t cur_idx = res_base_idx;
349         uint64_t vb_addr = 0;
350         uint64_t vb_hdr_addr = 0;
351         uint32_t seg_offset = 0;
352         uint32_t vb_offset = 0;
353         uint32_t seg_avail;
354         uint32_t vb_avail;
355         uint32_t cpy_len, entry_len;
356         uint16_t idx;
357
358         if (pkt == NULL)
359                 return 0;
360
361         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| "
362                 "End Index %d\n",
363                 dev->device_fh, cur_idx, res_end_idx);
364
365         /*
366          * Convert from gpa to vva
367          * (guest physical addr -> vhost virtual addr)
368          */
369         vq = dev->virtqueue[queue_id];
370
371         vb_addr = gpa_to_vva(dev, vq->buf_vec[vec_idx].buf_addr);
372         vb_hdr_addr = vb_addr;
373
374         /* Prefetch buffer address. */
375         rte_prefetch0((void *)(uintptr_t)vb_addr);
376
377         virtio_hdr.num_buffers = res_end_idx - res_base_idx;
378
379         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") RX: Num merge buffers %d\n",
380                 dev->device_fh, virtio_hdr.num_buffers);
381
382         virtio_enqueue_offload(pkt, &virtio_hdr.hdr);
383
384         rte_memcpy((void *)(uintptr_t)vb_hdr_addr,
385                 (const void *)&virtio_hdr, vq->vhost_hlen);
386
387         PRINT_PACKET(dev, (uintptr_t)vb_hdr_addr, vq->vhost_hlen, 1);
388
389         seg_avail = rte_pktmbuf_data_len(pkt);
390         vb_offset = vq->vhost_hlen;
391         vb_avail = vq->buf_vec[vec_idx].buf_len - vq->vhost_hlen;
392
393         entry_len = vq->vhost_hlen;
394
395         if (vb_avail == 0) {
396                 uint32_t desc_idx = vq->buf_vec[vec_idx].desc_idx;
397
398                 if ((vq->desc[desc_idx].flags & VRING_DESC_F_NEXT) == 0) {
399                         idx = cur_idx & (vq->size - 1);
400
401                         /* Update used ring with desc information */
402                         vq->used->ring[idx].id = vq->buf_vec[vec_idx].desc_idx;
403                         vq->used->ring[idx].len = entry_len;
404
405                         vhost_log_used_vring(dev, vq,
406                                         offsetof(struct vring_used, ring[idx]),
407                                         sizeof(vq->used->ring[idx]));
408
409                         entry_len = 0;
410                         cur_idx++;
411                         entry_success++;
412                 }
413
414                 vec_idx++;
415                 vb_addr = gpa_to_vva(dev, vq->buf_vec[vec_idx].buf_addr);
416
417                 /* Prefetch buffer address. */
418                 rte_prefetch0((void *)(uintptr_t)vb_addr);
419                 vb_offset = 0;
420                 vb_avail = vq->buf_vec[vec_idx].buf_len;
421         }
422
423         cpy_len = RTE_MIN(vb_avail, seg_avail);
424
425         while (cpy_len > 0) {
426                 /* Copy mbuf data to vring buffer */
427                 rte_memcpy((void *)(uintptr_t)(vb_addr + vb_offset),
428                         rte_pktmbuf_mtod_offset(pkt, const void *, seg_offset),
429                         cpy_len);
430
431                 PRINT_PACKET(dev,
432                         (uintptr_t)(vb_addr + vb_offset),
433                         cpy_len, 0);
434
435                 seg_offset += cpy_len;
436                 vb_offset += cpy_len;
437                 seg_avail -= cpy_len;
438                 vb_avail -= cpy_len;
439                 entry_len += cpy_len;
440
441                 if (seg_avail != 0) {
442                         /*
443                          * The virtio buffer in this vring
444                          * entry reach to its end.
445                          * But the segment doesn't complete.
446                          */
447                         if ((vq->desc[vq->buf_vec[vec_idx].desc_idx].flags &
448                                 VRING_DESC_F_NEXT) == 0) {
449                                 /* Update used ring with desc information */
450                                 idx = cur_idx & (vq->size - 1);
451                                 vq->used->ring[idx].id
452                                         = vq->buf_vec[vec_idx].desc_idx;
453                                 vq->used->ring[idx].len = entry_len;
454                                 vhost_log_used_vring(dev, vq,
455                                         offsetof(struct vring_used, ring[idx]),
456                                         sizeof(vq->used->ring[idx]));
457                                 entry_len = 0;
458                                 cur_idx++;
459                                 entry_success++;
460                         }
461
462                         vec_idx++;
463                         vb_addr = gpa_to_vva(dev,
464                                 vq->buf_vec[vec_idx].buf_addr);
465                         vb_offset = 0;
466                         vb_avail = vq->buf_vec[vec_idx].buf_len;
467                         cpy_len = RTE_MIN(vb_avail, seg_avail);
468                 } else {
469                         /*
470                          * This current segment complete, need continue to
471                          * check if the whole packet complete or not.
472                          */
473                         pkt = pkt->next;
474                         if (pkt != NULL) {
475                                 /*
476                                  * There are more segments.
477                                  */
478                                 if (vb_avail == 0) {
479                                         /*
480                                          * This current buffer from vring is
481                                          * used up, need fetch next buffer
482                                          * from buf_vec.
483                                          */
484                                         uint32_t desc_idx =
485                                                 vq->buf_vec[vec_idx].desc_idx;
486
487                                         if ((vq->desc[desc_idx].flags &
488                                                 VRING_DESC_F_NEXT) == 0) {
489                                                 idx = cur_idx & (vq->size - 1);
490                                                 /*
491                                                  * Update used ring with the
492                                                  * descriptor information
493                                                  */
494                                                 vq->used->ring[idx].id
495                                                         = desc_idx;
496                                                 vq->used->ring[idx].len
497                                                         = entry_len;
498                                                 vhost_log_used_vring(dev, vq,
499                                                         offsetof(struct vring_used, ring[idx]),
500                                                         sizeof(vq->used->ring[idx]));
501                                                 entry_success++;
502                                                 entry_len = 0;
503                                                 cur_idx++;
504                                         }
505
506                                         /* Get next buffer from buf_vec. */
507                                         vec_idx++;
508                                         vb_addr = gpa_to_vva(dev,
509                                                 vq->buf_vec[vec_idx].buf_addr);
510                                         vb_avail =
511                                                 vq->buf_vec[vec_idx].buf_len;
512                                         vb_offset = 0;
513                                 }
514
515                                 seg_offset = 0;
516                                 seg_avail = rte_pktmbuf_data_len(pkt);
517                                 cpy_len = RTE_MIN(vb_avail, seg_avail);
518                         } else {
519                                 /*
520                                  * This whole packet completes.
521                                  */
522                                 /* Update used ring with desc information */
523                                 idx = cur_idx & (vq->size - 1);
524                                 vq->used->ring[idx].id
525                                         = vq->buf_vec[vec_idx].desc_idx;
526                                 vq->used->ring[idx].len = entry_len;
527                                 vhost_log_used_vring(dev, vq,
528                                         offsetof(struct vring_used, ring[idx]),
529                                         sizeof(vq->used->ring[idx]));
530                                 entry_success++;
531                                 break;
532                         }
533                 }
534         }
535
536         return entry_success;
537 }
538
539 static inline void __attribute__((always_inline))
540 update_secure_len(struct vhost_virtqueue *vq, uint32_t id,
541         uint32_t *secure_len, uint32_t *vec_idx)
542 {
543         uint16_t wrapped_idx = id & (vq->size - 1);
544         uint32_t idx = vq->avail->ring[wrapped_idx];
545         uint8_t next_desc;
546         uint32_t len = *secure_len;
547         uint32_t vec_id = *vec_idx;
548
549         do {
550                 next_desc = 0;
551                 len += vq->desc[idx].len;
552                 vq->buf_vec[vec_id].buf_addr = vq->desc[idx].addr;
553                 vq->buf_vec[vec_id].buf_len = vq->desc[idx].len;
554                 vq->buf_vec[vec_id].desc_idx = idx;
555                 vec_id++;
556
557                 if (vq->desc[idx].flags & VRING_DESC_F_NEXT) {
558                         idx = vq->desc[idx].next;
559                         next_desc = 1;
560                 }
561         } while (next_desc);
562
563         *secure_len = len;
564         *vec_idx = vec_id;
565 }
566
567 /*
568  * This function works for mergeable RX.
569  */
570 static inline uint32_t __attribute__((always_inline))
571 virtio_dev_merge_rx(struct virtio_net *dev, uint16_t queue_id,
572         struct rte_mbuf **pkts, uint32_t count)
573 {
574         struct vhost_virtqueue *vq;
575         uint32_t pkt_idx = 0, entry_success = 0;
576         uint16_t avail_idx;
577         uint16_t res_base_idx, res_cur_idx;
578         uint8_t success = 0;
579
580         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_merge_rx()\n",
581                 dev->device_fh);
582         if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->virt_qp_nb))) {
583                 RTE_LOG(ERR, VHOST_DATA,
584                         "%s (%"PRIu64"): virtqueue idx:%d invalid.\n",
585                         __func__, dev->device_fh, queue_id);
586                 return 0;
587         }
588
589         vq = dev->virtqueue[queue_id];
590         if (unlikely(vq->enabled == 0))
591                 return 0;
592
593         count = RTE_MIN((uint32_t)MAX_PKT_BURST, count);
594
595         if (count == 0)
596                 return 0;
597
598         for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
599                 uint32_t pkt_len = pkts[pkt_idx]->pkt_len + vq->vhost_hlen;
600
601                 do {
602                         /*
603                          * As many data cores may want access to available
604                          * buffers, they need to be reserved.
605                          */
606                         uint32_t secure_len = 0;
607                         uint32_t vec_idx = 0;
608
609                         res_base_idx = vq->last_used_idx_res;
610                         res_cur_idx = res_base_idx;
611
612                         do {
613                                 avail_idx = *((volatile uint16_t *)&vq->avail->idx);
614                                 if (unlikely(res_cur_idx == avail_idx))
615                                         goto merge_rx_exit;
616
617                                 update_secure_len(vq, res_cur_idx,
618                                                   &secure_len, &vec_idx);
619                                 res_cur_idx++;
620                         } while (pkt_len > secure_len);
621
622                         /* vq->last_used_idx_res is atomically updated. */
623                         success = rte_atomic16_cmpset(&vq->last_used_idx_res,
624                                                         res_base_idx,
625                                                         res_cur_idx);
626                 } while (success == 0);
627
628                 entry_success = copy_from_mbuf_to_vring(dev, queue_id,
629                         res_base_idx, res_cur_idx, pkts[pkt_idx]);
630
631                 rte_compiler_barrier();
632
633                 /*
634                  * Wait until it's our turn to add our buffer
635                  * to the used ring.
636                  */
637                 while (unlikely(vq->last_used_idx != res_base_idx))
638                         rte_pause();
639
640                 *(volatile uint16_t *)&vq->used->idx += entry_success;
641                 vq->last_used_idx = res_cur_idx;
642         }
643
644 merge_rx_exit:
645         if (likely(pkt_idx)) {
646                 /* flush used->idx update before we read avail->flags. */
647                 rte_mb();
648
649                 /* Kick the guest if necessary. */
650                 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
651                         eventfd_write(vq->callfd, (eventfd_t)1);
652         }
653
654         return pkt_idx;
655 }
656
657 uint16_t
658 rte_vhost_enqueue_burst(struct virtio_net *dev, uint16_t queue_id,
659         struct rte_mbuf **pkts, uint16_t count)
660 {
661         if (unlikely(dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF)))
662                 return virtio_dev_merge_rx(dev, queue_id, pkts, count);
663         else
664                 return virtio_dev_rx(dev, queue_id, pkts, count);
665 }
666
667 static void
668 parse_ethernet(struct rte_mbuf *m, uint16_t *l4_proto, void **l4_hdr)
669 {
670         struct ipv4_hdr *ipv4_hdr;
671         struct ipv6_hdr *ipv6_hdr;
672         void *l3_hdr = NULL;
673         struct ether_hdr *eth_hdr;
674         uint16_t ethertype;
675
676         eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
677
678         m->l2_len = sizeof(struct ether_hdr);
679         ethertype = rte_be_to_cpu_16(eth_hdr->ether_type);
680
681         if (ethertype == ETHER_TYPE_VLAN) {
682                 struct vlan_hdr *vlan_hdr = (struct vlan_hdr *)(eth_hdr + 1);
683
684                 m->l2_len += sizeof(struct vlan_hdr);
685                 ethertype = rte_be_to_cpu_16(vlan_hdr->eth_proto);
686         }
687
688         l3_hdr = (char *)eth_hdr + m->l2_len;
689
690         switch (ethertype) {
691         case ETHER_TYPE_IPv4:
692                 ipv4_hdr = (struct ipv4_hdr *)l3_hdr;
693                 *l4_proto = ipv4_hdr->next_proto_id;
694                 m->l3_len = (ipv4_hdr->version_ihl & 0x0f) * 4;
695                 *l4_hdr = (char *)l3_hdr + m->l3_len;
696                 m->ol_flags |= PKT_TX_IPV4;
697                 break;
698         case ETHER_TYPE_IPv6:
699                 ipv6_hdr = (struct ipv6_hdr *)l3_hdr;
700                 *l4_proto = ipv6_hdr->proto;
701                 m->l3_len = sizeof(struct ipv6_hdr);
702                 *l4_hdr = (char *)l3_hdr + m->l3_len;
703                 m->ol_flags |= PKT_TX_IPV6;
704                 break;
705         default:
706                 m->l3_len = 0;
707                 *l4_proto = 0;
708                 break;
709         }
710 }
711
712 static inline void __attribute__((always_inline))
713 vhost_dequeue_offload(struct virtio_net_hdr *hdr, struct rte_mbuf *m)
714 {
715         uint16_t l4_proto = 0;
716         void *l4_hdr = NULL;
717         struct tcp_hdr *tcp_hdr = NULL;
718
719         parse_ethernet(m, &l4_proto, &l4_hdr);
720         if (hdr->flags == VIRTIO_NET_HDR_F_NEEDS_CSUM) {
721                 if (hdr->csum_start == (m->l2_len + m->l3_len)) {
722                         switch (hdr->csum_offset) {
723                         case (offsetof(struct tcp_hdr, cksum)):
724                                 if (l4_proto == IPPROTO_TCP)
725                                         m->ol_flags |= PKT_TX_TCP_CKSUM;
726                                 break;
727                         case (offsetof(struct udp_hdr, dgram_cksum)):
728                                 if (l4_proto == IPPROTO_UDP)
729                                         m->ol_flags |= PKT_TX_UDP_CKSUM;
730                                 break;
731                         case (offsetof(struct sctp_hdr, cksum)):
732                                 if (l4_proto == IPPROTO_SCTP)
733                                         m->ol_flags |= PKT_TX_SCTP_CKSUM;
734                                 break;
735                         default:
736                                 break;
737                         }
738                 }
739         }
740
741         if (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
742                 switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
743                 case VIRTIO_NET_HDR_GSO_TCPV4:
744                 case VIRTIO_NET_HDR_GSO_TCPV6:
745                         tcp_hdr = (struct tcp_hdr *)l4_hdr;
746                         m->ol_flags |= PKT_TX_TCP_SEG;
747                         m->tso_segsz = hdr->gso_size;
748                         m->l4_len = (tcp_hdr->data_off & 0xf0) >> 2;
749                         break;
750                 default:
751                         RTE_LOG(WARNING, VHOST_DATA,
752                                 "unsupported gso type %u.\n", hdr->gso_type);
753                         break;
754                 }
755         }
756 }
757
758 uint16_t
759 rte_vhost_dequeue_burst(struct virtio_net *dev, uint16_t queue_id,
760         struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count)
761 {
762         struct rte_mbuf *m, *prev;
763         struct vhost_virtqueue *vq;
764         struct vring_desc *desc;
765         uint64_t vb_addr = 0;
766         uint64_t vb_net_hdr_addr = 0;
767         uint32_t head[MAX_PKT_BURST];
768         uint32_t used_idx;
769         uint32_t i;
770         uint16_t free_entries, entry_success = 0;
771         uint16_t avail_idx;
772         struct virtio_net_hdr *hdr = NULL;
773
774         if (unlikely(!is_valid_virt_queue_idx(queue_id, 1, dev->virt_qp_nb))) {
775                 RTE_LOG(ERR, VHOST_DATA,
776                         "%s (%"PRIu64"): virtqueue idx:%d invalid.\n",
777                         __func__, dev->device_fh, queue_id);
778                 return 0;
779         }
780
781         vq = dev->virtqueue[queue_id];
782         if (unlikely(vq->enabled == 0))
783                 return 0;
784
785         avail_idx =  *((volatile uint16_t *)&vq->avail->idx);
786
787         /* If there are no available buffers then return. */
788         if (vq->last_used_idx == avail_idx)
789                 return 0;
790
791         LOG_DEBUG(VHOST_DATA, "%s (%"PRIu64")\n", __func__,
792                 dev->device_fh);
793
794         /* Prefetch available ring to retrieve head indexes. */
795         rte_prefetch0(&vq->avail->ring[vq->last_used_idx & (vq->size - 1)]);
796
797         /*get the number of free entries in the ring*/
798         free_entries = (avail_idx - vq->last_used_idx);
799
800         free_entries = RTE_MIN(free_entries, count);
801         /* Limit to MAX_PKT_BURST. */
802         free_entries = RTE_MIN(free_entries, MAX_PKT_BURST);
803
804         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n",
805                         dev->device_fh, free_entries);
806         /* Retrieve all of the head indexes first to avoid caching issues. */
807         for (i = 0; i < free_entries; i++)
808                 head[i] = vq->avail->ring[(vq->last_used_idx + i) & (vq->size - 1)];
809
810         /* Prefetch descriptor index. */
811         rte_prefetch0(&vq->desc[head[entry_success]]);
812         rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
813
814         while (entry_success < free_entries) {
815                 uint32_t vb_avail, vb_offset;
816                 uint32_t seg_avail, seg_offset;
817                 uint32_t cpy_len;
818                 uint32_t seg_num = 0;
819                 struct rte_mbuf *cur;
820                 uint8_t alloc_err = 0;
821
822                 desc = &vq->desc[head[entry_success]];
823
824                 vb_net_hdr_addr = gpa_to_vva(dev, desc->addr);
825                 hdr = (struct virtio_net_hdr *)((uintptr_t)vb_net_hdr_addr);
826
827                 /* Discard first buffer as it is the virtio header */
828                 if (desc->flags & VRING_DESC_F_NEXT) {
829                         desc = &vq->desc[desc->next];
830                         vb_offset = 0;
831                         vb_avail = desc->len;
832                 } else {
833                         vb_offset = vq->vhost_hlen;
834                         vb_avail = desc->len - vb_offset;
835                 }
836
837                 /* Buffer address translation. */
838                 vb_addr = gpa_to_vva(dev, desc->addr);
839                 /* Prefetch buffer address. */
840                 rte_prefetch0((void *)(uintptr_t)vb_addr);
841
842                 used_idx = vq->last_used_idx & (vq->size - 1);
843
844                 if (entry_success < (free_entries - 1)) {
845                         /* Prefetch descriptor index. */
846                         rte_prefetch0(&vq->desc[head[entry_success+1]]);
847                         rte_prefetch0(&vq->used->ring[(used_idx + 1) & (vq->size - 1)]);
848                 }
849
850                 /* Update used index buffer information. */
851                 vq->used->ring[used_idx].id = head[entry_success];
852                 vq->used->ring[used_idx].len = 0;
853                 vhost_log_used_vring(dev, vq,
854                                 offsetof(struct vring_used, ring[used_idx]),
855                                 sizeof(vq->used->ring[used_idx]));
856
857                 /* Allocate an mbuf and populate the structure. */
858                 m = rte_pktmbuf_alloc(mbuf_pool);
859                 if (unlikely(m == NULL)) {
860                         RTE_LOG(ERR, VHOST_DATA,
861                                 "Failed to allocate memory for mbuf.\n");
862                         break;
863                 }
864                 seg_offset = 0;
865                 seg_avail = m->buf_len - RTE_PKTMBUF_HEADROOM;
866                 cpy_len = RTE_MIN(vb_avail, seg_avail);
867
868                 PRINT_PACKET(dev, (uintptr_t)vb_addr, desc->len, 0);
869
870                 seg_num++;
871                 cur = m;
872                 prev = m;
873                 while (cpy_len != 0) {
874                         rte_memcpy(rte_pktmbuf_mtod_offset(cur, void *, seg_offset),
875                                 (void *)((uintptr_t)(vb_addr + vb_offset)),
876                                 cpy_len);
877
878                         seg_offset += cpy_len;
879                         vb_offset += cpy_len;
880                         vb_avail -= cpy_len;
881                         seg_avail -= cpy_len;
882
883                         if (vb_avail != 0) {
884                                 /*
885                                  * The segment reachs to its end,
886                                  * while the virtio buffer in TX vring has
887                                  * more data to be copied.
888                                  */
889                                 cur->data_len = seg_offset;
890                                 m->pkt_len += seg_offset;
891                                 /* Allocate mbuf and populate the structure. */
892                                 cur = rte_pktmbuf_alloc(mbuf_pool);
893                                 if (unlikely(cur == NULL)) {
894                                         RTE_LOG(ERR, VHOST_DATA, "Failed to "
895                                                 "allocate memory for mbuf.\n");
896                                         rte_pktmbuf_free(m);
897                                         alloc_err = 1;
898                                         break;
899                                 }
900
901                                 seg_num++;
902                                 prev->next = cur;
903                                 prev = cur;
904                                 seg_offset = 0;
905                                 seg_avail = cur->buf_len - RTE_PKTMBUF_HEADROOM;
906                         } else {
907                                 if (desc->flags & VRING_DESC_F_NEXT) {
908                                         /*
909                                          * There are more virtio buffers in
910                                          * same vring entry need to be copied.
911                                          */
912                                         if (seg_avail == 0) {
913                                                 /*
914                                                  * The current segment hasn't
915                                                  * room to accomodate more
916                                                  * data.
917                                                  */
918                                                 cur->data_len = seg_offset;
919                                                 m->pkt_len += seg_offset;
920                                                 /*
921                                                  * Allocate an mbuf and
922                                                  * populate the structure.
923                                                  */
924                                                 cur = rte_pktmbuf_alloc(mbuf_pool);
925                                                 if (unlikely(cur == NULL)) {
926                                                         RTE_LOG(ERR,
927                                                                 VHOST_DATA,
928                                                                 "Failed to "
929                                                                 "allocate memory "
930                                                                 "for mbuf\n");
931                                                         rte_pktmbuf_free(m);
932                                                         alloc_err = 1;
933                                                         break;
934                                                 }
935                                                 seg_num++;
936                                                 prev->next = cur;
937                                                 prev = cur;
938                                                 seg_offset = 0;
939                                                 seg_avail = cur->buf_len - RTE_PKTMBUF_HEADROOM;
940                                         }
941
942                                         desc = &vq->desc[desc->next];
943
944                                         /* Buffer address translation. */
945                                         vb_addr = gpa_to_vva(dev, desc->addr);
946                                         /* Prefetch buffer address. */
947                                         rte_prefetch0((void *)(uintptr_t)vb_addr);
948                                         vb_offset = 0;
949                                         vb_avail = desc->len;
950
951                                         PRINT_PACKET(dev, (uintptr_t)vb_addr,
952                                                 desc->len, 0);
953                                 } else {
954                                         /* The whole packet completes. */
955                                         cur->data_len = seg_offset;
956                                         m->pkt_len += seg_offset;
957                                         vb_avail = 0;
958                                 }
959                         }
960
961                         cpy_len = RTE_MIN(vb_avail, seg_avail);
962                 }
963
964                 if (unlikely(alloc_err == 1))
965                         break;
966
967                 m->nb_segs = seg_num;
968                 if ((hdr->flags != 0) || (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE))
969                         vhost_dequeue_offload(hdr, m);
970
971                 pkts[entry_success] = m;
972                 vq->last_used_idx++;
973                 entry_success++;
974         }
975
976         rte_compiler_barrier();
977         vq->used->idx += entry_success;
978         vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
979                         sizeof(vq->used->idx));
980         /* Kick guest if required. */
981         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
982                 eventfd_write(vq->callfd, (eventfd_t)1);
983         return entry_success;
984 }