a483594f38bf0667e70aa9f53908dd5f74bcc981
[dpdk.git] / drivers / net / virtio / virtio_rxtx.c
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33
34 #include <stdint.h>
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <string.h>
38 #include <errno.h>
39
40 #include <rte_cycles.h>
41 #include <rte_memory.h>
42 #include <rte_memzone.h>
43 #include <rte_branch_prediction.h>
44 #include <rte_mempool.h>
45 #include <rte_malloc.h>
46 #include <rte_mbuf.h>
47 #include <rte_ether.h>
48 #include <rte_ethdev.h>
49 #include <rte_prefetch.h>
50 #include <rte_string_fns.h>
51 #include <rte_errno.h>
52 #include <rte_byteorder.h>
53 #include <rte_net.h>
54 #include <rte_ip.h>
55 #include <rte_udp.h>
56 #include <rte_tcp.h>
57
58 #include "virtio_logs.h"
59 #include "virtio_ethdev.h"
60 #include "virtio_pci.h"
61 #include "virtqueue.h"
62 #include "virtio_rxtx.h"
63
64 #ifdef RTE_LIBRTE_VIRTIO_DEBUG_DUMP
65 #define VIRTIO_DUMP_PACKET(m, len) rte_pktmbuf_dump(stdout, m, len)
66 #else
67 #define  VIRTIO_DUMP_PACKET(m, len) do { } while (0)
68 #endif
69
70
71 #define VIRTIO_SIMPLE_FLAGS ((uint32_t)ETH_TXQ_FLAGS_NOMULTSEGS | \
72         ETH_TXQ_FLAGS_NOOFFLOADS)
73
74 int
75 virtio_dev_rx_queue_done(void *rxq, uint16_t offset)
76 {
77         struct virtnet_rx *rxvq = rxq;
78         struct virtqueue *vq = rxvq->vq;
79
80         return VIRTQUEUE_NUSED(vq) >= offset;
81 }
82
83 void
84 vq_ring_free_chain(struct virtqueue *vq, uint16_t desc_idx)
85 {
86         struct vring_desc *dp, *dp_tail;
87         struct vq_desc_extra *dxp;
88         uint16_t desc_idx_last = desc_idx;
89
90         dp  = &vq->vq_ring.desc[desc_idx];
91         dxp = &vq->vq_descx[desc_idx];
92         vq->vq_free_cnt = (uint16_t)(vq->vq_free_cnt + dxp->ndescs);
93         if ((dp->flags & VRING_DESC_F_INDIRECT) == 0) {
94                 while (dp->flags & VRING_DESC_F_NEXT) {
95                         desc_idx_last = dp->next;
96                         dp = &vq->vq_ring.desc[dp->next];
97                 }
98         }
99         dxp->ndescs = 0;
100
101         /*
102          * We must append the existing free chain, if any, to the end of
103          * newly freed chain. If the virtqueue was completely used, then
104          * head would be VQ_RING_DESC_CHAIN_END (ASSERTed above).
105          */
106         if (vq->vq_desc_tail_idx == VQ_RING_DESC_CHAIN_END) {
107                 vq->vq_desc_head_idx = desc_idx;
108         } else {
109                 dp_tail = &vq->vq_ring.desc[vq->vq_desc_tail_idx];
110                 dp_tail->next = desc_idx;
111         }
112
113         vq->vq_desc_tail_idx = desc_idx_last;
114         dp->next = VQ_RING_DESC_CHAIN_END;
115 }
116
117 static uint16_t
118 virtqueue_dequeue_burst_rx(struct virtqueue *vq, struct rte_mbuf **rx_pkts,
119                            uint32_t *len, uint16_t num)
120 {
121         struct vring_used_elem *uep;
122         struct rte_mbuf *cookie;
123         uint16_t used_idx, desc_idx;
124         uint16_t i;
125
126         /*  Caller does the check */
127         for (i = 0; i < num ; i++) {
128                 used_idx = (uint16_t)(vq->vq_used_cons_idx & (vq->vq_nentries - 1));
129                 uep = &vq->vq_ring.used->ring[used_idx];
130                 desc_idx = (uint16_t) uep->id;
131                 len[i] = uep->len;
132                 cookie = (struct rte_mbuf *)vq->vq_descx[desc_idx].cookie;
133
134                 if (unlikely(cookie == NULL)) {
135                         PMD_DRV_LOG(ERR, "vring descriptor with no mbuf cookie at %u",
136                                 vq->vq_used_cons_idx);
137                         break;
138                 }
139
140                 rte_prefetch0(cookie);
141                 rte_packet_prefetch(rte_pktmbuf_mtod(cookie, void *));
142                 rx_pkts[i]  = cookie;
143                 vq->vq_used_cons_idx++;
144                 vq_ring_free_chain(vq, desc_idx);
145                 vq->vq_descx[desc_idx].cookie = NULL;
146         }
147
148         return i;
149 }
150
151 #ifndef DEFAULT_TX_FREE_THRESH
152 #define DEFAULT_TX_FREE_THRESH 32
153 #endif
154
155 /* Cleanup from completed transmits. */
156 static void
157 virtio_xmit_cleanup(struct virtqueue *vq, uint16_t num)
158 {
159         uint16_t i, used_idx, desc_idx;
160         for (i = 0; i < num; i++) {
161                 struct vring_used_elem *uep;
162                 struct vq_desc_extra *dxp;
163
164                 used_idx = (uint16_t)(vq->vq_used_cons_idx & (vq->vq_nentries - 1));
165                 uep = &vq->vq_ring.used->ring[used_idx];
166
167                 desc_idx = (uint16_t) uep->id;
168                 dxp = &vq->vq_descx[desc_idx];
169                 vq->vq_used_cons_idx++;
170                 vq_ring_free_chain(vq, desc_idx);
171
172                 if (dxp->cookie != NULL) {
173                         rte_pktmbuf_free(dxp->cookie);
174                         dxp->cookie = NULL;
175                 }
176         }
177 }
178
179
180 static inline int
181 virtqueue_enqueue_recv_refill(struct virtqueue *vq, struct rte_mbuf *cookie)
182 {
183         struct vq_desc_extra *dxp;
184         struct virtio_hw *hw = vq->hw;
185         struct vring_desc *start_dp;
186         uint16_t needed = 1;
187         uint16_t head_idx, idx;
188
189         if (unlikely(vq->vq_free_cnt == 0))
190                 return -ENOSPC;
191         if (unlikely(vq->vq_free_cnt < needed))
192                 return -EMSGSIZE;
193
194         head_idx = vq->vq_desc_head_idx;
195         if (unlikely(head_idx >= vq->vq_nentries))
196                 return -EFAULT;
197
198         idx = head_idx;
199         dxp = &vq->vq_descx[idx];
200         dxp->cookie = (void *)cookie;
201         dxp->ndescs = needed;
202
203         start_dp = vq->vq_ring.desc;
204         start_dp[idx].addr =
205                 VIRTIO_MBUF_ADDR(cookie, vq) +
206                 RTE_PKTMBUF_HEADROOM - hw->vtnet_hdr_size;
207         start_dp[idx].len =
208                 cookie->buf_len - RTE_PKTMBUF_HEADROOM + hw->vtnet_hdr_size;
209         start_dp[idx].flags =  VRING_DESC_F_WRITE;
210         idx = start_dp[idx].next;
211         vq->vq_desc_head_idx = idx;
212         if (vq->vq_desc_head_idx == VQ_RING_DESC_CHAIN_END)
213                 vq->vq_desc_tail_idx = idx;
214         vq->vq_free_cnt = (uint16_t)(vq->vq_free_cnt - needed);
215         vq_update_avail_ring(vq, head_idx);
216
217         return 0;
218 }
219
220 /* When doing TSO, the IP length is not included in the pseudo header
221  * checksum of the packet given to the PMD, but for virtio it is
222  * expected.
223  */
224 static void
225 virtio_tso_fix_cksum(struct rte_mbuf *m)
226 {
227         /* common case: header is not fragmented */
228         if (likely(rte_pktmbuf_data_len(m) >= m->l2_len + m->l3_len +
229                         m->l4_len)) {
230                 struct ipv4_hdr *iph;
231                 struct ipv6_hdr *ip6h;
232                 struct tcp_hdr *th;
233                 uint16_t prev_cksum, new_cksum, ip_len, ip_paylen;
234                 uint32_t tmp;
235
236                 iph = rte_pktmbuf_mtod_offset(m, struct ipv4_hdr *, m->l2_len);
237                 th = RTE_PTR_ADD(iph, m->l3_len);
238                 if ((iph->version_ihl >> 4) == 4) {
239                         iph->hdr_checksum = 0;
240                         iph->hdr_checksum = rte_ipv4_cksum(iph);
241                         ip_len = iph->total_length;
242                         ip_paylen = rte_cpu_to_be_16(rte_be_to_cpu_16(ip_len) -
243                                 m->l3_len);
244                 } else {
245                         ip6h = (struct ipv6_hdr *)iph;
246                         ip_paylen = ip6h->payload_len;
247                 }
248
249                 /* calculate the new phdr checksum not including ip_paylen */
250                 prev_cksum = th->cksum;
251                 tmp = prev_cksum;
252                 tmp += ip_paylen;
253                 tmp = (tmp & 0xffff) + (tmp >> 16);
254                 new_cksum = tmp;
255
256                 /* replace it in the packet */
257                 th->cksum = new_cksum;
258         }
259 }
260
261 static inline int
262 tx_offload_enabled(struct virtio_hw *hw)
263 {
264         return vtpci_with_feature(hw, VIRTIO_NET_F_CSUM) ||
265                 vtpci_with_feature(hw, VIRTIO_NET_F_HOST_TSO4) ||
266                 vtpci_with_feature(hw, VIRTIO_NET_F_HOST_TSO6);
267 }
268
269 /* avoid write operation when necessary, to lessen cache issues */
270 #define ASSIGN_UNLESS_EQUAL(var, val) do {      \
271         if ((var) != (val))                     \
272                 (var) = (val);                  \
273 } while (0)
274
275 static inline void
276 virtqueue_enqueue_xmit(struct virtnet_tx *txvq, struct rte_mbuf *cookie,
277                        uint16_t needed, int use_indirect, int can_push)
278 {
279         struct virtio_tx_region *txr = txvq->virtio_net_hdr_mz->addr;
280         struct vq_desc_extra *dxp;
281         struct virtqueue *vq = txvq->vq;
282         struct vring_desc *start_dp;
283         uint16_t seg_num = cookie->nb_segs;
284         uint16_t head_idx, idx;
285         uint16_t head_size = vq->hw->vtnet_hdr_size;
286         struct virtio_net_hdr *hdr;
287         int offload;
288
289         offload = tx_offload_enabled(vq->hw);
290         head_idx = vq->vq_desc_head_idx;
291         idx = head_idx;
292         dxp = &vq->vq_descx[idx];
293         dxp->cookie = (void *)cookie;
294         dxp->ndescs = needed;
295
296         start_dp = vq->vq_ring.desc;
297
298         if (can_push) {
299                 /* prepend cannot fail, checked by caller */
300                 hdr = (struct virtio_net_hdr *)
301                         rte_pktmbuf_prepend(cookie, head_size);
302                 /* rte_pktmbuf_prepend() counts the hdr size to the pkt length,
303                  * which is wrong. Below subtract restores correct pkt size.
304                  */
305                 cookie->pkt_len -= head_size;
306                 /* if offload disabled, it is not zeroed below, do it now */
307                 if (offload == 0) {
308                         ASSIGN_UNLESS_EQUAL(hdr->csum_start, 0);
309                         ASSIGN_UNLESS_EQUAL(hdr->csum_offset, 0);
310                         ASSIGN_UNLESS_EQUAL(hdr->flags, 0);
311                         ASSIGN_UNLESS_EQUAL(hdr->gso_type, 0);
312                         ASSIGN_UNLESS_EQUAL(hdr->gso_size, 0);
313                         ASSIGN_UNLESS_EQUAL(hdr->hdr_len, 0);
314                 }
315         } else if (use_indirect) {
316                 /* setup tx ring slot to point to indirect
317                  * descriptor list stored in reserved region.
318                  *
319                  * the first slot in indirect ring is already preset
320                  * to point to the header in reserved region
321                  */
322                 start_dp[idx].addr  = txvq->virtio_net_hdr_mem +
323                         RTE_PTR_DIFF(&txr[idx].tx_indir, txr);
324                 start_dp[idx].len   = (seg_num + 1) * sizeof(struct vring_desc);
325                 start_dp[idx].flags = VRING_DESC_F_INDIRECT;
326                 hdr = (struct virtio_net_hdr *)&txr[idx].tx_hdr;
327
328                 /* loop below will fill in rest of the indirect elements */
329                 start_dp = txr[idx].tx_indir;
330                 idx = 1;
331         } else {
332                 /* setup first tx ring slot to point to header
333                  * stored in reserved region.
334                  */
335                 start_dp[idx].addr  = txvq->virtio_net_hdr_mem +
336                         RTE_PTR_DIFF(&txr[idx].tx_hdr, txr);
337                 start_dp[idx].len   = vq->hw->vtnet_hdr_size;
338                 start_dp[idx].flags = VRING_DESC_F_NEXT;
339                 hdr = (struct virtio_net_hdr *)&txr[idx].tx_hdr;
340
341                 idx = start_dp[idx].next;
342         }
343
344         /* Checksum Offload / TSO */
345         if (offload) {
346                 if (cookie->ol_flags & PKT_TX_TCP_SEG)
347                         cookie->ol_flags |= PKT_TX_TCP_CKSUM;
348
349                 switch (cookie->ol_flags & PKT_TX_L4_MASK) {
350                 case PKT_TX_UDP_CKSUM:
351                         hdr->csum_start = cookie->l2_len + cookie->l3_len;
352                         hdr->csum_offset = offsetof(struct udp_hdr,
353                                 dgram_cksum);
354                         hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
355                         break;
356
357                 case PKT_TX_TCP_CKSUM:
358                         hdr->csum_start = cookie->l2_len + cookie->l3_len;
359                         hdr->csum_offset = offsetof(struct tcp_hdr, cksum);
360                         hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
361                         break;
362
363                 default:
364                         ASSIGN_UNLESS_EQUAL(hdr->csum_start, 0);
365                         ASSIGN_UNLESS_EQUAL(hdr->csum_offset, 0);
366                         ASSIGN_UNLESS_EQUAL(hdr->flags, 0);
367                         break;
368                 }
369
370                 /* TCP Segmentation Offload */
371                 if (cookie->ol_flags & PKT_TX_TCP_SEG) {
372                         virtio_tso_fix_cksum(cookie);
373                         hdr->gso_type = (cookie->ol_flags & PKT_TX_IPV6) ?
374                                 VIRTIO_NET_HDR_GSO_TCPV6 :
375                                 VIRTIO_NET_HDR_GSO_TCPV4;
376                         hdr->gso_size = cookie->tso_segsz;
377                         hdr->hdr_len =
378                                 cookie->l2_len +
379                                 cookie->l3_len +
380                                 cookie->l4_len;
381                 } else {
382                         ASSIGN_UNLESS_EQUAL(hdr->gso_type, 0);
383                         ASSIGN_UNLESS_EQUAL(hdr->gso_size, 0);
384                         ASSIGN_UNLESS_EQUAL(hdr->hdr_len, 0);
385                 }
386         }
387
388         do {
389                 start_dp[idx].addr  = VIRTIO_MBUF_DATA_DMA_ADDR(cookie, vq);
390                 start_dp[idx].len   = cookie->data_len;
391                 start_dp[idx].flags = cookie->next ? VRING_DESC_F_NEXT : 0;
392                 idx = start_dp[idx].next;
393         } while ((cookie = cookie->next) != NULL);
394
395         if (use_indirect)
396                 idx = vq->vq_ring.desc[head_idx].next;
397
398         vq->vq_desc_head_idx = idx;
399         if (vq->vq_desc_head_idx == VQ_RING_DESC_CHAIN_END)
400                 vq->vq_desc_tail_idx = idx;
401         vq->vq_free_cnt = (uint16_t)(vq->vq_free_cnt - needed);
402         vq_update_avail_ring(vq, head_idx);
403 }
404
405 void
406 virtio_dev_cq_start(struct rte_eth_dev *dev)
407 {
408         struct virtio_hw *hw = dev->data->dev_private;
409
410         if (hw->cvq && hw->cvq->vq) {
411                 VIRTQUEUE_DUMP((struct virtqueue *)hw->cvq->vq);
412         }
413 }
414
415 int
416 virtio_dev_rx_queue_setup(struct rte_eth_dev *dev,
417                         uint16_t queue_idx,
418                         uint16_t nb_desc,
419                         unsigned int socket_id __rte_unused,
420                         __rte_unused const struct rte_eth_rxconf *rx_conf,
421                         struct rte_mempool *mp)
422 {
423         uint16_t vtpci_queue_idx = 2 * queue_idx + VTNET_SQ_RQ_QUEUE_IDX;
424         struct virtio_hw *hw = dev->data->dev_private;
425         struct virtqueue *vq = hw->vqs[vtpci_queue_idx];
426         struct virtnet_rx *rxvq;
427
428         PMD_INIT_FUNC_TRACE();
429
430         if (nb_desc == 0 || nb_desc > vq->vq_nentries)
431                 nb_desc = vq->vq_nentries;
432         vq->vq_free_cnt = RTE_MIN(vq->vq_free_cnt, nb_desc);
433
434         rxvq = &vq->rxq;
435         rxvq->queue_id = queue_idx;
436         rxvq->mpool = mp;
437         if (rxvq->mpool == NULL) {
438                 rte_exit(EXIT_FAILURE,
439                         "Cannot allocate mbufs for rx virtqueue");
440         }
441         dev->data->rx_queues[queue_idx] = rxvq;
442
443         return 0;
444 }
445
446 int
447 virtio_dev_rx_queue_setup_finish(struct rte_eth_dev *dev, uint16_t queue_idx)
448 {
449         uint16_t vtpci_queue_idx = 2 * queue_idx + VTNET_SQ_RQ_QUEUE_IDX;
450         struct virtio_hw *hw = dev->data->dev_private;
451         struct virtqueue *vq = hw->vqs[vtpci_queue_idx];
452         struct virtnet_rx *rxvq = &vq->rxq;
453         struct rte_mbuf *m;
454         uint16_t desc_idx;
455         int error, nbufs;
456
457         PMD_INIT_FUNC_TRACE();
458
459         /* Allocate blank mbufs for the each rx descriptor */
460         nbufs = 0;
461
462         if (hw->use_simple_rx) {
463                 for (desc_idx = 0; desc_idx < vq->vq_nentries;
464                      desc_idx++) {
465                         vq->vq_ring.avail->ring[desc_idx] = desc_idx;
466                         vq->vq_ring.desc[desc_idx].flags =
467                                 VRING_DESC_F_WRITE;
468                 }
469         }
470
471         memset(&rxvq->fake_mbuf, 0, sizeof(rxvq->fake_mbuf));
472         for (desc_idx = 0; desc_idx < RTE_PMD_VIRTIO_RX_MAX_BURST;
473              desc_idx++) {
474                 vq->sw_ring[vq->vq_nentries + desc_idx] =
475                         &rxvq->fake_mbuf;
476         }
477
478         while (!virtqueue_full(vq)) {
479                 m = rte_mbuf_raw_alloc(rxvq->mpool);
480                 if (m == NULL)
481                         break;
482
483                 /* Enqueue allocated buffers */
484                 if (hw->use_simple_rx)
485                         error = virtqueue_enqueue_recv_refill_simple(vq, m);
486                 else
487                         error = virtqueue_enqueue_recv_refill(vq, m);
488
489                 if (error) {
490                         rte_pktmbuf_free(m);
491                         break;
492                 }
493                 nbufs++;
494         }
495
496         vq_update_avail_idx(vq);
497
498         PMD_INIT_LOG(DEBUG, "Allocated %d bufs", nbufs);
499
500         virtio_rxq_vec_setup(rxvq);
501
502         VIRTQUEUE_DUMP(vq);
503
504         return 0;
505 }
506
507 /*
508  * struct rte_eth_dev *dev: Used to update dev
509  * uint16_t nb_desc: Defaults to values read from config space
510  * unsigned int socket_id: Used to allocate memzone
511  * const struct rte_eth_txconf *tx_conf: Used to setup tx engine
512  * uint16_t queue_idx: Just used as an index in dev txq list
513  */
514 int
515 virtio_dev_tx_queue_setup(struct rte_eth_dev *dev,
516                         uint16_t queue_idx,
517                         uint16_t nb_desc,
518                         unsigned int socket_id __rte_unused,
519                         const struct rte_eth_txconf *tx_conf)
520 {
521         uint8_t vtpci_queue_idx = 2 * queue_idx + VTNET_SQ_TQ_QUEUE_IDX;
522         struct virtio_hw *hw = dev->data->dev_private;
523         struct virtqueue *vq = hw->vqs[vtpci_queue_idx];
524         struct virtnet_tx *txvq;
525         uint16_t tx_free_thresh;
526
527         PMD_INIT_FUNC_TRACE();
528
529         /* cannot use simple rxtx funcs with multisegs or offloads */
530         if ((tx_conf->txq_flags & VIRTIO_SIMPLE_FLAGS) != VIRTIO_SIMPLE_FLAGS)
531                 hw->use_simple_tx = 0;
532
533         if (nb_desc == 0 || nb_desc > vq->vq_nentries)
534                 nb_desc = vq->vq_nentries;
535         vq->vq_free_cnt = RTE_MIN(vq->vq_free_cnt, nb_desc);
536
537         txvq = &vq->txq;
538         txvq->queue_id = queue_idx;
539
540         tx_free_thresh = tx_conf->tx_free_thresh;
541         if (tx_free_thresh == 0)
542                 tx_free_thresh =
543                         RTE_MIN(vq->vq_nentries / 4, DEFAULT_TX_FREE_THRESH);
544
545         if (tx_free_thresh >= (vq->vq_nentries - 3)) {
546                 RTE_LOG(ERR, PMD, "tx_free_thresh must be less than the "
547                         "number of TX entries minus 3 (%u)."
548                         " (tx_free_thresh=%u port=%u queue=%u)\n",
549                         vq->vq_nentries - 3,
550                         tx_free_thresh, dev->data->port_id, queue_idx);
551                 return -EINVAL;
552         }
553
554         vq->vq_free_thresh = tx_free_thresh;
555
556         dev->data->tx_queues[queue_idx] = txvq;
557         return 0;
558 }
559
560 int
561 virtio_dev_tx_queue_setup_finish(struct rte_eth_dev *dev,
562                                 uint16_t queue_idx)
563 {
564         uint8_t vtpci_queue_idx = 2 * queue_idx + VTNET_SQ_TQ_QUEUE_IDX;
565         struct virtio_hw *hw = dev->data->dev_private;
566         struct virtqueue *vq = hw->vqs[vtpci_queue_idx];
567         uint16_t mid_idx = vq->vq_nentries >> 1;
568         struct virtnet_tx *txvq = &vq->txq;
569         uint16_t desc_idx;
570
571         PMD_INIT_FUNC_TRACE();
572
573         if (hw->use_simple_tx) {
574                 for (desc_idx = 0; desc_idx < mid_idx; desc_idx++) {
575                         vq->vq_ring.avail->ring[desc_idx] =
576                                 desc_idx + mid_idx;
577                         vq->vq_ring.desc[desc_idx + mid_idx].next =
578                                 desc_idx;
579                         vq->vq_ring.desc[desc_idx + mid_idx].addr =
580                                 txvq->virtio_net_hdr_mem +
581                                 offsetof(struct virtio_tx_region, tx_hdr);
582                         vq->vq_ring.desc[desc_idx + mid_idx].len =
583                                 vq->hw->vtnet_hdr_size;
584                         vq->vq_ring.desc[desc_idx + mid_idx].flags =
585                                 VRING_DESC_F_NEXT;
586                         vq->vq_ring.desc[desc_idx].flags = 0;
587                 }
588                 for (desc_idx = mid_idx; desc_idx < vq->vq_nentries;
589                      desc_idx++)
590                         vq->vq_ring.avail->ring[desc_idx] = desc_idx;
591         }
592
593         VIRTQUEUE_DUMP(vq);
594
595         return 0;
596 }
597
598 static void
599 virtio_discard_rxbuf(struct virtqueue *vq, struct rte_mbuf *m)
600 {
601         int error;
602         /*
603          * Requeue the discarded mbuf. This should always be
604          * successful since it was just dequeued.
605          */
606         error = virtqueue_enqueue_recv_refill(vq, m);
607         if (unlikely(error)) {
608                 RTE_LOG(ERR, PMD, "cannot requeue discarded mbuf");
609                 rte_pktmbuf_free(m);
610         }
611 }
612
613 static void
614 virtio_update_packet_stats(struct virtnet_stats *stats, struct rte_mbuf *mbuf)
615 {
616         uint32_t s = mbuf->pkt_len;
617         struct ether_addr *ea;
618
619         if (s == 64) {
620                 stats->size_bins[1]++;
621         } else if (s > 64 && s < 1024) {
622                 uint32_t bin;
623
624                 /* count zeros, and offset into correct bin */
625                 bin = (sizeof(s) * 8) - __builtin_clz(s) - 5;
626                 stats->size_bins[bin]++;
627         } else {
628                 if (s < 64)
629                         stats->size_bins[0]++;
630                 else if (s < 1519)
631                         stats->size_bins[6]++;
632                 else if (s >= 1519)
633                         stats->size_bins[7]++;
634         }
635
636         ea = rte_pktmbuf_mtod(mbuf, struct ether_addr *);
637         if (is_multicast_ether_addr(ea)) {
638                 if (is_broadcast_ether_addr(ea))
639                         stats->broadcast++;
640                 else
641                         stats->multicast++;
642         }
643 }
644
645 /* Optionally fill offload information in structure */
646 static int
647 virtio_rx_offload(struct rte_mbuf *m, struct virtio_net_hdr *hdr)
648 {
649         struct rte_net_hdr_lens hdr_lens;
650         uint32_t hdrlen, ptype;
651         int l4_supported = 0;
652
653         /* nothing to do */
654         if (hdr->flags == 0 && hdr->gso_type == VIRTIO_NET_HDR_GSO_NONE)
655                 return 0;
656
657         m->ol_flags |= PKT_RX_IP_CKSUM_UNKNOWN;
658
659         ptype = rte_net_get_ptype(m, &hdr_lens, RTE_PTYPE_ALL_MASK);
660         m->packet_type = ptype;
661         if ((ptype & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_TCP ||
662             (ptype & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_UDP ||
663             (ptype & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_SCTP)
664                 l4_supported = 1;
665
666         if (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
667                 hdrlen = hdr_lens.l2_len + hdr_lens.l3_len + hdr_lens.l4_len;
668                 if (hdr->csum_start <= hdrlen && l4_supported) {
669                         m->ol_flags |= PKT_RX_L4_CKSUM_NONE;
670                 } else {
671                         /* Unknown proto or tunnel, do sw cksum. We can assume
672                          * the cksum field is in the first segment since the
673                          * buffers we provided to the host are large enough.
674                          * In case of SCTP, this will be wrong since it's a CRC
675                          * but there's nothing we can do.
676                          */
677                         uint16_t csum = 0, off;
678
679                         rte_raw_cksum_mbuf(m, hdr->csum_start,
680                                 rte_pktmbuf_pkt_len(m) - hdr->csum_start,
681                                 &csum);
682                         if (likely(csum != 0xffff))
683                                 csum = ~csum;
684                         off = hdr->csum_offset + hdr->csum_start;
685                         if (rte_pktmbuf_data_len(m) >= off + 1)
686                                 *rte_pktmbuf_mtod_offset(m, uint16_t *,
687                                         off) = csum;
688                 }
689         } else if (hdr->flags & VIRTIO_NET_HDR_F_DATA_VALID && l4_supported) {
690                 m->ol_flags |= PKT_RX_L4_CKSUM_GOOD;
691         }
692
693         /* GSO request, save required information in mbuf */
694         if (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
695                 /* Check unsupported modes */
696                 if ((hdr->gso_type & VIRTIO_NET_HDR_GSO_ECN) ||
697                     (hdr->gso_size == 0)) {
698                         return -EINVAL;
699                 }
700
701                 /* Update mss lengthes in mbuf */
702                 m->tso_segsz = hdr->gso_size;
703                 switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
704                         case VIRTIO_NET_HDR_GSO_TCPV4:
705                         case VIRTIO_NET_HDR_GSO_TCPV6:
706                                 m->ol_flags |= PKT_RX_LRO | \
707                                         PKT_RX_L4_CKSUM_NONE;
708                                 break;
709                         default:
710                                 return -EINVAL;
711                 }
712         }
713
714         return 0;
715 }
716
717 static inline int
718 rx_offload_enabled(struct virtio_hw *hw)
719 {
720         return vtpci_with_feature(hw, VIRTIO_NET_F_GUEST_CSUM) ||
721                 vtpci_with_feature(hw, VIRTIO_NET_F_GUEST_TSO4) ||
722                 vtpci_with_feature(hw, VIRTIO_NET_F_GUEST_TSO6);
723 }
724
725 #define VIRTIO_MBUF_BURST_SZ 64
726 #define DESC_PER_CACHELINE (RTE_CACHE_LINE_SIZE / sizeof(struct vring_desc))
727 uint16_t
728 virtio_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
729 {
730         struct virtnet_rx *rxvq = rx_queue;
731         struct virtqueue *vq = rxvq->vq;
732         struct virtio_hw *hw = vq->hw;
733         struct rte_mbuf *rxm, *new_mbuf;
734         uint16_t nb_used, num, nb_rx;
735         uint32_t len[VIRTIO_MBUF_BURST_SZ];
736         struct rte_mbuf *rcv_pkts[VIRTIO_MBUF_BURST_SZ];
737         int error;
738         uint32_t i, nb_enqueued;
739         uint32_t hdr_size;
740         int offload;
741         struct virtio_net_hdr *hdr;
742
743         nb_rx = 0;
744         if (unlikely(hw->started == 0))
745                 return nb_rx;
746
747         nb_used = VIRTQUEUE_NUSED(vq);
748
749         virtio_rmb();
750
751         num = likely(nb_used <= nb_pkts) ? nb_used : nb_pkts;
752         if (unlikely(num > VIRTIO_MBUF_BURST_SZ))
753                 num = VIRTIO_MBUF_BURST_SZ;
754         if (likely(num > DESC_PER_CACHELINE))
755                 num = num - ((vq->vq_used_cons_idx + num) % DESC_PER_CACHELINE);
756
757         num = virtqueue_dequeue_burst_rx(vq, rcv_pkts, len, num);
758         PMD_RX_LOG(DEBUG, "used:%d dequeue:%d", nb_used, num);
759
760         nb_enqueued = 0;
761         hdr_size = hw->vtnet_hdr_size;
762         offload = rx_offload_enabled(hw);
763
764         for (i = 0; i < num ; i++) {
765                 rxm = rcv_pkts[i];
766
767                 PMD_RX_LOG(DEBUG, "packet len:%d", len[i]);
768
769                 if (unlikely(len[i] < hdr_size + ETHER_HDR_LEN)) {
770                         PMD_RX_LOG(ERR, "Packet drop");
771                         nb_enqueued++;
772                         virtio_discard_rxbuf(vq, rxm);
773                         rxvq->stats.errors++;
774                         continue;
775                 }
776
777                 rxm->port = rxvq->port_id;
778                 rxm->data_off = RTE_PKTMBUF_HEADROOM;
779                 rxm->ol_flags = 0;
780                 rxm->vlan_tci = 0;
781
782                 rxm->pkt_len = (uint32_t)(len[i] - hdr_size);
783                 rxm->data_len = (uint16_t)(len[i] - hdr_size);
784
785                 hdr = (struct virtio_net_hdr *)((char *)rxm->buf_addr +
786                         RTE_PKTMBUF_HEADROOM - hdr_size);
787
788                 if (hw->vlan_strip)
789                         rte_vlan_strip(rxm);
790
791                 if (offload && virtio_rx_offload(rxm, hdr) < 0) {
792                         virtio_discard_rxbuf(vq, rxm);
793                         rxvq->stats.errors++;
794                         continue;
795                 }
796
797                 VIRTIO_DUMP_PACKET(rxm, rxm->data_len);
798
799                 rx_pkts[nb_rx++] = rxm;
800
801                 rxvq->stats.bytes += rxm->pkt_len;
802                 virtio_update_packet_stats(&rxvq->stats, rxm);
803         }
804
805         rxvq->stats.packets += nb_rx;
806
807         /* Allocate new mbuf for the used descriptor */
808         error = ENOSPC;
809         while (likely(!virtqueue_full(vq))) {
810                 new_mbuf = rte_mbuf_raw_alloc(rxvq->mpool);
811                 if (unlikely(new_mbuf == NULL)) {
812                         struct rte_eth_dev *dev
813                                 = &rte_eth_devices[rxvq->port_id];
814                         dev->data->rx_mbuf_alloc_failed++;
815                         break;
816                 }
817                 error = virtqueue_enqueue_recv_refill(vq, new_mbuf);
818                 if (unlikely(error)) {
819                         rte_pktmbuf_free(new_mbuf);
820                         break;
821                 }
822                 nb_enqueued++;
823         }
824
825         if (likely(nb_enqueued)) {
826                 vq_update_avail_idx(vq);
827
828                 if (unlikely(virtqueue_kick_prepare(vq))) {
829                         virtqueue_notify(vq);
830                         PMD_RX_LOG(DEBUG, "Notified");
831                 }
832         }
833
834         return nb_rx;
835 }
836
837 uint16_t
838 virtio_recv_mergeable_pkts(void *rx_queue,
839                         struct rte_mbuf **rx_pkts,
840                         uint16_t nb_pkts)
841 {
842         struct virtnet_rx *rxvq = rx_queue;
843         struct virtqueue *vq = rxvq->vq;
844         struct virtio_hw *hw = vq->hw;
845         struct rte_mbuf *rxm, *new_mbuf;
846         uint16_t nb_used, num, nb_rx;
847         uint32_t len[VIRTIO_MBUF_BURST_SZ];
848         struct rte_mbuf *rcv_pkts[VIRTIO_MBUF_BURST_SZ];
849         struct rte_mbuf *prev;
850         int error;
851         uint32_t i, nb_enqueued;
852         uint32_t seg_num;
853         uint16_t extra_idx;
854         uint32_t seg_res;
855         uint32_t hdr_size;
856         int offload;
857
858         nb_rx = 0;
859         if (unlikely(hw->started == 0))
860                 return nb_rx;
861
862         nb_used = VIRTQUEUE_NUSED(vq);
863
864         virtio_rmb();
865
866         PMD_RX_LOG(DEBUG, "used:%d", nb_used);
867
868         i = 0;
869         nb_enqueued = 0;
870         seg_num = 0;
871         extra_idx = 0;
872         seg_res = 0;
873         hdr_size = hw->vtnet_hdr_size;
874         offload = rx_offload_enabled(hw);
875
876         while (i < nb_used) {
877                 struct virtio_net_hdr_mrg_rxbuf *header;
878
879                 if (nb_rx == nb_pkts)
880                         break;
881
882                 num = virtqueue_dequeue_burst_rx(vq, rcv_pkts, len, 1);
883                 if (num != 1)
884                         continue;
885
886                 i++;
887
888                 PMD_RX_LOG(DEBUG, "dequeue:%d", num);
889                 PMD_RX_LOG(DEBUG, "packet len:%d", len[0]);
890
891                 rxm = rcv_pkts[0];
892
893                 if (unlikely(len[0] < hdr_size + ETHER_HDR_LEN)) {
894                         PMD_RX_LOG(ERR, "Packet drop");
895                         nb_enqueued++;
896                         virtio_discard_rxbuf(vq, rxm);
897                         rxvq->stats.errors++;
898                         continue;
899                 }
900
901                 header = (struct virtio_net_hdr_mrg_rxbuf *)((char *)rxm->buf_addr +
902                         RTE_PKTMBUF_HEADROOM - hdr_size);
903                 seg_num = header->num_buffers;
904
905                 if (seg_num == 0)
906                         seg_num = 1;
907
908                 rxm->data_off = RTE_PKTMBUF_HEADROOM;
909                 rxm->nb_segs = seg_num;
910                 rxm->ol_flags = 0;
911                 rxm->vlan_tci = 0;
912                 rxm->pkt_len = (uint32_t)(len[0] - hdr_size);
913                 rxm->data_len = (uint16_t)(len[0] - hdr_size);
914
915                 rxm->port = rxvq->port_id;
916                 rx_pkts[nb_rx] = rxm;
917                 prev = rxm;
918
919                 if (offload && virtio_rx_offload(rxm, &header->hdr) < 0) {
920                         virtio_discard_rxbuf(vq, rxm);
921                         rxvq->stats.errors++;
922                         continue;
923                 }
924
925                 seg_res = seg_num - 1;
926
927                 while (seg_res != 0) {
928                         /*
929                          * Get extra segments for current uncompleted packet.
930                          */
931                         uint16_t  rcv_cnt =
932                                 RTE_MIN(seg_res, RTE_DIM(rcv_pkts));
933                         if (likely(VIRTQUEUE_NUSED(vq) >= rcv_cnt)) {
934                                 uint32_t rx_num =
935                                         virtqueue_dequeue_burst_rx(vq,
936                                         rcv_pkts, len, rcv_cnt);
937                                 i += rx_num;
938                                 rcv_cnt = rx_num;
939                         } else {
940                                 PMD_RX_LOG(ERR,
941                                            "No enough segments for packet.");
942                                 nb_enqueued++;
943                                 virtio_discard_rxbuf(vq, rxm);
944                                 rxvq->stats.errors++;
945                                 break;
946                         }
947
948                         extra_idx = 0;
949
950                         while (extra_idx < rcv_cnt) {
951                                 rxm = rcv_pkts[extra_idx];
952
953                                 rxm->data_off = RTE_PKTMBUF_HEADROOM - hdr_size;
954                                 rxm->pkt_len = (uint32_t)(len[extra_idx]);
955                                 rxm->data_len = (uint16_t)(len[extra_idx]);
956
957                                 if (prev)
958                                         prev->next = rxm;
959
960                                 prev = rxm;
961                                 rx_pkts[nb_rx]->pkt_len += rxm->pkt_len;
962                                 extra_idx++;
963                         };
964                         seg_res -= rcv_cnt;
965                 }
966
967                 if (hw->vlan_strip)
968                         rte_vlan_strip(rx_pkts[nb_rx]);
969
970                 VIRTIO_DUMP_PACKET(rx_pkts[nb_rx],
971                         rx_pkts[nb_rx]->data_len);
972
973                 rxvq->stats.bytes += rx_pkts[nb_rx]->pkt_len;
974                 virtio_update_packet_stats(&rxvq->stats, rx_pkts[nb_rx]);
975                 nb_rx++;
976         }
977
978         rxvq->stats.packets += nb_rx;
979
980         /* Allocate new mbuf for the used descriptor */
981         error = ENOSPC;
982         while (likely(!virtqueue_full(vq))) {
983                 new_mbuf = rte_mbuf_raw_alloc(rxvq->mpool);
984                 if (unlikely(new_mbuf == NULL)) {
985                         struct rte_eth_dev *dev
986                                 = &rte_eth_devices[rxvq->port_id];
987                         dev->data->rx_mbuf_alloc_failed++;
988                         break;
989                 }
990                 error = virtqueue_enqueue_recv_refill(vq, new_mbuf);
991                 if (unlikely(error)) {
992                         rte_pktmbuf_free(new_mbuf);
993                         break;
994                 }
995                 nb_enqueued++;
996         }
997
998         if (likely(nb_enqueued)) {
999                 vq_update_avail_idx(vq);
1000
1001                 if (unlikely(virtqueue_kick_prepare(vq))) {
1002                         virtqueue_notify(vq);
1003                         PMD_RX_LOG(DEBUG, "Notified");
1004                 }
1005         }
1006
1007         return nb_rx;
1008 }
1009
1010 uint16_t
1011 virtio_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
1012 {
1013         struct virtnet_tx *txvq = tx_queue;
1014         struct virtqueue *vq = txvq->vq;
1015         struct virtio_hw *hw = vq->hw;
1016         uint16_t hdr_size = hw->vtnet_hdr_size;
1017         uint16_t nb_used, nb_tx = 0;
1018         int error;
1019
1020         if (unlikely(hw->started == 0))
1021                 return nb_tx;
1022
1023         if (unlikely(nb_pkts < 1))
1024                 return nb_pkts;
1025
1026         PMD_TX_LOG(DEBUG, "%d packets to xmit", nb_pkts);
1027         nb_used = VIRTQUEUE_NUSED(vq);
1028
1029         virtio_rmb();
1030         if (likely(nb_used > vq->vq_nentries - vq->vq_free_thresh))
1031                 virtio_xmit_cleanup(vq, nb_used);
1032
1033         for (nb_tx = 0; nb_tx < nb_pkts; nb_tx++) {
1034                 struct rte_mbuf *txm = tx_pkts[nb_tx];
1035                 int can_push = 0, use_indirect = 0, slots, need;
1036
1037                 /* Do VLAN tag insertion */
1038                 if (unlikely(txm->ol_flags & PKT_TX_VLAN_PKT)) {
1039                         error = rte_vlan_insert(&txm);
1040                         if (unlikely(error)) {
1041                                 rte_pktmbuf_free(txm);
1042                                 continue;
1043                         }
1044                 }
1045
1046                 /* optimize ring usage */
1047                 if ((vtpci_with_feature(hw, VIRTIO_F_ANY_LAYOUT) ||
1048                       vtpci_with_feature(hw, VIRTIO_F_VERSION_1)) &&
1049                     rte_mbuf_refcnt_read(txm) == 1 &&
1050                     RTE_MBUF_DIRECT(txm) &&
1051                     txm->nb_segs == 1 &&
1052                     rte_pktmbuf_headroom(txm) >= hdr_size &&
1053                     rte_is_aligned(rte_pktmbuf_mtod(txm, char *),
1054                                    __alignof__(struct virtio_net_hdr_mrg_rxbuf)))
1055                         can_push = 1;
1056                 else if (vtpci_with_feature(hw, VIRTIO_RING_F_INDIRECT_DESC) &&
1057                          txm->nb_segs < VIRTIO_MAX_TX_INDIRECT)
1058                         use_indirect = 1;
1059
1060                 /* How many main ring entries are needed to this Tx?
1061                  * any_layout => number of segments
1062                  * indirect   => 1
1063                  * default    => number of segments + 1
1064                  */
1065                 slots = use_indirect ? 1 : (txm->nb_segs + !can_push);
1066                 need = slots - vq->vq_free_cnt;
1067
1068                 /* Positive value indicates it need free vring descriptors */
1069                 if (unlikely(need > 0)) {
1070                         nb_used = VIRTQUEUE_NUSED(vq);
1071                         virtio_rmb();
1072                         need = RTE_MIN(need, (int)nb_used);
1073
1074                         virtio_xmit_cleanup(vq, need);
1075                         need = slots - vq->vq_free_cnt;
1076                         if (unlikely(need > 0)) {
1077                                 PMD_TX_LOG(ERR,
1078                                            "No free tx descriptors to transmit");
1079                                 break;
1080                         }
1081                 }
1082
1083                 /* Enqueue Packet buffers */
1084                 virtqueue_enqueue_xmit(txvq, txm, slots, use_indirect, can_push);
1085
1086                 txvq->stats.bytes += txm->pkt_len;
1087                 virtio_update_packet_stats(&txvq->stats, txm);
1088         }
1089
1090         txvq->stats.packets += nb_tx;
1091
1092         if (likely(nb_tx)) {
1093                 vq_update_avail_idx(vq);
1094
1095                 if (unlikely(virtqueue_kick_prepare(vq))) {
1096                         virtqueue_notify(vq);
1097                         PMD_TX_LOG(DEBUG, "Notified backend after xmit");
1098                 }
1099         }
1100
1101         return nb_tx;
1102 }