net/virtio: make control queue thread-safe
[dpdk.git] / drivers / net / virtio / virtio_rxtx.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2014 Intel Corporation
3  */
4
5 #include <stdint.h>
6 #include <stdio.h>
7 #include <stdlib.h>
8 #include <string.h>
9 #include <errno.h>
10
11 #include <rte_cycles.h>
12 #include <rte_memory.h>
13 #include <rte_branch_prediction.h>
14 #include <rte_mempool.h>
15 #include <rte_malloc.h>
16 #include <rte_mbuf.h>
17 #include <rte_ether.h>
18 #include <rte_ethdev.h>
19 #include <rte_prefetch.h>
20 #include <rte_string_fns.h>
21 #include <rte_errno.h>
22 #include <rte_byteorder.h>
23 #include <rte_net.h>
24 #include <rte_ip.h>
25 #include <rte_udp.h>
26 #include <rte_tcp.h>
27
28 #include "virtio_logs.h"
29 #include "virtio_ethdev.h"
30 #include "virtio_pci.h"
31 #include "virtqueue.h"
32 #include "virtio_rxtx.h"
33
34 #ifdef RTE_LIBRTE_VIRTIO_DEBUG_DUMP
35 #define VIRTIO_DUMP_PACKET(m, len) rte_pktmbuf_dump(stdout, m, len)
36 #else
37 #define  VIRTIO_DUMP_PACKET(m, len) do { } while (0)
38 #endif
39
40
41 #define VIRTIO_SIMPLE_FLAGS ((uint32_t)ETH_TXQ_FLAGS_NOMULTSEGS | \
42         ETH_TXQ_FLAGS_NOOFFLOADS)
43
44 int
45 virtio_dev_rx_queue_done(void *rxq, uint16_t offset)
46 {
47         struct virtnet_rx *rxvq = rxq;
48         struct virtqueue *vq = rxvq->vq;
49
50         return VIRTQUEUE_NUSED(vq) >= offset;
51 }
52
53 void
54 vq_ring_free_chain(struct virtqueue *vq, uint16_t desc_idx)
55 {
56         struct vring_desc *dp, *dp_tail;
57         struct vq_desc_extra *dxp;
58         uint16_t desc_idx_last = desc_idx;
59
60         dp  = &vq->vq_ring.desc[desc_idx];
61         dxp = &vq->vq_descx[desc_idx];
62         vq->vq_free_cnt = (uint16_t)(vq->vq_free_cnt + dxp->ndescs);
63         if ((dp->flags & VRING_DESC_F_INDIRECT) == 0) {
64                 while (dp->flags & VRING_DESC_F_NEXT) {
65                         desc_idx_last = dp->next;
66                         dp = &vq->vq_ring.desc[dp->next];
67                 }
68         }
69         dxp->ndescs = 0;
70
71         /*
72          * We must append the existing free chain, if any, to the end of
73          * newly freed chain. If the virtqueue was completely used, then
74          * head would be VQ_RING_DESC_CHAIN_END (ASSERTed above).
75          */
76         if (vq->vq_desc_tail_idx == VQ_RING_DESC_CHAIN_END) {
77                 vq->vq_desc_head_idx = desc_idx;
78         } else {
79                 dp_tail = &vq->vq_ring.desc[vq->vq_desc_tail_idx];
80                 dp_tail->next = desc_idx;
81         }
82
83         vq->vq_desc_tail_idx = desc_idx_last;
84         dp->next = VQ_RING_DESC_CHAIN_END;
85 }
86
87 static uint16_t
88 virtqueue_dequeue_burst_rx(struct virtqueue *vq, struct rte_mbuf **rx_pkts,
89                            uint32_t *len, uint16_t num)
90 {
91         struct vring_used_elem *uep;
92         struct rte_mbuf *cookie;
93         uint16_t used_idx, desc_idx;
94         uint16_t i;
95
96         /*  Caller does the check */
97         for (i = 0; i < num ; i++) {
98                 used_idx = (uint16_t)(vq->vq_used_cons_idx & (vq->vq_nentries - 1));
99                 uep = &vq->vq_ring.used->ring[used_idx];
100                 desc_idx = (uint16_t) uep->id;
101                 len[i] = uep->len;
102                 cookie = (struct rte_mbuf *)vq->vq_descx[desc_idx].cookie;
103
104                 if (unlikely(cookie == NULL)) {
105                         PMD_DRV_LOG(ERR, "vring descriptor with no mbuf cookie at %u",
106                                 vq->vq_used_cons_idx);
107                         break;
108                 }
109
110                 rte_prefetch0(cookie);
111                 rte_packet_prefetch(rte_pktmbuf_mtod(cookie, void *));
112                 rx_pkts[i]  = cookie;
113                 vq->vq_used_cons_idx++;
114                 vq_ring_free_chain(vq, desc_idx);
115                 vq->vq_descx[desc_idx].cookie = NULL;
116         }
117
118         return i;
119 }
120
121 #ifndef DEFAULT_TX_FREE_THRESH
122 #define DEFAULT_TX_FREE_THRESH 32
123 #endif
124
125 /* Cleanup from completed transmits. */
126 static void
127 virtio_xmit_cleanup(struct virtqueue *vq, uint16_t num)
128 {
129         uint16_t i, used_idx, desc_idx;
130         for (i = 0; i < num; i++) {
131                 struct vring_used_elem *uep;
132                 struct vq_desc_extra *dxp;
133
134                 used_idx = (uint16_t)(vq->vq_used_cons_idx & (vq->vq_nentries - 1));
135                 uep = &vq->vq_ring.used->ring[used_idx];
136
137                 desc_idx = (uint16_t) uep->id;
138                 dxp = &vq->vq_descx[desc_idx];
139                 vq->vq_used_cons_idx++;
140                 vq_ring_free_chain(vq, desc_idx);
141
142                 if (dxp->cookie != NULL) {
143                         rte_pktmbuf_free(dxp->cookie);
144                         dxp->cookie = NULL;
145                 }
146         }
147 }
148
149
150 static inline int
151 virtqueue_enqueue_recv_refill(struct virtqueue *vq, struct rte_mbuf *cookie)
152 {
153         struct vq_desc_extra *dxp;
154         struct virtio_hw *hw = vq->hw;
155         struct vring_desc *start_dp;
156         uint16_t needed = 1;
157         uint16_t head_idx, idx;
158
159         if (unlikely(vq->vq_free_cnt == 0))
160                 return -ENOSPC;
161         if (unlikely(vq->vq_free_cnt < needed))
162                 return -EMSGSIZE;
163
164         head_idx = vq->vq_desc_head_idx;
165         if (unlikely(head_idx >= vq->vq_nentries))
166                 return -EFAULT;
167
168         idx = head_idx;
169         dxp = &vq->vq_descx[idx];
170         dxp->cookie = (void *)cookie;
171         dxp->ndescs = needed;
172
173         start_dp = vq->vq_ring.desc;
174         start_dp[idx].addr =
175                 VIRTIO_MBUF_ADDR(cookie, vq) +
176                 RTE_PKTMBUF_HEADROOM - hw->vtnet_hdr_size;
177         start_dp[idx].len =
178                 cookie->buf_len - RTE_PKTMBUF_HEADROOM + hw->vtnet_hdr_size;
179         start_dp[idx].flags =  VRING_DESC_F_WRITE;
180         idx = start_dp[idx].next;
181         vq->vq_desc_head_idx = idx;
182         if (vq->vq_desc_head_idx == VQ_RING_DESC_CHAIN_END)
183                 vq->vq_desc_tail_idx = idx;
184         vq->vq_free_cnt = (uint16_t)(vq->vq_free_cnt - needed);
185         vq_update_avail_ring(vq, head_idx);
186
187         return 0;
188 }
189
190 /* When doing TSO, the IP length is not included in the pseudo header
191  * checksum of the packet given to the PMD, but for virtio it is
192  * expected.
193  */
194 static void
195 virtio_tso_fix_cksum(struct rte_mbuf *m)
196 {
197         /* common case: header is not fragmented */
198         if (likely(rte_pktmbuf_data_len(m) >= m->l2_len + m->l3_len +
199                         m->l4_len)) {
200                 struct ipv4_hdr *iph;
201                 struct ipv6_hdr *ip6h;
202                 struct tcp_hdr *th;
203                 uint16_t prev_cksum, new_cksum, ip_len, ip_paylen;
204                 uint32_t tmp;
205
206                 iph = rte_pktmbuf_mtod_offset(m, struct ipv4_hdr *, m->l2_len);
207                 th = RTE_PTR_ADD(iph, m->l3_len);
208                 if ((iph->version_ihl >> 4) == 4) {
209                         iph->hdr_checksum = 0;
210                         iph->hdr_checksum = rte_ipv4_cksum(iph);
211                         ip_len = iph->total_length;
212                         ip_paylen = rte_cpu_to_be_16(rte_be_to_cpu_16(ip_len) -
213                                 m->l3_len);
214                 } else {
215                         ip6h = (struct ipv6_hdr *)iph;
216                         ip_paylen = ip6h->payload_len;
217                 }
218
219                 /* calculate the new phdr checksum not including ip_paylen */
220                 prev_cksum = th->cksum;
221                 tmp = prev_cksum;
222                 tmp += ip_paylen;
223                 tmp = (tmp & 0xffff) + (tmp >> 16);
224                 new_cksum = tmp;
225
226                 /* replace it in the packet */
227                 th->cksum = new_cksum;
228         }
229 }
230
231 static inline int
232 tx_offload_enabled(struct virtio_hw *hw)
233 {
234         return vtpci_with_feature(hw, VIRTIO_NET_F_CSUM) ||
235                 vtpci_with_feature(hw, VIRTIO_NET_F_HOST_TSO4) ||
236                 vtpci_with_feature(hw, VIRTIO_NET_F_HOST_TSO6);
237 }
238
239 /* avoid write operation when necessary, to lessen cache issues */
240 #define ASSIGN_UNLESS_EQUAL(var, val) do {      \
241         if ((var) != (val))                     \
242                 (var) = (val);                  \
243 } while (0)
244
245 static inline void
246 virtqueue_enqueue_xmit(struct virtnet_tx *txvq, struct rte_mbuf *cookie,
247                        uint16_t needed, int use_indirect, int can_push)
248 {
249         struct virtio_tx_region *txr = txvq->virtio_net_hdr_mz->addr;
250         struct vq_desc_extra *dxp;
251         struct virtqueue *vq = txvq->vq;
252         struct vring_desc *start_dp;
253         uint16_t seg_num = cookie->nb_segs;
254         uint16_t head_idx, idx;
255         uint16_t head_size = vq->hw->vtnet_hdr_size;
256         struct virtio_net_hdr *hdr;
257         int offload;
258
259         offload = tx_offload_enabled(vq->hw);
260         head_idx = vq->vq_desc_head_idx;
261         idx = head_idx;
262         dxp = &vq->vq_descx[idx];
263         dxp->cookie = (void *)cookie;
264         dxp->ndescs = needed;
265
266         start_dp = vq->vq_ring.desc;
267
268         if (can_push) {
269                 /* prepend cannot fail, checked by caller */
270                 hdr = (struct virtio_net_hdr *)
271                         rte_pktmbuf_prepend(cookie, head_size);
272                 /* rte_pktmbuf_prepend() counts the hdr size to the pkt length,
273                  * which is wrong. Below subtract restores correct pkt size.
274                  */
275                 cookie->pkt_len -= head_size;
276                 /* if offload disabled, it is not zeroed below, do it now */
277                 if (offload == 0) {
278                         ASSIGN_UNLESS_EQUAL(hdr->csum_start, 0);
279                         ASSIGN_UNLESS_EQUAL(hdr->csum_offset, 0);
280                         ASSIGN_UNLESS_EQUAL(hdr->flags, 0);
281                         ASSIGN_UNLESS_EQUAL(hdr->gso_type, 0);
282                         ASSIGN_UNLESS_EQUAL(hdr->gso_size, 0);
283                         ASSIGN_UNLESS_EQUAL(hdr->hdr_len, 0);
284                 }
285         } else if (use_indirect) {
286                 /* setup tx ring slot to point to indirect
287                  * descriptor list stored in reserved region.
288                  *
289                  * the first slot in indirect ring is already preset
290                  * to point to the header in reserved region
291                  */
292                 start_dp[idx].addr  = txvq->virtio_net_hdr_mem +
293                         RTE_PTR_DIFF(&txr[idx].tx_indir, txr);
294                 start_dp[idx].len   = (seg_num + 1) * sizeof(struct vring_desc);
295                 start_dp[idx].flags = VRING_DESC_F_INDIRECT;
296                 hdr = (struct virtio_net_hdr *)&txr[idx].tx_hdr;
297
298                 /* loop below will fill in rest of the indirect elements */
299                 start_dp = txr[idx].tx_indir;
300                 idx = 1;
301         } else {
302                 /* setup first tx ring slot to point to header
303                  * stored in reserved region.
304                  */
305                 start_dp[idx].addr  = txvq->virtio_net_hdr_mem +
306                         RTE_PTR_DIFF(&txr[idx].tx_hdr, txr);
307                 start_dp[idx].len   = vq->hw->vtnet_hdr_size;
308                 start_dp[idx].flags = VRING_DESC_F_NEXT;
309                 hdr = (struct virtio_net_hdr *)&txr[idx].tx_hdr;
310
311                 idx = start_dp[idx].next;
312         }
313
314         /* Checksum Offload / TSO */
315         if (offload) {
316                 if (cookie->ol_flags & PKT_TX_TCP_SEG)
317                         cookie->ol_flags |= PKT_TX_TCP_CKSUM;
318
319                 switch (cookie->ol_flags & PKT_TX_L4_MASK) {
320                 case PKT_TX_UDP_CKSUM:
321                         hdr->csum_start = cookie->l2_len + cookie->l3_len;
322                         hdr->csum_offset = offsetof(struct udp_hdr,
323                                 dgram_cksum);
324                         hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
325                         break;
326
327                 case PKT_TX_TCP_CKSUM:
328                         hdr->csum_start = cookie->l2_len + cookie->l3_len;
329                         hdr->csum_offset = offsetof(struct tcp_hdr, cksum);
330                         hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
331                         break;
332
333                 default:
334                         ASSIGN_UNLESS_EQUAL(hdr->csum_start, 0);
335                         ASSIGN_UNLESS_EQUAL(hdr->csum_offset, 0);
336                         ASSIGN_UNLESS_EQUAL(hdr->flags, 0);
337                         break;
338                 }
339
340                 /* TCP Segmentation Offload */
341                 if (cookie->ol_flags & PKT_TX_TCP_SEG) {
342                         virtio_tso_fix_cksum(cookie);
343                         hdr->gso_type = (cookie->ol_flags & PKT_TX_IPV6) ?
344                                 VIRTIO_NET_HDR_GSO_TCPV6 :
345                                 VIRTIO_NET_HDR_GSO_TCPV4;
346                         hdr->gso_size = cookie->tso_segsz;
347                         hdr->hdr_len =
348                                 cookie->l2_len +
349                                 cookie->l3_len +
350                                 cookie->l4_len;
351                 } else {
352                         ASSIGN_UNLESS_EQUAL(hdr->gso_type, 0);
353                         ASSIGN_UNLESS_EQUAL(hdr->gso_size, 0);
354                         ASSIGN_UNLESS_EQUAL(hdr->hdr_len, 0);
355                 }
356         }
357
358         do {
359                 start_dp[idx].addr  = VIRTIO_MBUF_DATA_DMA_ADDR(cookie, vq);
360                 start_dp[idx].len   = cookie->data_len;
361                 start_dp[idx].flags = cookie->next ? VRING_DESC_F_NEXT : 0;
362                 idx = start_dp[idx].next;
363         } while ((cookie = cookie->next) != NULL);
364
365         if (use_indirect)
366                 idx = vq->vq_ring.desc[head_idx].next;
367
368         vq->vq_desc_head_idx = idx;
369         if (vq->vq_desc_head_idx == VQ_RING_DESC_CHAIN_END)
370                 vq->vq_desc_tail_idx = idx;
371         vq->vq_free_cnt = (uint16_t)(vq->vq_free_cnt - needed);
372         vq_update_avail_ring(vq, head_idx);
373 }
374
375 void
376 virtio_dev_cq_start(struct rte_eth_dev *dev)
377 {
378         struct virtio_hw *hw = dev->data->dev_private;
379
380         if (hw->cvq && hw->cvq->vq) {
381                 rte_spinlock_init(&hw->cvq->lock);
382                 VIRTQUEUE_DUMP((struct virtqueue *)hw->cvq->vq);
383         }
384 }
385
386 int
387 virtio_dev_rx_queue_setup(struct rte_eth_dev *dev,
388                         uint16_t queue_idx,
389                         uint16_t nb_desc,
390                         unsigned int socket_id __rte_unused,
391                         __rte_unused const struct rte_eth_rxconf *rx_conf,
392                         struct rte_mempool *mp)
393 {
394         uint16_t vtpci_queue_idx = 2 * queue_idx + VTNET_SQ_RQ_QUEUE_IDX;
395         struct virtio_hw *hw = dev->data->dev_private;
396         struct virtqueue *vq = hw->vqs[vtpci_queue_idx];
397         struct virtnet_rx *rxvq;
398
399         PMD_INIT_FUNC_TRACE();
400
401         if (nb_desc == 0 || nb_desc > vq->vq_nentries)
402                 nb_desc = vq->vq_nentries;
403         vq->vq_free_cnt = RTE_MIN(vq->vq_free_cnt, nb_desc);
404
405         rxvq = &vq->rxq;
406         rxvq->queue_id = queue_idx;
407         rxvq->mpool = mp;
408         if (rxvq->mpool == NULL) {
409                 rte_exit(EXIT_FAILURE,
410                         "Cannot allocate mbufs for rx virtqueue");
411         }
412         dev->data->rx_queues[queue_idx] = rxvq;
413
414         return 0;
415 }
416
417 int
418 virtio_dev_rx_queue_setup_finish(struct rte_eth_dev *dev, uint16_t queue_idx)
419 {
420         uint16_t vtpci_queue_idx = 2 * queue_idx + VTNET_SQ_RQ_QUEUE_IDX;
421         struct virtio_hw *hw = dev->data->dev_private;
422         struct virtqueue *vq = hw->vqs[vtpci_queue_idx];
423         struct virtnet_rx *rxvq = &vq->rxq;
424         struct rte_mbuf *m;
425         uint16_t desc_idx;
426         int error, nbufs;
427
428         PMD_INIT_FUNC_TRACE();
429
430         /* Allocate blank mbufs for the each rx descriptor */
431         nbufs = 0;
432
433         if (hw->use_simple_rx) {
434                 for (desc_idx = 0; desc_idx < vq->vq_nentries;
435                      desc_idx++) {
436                         vq->vq_ring.avail->ring[desc_idx] = desc_idx;
437                         vq->vq_ring.desc[desc_idx].flags =
438                                 VRING_DESC_F_WRITE;
439                 }
440         }
441
442         memset(&rxvq->fake_mbuf, 0, sizeof(rxvq->fake_mbuf));
443         for (desc_idx = 0; desc_idx < RTE_PMD_VIRTIO_RX_MAX_BURST;
444              desc_idx++) {
445                 vq->sw_ring[vq->vq_nentries + desc_idx] =
446                         &rxvq->fake_mbuf;
447         }
448
449         while (!virtqueue_full(vq)) {
450                 m = rte_mbuf_raw_alloc(rxvq->mpool);
451                 if (m == NULL)
452                         break;
453
454                 /* Enqueue allocated buffers */
455                 if (hw->use_simple_rx)
456                         error = virtqueue_enqueue_recv_refill_simple(vq, m);
457                 else
458                         error = virtqueue_enqueue_recv_refill(vq, m);
459
460                 if (error) {
461                         rte_pktmbuf_free(m);
462                         break;
463                 }
464                 nbufs++;
465         }
466
467         vq_update_avail_idx(vq);
468
469         PMD_INIT_LOG(DEBUG, "Allocated %d bufs", nbufs);
470
471         virtio_rxq_vec_setup(rxvq);
472
473         VIRTQUEUE_DUMP(vq);
474
475         return 0;
476 }
477
478 /*
479  * struct rte_eth_dev *dev: Used to update dev
480  * uint16_t nb_desc: Defaults to values read from config space
481  * unsigned int socket_id: Used to allocate memzone
482  * const struct rte_eth_txconf *tx_conf: Used to setup tx engine
483  * uint16_t queue_idx: Just used as an index in dev txq list
484  */
485 int
486 virtio_dev_tx_queue_setup(struct rte_eth_dev *dev,
487                         uint16_t queue_idx,
488                         uint16_t nb_desc,
489                         unsigned int socket_id __rte_unused,
490                         const struct rte_eth_txconf *tx_conf)
491 {
492         uint8_t vtpci_queue_idx = 2 * queue_idx + VTNET_SQ_TQ_QUEUE_IDX;
493         struct virtio_hw *hw = dev->data->dev_private;
494         struct virtqueue *vq = hw->vqs[vtpci_queue_idx];
495         struct virtnet_tx *txvq;
496         uint16_t tx_free_thresh;
497
498         PMD_INIT_FUNC_TRACE();
499
500         /* cannot use simple rxtx funcs with multisegs or offloads */
501         if ((tx_conf->txq_flags & VIRTIO_SIMPLE_FLAGS) != VIRTIO_SIMPLE_FLAGS)
502                 hw->use_simple_tx = 0;
503
504         if (nb_desc == 0 || nb_desc > vq->vq_nentries)
505                 nb_desc = vq->vq_nentries;
506         vq->vq_free_cnt = RTE_MIN(vq->vq_free_cnt, nb_desc);
507
508         txvq = &vq->txq;
509         txvq->queue_id = queue_idx;
510
511         tx_free_thresh = tx_conf->tx_free_thresh;
512         if (tx_free_thresh == 0)
513                 tx_free_thresh =
514                         RTE_MIN(vq->vq_nentries / 4, DEFAULT_TX_FREE_THRESH);
515
516         if (tx_free_thresh >= (vq->vq_nentries - 3)) {
517                 RTE_LOG(ERR, PMD, "tx_free_thresh must be less than the "
518                         "number of TX entries minus 3 (%u)."
519                         " (tx_free_thresh=%u port=%u queue=%u)\n",
520                         vq->vq_nentries - 3,
521                         tx_free_thresh, dev->data->port_id, queue_idx);
522                 return -EINVAL;
523         }
524
525         vq->vq_free_thresh = tx_free_thresh;
526
527         dev->data->tx_queues[queue_idx] = txvq;
528         return 0;
529 }
530
531 int
532 virtio_dev_tx_queue_setup_finish(struct rte_eth_dev *dev,
533                                 uint16_t queue_idx)
534 {
535         uint8_t vtpci_queue_idx = 2 * queue_idx + VTNET_SQ_TQ_QUEUE_IDX;
536         struct virtio_hw *hw = dev->data->dev_private;
537         struct virtqueue *vq = hw->vqs[vtpci_queue_idx];
538         uint16_t mid_idx = vq->vq_nentries >> 1;
539         struct virtnet_tx *txvq = &vq->txq;
540         uint16_t desc_idx;
541
542         PMD_INIT_FUNC_TRACE();
543
544         if (hw->use_simple_tx) {
545                 for (desc_idx = 0; desc_idx < mid_idx; desc_idx++) {
546                         vq->vq_ring.avail->ring[desc_idx] =
547                                 desc_idx + mid_idx;
548                         vq->vq_ring.desc[desc_idx + mid_idx].next =
549                                 desc_idx;
550                         vq->vq_ring.desc[desc_idx + mid_idx].addr =
551                                 txvq->virtio_net_hdr_mem +
552                                 offsetof(struct virtio_tx_region, tx_hdr);
553                         vq->vq_ring.desc[desc_idx + mid_idx].len =
554                                 vq->hw->vtnet_hdr_size;
555                         vq->vq_ring.desc[desc_idx + mid_idx].flags =
556                                 VRING_DESC_F_NEXT;
557                         vq->vq_ring.desc[desc_idx].flags = 0;
558                 }
559                 for (desc_idx = mid_idx; desc_idx < vq->vq_nentries;
560                      desc_idx++)
561                         vq->vq_ring.avail->ring[desc_idx] = desc_idx;
562         }
563
564         VIRTQUEUE_DUMP(vq);
565
566         return 0;
567 }
568
569 static void
570 virtio_discard_rxbuf(struct virtqueue *vq, struct rte_mbuf *m)
571 {
572         int error;
573         /*
574          * Requeue the discarded mbuf. This should always be
575          * successful since it was just dequeued.
576          */
577         error = virtqueue_enqueue_recv_refill(vq, m);
578         if (unlikely(error)) {
579                 RTE_LOG(ERR, PMD, "cannot requeue discarded mbuf");
580                 rte_pktmbuf_free(m);
581         }
582 }
583
584 static void
585 virtio_update_packet_stats(struct virtnet_stats *stats, struct rte_mbuf *mbuf)
586 {
587         uint32_t s = mbuf->pkt_len;
588         struct ether_addr *ea;
589
590         if (s == 64) {
591                 stats->size_bins[1]++;
592         } else if (s > 64 && s < 1024) {
593                 uint32_t bin;
594
595                 /* count zeros, and offset into correct bin */
596                 bin = (sizeof(s) * 8) - __builtin_clz(s) - 5;
597                 stats->size_bins[bin]++;
598         } else {
599                 if (s < 64)
600                         stats->size_bins[0]++;
601                 else if (s < 1519)
602                         stats->size_bins[6]++;
603                 else if (s >= 1519)
604                         stats->size_bins[7]++;
605         }
606
607         ea = rte_pktmbuf_mtod(mbuf, struct ether_addr *);
608         if (is_multicast_ether_addr(ea)) {
609                 if (is_broadcast_ether_addr(ea))
610                         stats->broadcast++;
611                 else
612                         stats->multicast++;
613         }
614 }
615
616 /* Optionally fill offload information in structure */
617 static int
618 virtio_rx_offload(struct rte_mbuf *m, struct virtio_net_hdr *hdr)
619 {
620         struct rte_net_hdr_lens hdr_lens;
621         uint32_t hdrlen, ptype;
622         int l4_supported = 0;
623
624         /* nothing to do */
625         if (hdr->flags == 0 && hdr->gso_type == VIRTIO_NET_HDR_GSO_NONE)
626                 return 0;
627
628         m->ol_flags |= PKT_RX_IP_CKSUM_UNKNOWN;
629
630         ptype = rte_net_get_ptype(m, &hdr_lens, RTE_PTYPE_ALL_MASK);
631         m->packet_type = ptype;
632         if ((ptype & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_TCP ||
633             (ptype & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_UDP ||
634             (ptype & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_SCTP)
635                 l4_supported = 1;
636
637         if (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
638                 hdrlen = hdr_lens.l2_len + hdr_lens.l3_len + hdr_lens.l4_len;
639                 if (hdr->csum_start <= hdrlen && l4_supported) {
640                         m->ol_flags |= PKT_RX_L4_CKSUM_NONE;
641                 } else {
642                         /* Unknown proto or tunnel, do sw cksum. We can assume
643                          * the cksum field is in the first segment since the
644                          * buffers we provided to the host are large enough.
645                          * In case of SCTP, this will be wrong since it's a CRC
646                          * but there's nothing we can do.
647                          */
648                         uint16_t csum = 0, off;
649
650                         rte_raw_cksum_mbuf(m, hdr->csum_start,
651                                 rte_pktmbuf_pkt_len(m) - hdr->csum_start,
652                                 &csum);
653                         if (likely(csum != 0xffff))
654                                 csum = ~csum;
655                         off = hdr->csum_offset + hdr->csum_start;
656                         if (rte_pktmbuf_data_len(m) >= off + 1)
657                                 *rte_pktmbuf_mtod_offset(m, uint16_t *,
658                                         off) = csum;
659                 }
660         } else if (hdr->flags & VIRTIO_NET_HDR_F_DATA_VALID && l4_supported) {
661                 m->ol_flags |= PKT_RX_L4_CKSUM_GOOD;
662         }
663
664         /* GSO request, save required information in mbuf */
665         if (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
666                 /* Check unsupported modes */
667                 if ((hdr->gso_type & VIRTIO_NET_HDR_GSO_ECN) ||
668                     (hdr->gso_size == 0)) {
669                         return -EINVAL;
670                 }
671
672                 /* Update mss lengthes in mbuf */
673                 m->tso_segsz = hdr->gso_size;
674                 switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
675                         case VIRTIO_NET_HDR_GSO_TCPV4:
676                         case VIRTIO_NET_HDR_GSO_TCPV6:
677                                 m->ol_flags |= PKT_RX_LRO | \
678                                         PKT_RX_L4_CKSUM_NONE;
679                                 break;
680                         default:
681                                 return -EINVAL;
682                 }
683         }
684
685         return 0;
686 }
687
688 static inline int
689 rx_offload_enabled(struct virtio_hw *hw)
690 {
691         return vtpci_with_feature(hw, VIRTIO_NET_F_GUEST_CSUM) ||
692                 vtpci_with_feature(hw, VIRTIO_NET_F_GUEST_TSO4) ||
693                 vtpci_with_feature(hw, VIRTIO_NET_F_GUEST_TSO6);
694 }
695
696 #define VIRTIO_MBUF_BURST_SZ 64
697 #define DESC_PER_CACHELINE (RTE_CACHE_LINE_SIZE / sizeof(struct vring_desc))
698 uint16_t
699 virtio_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
700 {
701         struct virtnet_rx *rxvq = rx_queue;
702         struct virtqueue *vq = rxvq->vq;
703         struct virtio_hw *hw = vq->hw;
704         struct rte_mbuf *rxm, *new_mbuf;
705         uint16_t nb_used, num, nb_rx;
706         uint32_t len[VIRTIO_MBUF_BURST_SZ];
707         struct rte_mbuf *rcv_pkts[VIRTIO_MBUF_BURST_SZ];
708         int error;
709         uint32_t i, nb_enqueued;
710         uint32_t hdr_size;
711         int offload;
712         struct virtio_net_hdr *hdr;
713
714         nb_rx = 0;
715         if (unlikely(hw->started == 0))
716                 return nb_rx;
717
718         nb_used = VIRTQUEUE_NUSED(vq);
719
720         virtio_rmb();
721
722         num = likely(nb_used <= nb_pkts) ? nb_used : nb_pkts;
723         if (unlikely(num > VIRTIO_MBUF_BURST_SZ))
724                 num = VIRTIO_MBUF_BURST_SZ;
725         if (likely(num > DESC_PER_CACHELINE))
726                 num = num - ((vq->vq_used_cons_idx + num) % DESC_PER_CACHELINE);
727
728         num = virtqueue_dequeue_burst_rx(vq, rcv_pkts, len, num);
729         PMD_RX_LOG(DEBUG, "used:%d dequeue:%d", nb_used, num);
730
731         nb_enqueued = 0;
732         hdr_size = hw->vtnet_hdr_size;
733         offload = rx_offload_enabled(hw);
734
735         for (i = 0; i < num ; i++) {
736                 rxm = rcv_pkts[i];
737
738                 PMD_RX_LOG(DEBUG, "packet len:%d", len[i]);
739
740                 if (unlikely(len[i] < hdr_size + ETHER_HDR_LEN)) {
741                         PMD_RX_LOG(ERR, "Packet drop");
742                         nb_enqueued++;
743                         virtio_discard_rxbuf(vq, rxm);
744                         rxvq->stats.errors++;
745                         continue;
746                 }
747
748                 rxm->port = rxvq->port_id;
749                 rxm->data_off = RTE_PKTMBUF_HEADROOM;
750                 rxm->ol_flags = 0;
751                 rxm->vlan_tci = 0;
752
753                 rxm->pkt_len = (uint32_t)(len[i] - hdr_size);
754                 rxm->data_len = (uint16_t)(len[i] - hdr_size);
755
756                 hdr = (struct virtio_net_hdr *)((char *)rxm->buf_addr +
757                         RTE_PKTMBUF_HEADROOM - hdr_size);
758
759                 if (hw->vlan_strip)
760                         rte_vlan_strip(rxm);
761
762                 if (offload && virtio_rx_offload(rxm, hdr) < 0) {
763                         virtio_discard_rxbuf(vq, rxm);
764                         rxvq->stats.errors++;
765                         continue;
766                 }
767
768                 VIRTIO_DUMP_PACKET(rxm, rxm->data_len);
769
770                 rx_pkts[nb_rx++] = rxm;
771
772                 rxvq->stats.bytes += rxm->pkt_len;
773                 virtio_update_packet_stats(&rxvq->stats, rxm);
774         }
775
776         rxvq->stats.packets += nb_rx;
777
778         /* Allocate new mbuf for the used descriptor */
779         error = ENOSPC;
780         while (likely(!virtqueue_full(vq))) {
781                 new_mbuf = rte_mbuf_raw_alloc(rxvq->mpool);
782                 if (unlikely(new_mbuf == NULL)) {
783                         struct rte_eth_dev *dev
784                                 = &rte_eth_devices[rxvq->port_id];
785                         dev->data->rx_mbuf_alloc_failed++;
786                         break;
787                 }
788                 error = virtqueue_enqueue_recv_refill(vq, new_mbuf);
789                 if (unlikely(error)) {
790                         rte_pktmbuf_free(new_mbuf);
791                         break;
792                 }
793                 nb_enqueued++;
794         }
795
796         if (likely(nb_enqueued)) {
797                 vq_update_avail_idx(vq);
798
799                 if (unlikely(virtqueue_kick_prepare(vq))) {
800                         virtqueue_notify(vq);
801                         PMD_RX_LOG(DEBUG, "Notified");
802                 }
803         }
804
805         return nb_rx;
806 }
807
808 uint16_t
809 virtio_recv_mergeable_pkts(void *rx_queue,
810                         struct rte_mbuf **rx_pkts,
811                         uint16_t nb_pkts)
812 {
813         struct virtnet_rx *rxvq = rx_queue;
814         struct virtqueue *vq = rxvq->vq;
815         struct virtio_hw *hw = vq->hw;
816         struct rte_mbuf *rxm, *new_mbuf;
817         uint16_t nb_used, num, nb_rx;
818         uint32_t len[VIRTIO_MBUF_BURST_SZ];
819         struct rte_mbuf *rcv_pkts[VIRTIO_MBUF_BURST_SZ];
820         struct rte_mbuf *prev;
821         int error;
822         uint32_t i, nb_enqueued;
823         uint32_t seg_num;
824         uint16_t extra_idx;
825         uint32_t seg_res;
826         uint32_t hdr_size;
827         int offload;
828
829         nb_rx = 0;
830         if (unlikely(hw->started == 0))
831                 return nb_rx;
832
833         nb_used = VIRTQUEUE_NUSED(vq);
834
835         virtio_rmb();
836
837         PMD_RX_LOG(DEBUG, "used:%d", nb_used);
838
839         i = 0;
840         nb_enqueued = 0;
841         seg_num = 0;
842         extra_idx = 0;
843         seg_res = 0;
844         hdr_size = hw->vtnet_hdr_size;
845         offload = rx_offload_enabled(hw);
846
847         while (i < nb_used) {
848                 struct virtio_net_hdr_mrg_rxbuf *header;
849
850                 if (nb_rx == nb_pkts)
851                         break;
852
853                 num = virtqueue_dequeue_burst_rx(vq, rcv_pkts, len, 1);
854                 if (num != 1)
855                         continue;
856
857                 i++;
858
859                 PMD_RX_LOG(DEBUG, "dequeue:%d", num);
860                 PMD_RX_LOG(DEBUG, "packet len:%d", len[0]);
861
862                 rxm = rcv_pkts[0];
863
864                 if (unlikely(len[0] < hdr_size + ETHER_HDR_LEN)) {
865                         PMD_RX_LOG(ERR, "Packet drop");
866                         nb_enqueued++;
867                         virtio_discard_rxbuf(vq, rxm);
868                         rxvq->stats.errors++;
869                         continue;
870                 }
871
872                 header = (struct virtio_net_hdr_mrg_rxbuf *)((char *)rxm->buf_addr +
873                         RTE_PKTMBUF_HEADROOM - hdr_size);
874                 seg_num = header->num_buffers;
875
876                 if (seg_num == 0)
877                         seg_num = 1;
878
879                 rxm->data_off = RTE_PKTMBUF_HEADROOM;
880                 rxm->nb_segs = seg_num;
881                 rxm->ol_flags = 0;
882                 rxm->vlan_tci = 0;
883                 rxm->pkt_len = (uint32_t)(len[0] - hdr_size);
884                 rxm->data_len = (uint16_t)(len[0] - hdr_size);
885
886                 rxm->port = rxvq->port_id;
887                 rx_pkts[nb_rx] = rxm;
888                 prev = rxm;
889
890                 if (offload && virtio_rx_offload(rxm, &header->hdr) < 0) {
891                         virtio_discard_rxbuf(vq, rxm);
892                         rxvq->stats.errors++;
893                         continue;
894                 }
895
896                 seg_res = seg_num - 1;
897
898                 while (seg_res != 0) {
899                         /*
900                          * Get extra segments for current uncompleted packet.
901                          */
902                         uint16_t  rcv_cnt =
903                                 RTE_MIN(seg_res, RTE_DIM(rcv_pkts));
904                         if (likely(VIRTQUEUE_NUSED(vq) >= rcv_cnt)) {
905                                 uint32_t rx_num =
906                                         virtqueue_dequeue_burst_rx(vq,
907                                         rcv_pkts, len, rcv_cnt);
908                                 i += rx_num;
909                                 rcv_cnt = rx_num;
910                         } else {
911                                 PMD_RX_LOG(ERR,
912                                            "No enough segments for packet.");
913                                 nb_enqueued++;
914                                 virtio_discard_rxbuf(vq, rxm);
915                                 rxvq->stats.errors++;
916                                 break;
917                         }
918
919                         extra_idx = 0;
920
921                         while (extra_idx < rcv_cnt) {
922                                 rxm = rcv_pkts[extra_idx];
923
924                                 rxm->data_off = RTE_PKTMBUF_HEADROOM - hdr_size;
925                                 rxm->pkt_len = (uint32_t)(len[extra_idx]);
926                                 rxm->data_len = (uint16_t)(len[extra_idx]);
927
928                                 if (prev)
929                                         prev->next = rxm;
930
931                                 prev = rxm;
932                                 rx_pkts[nb_rx]->pkt_len += rxm->pkt_len;
933                                 extra_idx++;
934                         };
935                         seg_res -= rcv_cnt;
936                 }
937
938                 if (hw->vlan_strip)
939                         rte_vlan_strip(rx_pkts[nb_rx]);
940
941                 VIRTIO_DUMP_PACKET(rx_pkts[nb_rx],
942                         rx_pkts[nb_rx]->data_len);
943
944                 rxvq->stats.bytes += rx_pkts[nb_rx]->pkt_len;
945                 virtio_update_packet_stats(&rxvq->stats, rx_pkts[nb_rx]);
946                 nb_rx++;
947         }
948
949         rxvq->stats.packets += nb_rx;
950
951         /* Allocate new mbuf for the used descriptor */
952         error = ENOSPC;
953         while (likely(!virtqueue_full(vq))) {
954                 new_mbuf = rte_mbuf_raw_alloc(rxvq->mpool);
955                 if (unlikely(new_mbuf == NULL)) {
956                         struct rte_eth_dev *dev
957                                 = &rte_eth_devices[rxvq->port_id];
958                         dev->data->rx_mbuf_alloc_failed++;
959                         break;
960                 }
961                 error = virtqueue_enqueue_recv_refill(vq, new_mbuf);
962                 if (unlikely(error)) {
963                         rte_pktmbuf_free(new_mbuf);
964                         break;
965                 }
966                 nb_enqueued++;
967         }
968
969         if (likely(nb_enqueued)) {
970                 vq_update_avail_idx(vq);
971
972                 if (unlikely(virtqueue_kick_prepare(vq))) {
973                         virtqueue_notify(vq);
974                         PMD_RX_LOG(DEBUG, "Notified");
975                 }
976         }
977
978         return nb_rx;
979 }
980
981 uint16_t
982 virtio_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
983 {
984         struct virtnet_tx *txvq = tx_queue;
985         struct virtqueue *vq = txvq->vq;
986         struct virtio_hw *hw = vq->hw;
987         uint16_t hdr_size = hw->vtnet_hdr_size;
988         uint16_t nb_used, nb_tx = 0;
989         int error;
990
991         if (unlikely(hw->started == 0))
992                 return nb_tx;
993
994         if (unlikely(nb_pkts < 1))
995                 return nb_pkts;
996
997         PMD_TX_LOG(DEBUG, "%d packets to xmit", nb_pkts);
998         nb_used = VIRTQUEUE_NUSED(vq);
999
1000         virtio_rmb();
1001         if (likely(nb_used > vq->vq_nentries - vq->vq_free_thresh))
1002                 virtio_xmit_cleanup(vq, nb_used);
1003
1004         for (nb_tx = 0; nb_tx < nb_pkts; nb_tx++) {
1005                 struct rte_mbuf *txm = tx_pkts[nb_tx];
1006                 int can_push = 0, use_indirect = 0, slots, need;
1007
1008                 /* Do VLAN tag insertion */
1009                 if (unlikely(txm->ol_flags & PKT_TX_VLAN_PKT)) {
1010                         error = rte_vlan_insert(&txm);
1011                         if (unlikely(error)) {
1012                                 rte_pktmbuf_free(txm);
1013                                 continue;
1014                         }
1015                 }
1016
1017                 /* optimize ring usage */
1018                 if ((vtpci_with_feature(hw, VIRTIO_F_ANY_LAYOUT) ||
1019                       vtpci_with_feature(hw, VIRTIO_F_VERSION_1)) &&
1020                     rte_mbuf_refcnt_read(txm) == 1 &&
1021                     RTE_MBUF_DIRECT(txm) &&
1022                     txm->nb_segs == 1 &&
1023                     rte_pktmbuf_headroom(txm) >= hdr_size &&
1024                     rte_is_aligned(rte_pktmbuf_mtod(txm, char *),
1025                                    __alignof__(struct virtio_net_hdr_mrg_rxbuf)))
1026                         can_push = 1;
1027                 else if (vtpci_with_feature(hw, VIRTIO_RING_F_INDIRECT_DESC) &&
1028                          txm->nb_segs < VIRTIO_MAX_TX_INDIRECT)
1029                         use_indirect = 1;
1030
1031                 /* How many main ring entries are needed to this Tx?
1032                  * any_layout => number of segments
1033                  * indirect   => 1
1034                  * default    => number of segments + 1
1035                  */
1036                 slots = use_indirect ? 1 : (txm->nb_segs + !can_push);
1037                 need = slots - vq->vq_free_cnt;
1038
1039                 /* Positive value indicates it need free vring descriptors */
1040                 if (unlikely(need > 0)) {
1041                         nb_used = VIRTQUEUE_NUSED(vq);
1042                         virtio_rmb();
1043                         need = RTE_MIN(need, (int)nb_used);
1044
1045                         virtio_xmit_cleanup(vq, need);
1046                         need = slots - vq->vq_free_cnt;
1047                         if (unlikely(need > 0)) {
1048                                 PMD_TX_LOG(ERR,
1049                                            "No free tx descriptors to transmit");
1050                                 break;
1051                         }
1052                 }
1053
1054                 /* Enqueue Packet buffers */
1055                 virtqueue_enqueue_xmit(txvq, txm, slots, use_indirect, can_push);
1056
1057                 txvq->stats.bytes += txm->pkt_len;
1058                 virtio_update_packet_stats(&txvq->stats, txm);
1059         }
1060
1061         txvq->stats.packets += nb_tx;
1062
1063         if (likely(nb_tx)) {
1064                 vq_update_avail_idx(vq);
1065
1066                 if (unlikely(virtqueue_kick_prepare(vq))) {
1067                         virtqueue_notify(vq);
1068                         PMD_TX_LOG(DEBUG, "Notified backend after xmit");
1069                 }
1070         }
1071
1072         return nb_tx;
1073 }