net/virtio: add vectorized packed ring NEON Rx
[dpdk.git] / drivers / net / virtio / virtio_rxtx_packed.h
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2020 Intel Corporation
3  */
4
5 #ifndef _VIRTIO_RXTX_PACKED_H_
6 #define _VIRTIO_RXTX_PACKED_H_
7
8 #include <stdint.h>
9 #include <stdio.h>
10 #include <stdlib.h>
11 #include <string.h>
12 #include <errno.h>
13
14 #include <rte_net.h>
15
16 #include "virtio_logs.h"
17 #include "virtio_ethdev.h"
18 #include "virtio_pci.h"
19 #include "virtqueue.h"
20
21 #define BYTE_SIZE 8
22
23 #ifdef CC_AVX512_SUPPORT
24 /* flag bits offset in packed ring desc higher 64bits */
25 #define FLAGS_BITS_OFFSET ((offsetof(struct vring_packed_desc, flags) - \
26         offsetof(struct vring_packed_desc, len)) * BYTE_SIZE)
27 #elif defined(RTE_ARCH_ARM)
28 /* flag bits offset in packed ring desc from ID */
29 #define FLAGS_BITS_OFFSET ((offsetof(struct vring_packed_desc, flags) - \
30         offsetof(struct vring_packed_desc, id)) * BYTE_SIZE)
31 #endif
32
33 #define PACKED_FLAGS_MASK ((0ULL | VRING_PACKED_DESC_F_AVAIL_USED) << \
34         FLAGS_BITS_OFFSET)
35
36 /* reference count offset in mbuf rearm data */
37 #define REFCNT_BITS_OFFSET ((offsetof(struct rte_mbuf, refcnt) - \
38         offsetof(struct rte_mbuf, rearm_data)) * BYTE_SIZE)
39 /* segment number offset in mbuf rearm data */
40 #define SEG_NUM_BITS_OFFSET ((offsetof(struct rte_mbuf, nb_segs) - \
41         offsetof(struct rte_mbuf, rearm_data)) * BYTE_SIZE)
42
43 /* default rearm data */
44 #define DEFAULT_REARM_DATA (1ULL << SEG_NUM_BITS_OFFSET | \
45         1ULL << REFCNT_BITS_OFFSET)
46
47 /* id bits offset in packed ring desc higher 64bits */
48 #define ID_BITS_OFFSET ((offsetof(struct vring_packed_desc, id) - \
49         offsetof(struct vring_packed_desc, len)) * BYTE_SIZE)
50
51 /* net hdr short size mask */
52 #define NET_HDR_MASK 0x3F
53
54 #ifdef RTE_ARCH_ARM
55 /* The cache line size on different Arm platforms are different, so
56  * put a four batch size here to match with the minimum cache line
57  * size and accommodate NEON register size.
58  */
59 #define PACKED_BATCH_SIZE 4
60 #else
61 #define PACKED_BATCH_SIZE (RTE_CACHE_LINE_SIZE / \
62         sizeof(struct vring_packed_desc))
63 #endif
64 #define PACKED_BATCH_MASK (PACKED_BATCH_SIZE - 1)
65
66 #ifdef VIRTIO_GCC_UNROLL_PRAGMA
67 #define virtio_for_each_try_unroll(iter, val, size) _Pragma("GCC unroll 4") \
68         for (iter = val; iter < size; iter++)
69 #endif
70
71 #ifdef VIRTIO_CLANG_UNROLL_PRAGMA
72 #define virtio_for_each_try_unroll(iter, val, size) _Pragma("unroll 4") \
73         for (iter = val; iter < size; iter++)
74 #endif
75
76 #ifdef VIRTIO_ICC_UNROLL_PRAGMA
77 #define virtio_for_each_try_unroll(iter, val, size) _Pragma("unroll 4") \
78         for (iter = val; iter < size; iter++)
79 #endif
80
81 #ifndef virtio_for_each_try_unroll
82 #define virtio_for_each_try_unroll(iter, val, size) \
83         for (iter = val; iter < size; iter++)
84 #endif
85
86 static inline void
87 virtio_update_batch_stats(struct virtnet_stats *stats,
88                           uint16_t pkt_len1,
89                           uint16_t pkt_len2,
90                           uint16_t pkt_len3,
91                           uint16_t pkt_len4)
92 {
93         stats->bytes += pkt_len1;
94         stats->bytes += pkt_len2;
95         stats->bytes += pkt_len3;
96         stats->bytes += pkt_len4;
97 }
98
99 static inline int
100 virtqueue_enqueue_single_packed_vec(struct virtnet_tx *txvq,
101                                     struct rte_mbuf *txm)
102 {
103         struct virtqueue *vq = txvq->vq;
104         struct virtio_hw *hw = vq->hw;
105         uint16_t hdr_size = hw->vtnet_hdr_size;
106         uint16_t slots, can_push = 0, use_indirect = 0;
107         int16_t need;
108
109         /* optimize ring usage */
110         if ((vtpci_with_feature(hw, VIRTIO_F_ANY_LAYOUT) ||
111              vtpci_with_feature(hw, VIRTIO_F_VERSION_1)) &&
112              rte_mbuf_refcnt_read(txm) == 1 && RTE_MBUF_DIRECT(txm) &&
113              txm->nb_segs == 1 && rte_pktmbuf_headroom(txm) >= hdr_size)
114                 can_push = 1;
115         else if (vtpci_with_feature(hw, VIRTIO_RING_F_INDIRECT_DESC) &&
116                  txm->nb_segs < VIRTIO_MAX_TX_INDIRECT)
117                 use_indirect = 1;
118
119         /* How many main ring entries are needed to this Tx?
120          * indirect   => 1
121          * any_layout => number of segments
122          * default    => number of segments + 1
123          */
124         slots = use_indirect ? 1 : (txm->nb_segs + !can_push);
125         can_push = rte_mbuf_refcnt_read(txm) == 1 &&
126                    RTE_MBUF_DIRECT(txm) &&
127                    txm->nb_segs == 1 &&
128                    rte_pktmbuf_headroom(txm) >= hdr_size;
129
130         slots = txm->nb_segs + !can_push;
131         need = slots - vq->vq_free_cnt;
132
133         /* Positive value indicates it need free vring descriptors */
134         if (unlikely(need > 0)) {
135                 virtio_xmit_cleanup_inorder_packed(vq, need);
136                 need = slots - vq->vq_free_cnt;
137                 if (unlikely(need > 0)) {
138                         PMD_TX_LOG(ERR,
139                                    "No free tx descriptors to transmit");
140                         return -1;
141                 }
142         }
143
144         /* Enqueue Packet buffers */
145         virtqueue_enqueue_xmit_packed(txvq, txm, slots, use_indirect,
146                                 can_push, 1);
147
148         txvq->stats.bytes += txm->pkt_len;
149         return 0;
150 }
151
152 /* Optionally fill offload information in structure */
153 static inline int
154 virtio_vec_rx_offload(struct rte_mbuf *m, struct virtio_net_hdr *hdr)
155 {
156         struct rte_net_hdr_lens hdr_lens;
157         uint32_t hdrlen, ptype;
158         int l4_supported = 0;
159
160         /* nothing to do */
161         if (hdr->flags == 0)
162                 return 0;
163
164         /* GSO not support in vec path, skip check */
165         m->ol_flags |= PKT_RX_IP_CKSUM_UNKNOWN;
166
167         ptype = rte_net_get_ptype(m, &hdr_lens, RTE_PTYPE_ALL_MASK);
168         m->packet_type = ptype;
169         if ((ptype & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_TCP ||
170             (ptype & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_UDP ||
171             (ptype & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_SCTP)
172                 l4_supported = 1;
173
174         if (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
175                 hdrlen = hdr_lens.l2_len + hdr_lens.l3_len + hdr_lens.l4_len;
176                 if (hdr->csum_start <= hdrlen && l4_supported) {
177                         m->ol_flags |= PKT_RX_L4_CKSUM_NONE;
178                 } else {
179                         /* Unknown proto or tunnel, do sw cksum. We can assume
180                          * the cksum field is in the first segment since the
181                          * buffers we provided to the host are large enough.
182                          * In case of SCTP, this will be wrong since it's a CRC
183                          * but there's nothing we can do.
184                          */
185                         uint16_t csum = 0, off;
186
187                         if (rte_raw_cksum_mbuf(m, hdr->csum_start,
188                                 rte_pktmbuf_pkt_len(m) - hdr->csum_start,
189                                 &csum) < 0)
190                                 return -1;
191                         if (likely(csum != 0xffff))
192                                 csum = ~csum;
193                         off = hdr->csum_offset + hdr->csum_start;
194                         if (rte_pktmbuf_data_len(m) >= off + 1)
195                                 *rte_pktmbuf_mtod_offset(m, uint16_t *,
196                                         off) = csum;
197                 }
198         } else if (hdr->flags & VIRTIO_NET_HDR_F_DATA_VALID && l4_supported) {
199                 m->ol_flags |= PKT_RX_L4_CKSUM_GOOD;
200         }
201
202         return 0;
203 }
204
205 static inline uint16_t
206 virtqueue_dequeue_single_packed_vec(struct virtnet_rx *rxvq,
207                                     struct rte_mbuf **rx_pkts)
208 {
209         uint16_t used_idx, id;
210         uint32_t len;
211         struct virtqueue *vq = rxvq->vq;
212         struct virtio_hw *hw = vq->hw;
213         uint32_t hdr_size = hw->vtnet_hdr_size;
214         struct virtio_net_hdr *hdr;
215         struct vring_packed_desc *desc;
216         struct rte_mbuf *cookie;
217
218         desc = vq->vq_packed.ring.desc;
219         used_idx = vq->vq_used_cons_idx;
220         if (!desc_is_used(&desc[used_idx], vq))
221                 return -1;
222
223         len = desc[used_idx].len;
224         id = desc[used_idx].id;
225         cookie = (struct rte_mbuf *)vq->vq_descx[id].cookie;
226         if (unlikely(cookie == NULL)) {
227                 PMD_DRV_LOG(ERR, "vring descriptor with no mbuf cookie at %u",
228                                 vq->vq_used_cons_idx);
229                 return -1;
230         }
231         rte_prefetch0(cookie);
232         rte_packet_prefetch(rte_pktmbuf_mtod(cookie, void *));
233
234         cookie->data_off = RTE_PKTMBUF_HEADROOM;
235         cookie->ol_flags = 0;
236         cookie->pkt_len = (uint32_t)(len - hdr_size);
237         cookie->data_len = (uint32_t)(len - hdr_size);
238
239         hdr = (struct virtio_net_hdr *)((char *)cookie->buf_addr +
240                                         RTE_PKTMBUF_HEADROOM - hdr_size);
241         if (hw->has_rx_offload)
242                 virtio_vec_rx_offload(cookie, hdr);
243
244         *rx_pkts = cookie;
245
246         rxvq->stats.bytes += cookie->pkt_len;
247
248         vq->vq_free_cnt++;
249         vq->vq_used_cons_idx++;
250         if (vq->vq_used_cons_idx >= vq->vq_nentries) {
251                 vq->vq_used_cons_idx -= vq->vq_nentries;
252                 vq->vq_packed.used_wrap_counter ^= 1;
253         }
254
255         return 0;
256 }
257
258 static inline void
259 virtio_recv_refill_packed_vec(struct virtnet_rx *rxvq,
260                               struct rte_mbuf **cookie,
261                               uint16_t num)
262 {
263         struct virtqueue *vq = rxvq->vq;
264         struct vring_packed_desc *start_dp = vq->vq_packed.ring.desc;
265         uint16_t flags = vq->vq_packed.cached_flags;
266         struct virtio_hw *hw = vq->hw;
267         struct vq_desc_extra *dxp;
268         uint16_t idx, i;
269         uint16_t batch_num, total_num = 0;
270         uint16_t head_idx = vq->vq_avail_idx;
271         uint16_t head_flag = vq->vq_packed.cached_flags;
272         uint64_t addr;
273
274         do {
275                 idx = vq->vq_avail_idx;
276
277                 batch_num = PACKED_BATCH_SIZE;
278                 if (unlikely((idx + PACKED_BATCH_SIZE) > vq->vq_nentries))
279                         batch_num = vq->vq_nentries - idx;
280                 if (unlikely((total_num + batch_num) > num))
281                         batch_num = num - total_num;
282
283                 virtio_for_each_try_unroll(i, 0, batch_num) {
284                         dxp = &vq->vq_descx[idx + i];
285                         dxp->cookie = (void *)cookie[total_num + i];
286
287                         addr = VIRTIO_MBUF_ADDR(cookie[total_num + i], vq) +
288                                 RTE_PKTMBUF_HEADROOM - hw->vtnet_hdr_size;
289                         start_dp[idx + i].addr = addr;
290                         start_dp[idx + i].len = cookie[total_num + i]->buf_len
291                                 - RTE_PKTMBUF_HEADROOM + hw->vtnet_hdr_size;
292                         if (total_num || i) {
293                                 virtqueue_store_flags_packed(&start_dp[idx + i],
294                                                 flags, hw->weak_barriers);
295                         }
296                 }
297
298                 vq->vq_avail_idx += batch_num;
299                 if (vq->vq_avail_idx >= vq->vq_nentries) {
300                         vq->vq_avail_idx -= vq->vq_nentries;
301                         vq->vq_packed.cached_flags ^=
302                                 VRING_PACKED_DESC_F_AVAIL_USED;
303                         flags = vq->vq_packed.cached_flags;
304                 }
305                 total_num += batch_num;
306         } while (total_num < num);
307
308         virtqueue_store_flags_packed(&start_dp[head_idx], head_flag,
309                                 hw->weak_barriers);
310         vq->vq_free_cnt = (uint16_t)(vq->vq_free_cnt - num);
311 }
312
313 #endif /* _VIRTIO_RXTX_PACKED_H_ */