mbuf: add rte prefix to offload flags
[dpdk.git] / drivers / net / enic / enic_rxtx_vec_avx2.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2008-2018 Cisco Systems, Inc.  All rights reserved.
3  * Copyright 2007 Nuova Systems, Inc.  All rights reserved.
4  */
5
6 #include <rte_mbuf.h>
7 #include <ethdev_driver.h>
8 #include <rte_vect.h>
9
10 #include "enic_compat.h"
11 #include "rq_enet_desc.h"
12 #include "enic.h"
13 #include "enic_rxtx_common.h"
14
15 #include <x86intrin.h>
16
17 static struct rte_mbuf *
18 rx_one(struct cq_enet_rq_desc *cqd, struct rte_mbuf *mb, struct enic *enic)
19 {
20         bool tnl;
21
22         *(uint64_t *)&mb->rearm_data = enic->mbuf_initializer;
23         mb->data_len = cqd->bytes_written_flags &
24                 CQ_ENET_RQ_DESC_BYTES_WRITTEN_MASK;
25         mb->pkt_len = mb->data_len;
26         tnl = enic->overlay_offload && (cqd->completed_index_flags &
27                                         CQ_ENET_RQ_DESC_FLAGS_FCOE) != 0;
28         mb->packet_type =
29                 enic_cq_rx_flags_to_pkt_type((struct cq_desc *)cqd, tnl);
30         enic_cq_rx_to_pkt_flags((struct cq_desc *)cqd, mb);
31         /* Wipe the outer types set by enic_cq_rx_flags_to_pkt_type() */
32         if (tnl) {
33                 mb->packet_type &= ~(RTE_PTYPE_L3_MASK |
34                                      RTE_PTYPE_L4_MASK);
35         }
36         return mb;
37 }
38
39 static uint16_t
40 enic_noscatter_vec_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
41                              uint16_t nb_pkts)
42 {
43         struct rte_mbuf **rx, **rxmb;
44         uint16_t cq_idx, nb_rx, max_rx;
45         struct cq_enet_rq_desc *cqd;
46         struct rq_enet_desc *rqd;
47         struct vnic_cq *cq;
48         struct vnic_rq *rq;
49         struct enic *enic;
50         uint8_t color;
51
52         rq = rx_queue;
53         enic = vnic_dev_priv(rq->vdev);
54         cq = &enic->cq[enic_cq_rq(enic, rq->index)];
55         cq_idx = cq->to_clean;
56
57         /*
58          * Fill up the reserve of free mbufs. Below, we restock the receive
59          * ring with these mbufs to avoid allocation failures.
60          */
61         if (rq->num_free_mbufs == 0) {
62                 if (rte_mempool_get_bulk(rq->mp, (void **)rq->free_mbufs,
63                                          ENIC_RX_BURST_MAX))
64                         return 0;
65                 rq->num_free_mbufs = ENIC_RX_BURST_MAX;
66         }
67         /* Receive until the end of the ring, at most. */
68         max_rx = RTE_MIN(nb_pkts, rq->num_free_mbufs);
69         max_rx = RTE_MIN(max_rx, cq->ring.desc_count - cq_idx);
70
71         rxmb = rq->mbuf_ring + cq_idx;
72         color = cq->last_color;
73         cqd = (struct cq_enet_rq_desc *)(cq->ring.descs) + cq_idx;
74         rx = rx_pkts;
75         if (max_rx == 0 ||
76             (cqd->type_color & CQ_DESC_COLOR_MASK_NOSHIFT) == color)
77                 return 0;
78
79         /* Step 1: Process one packet to do aligned 256-bit load below */
80         if (cq_idx & 0x1) {
81                 if (unlikely(cqd->bytes_written_flags &
82                              CQ_ENET_RQ_DESC_FLAGS_TRUNCATED)) {
83                         rte_pktmbuf_free(*rxmb++);
84                         rte_atomic64_inc(&enic->soft_stats.rx_packet_errors);
85                 } else {
86                         *rx++ = rx_one(cqd, *rxmb++, enic);
87                 }
88                 cqd++;
89                 max_rx--;
90         }
91
92         const __m256i mask =
93                 _mm256_set_epi8(/* Second descriptor */
94                         0xff, /* type_color */
95                         (CQ_ENET_RQ_DESC_FLAGS_IPV4_FRAGMENT |
96                          CQ_ENET_RQ_DESC_FLAGS_IPV4 |
97                          CQ_ENET_RQ_DESC_FLAGS_IPV6 |
98                          CQ_ENET_RQ_DESC_FLAGS_TCP |
99                          CQ_ENET_RQ_DESC_FLAGS_UDP), /* flags */
100                         0, 0, /* checksum_fcoe */
101                         0xff, 0xff, /* vlan */
102                         0x3f, 0xff, /* bytes_written_flags */
103                         0xff, 0xff, 0xff, 0xff, /* rss_hash */
104                         0xff, 0xff, /* q_number_rss_type_flags */
105                         0, 0, /* completed_index_flags */
106                         /* First descriptor */
107                         0xff, /* type_color */
108                         (CQ_ENET_RQ_DESC_FLAGS_IPV4_FRAGMENT |
109                          CQ_ENET_RQ_DESC_FLAGS_IPV4 |
110                          CQ_ENET_RQ_DESC_FLAGS_IPV6 |
111                          CQ_ENET_RQ_DESC_FLAGS_TCP |
112                          CQ_ENET_RQ_DESC_FLAGS_UDP), /* flags */
113                         0, 0, /* checksum_fcoe */
114                         0xff, 0xff, /* vlan */
115                         0x3f, 0xff, /* bytes_written_flags */
116                         0xff, 0xff, 0xff, 0xff, /* rss_hash */
117                         0xff, 0xff, /* q_number_rss_type_flags */
118                         0, 0 /* completed_index_flags */
119                         );
120         const __m256i shuffle_mask =
121                 _mm256_set_epi8(/* Second descriptor */
122                         7, 6, 5, 4,             /* rss = rss_hash */
123                         11, 10,                 /* vlan_tci = vlan */
124                         9, 8,                   /* data_len = bytes_written */
125                         0x80, 0x80, 9, 8,       /* pkt_len = bytes_written */
126                         0x80, 0x80, 0x80, 0x80, /* packet_type = 0 */
127                         /* First descriptor */
128                         7, 6, 5, 4,             /* rss = rss_hash */
129                         11, 10,                 /* vlan_tci = vlan */
130                         9, 8,                   /* data_len = bytes_written */
131                         0x80, 0x80, 9, 8,       /* pkt_len = bytes_written */
132                         0x80, 0x80, 0x80, 0x80  /* packet_type = 0 */
133                         );
134         /* Used to collect 8 flags from 8 desc into one register */
135         const __m256i flags_shuffle_mask =
136                 _mm256_set_epi8(/* Second descriptor */
137                         1, 3, 9, 14,
138                         1, 3, 9, 14,
139                         1, 3, 9, 14,
140                         1, 3, 9, 14,
141                         /* First descriptor */
142                         1, 3, 9, 14,
143                         1, 3, 9, 14,
144                         1, 3, 9, 14,
145                         /*
146                          * Byte 3: upper byte of completed_index_flags
147                          *         bit 5 = fcoe (tunnel)
148                          * Byte 2: upper byte of q_number_rss_type_flags
149                          *         bits 2,3,4,5 = rss type
150                          *         bit 6 = csum_not_calc
151                          * Byte 1: upper byte of bytes_written_flags
152                          *         bit 6 = truncated
153                          *         bit 7 = vlan stripped
154                          * Byte 0: flags
155                          */
156                         1, 3, 9, 14
157                         );
158         /* Used to collect 8 VLAN IDs from 8 desc into one register */
159         const __m256i vlan_shuffle_mask =
160                 _mm256_set_epi8(/* Second descriptor */
161                         0x80, 0x80, 11, 10,
162                         0x80, 0x80, 11, 10,
163                         0x80, 0x80, 11, 10,
164                         0x80, 0x80, 11, 10,
165                         /* First descriptor */
166                         0x80, 0x80, 11, 10,
167                         0x80, 0x80, 11, 10,
168                         0x80, 0x80, 11, 10,
169                         0x80, 0x80, 11, 10);
170         /* RTE_MBUF_F_RX_RSS_HASH is 1<<1 so fits in 8-bit integer */
171         const __m256i rss_shuffle =
172                 _mm256_set_epi8(RTE_MBUF_F_RX_RSS_HASH,
173                                 RTE_MBUF_F_RX_RSS_HASH,
174                                 RTE_MBUF_F_RX_RSS_HASH,
175                                 RTE_MBUF_F_RX_RSS_HASH,
176                                 RTE_MBUF_F_RX_RSS_HASH,
177                                 RTE_MBUF_F_RX_RSS_HASH,
178                                 RTE_MBUF_F_RX_RSS_HASH,
179                                 RTE_MBUF_F_RX_RSS_HASH,
180                                 RTE_MBUF_F_RX_RSS_HASH,
181                                 RTE_MBUF_F_RX_RSS_HASH,
182                                 RTE_MBUF_F_RX_RSS_HASH,
183                                 RTE_MBUF_F_RX_RSS_HASH,
184                                 RTE_MBUF_F_RX_RSS_HASH,
185                                 RTE_MBUF_F_RX_RSS_HASH,
186                                 RTE_MBUF_F_RX_RSS_HASH,
187                                 0, /* rss_types = 0 */
188                                 /* first 128 bits */
189                                 RTE_MBUF_F_RX_RSS_HASH,
190                                 RTE_MBUF_F_RX_RSS_HASH,
191                                 RTE_MBUF_F_RX_RSS_HASH,
192                                 RTE_MBUF_F_RX_RSS_HASH,
193                                 RTE_MBUF_F_RX_RSS_HASH,
194                                 RTE_MBUF_F_RX_RSS_HASH,
195                                 RTE_MBUF_F_RX_RSS_HASH,
196                                 RTE_MBUF_F_RX_RSS_HASH,
197                                 RTE_MBUF_F_RX_RSS_HASH,
198                                 RTE_MBUF_F_RX_RSS_HASH,
199                                 RTE_MBUF_F_RX_RSS_HASH,
200                                 RTE_MBUF_F_RX_RSS_HASH,
201                                 RTE_MBUF_F_RX_RSS_HASH,
202                                 RTE_MBUF_F_RX_RSS_HASH,
203                                 RTE_MBUF_F_RX_RSS_HASH,
204                                 0 /* rss_types = 0 */);
205         /*
206          * VLAN offload flags.
207          * shuffle index:
208          * vlan_stripped => bit 0
209          * vlan_id == 0  => bit 1
210          */
211         const __m256i vlan_shuffle =
212                 _mm256_set_epi32(0, 0, 0, 0,
213                         RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED, 0,
214                         RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED,
215                         RTE_MBUF_F_RX_VLAN);
216         /* Use the same shuffle index as vlan_shuffle */
217         const __m256i vlan_ptype_shuffle =
218                 _mm256_set_epi32(0, 0, 0, 0,
219                                  RTE_PTYPE_L2_ETHER,
220                                  RTE_PTYPE_L2_ETHER,
221                                  RTE_PTYPE_L2_ETHER,
222                                  RTE_PTYPE_L2_ETHER_VLAN);
223         /*
224          * CKSUM flags. Shift right so they fit int 8-bit integers.
225          * shuffle index:
226          * ipv4_csum_ok    => bit 3
227          * ip4             => bit 2
228          * tcp_or_udp      => bit 1
229          * tcp_udp_csum_ok => bit 0
230          */
231         const __m256i csum_shuffle =
232                 _mm256_set_epi8(/* second 128 bits */
233                         /* 1111 ip4+ip4_ok+l4+l4_ok */
234                         ((RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_L4_CKSUM_GOOD) >> 1),
235                         /* 1110 ip4_ok+ip4+l4+!l4_ok */
236                         ((RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_L4_CKSUM_BAD) >> 1),
237                         (RTE_MBUF_F_RX_IP_CKSUM_GOOD >> 1),
238                         /* 1101 ip4+ip4_ok */
239                         (RTE_MBUF_F_RX_IP_CKSUM_GOOD >> 1),
240                         /* 1100 ip4_ok+ip4 */
241                         (RTE_MBUF_F_RX_L4_CKSUM_GOOD >> 1),
242                         /* 1011 l4+l4_ok */
243                         (RTE_MBUF_F_RX_L4_CKSUM_BAD >> 1),
244                           /* 1010 l4+!l4_ok */
245                         0, /* 1001 */
246                         0, /* 1000 */
247                         /* 0111 !ip4_ok+ip4+l4+l4_ok */
248                         ((RTE_MBUF_F_RX_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_GOOD) >> 1),
249                         /* 0110 !ip4_ok+ip4+l4+!l4_ok */
250                         ((RTE_MBUF_F_RX_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD) >> 1),
251                         (RTE_MBUF_F_RX_IP_CKSUM_BAD >> 1),
252                           /* 0101 !ip4_ok+ip4 */
253                         (RTE_MBUF_F_RX_IP_CKSUM_BAD >> 1),
254                           /* 0100 !ip4_ok+ip4 */
255                         (RTE_MBUF_F_RX_L4_CKSUM_GOOD >> 1),
256                         /* 0011 l4+l4_ok */
257                         (RTE_MBUF_F_RX_L4_CKSUM_BAD >> 1),
258                           /* 0010 l4+!l4_ok */
259                         0, /* 0001 */
260                         0, /* 0000 */
261                         /* first 128 bits */
262                         ((RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_L4_CKSUM_GOOD) >> 1),
263                         ((RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_L4_CKSUM_BAD) >> 1),
264                         (RTE_MBUF_F_RX_IP_CKSUM_GOOD >> 1),
265                         (RTE_MBUF_F_RX_IP_CKSUM_GOOD >> 1),
266                         (RTE_MBUF_F_RX_L4_CKSUM_GOOD >> 1),
267                         (RTE_MBUF_F_RX_L4_CKSUM_BAD >> 1),
268                         0, 0,
269                         ((RTE_MBUF_F_RX_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_GOOD) >> 1),
270                         ((RTE_MBUF_F_RX_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD) >> 1),
271                         (RTE_MBUF_F_RX_IP_CKSUM_BAD >> 1),
272                         (RTE_MBUF_F_RX_IP_CKSUM_BAD >> 1),
273                         (RTE_MBUF_F_RX_L4_CKSUM_GOOD >> 1),
274                         (RTE_MBUF_F_RX_L4_CKSUM_BAD >> 1),
275                         0, 0);
276         /*
277          * Non-fragment PTYPEs.
278          * Shuffle 4-bit index:
279          * ip6 => bit 0
280          * ip4 => bit 1
281          * udp => bit 2
282          * tcp => bit 3
283          *   bit
284          * 3 2 1 0
285          * -------
286          * 0 0 0 0 unknown
287          * 0 0 0 1 ip6 | nonfrag
288          * 0 0 1 0 ip4 | nonfrag
289          * 0 0 1 1 unknown
290          * 0 1 0 0 unknown
291          * 0 1 0 1 ip6 | udp
292          * 0 1 1 0 ip4 | udp
293          * 0 1 1 1 unknown
294          * 1 0 0 0 unknown
295          * 1 0 0 1 ip6 | tcp
296          * 1 0 1 0 ip4 | tcp
297          * 1 0 1 1 unknown
298          * 1 1 0 0 unknown
299          * 1 1 0 1 unknown
300          * 1 1 1 0 unknown
301          * 1 1 1 1 unknown
302          *
303          * PTYPEs do not fit in 8 bits, so shift right 4..
304          */
305         const __m256i nonfrag_ptype_shuffle =
306                 _mm256_set_epi8(/* second 128 bits */
307                         RTE_PTYPE_UNKNOWN,
308                         RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
309                         RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
310                         (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_TCP) >> 4,
311                         (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_TCP) >> 4,
312                         RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
313                         (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_UDP) >> 4,
314                         (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_UDP) >> 4,
315                         RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
316                         (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
317                          RTE_PTYPE_L4_NONFRAG) >> 4,
318                         (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
319                          RTE_PTYPE_L4_NONFRAG) >> 4,
320                         RTE_PTYPE_UNKNOWN,
321                         /* first 128 bits */
322                         RTE_PTYPE_UNKNOWN,
323                         RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
324                         RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
325                         (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_TCP) >> 4,
326                         (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_TCP) >> 4,
327                         RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
328                         (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_UDP) >> 4,
329                         (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_UDP) >> 4,
330                         RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
331                         (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
332                          RTE_PTYPE_L4_NONFRAG) >> 4,
333                         (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
334                          RTE_PTYPE_L4_NONFRAG) >> 4,
335                         RTE_PTYPE_UNKNOWN);
336         /* Fragment PTYPEs. Use the same shuffle index as above. */
337         const __m256i frag_ptype_shuffle =
338                 _mm256_set_epi8(/* second 128 bits */
339                         RTE_PTYPE_UNKNOWN,
340                         RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
341                         RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
342                         (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
343                          RTE_PTYPE_L4_FRAG) >> 4,
344                         (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
345                          RTE_PTYPE_L4_FRAG) >> 4,
346                         RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
347                         (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
348                          RTE_PTYPE_L4_FRAG) >> 4,
349                         (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
350                          RTE_PTYPE_L4_FRAG) >> 4,
351                         RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
352                         (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
353                          RTE_PTYPE_L4_FRAG) >> 4,
354                         (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
355                          RTE_PTYPE_L4_FRAG) >> 4,
356                         RTE_PTYPE_UNKNOWN,
357                         /* first 128 bits */
358                         RTE_PTYPE_UNKNOWN,
359                         RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
360                         RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
361                         (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
362                          RTE_PTYPE_L4_FRAG) >> 4,
363                         (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
364                          RTE_PTYPE_L4_FRAG) >> 4,
365                         RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
366                         (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
367                          RTE_PTYPE_L4_FRAG) >> 4,
368                         (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
369                          RTE_PTYPE_L4_FRAG) >> 4,
370                         RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
371                         (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
372                          RTE_PTYPE_L4_FRAG) >> 4,
373                         (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
374                          RTE_PTYPE_L4_FRAG) >> 4,
375                         RTE_PTYPE_UNKNOWN);
376         /*
377          * Tunnel PTYPEs. Use the same shuffle index as above.
378          * L4 types are not part of this table. They come from non-tunnel
379          * types above.
380          */
381         const __m256i tnl_l3_ptype_shuffle =
382                 _mm256_set_epi8(/* second 128 bits */
383                         RTE_PTYPE_UNKNOWN,
384                         RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
385                         RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
386                         RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN >> 16,
387                         RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN >> 16,
388                         RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
389                         RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN >> 16,
390                         RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN >> 16,
391                         RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
392                         RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN >> 16,
393                         RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN >> 16,
394                         RTE_PTYPE_UNKNOWN,
395                         /* first 128 bits */
396                         RTE_PTYPE_UNKNOWN,
397                         RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
398                         RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
399                         RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN >> 16,
400                         RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN >> 16,
401                         RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
402                         RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN >> 16,
403                         RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN >> 16,
404                         RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
405                         RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN >> 16,
406                         RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN >> 16,
407                         RTE_PTYPE_UNKNOWN);
408
409         const __m256i mbuf_init = _mm256_set_epi64x(0, enic->mbuf_initializer,
410                                                     0, enic->mbuf_initializer);
411
412         /*
413          * --- cq desc fields ---    offset
414          * completed_index_flags    - 0   use: fcoe
415          * q_number_rss_type_flags  - 2   use: rss types, csum_not_calc
416          * rss_hash                 - 4   ==> mbuf.hash.rss
417          * bytes_written_flags      - 8   ==> mbuf.pkt_len,data_len
418          *                                use: truncated, vlan_stripped
419          * vlan                     - 10  ==> mbuf.vlan_tci
420          * checksum_fcoe            - 12  (unused)
421          * flags                    - 14  use: all bits
422          * type_color               - 15  (unused)
423          *
424          * --- mbuf fields ---       offset
425          * rearm_data              ---- 16
426          * data_off    - 0      (mbuf_init) -+
427          * refcnt      - 2      (mbuf_init)  |
428          * nb_segs     - 4      (mbuf_init)  | 16B 128b
429          * port        - 6      (mbuf_init)  |
430          * ol_flag     - 8      (from cqd)  -+
431          * rx_descriptor_fields1   ---- 32
432          * packet_type - 0      (from cqd)  -+
433          * pkt_len     - 4      (from cqd)   |
434          * data_len    - 8      (from cqd)   | 16B 128b
435          * vlan_tci    - 10     (from cqd)   |
436          * rss         - 12     (from cqd)  -+
437          */
438
439         __m256i overlay_enabled =
440                 _mm256_set1_epi32((uint32_t)enic->overlay_offload);
441
442         /* Step 2: Process 8 packets per loop using SIMD */
443         while (max_rx > 7 && (((cqd + 7)->type_color &
444                                CQ_DESC_COLOR_MASK_NOSHIFT) != color)) {
445                 /* Load 8 16B CQ descriptors */
446                 __m256i cqd01 = _mm256_load_si256((void *)cqd);
447                 __m256i cqd23 = _mm256_load_si256((void *)(cqd + 2));
448                 __m256i cqd45 = _mm256_load_si256((void *)(cqd + 4));
449                 __m256i cqd67 = _mm256_load_si256((void *)(cqd + 6));
450                 /* Copy 8 mbuf pointers to rx_pkts */
451                 _mm256_storeu_si256((void *)rx,
452                                     _mm256_loadu_si256((void *)rxmb));
453                 _mm256_storeu_si256((void *)(rx + 4),
454                                     _mm256_loadu_si256((void *)(rxmb + 4)));
455
456                 /*
457                  * Collect 8 flags (each 32 bits) into one register.
458                  * 4 shuffles, 3 blends, 1 permute for 8 desc: 1 inst/desc
459                  */
460                 __m256i flags01 =
461                         _mm256_shuffle_epi8(cqd01, flags_shuffle_mask);
462                 /*
463                  * Shuffle above produces 8 x 32-bit flags for 8 descriptors
464                  * in this order: 0, 0, 0, 0, 1, 1, 1, 1
465                  * The duplicates in each 128-bit lane simplifies blending
466                  * below.
467                  */
468                 __m256i flags23 =
469                         _mm256_shuffle_epi8(cqd23, flags_shuffle_mask);
470                 __m256i flags45 =
471                         _mm256_shuffle_epi8(cqd45, flags_shuffle_mask);
472                 __m256i flags67 =
473                         _mm256_shuffle_epi8(cqd67, flags_shuffle_mask);
474                 /* 1st blend produces flags for desc: 0, 2, 0, 0, 1, 3, 1, 1 */
475                 __m256i flags0_3 = _mm256_blend_epi32(flags01, flags23, 0x22);
476                 /* 2nd blend produces flags for desc: 4, 4, 4, 6, 5, 5, 5, 7 */
477                 __m256i flags4_7 = _mm256_blend_epi32(flags45, flags67, 0x88);
478                 /* 3rd blend produces flags for desc: 0, 2, 4, 6, 1, 3, 5, 7 */
479                 __m256i flags0_7 = _mm256_blend_epi32(flags0_3, flags4_7, 0xcc);
480                 /*
481                  * Swap to reorder flags in this order: 1, 3, 5, 7, 0, 2, 4, 6
482                  * This order simplifies blend operations way below that
483                  * produce 'rearm' data for each mbuf.
484                  */
485                 flags0_7 = _mm256_permute4x64_epi64(flags0_7,
486                         (1 << 6) + (0 << 4) + (3 << 2) + 2);
487
488                 /*
489                  * Check truncated bits and bail out early on.
490                  * 6 avx inst, 1 or, 1 if-then-else for 8 desc: 1 inst/desc
491                  */
492                 __m256i trunc =
493                         _mm256_srli_epi32(_mm256_slli_epi32(flags0_7, 17), 31);
494                 trunc = _mm256_add_epi64(trunc, _mm256_permute4x64_epi64(trunc,
495                         (1 << 6) + (0 << 4) + (3 << 2) + 2));
496                 /* 0:63 contains 1+3+0+2 and 64:127 contains 5+7+4+6 */
497                 if (_mm256_extract_epi64(trunc, 0) ||
498                     _mm256_extract_epi64(trunc, 1))
499                         break;
500
501                 /*
502                  * Compute RTE_MBUF_F_RX_RSS_HASH.
503                  * Use 2 shifts and 1 shuffle for 8 desc: 0.375 inst/desc
504                  * RSS types in byte 0, 4, 8, 12, 16, 20, 24, 28
505                  * Everything else is zero.
506                  */
507                 __m256i rss_types =
508                         _mm256_srli_epi32(_mm256_slli_epi32(flags0_7, 10), 28);
509                 /*
510                  * RSS flags (RTE_MBUF_F_RX_RSS_HASH) are in
511                  * byte 0, 4, 8, 12, 16, 20, 24, 28
512                  * Everything else is zero.
513                  */
514                 __m256i rss_flags = _mm256_shuffle_epi8(rss_shuffle, rss_types);
515
516                 /*
517                  * Compute CKSUM flags. First build the index and then
518                  * use it to shuffle csum_shuffle.
519                  * 20 instructions including const loads: 2.5 inst/desc
520                  */
521                 /*
522                  * csum_not_calc (bit 22)
523                  * csum_not_calc (0) => 0xffffffff
524                  * csum_not_calc (1) => 0x0
525                  */
526                 const __m256i zero4 = _mm256_setzero_si256();
527                 const __m256i mask22 = _mm256_set1_epi32(0x400000);
528                 __m256i csum_not_calc = _mm256_cmpeq_epi32(zero4,
529                         _mm256_and_si256(flags0_7, mask22));
530                 /*
531                  * (tcp|udp) && !fragment => bit 1
532                  * tcp = bit 2, udp = bit 1, frag = bit 6
533                  */
534                 const __m256i mask1 = _mm256_set1_epi32(0x2);
535                 __m256i tcp_udp =
536                         _mm256_andnot_si256(_mm256_srli_epi32(flags0_7, 5),
537                                 _mm256_or_si256(flags0_7,
538                                         _mm256_srli_epi32(flags0_7, 1)));
539                 tcp_udp = _mm256_and_si256(tcp_udp, mask1);
540                 /* ipv4 (bit 5) => bit 2 */
541                 const __m256i mask2 = _mm256_set1_epi32(0x4);
542                 __m256i ipv4 = _mm256_and_si256(mask2,
543                         _mm256_srli_epi32(flags0_7, 3));
544                 /*
545                  * ipv4_csum_ok (bit 3) => bit 3
546                  * tcp_udp_csum_ok (bit 0) => bit 0
547                  * 0x9
548                  */
549                 const __m256i mask0_3 = _mm256_set1_epi32(0x9);
550                 __m256i csum_idx = _mm256_and_si256(flags0_7, mask0_3);
551                 csum_idx = _mm256_and_si256(csum_not_calc,
552                         _mm256_or_si256(_mm256_or_si256(csum_idx, ipv4),
553                                 tcp_udp));
554                 __m256i csum_flags =
555                         _mm256_shuffle_epi8(csum_shuffle, csum_idx);
556                 /* Shift left to restore CKSUM flags. See csum_shuffle. */
557                 csum_flags = _mm256_slli_epi32(csum_flags, 1);
558                 /* Combine csum flags and offload flags: 0.125 inst/desc */
559                 rss_flags = _mm256_or_si256(rss_flags, csum_flags);
560
561                 /*
562                  * Collect 8 VLAN IDs and compute vlan_id != 0 on each.
563                  * 4 shuffles, 3 blends, 1 permute, 1 cmp, 1 sub for 8 desc:
564                  * 1.25 inst/desc
565                  */
566                 __m256i vlan01 = _mm256_shuffle_epi8(cqd01, vlan_shuffle_mask);
567                 __m256i vlan23 = _mm256_shuffle_epi8(cqd23, vlan_shuffle_mask);
568                 __m256i vlan45 = _mm256_shuffle_epi8(cqd45, vlan_shuffle_mask);
569                 __m256i vlan67 = _mm256_shuffle_epi8(cqd67, vlan_shuffle_mask);
570                 __m256i vlan0_3 = _mm256_blend_epi32(vlan01, vlan23, 0x22);
571                 __m256i vlan4_7 = _mm256_blend_epi32(vlan45, vlan67, 0x88);
572                 /* desc: 0, 2, 4, 6, 1, 3, 5, 7 */
573                 __m256i vlan0_7 = _mm256_blend_epi32(vlan0_3, vlan4_7, 0xcc);
574                 /* desc: 1, 3, 5, 7, 0, 2, 4, 6 */
575                 vlan0_7 = _mm256_permute4x64_epi64(vlan0_7,
576                         (1 << 6) + (0 << 4) + (3 << 2) + 2);
577                 /*
578                  * Compare 0 == vlan_id produces 0xffffffff (-1) if
579                  * vlan 0 and 0 if vlan non-0. Then subtracting the
580                  * result from 0 produces 0 - (-1) = 1 for vlan 0, and
581                  * 0 - 0 = 0 for vlan non-0.
582                  */
583                 vlan0_7 = _mm256_cmpeq_epi32(zero4, vlan0_7);
584                 /* vlan_id != 0 => 0, vlan_id == 0 => 1 */
585                 vlan0_7 = _mm256_sub_epi32(zero4, vlan0_7);
586
587                 /*
588                  * Compute RTE_MBUF_F_RX_VLAN and RTE_MBUF_F_RX_VLAN_STRIPPED.
589                  * Use 3 shifts, 1 or,  1 shuffle for 8 desc: 0.625 inst/desc
590                  * VLAN offload flags in byte 0, 4, 8, 12, 16, 20, 24, 28
591                  * Everything else is zero.
592                  */
593                 __m256i vlan_idx =
594                         _mm256_or_si256(/* vlan_stripped => bit 0 */
595                                 _mm256_srli_epi32(_mm256_slli_epi32(flags0_7,
596                                         16), 31),
597                                 /* (vlan_id == 0) => bit 1 */
598                                 _mm256_slli_epi32(vlan0_7, 1));
599                 /*
600                  * The index captures 4 cases.
601                  * stripped, id = 0   ==> 11b = 3
602                  * stripped, id != 0  ==> 01b = 1
603                  * not strip, id == 0 ==> 10b = 2
604                  * not strip, id != 0 ==> 00b = 0
605                  */
606                 __m256i vlan_flags = _mm256_permutevar8x32_epi32(vlan_shuffle,
607                         vlan_idx);
608                 /* Combine vlan and offload flags: 0.125 inst/desc */
609                 rss_flags = _mm256_or_si256(rss_flags, vlan_flags);
610
611                 /*
612                  * Compute non-tunnel PTYPEs.
613                  * 17 inst / 8 desc = 2.125 inst/desc
614                  */
615                 /* ETHER and ETHER_VLAN */
616                 __m256i vlan_ptype =
617                         _mm256_permutevar8x32_epi32(vlan_ptype_shuffle,
618                                 vlan_idx);
619                 /* Build the ptype index from flags */
620                 tcp_udp = _mm256_slli_epi32(flags0_7, 29);
621                 tcp_udp = _mm256_slli_epi32(_mm256_srli_epi32(tcp_udp, 30), 2);
622                 __m256i ip4_ip6 =
623                         _mm256_srli_epi32(_mm256_slli_epi32(flags0_7, 26), 30);
624                 __m256i ptype_idx = _mm256_or_si256(tcp_udp, ip4_ip6);
625                 __m256i frag_bit =
626                         _mm256_srli_epi32(_mm256_slli_epi32(flags0_7, 25), 31);
627                 __m256i nonfrag_ptype =
628                         _mm256_shuffle_epi8(nonfrag_ptype_shuffle, ptype_idx);
629                 __m256i frag_ptype =
630                         _mm256_shuffle_epi8(frag_ptype_shuffle, ptype_idx);
631                 /*
632                  * Zero out the unwanted types and combine the remaining bits.
633                  * The effect is same as selecting non-frag or frag types
634                  * depending on the frag bit.
635                  */
636                 nonfrag_ptype = _mm256_and_si256(nonfrag_ptype,
637                         _mm256_cmpeq_epi32(zero4, frag_bit));
638                 frag_ptype = _mm256_and_si256(frag_ptype,
639                         _mm256_cmpgt_epi32(frag_bit, zero4));
640                 __m256i ptype = _mm256_or_si256(nonfrag_ptype, frag_ptype);
641                 ptype = _mm256_slli_epi32(ptype, 4);
642                 /*
643                  * Compute tunnel PTYPEs.
644                  * 15 inst / 8 desc = 1.875 inst/desc
645                  */
646                 __m256i tnl_l3_ptype =
647                         _mm256_shuffle_epi8(tnl_l3_ptype_shuffle, ptype_idx);
648                 tnl_l3_ptype = _mm256_slli_epi32(tnl_l3_ptype, 16);
649                 /*
650                  * Shift non-tunnel L4 types to make them tunnel types.
651                  * RTE_PTYPE_L4_TCP << 16 == RTE_PTYPE_INNER_L4_TCP
652                  */
653                 __m256i tnl_l4_ptype =
654                         _mm256_slli_epi32(_mm256_and_si256(ptype,
655                                 _mm256_set1_epi32(RTE_PTYPE_L4_MASK)), 16);
656                 __m256i tnl_ptype =
657                         _mm256_or_si256(tnl_l3_ptype, tnl_l4_ptype);
658                 tnl_ptype = _mm256_or_si256(tnl_ptype,
659                         _mm256_set1_epi32(RTE_PTYPE_TUNNEL_GRENAT |
660                                 RTE_PTYPE_INNER_L2_ETHER));
661                 /*
662                  * Select non-tunnel or tunnel types by zeroing out the
663                  * unwanted ones.
664                  */
665                 __m256i tnl_flags = _mm256_and_si256(overlay_enabled,
666                         _mm256_srli_epi32(_mm256_slli_epi32(flags0_7, 2), 31));
667                 tnl_ptype = _mm256_and_si256(tnl_ptype,
668                         _mm256_sub_epi32(zero4, tnl_flags));
669                 ptype = _mm256_and_si256(ptype,
670                         _mm256_cmpeq_epi32(zero4, tnl_flags));
671                 /*
672                  * Combine types and swap to have ptypes in the same order
673                  * as desc.
674                  * desc: 0 2 4 6 1 3 5 7
675                  * 3 inst / 8 desc = 0.375 inst/desc
676                  */
677                 ptype = _mm256_or_si256(ptype, tnl_ptype);
678                 ptype = _mm256_or_si256(ptype, vlan_ptype);
679                 ptype = _mm256_permute4x64_epi64(ptype,
680                         (1 << 6) + (0 << 4) + (3 << 2) + 2);
681
682                 /*
683                  * Mask packet length.
684                  * Use 4 ands: 0.5 instructions/desc
685                  */
686                 cqd01 = _mm256_and_si256(cqd01, mask);
687                 cqd23 = _mm256_and_si256(cqd23, mask);
688                 cqd45 = _mm256_and_si256(cqd45, mask);
689                 cqd67 = _mm256_and_si256(cqd67, mask);
690                 /*
691                  * Shuffle. Two 16B sets of the mbuf fields.
692                  * packet_type, pkt_len, data_len, vlan_tci, rss
693                  */
694                 __m256i rearm01 = _mm256_shuffle_epi8(cqd01, shuffle_mask);
695                 __m256i rearm23 = _mm256_shuffle_epi8(cqd23, shuffle_mask);
696                 __m256i rearm45 = _mm256_shuffle_epi8(cqd45, shuffle_mask);
697                 __m256i rearm67 = _mm256_shuffle_epi8(cqd67, shuffle_mask);
698
699                 /*
700                  * Blend in ptypes
701                  * 4 blends and 3 shuffles for 8 desc: 0.875 inst/desc
702                  */
703                 rearm01 = _mm256_blend_epi32(rearm01, ptype, 0x11);
704                 rearm23 = _mm256_blend_epi32(rearm23,
705                         _mm256_shuffle_epi32(ptype, 1), 0x11);
706                 rearm45 = _mm256_blend_epi32(rearm45,
707                         _mm256_shuffle_epi32(ptype, 2), 0x11);
708                 rearm67 = _mm256_blend_epi32(rearm67,
709                         _mm256_shuffle_epi32(ptype, 3), 0x11);
710
711                 /*
712                  * Move rss_flags into ol_flags in mbuf_init.
713                  * Use 1 shift and 1 blend for each desc: 2 inst/desc
714                  */
715                 __m256i mbuf_init4_5 = _mm256_blend_epi32(mbuf_init,
716                         rss_flags, 0x44);
717                 __m256i mbuf_init2_3 = _mm256_blend_epi32(mbuf_init,
718                         _mm256_slli_si256(rss_flags, 4), 0x44);
719                 __m256i mbuf_init0_1 = _mm256_blend_epi32(mbuf_init,
720                         _mm256_slli_si256(rss_flags, 8), 0x44);
721                 __m256i mbuf_init6_7 = _mm256_blend_epi32(mbuf_init,
722                         _mm256_srli_si256(rss_flags, 4), 0x44);
723
724                 /*
725                  * Build rearm, one per desc.
726                  * 8 blends and 4 permutes: 1.5 inst/desc
727                  */
728                 __m256i rearm0 = _mm256_blend_epi32(rearm01,
729                         mbuf_init0_1, 0xf0);
730                 __m256i rearm1 = _mm256_blend_epi32(mbuf_init0_1,
731                         rearm01, 0xf0);
732                 __m256i rearm2 = _mm256_blend_epi32(rearm23,
733                         mbuf_init2_3, 0xf0);
734                 __m256i rearm3 = _mm256_blend_epi32(mbuf_init2_3,
735                         rearm23, 0xf0);
736                 /* Swap upper and lower 64 bits */
737                 rearm0 = _mm256_permute4x64_epi64(rearm0,
738                         (1 << 6) + (0 << 4) + (3 << 2) + 2);
739                 rearm2 = _mm256_permute4x64_epi64(rearm2,
740                         (1 << 6) + (0 << 4) + (3 << 2) + 2);
741                 /* Second set of 4 descriptors */
742                 __m256i rearm4 = _mm256_blend_epi32(rearm45,
743                         mbuf_init4_5, 0xf0);
744                 __m256i rearm5 = _mm256_blend_epi32(mbuf_init4_5,
745                         rearm45, 0xf0);
746                 __m256i rearm6 = _mm256_blend_epi32(rearm67,
747                         mbuf_init6_7, 0xf0);
748                 __m256i rearm7 = _mm256_blend_epi32(mbuf_init6_7,
749                         rearm67, 0xf0);
750                 rearm4 = _mm256_permute4x64_epi64(rearm4,
751                         (1 << 6) + (0 << 4) + (3 << 2) + 2);
752                 rearm6 = _mm256_permute4x64_epi64(rearm6,
753                         (1 << 6) + (0 << 4) + (3 << 2) + 2);
754
755                 /*
756                  * Write out 32B of mbuf fields.
757                  * data_off    - off 0  (mbuf_init)
758                  * refcnt      - 2      (mbuf_init)
759                  * nb_segs     - 4      (mbuf_init)
760                  * port        - 6      (mbuf_init)
761                  * ol_flag     - 8      (from cqd)
762                  * packet_type - 16     (from cqd)
763                  * pkt_len     - 20     (from cqd)
764                  * data_len    - 24     (from cqd)
765                  * vlan_tci    - 26     (from cqd)
766                  * rss         - 28     (from cqd)
767                  */
768                 _mm256_storeu_si256((__m256i *)&rxmb[0]->rearm_data, rearm0);
769                 _mm256_storeu_si256((__m256i *)&rxmb[1]->rearm_data, rearm1);
770                 _mm256_storeu_si256((__m256i *)&rxmb[2]->rearm_data, rearm2);
771                 _mm256_storeu_si256((__m256i *)&rxmb[3]->rearm_data, rearm3);
772                 _mm256_storeu_si256((__m256i *)&rxmb[4]->rearm_data, rearm4);
773                 _mm256_storeu_si256((__m256i *)&rxmb[5]->rearm_data, rearm5);
774                 _mm256_storeu_si256((__m256i *)&rxmb[6]->rearm_data, rearm6);
775                 _mm256_storeu_si256((__m256i *)&rxmb[7]->rearm_data, rearm7);
776
777                 max_rx -= 8;
778                 cqd += 8;
779                 rx += 8;
780                 rxmb += 8;
781         }
782
783         /*
784          * Step 3: Slow path to handle a small (<8) number of packets and
785          * occasional truncated packets.
786          */
787         while (max_rx && ((cqd->type_color &
788                            CQ_DESC_COLOR_MASK_NOSHIFT) != color)) {
789                 if (unlikely(cqd->bytes_written_flags &
790                              CQ_ENET_RQ_DESC_FLAGS_TRUNCATED)) {
791                         rte_pktmbuf_free(*rxmb++);
792                         rte_atomic64_inc(&enic->soft_stats.rx_packet_errors);
793                 } else {
794                         *rx++ = rx_one(cqd, *rxmb++, enic);
795                 }
796                 cqd++;
797                 max_rx--;
798         }
799
800         /* Number of descriptors visited */
801         nb_rx = cqd - (struct cq_enet_rq_desc *)(cq->ring.descs) - cq_idx;
802         if (nb_rx == 0)
803                 return 0;
804         rqd = ((struct rq_enet_desc *)rq->ring.descs) + cq_idx;
805         rxmb = rq->mbuf_ring + cq_idx;
806         cq_idx += nb_rx;
807         rq->rx_nb_hold += nb_rx;
808         if (unlikely(cq_idx == cq->ring.desc_count)) {
809                 cq_idx = 0;
810                 cq->last_color ^= CQ_DESC_COLOR_MASK_NOSHIFT;
811         }
812         cq->to_clean = cq_idx;
813
814         /* Step 4: Restock RQ with new mbufs */
815         memcpy(rxmb, rq->free_mbufs + ENIC_RX_BURST_MAX - rq->num_free_mbufs,
816                sizeof(struct rte_mbuf *) * nb_rx);
817         rq->num_free_mbufs -= nb_rx;
818         while (nb_rx) {
819                 rqd->address = (*rxmb)->buf_iova + RTE_PKTMBUF_HEADROOM;
820                 nb_rx--;
821                 rqd++;
822                 rxmb++;
823         }
824         if (rq->rx_nb_hold > rq->rx_free_thresh) {
825                 rq->posted_index = enic_ring_add(rq->ring.desc_count,
826                                                  rq->posted_index,
827                                                  rq->rx_nb_hold);
828                 rq->rx_nb_hold = 0;
829                 rte_wmb();
830                 iowrite32_relaxed(rq->posted_index,
831                                   &rq->ctrl->posted_index);
832         }
833
834         return rx - rx_pkts;
835 }
836
837 bool
838 enic_use_vector_rx_handler(struct rte_eth_dev *eth_dev)
839 {
840         struct enic *enic = pmd_priv(eth_dev);
841
842         /* User needs to request for the avx2 handler */
843         if (!enic->enable_avx2_rx)
844                 return false;
845         /* Do not support scatter Rx */
846         if (!(enic->rq_count > 0 && enic->rq[0].data_queue_enable == 0))
847                 return false;
848         if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) &&
849                         rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_256) {
850                 ENICPMD_LOG(DEBUG, " use the non-scatter avx2 Rx handler");
851                 eth_dev->rx_pkt_burst = &enic_noscatter_vec_recv_pkts;
852                 enic->use_noscatter_vec_rx_handler = 1;
853                 return true;
854         }
855         return false;
856 }