1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright 2008-2018 Cisco Systems, Inc. All rights reserved.
3 * Copyright 2007 Nuova Systems, Inc. All rights reserved.
7 #include <ethdev_driver.h>
10 #include "enic_compat.h"
11 #include "rq_enet_desc.h"
13 #include "enic_rxtx_common.h"
15 #include <x86intrin.h>
17 static struct rte_mbuf *
18 rx_one(struct cq_enet_rq_desc *cqd, struct rte_mbuf *mb, struct enic *enic)
22 *(uint64_t *)&mb->rearm_data = enic->mbuf_initializer;
23 mb->data_len = cqd->bytes_written_flags &
24 CQ_ENET_RQ_DESC_BYTES_WRITTEN_MASK;
25 mb->pkt_len = mb->data_len;
26 tnl = enic->overlay_offload && (cqd->completed_index_flags &
27 CQ_ENET_RQ_DESC_FLAGS_FCOE) != 0;
29 enic_cq_rx_flags_to_pkt_type((struct cq_desc *)cqd, tnl);
30 enic_cq_rx_to_pkt_flags((struct cq_desc *)cqd, mb);
31 /* Wipe the outer types set by enic_cq_rx_flags_to_pkt_type() */
33 mb->packet_type &= ~(RTE_PTYPE_L3_MASK |
40 enic_noscatter_vec_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
43 struct rte_mbuf **rx, **rxmb;
44 uint16_t cq_idx, nb_rx, max_rx;
45 struct cq_enet_rq_desc *cqd;
46 struct rq_enet_desc *rqd;
53 enic = vnic_dev_priv(rq->vdev);
54 cq = &enic->cq[enic_cq_rq(enic, rq->index)];
55 cq_idx = cq->to_clean;
58 * Fill up the reserve of free mbufs. Below, we restock the receive
59 * ring with these mbufs to avoid allocation failures.
61 if (rq->num_free_mbufs == 0) {
62 if (rte_mempool_get_bulk(rq->mp, (void **)rq->free_mbufs,
65 rq->num_free_mbufs = ENIC_RX_BURST_MAX;
67 /* Receive until the end of the ring, at most. */
68 max_rx = RTE_MIN(nb_pkts, rq->num_free_mbufs);
69 max_rx = RTE_MIN(max_rx, cq->ring.desc_count - cq_idx);
71 rxmb = rq->mbuf_ring + cq_idx;
72 color = cq->last_color;
73 cqd = (struct cq_enet_rq_desc *)(cq->ring.descs) + cq_idx;
76 (cqd->type_color & CQ_DESC_COLOR_MASK_NOSHIFT) == color)
79 /* Step 1: Process one packet to do aligned 256-bit load below */
81 if (unlikely(cqd->bytes_written_flags &
82 CQ_ENET_RQ_DESC_FLAGS_TRUNCATED)) {
83 rte_pktmbuf_free(*rxmb++);
84 rte_atomic64_inc(&enic->soft_stats.rx_packet_errors);
86 *rx++ = rx_one(cqd, *rxmb++, enic);
93 _mm256_set_epi8(/* Second descriptor */
94 0xff, /* type_color */
95 (CQ_ENET_RQ_DESC_FLAGS_IPV4_FRAGMENT |
96 CQ_ENET_RQ_DESC_FLAGS_IPV4 |
97 CQ_ENET_RQ_DESC_FLAGS_IPV6 |
98 CQ_ENET_RQ_DESC_FLAGS_TCP |
99 CQ_ENET_RQ_DESC_FLAGS_UDP), /* flags */
100 0, 0, /* checksum_fcoe */
101 0xff, 0xff, /* vlan */
102 0x3f, 0xff, /* bytes_written_flags */
103 0xff, 0xff, 0xff, 0xff, /* rss_hash */
104 0xff, 0xff, /* q_number_rss_type_flags */
105 0, 0, /* completed_index_flags */
106 /* First descriptor */
107 0xff, /* type_color */
108 (CQ_ENET_RQ_DESC_FLAGS_IPV4_FRAGMENT |
109 CQ_ENET_RQ_DESC_FLAGS_IPV4 |
110 CQ_ENET_RQ_DESC_FLAGS_IPV6 |
111 CQ_ENET_RQ_DESC_FLAGS_TCP |
112 CQ_ENET_RQ_DESC_FLAGS_UDP), /* flags */
113 0, 0, /* checksum_fcoe */
114 0xff, 0xff, /* vlan */
115 0x3f, 0xff, /* bytes_written_flags */
116 0xff, 0xff, 0xff, 0xff, /* rss_hash */
117 0xff, 0xff, /* q_number_rss_type_flags */
118 0, 0 /* completed_index_flags */
120 const __m256i shuffle_mask =
121 _mm256_set_epi8(/* Second descriptor */
122 7, 6, 5, 4, /* rss = rss_hash */
123 11, 10, /* vlan_tci = vlan */
124 9, 8, /* data_len = bytes_written */
125 0x80, 0x80, 9, 8, /* pkt_len = bytes_written */
126 0x80, 0x80, 0x80, 0x80, /* packet_type = 0 */
127 /* First descriptor */
128 7, 6, 5, 4, /* rss = rss_hash */
129 11, 10, /* vlan_tci = vlan */
130 9, 8, /* data_len = bytes_written */
131 0x80, 0x80, 9, 8, /* pkt_len = bytes_written */
132 0x80, 0x80, 0x80, 0x80 /* packet_type = 0 */
134 /* Used to collect 8 flags from 8 desc into one register */
135 const __m256i flags_shuffle_mask =
136 _mm256_set_epi8(/* Second descriptor */
141 /* First descriptor */
146 * Byte 3: upper byte of completed_index_flags
147 * bit 5 = fcoe (tunnel)
148 * Byte 2: upper byte of q_number_rss_type_flags
149 * bits 2,3,4,5 = rss type
150 * bit 6 = csum_not_calc
151 * Byte 1: upper byte of bytes_written_flags
153 * bit 7 = vlan stripped
158 /* Used to collect 8 VLAN IDs from 8 desc into one register */
159 const __m256i vlan_shuffle_mask =
160 _mm256_set_epi8(/* Second descriptor */
165 /* First descriptor */
170 /* RTE_MBUF_F_RX_RSS_HASH is 1<<1 so fits in 8-bit integer */
171 const __m256i rss_shuffle =
172 _mm256_set_epi8(RTE_MBUF_F_RX_RSS_HASH,
173 RTE_MBUF_F_RX_RSS_HASH,
174 RTE_MBUF_F_RX_RSS_HASH,
175 RTE_MBUF_F_RX_RSS_HASH,
176 RTE_MBUF_F_RX_RSS_HASH,
177 RTE_MBUF_F_RX_RSS_HASH,
178 RTE_MBUF_F_RX_RSS_HASH,
179 RTE_MBUF_F_RX_RSS_HASH,
180 RTE_MBUF_F_RX_RSS_HASH,
181 RTE_MBUF_F_RX_RSS_HASH,
182 RTE_MBUF_F_RX_RSS_HASH,
183 RTE_MBUF_F_RX_RSS_HASH,
184 RTE_MBUF_F_RX_RSS_HASH,
185 RTE_MBUF_F_RX_RSS_HASH,
186 RTE_MBUF_F_RX_RSS_HASH,
187 0, /* rss_types = 0 */
189 RTE_MBUF_F_RX_RSS_HASH,
190 RTE_MBUF_F_RX_RSS_HASH,
191 RTE_MBUF_F_RX_RSS_HASH,
192 RTE_MBUF_F_RX_RSS_HASH,
193 RTE_MBUF_F_RX_RSS_HASH,
194 RTE_MBUF_F_RX_RSS_HASH,
195 RTE_MBUF_F_RX_RSS_HASH,
196 RTE_MBUF_F_RX_RSS_HASH,
197 RTE_MBUF_F_RX_RSS_HASH,
198 RTE_MBUF_F_RX_RSS_HASH,
199 RTE_MBUF_F_RX_RSS_HASH,
200 RTE_MBUF_F_RX_RSS_HASH,
201 RTE_MBUF_F_RX_RSS_HASH,
202 RTE_MBUF_F_RX_RSS_HASH,
203 RTE_MBUF_F_RX_RSS_HASH,
204 0 /* rss_types = 0 */);
206 * VLAN offload flags.
208 * vlan_stripped => bit 0
209 * vlan_id == 0 => bit 1
211 const __m256i vlan_shuffle =
212 _mm256_set_epi32(0, 0, 0, 0,
213 RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED, 0,
214 RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED,
216 /* Use the same shuffle index as vlan_shuffle */
217 const __m256i vlan_ptype_shuffle =
218 _mm256_set_epi32(0, 0, 0, 0,
222 RTE_PTYPE_L2_ETHER_VLAN);
224 * CKSUM flags. Shift right so they fit int 8-bit integers.
226 * ipv4_csum_ok => bit 3
228 * tcp_or_udp => bit 1
229 * tcp_udp_csum_ok => bit 0
231 const __m256i csum_shuffle =
232 _mm256_set_epi8(/* second 128 bits */
233 /* 1111 ip4+ip4_ok+l4+l4_ok */
234 ((RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_L4_CKSUM_GOOD) >> 1),
235 /* 1110 ip4_ok+ip4+l4+!l4_ok */
236 ((RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_L4_CKSUM_BAD) >> 1),
237 (RTE_MBUF_F_RX_IP_CKSUM_GOOD >> 1),
238 /* 1101 ip4+ip4_ok */
239 (RTE_MBUF_F_RX_IP_CKSUM_GOOD >> 1),
240 /* 1100 ip4_ok+ip4 */
241 (RTE_MBUF_F_RX_L4_CKSUM_GOOD >> 1),
243 (RTE_MBUF_F_RX_L4_CKSUM_BAD >> 1),
247 /* 0111 !ip4_ok+ip4+l4+l4_ok */
248 ((RTE_MBUF_F_RX_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_GOOD) >> 1),
249 /* 0110 !ip4_ok+ip4+l4+!l4_ok */
250 ((RTE_MBUF_F_RX_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD) >> 1),
251 (RTE_MBUF_F_RX_IP_CKSUM_BAD >> 1),
252 /* 0101 !ip4_ok+ip4 */
253 (RTE_MBUF_F_RX_IP_CKSUM_BAD >> 1),
254 /* 0100 !ip4_ok+ip4 */
255 (RTE_MBUF_F_RX_L4_CKSUM_GOOD >> 1),
257 (RTE_MBUF_F_RX_L4_CKSUM_BAD >> 1),
262 ((RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_L4_CKSUM_GOOD) >> 1),
263 ((RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_L4_CKSUM_BAD) >> 1),
264 (RTE_MBUF_F_RX_IP_CKSUM_GOOD >> 1),
265 (RTE_MBUF_F_RX_IP_CKSUM_GOOD >> 1),
266 (RTE_MBUF_F_RX_L4_CKSUM_GOOD >> 1),
267 (RTE_MBUF_F_RX_L4_CKSUM_BAD >> 1),
269 ((RTE_MBUF_F_RX_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_GOOD) >> 1),
270 ((RTE_MBUF_F_RX_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD) >> 1),
271 (RTE_MBUF_F_RX_IP_CKSUM_BAD >> 1),
272 (RTE_MBUF_F_RX_IP_CKSUM_BAD >> 1),
273 (RTE_MBUF_F_RX_L4_CKSUM_GOOD >> 1),
274 (RTE_MBUF_F_RX_L4_CKSUM_BAD >> 1),
277 * Non-fragment PTYPEs.
278 * Shuffle 4-bit index:
287 * 0 0 0 1 ip6 | nonfrag
288 * 0 0 1 0 ip4 | nonfrag
303 * PTYPEs do not fit in 8 bits, so shift right 4..
305 const __m256i nonfrag_ptype_shuffle =
306 _mm256_set_epi8(/* second 128 bits */
308 RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
309 RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
310 (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_TCP) >> 4,
311 (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_TCP) >> 4,
312 RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
313 (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_UDP) >> 4,
314 (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_UDP) >> 4,
315 RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
316 (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
317 RTE_PTYPE_L4_NONFRAG) >> 4,
318 (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
319 RTE_PTYPE_L4_NONFRAG) >> 4,
323 RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
324 RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
325 (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_TCP) >> 4,
326 (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_TCP) >> 4,
327 RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
328 (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_UDP) >> 4,
329 (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_UDP) >> 4,
330 RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
331 (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
332 RTE_PTYPE_L4_NONFRAG) >> 4,
333 (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
334 RTE_PTYPE_L4_NONFRAG) >> 4,
336 /* Fragment PTYPEs. Use the same shuffle index as above. */
337 const __m256i frag_ptype_shuffle =
338 _mm256_set_epi8(/* second 128 bits */
340 RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
341 RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
342 (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
343 RTE_PTYPE_L4_FRAG) >> 4,
344 (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
345 RTE_PTYPE_L4_FRAG) >> 4,
346 RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
347 (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
348 RTE_PTYPE_L4_FRAG) >> 4,
349 (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
350 RTE_PTYPE_L4_FRAG) >> 4,
351 RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
352 (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
353 RTE_PTYPE_L4_FRAG) >> 4,
354 (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
355 RTE_PTYPE_L4_FRAG) >> 4,
359 RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
360 RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
361 (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
362 RTE_PTYPE_L4_FRAG) >> 4,
363 (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
364 RTE_PTYPE_L4_FRAG) >> 4,
365 RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
366 (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
367 RTE_PTYPE_L4_FRAG) >> 4,
368 (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
369 RTE_PTYPE_L4_FRAG) >> 4,
370 RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
371 (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
372 RTE_PTYPE_L4_FRAG) >> 4,
373 (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
374 RTE_PTYPE_L4_FRAG) >> 4,
377 * Tunnel PTYPEs. Use the same shuffle index as above.
378 * L4 types are not part of this table. They come from non-tunnel
381 const __m256i tnl_l3_ptype_shuffle =
382 _mm256_set_epi8(/* second 128 bits */
384 RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
385 RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
386 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN >> 16,
387 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN >> 16,
388 RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
389 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN >> 16,
390 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN >> 16,
391 RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
392 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN >> 16,
393 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN >> 16,
397 RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
398 RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
399 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN >> 16,
400 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN >> 16,
401 RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
402 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN >> 16,
403 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN >> 16,
404 RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
405 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN >> 16,
406 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN >> 16,
409 const __m256i mbuf_init = _mm256_set_epi64x(0, enic->mbuf_initializer,
410 0, enic->mbuf_initializer);
413 * --- cq desc fields --- offset
414 * completed_index_flags - 0 use: fcoe
415 * q_number_rss_type_flags - 2 use: rss types, csum_not_calc
416 * rss_hash - 4 ==> mbuf.hash.rss
417 * bytes_written_flags - 8 ==> mbuf.pkt_len,data_len
418 * use: truncated, vlan_stripped
419 * vlan - 10 ==> mbuf.vlan_tci
420 * checksum_fcoe - 12 (unused)
421 * flags - 14 use: all bits
422 * type_color - 15 (unused)
424 * --- mbuf fields --- offset
426 * data_off - 0 (mbuf_init) -+
427 * refcnt - 2 (mbuf_init) |
428 * nb_segs - 4 (mbuf_init) | 16B 128b
429 * port - 6 (mbuf_init) |
430 * ol_flag - 8 (from cqd) -+
431 * rx_descriptor_fields1 ---- 32
432 * packet_type - 0 (from cqd) -+
433 * pkt_len - 4 (from cqd) |
434 * data_len - 8 (from cqd) | 16B 128b
435 * vlan_tci - 10 (from cqd) |
436 * rss - 12 (from cqd) -+
439 __m256i overlay_enabled =
440 _mm256_set1_epi32((uint32_t)enic->overlay_offload);
442 /* Step 2: Process 8 packets per loop using SIMD */
443 while (max_rx > 7 && (((cqd + 7)->type_color &
444 CQ_DESC_COLOR_MASK_NOSHIFT) != color)) {
445 /* Load 8 16B CQ descriptors */
446 __m256i cqd01 = _mm256_load_si256((void *)cqd);
447 __m256i cqd23 = _mm256_load_si256((void *)(cqd + 2));
448 __m256i cqd45 = _mm256_load_si256((void *)(cqd + 4));
449 __m256i cqd67 = _mm256_load_si256((void *)(cqd + 6));
450 /* Copy 8 mbuf pointers to rx_pkts */
451 _mm256_storeu_si256((void *)rx,
452 _mm256_loadu_si256((void *)rxmb));
453 _mm256_storeu_si256((void *)(rx + 4),
454 _mm256_loadu_si256((void *)(rxmb + 4)));
457 * Collect 8 flags (each 32 bits) into one register.
458 * 4 shuffles, 3 blends, 1 permute for 8 desc: 1 inst/desc
461 _mm256_shuffle_epi8(cqd01, flags_shuffle_mask);
463 * Shuffle above produces 8 x 32-bit flags for 8 descriptors
464 * in this order: 0, 0, 0, 0, 1, 1, 1, 1
465 * The duplicates in each 128-bit lane simplifies blending
469 _mm256_shuffle_epi8(cqd23, flags_shuffle_mask);
471 _mm256_shuffle_epi8(cqd45, flags_shuffle_mask);
473 _mm256_shuffle_epi8(cqd67, flags_shuffle_mask);
474 /* 1st blend produces flags for desc: 0, 2, 0, 0, 1, 3, 1, 1 */
475 __m256i flags0_3 = _mm256_blend_epi32(flags01, flags23, 0x22);
476 /* 2nd blend produces flags for desc: 4, 4, 4, 6, 5, 5, 5, 7 */
477 __m256i flags4_7 = _mm256_blend_epi32(flags45, flags67, 0x88);
478 /* 3rd blend produces flags for desc: 0, 2, 4, 6, 1, 3, 5, 7 */
479 __m256i flags0_7 = _mm256_blend_epi32(flags0_3, flags4_7, 0xcc);
481 * Swap to reorder flags in this order: 1, 3, 5, 7, 0, 2, 4, 6
482 * This order simplifies blend operations way below that
483 * produce 'rearm' data for each mbuf.
485 flags0_7 = _mm256_permute4x64_epi64(flags0_7,
486 (1 << 6) + (0 << 4) + (3 << 2) + 2);
489 * Check truncated bits and bail out early on.
490 * 6 avx inst, 1 or, 1 if-then-else for 8 desc: 1 inst/desc
493 _mm256_srli_epi32(_mm256_slli_epi32(flags0_7, 17), 31);
494 trunc = _mm256_add_epi64(trunc, _mm256_permute4x64_epi64(trunc,
495 (1 << 6) + (0 << 4) + (3 << 2) + 2));
496 /* 0:63 contains 1+3+0+2 and 64:127 contains 5+7+4+6 */
497 if (_mm256_extract_epi64(trunc, 0) ||
498 _mm256_extract_epi64(trunc, 1))
502 * Compute RTE_MBUF_F_RX_RSS_HASH.
503 * Use 2 shifts and 1 shuffle for 8 desc: 0.375 inst/desc
504 * RSS types in byte 0, 4, 8, 12, 16, 20, 24, 28
505 * Everything else is zero.
508 _mm256_srli_epi32(_mm256_slli_epi32(flags0_7, 10), 28);
510 * RSS flags (RTE_MBUF_F_RX_RSS_HASH) are in
511 * byte 0, 4, 8, 12, 16, 20, 24, 28
512 * Everything else is zero.
514 __m256i rss_flags = _mm256_shuffle_epi8(rss_shuffle, rss_types);
517 * Compute CKSUM flags. First build the index and then
518 * use it to shuffle csum_shuffle.
519 * 20 instructions including const loads: 2.5 inst/desc
522 * csum_not_calc (bit 22)
523 * csum_not_calc (0) => 0xffffffff
524 * csum_not_calc (1) => 0x0
526 const __m256i zero4 = _mm256_setzero_si256();
527 const __m256i mask22 = _mm256_set1_epi32(0x400000);
528 __m256i csum_not_calc = _mm256_cmpeq_epi32(zero4,
529 _mm256_and_si256(flags0_7, mask22));
531 * (tcp|udp) && !fragment => bit 1
532 * tcp = bit 2, udp = bit 1, frag = bit 6
534 const __m256i mask1 = _mm256_set1_epi32(0x2);
536 _mm256_andnot_si256(_mm256_srli_epi32(flags0_7, 5),
537 _mm256_or_si256(flags0_7,
538 _mm256_srli_epi32(flags0_7, 1)));
539 tcp_udp = _mm256_and_si256(tcp_udp, mask1);
540 /* ipv4 (bit 5) => bit 2 */
541 const __m256i mask2 = _mm256_set1_epi32(0x4);
542 __m256i ipv4 = _mm256_and_si256(mask2,
543 _mm256_srli_epi32(flags0_7, 3));
545 * ipv4_csum_ok (bit 3) => bit 3
546 * tcp_udp_csum_ok (bit 0) => bit 0
549 const __m256i mask0_3 = _mm256_set1_epi32(0x9);
550 __m256i csum_idx = _mm256_and_si256(flags0_7, mask0_3);
551 csum_idx = _mm256_and_si256(csum_not_calc,
552 _mm256_or_si256(_mm256_or_si256(csum_idx, ipv4),
555 _mm256_shuffle_epi8(csum_shuffle, csum_idx);
556 /* Shift left to restore CKSUM flags. See csum_shuffle. */
557 csum_flags = _mm256_slli_epi32(csum_flags, 1);
558 /* Combine csum flags and offload flags: 0.125 inst/desc */
559 rss_flags = _mm256_or_si256(rss_flags, csum_flags);
562 * Collect 8 VLAN IDs and compute vlan_id != 0 on each.
563 * 4 shuffles, 3 blends, 1 permute, 1 cmp, 1 sub for 8 desc:
566 __m256i vlan01 = _mm256_shuffle_epi8(cqd01, vlan_shuffle_mask);
567 __m256i vlan23 = _mm256_shuffle_epi8(cqd23, vlan_shuffle_mask);
568 __m256i vlan45 = _mm256_shuffle_epi8(cqd45, vlan_shuffle_mask);
569 __m256i vlan67 = _mm256_shuffle_epi8(cqd67, vlan_shuffle_mask);
570 __m256i vlan0_3 = _mm256_blend_epi32(vlan01, vlan23, 0x22);
571 __m256i vlan4_7 = _mm256_blend_epi32(vlan45, vlan67, 0x88);
572 /* desc: 0, 2, 4, 6, 1, 3, 5, 7 */
573 __m256i vlan0_7 = _mm256_blend_epi32(vlan0_3, vlan4_7, 0xcc);
574 /* desc: 1, 3, 5, 7, 0, 2, 4, 6 */
575 vlan0_7 = _mm256_permute4x64_epi64(vlan0_7,
576 (1 << 6) + (0 << 4) + (3 << 2) + 2);
578 * Compare 0 == vlan_id produces 0xffffffff (-1) if
579 * vlan 0 and 0 if vlan non-0. Then subtracting the
580 * result from 0 produces 0 - (-1) = 1 for vlan 0, and
581 * 0 - 0 = 0 for vlan non-0.
583 vlan0_7 = _mm256_cmpeq_epi32(zero4, vlan0_7);
584 /* vlan_id != 0 => 0, vlan_id == 0 => 1 */
585 vlan0_7 = _mm256_sub_epi32(zero4, vlan0_7);
588 * Compute RTE_MBUF_F_RX_VLAN and RTE_MBUF_F_RX_VLAN_STRIPPED.
589 * Use 3 shifts, 1 or, 1 shuffle for 8 desc: 0.625 inst/desc
590 * VLAN offload flags in byte 0, 4, 8, 12, 16, 20, 24, 28
591 * Everything else is zero.
594 _mm256_or_si256(/* vlan_stripped => bit 0 */
595 _mm256_srli_epi32(_mm256_slli_epi32(flags0_7,
597 /* (vlan_id == 0) => bit 1 */
598 _mm256_slli_epi32(vlan0_7, 1));
600 * The index captures 4 cases.
601 * stripped, id = 0 ==> 11b = 3
602 * stripped, id != 0 ==> 01b = 1
603 * not strip, id == 0 ==> 10b = 2
604 * not strip, id != 0 ==> 00b = 0
606 __m256i vlan_flags = _mm256_permutevar8x32_epi32(vlan_shuffle,
608 /* Combine vlan and offload flags: 0.125 inst/desc */
609 rss_flags = _mm256_or_si256(rss_flags, vlan_flags);
612 * Compute non-tunnel PTYPEs.
613 * 17 inst / 8 desc = 2.125 inst/desc
615 /* ETHER and ETHER_VLAN */
617 _mm256_permutevar8x32_epi32(vlan_ptype_shuffle,
619 /* Build the ptype index from flags */
620 tcp_udp = _mm256_slli_epi32(flags0_7, 29);
621 tcp_udp = _mm256_slli_epi32(_mm256_srli_epi32(tcp_udp, 30), 2);
623 _mm256_srli_epi32(_mm256_slli_epi32(flags0_7, 26), 30);
624 __m256i ptype_idx = _mm256_or_si256(tcp_udp, ip4_ip6);
626 _mm256_srli_epi32(_mm256_slli_epi32(flags0_7, 25), 31);
627 __m256i nonfrag_ptype =
628 _mm256_shuffle_epi8(nonfrag_ptype_shuffle, ptype_idx);
630 _mm256_shuffle_epi8(frag_ptype_shuffle, ptype_idx);
632 * Zero out the unwanted types and combine the remaining bits.
633 * The effect is same as selecting non-frag or frag types
634 * depending on the frag bit.
636 nonfrag_ptype = _mm256_and_si256(nonfrag_ptype,
637 _mm256_cmpeq_epi32(zero4, frag_bit));
638 frag_ptype = _mm256_and_si256(frag_ptype,
639 _mm256_cmpgt_epi32(frag_bit, zero4));
640 __m256i ptype = _mm256_or_si256(nonfrag_ptype, frag_ptype);
641 ptype = _mm256_slli_epi32(ptype, 4);
643 * Compute tunnel PTYPEs.
644 * 15 inst / 8 desc = 1.875 inst/desc
646 __m256i tnl_l3_ptype =
647 _mm256_shuffle_epi8(tnl_l3_ptype_shuffle, ptype_idx);
648 tnl_l3_ptype = _mm256_slli_epi32(tnl_l3_ptype, 16);
650 * Shift non-tunnel L4 types to make them tunnel types.
651 * RTE_PTYPE_L4_TCP << 16 == RTE_PTYPE_INNER_L4_TCP
653 __m256i tnl_l4_ptype =
654 _mm256_slli_epi32(_mm256_and_si256(ptype,
655 _mm256_set1_epi32(RTE_PTYPE_L4_MASK)), 16);
657 _mm256_or_si256(tnl_l3_ptype, tnl_l4_ptype);
658 tnl_ptype = _mm256_or_si256(tnl_ptype,
659 _mm256_set1_epi32(RTE_PTYPE_TUNNEL_GRENAT |
660 RTE_PTYPE_INNER_L2_ETHER));
662 * Select non-tunnel or tunnel types by zeroing out the
665 __m256i tnl_flags = _mm256_and_si256(overlay_enabled,
666 _mm256_srli_epi32(_mm256_slli_epi32(flags0_7, 2), 31));
667 tnl_ptype = _mm256_and_si256(tnl_ptype,
668 _mm256_sub_epi32(zero4, tnl_flags));
669 ptype = _mm256_and_si256(ptype,
670 _mm256_cmpeq_epi32(zero4, tnl_flags));
672 * Combine types and swap to have ptypes in the same order
674 * desc: 0 2 4 6 1 3 5 7
675 * 3 inst / 8 desc = 0.375 inst/desc
677 ptype = _mm256_or_si256(ptype, tnl_ptype);
678 ptype = _mm256_or_si256(ptype, vlan_ptype);
679 ptype = _mm256_permute4x64_epi64(ptype,
680 (1 << 6) + (0 << 4) + (3 << 2) + 2);
683 * Mask packet length.
684 * Use 4 ands: 0.5 instructions/desc
686 cqd01 = _mm256_and_si256(cqd01, mask);
687 cqd23 = _mm256_and_si256(cqd23, mask);
688 cqd45 = _mm256_and_si256(cqd45, mask);
689 cqd67 = _mm256_and_si256(cqd67, mask);
691 * Shuffle. Two 16B sets of the mbuf fields.
692 * packet_type, pkt_len, data_len, vlan_tci, rss
694 __m256i rearm01 = _mm256_shuffle_epi8(cqd01, shuffle_mask);
695 __m256i rearm23 = _mm256_shuffle_epi8(cqd23, shuffle_mask);
696 __m256i rearm45 = _mm256_shuffle_epi8(cqd45, shuffle_mask);
697 __m256i rearm67 = _mm256_shuffle_epi8(cqd67, shuffle_mask);
701 * 4 blends and 3 shuffles for 8 desc: 0.875 inst/desc
703 rearm01 = _mm256_blend_epi32(rearm01, ptype, 0x11);
704 rearm23 = _mm256_blend_epi32(rearm23,
705 _mm256_shuffle_epi32(ptype, 1), 0x11);
706 rearm45 = _mm256_blend_epi32(rearm45,
707 _mm256_shuffle_epi32(ptype, 2), 0x11);
708 rearm67 = _mm256_blend_epi32(rearm67,
709 _mm256_shuffle_epi32(ptype, 3), 0x11);
712 * Move rss_flags into ol_flags in mbuf_init.
713 * Use 1 shift and 1 blend for each desc: 2 inst/desc
715 __m256i mbuf_init4_5 = _mm256_blend_epi32(mbuf_init,
717 __m256i mbuf_init2_3 = _mm256_blend_epi32(mbuf_init,
718 _mm256_slli_si256(rss_flags, 4), 0x44);
719 __m256i mbuf_init0_1 = _mm256_blend_epi32(mbuf_init,
720 _mm256_slli_si256(rss_flags, 8), 0x44);
721 __m256i mbuf_init6_7 = _mm256_blend_epi32(mbuf_init,
722 _mm256_srli_si256(rss_flags, 4), 0x44);
725 * Build rearm, one per desc.
726 * 8 blends and 4 permutes: 1.5 inst/desc
728 __m256i rearm0 = _mm256_blend_epi32(rearm01,
730 __m256i rearm1 = _mm256_blend_epi32(mbuf_init0_1,
732 __m256i rearm2 = _mm256_blend_epi32(rearm23,
734 __m256i rearm3 = _mm256_blend_epi32(mbuf_init2_3,
736 /* Swap upper and lower 64 bits */
737 rearm0 = _mm256_permute4x64_epi64(rearm0,
738 (1 << 6) + (0 << 4) + (3 << 2) + 2);
739 rearm2 = _mm256_permute4x64_epi64(rearm2,
740 (1 << 6) + (0 << 4) + (3 << 2) + 2);
741 /* Second set of 4 descriptors */
742 __m256i rearm4 = _mm256_blend_epi32(rearm45,
744 __m256i rearm5 = _mm256_blend_epi32(mbuf_init4_5,
746 __m256i rearm6 = _mm256_blend_epi32(rearm67,
748 __m256i rearm7 = _mm256_blend_epi32(mbuf_init6_7,
750 rearm4 = _mm256_permute4x64_epi64(rearm4,
751 (1 << 6) + (0 << 4) + (3 << 2) + 2);
752 rearm6 = _mm256_permute4x64_epi64(rearm6,
753 (1 << 6) + (0 << 4) + (3 << 2) + 2);
756 * Write out 32B of mbuf fields.
757 * data_off - off 0 (mbuf_init)
758 * refcnt - 2 (mbuf_init)
759 * nb_segs - 4 (mbuf_init)
760 * port - 6 (mbuf_init)
761 * ol_flag - 8 (from cqd)
762 * packet_type - 16 (from cqd)
763 * pkt_len - 20 (from cqd)
764 * data_len - 24 (from cqd)
765 * vlan_tci - 26 (from cqd)
766 * rss - 28 (from cqd)
768 _mm256_storeu_si256((__m256i *)&rxmb[0]->rearm_data, rearm0);
769 _mm256_storeu_si256((__m256i *)&rxmb[1]->rearm_data, rearm1);
770 _mm256_storeu_si256((__m256i *)&rxmb[2]->rearm_data, rearm2);
771 _mm256_storeu_si256((__m256i *)&rxmb[3]->rearm_data, rearm3);
772 _mm256_storeu_si256((__m256i *)&rxmb[4]->rearm_data, rearm4);
773 _mm256_storeu_si256((__m256i *)&rxmb[5]->rearm_data, rearm5);
774 _mm256_storeu_si256((__m256i *)&rxmb[6]->rearm_data, rearm6);
775 _mm256_storeu_si256((__m256i *)&rxmb[7]->rearm_data, rearm7);
784 * Step 3: Slow path to handle a small (<8) number of packets and
785 * occasional truncated packets.
787 while (max_rx && ((cqd->type_color &
788 CQ_DESC_COLOR_MASK_NOSHIFT) != color)) {
789 if (unlikely(cqd->bytes_written_flags &
790 CQ_ENET_RQ_DESC_FLAGS_TRUNCATED)) {
791 rte_pktmbuf_free(*rxmb++);
792 rte_atomic64_inc(&enic->soft_stats.rx_packet_errors);
794 *rx++ = rx_one(cqd, *rxmb++, enic);
800 /* Number of descriptors visited */
801 nb_rx = cqd - (struct cq_enet_rq_desc *)(cq->ring.descs) - cq_idx;
804 rqd = ((struct rq_enet_desc *)rq->ring.descs) + cq_idx;
805 rxmb = rq->mbuf_ring + cq_idx;
807 rq->rx_nb_hold += nb_rx;
808 if (unlikely(cq_idx == cq->ring.desc_count)) {
810 cq->last_color ^= CQ_DESC_COLOR_MASK_NOSHIFT;
812 cq->to_clean = cq_idx;
814 /* Step 4: Restock RQ with new mbufs */
815 memcpy(rxmb, rq->free_mbufs + ENIC_RX_BURST_MAX - rq->num_free_mbufs,
816 sizeof(struct rte_mbuf *) * nb_rx);
817 rq->num_free_mbufs -= nb_rx;
819 rqd->address = (*rxmb)->buf_iova + RTE_PKTMBUF_HEADROOM;
824 if (rq->rx_nb_hold > rq->rx_free_thresh) {
825 rq->posted_index = enic_ring_add(rq->ring.desc_count,
830 iowrite32_relaxed(rq->posted_index,
831 &rq->ctrl->posted_index);
838 enic_use_vector_rx_handler(struct rte_eth_dev *eth_dev)
840 struct enic *enic = pmd_priv(eth_dev);
842 /* User needs to request for the avx2 handler */
843 if (!enic->enable_avx2_rx)
845 /* Do not support scatter Rx */
846 if (!(enic->rq_count > 0 && enic->rq[0].data_queue_enable == 0))
848 if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) &&
849 rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_256) {
850 ENICPMD_LOG(DEBUG, " use the non-scatter avx2 Rx handler");
851 eth_dev->rx_pkt_burst = &enic_noscatter_vec_recv_pkts;
852 enic->use_noscatter_vec_rx_handler = 1;