1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright(c) 2019 Intel Corporation
5 #include "iavf_rxtx_vec_common.h"
9 #ifndef __INTEL_COMPILER
10 #pragma GCC diagnostic ignored "-Wcast-qual"
13 static __rte_always_inline void
14 iavf_rxq_rearm(struct iavf_rx_queue *rxq)
16 return iavf_rxq_rearm_common(rxq, false);
19 #define PKTLEN_SHIFT 10
21 static inline uint16_t
22 _iavf_recv_raw_pkts_vec_avx2(struct iavf_rx_queue *rxq,
23 struct rte_mbuf **rx_pkts,
24 uint16_t nb_pkts, uint8_t *split_packet)
26 #define IAVF_DESCS_PER_LOOP_AVX 8
28 /* const uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl; */
29 const uint32_t *type_table = rxq->vsi->adapter->ptype_tbl;
31 const __m256i mbuf_init = _mm256_set_epi64x(0, 0,
32 0, rxq->mbuf_initializer);
33 /* struct iavf_rx_entry *sw_ring = &rxq->sw_ring[rxq->rx_tail]; */
34 struct rte_mbuf **sw_ring = &rxq->sw_ring[rxq->rx_tail];
35 volatile union iavf_rx_desc *rxdp = rxq->rx_ring + rxq->rx_tail;
36 const int avx_aligned = ((rxq->rx_tail & 1) == 0);
40 /* nb_pkts has to be floor-aligned to IAVF_DESCS_PER_LOOP_AVX */
41 nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, IAVF_DESCS_PER_LOOP_AVX);
43 /* See if we need to rearm the RX queue - gives the prefetch a bit
46 if (rxq->rxrearm_nb > IAVF_RXQ_REARM_THRESH)
49 /* Before we start moving massive data around, check to see if
50 * there is actually a packet available
52 if (!(rxdp->wb.qword1.status_error_len &
53 rte_cpu_to_le_32(1 << IAVF_RX_DESC_STATUS_DD_SHIFT)))
56 /* constants used in processing loop */
57 const __m256i crc_adjust =
59 (/* first descriptor */
60 0, 0, 0, /* ignore non-length fields */
61 -rxq->crc_len, /* sub crc on data_len */
62 0, /* ignore high-16bits of pkt_len */
63 -rxq->crc_len, /* sub crc on pkt_len */
64 0, 0, /* ignore pkt_type field */
65 /* second descriptor */
66 0, 0, 0, /* ignore non-length fields */
67 -rxq->crc_len, /* sub crc on data_len */
68 0, /* ignore high-16bits of pkt_len */
69 -rxq->crc_len, /* sub crc on pkt_len */
70 0, 0 /* ignore pkt_type field */
73 /* 8 packets DD mask, LSB in each 32-bit value */
74 const __m256i dd_check = _mm256_set1_epi32(1);
76 /* 8 packets EOP mask, second-LSB in each 32-bit value */
77 const __m256i eop_check = _mm256_slli_epi32(dd_check,
78 IAVF_RX_DESC_STATUS_EOF_SHIFT);
80 /* mask to shuffle from desc. to mbuf (2 descriptors)*/
81 const __m256i shuf_msk =
83 (/* first descriptor */
84 7, 6, 5, 4, /* octet 4~7, 32bits rss */
85 3, 2, /* octet 2~3, low 16 bits vlan_macip */
86 15, 14, /* octet 15~14, 16 bits data_len */
87 0xFF, 0xFF, /* skip high 16 bits pkt_len, zero out */
88 15, 14, /* octet 15~14, low 16 bits pkt_len */
89 0xFF, 0xFF, /* pkt_type set as unknown */
90 0xFF, 0xFF, /*pkt_type set as unknown */
91 /* second descriptor */
92 7, 6, 5, 4, /* octet 4~7, 32bits rss */
93 3, 2, /* octet 2~3, low 16 bits vlan_macip */
94 15, 14, /* octet 15~14, 16 bits data_len */
95 0xFF, 0xFF, /* skip high 16 bits pkt_len, zero out */
96 15, 14, /* octet 15~14, low 16 bits pkt_len */
97 0xFF, 0xFF, /* pkt_type set as unknown */
98 0xFF, 0xFF /*pkt_type set as unknown */
101 * compile-time check the above crc and shuffle layout is correct.
102 * NOTE: the first field (lowest address) is given last in set_epi
105 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) !=
106 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4);
107 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) !=
108 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8);
109 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, vlan_tci) !=
110 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 10);
111 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, hash) !=
112 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 12);
114 /* Status/Error flag masks */
116 * mask everything except RSS, flow director and VLAN flags
117 * bit2 is for VLAN tag, bit11 for flow director indication
118 * bit13:12 for RSS indication. Bits 3-5 of error
119 * field (bits 22-24) are for IP/L4 checksum errors
121 const __m256i flags_mask =
122 _mm256_set1_epi32((1 << 2) | (1 << 11) |
123 (3 << 12) | (7 << 22));
125 * data to be shuffled by result of flag mask. If VLAN bit is set,
126 * (bit 2), then position 4 in this array will be used in the
129 const __m256i vlan_flags_shuf =
130 _mm256_set_epi32(0, 0, PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED, 0,
131 0, 0, PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED, 0);
133 * data to be shuffled by result of flag mask, shifted down 11.
134 * If RSS/FDIR bits are set, shuffle moves appropriate flags in
137 const __m256i rss_flags_shuf =
138 _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
139 PKT_RX_RSS_HASH | PKT_RX_FDIR, PKT_RX_RSS_HASH,
140 0, 0, 0, 0, PKT_RX_FDIR, 0,/* end up 128-bits */
141 0, 0, 0, 0, 0, 0, 0, 0,
142 PKT_RX_RSS_HASH | PKT_RX_FDIR, PKT_RX_RSS_HASH,
143 0, 0, 0, 0, PKT_RX_FDIR, 0);
146 * data to be shuffled by the result of the flags mask shifted by 22
147 * bits. This gives use the l3_l4 flags.
149 const __m256i l3_l4_flags_shuf = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
150 /* shift right 1 bit to make sure it not exceed 255 */
151 (PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
152 PKT_RX_IP_CKSUM_BAD) >> 1,
153 (PKT_RX_IP_CKSUM_GOOD | PKT_RX_OUTER_IP_CKSUM_BAD |
154 PKT_RX_L4_CKSUM_BAD) >> 1,
155 (PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
156 (PKT_RX_IP_CKSUM_GOOD | PKT_RX_OUTER_IP_CKSUM_BAD) >> 1,
157 (PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
158 (PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD) >> 1,
159 PKT_RX_IP_CKSUM_BAD >> 1,
160 (PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD) >> 1,
161 /* second 128-bits */
162 0, 0, 0, 0, 0, 0, 0, 0,
163 (PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
164 PKT_RX_IP_CKSUM_BAD) >> 1,
165 (PKT_RX_IP_CKSUM_GOOD | PKT_RX_OUTER_IP_CKSUM_BAD |
166 PKT_RX_L4_CKSUM_BAD) >> 1,
167 (PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
168 (PKT_RX_IP_CKSUM_GOOD | PKT_RX_OUTER_IP_CKSUM_BAD) >> 1,
169 (PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
170 (PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD) >> 1,
171 PKT_RX_IP_CKSUM_BAD >> 1,
172 (PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD) >> 1);
174 const __m256i cksum_mask =
175 _mm256_set1_epi32(PKT_RX_IP_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD |
176 PKT_RX_L4_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD |
177 PKT_RX_OUTER_IP_CKSUM_BAD);
179 RTE_SET_USED(avx_aligned); /* for 32B descriptors we don't use this */
181 uint16_t i, received;
183 for (i = 0, received = 0; i < nb_pkts;
184 i += IAVF_DESCS_PER_LOOP_AVX,
185 rxdp += IAVF_DESCS_PER_LOOP_AVX) {
186 /* step 1, copy over 8 mbuf pointers to rx_pkts array */
187 _mm256_storeu_si256((void *)&rx_pkts[i],
188 _mm256_loadu_si256((void *)&sw_ring[i]));
189 #ifdef RTE_ARCH_X86_64
191 ((void *)&rx_pkts[i + 4],
192 _mm256_loadu_si256((void *)&sw_ring[i + 4]));
195 __m256i raw_desc0_1, raw_desc2_3, raw_desc4_5, raw_desc6_7;
196 #ifdef RTE_LIBRTE_IAVF_16BYTE_RX_DESC
197 /* for AVX we need alignment otherwise loads are not atomic */
199 /* load in descriptors, 2 at a time, in reverse order */
200 raw_desc6_7 = _mm256_load_si256((void *)(rxdp + 6));
201 rte_compiler_barrier();
202 raw_desc4_5 = _mm256_load_si256((void *)(rxdp + 4));
203 rte_compiler_barrier();
204 raw_desc2_3 = _mm256_load_si256((void *)(rxdp + 2));
205 rte_compiler_barrier();
206 raw_desc0_1 = _mm256_load_si256((void *)(rxdp + 0));
210 const __m128i raw_desc7 =
211 _mm_load_si128((void *)(rxdp + 7));
212 rte_compiler_barrier();
213 const __m128i raw_desc6 =
214 _mm_load_si128((void *)(rxdp + 6));
215 rte_compiler_barrier();
216 const __m128i raw_desc5 =
217 _mm_load_si128((void *)(rxdp + 5));
218 rte_compiler_barrier();
219 const __m128i raw_desc4 =
220 _mm_load_si128((void *)(rxdp + 4));
221 rte_compiler_barrier();
222 const __m128i raw_desc3 =
223 _mm_load_si128((void *)(rxdp + 3));
224 rte_compiler_barrier();
225 const __m128i raw_desc2 =
226 _mm_load_si128((void *)(rxdp + 2));
227 rte_compiler_barrier();
228 const __m128i raw_desc1 =
229 _mm_load_si128((void *)(rxdp + 1));
230 rte_compiler_barrier();
231 const __m128i raw_desc0 =
232 _mm_load_si128((void *)(rxdp + 0));
235 _mm256_inserti128_si256
236 (_mm256_castsi128_si256(raw_desc6),
239 _mm256_inserti128_si256
240 (_mm256_castsi128_si256(raw_desc4),
243 _mm256_inserti128_si256
244 (_mm256_castsi128_si256(raw_desc2),
247 _mm256_inserti128_si256
248 (_mm256_castsi128_si256(raw_desc0),
255 for (j = 0; j < IAVF_DESCS_PER_LOOP_AVX; j++)
256 rte_mbuf_prefetch_part2(rx_pkts[i + j]);
260 * convert descriptors 4-7 into mbufs, adjusting length and
261 * re-arranging fields. Then write into the mbuf
263 const __m256i len6_7 = _mm256_slli_epi32(raw_desc6_7,
265 const __m256i len4_5 = _mm256_slli_epi32(raw_desc4_5,
267 const __m256i desc6_7 = _mm256_blend_epi16(raw_desc6_7,
269 const __m256i desc4_5 = _mm256_blend_epi16(raw_desc4_5,
271 __m256i mb6_7 = _mm256_shuffle_epi8(desc6_7, shuf_msk);
272 __m256i mb4_5 = _mm256_shuffle_epi8(desc4_5, shuf_msk);
274 mb6_7 = _mm256_add_epi16(mb6_7, crc_adjust);
275 mb4_5 = _mm256_add_epi16(mb4_5, crc_adjust);
277 * to get packet types, shift 64-bit values down 30 bits
278 * and so ptype is in lower 8-bits in each
280 const __m256i ptypes6_7 = _mm256_srli_epi64(desc6_7, 30);
281 const __m256i ptypes4_5 = _mm256_srli_epi64(desc4_5, 30);
282 const uint8_t ptype7 = _mm256_extract_epi8(ptypes6_7, 24);
283 const uint8_t ptype6 = _mm256_extract_epi8(ptypes6_7, 8);
284 const uint8_t ptype5 = _mm256_extract_epi8(ptypes4_5, 24);
285 const uint8_t ptype4 = _mm256_extract_epi8(ptypes4_5, 8);
287 mb6_7 = _mm256_insert_epi32(mb6_7, type_table[ptype7], 4);
288 mb6_7 = _mm256_insert_epi32(mb6_7, type_table[ptype6], 0);
289 mb4_5 = _mm256_insert_epi32(mb4_5, type_table[ptype5], 4);
290 mb4_5 = _mm256_insert_epi32(mb4_5, type_table[ptype4], 0);
291 /* merge the status bits into one register */
292 const __m256i status4_7 = _mm256_unpackhi_epi32(desc6_7,
296 * convert descriptors 0-3 into mbufs, adjusting length and
297 * re-arranging fields. Then write into the mbuf
299 const __m256i len2_3 = _mm256_slli_epi32(raw_desc2_3,
301 const __m256i len0_1 = _mm256_slli_epi32(raw_desc0_1,
303 const __m256i desc2_3 = _mm256_blend_epi16(raw_desc2_3,
305 const __m256i desc0_1 = _mm256_blend_epi16(raw_desc0_1,
307 __m256i mb2_3 = _mm256_shuffle_epi8(desc2_3, shuf_msk);
308 __m256i mb0_1 = _mm256_shuffle_epi8(desc0_1, shuf_msk);
310 mb2_3 = _mm256_add_epi16(mb2_3, crc_adjust);
311 mb0_1 = _mm256_add_epi16(mb0_1, crc_adjust);
312 /* get the packet types */
313 const __m256i ptypes2_3 = _mm256_srli_epi64(desc2_3, 30);
314 const __m256i ptypes0_1 = _mm256_srli_epi64(desc0_1, 30);
315 const uint8_t ptype3 = _mm256_extract_epi8(ptypes2_3, 24);
316 const uint8_t ptype2 = _mm256_extract_epi8(ptypes2_3, 8);
317 const uint8_t ptype1 = _mm256_extract_epi8(ptypes0_1, 24);
318 const uint8_t ptype0 = _mm256_extract_epi8(ptypes0_1, 8);
320 mb2_3 = _mm256_insert_epi32(mb2_3, type_table[ptype3], 4);
321 mb2_3 = _mm256_insert_epi32(mb2_3, type_table[ptype2], 0);
322 mb0_1 = _mm256_insert_epi32(mb0_1, type_table[ptype1], 4);
323 mb0_1 = _mm256_insert_epi32(mb0_1, type_table[ptype0], 0);
324 /* merge the status bits into one register */
325 const __m256i status0_3 = _mm256_unpackhi_epi32(desc2_3,
329 * take the two sets of status bits and merge to one
330 * After merge, the packets status flags are in the
331 * order (hi->lo): [1, 3, 5, 7, 0, 2, 4, 6]
333 __m256i status0_7 = _mm256_unpacklo_epi64(status4_7,
336 /* now do flag manipulation */
338 /* get only flag/error bits we want */
339 const __m256i flag_bits =
340 _mm256_and_si256(status0_7, flags_mask);
341 /* set vlan and rss flags */
342 const __m256i vlan_flags =
343 _mm256_shuffle_epi8(vlan_flags_shuf, flag_bits);
344 const __m256i rss_flags =
345 _mm256_shuffle_epi8(rss_flags_shuf,
346 _mm256_srli_epi32(flag_bits, 11));
348 * l3_l4_error flags, shuffle, then shift to correct adjustment
349 * of flags in flags_shuf, and finally mask out extra bits
351 __m256i l3_l4_flags = _mm256_shuffle_epi8(l3_l4_flags_shuf,
352 _mm256_srli_epi32(flag_bits, 22));
353 l3_l4_flags = _mm256_slli_epi32(l3_l4_flags, 1);
354 l3_l4_flags = _mm256_and_si256(l3_l4_flags, cksum_mask);
357 const __m256i mbuf_flags = _mm256_or_si256(l3_l4_flags,
358 _mm256_or_si256(rss_flags, vlan_flags));
360 * At this point, we have the 8 sets of flags in the low 16-bits
361 * of each 32-bit value in vlan0.
362 * We want to extract these, and merge them with the mbuf init
363 * data so we can do a single write to the mbuf to set the flags
364 * and all the other initialization fields. Extracting the
365 * appropriate flags means that we have to do a shift and blend
366 * for each mbuf before we do the write. However, we can also
367 * add in the previously computed rx_descriptor fields to
368 * make a single 256-bit write per mbuf
370 /* check the structure matches expectations */
371 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, ol_flags) !=
372 offsetof(struct rte_mbuf, rearm_data) + 8);
373 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, rearm_data) !=
374 RTE_ALIGN(offsetof(struct rte_mbuf,
377 /* build up data and do writes */
378 __m256i rearm0, rearm1, rearm2, rearm3, rearm4, rearm5,
380 rearm6 = _mm256_blend_epi32(mbuf_init,
381 _mm256_slli_si256(mbuf_flags, 8),
383 rearm4 = _mm256_blend_epi32(mbuf_init,
384 _mm256_slli_si256(mbuf_flags, 4),
386 rearm2 = _mm256_blend_epi32(mbuf_init, mbuf_flags, 0x04);
387 rearm0 = _mm256_blend_epi32(mbuf_init,
388 _mm256_srli_si256(mbuf_flags, 4),
390 /* permute to add in the rx_descriptor e.g. rss fields */
391 rearm6 = _mm256_permute2f128_si256(rearm6, mb6_7, 0x20);
392 rearm4 = _mm256_permute2f128_si256(rearm4, mb4_5, 0x20);
393 rearm2 = _mm256_permute2f128_si256(rearm2, mb2_3, 0x20);
394 rearm0 = _mm256_permute2f128_si256(rearm0, mb0_1, 0x20);
396 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 6]->rearm_data,
398 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 4]->rearm_data,
400 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 2]->rearm_data,
402 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 0]->rearm_data,
405 /* repeat for the odd mbufs */
406 const __m256i odd_flags =
407 _mm256_castsi128_si256
408 (_mm256_extracti128_si256(mbuf_flags, 1));
409 rearm7 = _mm256_blend_epi32(mbuf_init,
410 _mm256_slli_si256(odd_flags, 8),
412 rearm5 = _mm256_blend_epi32(mbuf_init,
413 _mm256_slli_si256(odd_flags, 4),
415 rearm3 = _mm256_blend_epi32(mbuf_init, odd_flags, 0x04);
416 rearm1 = _mm256_blend_epi32(mbuf_init,
417 _mm256_srli_si256(odd_flags, 4),
419 /* since odd mbufs are already in hi 128-bits use blend */
420 rearm7 = _mm256_blend_epi32(rearm7, mb6_7, 0xF0);
421 rearm5 = _mm256_blend_epi32(rearm5, mb4_5, 0xF0);
422 rearm3 = _mm256_blend_epi32(rearm3, mb2_3, 0xF0);
423 rearm1 = _mm256_blend_epi32(rearm1, mb0_1, 0xF0);
424 /* again write to mbufs */
425 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 7]->rearm_data,
427 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 5]->rearm_data,
429 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 3]->rearm_data,
431 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 1]->rearm_data,
434 /* extract and record EOP bit */
436 const __m128i eop_mask =
437 _mm_set1_epi16(1 << IAVF_RX_DESC_STATUS_EOF_SHIFT);
438 const __m256i eop_bits256 = _mm256_and_si256(status0_7,
440 /* pack status bits into a single 128-bit register */
441 const __m128i eop_bits =
443 (_mm256_castsi256_si128(eop_bits256),
444 _mm256_extractf128_si256(eop_bits256,
447 * flip bits, and mask out the EOP bit, which is now
448 * a split-packet bit i.e. !EOP, rather than EOP one.
450 __m128i split_bits = _mm_andnot_si128(eop_bits,
453 * eop bits are out of order, so we need to shuffle them
454 * back into order again. In doing so, only use low 8
455 * bits, which acts like another pack instruction
456 * The original order is (hi->lo): 1,3,5,7,0,2,4,6
457 * [Since we use epi8, the 16-bit positions are
458 * multiplied by 2 in the eop_shuffle value.]
460 __m128i eop_shuffle =
461 _mm_set_epi8(/* zero hi 64b */
462 0xFF, 0xFF, 0xFF, 0xFF,
463 0xFF, 0xFF, 0xFF, 0xFF,
464 /* move values to lo 64b */
467 split_bits = _mm_shuffle_epi8(split_bits, eop_shuffle);
468 *(uint64_t *)split_packet =
469 _mm_cvtsi128_si64(split_bits);
470 split_packet += IAVF_DESCS_PER_LOOP_AVX;
473 /* perform dd_check */
474 status0_7 = _mm256_and_si256(status0_7, dd_check);
475 status0_7 = _mm256_packs_epi32(status0_7,
476 _mm256_setzero_si256());
478 uint64_t burst = __builtin_popcountll
480 (_mm256_extracti128_si256
482 burst += __builtin_popcountll
484 (_mm256_castsi256_si128(status0_7)));
486 if (burst != IAVF_DESCS_PER_LOOP_AVX)
490 /* update tail pointers */
491 rxq->rx_tail += received;
492 rxq->rx_tail &= (rxq->nb_rx_desc - 1);
493 if ((rxq->rx_tail & 1) == 1 && received > 1) { /* keep avx2 aligned */
497 rxq->rxrearm_nb += received;
501 static inline __m256i
502 flex_rxd_to_fdir_flags_vec_avx2(const __m256i fdir_id0_7)
504 #define FDID_MIS_MAGIC 0xFFFFFFFF
505 RTE_BUILD_BUG_ON(PKT_RX_FDIR != (1 << 2));
506 RTE_BUILD_BUG_ON(PKT_RX_FDIR_ID != (1 << 13));
507 const __m256i pkt_fdir_bit = _mm256_set1_epi32(PKT_RX_FDIR |
509 /* desc->flow_id field == 0xFFFFFFFF means fdir mismatch */
510 const __m256i fdir_mis_mask = _mm256_set1_epi32(FDID_MIS_MAGIC);
511 __m256i fdir_mask = _mm256_cmpeq_epi32(fdir_id0_7,
513 /* this XOR op results to bit-reverse the fdir_mask */
514 fdir_mask = _mm256_xor_si256(fdir_mask, fdir_mis_mask);
515 const __m256i fdir_flags = _mm256_and_si256(fdir_mask, pkt_fdir_bit);
520 static inline uint16_t
521 _iavf_recv_raw_pkts_vec_avx2_flex_rxd(struct iavf_rx_queue *rxq,
522 struct rte_mbuf **rx_pkts,
523 uint16_t nb_pkts, uint8_t *split_packet)
525 #define IAVF_DESCS_PER_LOOP_AVX 8
527 const uint32_t *type_table = rxq->vsi->adapter->ptype_tbl;
529 const __m256i mbuf_init = _mm256_set_epi64x(0, 0,
530 0, rxq->mbuf_initializer);
531 struct rte_mbuf **sw_ring = &rxq->sw_ring[rxq->rx_tail];
532 volatile union iavf_rx_flex_desc *rxdp =
533 (union iavf_rx_flex_desc *)rxq->rx_ring + rxq->rx_tail;
537 /* nb_pkts has to be floor-aligned to IAVF_DESCS_PER_LOOP_AVX */
538 nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, IAVF_DESCS_PER_LOOP_AVX);
540 /* See if we need to rearm the RX queue - gives the prefetch a bit
543 if (rxq->rxrearm_nb > IAVF_RXQ_REARM_THRESH)
546 /* Before we start moving massive data around, check to see if
547 * there is actually a packet available
549 if (!(rxdp->wb.status_error0 &
550 rte_cpu_to_le_32(1 << IAVF_RX_FLEX_DESC_STATUS0_DD_S)))
553 /* constants used in processing loop */
554 const __m256i crc_adjust =
556 (/* first descriptor */
557 0, 0, 0, /* ignore non-length fields */
558 -rxq->crc_len, /* sub crc on data_len */
559 0, /* ignore high-16bits of pkt_len */
560 -rxq->crc_len, /* sub crc on pkt_len */
561 0, 0, /* ignore pkt_type field */
562 /* second descriptor */
563 0, 0, 0, /* ignore non-length fields */
564 -rxq->crc_len, /* sub crc on data_len */
565 0, /* ignore high-16bits of pkt_len */
566 -rxq->crc_len, /* sub crc on pkt_len */
567 0, 0 /* ignore pkt_type field */
570 /* 8 packets DD mask, LSB in each 32-bit value */
571 const __m256i dd_check = _mm256_set1_epi32(1);
573 /* 8 packets EOP mask, second-LSB in each 32-bit value */
574 const __m256i eop_check = _mm256_slli_epi32(dd_check,
575 IAVF_RX_FLEX_DESC_STATUS0_EOF_S);
577 /* mask to shuffle from desc. to mbuf (2 descriptors)*/
578 const __m256i shuf_msk =
580 (/* first descriptor */
582 0xFF, 0xFF, /* rss hash parsed separately */
583 11, 10, /* octet 10~11, 16 bits vlan_macip */
584 5, 4, /* octet 4~5, 16 bits data_len */
585 0xFF, 0xFF, /* skip hi 16 bits pkt_len, zero out */
586 5, 4, /* octet 4~5, 16 bits pkt_len */
587 0xFF, 0xFF, /* pkt_type set as unknown */
588 0xFF, 0xFF, /*pkt_type set as unknown */
589 /* second descriptor */
591 0xFF, 0xFF, /* rss hash parsed separately */
592 11, 10, /* octet 10~11, 16 bits vlan_macip */
593 5, 4, /* octet 4~5, 16 bits data_len */
594 0xFF, 0xFF, /* skip hi 16 bits pkt_len, zero out */
595 5, 4, /* octet 4~5, 16 bits pkt_len */
596 0xFF, 0xFF, /* pkt_type set as unknown */
597 0xFF, 0xFF /*pkt_type set as unknown */
600 * compile-time check the above crc and shuffle layout is correct.
601 * NOTE: the first field (lowest address) is given last in set_epi
604 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) !=
605 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4);
606 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) !=
607 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8);
608 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, vlan_tci) !=
609 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 10);
610 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, hash) !=
611 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 12);
613 /* Status/Error flag masks */
615 * mask everything except Checksum Reports, RSS indication
616 * and VLAN indication.
617 * bit6:4 for IP/L4 checksum errors.
618 * bit12 is for RSS indication.
619 * bit13 is for VLAN indication.
621 const __m256i flags_mask =
622 _mm256_set1_epi32((7 << 4) | (1 << 12) | (1 << 13));
624 * data to be shuffled by the result of the flags mask shifted by 4
625 * bits. This gives use the l3_l4 flags.
627 const __m256i l3_l4_flags_shuf = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
628 /* shift right 1 bit to make sure it not exceed 255 */
629 (PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
630 PKT_RX_IP_CKSUM_BAD) >> 1,
631 (PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
632 PKT_RX_IP_CKSUM_GOOD) >> 1,
633 (PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD |
634 PKT_RX_IP_CKSUM_BAD) >> 1,
635 (PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD |
636 PKT_RX_IP_CKSUM_GOOD) >> 1,
637 (PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
638 (PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_GOOD) >> 1,
639 (PKT_RX_L4_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD) >> 1,
640 (PKT_RX_L4_CKSUM_GOOD | PKT_RX_IP_CKSUM_GOOD) >> 1,
641 /* second 128-bits */
642 0, 0, 0, 0, 0, 0, 0, 0,
643 (PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
644 PKT_RX_IP_CKSUM_BAD) >> 1,
645 (PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
646 PKT_RX_IP_CKSUM_GOOD) >> 1,
647 (PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD |
648 PKT_RX_IP_CKSUM_BAD) >> 1,
649 (PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD |
650 PKT_RX_IP_CKSUM_GOOD) >> 1,
651 (PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
652 (PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_GOOD) >> 1,
653 (PKT_RX_L4_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD) >> 1,
654 (PKT_RX_L4_CKSUM_GOOD | PKT_RX_IP_CKSUM_GOOD) >> 1);
655 const __m256i cksum_mask =
656 _mm256_set1_epi32(PKT_RX_IP_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD |
657 PKT_RX_L4_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD |
658 PKT_RX_OUTER_IP_CKSUM_BAD);
660 * data to be shuffled by result of flag mask, shifted down 12.
661 * If RSS(bit12)/VLAN(bit13) are set,
662 * shuffle moves appropriate flags in place.
664 const __m256i rss_flags_shuf = _mm256_set_epi8(0, 0, 0, 0,
669 /* end up 128-bits */
676 const __m256i vlan_flags_shuf = _mm256_set_epi8(0, 0, 0, 0,
679 PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED,
680 PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED,
682 /* end up 128-bits */
686 PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED,
687 PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED,
690 uint16_t i, received;
692 for (i = 0, received = 0; i < nb_pkts;
693 i += IAVF_DESCS_PER_LOOP_AVX,
694 rxdp += IAVF_DESCS_PER_LOOP_AVX) {
695 /* step 1, copy over 8 mbuf pointers to rx_pkts array */
696 _mm256_storeu_si256((void *)&rx_pkts[i],
697 _mm256_loadu_si256((void *)&sw_ring[i]));
698 #ifdef RTE_ARCH_X86_64
700 ((void *)&rx_pkts[i + 4],
701 _mm256_loadu_si256((void *)&sw_ring[i + 4]));
704 __m256i raw_desc0_1, raw_desc2_3, raw_desc4_5, raw_desc6_7;
706 const __m128i raw_desc7 =
707 _mm_load_si128((void *)(rxdp + 7));
708 rte_compiler_barrier();
709 const __m128i raw_desc6 =
710 _mm_load_si128((void *)(rxdp + 6));
711 rte_compiler_barrier();
712 const __m128i raw_desc5 =
713 _mm_load_si128((void *)(rxdp + 5));
714 rte_compiler_barrier();
715 const __m128i raw_desc4 =
716 _mm_load_si128((void *)(rxdp + 4));
717 rte_compiler_barrier();
718 const __m128i raw_desc3 =
719 _mm_load_si128((void *)(rxdp + 3));
720 rte_compiler_barrier();
721 const __m128i raw_desc2 =
722 _mm_load_si128((void *)(rxdp + 2));
723 rte_compiler_barrier();
724 const __m128i raw_desc1 =
725 _mm_load_si128((void *)(rxdp + 1));
726 rte_compiler_barrier();
727 const __m128i raw_desc0 =
728 _mm_load_si128((void *)(rxdp + 0));
731 _mm256_inserti128_si256
732 (_mm256_castsi128_si256(raw_desc6),
735 _mm256_inserti128_si256
736 (_mm256_castsi128_si256(raw_desc4),
739 _mm256_inserti128_si256
740 (_mm256_castsi128_si256(raw_desc2),
743 _mm256_inserti128_si256
744 (_mm256_castsi128_si256(raw_desc0),
750 for (j = 0; j < IAVF_DESCS_PER_LOOP_AVX; j++)
751 rte_mbuf_prefetch_part2(rx_pkts[i + j]);
755 * convert descriptors 4-7 into mbufs, re-arrange fields.
756 * Then write into the mbuf.
758 __m256i mb6_7 = _mm256_shuffle_epi8(raw_desc6_7, shuf_msk);
759 __m256i mb4_5 = _mm256_shuffle_epi8(raw_desc4_5, shuf_msk);
761 mb6_7 = _mm256_add_epi16(mb6_7, crc_adjust);
762 mb4_5 = _mm256_add_epi16(mb4_5, crc_adjust);
764 * to get packet types, ptype is located in bit16-25
767 const __m256i ptype_mask =
768 _mm256_set1_epi16(IAVF_RX_FLEX_DESC_PTYPE_M);
769 const __m256i ptypes6_7 =
770 _mm256_and_si256(raw_desc6_7, ptype_mask);
771 const __m256i ptypes4_5 =
772 _mm256_and_si256(raw_desc4_5, ptype_mask);
773 const uint16_t ptype7 = _mm256_extract_epi16(ptypes6_7, 9);
774 const uint16_t ptype6 = _mm256_extract_epi16(ptypes6_7, 1);
775 const uint16_t ptype5 = _mm256_extract_epi16(ptypes4_5, 9);
776 const uint16_t ptype4 = _mm256_extract_epi16(ptypes4_5, 1);
778 mb6_7 = _mm256_insert_epi32(mb6_7, type_table[ptype7], 4);
779 mb6_7 = _mm256_insert_epi32(mb6_7, type_table[ptype6], 0);
780 mb4_5 = _mm256_insert_epi32(mb4_5, type_table[ptype5], 4);
781 mb4_5 = _mm256_insert_epi32(mb4_5, type_table[ptype4], 0);
782 /* merge the status bits into one register */
783 const __m256i status4_7 = _mm256_unpackhi_epi32(raw_desc6_7,
787 * convert descriptors 0-3 into mbufs, re-arrange fields.
788 * Then write into the mbuf.
790 __m256i mb2_3 = _mm256_shuffle_epi8(raw_desc2_3, shuf_msk);
791 __m256i mb0_1 = _mm256_shuffle_epi8(raw_desc0_1, shuf_msk);
793 mb2_3 = _mm256_add_epi16(mb2_3, crc_adjust);
794 mb0_1 = _mm256_add_epi16(mb0_1, crc_adjust);
796 * to get packet types, ptype is located in bit16-25
799 const __m256i ptypes2_3 =
800 _mm256_and_si256(raw_desc2_3, ptype_mask);
801 const __m256i ptypes0_1 =
802 _mm256_and_si256(raw_desc0_1, ptype_mask);
803 const uint16_t ptype3 = _mm256_extract_epi16(ptypes2_3, 9);
804 const uint16_t ptype2 = _mm256_extract_epi16(ptypes2_3, 1);
805 const uint16_t ptype1 = _mm256_extract_epi16(ptypes0_1, 9);
806 const uint16_t ptype0 = _mm256_extract_epi16(ptypes0_1, 1);
808 mb2_3 = _mm256_insert_epi32(mb2_3, type_table[ptype3], 4);
809 mb2_3 = _mm256_insert_epi32(mb2_3, type_table[ptype2], 0);
810 mb0_1 = _mm256_insert_epi32(mb0_1, type_table[ptype1], 4);
811 mb0_1 = _mm256_insert_epi32(mb0_1, type_table[ptype0], 0);
812 /* merge the status bits into one register */
813 const __m256i status0_3 = _mm256_unpackhi_epi32(raw_desc2_3,
817 * take the two sets of status bits and merge to one
818 * After merge, the packets status flags are in the
819 * order (hi->lo): [1, 3, 5, 7, 0, 2, 4, 6]
821 __m256i status0_7 = _mm256_unpacklo_epi64(status4_7,
824 /* now do flag manipulation */
826 /* get only flag/error bits we want */
827 const __m256i flag_bits =
828 _mm256_and_si256(status0_7, flags_mask);
830 * l3_l4_error flags, shuffle, then shift to correct adjustment
831 * of flags in flags_shuf, and finally mask out extra bits
833 __m256i l3_l4_flags = _mm256_shuffle_epi8(l3_l4_flags_shuf,
834 _mm256_srli_epi32(flag_bits, 4));
835 l3_l4_flags = _mm256_slli_epi32(l3_l4_flags, 1);
836 l3_l4_flags = _mm256_and_si256(l3_l4_flags, cksum_mask);
838 /* set rss and vlan flags */
839 const __m256i rss_vlan_flag_bits =
840 _mm256_srli_epi32(flag_bits, 12);
841 const __m256i rss_flags =
842 _mm256_shuffle_epi8(rss_flags_shuf,
845 __m256i vlan_flags = _mm256_setzero_si256();
847 if (rxq->rx_flags == IAVF_RX_FLAGS_VLAN_TAG_LOC_L2TAG1)
849 _mm256_shuffle_epi8(vlan_flags_shuf,
852 const __m256i rss_vlan_flags =
853 _mm256_or_si256(rss_flags, vlan_flags);
856 __m256i mbuf_flags = _mm256_or_si256(l3_l4_flags,
859 if (rxq->fdir_enabled) {
860 const __m256i fdir_id4_7 =
861 _mm256_unpackhi_epi32(raw_desc6_7, raw_desc4_5);
863 const __m256i fdir_id0_3 =
864 _mm256_unpackhi_epi32(raw_desc2_3, raw_desc0_1);
866 const __m256i fdir_id0_7 =
867 _mm256_unpackhi_epi64(fdir_id4_7, fdir_id0_3);
869 const __m256i fdir_flags =
870 flex_rxd_to_fdir_flags_vec_avx2(fdir_id0_7);
872 /* merge with fdir_flags */
873 mbuf_flags = _mm256_or_si256(mbuf_flags, fdir_flags);
875 /* write to mbuf: have to use scalar store here */
876 rx_pkts[i + 0]->hash.fdir.hi =
877 _mm256_extract_epi32(fdir_id0_7, 3);
879 rx_pkts[i + 1]->hash.fdir.hi =
880 _mm256_extract_epi32(fdir_id0_7, 7);
882 rx_pkts[i + 2]->hash.fdir.hi =
883 _mm256_extract_epi32(fdir_id0_7, 2);
885 rx_pkts[i + 3]->hash.fdir.hi =
886 _mm256_extract_epi32(fdir_id0_7, 6);
888 rx_pkts[i + 4]->hash.fdir.hi =
889 _mm256_extract_epi32(fdir_id0_7, 1);
891 rx_pkts[i + 5]->hash.fdir.hi =
892 _mm256_extract_epi32(fdir_id0_7, 5);
894 rx_pkts[i + 6]->hash.fdir.hi =
895 _mm256_extract_epi32(fdir_id0_7, 0);
897 rx_pkts[i + 7]->hash.fdir.hi =
898 _mm256_extract_epi32(fdir_id0_7, 4);
899 } /* if() on fdir_enabled */
901 #ifndef RTE_LIBRTE_IAVF_16BYTE_RX_DESC
903 * needs to load 2nd 16B of each desc for RSS hash parsing,
904 * will cause performance drop to get into this context.
906 if (rxq->vsi->adapter->eth_dev->data->dev_conf.rxmode.offloads &
907 DEV_RX_OFFLOAD_RSS_HASH ||
908 rxq->rx_flags & IAVF_RX_FLAGS_VLAN_TAG_LOC_L2TAG2_2) {
909 /* load bottom half of every 32B desc */
910 const __m128i raw_desc_bh7 =
912 ((void *)(&rxdp[7].wb.status_error1));
913 rte_compiler_barrier();
914 const __m128i raw_desc_bh6 =
916 ((void *)(&rxdp[6].wb.status_error1));
917 rte_compiler_barrier();
918 const __m128i raw_desc_bh5 =
920 ((void *)(&rxdp[5].wb.status_error1));
921 rte_compiler_barrier();
922 const __m128i raw_desc_bh4 =
924 ((void *)(&rxdp[4].wb.status_error1));
925 rte_compiler_barrier();
926 const __m128i raw_desc_bh3 =
928 ((void *)(&rxdp[3].wb.status_error1));
929 rte_compiler_barrier();
930 const __m128i raw_desc_bh2 =
932 ((void *)(&rxdp[2].wb.status_error1));
933 rte_compiler_barrier();
934 const __m128i raw_desc_bh1 =
936 ((void *)(&rxdp[1].wb.status_error1));
937 rte_compiler_barrier();
938 const __m128i raw_desc_bh0 =
940 ((void *)(&rxdp[0].wb.status_error1));
942 __m256i raw_desc_bh6_7 =
943 _mm256_inserti128_si256
944 (_mm256_castsi128_si256(raw_desc_bh6),
946 __m256i raw_desc_bh4_5 =
947 _mm256_inserti128_si256
948 (_mm256_castsi128_si256(raw_desc_bh4),
950 __m256i raw_desc_bh2_3 =
951 _mm256_inserti128_si256
952 (_mm256_castsi128_si256(raw_desc_bh2),
954 __m256i raw_desc_bh0_1 =
955 _mm256_inserti128_si256
956 (_mm256_castsi128_si256(raw_desc_bh0),
959 if (rxq->vsi->adapter->eth_dev->data->dev_conf.rxmode.offloads &
960 DEV_RX_OFFLOAD_RSS_HASH) {
962 * to shift the 32b RSS hash value to the
963 * highest 32b of each 128b before mask
965 __m256i rss_hash6_7 =
966 _mm256_slli_epi64(raw_desc_bh6_7, 32);
967 __m256i rss_hash4_5 =
968 _mm256_slli_epi64(raw_desc_bh4_5, 32);
969 __m256i rss_hash2_3 =
970 _mm256_slli_epi64(raw_desc_bh2_3, 32);
971 __m256i rss_hash0_1 =
972 _mm256_slli_epi64(raw_desc_bh0_1, 32);
974 const __m256i rss_hash_msk =
975 _mm256_set_epi32(0xFFFFFFFF, 0, 0, 0,
976 0xFFFFFFFF, 0, 0, 0);
978 rss_hash6_7 = _mm256_and_si256
979 (rss_hash6_7, rss_hash_msk);
980 rss_hash4_5 = _mm256_and_si256
981 (rss_hash4_5, rss_hash_msk);
982 rss_hash2_3 = _mm256_and_si256
983 (rss_hash2_3, rss_hash_msk);
984 rss_hash0_1 = _mm256_and_si256
985 (rss_hash0_1, rss_hash_msk);
987 mb6_7 = _mm256_or_si256(mb6_7, rss_hash6_7);
988 mb4_5 = _mm256_or_si256(mb4_5, rss_hash4_5);
989 mb2_3 = _mm256_or_si256(mb2_3, rss_hash2_3);
990 mb0_1 = _mm256_or_si256(mb0_1, rss_hash0_1);
993 if (rxq->rx_flags & IAVF_RX_FLAGS_VLAN_TAG_LOC_L2TAG2_2) {
994 /* merge the status/error-1 bits into one register */
995 const __m256i status1_4_7 =
996 _mm256_unpacklo_epi32(raw_desc_bh6_7,
998 const __m256i status1_0_3 =
999 _mm256_unpacklo_epi32(raw_desc_bh2_3,
1002 const __m256i status1_0_7 =
1003 _mm256_unpacklo_epi64(status1_4_7,
1006 const __m256i l2tag2p_flag_mask =
1008 (1 << IAVF_RX_FLEX_DESC_STATUS1_L2TAG2P_S);
1010 __m256i l2tag2p_flag_bits =
1012 (status1_0_7, l2tag2p_flag_mask);
1015 _mm256_srli_epi32(l2tag2p_flag_bits,
1016 IAVF_RX_FLEX_DESC_STATUS1_L2TAG2P_S);
1018 const __m256i l2tag2_flags_shuf =
1019 _mm256_set_epi8(0, 0, 0, 0,
1023 /* end up 128-bits */
1029 PKT_RX_VLAN_STRIPPED,
1033 _mm256_shuffle_epi8(l2tag2_flags_shuf,
1036 /* merge with vlan_flags */
1037 mbuf_flags = _mm256_or_si256
1038 (mbuf_flags, vlan_flags);
1041 __m256i vlan_tci6_7 =
1042 _mm256_slli_si256(raw_desc_bh6_7, 4);
1043 __m256i vlan_tci4_5 =
1044 _mm256_slli_si256(raw_desc_bh4_5, 4);
1045 __m256i vlan_tci2_3 =
1046 _mm256_slli_si256(raw_desc_bh2_3, 4);
1047 __m256i vlan_tci0_1 =
1048 _mm256_slli_si256(raw_desc_bh0_1, 4);
1050 const __m256i vlan_tci_msk =
1051 _mm256_set_epi32(0, 0xFFFF0000, 0, 0,
1052 0, 0xFFFF0000, 0, 0);
1054 vlan_tci6_7 = _mm256_and_si256
1055 (vlan_tci6_7, vlan_tci_msk);
1056 vlan_tci4_5 = _mm256_and_si256
1057 (vlan_tci4_5, vlan_tci_msk);
1058 vlan_tci2_3 = _mm256_and_si256
1059 (vlan_tci2_3, vlan_tci_msk);
1060 vlan_tci0_1 = _mm256_and_si256
1061 (vlan_tci0_1, vlan_tci_msk);
1063 mb6_7 = _mm256_or_si256(mb6_7, vlan_tci6_7);
1064 mb4_5 = _mm256_or_si256(mb4_5, vlan_tci4_5);
1065 mb2_3 = _mm256_or_si256(mb2_3, vlan_tci2_3);
1066 mb0_1 = _mm256_or_si256(mb0_1, vlan_tci0_1);
1068 } /* if() on RSS hash parsing */
1072 * At this point, we have the 8 sets of flags in the low 16-bits
1073 * of each 32-bit value in vlan0.
1074 * We want to extract these, and merge them with the mbuf init
1075 * data so we can do a single write to the mbuf to set the flags
1076 * and all the other initialization fields. Extracting the
1077 * appropriate flags means that we have to do a shift and blend
1078 * for each mbuf before we do the write. However, we can also
1079 * add in the previously computed rx_descriptor fields to
1080 * make a single 256-bit write per mbuf
1082 /* check the structure matches expectations */
1083 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, ol_flags) !=
1084 offsetof(struct rte_mbuf, rearm_data) + 8);
1085 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, rearm_data) !=
1086 RTE_ALIGN(offsetof(struct rte_mbuf,
1089 /* build up data and do writes */
1090 __m256i rearm0, rearm1, rearm2, rearm3, rearm4, rearm5,
1092 rearm6 = _mm256_blend_epi32(mbuf_init,
1093 _mm256_slli_si256(mbuf_flags, 8),
1095 rearm4 = _mm256_blend_epi32(mbuf_init,
1096 _mm256_slli_si256(mbuf_flags, 4),
1098 rearm2 = _mm256_blend_epi32(mbuf_init, mbuf_flags, 0x04);
1099 rearm0 = _mm256_blend_epi32(mbuf_init,
1100 _mm256_srli_si256(mbuf_flags, 4),
1102 /* permute to add in the rx_descriptor e.g. rss fields */
1103 rearm6 = _mm256_permute2f128_si256(rearm6, mb6_7, 0x20);
1104 rearm4 = _mm256_permute2f128_si256(rearm4, mb4_5, 0x20);
1105 rearm2 = _mm256_permute2f128_si256(rearm2, mb2_3, 0x20);
1106 rearm0 = _mm256_permute2f128_si256(rearm0, mb0_1, 0x20);
1108 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 6]->rearm_data,
1110 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 4]->rearm_data,
1112 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 2]->rearm_data,
1114 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 0]->rearm_data,
1117 /* repeat for the odd mbufs */
1118 const __m256i odd_flags =
1119 _mm256_castsi128_si256
1120 (_mm256_extracti128_si256(mbuf_flags, 1));
1121 rearm7 = _mm256_blend_epi32(mbuf_init,
1122 _mm256_slli_si256(odd_flags, 8),
1124 rearm5 = _mm256_blend_epi32(mbuf_init,
1125 _mm256_slli_si256(odd_flags, 4),
1127 rearm3 = _mm256_blend_epi32(mbuf_init, odd_flags, 0x04);
1128 rearm1 = _mm256_blend_epi32(mbuf_init,
1129 _mm256_srli_si256(odd_flags, 4),
1131 /* since odd mbufs are already in hi 128-bits use blend */
1132 rearm7 = _mm256_blend_epi32(rearm7, mb6_7, 0xF0);
1133 rearm5 = _mm256_blend_epi32(rearm5, mb4_5, 0xF0);
1134 rearm3 = _mm256_blend_epi32(rearm3, mb2_3, 0xF0);
1135 rearm1 = _mm256_blend_epi32(rearm1, mb0_1, 0xF0);
1136 /* again write to mbufs */
1137 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 7]->rearm_data,
1139 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 5]->rearm_data,
1141 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 3]->rearm_data,
1143 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 1]->rearm_data,
1146 /* extract and record EOP bit */
1148 const __m128i eop_mask =
1150 IAVF_RX_FLEX_DESC_STATUS0_EOF_S);
1151 const __m256i eop_bits256 = _mm256_and_si256(status0_7,
1153 /* pack status bits into a single 128-bit register */
1154 const __m128i eop_bits =
1156 (_mm256_castsi256_si128(eop_bits256),
1157 _mm256_extractf128_si256(eop_bits256,
1160 * flip bits, and mask out the EOP bit, which is now
1161 * a split-packet bit i.e. !EOP, rather than EOP one.
1163 __m128i split_bits = _mm_andnot_si128(eop_bits,
1166 * eop bits are out of order, so we need to shuffle them
1167 * back into order again. In doing so, only use low 8
1168 * bits, which acts like another pack instruction
1169 * The original order is (hi->lo): 1,3,5,7,0,2,4,6
1170 * [Since we use epi8, the 16-bit positions are
1171 * multiplied by 2 in the eop_shuffle value.]
1173 __m128i eop_shuffle =
1174 _mm_set_epi8(/* zero hi 64b */
1175 0xFF, 0xFF, 0xFF, 0xFF,
1176 0xFF, 0xFF, 0xFF, 0xFF,
1177 /* move values to lo 64b */
1180 split_bits = _mm_shuffle_epi8(split_bits, eop_shuffle);
1181 *(uint64_t *)split_packet =
1182 _mm_cvtsi128_si64(split_bits);
1183 split_packet += IAVF_DESCS_PER_LOOP_AVX;
1186 /* perform dd_check */
1187 status0_7 = _mm256_and_si256(status0_7, dd_check);
1188 status0_7 = _mm256_packs_epi32(status0_7,
1189 _mm256_setzero_si256());
1191 uint64_t burst = __builtin_popcountll
1193 (_mm256_extracti128_si256
1195 burst += __builtin_popcountll
1197 (_mm256_castsi256_si128(status0_7)));
1199 if (burst != IAVF_DESCS_PER_LOOP_AVX)
1203 /* update tail pointers */
1204 rxq->rx_tail += received;
1205 rxq->rx_tail &= (rxq->nb_rx_desc - 1);
1206 if ((rxq->rx_tail & 1) == 1 && received > 1) { /* keep avx2 aligned */
1210 rxq->rxrearm_nb += received;
1216 * - nb_pkts < IAVF_DESCS_PER_LOOP, just return no packet
1219 iavf_recv_pkts_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
1222 return _iavf_recv_raw_pkts_vec_avx2(rx_queue, rx_pkts, nb_pkts, NULL);
1227 * - nb_pkts < IAVF_DESCS_PER_LOOP, just return no packet
1230 iavf_recv_pkts_vec_avx2_flex_rxd(void *rx_queue, struct rte_mbuf **rx_pkts,
1233 return _iavf_recv_raw_pkts_vec_avx2_flex_rxd(rx_queue, rx_pkts,
1238 * vPMD receive routine that reassembles single burst of 32 scattered packets
1240 * - nb_pkts < IAVF_DESCS_PER_LOOP, just return no packet
1243 iavf_recv_scattered_burst_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
1246 struct iavf_rx_queue *rxq = rx_queue;
1247 uint8_t split_flags[IAVF_VPMD_RX_MAX_BURST] = {0};
1249 /* get some new buffers */
1250 uint16_t nb_bufs = _iavf_recv_raw_pkts_vec_avx2(rxq, rx_pkts, nb_pkts,
1255 /* happy day case, full burst + no packets to be joined */
1256 const uint64_t *split_fl64 = (uint64_t *)split_flags;
1258 if (!rxq->pkt_first_seg &&
1259 split_fl64[0] == 0 && split_fl64[1] == 0 &&
1260 split_fl64[2] == 0 && split_fl64[3] == 0)
1263 /* reassemble any packets that need reassembly*/
1266 if (!rxq->pkt_first_seg) {
1267 /* find the first split flag, and only reassemble then*/
1268 while (i < nb_bufs && !split_flags[i])
1272 rxq->pkt_first_seg = rx_pkts[i];
1274 return i + reassemble_packets(rxq, &rx_pkts[i], nb_bufs - i,
1279 * vPMD receive routine that reassembles scattered packets.
1280 * Main receive routine that can handle arbitrary burst sizes
1282 * - nb_pkts < IAVF_DESCS_PER_LOOP, just return no packet
1285 iavf_recv_scattered_pkts_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
1288 uint16_t retval = 0;
1290 while (nb_pkts > IAVF_VPMD_RX_MAX_BURST) {
1291 uint16_t burst = iavf_recv_scattered_burst_vec_avx2(rx_queue,
1292 rx_pkts + retval, IAVF_VPMD_RX_MAX_BURST);
1295 if (burst < IAVF_VPMD_RX_MAX_BURST)
1298 return retval + iavf_recv_scattered_burst_vec_avx2(rx_queue,
1299 rx_pkts + retval, nb_pkts);
1303 * vPMD receive routine that reassembles single burst of
1304 * 32 scattered packets for flex RxD
1306 * - nb_pkts < IAVF_DESCS_PER_LOOP, just return no packet
1309 iavf_recv_scattered_burst_vec_avx2_flex_rxd(void *rx_queue,
1310 struct rte_mbuf **rx_pkts,
1313 struct iavf_rx_queue *rxq = rx_queue;
1314 uint8_t split_flags[IAVF_VPMD_RX_MAX_BURST] = {0};
1316 /* get some new buffers */
1317 uint16_t nb_bufs = _iavf_recv_raw_pkts_vec_avx2_flex_rxd(rxq,
1318 rx_pkts, nb_pkts, split_flags);
1322 /* happy day case, full burst + no packets to be joined */
1323 const uint64_t *split_fl64 = (uint64_t *)split_flags;
1325 if (!rxq->pkt_first_seg &&
1326 split_fl64[0] == 0 && split_fl64[1] == 0 &&
1327 split_fl64[2] == 0 && split_fl64[3] == 0)
1330 /* reassemble any packets that need reassembly*/
1333 if (!rxq->pkt_first_seg) {
1334 /* find the first split flag, and only reassemble then*/
1335 while (i < nb_bufs && !split_flags[i])
1339 rxq->pkt_first_seg = rx_pkts[i];
1341 return i + reassemble_packets(rxq, &rx_pkts[i], nb_bufs - i,
1346 * vPMD receive routine that reassembles scattered packets for flex RxD.
1347 * Main receive routine that can handle arbitrary burst sizes
1349 * - nb_pkts < IAVF_DESCS_PER_LOOP, just return no packet
1352 iavf_recv_scattered_pkts_vec_avx2_flex_rxd(void *rx_queue,
1353 struct rte_mbuf **rx_pkts,
1356 uint16_t retval = 0;
1358 while (nb_pkts > IAVF_VPMD_RX_MAX_BURST) {
1360 iavf_recv_scattered_burst_vec_avx2_flex_rxd
1361 (rx_queue, rx_pkts + retval, IAVF_VPMD_RX_MAX_BURST);
1364 if (burst < IAVF_VPMD_RX_MAX_BURST)
1367 return retval + iavf_recv_scattered_burst_vec_avx2_flex_rxd(rx_queue,
1368 rx_pkts + retval, nb_pkts);
1372 iavf_vtx1(volatile struct iavf_tx_desc *txdp,
1373 struct rte_mbuf *pkt, uint64_t flags)
1376 (IAVF_TX_DESC_DTYPE_DATA |
1377 ((uint64_t)flags << IAVF_TXD_QW1_CMD_SHIFT) |
1378 ((uint64_t)pkt->data_len << IAVF_TXD_QW1_TX_BUF_SZ_SHIFT));
1380 __m128i descriptor = _mm_set_epi64x(high_qw,
1381 pkt->buf_iova + pkt->data_off);
1382 _mm_store_si128((__m128i *)txdp, descriptor);
1386 iavf_vtx(volatile struct iavf_tx_desc *txdp,
1387 struct rte_mbuf **pkt, uint16_t nb_pkts, uint64_t flags)
1389 const uint64_t hi_qw_tmpl = (IAVF_TX_DESC_DTYPE_DATA |
1390 ((uint64_t)flags << IAVF_TXD_QW1_CMD_SHIFT));
1392 /* if unaligned on 32-bit boundary, do one to align */
1393 if (((uintptr_t)txdp & 0x1F) != 0 && nb_pkts != 0) {
1394 iavf_vtx1(txdp, *pkt, flags);
1395 nb_pkts--, txdp++, pkt++;
1398 /* do two at a time while possible, in bursts */
1399 for (; nb_pkts > 3; txdp += 4, pkt += 4, nb_pkts -= 4) {
1402 ((uint64_t)pkt[3]->data_len <<
1403 IAVF_TXD_QW1_TX_BUF_SZ_SHIFT);
1406 ((uint64_t)pkt[2]->data_len <<
1407 IAVF_TXD_QW1_TX_BUF_SZ_SHIFT);
1410 ((uint64_t)pkt[1]->data_len <<
1411 IAVF_TXD_QW1_TX_BUF_SZ_SHIFT);
1414 ((uint64_t)pkt[0]->data_len <<
1415 IAVF_TXD_QW1_TX_BUF_SZ_SHIFT);
1420 pkt[3]->buf_iova + pkt[3]->data_off,
1422 pkt[2]->buf_iova + pkt[2]->data_off);
1426 pkt[1]->buf_iova + pkt[1]->data_off,
1428 pkt[0]->buf_iova + pkt[0]->data_off);
1429 _mm256_store_si256((void *)(txdp + 2), desc2_3);
1430 _mm256_store_si256((void *)txdp, desc0_1);
1433 /* do any last ones */
1435 iavf_vtx1(txdp, *pkt, flags);
1436 txdp++, pkt++, nb_pkts--;
1440 static inline uint16_t
1441 iavf_xmit_fixed_burst_vec_avx2(void *tx_queue, struct rte_mbuf **tx_pkts,
1444 struct iavf_tx_queue *txq = (struct iavf_tx_queue *)tx_queue;
1445 volatile struct iavf_tx_desc *txdp;
1446 struct iavf_tx_entry *txep;
1447 uint16_t n, nb_commit, tx_id;
1448 /* bit2 is reserved and must be set to 1 according to Spec */
1449 uint64_t flags = IAVF_TX_DESC_CMD_EOP | IAVF_TX_DESC_CMD_ICRC;
1450 uint64_t rs = IAVF_TX_DESC_CMD_RS | flags;
1452 /* cross rx_thresh boundary is not allowed */
1453 nb_pkts = RTE_MIN(nb_pkts, txq->rs_thresh);
1455 if (txq->nb_free < txq->free_thresh)
1456 iavf_tx_free_bufs(txq);
1458 nb_commit = nb_pkts = (uint16_t)RTE_MIN(txq->nb_free, nb_pkts);
1459 if (unlikely(nb_pkts == 0))
1462 tx_id = txq->tx_tail;
1463 txdp = &txq->tx_ring[tx_id];
1464 txep = &txq->sw_ring[tx_id];
1466 txq->nb_free = (uint16_t)(txq->nb_free - nb_pkts);
1468 n = (uint16_t)(txq->nb_tx_desc - tx_id);
1469 if (nb_commit >= n) {
1470 tx_backlog_entry(txep, tx_pkts, n);
1472 iavf_vtx(txdp, tx_pkts, n - 1, flags);
1476 iavf_vtx1(txdp, *tx_pkts++, rs);
1478 nb_commit = (uint16_t)(nb_commit - n);
1481 txq->next_rs = (uint16_t)(txq->rs_thresh - 1);
1483 /* avoid reach the end of ring */
1484 txdp = &txq->tx_ring[tx_id];
1485 txep = &txq->sw_ring[tx_id];
1488 tx_backlog_entry(txep, tx_pkts, nb_commit);
1490 iavf_vtx(txdp, tx_pkts, nb_commit, flags);
1492 tx_id = (uint16_t)(tx_id + nb_commit);
1493 if (tx_id > txq->next_rs) {
1494 txq->tx_ring[txq->next_rs].cmd_type_offset_bsz |=
1495 rte_cpu_to_le_64(((uint64_t)IAVF_TX_DESC_CMD_RS) <<
1496 IAVF_TXD_QW1_CMD_SHIFT);
1498 (uint16_t)(txq->next_rs + txq->rs_thresh);
1501 txq->tx_tail = tx_id;
1503 IAVF_PCI_REG_WC_WRITE(txq->qtx_tail, txq->tx_tail);
1509 iavf_xmit_pkts_vec_avx2(void *tx_queue, struct rte_mbuf **tx_pkts,
1513 struct iavf_tx_queue *txq = (struct iavf_tx_queue *)tx_queue;
1518 num = (uint16_t)RTE_MIN(nb_pkts, txq->rs_thresh);
1519 ret = iavf_xmit_fixed_burst_vec_avx2(tx_queue, &tx_pkts[nb_tx],