1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright(c) 2019 Intel Corporation
5 #include "iavf_rxtx_vec_common.h"
9 #ifndef __INTEL_COMPILER
10 #pragma GCC diagnostic ignored "-Wcast-qual"
13 static __rte_always_inline void
14 iavf_rxq_rearm(struct iavf_rx_queue *rxq)
16 return iavf_rxq_rearm_common(rxq, false);
19 #define PKTLEN_SHIFT 10
21 static inline uint16_t
22 _iavf_recv_raw_pkts_vec_avx2(struct iavf_rx_queue *rxq,
23 struct rte_mbuf **rx_pkts,
24 uint16_t nb_pkts, uint8_t *split_packet)
26 #define IAVF_DESCS_PER_LOOP_AVX 8
28 /* const uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl; */
29 const uint32_t *type_table = rxq->vsi->adapter->ptype_tbl;
31 const __m256i mbuf_init = _mm256_set_epi64x(0, 0,
32 0, rxq->mbuf_initializer);
33 /* struct iavf_rx_entry *sw_ring = &rxq->sw_ring[rxq->rx_tail]; */
34 struct rte_mbuf **sw_ring = &rxq->sw_ring[rxq->rx_tail];
35 volatile union iavf_rx_desc *rxdp = rxq->rx_ring + rxq->rx_tail;
36 const int avx_aligned = ((rxq->rx_tail & 1) == 0);
40 /* nb_pkts has to be floor-aligned to IAVF_DESCS_PER_LOOP_AVX */
41 nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, IAVF_DESCS_PER_LOOP_AVX);
43 /* See if we need to rearm the RX queue - gives the prefetch a bit
46 if (rxq->rxrearm_nb > IAVF_RXQ_REARM_THRESH)
49 /* Before we start moving massive data around, check to see if
50 * there is actually a packet available
52 if (!(rxdp->wb.qword1.status_error_len &
53 rte_cpu_to_le_32(1 << IAVF_RX_DESC_STATUS_DD_SHIFT)))
56 /* constants used in processing loop */
57 const __m256i crc_adjust =
59 (/* first descriptor */
60 0, 0, 0, /* ignore non-length fields */
61 -rxq->crc_len, /* sub crc on data_len */
62 0, /* ignore high-16bits of pkt_len */
63 -rxq->crc_len, /* sub crc on pkt_len */
64 0, 0, /* ignore pkt_type field */
65 /* second descriptor */
66 0, 0, 0, /* ignore non-length fields */
67 -rxq->crc_len, /* sub crc on data_len */
68 0, /* ignore high-16bits of pkt_len */
69 -rxq->crc_len, /* sub crc on pkt_len */
70 0, 0 /* ignore pkt_type field */
73 /* 8 packets DD mask, LSB in each 32-bit value */
74 const __m256i dd_check = _mm256_set1_epi32(1);
76 /* 8 packets EOP mask, second-LSB in each 32-bit value */
77 const __m256i eop_check = _mm256_slli_epi32(dd_check,
78 IAVF_RX_DESC_STATUS_EOF_SHIFT);
80 /* mask to shuffle from desc. to mbuf (2 descriptors)*/
81 const __m256i shuf_msk =
83 (/* first descriptor */
84 7, 6, 5, 4, /* octet 4~7, 32bits rss */
85 3, 2, /* octet 2~3, low 16 bits vlan_macip */
86 15, 14, /* octet 15~14, 16 bits data_len */
87 0xFF, 0xFF, /* skip high 16 bits pkt_len, zero out */
88 15, 14, /* octet 15~14, low 16 bits pkt_len */
89 0xFF, 0xFF, /* pkt_type set as unknown */
90 0xFF, 0xFF, /*pkt_type set as unknown */
91 /* second descriptor */
92 7, 6, 5, 4, /* octet 4~7, 32bits rss */
93 3, 2, /* octet 2~3, low 16 bits vlan_macip */
94 15, 14, /* octet 15~14, 16 bits data_len */
95 0xFF, 0xFF, /* skip high 16 bits pkt_len, zero out */
96 15, 14, /* octet 15~14, low 16 bits pkt_len */
97 0xFF, 0xFF, /* pkt_type set as unknown */
98 0xFF, 0xFF /*pkt_type set as unknown */
101 * compile-time check the above crc and shuffle layout is correct.
102 * NOTE: the first field (lowest address) is given last in set_epi
105 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) !=
106 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4);
107 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) !=
108 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8);
109 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, vlan_tci) !=
110 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 10);
111 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, hash) !=
112 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 12);
114 /* Status/Error flag masks */
116 * mask everything except RSS, flow director and VLAN flags
117 * bit2 is for VLAN tag, bit11 for flow director indication
118 * bit13:12 for RSS indication. Bits 3-5 of error
119 * field (bits 22-24) are for IP/L4 checksum errors
121 const __m256i flags_mask =
122 _mm256_set1_epi32((1 << 2) | (1 << 11) |
123 (3 << 12) | (7 << 22));
125 * data to be shuffled by result of flag mask. If VLAN bit is set,
126 * (bit 2), then position 4 in this array will be used in the
129 const __m256i vlan_flags_shuf =
130 _mm256_set_epi32(0, 0, RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED, 0,
131 0, 0, RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED, 0);
133 * data to be shuffled by result of flag mask, shifted down 11.
134 * If RSS/FDIR bits are set, shuffle moves appropriate flags in
137 const __m256i rss_flags_shuf =
138 _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
139 RTE_MBUF_F_RX_RSS_HASH | RTE_MBUF_F_RX_FDIR, RTE_MBUF_F_RX_RSS_HASH,
140 0, 0, 0, 0, RTE_MBUF_F_RX_FDIR, 0,/* end up 128-bits */
141 0, 0, 0, 0, 0, 0, 0, 0,
142 RTE_MBUF_F_RX_RSS_HASH | RTE_MBUF_F_RX_FDIR, RTE_MBUF_F_RX_RSS_HASH,
143 0, 0, 0, 0, RTE_MBUF_F_RX_FDIR, 0);
146 * data to be shuffled by the result of the flags mask shifted by 22
147 * bits. This gives use the l3_l4 flags.
149 const __m256i l3_l4_flags_shuf = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
150 /* shift right 1 bit to make sure it not exceed 255 */
151 (RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD |
152 RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
153 (RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
154 RTE_MBUF_F_RX_L4_CKSUM_BAD) >> 1,
155 (RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
156 (RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD) >> 1,
157 (RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
158 (RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_L4_CKSUM_BAD) >> 1,
159 RTE_MBUF_F_RX_IP_CKSUM_BAD >> 1,
160 (RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_L4_CKSUM_GOOD) >> 1,
161 /* second 128-bits */
162 0, 0, 0, 0, 0, 0, 0, 0,
163 (RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD |
164 RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
165 (RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
166 RTE_MBUF_F_RX_L4_CKSUM_BAD) >> 1,
167 (RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
168 (RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD) >> 1,
169 (RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
170 (RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_L4_CKSUM_BAD) >> 1,
171 RTE_MBUF_F_RX_IP_CKSUM_BAD >> 1,
172 (RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_L4_CKSUM_GOOD) >> 1);
174 const __m256i cksum_mask =
175 _mm256_set1_epi32(RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD |
176 RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_L4_CKSUM_BAD |
177 RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD);
179 RTE_SET_USED(avx_aligned); /* for 32B descriptors we don't use this */
181 uint16_t i, received;
183 for (i = 0, received = 0; i < nb_pkts;
184 i += IAVF_DESCS_PER_LOOP_AVX,
185 rxdp += IAVF_DESCS_PER_LOOP_AVX) {
186 /* step 1, copy over 8 mbuf pointers to rx_pkts array */
187 _mm256_storeu_si256((void *)&rx_pkts[i],
188 _mm256_loadu_si256((void *)&sw_ring[i]));
189 #ifdef RTE_ARCH_X86_64
191 ((void *)&rx_pkts[i + 4],
192 _mm256_loadu_si256((void *)&sw_ring[i + 4]));
195 __m256i raw_desc0_1, raw_desc2_3, raw_desc4_5, raw_desc6_7;
196 #ifdef RTE_LIBRTE_IAVF_16BYTE_RX_DESC
197 /* for AVX we need alignment otherwise loads are not atomic */
199 /* load in descriptors, 2 at a time, in reverse order */
200 raw_desc6_7 = _mm256_load_si256((void *)(rxdp + 6));
201 rte_compiler_barrier();
202 raw_desc4_5 = _mm256_load_si256((void *)(rxdp + 4));
203 rte_compiler_barrier();
204 raw_desc2_3 = _mm256_load_si256((void *)(rxdp + 2));
205 rte_compiler_barrier();
206 raw_desc0_1 = _mm256_load_si256((void *)(rxdp + 0));
210 const __m128i raw_desc7 =
211 _mm_load_si128((void *)(rxdp + 7));
212 rte_compiler_barrier();
213 const __m128i raw_desc6 =
214 _mm_load_si128((void *)(rxdp + 6));
215 rte_compiler_barrier();
216 const __m128i raw_desc5 =
217 _mm_load_si128((void *)(rxdp + 5));
218 rte_compiler_barrier();
219 const __m128i raw_desc4 =
220 _mm_load_si128((void *)(rxdp + 4));
221 rte_compiler_barrier();
222 const __m128i raw_desc3 =
223 _mm_load_si128((void *)(rxdp + 3));
224 rte_compiler_barrier();
225 const __m128i raw_desc2 =
226 _mm_load_si128((void *)(rxdp + 2));
227 rte_compiler_barrier();
228 const __m128i raw_desc1 =
229 _mm_load_si128((void *)(rxdp + 1));
230 rte_compiler_barrier();
231 const __m128i raw_desc0 =
232 _mm_load_si128((void *)(rxdp + 0));
235 _mm256_inserti128_si256
236 (_mm256_castsi128_si256(raw_desc6),
239 _mm256_inserti128_si256
240 (_mm256_castsi128_si256(raw_desc4),
243 _mm256_inserti128_si256
244 (_mm256_castsi128_si256(raw_desc2),
247 _mm256_inserti128_si256
248 (_mm256_castsi128_si256(raw_desc0),
255 for (j = 0; j < IAVF_DESCS_PER_LOOP_AVX; j++)
256 rte_mbuf_prefetch_part2(rx_pkts[i + j]);
260 * convert descriptors 4-7 into mbufs, adjusting length and
261 * re-arranging fields. Then write into the mbuf
263 const __m256i len6_7 = _mm256_slli_epi32(raw_desc6_7,
265 const __m256i len4_5 = _mm256_slli_epi32(raw_desc4_5,
267 const __m256i desc6_7 = _mm256_blend_epi16(raw_desc6_7,
269 const __m256i desc4_5 = _mm256_blend_epi16(raw_desc4_5,
271 __m256i mb6_7 = _mm256_shuffle_epi8(desc6_7, shuf_msk);
272 __m256i mb4_5 = _mm256_shuffle_epi8(desc4_5, shuf_msk);
274 mb6_7 = _mm256_add_epi16(mb6_7, crc_adjust);
275 mb4_5 = _mm256_add_epi16(mb4_5, crc_adjust);
277 * to get packet types, shift 64-bit values down 30 bits
278 * and so ptype is in lower 8-bits in each
280 const __m256i ptypes6_7 = _mm256_srli_epi64(desc6_7, 30);
281 const __m256i ptypes4_5 = _mm256_srli_epi64(desc4_5, 30);
282 const uint8_t ptype7 = _mm256_extract_epi8(ptypes6_7, 24);
283 const uint8_t ptype6 = _mm256_extract_epi8(ptypes6_7, 8);
284 const uint8_t ptype5 = _mm256_extract_epi8(ptypes4_5, 24);
285 const uint8_t ptype4 = _mm256_extract_epi8(ptypes4_5, 8);
287 mb6_7 = _mm256_insert_epi32(mb6_7, type_table[ptype7], 4);
288 mb6_7 = _mm256_insert_epi32(mb6_7, type_table[ptype6], 0);
289 mb4_5 = _mm256_insert_epi32(mb4_5, type_table[ptype5], 4);
290 mb4_5 = _mm256_insert_epi32(mb4_5, type_table[ptype4], 0);
291 /* merge the status bits into one register */
292 const __m256i status4_7 = _mm256_unpackhi_epi32(desc6_7,
296 * convert descriptors 0-3 into mbufs, adjusting length and
297 * re-arranging fields. Then write into the mbuf
299 const __m256i len2_3 = _mm256_slli_epi32(raw_desc2_3,
301 const __m256i len0_1 = _mm256_slli_epi32(raw_desc0_1,
303 const __m256i desc2_3 = _mm256_blend_epi16(raw_desc2_3,
305 const __m256i desc0_1 = _mm256_blend_epi16(raw_desc0_1,
307 __m256i mb2_3 = _mm256_shuffle_epi8(desc2_3, shuf_msk);
308 __m256i mb0_1 = _mm256_shuffle_epi8(desc0_1, shuf_msk);
310 mb2_3 = _mm256_add_epi16(mb2_3, crc_adjust);
311 mb0_1 = _mm256_add_epi16(mb0_1, crc_adjust);
312 /* get the packet types */
313 const __m256i ptypes2_3 = _mm256_srli_epi64(desc2_3, 30);
314 const __m256i ptypes0_1 = _mm256_srli_epi64(desc0_1, 30);
315 const uint8_t ptype3 = _mm256_extract_epi8(ptypes2_3, 24);
316 const uint8_t ptype2 = _mm256_extract_epi8(ptypes2_3, 8);
317 const uint8_t ptype1 = _mm256_extract_epi8(ptypes0_1, 24);
318 const uint8_t ptype0 = _mm256_extract_epi8(ptypes0_1, 8);
320 mb2_3 = _mm256_insert_epi32(mb2_3, type_table[ptype3], 4);
321 mb2_3 = _mm256_insert_epi32(mb2_3, type_table[ptype2], 0);
322 mb0_1 = _mm256_insert_epi32(mb0_1, type_table[ptype1], 4);
323 mb0_1 = _mm256_insert_epi32(mb0_1, type_table[ptype0], 0);
324 /* merge the status bits into one register */
325 const __m256i status0_3 = _mm256_unpackhi_epi32(desc2_3,
329 * take the two sets of status bits and merge to one
330 * After merge, the packets status flags are in the
331 * order (hi->lo): [1, 3, 5, 7, 0, 2, 4, 6]
333 __m256i status0_7 = _mm256_unpacklo_epi64(status4_7,
336 /* now do flag manipulation */
338 /* get only flag/error bits we want */
339 const __m256i flag_bits =
340 _mm256_and_si256(status0_7, flags_mask);
341 /* set vlan and rss flags */
342 const __m256i vlan_flags =
343 _mm256_shuffle_epi8(vlan_flags_shuf, flag_bits);
344 const __m256i rss_flags =
345 _mm256_shuffle_epi8(rss_flags_shuf,
346 _mm256_srli_epi32(flag_bits, 11));
348 * l3_l4_error flags, shuffle, then shift to correct adjustment
349 * of flags in flags_shuf, and finally mask out extra bits
351 __m256i l3_l4_flags = _mm256_shuffle_epi8(l3_l4_flags_shuf,
352 _mm256_srli_epi32(flag_bits, 22));
353 l3_l4_flags = _mm256_slli_epi32(l3_l4_flags, 1);
354 l3_l4_flags = _mm256_and_si256(l3_l4_flags, cksum_mask);
357 const __m256i mbuf_flags = _mm256_or_si256(l3_l4_flags,
358 _mm256_or_si256(rss_flags, vlan_flags));
360 * At this point, we have the 8 sets of flags in the low 16-bits
361 * of each 32-bit value in vlan0.
362 * We want to extract these, and merge them with the mbuf init
363 * data so we can do a single write to the mbuf to set the flags
364 * and all the other initialization fields. Extracting the
365 * appropriate flags means that we have to do a shift and blend
366 * for each mbuf before we do the write. However, we can also
367 * add in the previously computed rx_descriptor fields to
368 * make a single 256-bit write per mbuf
370 /* check the structure matches expectations */
371 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, ol_flags) !=
372 offsetof(struct rte_mbuf, rearm_data) + 8);
373 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, rearm_data) !=
374 RTE_ALIGN(offsetof(struct rte_mbuf,
377 /* build up data and do writes */
378 __m256i rearm0, rearm1, rearm2, rearm3, rearm4, rearm5,
380 rearm6 = _mm256_blend_epi32(mbuf_init,
381 _mm256_slli_si256(mbuf_flags, 8),
383 rearm4 = _mm256_blend_epi32(mbuf_init,
384 _mm256_slli_si256(mbuf_flags, 4),
386 rearm2 = _mm256_blend_epi32(mbuf_init, mbuf_flags, 0x04);
387 rearm0 = _mm256_blend_epi32(mbuf_init,
388 _mm256_srli_si256(mbuf_flags, 4),
390 /* permute to add in the rx_descriptor e.g. rss fields */
391 rearm6 = _mm256_permute2f128_si256(rearm6, mb6_7, 0x20);
392 rearm4 = _mm256_permute2f128_si256(rearm4, mb4_5, 0x20);
393 rearm2 = _mm256_permute2f128_si256(rearm2, mb2_3, 0x20);
394 rearm0 = _mm256_permute2f128_si256(rearm0, mb0_1, 0x20);
396 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 6]->rearm_data,
398 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 4]->rearm_data,
400 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 2]->rearm_data,
402 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 0]->rearm_data,
405 /* repeat for the odd mbufs */
406 const __m256i odd_flags =
407 _mm256_castsi128_si256
408 (_mm256_extracti128_si256(mbuf_flags, 1));
409 rearm7 = _mm256_blend_epi32(mbuf_init,
410 _mm256_slli_si256(odd_flags, 8),
412 rearm5 = _mm256_blend_epi32(mbuf_init,
413 _mm256_slli_si256(odd_flags, 4),
415 rearm3 = _mm256_blend_epi32(mbuf_init, odd_flags, 0x04);
416 rearm1 = _mm256_blend_epi32(mbuf_init,
417 _mm256_srli_si256(odd_flags, 4),
419 /* since odd mbufs are already in hi 128-bits use blend */
420 rearm7 = _mm256_blend_epi32(rearm7, mb6_7, 0xF0);
421 rearm5 = _mm256_blend_epi32(rearm5, mb4_5, 0xF0);
422 rearm3 = _mm256_blend_epi32(rearm3, mb2_3, 0xF0);
423 rearm1 = _mm256_blend_epi32(rearm1, mb0_1, 0xF0);
424 /* again write to mbufs */
425 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 7]->rearm_data,
427 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 5]->rearm_data,
429 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 3]->rearm_data,
431 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 1]->rearm_data,
434 /* extract and record EOP bit */
436 const __m128i eop_mask =
437 _mm_set1_epi16(1 << IAVF_RX_DESC_STATUS_EOF_SHIFT);
438 const __m256i eop_bits256 = _mm256_and_si256(status0_7,
440 /* pack status bits into a single 128-bit register */
441 const __m128i eop_bits =
443 (_mm256_castsi256_si128(eop_bits256),
444 _mm256_extractf128_si256(eop_bits256,
447 * flip bits, and mask out the EOP bit, which is now
448 * a split-packet bit i.e. !EOP, rather than EOP one.
450 __m128i split_bits = _mm_andnot_si128(eop_bits,
453 * eop bits are out of order, so we need to shuffle them
454 * back into order again. In doing so, only use low 8
455 * bits, which acts like another pack instruction
456 * The original order is (hi->lo): 1,3,5,7,0,2,4,6
457 * [Since we use epi8, the 16-bit positions are
458 * multiplied by 2 in the eop_shuffle value.]
460 __m128i eop_shuffle =
461 _mm_set_epi8(/* zero hi 64b */
462 0xFF, 0xFF, 0xFF, 0xFF,
463 0xFF, 0xFF, 0xFF, 0xFF,
464 /* move values to lo 64b */
467 split_bits = _mm_shuffle_epi8(split_bits, eop_shuffle);
468 *(uint64_t *)split_packet =
469 _mm_cvtsi128_si64(split_bits);
470 split_packet += IAVF_DESCS_PER_LOOP_AVX;
473 /* perform dd_check */
474 status0_7 = _mm256_and_si256(status0_7, dd_check);
475 status0_7 = _mm256_packs_epi32(status0_7,
476 _mm256_setzero_si256());
478 uint64_t burst = __builtin_popcountll
480 (_mm256_extracti128_si256
482 burst += __builtin_popcountll
484 (_mm256_castsi256_si128(status0_7)));
486 if (burst != IAVF_DESCS_PER_LOOP_AVX)
490 /* update tail pointers */
491 rxq->rx_tail += received;
492 rxq->rx_tail &= (rxq->nb_rx_desc - 1);
493 if ((rxq->rx_tail & 1) == 1 && received > 1) { /* keep avx2 aligned */
497 rxq->rxrearm_nb += received;
501 static inline __m256i
502 flex_rxd_to_fdir_flags_vec_avx2(const __m256i fdir_id0_7)
504 #define FDID_MIS_MAGIC 0xFFFFFFFF
505 RTE_BUILD_BUG_ON(RTE_MBUF_F_RX_FDIR != (1 << 2));
506 RTE_BUILD_BUG_ON(RTE_MBUF_F_RX_FDIR_ID != (1 << 13));
507 const __m256i pkt_fdir_bit = _mm256_set1_epi32(RTE_MBUF_F_RX_FDIR |
508 RTE_MBUF_F_RX_FDIR_ID);
509 /* desc->flow_id field == 0xFFFFFFFF means fdir mismatch */
510 const __m256i fdir_mis_mask = _mm256_set1_epi32(FDID_MIS_MAGIC);
511 __m256i fdir_mask = _mm256_cmpeq_epi32(fdir_id0_7,
513 /* this XOR op results to bit-reverse the fdir_mask */
514 fdir_mask = _mm256_xor_si256(fdir_mask, fdir_mis_mask);
515 const __m256i fdir_flags = _mm256_and_si256(fdir_mask, pkt_fdir_bit);
520 static inline uint16_t
521 _iavf_recv_raw_pkts_vec_avx2_flex_rxd(struct iavf_rx_queue *rxq,
522 struct rte_mbuf **rx_pkts,
523 uint16_t nb_pkts, uint8_t *split_packet)
525 #define IAVF_DESCS_PER_LOOP_AVX 8
527 struct iavf_adapter *adapter = rxq->vsi->adapter;
529 uint64_t offloads = adapter->dev_data->dev_conf.rxmode.offloads;
530 const uint32_t *type_table = adapter->ptype_tbl;
532 const __m256i mbuf_init = _mm256_set_epi64x(0, 0,
533 0, rxq->mbuf_initializer);
534 struct rte_mbuf **sw_ring = &rxq->sw_ring[rxq->rx_tail];
535 volatile union iavf_rx_flex_desc *rxdp =
536 (union iavf_rx_flex_desc *)rxq->rx_ring + rxq->rx_tail;
540 /* nb_pkts has to be floor-aligned to IAVF_DESCS_PER_LOOP_AVX */
541 nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, IAVF_DESCS_PER_LOOP_AVX);
543 /* See if we need to rearm the RX queue - gives the prefetch a bit
546 if (rxq->rxrearm_nb > IAVF_RXQ_REARM_THRESH)
549 /* Before we start moving massive data around, check to see if
550 * there is actually a packet available
552 if (!(rxdp->wb.status_error0 &
553 rte_cpu_to_le_32(1 << IAVF_RX_FLEX_DESC_STATUS0_DD_S)))
556 /* constants used in processing loop */
557 const __m256i crc_adjust =
559 (/* first descriptor */
560 0, 0, 0, /* ignore non-length fields */
561 -rxq->crc_len, /* sub crc on data_len */
562 0, /* ignore high-16bits of pkt_len */
563 -rxq->crc_len, /* sub crc on pkt_len */
564 0, 0, /* ignore pkt_type field */
565 /* second descriptor */
566 0, 0, 0, /* ignore non-length fields */
567 -rxq->crc_len, /* sub crc on data_len */
568 0, /* ignore high-16bits of pkt_len */
569 -rxq->crc_len, /* sub crc on pkt_len */
570 0, 0 /* ignore pkt_type field */
573 /* 8 packets DD mask, LSB in each 32-bit value */
574 const __m256i dd_check = _mm256_set1_epi32(1);
576 /* 8 packets EOP mask, second-LSB in each 32-bit value */
577 const __m256i eop_check = _mm256_slli_epi32(dd_check,
578 IAVF_RX_FLEX_DESC_STATUS0_EOF_S);
580 /* mask to shuffle from desc. to mbuf (2 descriptors)*/
581 const __m256i shuf_msk =
583 (/* first descriptor */
585 0xFF, 0xFF, /* rss hash parsed separately */
586 11, 10, /* octet 10~11, 16 bits vlan_macip */
587 5, 4, /* octet 4~5, 16 bits data_len */
588 0xFF, 0xFF, /* skip hi 16 bits pkt_len, zero out */
589 5, 4, /* octet 4~5, 16 bits pkt_len */
590 0xFF, 0xFF, /* pkt_type set as unknown */
591 0xFF, 0xFF, /*pkt_type set as unknown */
592 /* second descriptor */
594 0xFF, 0xFF, /* rss hash parsed separately */
595 11, 10, /* octet 10~11, 16 bits vlan_macip */
596 5, 4, /* octet 4~5, 16 bits data_len */
597 0xFF, 0xFF, /* skip hi 16 bits pkt_len, zero out */
598 5, 4, /* octet 4~5, 16 bits pkt_len */
599 0xFF, 0xFF, /* pkt_type set as unknown */
600 0xFF, 0xFF /*pkt_type set as unknown */
603 * compile-time check the above crc and shuffle layout is correct.
604 * NOTE: the first field (lowest address) is given last in set_epi
607 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) !=
608 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4);
609 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) !=
610 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8);
611 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, vlan_tci) !=
612 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 10);
613 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, hash) !=
614 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 12);
616 /* Status/Error flag masks */
618 * mask everything except Checksum Reports, RSS indication
619 * and VLAN indication.
620 * bit6:4 for IP/L4 checksum errors.
621 * bit12 is for RSS indication.
622 * bit13 is for VLAN indication.
624 const __m256i flags_mask =
625 _mm256_set1_epi32((7 << 4) | (1 << 12) | (1 << 13));
627 * data to be shuffled by the result of the flags mask shifted by 4
628 * bits. This gives use the l3_l4 flags.
630 const __m256i l3_l4_flags_shuf = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
631 /* shift right 1 bit to make sure it not exceed 255 */
632 (RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD |
633 RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
634 (RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD |
635 RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
636 (RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_GOOD |
637 RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
638 (RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_GOOD |
639 RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
640 (RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
641 (RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
642 (RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
643 (RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
644 /* second 128-bits */
645 0, 0, 0, 0, 0, 0, 0, 0,
646 (RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD |
647 RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
648 (RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD |
649 RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
650 (RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_GOOD |
651 RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
652 (RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_GOOD |
653 RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
654 (RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
655 (RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
656 (RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
657 (RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1);
658 const __m256i cksum_mask =
659 _mm256_set1_epi32(RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD |
660 RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_L4_CKSUM_BAD |
661 RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD);
663 * data to be shuffled by result of flag mask, shifted down 12.
664 * If RSS(bit12)/VLAN(bit13) are set,
665 * shuffle moves appropriate flags in place.
667 const __m256i rss_flags_shuf = _mm256_set_epi8(0, 0, 0, 0,
670 RTE_MBUF_F_RX_RSS_HASH, 0,
671 RTE_MBUF_F_RX_RSS_HASH, 0,
672 /* end up 128-bits */
676 RTE_MBUF_F_RX_RSS_HASH, 0,
677 RTE_MBUF_F_RX_RSS_HASH, 0);
679 const __m256i vlan_flags_shuf = _mm256_set_epi8(0, 0, 0, 0,
682 RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED,
683 RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED,
685 /* end up 128-bits */
689 RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED,
690 RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED,
693 uint16_t i, received;
695 for (i = 0, received = 0; i < nb_pkts;
696 i += IAVF_DESCS_PER_LOOP_AVX,
697 rxdp += IAVF_DESCS_PER_LOOP_AVX) {
698 /* step 1, copy over 8 mbuf pointers to rx_pkts array */
699 _mm256_storeu_si256((void *)&rx_pkts[i],
700 _mm256_loadu_si256((void *)&sw_ring[i]));
701 #ifdef RTE_ARCH_X86_64
703 ((void *)&rx_pkts[i + 4],
704 _mm256_loadu_si256((void *)&sw_ring[i + 4]));
707 __m256i raw_desc0_1, raw_desc2_3, raw_desc4_5, raw_desc6_7;
709 const __m128i raw_desc7 =
710 _mm_load_si128((void *)(rxdp + 7));
711 rte_compiler_barrier();
712 const __m128i raw_desc6 =
713 _mm_load_si128((void *)(rxdp + 6));
714 rte_compiler_barrier();
715 const __m128i raw_desc5 =
716 _mm_load_si128((void *)(rxdp + 5));
717 rte_compiler_barrier();
718 const __m128i raw_desc4 =
719 _mm_load_si128((void *)(rxdp + 4));
720 rte_compiler_barrier();
721 const __m128i raw_desc3 =
722 _mm_load_si128((void *)(rxdp + 3));
723 rte_compiler_barrier();
724 const __m128i raw_desc2 =
725 _mm_load_si128((void *)(rxdp + 2));
726 rte_compiler_barrier();
727 const __m128i raw_desc1 =
728 _mm_load_si128((void *)(rxdp + 1));
729 rte_compiler_barrier();
730 const __m128i raw_desc0 =
731 _mm_load_si128((void *)(rxdp + 0));
734 _mm256_inserti128_si256
735 (_mm256_castsi128_si256(raw_desc6),
738 _mm256_inserti128_si256
739 (_mm256_castsi128_si256(raw_desc4),
742 _mm256_inserti128_si256
743 (_mm256_castsi128_si256(raw_desc2),
746 _mm256_inserti128_si256
747 (_mm256_castsi128_si256(raw_desc0),
753 for (j = 0; j < IAVF_DESCS_PER_LOOP_AVX; j++)
754 rte_mbuf_prefetch_part2(rx_pkts[i + j]);
758 * convert descriptors 4-7 into mbufs, re-arrange fields.
759 * Then write into the mbuf.
761 __m256i mb6_7 = _mm256_shuffle_epi8(raw_desc6_7, shuf_msk);
762 __m256i mb4_5 = _mm256_shuffle_epi8(raw_desc4_5, shuf_msk);
764 mb6_7 = _mm256_add_epi16(mb6_7, crc_adjust);
765 mb4_5 = _mm256_add_epi16(mb4_5, crc_adjust);
767 * to get packet types, ptype is located in bit16-25
770 const __m256i ptype_mask =
771 _mm256_set1_epi16(IAVF_RX_FLEX_DESC_PTYPE_M);
772 const __m256i ptypes6_7 =
773 _mm256_and_si256(raw_desc6_7, ptype_mask);
774 const __m256i ptypes4_5 =
775 _mm256_and_si256(raw_desc4_5, ptype_mask);
776 const uint16_t ptype7 = _mm256_extract_epi16(ptypes6_7, 9);
777 const uint16_t ptype6 = _mm256_extract_epi16(ptypes6_7, 1);
778 const uint16_t ptype5 = _mm256_extract_epi16(ptypes4_5, 9);
779 const uint16_t ptype4 = _mm256_extract_epi16(ptypes4_5, 1);
781 mb6_7 = _mm256_insert_epi32(mb6_7, type_table[ptype7], 4);
782 mb6_7 = _mm256_insert_epi32(mb6_7, type_table[ptype6], 0);
783 mb4_5 = _mm256_insert_epi32(mb4_5, type_table[ptype5], 4);
784 mb4_5 = _mm256_insert_epi32(mb4_5, type_table[ptype4], 0);
785 /* merge the status bits into one register */
786 const __m256i status4_7 = _mm256_unpackhi_epi32(raw_desc6_7,
790 * convert descriptors 0-3 into mbufs, re-arrange fields.
791 * Then write into the mbuf.
793 __m256i mb2_3 = _mm256_shuffle_epi8(raw_desc2_3, shuf_msk);
794 __m256i mb0_1 = _mm256_shuffle_epi8(raw_desc0_1, shuf_msk);
796 mb2_3 = _mm256_add_epi16(mb2_3, crc_adjust);
797 mb0_1 = _mm256_add_epi16(mb0_1, crc_adjust);
799 * to get packet types, ptype is located in bit16-25
802 const __m256i ptypes2_3 =
803 _mm256_and_si256(raw_desc2_3, ptype_mask);
804 const __m256i ptypes0_1 =
805 _mm256_and_si256(raw_desc0_1, ptype_mask);
806 const uint16_t ptype3 = _mm256_extract_epi16(ptypes2_3, 9);
807 const uint16_t ptype2 = _mm256_extract_epi16(ptypes2_3, 1);
808 const uint16_t ptype1 = _mm256_extract_epi16(ptypes0_1, 9);
809 const uint16_t ptype0 = _mm256_extract_epi16(ptypes0_1, 1);
811 mb2_3 = _mm256_insert_epi32(mb2_3, type_table[ptype3], 4);
812 mb2_3 = _mm256_insert_epi32(mb2_3, type_table[ptype2], 0);
813 mb0_1 = _mm256_insert_epi32(mb0_1, type_table[ptype1], 4);
814 mb0_1 = _mm256_insert_epi32(mb0_1, type_table[ptype0], 0);
815 /* merge the status bits into one register */
816 const __m256i status0_3 = _mm256_unpackhi_epi32(raw_desc2_3,
820 * take the two sets of status bits and merge to one
821 * After merge, the packets status flags are in the
822 * order (hi->lo): [1, 3, 5, 7, 0, 2, 4, 6]
824 __m256i status0_7 = _mm256_unpacklo_epi64(status4_7,
827 /* now do flag manipulation */
829 /* get only flag/error bits we want */
830 const __m256i flag_bits =
831 _mm256_and_si256(status0_7, flags_mask);
833 * l3_l4_error flags, shuffle, then shift to correct adjustment
834 * of flags in flags_shuf, and finally mask out extra bits
836 __m256i l3_l4_flags = _mm256_shuffle_epi8(l3_l4_flags_shuf,
837 _mm256_srli_epi32(flag_bits, 4));
838 l3_l4_flags = _mm256_slli_epi32(l3_l4_flags, 1);
839 l3_l4_flags = _mm256_and_si256(l3_l4_flags, cksum_mask);
841 /* set rss and vlan flags */
842 const __m256i rss_vlan_flag_bits =
843 _mm256_srli_epi32(flag_bits, 12);
844 const __m256i rss_flags =
845 _mm256_shuffle_epi8(rss_flags_shuf,
848 __m256i vlan_flags = _mm256_setzero_si256();
850 if (rxq->rx_flags == IAVF_RX_FLAGS_VLAN_TAG_LOC_L2TAG1)
852 _mm256_shuffle_epi8(vlan_flags_shuf,
855 const __m256i rss_vlan_flags =
856 _mm256_or_si256(rss_flags, vlan_flags);
859 __m256i mbuf_flags = _mm256_or_si256(l3_l4_flags,
862 if (rxq->fdir_enabled) {
863 const __m256i fdir_id4_7 =
864 _mm256_unpackhi_epi32(raw_desc6_7, raw_desc4_5);
866 const __m256i fdir_id0_3 =
867 _mm256_unpackhi_epi32(raw_desc2_3, raw_desc0_1);
869 const __m256i fdir_id0_7 =
870 _mm256_unpackhi_epi64(fdir_id4_7, fdir_id0_3);
872 const __m256i fdir_flags =
873 flex_rxd_to_fdir_flags_vec_avx2(fdir_id0_7);
875 /* merge with fdir_flags */
876 mbuf_flags = _mm256_or_si256(mbuf_flags, fdir_flags);
878 /* write to mbuf: have to use scalar store here */
879 rx_pkts[i + 0]->hash.fdir.hi =
880 _mm256_extract_epi32(fdir_id0_7, 3);
882 rx_pkts[i + 1]->hash.fdir.hi =
883 _mm256_extract_epi32(fdir_id0_7, 7);
885 rx_pkts[i + 2]->hash.fdir.hi =
886 _mm256_extract_epi32(fdir_id0_7, 2);
888 rx_pkts[i + 3]->hash.fdir.hi =
889 _mm256_extract_epi32(fdir_id0_7, 6);
891 rx_pkts[i + 4]->hash.fdir.hi =
892 _mm256_extract_epi32(fdir_id0_7, 1);
894 rx_pkts[i + 5]->hash.fdir.hi =
895 _mm256_extract_epi32(fdir_id0_7, 5);
897 rx_pkts[i + 6]->hash.fdir.hi =
898 _mm256_extract_epi32(fdir_id0_7, 0);
900 rx_pkts[i + 7]->hash.fdir.hi =
901 _mm256_extract_epi32(fdir_id0_7, 4);
902 } /* if() on fdir_enabled */
904 #ifndef RTE_LIBRTE_IAVF_16BYTE_RX_DESC
906 * needs to load 2nd 16B of each desc for RSS hash parsing,
907 * will cause performance drop to get into this context.
909 if (offloads & RTE_ETH_RX_OFFLOAD_RSS_HASH ||
910 rxq->rx_flags & IAVF_RX_FLAGS_VLAN_TAG_LOC_L2TAG2_2) {
911 /* load bottom half of every 32B desc */
912 const __m128i raw_desc_bh7 =
914 ((void *)(&rxdp[7].wb.status_error1));
915 rte_compiler_barrier();
916 const __m128i raw_desc_bh6 =
918 ((void *)(&rxdp[6].wb.status_error1));
919 rte_compiler_barrier();
920 const __m128i raw_desc_bh5 =
922 ((void *)(&rxdp[5].wb.status_error1));
923 rte_compiler_barrier();
924 const __m128i raw_desc_bh4 =
926 ((void *)(&rxdp[4].wb.status_error1));
927 rte_compiler_barrier();
928 const __m128i raw_desc_bh3 =
930 ((void *)(&rxdp[3].wb.status_error1));
931 rte_compiler_barrier();
932 const __m128i raw_desc_bh2 =
934 ((void *)(&rxdp[2].wb.status_error1));
935 rte_compiler_barrier();
936 const __m128i raw_desc_bh1 =
938 ((void *)(&rxdp[1].wb.status_error1));
939 rte_compiler_barrier();
940 const __m128i raw_desc_bh0 =
942 ((void *)(&rxdp[0].wb.status_error1));
944 __m256i raw_desc_bh6_7 =
945 _mm256_inserti128_si256
946 (_mm256_castsi128_si256(raw_desc_bh6),
948 __m256i raw_desc_bh4_5 =
949 _mm256_inserti128_si256
950 (_mm256_castsi128_si256(raw_desc_bh4),
952 __m256i raw_desc_bh2_3 =
953 _mm256_inserti128_si256
954 (_mm256_castsi128_si256(raw_desc_bh2),
956 __m256i raw_desc_bh0_1 =
957 _mm256_inserti128_si256
958 (_mm256_castsi128_si256(raw_desc_bh0),
961 if (offloads & RTE_ETH_RX_OFFLOAD_RSS_HASH) {
963 * to shift the 32b RSS hash value to the
964 * highest 32b of each 128b before mask
966 __m256i rss_hash6_7 =
967 _mm256_slli_epi64(raw_desc_bh6_7, 32);
968 __m256i rss_hash4_5 =
969 _mm256_slli_epi64(raw_desc_bh4_5, 32);
970 __m256i rss_hash2_3 =
971 _mm256_slli_epi64(raw_desc_bh2_3, 32);
972 __m256i rss_hash0_1 =
973 _mm256_slli_epi64(raw_desc_bh0_1, 32);
975 const __m256i rss_hash_msk =
976 _mm256_set_epi32(0xFFFFFFFF, 0, 0, 0,
977 0xFFFFFFFF, 0, 0, 0);
979 rss_hash6_7 = _mm256_and_si256
980 (rss_hash6_7, rss_hash_msk);
981 rss_hash4_5 = _mm256_and_si256
982 (rss_hash4_5, rss_hash_msk);
983 rss_hash2_3 = _mm256_and_si256
984 (rss_hash2_3, rss_hash_msk);
985 rss_hash0_1 = _mm256_and_si256
986 (rss_hash0_1, rss_hash_msk);
988 mb6_7 = _mm256_or_si256(mb6_7, rss_hash6_7);
989 mb4_5 = _mm256_or_si256(mb4_5, rss_hash4_5);
990 mb2_3 = _mm256_or_si256(mb2_3, rss_hash2_3);
991 mb0_1 = _mm256_or_si256(mb0_1, rss_hash0_1);
994 if (rxq->rx_flags & IAVF_RX_FLAGS_VLAN_TAG_LOC_L2TAG2_2) {
995 /* merge the status/error-1 bits into one register */
996 const __m256i status1_4_7 =
997 _mm256_unpacklo_epi32(raw_desc_bh6_7,
999 const __m256i status1_0_3 =
1000 _mm256_unpacklo_epi32(raw_desc_bh2_3,
1003 const __m256i status1_0_7 =
1004 _mm256_unpacklo_epi64(status1_4_7,
1007 const __m256i l2tag2p_flag_mask =
1009 (1 << IAVF_RX_FLEX_DESC_STATUS1_L2TAG2P_S);
1011 __m256i l2tag2p_flag_bits =
1013 (status1_0_7, l2tag2p_flag_mask);
1016 _mm256_srli_epi32(l2tag2p_flag_bits,
1017 IAVF_RX_FLEX_DESC_STATUS1_L2TAG2P_S);
1019 const __m256i l2tag2_flags_shuf =
1020 _mm256_set_epi8(0, 0, 0, 0,
1024 /* end up 128-bits */
1029 RTE_MBUF_F_RX_VLAN |
1030 RTE_MBUF_F_RX_VLAN_STRIPPED,
1034 _mm256_shuffle_epi8(l2tag2_flags_shuf,
1037 /* merge with vlan_flags */
1038 mbuf_flags = _mm256_or_si256
1039 (mbuf_flags, vlan_flags);
1042 __m256i vlan_tci6_7 =
1043 _mm256_slli_si256(raw_desc_bh6_7, 4);
1044 __m256i vlan_tci4_5 =
1045 _mm256_slli_si256(raw_desc_bh4_5, 4);
1046 __m256i vlan_tci2_3 =
1047 _mm256_slli_si256(raw_desc_bh2_3, 4);
1048 __m256i vlan_tci0_1 =
1049 _mm256_slli_si256(raw_desc_bh0_1, 4);
1051 const __m256i vlan_tci_msk =
1052 _mm256_set_epi32(0, 0xFFFF0000, 0, 0,
1053 0, 0xFFFF0000, 0, 0);
1055 vlan_tci6_7 = _mm256_and_si256
1056 (vlan_tci6_7, vlan_tci_msk);
1057 vlan_tci4_5 = _mm256_and_si256
1058 (vlan_tci4_5, vlan_tci_msk);
1059 vlan_tci2_3 = _mm256_and_si256
1060 (vlan_tci2_3, vlan_tci_msk);
1061 vlan_tci0_1 = _mm256_and_si256
1062 (vlan_tci0_1, vlan_tci_msk);
1064 mb6_7 = _mm256_or_si256(mb6_7, vlan_tci6_7);
1065 mb4_5 = _mm256_or_si256(mb4_5, vlan_tci4_5);
1066 mb2_3 = _mm256_or_si256(mb2_3, vlan_tci2_3);
1067 mb0_1 = _mm256_or_si256(mb0_1, vlan_tci0_1);
1069 } /* if() on RSS hash parsing */
1073 * At this point, we have the 8 sets of flags in the low 16-bits
1074 * of each 32-bit value in vlan0.
1075 * We want to extract these, and merge them with the mbuf init
1076 * data so we can do a single write to the mbuf to set the flags
1077 * and all the other initialization fields. Extracting the
1078 * appropriate flags means that we have to do a shift and blend
1079 * for each mbuf before we do the write. However, we can also
1080 * add in the previously computed rx_descriptor fields to
1081 * make a single 256-bit write per mbuf
1083 /* check the structure matches expectations */
1084 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, ol_flags) !=
1085 offsetof(struct rte_mbuf, rearm_data) + 8);
1086 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, rearm_data) !=
1087 RTE_ALIGN(offsetof(struct rte_mbuf,
1090 /* build up data and do writes */
1091 __m256i rearm0, rearm1, rearm2, rearm3, rearm4, rearm5,
1093 rearm6 = _mm256_blend_epi32(mbuf_init,
1094 _mm256_slli_si256(mbuf_flags, 8),
1096 rearm4 = _mm256_blend_epi32(mbuf_init,
1097 _mm256_slli_si256(mbuf_flags, 4),
1099 rearm2 = _mm256_blend_epi32(mbuf_init, mbuf_flags, 0x04);
1100 rearm0 = _mm256_blend_epi32(mbuf_init,
1101 _mm256_srli_si256(mbuf_flags, 4),
1103 /* permute to add in the rx_descriptor e.g. rss fields */
1104 rearm6 = _mm256_permute2f128_si256(rearm6, mb6_7, 0x20);
1105 rearm4 = _mm256_permute2f128_si256(rearm4, mb4_5, 0x20);
1106 rearm2 = _mm256_permute2f128_si256(rearm2, mb2_3, 0x20);
1107 rearm0 = _mm256_permute2f128_si256(rearm0, mb0_1, 0x20);
1109 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 6]->rearm_data,
1111 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 4]->rearm_data,
1113 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 2]->rearm_data,
1115 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 0]->rearm_data,
1118 /* repeat for the odd mbufs */
1119 const __m256i odd_flags =
1120 _mm256_castsi128_si256
1121 (_mm256_extracti128_si256(mbuf_flags, 1));
1122 rearm7 = _mm256_blend_epi32(mbuf_init,
1123 _mm256_slli_si256(odd_flags, 8),
1125 rearm5 = _mm256_blend_epi32(mbuf_init,
1126 _mm256_slli_si256(odd_flags, 4),
1128 rearm3 = _mm256_blend_epi32(mbuf_init, odd_flags, 0x04);
1129 rearm1 = _mm256_blend_epi32(mbuf_init,
1130 _mm256_srli_si256(odd_flags, 4),
1132 /* since odd mbufs are already in hi 128-bits use blend */
1133 rearm7 = _mm256_blend_epi32(rearm7, mb6_7, 0xF0);
1134 rearm5 = _mm256_blend_epi32(rearm5, mb4_5, 0xF0);
1135 rearm3 = _mm256_blend_epi32(rearm3, mb2_3, 0xF0);
1136 rearm1 = _mm256_blend_epi32(rearm1, mb0_1, 0xF0);
1137 /* again write to mbufs */
1138 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 7]->rearm_data,
1140 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 5]->rearm_data,
1142 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 3]->rearm_data,
1144 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 1]->rearm_data,
1147 /* extract and record EOP bit */
1149 const __m128i eop_mask =
1151 IAVF_RX_FLEX_DESC_STATUS0_EOF_S);
1152 const __m256i eop_bits256 = _mm256_and_si256(status0_7,
1154 /* pack status bits into a single 128-bit register */
1155 const __m128i eop_bits =
1157 (_mm256_castsi256_si128(eop_bits256),
1158 _mm256_extractf128_si256(eop_bits256,
1161 * flip bits, and mask out the EOP bit, which is now
1162 * a split-packet bit i.e. !EOP, rather than EOP one.
1164 __m128i split_bits = _mm_andnot_si128(eop_bits,
1167 * eop bits are out of order, so we need to shuffle them
1168 * back into order again. In doing so, only use low 8
1169 * bits, which acts like another pack instruction
1170 * The original order is (hi->lo): 1,3,5,7,0,2,4,6
1171 * [Since we use epi8, the 16-bit positions are
1172 * multiplied by 2 in the eop_shuffle value.]
1174 __m128i eop_shuffle =
1175 _mm_set_epi8(/* zero hi 64b */
1176 0xFF, 0xFF, 0xFF, 0xFF,
1177 0xFF, 0xFF, 0xFF, 0xFF,
1178 /* move values to lo 64b */
1181 split_bits = _mm_shuffle_epi8(split_bits, eop_shuffle);
1182 *(uint64_t *)split_packet =
1183 _mm_cvtsi128_si64(split_bits);
1184 split_packet += IAVF_DESCS_PER_LOOP_AVX;
1187 /* perform dd_check */
1188 status0_7 = _mm256_and_si256(status0_7, dd_check);
1189 status0_7 = _mm256_packs_epi32(status0_7,
1190 _mm256_setzero_si256());
1192 uint64_t burst = __builtin_popcountll
1194 (_mm256_extracti128_si256
1196 burst += __builtin_popcountll
1198 (_mm256_castsi256_si128(status0_7)));
1200 if (burst != IAVF_DESCS_PER_LOOP_AVX)
1204 /* update tail pointers */
1205 rxq->rx_tail += received;
1206 rxq->rx_tail &= (rxq->nb_rx_desc - 1);
1207 if ((rxq->rx_tail & 1) == 1 && received > 1) { /* keep avx2 aligned */
1211 rxq->rxrearm_nb += received;
1217 * - nb_pkts < IAVF_DESCS_PER_LOOP, just return no packet
1220 iavf_recv_pkts_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
1223 return _iavf_recv_raw_pkts_vec_avx2(rx_queue, rx_pkts, nb_pkts, NULL);
1228 * - nb_pkts < IAVF_DESCS_PER_LOOP, just return no packet
1231 iavf_recv_pkts_vec_avx2_flex_rxd(void *rx_queue, struct rte_mbuf **rx_pkts,
1234 return _iavf_recv_raw_pkts_vec_avx2_flex_rxd(rx_queue, rx_pkts,
1239 * vPMD receive routine that reassembles single burst of 32 scattered packets
1241 * - nb_pkts < IAVF_DESCS_PER_LOOP, just return no packet
1244 iavf_recv_scattered_burst_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
1247 struct iavf_rx_queue *rxq = rx_queue;
1248 uint8_t split_flags[IAVF_VPMD_RX_MAX_BURST] = {0};
1250 /* get some new buffers */
1251 uint16_t nb_bufs = _iavf_recv_raw_pkts_vec_avx2(rxq, rx_pkts, nb_pkts,
1256 /* happy day case, full burst + no packets to be joined */
1257 const uint64_t *split_fl64 = (uint64_t *)split_flags;
1259 if (!rxq->pkt_first_seg &&
1260 split_fl64[0] == 0 && split_fl64[1] == 0 &&
1261 split_fl64[2] == 0 && split_fl64[3] == 0)
1264 /* reassemble any packets that need reassembly*/
1267 if (!rxq->pkt_first_seg) {
1268 /* find the first split flag, and only reassemble then*/
1269 while (i < nb_bufs && !split_flags[i])
1273 rxq->pkt_first_seg = rx_pkts[i];
1275 return i + reassemble_packets(rxq, &rx_pkts[i], nb_bufs - i,
1280 * vPMD receive routine that reassembles scattered packets.
1281 * Main receive routine that can handle arbitrary burst sizes
1283 * - nb_pkts < IAVF_DESCS_PER_LOOP, just return no packet
1286 iavf_recv_scattered_pkts_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
1289 uint16_t retval = 0;
1291 while (nb_pkts > IAVF_VPMD_RX_MAX_BURST) {
1292 uint16_t burst = iavf_recv_scattered_burst_vec_avx2(rx_queue,
1293 rx_pkts + retval, IAVF_VPMD_RX_MAX_BURST);
1296 if (burst < IAVF_VPMD_RX_MAX_BURST)
1299 return retval + iavf_recv_scattered_burst_vec_avx2(rx_queue,
1300 rx_pkts + retval, nb_pkts);
1304 * vPMD receive routine that reassembles single burst of
1305 * 32 scattered packets for flex RxD
1307 * - nb_pkts < IAVF_DESCS_PER_LOOP, just return no packet
1310 iavf_recv_scattered_burst_vec_avx2_flex_rxd(void *rx_queue,
1311 struct rte_mbuf **rx_pkts,
1314 struct iavf_rx_queue *rxq = rx_queue;
1315 uint8_t split_flags[IAVF_VPMD_RX_MAX_BURST] = {0};
1317 /* get some new buffers */
1318 uint16_t nb_bufs = _iavf_recv_raw_pkts_vec_avx2_flex_rxd(rxq,
1319 rx_pkts, nb_pkts, split_flags);
1323 /* happy day case, full burst + no packets to be joined */
1324 const uint64_t *split_fl64 = (uint64_t *)split_flags;
1326 if (!rxq->pkt_first_seg &&
1327 split_fl64[0] == 0 && split_fl64[1] == 0 &&
1328 split_fl64[2] == 0 && split_fl64[3] == 0)
1331 /* reassemble any packets that need reassembly*/
1334 if (!rxq->pkt_first_seg) {
1335 /* find the first split flag, and only reassemble then*/
1336 while (i < nb_bufs && !split_flags[i])
1340 rxq->pkt_first_seg = rx_pkts[i];
1342 return i + reassemble_packets(rxq, &rx_pkts[i], nb_bufs - i,
1347 * vPMD receive routine that reassembles scattered packets for flex RxD.
1348 * Main receive routine that can handle arbitrary burst sizes
1350 * - nb_pkts < IAVF_DESCS_PER_LOOP, just return no packet
1353 iavf_recv_scattered_pkts_vec_avx2_flex_rxd(void *rx_queue,
1354 struct rte_mbuf **rx_pkts,
1357 uint16_t retval = 0;
1359 while (nb_pkts > IAVF_VPMD_RX_MAX_BURST) {
1361 iavf_recv_scattered_burst_vec_avx2_flex_rxd
1362 (rx_queue, rx_pkts + retval, IAVF_VPMD_RX_MAX_BURST);
1365 if (burst < IAVF_VPMD_RX_MAX_BURST)
1368 return retval + iavf_recv_scattered_burst_vec_avx2_flex_rxd(rx_queue,
1369 rx_pkts + retval, nb_pkts);
1373 iavf_vtx1(volatile struct iavf_tx_desc *txdp,
1374 struct rte_mbuf *pkt, uint64_t flags)
1377 (IAVF_TX_DESC_DTYPE_DATA |
1378 ((uint64_t)flags << IAVF_TXD_QW1_CMD_SHIFT) |
1379 ((uint64_t)pkt->data_len << IAVF_TXD_QW1_TX_BUF_SZ_SHIFT));
1381 __m128i descriptor = _mm_set_epi64x(high_qw,
1382 pkt->buf_iova + pkt->data_off);
1383 _mm_store_si128((__m128i *)txdp, descriptor);
1387 iavf_vtx(volatile struct iavf_tx_desc *txdp,
1388 struct rte_mbuf **pkt, uint16_t nb_pkts, uint64_t flags)
1390 const uint64_t hi_qw_tmpl = (IAVF_TX_DESC_DTYPE_DATA |
1391 ((uint64_t)flags << IAVF_TXD_QW1_CMD_SHIFT));
1393 /* if unaligned on 32-bit boundary, do one to align */
1394 if (((uintptr_t)txdp & 0x1F) != 0 && nb_pkts != 0) {
1395 iavf_vtx1(txdp, *pkt, flags);
1396 nb_pkts--, txdp++, pkt++;
1399 /* do two at a time while possible, in bursts */
1400 for (; nb_pkts > 3; txdp += 4, pkt += 4, nb_pkts -= 4) {
1403 ((uint64_t)pkt[3]->data_len <<
1404 IAVF_TXD_QW1_TX_BUF_SZ_SHIFT);
1407 ((uint64_t)pkt[2]->data_len <<
1408 IAVF_TXD_QW1_TX_BUF_SZ_SHIFT);
1411 ((uint64_t)pkt[1]->data_len <<
1412 IAVF_TXD_QW1_TX_BUF_SZ_SHIFT);
1415 ((uint64_t)pkt[0]->data_len <<
1416 IAVF_TXD_QW1_TX_BUF_SZ_SHIFT);
1421 pkt[3]->buf_iova + pkt[3]->data_off,
1423 pkt[2]->buf_iova + pkt[2]->data_off);
1427 pkt[1]->buf_iova + pkt[1]->data_off,
1429 pkt[0]->buf_iova + pkt[0]->data_off);
1430 _mm256_store_si256((void *)(txdp + 2), desc2_3);
1431 _mm256_store_si256((void *)txdp, desc0_1);
1434 /* do any last ones */
1436 iavf_vtx1(txdp, *pkt, flags);
1437 txdp++, pkt++, nb_pkts--;
1441 static inline uint16_t
1442 iavf_xmit_fixed_burst_vec_avx2(void *tx_queue, struct rte_mbuf **tx_pkts,
1445 struct iavf_tx_queue *txq = (struct iavf_tx_queue *)tx_queue;
1446 volatile struct iavf_tx_desc *txdp;
1447 struct iavf_tx_entry *txep;
1448 uint16_t n, nb_commit, tx_id;
1449 /* bit2 is reserved and must be set to 1 according to Spec */
1450 uint64_t flags = IAVF_TX_DESC_CMD_EOP | IAVF_TX_DESC_CMD_ICRC;
1451 uint64_t rs = IAVF_TX_DESC_CMD_RS | flags;
1453 if (txq->nb_free < txq->free_thresh)
1454 iavf_tx_free_bufs(txq);
1456 nb_commit = nb_pkts = (uint16_t)RTE_MIN(txq->nb_free, nb_pkts);
1457 if (unlikely(nb_pkts == 0))
1460 tx_id = txq->tx_tail;
1461 txdp = &txq->tx_ring[tx_id];
1462 txep = &txq->sw_ring[tx_id];
1464 txq->nb_free = (uint16_t)(txq->nb_free - nb_pkts);
1466 n = (uint16_t)(txq->nb_tx_desc - tx_id);
1467 if (nb_commit >= n) {
1468 tx_backlog_entry(txep, tx_pkts, n);
1470 iavf_vtx(txdp, tx_pkts, n - 1, flags);
1474 iavf_vtx1(txdp, *tx_pkts++, rs);
1476 nb_commit = (uint16_t)(nb_commit - n);
1479 txq->next_rs = (uint16_t)(txq->rs_thresh - 1);
1481 /* avoid reach the end of ring */
1482 txdp = &txq->tx_ring[tx_id];
1483 txep = &txq->sw_ring[tx_id];
1486 tx_backlog_entry(txep, tx_pkts, nb_commit);
1488 iavf_vtx(txdp, tx_pkts, nb_commit, flags);
1490 tx_id = (uint16_t)(tx_id + nb_commit);
1491 if (tx_id > txq->next_rs) {
1492 txq->tx_ring[txq->next_rs].cmd_type_offset_bsz |=
1493 rte_cpu_to_le_64(((uint64_t)IAVF_TX_DESC_CMD_RS) <<
1494 IAVF_TXD_QW1_CMD_SHIFT);
1496 (uint16_t)(txq->next_rs + txq->rs_thresh);
1499 txq->tx_tail = tx_id;
1501 IAVF_PCI_REG_WC_WRITE(txq->qtx_tail, txq->tx_tail);
1507 iavf_xmit_pkts_vec_avx2(void *tx_queue, struct rte_mbuf **tx_pkts,
1511 struct iavf_tx_queue *txq = (struct iavf_tx_queue *)tx_queue;
1516 /* cross rs_thresh boundary is not allowed */
1517 num = (uint16_t)RTE_MIN(nb_pkts, txq->rs_thresh);
1518 ret = iavf_xmit_fixed_burst_vec_avx2(tx_queue, &tx_pkts[nb_tx],