net/iavf: add offload path for Tx AVX512
[dpdk.git] / drivers / net / iavf / iavf_rxtx_vec_avx512.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2020 Intel Corporation
3  */
4
5 #include "iavf_rxtx_vec_common.h"
6
7 #include <x86intrin.h>
8
9 #ifndef __INTEL_COMPILER
10 #pragma GCC diagnostic ignored "-Wcast-qual"
11 #endif
12
13 #define IAVF_DESCS_PER_LOOP_AVX 8
14 #define PKTLEN_SHIFT 10
15
16 static __rte_always_inline void
17 iavf_rxq_rearm(struct iavf_rx_queue *rxq)
18 {
19         int i;
20         uint16_t rx_id;
21         volatile union iavf_rx_desc *rxdp;
22         struct rte_mempool_cache *cache =
23                 rte_mempool_default_cache(rxq->mp, rte_lcore_id());
24         struct rte_mbuf **rxp = &rxq->sw_ring[rxq->rxrearm_start];
25
26         rxdp = rxq->rx_ring + rxq->rxrearm_start;
27
28         if (unlikely(!cache))
29                 return iavf_rxq_rearm_common(rxq, true);
30
31         /* We need to pull 'n' more MBUFs into the software ring from mempool
32          * We inline the mempool function here, so we can vectorize the copy
33          * from the cache into the shadow ring.
34          */
35
36         /* Can this be satisfied from the cache? */
37         if (cache->len < IAVF_RXQ_REARM_THRESH) {
38                 /* No. Backfill the cache first, and then fill from it */
39                 uint32_t req = IAVF_RXQ_REARM_THRESH + (cache->size -
40                                                         cache->len);
41
42                 /* How many do we require i.e. number to fill the cache + the request */
43                 int ret = rte_mempool_ops_dequeue_bulk
44                                 (rxq->mp, &cache->objs[cache->len], req);
45                 if (ret == 0) {
46                         cache->len += req;
47                 } else {
48                         if (rxq->rxrearm_nb + IAVF_RXQ_REARM_THRESH >=
49                             rxq->nb_rx_desc) {
50                                 __m128i dma_addr0;
51
52                                 dma_addr0 = _mm_setzero_si128();
53                                 for (i = 0; i < IAVF_VPMD_DESCS_PER_LOOP; i++) {
54                                         rxp[i] = &rxq->fake_mbuf;
55                                         _mm_storeu_si128((__m128i *)&rxdp[i].read,
56                                                          dma_addr0);
57                                 }
58                         }
59                         rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed +=
60                                         IAVF_RXQ_REARM_THRESH;
61                         return;
62                 }
63         }
64
65         const __m512i iova_offsets =  _mm512_set1_epi64(offsetof
66                                                         (struct rte_mbuf, buf_iova));
67         const __m512i headroom = _mm512_set1_epi64(RTE_PKTMBUF_HEADROOM);
68
69 #ifndef RTE_LIBRTE_IAVF_16BYTE_RX_DESC
70         /* to shuffle the addresses to correct slots. Values 4-7 will contain
71          * zeros, so use 7 for a zero-value.
72          */
73         const __m512i permute_idx = _mm512_set_epi64(7, 7, 3, 1, 7, 7, 2, 0);
74 #else
75         const __m512i permute_idx = _mm512_set_epi64(7, 3, 6, 2, 5, 1, 4, 0);
76 #endif
77
78         /* Initialize the mbufs in vector, process 8 mbufs in one loop, taking
79          * from mempool cache and populating both shadow and HW rings
80          */
81         for (i = 0; i < IAVF_RXQ_REARM_THRESH / IAVF_DESCS_PER_LOOP_AVX; i++) {
82                 const __m512i mbuf_ptrs = _mm512_loadu_si512
83                         (&cache->objs[cache->len - IAVF_DESCS_PER_LOOP_AVX]);
84                 _mm512_storeu_si512(rxp, mbuf_ptrs);
85
86                 const __m512i iova_base_addrs = _mm512_i64gather_epi64
87                                 (_mm512_add_epi64(mbuf_ptrs, iova_offsets),
88                                  0, /* base */
89                                  1  /* scale */);
90                 const __m512i iova_addrs = _mm512_add_epi64(iova_base_addrs,
91                                 headroom);
92 #ifndef RTE_LIBRTE_IAVF_16BYTE_RX_DESC
93                 const __m512i iovas0 = _mm512_castsi256_si512
94                                 (_mm512_extracti64x4_epi64(iova_addrs, 0));
95                 const __m512i iovas1 = _mm512_castsi256_si512
96                                 (_mm512_extracti64x4_epi64(iova_addrs, 1));
97
98                 /* permute leaves desc 2-3 addresses in header address slots 0-1
99                  * but these are ignored by driver since header split not
100                  * enabled. Similarly for desc 6 & 7.
101                  */
102                 const __m512i desc0_1 = _mm512_permutexvar_epi64
103                                 (permute_idx,
104                                  iovas0);
105                 const __m512i desc2_3 = _mm512_bsrli_epi128(desc0_1, 8);
106
107                 const __m512i desc4_5 = _mm512_permutexvar_epi64
108                                 (permute_idx,
109                                  iovas1);
110                 const __m512i desc6_7 = _mm512_bsrli_epi128(desc4_5, 8);
111
112                 _mm512_storeu_si512((void *)rxdp, desc0_1);
113                 _mm512_storeu_si512((void *)(rxdp + 2), desc2_3);
114                 _mm512_storeu_si512((void *)(rxdp + 4), desc4_5);
115                 _mm512_storeu_si512((void *)(rxdp + 6), desc6_7);
116 #else
117                 /* permute leaves desc 4-7 addresses in header address slots 0-3
118                  * but these are ignored by driver since header split not
119                  * enabled.
120                  */
121                 const __m512i desc0_3 = _mm512_permutexvar_epi64(permute_idx,
122                                                                  iova_addrs);
123                 const __m512i desc4_7 = _mm512_bsrli_epi128(desc0_3, 8);
124
125                 _mm512_storeu_si512((void *)rxdp, desc0_3);
126                 _mm512_storeu_si512((void *)(rxdp + 4), desc4_7);
127 #endif
128                 rxp += IAVF_DESCS_PER_LOOP_AVX;
129                 rxdp += IAVF_DESCS_PER_LOOP_AVX;
130                 cache->len -= IAVF_DESCS_PER_LOOP_AVX;
131         }
132
133         rxq->rxrearm_start += IAVF_RXQ_REARM_THRESH;
134         if (rxq->rxrearm_start >= rxq->nb_rx_desc)
135                 rxq->rxrearm_start = 0;
136
137         rxq->rxrearm_nb -= IAVF_RXQ_REARM_THRESH;
138
139         rx_id = (uint16_t)((rxq->rxrearm_start == 0) ?
140                            (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1));
141
142         /* Update the tail pointer on the NIC */
143         IAVF_PCI_REG_WRITE(rxq->qrx_tail, rx_id);
144 }
145
146 #define IAVF_RX_LEN_MASK 0x80808080
147 static inline uint16_t
148 _iavf_recv_raw_pkts_vec_avx512(struct iavf_rx_queue *rxq,
149                                struct rte_mbuf **rx_pkts,
150                                uint16_t nb_pkts, uint8_t *split_packet)
151 {
152         const uint32_t *type_table = rxq->vsi->adapter->ptype_tbl;
153
154         const __m256i mbuf_init = _mm256_set_epi64x(0, 0, 0,
155                                                     rxq->mbuf_initializer);
156         struct rte_mbuf **sw_ring = &rxq->sw_ring[rxq->rx_tail];
157         volatile union iavf_rx_desc *rxdp = rxq->rx_ring + rxq->rx_tail;
158
159         rte_prefetch0(rxdp);
160
161         /* nb_pkts has to be floor-aligned to IAVF_DESCS_PER_LOOP_AVX */
162         nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, IAVF_DESCS_PER_LOOP_AVX);
163
164         /* See if we need to rearm the RX queue - gives the prefetch a bit
165          * of time to act
166          */
167         if (rxq->rxrearm_nb > IAVF_RXQ_REARM_THRESH)
168                 iavf_rxq_rearm(rxq);
169
170         /* Before we start moving massive data around, check to see if
171          * there is actually a packet available
172          */
173         if (!(rxdp->wb.qword1.status_error_len &
174               rte_cpu_to_le_32(1 << IAVF_RX_DESC_STATUS_DD_SHIFT)))
175                 return 0;
176
177         /* constants used in processing loop */
178         const __m512i crc_adjust =
179                 _mm512_set_epi32
180                         (/* 1st descriptor */
181                          0,             /* ignore non-length fields */
182                          -rxq->crc_len, /* sub crc on data_len */
183                          -rxq->crc_len, /* sub crc on pkt_len */
184                          0,             /* ignore pkt_type field */
185                          /* 2nd descriptor */
186                          0,             /* ignore non-length fields */
187                          -rxq->crc_len, /* sub crc on data_len */
188                          -rxq->crc_len, /* sub crc on pkt_len */
189                          0,             /* ignore pkt_type field */
190                          /* 3rd descriptor */
191                          0,             /* ignore non-length fields */
192                          -rxq->crc_len, /* sub crc on data_len */
193                          -rxq->crc_len, /* sub crc on pkt_len */
194                          0,             /* ignore pkt_type field */
195                          /* 4th descriptor */
196                          0,             /* ignore non-length fields */
197                          -rxq->crc_len, /* sub crc on data_len */
198                          -rxq->crc_len, /* sub crc on pkt_len */
199                          0              /* ignore pkt_type field */
200                         );
201
202         /* 8 packets DD mask, LSB in each 32-bit value */
203         const __m256i dd_check = _mm256_set1_epi32(1);
204
205         /* 8 packets EOP mask, second-LSB in each 32-bit value */
206         const __m256i eop_check = _mm256_slli_epi32(dd_check,
207                         IAVF_RX_DESC_STATUS_EOF_SHIFT);
208
209         /* mask to shuffle from desc. to mbuf (4 descriptors)*/
210         const __m512i shuf_msk =
211                 _mm512_set_epi32
212                         (/* 1st descriptor */
213                          0x07060504,    /* octet 4~7, 32bits rss */
214                          0x03020F0E,    /* octet 2~3, low 16 bits vlan_macip */
215                                         /* octet 15~14, 16 bits data_len */
216                          0xFFFF0F0E,    /* skip high 16 bits pkt_len, zero out */
217                                         /* octet 15~14, low 16 bits pkt_len */
218                          0xFFFFFFFF,    /* pkt_type set as unknown */
219                          /* 2nd descriptor */
220                          0x07060504,    /* octet 4~7, 32bits rss */
221                          0x03020F0E,    /* octet 2~3, low 16 bits vlan_macip */
222                                         /* octet 15~14, 16 bits data_len */
223                          0xFFFF0F0E,    /* skip high 16 bits pkt_len, zero out */
224                                         /* octet 15~14, low 16 bits pkt_len */
225                          0xFFFFFFFF,    /* pkt_type set as unknown */
226                          /* 3rd descriptor */
227                          0x07060504,    /* octet 4~7, 32bits rss */
228                          0x03020F0E,    /* octet 2~3, low 16 bits vlan_macip */
229                                         /* octet 15~14, 16 bits data_len */
230                          0xFFFF0F0E,    /* skip high 16 bits pkt_len, zero out */
231                                         /* octet 15~14, low 16 bits pkt_len */
232                          0xFFFFFFFF,    /* pkt_type set as unknown */
233                          /* 4th descriptor */
234                          0x07060504,    /* octet 4~7, 32bits rss */
235                          0x03020F0E,    /* octet 2~3, low 16 bits vlan_macip */
236                                         /* octet 15~14, 16 bits data_len */
237                          0xFFFF0F0E,    /* skip high 16 bits pkt_len, zero out */
238                                         /* octet 15~14, low 16 bits pkt_len */
239                          0xFFFFFFFF     /* pkt_type set as unknown */
240                         );
241         /**
242          * compile-time check the above crc and shuffle layout is correct.
243          * NOTE: the first field (lowest address) is given last in set_epi
244          * calls above.
245          */
246         RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) !=
247                          offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4);
248         RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) !=
249                          offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8);
250         RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, vlan_tci) !=
251                          offsetof(struct rte_mbuf, rx_descriptor_fields1) + 10);
252         RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, hash) !=
253                          offsetof(struct rte_mbuf, rx_descriptor_fields1) + 12);
254
255         /* Status/Error flag masks */
256         /**
257          * mask everything except RSS, flow director and VLAN flags
258          * bit2 is for VLAN tag, bit11 for flow director indication
259          * bit13:12 for RSS indication. Bits 3-5 of error
260          * field (bits 22-24) are for IP/L4 checksum errors
261          */
262         const __m256i flags_mask =
263                 _mm256_set1_epi32((1 << 2) | (1 << 11) |
264                                   (3 << 12) | (7 << 22));
265         /**
266          * data to be shuffled by result of flag mask. If VLAN bit is set,
267          * (bit 2), then position 4 in this array will be used in the
268          * destination
269          */
270         const __m256i vlan_flags_shuf =
271                 _mm256_set_epi32(0, 0, PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED, 0,
272                                  0, 0, PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED, 0);
273         /**
274          * data to be shuffled by result of flag mask, shifted down 11.
275          * If RSS/FDIR bits are set, shuffle moves appropriate flags in
276          * place.
277          */
278         const __m256i rss_flags_shuf =
279                 _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
280                                 PKT_RX_RSS_HASH | PKT_RX_FDIR, PKT_RX_RSS_HASH,
281                                 0, 0, 0, 0, PKT_RX_FDIR, 0,/* end up 128-bits */
282                                 0, 0, 0, 0, 0, 0, 0, 0,
283                                 PKT_RX_RSS_HASH | PKT_RX_FDIR, PKT_RX_RSS_HASH,
284                                 0, 0, 0, 0, PKT_RX_FDIR, 0);
285
286         /**
287          * data to be shuffled by the result of the flags mask shifted by 22
288          * bits.  This gives use the l3_l4 flags.
289          */
290         const __m256i l3_l4_flags_shuf = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
291                         /* shift right 1 bit to make sure it not exceed 255 */
292                         (PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
293                          PKT_RX_IP_CKSUM_BAD) >> 1,
294                         (PKT_RX_IP_CKSUM_GOOD | PKT_RX_OUTER_IP_CKSUM_BAD |
295                          PKT_RX_L4_CKSUM_BAD) >> 1,
296                         (PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
297                         (PKT_RX_IP_CKSUM_GOOD | PKT_RX_OUTER_IP_CKSUM_BAD) >> 1,
298                         (PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
299                         (PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD) >> 1,
300                         PKT_RX_IP_CKSUM_BAD >> 1,
301                         (PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD) >> 1,
302                         /* second 128-bits */
303                         0, 0, 0, 0, 0, 0, 0, 0,
304                         (PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
305                          PKT_RX_IP_CKSUM_BAD) >> 1,
306                         (PKT_RX_IP_CKSUM_GOOD | PKT_RX_OUTER_IP_CKSUM_BAD |
307                          PKT_RX_L4_CKSUM_BAD) >> 1,
308                         (PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
309                         (PKT_RX_IP_CKSUM_GOOD | PKT_RX_OUTER_IP_CKSUM_BAD) >> 1,
310                         (PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
311                         (PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD) >> 1,
312                         PKT_RX_IP_CKSUM_BAD >> 1,
313                         (PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD) >> 1);
314
315         const __m256i cksum_mask =
316                 _mm256_set1_epi32(PKT_RX_IP_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD |
317                                   PKT_RX_L4_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD |
318                                   PKT_RX_OUTER_IP_CKSUM_BAD);
319
320         uint16_t i, received;
321
322         for (i = 0, received = 0; i < nb_pkts;
323              i += IAVF_DESCS_PER_LOOP_AVX,
324              rxdp += IAVF_DESCS_PER_LOOP_AVX) {
325                 /* step 1, copy over 8 mbuf pointers to rx_pkts array */
326                 _mm256_storeu_si256((void *)&rx_pkts[i],
327                                     _mm256_loadu_si256((void *)&sw_ring[i]));
328 #ifdef RTE_ARCH_X86_64
329                 _mm256_storeu_si256
330                         ((void *)&rx_pkts[i + 4],
331                          _mm256_loadu_si256((void *)&sw_ring[i + 4]));
332 #endif
333
334                 __m512i raw_desc0_3, raw_desc4_7;
335                 const __m128i raw_desc7 =
336                         _mm_load_si128((void *)(rxdp + 7));
337                 rte_compiler_barrier();
338                 const __m128i raw_desc6 =
339                         _mm_load_si128((void *)(rxdp + 6));
340                 rte_compiler_barrier();
341                 const __m128i raw_desc5 =
342                         _mm_load_si128((void *)(rxdp + 5));
343                 rte_compiler_barrier();
344                 const __m128i raw_desc4 =
345                         _mm_load_si128((void *)(rxdp + 4));
346                 rte_compiler_barrier();
347                 const __m128i raw_desc3 =
348                         _mm_load_si128((void *)(rxdp + 3));
349                 rte_compiler_barrier();
350                 const __m128i raw_desc2 =
351                         _mm_load_si128((void *)(rxdp + 2));
352                 rte_compiler_barrier();
353                 const __m128i raw_desc1 =
354                         _mm_load_si128((void *)(rxdp + 1));
355                 rte_compiler_barrier();
356                 const __m128i raw_desc0 =
357                         _mm_load_si128((void *)(rxdp + 0));
358
359                 raw_desc4_7 = _mm512_broadcast_i32x4(raw_desc4);
360                 raw_desc4_7 = _mm512_inserti32x4(raw_desc4_7, raw_desc5, 1);
361                 raw_desc4_7 = _mm512_inserti32x4(raw_desc4_7, raw_desc6, 2);
362                 raw_desc4_7 = _mm512_inserti32x4(raw_desc4_7, raw_desc7, 3);
363                 raw_desc0_3 = _mm512_broadcast_i32x4(raw_desc0);
364                 raw_desc0_3 = _mm512_inserti32x4(raw_desc0_3, raw_desc1, 1);
365                 raw_desc0_3 = _mm512_inserti32x4(raw_desc0_3, raw_desc2, 2);
366                 raw_desc0_3 = _mm512_inserti32x4(raw_desc0_3, raw_desc3, 3);
367
368                 if (split_packet) {
369                         int j;
370
371                         for (j = 0; j < IAVF_DESCS_PER_LOOP_AVX; j++)
372                                 rte_mbuf_prefetch_part2(rx_pkts[i + j]);
373                 }
374
375                 /**
376                  * convert descriptors 4-7 into mbufs, adjusting length and
377                  * re-arranging fields. Then write into the mbuf
378                  */
379                 const __m512i len4_7 = _mm512_slli_epi32(raw_desc4_7,
380                                                          PKTLEN_SHIFT);
381                 const __m512i desc4_7 = _mm512_mask_blend_epi16(IAVF_RX_LEN_MASK,
382                                                                 raw_desc4_7,
383                                                                 len4_7);
384                 __m512i mb4_7 = _mm512_shuffle_epi8(desc4_7, shuf_msk);
385
386                 mb4_7 = _mm512_add_epi32(mb4_7, crc_adjust);
387                 /**
388                  * to get packet types, shift 64-bit values down 30 bits
389                  * and so ptype is in lower 8-bits in each
390                  */
391                 const __m512i ptypes4_7 = _mm512_srli_epi64(desc4_7, 30);
392                 const __m256i ptypes6_7 = _mm512_extracti64x4_epi64(ptypes4_7, 1);
393                 const __m256i ptypes4_5 = _mm512_extracti64x4_epi64(ptypes4_7, 0);
394                 const uint8_t ptype7 = _mm256_extract_epi8(ptypes6_7, 24);
395                 const uint8_t ptype6 = _mm256_extract_epi8(ptypes6_7, 8);
396                 const uint8_t ptype5 = _mm256_extract_epi8(ptypes4_5, 24);
397                 const uint8_t ptype4 = _mm256_extract_epi8(ptypes4_5, 8);
398
399                 const __m512i ptype4_7 = _mm512_set_epi32
400                         (0, 0, 0, type_table[ptype7],
401                          0, 0, 0, type_table[ptype6],
402                          0, 0, 0, type_table[ptype5],
403                          0, 0, 0, type_table[ptype4]);
404                 mb4_7 = _mm512_mask_blend_epi32(0x1111, mb4_7, ptype4_7);
405
406                 /**
407                  * convert descriptors 0-3 into mbufs, adjusting length and
408                  * re-arranging fields. Then write into the mbuf
409                  */
410                 const __m512i len0_3 = _mm512_slli_epi32(raw_desc0_3,
411                                                          PKTLEN_SHIFT);
412                 const __m512i desc0_3 = _mm512_mask_blend_epi16(IAVF_RX_LEN_MASK,
413                                                                 raw_desc0_3,
414                                                                 len0_3);
415                 __m512i mb0_3 = _mm512_shuffle_epi8(desc0_3, shuf_msk);
416
417                 mb0_3 = _mm512_add_epi32(mb0_3, crc_adjust);
418                 /* get the packet types */
419                 const __m512i ptypes0_3 = _mm512_srli_epi64(desc0_3, 30);
420                 const __m256i ptypes2_3 = _mm512_extracti64x4_epi64(ptypes0_3, 1);
421                 const __m256i ptypes0_1 = _mm512_extracti64x4_epi64(ptypes0_3, 0);
422                 const uint8_t ptype3 = _mm256_extract_epi8(ptypes2_3, 24);
423                 const uint8_t ptype2 = _mm256_extract_epi8(ptypes2_3, 8);
424                 const uint8_t ptype1 = _mm256_extract_epi8(ptypes0_1, 24);
425                 const uint8_t ptype0 = _mm256_extract_epi8(ptypes0_1, 8);
426
427                 const __m512i ptype0_3 = _mm512_set_epi32
428                         (0, 0, 0, type_table[ptype3],
429                          0, 0, 0, type_table[ptype2],
430                          0, 0, 0, type_table[ptype1],
431                          0, 0, 0, type_table[ptype0]);
432                 mb0_3 = _mm512_mask_blend_epi32(0x1111, mb0_3, ptype0_3);
433
434                 /**
435                  * use permute/extract to get status content
436                  * After the operations, the packets status flags are in the
437                  * order (hi->lo): [1, 3, 5, 7, 0, 2, 4, 6]
438                  */
439                 /* merge the status bits into one register */
440                 const __m512i status_permute_msk = _mm512_set_epi32
441                         (0, 0, 0, 0,
442                          0, 0, 0, 0,
443                          22, 30, 6, 14,
444                          18, 26, 2, 10);
445                 const __m512i raw_status0_7 = _mm512_permutex2var_epi32
446                         (raw_desc4_7, status_permute_msk, raw_desc0_3);
447                 __m256i status0_7 = _mm512_extracti64x4_epi64
448                         (raw_status0_7, 0);
449
450                 /* now do flag manipulation */
451
452                 /* get only flag/error bits we want */
453                 const __m256i flag_bits =
454                         _mm256_and_si256(status0_7, flags_mask);
455                 /* set vlan and rss flags */
456                 const __m256i vlan_flags =
457                         _mm256_shuffle_epi8(vlan_flags_shuf, flag_bits);
458                 const __m256i rss_flags =
459                         _mm256_shuffle_epi8(rss_flags_shuf,
460                                             _mm256_srli_epi32(flag_bits, 11));
461                 /**
462                  * l3_l4_error flags, shuffle, then shift to correct adjustment
463                  * of flags in flags_shuf, and finally mask out extra bits
464                  */
465                 __m256i l3_l4_flags = _mm256_shuffle_epi8(l3_l4_flags_shuf,
466                                                 _mm256_srli_epi32(flag_bits, 22));
467                 l3_l4_flags = _mm256_slli_epi32(l3_l4_flags, 1);
468                 l3_l4_flags = _mm256_and_si256(l3_l4_flags, cksum_mask);
469
470                 /* merge flags */
471                 const __m256i mbuf_flags = _mm256_or_si256(l3_l4_flags,
472                                 _mm256_or_si256(rss_flags, vlan_flags));
473                 /**
474                  * At this point, we have the 8 sets of flags in the low 16-bits
475                  * of each 32-bit value in vlan0.
476                  * We want to extract these, and merge them with the mbuf init
477                  * data so we can do a single write to the mbuf to set the flags
478                  * and all the other initialization fields. Extracting the
479                  * appropriate flags means that we have to do a shift and blend
480                  * for each mbuf before we do the write. However, we can also
481                  * add in the previously computed rx_descriptor fields to
482                  * make a single 256-bit write per mbuf
483                  */
484                 /* check the structure matches expectations */
485                 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, ol_flags) !=
486                                  offsetof(struct rte_mbuf, rearm_data) + 8);
487                 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, rearm_data) !=
488                                  RTE_ALIGN(offsetof(struct rte_mbuf,
489                                                     rearm_data),
490                                            16));
491                 /* build up data and do writes */
492                 __m256i rearm0, rearm1, rearm2, rearm3, rearm4, rearm5,
493                         rearm6, rearm7;
494                 const __m256i mb4_5 = _mm512_extracti64x4_epi64(mb4_7, 0);
495                 const __m256i mb6_7 = _mm512_extracti64x4_epi64(mb4_7, 1);
496                 const __m256i mb0_1 = _mm512_extracti64x4_epi64(mb0_3, 0);
497                 const __m256i mb2_3 = _mm512_extracti64x4_epi64(mb0_3, 1);
498
499                 rearm6 = _mm256_blend_epi32(mbuf_init,
500                                             _mm256_slli_si256(mbuf_flags, 8),
501                                             0x04);
502                 rearm4 = _mm256_blend_epi32(mbuf_init,
503                                             _mm256_slli_si256(mbuf_flags, 4),
504                                             0x04);
505                 rearm2 = _mm256_blend_epi32(mbuf_init, mbuf_flags, 0x04);
506                 rearm0 = _mm256_blend_epi32(mbuf_init,
507                                             _mm256_srli_si256(mbuf_flags, 4),
508                                             0x04);
509                 /* permute to add in the rx_descriptor e.g. rss fields */
510                 rearm6 = _mm256_permute2f128_si256(rearm6, mb6_7, 0x20);
511                 rearm4 = _mm256_permute2f128_si256(rearm4, mb4_5, 0x20);
512                 rearm2 = _mm256_permute2f128_si256(rearm2, mb2_3, 0x20);
513                 rearm0 = _mm256_permute2f128_si256(rearm0, mb0_1, 0x20);
514                 /* write to mbuf */
515                 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 6]->rearm_data,
516                                     rearm6);
517                 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 4]->rearm_data,
518                                     rearm4);
519                 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 2]->rearm_data,
520                                     rearm2);
521                 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 0]->rearm_data,
522                                     rearm0);
523
524                 /* repeat for the odd mbufs */
525                 const __m256i odd_flags =
526                         _mm256_castsi128_si256
527                                 (_mm256_extracti128_si256(mbuf_flags, 1));
528                 rearm7 = _mm256_blend_epi32(mbuf_init,
529                                             _mm256_slli_si256(odd_flags, 8),
530                                             0x04);
531                 rearm5 = _mm256_blend_epi32(mbuf_init,
532                                             _mm256_slli_si256(odd_flags, 4),
533                                             0x04);
534                 rearm3 = _mm256_blend_epi32(mbuf_init, odd_flags, 0x04);
535                 rearm1 = _mm256_blend_epi32(mbuf_init,
536                                             _mm256_srli_si256(odd_flags, 4),
537                                             0x04);
538                 /* since odd mbufs are already in hi 128-bits use blend */
539                 rearm7 = _mm256_blend_epi32(rearm7, mb6_7, 0xF0);
540                 rearm5 = _mm256_blend_epi32(rearm5, mb4_5, 0xF0);
541                 rearm3 = _mm256_blend_epi32(rearm3, mb2_3, 0xF0);
542                 rearm1 = _mm256_blend_epi32(rearm1, mb0_1, 0xF0);
543                 /* again write to mbufs */
544                 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 7]->rearm_data,
545                                     rearm7);
546                 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 5]->rearm_data,
547                                     rearm5);
548                 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 3]->rearm_data,
549                                     rearm3);
550                 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 1]->rearm_data,
551                                     rearm1);
552
553                 /* extract and record EOP bit */
554                 if (split_packet) {
555                         const __m128i eop_mask =
556                                 _mm_set1_epi16(1 << IAVF_RX_DESC_STATUS_EOF_SHIFT);
557                         const __m256i eop_bits256 = _mm256_and_si256(status0_7,
558                                                                      eop_check);
559                         /* pack status bits into a single 128-bit register */
560                         const __m128i eop_bits =
561                                 _mm_packus_epi32
562                                         (_mm256_castsi256_si128(eop_bits256),
563                                          _mm256_extractf128_si256(eop_bits256,
564                                                                   1));
565                         /**
566                          * flip bits, and mask out the EOP bit, which is now
567                          * a split-packet bit i.e. !EOP, rather than EOP one.
568                          */
569                         __m128i split_bits = _mm_andnot_si128(eop_bits,
570                                                               eop_mask);
571                         /**
572                          * eop bits are out of order, so we need to shuffle them
573                          * back into order again. In doing so, only use low 8
574                          * bits, which acts like another pack instruction
575                          * The original order is (hi->lo): 1,3,5,7,0,2,4,6
576                          * [Since we use epi8, the 16-bit positions are
577                          * multiplied by 2 in the eop_shuffle value.]
578                          */
579                         __m128i eop_shuffle =
580                                 _mm_set_epi8(/* zero hi 64b */
581                                              0xFF, 0xFF, 0xFF, 0xFF,
582                                              0xFF, 0xFF, 0xFF, 0xFF,
583                                              /* move values to lo 64b */
584                                              8, 0, 10, 2,
585                                              12, 4, 14, 6);
586                         split_bits = _mm_shuffle_epi8(split_bits, eop_shuffle);
587                         *(uint64_t *)split_packet =
588                                 _mm_cvtsi128_si64(split_bits);
589                         split_packet += IAVF_DESCS_PER_LOOP_AVX;
590                 }
591
592                 /* perform dd_check */
593                 status0_7 = _mm256_and_si256(status0_7, dd_check);
594                 status0_7 = _mm256_packs_epi32(status0_7,
595                                                _mm256_setzero_si256());
596
597                 uint64_t burst = __builtin_popcountll
598                                         (_mm_cvtsi128_si64
599                                                 (_mm256_extracti128_si256
600                                                         (status0_7, 1)));
601                 burst += __builtin_popcountll
602                                 (_mm_cvtsi128_si64
603                                         (_mm256_castsi256_si128(status0_7)));
604                 received += burst;
605                 if (burst != IAVF_DESCS_PER_LOOP_AVX)
606                         break;
607         }
608
609         /* update tail pointers */
610         rxq->rx_tail += received;
611         rxq->rx_tail &= (rxq->nb_rx_desc - 1);
612         if ((rxq->rx_tail & 1) == 1 && received > 1) { /* keep aligned */
613                 rxq->rx_tail--;
614                 received--;
615         }
616         rxq->rxrearm_nb += received;
617         return received;
618 }
619
620 static inline __m256i
621 flex_rxd_to_fdir_flags_vec_avx512(const __m256i fdir_id0_7)
622 {
623 #define FDID_MIS_MAGIC 0xFFFFFFFF
624         RTE_BUILD_BUG_ON(PKT_RX_FDIR != (1 << 2));
625         RTE_BUILD_BUG_ON(PKT_RX_FDIR_ID != (1 << 13));
626         const __m256i pkt_fdir_bit = _mm256_set1_epi32(PKT_RX_FDIR |
627                                                        PKT_RX_FDIR_ID);
628         /* desc->flow_id field == 0xFFFFFFFF means fdir mismatch */
629         const __m256i fdir_mis_mask = _mm256_set1_epi32(FDID_MIS_MAGIC);
630         __m256i fdir_mask = _mm256_cmpeq_epi32(fdir_id0_7,
631                                                fdir_mis_mask);
632         /* this XOR op results to bit-reverse the fdir_mask */
633         fdir_mask = _mm256_xor_si256(fdir_mask, fdir_mis_mask);
634         const __m256i fdir_flags = _mm256_and_si256(fdir_mask, pkt_fdir_bit);
635
636         return fdir_flags;
637 }
638
639 static inline uint16_t
640 _iavf_recv_raw_pkts_vec_avx512_flex_rxd(struct iavf_rx_queue *rxq,
641                                         struct rte_mbuf **rx_pkts,
642                                         uint16_t nb_pkts, uint8_t *split_packet)
643 {
644         const uint32_t *type_table = rxq->vsi->adapter->ptype_tbl;
645
646         const __m256i mbuf_init = _mm256_set_epi64x(0, 0, 0,
647                                                     rxq->mbuf_initializer);
648         struct rte_mbuf **sw_ring = &rxq->sw_ring[rxq->rx_tail];
649         volatile union iavf_rx_flex_desc *rxdp =
650                 (union iavf_rx_flex_desc *)rxq->rx_ring + rxq->rx_tail;
651
652         rte_prefetch0(rxdp);
653
654         /* nb_pkts has to be floor-aligned to IAVF_DESCS_PER_LOOP_AVX */
655         nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, IAVF_DESCS_PER_LOOP_AVX);
656
657         /* See if we need to rearm the RX queue - gives the prefetch a bit
658          * of time to act
659          */
660         if (rxq->rxrearm_nb > IAVF_RXQ_REARM_THRESH)
661                 iavf_rxq_rearm(rxq);
662
663         /* Before we start moving massive data around, check to see if
664          * there is actually a packet available
665          */
666         if (!(rxdp->wb.status_error0 &
667               rte_cpu_to_le_32(1 << IAVF_RX_FLEX_DESC_STATUS0_DD_S)))
668                 return 0;
669
670         /* constants used in processing loop */
671         const __m512i crc_adjust =
672                 _mm512_set_epi32
673                         (/* 1st descriptor */
674                          0,             /* ignore non-length fields */
675                          -rxq->crc_len, /* sub crc on data_len */
676                          -rxq->crc_len, /* sub crc on pkt_len */
677                          0,             /* ignore pkt_type field */
678                          /* 2nd descriptor */
679                          0,             /* ignore non-length fields */
680                          -rxq->crc_len, /* sub crc on data_len */
681                          -rxq->crc_len, /* sub crc on pkt_len */
682                          0,             /* ignore pkt_type field */
683                          /* 3rd descriptor */
684                          0,             /* ignore non-length fields */
685                          -rxq->crc_len, /* sub crc on data_len */
686                          -rxq->crc_len, /* sub crc on pkt_len */
687                          0,             /* ignore pkt_type field */
688                          /* 4th descriptor */
689                          0,             /* ignore non-length fields */
690                          -rxq->crc_len, /* sub crc on data_len */
691                          -rxq->crc_len, /* sub crc on pkt_len */
692                          0              /* ignore pkt_type field */
693                         );
694
695         /* 8 packets DD mask, LSB in each 32-bit value */
696         const __m256i dd_check = _mm256_set1_epi32(1);
697
698         /* 8 packets EOP mask, second-LSB in each 32-bit value */
699         const __m256i eop_check = _mm256_slli_epi32(dd_check,
700                         IAVF_RX_FLEX_DESC_STATUS0_EOF_S);
701
702         /* mask to shuffle from desc. to mbuf (4 descriptors)*/
703         const __m512i shuf_msk =
704                 _mm512_set_epi32
705                         (/* 1st descriptor */
706                          0xFFFFFFFF,    /* rss hash parsed separately */
707                          0x0B0A0504,    /* octet 10~11, 16 bits vlan_macip */
708                                         /* octet 4~5, 16 bits data_len */
709                          0xFFFF0504,    /* skip hi 16 bits pkt_len, zero out */
710                                         /* octet 4~5, 16 bits pkt_len */
711                          0xFFFFFFFF,    /* pkt_type set as unknown */
712                          /* 2nd descriptor */
713                          0xFFFFFFFF,    /* rss hash parsed separately */
714                          0x0B0A0504,    /* octet 10~11, 16 bits vlan_macip */
715                                         /* octet 4~5, 16 bits data_len */
716                          0xFFFF0504,    /* skip hi 16 bits pkt_len, zero out */
717                                         /* octet 4~5, 16 bits pkt_len */
718                          0xFFFFFFFF,    /* pkt_type set as unknown */
719                          /* 3rd descriptor */
720                          0xFFFFFFFF,    /* rss hash parsed separately */
721                          0x0B0A0504,    /* octet 10~11, 16 bits vlan_macip */
722                                         /* octet 4~5, 16 bits data_len */
723                          0xFFFF0504,    /* skip hi 16 bits pkt_len, zero out */
724                                         /* octet 4~5, 16 bits pkt_len */
725                          0xFFFFFFFF,    /* pkt_type set as unknown */
726                          /* 4th descriptor */
727                          0xFFFFFFFF,    /* rss hash parsed separately */
728                          0x0B0A0504,    /* octet 10~11, 16 bits vlan_macip */
729                                         /* octet 4~5, 16 bits data_len */
730                          0xFFFF0504,    /* skip hi 16 bits pkt_len, zero out */
731                                         /* octet 4~5, 16 bits pkt_len */
732                          0xFFFFFFFF     /* pkt_type set as unknown */
733                         );
734         /**
735          * compile-time check the above crc and shuffle layout is correct.
736          * NOTE: the first field (lowest address) is given last in set_epi
737          * calls above.
738          */
739         RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) !=
740                          offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4);
741         RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) !=
742                          offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8);
743         RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, vlan_tci) !=
744                          offsetof(struct rte_mbuf, rx_descriptor_fields1) + 10);
745         RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, hash) !=
746                          offsetof(struct rte_mbuf, rx_descriptor_fields1) + 12);
747
748         /* Status/Error flag masks */
749         /**
750          * mask everything except Checksum Reports, RSS indication
751          * and VLAN indication.
752          * bit6:4 for IP/L4 checksum errors.
753          * bit12 is for RSS indication.
754          * bit13 is for VLAN indication.
755          */
756         const __m256i flags_mask =
757                 _mm256_set1_epi32((7 << 4) | (1 << 12) | (1 << 13));
758         /**
759          * data to be shuffled by the result of the flags mask shifted by 4
760          * bits.  This gives use the l3_l4 flags.
761          */
762         const __m256i l3_l4_flags_shuf = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
763                         /* shift right 1 bit to make sure it not exceed 255 */
764                         (PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
765                          PKT_RX_IP_CKSUM_BAD) >> 1,
766                         (PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
767                          PKT_RX_IP_CKSUM_GOOD) >> 1,
768                         (PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD |
769                          PKT_RX_IP_CKSUM_BAD) >> 1,
770                         (PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD |
771                          PKT_RX_IP_CKSUM_GOOD) >> 1,
772                         (PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
773                         (PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_GOOD) >> 1,
774                         (PKT_RX_L4_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD) >> 1,
775                         (PKT_RX_L4_CKSUM_GOOD | PKT_RX_IP_CKSUM_GOOD) >> 1,
776                         /* second 128-bits */
777                         0, 0, 0, 0, 0, 0, 0, 0,
778                         (PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
779                          PKT_RX_IP_CKSUM_BAD) >> 1,
780                         (PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
781                          PKT_RX_IP_CKSUM_GOOD) >> 1,
782                         (PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD |
783                          PKT_RX_IP_CKSUM_BAD) >> 1,
784                         (PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD |
785                          PKT_RX_IP_CKSUM_GOOD) >> 1,
786                         (PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
787                         (PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_GOOD) >> 1,
788                         (PKT_RX_L4_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD) >> 1,
789                         (PKT_RX_L4_CKSUM_GOOD | PKT_RX_IP_CKSUM_GOOD) >> 1);
790         const __m256i cksum_mask =
791                 _mm256_set1_epi32(PKT_RX_IP_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD |
792                                   PKT_RX_L4_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD |
793                                   PKT_RX_OUTER_IP_CKSUM_BAD);
794         /**
795          * data to be shuffled by result of flag mask, shifted down 12.
796          * If RSS(bit12)/VLAN(bit13) are set,
797          * shuffle moves appropriate flags in place.
798          */
799         const __m256i rss_vlan_flags_shuf = _mm256_set_epi8(0, 0, 0, 0,
800                         0, 0, 0, 0,
801                         0, 0, 0, 0,
802                         PKT_RX_RSS_HASH | PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED,
803                         PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED,
804                         PKT_RX_RSS_HASH, 0,
805                         /* end up 128-bits */
806                         0, 0, 0, 0,
807                         0, 0, 0, 0,
808                         0, 0, 0, 0,
809                         PKT_RX_RSS_HASH | PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED,
810                         PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED,
811                         PKT_RX_RSS_HASH, 0);
812
813         uint16_t i, received;
814
815         for (i = 0, received = 0; i < nb_pkts;
816              i += IAVF_DESCS_PER_LOOP_AVX,
817              rxdp += IAVF_DESCS_PER_LOOP_AVX) {
818                 /* step 1, copy over 8 mbuf pointers to rx_pkts array */
819                 _mm256_storeu_si256((void *)&rx_pkts[i],
820                                     _mm256_loadu_si256((void *)&sw_ring[i]));
821 #ifdef RTE_ARCH_X86_64
822                 _mm256_storeu_si256
823                         ((void *)&rx_pkts[i + 4],
824                          _mm256_loadu_si256((void *)&sw_ring[i + 4]));
825 #endif
826
827                 __m512i raw_desc0_3, raw_desc4_7;
828
829                 const __m128i raw_desc7 =
830                         _mm_load_si128((void *)(rxdp + 7));
831                 rte_compiler_barrier();
832                 const __m128i raw_desc6 =
833                         _mm_load_si128((void *)(rxdp + 6));
834                 rte_compiler_barrier();
835                 const __m128i raw_desc5 =
836                         _mm_load_si128((void *)(rxdp + 5));
837                 rte_compiler_barrier();
838                 const __m128i raw_desc4 =
839                         _mm_load_si128((void *)(rxdp + 4));
840                 rte_compiler_barrier();
841                 const __m128i raw_desc3 =
842                         _mm_load_si128((void *)(rxdp + 3));
843                 rte_compiler_barrier();
844                 const __m128i raw_desc2 =
845                         _mm_load_si128((void *)(rxdp + 2));
846                 rte_compiler_barrier();
847                 const __m128i raw_desc1 =
848                         _mm_load_si128((void *)(rxdp + 1));
849                 rte_compiler_barrier();
850                 const __m128i raw_desc0 =
851                         _mm_load_si128((void *)(rxdp + 0));
852
853                 raw_desc4_7 = _mm512_broadcast_i32x4(raw_desc4);
854                 raw_desc4_7 = _mm512_inserti32x4(raw_desc4_7, raw_desc5, 1);
855                 raw_desc4_7 = _mm512_inserti32x4(raw_desc4_7, raw_desc6, 2);
856                 raw_desc4_7 = _mm512_inserti32x4(raw_desc4_7, raw_desc7, 3);
857                 raw_desc0_3 = _mm512_broadcast_i32x4(raw_desc0);
858                 raw_desc0_3 = _mm512_inserti32x4(raw_desc0_3, raw_desc1, 1);
859                 raw_desc0_3 = _mm512_inserti32x4(raw_desc0_3, raw_desc2, 2);
860                 raw_desc0_3 = _mm512_inserti32x4(raw_desc0_3, raw_desc3, 3);
861
862                 if (split_packet) {
863                         int j;
864
865                         for (j = 0; j < IAVF_DESCS_PER_LOOP_AVX; j++)
866                                 rte_mbuf_prefetch_part2(rx_pkts[i + j]);
867                 }
868
869                 /**
870                  * convert descriptors 4-7 into mbufs, re-arrange fields.
871                  * Then write into the mbuf.
872                  */
873                 __m512i mb4_7 = _mm512_shuffle_epi8(raw_desc4_7, shuf_msk);
874
875                 mb4_7 = _mm512_add_epi32(mb4_7, crc_adjust);
876                 /**
877                  * to get packet types, ptype is located in bit16-25
878                  * of each 128bits
879                  */
880                 const __m512i ptype_mask =
881                         _mm512_set1_epi16(IAVF_RX_FLEX_DESC_PTYPE_M);
882                 const __m512i ptypes4_7 =
883                         _mm512_and_si512(raw_desc4_7, ptype_mask);
884                 const __m256i ptypes6_7 = _mm512_extracti64x4_epi64(ptypes4_7, 1);
885                 const __m256i ptypes4_5 = _mm512_extracti64x4_epi64(ptypes4_7, 0);
886                 const uint16_t ptype7 = _mm256_extract_epi16(ptypes6_7, 9);
887                 const uint16_t ptype6 = _mm256_extract_epi16(ptypes6_7, 1);
888                 const uint16_t ptype5 = _mm256_extract_epi16(ptypes4_5, 9);
889                 const uint16_t ptype4 = _mm256_extract_epi16(ptypes4_5, 1);
890
891                 const __m512i ptype4_7 = _mm512_set_epi32
892                         (0, 0, 0, type_table[ptype7],
893                          0, 0, 0, type_table[ptype6],
894                          0, 0, 0, type_table[ptype5],
895                          0, 0, 0, type_table[ptype4]);
896                 mb4_7 = _mm512_mask_blend_epi32(0x1111, mb4_7, ptype4_7);
897
898                 /**
899                  * convert descriptors 0-3 into mbufs, re-arrange fields.
900                  * Then write into the mbuf.
901                  */
902                 __m512i mb0_3 = _mm512_shuffle_epi8(raw_desc0_3, shuf_msk);
903
904                 mb0_3 = _mm512_add_epi32(mb0_3, crc_adjust);
905                 /**
906                  * to get packet types, ptype is located in bit16-25
907                  * of each 128bits
908                  */
909                 const __m512i ptypes0_3 =
910                         _mm512_and_si512(raw_desc0_3, ptype_mask);
911                 const __m256i ptypes2_3 = _mm512_extracti64x4_epi64(ptypes0_3, 1);
912                 const __m256i ptypes0_1 = _mm512_extracti64x4_epi64(ptypes0_3, 0);
913                 const uint16_t ptype3 = _mm256_extract_epi16(ptypes2_3, 9);
914                 const uint16_t ptype2 = _mm256_extract_epi16(ptypes2_3, 1);
915                 const uint16_t ptype1 = _mm256_extract_epi16(ptypes0_1, 9);
916                 const uint16_t ptype0 = _mm256_extract_epi16(ptypes0_1, 1);
917
918                 const __m512i ptype0_3 = _mm512_set_epi32
919                         (0, 0, 0, type_table[ptype3],
920                          0, 0, 0, type_table[ptype2],
921                          0, 0, 0, type_table[ptype1],
922                          0, 0, 0, type_table[ptype0]);
923                 mb0_3 = _mm512_mask_blend_epi32(0x1111, mb0_3, ptype0_3);
924
925                 /**
926                  * use permute/extract to get status content
927                  * After the operations, the packets status flags are in the
928                  * order (hi->lo): [1, 3, 5, 7, 0, 2, 4, 6]
929                  */
930                 /* merge the status bits into one register */
931                 const __m512i status_permute_msk = _mm512_set_epi32
932                         (0, 0, 0, 0,
933                          0, 0, 0, 0,
934                          22, 30, 6, 14,
935                          18, 26, 2, 10);
936                 const __m512i raw_status0_7 = _mm512_permutex2var_epi32
937                         (raw_desc4_7, status_permute_msk, raw_desc0_3);
938                 __m256i status0_7 = _mm512_extracti64x4_epi64
939                         (raw_status0_7, 0);
940
941                 /* now do flag manipulation */
942
943                 /* get only flag/error bits we want */
944                 const __m256i flag_bits =
945                         _mm256_and_si256(status0_7, flags_mask);
946                 /**
947                  * l3_l4_error flags, shuffle, then shift to correct adjustment
948                  * of flags in flags_shuf, and finally mask out extra bits
949                  */
950                 __m256i l3_l4_flags = _mm256_shuffle_epi8(l3_l4_flags_shuf,
951                                 _mm256_srli_epi32(flag_bits, 4));
952                 l3_l4_flags = _mm256_slli_epi32(l3_l4_flags, 1);
953                 l3_l4_flags = _mm256_and_si256(l3_l4_flags, cksum_mask);
954                 /* set rss and vlan flags */
955                 const __m256i rss_vlan_flag_bits =
956                         _mm256_srli_epi32(flag_bits, 12);
957                 const __m256i rss_vlan_flags =
958                         _mm256_shuffle_epi8(rss_vlan_flags_shuf,
959                                             rss_vlan_flag_bits);
960
961                 /* merge flags */
962                 __m256i mbuf_flags = _mm256_or_si256(l3_l4_flags,
963                                                      rss_vlan_flags);
964
965                 if (rxq->fdir_enabled) {
966                         const __m512i fdir_permute_mask = _mm512_set_epi32
967                                 (0, 0, 0, 0,
968                                  0, 0, 0, 0,
969                                  7, 15, 23, 31,
970                                  3, 11, 19, 27);
971                         __m512i fdir_tmp = _mm512_permutex2var_epi32
972                                 (raw_desc0_3, fdir_permute_mask, raw_desc4_7);
973                         const __m256i fdir_id0_7 = _mm512_extracti64x4_epi64
974                                 (fdir_tmp, 0);
975                         const __m256i fdir_flags =
976                                 flex_rxd_to_fdir_flags_vec_avx512(fdir_id0_7);
977
978                         /* merge with fdir_flags */
979                         mbuf_flags = _mm256_or_si256(mbuf_flags, fdir_flags);
980
981                         /* write to mbuf: have to use scalar store here */
982                         rx_pkts[i + 0]->hash.fdir.hi =
983                                 _mm256_extract_epi32(fdir_id0_7, 3);
984
985                         rx_pkts[i + 1]->hash.fdir.hi =
986                                 _mm256_extract_epi32(fdir_id0_7, 7);
987
988                         rx_pkts[i + 2]->hash.fdir.hi =
989                                 _mm256_extract_epi32(fdir_id0_7, 2);
990
991                         rx_pkts[i + 3]->hash.fdir.hi =
992                                 _mm256_extract_epi32(fdir_id0_7, 6);
993
994                         rx_pkts[i + 4]->hash.fdir.hi =
995                                 _mm256_extract_epi32(fdir_id0_7, 1);
996
997                         rx_pkts[i + 5]->hash.fdir.hi =
998                                 _mm256_extract_epi32(fdir_id0_7, 5);
999
1000                         rx_pkts[i + 6]->hash.fdir.hi =
1001                                 _mm256_extract_epi32(fdir_id0_7, 0);
1002
1003                         rx_pkts[i + 7]->hash.fdir.hi =
1004                                 _mm256_extract_epi32(fdir_id0_7, 4);
1005                 } /* if() on fdir_enabled */
1006
1007                 __m256i mb4_5 = _mm512_extracti64x4_epi64(mb4_7, 0);
1008                 __m256i mb6_7 = _mm512_extracti64x4_epi64(mb4_7, 1);
1009                 __m256i mb0_1 = _mm512_extracti64x4_epi64(mb0_3, 0);
1010                 __m256i mb2_3 = _mm512_extracti64x4_epi64(mb0_3, 1);
1011
1012 #ifndef RTE_LIBRTE_IAVF_16BYTE_RX_DESC
1013                 /**
1014                  * needs to load 2nd 16B of each desc for RSS hash parsing,
1015                  * will cause performance drop to get into this context.
1016                  */
1017                 if (rxq->vsi->adapter->eth_dev->data->dev_conf.rxmode.offloads &
1018                     DEV_RX_OFFLOAD_RSS_HASH) {
1019                         /* load bottom half of every 32B desc */
1020                         const __m128i raw_desc_bh7 =
1021                                 _mm_load_si128
1022                                         ((void *)(&rxdp[7].wb.status_error1));
1023                         rte_compiler_barrier();
1024                         const __m128i raw_desc_bh6 =
1025                                 _mm_load_si128
1026                                         ((void *)(&rxdp[6].wb.status_error1));
1027                         rte_compiler_barrier();
1028                         const __m128i raw_desc_bh5 =
1029                                 _mm_load_si128
1030                                         ((void *)(&rxdp[5].wb.status_error1));
1031                         rte_compiler_barrier();
1032                         const __m128i raw_desc_bh4 =
1033                                 _mm_load_si128
1034                                         ((void *)(&rxdp[4].wb.status_error1));
1035                         rte_compiler_barrier();
1036                         const __m128i raw_desc_bh3 =
1037                                 _mm_load_si128
1038                                         ((void *)(&rxdp[3].wb.status_error1));
1039                         rte_compiler_barrier();
1040                         const __m128i raw_desc_bh2 =
1041                                 _mm_load_si128
1042                                         ((void *)(&rxdp[2].wb.status_error1));
1043                         rte_compiler_barrier();
1044                         const __m128i raw_desc_bh1 =
1045                                 _mm_load_si128
1046                                         ((void *)(&rxdp[1].wb.status_error1));
1047                         rte_compiler_barrier();
1048                         const __m128i raw_desc_bh0 =
1049                                 _mm_load_si128
1050                                         ((void *)(&rxdp[0].wb.status_error1));
1051
1052                         __m256i raw_desc_bh6_7 =
1053                                 _mm256_inserti128_si256
1054                                         (_mm256_castsi128_si256(raw_desc_bh6),
1055                                          raw_desc_bh7, 1);
1056                         __m256i raw_desc_bh4_5 =
1057                                 _mm256_inserti128_si256
1058                                         (_mm256_castsi128_si256(raw_desc_bh4),
1059                                          raw_desc_bh5, 1);
1060                         __m256i raw_desc_bh2_3 =
1061                                 _mm256_inserti128_si256
1062                                         (_mm256_castsi128_si256(raw_desc_bh2),
1063                                          raw_desc_bh3, 1);
1064                         __m256i raw_desc_bh0_1 =
1065                                 _mm256_inserti128_si256
1066                                         (_mm256_castsi128_si256(raw_desc_bh0),
1067                                          raw_desc_bh1, 1);
1068
1069                         /**
1070                          * to shift the 32b RSS hash value to the
1071                          * highest 32b of each 128b before mask
1072                          */
1073                         __m256i rss_hash6_7 =
1074                                 _mm256_slli_epi64(raw_desc_bh6_7, 32);
1075                         __m256i rss_hash4_5 =
1076                                 _mm256_slli_epi64(raw_desc_bh4_5, 32);
1077                         __m256i rss_hash2_3 =
1078                                 _mm256_slli_epi64(raw_desc_bh2_3, 32);
1079                         __m256i rss_hash0_1 =
1080                                 _mm256_slli_epi64(raw_desc_bh0_1, 32);
1081
1082                         __m256i rss_hash_msk =
1083                                 _mm256_set_epi32(0xFFFFFFFF, 0, 0, 0,
1084                                                  0xFFFFFFFF, 0, 0, 0);
1085
1086                         rss_hash6_7 = _mm256_and_si256
1087                                         (rss_hash6_7, rss_hash_msk);
1088                         rss_hash4_5 = _mm256_and_si256
1089                                         (rss_hash4_5, rss_hash_msk);
1090                         rss_hash2_3 = _mm256_and_si256
1091                                         (rss_hash2_3, rss_hash_msk);
1092                         rss_hash0_1 = _mm256_and_si256
1093                                         (rss_hash0_1, rss_hash_msk);
1094
1095                         mb6_7 = _mm256_or_si256(mb6_7, rss_hash6_7);
1096                         mb4_5 = _mm256_or_si256(mb4_5, rss_hash4_5);
1097                         mb2_3 = _mm256_or_si256(mb2_3, rss_hash2_3);
1098                         mb0_1 = _mm256_or_si256(mb0_1, rss_hash0_1);
1099                 } /* if() on RSS hash parsing */
1100 #endif
1101
1102                 /**
1103                  * At this point, we have the 8 sets of flags in the low 16-bits
1104                  * of each 32-bit value in vlan0.
1105                  * We want to extract these, and merge them with the mbuf init
1106                  * data so we can do a single write to the mbuf to set the flags
1107                  * and all the other initialization fields. Extracting the
1108                  * appropriate flags means that we have to do a shift and blend
1109                  * for each mbuf before we do the write. However, we can also
1110                  * add in the previously computed rx_descriptor fields to
1111                  * make a single 256-bit write per mbuf
1112                  */
1113                 /* check the structure matches expectations */
1114                 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, ol_flags) !=
1115                                  offsetof(struct rte_mbuf, rearm_data) + 8);
1116                 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, rearm_data) !=
1117                                  RTE_ALIGN(offsetof(struct rte_mbuf,
1118                                                     rearm_data),
1119                                                     16));
1120                 /* build up data and do writes */
1121                 __m256i rearm0, rearm1, rearm2, rearm3, rearm4, rearm5,
1122                         rearm6, rearm7;
1123                 rearm6 = _mm256_blend_epi32(mbuf_init,
1124                                             _mm256_slli_si256(mbuf_flags, 8),
1125                                             0x04);
1126                 rearm4 = _mm256_blend_epi32(mbuf_init,
1127                                             _mm256_slli_si256(mbuf_flags, 4),
1128                                             0x04);
1129                 rearm2 = _mm256_blend_epi32(mbuf_init, mbuf_flags, 0x04);
1130                 rearm0 = _mm256_blend_epi32(mbuf_init,
1131                                             _mm256_srli_si256(mbuf_flags, 4),
1132                                             0x04);
1133                 /* permute to add in the rx_descriptor e.g. rss fields */
1134                 rearm6 = _mm256_permute2f128_si256(rearm6, mb6_7, 0x20);
1135                 rearm4 = _mm256_permute2f128_si256(rearm4, mb4_5, 0x20);
1136                 rearm2 = _mm256_permute2f128_si256(rearm2, mb2_3, 0x20);
1137                 rearm0 = _mm256_permute2f128_si256(rearm0, mb0_1, 0x20);
1138                 /* write to mbuf */
1139                 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 6]->rearm_data,
1140                                     rearm6);
1141                 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 4]->rearm_data,
1142                                     rearm4);
1143                 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 2]->rearm_data,
1144                                     rearm2);
1145                 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 0]->rearm_data,
1146                                     rearm0);
1147
1148                 /* repeat for the odd mbufs */
1149                 const __m256i odd_flags =
1150                         _mm256_castsi128_si256
1151                                 (_mm256_extracti128_si256(mbuf_flags, 1));
1152                 rearm7 = _mm256_blend_epi32(mbuf_init,
1153                                             _mm256_slli_si256(odd_flags, 8),
1154                                             0x04);
1155                 rearm5 = _mm256_blend_epi32(mbuf_init,
1156                                             _mm256_slli_si256(odd_flags, 4),
1157                                             0x04);
1158                 rearm3 = _mm256_blend_epi32(mbuf_init, odd_flags, 0x04);
1159                 rearm1 = _mm256_blend_epi32(mbuf_init,
1160                                             _mm256_srli_si256(odd_flags, 4),
1161                                             0x04);
1162                 /* since odd mbufs are already in hi 128-bits use blend */
1163                 rearm7 = _mm256_blend_epi32(rearm7, mb6_7, 0xF0);
1164                 rearm5 = _mm256_blend_epi32(rearm5, mb4_5, 0xF0);
1165                 rearm3 = _mm256_blend_epi32(rearm3, mb2_3, 0xF0);
1166                 rearm1 = _mm256_blend_epi32(rearm1, mb0_1, 0xF0);
1167                 /* again write to mbufs */
1168                 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 7]->rearm_data,
1169                                     rearm7);
1170                 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 5]->rearm_data,
1171                                     rearm5);
1172                 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 3]->rearm_data,
1173                                     rearm3);
1174                 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 1]->rearm_data,
1175                                     rearm1);
1176
1177                 /* extract and record EOP bit */
1178                 if (split_packet) {
1179                         const __m128i eop_mask =
1180                                 _mm_set1_epi16(1 <<
1181                                                IAVF_RX_FLEX_DESC_STATUS0_EOF_S);
1182                         const __m256i eop_bits256 = _mm256_and_si256(status0_7,
1183                                                                      eop_check);
1184                         /* pack status bits into a single 128-bit register */
1185                         const __m128i eop_bits =
1186                                 _mm_packus_epi32
1187                                         (_mm256_castsi256_si128(eop_bits256),
1188                                          _mm256_extractf128_si256(eop_bits256,
1189                                                                   1));
1190                         /**
1191                          * flip bits, and mask out the EOP bit, which is now
1192                          * a split-packet bit i.e. !EOP, rather than EOP one.
1193                          */
1194                         __m128i split_bits = _mm_andnot_si128(eop_bits,
1195                                                               eop_mask);
1196                         /**
1197                          * eop bits are out of order, so we need to shuffle them
1198                          * back into order again. In doing so, only use low 8
1199                          * bits, which acts like another pack instruction
1200                          * The original order is (hi->lo): 1,3,5,7,0,2,4,6
1201                          * [Since we use epi8, the 16-bit positions are
1202                          * multiplied by 2 in the eop_shuffle value.]
1203                          */
1204                         __m128i eop_shuffle =
1205                                 _mm_set_epi8(/* zero hi 64b */
1206                                              0xFF, 0xFF, 0xFF, 0xFF,
1207                                              0xFF, 0xFF, 0xFF, 0xFF,
1208                                              /* move values to lo 64b */
1209                                              8, 0, 10, 2,
1210                                              12, 4, 14, 6);
1211                         split_bits = _mm_shuffle_epi8(split_bits, eop_shuffle);
1212                         *(uint64_t *)split_packet =
1213                                 _mm_cvtsi128_si64(split_bits);
1214                         split_packet += IAVF_DESCS_PER_LOOP_AVX;
1215                 }
1216
1217                 /* perform dd_check */
1218                 status0_7 = _mm256_and_si256(status0_7, dd_check);
1219                 status0_7 = _mm256_packs_epi32(status0_7,
1220                                                _mm256_setzero_si256());
1221
1222                 uint64_t burst = __builtin_popcountll
1223                                         (_mm_cvtsi128_si64
1224                                                 (_mm256_extracti128_si256
1225                                                         (status0_7, 1)));
1226                 burst += __builtin_popcountll
1227                                 (_mm_cvtsi128_si64
1228                                         (_mm256_castsi256_si128(status0_7)));
1229                 received += burst;
1230                 if (burst != IAVF_DESCS_PER_LOOP_AVX)
1231                         break;
1232         }
1233
1234         /* update tail pointers */
1235         rxq->rx_tail += received;
1236         rxq->rx_tail &= (rxq->nb_rx_desc - 1);
1237         if ((rxq->rx_tail & 1) == 1 && received > 1) { /* keep aligned */
1238                 rxq->rx_tail--;
1239                 received--;
1240         }
1241         rxq->rxrearm_nb += received;
1242         return received;
1243 }
1244
1245 /**
1246  * Notice:
1247  * - nb_pkts < IAVF_DESCS_PER_LOOP, just return no packet
1248  */
1249 uint16_t
1250 iavf_recv_pkts_vec_avx512(void *rx_queue, struct rte_mbuf **rx_pkts,
1251                           uint16_t nb_pkts)
1252 {
1253         return _iavf_recv_raw_pkts_vec_avx512(rx_queue, rx_pkts, nb_pkts, NULL);
1254 }
1255
1256 /**
1257  * Notice:
1258  * - nb_pkts < IAVF_DESCS_PER_LOOP, just return no packet
1259  */
1260 uint16_t
1261 iavf_recv_pkts_vec_avx512_flex_rxd(void *rx_queue, struct rte_mbuf **rx_pkts,
1262                                    uint16_t nb_pkts)
1263 {
1264         return _iavf_recv_raw_pkts_vec_avx512_flex_rxd(rx_queue, rx_pkts,
1265                                                        nb_pkts, NULL);
1266 }
1267
1268 /**
1269  * vPMD receive routine that reassembles single burst of 32 scattered packets
1270  * Notice:
1271  * - nb_pkts < IAVF_DESCS_PER_LOOP, just return no packet
1272  */
1273 static uint16_t
1274 iavf_recv_scattered_burst_vec_avx512(void *rx_queue, struct rte_mbuf **rx_pkts,
1275                                      uint16_t nb_pkts)
1276 {
1277         struct iavf_rx_queue *rxq = rx_queue;
1278         uint8_t split_flags[IAVF_VPMD_RX_MAX_BURST] = {0};
1279
1280         /* get some new buffers */
1281         uint16_t nb_bufs = _iavf_recv_raw_pkts_vec_avx512(rxq, rx_pkts, nb_pkts,
1282                                                           split_flags);
1283         if (nb_bufs == 0)
1284                 return 0;
1285
1286         /* happy day case, full burst + no packets to be joined */
1287         const uint64_t *split_fl64 = (uint64_t *)split_flags;
1288
1289         if (!rxq->pkt_first_seg &&
1290             split_fl64[0] == 0 && split_fl64[1] == 0 &&
1291             split_fl64[2] == 0 && split_fl64[3] == 0)
1292                 return nb_bufs;
1293
1294         /* reassemble any packets that need reassembly*/
1295         unsigned int i = 0;
1296
1297         if (!rxq->pkt_first_seg) {
1298                 /* find the first split flag, and only reassemble then*/
1299                 while (i < nb_bufs && !split_flags[i])
1300                         i++;
1301                 if (i == nb_bufs)
1302                         return nb_bufs;
1303                 rxq->pkt_first_seg = rx_pkts[i];
1304         }
1305         return i + reassemble_packets(rxq, &rx_pkts[i], nb_bufs - i,
1306                                       &split_flags[i]);
1307 }
1308
1309 /**
1310  * vPMD receive routine that reassembles scattered packets.
1311  * Main receive routine that can handle arbitrary burst sizes
1312  * Notice:
1313  * - nb_pkts < IAVF_DESCS_PER_LOOP, just return no packet
1314  */
1315 uint16_t
1316 iavf_recv_scattered_pkts_vec_avx512(void *rx_queue, struct rte_mbuf **rx_pkts,
1317                                     uint16_t nb_pkts)
1318 {
1319         uint16_t retval = 0;
1320
1321         while (nb_pkts > IAVF_VPMD_RX_MAX_BURST) {
1322                 uint16_t burst = iavf_recv_scattered_burst_vec_avx512(rx_queue,
1323                                 rx_pkts + retval, IAVF_VPMD_RX_MAX_BURST);
1324                 retval += burst;
1325                 nb_pkts -= burst;
1326                 if (burst < IAVF_VPMD_RX_MAX_BURST)
1327                         return retval;
1328         }
1329         return retval + iavf_recv_scattered_burst_vec_avx512(rx_queue,
1330                                 rx_pkts + retval, nb_pkts);
1331 }
1332
1333 /**
1334  * vPMD receive routine that reassembles single burst of
1335  * 32 scattered packets for flex RxD
1336  * Notice:
1337  * - nb_pkts < IAVF_DESCS_PER_LOOP, just return no packet
1338  */
1339 static uint16_t
1340 iavf_recv_scattered_burst_vec_avx512_flex_rxd(void *rx_queue,
1341                                               struct rte_mbuf **rx_pkts,
1342                                               uint16_t nb_pkts)
1343 {
1344         struct iavf_rx_queue *rxq = rx_queue;
1345         uint8_t split_flags[IAVF_VPMD_RX_MAX_BURST] = {0};
1346
1347         /* get some new buffers */
1348         uint16_t nb_bufs = _iavf_recv_raw_pkts_vec_avx512_flex_rxd(rxq,
1349                                         rx_pkts, nb_pkts, split_flags);
1350         if (nb_bufs == 0)
1351                 return 0;
1352
1353         /* happy day case, full burst + no packets to be joined */
1354         const uint64_t *split_fl64 = (uint64_t *)split_flags;
1355
1356         if (!rxq->pkt_first_seg &&
1357             split_fl64[0] == 0 && split_fl64[1] == 0 &&
1358             split_fl64[2] == 0 && split_fl64[3] == 0)
1359                 return nb_bufs;
1360
1361         /* reassemble any packets that need reassembly*/
1362         unsigned int i = 0;
1363
1364         if (!rxq->pkt_first_seg) {
1365                 /* find the first split flag, and only reassemble then*/
1366                 while (i < nb_bufs && !split_flags[i])
1367                         i++;
1368                 if (i == nb_bufs)
1369                         return nb_bufs;
1370                 rxq->pkt_first_seg = rx_pkts[i];
1371         }
1372         return i + reassemble_packets(rxq, &rx_pkts[i], nb_bufs - i,
1373                                       &split_flags[i]);
1374 }
1375
1376 /**
1377  * vPMD receive routine that reassembles scattered packets for flex RxD.
1378  * Main receive routine that can handle arbitrary burst sizes
1379  * Notice:
1380  * - nb_pkts < IAVF_DESCS_PER_LOOP, just return no packet
1381  */
1382 uint16_t
1383 iavf_recv_scattered_pkts_vec_avx512_flex_rxd(void *rx_queue,
1384                                              struct rte_mbuf **rx_pkts,
1385                                              uint16_t nb_pkts)
1386 {
1387         uint16_t retval = 0;
1388
1389         while (nb_pkts > IAVF_VPMD_RX_MAX_BURST) {
1390                 uint16_t burst =
1391                         iavf_recv_scattered_burst_vec_avx512_flex_rxd
1392                                 (rx_queue, rx_pkts + retval,
1393                                  IAVF_VPMD_RX_MAX_BURST);
1394                 retval += burst;
1395                 nb_pkts -= burst;
1396                 if (burst < IAVF_VPMD_RX_MAX_BURST)
1397                         return retval;
1398         }
1399         return retval + iavf_recv_scattered_burst_vec_avx512_flex_rxd(rx_queue,
1400                                 rx_pkts + retval, nb_pkts);
1401 }
1402
1403 static __rte_always_inline int
1404 iavf_tx_free_bufs_avx512(struct iavf_tx_queue *txq)
1405 {
1406         struct iavf_tx_vec_entry *txep;
1407         uint32_t n;
1408         uint32_t i;
1409         int nb_free = 0;
1410         struct rte_mbuf *m, *free[IAVF_VPMD_TX_MAX_FREE_BUF];
1411
1412         /* check DD bits on threshold descriptor */
1413         if ((txq->tx_ring[txq->next_dd].cmd_type_offset_bsz &
1414              rte_cpu_to_le_64(IAVF_TXD_QW1_DTYPE_MASK)) !=
1415             rte_cpu_to_le_64(IAVF_TX_DESC_DTYPE_DESC_DONE))
1416                 return 0;
1417
1418         n = txq->rs_thresh;
1419
1420          /* first buffer to free from S/W ring is at index
1421           * tx_next_dd - (tx_rs_thresh-1)
1422           */
1423         txep = (void *)txq->sw_ring;
1424         txep += txq->next_dd - (n - 1);
1425
1426         if (txq->offloads & DEV_TX_OFFLOAD_MBUF_FAST_FREE && (n & 31) == 0) {
1427                 struct rte_mempool *mp = txep[0].mbuf->pool;
1428                 struct rte_mempool_cache *cache = rte_mempool_default_cache(mp,
1429                                                                 rte_lcore_id());
1430                 void **cache_objs;
1431
1432                 if (!cache || cache->len == 0)
1433                         goto normal;
1434
1435                 cache_objs = &cache->objs[cache->len];
1436
1437                 if (n > RTE_MEMPOOL_CACHE_MAX_SIZE) {
1438                         rte_mempool_ops_enqueue_bulk(mp, (void *)txep, n);
1439                         goto done;
1440                 }
1441
1442                 /* The cache follows the following algorithm
1443                  *   1. Add the objects to the cache
1444                  *   2. Anything greater than the cache min value (if it crosses the
1445                  *   cache flush threshold) is flushed to the ring.
1446                  */
1447                 /* Add elements back into the cache */
1448                 uint32_t copied = 0;
1449                 /* n is multiple of 32 */
1450                 while (copied < n) {
1451                         const __m512i a = _mm512_loadu_si512(&txep[copied]);
1452                         const __m512i b = _mm512_loadu_si512(&txep[copied + 8]);
1453                         const __m512i c = _mm512_loadu_si512(&txep[copied + 16]);
1454                         const __m512i d = _mm512_loadu_si512(&txep[copied + 24]);
1455
1456                         _mm512_storeu_si512(&cache_objs[copied], a);
1457                         _mm512_storeu_si512(&cache_objs[copied + 8], b);
1458                         _mm512_storeu_si512(&cache_objs[copied + 16], c);
1459                         _mm512_storeu_si512(&cache_objs[copied + 24], d);
1460                         copied += 32;
1461                 }
1462                 cache->len += n;
1463
1464                 if (cache->len >= cache->flushthresh) {
1465                         rte_mempool_ops_enqueue_bulk(mp,
1466                                                      &cache->objs[cache->size],
1467                                                      cache->len - cache->size);
1468                         cache->len = cache->size;
1469                 }
1470                 goto done;
1471         }
1472
1473 normal:
1474         m = rte_pktmbuf_prefree_seg(txep[0].mbuf);
1475         if (likely(m)) {
1476                 free[0] = m;
1477                 nb_free = 1;
1478                 for (i = 1; i < n; i++) {
1479                         m = rte_pktmbuf_prefree_seg(txep[i].mbuf);
1480                         if (likely(m)) {
1481                                 if (likely(m->pool == free[0]->pool)) {
1482                                         free[nb_free++] = m;
1483                                 } else {
1484                                         rte_mempool_put_bulk(free[0]->pool,
1485                                                              (void *)free,
1486                                                              nb_free);
1487                                         free[0] = m;
1488                                         nb_free = 1;
1489                                 }
1490                         }
1491                 }
1492                 rte_mempool_put_bulk(free[0]->pool, (void **)free, nb_free);
1493         } else {
1494                 for (i = 1; i < n; i++) {
1495                         m = rte_pktmbuf_prefree_seg(txep[i].mbuf);
1496                         if (m)
1497                                 rte_mempool_put(m->pool, m);
1498                 }
1499         }
1500
1501 done:
1502         /* buffers were freed, update counters */
1503         txq->nb_free = (uint16_t)(txq->nb_free + txq->rs_thresh);
1504         txq->next_dd = (uint16_t)(txq->next_dd + txq->rs_thresh);
1505         if (txq->next_dd >= txq->nb_tx_desc)
1506                 txq->next_dd = (uint16_t)(txq->rs_thresh - 1);
1507
1508         return txq->rs_thresh;
1509 }
1510
1511 static __rte_always_inline void
1512 tx_backlog_entry_avx512(struct iavf_tx_vec_entry *txep,
1513                         struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
1514 {
1515         int i;
1516
1517         for (i = 0; i < (int)nb_pkts; ++i)
1518                 txep[i].mbuf = tx_pkts[i];
1519 }
1520
1521 static __rte_always_inline void
1522 iavf_vtx1(volatile struct iavf_tx_desc *txdp,
1523           struct rte_mbuf *pkt, uint64_t flags, bool offload)
1524 {
1525         uint64_t high_qw =
1526                 (IAVF_TX_DESC_DTYPE_DATA |
1527                  ((uint64_t)flags  << IAVF_TXD_QW1_CMD_SHIFT) |
1528                  ((uint64_t)pkt->data_len << IAVF_TXD_QW1_TX_BUF_SZ_SHIFT));
1529         if (offload)
1530                 iavf_txd_enable_offload(pkt, &high_qw);
1531
1532         __m128i descriptor = _mm_set_epi64x(high_qw,
1533                                             pkt->buf_iova + pkt->data_off);
1534         _mm_storeu_si128((__m128i *)txdp, descriptor);
1535 }
1536
1537 #define IAVF_TX_LEN_MASK 0xAA
1538 #define IAVF_TX_OFF_MASK 0x55
1539 static __rte_always_inline void
1540 iavf_vtx(volatile struct iavf_tx_desc *txdp,
1541          struct rte_mbuf **pkt, uint16_t nb_pkts,  uint64_t flags,
1542          bool offload)
1543 {
1544         const uint64_t hi_qw_tmpl = (IAVF_TX_DESC_DTYPE_DATA |
1545                         ((uint64_t)flags  << IAVF_TXD_QW1_CMD_SHIFT));
1546
1547         /* if unaligned on 32-bit boundary, do one to align */
1548         if (((uintptr_t)txdp & 0x1F) != 0 && nb_pkts != 0) {
1549                 iavf_vtx1(txdp, *pkt, flags, offload);
1550                 nb_pkts--, txdp++, pkt++;
1551         }
1552
1553         /* do 4 at a time while possible, in bursts */
1554         for (; nb_pkts > 3; txdp += 4, pkt += 4, nb_pkts -= 4) {
1555                 uint64_t hi_qw3 =
1556                         hi_qw_tmpl |
1557                         ((uint64_t)pkt[3]->data_len <<
1558                          IAVF_TXD_QW1_TX_BUF_SZ_SHIFT);
1559                 if (offload)
1560                         iavf_txd_enable_offload(pkt[3], &hi_qw3);
1561                 uint64_t hi_qw2 =
1562                         hi_qw_tmpl |
1563                         ((uint64_t)pkt[2]->data_len <<
1564                          IAVF_TXD_QW1_TX_BUF_SZ_SHIFT);
1565                 if (offload)
1566                         iavf_txd_enable_offload(pkt[2], &hi_qw2);
1567                 uint64_t hi_qw1 =
1568                         hi_qw_tmpl |
1569                         ((uint64_t)pkt[1]->data_len <<
1570                          IAVF_TXD_QW1_TX_BUF_SZ_SHIFT);
1571                 if (offload)
1572                         iavf_txd_enable_offload(pkt[1], &hi_qw1);
1573                 uint64_t hi_qw0 =
1574                         hi_qw_tmpl |
1575                         ((uint64_t)pkt[0]->data_len <<
1576                          IAVF_TXD_QW1_TX_BUF_SZ_SHIFT);
1577                 if (offload)
1578                         iavf_txd_enable_offload(pkt[0], &hi_qw0);
1579
1580                 __m512i desc0_3 =
1581                         _mm512_set_epi64
1582                                 (hi_qw3,
1583                                  pkt[3]->buf_iova + pkt[3]->data_off,
1584                                  hi_qw2,
1585                                  pkt[2]->buf_iova + pkt[2]->data_off,
1586                                  hi_qw1,
1587                                  pkt[1]->buf_iova + pkt[1]->data_off,
1588                                  hi_qw0,
1589                                  pkt[0]->buf_iova + pkt[0]->data_off);
1590                 _mm512_storeu_si512((void *)txdp, desc0_3);
1591         }
1592
1593         /* do any last ones */
1594         while (nb_pkts) {
1595                 iavf_vtx1(txdp, *pkt, flags, offload);
1596                 txdp++, pkt++, nb_pkts--;
1597         }
1598 }
1599
1600 static __rte_always_inline uint16_t
1601 iavf_xmit_fixed_burst_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
1602                                  uint16_t nb_pkts, bool offload)
1603 {
1604         struct iavf_tx_queue *txq = (struct iavf_tx_queue *)tx_queue;
1605         volatile struct iavf_tx_desc *txdp;
1606         struct iavf_tx_vec_entry *txep;
1607         uint16_t n, nb_commit, tx_id;
1608         /* bit2 is reserved and must be set to 1 according to Spec */
1609         uint64_t flags = IAVF_TX_DESC_CMD_EOP | IAVF_TX_DESC_CMD_ICRC;
1610         uint64_t rs = IAVF_TX_DESC_CMD_RS | flags;
1611
1612         /* cross rx_thresh boundary is not allowed */
1613         nb_pkts = RTE_MIN(nb_pkts, txq->rs_thresh);
1614
1615         if (txq->nb_free < txq->free_thresh)
1616                 iavf_tx_free_bufs_avx512(txq);
1617
1618         nb_commit = nb_pkts = (uint16_t)RTE_MIN(txq->nb_free, nb_pkts);
1619         if (unlikely(nb_pkts == 0))
1620                 return 0;
1621
1622         tx_id = txq->tx_tail;
1623         txdp = &txq->tx_ring[tx_id];
1624         txep = (void *)txq->sw_ring;
1625         txep += tx_id;
1626
1627         txq->nb_free = (uint16_t)(txq->nb_free - nb_pkts);
1628
1629         n = (uint16_t)(txq->nb_tx_desc - tx_id);
1630         if (nb_commit >= n) {
1631                 tx_backlog_entry_avx512(txep, tx_pkts, n);
1632
1633                 iavf_vtx(txdp, tx_pkts, n - 1, flags, offload);
1634                 tx_pkts += (n - 1);
1635                 txdp += (n - 1);
1636
1637                 iavf_vtx1(txdp, *tx_pkts++, rs, offload);
1638
1639                 nb_commit = (uint16_t)(nb_commit - n);
1640
1641                 tx_id = 0;
1642                 txq->next_rs = (uint16_t)(txq->rs_thresh - 1);
1643
1644                 /* avoid reach the end of ring */
1645                 txdp = &txq->tx_ring[tx_id];
1646                 txep = (void *)txq->sw_ring;
1647                 txep += tx_id;
1648         }
1649
1650         tx_backlog_entry_avx512(txep, tx_pkts, nb_commit);
1651
1652         iavf_vtx(txdp, tx_pkts, nb_commit, flags, offload);
1653
1654         tx_id = (uint16_t)(tx_id + nb_commit);
1655         if (tx_id > txq->next_rs) {
1656                 txq->tx_ring[txq->next_rs].cmd_type_offset_bsz |=
1657                         rte_cpu_to_le_64(((uint64_t)IAVF_TX_DESC_CMD_RS) <<
1658                                          IAVF_TXD_QW1_CMD_SHIFT);
1659                 txq->next_rs =
1660                         (uint16_t)(txq->next_rs + txq->rs_thresh);
1661         }
1662
1663         txq->tx_tail = tx_id;
1664
1665         IAVF_PCI_REG_WRITE(txq->qtx_tail, txq->tx_tail);
1666
1667         return nb_pkts;
1668 }
1669
1670 static __rte_always_inline uint16_t
1671 iavf_xmit_pkts_vec_avx512_cmn(void *tx_queue, struct rte_mbuf **tx_pkts,
1672                               uint16_t nb_pkts, bool offload)
1673 {
1674         uint16_t nb_tx = 0;
1675         struct iavf_tx_queue *txq = (struct iavf_tx_queue *)tx_queue;
1676
1677         while (nb_pkts) {
1678                 uint16_t ret, num;
1679
1680                 num = (uint16_t)RTE_MIN(nb_pkts, txq->rs_thresh);
1681                 ret = iavf_xmit_fixed_burst_vec_avx512(tx_queue, &tx_pkts[nb_tx],
1682                                                        num, offload);
1683                 nb_tx += ret;
1684                 nb_pkts -= ret;
1685                 if (ret < num)
1686                         break;
1687         }
1688
1689         return nb_tx;
1690 }
1691
1692 uint16_t
1693 iavf_xmit_pkts_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
1694                           uint16_t nb_pkts)
1695 {
1696         return iavf_xmit_pkts_vec_avx512_cmn(tx_queue, tx_pkts, nb_pkts, false);
1697 }
1698
1699 static inline void
1700 iavf_tx_queue_release_mbufs_avx512(struct iavf_tx_queue *txq)
1701 {
1702         unsigned int i;
1703         const uint16_t max_desc = (uint16_t)(txq->nb_tx_desc - 1);
1704         struct iavf_tx_vec_entry *swr = (void *)txq->sw_ring;
1705
1706         if (!txq->sw_ring || txq->nb_free == max_desc)
1707                 return;
1708
1709         i = txq->next_dd - txq->rs_thresh + 1;
1710         if (txq->tx_tail < i) {
1711                 for (; i < txq->nb_tx_desc; i++) {
1712                         rte_pktmbuf_free_seg(swr[i].mbuf);
1713                         swr[i].mbuf = NULL;
1714                 }
1715                 i = 0;
1716         }
1717 }
1718
1719 static const struct iavf_txq_ops avx512_vec_txq_ops = {
1720         .release_mbufs = iavf_tx_queue_release_mbufs_avx512,
1721 };
1722
1723 int __rte_cold
1724 iavf_txq_vec_setup_avx512(struct iavf_tx_queue *txq)
1725 {
1726         txq->ops = &avx512_vec_txq_ops;
1727         return 0;
1728 }
1729
1730 uint16_t
1731 iavf_xmit_pkts_vec_avx512_offload(void *tx_queue, struct rte_mbuf **tx_pkts,
1732                                   uint16_t nb_pkts)
1733 {
1734         return iavf_xmit_pkts_vec_avx512_cmn(tx_queue, tx_pkts, nb_pkts, true);
1735 }