net/i40e: fix Rx packet statistics
[dpdk.git] / drivers / net / ice / ice_rxtx_vec_sse.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2019 Intel Corporation
3  */
4
5 #include "ice_rxtx_vec_common.h"
6
7 #include <tmmintrin.h>
8
9 #ifndef __INTEL_COMPILER
10 #pragma GCC diagnostic ignored "-Wcast-qual"
11 #endif
12
13 static inline __m128i
14 ice_flex_rxd_to_fdir_flags_vec(const __m128i fdir_id0_3)
15 {
16 #define FDID_MIS_MAGIC 0xFFFFFFFF
17         RTE_BUILD_BUG_ON(PKT_RX_FDIR != (1 << 2));
18         RTE_BUILD_BUG_ON(PKT_RX_FDIR_ID != (1 << 13));
19         const __m128i pkt_fdir_bit = _mm_set1_epi32(PKT_RX_FDIR |
20                         PKT_RX_FDIR_ID);
21         /* desc->flow_id field == 0xFFFFFFFF means fdir mismatch */
22         const __m128i fdir_mis_mask = _mm_set1_epi32(FDID_MIS_MAGIC);
23         __m128i fdir_mask = _mm_cmpeq_epi32(fdir_id0_3,
24                         fdir_mis_mask);
25         /* this XOR op results to bit-reverse the fdir_mask */
26         fdir_mask = _mm_xor_si128(fdir_mask, fdir_mis_mask);
27         const __m128i fdir_flags = _mm_and_si128(fdir_mask, pkt_fdir_bit);
28
29         return fdir_flags;
30 }
31
32 static inline void
33 ice_rxq_rearm(struct ice_rx_queue *rxq)
34 {
35         int i;
36         uint16_t rx_id;
37         volatile union ice_rx_flex_desc *rxdp;
38         struct ice_rx_entry *rxep = &rxq->sw_ring[rxq->rxrearm_start];
39         struct rte_mbuf *mb0, *mb1;
40         __m128i hdr_room = _mm_set_epi64x(RTE_PKTMBUF_HEADROOM,
41                                           RTE_PKTMBUF_HEADROOM);
42         __m128i dma_addr0, dma_addr1;
43
44         rxdp = rxq->rx_ring + rxq->rxrearm_start;
45
46         /* Pull 'n' more MBUFs into the software ring */
47         if (rte_mempool_get_bulk(rxq->mp,
48                                  (void *)rxep,
49                                  ICE_RXQ_REARM_THRESH) < 0) {
50                 if (rxq->rxrearm_nb + ICE_RXQ_REARM_THRESH >=
51                     rxq->nb_rx_desc) {
52                         dma_addr0 = _mm_setzero_si128();
53                         for (i = 0; i < ICE_DESCS_PER_LOOP; i++) {
54                                 rxep[i].mbuf = &rxq->fake_mbuf;
55                                 _mm_store_si128((__m128i *)&rxdp[i].read,
56                                                 dma_addr0);
57                         }
58                 }
59                 rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed +=
60                         ICE_RXQ_REARM_THRESH;
61                 return;
62         }
63
64         /* Initialize the mbufs in vector, process 2 mbufs in one loop */
65         for (i = 0; i < ICE_RXQ_REARM_THRESH; i += 2, rxep += 2) {
66                 __m128i vaddr0, vaddr1;
67
68                 mb0 = rxep[0].mbuf;
69                 mb1 = rxep[1].mbuf;
70
71                 /* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */
72                 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) !=
73                                  offsetof(struct rte_mbuf, buf_addr) + 8);
74                 vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr);
75                 vaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr);
76
77                 /* convert pa to dma_addr hdr/data */
78                 dma_addr0 = _mm_unpackhi_epi64(vaddr0, vaddr0);
79                 dma_addr1 = _mm_unpackhi_epi64(vaddr1, vaddr1);
80
81                 /* add headroom to pa values */
82                 dma_addr0 = _mm_add_epi64(dma_addr0, hdr_room);
83                 dma_addr1 = _mm_add_epi64(dma_addr1, hdr_room);
84
85                 /* flush desc with pa dma_addr */
86                 _mm_store_si128((__m128i *)&rxdp++->read, dma_addr0);
87                 _mm_store_si128((__m128i *)&rxdp++->read, dma_addr1);
88         }
89
90         rxq->rxrearm_start += ICE_RXQ_REARM_THRESH;
91         if (rxq->rxrearm_start >= rxq->nb_rx_desc)
92                 rxq->rxrearm_start = 0;
93
94         rxq->rxrearm_nb -= ICE_RXQ_REARM_THRESH;
95
96         rx_id = (uint16_t)((rxq->rxrearm_start == 0) ?
97                            (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1));
98
99         /* Update the tail pointer on the NIC */
100         ICE_PCI_REG_WC_WRITE(rxq->qrx_tail, rx_id);
101 }
102
103 static inline void
104 ice_rx_desc_to_olflags_v(struct ice_rx_queue *rxq, __m128i descs[4],
105                          struct rte_mbuf **rx_pkts)
106 {
107         const __m128i mbuf_init = _mm_set_epi64x(0, rxq->mbuf_initializer);
108         __m128i rearm0, rearm1, rearm2, rearm3;
109
110         __m128i tmp_desc, flags, rss_vlan;
111
112         /* mask everything except checksum, RSS and VLAN flags.
113          * bit6:4 for checksum.
114          * bit12 for RSS indication.
115          * bit13 for VLAN indication.
116          */
117         const __m128i desc_mask = _mm_set_epi32(0x30f0, 0x30f0,
118                                                 0x30f0, 0x30f0);
119         const __m128i cksum_mask = _mm_set_epi32(PKT_RX_IP_CKSUM_MASK |
120                                                  PKT_RX_L4_CKSUM_MASK |
121                                                  PKT_RX_OUTER_L4_CKSUM_MASK |
122                                                  PKT_RX_OUTER_IP_CKSUM_BAD,
123                                                  PKT_RX_IP_CKSUM_MASK |
124                                                  PKT_RX_L4_CKSUM_MASK |
125                                                  PKT_RX_OUTER_L4_CKSUM_MASK |
126                                                  PKT_RX_OUTER_IP_CKSUM_BAD,
127                                                  PKT_RX_IP_CKSUM_MASK |
128                                                  PKT_RX_L4_CKSUM_MASK |
129                                                  PKT_RX_OUTER_L4_CKSUM_MASK |
130                                                  PKT_RX_OUTER_IP_CKSUM_BAD,
131                                                  PKT_RX_IP_CKSUM_MASK |
132                                                  PKT_RX_L4_CKSUM_MASK |
133                                                  PKT_RX_OUTER_L4_CKSUM_MASK |
134                                                  PKT_RX_OUTER_IP_CKSUM_BAD);
135
136         /* map the checksum, rss and vlan fields to the checksum, rss
137          * and vlan flag
138          */
139         const __m128i cksum_flags =
140                 _mm_set_epi8((PKT_RX_OUTER_L4_CKSUM_BAD >> 20 |
141                  PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
142                   PKT_RX_IP_CKSUM_BAD) >> 1,
143                 (PKT_RX_OUTER_L4_CKSUM_BAD >> 20 | PKT_RX_OUTER_IP_CKSUM_BAD |
144                  PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_GOOD) >> 1,
145                 (PKT_RX_OUTER_L4_CKSUM_BAD >> 20 | PKT_RX_OUTER_IP_CKSUM_BAD |
146                  PKT_RX_L4_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD) >> 1,
147                 (PKT_RX_OUTER_L4_CKSUM_BAD >> 20 | PKT_RX_OUTER_IP_CKSUM_BAD |
148                  PKT_RX_L4_CKSUM_GOOD | PKT_RX_IP_CKSUM_GOOD) >> 1,
149                 (PKT_RX_OUTER_L4_CKSUM_BAD >> 20 | PKT_RX_L4_CKSUM_BAD |
150                  PKT_RX_IP_CKSUM_BAD) >> 1,
151                 (PKT_RX_OUTER_L4_CKSUM_BAD >> 20 | PKT_RX_L4_CKSUM_BAD |
152                  PKT_RX_IP_CKSUM_GOOD) >> 1,
153                 (PKT_RX_OUTER_L4_CKSUM_BAD >> 20 | PKT_RX_L4_CKSUM_GOOD |
154                  PKT_RX_IP_CKSUM_BAD) >> 1,
155                 (PKT_RX_OUTER_L4_CKSUM_BAD >> 20 | PKT_RX_L4_CKSUM_GOOD |
156                  PKT_RX_IP_CKSUM_GOOD) >> 1,
157                 /**
158                  * shift right 20 bits to use the low two bits to indicate
159                  * outer checksum status
160                  * shift right 1 bit to make sure it not exceed 255
161                  */
162                 (PKT_RX_OUTER_L4_CKSUM_GOOD >> 20 | PKT_RX_OUTER_IP_CKSUM_BAD |
163                  PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
164                 (PKT_RX_OUTER_L4_CKSUM_GOOD >> 20 | PKT_RX_OUTER_IP_CKSUM_BAD |
165                  PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_GOOD) >> 1,
166                 (PKT_RX_OUTER_L4_CKSUM_GOOD >> 20 | PKT_RX_OUTER_IP_CKSUM_BAD |
167                  PKT_RX_L4_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD) >> 1,
168                 (PKT_RX_OUTER_L4_CKSUM_GOOD >> 20 | PKT_RX_OUTER_IP_CKSUM_BAD |
169                  PKT_RX_L4_CKSUM_GOOD | PKT_RX_IP_CKSUM_GOOD) >> 1,
170                 (PKT_RX_OUTER_L4_CKSUM_GOOD >> 20 | PKT_RX_L4_CKSUM_BAD |
171                  PKT_RX_IP_CKSUM_BAD) >> 1,
172                 (PKT_RX_OUTER_L4_CKSUM_GOOD >> 20 | PKT_RX_L4_CKSUM_BAD |
173                  PKT_RX_IP_CKSUM_GOOD) >> 1,
174                 (PKT_RX_OUTER_L4_CKSUM_GOOD >> 20 | PKT_RX_L4_CKSUM_GOOD |
175                  PKT_RX_IP_CKSUM_BAD) >> 1,
176                 (PKT_RX_OUTER_L4_CKSUM_GOOD >> 20 | PKT_RX_L4_CKSUM_GOOD |
177                  PKT_RX_IP_CKSUM_GOOD) >> 1);
178
179         const __m128i rss_vlan_flags = _mm_set_epi8(0, 0, 0, 0,
180                         0, 0, 0, 0,
181                         0, 0, 0, 0,
182                         PKT_RX_RSS_HASH | PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED,
183                         PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED,
184                         PKT_RX_RSS_HASH, 0);
185
186         /* merge 4 descriptors */
187         flags = _mm_unpackhi_epi32(descs[0], descs[1]);
188         tmp_desc = _mm_unpackhi_epi32(descs[2], descs[3]);
189         tmp_desc = _mm_unpacklo_epi64(flags, tmp_desc);
190         tmp_desc = _mm_and_si128(tmp_desc, desc_mask);
191
192         /* checksum flags */
193         tmp_desc = _mm_srli_epi32(tmp_desc, 4);
194         flags = _mm_shuffle_epi8(cksum_flags, tmp_desc);
195         /* then we shift left 1 bit */
196         flags = _mm_slli_epi32(flags, 1);
197
198         __m128i l4_outer_mask = _mm_set_epi32(0x6, 0x6, 0x6, 0x6);
199         __m128i l4_outer_flags = _mm_and_si128(flags, l4_outer_mask);
200         l4_outer_flags = _mm_slli_epi32(l4_outer_flags, 20);
201
202         __m128i l3_l4_mask = _mm_set_epi32(~0x6, ~0x6, ~0x6, ~0x6);
203         __m128i l3_l4_flags = _mm_and_si128(flags, l3_l4_mask);
204         flags = _mm_or_si128(l3_l4_flags, l4_outer_flags);
205         /* we need to mask out the reduntant bits introduced by RSS or
206          * VLAN fields.
207          */
208         flags = _mm_and_si128(flags, cksum_mask);
209
210         /* RSS, VLAN flag */
211         tmp_desc = _mm_srli_epi32(tmp_desc, 8);
212         rss_vlan = _mm_shuffle_epi8(rss_vlan_flags, tmp_desc);
213
214         /* merge the flags */
215         flags = _mm_or_si128(flags, rss_vlan);
216
217         if (rxq->fdir_enabled) {
218                 const __m128i fdir_id0_1 =
219                         _mm_unpackhi_epi32(descs[0], descs[1]);
220
221                 const __m128i fdir_id2_3 =
222                         _mm_unpackhi_epi32(descs[2], descs[3]);
223
224                 const __m128i fdir_id0_3 =
225                         _mm_unpackhi_epi64(fdir_id0_1, fdir_id2_3);
226
227                 const __m128i fdir_flags =
228                         ice_flex_rxd_to_fdir_flags_vec(fdir_id0_3);
229
230                 /* merge with fdir_flags */
231                 flags = _mm_or_si128(flags, fdir_flags);
232
233                 /* write fdir_id to mbuf */
234                 rx_pkts[0]->hash.fdir.hi =
235                         _mm_extract_epi32(fdir_id0_3, 0);
236
237                 rx_pkts[1]->hash.fdir.hi =
238                         _mm_extract_epi32(fdir_id0_3, 1);
239
240                 rx_pkts[2]->hash.fdir.hi =
241                         _mm_extract_epi32(fdir_id0_3, 2);
242
243                 rx_pkts[3]->hash.fdir.hi =
244                         _mm_extract_epi32(fdir_id0_3, 3);
245         } /* if() on fdir_enabled */
246
247         /**
248          * At this point, we have the 4 sets of flags in the low 16-bits
249          * of each 32-bit value in flags.
250          * We want to extract these, and merge them with the mbuf init data
251          * so we can do a single 16-byte write to the mbuf to set the flags
252          * and all the other initialization fields. Extracting the
253          * appropriate flags means that we have to do a shift and blend for
254          * each mbuf before we do the write.
255          */
256         rearm0 = _mm_blend_epi16(mbuf_init, _mm_slli_si128(flags, 8), 0x30);
257         rearm1 = _mm_blend_epi16(mbuf_init, _mm_slli_si128(flags, 4), 0x30);
258         rearm2 = _mm_blend_epi16(mbuf_init, flags, 0x30);
259         rearm3 = _mm_blend_epi16(mbuf_init, _mm_srli_si128(flags, 4), 0x30);
260
261         /* write the rearm data and the olflags in one write */
262         RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, ol_flags) !=
263                          offsetof(struct rte_mbuf, rearm_data) + 8);
264         RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, rearm_data) !=
265                          RTE_ALIGN(offsetof(struct rte_mbuf, rearm_data), 16));
266         _mm_store_si128((__m128i *)&rx_pkts[0]->rearm_data, rearm0);
267         _mm_store_si128((__m128i *)&rx_pkts[1]->rearm_data, rearm1);
268         _mm_store_si128((__m128i *)&rx_pkts[2]->rearm_data, rearm2);
269         _mm_store_si128((__m128i *)&rx_pkts[3]->rearm_data, rearm3);
270 }
271
272 static inline void
273 ice_rx_desc_to_ptype_v(__m128i descs[4], struct rte_mbuf **rx_pkts,
274                        uint32_t *ptype_tbl)
275 {
276         const __m128i ptype_mask = _mm_set_epi16(ICE_RX_FLEX_DESC_PTYPE_M, 0,
277                                                  ICE_RX_FLEX_DESC_PTYPE_M, 0,
278                                                  ICE_RX_FLEX_DESC_PTYPE_M, 0,
279                                                  ICE_RX_FLEX_DESC_PTYPE_M, 0);
280         __m128i ptype_01 = _mm_unpacklo_epi32(descs[0], descs[1]);
281         __m128i ptype_23 = _mm_unpacklo_epi32(descs[2], descs[3]);
282         __m128i ptype_all = _mm_unpacklo_epi64(ptype_01, ptype_23);
283
284         ptype_all = _mm_and_si128(ptype_all, ptype_mask);
285
286         rx_pkts[0]->packet_type = ptype_tbl[_mm_extract_epi16(ptype_all, 1)];
287         rx_pkts[1]->packet_type = ptype_tbl[_mm_extract_epi16(ptype_all, 3)];
288         rx_pkts[2]->packet_type = ptype_tbl[_mm_extract_epi16(ptype_all, 5)];
289         rx_pkts[3]->packet_type = ptype_tbl[_mm_extract_epi16(ptype_all, 7)];
290 }
291
292 /**
293  * vPMD raw receive routine, only accept(nb_pkts >= ICE_DESCS_PER_LOOP)
294  *
295  * Notice:
296  * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet
297  * - floor align nb_pkts to a ICE_DESCS_PER_LOOP power-of-two
298  */
299 static inline uint16_t
300 _ice_recv_raw_pkts_vec(struct ice_rx_queue *rxq, struct rte_mbuf **rx_pkts,
301                        uint16_t nb_pkts, uint8_t *split_packet)
302 {
303         volatile union ice_rx_flex_desc *rxdp;
304         struct ice_rx_entry *sw_ring;
305         uint16_t nb_pkts_recd;
306         int pos;
307         uint64_t var;
308         uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl;
309         __m128i crc_adjust = _mm_set_epi16
310                                 (0, 0, 0,       /* ignore non-length fields */
311                                  -rxq->crc_len, /* sub crc on data_len */
312                                  0,          /* ignore high-16bits of pkt_len */
313                                  -rxq->crc_len, /* sub crc on pkt_len */
314                                  0, 0           /* ignore pkt_type field */
315                                 );
316         const __m128i zero = _mm_setzero_si128();
317         /* mask to shuffle from desc. to mbuf */
318         const __m128i shuf_msk = _mm_set_epi8
319                         (0xFF, 0xFF,
320                          0xFF, 0xFF,  /* rss hash parsed separately */
321                          11, 10,      /* octet 10~11, 16 bits vlan_macip */
322                          5, 4,        /* octet 4~5, 16 bits data_len */
323                          0xFF, 0xFF,  /* skip high 16 bits pkt_len, zero out */
324                          5, 4,        /* octet 4~5, low 16 bits pkt_len */
325                          0xFF, 0xFF,  /* pkt_type set as unknown */
326                          0xFF, 0xFF   /* pkt_type set as unknown */
327                         );
328         const __m128i eop_shuf_mask = _mm_set_epi8(0xFF, 0xFF,
329                                                    0xFF, 0xFF,
330                                                    0xFF, 0xFF,
331                                                    0xFF, 0xFF,
332                                                    0xFF, 0xFF,
333                                                    0xFF, 0xFF,
334                                                    0x04, 0x0C,
335                                                    0x00, 0x08);
336
337         /**
338          * compile-time check the above crc_adjust layout is correct.
339          * NOTE: the first field (lowest address) is given last in set_epi16
340          * call above.
341          */
342         RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) !=
343                          offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4);
344         RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) !=
345                          offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8);
346
347         /* 4 packets DD mask */
348         const __m128i dd_check = _mm_set_epi64x(0x0000000100000001LL,
349                                                 0x0000000100000001LL);
350         /* 4 packets EOP mask */
351         const __m128i eop_check = _mm_set_epi64x(0x0000000200000002LL,
352                                                  0x0000000200000002LL);
353
354         /* nb_pkts has to be floor-aligned to ICE_DESCS_PER_LOOP */
355         nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, ICE_DESCS_PER_LOOP);
356
357         /* Just the act of getting into the function from the application is
358          * going to cost about 7 cycles
359          */
360         rxdp = rxq->rx_ring + rxq->rx_tail;
361
362         rte_prefetch0(rxdp);
363
364         /* See if we need to rearm the RX queue - gives the prefetch a bit
365          * of time to act
366          */
367         if (rxq->rxrearm_nb > ICE_RXQ_REARM_THRESH)
368                 ice_rxq_rearm(rxq);
369
370         /* Before we start moving massive data around, check to see if
371          * there is actually a packet available
372          */
373         if (!(rxdp->wb.status_error0 &
374               rte_cpu_to_le_32(1 << ICE_RX_FLEX_DESC_STATUS0_DD_S)))
375                 return 0;
376
377         /**
378          * Compile-time verify the shuffle mask
379          * NOTE: some field positions already verified above, but duplicated
380          * here for completeness in case of future modifications.
381          */
382         RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) !=
383                          offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4);
384         RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) !=
385                          offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8);
386         RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, vlan_tci) !=
387                          offsetof(struct rte_mbuf, rx_descriptor_fields1) + 10);
388         RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, hash) !=
389                          offsetof(struct rte_mbuf, rx_descriptor_fields1) + 12);
390
391         /* Cache is empty -> need to scan the buffer rings, but first move
392          * the next 'n' mbufs into the cache
393          */
394         sw_ring = &rxq->sw_ring[rxq->rx_tail];
395
396         /* A. load 4 packet in one loop
397          * [A*. mask out 4 unused dirty field in desc]
398          * B. copy 4 mbuf point from swring to rx_pkts
399          * C. calc the number of DD bits among the 4 packets
400          * [C*. extract the end-of-packet bit, if requested]
401          * D. fill info. from desc to mbuf
402          */
403
404         for (pos = 0, nb_pkts_recd = 0; pos < nb_pkts;
405              pos += ICE_DESCS_PER_LOOP,
406              rxdp += ICE_DESCS_PER_LOOP) {
407                 __m128i descs[ICE_DESCS_PER_LOOP];
408                 __m128i pkt_mb0, pkt_mb1, pkt_mb2, pkt_mb3;
409                 __m128i staterr, sterr_tmp1, sterr_tmp2;
410                 /* 2 64 bit or 4 32 bit mbuf pointers in one XMM reg. */
411                 __m128i mbp1;
412 #if defined(RTE_ARCH_X86_64)
413                 __m128i mbp2;
414 #endif
415
416                 /* B.1 load 2 (64 bit) or 4 (32 bit) mbuf points */
417                 mbp1 = _mm_loadu_si128((__m128i *)&sw_ring[pos]);
418                 /* Read desc statuses backwards to avoid race condition */
419                 /* A.1 load desc[3] */
420                 descs[3] = _mm_loadu_si128((__m128i *)(rxdp + 3));
421                 rte_compiler_barrier();
422
423                 /* B.2 copy 2 64 bit or 4 32 bit mbuf point into rx_pkts */
424                 _mm_storeu_si128((__m128i *)&rx_pkts[pos], mbp1);
425
426 #if defined(RTE_ARCH_X86_64)
427                 /* B.1 load 2 64 bit mbuf points */
428                 mbp2 = _mm_loadu_si128((__m128i *)&sw_ring[pos + 2]);
429 #endif
430
431                 /* A.1 load desc[2-0] */
432                 descs[2] = _mm_loadu_si128((__m128i *)(rxdp + 2));
433                 rte_compiler_barrier();
434                 descs[1] = _mm_loadu_si128((__m128i *)(rxdp + 1));
435                 rte_compiler_barrier();
436                 descs[0] = _mm_loadu_si128((__m128i *)(rxdp));
437
438 #if defined(RTE_ARCH_X86_64)
439                 /* B.2 copy 2 mbuf point into rx_pkts  */
440                 _mm_storeu_si128((__m128i *)&rx_pkts[pos + 2], mbp2);
441 #endif
442
443                 if (split_packet) {
444                         rte_mbuf_prefetch_part2(rx_pkts[pos]);
445                         rte_mbuf_prefetch_part2(rx_pkts[pos + 1]);
446                         rte_mbuf_prefetch_part2(rx_pkts[pos + 2]);
447                         rte_mbuf_prefetch_part2(rx_pkts[pos + 3]);
448                 }
449
450                 /* avoid compiler reorder optimization */
451                 rte_compiler_barrier();
452
453                 /* D.1 pkt 3,4 convert format from desc to pktmbuf */
454                 pkt_mb3 = _mm_shuffle_epi8(descs[3], shuf_msk);
455                 pkt_mb2 = _mm_shuffle_epi8(descs[2], shuf_msk);
456
457                 /* D.1 pkt 1,2 convert format from desc to pktmbuf */
458                 pkt_mb1 = _mm_shuffle_epi8(descs[1], shuf_msk);
459                 pkt_mb0 = _mm_shuffle_epi8(descs[0], shuf_msk);
460
461                 /* C.1 4=>2 filter staterr info only */
462                 sterr_tmp2 = _mm_unpackhi_epi32(descs[3], descs[2]);
463                 /* C.1 4=>2 filter staterr info only */
464                 sterr_tmp1 = _mm_unpackhi_epi32(descs[1], descs[0]);
465
466                 ice_rx_desc_to_olflags_v(rxq, descs, &rx_pkts[pos]);
467
468                 /* D.2 pkt 3,4 set in_port/nb_seg and remove crc */
469                 pkt_mb3 = _mm_add_epi16(pkt_mb3, crc_adjust);
470                 pkt_mb2 = _mm_add_epi16(pkt_mb2, crc_adjust);
471
472                 /* D.2 pkt 1,2 set in_port/nb_seg and remove crc */
473                 pkt_mb1 = _mm_add_epi16(pkt_mb1, crc_adjust);
474                 pkt_mb0 = _mm_add_epi16(pkt_mb0, crc_adjust);
475
476 #ifndef RTE_LIBRTE_ICE_16BYTE_RX_DESC
477                 /**
478                  * needs to load 2nd 16B of each desc for RSS hash parsing,
479                  * will cause performance drop to get into this context.
480                  */
481                 if (rxq->vsi->adapter->pf.dev_data->dev_conf.rxmode.offloads &
482                                 DEV_RX_OFFLOAD_RSS_HASH) {
483                         /* load bottom half of every 32B desc */
484                         const __m128i raw_desc_bh3 =
485                                 _mm_load_si128
486                                         ((void *)(&rxdp[3].wb.status_error1));
487                         rte_compiler_barrier();
488                         const __m128i raw_desc_bh2 =
489                                 _mm_load_si128
490                                         ((void *)(&rxdp[2].wb.status_error1));
491                         rte_compiler_barrier();
492                         const __m128i raw_desc_bh1 =
493                                 _mm_load_si128
494                                         ((void *)(&rxdp[1].wb.status_error1));
495                         rte_compiler_barrier();
496                         const __m128i raw_desc_bh0 =
497                                 _mm_load_si128
498                                         ((void *)(&rxdp[0].wb.status_error1));
499
500                         /**
501                          * to shift the 32b RSS hash value to the
502                          * highest 32b of each 128b before mask
503                          */
504                         __m128i rss_hash3 =
505                                 _mm_slli_epi64(raw_desc_bh3, 32);
506                         __m128i rss_hash2 =
507                                 _mm_slli_epi64(raw_desc_bh2, 32);
508                         __m128i rss_hash1 =
509                                 _mm_slli_epi64(raw_desc_bh1, 32);
510                         __m128i rss_hash0 =
511                                 _mm_slli_epi64(raw_desc_bh0, 32);
512
513                         __m128i rss_hash_msk =
514                                 _mm_set_epi32(0xFFFFFFFF, 0, 0, 0);
515
516                         rss_hash3 = _mm_and_si128
517                                         (rss_hash3, rss_hash_msk);
518                         rss_hash2 = _mm_and_si128
519                                         (rss_hash2, rss_hash_msk);
520                         rss_hash1 = _mm_and_si128
521                                         (rss_hash1, rss_hash_msk);
522                         rss_hash0 = _mm_and_si128
523                                         (rss_hash0, rss_hash_msk);
524
525                         pkt_mb3 = _mm_or_si128(pkt_mb3, rss_hash3);
526                         pkt_mb2 = _mm_or_si128(pkt_mb2, rss_hash2);
527                         pkt_mb1 = _mm_or_si128(pkt_mb1, rss_hash1);
528                         pkt_mb0 = _mm_or_si128(pkt_mb0, rss_hash0);
529                 } /* if() on RSS hash parsing */
530 #endif
531
532                 /* C.2 get 4 pkts staterr value  */
533                 staterr = _mm_unpacklo_epi32(sterr_tmp1, sterr_tmp2);
534
535                 /* D.3 copy final 3,4 data to rx_pkts */
536                 _mm_storeu_si128
537                         ((void *)&rx_pkts[pos + 3]->rx_descriptor_fields1,
538                          pkt_mb3);
539                 _mm_storeu_si128
540                         ((void *)&rx_pkts[pos + 2]->rx_descriptor_fields1,
541                          pkt_mb2);
542
543                 /* C* extract and record EOP bit */
544                 if (split_packet) {
545                         /* and with mask to extract bits, flipping 1-0 */
546                         __m128i eop_bits = _mm_andnot_si128(staterr, eop_check);
547                         /* the staterr values are not in order, as the count
548                          * of dd bits doesn't care. However, for end of
549                          * packet tracking, we do care, so shuffle. This also
550                          * compresses the 32-bit values to 8-bit
551                          */
552                         eop_bits = _mm_shuffle_epi8(eop_bits, eop_shuf_mask);
553                         /* store the resulting 32-bit value */
554                         *(int *)split_packet = _mm_cvtsi128_si32(eop_bits);
555                         split_packet += ICE_DESCS_PER_LOOP;
556                 }
557
558                 /* C.3 calc available number of desc */
559                 staterr = _mm_and_si128(staterr, dd_check);
560                 staterr = _mm_packs_epi32(staterr, zero);
561
562                 /* D.3 copy final 1,2 data to rx_pkts */
563                 _mm_storeu_si128
564                         ((void *)&rx_pkts[pos + 1]->rx_descriptor_fields1,
565                          pkt_mb1);
566                 _mm_storeu_si128((void *)&rx_pkts[pos]->rx_descriptor_fields1,
567                                  pkt_mb0);
568                 ice_rx_desc_to_ptype_v(descs, &rx_pkts[pos], ptype_tbl);
569                 /* C.4 calc avaialbe number of desc */
570                 var = __builtin_popcountll(_mm_cvtsi128_si64(staterr));
571                 nb_pkts_recd += var;
572                 if (likely(var != ICE_DESCS_PER_LOOP))
573                         break;
574         }
575
576         /* Update our internal tail pointer */
577         rxq->rx_tail = (uint16_t)(rxq->rx_tail + nb_pkts_recd);
578         rxq->rx_tail = (uint16_t)(rxq->rx_tail & (rxq->nb_rx_desc - 1));
579         rxq->rxrearm_nb = (uint16_t)(rxq->rxrearm_nb + nb_pkts_recd);
580
581         return nb_pkts_recd;
582 }
583
584 /**
585  * Notice:
586  * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet
587  * - nb_pkts > ICE_VPMD_RX_BURST, only scan ICE_VPMD_RX_BURST
588  *   numbers of DD bits
589  */
590 uint16_t
591 ice_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
592                   uint16_t nb_pkts)
593 {
594         return _ice_recv_raw_pkts_vec(rx_queue, rx_pkts, nb_pkts, NULL);
595 }
596
597 /**
598  * vPMD receive routine that reassembles single burst of 32 scattered packets
599  *
600  * Notice:
601  * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet
602  */
603 static uint16_t
604 ice_recv_scattered_burst_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
605                              uint16_t nb_pkts)
606 {
607         struct ice_rx_queue *rxq = rx_queue;
608         uint8_t split_flags[ICE_VPMD_RX_BURST] = {0};
609
610         /* get some new buffers */
611         uint16_t nb_bufs = _ice_recv_raw_pkts_vec(rxq, rx_pkts, nb_pkts,
612                                                   split_flags);
613         if (nb_bufs == 0)
614                 return 0;
615
616         /* happy day case, full burst + no packets to be joined */
617         const uint64_t *split_fl64 = (uint64_t *)split_flags;
618
619         if (!rxq->pkt_first_seg &&
620             split_fl64[0] == 0 && split_fl64[1] == 0 &&
621             split_fl64[2] == 0 && split_fl64[3] == 0)
622                 return nb_bufs;
623
624         /* reassemble any packets that need reassembly*/
625         unsigned int i = 0;
626
627         if (!rxq->pkt_first_seg) {
628                 /* find the first split flag, and only reassemble then*/
629                 while (i < nb_bufs && !split_flags[i])
630                         i++;
631                 if (i == nb_bufs)
632                         return nb_bufs;
633                 rxq->pkt_first_seg = rx_pkts[i];
634         }
635         return i + ice_rx_reassemble_packets(rxq, &rx_pkts[i], nb_bufs - i,
636                                              &split_flags[i]);
637 }
638
639 /**
640  * vPMD receive routine that reassembles scattered packets.
641  */
642 uint16_t
643 ice_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
644                             uint16_t nb_pkts)
645 {
646         uint16_t retval = 0;
647
648         while (nb_pkts > ICE_VPMD_RX_BURST) {
649                 uint16_t burst;
650
651                 burst = ice_recv_scattered_burst_vec(rx_queue,
652                                                      rx_pkts + retval,
653                                                      ICE_VPMD_RX_BURST);
654                 retval += burst;
655                 nb_pkts -= burst;
656                 if (burst < ICE_VPMD_RX_BURST)
657                         return retval;
658         }
659
660         return retval + ice_recv_scattered_burst_vec(rx_queue,
661                                                      rx_pkts + retval,
662                                                      nb_pkts);
663 }
664
665 static inline void
666 ice_vtx1(volatile struct ice_tx_desc *txdp, struct rte_mbuf *pkt,
667          uint64_t flags)
668 {
669         uint64_t high_qw =
670                 (ICE_TX_DESC_DTYPE_DATA |
671                  ((uint64_t)flags  << ICE_TXD_QW1_CMD_S) |
672                  ((uint64_t)pkt->data_len << ICE_TXD_QW1_TX_BUF_SZ_S));
673
674         __m128i descriptor = _mm_set_epi64x(high_qw,
675                                             pkt->buf_iova + pkt->data_off);
676         _mm_store_si128((__m128i *)txdp, descriptor);
677 }
678
679 static inline void
680 ice_vtx(volatile struct ice_tx_desc *txdp, struct rte_mbuf **pkt,
681         uint16_t nb_pkts, uint64_t flags)
682 {
683         int i;
684
685         for (i = 0; i < nb_pkts; ++i, ++txdp, ++pkt)
686                 ice_vtx1(txdp, *pkt, flags);
687 }
688
689 static uint16_t
690 ice_xmit_fixed_burst_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
691                          uint16_t nb_pkts)
692 {
693         struct ice_tx_queue *txq = (struct ice_tx_queue *)tx_queue;
694         volatile struct ice_tx_desc *txdp;
695         struct ice_tx_entry *txep;
696         uint16_t n, nb_commit, tx_id;
697         uint64_t flags = ICE_TD_CMD;
698         uint64_t rs = ICE_TX_DESC_CMD_RS | ICE_TD_CMD;
699         int i;
700
701         /* cross rx_thresh boundary is not allowed */
702         nb_pkts = RTE_MIN(nb_pkts, txq->tx_rs_thresh);
703
704         if (txq->nb_tx_free < txq->tx_free_thresh)
705                 ice_tx_free_bufs_vec(txq);
706
707         nb_pkts = (uint16_t)RTE_MIN(txq->nb_tx_free, nb_pkts);
708         nb_commit = nb_pkts;
709         if (unlikely(nb_pkts == 0))
710                 return 0;
711
712         tx_id = txq->tx_tail;
713         txdp = &txq->tx_ring[tx_id];
714         txep = &txq->sw_ring[tx_id];
715
716         txq->nb_tx_free = (uint16_t)(txq->nb_tx_free - nb_pkts);
717
718         n = (uint16_t)(txq->nb_tx_desc - tx_id);
719         if (nb_commit >= n) {
720                 ice_tx_backlog_entry(txep, tx_pkts, n);
721
722                 for (i = 0; i < n - 1; ++i, ++tx_pkts, ++txdp)
723                         ice_vtx1(txdp, *tx_pkts, flags);
724
725                 ice_vtx1(txdp, *tx_pkts++, rs);
726
727                 nb_commit = (uint16_t)(nb_commit - n);
728
729                 tx_id = 0;
730                 txq->tx_next_rs = (uint16_t)(txq->tx_rs_thresh - 1);
731
732                 /* avoid reach the end of ring */
733                 txdp = &txq->tx_ring[tx_id];
734                 txep = &txq->sw_ring[tx_id];
735         }
736
737         ice_tx_backlog_entry(txep, tx_pkts, nb_commit);
738
739         ice_vtx(txdp, tx_pkts, nb_commit, flags);
740
741         tx_id = (uint16_t)(tx_id + nb_commit);
742         if (tx_id > txq->tx_next_rs) {
743                 txq->tx_ring[txq->tx_next_rs].cmd_type_offset_bsz |=
744                         rte_cpu_to_le_64(((uint64_t)ICE_TX_DESC_CMD_RS) <<
745                                          ICE_TXD_QW1_CMD_S);
746                 txq->tx_next_rs =
747                         (uint16_t)(txq->tx_next_rs + txq->tx_rs_thresh);
748         }
749
750         txq->tx_tail = tx_id;
751
752         ICE_PCI_REG_WC_WRITE(txq->qtx_tail, txq->tx_tail);
753
754         return nb_pkts;
755 }
756
757 uint16_t
758 ice_xmit_pkts_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
759                   uint16_t nb_pkts)
760 {
761         uint16_t nb_tx = 0;
762         struct ice_tx_queue *txq = (struct ice_tx_queue *)tx_queue;
763
764         while (nb_pkts) {
765                 uint16_t ret, num;
766
767                 num = (uint16_t)RTE_MIN(nb_pkts, txq->tx_rs_thresh);
768                 ret = ice_xmit_fixed_burst_vec(tx_queue, &tx_pkts[nb_tx], num);
769                 nb_tx += ret;
770                 nb_pkts -= ret;
771                 if (ret < num)
772                         break;
773         }
774
775         return nb_tx;
776 }
777
778 int __rte_cold
779 ice_rxq_vec_setup(struct ice_rx_queue *rxq)
780 {
781         if (!rxq)
782                 return -1;
783
784         rxq->rx_rel_mbufs = _ice_rx_queue_release_mbufs_vec;
785         return ice_rxq_vec_setup_default(rxq);
786 }
787
788 int __rte_cold
789 ice_txq_vec_setup(struct ice_tx_queue __rte_unused *txq)
790 {
791         if (!txq)
792                 return -1;
793
794         txq->tx_rel_mbufs = _ice_tx_queue_release_mbufs_vec;
795         return 0;
796 }
797
798 int __rte_cold
799 ice_rx_vec_dev_check(struct rte_eth_dev *dev)
800 {
801         return ice_rx_vec_dev_check_default(dev);
802 }
803
804 int __rte_cold
805 ice_tx_vec_dev_check(struct rte_eth_dev *dev)
806 {
807         return ice_tx_vec_dev_check_default(dev);
808 }