1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright 2017 6WIND S.A.
3 * Copyright 2017 Mellanox Technologies, Ltd
6 #ifndef RTE_PMD_MLX5_RXTX_VEC_SSE_H_
7 #define RTE_PMD_MLX5_RXTX_VEC_SSE_H_
12 #include <smmintrin.h>
15 #include <rte_mempool.h>
16 #include <rte_prefetch.h>
20 #include "mlx5_defs.h"
22 #include "mlx5_utils.h"
23 #include "mlx5_rxtx.h"
24 #include "mlx5_rxtx_vec.h"
25 #include "mlx5_autoconf.h"
27 #ifndef __INTEL_COMPILER
28 #pragma GCC diagnostic ignored "-Wcast-qual"
32 * Store free buffers to RX SW ring.
35 * Pointer to SW ring to be filled.
37 * Pointer to array of packets to be stored.
39 * Number of packets to be stored.
42 rxq_copy_mbuf_v(struct rte_mbuf **elts, struct rte_mbuf **pkts, uint16_t n)
47 for (pos = 0; pos < p; pos += 2) {
50 mbp = _mm_loadu_si128((__m128i *)&elts[pos]);
51 _mm_storeu_si128((__m128i *)&pkts[pos], mbp);
54 pkts[pos] = elts[pos];
58 * Decompress a compressed completion and fill in mbufs in RX SW ring with data
59 * extracted from the title completion descriptor.
62 * Pointer to RX queue structure.
64 * Pointer to completion array having a compressed completion at first.
66 * Pointer to SW ring to be filled. The first mbuf has to be pre-built from
67 * the title completion descriptor to be copied to the rest of mbufs.
70 * Number of mini-CQEs successfully decompressed.
72 static inline uint16_t
73 rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
74 struct rte_mbuf **elts)
76 volatile struct mlx5_mini_cqe8 *mcq = (void *)(cq + 1);
77 struct rte_mbuf *t_pkt = elts[0]; /* Title packet is pre-built. */
81 /* Mask to shuffle from extracted mini CQE to mbuf. */
82 const __m128i shuf_mask1 =
83 _mm_set_epi8(0, 1, 2, 3, /* rss, bswap32 */
84 -1, -1, /* skip vlan_tci */
85 6, 7, /* data_len, bswap16 */
86 -1, -1, 6, 7, /* pkt_len, bswap16 */
87 -1, -1, -1, -1 /* skip packet_type */);
88 const __m128i shuf_mask2 =
89 _mm_set_epi8(8, 9, 10, 11, /* rss, bswap32 */
90 -1, -1, /* skip vlan_tci */
91 14, 15, /* data_len, bswap16 */
92 -1, -1, 14, 15, /* pkt_len, bswap16 */
93 -1, -1, -1, -1 /* skip packet_type */);
94 /* Restore the compressed count. Must be 16 bits. */
95 const uint16_t mcqe_n = t_pkt->data_len +
96 (rxq->crc_present * RTE_ETHER_CRC_LEN);
98 _mm_loadu_si128((__m128i *)&t_pkt->rearm_data);
100 _mm_loadu_si128((__m128i *)&t_pkt->rx_descriptor_fields1);
101 const __m128i crc_adj =
102 _mm_set_epi16(0, 0, 0,
103 rxq->crc_present * RTE_ETHER_CRC_LEN,
105 rxq->crc_present * RTE_ETHER_CRC_LEN,
107 __m128i ol_flags = _mm_setzero_si128();
108 __m128i ol_flags_mask = _mm_setzero_si128();
109 #ifdef MLX5_PMD_SOFT_COUNTERS
110 const __m128i zero = _mm_setzero_si128();
111 const __m128i ones = _mm_cmpeq_epi32(zero, zero);
112 uint32_t rcvd_byte = 0;
113 /* Mask to shuffle byte_cnt to add up stats. Do bswap16 for all. */
114 const __m128i len_shuf_mask =
115 _mm_set_epi8(-1, -1, -1, -1,
121 * A. load mCQEs into a 128bit register.
122 * B. store rearm data to mbuf.
123 * C. combine data from mCQEs with rx_descriptor_fields1.
124 * D. store rx_descriptor_fields1.
125 * E. store flow tag (rte_flow mark).
127 for (pos = 0; pos < mcqe_n; ) {
128 __m128i mcqe1, mcqe2;
129 __m128i rxdf1, rxdf2;
130 #ifdef MLX5_PMD_SOFT_COUNTERS
131 __m128i byte_cnt, invalid_mask;
134 for (i = 0; i < MLX5_VPMD_DESCS_PER_LOOP; ++i)
135 if (likely(pos + i < mcqe_n))
136 rte_prefetch0((void *)(cq + pos + i));
137 /* A.1 load mCQEs into a 128bit register. */
138 mcqe1 = _mm_loadu_si128((__m128i *)&mcq[pos % 8]);
139 mcqe2 = _mm_loadu_si128((__m128i *)&mcq[pos % 8 + 2]);
140 /* B.1 store rearm data to mbuf. */
141 _mm_storeu_si128((__m128i *)&elts[pos]->rearm_data, rearm);
142 _mm_storeu_si128((__m128i *)&elts[pos + 1]->rearm_data, rearm);
143 /* C.1 combine data from mCQEs with rx_descriptor_fields1. */
144 rxdf1 = _mm_shuffle_epi8(mcqe1, shuf_mask1);
145 rxdf2 = _mm_shuffle_epi8(mcqe1, shuf_mask2);
146 rxdf1 = _mm_sub_epi16(rxdf1, crc_adj);
147 rxdf2 = _mm_sub_epi16(rxdf2, crc_adj);
148 rxdf1 = _mm_blend_epi16(rxdf1, rxdf, 0x23);
149 rxdf2 = _mm_blend_epi16(rxdf2, rxdf, 0x23);
150 /* D.1 store rx_descriptor_fields1. */
151 _mm_storeu_si128((__m128i *)
152 &elts[pos]->rx_descriptor_fields1,
154 _mm_storeu_si128((__m128i *)
155 &elts[pos + 1]->rx_descriptor_fields1,
157 /* B.1 store rearm data to mbuf. */
158 _mm_storeu_si128((__m128i *)&elts[pos + 2]->rearm_data, rearm);
159 _mm_storeu_si128((__m128i *)&elts[pos + 3]->rearm_data, rearm);
160 /* C.1 combine data from mCQEs with rx_descriptor_fields1. */
161 rxdf1 = _mm_shuffle_epi8(mcqe2, shuf_mask1);
162 rxdf2 = _mm_shuffle_epi8(mcqe2, shuf_mask2);
163 rxdf1 = _mm_sub_epi16(rxdf1, crc_adj);
164 rxdf2 = _mm_sub_epi16(rxdf2, crc_adj);
165 rxdf1 = _mm_blend_epi16(rxdf1, rxdf, 0x23);
166 rxdf2 = _mm_blend_epi16(rxdf2, rxdf, 0x23);
167 /* D.1 store rx_descriptor_fields1. */
168 _mm_storeu_si128((__m128i *)
169 &elts[pos + 2]->rx_descriptor_fields1,
171 _mm_storeu_si128((__m128i *)
172 &elts[pos + 3]->rx_descriptor_fields1,
174 #ifdef MLX5_PMD_SOFT_COUNTERS
175 invalid_mask = _mm_set_epi64x(0,
177 sizeof(uint16_t) * 8);
178 invalid_mask = _mm_sll_epi64(ones, invalid_mask);
179 byte_cnt = _mm_blend_epi16(_mm_srli_si128(mcqe1, 4),
181 byte_cnt = _mm_shuffle_epi8(byte_cnt, len_shuf_mask);
182 byte_cnt = _mm_andnot_si128(invalid_mask, byte_cnt);
183 byte_cnt = _mm_hadd_epi16(byte_cnt, zero);
184 rcvd_byte += _mm_cvtsi128_si64(_mm_hadd_epi16(byte_cnt, zero));
187 if (rxq->mcqe_format !=
188 MLX5_CQE_RESP_FORMAT_FTAG_STRIDX) {
189 const uint32_t flow_tag = t_pkt->hash.fdir.hi;
191 /* E.1 store flow tag (rte_flow mark). */
192 elts[pos]->hash.fdir.hi = flow_tag;
193 elts[pos + 1]->hash.fdir.hi = flow_tag;
194 elts[pos + 2]->hash.fdir.hi = flow_tag;
195 elts[pos + 3]->hash.fdir.hi = flow_tag;
197 const __m128i flow_mark_adj =
198 _mm_set_epi32(-1, -1, -1, -1);
199 const __m128i flow_mark_shuf =
200 _mm_set_epi8(-1, 9, 8, 12,
204 const __m128i ft_mask =
205 _mm_set1_epi32(0xffffff00);
206 const __m128i fdir_flags =
207 _mm_set1_epi32(PKT_RX_FDIR);
208 const __m128i fdir_all_flags =
209 _mm_set1_epi32(PKT_RX_FDIR |
211 __m128i fdir_id_flags =
212 _mm_set1_epi32(PKT_RX_FDIR_ID);
214 /* Extract flow_tag field. */
216 _mm_shuffle_epi8(mcqe1, flow_mark_shuf);
218 _mm_shuffle_epi8(mcqe2, flow_mark_shuf);
220 _mm_unpackhi_epi64(ftag0, ftag1);
221 __m128i invalid_mask =
222 _mm_cmpeq_epi32(ftag, zero);
224 ol_flags_mask = _mm_or_si128(ol_flags_mask,
226 /* Set PKT_RX_FDIR if flow tag is non-zero. */
227 ol_flags = _mm_or_si128(ol_flags,
228 _mm_andnot_si128(invalid_mask,
230 /* Mask out invalid entries. */
231 fdir_id_flags = _mm_andnot_si128(invalid_mask,
233 /* Check if flow tag MLX5_FLOW_MARK_DEFAULT. */
234 ol_flags = _mm_or_si128(ol_flags,
235 _mm_andnot_si128(_mm_cmpeq_epi32(ftag,
238 ftag = _mm_add_epi32(ftag, flow_mark_adj);
239 elts[pos]->hash.fdir.hi =
240 _mm_extract_epi32(ftag, 0);
241 elts[pos + 1]->hash.fdir.hi =
242 _mm_extract_epi32(ftag, 1);
243 elts[pos + 2]->hash.fdir.hi =
244 _mm_extract_epi32(ftag, 2);
245 elts[pos + 3]->hash.fdir.hi =
246 _mm_extract_epi32(ftag, 3);
249 if (unlikely(rxq->mcqe_format != MLX5_CQE_RESP_FORMAT_HASH)) {
250 if (rxq->mcqe_format ==
251 MLX5_CQE_RESP_FORMAT_L34H_STRIDX) {
252 const uint8_t pkt_info =
253 (cq->pkt_info & 0x3) << 6;
254 const uint8_t pkt_hdr0 =
255 _mm_extract_epi8(mcqe1, 0);
256 const uint8_t pkt_hdr1 =
257 _mm_extract_epi8(mcqe1, 8);
258 const uint8_t pkt_hdr2 =
259 _mm_extract_epi8(mcqe2, 0);
260 const uint8_t pkt_hdr3 =
261 _mm_extract_epi8(mcqe2, 8);
262 const __m128i vlan_mask =
263 _mm_set1_epi32(PKT_RX_VLAN |
264 PKT_RX_VLAN_STRIPPED);
265 const __m128i cv_mask =
266 _mm_set1_epi32(MLX5_CQE_VLAN_STRIPPED);
267 const __m128i pkt_cv =
268 _mm_set_epi32(pkt_hdr0 & 0x1,
273 ol_flags_mask = _mm_or_si128(ol_flags_mask,
275 ol_flags = _mm_or_si128(ol_flags,
276 _mm_and_si128(_mm_cmpeq_epi32(pkt_cv,
277 cv_mask), vlan_mask));
278 elts[pos]->packet_type =
279 mlx5_ptype_table[(pkt_hdr0 >> 2) |
281 elts[pos + 1]->packet_type =
282 mlx5_ptype_table[(pkt_hdr1 >> 2) |
284 elts[pos + 2]->packet_type =
285 mlx5_ptype_table[(pkt_hdr2 >> 2) |
287 elts[pos + 3]->packet_type =
288 mlx5_ptype_table[(pkt_hdr3 >> 2) |
291 elts[pos]->packet_type |=
292 !!(((pkt_hdr0 >> 2) |
293 pkt_info) & (1 << 6));
294 elts[pos + 1]->packet_type |=
295 !!(((pkt_hdr1 >> 2) |
296 pkt_info) & (1 << 6));
297 elts[pos + 2]->packet_type |=
298 !!(((pkt_hdr2 >> 2) |
299 pkt_info) & (1 << 6));
300 elts[pos + 3]->packet_type |=
301 !!(((pkt_hdr3 >> 2) |
302 pkt_info) & (1 << 6));
305 const __m128i hash_flags =
306 _mm_set1_epi32(PKT_RX_RSS_HASH);
307 const __m128i rearm_flags =
308 _mm_set1_epi32((uint32_t)t_pkt->ol_flags);
310 ol_flags_mask = _mm_or_si128(ol_flags_mask, hash_flags);
311 ol_flags = _mm_or_si128(ol_flags,
312 _mm_andnot_si128(ol_flags_mask, rearm_flags));
313 elts[pos]->ol_flags =
314 _mm_extract_epi32(ol_flags, 0);
315 elts[pos + 1]->ol_flags =
316 _mm_extract_epi32(ol_flags, 1);
317 elts[pos + 2]->ol_flags =
318 _mm_extract_epi32(ol_flags, 2);
319 elts[pos + 3]->ol_flags =
320 _mm_extract_epi32(ol_flags, 3);
321 elts[pos]->hash.rss = 0;
322 elts[pos + 1]->hash.rss = 0;
323 elts[pos + 2]->hash.rss = 0;
324 elts[pos + 3]->hash.rss = 0;
326 if (rxq->dynf_meta) {
327 int32_t offs = rxq->flow_meta_offset;
328 const uint32_t meta =
329 *RTE_MBUF_DYNFIELD(t_pkt, offs, uint32_t *);
331 /* Check if title packet has valid metadata. */
333 MLX5_ASSERT(t_pkt->ol_flags &
334 rxq->flow_meta_mask);
335 *RTE_MBUF_DYNFIELD(elts[pos], offs,
337 *RTE_MBUF_DYNFIELD(elts[pos + 1], offs,
339 *RTE_MBUF_DYNFIELD(elts[pos + 2], offs,
341 *RTE_MBUF_DYNFIELD(elts[pos + 3], offs,
345 pos += MLX5_VPMD_DESCS_PER_LOOP;
346 /* Move to next CQE and invalidate consumed CQEs. */
347 if (!(pos & 0x7) && pos < mcqe_n) {
348 if (pos + 8 < mcqe_n)
349 rte_prefetch0((void *)(cq + pos + 8));
350 mcq = (void *)(cq + pos);
351 for (i = 0; i < 8; ++i)
352 cq[inv++].op_own = MLX5_CQE_INVALIDATE;
355 /* Invalidate the rest of CQEs. */
356 for (; inv < mcqe_n; ++inv)
357 cq[inv].op_own = MLX5_CQE_INVALIDATE;
358 #ifdef MLX5_PMD_SOFT_COUNTERS
359 rxq->stats.ipackets += mcqe_n;
360 rxq->stats.ibytes += rcvd_byte;
366 * Calculate packet type and offload flag for mbuf and store it.
369 * Pointer to RX queue structure.
371 * Array of four 16bytes completions extracted from the original completion
374 * Opcode vector having responder error status. Each field is 4B.
376 * Pointer to array of packets to be filled.
379 rxq_cq_to_ptype_oflags_v(struct mlx5_rxq_data *rxq, __m128i cqes[4],
380 __m128i op_err, struct rte_mbuf **pkts)
382 __m128i pinfo0, pinfo1;
383 __m128i pinfo, ptype;
384 __m128i ol_flags = _mm_set1_epi32(rxq->rss_hash * PKT_RX_RSS_HASH |
385 rxq->hw_timestamp * rxq->timestamp_rx_flag);
387 const __m128i zero = _mm_setzero_si128();
388 const __m128i ptype_mask = _mm_set1_epi32(0xfd06);
389 const __m128i ptype_ol_mask = _mm_set1_epi32(0x106);
390 const __m128i pinfo_mask = _mm_set1_epi32(0x3);
391 const __m128i cv_flag_sel =
392 _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0,
393 (uint8_t)((PKT_RX_IP_CKSUM_GOOD |
394 PKT_RX_L4_CKSUM_GOOD) >> 1),
396 (uint8_t)(PKT_RX_L4_CKSUM_GOOD >> 1),
398 (uint8_t)(PKT_RX_IP_CKSUM_GOOD >> 1),
399 (uint8_t)(PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED),
401 const __m128i cv_mask =
402 _mm_set1_epi32(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD |
403 PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED);
404 const __m128i mbuf_init =
405 _mm_load_si128((__m128i *)&rxq->mbuf_initializer);
406 __m128i rearm0, rearm1, rearm2, rearm3;
407 uint8_t pt_idx0, pt_idx1, pt_idx2, pt_idx3;
409 /* Extract pkt_info field. */
410 pinfo0 = _mm_unpacklo_epi32(cqes[0], cqes[1]);
411 pinfo1 = _mm_unpacklo_epi32(cqes[2], cqes[3]);
412 pinfo = _mm_unpacklo_epi64(pinfo0, pinfo1);
413 /* Extract hdr_type_etc field. */
414 pinfo0 = _mm_unpackhi_epi32(cqes[0], cqes[1]);
415 pinfo1 = _mm_unpackhi_epi32(cqes[2], cqes[3]);
416 ptype = _mm_unpacklo_epi64(pinfo0, pinfo1);
418 const __m128i pinfo_ft_mask = _mm_set1_epi32(0xffffff00);
419 const __m128i fdir_flags = _mm_set1_epi32(PKT_RX_FDIR);
420 __m128i fdir_id_flags = _mm_set1_epi32(PKT_RX_FDIR_ID);
421 __m128i flow_tag, invalid_mask;
423 flow_tag = _mm_and_si128(pinfo, pinfo_ft_mask);
424 /* Check if flow tag is non-zero then set PKT_RX_FDIR. */
425 invalid_mask = _mm_cmpeq_epi32(flow_tag, zero);
426 ol_flags = _mm_or_si128(ol_flags,
427 _mm_andnot_si128(invalid_mask,
429 /* Mask out invalid entries. */
430 fdir_id_flags = _mm_andnot_si128(invalid_mask, fdir_id_flags);
431 /* Check if flow tag MLX5_FLOW_MARK_DEFAULT. */
432 ol_flags = _mm_or_si128(ol_flags,
434 _mm_cmpeq_epi32(flow_tag,
439 * Merge the two fields to generate the following:
443 * bit[11:10] = l3_hdr_type
444 * bit[14:12] = l4_hdr_type
447 * bit[17] = outer_l3_type
449 ptype = _mm_and_si128(ptype, ptype_mask);
450 pinfo = _mm_and_si128(pinfo, pinfo_mask);
451 pinfo = _mm_slli_epi32(pinfo, 16);
452 /* Make pinfo has merged fields for ol_flags calculation. */
453 pinfo = _mm_or_si128(ptype, pinfo);
454 ptype = _mm_srli_epi32(pinfo, 10);
455 ptype = _mm_packs_epi32(ptype, zero);
456 /* Errored packets will have RTE_PTYPE_ALL_MASK. */
457 op_err = _mm_srli_epi16(op_err, 8);
458 ptype = _mm_or_si128(ptype, op_err);
459 pt_idx0 = _mm_extract_epi8(ptype, 0);
460 pt_idx1 = _mm_extract_epi8(ptype, 2);
461 pt_idx2 = _mm_extract_epi8(ptype, 4);
462 pt_idx3 = _mm_extract_epi8(ptype, 6);
463 pkts[0]->packet_type = mlx5_ptype_table[pt_idx0] |
464 !!(pt_idx0 & (1 << 6)) * rxq->tunnel;
465 pkts[1]->packet_type = mlx5_ptype_table[pt_idx1] |
466 !!(pt_idx1 & (1 << 6)) * rxq->tunnel;
467 pkts[2]->packet_type = mlx5_ptype_table[pt_idx2] |
468 !!(pt_idx2 & (1 << 6)) * rxq->tunnel;
469 pkts[3]->packet_type = mlx5_ptype_table[pt_idx3] |
470 !!(pt_idx3 & (1 << 6)) * rxq->tunnel;
471 /* Fill flags for checksum and VLAN. */
472 pinfo = _mm_and_si128(pinfo, ptype_ol_mask);
473 pinfo = _mm_shuffle_epi8(cv_flag_sel, pinfo);
474 /* Locate checksum flags at byte[2:1] and merge with VLAN flags. */
475 cv_flags = _mm_slli_epi32(pinfo, 9);
476 cv_flags = _mm_or_si128(pinfo, cv_flags);
477 /* Move back flags to start from byte[0]. */
478 cv_flags = _mm_srli_epi32(cv_flags, 8);
479 /* Mask out garbage bits. */
480 cv_flags = _mm_and_si128(cv_flags, cv_mask);
481 /* Merge to ol_flags. */
482 ol_flags = _mm_or_si128(ol_flags, cv_flags);
483 /* Merge mbuf_init and ol_flags. */
484 rearm0 = _mm_blend_epi16(mbuf_init, _mm_slli_si128(ol_flags, 8), 0x30);
485 rearm1 = _mm_blend_epi16(mbuf_init, _mm_slli_si128(ol_flags, 4), 0x30);
486 rearm2 = _mm_blend_epi16(mbuf_init, ol_flags, 0x30);
487 rearm3 = _mm_blend_epi16(mbuf_init, _mm_srli_si128(ol_flags, 4), 0x30);
488 /* Write 8B rearm_data and 8B ol_flags. */
489 _mm_store_si128((__m128i *)&pkts[0]->rearm_data, rearm0);
490 _mm_store_si128((__m128i *)&pkts[1]->rearm_data, rearm1);
491 _mm_store_si128((__m128i *)&pkts[2]->rearm_data, rearm2);
492 _mm_store_si128((__m128i *)&pkts[3]->rearm_data, rearm3);
496 * Process a non-compressed completion and fill in mbufs in RX SW ring
497 * with data extracted from the title completion descriptor.
500 * Pointer to RX queue structure.
502 * Pointer to completion array having a non-compressed completion at first.
504 * Pointer to SW ring to be filled. The first mbuf has to be pre-built from
505 * the title completion descriptor to be copied to the rest of mbufs.
507 * Array to store received packets.
509 * Maximum number of packets in array.
511 * Pointer to a flag. Set non-zero value if pkts array has at least one error
514 * Pointer to a index. Set it to the first compressed completion if any.
517 * Number of CQEs successfully processed.
519 static inline uint16_t
520 rxq_cq_process_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
521 struct rte_mbuf **elts, struct rte_mbuf **pkts,
522 uint16_t pkts_n, uint64_t *err, uint64_t *comp)
524 const uint16_t q_n = 1 << rxq->cqe_n;
525 const uint16_t q_mask = q_n - 1;
528 uint64_t comp_idx = MLX5_VPMD_DESCS_PER_LOOP;
529 uint16_t nocmp_n = 0;
530 unsigned int ownership = !!(rxq->cq_ci & (q_mask + 1));
531 const __m128i owner_check = _mm_set1_epi64x(0x0100000001000000LL);
532 const __m128i opcode_check = _mm_set1_epi64x(0xf0000000f0000000LL);
533 const __m128i format_check = _mm_set1_epi64x(0x0c0000000c000000LL);
534 const __m128i resp_err_check = _mm_set1_epi64x(0xe0000000e0000000LL);
535 #ifdef MLX5_PMD_SOFT_COUNTERS
536 uint32_t rcvd_byte = 0;
537 /* Mask to shuffle byte_cnt to add up stats. Do bswap16 for all. */
538 const __m128i len_shuf_mask =
539 _mm_set_epi8(-1, -1, -1, -1,
544 /* Mask to shuffle from extracted CQE to mbuf. */
545 const __m128i shuf_mask =
546 _mm_set_epi8(-1, 3, 2, 1, /* fdir.hi */
547 12, 13, 14, 15, /* rss, bswap32 */
548 10, 11, /* vlan_tci, bswap16 */
549 4, 5, /* data_len, bswap16 */
550 -1, -1, /* zero out 2nd half of pkt_len */
551 4, 5 /* pkt_len, bswap16 */);
552 /* Mask to blend from the last Qword to the first DQword. */
553 const __m128i blend_mask =
554 _mm_set_epi8(-1, -1, -1, -1,
558 const __m128i zero = _mm_setzero_si128();
559 const __m128i ones = _mm_cmpeq_epi32(zero, zero);
560 const __m128i crc_adj =
561 _mm_set_epi16(0, 0, 0, 0, 0,
562 rxq->crc_present * RTE_ETHER_CRC_LEN,
564 rxq->crc_present * RTE_ETHER_CRC_LEN);
565 const __m128i flow_mark_adj = _mm_set_epi32(rxq->mark * (-1), 0, 0, 0);
567 * A. load first Qword (8bytes) in one loop.
568 * B. copy 4 mbuf pointers from elts ring to returning pkts.
569 * C. load remained CQE data and extract necessary fields.
570 * Final 16bytes cqes[] extracted from original 64bytes CQE has the
571 * following structure:
574 * uint8_t flow_tag[3];
578 * uint16_t hdr_type_etc;
579 * uint16_t vlan_info;
580 * uint32_t rx_has_res;
584 * F. find compressed CQE.
588 pos += MLX5_VPMD_DESCS_PER_LOOP) {
589 __m128i cqes[MLX5_VPMD_DESCS_PER_LOOP];
590 __m128i cqe_tmp1, cqe_tmp2;
591 __m128i pkt_mb0, pkt_mb1, pkt_mb2, pkt_mb3;
592 __m128i op_own, op_own_tmp1, op_own_tmp2;
593 __m128i opcode, owner_mask, invalid_mask;
596 #ifdef MLX5_PMD_SOFT_COUNTERS
600 __m128i p = _mm_set_epi16(0, 0, 0, 0, 3, 2, 1, 0);
601 unsigned int p1, p2, p3;
603 /* Prefetch next 4 CQEs. */
604 if (pkts_n - pos >= 2 * MLX5_VPMD_DESCS_PER_LOOP) {
605 rte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP]);
606 rte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP + 1]);
607 rte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP + 2]);
608 rte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP + 3]);
610 /* A.0 do not cross the end of CQ. */
611 mask = _mm_set_epi64x(0, (pkts_n - pos) * sizeof(uint16_t) * 8);
612 mask = _mm_sll_epi64(ones, mask);
613 p = _mm_andnot_si128(mask, p);
615 p3 = _mm_extract_epi16(p, 3);
616 cqes[3] = _mm_loadl_epi64((__m128i *)
617 &cq[pos + p3].sop_drop_qpn);
618 rte_compiler_barrier();
619 p2 = _mm_extract_epi16(p, 2);
620 cqes[2] = _mm_loadl_epi64((__m128i *)
621 &cq[pos + p2].sop_drop_qpn);
622 rte_compiler_barrier();
623 /* B.1 load mbuf pointers. */
624 mbp1 = _mm_loadu_si128((__m128i *)&elts[pos]);
625 mbp2 = _mm_loadu_si128((__m128i *)&elts[pos + 2]);
626 /* A.1 load a block having op_own. */
627 p1 = _mm_extract_epi16(p, 1);
628 cqes[1] = _mm_loadl_epi64((__m128i *)
629 &cq[pos + p1].sop_drop_qpn);
630 rte_compiler_barrier();
631 cqes[0] = _mm_loadl_epi64((__m128i *)
632 &cq[pos].sop_drop_qpn);
633 /* B.2 copy mbuf pointers. */
634 _mm_storeu_si128((__m128i *)&pkts[pos], mbp1);
635 _mm_storeu_si128((__m128i *)&pkts[pos + 2], mbp2);
637 /* C.1 load remained CQE data and extract necessary fields. */
638 cqe_tmp2 = _mm_load_si128((__m128i *)&cq[pos + p3]);
639 cqe_tmp1 = _mm_load_si128((__m128i *)&cq[pos + p2]);
640 cqes[3] = _mm_blendv_epi8(cqes[3], cqe_tmp2, blend_mask);
641 cqes[2] = _mm_blendv_epi8(cqes[2], cqe_tmp1, blend_mask);
642 cqe_tmp2 = _mm_loadu_si128((__m128i *)&cq[pos + p3].csum);
643 cqe_tmp1 = _mm_loadu_si128((__m128i *)&cq[pos + p2].csum);
644 cqes[3] = _mm_blend_epi16(cqes[3], cqe_tmp2, 0x30);
645 cqes[2] = _mm_blend_epi16(cqes[2], cqe_tmp1, 0x30);
646 cqe_tmp2 = _mm_loadl_epi64((__m128i *)&cq[pos + p3].rsvd4[2]);
647 cqe_tmp1 = _mm_loadl_epi64((__m128i *)&cq[pos + p2].rsvd4[2]);
648 cqes[3] = _mm_blend_epi16(cqes[3], cqe_tmp2, 0x04);
649 cqes[2] = _mm_blend_epi16(cqes[2], cqe_tmp1, 0x04);
650 /* C.2 generate final structure for mbuf with swapping bytes. */
651 pkt_mb3 = _mm_shuffle_epi8(cqes[3], shuf_mask);
652 pkt_mb2 = _mm_shuffle_epi8(cqes[2], shuf_mask);
653 /* C.3 adjust CRC length. */
654 pkt_mb3 = _mm_sub_epi16(pkt_mb3, crc_adj);
655 pkt_mb2 = _mm_sub_epi16(pkt_mb2, crc_adj);
656 /* C.4 adjust flow mark. */
657 pkt_mb3 = _mm_add_epi32(pkt_mb3, flow_mark_adj);
658 pkt_mb2 = _mm_add_epi32(pkt_mb2, flow_mark_adj);
659 /* D.1 fill in mbuf - rx_descriptor_fields1. */
660 _mm_storeu_si128((void *)&pkts[pos + 3]->pkt_len, pkt_mb3);
661 _mm_storeu_si128((void *)&pkts[pos + 2]->pkt_len, pkt_mb2);
662 /* E.1 extract op_own field. */
663 op_own_tmp2 = _mm_unpacklo_epi32(cqes[2], cqes[3]);
664 /* C.1 load remained CQE data and extract necessary fields. */
665 cqe_tmp2 = _mm_load_si128((__m128i *)&cq[pos + p1]);
666 cqe_tmp1 = _mm_load_si128((__m128i *)&cq[pos]);
667 cqes[1] = _mm_blendv_epi8(cqes[1], cqe_tmp2, blend_mask);
668 cqes[0] = _mm_blendv_epi8(cqes[0], cqe_tmp1, blend_mask);
669 cqe_tmp2 = _mm_loadu_si128((__m128i *)&cq[pos + p1].csum);
670 cqe_tmp1 = _mm_loadu_si128((__m128i *)&cq[pos].csum);
671 cqes[1] = _mm_blend_epi16(cqes[1], cqe_tmp2, 0x30);
672 cqes[0] = _mm_blend_epi16(cqes[0], cqe_tmp1, 0x30);
673 cqe_tmp2 = _mm_loadl_epi64((__m128i *)&cq[pos + p1].rsvd4[2]);
674 cqe_tmp1 = _mm_loadl_epi64((__m128i *)&cq[pos].rsvd4[2]);
675 cqes[1] = _mm_blend_epi16(cqes[1], cqe_tmp2, 0x04);
676 cqes[0] = _mm_blend_epi16(cqes[0], cqe_tmp1, 0x04);
677 /* C.2 generate final structure for mbuf with swapping bytes. */
678 pkt_mb1 = _mm_shuffle_epi8(cqes[1], shuf_mask);
679 pkt_mb0 = _mm_shuffle_epi8(cqes[0], shuf_mask);
680 /* C.3 adjust CRC length. */
681 pkt_mb1 = _mm_sub_epi16(pkt_mb1, crc_adj);
682 pkt_mb0 = _mm_sub_epi16(pkt_mb0, crc_adj);
683 /* C.4 adjust flow mark. */
684 pkt_mb1 = _mm_add_epi32(pkt_mb1, flow_mark_adj);
685 pkt_mb0 = _mm_add_epi32(pkt_mb0, flow_mark_adj);
686 /* E.1 extract op_own byte. */
687 op_own_tmp1 = _mm_unpacklo_epi32(cqes[0], cqes[1]);
688 op_own = _mm_unpackhi_epi64(op_own_tmp1, op_own_tmp2);
689 /* D.1 fill in mbuf - rx_descriptor_fields1. */
690 _mm_storeu_si128((void *)&pkts[pos + 1]->pkt_len, pkt_mb1);
691 _mm_storeu_si128((void *)&pkts[pos]->pkt_len, pkt_mb0);
692 /* E.2 flip owner bit to mark CQEs from last round. */
693 owner_mask = _mm_and_si128(op_own, owner_check);
695 owner_mask = _mm_xor_si128(owner_mask, owner_check);
696 owner_mask = _mm_cmpeq_epi32(owner_mask, owner_check);
697 owner_mask = _mm_packs_epi32(owner_mask, zero);
698 /* E.3 get mask for invalidated CQEs. */
699 opcode = _mm_and_si128(op_own, opcode_check);
700 invalid_mask = _mm_cmpeq_epi32(opcode_check, opcode);
701 invalid_mask = _mm_packs_epi32(invalid_mask, zero);
702 /* E.4 mask out beyond boundary. */
703 invalid_mask = _mm_or_si128(invalid_mask, mask);
704 /* E.5 merge invalid_mask with invalid owner. */
705 invalid_mask = _mm_or_si128(invalid_mask, owner_mask);
706 /* F.1 find compressed CQE format. */
707 comp_mask = _mm_and_si128(op_own, format_check);
708 comp_mask = _mm_cmpeq_epi32(comp_mask, format_check);
709 comp_mask = _mm_packs_epi32(comp_mask, zero);
710 /* F.2 mask out invalid entries. */
711 comp_mask = _mm_andnot_si128(invalid_mask, comp_mask);
712 comp_idx = _mm_cvtsi128_si64(comp_mask);
713 /* F.3 get the first compressed CQE. */
714 comp_idx = comp_idx ?
715 __builtin_ctzll(comp_idx) /
716 (sizeof(uint16_t) * 8) :
717 MLX5_VPMD_DESCS_PER_LOOP;
718 /* E.6 mask out entries after the compressed CQE. */
719 mask = _mm_set_epi64x(0, comp_idx * sizeof(uint16_t) * 8);
720 mask = _mm_sll_epi64(ones, mask);
721 invalid_mask = _mm_or_si128(invalid_mask, mask);
722 /* E.7 count non-compressed valid CQEs. */
723 n = _mm_cvtsi128_si64(invalid_mask);
724 n = n ? __builtin_ctzll(n) / (sizeof(uint16_t) * 8) :
725 MLX5_VPMD_DESCS_PER_LOOP;
727 /* D.2 get the final invalid mask. */
728 mask = _mm_set_epi64x(0, n * sizeof(uint16_t) * 8);
729 mask = _mm_sll_epi64(ones, mask);
730 invalid_mask = _mm_or_si128(invalid_mask, mask);
731 /* D.3 check error in opcode. */
732 opcode = _mm_cmpeq_epi32(resp_err_check, opcode);
733 opcode = _mm_packs_epi32(opcode, zero);
734 opcode = _mm_andnot_si128(invalid_mask, opcode);
735 /* D.4 mark if any error is set */
736 *err |= _mm_cvtsi128_si64(opcode);
737 /* D.5 fill in mbuf - rearm_data and packet_type. */
738 rxq_cq_to_ptype_oflags_v(rxq, cqes, opcode, &pkts[pos]);
739 if (rxq->hw_timestamp) {
740 int offset = rxq->timestamp_offset;
741 if (rxq->rt_timestamp) {
742 struct mlx5_dev_ctx_shared *sh = rxq->sh;
745 ts = rte_be_to_cpu_64(cq[pos].timestamp);
746 mlx5_timestamp_set(pkts[pos], offset,
747 mlx5_txpp_convert_rx_ts(sh, ts));
748 ts = rte_be_to_cpu_64(cq[pos + p1].timestamp);
749 mlx5_timestamp_set(pkts[pos + 1], offset,
750 mlx5_txpp_convert_rx_ts(sh, ts));
751 ts = rte_be_to_cpu_64(cq[pos + p2].timestamp);
752 mlx5_timestamp_set(pkts[pos + 2], offset,
753 mlx5_txpp_convert_rx_ts(sh, ts));
754 ts = rte_be_to_cpu_64(cq[pos + p3].timestamp);
755 mlx5_timestamp_set(pkts[pos + 3], offset,
756 mlx5_txpp_convert_rx_ts(sh, ts));
758 mlx5_timestamp_set(pkts[pos], offset,
759 rte_be_to_cpu_64(cq[pos].timestamp));
760 mlx5_timestamp_set(pkts[pos + 1], offset,
761 rte_be_to_cpu_64(cq[pos + p1].timestamp));
762 mlx5_timestamp_set(pkts[pos + 2], offset,
763 rte_be_to_cpu_64(cq[pos + p2].timestamp));
764 mlx5_timestamp_set(pkts[pos + 3], offset,
765 rte_be_to_cpu_64(cq[pos + p3].timestamp));
768 if (rxq->dynf_meta) {
769 /* This code is subject for futher optimization. */
770 int32_t offs = rxq->flow_meta_offset;
771 uint32_t mask = rxq->flow_meta_port_mask;
773 *RTE_MBUF_DYNFIELD(pkts[pos], offs, uint32_t *) =
775 (cq[pos].flow_table_metadata) & mask;
776 *RTE_MBUF_DYNFIELD(pkts[pos + 1], offs, uint32_t *) =
778 (cq[pos + p1].flow_table_metadata) & mask;
779 *RTE_MBUF_DYNFIELD(pkts[pos + 2], offs, uint32_t *) =
781 (cq[pos + p2].flow_table_metadata) & mask;
782 *RTE_MBUF_DYNFIELD(pkts[pos + 3], offs, uint32_t *) =
784 (cq[pos + p3].flow_table_metadata) & mask;
785 if (*RTE_MBUF_DYNFIELD(pkts[pos], offs, uint32_t *))
786 pkts[pos]->ol_flags |= rxq->flow_meta_mask;
787 if (*RTE_MBUF_DYNFIELD(pkts[pos + 1], offs, uint32_t *))
788 pkts[pos + 1]->ol_flags |= rxq->flow_meta_mask;
789 if (*RTE_MBUF_DYNFIELD(pkts[pos + 2], offs, uint32_t *))
790 pkts[pos + 2]->ol_flags |= rxq->flow_meta_mask;
791 if (*RTE_MBUF_DYNFIELD(pkts[pos + 3], offs, uint32_t *))
792 pkts[pos + 3]->ol_flags |= rxq->flow_meta_mask;
794 #ifdef MLX5_PMD_SOFT_COUNTERS
795 /* Add up received bytes count. */
796 byte_cnt = _mm_shuffle_epi8(op_own, len_shuf_mask);
797 byte_cnt = _mm_andnot_si128(invalid_mask, byte_cnt);
798 byte_cnt = _mm_hadd_epi16(byte_cnt, zero);
799 rcvd_byte += _mm_cvtsi128_si64(_mm_hadd_epi16(byte_cnt, zero));
802 * Break the loop unless more valid CQE is expected, or if
803 * there's a compressed CQE.
805 if (n != MLX5_VPMD_DESCS_PER_LOOP)
808 #ifdef MLX5_PMD_SOFT_COUNTERS
809 rxq->stats.ipackets += nocmp_n;
810 rxq->stats.ibytes += rcvd_byte;
817 #endif /* RTE_PMD_MLX5_RXTX_VEC_SSE_H_ */