1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright(c) 2017 Intel Corporation
5 #ifndef _RTE_NET_CRC_SSE_H_
6 #define _RTE_NET_CRC_SSE_H_
8 #include <rte_branch_prediction.h>
10 #include <x86intrin.h>
17 /** PCLMULQDQ CRC computation context structure */
18 struct crc_pclmulqdq_ctx {
24 static struct crc_pclmulqdq_ctx crc32_eth_pclmulqdq __rte_aligned(16);
25 static struct crc_pclmulqdq_ctx crc16_ccitt_pclmulqdq __rte_aligned(16);
27 * @brief Performs one folding round
29 * Logically function operates as follows:
30 * DATA = READ_NEXT_16BYTES();
35 * FOLD = XOR(T1, T2, DATA)
40 * Precomputed rk1 constant
42 * Current16 byte folded data
45 * New 16 byte folded data
47 static __rte_always_inline __m128i
48 crcr32_folding_round(__m128i data_block,
52 __m128i tmp0 = _mm_clmulepi64_si128(fold, precomp, 0x01);
53 __m128i tmp1 = _mm_clmulepi64_si128(fold, precomp, 0x10);
55 return _mm_xor_si128(tmp1, _mm_xor_si128(data_block, tmp0));
59 * Performs reduction from 128 bits to 64 bits
62 * 128 bits data to be reduced
64 * precomputed constants rk5, rk6
67 * 64 bits reduced data
70 static __rte_always_inline __m128i
71 crcr32_reduce_128_to_64(__m128i data128, __m128i precomp)
73 __m128i tmp0, tmp1, tmp2;
76 tmp0 = _mm_clmulepi64_si128(data128, precomp, 0x00);
77 tmp1 = _mm_srli_si128(data128, 8);
78 tmp0 = _mm_xor_si128(tmp0, tmp1);
81 tmp2 = _mm_slli_si128(tmp0, 4);
82 tmp1 = _mm_clmulepi64_si128(tmp2, precomp, 0x10);
84 return _mm_xor_si128(tmp1, tmp0);
88 * Performs Barret's reduction from 64 bits to 32 bits
91 * 64 bits data to be reduced
93 * rk7 precomputed constant
96 * reduced 32 bits data
99 static __rte_always_inline uint32_t
100 crcr32_reduce_64_to_32(__m128i data64, __m128i precomp)
102 static const uint32_t mask1[4] __rte_aligned(16) = {
103 0xffffffff, 0xffffffff, 0x00000000, 0x00000000
106 static const uint32_t mask2[4] __rte_aligned(16) = {
107 0x00000000, 0xffffffff, 0xffffffff, 0xffffffff
109 __m128i tmp0, tmp1, tmp2;
111 tmp0 = _mm_and_si128(data64, _mm_load_si128((const __m128i *)mask2));
113 tmp1 = _mm_clmulepi64_si128(tmp0, precomp, 0x00);
114 tmp1 = _mm_xor_si128(tmp1, tmp0);
115 tmp1 = _mm_and_si128(tmp1, _mm_load_si128((const __m128i *)mask1));
117 tmp2 = _mm_clmulepi64_si128(tmp1, precomp, 0x10);
118 tmp2 = _mm_xor_si128(tmp2, tmp1);
119 tmp2 = _mm_xor_si128(tmp2, tmp0);
121 return _mm_extract_epi32(tmp2, 2);
124 static const uint8_t crc_xmm_shift_tab[48] __rte_aligned(16) = {
125 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
126 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
127 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
128 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
129 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
130 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
134 * Shifts left 128 bit register by specified number of bytes
139 * number of bytes to shift left reg by (0-16)
145 static __rte_always_inline __m128i
146 xmm_shift_left(__m128i reg, const unsigned int num)
148 const __m128i *p = (const __m128i *)(crc_xmm_shift_tab + 16 - num);
150 return _mm_shuffle_epi8(reg, _mm_loadu_si128(p));
153 static __rte_always_inline uint32_t
154 crc32_eth_calc_pclmulqdq(
158 const struct crc_pclmulqdq_ctx *params)
160 __m128i temp, fold, k;
163 /* Get CRC init value */
164 temp = _mm_insert_epi32(_mm_setzero_si128(), crc, 0);
167 * Folding all data into single 16 byte data block
168 * Assumes: fold holds first 16 bytes of data
171 if (unlikely(data_len < 32)) {
172 if (unlikely(data_len == 16)) {
174 fold = _mm_loadu_si128((const __m128i *)data);
175 fold = _mm_xor_si128(fold, temp);
176 goto reduction_128_64;
179 if (unlikely(data_len < 16)) {
181 uint8_t buffer[16] __rte_aligned(16);
183 memset(buffer, 0, sizeof(buffer));
184 memcpy(buffer, data, data_len);
186 fold = _mm_load_si128((const __m128i *)buffer);
187 fold = _mm_xor_si128(fold, temp);
188 if (unlikely(data_len < 4)) {
189 fold = xmm_shift_left(fold, 8 - data_len);
190 goto barret_reduction;
192 fold = xmm_shift_left(fold, 16 - data_len);
193 goto reduction_128_64;
196 fold = _mm_loadu_si128((const __m128i *)data);
197 fold = _mm_xor_si128(fold, temp);
203 /** At least 32 bytes in the buffer */
204 /** Apply CRC initial value */
205 fold = _mm_loadu_si128((const __m128i *)data);
206 fold = _mm_xor_si128(fold, temp);
208 /** Main folding loop - the last 16 bytes is processed separately */
210 for (n = 16; (n + 16) <= data_len; n += 16) {
211 temp = _mm_loadu_si128((const __m128i *)&data[n]);
212 fold = crcr32_folding_round(temp, k, fold);
216 if (likely(n < data_len)) {
218 const uint32_t mask3[4] __rte_aligned(16) = {
219 0x80808080, 0x80808080, 0x80808080, 0x80808080
222 const uint8_t shf_table[32] __rte_aligned(16) = {
223 0x00, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
224 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
225 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
226 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
229 __m128i last16, a, b;
231 last16 = _mm_loadu_si128((const __m128i *)&data[data_len - 16]);
233 temp = _mm_loadu_si128((const __m128i *)
234 &shf_table[data_len & 15]);
235 a = _mm_shuffle_epi8(fold, temp);
237 temp = _mm_xor_si128(temp,
238 _mm_load_si128((const __m128i *)mask3));
239 b = _mm_shuffle_epi8(fold, temp);
240 b = _mm_blendv_epi8(b, last16, temp);
243 temp = _mm_clmulepi64_si128(a, k, 0x01);
244 fold = _mm_clmulepi64_si128(a, k, 0x10);
246 fold = _mm_xor_si128(fold, temp);
247 fold = _mm_xor_si128(fold, b);
250 /** Reduction 128 -> 32 Assumes: fold holds 128bit folded data */
253 fold = crcr32_reduce_128_to_64(fold, k);
257 n = crcr32_reduce_64_to_32(fold, k);
264 rte_net_crc_sse42_init(void)
266 uint64_t k1, k2, k5, k6;
267 uint64_t p = 0, q = 0;
269 /** Initialize CRC16 data */
277 /** Save the params in context structure */
278 crc16_ccitt_pclmulqdq.rk1_rk2 =
279 _mm_setr_epi64(_mm_cvtsi64_m64(k1), _mm_cvtsi64_m64(k2));
280 crc16_ccitt_pclmulqdq.rk5_rk6 =
281 _mm_setr_epi64(_mm_cvtsi64_m64(k5), _mm_cvtsi64_m64(k6));
282 crc16_ccitt_pclmulqdq.rk7_rk8 =
283 _mm_setr_epi64(_mm_cvtsi64_m64(q), _mm_cvtsi64_m64(p));
285 /** Initialize CRC32 data */
293 /** Save the params in context structure */
294 crc32_eth_pclmulqdq.rk1_rk2 =
295 _mm_setr_epi64(_mm_cvtsi64_m64(k1), _mm_cvtsi64_m64(k2));
296 crc32_eth_pclmulqdq.rk5_rk6 =
297 _mm_setr_epi64(_mm_cvtsi64_m64(k5), _mm_cvtsi64_m64(k6));
298 crc32_eth_pclmulqdq.rk7_rk8 =
299 _mm_setr_epi64(_mm_cvtsi64_m64(q), _mm_cvtsi64_m64(p));
302 * Reset the register as following calculation may
303 * use other data types such as float, double, etc.
309 static inline uint32_t
310 rte_crc16_ccitt_sse42_handler(const uint8_t *data,
313 /** return 16-bit CRC value */
314 return (uint16_t)~crc32_eth_calc_pclmulqdq(data,
317 &crc16_ccitt_pclmulqdq);
320 static inline uint32_t
321 rte_crc32_eth_sse42_handler(const uint8_t *data,
324 return ~crc32_eth_calc_pclmulqdq(data,
327 &crc32_eth_pclmulqdq);
334 #endif /* _RTE_NET_CRC_SSE_H_ */