1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright(c) 2017-2020 Intel Corporation
7 #include <rte_common.h>
8 #include <rte_branch_prediction.h>
9 #include <rte_cpuflags.h>
13 #include <x86intrin.h>
15 /** PCLMULQDQ CRC computation context structure */
16 struct crc_pclmulqdq_ctx {
22 static struct crc_pclmulqdq_ctx crc32_eth_pclmulqdq __rte_aligned(16);
23 static struct crc_pclmulqdq_ctx crc16_ccitt_pclmulqdq __rte_aligned(16);
25 * @brief Performs one folding round
27 * Logically function operates as follows:
28 * DATA = READ_NEXT_16BYTES();
33 * FOLD = XOR(T1, T2, DATA)
38 * Precomputed rk1 constant
40 * Current16 byte folded data
43 * New 16 byte folded data
45 static __rte_always_inline __m128i
46 crcr32_folding_round(__m128i data_block,
50 __m128i tmp0 = _mm_clmulepi64_si128(fold, precomp, 0x01);
51 __m128i tmp1 = _mm_clmulepi64_si128(fold, precomp, 0x10);
53 return _mm_xor_si128(tmp1, _mm_xor_si128(data_block, tmp0));
57 * Performs reduction from 128 bits to 64 bits
60 * 128 bits data to be reduced
62 * precomputed constants rk5, rk6
65 * 64 bits reduced data
68 static __rte_always_inline __m128i
69 crcr32_reduce_128_to_64(__m128i data128, __m128i precomp)
71 __m128i tmp0, tmp1, tmp2;
74 tmp0 = _mm_clmulepi64_si128(data128, precomp, 0x00);
75 tmp1 = _mm_srli_si128(data128, 8);
76 tmp0 = _mm_xor_si128(tmp0, tmp1);
79 tmp2 = _mm_slli_si128(tmp0, 4);
80 tmp1 = _mm_clmulepi64_si128(tmp2, precomp, 0x10);
82 return _mm_xor_si128(tmp1, tmp0);
86 * Performs Barret's reduction from 64 bits to 32 bits
89 * 64 bits data to be reduced
91 * rk7 precomputed constant
94 * reduced 32 bits data
97 static __rte_always_inline uint32_t
98 crcr32_reduce_64_to_32(__m128i data64, __m128i precomp)
100 static const uint32_t mask1[4] __rte_aligned(16) = {
101 0xffffffff, 0xffffffff, 0x00000000, 0x00000000
104 static const uint32_t mask2[4] __rte_aligned(16) = {
105 0x00000000, 0xffffffff, 0xffffffff, 0xffffffff
107 __m128i tmp0, tmp1, tmp2;
109 tmp0 = _mm_and_si128(data64, _mm_load_si128((const __m128i *)mask2));
111 tmp1 = _mm_clmulepi64_si128(tmp0, precomp, 0x00);
112 tmp1 = _mm_xor_si128(tmp1, tmp0);
113 tmp1 = _mm_and_si128(tmp1, _mm_load_si128((const __m128i *)mask1));
115 tmp2 = _mm_clmulepi64_si128(tmp1, precomp, 0x10);
116 tmp2 = _mm_xor_si128(tmp2, tmp1);
117 tmp2 = _mm_xor_si128(tmp2, tmp0);
119 return _mm_extract_epi32(tmp2, 2);
122 static const uint8_t crc_xmm_shift_tab[48] __rte_aligned(16) = {
123 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
124 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
125 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
126 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
127 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
128 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
132 * Shifts left 128 bit register by specified number of bytes
137 * number of bytes to shift left reg by (0-16)
143 static __rte_always_inline __m128i
144 xmm_shift_left(__m128i reg, const unsigned int num)
146 const __m128i *p = (const __m128i *)(crc_xmm_shift_tab + 16 - num);
148 return _mm_shuffle_epi8(reg, _mm_loadu_si128(p));
151 static __rte_always_inline uint32_t
152 crc32_eth_calc_pclmulqdq(
156 const struct crc_pclmulqdq_ctx *params)
158 __m128i temp, fold, k;
161 /* Get CRC init value */
162 temp = _mm_insert_epi32(_mm_setzero_si128(), crc, 0);
165 * Folding all data into single 16 byte data block
166 * Assumes: fold holds first 16 bytes of data
169 if (unlikely(data_len < 32)) {
170 if (unlikely(data_len == 16)) {
172 fold = _mm_loadu_si128((const __m128i *)data);
173 fold = _mm_xor_si128(fold, temp);
174 goto reduction_128_64;
177 if (unlikely(data_len < 16)) {
179 uint8_t buffer[16] __rte_aligned(16);
181 memset(buffer, 0, sizeof(buffer));
182 memcpy(buffer, data, data_len);
184 fold = _mm_load_si128((const __m128i *)buffer);
185 fold = _mm_xor_si128(fold, temp);
186 if (unlikely(data_len < 4)) {
187 fold = xmm_shift_left(fold, 8 - data_len);
188 goto barret_reduction;
190 fold = xmm_shift_left(fold, 16 - data_len);
191 goto reduction_128_64;
194 fold = _mm_loadu_si128((const __m128i *)data);
195 fold = _mm_xor_si128(fold, temp);
201 /** At least 32 bytes in the buffer */
202 /** Apply CRC initial value */
203 fold = _mm_loadu_si128((const __m128i *)data);
204 fold = _mm_xor_si128(fold, temp);
206 /** Main folding loop - the last 16 bytes is processed separately */
208 for (n = 16; (n + 16) <= data_len; n += 16) {
209 temp = _mm_loadu_si128((const __m128i *)&data[n]);
210 fold = crcr32_folding_round(temp, k, fold);
214 if (likely(n < data_len)) {
216 const uint32_t mask3[4] __rte_aligned(16) = {
217 0x80808080, 0x80808080, 0x80808080, 0x80808080
220 const uint8_t shf_table[32] __rte_aligned(16) = {
221 0x00, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
222 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
223 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
224 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
227 __m128i last16, a, b;
229 last16 = _mm_loadu_si128((const __m128i *)&data[data_len - 16]);
231 temp = _mm_loadu_si128((const __m128i *)
232 &shf_table[data_len & 15]);
233 a = _mm_shuffle_epi8(fold, temp);
235 temp = _mm_xor_si128(temp,
236 _mm_load_si128((const __m128i *)mask3));
237 b = _mm_shuffle_epi8(fold, temp);
238 b = _mm_blendv_epi8(b, last16, temp);
241 temp = _mm_clmulepi64_si128(a, k, 0x01);
242 fold = _mm_clmulepi64_si128(a, k, 0x10);
244 fold = _mm_xor_si128(fold, temp);
245 fold = _mm_xor_si128(fold, b);
248 /** Reduction 128 -> 32 Assumes: fold holds 128bit folded data */
251 fold = crcr32_reduce_128_to_64(fold, k);
255 n = crcr32_reduce_64_to_32(fold, k);
261 rte_net_crc_sse42_init(void)
263 uint64_t k1, k2, k5, k6;
264 uint64_t p = 0, q = 0;
266 /** Initialize CRC16 data */
274 /** Save the params in context structure */
275 crc16_ccitt_pclmulqdq.rk1_rk2 =
276 _mm_setr_epi64(_mm_cvtsi64_m64(k1), _mm_cvtsi64_m64(k2));
277 crc16_ccitt_pclmulqdq.rk5_rk6 =
278 _mm_setr_epi64(_mm_cvtsi64_m64(k5), _mm_cvtsi64_m64(k6));
279 crc16_ccitt_pclmulqdq.rk7_rk8 =
280 _mm_setr_epi64(_mm_cvtsi64_m64(q), _mm_cvtsi64_m64(p));
282 /** Initialize CRC32 data */
290 /** Save the params in context structure */
291 crc32_eth_pclmulqdq.rk1_rk2 =
292 _mm_setr_epi64(_mm_cvtsi64_m64(k1), _mm_cvtsi64_m64(k2));
293 crc32_eth_pclmulqdq.rk5_rk6 =
294 _mm_setr_epi64(_mm_cvtsi64_m64(k5), _mm_cvtsi64_m64(k6));
295 crc32_eth_pclmulqdq.rk7_rk8 =
296 _mm_setr_epi64(_mm_cvtsi64_m64(q), _mm_cvtsi64_m64(p));
299 * Reset the register as following calculation may
300 * use other data types such as float, double, etc.
306 rte_crc16_ccitt_sse42_handler(const uint8_t *data, uint32_t data_len)
308 /** return 16-bit CRC value */
309 return (uint16_t)~crc32_eth_calc_pclmulqdq(data,
312 &crc16_ccitt_pclmulqdq);
316 rte_crc32_eth_sse42_handler(const uint8_t *data, uint32_t data_len)
318 return ~crc32_eth_calc_pclmulqdq(data,
321 &crc32_eth_pclmulqdq);