1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright(c) 2017-2020 Intel Corporation
7 #include <rte_common.h>
8 #include <rte_branch_prediction.h>
12 #include <x86intrin.h>
14 /** PCLMULQDQ CRC computation context structure */
15 struct crc_pclmulqdq_ctx {
21 static struct crc_pclmulqdq_ctx crc32_eth_pclmulqdq __rte_aligned(16);
22 static struct crc_pclmulqdq_ctx crc16_ccitt_pclmulqdq __rte_aligned(16);
24 * @brief Performs one folding round
26 * Logically function operates as follows:
27 * DATA = READ_NEXT_16BYTES();
32 * FOLD = XOR(T1, T2, DATA)
37 * Precomputed rk1 constant
39 * Current16 byte folded data
42 * New 16 byte folded data
44 static __rte_always_inline __m128i
45 crcr32_folding_round(__m128i data_block,
49 __m128i tmp0 = _mm_clmulepi64_si128(fold, precomp, 0x01);
50 __m128i tmp1 = _mm_clmulepi64_si128(fold, precomp, 0x10);
52 return _mm_xor_si128(tmp1, _mm_xor_si128(data_block, tmp0));
56 * Performs reduction from 128 bits to 64 bits
59 * 128 bits data to be reduced
61 * precomputed constants rk5, rk6
64 * 64 bits reduced data
67 static __rte_always_inline __m128i
68 crcr32_reduce_128_to_64(__m128i data128, __m128i precomp)
70 __m128i tmp0, tmp1, tmp2;
73 tmp0 = _mm_clmulepi64_si128(data128, precomp, 0x00);
74 tmp1 = _mm_srli_si128(data128, 8);
75 tmp0 = _mm_xor_si128(tmp0, tmp1);
78 tmp2 = _mm_slli_si128(tmp0, 4);
79 tmp1 = _mm_clmulepi64_si128(tmp2, precomp, 0x10);
81 return _mm_xor_si128(tmp1, tmp0);
85 * Performs Barret's reduction from 64 bits to 32 bits
88 * 64 bits data to be reduced
90 * rk7 precomputed constant
93 * reduced 32 bits data
96 static __rte_always_inline uint32_t
97 crcr32_reduce_64_to_32(__m128i data64, __m128i precomp)
99 static const uint32_t mask1[4] __rte_aligned(16) = {
100 0xffffffff, 0xffffffff, 0x00000000, 0x00000000
103 static const uint32_t mask2[4] __rte_aligned(16) = {
104 0x00000000, 0xffffffff, 0xffffffff, 0xffffffff
106 __m128i tmp0, tmp1, tmp2;
108 tmp0 = _mm_and_si128(data64, _mm_load_si128((const __m128i *)mask2));
110 tmp1 = _mm_clmulepi64_si128(tmp0, precomp, 0x00);
111 tmp1 = _mm_xor_si128(tmp1, tmp0);
112 tmp1 = _mm_and_si128(tmp1, _mm_load_si128((const __m128i *)mask1));
114 tmp2 = _mm_clmulepi64_si128(tmp1, precomp, 0x10);
115 tmp2 = _mm_xor_si128(tmp2, tmp1);
116 tmp2 = _mm_xor_si128(tmp2, tmp0);
118 return _mm_extract_epi32(tmp2, 2);
121 static const uint8_t crc_xmm_shift_tab[48] __rte_aligned(16) = {
122 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
123 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
124 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
125 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
126 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
127 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
131 * Shifts left 128 bit register by specified number of bytes
136 * number of bytes to shift left reg by (0-16)
142 static __rte_always_inline __m128i
143 xmm_shift_left(__m128i reg, const unsigned int num)
145 const __m128i *p = (const __m128i *)(crc_xmm_shift_tab + 16 - num);
147 return _mm_shuffle_epi8(reg, _mm_loadu_si128(p));
150 static __rte_always_inline uint32_t
151 crc32_eth_calc_pclmulqdq(
155 const struct crc_pclmulqdq_ctx *params)
157 __m128i temp, fold, k;
160 /* Get CRC init value */
161 temp = _mm_insert_epi32(_mm_setzero_si128(), crc, 0);
164 * Folding all data into single 16 byte data block
165 * Assumes: fold holds first 16 bytes of data
168 if (unlikely(data_len < 32)) {
169 if (unlikely(data_len == 16)) {
171 fold = _mm_loadu_si128((const __m128i *)data);
172 fold = _mm_xor_si128(fold, temp);
173 goto reduction_128_64;
176 if (unlikely(data_len < 16)) {
178 uint8_t buffer[16] __rte_aligned(16);
180 memset(buffer, 0, sizeof(buffer));
181 memcpy(buffer, data, data_len);
183 fold = _mm_load_si128((const __m128i *)buffer);
184 fold = _mm_xor_si128(fold, temp);
185 if (unlikely(data_len < 4)) {
186 fold = xmm_shift_left(fold, 8 - data_len);
187 goto barret_reduction;
189 fold = xmm_shift_left(fold, 16 - data_len);
190 goto reduction_128_64;
193 fold = _mm_loadu_si128((const __m128i *)data);
194 fold = _mm_xor_si128(fold, temp);
200 /** At least 32 bytes in the buffer */
201 /** Apply CRC initial value */
202 fold = _mm_loadu_si128((const __m128i *)data);
203 fold = _mm_xor_si128(fold, temp);
205 /** Main folding loop - the last 16 bytes is processed separately */
207 for (n = 16; (n + 16) <= data_len; n += 16) {
208 temp = _mm_loadu_si128((const __m128i *)&data[n]);
209 fold = crcr32_folding_round(temp, k, fold);
213 if (likely(n < data_len)) {
215 const uint32_t mask3[4] __rte_aligned(16) = {
216 0x80808080, 0x80808080, 0x80808080, 0x80808080
219 const uint8_t shf_table[32] __rte_aligned(16) = {
220 0x00, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
221 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
222 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
223 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
226 __m128i last16, a, b;
228 last16 = _mm_loadu_si128((const __m128i *)&data[data_len - 16]);
230 temp = _mm_loadu_si128((const __m128i *)
231 &shf_table[data_len & 15]);
232 a = _mm_shuffle_epi8(fold, temp);
234 temp = _mm_xor_si128(temp,
235 _mm_load_si128((const __m128i *)mask3));
236 b = _mm_shuffle_epi8(fold, temp);
237 b = _mm_blendv_epi8(b, last16, temp);
240 temp = _mm_clmulepi64_si128(a, k, 0x01);
241 fold = _mm_clmulepi64_si128(a, k, 0x10);
243 fold = _mm_xor_si128(fold, temp);
244 fold = _mm_xor_si128(fold, b);
247 /** Reduction 128 -> 32 Assumes: fold holds 128bit folded data */
250 fold = crcr32_reduce_128_to_64(fold, k);
254 n = crcr32_reduce_64_to_32(fold, k);
260 rte_net_crc_sse42_init(void)
262 uint64_t k1, k2, k5, k6;
263 uint64_t p = 0, q = 0;
265 /** Initialize CRC16 data */
273 /** Save the params in context structure */
274 crc16_ccitt_pclmulqdq.rk1_rk2 =
275 _mm_setr_epi64(_mm_cvtsi64_m64(k1), _mm_cvtsi64_m64(k2));
276 crc16_ccitt_pclmulqdq.rk5_rk6 =
277 _mm_setr_epi64(_mm_cvtsi64_m64(k5), _mm_cvtsi64_m64(k6));
278 crc16_ccitt_pclmulqdq.rk7_rk8 =
279 _mm_setr_epi64(_mm_cvtsi64_m64(q), _mm_cvtsi64_m64(p));
281 /** Initialize CRC32 data */
289 /** Save the params in context structure */
290 crc32_eth_pclmulqdq.rk1_rk2 =
291 _mm_setr_epi64(_mm_cvtsi64_m64(k1), _mm_cvtsi64_m64(k2));
292 crc32_eth_pclmulqdq.rk5_rk6 =
293 _mm_setr_epi64(_mm_cvtsi64_m64(k5), _mm_cvtsi64_m64(k6));
294 crc32_eth_pclmulqdq.rk7_rk8 =
295 _mm_setr_epi64(_mm_cvtsi64_m64(q), _mm_cvtsi64_m64(p));
298 * Reset the register as following calculation may
299 * use other data types such as float, double, etc.
305 rte_crc16_ccitt_sse42_handler(const uint8_t *data, uint32_t data_len)
307 /** return 16-bit CRC value */
308 return (uint16_t)~crc32_eth_calc_pclmulqdq(data,
311 &crc16_ccitt_pclmulqdq);
315 rte_crc32_eth_sse42_handler(const uint8_t *data, uint32_t data_len)
317 return ~crc32_eth_calc_pclmulqdq(data,
320 &crc32_eth_pclmulqdq);