4 * Copyright(c) 2017 Intel Corporation.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * * Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * * Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * * Neither the name of Intel Corporation nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 #ifndef _RTE_NET_CRC_SSE_H_
35 #define _RTE_NET_CRC_SSE_H_
37 #include <rte_branch_prediction.h>
39 #include <x86intrin.h>
46 /** PCLMULQDQ CRC computation context structure */
47 struct crc_pclmulqdq_ctx {
53 struct crc_pclmulqdq_ctx crc32_eth_pclmulqdq __rte_aligned(16);
54 struct crc_pclmulqdq_ctx crc16_ccitt_pclmulqdq __rte_aligned(16);
56 * @brief Performs one folding round
58 * Logically function operates as follows:
59 * DATA = READ_NEXT_16BYTES();
64 * FOLD = XOR(T1, T2, DATA)
69 * Precomputed rk1 constanst
71 * Current16 byte folded data
74 * New 16 byte folded data
76 static __rte_always_inline __m128i
77 crcr32_folding_round(__m128i data_block,
81 __m128i tmp0 = _mm_clmulepi64_si128(fold, precomp, 0x01);
82 __m128i tmp1 = _mm_clmulepi64_si128(fold, precomp, 0x10);
84 return _mm_xor_si128(tmp1, _mm_xor_si128(data_block, tmp0));
88 * Performs reduction from 128 bits to 64 bits
91 * 128 bits data to be reduced
93 * precomputed constants rk5, rk6
96 * 64 bits reduced data
99 static __rte_always_inline __m128i
100 crcr32_reduce_128_to_64(__m128i data128, __m128i precomp)
102 __m128i tmp0, tmp1, tmp2;
105 tmp0 = _mm_clmulepi64_si128(data128, precomp, 0x00);
106 tmp1 = _mm_srli_si128(data128, 8);
107 tmp0 = _mm_xor_si128(tmp0, tmp1);
110 tmp2 = _mm_slli_si128(tmp0, 4);
111 tmp1 = _mm_clmulepi64_si128(tmp2, precomp, 0x10);
113 return _mm_xor_si128(tmp1, tmp0);
117 * Performs Barret's reduction from 64 bits to 32 bits
120 * 64 bits data to be reduced
122 * rk7 precomputed constant
125 * reduced 32 bits data
128 static __rte_always_inline uint32_t
129 crcr32_reduce_64_to_32(__m128i data64, __m128i precomp)
131 static const uint32_t mask1[4] __rte_aligned(16) = {
132 0xffffffff, 0xffffffff, 0x00000000, 0x00000000
135 static const uint32_t mask2[4] __rte_aligned(16) = {
136 0x00000000, 0xffffffff, 0xffffffff, 0xffffffff
138 __m128i tmp0, tmp1, tmp2;
140 tmp0 = _mm_and_si128(data64, _mm_load_si128((const __m128i *)mask2));
142 tmp1 = _mm_clmulepi64_si128(tmp0, precomp, 0x00);
143 tmp1 = _mm_xor_si128(tmp1, tmp0);
144 tmp1 = _mm_and_si128(tmp1, _mm_load_si128((const __m128i *)mask1));
146 tmp2 = _mm_clmulepi64_si128(tmp1, precomp, 0x10);
147 tmp2 = _mm_xor_si128(tmp2, tmp1);
148 tmp2 = _mm_xor_si128(tmp2, tmp0);
150 return _mm_extract_epi32(tmp2, 2);
153 static const uint8_t crc_xmm_shift_tab[48] __rte_aligned(16) = {
154 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
155 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
156 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
157 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
158 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
159 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
163 * Shifts left 128 bit register by specified number of bytes
168 * number of bytes to shift left reg by (0-16)
174 static __rte_always_inline __m128i
175 xmm_shift_left(__m128i reg, const unsigned int num)
177 const __m128i *p = (const __m128i *)(crc_xmm_shift_tab + 16 - num);
179 return _mm_shuffle_epi8(reg, _mm_loadu_si128(p));
182 static __rte_always_inline uint32_t
183 crc32_eth_calc_pclmulqdq(
187 const struct crc_pclmulqdq_ctx *params)
189 __m128i temp, fold, k;
192 /* Get CRC init value */
193 temp = _mm_insert_epi32(_mm_setzero_si128(), crc, 0);
196 * Folding all data into single 16 byte data block
197 * Assumes: fold holds first 16 bytes of data
200 if (unlikely(data_len < 32)) {
201 if (unlikely(data_len == 16)) {
203 fold = _mm_loadu_si128((const __m128i *)data);
204 fold = _mm_xor_si128(fold, temp);
205 goto reduction_128_64;
208 if (unlikely(data_len < 16)) {
210 uint8_t buffer[16] __rte_aligned(16);
212 memset(buffer, 0, sizeof(buffer));
213 memcpy(buffer, data, data_len);
215 fold = _mm_load_si128((const __m128i *)buffer);
216 fold = _mm_xor_si128(fold, temp);
217 if (unlikely(data_len < 4)) {
218 fold = xmm_shift_left(fold, 8 - data_len);
219 goto barret_reduction;
221 fold = xmm_shift_left(fold, 16 - data_len);
222 goto reduction_128_64;
225 fold = _mm_loadu_si128((const __m128i *)data);
226 fold = _mm_xor_si128(fold, temp);
232 /** At least 32 bytes in the buffer */
233 /** Apply CRC initial value */
234 fold = _mm_loadu_si128((const __m128i *)data);
235 fold = _mm_xor_si128(fold, temp);
237 /** Main folding loop - the last 16 bytes is processed separately */
239 for (n = 16; (n + 16) <= data_len; n += 16) {
240 temp = _mm_loadu_si128((const __m128i *)&data[n]);
241 fold = crcr32_folding_round(temp, k, fold);
245 if (likely(n < data_len)) {
247 const uint32_t mask3[4] __rte_aligned(16) = {
248 0x80808080, 0x80808080, 0x80808080, 0x80808080
251 const uint8_t shf_table[32] __rte_aligned(16) = {
252 0x00, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
253 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
254 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
255 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
258 __m128i last16, a, b;
260 last16 = _mm_loadu_si128((const __m128i *)&data[data_len - 16]);
262 temp = _mm_loadu_si128((const __m128i *)
263 &shf_table[data_len & 15]);
264 a = _mm_shuffle_epi8(fold, temp);
266 temp = _mm_xor_si128(temp,
267 _mm_load_si128((const __m128i *)mask3));
268 b = _mm_shuffle_epi8(fold, temp);
269 b = _mm_blendv_epi8(b, last16, temp);
272 temp = _mm_clmulepi64_si128(a, k, 0x01);
273 fold = _mm_clmulepi64_si128(a, k, 0x10);
275 fold = _mm_xor_si128(fold, temp);
276 fold = _mm_xor_si128(fold, b);
279 /** Reduction 128 -> 32 Assumes: fold holds 128bit folded data */
282 fold = crcr32_reduce_128_to_64(fold, k);
286 n = crcr32_reduce_64_to_32(fold, k);
293 rte_net_crc_sse42_init(void)
295 uint64_t k1, k2, k5, k6;
296 uint64_t p = 0, q = 0;
298 /** Initialize CRC16 data */
306 /** Save the params in context structure */
307 crc16_ccitt_pclmulqdq.rk1_rk2 =
308 _mm_setr_epi64(_mm_cvtsi64_m64(k1), _mm_cvtsi64_m64(k2));
309 crc16_ccitt_pclmulqdq.rk5_rk6 =
310 _mm_setr_epi64(_mm_cvtsi64_m64(k5), _mm_cvtsi64_m64(k6));
311 crc16_ccitt_pclmulqdq.rk7_rk8 =
312 _mm_setr_epi64(_mm_cvtsi64_m64(q), _mm_cvtsi64_m64(p));
314 /** Initialize CRC32 data */
322 /** Save the params in context structure */
323 crc32_eth_pclmulqdq.rk1_rk2 =
324 _mm_setr_epi64(_mm_cvtsi64_m64(k1), _mm_cvtsi64_m64(k2));
325 crc32_eth_pclmulqdq.rk5_rk6 =
326 _mm_setr_epi64(_mm_cvtsi64_m64(k5), _mm_cvtsi64_m64(k6));
327 crc32_eth_pclmulqdq.rk7_rk8 =
328 _mm_setr_epi64(_mm_cvtsi64_m64(q), _mm_cvtsi64_m64(p));
331 * Reset the register as following calculation may
332 * use other data types such as float, double, etc.
338 static inline uint32_t
339 rte_crc16_ccitt_sse42_handler(const uint8_t *data,
342 /** return 16-bit CRC value */
343 return (uint16_t)~crc32_eth_calc_pclmulqdq(data,
346 &crc16_ccitt_pclmulqdq);
349 static inline uint32_t
350 rte_crc32_eth_sse42_handler(const uint8_t *data,
353 return ~crc32_eth_calc_pclmulqdq(data,
356 &crc32_eth_pclmulqdq);
363 #endif /* _RTE_NET_CRC_SSE_H_ */