lib: remove unneeded header includes
[dpdk.git] / lib / net / net_crc_sse.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2017-2020 Intel Corporation
3  */
4
5 #include <string.h>
6
7 #include <rte_common.h>
8 #include <rte_branch_prediction.h>
9
10 #include "net_crc.h"
11
12 #include <x86intrin.h>
13
14 /** PCLMULQDQ CRC computation context structure */
15 struct crc_pclmulqdq_ctx {
16         __m128i rk1_rk2;
17         __m128i rk5_rk6;
18         __m128i rk7_rk8;
19 };
20
21 static struct crc_pclmulqdq_ctx crc32_eth_pclmulqdq __rte_aligned(16);
22 static struct crc_pclmulqdq_ctx crc16_ccitt_pclmulqdq __rte_aligned(16);
23 /**
24  * @brief Performs one folding round
25  *
26  * Logically function operates as follows:
27  *     DATA = READ_NEXT_16BYTES();
28  *     F1 = LSB8(FOLD)
29  *     F2 = MSB8(FOLD)
30  *     T1 = CLMUL(F1, RK1)
31  *     T2 = CLMUL(F2, RK2)
32  *     FOLD = XOR(T1, T2, DATA)
33  *
34  * @param data_block
35  *   16 byte data block
36  * @param precomp
37  *   Precomputed rk1 constant
38  * @param fold
39  *   Current16 byte folded data
40  *
41  * @return
42  *   New 16 byte folded data
43  */
44 static __rte_always_inline __m128i
45 crcr32_folding_round(__m128i data_block,
46                 __m128i precomp,
47                 __m128i fold)
48 {
49         __m128i tmp0 = _mm_clmulepi64_si128(fold, precomp, 0x01);
50         __m128i tmp1 = _mm_clmulepi64_si128(fold, precomp, 0x10);
51
52         return _mm_xor_si128(tmp1, _mm_xor_si128(data_block, tmp0));
53 }
54
55 /**
56  * Performs reduction from 128 bits to 64 bits
57  *
58  * @param data128
59  *   128 bits data to be reduced
60  * @param precomp
61  *   precomputed constants rk5, rk6
62  *
63  * @return
64  *  64 bits reduced data
65  */
66
67 static __rte_always_inline __m128i
68 crcr32_reduce_128_to_64(__m128i data128, __m128i precomp)
69 {
70         __m128i tmp0, tmp1, tmp2;
71
72         /* 64b fold */
73         tmp0 = _mm_clmulepi64_si128(data128, precomp, 0x00);
74         tmp1 = _mm_srli_si128(data128, 8);
75         tmp0 = _mm_xor_si128(tmp0, tmp1);
76
77         /* 32b fold */
78         tmp2 = _mm_slli_si128(tmp0, 4);
79         tmp1 = _mm_clmulepi64_si128(tmp2, precomp, 0x10);
80
81         return _mm_xor_si128(tmp1, tmp0);
82 }
83
84 /**
85  * Performs Barret's reduction from 64 bits to 32 bits
86  *
87  * @param data64
88  *   64 bits data to be reduced
89  * @param precomp
90  *   rk7 precomputed constant
91  *
92  * @return
93  *   reduced 32 bits data
94  */
95
96 static __rte_always_inline uint32_t
97 crcr32_reduce_64_to_32(__m128i data64, __m128i precomp)
98 {
99         static const uint32_t mask1[4] __rte_aligned(16) = {
100                 0xffffffff, 0xffffffff, 0x00000000, 0x00000000
101         };
102
103         static const uint32_t mask2[4] __rte_aligned(16) = {
104                 0x00000000, 0xffffffff, 0xffffffff, 0xffffffff
105         };
106         __m128i tmp0, tmp1, tmp2;
107
108         tmp0 = _mm_and_si128(data64, _mm_load_si128((const __m128i *)mask2));
109
110         tmp1 = _mm_clmulepi64_si128(tmp0, precomp, 0x00);
111         tmp1 = _mm_xor_si128(tmp1, tmp0);
112         tmp1 = _mm_and_si128(tmp1, _mm_load_si128((const __m128i *)mask1));
113
114         tmp2 = _mm_clmulepi64_si128(tmp1, precomp, 0x10);
115         tmp2 = _mm_xor_si128(tmp2, tmp1);
116         tmp2 = _mm_xor_si128(tmp2, tmp0);
117
118         return _mm_extract_epi32(tmp2, 2);
119 }
120
121 static const uint8_t crc_xmm_shift_tab[48] __rte_aligned(16) = {
122         0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
123         0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
124         0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
125         0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
126         0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
127         0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
128 };
129
130 /**
131  * Shifts left 128 bit register by specified number of bytes
132  *
133  * @param reg
134  *   128 bit value
135  * @param num
136  *   number of bytes to shift left reg by (0-16)
137  *
138  * @return
139  *   reg << (num * 8)
140  */
141
142 static __rte_always_inline __m128i
143 xmm_shift_left(__m128i reg, const unsigned int num)
144 {
145         const __m128i *p = (const __m128i *)(crc_xmm_shift_tab + 16 - num);
146
147         return _mm_shuffle_epi8(reg, _mm_loadu_si128(p));
148 }
149
150 static __rte_always_inline uint32_t
151 crc32_eth_calc_pclmulqdq(
152         const uint8_t *data,
153         uint32_t data_len,
154         uint32_t crc,
155         const struct crc_pclmulqdq_ctx *params)
156 {
157         __m128i temp, fold, k;
158         uint32_t n;
159
160         /* Get CRC init value */
161         temp = _mm_insert_epi32(_mm_setzero_si128(), crc, 0);
162
163         /**
164          * Folding all data into single 16 byte data block
165          * Assumes: fold holds first 16 bytes of data
166          */
167
168         if (unlikely(data_len < 32)) {
169                 if (unlikely(data_len == 16)) {
170                         /* 16 bytes */
171                         fold = _mm_loadu_si128((const __m128i *)data);
172                         fold = _mm_xor_si128(fold, temp);
173                         goto reduction_128_64;
174                 }
175
176                 if (unlikely(data_len < 16)) {
177                         /* 0 to 15 bytes */
178                         uint8_t buffer[16] __rte_aligned(16);
179
180                         memset(buffer, 0, sizeof(buffer));
181                         memcpy(buffer, data, data_len);
182
183                         fold = _mm_load_si128((const __m128i *)buffer);
184                         fold = _mm_xor_si128(fold, temp);
185                         if (unlikely(data_len < 4)) {
186                                 fold = xmm_shift_left(fold, 8 - data_len);
187                                 goto barret_reduction;
188                         }
189                         fold = xmm_shift_left(fold, 16 - data_len);
190                         goto reduction_128_64;
191                 }
192                 /* 17 to 31 bytes */
193                 fold = _mm_loadu_si128((const __m128i *)data);
194                 fold = _mm_xor_si128(fold, temp);
195                 n = 16;
196                 k = params->rk1_rk2;
197                 goto partial_bytes;
198         }
199
200         /** At least 32 bytes in the buffer */
201         /** Apply CRC initial value */
202         fold = _mm_loadu_si128((const __m128i *)data);
203         fold = _mm_xor_si128(fold, temp);
204
205         /** Main folding loop - the last 16 bytes is processed separately */
206         k = params->rk1_rk2;
207         for (n = 16; (n + 16) <= data_len; n += 16) {
208                 temp = _mm_loadu_si128((const __m128i *)&data[n]);
209                 fold = crcr32_folding_round(temp, k, fold);
210         }
211
212 partial_bytes:
213         if (likely(n < data_len)) {
214
215                 const uint32_t mask3[4] __rte_aligned(16) = {
216                         0x80808080, 0x80808080, 0x80808080, 0x80808080
217                 };
218
219                 const uint8_t shf_table[32] __rte_aligned(16) = {
220                         0x00, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
221                         0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
222                         0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
223                         0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
224                 };
225
226                 __m128i last16, a, b;
227
228                 last16 = _mm_loadu_si128((const __m128i *)&data[data_len - 16]);
229
230                 temp = _mm_loadu_si128((const __m128i *)
231                         &shf_table[data_len & 15]);
232                 a = _mm_shuffle_epi8(fold, temp);
233
234                 temp = _mm_xor_si128(temp,
235                         _mm_load_si128((const __m128i *)mask3));
236                 b = _mm_shuffle_epi8(fold, temp);
237                 b = _mm_blendv_epi8(b, last16, temp);
238
239                 /* k = rk1 & rk2 */
240                 temp = _mm_clmulepi64_si128(a, k, 0x01);
241                 fold = _mm_clmulepi64_si128(a, k, 0x10);
242
243                 fold = _mm_xor_si128(fold, temp);
244                 fold = _mm_xor_si128(fold, b);
245         }
246
247         /** Reduction 128 -> 32 Assumes: fold holds 128bit folded data */
248 reduction_128_64:
249         k = params->rk5_rk6;
250         fold = crcr32_reduce_128_to_64(fold, k);
251
252 barret_reduction:
253         k = params->rk7_rk8;
254         n = crcr32_reduce_64_to_32(fold, k);
255
256         return n;
257 }
258
259 void
260 rte_net_crc_sse42_init(void)
261 {
262         uint64_t k1, k2, k5, k6;
263         uint64_t p = 0, q = 0;
264
265         /** Initialize CRC16 data */
266         k1 = 0x189aeLLU;
267         k2 = 0x8e10LLU;
268         k5 = 0x189aeLLU;
269         k6 = 0x114aaLLU;
270         q =  0x11c581910LLU;
271         p =  0x10811LLU;
272
273         /** Save the params in context structure */
274         crc16_ccitt_pclmulqdq.rk1_rk2 =
275                 _mm_setr_epi64(_mm_cvtsi64_m64(k1), _mm_cvtsi64_m64(k2));
276         crc16_ccitt_pclmulqdq.rk5_rk6 =
277                 _mm_setr_epi64(_mm_cvtsi64_m64(k5), _mm_cvtsi64_m64(k6));
278         crc16_ccitt_pclmulqdq.rk7_rk8 =
279                 _mm_setr_epi64(_mm_cvtsi64_m64(q), _mm_cvtsi64_m64(p));
280
281         /** Initialize CRC32 data */
282         k1 = 0xccaa009eLLU;
283         k2 = 0x1751997d0LLU;
284         k5 = 0xccaa009eLLU;
285         k6 = 0x163cd6124LLU;
286         q =  0x1f7011640LLU;
287         p =  0x1db710641LLU;
288
289         /** Save the params in context structure */
290         crc32_eth_pclmulqdq.rk1_rk2 =
291                 _mm_setr_epi64(_mm_cvtsi64_m64(k1), _mm_cvtsi64_m64(k2));
292         crc32_eth_pclmulqdq.rk5_rk6 =
293                 _mm_setr_epi64(_mm_cvtsi64_m64(k5), _mm_cvtsi64_m64(k6));
294         crc32_eth_pclmulqdq.rk7_rk8 =
295                 _mm_setr_epi64(_mm_cvtsi64_m64(q), _mm_cvtsi64_m64(p));
296
297         /**
298          * Reset the register as following calculation may
299          * use other data types such as float, double, etc.
300          */
301         _mm_empty();
302 }
303
304 uint32_t
305 rte_crc16_ccitt_sse42_handler(const uint8_t *data, uint32_t data_len)
306 {
307         /** return 16-bit CRC value */
308         return (uint16_t)~crc32_eth_calc_pclmulqdq(data,
309                 data_len,
310                 0xffff,
311                 &crc16_ccitt_pclmulqdq);
312 }
313
314 uint32_t
315 rte_crc32_eth_sse42_handler(const uint8_t *data, uint32_t data_len)
316 {
317         return ~crc32_eth_calc_pclmulqdq(data,
318                 data_len,
319                 0xffffffffUL,
320                 &crc32_eth_pclmulqdq);
321 }