ac93637bf1e048e8e8001af0b88030e18e48f9dd
[dpdk.git] / lib / librte_net / net_crc_sse.h
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2017 Intel Corporation.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33
34 #ifndef _RTE_NET_CRC_SSE_H_
35 #define _RTE_NET_CRC_SSE_H_
36
37 #include <rte_branch_prediction.h>
38
39 #include <x86intrin.h>
40 #include <cpuid.h>
41
42 #ifdef __cplusplus
43 extern "C" {
44 #endif
45
46 /** PCLMULQDQ CRC computation context structure */
47 struct crc_pclmulqdq_ctx {
48         __m128i rk1_rk2;
49         __m128i rk5_rk6;
50         __m128i rk7_rk8;
51 };
52
53 struct crc_pclmulqdq_ctx crc32_eth_pclmulqdq __rte_aligned(16);
54 struct crc_pclmulqdq_ctx crc16_ccitt_pclmulqdq __rte_aligned(16);
55 /**
56  * @brief Performs one folding round
57  *
58  * Logically function operates as follows:
59  *     DATA = READ_NEXT_16BYTES();
60  *     F1 = LSB8(FOLD)
61  *     F2 = MSB8(FOLD)
62  *     T1 = CLMUL(F1, RK1)
63  *     T2 = CLMUL(F2, RK2)
64  *     FOLD = XOR(T1, T2, DATA)
65  *
66  * @param data_block
67  *   16 byte data block
68  * @param precomp
69  *   Precomputed rk1 constanst
70  * @param fold
71  *   Current16 byte folded data
72  *
73  * @return
74  *   New 16 byte folded data
75  */
76 static __rte_always_inline __m128i
77 crcr32_folding_round(__m128i data_block,
78                 __m128i precomp,
79                 __m128i fold)
80 {
81         __m128i tmp0 = _mm_clmulepi64_si128(fold, precomp, 0x01);
82         __m128i tmp1 = _mm_clmulepi64_si128(fold, precomp, 0x10);
83
84         return _mm_xor_si128(tmp1, _mm_xor_si128(data_block, tmp0));
85 }
86
87 /**
88  * Performs reduction from 128 bits to 64 bits
89  *
90  * @param data128
91  *   128 bits data to be reduced
92  * @param precomp
93  *   precomputed constants rk5, rk6
94  *
95  * @return
96  *  64 bits reduced data
97  */
98
99 static __rte_always_inline __m128i
100 crcr32_reduce_128_to_64(__m128i data128, __m128i precomp)
101 {
102         __m128i tmp0, tmp1, tmp2;
103
104         /* 64b fold */
105         tmp0 = _mm_clmulepi64_si128(data128, precomp, 0x00);
106         tmp1 = _mm_srli_si128(data128, 8);
107         tmp0 = _mm_xor_si128(tmp0, tmp1);
108
109         /* 32b fold */
110         tmp2 = _mm_slli_si128(tmp0, 4);
111         tmp1 = _mm_clmulepi64_si128(tmp2, precomp, 0x10);
112
113         return _mm_xor_si128(tmp1, tmp0);
114 }
115
116 /**
117  * Performs Barret's reduction from 64 bits to 32 bits
118  *
119  * @param data64
120  *   64 bits data to be reduced
121  * @param precomp
122  *   rk7 precomputed constant
123  *
124  * @return
125  *   reduced 32 bits data
126  */
127
128 static __rte_always_inline uint32_t
129 crcr32_reduce_64_to_32(__m128i data64, __m128i precomp)
130 {
131         static const uint32_t mask1[4] __rte_aligned(16) = {
132                 0xffffffff, 0xffffffff, 0x00000000, 0x00000000
133         };
134
135         static const uint32_t mask2[4] __rte_aligned(16) = {
136                 0x00000000, 0xffffffff, 0xffffffff, 0xffffffff
137         };
138         __m128i tmp0, tmp1, tmp2;
139
140         tmp0 = _mm_and_si128(data64, _mm_load_si128((const __m128i *)mask2));
141
142         tmp1 = _mm_clmulepi64_si128(tmp0, precomp, 0x00);
143         tmp1 = _mm_xor_si128(tmp1, tmp0);
144         tmp1 = _mm_and_si128(tmp1, _mm_load_si128((const __m128i *)mask1));
145
146         tmp2 = _mm_clmulepi64_si128(tmp1, precomp, 0x10);
147         tmp2 = _mm_xor_si128(tmp2, tmp1);
148         tmp2 = _mm_xor_si128(tmp2, tmp0);
149
150         return _mm_extract_epi32(tmp2, 2);
151 }
152
153 static const uint8_t crc_xmm_shift_tab[48] __rte_aligned(16) = {
154         0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
155         0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
156         0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
157         0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
158         0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
159         0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
160 };
161
162 /**
163  * Shifts left 128 bit register by specified number of bytes
164  *
165  * @param reg
166  *   128 bit value
167  * @param num
168  *   number of bytes to shift left reg by (0-16)
169  *
170  * @return
171  *   reg << (num * 8)
172  */
173
174 static __rte_always_inline __m128i
175 xmm_shift_left(__m128i reg, const unsigned int num)
176 {
177         const __m128i *p = (const __m128i *)(crc_xmm_shift_tab + 16 - num);
178
179         return _mm_shuffle_epi8(reg, _mm_loadu_si128(p));
180 }
181
182 static __rte_always_inline uint32_t
183 crc32_eth_calc_pclmulqdq(
184         const uint8_t *data,
185         uint32_t data_len,
186         uint32_t crc,
187         const struct crc_pclmulqdq_ctx *params)
188 {
189         __m128i temp, fold, k;
190         uint32_t n;
191
192         /* Get CRC init value */
193         temp = _mm_insert_epi32(_mm_setzero_si128(), crc, 0);
194
195         /**
196          * Folding all data into single 16 byte data block
197          * Assumes: fold holds first 16 bytes of data
198          */
199
200         if (unlikely(data_len < 32)) {
201                 if (unlikely(data_len == 16)) {
202                         /* 16 bytes */
203                         fold = _mm_loadu_si128((const __m128i *)data);
204                         fold = _mm_xor_si128(fold, temp);
205                         goto reduction_128_64;
206                 }
207
208                 if (unlikely(data_len < 16)) {
209                         /* 0 to 15 bytes */
210                         uint8_t buffer[16] __rte_aligned(16);
211
212                         memset(buffer, 0, sizeof(buffer));
213                         memcpy(buffer, data, data_len);
214
215                         fold = _mm_load_si128((const __m128i *)buffer);
216                         fold = _mm_xor_si128(fold, temp);
217                         if (unlikely(data_len < 4)) {
218                                 fold = xmm_shift_left(fold, 8 - data_len);
219                                 goto barret_reduction;
220                         }
221                         fold = xmm_shift_left(fold, 16 - data_len);
222                         goto reduction_128_64;
223                 }
224                 /* 17 to 31 bytes */
225                 fold = _mm_loadu_si128((const __m128i *)data);
226                 fold = _mm_xor_si128(fold, temp);
227                 n = 16;
228                 k = params->rk1_rk2;
229                 goto partial_bytes;
230         }
231
232         /** At least 32 bytes in the buffer */
233         /** Apply CRC initial value */
234         fold = _mm_loadu_si128((const __m128i *)data);
235         fold = _mm_xor_si128(fold, temp);
236
237         /** Main folding loop - the last 16 bytes is processed separately */
238         k = params->rk1_rk2;
239         for (n = 16; (n + 16) <= data_len; n += 16) {
240                 temp = _mm_loadu_si128((const __m128i *)&data[n]);
241                 fold = crcr32_folding_round(temp, k, fold);
242         }
243
244 partial_bytes:
245         if (likely(n < data_len)) {
246
247                 const uint32_t mask3[4] __rte_aligned(16) = {
248                         0x80808080, 0x80808080, 0x80808080, 0x80808080
249                 };
250
251                 const uint8_t shf_table[32] __rte_aligned(16) = {
252                         0x00, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
253                         0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
254                         0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
255                         0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
256                 };
257
258                 __m128i last16, a, b;
259
260                 last16 = _mm_loadu_si128((const __m128i *)&data[data_len - 16]);
261
262                 temp = _mm_loadu_si128((const __m128i *)
263                         &shf_table[data_len & 15]);
264                 a = _mm_shuffle_epi8(fold, temp);
265
266                 temp = _mm_xor_si128(temp,
267                         _mm_load_si128((const __m128i *)mask3));
268                 b = _mm_shuffle_epi8(fold, temp);
269                 b = _mm_blendv_epi8(b, last16, temp);
270
271                 /* k = rk1 & rk2 */
272                 temp = _mm_clmulepi64_si128(a, k, 0x01);
273                 fold = _mm_clmulepi64_si128(a, k, 0x10);
274
275                 fold = _mm_xor_si128(fold, temp);
276                 fold = _mm_xor_si128(fold, b);
277         }
278
279         /** Reduction 128 -> 32 Assumes: fold holds 128bit folded data */
280 reduction_128_64:
281         k = params->rk5_rk6;
282         fold = crcr32_reduce_128_to_64(fold, k);
283
284 barret_reduction:
285         k = params->rk7_rk8;
286         n = crcr32_reduce_64_to_32(fold, k);
287
288         return n;
289 }
290
291
292 static inline void
293 rte_net_crc_sse42_init(void)
294 {
295         uint64_t k1, k2, k5, k6;
296         uint64_t p = 0, q = 0;
297
298         /** Initialize CRC16 data */
299         k1 = 0x189aeLLU;
300         k2 = 0x8e10LLU;
301         k5 = 0x189aeLLU;
302         k6 = 0x114aaLLU;
303         q =  0x11c581910LLU;
304         p =  0x10811LLU;
305
306         /** Save the params in context structure */
307         crc16_ccitt_pclmulqdq.rk1_rk2 =
308                 _mm_setr_epi64(_mm_cvtsi64_m64(k1), _mm_cvtsi64_m64(k2));
309         crc16_ccitt_pclmulqdq.rk5_rk6 =
310                 _mm_setr_epi64(_mm_cvtsi64_m64(k5), _mm_cvtsi64_m64(k6));
311         crc16_ccitt_pclmulqdq.rk7_rk8 =
312                 _mm_setr_epi64(_mm_cvtsi64_m64(q), _mm_cvtsi64_m64(p));
313
314         /** Initialize CRC32 data */
315         k1 = 0xccaa009eLLU;
316         k2 = 0x1751997d0LLU;
317         k5 = 0xccaa009eLLU;
318         k6 = 0x163cd6124LLU;
319         q =  0x1f7011640LLU;
320         p =  0x1db710641LLU;
321
322         /** Save the params in context structure */
323         crc32_eth_pclmulqdq.rk1_rk2 =
324                 _mm_setr_epi64(_mm_cvtsi64_m64(k1), _mm_cvtsi64_m64(k2));
325         crc32_eth_pclmulqdq.rk5_rk6 =
326                 _mm_setr_epi64(_mm_cvtsi64_m64(k5), _mm_cvtsi64_m64(k6));
327         crc32_eth_pclmulqdq.rk7_rk8 =
328                 _mm_setr_epi64(_mm_cvtsi64_m64(q), _mm_cvtsi64_m64(p));
329
330         /**
331          * Reset the register as following calculation may
332          * use other data types such as float, double, etc.
333          */
334         _mm_empty();
335
336 }
337
338 static inline uint32_t
339 rte_crc16_ccitt_sse42_handler(const uint8_t *data,
340         uint32_t data_len)
341 {
342         /** return 16-bit CRC value */
343         return (uint16_t)~crc32_eth_calc_pclmulqdq(data,
344                 data_len,
345                 0xffff,
346                 &crc16_ccitt_pclmulqdq);
347 }
348
349 static inline uint32_t
350 rte_crc32_eth_sse42_handler(const uint8_t *data,
351         uint32_t data_len)
352 {
353         return ~crc32_eth_calc_pclmulqdq(data,
354                 data_len,
355                 0xffffffffUL,
356                 &crc32_eth_pclmulqdq);
357 }
358
359 #ifdef __cplusplus
360 }
361 #endif
362
363 #endif /* _RTE_NET_CRC_SSE_H_ */