test/mbuf: fix access to freed memory
[dpdk.git] / lib / net / net_crc_sse.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2017-2020 Intel Corporation
3  */
4
5 #include <string.h>
6
7 #include <rte_common.h>
8 #include <rte_branch_prediction.h>
9 #include <rte_cpuflags.h>
10
11 #include "net_crc.h"
12
13 #include <x86intrin.h>
14
15 /** PCLMULQDQ CRC computation context structure */
16 struct crc_pclmulqdq_ctx {
17         __m128i rk1_rk2;
18         __m128i rk5_rk6;
19         __m128i rk7_rk8;
20 };
21
22 static struct crc_pclmulqdq_ctx crc32_eth_pclmulqdq __rte_aligned(16);
23 static struct crc_pclmulqdq_ctx crc16_ccitt_pclmulqdq __rte_aligned(16);
24 /**
25  * @brief Performs one folding round
26  *
27  * Logically function operates as follows:
28  *     DATA = READ_NEXT_16BYTES();
29  *     F1 = LSB8(FOLD)
30  *     F2 = MSB8(FOLD)
31  *     T1 = CLMUL(F1, RK1)
32  *     T2 = CLMUL(F2, RK2)
33  *     FOLD = XOR(T1, T2, DATA)
34  *
35  * @param data_block
36  *   16 byte data block
37  * @param precomp
38  *   Precomputed rk1 constant
39  * @param fold
40  *   Current16 byte folded data
41  *
42  * @return
43  *   New 16 byte folded data
44  */
45 static __rte_always_inline __m128i
46 crcr32_folding_round(__m128i data_block,
47                 __m128i precomp,
48                 __m128i fold)
49 {
50         __m128i tmp0 = _mm_clmulepi64_si128(fold, precomp, 0x01);
51         __m128i tmp1 = _mm_clmulepi64_si128(fold, precomp, 0x10);
52
53         return _mm_xor_si128(tmp1, _mm_xor_si128(data_block, tmp0));
54 }
55
56 /**
57  * Performs reduction from 128 bits to 64 bits
58  *
59  * @param data128
60  *   128 bits data to be reduced
61  * @param precomp
62  *   precomputed constants rk5, rk6
63  *
64  * @return
65  *  64 bits reduced data
66  */
67
68 static __rte_always_inline __m128i
69 crcr32_reduce_128_to_64(__m128i data128, __m128i precomp)
70 {
71         __m128i tmp0, tmp1, tmp2;
72
73         /* 64b fold */
74         tmp0 = _mm_clmulepi64_si128(data128, precomp, 0x00);
75         tmp1 = _mm_srli_si128(data128, 8);
76         tmp0 = _mm_xor_si128(tmp0, tmp1);
77
78         /* 32b fold */
79         tmp2 = _mm_slli_si128(tmp0, 4);
80         tmp1 = _mm_clmulepi64_si128(tmp2, precomp, 0x10);
81
82         return _mm_xor_si128(tmp1, tmp0);
83 }
84
85 /**
86  * Performs Barret's reduction from 64 bits to 32 bits
87  *
88  * @param data64
89  *   64 bits data to be reduced
90  * @param precomp
91  *   rk7 precomputed constant
92  *
93  * @return
94  *   reduced 32 bits data
95  */
96
97 static __rte_always_inline uint32_t
98 crcr32_reduce_64_to_32(__m128i data64, __m128i precomp)
99 {
100         static const uint32_t mask1[4] __rte_aligned(16) = {
101                 0xffffffff, 0xffffffff, 0x00000000, 0x00000000
102         };
103
104         static const uint32_t mask2[4] __rte_aligned(16) = {
105                 0x00000000, 0xffffffff, 0xffffffff, 0xffffffff
106         };
107         __m128i tmp0, tmp1, tmp2;
108
109         tmp0 = _mm_and_si128(data64, _mm_load_si128((const __m128i *)mask2));
110
111         tmp1 = _mm_clmulepi64_si128(tmp0, precomp, 0x00);
112         tmp1 = _mm_xor_si128(tmp1, tmp0);
113         tmp1 = _mm_and_si128(tmp1, _mm_load_si128((const __m128i *)mask1));
114
115         tmp2 = _mm_clmulepi64_si128(tmp1, precomp, 0x10);
116         tmp2 = _mm_xor_si128(tmp2, tmp1);
117         tmp2 = _mm_xor_si128(tmp2, tmp0);
118
119         return _mm_extract_epi32(tmp2, 2);
120 }
121
122 static const uint8_t crc_xmm_shift_tab[48] __rte_aligned(16) = {
123         0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
124         0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
125         0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
126         0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
127         0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
128         0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
129 };
130
131 /**
132  * Shifts left 128 bit register by specified number of bytes
133  *
134  * @param reg
135  *   128 bit value
136  * @param num
137  *   number of bytes to shift left reg by (0-16)
138  *
139  * @return
140  *   reg << (num * 8)
141  */
142
143 static __rte_always_inline __m128i
144 xmm_shift_left(__m128i reg, const unsigned int num)
145 {
146         const __m128i *p = (const __m128i *)(crc_xmm_shift_tab + 16 - num);
147
148         return _mm_shuffle_epi8(reg, _mm_loadu_si128(p));
149 }
150
151 static __rte_always_inline uint32_t
152 crc32_eth_calc_pclmulqdq(
153         const uint8_t *data,
154         uint32_t data_len,
155         uint32_t crc,
156         const struct crc_pclmulqdq_ctx *params)
157 {
158         __m128i temp, fold, k;
159         uint32_t n;
160
161         /* Get CRC init value */
162         temp = _mm_insert_epi32(_mm_setzero_si128(), crc, 0);
163
164         /**
165          * Folding all data into single 16 byte data block
166          * Assumes: fold holds first 16 bytes of data
167          */
168
169         if (unlikely(data_len < 32)) {
170                 if (unlikely(data_len == 16)) {
171                         /* 16 bytes */
172                         fold = _mm_loadu_si128((const __m128i *)data);
173                         fold = _mm_xor_si128(fold, temp);
174                         goto reduction_128_64;
175                 }
176
177                 if (unlikely(data_len < 16)) {
178                         /* 0 to 15 bytes */
179                         uint8_t buffer[16] __rte_aligned(16);
180
181                         memset(buffer, 0, sizeof(buffer));
182                         memcpy(buffer, data, data_len);
183
184                         fold = _mm_load_si128((const __m128i *)buffer);
185                         fold = _mm_xor_si128(fold, temp);
186                         if (unlikely(data_len < 4)) {
187                                 fold = xmm_shift_left(fold, 8 - data_len);
188                                 goto barret_reduction;
189                         }
190                         fold = xmm_shift_left(fold, 16 - data_len);
191                         goto reduction_128_64;
192                 }
193                 /* 17 to 31 bytes */
194                 fold = _mm_loadu_si128((const __m128i *)data);
195                 fold = _mm_xor_si128(fold, temp);
196                 n = 16;
197                 k = params->rk1_rk2;
198                 goto partial_bytes;
199         }
200
201         /** At least 32 bytes in the buffer */
202         /** Apply CRC initial value */
203         fold = _mm_loadu_si128((const __m128i *)data);
204         fold = _mm_xor_si128(fold, temp);
205
206         /** Main folding loop - the last 16 bytes is processed separately */
207         k = params->rk1_rk2;
208         for (n = 16; (n + 16) <= data_len; n += 16) {
209                 temp = _mm_loadu_si128((const __m128i *)&data[n]);
210                 fold = crcr32_folding_round(temp, k, fold);
211         }
212
213 partial_bytes:
214         if (likely(n < data_len)) {
215
216                 const uint32_t mask3[4] __rte_aligned(16) = {
217                         0x80808080, 0x80808080, 0x80808080, 0x80808080
218                 };
219
220                 const uint8_t shf_table[32] __rte_aligned(16) = {
221                         0x00, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
222                         0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
223                         0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
224                         0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
225                 };
226
227                 __m128i last16, a, b;
228
229                 last16 = _mm_loadu_si128((const __m128i *)&data[data_len - 16]);
230
231                 temp = _mm_loadu_si128((const __m128i *)
232                         &shf_table[data_len & 15]);
233                 a = _mm_shuffle_epi8(fold, temp);
234
235                 temp = _mm_xor_si128(temp,
236                         _mm_load_si128((const __m128i *)mask3));
237                 b = _mm_shuffle_epi8(fold, temp);
238                 b = _mm_blendv_epi8(b, last16, temp);
239
240                 /* k = rk1 & rk2 */
241                 temp = _mm_clmulepi64_si128(a, k, 0x01);
242                 fold = _mm_clmulepi64_si128(a, k, 0x10);
243
244                 fold = _mm_xor_si128(fold, temp);
245                 fold = _mm_xor_si128(fold, b);
246         }
247
248         /** Reduction 128 -> 32 Assumes: fold holds 128bit folded data */
249 reduction_128_64:
250         k = params->rk5_rk6;
251         fold = crcr32_reduce_128_to_64(fold, k);
252
253 barret_reduction:
254         k = params->rk7_rk8;
255         n = crcr32_reduce_64_to_32(fold, k);
256
257         return n;
258 }
259
260 void
261 rte_net_crc_sse42_init(void)
262 {
263         uint64_t k1, k2, k5, k6;
264         uint64_t p = 0, q = 0;
265
266         /** Initialize CRC16 data */
267         k1 = 0x189aeLLU;
268         k2 = 0x8e10LLU;
269         k5 = 0x189aeLLU;
270         k6 = 0x114aaLLU;
271         q =  0x11c581910LLU;
272         p =  0x10811LLU;
273
274         /** Save the params in context structure */
275         crc16_ccitt_pclmulqdq.rk1_rk2 =
276                 _mm_setr_epi64(_mm_cvtsi64_m64(k1), _mm_cvtsi64_m64(k2));
277         crc16_ccitt_pclmulqdq.rk5_rk6 =
278                 _mm_setr_epi64(_mm_cvtsi64_m64(k5), _mm_cvtsi64_m64(k6));
279         crc16_ccitt_pclmulqdq.rk7_rk8 =
280                 _mm_setr_epi64(_mm_cvtsi64_m64(q), _mm_cvtsi64_m64(p));
281
282         /** Initialize CRC32 data */
283         k1 = 0xccaa009eLLU;
284         k2 = 0x1751997d0LLU;
285         k5 = 0xccaa009eLLU;
286         k6 = 0x163cd6124LLU;
287         q =  0x1f7011640LLU;
288         p =  0x1db710641LLU;
289
290         /** Save the params in context structure */
291         crc32_eth_pclmulqdq.rk1_rk2 =
292                 _mm_setr_epi64(_mm_cvtsi64_m64(k1), _mm_cvtsi64_m64(k2));
293         crc32_eth_pclmulqdq.rk5_rk6 =
294                 _mm_setr_epi64(_mm_cvtsi64_m64(k5), _mm_cvtsi64_m64(k6));
295         crc32_eth_pclmulqdq.rk7_rk8 =
296                 _mm_setr_epi64(_mm_cvtsi64_m64(q), _mm_cvtsi64_m64(p));
297
298         /**
299          * Reset the register as following calculation may
300          * use other data types such as float, double, etc.
301          */
302         _mm_empty();
303 }
304
305 uint32_t
306 rte_crc16_ccitt_sse42_handler(const uint8_t *data, uint32_t data_len)
307 {
308         /** return 16-bit CRC value */
309         return (uint16_t)~crc32_eth_calc_pclmulqdq(data,
310                 data_len,
311                 0xffff,
312                 &crc16_ccitt_pclmulqdq);
313 }
314
315 uint32_t
316 rte_crc32_eth_sse42_handler(const uint8_t *data, uint32_t data_len)
317 {
318         return ~crc32_eth_calc_pclmulqdq(data,
319                 data_len,
320                 0xffffffffUL,
321                 &crc32_eth_pclmulqdq);
322 }