examples/pipeline: fix build
[dpdk.git] / lib / net / net_crc_avx512.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2020 Intel Corporation
3  */
4
5
6 #include <rte_common.h>
7
8 #include "net_crc.h"
9
10 #include <x86intrin.h>
11
12 /* VPCLMULQDQ CRC computation context structure */
13 struct crc_vpclmulqdq_ctx {
14         __m512i rk1_rk2;
15         __m512i rk3_rk4;
16         __m512i fold_7x128b;
17         __m512i fold_3x128b;
18         __m128i rk5_rk6;
19         __m128i rk7_rk8;
20         __m128i fold_1x128b;
21 };
22
23 static struct crc_vpclmulqdq_ctx crc32_eth __rte_aligned(64);
24 static struct crc_vpclmulqdq_ctx crc16_ccitt __rte_aligned(64);
25
26 static uint16_t byte_len_to_mask_table[] = {
27         0x0000, 0x0001, 0x0003, 0x0007,
28         0x000f, 0x001f, 0x003f, 0x007f,
29         0x00ff, 0x01ff, 0x03ff, 0x07ff,
30         0x0fff, 0x1fff, 0x3fff, 0x7fff,
31         0xffff};
32
33 static const uint8_t shf_table[32] __rte_aligned(16) = {
34         0x00, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
35         0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
36         0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
37         0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
38 };
39
40 static const uint32_t mask[4] __rte_aligned(16) = {
41         0xffffffff, 0xffffffff, 0x00000000, 0x00000000
42 };
43
44 static const uint32_t mask2[4] __rte_aligned(16) = {
45         0x00000000, 0xffffffff, 0xffffffff, 0xffffffff
46 };
47
48 static __rte_always_inline __m512i
49 crcr32_folding_round(__m512i data_block, __m512i precomp, __m512i fold)
50 {
51         __m512i tmp0, tmp1;
52
53         tmp0 = _mm512_clmulepi64_epi128(fold, precomp, 0x01);
54         tmp1 = _mm512_clmulepi64_epi128(fold, precomp, 0x10);
55
56         return _mm512_ternarylogic_epi64(tmp0, tmp1, data_block, 0x96);
57 }
58
59 static __rte_always_inline __m128i
60 crc32_fold_128(__m512i fold0, __m512i fold1,
61         const struct crc_vpclmulqdq_ctx *params)
62 {
63         __m128i res, res2;
64         __m256i a;
65         __m512i tmp0, tmp1, tmp2, tmp3;
66         __m512i tmp4;
67
68         tmp0 = _mm512_clmulepi64_epi128(fold0, params->fold_7x128b, 0x01);
69         tmp1 = _mm512_clmulepi64_epi128(fold0, params->fold_7x128b, 0x10);
70
71         res = _mm512_extracti64x2_epi64(fold1, 3);
72         tmp4 = _mm512_maskz_broadcast_i32x4(0xF, res);
73
74         tmp2 = _mm512_clmulepi64_epi128(fold1, params->fold_3x128b, 0x01);
75         tmp3 = _mm512_clmulepi64_epi128(fold1, params->fold_3x128b, 0x10);
76
77         tmp0 = _mm512_ternarylogic_epi64(tmp0, tmp1, tmp2, 0x96);
78         tmp0 = _mm512_ternarylogic_epi64(tmp0, tmp3, tmp4, 0x96);
79
80         tmp1 = _mm512_shuffle_i64x2(tmp0, tmp0, 0x4e);
81
82         a = _mm256_xor_si256(*(__m256i *)&tmp1, *(__m256i *)&tmp0);
83         res = _mm256_extracti64x2_epi64(a, 1);
84         res2 = _mm_xor_si128(res, *(__m128i *)&a);
85
86         return res2;
87 }
88
89 static __rte_always_inline __m128i
90 last_two_xmm(const uint8_t *data, uint32_t data_len, uint32_t n, __m128i res,
91         const struct crc_vpclmulqdq_ctx *params)
92 {
93         uint32_t offset;
94         __m128i res2, res3, res4, pshufb_shf;
95
96         const uint32_t mask3[4] __rte_aligned(16) = {
97                    0x80808080, 0x80808080, 0x80808080, 0x80808080
98         };
99
100         res2 = res;
101         offset = data_len - n;
102         res3 = _mm_loadu_si128((const __m128i *)&data[n+offset-16]);
103
104         pshufb_shf = _mm_loadu_si128((const __m128i *)
105                         (shf_table + (data_len-n)));
106
107         res = _mm_shuffle_epi8(res, pshufb_shf);
108         pshufb_shf = _mm_xor_si128(pshufb_shf,
109                         _mm_load_si128((const __m128i *) mask3));
110         res2 = _mm_shuffle_epi8(res2, pshufb_shf);
111
112         res2 = _mm_blendv_epi8(res2, res3, pshufb_shf);
113
114         res4 = _mm_clmulepi64_si128(res, params->fold_1x128b, 0x01);
115         res = _mm_clmulepi64_si128(res, params->fold_1x128b, 0x10);
116         res = _mm_ternarylogic_epi64(res, res2, res4, 0x96);
117
118         return res;
119 }
120
121 static __rte_always_inline __m128i
122 done_128(__m128i res, const struct crc_vpclmulqdq_ctx *params)
123 {
124         __m128i res1;
125
126         res1 = res;
127
128         res = _mm_clmulepi64_si128(res, params->rk5_rk6, 0x0);
129         res1 = _mm_srli_si128(res1, 8);
130         res = _mm_xor_si128(res, res1);
131
132         res1 = res;
133         res = _mm_slli_si128(res, 4);
134         res = _mm_clmulepi64_si128(res, params->rk5_rk6, 0x10);
135         res = _mm_xor_si128(res, res1);
136
137         return res;
138 }
139
140 static __rte_always_inline uint32_t
141 barrett_reduction(__m128i data64, const struct crc_vpclmulqdq_ctx *params)
142 {
143         __m128i tmp0, tmp1;
144
145         data64 =  _mm_and_si128(data64, *(const __m128i *)mask2);
146         tmp0 = data64;
147         tmp1 = data64;
148
149         data64 = _mm_clmulepi64_si128(tmp0, params->rk7_rk8, 0x0);
150         data64 = _mm_ternarylogic_epi64(data64, tmp1, *(const __m128i *)mask,
151                         0x28);
152
153         tmp1 = data64;
154         data64 = _mm_clmulepi64_si128(data64, params->rk7_rk8, 0x10);
155         data64 = _mm_ternarylogic_epi64(data64, tmp1, tmp0, 0x96);
156
157         return _mm_extract_epi32(data64, 2);
158 }
159
160 static __rte_always_inline void
161 reduction_loop(__m128i *fold, int *len, const uint8_t *data, uint32_t *n,
162         const struct crc_vpclmulqdq_ctx *params)
163 {
164         __m128i tmp, tmp1;
165
166         tmp = _mm_clmulepi64_si128(*fold, params->fold_1x128b, 0x1);
167         *fold = _mm_clmulepi64_si128(*fold, params->fold_1x128b, 0x10);
168         *fold = _mm_xor_si128(*fold, tmp);
169         tmp1 = _mm_loadu_si128((const __m128i *)&data[*n]);
170         *fold = _mm_xor_si128(*fold, tmp1);
171         *n += 16;
172         *len -= 16;
173 }
174
175 static __rte_always_inline uint32_t
176 crc32_eth_calc_vpclmulqdq(const uint8_t *data, uint32_t data_len, uint32_t crc,
177         const struct crc_vpclmulqdq_ctx *params)
178 {
179         __m128i res, d, b;
180         __m512i temp, k;
181         __m512i qw0 = _mm512_set1_epi64(0), qw1, qw2, qw3;
182         __m512i fold0, fold1, fold2, fold3;
183         __mmask16 mask;
184         uint32_t n = 0;
185         int reduction = 0;
186
187         /* Get CRC init value */
188         b = _mm_cvtsi32_si128(crc);
189         temp = _mm512_castsi128_si512(b);
190
191         if (data_len > 255) {
192                 fold0 = _mm512_loadu_si512((const __m512i *)data);
193                 fold1 = _mm512_loadu_si512((const __m512i *)(data+64));
194                 fold2 = _mm512_loadu_si512((const __m512i *)(data+128));
195                 fold3 = _mm512_loadu_si512((const __m512i *)(data+192));
196                 fold0 = _mm512_xor_si512(fold0, temp);
197
198                 /* Main folding loop */
199                 k = params->rk1_rk2;
200                 for (n = 256; (n + 256) <= data_len; n += 256) {
201                         qw0 = _mm512_loadu_si512((const __m512i *)&data[n]);
202                         qw1 = _mm512_loadu_si512((const __m512i *)
203                                         &(data[n+64]));
204                         qw2 = _mm512_loadu_si512((const __m512i *)
205                                         &(data[n+128]));
206                         qw3 = _mm512_loadu_si512((const __m512i *)
207                                         &(data[n+192]));
208                         fold0 = crcr32_folding_round(qw0, k, fold0);
209                         fold1 = crcr32_folding_round(qw1, k, fold1);
210                         fold2 = crcr32_folding_round(qw2, k, fold2);
211                         fold3 = crcr32_folding_round(qw3, k, fold3);
212                 }
213
214                 /* 256 to 128 fold */
215                 k = params->rk3_rk4;
216                 fold0 = crcr32_folding_round(fold2, k, fold0);
217                 fold1 = crcr32_folding_round(fold3, k, fold1);
218
219                 res = crc32_fold_128(fold0, fold1, params);
220
221                 reduction = 240 - ((n+256)-data_len);
222
223                 while (reduction > 0)
224                         reduction_loop(&res, &reduction, data, &n,
225                                         params);
226
227                 reduction += 16;
228
229                 if (n != data_len)
230                         res = last_two_xmm(data, data_len, n, res,
231                                         params);
232         } else {
233                 if (data_len > 31) {
234                         res = _mm_cvtsi32_si128(crc);
235                         d = _mm_loadu_si128((const __m128i *)data);
236                         res = _mm_xor_si128(res, d);
237                         n += 16;
238
239                         reduction = 240 - ((n+256)-data_len);
240
241                         while (reduction > 0)
242                                 reduction_loop(&res, &reduction, data, &n,
243                                                 params);
244
245                         if (n != data_len)
246                                 res = last_two_xmm(data, data_len, n, res,
247                                                 params);
248                 } else if (data_len > 16) {
249                         res = _mm_cvtsi32_si128(crc);
250                         d = _mm_loadu_si128((const __m128i *)data);
251                         res = _mm_xor_si128(res, d);
252                         n += 16;
253
254                         if (n != data_len)
255                                 res = last_two_xmm(data, data_len, n, res,
256                                                 params);
257                 } else if (data_len == 16) {
258                         res = _mm_cvtsi32_si128(crc);
259                         d = _mm_loadu_si128((const __m128i *)data);
260                         res = _mm_xor_si128(res, d);
261                 } else {
262                         res = _mm_cvtsi32_si128(crc);
263                         mask = byte_len_to_mask_table[data_len];
264                         d = _mm_maskz_loadu_epi8(mask, data);
265                         res = _mm_xor_si128(res, d);
266
267                         if (data_len > 3) {
268                                 d = _mm_loadu_si128((const __m128i *)
269                                                 &shf_table[data_len]);
270                                 res = _mm_shuffle_epi8(res, d);
271                         } else if (data_len > 2) {
272                                 res = _mm_slli_si128(res, 5);
273                                 goto do_barrett_reduction;
274                         } else if (data_len > 1) {
275                                 res = _mm_slli_si128(res, 6);
276                                 goto do_barrett_reduction;
277                         } else if (data_len > 0) {
278                                 res = _mm_slli_si128(res, 7);
279                                 goto do_barrett_reduction;
280                         } else {
281                                 /* zero length case */
282                                 return crc;
283                         }
284                 }
285         }
286
287         res = done_128(res, params);
288
289 do_barrett_reduction:
290         n = barrett_reduction(res, params);
291
292         return n;
293 }
294
295 static void
296 crc32_load_init_constants(void)
297 {
298         __m128i a;
299         /* fold constants */
300         uint64_t c0 = 0x00000000e95c1271;
301         uint64_t c1 = 0x00000000ce3371cb;
302         uint64_t c2 = 0x00000000910eeec1;
303         uint64_t c3 = 0x0000000033fff533;
304         uint64_t c4 = 0x000000000cbec0ed;
305         uint64_t c5 = 0x0000000031f8303f;
306         uint64_t c6 = 0x0000000057c54819;
307         uint64_t c7 = 0x00000000df068dc2;
308         uint64_t c8 = 0x00000000ae0b5394;
309         uint64_t c9 = 0x000000001c279815;
310         uint64_t c10 = 0x000000001d9513d7;
311         uint64_t c11 = 0x000000008f352d95;
312         uint64_t c12 = 0x00000000af449247;
313         uint64_t c13 = 0x000000003db1ecdc;
314         uint64_t c14 = 0x0000000081256527;
315         uint64_t c15 = 0x00000000f1da05aa;
316         uint64_t c16 = 0x00000000ccaa009e;
317         uint64_t c17 = 0x00000000ae689191;
318         uint64_t c18 = 0x00000000ccaa009e;
319         uint64_t c19 = 0x00000000b8bc6765;
320         uint64_t c20 = 0x00000001f7011640;
321         uint64_t c21 = 0x00000001db710640;
322
323         a = _mm_set_epi64x(c1, c0);
324         crc32_eth.rk1_rk2 = _mm512_broadcast_i32x4(a);
325
326         a = _mm_set_epi64x(c3, c2);
327         crc32_eth.rk3_rk4 = _mm512_broadcast_i32x4(a);
328
329         crc32_eth.fold_7x128b = _mm512_setr_epi64(c4, c5, c6, c7, c8,
330                         c9, c10, c11);
331         crc32_eth.fold_3x128b = _mm512_setr_epi64(c12, c13, c14, c15,
332                         c16, c17, 0, 0);
333         crc32_eth.fold_1x128b = _mm_setr_epi64(_mm_cvtsi64_m64(c16),
334                         _mm_cvtsi64_m64(c17));
335
336         crc32_eth.rk5_rk6 = _mm_setr_epi64(_mm_cvtsi64_m64(c18),
337                         _mm_cvtsi64_m64(c19));
338         crc32_eth.rk7_rk8 = _mm_setr_epi64(_mm_cvtsi64_m64(c20),
339                         _mm_cvtsi64_m64(c21));
340 }
341
342 static void
343 crc16_load_init_constants(void)
344 {
345         __m128i a;
346         /* fold constants */
347         uint64_t c0 = 0x0000000000009a19;
348         uint64_t c1 = 0x0000000000002df8;
349         uint64_t c2 = 0x00000000000068af;
350         uint64_t c3 = 0x000000000000b6c9;
351         uint64_t c4 = 0x000000000000c64f;
352         uint64_t c5 = 0x000000000000cd95;
353         uint64_t c6 = 0x000000000000d341;
354         uint64_t c7 = 0x000000000000b8f2;
355         uint64_t c8 = 0x0000000000000842;
356         uint64_t c9 = 0x000000000000b072;
357         uint64_t c10 = 0x00000000000047e3;
358         uint64_t c11 = 0x000000000000922d;
359         uint64_t c12 = 0x0000000000000e3a;
360         uint64_t c13 = 0x0000000000004d7a;
361         uint64_t c14 = 0x0000000000005b44;
362         uint64_t c15 = 0x0000000000007762;
363         uint64_t c16 = 0x00000000000081bf;
364         uint64_t c17 = 0x0000000000008e10;
365         uint64_t c18 = 0x00000000000081bf;
366         uint64_t c19 = 0x0000000000001cbb;
367         uint64_t c20 = 0x000000011c581910;
368         uint64_t c21 = 0x0000000000010810;
369
370         a = _mm_set_epi64x(c1, c0);
371         crc16_ccitt.rk1_rk2 = _mm512_broadcast_i32x4(a);
372
373         a = _mm_set_epi64x(c3, c2);
374         crc16_ccitt.rk3_rk4 = _mm512_broadcast_i32x4(a);
375
376         crc16_ccitt.fold_7x128b = _mm512_setr_epi64(c4, c5, c6, c7, c8,
377                         c9, c10, c11);
378         crc16_ccitt.fold_3x128b = _mm512_setr_epi64(c12, c13, c14, c15,
379                         c16, c17, 0, 0);
380         crc16_ccitt.fold_1x128b = _mm_setr_epi64(_mm_cvtsi64_m64(c16),
381                         _mm_cvtsi64_m64(c17));
382
383         crc16_ccitt.rk5_rk6 = _mm_setr_epi64(_mm_cvtsi64_m64(c18),
384                         _mm_cvtsi64_m64(c19));
385         crc16_ccitt.rk7_rk8 = _mm_setr_epi64(_mm_cvtsi64_m64(c20),
386                         _mm_cvtsi64_m64(c21));
387 }
388
389 void
390 rte_net_crc_avx512_init(void)
391 {
392         crc32_load_init_constants();
393         crc16_load_init_constants();
394
395         /*
396          * Reset the register as following calculation may
397          * use other data types such as float, double, etc.
398          */
399         _mm_empty();
400 }
401
402 uint32_t
403 rte_crc16_ccitt_avx512_handler(const uint8_t *data, uint32_t data_len)
404 {
405         /* return 16-bit CRC value */
406         return (uint16_t)~crc32_eth_calc_vpclmulqdq(data,
407                 data_len,
408                 0xffff,
409                 &crc16_ccitt);
410 }
411
412 uint32_t
413 rte_crc32_eth_avx512_handler(const uint8_t *data, uint32_t data_len)
414 {
415         /* return 32-bit CRC value */
416         return ~crc32_eth_calc_vpclmulqdq(data,
417                 data_len,
418                 0xffffffffUL,
419                 &crc32_eth);
420 }