net: add CRC AVX512 implementation

author Mairtin o Loingsigh <mairtin.oloingsigh@intel.com>

Fri, 9 Oct 2020 13:50:45 +0000 (14:50 +0100)

committer David Marchand <david.marchand@redhat.com>

Tue, 13 Oct 2020 17:26:15 +0000 (19:26 +0200)
author Mairtin o Loingsigh <mairtin.oloingsigh@intel.com>
Fri, 9 Oct 2020 13:50:45 +0000 (14:50 +0100)
committer David Marchand <david.marchand@redhat.com>
Tue, 13 Oct 2020 17:26:15 +0000 (19:26 +0200)
diff --git a/MAINTAINERS b/MAINTAINERS

index 0796190..cdaf0b0 100644 (file)
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1245,6 +1245,7 @@ Packet CRC
  M: Jasvinder Singh <jasvinder.singh@intel.com>
  F: lib/librte_net/net_crc.h
  F: lib/librte_net/rte_net_crc*
+F: lib/librte_net/net_crc_avx512.c
  F: lib/librte_net/net_crc_sse.c
  F: app/test/test_crc.c
  
diff --git a/app/test/test_crc.c b/app/test/test_crc.c

index f8a74e0..bf1d344 100644 (file)
--- a/app/test/test_crc.c
+++ b/app/test/test_crc.c
@@ -1,5 +1,5 @@
  /* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2017 Intel Corporation
+ * Copyright(c) 2017-2020 Intel Corporation
   */
  
  #include "test.h"
@@ -149,6 +149,15 @@ test_crc(void)
                 return ret;
         }
  
+       /* set CRC avx512 mode */
+       rte_net_crc_set_alg(RTE_NET_CRC_AVX512);
+
+       ret = test_crc_calc();
+       if (ret < 0) {
+               printf("test crc (x86_64 AVX512): failed (%d)\n", ret);
+               return ret;
+       }
+
         /* set CRC neon mode */
         rte_net_crc_set_alg(RTE_NET_CRC_NEON);
  
diff --git a/config/x86/meson.build b/config/x86/meson.build

index fea4d54..172b72b 100644 (file)
--- a/config/x86/meson.build
+++ b/config/x86/meson.build
@@ -1,5 +1,5 @@
  # SPDX-License-Identifier: BSD-3-Clause
-# Copyright(c) 2017-2019 Intel Corporation
+# Copyright(c) 2017-2020 Intel Corporation
  
  # get binutils version for the workaround of Bug 97
  if not is_windows
@@ -23,7 +23,9 @@ endforeach
  
  optional_flags = ['AES', 'PCLMUL',
                 'AVX', 'AVX2', 'AVX512F',
-               'RDRND', 'RDSEED']
+               'RDRND', 'RDSEED',
+               'AVX512BW', 'AVX512DQ',
+               'AVX512VL', 'VPCLMULQDQ']
  foreach f:optional_flags
         if cc.get_define('__@0@__'.format(f), args: machine_args) == '1'
                 if f == 'PCLMUL' # special case flags with different defines
diff --git a/doc/guides/rel_notes/release_20_11.rst b/doc/guides/rel_notes/release_20_11.rst

index 2ccf35f..71665c1 100644 (file)
--- a/doc/guides/rel_notes/release_20_11.rst
+++ b/doc/guides/rel_notes/release_20_11.rst
@@ -65,6 +65,8 @@ New Features
  * **Updated CRC modules of the net library.**
  
    * Added runtime selection of the optimal architecture-specific CRC path.
+  * Added optimized implementations of CRC32-Ethernet and CRC16-CCITT
+    using the AVX512 and VPCLMULQDQ instruction sets.
  
  * **Added the FEC API, for a generic FEC query and config.**
  
diff --git a/lib/librte_net/meson.build b/lib/librte_net/meson.build

index fa439b9..6c96b36 100644 (file)
--- a/lib/librte_net/meson.build
+++ b/lib/librte_net/meson.build
@@ -24,18 +24,62 @@ deps += ['mbuf']
  if dpdk_conf.has('RTE_ARCH_X86_64')
         net_crc_sse42_cpu_support = (
                 cc.get_define('__PCLMUL__', args: machine_args) != '')
+       net_crc_avx512_cpu_support = (
+               cc.get_define('__AVX512F__', args: machine_args) != '' and
+               cc.get_define('__AVX512BW__', args: machine_args) != '' and
+               cc.get_define('__AVX512DQ__', args: machine_args) != '' and
+               cc.get_define('__AVX512VL__', args: machine_args) != '' and
+               cc.get_define('__VPCLMULQDQ__', args: machine_args) != '')
+
         net_crc_sse42_cc_support = (
                 cc.has_argument('-mpclmul') and cc.has_argument('-maes'))
+       net_crc_avx512_cc_support = (
+               not machine_args.contains('-mno-avx512f') and
+               cc.has_argument('-mavx512f') and
+               cc.has_argument('-mavx512bw') and
+               cc.has_argument('-mavx512dq') and
+               cc.has_argument('-mavx512vl') and
+               cc.has_argument('-mvpclmulqdq') and
+               cc.has_argument('-mavx2') and
+               cc.has_argument('-mavx'))
  
         build_static_net_crc_sse42_lib = 0
+       build_static_net_crc_avx512_lib = 0
  
         if net_crc_sse42_cpu_support == true
                 sources += files('net_crc_sse.c')
                 cflags += ['-DCC_X86_64_SSE42_PCLMULQDQ_SUPPORT']
+               if net_crc_avx512_cpu_support == true
+                       sources += files('net_crc_avx512.c')
+                       cflags += ['-DCC_X86_64_AVX512_VPCLMULQDQ_SUPPORT']
+               elif net_crc_avx512_cc_support == true
+                       build_static_net_crc_avx512_lib = 1
+                       net_crc_avx512_lib_cflags = ['-mavx512f',
+                                                       '-mavx512bw',
+                                                       '-mavx512dq',
+                                                       '-mavx512vl',
+                                                       '-mvpclmulqdq',
+                                                       '-mavx2',
+                                                       '-mavx']
+                       cflags += ['-DCC_X86_64_AVX512_VPCLMULQDQ_SUPPORT']
+               endif
         elif net_crc_sse42_cc_support == true
                 build_static_net_crc_sse42_lib = 1
                 net_crc_sse42_lib_cflags = ['-mpclmul', '-maes']
                 cflags += ['-DCC_X86_64_SSE42_PCLMULQDQ_SUPPORT']
+               if net_crc_avx512_cc_support == true
+                       build_static_net_crc_avx512_lib = 1
+                       net_crc_avx512_lib_cflags = ['-mpclmul',
+                                                       '-maes',
+                                                       '-mavx512f',
+                                                       '-mavx512bw',
+                                                       '-mavx512dq',
+                                                       '-mavx512vl',
+                                                       '-mvpclmulqdq',
+                                                       '-mavx2',
+                                                       '-mavx']
+                       cflags += ['-DCC_X86_64_AVX512_VPCLMULQDQ_SUPPORT']
+               endif
         endif
  
         if build_static_net_crc_sse42_lib == 1
@@ -47,6 +91,17 @@ if dpdk_conf.has('RTE_ARCH_X86_64')
                                                 net_crc_sse42_lib_cflags])
                 objs += net_crc_sse42_lib.extract_objects('net_crc_sse.c')
         endif
+
+       if build_static_net_crc_avx512_lib == 1
+               net_crc_avx512_lib = static_library(
+                                       'net_crc_avx512_lib',
+                                       'net_crc_avx512.c',
+                                       dependencies: static_rte_eal,
+                                       c_args: [cflags,
+                                               net_crc_avx512_lib_cflags])
+               objs += net_crc_avx512_lib.extract_objects('net_crc_avx512.c')
+       endif
+
  elif (dpdk_conf.has('RTE_ARCH_ARM64') and
                 cc.get_define('__ARM_FEATURE_CRYPTO', args: machine_args) != '')
         sources += files('net_crc_neon.c')
diff --git a/lib/librte_net/net_crc.h b/lib/librte_net/net_crc.h

index a1578a5..7a74d54 100644 (file)
--- a/lib/librte_net/net_crc.h
+++ b/lib/librte_net/net_crc.h
@@ -20,6 +20,17 @@ rte_crc16_ccitt_sse42_handler(const uint8_t *data, uint32_t data_len);
  uint32_t
  rte_crc32_eth_sse42_handler(const uint8_t *data, uint32_t data_len);
  
+/* AVX512 */
+
+void
+rte_net_crc_avx512_init(void);
+
+uint32_t
+rte_crc16_ccitt_avx512_handler(const uint8_t *data, uint32_t data_len);
+
+uint32_t
+rte_crc32_eth_avx512_handler(const uint8_t *data, uint32_t data_len);
+
  /* NEON */
  
  void
diff --git a/lib/librte_net/net_crc_avx512.c b/lib/librte_net/net_crc_avx512.c

new file mode 100644 (file)

index 0000000..3740fe3
--- /dev/null
+++ b/lib/librte_net/net_crc_avx512.c
@@ -0,0 +1,423 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2020 Intel Corporation
+ */
+
+#include <string.h>
+
+#include <rte_common.h>
+#include <rte_branch_prediction.h>
+#include <rte_cpuflags.h>
+
+#include "net_crc.h"
+
+#include <x86intrin.h>
+
+/* VPCLMULQDQ CRC computation context structure */
+struct crc_vpclmulqdq_ctx {
+       __m512i rk1_rk2;
+       __m512i rk3_rk4;
+       __m512i fold_7x128b;
+       __m512i fold_3x128b;
+       __m128i rk5_rk6;
+       __m128i rk7_rk8;
+       __m128i fold_1x128b;
+};
+
+static struct crc_vpclmulqdq_ctx crc32_eth __rte_aligned(64);
+static struct crc_vpclmulqdq_ctx crc16_ccitt __rte_aligned(64);
+
+static uint16_t byte_len_to_mask_table[] = {
+       0x0000, 0x0001, 0x0003, 0x0007,
+       0x000f, 0x001f, 0x003f, 0x007f,
+       0x00ff, 0x01ff, 0x03ff, 0x07ff,
+       0x0fff, 0x1fff, 0x3fff, 0x7fff,
+       0xffff};
+
+static const uint8_t shf_table[32] __rte_aligned(16) = {
+       0x00, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
+       0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
+       0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+       0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
+};
+
+static const uint32_t mask[4] __rte_aligned(16) = {
+       0xffffffff, 0xffffffff, 0x00000000, 0x00000000
+};
+
+static const uint32_t mask2[4] __rte_aligned(16) = {
+       0x00000000, 0xffffffff, 0xffffffff, 0xffffffff
+};
+
+static __rte_always_inline __m512i
+crcr32_folding_round(__m512i data_block, __m512i precomp, __m512i fold)
+{
+       __m512i tmp0, tmp1;
+
+       tmp0 = _mm512_clmulepi64_epi128(fold, precomp, 0x01);
+       tmp1 = _mm512_clmulepi64_epi128(fold, precomp, 0x10);
+
+       return _mm512_ternarylogic_epi64(tmp0, tmp1, data_block, 0x96);
+}
+
+static __rte_always_inline __m128i
+crc32_fold_128(__m512i fold0, __m512i fold1,
+       const struct crc_vpclmulqdq_ctx *params)
+{
+       __m128i res, res2;
+       __m256i a;
+       __m512i tmp0, tmp1, tmp2, tmp3;
+       __m512i tmp4;
+
+       tmp0 = _mm512_clmulepi64_epi128(fold0, params->fold_7x128b, 0x01);
+       tmp1 = _mm512_clmulepi64_epi128(fold0, params->fold_7x128b, 0x10);
+
+       res = _mm512_extracti64x2_epi64(fold1, 3);
+       tmp4 = _mm512_maskz_broadcast_i32x4(0xF, res);
+
+       tmp2 = _mm512_clmulepi64_epi128(fold1, params->fold_3x128b, 0x01);
+       tmp3 = _mm512_clmulepi64_epi128(fold1, params->fold_3x128b, 0x10);
+
+       tmp0 = _mm512_ternarylogic_epi64(tmp0, tmp1, tmp2, 0x96);
+       tmp0 = _mm512_ternarylogic_epi64(tmp0, tmp3, tmp4, 0x96);
+
+       tmp1 = _mm512_shuffle_i64x2(tmp0, tmp0, 0x4e);
+
+       a = _mm256_xor_si256(*(__m256i *)&tmp1, *(__m256i *)&tmp0);
+       res = _mm256_extracti64x2_epi64(a, 1);
+       res2 = _mm_xor_si128(res, *(__m128i *)&a);
+
+       return res2;
+}
+
+static __rte_always_inline __m128i
+last_two_xmm(const uint8_t *data, uint32_t data_len, uint32_t n, __m128i res,
+       const struct crc_vpclmulqdq_ctx *params)
+{
+       uint32_t offset;
+       __m128i res2, res3, res4, pshufb_shf;
+
+       const uint32_t mask3[4] __rte_aligned(16) = {
+                  0x80808080, 0x80808080, 0x80808080, 0x80808080
+       };
+
+       res2 = res;
+       offset = data_len - n;
+       res3 = _mm_loadu_si128((const __m128i *)&data[n+offset-16]);
+
+       pshufb_shf = _mm_loadu_si128((const __m128i *)
+                       (shf_table + (data_len-n)));
+
+       res = _mm_shuffle_epi8(res, pshufb_shf);
+       pshufb_shf = _mm_xor_si128(pshufb_shf,
+                       _mm_load_si128((const __m128i *) mask3));
+       res2 = _mm_shuffle_epi8(res2, pshufb_shf);
+
+       res2 = _mm_blendv_epi8(res2, res3, pshufb_shf);
+
+       res4 = _mm_clmulepi64_si128(res, params->fold_1x128b, 0x01);
+       res = _mm_clmulepi64_si128(res, params->fold_1x128b, 0x10);
+       res = _mm_ternarylogic_epi64(res, res2, res4, 0x96);
+
+       return res;
+}
+
+static __rte_always_inline __m128i
+done_128(__m128i res, const struct crc_vpclmulqdq_ctx *params)
+{
+       __m128i res1;
+
+       res1 = res;
+
+       res = _mm_clmulepi64_si128(res, params->rk5_rk6, 0x0);
+       res1 = _mm_srli_si128(res1, 8);
+       res = _mm_xor_si128(res, res1);
+
+       res1 = res;
+       res = _mm_slli_si128(res, 4);
+       res = _mm_clmulepi64_si128(res, params->rk5_rk6, 0x10);
+       res = _mm_xor_si128(res, res1);
+
+       return res;
+}
+
+static __rte_always_inline uint32_t
+barrett_reduction(__m128i data64, const struct crc_vpclmulqdq_ctx *params)
+{
+       __m128i tmp0, tmp1;
+
+       data64 =  _mm_and_si128(data64, *(const __m128i *)mask2);
+       tmp0 = data64;
+       tmp1 = data64;
+
+       data64 = _mm_clmulepi64_si128(tmp0, params->rk7_rk8, 0x0);
+       data64 = _mm_ternarylogic_epi64(data64, tmp1, *(const __m128i *)mask,
+                       0x28);
+
+       tmp1 = data64;
+       data64 = _mm_clmulepi64_si128(data64, params->rk7_rk8, 0x10);
+       data64 = _mm_ternarylogic_epi64(data64, tmp1, tmp0, 0x96);
+
+       return _mm_extract_epi32(data64, 2);
+}
+
+static __rte_always_inline void
+reduction_loop(__m128i *fold, int *len, const uint8_t *data, uint32_t *n,
+       const struct crc_vpclmulqdq_ctx *params)
+{
+       __m128i tmp, tmp1;
+
+       tmp = _mm_clmulepi64_si128(*fold, params->fold_1x128b, 0x1);
+       *fold = _mm_clmulepi64_si128(*fold, params->fold_1x128b, 0x10);
+       *fold = _mm_xor_si128(*fold, tmp);
+       tmp1 = _mm_loadu_si128((const __m128i *)&data[*n]);
+       *fold = _mm_xor_si128(*fold, tmp1);
+       *n += 16;
+       *len -= 16;
+}
+
+static __rte_always_inline uint32_t
+crc32_eth_calc_vpclmulqdq(const uint8_t *data, uint32_t data_len, uint32_t crc,
+       const struct crc_vpclmulqdq_ctx *params)
+{
+       __m128i res, d, b;
+       __m512i temp, k;
+       __m512i qw0 = _mm512_set1_epi64(0), qw1, qw2, qw3;
+       __m512i fold0, fold1, fold2, fold3;
+       __mmask16 mask;
+       uint32_t n = 0;
+       int reduction = 0;
+
+       /* Get CRC init value */
+       b = _mm_cvtsi32_si128(crc);
+       temp = _mm512_castsi128_si512(b);
+
+       if (data_len > 255) {
+               fold0 = _mm512_loadu_si512((const __m512i *)data);
+               fold1 = _mm512_loadu_si512((const __m512i *)(data+64));
+               fold2 = _mm512_loadu_si512((const __m512i *)(data+128));
+               fold3 = _mm512_loadu_si512((const __m512i *)(data+192));
+               fold0 = _mm512_xor_si512(fold0, temp);
+
+               /* Main folding loop */
+               k = params->rk1_rk2;
+               for (n = 256; (n + 256) <= data_len; n += 256) {
+                       qw0 = _mm512_loadu_si512((const __m512i *)&data[n]);
+                       qw1 = _mm512_loadu_si512((const __m512i *)
+                                       &(data[n+64]));
+                       qw2 = _mm512_loadu_si512((const __m512i *)
+                                       &(data[n+128]));
+                       qw3 = _mm512_loadu_si512((const __m512i *)
+                                       &(data[n+192]));
+                       fold0 = crcr32_folding_round(qw0, k, fold0);
+                       fold1 = crcr32_folding_round(qw1, k, fold1);
+                       fold2 = crcr32_folding_round(qw2, k, fold2);
+                       fold3 = crcr32_folding_round(qw3, k, fold3);
+               }
+
+               /* 256 to 128 fold */
+               k = params->rk3_rk4;
+               fold0 = crcr32_folding_round(fold2, k, fold0);
+               fold1 = crcr32_folding_round(fold3, k, fold1);
+
+               res = crc32_fold_128(fold0, fold1, params);
+
+               reduction = 240 - ((n+256)-data_len);
+
+               while (reduction > 0)
+                       reduction_loop(&res, &reduction, data, &n,
+                                       params);
+
+               reduction += 16;
+
+               if (n != data_len)
+                       res = last_two_xmm(data, data_len, n, res,
+                                       params);
+       } else {
+               if (data_len > 31) {
+                       res = _mm_cvtsi32_si128(crc);
+                       d = _mm_loadu_si128((const __m128i *)data);
+                       res = _mm_xor_si128(res, d);
+                       n += 16;
+
+                       reduction = 240 - ((n+256)-data_len);
+
+                       while (reduction > 0)
+                               reduction_loop(&res, &reduction, data, &n,
+                                               params);
+
+                       if (n != data_len)
+                               res = last_two_xmm(data, data_len, n, res,
+                                               params);
+               } else if (data_len > 16) {
+                       res = _mm_cvtsi32_si128(crc);
+                       d = _mm_loadu_si128((const __m128i *)data);
+                       res = _mm_xor_si128(res, d);
+                       n += 16;
+
+                       if (n != data_len)
+                               res = last_two_xmm(data, data_len, n, res,
+                                               params);
+               } else if (data_len == 16) {
+                       res = _mm_cvtsi32_si128(crc);
+                       d = _mm_loadu_si128((const __m128i *)data);
+                       res = _mm_xor_si128(res, d);
+               } else {
+                       res = _mm_cvtsi32_si128(crc);
+                       mask = byte_len_to_mask_table[data_len];
+                       d = _mm_maskz_loadu_epi8(mask, data);
+                       res = _mm_xor_si128(res, d);
+
+                       if (data_len > 3) {
+                               d = _mm_loadu_si128((const __m128i *)
+                                               &shf_table[data_len]);
+                               res = _mm_shuffle_epi8(res, d);
+                       } else if (data_len > 2) {
+                               res = _mm_slli_si128(res, 5);
+                               goto do_barrett_reduction;
+                       } else if (data_len > 1) {
+                               res = _mm_slli_si128(res, 6);
+                               goto do_barrett_reduction;
+                       } else if (data_len > 0) {
+                               res = _mm_slli_si128(res, 7);
+                               goto do_barrett_reduction;
+                       } else {
+                               /* zero length case */
+                               return crc;
+                       }
+               }
+       }
+
+       res = done_128(res, params);
+
+do_barrett_reduction:
+       n = barrett_reduction(res, params);
+
+       return n;
+}
+
+static void
+crc32_load_init_constants(void)
+{
+       __m128i a;
+       /* fold constants */
+       uint64_t c0 = 0x00000000e95c1271;
+       uint64_t c1 = 0x00000000ce3371cb;
+       uint64_t c2 = 0x00000000910eeec1;
+       uint64_t c3 = 0x0000000033fff533;
+       uint64_t c4 = 0x000000000cbec0ed;
+       uint64_t c5 = 0x0000000031f8303f;
+       uint64_t c6 = 0x0000000057c54819;
+       uint64_t c7 = 0x00000000df068dc2;
+       uint64_t c8 = 0x00000000ae0b5394;
+       uint64_t c9 = 0x000000001c279815;
+       uint64_t c10 = 0x000000001d9513d7;
+       uint64_t c11 = 0x000000008f352d95;
+       uint64_t c12 = 0x00000000af449247;
+       uint64_t c13 = 0x000000003db1ecdc;
+       uint64_t c14 = 0x0000000081256527;
+       uint64_t c15 = 0x00000000f1da05aa;
+       uint64_t c16 = 0x00000000ccaa009e;
+       uint64_t c17 = 0x00000000ae689191;
+       uint64_t c18 = 0x00000000ccaa009e;
+       uint64_t c19 = 0x00000000b8bc6765;
+       uint64_t c20 = 0x00000001f7011640;
+       uint64_t c21 = 0x00000001db710640;
+
+       a = _mm_set_epi64x(c1, c0);
+       crc32_eth.rk1_rk2 = _mm512_broadcast_i32x4(a);
+
+       a = _mm_set_epi64x(c3, c2);
+       crc32_eth.rk3_rk4 = _mm512_broadcast_i32x4(a);
+
+       crc32_eth.fold_7x128b = _mm512_setr_epi64(c4, c5, c6, c7, c8,
+                       c9, c10, c11);
+       crc32_eth.fold_3x128b = _mm512_setr_epi64(c12, c13, c14, c15,
+                       c16, c17, 0, 0);
+       crc32_eth.fold_1x128b = _mm_setr_epi64(_mm_cvtsi64_m64(c16),
+                       _mm_cvtsi64_m64(c17));
+
+       crc32_eth.rk5_rk6 = _mm_setr_epi64(_mm_cvtsi64_m64(c18),
+                       _mm_cvtsi64_m64(c19));
+       crc32_eth.rk7_rk8 = _mm_setr_epi64(_mm_cvtsi64_m64(c20),
+                       _mm_cvtsi64_m64(c21));
+}
+
+static void
+crc16_load_init_constants(void)
+{
+       __m128i a;
+       /* fold constants */
+       uint64_t c0 = 0x0000000000009a19;
+       uint64_t c1 = 0x0000000000002df8;
+       uint64_t c2 = 0x00000000000068af;
+       uint64_t c3 = 0x000000000000b6c9;
+       uint64_t c4 = 0x000000000000c64f;
+       uint64_t c5 = 0x000000000000cd95;
+       uint64_t c6 = 0x000000000000d341;
+       uint64_t c7 = 0x000000000000b8f2;
+       uint64_t c8 = 0x0000000000000842;
+       uint64_t c9 = 0x000000000000b072;
+       uint64_t c10 = 0x00000000000047e3;
+       uint64_t c11 = 0x000000000000922d;
+       uint64_t c12 = 0x0000000000000e3a;
+       uint64_t c13 = 0x0000000000004d7a;
+       uint64_t c14 = 0x0000000000005b44;
+       uint64_t c15 = 0x0000000000007762;
+       uint64_t c16 = 0x00000000000081bf;
+       uint64_t c17 = 0x0000000000008e10;
+       uint64_t c18 = 0x00000000000081bf;
+       uint64_t c19 = 0x0000000000001cbb;
+       uint64_t c20 = 0x000000011c581910;
+       uint64_t c21 = 0x0000000000010810;
+
+       a = _mm_set_epi64x(c1, c0);
+       crc16_ccitt.rk1_rk2 = _mm512_broadcast_i32x4(a);
+
+       a = _mm_set_epi64x(c3, c2);
+       crc16_ccitt.rk3_rk4 = _mm512_broadcast_i32x4(a);
+
+       crc16_ccitt.fold_7x128b = _mm512_setr_epi64(c4, c5, c6, c7, c8,
+                       c9, c10, c11);
+       crc16_ccitt.fold_3x128b = _mm512_setr_epi64(c12, c13, c14, c15,
+                       c16, c17, 0, 0);
+       crc16_ccitt.fold_1x128b = _mm_setr_epi64(_mm_cvtsi64_m64(c16),
+                       _mm_cvtsi64_m64(c17));
+
+       crc16_ccitt.rk5_rk6 = _mm_setr_epi64(_mm_cvtsi64_m64(c18),
+                       _mm_cvtsi64_m64(c19));
+       crc16_ccitt.rk7_rk8 = _mm_setr_epi64(_mm_cvtsi64_m64(c20),
+                       _mm_cvtsi64_m64(c21));
+}
+
+void
+rte_net_crc_avx512_init(void)
+{
+       crc32_load_init_constants();
+       crc16_load_init_constants();
+
+       /*
+        * Reset the register as following calculation may
+        * use other data types such as float, double, etc.
+        */
+       _mm_empty();
+}
+
+uint32_t
+rte_crc16_ccitt_avx512_handler(const uint8_t *data, uint32_t data_len)
+{
+       /* return 16-bit CRC value */
+       return (uint16_t)~crc32_eth_calc_vpclmulqdq(data,
+               data_len,
+               0xffff,
+               &crc16_ccitt);
+}
+
+uint32_t
+rte_crc32_eth_avx512_handler(const uint8_t *data, uint32_t data_len)
+{
+       /* return 32-bit CRC value */
+       return ~crc32_eth_calc_vpclmulqdq(data,
+               data_len,
+               0xffffffffUL,
+               &crc32_eth);
+}
diff --git a/lib/librte_net/rte_net_crc.c b/lib/librte_net/rte_net_crc.c

index d271d52..32a3665 100644 (file)
--- a/lib/librte_net/rte_net_crc.c
+++ b/lib/librte_net/rte_net_crc.c
@@ -37,6 +37,12 @@ static const rte_net_crc_handler handlers_scalar[] = {
         [RTE_NET_CRC16_CCITT] = rte_crc16_ccitt_handler,
         [RTE_NET_CRC32_ETH] = rte_crc32_eth_handler,
  };
+#ifdef CC_X86_64_AVX512_VPCLMULQDQ_SUPPORT
+static const rte_net_crc_handler handlers_avx512[] = {
+       [RTE_NET_CRC16_CCITT] = rte_crc16_ccitt_avx512_handler,
+       [RTE_NET_CRC32_ETH] = rte_crc32_eth_avx512_handler,
+};
+#endif
  #ifdef CC_X86_64_SSE42_PCLMULQDQ_SUPPORT
  static const rte_net_crc_handler handlers_sse42[] = {
         [RTE_NET_CRC16_CCITT] = rte_crc16_ccitt_sse42_handler,
@@ -134,6 +140,39 @@ rte_crc32_eth_handler(const uint8_t *data, uint32_t data_len)
                 crc32_eth_lut);
  }
  
+/* AVX512/VPCLMULQDQ handling */
+
+#define AVX512_VPCLMULQDQ_CPU_SUPPORTED ( \
+       rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) && \
+       rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512BW) && \
+       rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512DQ) && \
+       rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512VL) && \
+       rte_cpu_get_flag_enabled(RTE_CPUFLAG_PCLMULQDQ) && \
+       rte_cpu_get_flag_enabled(RTE_CPUFLAG_VPCLMULQDQ) \
+)
+
+static const rte_net_crc_handler *
+avx512_vpclmulqdq_get_handlers(void)
+{
+#ifdef CC_X86_64_AVX512_VPCLMULQDQ_SUPPORT
+       if (AVX512_VPCLMULQDQ_CPU_SUPPORTED)
+               return handlers_avx512;
+#endif
+       return NULL;
+}
+
+static uint8_t
+avx512_vpclmulqdq_init(void)
+{
+#ifdef CC_X86_64_AVX512_VPCLMULQDQ_SUPPORT
+       if (AVX512_VPCLMULQDQ_CPU_SUPPORTED) {
+               rte_net_crc_avx512_init();
+               return 1;
+       }
+#endif
+       return 0;
+}
+
  /* SSE4.2/PCLMULQDQ handling */
  
  #define SSE42_PCLMULQDQ_CPU_SUPPORTED \
@@ -196,6 +235,11 @@ rte_net_crc_set_alg(enum rte_net_crc_alg alg)
         handlers = NULL;
  
         switch (alg) {
+       case RTE_NET_CRC_AVX512:
+               handlers = avx512_vpclmulqdq_get_handlers();
+               if (handlers != NULL)
+                       break;
+               /* fall-through */
         case RTE_NET_CRC_SSE42:
                 handlers = sse42_pclmulqdq_get_handlers();
                 break; /* for x86, always break here */
@@ -235,6 +279,8 @@ RTE_INIT(rte_net_crc_init)
  
         if (sse42_pclmulqdq_init())
                 alg = RTE_NET_CRC_SSE42;
+       if (avx512_vpclmulqdq_init())
+               alg = RTE_NET_CRC_AVX512;
         if (neon_pmull_init())
                 alg = RTE_NET_CRC_NEON;
  
diff --git a/lib/librte_net/rte_net_crc.h b/lib/librte_net/rte_net_crc.h

index 16e85ca..72d3e10 100644 (file)
--- a/lib/librte_net/rte_net_crc.h
+++ b/lib/librte_net/rte_net_crc.h
@@ -1,5 +1,5 @@
  /* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2017 Intel Corporation
+ * Copyright(c) 2017-2020 Intel Corporation
   */
  
  #ifndef _RTE_NET_CRC_H_
@@ -23,6 +23,7 @@ enum rte_net_crc_alg {
         RTE_NET_CRC_SCALAR = 0,
         RTE_NET_CRC_SSE42,
         RTE_NET_CRC_NEON,
+       RTE_NET_CRC_AVX512,
  };
  
  /**
@@ -35,6 +36,7 @@ enum rte_net_crc_alg {
   *   - RTE_NET_CRC_SCALAR
   *   - RTE_NET_CRC_SSE42 (Use 64-bit SSE4.2 intrinsic)
   *   - RTE_NET_CRC_NEON (Use ARM Neon intrinsic)
+ *   - RTE_NET_CRC_AVX512 (Use 512-bit AVX intrinsic)
   */
  void
  rte_net_crc_set_alg(enum rte_net_crc_alg alg);
author	Mairtin o Loingsigh <mairtin.oloingsigh@intel.com>
	Fri, 9 Oct 2020 13:50:45 +0000 (14:50 +0100)
committer	David Marchand <david.marchand@redhat.com>
	Tue, 13 Oct 2020 17:26:15 +0000 (19:26 +0200)
MAINTAINERS		patch \| blob \| history
app/test/test_crc.c		patch \| blob \| history
config/x86/meson.build		patch \| blob \| history
doc/guides/rel_notes/release_20_11.rst		patch \| blob \| history
lib/librte_net/meson.build		patch \| blob \| history
lib/librte_net/net_crc.h		patch \| blob \| history
lib/librte_net/net_crc_avx512.c	[new file with mode: 0644]	patch \| blob
lib/librte_net/rte_net_crc.c		patch \| blob \| history
lib/librte_net/rte_net_crc.h		patch \| blob \| history