acl: add infrastructure for AVX512 classify methods
authorKonstantin Ananyev <konstantin.ananyev@intel.com>
Tue, 6 Oct 2020 15:03:09 +0000 (16:03 +0100)
committerDavid Marchand <david.marchand@redhat.com>
Wed, 14 Oct 2020 12:23:00 +0000 (14:23 +0200)
Add necessary changes to support new AVX512 specific ACL classify
algorithm:
 - changes in meson.build to check that build tools
   (compiler, assembler, etc.) do properly support AVX512.
 - run-time checks to make sure target platform does support AVX512.
 - dummy rte_acl_classify_avx512() for targets where AVX512
   implementation couldn't be properly supported.

Signed-off-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
Acked-by: Bruce Richardson <bruce.richardson@intel.com>
config/x86/meson.build
lib/librte_acl/acl.h
lib/librte_acl/acl_run_avx512.c [new file with mode: 0644]
lib/librte_acl/meson.build
lib/librte_acl/rte_acl.c
lib/librte_acl/rte_acl.h

index 172b72b..31bfa63 100644 (file)
@@ -21,11 +21,20 @@ foreach f:base_flags
        compile_time_cpuflags += ['RTE_CPUFLAG_' + f]
 endforeach
 
-optional_flags = ['AES', 'PCLMUL',
-               'AVX', 'AVX2', 'AVX512F',
-               'RDRND', 'RDSEED',
-               'AVX512BW', 'AVX512DQ',
-               'AVX512VL', 'VPCLMULQDQ']
+optional_flags = [
+       'AES',
+       'AVX',
+       'AVX2',
+       'AVX512BW',
+       'AVX512CD',
+       'AVX512DQ',
+       'AVX512F',
+       'AVX512VL',
+       'PCLMUL',
+       'RDRND',
+       'RDSEED',
+       'VPCLMULQDQ',
+]
 foreach f:optional_flags
        if cc.get_define('__@0@__'.format(f), args: machine_args) == '1'
                if f == 'PCLMUL' # special case flags with different defines
index 39d45a0..543ce55 100644 (file)
@@ -201,6 +201,14 @@ int
 rte_acl_classify_avx2(const struct rte_acl_ctx *ctx, const uint8_t **data,
        uint32_t *results, uint32_t num, uint32_t categories);
 
+int
+rte_acl_classify_avx512x16(const struct rte_acl_ctx *ctx, const uint8_t **data,
+       uint32_t *results, uint32_t num, uint32_t categories);
+
+int
+rte_acl_classify_avx512x32(const struct rte_acl_ctx *ctx, const uint8_t **data,
+       uint32_t *results, uint32_t num, uint32_t categories);
+
 int
 rte_acl_classify_neon(const struct rte_acl_ctx *ctx, const uint8_t **data,
        uint32_t *results, uint32_t num, uint32_t categories);
diff --git a/lib/librte_acl/acl_run_avx512.c b/lib/librte_acl/acl_run_avx512.c
new file mode 100644 (file)
index 0000000..1817f88
--- /dev/null
@@ -0,0 +1,29 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2020 Intel Corporation
+ */
+
+#include "acl_run_sse.h"
+
+int
+rte_acl_classify_avx512x16(const struct rte_acl_ctx *ctx, const uint8_t **data,
+       uint32_t *results, uint32_t num, uint32_t categories)
+{
+       if (num >= MAX_SEARCHES_SSE8)
+               return search_sse_8(ctx, data, results, num, categories);
+       if (num >= MAX_SEARCHES_SSE4)
+               return search_sse_4(ctx, data, results, num, categories);
+
+       return rte_acl_classify_scalar(ctx, data, results, num, categories);
+}
+
+int
+rte_acl_classify_avx512x32(const struct rte_acl_ctx *ctx, const uint8_t **data,
+       uint32_t *results, uint32_t num, uint32_t categories)
+{
+       if (num >= MAX_SEARCHES_SSE8)
+               return search_sse_8(ctx, data, results, num, categories);
+       if (num >= MAX_SEARCHES_SSE4)
+               return search_sse_4(ctx, data, results, num, categories);
+
+       return rte_acl_classify_scalar(ctx, data, results, num, categories);
+}
index 751ee60..ee4e229 100644 (file)
@@ -27,6 +27,54 @@ if dpdk_conf.has('RTE_ARCH_X86')
                cflags += '-DCC_AVX2_SUPPORT'
        endif
 
+       # compile AVX512 version if:
+       # we are building 64-bit binary AND binutils can generate proper code
+
+       if dpdk_conf.has('RTE_ARCH_X86_64') and binutils_ok.returncode() == 0
+
+               # compile AVX512 version if either:
+               # a. we have AVX512 supported in minimum instruction set
+               #    baseline
+               # b. it's not minimum instruction set, but supported by
+               #    compiler
+               #
+               # in former case, just add avx512 C file to files list
+               # in latter case, compile c file to static lib, using correct
+               # compiler flags, and then have the .o file from static lib
+               # linked into main lib.
+
+               # check if all required flags already enabled (variant a).
+               acl_avx512_flags = ['__AVX512F__', '__AVX512VL__',
+                       '__AVX512CD__', '__AVX512BW__']
+
+               acl_avx512_on = true
+               foreach f:acl_avx512_flags
+
+                       if cc.get_define(f, args: machine_args) == ''
+                               acl_avx512_on = false
+                       endif
+               endforeach
+
+               if acl_avx512_on == true
+
+                       sources += files('acl_run_avx512.c')
+                       cflags += '-DCC_AVX512_SUPPORT'
+
+               elif cc.has_multi_arguments('-mavx512f', '-mavx512vl',
+                                       '-mavx512cd', '-mavx512bw')
+
+                       avx512_tmplib = static_library('avx512_tmp',
+                               'acl_run_avx512.c',
+                               dependencies: static_rte_eal,
+                               c_args: cflags +
+                                       ['-mavx512f', '-mavx512vl',
+                                        '-mavx512cd', '-mavx512bw'])
+                       objs += avx512_tmplib.extract_objects(
+                                       'acl_run_avx512.c')
+                       cflags += '-DCC_AVX512_SUPPORT'
+               endif
+       endif
+
 elif dpdk_conf.has('RTE_ARCH_ARM')
        cflags += '-flax-vector-conversions'
        sources += files('acl_run_neon.c')
index d1583c5..cb8ccc5 100644 (file)
@@ -16,6 +16,32 @@ static struct rte_tailq_elem rte_acl_tailq = {
 };
 EAL_REGISTER_TAILQ(rte_acl_tailq)
 
+#ifndef CC_AVX512_SUPPORT
+/*
+ * If the compiler doesn't support AVX512 instructions,
+ * then the dummy one would be used instead for AVX512 classify method.
+ */
+int
+rte_acl_classify_avx512x16(__rte_unused const struct rte_acl_ctx *ctx,
+       __rte_unused const uint8_t **data,
+       __rte_unused uint32_t *results,
+       __rte_unused uint32_t num,
+       __rte_unused uint32_t categories)
+{
+       return -ENOTSUP;
+}
+
+int
+rte_acl_classify_avx512x32(__rte_unused const struct rte_acl_ctx *ctx,
+       __rte_unused const uint8_t **data,
+       __rte_unused uint32_t *results,
+       __rte_unused uint32_t num,
+       __rte_unused uint32_t categories)
+{
+       return -ENOTSUP;
+}
+#endif
+
 #ifndef CC_AVX2_SUPPORT
 /*
  * If the compiler doesn't support AVX2 instructions,
@@ -75,6 +101,8 @@ static const rte_acl_classify_t classify_fns[] = {
        [RTE_ACL_CLASSIFY_AVX2] = rte_acl_classify_avx2,
        [RTE_ACL_CLASSIFY_NEON] = rte_acl_classify_neon,
        [RTE_ACL_CLASSIFY_ALTIVEC] = rte_acl_classify_altivec,
+       [RTE_ACL_CLASSIFY_AVX512X16] = rte_acl_classify_avx512x16,
+       [RTE_ACL_CLASSIFY_AVX512X32] = rte_acl_classify_avx512x32,
 };
 
 /*
@@ -124,6 +152,18 @@ acl_check_alg_ppc(enum rte_acl_classify_alg alg)
 static int
 acl_check_alg_x86(enum rte_acl_classify_alg alg)
 {
+       if (alg == RTE_ACL_CLASSIFY_AVX512X16 ||
+                       alg == RTE_ACL_CLASSIFY_AVX512X32) {
+#ifdef CC_AVX512_SUPPORT
+               if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) &&
+                       rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512VL) &&
+                       rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512CD) &&
+                       rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512BW))
+                       return 0;
+#endif
+               return -ENOTSUP;
+       }
+
        if (alg == RTE_ACL_CLASSIFY_AVX2) {
 #ifdef CC_AVX2_SUPPORT
                if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2))
@@ -157,6 +197,8 @@ acl_check_alg(enum rte_acl_classify_alg alg)
                return acl_check_alg_arm(alg);
        case RTE_ACL_CLASSIFY_ALTIVEC:
                return acl_check_alg_ppc(alg);
+       case RTE_ACL_CLASSIFY_AVX512X32:
+       case RTE_ACL_CLASSIFY_AVX512X16:
        case RTE_ACL_CLASSIFY_AVX2:
        case RTE_ACL_CLASSIFY_SSE:
                return acl_check_alg_x86(alg);
index 3999f15..1bfed00 100644 (file)
@@ -241,6 +241,8 @@ enum rte_acl_classify_alg {
        RTE_ACL_CLASSIFY_AVX2 = 3,    /**< requires AVX2 support. */
        RTE_ACL_CLASSIFY_NEON = 4,    /**< requires NEON support. */
        RTE_ACL_CLASSIFY_ALTIVEC = 5,    /**< requires ALTIVEC support. */
+       RTE_ACL_CLASSIFY_AVX512X16 = 6,  /**< requires AVX512 support. */
+       RTE_ACL_CLASSIFY_AVX512X32 = 7,  /**< requires AVX512 support. */
 };
 
 /**