From: Mairtin o Loingsigh Date: Fri, 9 Oct 2020 13:50:44 +0000 (+0100) Subject: net: add CRC implementation runtime selection X-Git-Url: http://git.droids-corp.org/?a=commitdiff_plain;h=ef94569cf9f404838e5caf1c3b2799e703168c02;p=dpdk.git net: add CRC implementation runtime selection This patch adds support for run-time selection of the optimal architecture-specific CRC path, based on the supported instruction set(s) of the CPU. The compiler option checks have been moved from the C files to the meson script. The rte_cpu_get_flag_enabled function is called automatically by the library at process initialization time to determine which instructions the CPU supports, with the most optimal supported CRC path ultimately selected. Signed-off-by: Mairtin o Loingsigh Signed-off-by: David Coyle Acked-by: Konstantin Ananyev Reviewed-by: Jasvinder Singh Reviewed-by: Pablo de Lara Reviewed-by: Ruifeng Wang Acked-by: Bruce Richardson --- diff --git a/MAINTAINERS b/MAINTAINERS index 4b232570cd..07961904b7 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -262,7 +262,7 @@ ARM v8 M: Jerin Jacob M: Ruifeng Wang F: lib/librte_eal/arm/include/*_64.h -F: lib/librte_net/net_crc_neon.h +F: lib/librte_net/net_crc_neon.c F: lib/librte_acl/acl_run_neon.* F: lib/librte_bpf/bpf_jit_arm64.c F: lib/librte_lpm/rte_lpm_neon.h @@ -1243,8 +1243,9 @@ F: lib/librte_net/ Packet CRC M: Jasvinder Singh +F: lib/librte_net/net_crc.h F: lib/librte_net/rte_net_crc* -F: lib/librte_net/net_crc_sse.h +F: lib/librte_net/net_crc_sse.c F: app/test/test_crc.c IP fragmentation & reassembly diff --git a/doc/guides/rel_notes/release_20_11.rst b/doc/guides/rel_notes/release_20_11.rst index b7881f2e9d..2ccf35f0df 100644 --- a/doc/guides/rel_notes/release_20_11.rst +++ b/doc/guides/rel_notes/release_20_11.rst @@ -62,6 +62,10 @@ New Features The functions are provided as a generic stubs and x86 specific implementation. +* **Updated CRC modules of the net library.** + + * Added runtime selection of the optimal architecture-specific CRC path. + * **Added the FEC API, for a generic FEC query and config.** Added the FEC API which provides functions for query FEC capabilities and diff --git a/lib/librte_net/meson.build b/lib/librte_net/meson.build index 24ed8253b4..fa439b9e53 100644 --- a/lib/librte_net/meson.build +++ b/lib/librte_net/meson.build @@ -1,5 +1,5 @@ # SPDX-License-Identifier: BSD-3-Clause -# Copyright(c) 2017 Intel Corporation +# Copyright(c) 2017-2020 Intel Corporation headers = files('rte_ip.h', 'rte_tcp.h', @@ -20,3 +20,35 @@ headers = files('rte_ip.h', sources = files('rte_arp.c', 'rte_ether.c', 'rte_net.c', 'rte_net_crc.c') deps += ['mbuf'] + +if dpdk_conf.has('RTE_ARCH_X86_64') + net_crc_sse42_cpu_support = ( + cc.get_define('__PCLMUL__', args: machine_args) != '') + net_crc_sse42_cc_support = ( + cc.has_argument('-mpclmul') and cc.has_argument('-maes')) + + build_static_net_crc_sse42_lib = 0 + + if net_crc_sse42_cpu_support == true + sources += files('net_crc_sse.c') + cflags += ['-DCC_X86_64_SSE42_PCLMULQDQ_SUPPORT'] + elif net_crc_sse42_cc_support == true + build_static_net_crc_sse42_lib = 1 + net_crc_sse42_lib_cflags = ['-mpclmul', '-maes'] + cflags += ['-DCC_X86_64_SSE42_PCLMULQDQ_SUPPORT'] + endif + + if build_static_net_crc_sse42_lib == 1 + net_crc_sse42_lib = static_library( + 'net_crc_sse42_lib', + 'net_crc_sse.c', + dependencies: static_rte_eal, + c_args: [cflags, + net_crc_sse42_lib_cflags]) + objs += net_crc_sse42_lib.extract_objects('net_crc_sse.c') + endif +elif (dpdk_conf.has('RTE_ARCH_ARM64') and + cc.get_define('__ARM_FEATURE_CRYPTO', args: machine_args) != '') + sources += files('net_crc_neon.c') + cflags += ['-DCC_ARM64_NEON_PMULL_SUPPORT'] +endif diff --git a/lib/librte_net/net_crc.h b/lib/librte_net/net_crc.h new file mode 100644 index 0000000000..a1578a56c3 --- /dev/null +++ b/lib/librte_net/net_crc.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2020 Intel Corporation + */ + +#ifndef _NET_CRC_H_ +#define _NET_CRC_H_ + +/* + * Different implementations of CRC + */ + +/* SSE4.2 */ + +void +rte_net_crc_sse42_init(void); + +uint32_t +rte_crc16_ccitt_sse42_handler(const uint8_t *data, uint32_t data_len); + +uint32_t +rte_crc32_eth_sse42_handler(const uint8_t *data, uint32_t data_len); + +/* NEON */ + +void +rte_net_crc_neon_init(void); + +uint32_t +rte_crc16_ccitt_neon_handler(const uint8_t *data, uint32_t data_len); + +uint32_t +rte_crc32_eth_neon_handler(const uint8_t *data, uint32_t data_len); + +#endif /* _NET_CRC_H_ */ diff --git a/lib/librte_net/net_crc_neon.c b/lib/librte_net/net_crc_neon.c new file mode 100644 index 0000000000..f61d75a8c6 --- /dev/null +++ b/lib/librte_net/net_crc_neon.c @@ -0,0 +1,259 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017 Cavium, Inc + */ + +#include + +#include +#include +#include +#include +#include + +#include "net_crc.h" + +/** PMULL CRC computation context structure */ +struct crc_pmull_ctx { + uint64x2_t rk1_rk2; + uint64x2_t rk5_rk6; + uint64x2_t rk7_rk8; +}; + +struct crc_pmull_ctx crc32_eth_pmull __rte_aligned(16); +struct crc_pmull_ctx crc16_ccitt_pmull __rte_aligned(16); + +/** + * @brief Performs one folding round + * + * Logically function operates as follows: + * DATA = READ_NEXT_16BYTES(); + * F1 = LSB8(FOLD) + * F2 = MSB8(FOLD) + * T1 = CLMUL(F1, RK1) + * T2 = CLMUL(F2, RK2) + * FOLD = XOR(T1, T2, DATA) + * + * @param data_block 16 byte data block + * @param precomp precomputed rk1 constant + * @param fold running 16 byte folded data + * + * @return New 16 byte folded data + */ +static inline uint64x2_t +crcr32_folding_round(uint64x2_t data_block, uint64x2_t precomp, + uint64x2_t fold) +{ + uint64x2_t tmp0 = vreinterpretq_u64_p128(vmull_p64( + vgetq_lane_p64(vreinterpretq_p64_u64(fold), 1), + vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 0))); + + uint64x2_t tmp1 = vreinterpretq_u64_p128(vmull_p64( + vgetq_lane_p64(vreinterpretq_p64_u64(fold), 0), + vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 1))); + + return veorq_u64(tmp1, veorq_u64(data_block, tmp0)); +} + +/** + * Performs reduction from 128 bits to 64 bits + * + * @param data128 128 bits data to be reduced + * @param precomp rk5 and rk6 precomputed constants + * + * @return data reduced to 64 bits + */ +static inline uint64x2_t +crcr32_reduce_128_to_64(uint64x2_t data128, + uint64x2_t precomp) +{ + uint64x2_t tmp0, tmp1, tmp2; + + /* 64b fold */ + tmp0 = vreinterpretq_u64_p128(vmull_p64( + vgetq_lane_p64(vreinterpretq_p64_u64(data128), 0), + vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 0))); + tmp1 = vshift_bytes_right(data128, 8); + tmp0 = veorq_u64(tmp0, tmp1); + + /* 32b fold */ + tmp2 = vshift_bytes_left(tmp0, 4); + tmp1 = vreinterpretq_u64_p128(vmull_p64( + vgetq_lane_p64(vreinterpretq_p64_u64(tmp2), 0), + vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 1))); + + return veorq_u64(tmp1, tmp0); +} + +/** + * Performs Barret's reduction from 64 bits to 32 bits + * + * @param data64 64 bits data to be reduced + * @param precomp rk7 precomputed constant + * + * @return data reduced to 32 bits + */ +static inline uint32_t +crcr32_reduce_64_to_32(uint64x2_t data64, + uint64x2_t precomp) +{ + static uint32_t mask1[4] __rte_aligned(16) = { + 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 + }; + static uint32_t mask2[4] __rte_aligned(16) = { + 0x00000000, 0xffffffff, 0xffffffff, 0xffffffff + }; + uint64x2_t tmp0, tmp1, tmp2; + + tmp0 = vandq_u64(data64, vld1q_u64((uint64_t *)mask2)); + + tmp1 = vreinterpretq_u64_p128(vmull_p64( + vgetq_lane_p64(vreinterpretq_p64_u64(tmp0), 0), + vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 0))); + tmp1 = veorq_u64(tmp1, tmp0); + tmp1 = vandq_u64(tmp1, vld1q_u64((uint64_t *)mask1)); + + tmp2 = vreinterpretq_u64_p128(vmull_p64( + vgetq_lane_p64(vreinterpretq_p64_u64(tmp1), 0), + vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 1))); + tmp2 = veorq_u64(tmp2, tmp1); + tmp2 = veorq_u64(tmp2, tmp0); + + return vgetq_lane_u32(vreinterpretq_u32_u64(tmp2), 2); +} + +static inline uint32_t +crc32_eth_calc_pmull( + const uint8_t *data, + uint32_t data_len, + uint32_t crc, + const struct crc_pmull_ctx *params) +{ + uint64x2_t temp, fold, k; + uint32_t n; + + /* Get CRC init value */ + temp = vreinterpretq_u64_u32(vsetq_lane_u32(crc, vmovq_n_u32(0), 0)); + + /** + * Folding all data into single 16 byte data block + * Assumes: fold holds first 16 bytes of data + */ + if (unlikely(data_len < 32)) { + if (unlikely(data_len == 16)) { + /* 16 bytes */ + fold = vld1q_u64((const uint64_t *)data); + fold = veorq_u64(fold, temp); + goto reduction_128_64; + } + + if (unlikely(data_len < 16)) { + /* 0 to 15 bytes */ + uint8_t buffer[16] __rte_aligned(16); + + memset(buffer, 0, sizeof(buffer)); + memcpy(buffer, data, data_len); + + fold = vld1q_u64((uint64_t *)buffer); + fold = veorq_u64(fold, temp); + if (unlikely(data_len < 4)) { + fold = vshift_bytes_left(fold, 8 - data_len); + goto barret_reduction; + } + fold = vshift_bytes_left(fold, 16 - data_len); + goto reduction_128_64; + } + /* 17 to 31 bytes */ + fold = vld1q_u64((const uint64_t *)data); + fold = veorq_u64(fold, temp); + n = 16; + k = params->rk1_rk2; + goto partial_bytes; + } + + /** At least 32 bytes in the buffer */ + /** Apply CRC initial value */ + fold = vld1q_u64((const uint64_t *)data); + fold = veorq_u64(fold, temp); + + /** Main folding loop - the last 16 bytes is processed separately */ + k = params->rk1_rk2; + for (n = 16; (n + 16) <= data_len; n += 16) { + temp = vld1q_u64((const uint64_t *)&data[n]); + fold = crcr32_folding_round(temp, k, fold); + } + +partial_bytes: + if (likely(n < data_len)) { + uint64x2_t last16, a, b, mask; + uint32_t rem = data_len & 15; + + last16 = vld1q_u64((const uint64_t *)&data[data_len - 16]); + a = vshift_bytes_left(fold, 16 - rem); + b = vshift_bytes_right(fold, rem); + mask = vshift_bytes_left(vdupq_n_u64(-1), 16 - rem); + b = vorrq_u64(b, vandq_u64(mask, last16)); + + /* k = rk1 & rk2 */ + temp = vreinterpretq_u64_p128(vmull_p64( + vgetq_lane_p64(vreinterpretq_p64_u64(a), 1), + vgetq_lane_p64(vreinterpretq_p64_u64(k), 0))); + fold = vreinterpretq_u64_p128(vmull_p64( + vgetq_lane_p64(vreinterpretq_p64_u64(a), 0), + vgetq_lane_p64(vreinterpretq_p64_u64(k), 1))); + fold = veorq_u64(fold, temp); + fold = veorq_u64(fold, b); + } + + /** Reduction 128 -> 32 Assumes: fold holds 128bit folded data */ +reduction_128_64: + k = params->rk5_rk6; + fold = crcr32_reduce_128_to_64(fold, k); + +barret_reduction: + k = params->rk7_rk8; + n = crcr32_reduce_64_to_32(fold, k); + + return n; +} + +void +rte_net_crc_neon_init(void) +{ + /* Initialize CRC16 data */ + uint64_t ccitt_k1_k2[2] = {0x189aeLLU, 0x8e10LLU}; + uint64_t ccitt_k5_k6[2] = {0x189aeLLU, 0x114aaLLU}; + uint64_t ccitt_k7_k8[2] = {0x11c581910LLU, 0x10811LLU}; + + /* Initialize CRC32 data */ + uint64_t eth_k1_k2[2] = {0xccaa009eLLU, 0x1751997d0LLU}; + uint64_t eth_k5_k6[2] = {0xccaa009eLLU, 0x163cd6124LLU}; + uint64_t eth_k7_k8[2] = {0x1f7011640LLU, 0x1db710641LLU}; + + /** Save the params in context structure */ + crc16_ccitt_pmull.rk1_rk2 = vld1q_u64(ccitt_k1_k2); + crc16_ccitt_pmull.rk5_rk6 = vld1q_u64(ccitt_k5_k6); + crc16_ccitt_pmull.rk7_rk8 = vld1q_u64(ccitt_k7_k8); + + /** Save the params in context structure */ + crc32_eth_pmull.rk1_rk2 = vld1q_u64(eth_k1_k2); + crc32_eth_pmull.rk5_rk6 = vld1q_u64(eth_k5_k6); + crc32_eth_pmull.rk7_rk8 = vld1q_u64(eth_k7_k8); +} + +uint32_t +rte_crc16_ccitt_neon_handler(const uint8_t *data, uint32_t data_len) +{ + return (uint16_t)~crc32_eth_calc_pmull(data, + data_len, + 0xffff, + &crc16_ccitt_pmull); +} + +uint32_t +rte_crc32_eth_neon_handler(const uint8_t *data, uint32_t data_len) +{ + return ~crc32_eth_calc_pmull(data, + data_len, + 0xffffffffUL, + &crc32_eth_pmull); +} diff --git a/lib/librte_net/net_crc_neon.h b/lib/librte_net/net_crc_neon.h deleted file mode 100644 index 63fa1d4a11..0000000000 --- a/lib/librte_net/net_crc_neon.h +++ /dev/null @@ -1,269 +0,0 @@ -/* SPDX-License-Identifier: BSD-3-Clause - * Copyright(c) 2017 Cavium, Inc - */ - -#ifndef _NET_CRC_NEON_H_ -#define _NET_CRC_NEON_H_ - -#include -#include -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -/** PMULL CRC computation context structure */ -struct crc_pmull_ctx { - uint64x2_t rk1_rk2; - uint64x2_t rk5_rk6; - uint64x2_t rk7_rk8; -}; - -struct crc_pmull_ctx crc32_eth_pmull __rte_aligned(16); -struct crc_pmull_ctx crc16_ccitt_pmull __rte_aligned(16); - -/** - * @brief Performs one folding round - * - * Logically function operates as follows: - * DATA = READ_NEXT_16BYTES(); - * F1 = LSB8(FOLD) - * F2 = MSB8(FOLD) - * T1 = CLMUL(F1, RK1) - * T2 = CLMUL(F2, RK2) - * FOLD = XOR(T1, T2, DATA) - * - * @param data_block 16 byte data block - * @param precomp precomputed rk1 constant - * @param fold running 16 byte folded data - * - * @return New 16 byte folded data - */ -static inline uint64x2_t -crcr32_folding_round(uint64x2_t data_block, uint64x2_t precomp, - uint64x2_t fold) -{ - uint64x2_t tmp0 = vreinterpretq_u64_p128(vmull_p64( - vgetq_lane_p64(vreinterpretq_p64_u64(fold), 1), - vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 0))); - - uint64x2_t tmp1 = vreinterpretq_u64_p128(vmull_p64( - vgetq_lane_p64(vreinterpretq_p64_u64(fold), 0), - vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 1))); - - return veorq_u64(tmp1, veorq_u64(data_block, tmp0)); -} - -/** - * Performs reduction from 128 bits to 64 bits - * - * @param data128 128 bits data to be reduced - * @param precomp rk5 and rk6 precomputed constants - * - * @return data reduced to 64 bits - */ -static inline uint64x2_t -crcr32_reduce_128_to_64(uint64x2_t data128, - uint64x2_t precomp) -{ - uint64x2_t tmp0, tmp1, tmp2; - - /* 64b fold */ - tmp0 = vreinterpretq_u64_p128(vmull_p64( - vgetq_lane_p64(vreinterpretq_p64_u64(data128), 0), - vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 0))); - tmp1 = vshift_bytes_right(data128, 8); - tmp0 = veorq_u64(tmp0, tmp1); - - /* 32b fold */ - tmp2 = vshift_bytes_left(tmp0, 4); - tmp1 = vreinterpretq_u64_p128(vmull_p64( - vgetq_lane_p64(vreinterpretq_p64_u64(tmp2), 0), - vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 1))); - - return veorq_u64(tmp1, tmp0); -} - -/** - * Performs Barret's reduction from 64 bits to 32 bits - * - * @param data64 64 bits data to be reduced - * @param precomp rk7 precomputed constant - * - * @return data reduced to 32 bits - */ -static inline uint32_t -crcr32_reduce_64_to_32(uint64x2_t data64, - uint64x2_t precomp) -{ - static uint32_t mask1[4] __rte_aligned(16) = { - 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 - }; - static uint32_t mask2[4] __rte_aligned(16) = { - 0x00000000, 0xffffffff, 0xffffffff, 0xffffffff - }; - uint64x2_t tmp0, tmp1, tmp2; - - tmp0 = vandq_u64(data64, vld1q_u64((uint64_t *)mask2)); - - tmp1 = vreinterpretq_u64_p128(vmull_p64( - vgetq_lane_p64(vreinterpretq_p64_u64(tmp0), 0), - vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 0))); - tmp1 = veorq_u64(tmp1, tmp0); - tmp1 = vandq_u64(tmp1, vld1q_u64((uint64_t *)mask1)); - - tmp2 = vreinterpretq_u64_p128(vmull_p64( - vgetq_lane_p64(vreinterpretq_p64_u64(tmp1), 0), - vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 1))); - tmp2 = veorq_u64(tmp2, tmp1); - tmp2 = veorq_u64(tmp2, tmp0); - - return vgetq_lane_u32(vreinterpretq_u32_u64(tmp2), 2); -} - -static inline uint32_t -crc32_eth_calc_pmull( - const uint8_t *data, - uint32_t data_len, - uint32_t crc, - const struct crc_pmull_ctx *params) -{ - uint64x2_t temp, fold, k; - uint32_t n; - - /* Get CRC init value */ - temp = vreinterpretq_u64_u32(vsetq_lane_u32(crc, vmovq_n_u32(0), 0)); - - /** - * Folding all data into single 16 byte data block - * Assumes: fold holds first 16 bytes of data - */ - if (unlikely(data_len < 32)) { - if (unlikely(data_len == 16)) { - /* 16 bytes */ - fold = vld1q_u64((const uint64_t *)data); - fold = veorq_u64(fold, temp); - goto reduction_128_64; - } - - if (unlikely(data_len < 16)) { - /* 0 to 15 bytes */ - uint8_t buffer[16] __rte_aligned(16); - - memset(buffer, 0, sizeof(buffer)); - memcpy(buffer, data, data_len); - - fold = vld1q_u64((uint64_t *)buffer); - fold = veorq_u64(fold, temp); - if (unlikely(data_len < 4)) { - fold = vshift_bytes_left(fold, 8 - data_len); - goto barret_reduction; - } - fold = vshift_bytes_left(fold, 16 - data_len); - goto reduction_128_64; - } - /* 17 to 31 bytes */ - fold = vld1q_u64((const uint64_t *)data); - fold = veorq_u64(fold, temp); - n = 16; - k = params->rk1_rk2; - goto partial_bytes; - } - - /** At least 32 bytes in the buffer */ - /** Apply CRC initial value */ - fold = vld1q_u64((const uint64_t *)data); - fold = veorq_u64(fold, temp); - - /** Main folding loop - the last 16 bytes is processed separately */ - k = params->rk1_rk2; - for (n = 16; (n + 16) <= data_len; n += 16) { - temp = vld1q_u64((const uint64_t *)&data[n]); - fold = crcr32_folding_round(temp, k, fold); - } - -partial_bytes: - if (likely(n < data_len)) { - uint64x2_t last16, a, b, mask; - uint32_t rem = data_len & 15; - - last16 = vld1q_u64((const uint64_t *)&data[data_len - 16]); - a = vshift_bytes_left(fold, 16 - rem); - b = vshift_bytes_right(fold, rem); - mask = vshift_bytes_left(vdupq_n_u64(-1), 16 - rem); - b = vorrq_u64(b, vandq_u64(mask, last16)); - - /* k = rk1 & rk2 */ - temp = vreinterpretq_u64_p128(vmull_p64( - vgetq_lane_p64(vreinterpretq_p64_u64(a), 1), - vgetq_lane_p64(vreinterpretq_p64_u64(k), 0))); - fold = vreinterpretq_u64_p128(vmull_p64( - vgetq_lane_p64(vreinterpretq_p64_u64(a), 0), - vgetq_lane_p64(vreinterpretq_p64_u64(k), 1))); - fold = veorq_u64(fold, temp); - fold = veorq_u64(fold, b); - } - - /** Reduction 128 -> 32 Assumes: fold holds 128bit folded data */ -reduction_128_64: - k = params->rk5_rk6; - fold = crcr32_reduce_128_to_64(fold, k); - -barret_reduction: - k = params->rk7_rk8; - n = crcr32_reduce_64_to_32(fold, k); - - return n; -} - -static inline void -rte_net_crc_neon_init(void) -{ - /* Initialize CRC16 data */ - uint64_t ccitt_k1_k2[2] = {0x189aeLLU, 0x8e10LLU}; - uint64_t ccitt_k5_k6[2] = {0x189aeLLU, 0x114aaLLU}; - uint64_t ccitt_k7_k8[2] = {0x11c581910LLU, 0x10811LLU}; - - /* Initialize CRC32 data */ - uint64_t eth_k1_k2[2] = {0xccaa009eLLU, 0x1751997d0LLU}; - uint64_t eth_k5_k6[2] = {0xccaa009eLLU, 0x163cd6124LLU}; - uint64_t eth_k7_k8[2] = {0x1f7011640LLU, 0x1db710641LLU}; - - /** Save the params in context structure */ - crc16_ccitt_pmull.rk1_rk2 = vld1q_u64(ccitt_k1_k2); - crc16_ccitt_pmull.rk5_rk6 = vld1q_u64(ccitt_k5_k6); - crc16_ccitt_pmull.rk7_rk8 = vld1q_u64(ccitt_k7_k8); - - /** Save the params in context structure */ - crc32_eth_pmull.rk1_rk2 = vld1q_u64(eth_k1_k2); - crc32_eth_pmull.rk5_rk6 = vld1q_u64(eth_k5_k6); - crc32_eth_pmull.rk7_rk8 = vld1q_u64(eth_k7_k8); -} - -static inline uint32_t -rte_crc16_ccitt_neon_handler(const uint8_t *data, - uint32_t data_len) -{ - return (uint16_t)~crc32_eth_calc_pmull(data, - data_len, - 0xffff, - &crc16_ccitt_pmull); -} - -static inline uint32_t -rte_crc32_eth_neon_handler(const uint8_t *data, - uint32_t data_len) -{ - return ~crc32_eth_calc_pmull(data, - data_len, - 0xffffffffUL, - &crc32_eth_pmull); -} - -#ifdef __cplusplus -} -#endif - -#endif /* _NET_CRC_NEON_H_ */ diff --git a/lib/librte_net/net_crc_sse.c b/lib/librte_net/net_crc_sse.c new file mode 100644 index 0000000000..053b54b390 --- /dev/null +++ b/lib/librte_net/net_crc_sse.c @@ -0,0 +1,322 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017-2020 Intel Corporation + */ + +#include + +#include +#include +#include + +#include "net_crc.h" + +#include + +/** PCLMULQDQ CRC computation context structure */ +struct crc_pclmulqdq_ctx { + __m128i rk1_rk2; + __m128i rk5_rk6; + __m128i rk7_rk8; +}; + +static struct crc_pclmulqdq_ctx crc32_eth_pclmulqdq __rte_aligned(16); +static struct crc_pclmulqdq_ctx crc16_ccitt_pclmulqdq __rte_aligned(16); +/** + * @brief Performs one folding round + * + * Logically function operates as follows: + * DATA = READ_NEXT_16BYTES(); + * F1 = LSB8(FOLD) + * F2 = MSB8(FOLD) + * T1 = CLMUL(F1, RK1) + * T2 = CLMUL(F2, RK2) + * FOLD = XOR(T1, T2, DATA) + * + * @param data_block + * 16 byte data block + * @param precomp + * Precomputed rk1 constant + * @param fold + * Current16 byte folded data + * + * @return + * New 16 byte folded data + */ +static __rte_always_inline __m128i +crcr32_folding_round(__m128i data_block, + __m128i precomp, + __m128i fold) +{ + __m128i tmp0 = _mm_clmulepi64_si128(fold, precomp, 0x01); + __m128i tmp1 = _mm_clmulepi64_si128(fold, precomp, 0x10); + + return _mm_xor_si128(tmp1, _mm_xor_si128(data_block, tmp0)); +} + +/** + * Performs reduction from 128 bits to 64 bits + * + * @param data128 + * 128 bits data to be reduced + * @param precomp + * precomputed constants rk5, rk6 + * + * @return + * 64 bits reduced data + */ + +static __rte_always_inline __m128i +crcr32_reduce_128_to_64(__m128i data128, __m128i precomp) +{ + __m128i tmp0, tmp1, tmp2; + + /* 64b fold */ + tmp0 = _mm_clmulepi64_si128(data128, precomp, 0x00); + tmp1 = _mm_srli_si128(data128, 8); + tmp0 = _mm_xor_si128(tmp0, tmp1); + + /* 32b fold */ + tmp2 = _mm_slli_si128(tmp0, 4); + tmp1 = _mm_clmulepi64_si128(tmp2, precomp, 0x10); + + return _mm_xor_si128(tmp1, tmp0); +} + +/** + * Performs Barret's reduction from 64 bits to 32 bits + * + * @param data64 + * 64 bits data to be reduced + * @param precomp + * rk7 precomputed constant + * + * @return + * reduced 32 bits data + */ + +static __rte_always_inline uint32_t +crcr32_reduce_64_to_32(__m128i data64, __m128i precomp) +{ + static const uint32_t mask1[4] __rte_aligned(16) = { + 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 + }; + + static const uint32_t mask2[4] __rte_aligned(16) = { + 0x00000000, 0xffffffff, 0xffffffff, 0xffffffff + }; + __m128i tmp0, tmp1, tmp2; + + tmp0 = _mm_and_si128(data64, _mm_load_si128((const __m128i *)mask2)); + + tmp1 = _mm_clmulepi64_si128(tmp0, precomp, 0x00); + tmp1 = _mm_xor_si128(tmp1, tmp0); + tmp1 = _mm_and_si128(tmp1, _mm_load_si128((const __m128i *)mask1)); + + tmp2 = _mm_clmulepi64_si128(tmp1, precomp, 0x10); + tmp2 = _mm_xor_si128(tmp2, tmp1); + tmp2 = _mm_xor_si128(tmp2, tmp0); + + return _mm_extract_epi32(tmp2, 2); +} + +static const uint8_t crc_xmm_shift_tab[48] __rte_aligned(16) = { + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff +}; + +/** + * Shifts left 128 bit register by specified number of bytes + * + * @param reg + * 128 bit value + * @param num + * number of bytes to shift left reg by (0-16) + * + * @return + * reg << (num * 8) + */ + +static __rte_always_inline __m128i +xmm_shift_left(__m128i reg, const unsigned int num) +{ + const __m128i *p = (const __m128i *)(crc_xmm_shift_tab + 16 - num); + + return _mm_shuffle_epi8(reg, _mm_loadu_si128(p)); +} + +static __rte_always_inline uint32_t +crc32_eth_calc_pclmulqdq( + const uint8_t *data, + uint32_t data_len, + uint32_t crc, + const struct crc_pclmulqdq_ctx *params) +{ + __m128i temp, fold, k; + uint32_t n; + + /* Get CRC init value */ + temp = _mm_insert_epi32(_mm_setzero_si128(), crc, 0); + + /** + * Folding all data into single 16 byte data block + * Assumes: fold holds first 16 bytes of data + */ + + if (unlikely(data_len < 32)) { + if (unlikely(data_len == 16)) { + /* 16 bytes */ + fold = _mm_loadu_si128((const __m128i *)data); + fold = _mm_xor_si128(fold, temp); + goto reduction_128_64; + } + + if (unlikely(data_len < 16)) { + /* 0 to 15 bytes */ + uint8_t buffer[16] __rte_aligned(16); + + memset(buffer, 0, sizeof(buffer)); + memcpy(buffer, data, data_len); + + fold = _mm_load_si128((const __m128i *)buffer); + fold = _mm_xor_si128(fold, temp); + if (unlikely(data_len < 4)) { + fold = xmm_shift_left(fold, 8 - data_len); + goto barret_reduction; + } + fold = xmm_shift_left(fold, 16 - data_len); + goto reduction_128_64; + } + /* 17 to 31 bytes */ + fold = _mm_loadu_si128((const __m128i *)data); + fold = _mm_xor_si128(fold, temp); + n = 16; + k = params->rk1_rk2; + goto partial_bytes; + } + + /** At least 32 bytes in the buffer */ + /** Apply CRC initial value */ + fold = _mm_loadu_si128((const __m128i *)data); + fold = _mm_xor_si128(fold, temp); + + /** Main folding loop - the last 16 bytes is processed separately */ + k = params->rk1_rk2; + for (n = 16; (n + 16) <= data_len; n += 16) { + temp = _mm_loadu_si128((const __m128i *)&data[n]); + fold = crcr32_folding_round(temp, k, fold); + } + +partial_bytes: + if (likely(n < data_len)) { + + const uint32_t mask3[4] __rte_aligned(16) = { + 0x80808080, 0x80808080, 0x80808080, 0x80808080 + }; + + const uint8_t shf_table[32] __rte_aligned(16) = { + 0x00, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f + }; + + __m128i last16, a, b; + + last16 = _mm_loadu_si128((const __m128i *)&data[data_len - 16]); + + temp = _mm_loadu_si128((const __m128i *) + &shf_table[data_len & 15]); + a = _mm_shuffle_epi8(fold, temp); + + temp = _mm_xor_si128(temp, + _mm_load_si128((const __m128i *)mask3)); + b = _mm_shuffle_epi8(fold, temp); + b = _mm_blendv_epi8(b, last16, temp); + + /* k = rk1 & rk2 */ + temp = _mm_clmulepi64_si128(a, k, 0x01); + fold = _mm_clmulepi64_si128(a, k, 0x10); + + fold = _mm_xor_si128(fold, temp); + fold = _mm_xor_si128(fold, b); + } + + /** Reduction 128 -> 32 Assumes: fold holds 128bit folded data */ +reduction_128_64: + k = params->rk5_rk6; + fold = crcr32_reduce_128_to_64(fold, k); + +barret_reduction: + k = params->rk7_rk8; + n = crcr32_reduce_64_to_32(fold, k); + + return n; +} + +void +rte_net_crc_sse42_init(void) +{ + uint64_t k1, k2, k5, k6; + uint64_t p = 0, q = 0; + + /** Initialize CRC16 data */ + k1 = 0x189aeLLU; + k2 = 0x8e10LLU; + k5 = 0x189aeLLU; + k6 = 0x114aaLLU; + q = 0x11c581910LLU; + p = 0x10811LLU; + + /** Save the params in context structure */ + crc16_ccitt_pclmulqdq.rk1_rk2 = + _mm_setr_epi64(_mm_cvtsi64_m64(k1), _mm_cvtsi64_m64(k2)); + crc16_ccitt_pclmulqdq.rk5_rk6 = + _mm_setr_epi64(_mm_cvtsi64_m64(k5), _mm_cvtsi64_m64(k6)); + crc16_ccitt_pclmulqdq.rk7_rk8 = + _mm_setr_epi64(_mm_cvtsi64_m64(q), _mm_cvtsi64_m64(p)); + + /** Initialize CRC32 data */ + k1 = 0xccaa009eLLU; + k2 = 0x1751997d0LLU; + k5 = 0xccaa009eLLU; + k6 = 0x163cd6124LLU; + q = 0x1f7011640LLU; + p = 0x1db710641LLU; + + /** Save the params in context structure */ + crc32_eth_pclmulqdq.rk1_rk2 = + _mm_setr_epi64(_mm_cvtsi64_m64(k1), _mm_cvtsi64_m64(k2)); + crc32_eth_pclmulqdq.rk5_rk6 = + _mm_setr_epi64(_mm_cvtsi64_m64(k5), _mm_cvtsi64_m64(k6)); + crc32_eth_pclmulqdq.rk7_rk8 = + _mm_setr_epi64(_mm_cvtsi64_m64(q), _mm_cvtsi64_m64(p)); + + /** + * Reset the register as following calculation may + * use other data types such as float, double, etc. + */ + _mm_empty(); +} + +uint32_t +rte_crc16_ccitt_sse42_handler(const uint8_t *data, uint32_t data_len) +{ + /** return 16-bit CRC value */ + return (uint16_t)~crc32_eth_calc_pclmulqdq(data, + data_len, + 0xffff, + &crc16_ccitt_pclmulqdq); +} + +uint32_t +rte_crc32_eth_sse42_handler(const uint8_t *data, uint32_t data_len) +{ + return ~crc32_eth_calc_pclmulqdq(data, + data_len, + 0xffffffffUL, + &crc32_eth_pclmulqdq); +} diff --git a/lib/librte_net/net_crc_sse.h b/lib/librte_net/net_crc_sse.h deleted file mode 100644 index 1c7b7a548a..0000000000 --- a/lib/librte_net/net_crc_sse.h +++ /dev/null @@ -1,334 +0,0 @@ -/* SPDX-License-Identifier: BSD-3-Clause - * Copyright(c) 2017 Intel Corporation - */ - -#ifndef _RTE_NET_CRC_SSE_H_ -#define _RTE_NET_CRC_SSE_H_ - -#include - -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -/** PCLMULQDQ CRC computation context structure */ -struct crc_pclmulqdq_ctx { - __m128i rk1_rk2; - __m128i rk5_rk6; - __m128i rk7_rk8; -}; - -static struct crc_pclmulqdq_ctx crc32_eth_pclmulqdq __rte_aligned(16); -static struct crc_pclmulqdq_ctx crc16_ccitt_pclmulqdq __rte_aligned(16); -/** - * @brief Performs one folding round - * - * Logically function operates as follows: - * DATA = READ_NEXT_16BYTES(); - * F1 = LSB8(FOLD) - * F2 = MSB8(FOLD) - * T1 = CLMUL(F1, RK1) - * T2 = CLMUL(F2, RK2) - * FOLD = XOR(T1, T2, DATA) - * - * @param data_block - * 16 byte data block - * @param precomp - * Precomputed rk1 constant - * @param fold - * Current16 byte folded data - * - * @return - * New 16 byte folded data - */ -static __rte_always_inline __m128i -crcr32_folding_round(__m128i data_block, - __m128i precomp, - __m128i fold) -{ - __m128i tmp0 = _mm_clmulepi64_si128(fold, precomp, 0x01); - __m128i tmp1 = _mm_clmulepi64_si128(fold, precomp, 0x10); - - return _mm_xor_si128(tmp1, _mm_xor_si128(data_block, tmp0)); -} - -/** - * Performs reduction from 128 bits to 64 bits - * - * @param data128 - * 128 bits data to be reduced - * @param precomp - * precomputed constants rk5, rk6 - * - * @return - * 64 bits reduced data - */ - -static __rte_always_inline __m128i -crcr32_reduce_128_to_64(__m128i data128, __m128i precomp) -{ - __m128i tmp0, tmp1, tmp2; - - /* 64b fold */ - tmp0 = _mm_clmulepi64_si128(data128, precomp, 0x00); - tmp1 = _mm_srli_si128(data128, 8); - tmp0 = _mm_xor_si128(tmp0, tmp1); - - /* 32b fold */ - tmp2 = _mm_slli_si128(tmp0, 4); - tmp1 = _mm_clmulepi64_si128(tmp2, precomp, 0x10); - - return _mm_xor_si128(tmp1, tmp0); -} - -/** - * Performs Barret's reduction from 64 bits to 32 bits - * - * @param data64 - * 64 bits data to be reduced - * @param precomp - * rk7 precomputed constant - * - * @return - * reduced 32 bits data - */ - -static __rte_always_inline uint32_t -crcr32_reduce_64_to_32(__m128i data64, __m128i precomp) -{ - static const uint32_t mask1[4] __rte_aligned(16) = { - 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 - }; - - static const uint32_t mask2[4] __rte_aligned(16) = { - 0x00000000, 0xffffffff, 0xffffffff, 0xffffffff - }; - __m128i tmp0, tmp1, tmp2; - - tmp0 = _mm_and_si128(data64, _mm_load_si128((const __m128i *)mask2)); - - tmp1 = _mm_clmulepi64_si128(tmp0, precomp, 0x00); - tmp1 = _mm_xor_si128(tmp1, tmp0); - tmp1 = _mm_and_si128(tmp1, _mm_load_si128((const __m128i *)mask1)); - - tmp2 = _mm_clmulepi64_si128(tmp1, precomp, 0x10); - tmp2 = _mm_xor_si128(tmp2, tmp1); - tmp2 = _mm_xor_si128(tmp2, tmp0); - - return _mm_extract_epi32(tmp2, 2); -} - -static const uint8_t crc_xmm_shift_tab[48] __rte_aligned(16) = { - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, - 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff -}; - -/** - * Shifts left 128 bit register by specified number of bytes - * - * @param reg - * 128 bit value - * @param num - * number of bytes to shift left reg by (0-16) - * - * @return - * reg << (num * 8) - */ - -static __rte_always_inline __m128i -xmm_shift_left(__m128i reg, const unsigned int num) -{ - const __m128i *p = (const __m128i *)(crc_xmm_shift_tab + 16 - num); - - return _mm_shuffle_epi8(reg, _mm_loadu_si128(p)); -} - -static __rte_always_inline uint32_t -crc32_eth_calc_pclmulqdq( - const uint8_t *data, - uint32_t data_len, - uint32_t crc, - const struct crc_pclmulqdq_ctx *params) -{ - __m128i temp, fold, k; - uint32_t n; - - /* Get CRC init value */ - temp = _mm_insert_epi32(_mm_setzero_si128(), crc, 0); - - /** - * Folding all data into single 16 byte data block - * Assumes: fold holds first 16 bytes of data - */ - - if (unlikely(data_len < 32)) { - if (unlikely(data_len == 16)) { - /* 16 bytes */ - fold = _mm_loadu_si128((const __m128i *)data); - fold = _mm_xor_si128(fold, temp); - goto reduction_128_64; - } - - if (unlikely(data_len < 16)) { - /* 0 to 15 bytes */ - uint8_t buffer[16] __rte_aligned(16); - - memset(buffer, 0, sizeof(buffer)); - memcpy(buffer, data, data_len); - - fold = _mm_load_si128((const __m128i *)buffer); - fold = _mm_xor_si128(fold, temp); - if (unlikely(data_len < 4)) { - fold = xmm_shift_left(fold, 8 - data_len); - goto barret_reduction; - } - fold = xmm_shift_left(fold, 16 - data_len); - goto reduction_128_64; - } - /* 17 to 31 bytes */ - fold = _mm_loadu_si128((const __m128i *)data); - fold = _mm_xor_si128(fold, temp); - n = 16; - k = params->rk1_rk2; - goto partial_bytes; - } - - /** At least 32 bytes in the buffer */ - /** Apply CRC initial value */ - fold = _mm_loadu_si128((const __m128i *)data); - fold = _mm_xor_si128(fold, temp); - - /** Main folding loop - the last 16 bytes is processed separately */ - k = params->rk1_rk2; - for (n = 16; (n + 16) <= data_len; n += 16) { - temp = _mm_loadu_si128((const __m128i *)&data[n]); - fold = crcr32_folding_round(temp, k, fold); - } - -partial_bytes: - if (likely(n < data_len)) { - - const uint32_t mask3[4] __rte_aligned(16) = { - 0x80808080, 0x80808080, 0x80808080, 0x80808080 - }; - - const uint8_t shf_table[32] __rte_aligned(16) = { - 0x00, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, - 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, - 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, - 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f - }; - - __m128i last16, a, b; - - last16 = _mm_loadu_si128((const __m128i *)&data[data_len - 16]); - - temp = _mm_loadu_si128((const __m128i *) - &shf_table[data_len & 15]); - a = _mm_shuffle_epi8(fold, temp); - - temp = _mm_xor_si128(temp, - _mm_load_si128((const __m128i *)mask3)); - b = _mm_shuffle_epi8(fold, temp); - b = _mm_blendv_epi8(b, last16, temp); - - /* k = rk1 & rk2 */ - temp = _mm_clmulepi64_si128(a, k, 0x01); - fold = _mm_clmulepi64_si128(a, k, 0x10); - - fold = _mm_xor_si128(fold, temp); - fold = _mm_xor_si128(fold, b); - } - - /** Reduction 128 -> 32 Assumes: fold holds 128bit folded data */ -reduction_128_64: - k = params->rk5_rk6; - fold = crcr32_reduce_128_to_64(fold, k); - -barret_reduction: - k = params->rk7_rk8; - n = crcr32_reduce_64_to_32(fold, k); - - return n; -} - - -static inline void -rte_net_crc_sse42_init(void) -{ - uint64_t k1, k2, k5, k6; - uint64_t p = 0, q = 0; - - /** Initialize CRC16 data */ - k1 = 0x189aeLLU; - k2 = 0x8e10LLU; - k5 = 0x189aeLLU; - k6 = 0x114aaLLU; - q = 0x11c581910LLU; - p = 0x10811LLU; - - /** Save the params in context structure */ - crc16_ccitt_pclmulqdq.rk1_rk2 = - _mm_setr_epi64(_mm_cvtsi64_m64(k1), _mm_cvtsi64_m64(k2)); - crc16_ccitt_pclmulqdq.rk5_rk6 = - _mm_setr_epi64(_mm_cvtsi64_m64(k5), _mm_cvtsi64_m64(k6)); - crc16_ccitt_pclmulqdq.rk7_rk8 = - _mm_setr_epi64(_mm_cvtsi64_m64(q), _mm_cvtsi64_m64(p)); - - /** Initialize CRC32 data */ - k1 = 0xccaa009eLLU; - k2 = 0x1751997d0LLU; - k5 = 0xccaa009eLLU; - k6 = 0x163cd6124LLU; - q = 0x1f7011640LLU; - p = 0x1db710641LLU; - - /** Save the params in context structure */ - crc32_eth_pclmulqdq.rk1_rk2 = - _mm_setr_epi64(_mm_cvtsi64_m64(k1), _mm_cvtsi64_m64(k2)); - crc32_eth_pclmulqdq.rk5_rk6 = - _mm_setr_epi64(_mm_cvtsi64_m64(k5), _mm_cvtsi64_m64(k6)); - crc32_eth_pclmulqdq.rk7_rk8 = - _mm_setr_epi64(_mm_cvtsi64_m64(q), _mm_cvtsi64_m64(p)); - - /** - * Reset the register as following calculation may - * use other data types such as float, double, etc. - */ - _mm_empty(); - -} - -static inline uint32_t -rte_crc16_ccitt_sse42_handler(const uint8_t *data, - uint32_t data_len) -{ - /** return 16-bit CRC value */ - return (uint16_t)~crc32_eth_calc_pclmulqdq(data, - data_len, - 0xffff, - &crc16_ccitt_pclmulqdq); -} - -static inline uint32_t -rte_crc32_eth_sse42_handler(const uint8_t *data, - uint32_t data_len) -{ - return ~crc32_eth_calc_pclmulqdq(data, - data_len, - 0xffffffffUL, - &crc32_eth_pclmulqdq); -} - -#ifdef __cplusplus -} -#endif - -#endif /* _RTE_NET_CRC_SSE_H_ */ diff --git a/lib/librte_net/rte_net_crc.c b/lib/librte_net/rte_net_crc.c index 4f5b9e8286..d271d5205b 100644 --- a/lib/librte_net/rte_net_crc.c +++ b/lib/librte_net/rte_net_crc.c @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: BSD-3-Clause - * Copyright(c) 2017 Intel Corporation + * Copyright(c) 2017-2020 Intel Corporation */ #include @@ -10,17 +10,7 @@ #include #include -#if defined(RTE_ARCH_X86_64) && defined(__PCLMUL__) -#define X86_64_SSE42_PCLMULQDQ 1 -#elif defined(RTE_ARCH_ARM64) && defined(__ARM_FEATURE_CRYPTO) -#define ARM64_NEON_PMULL 1 -#endif - -#ifdef X86_64_SSE42_PCLMULQDQ -#include -#elif defined ARM64_NEON_PMULL -#include -#endif +#include "net_crc.h" /** CRC polynomials */ #define CRC32_ETH_POLYNOMIAL 0x04c11db7UL @@ -41,25 +31,27 @@ rte_crc32_eth_handler(const uint8_t *data, uint32_t data_len); typedef uint32_t (*rte_net_crc_handler)(const uint8_t *data, uint32_t data_len); -static rte_net_crc_handler *handlers; +static const rte_net_crc_handler *handlers; -static rte_net_crc_handler handlers_scalar[] = { +static const rte_net_crc_handler handlers_scalar[] = { [RTE_NET_CRC16_CCITT] = rte_crc16_ccitt_handler, [RTE_NET_CRC32_ETH] = rte_crc32_eth_handler, }; - -#ifdef X86_64_SSE42_PCLMULQDQ -static rte_net_crc_handler handlers_sse42[] = { +#ifdef CC_X86_64_SSE42_PCLMULQDQ_SUPPORT +static const rte_net_crc_handler handlers_sse42[] = { [RTE_NET_CRC16_CCITT] = rte_crc16_ccitt_sse42_handler, [RTE_NET_CRC32_ETH] = rte_crc32_eth_sse42_handler, }; -#elif defined ARM64_NEON_PMULL -static rte_net_crc_handler handlers_neon[] = { +#endif +#ifdef CC_ARM64_NEON_PMULL_SUPPORT +static const rte_net_crc_handler handlers_neon[] = { [RTE_NET_CRC16_CCITT] = rte_crc16_ccitt_neon_handler, [RTE_NET_CRC32_ETH] = rte_crc32_eth_neon_handler, }; #endif +/* Scalar handling */ + /** * Reflect the bits about the middle * @@ -142,29 +134,82 @@ rte_crc32_eth_handler(const uint8_t *data, uint32_t data_len) crc32_eth_lut); } +/* SSE4.2/PCLMULQDQ handling */ + +#define SSE42_PCLMULQDQ_CPU_SUPPORTED \ + rte_cpu_get_flag_enabled(RTE_CPUFLAG_PCLMULQDQ) + +static const rte_net_crc_handler * +sse42_pclmulqdq_get_handlers(void) +{ +#ifdef CC_X86_64_SSE42_PCLMULQDQ_SUPPORT + if (SSE42_PCLMULQDQ_CPU_SUPPORTED) + return handlers_sse42; +#endif + return NULL; +} + +static uint8_t +sse42_pclmulqdq_init(void) +{ +#ifdef CC_X86_64_SSE42_PCLMULQDQ_SUPPORT + if (SSE42_PCLMULQDQ_CPU_SUPPORTED) { + rte_net_crc_sse42_init(); + return 1; + } +#endif + return 0; +} + +/* NEON/PMULL handling */ + +#define NEON_PMULL_CPU_SUPPORTED \ + rte_cpu_get_flag_enabled(RTE_CPUFLAG_PMULL) + +static const rte_net_crc_handler * +neon_pmull_get_handlers(void) +{ +#ifdef CC_ARM64_NEON_PMULL_SUPPORT + if (NEON_PMULL_CPU_SUPPORTED) + return handlers_neon; +#endif + return NULL; +} + +static uint8_t +neon_pmull_init(void) +{ +#ifdef CC_ARM64_NEON_PMULL_SUPPORT + if (NEON_PMULL_CPU_SUPPORTED) { + rte_net_crc_neon_init(); + return 1; + } +#endif + return 0; +} + +/* Public API */ + void rte_net_crc_set_alg(enum rte_net_crc_alg alg) { + handlers = NULL; + switch (alg) { -#ifdef X86_64_SSE42_PCLMULQDQ case RTE_NET_CRC_SSE42: - handlers = handlers_sse42; - break; -#elif defined ARM64_NEON_PMULL - /* fall-through */ + handlers = sse42_pclmulqdq_get_handlers(); + break; /* for x86, always break here */ case RTE_NET_CRC_NEON: - if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_PMULL)) { - handlers = handlers_neon; - break; - } -#endif + handlers = neon_pmull_get_handlers(); /* fall-through */ case RTE_NET_CRC_SCALAR: /* fall-through */ default: - handlers = handlers_scalar; break; } + + if (handlers == NULL) + handlers = handlers_scalar; } uint32_t @@ -188,15 +233,10 @@ RTE_INIT(rte_net_crc_init) rte_net_crc_scalar_init(); -#ifdef X86_64_SSE42_PCLMULQDQ - alg = RTE_NET_CRC_SSE42; - rte_net_crc_sse42_init(); -#elif defined ARM64_NEON_PMULL - if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_PMULL)) { + if (sse42_pclmulqdq_init()) + alg = RTE_NET_CRC_SSE42; + if (neon_pmull_init()) alg = RTE_NET_CRC_NEON; - rte_net_crc_neon_init(); - } -#endif rte_net_crc_set_alg(alg); }