From: Konstantin Ananyev Date: Tue, 6 Oct 2020 15:03:14 +0000 (+0100) Subject: acl: deduplicate AVX512 code X-Git-Url: http://git.droids-corp.org/?a=commitdiff_plain;h=c5cf148d8915be0559b15aeff1fba649eccd1b5f;p=dpdk.git acl: deduplicate AVX512 code Current rte_acl_classify_avx512x32() and rte_acl_classify_avx512x16() code paths are very similar. The only differences are due to 256/512 register/instrincts naming conventions. So to deduplicate the code: - Move common code into “acl_run_avx512_common.h” - Use macros to hide difference in naming conventions Signed-off-by: Konstantin Ananyev --- diff --git a/lib/librte_acl/acl_run_avx512_common.h b/lib/librte_acl/acl_run_avx512_common.h new file mode 100644 index 0000000000..1baf79b7ae --- /dev/null +++ b/lib/librte_acl/acl_run_avx512_common.h @@ -0,0 +1,477 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2020 Intel Corporation + */ + +/* + * WARNING: It is not recommended to include this file directly. + * Please include "acl_run_avx512x*.h" instead. + * To make this file to generate proper code an includer has to + * define several macros, refer to "acl_run_avx512x*.h" for more details. + */ + +/* + * Calculate the address of the next transition for + * all types of nodes. Note that only DFA nodes and range + * nodes actually transition to another node. Match + * nodes not supposed to be encountered here. + * For quad range nodes: + * Calculate number of range boundaries that are less than the + * input value. Range boundaries for each node are in signed 8 bit, + * ordered from -128 to 127. + * This is effectively a popcnt of bytes that are greater than the + * input byte. + * Single nodes are processed in the same ways as quad range nodes. + */ +static __rte_always_inline _T_simd +_F_(calc_addr)(_T_simd index_mask, _T_simd next_input, _T_simd shuffle_input, + _T_simd four_32, _T_simd range_base, _T_simd tr_lo, _T_simd tr_hi) +{ + __mmask64 qm; + _T_mask dfa_msk; + _T_simd addr, in, node_type, r, t; + _T_simd dfa_ofs, quad_ofs; + + t = _M_SI_(xor)(index_mask, index_mask); + in = _M_I_(shuffle_epi8)(next_input, shuffle_input); + + /* Calc node type and node addr */ + node_type = _M_SI_(andnot)(index_mask, tr_lo); + addr = _M_SI_(and)(index_mask, tr_lo); + + /* mask for DFA type(0) nodes */ + dfa_msk = _M_I_(cmpeq_epi32_mask)(node_type, t); + + /* DFA calculations. */ + r = _M_I_(srli_epi32)(in, 30); + r = _M_I_(add_epi8)(r, range_base); + t = _M_I_(srli_epi32)(in, 24); + r = _M_I_(shuffle_epi8)(tr_hi, r); + + dfa_ofs = _M_I_(sub_epi32)(t, r); + + /* QUAD/SINGLE calculations. */ + qm = _M_I_(cmpgt_epi8_mask)(in, tr_hi); + t = _M_I_(maskz_set1_epi8)(qm, (uint8_t)UINT8_MAX); + t = _M_I_(lzcnt_epi32)(t); + t = _M_I_(srli_epi32)(t, 3); + quad_ofs = _M_I_(sub_epi32)(four_32, t); + + /* blend DFA and QUAD/SINGLE. */ + t = _M_I_(mask_mov_epi32)(quad_ofs, dfa_msk, dfa_ofs); + + /* calculate address for next transitions. */ + addr = _M_I_(add_epi32)(addr, t); + return addr; +} + +/* + * Process _N_ transitions in parallel. + * tr_lo contains low 32 bits for _N_ transition. + * tr_hi contains high 32 bits for _N_ transition. + * next_input contains up to 4 input bytes for _N_ flows. + */ +static __rte_always_inline _T_simd +_F_(trans)(_T_simd next_input, const uint64_t *trans, _T_simd *tr_lo, + _T_simd *tr_hi) +{ + const int32_t *tr; + _T_simd addr; + + tr = (const int32_t *)(uintptr_t)trans; + + /* Calculate the address (array index) for all _N_ transitions. */ + addr = _F_(calc_addr)(_SV_(index_mask), next_input, _SV_(shuffle_input), + _SV_(four_32), _SV_(range_base), *tr_lo, *tr_hi); + + /* load lower 32 bits of _N_ transactions at once. */ + *tr_lo = _M_GI_(i32gather_epi32, addr, tr, sizeof(trans[0])); + + next_input = _M_I_(srli_epi32)(next_input, CHAR_BIT); + + /* load high 32 bits of _N_ transactions at once. */ + *tr_hi = _M_GI_(i32gather_epi32, addr, (tr + 1), sizeof(trans[0])); + + return next_input; +} + +/* + * Execute first transition for up to _N_ flows in parallel. + * next_input should contain one input byte for up to _N_ flows. + * msk - mask of active flows. + * tr_lo contains low 32 bits for up to _N_ transitions. + * tr_hi contains high 32 bits for up to _N_ transitions. + */ +static __rte_always_inline void +_F_(first_trans)(const struct acl_flow_avx512 *flow, _T_simd next_input, + _T_mask msk, _T_simd *tr_lo, _T_simd *tr_hi) +{ + const int32_t *tr; + _T_simd addr, root; + + tr = (const int32_t *)(uintptr_t)flow->trans; + + addr = _M_I_(set1_epi32)(UINT8_MAX); + root = _M_I_(set1_epi32)(flow->root_index); + + addr = _M_SI_(and)(next_input, addr); + addr = _M_I_(add_epi32)(root, addr); + + /* load lower 32 bits of _N_ transactions at once. */ + *tr_lo = _M_MGI_(mask_i32gather_epi32)(*tr_lo, msk, addr, tr, + sizeof(flow->trans[0])); + + /* load high 32 bits of _N_ transactions at once. */ + *tr_hi = _M_MGI_(mask_i32gather_epi32)(*tr_hi, msk, addr, (tr + 1), + sizeof(flow->trans[0])); +} + +/* + * Load and return next 4 input bytes for up to _N_ flows in parallel. + * pdata - 8x2 pointers to flow input data + * mask - mask of active flows. + * di - data indexes for these _N_ flows. + */ +static inline _T_simd +_F_(get_next_bytes)(const struct acl_flow_avx512 *flow, _T_simd pdata[2], + uint32_t msk, _T_simd *di, uint32_t bnum) +{ + const int32_t *div; + uint32_t m[2]; + _T_simd one, zero, t, p[2]; + + div = (const int32_t *)flow->data_index; + + one = _M_I_(set1_epi32)(1); + zero = _M_SI_(xor)(one, one); + + /* load data offsets for given indexes */ + t = _M_MGI_(mask_i32gather_epi32)(zero, msk, *di, div, sizeof(div[0])); + + /* increment data indexes */ + *di = _M_I_(mask_add_epi32)(*di, msk, *di, one); + + /* + * unsigned expand 32-bit indexes to 64-bit + * (for later pointer arithmetic), i.e: + * for (i = 0; i != _N_; i++) + * p[i/8].u64[i%8] = (uint64_t)t.u32[i]; + */ + p[0] = _M_I_(maskz_permutexvar_epi32)(_SC_(pmidx_msk), _SV_(pmidx[0]), + t); + p[1] = _M_I_(maskz_permutexvar_epi32)(_SC_(pmidx_msk), _SV_(pmidx[1]), + t); + + p[0] = _M_I_(add_epi64)(p[0], pdata[0]); + p[1] = _M_I_(add_epi64)(p[1], pdata[1]); + + /* load input byte(s), either one or four */ + + m[0] = msk & _SIMD_PTR_MSK_; + m[1] = msk >> _SIMD_PTR_NUM_; + + return _F_(gather_bytes)(zero, p, m, bnum); +} + +/* + * Start up to _N_ new flows. + * num - number of flows to start + * msk - mask of new flows. + * pdata - pointers to flow input data + * idx - match indexed for given flows + * di - data indexes for these flows. + */ +static inline void +_F_(start_flow)(struct acl_flow_avx512 *flow, uint32_t num, uint32_t msk, + _T_simd pdata[2], _T_simd *idx, _T_simd *di) +{ + uint32_t n, m[2], nm[2]; + _T_simd ni, nd[2]; + + /* split mask into two - one for each pdata[] */ + m[0] = msk & _SIMD_PTR_MSK_; + m[1] = msk >> _SIMD_PTR_NUM_; + + /* calculate masks for new flows */ + n = __builtin_popcount(m[0]); + nm[0] = (1 << n) - 1; + nm[1] = (1 << (num - n)) - 1; + + /* load input data pointers for new flows */ + nd[0] = _M_I_(maskz_loadu_epi64)(nm[0], + flow->idata + flow->num_packets); + nd[1] = _M_I_(maskz_loadu_epi64)(nm[1], + flow->idata + flow->num_packets + n); + + /* calculate match indexes of new flows */ + ni = _M_I_(set1_epi32)(flow->num_packets); + ni = _M_I_(add_epi32)(ni, _SV_(idx_add)); + + /* merge new and existing flows data */ + pdata[0] = _M_I_(mask_expand_epi64)(pdata[0], m[0], nd[0]); + pdata[1] = _M_I_(mask_expand_epi64)(pdata[1], m[1], nd[1]); + + /* update match and data indexes */ + *idx = _M_I_(mask_expand_epi32)(*idx, msk, ni); + *di = _M_I_(maskz_mov_epi32)(msk ^ _SIMD_MASK_MAX_, *di); + + flow->num_packets += num; +} + +/* + * Process found matches for up to _N_ flows. + * fmsk - mask of active flows + * rmsk - mask of found matches + * pdata - pointers to flow input data + * di - data indexes for these flows + * idx - match indexed for given flows + * tr_lo contains low 32 bits for up to _N_ transitions. + * tr_hi contains high 32 bits for up to _N_ transitions. + */ +static inline uint32_t +_F_(match_process)(struct acl_flow_avx512 *flow, uint32_t *fmsk, + uint32_t *rmsk, _T_simd pdata[2], _T_simd *di, _T_simd *idx, + _T_simd *tr_lo, _T_simd *tr_hi) +{ + uint32_t n; + _T_simd res; + + if (rmsk[0] == 0) + return 0; + + /* extract match indexes */ + res = _M_SI_(and)(tr_lo[0], _SV_(index_mask)); + + /* mask matched transitions to nop */ + tr_lo[0] = _M_I_(mask_mov_epi32)(tr_lo[0], rmsk[0], _SV_(trlo_idle)); + tr_hi[0] = _M_I_(mask_mov_epi32)(tr_hi[0], rmsk[0], _SV_(trhi_idle)); + + /* save found match indexes */ + _M_I_(mask_i32scatter_epi32)(flow->matches, rmsk[0], idx[0], res, + sizeof(flow->matches[0])); + + /* update masks and start new flows for matches */ + n = update_flow_mask(flow, fmsk, rmsk); + _F_(start_flow)(flow, n, rmsk[0], pdata, idx, di); + + return n; +} + +/* + * Test for matches ut to (2 * _N_) flows at once, + * if matches exist - process them and start new flows. + */ +static inline void +_F_(match_check_process)(struct acl_flow_avx512 *flow, uint32_t fm[2], + _T_simd pdata[4], _T_simd di[2], _T_simd idx[2], _T_simd inp[2], + _T_simd tr_lo[2], _T_simd tr_hi[2]) +{ + uint32_t n[2]; + uint32_t rm[2]; + + /* check for matches */ + rm[0] = _M_I_(test_epi32_mask)(tr_lo[0], _SV_(match_mask)); + rm[1] = _M_I_(test_epi32_mask)(tr_lo[1], _SV_(match_mask)); + + /* till unprocessed matches exist */ + while ((rm[0] | rm[1]) != 0) { + + /* process matches and start new flows */ + n[0] = _F_(match_process)(flow, &fm[0], &rm[0], &pdata[0], + &di[0], &idx[0], &tr_lo[0], &tr_hi[0]); + n[1] = _F_(match_process)(flow, &fm[1], &rm[1], &pdata[2], + &di[1], &idx[1], &tr_lo[1], &tr_hi[1]); + + /* execute first transition for new flows, if any */ + + if (n[0] != 0) { + inp[0] = _F_(get_next_bytes)(flow, &pdata[0], + rm[0], &di[0], flow->first_load_sz); + _F_(first_trans)(flow, inp[0], rm[0], &tr_lo[0], + &tr_hi[0]); + rm[0] = _M_I_(test_epi32_mask)(tr_lo[0], + _SV_(match_mask)); + } + + if (n[1] != 0) { + inp[1] = _F_(get_next_bytes)(flow, &pdata[2], + rm[1], &di[1], flow->first_load_sz); + _F_(first_trans)(flow, inp[1], rm[1], &tr_lo[1], + &tr_hi[1]); + rm[1] = _M_I_(test_epi32_mask)(tr_lo[1], + _SV_(match_mask)); + } + } +} + +/* + * Perform search for up to (2 * _N_) flows in parallel. + * Use two sets of metadata, each serves _N_ flows max. + */ +static inline void +_F_(search_trie)(struct acl_flow_avx512 *flow) +{ + uint32_t fm[2]; + _T_simd di[2], idx[2], in[2], pdata[4], tr_lo[2], tr_hi[2]; + + /* first 1B load */ + _F_(start_flow)(flow, _SIMD_MASK_BIT_, _SIMD_MASK_MAX_, + &pdata[0], &idx[0], &di[0]); + _F_(start_flow)(flow, _SIMD_MASK_BIT_, _SIMD_MASK_MAX_, + &pdata[2], &idx[1], &di[1]); + + in[0] = _F_(get_next_bytes)(flow, &pdata[0], _SIMD_MASK_MAX_, &di[0], + flow->first_load_sz); + in[1] = _F_(get_next_bytes)(flow, &pdata[2], _SIMD_MASK_MAX_, &di[1], + flow->first_load_sz); + + _F_(first_trans)(flow, in[0], _SIMD_MASK_MAX_, &tr_lo[0], &tr_hi[0]); + _F_(first_trans)(flow, in[1], _SIMD_MASK_MAX_, &tr_lo[1], &tr_hi[1]); + + fm[0] = _SIMD_MASK_MAX_; + fm[1] = _SIMD_MASK_MAX_; + + /* match check */ + _F_(match_check_process)(flow, fm, pdata, di, idx, in, tr_lo, tr_hi); + + while ((fm[0] | fm[1]) != 0) { + + /* load next 4B */ + + in[0] = _F_(get_next_bytes)(flow, &pdata[0], fm[0], + &di[0], sizeof(uint32_t)); + in[1] = _F_(get_next_bytes)(flow, &pdata[2], fm[1], + &di[1], sizeof(uint32_t)); + + /* main 4B loop */ + + in[0] = _F_(trans)(in[0], flow->trans, &tr_lo[0], &tr_hi[0]); + in[1] = _F_(trans)(in[1], flow->trans, &tr_lo[1], &tr_hi[1]); + + in[0] = _F_(trans)(in[0], flow->trans, &tr_lo[0], &tr_hi[0]); + in[1] = _F_(trans)(in[1], flow->trans, &tr_lo[1], &tr_hi[1]); + + in[0] = _F_(trans)(in[0], flow->trans, &tr_lo[0], &tr_hi[0]); + in[1] = _F_(trans)(in[1], flow->trans, &tr_lo[1], &tr_hi[1]); + + in[0] = _F_(trans)(in[0], flow->trans, &tr_lo[0], &tr_hi[0]); + in[1] = _F_(trans)(in[1], flow->trans, &tr_lo[1], &tr_hi[1]); + + /* check for matches */ + _F_(match_check_process)(flow, fm, pdata, di, idx, in, + tr_lo, tr_hi); + } +} + +/* + * resolve match index to actual result/priority offset. + */ +static inline _T_simd +_F_(resolve_match_idx)(_T_simd mi) +{ + RTE_BUILD_BUG_ON(sizeof(struct rte_acl_match_results) != + 1 << (match_log + 2)); + return _M_I_(slli_epi32)(mi, match_log); +} + +/* + * Resolve multiple matches for the same flow based on priority. + */ +static inline _T_simd +_F_(resolve_pri)(const int32_t res[], const int32_t pri[], + const uint32_t match[], _T_mask msk, uint32_t nb_trie, + uint32_t nb_skip) +{ + uint32_t i; + const uint32_t *pm; + _T_mask m; + _T_simd cp, cr, np, nr, mch; + + const _T_simd zero = _M_I_(set1_epi32)(0); + + /* get match indexes */ + mch = _M_I_(maskz_loadu_epi32)(msk, match); + mch = _F_(resolve_match_idx)(mch); + + /* read result and priority values for first trie */ + cr = _M_MGI_(mask_i32gather_epi32)(zero, msk, mch, res, sizeof(res[0])); + cp = _M_MGI_(mask_i32gather_epi32)(zero, msk, mch, pri, sizeof(pri[0])); + + /* + * read result and priority values for next tries and select one + * with highest priority. + */ + for (i = 1, pm = match + nb_skip; i != nb_trie; + i++, pm += nb_skip) { + + mch = _M_I_(maskz_loadu_epi32)(msk, pm); + mch = _F_(resolve_match_idx)(mch); + + nr = _M_MGI_(mask_i32gather_epi32)(zero, msk, mch, res, + sizeof(res[0])); + np = _M_MGI_(mask_i32gather_epi32)(zero, msk, mch, pri, + sizeof(pri[0])); + + m = _M_I_(cmpgt_epi32_mask)(cp, np); + cr = _M_I_(mask_mov_epi32)(nr, m, cr); + cp = _M_I_(mask_mov_epi32)(np, m, cp); + } + + return cr; +} + +/* + * Resolve num (<= _N_) matches for single category + */ +static inline void +_F_(resolve_sc)(uint32_t result[], const int32_t res[], + const int32_t pri[], const uint32_t match[], uint32_t nb_pkt, + uint32_t nb_trie, uint32_t nb_skip) +{ + _T_mask msk; + _T_simd cr; + + msk = (1 << nb_pkt) - 1; + cr = _F_(resolve_pri)(res, pri, match, msk, nb_trie, nb_skip); + _M_I_(mask_storeu_epi32)(result, msk, cr); +} + +/* + * Resolve matches for single category + */ +static inline void +_F_(resolve_single_cat)(uint32_t result[], + const struct rte_acl_match_results pr[], const uint32_t match[], + uint32_t nb_pkt, uint32_t nb_trie) +{ + uint32_t j, k, n; + const int32_t *res, *pri; + _T_simd cr[2]; + + res = (const int32_t *)pr->results; + pri = pr->priority; + + for (k = 0; k != (nb_pkt & ~_SIMD_FLOW_MSK_); k += _SIMD_FLOW_NUM_) { + + j = k + _SIMD_MASK_BIT_; + + cr[0] = _F_(resolve_pri)(res, pri, match + k, _SIMD_MASK_MAX_, + nb_trie, nb_pkt); + cr[1] = _F_(resolve_pri)(res, pri, match + j, _SIMD_MASK_MAX_, + nb_trie, nb_pkt); + + _M_SI_(storeu)((void *)(result + k), cr[0]); + _M_SI_(storeu)((void *)(result + j), cr[1]); + } + + n = nb_pkt - k; + if (n != 0) { + if (n > _SIMD_MASK_BIT_) { + _F_(resolve_sc)(result + k, res, pri, match + k, + _SIMD_MASK_BIT_, nb_trie, nb_pkt); + k += _SIMD_MASK_BIT_; + n -= _SIMD_MASK_BIT_; + } + _F_(resolve_sc)(result + k, res, pri, match + k, n, + nb_trie, nb_pkt); + } +} diff --git a/lib/librte_acl/acl_run_avx512x16.h b/lib/librte_acl/acl_run_avx512x16.h index a5fc6b85b3..da244bc257 100644 --- a/lib/librte_acl/acl_run_avx512x16.h +++ b/lib/librte_acl/acl_run_avx512x16.h @@ -2,16 +2,57 @@ * Copyright(c) 2020 Intel Corporation */ -#define MASK16_BIT (sizeof(__mmask16) * CHAR_BIT) +/* + * Defines required by "acl_run_avx512_common.h". + * Note that all of them has to be undefined by the end + * of this file, as "acl_run_avx512_common.h" can be included several + * times from different *.h files for the same *.c. + */ + +/* + * This implementation uses 512-bit registers(zmm) and instrincts. + * So our main SIMD type is 512-bit width and each such variable can + * process sizeof(__m512i) / sizeof(uint32_t) == 16 entries in parallel. + */ +#define _T_simd __m512i +#define _T_mask __mmask16 + +/* Naming convention for static const variables. */ +#define _SC_(x) zmm_##x +#define _SV_(x) (zmm_##x.z) + +/* Naming convention for internal functions. */ +#define _F_(x) x##_avx512x16 + +/* + * Same instrincts have different syntaxis (depending on the bit-width), + * so to overcome that few macros need to be defined. + */ + +/* Naming convention for generic epi(packed integers) type instrincts. */ +#define _M_I_(x) _mm512_##x + +/* Naming convention for si(whole simd integer) type instrincts. */ +#define _M_SI_(x) _mm512_##x##_si512 + +/* Naming convention for masked gather type instrincts. */ +#define _M_MGI_(x) _mm512_##x + +/* Naming convention for gather type instrincts. */ +#define _M_GI_(name, idx, base, scale) _mm512_##name(idx, base, scale) -#define NUM_AVX512X16X2 (2 * MASK16_BIT) -#define MSK_AVX512X16X2 (NUM_AVX512X16X2 - 1) +/* num/mask of transitions per SIMD regs */ +#define _SIMD_MASK_BIT_ (sizeof(_T_simd) / sizeof(uint32_t)) +#define _SIMD_MASK_MAX_ RTE_LEN2MASK(_SIMD_MASK_BIT_, uint32_t) + +#define _SIMD_FLOW_NUM_ (2 * _SIMD_MASK_BIT_) +#define _SIMD_FLOW_MSK_ (_SIMD_FLOW_NUM_ - 1) /* num/mask of pointers per SIMD regs */ -#define ZMM_PTR_NUM (sizeof(__m512i) / sizeof(uintptr_t)) -#define ZMM_PTR_MSK RTE_LEN2MASK(ZMM_PTR_NUM, uint32_t) +#define _SIMD_PTR_NUM_ (sizeof(_T_simd) / sizeof(uintptr_t)) +#define _SIMD_PTR_MSK_ RTE_LEN2MASK(_SIMD_PTR_NUM_, uint32_t) -static const __rte_x86_zmm_t zmm_match_mask = { +static const __rte_x86_zmm_t _SC_(match_mask) = { .u32 = { RTE_ACL_NODE_MATCH, RTE_ACL_NODE_MATCH, @@ -32,7 +73,7 @@ static const __rte_x86_zmm_t zmm_match_mask = { }, }; -static const __rte_x86_zmm_t zmm_index_mask = { +static const __rte_x86_zmm_t _SC_(index_mask) = { .u32 = { RTE_ACL_NODE_INDEX, RTE_ACL_NODE_INDEX, @@ -53,7 +94,7 @@ static const __rte_x86_zmm_t zmm_index_mask = { }, }; -static const __rte_x86_zmm_t zmm_trlo_idle = { +static const __rte_x86_zmm_t _SC_(trlo_idle) = { .u32 = { RTE_ACL_IDLE_NODE, RTE_ACL_IDLE_NODE, @@ -74,7 +115,7 @@ static const __rte_x86_zmm_t zmm_trlo_idle = { }, }; -static const __rte_x86_zmm_t zmm_trhi_idle = { +static const __rte_x86_zmm_t _SC_(trhi_idle) = { .u32 = { 0, 0, 0, 0, 0, 0, 0, 0, @@ -83,7 +124,7 @@ static const __rte_x86_zmm_t zmm_trhi_idle = { }, }; -static const __rte_x86_zmm_t zmm_shuffle_input = { +static const __rte_x86_zmm_t _SC_(shuffle_input) = { .u32 = { 0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c, 0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c, @@ -92,7 +133,7 @@ static const __rte_x86_zmm_t zmm_shuffle_input = { }, }; -static const __rte_x86_zmm_t zmm_four_32 = { +static const __rte_x86_zmm_t _SC_(four_32) = { .u32 = { 4, 4, 4, 4, 4, 4, 4, 4, @@ -101,7 +142,7 @@ static const __rte_x86_zmm_t zmm_four_32 = { }, }; -static const __rte_x86_zmm_t zmm_idx_add = { +static const __rte_x86_zmm_t _SC_(idx_add) = { .u32 = { 0, 1, 2, 3, 4, 5, 6, 7, @@ -110,7 +151,7 @@ static const __rte_x86_zmm_t zmm_idx_add = { }, }; -static const __rte_x86_zmm_t zmm_range_base = { +static const __rte_x86_zmm_t _SC_(range_base) = { .u32 = { 0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c, 0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c, @@ -119,16 +160,16 @@ static const __rte_x86_zmm_t zmm_range_base = { }, }; -static const __rte_x86_zmm_t zmm_pminp = { +static const __rte_x86_zmm_t _SC_(pminp) = { .u32 = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, }, }; -static const __mmask16 zmm_pmidx_msk = 0x5555; +static const _T_mask _SC_(pmidx_msk) = 0x5555; -static const __rte_x86_zmm_t zmm_pmidx[2] = { +static const __rte_x86_zmm_t _SC_(pmidx[2]) = { [0] = { .u32 = { 0, 0, 1, 0, 2, 0, 3, 0, @@ -148,7 +189,7 @@ static const __rte_x86_zmm_t zmm_pmidx[2] = { * gather load on a byte quantity. So we have to mimic it in SW, * by doing 8x1B scalar loads. */ -static inline ymm_t +static inline __m256i _m512_mask_gather_epi8x8(__m512i pdata, __mmask8 mask) { rte_ymm_t v; @@ -156,7 +197,7 @@ _m512_mask_gather_epi8x8(__m512i pdata, __mmask8 mask) static const uint32_t zero; - p.z = _mm512_mask_set1_epi64(pdata, mask ^ ZMM_PTR_MSK, + p.z = _mm512_mask_set1_epi64(pdata, mask ^ _SIMD_PTR_MSK_, (uintptr_t)&zero); v.u32[0] = *(uint8_t *)p.u64[0]; @@ -172,369 +213,29 @@ _m512_mask_gather_epi8x8(__m512i pdata, __mmask8 mask) } /* - * Calculate the address of the next transition for - * all types of nodes. Note that only DFA nodes and range - * nodes actually transition to another node. Match - * nodes not supposed to be encountered here. - * For quad range nodes: - * Calculate number of range boundaries that are less than the - * input value. Range boundaries for each node are in signed 8 bit, - * ordered from -128 to 127. - * This is effectively a popcnt of bytes that are greater than the - * input byte. - * Single nodes are processed in the same ways as quad range nodes. - */ -static __rte_always_inline __m512i -calc_addr16(__m512i index_mask, __m512i next_input, __m512i shuffle_input, - __m512i four_32, __m512i range_base, __m512i tr_lo, __m512i tr_hi) -{ - __mmask64 qm; - __mmask16 dfa_msk; - __m512i addr, in, node_type, r, t; - __m512i dfa_ofs, quad_ofs; - - t = _mm512_xor_si512(index_mask, index_mask); - in = _mm512_shuffle_epi8(next_input, shuffle_input); - - /* Calc node type and node addr */ - node_type = _mm512_andnot_si512(index_mask, tr_lo); - addr = _mm512_and_si512(index_mask, tr_lo); - - /* mask for DFA type(0) nodes */ - dfa_msk = _mm512_cmpeq_epi32_mask(node_type, t); - - /* DFA calculations. */ - r = _mm512_srli_epi32(in, 30); - r = _mm512_add_epi8(r, range_base); - t = _mm512_srli_epi32(in, 24); - r = _mm512_shuffle_epi8(tr_hi, r); - - dfa_ofs = _mm512_sub_epi32(t, r); - - /* QUAD/SINGLE calculations. */ - qm = _mm512_cmpgt_epi8_mask(in, tr_hi); - t = _mm512_maskz_set1_epi8(qm, (uint8_t)UINT8_MAX); - t = _mm512_lzcnt_epi32(t); - t = _mm512_srli_epi32(t, 3); - quad_ofs = _mm512_sub_epi32(four_32, t); - - /* blend DFA and QUAD/SINGLE. */ - t = _mm512_mask_mov_epi32(quad_ofs, dfa_msk, dfa_ofs); - - /* calculate address for next transitions. */ - addr = _mm512_add_epi32(addr, t); - return addr; -} - -/* - * Process 16 transitions in parallel. - * tr_lo contains low 32 bits for 16 transition. - * tr_hi contains high 32 bits for 16 transition. - * next_input contains up to 4 input bytes for 16 flows. + * Gather 4/1 input bytes for up to 16 (2*8) locations in parallel. */ static __rte_always_inline __m512i -transition16(__m512i next_input, const uint64_t *trans, __m512i *tr_lo, - __m512i *tr_hi) -{ - const int32_t *tr; - __m512i addr; - - tr = (const int32_t *)(uintptr_t)trans; - - /* Calculate the address (array index) for all 16 transitions. */ - addr = calc_addr16(zmm_index_mask.z, next_input, zmm_shuffle_input.z, - zmm_four_32.z, zmm_range_base.z, *tr_lo, *tr_hi); - - /* load lower 32 bits of 16 transactions at once. */ - *tr_lo = _mm512_i32gather_epi32(addr, tr, sizeof(trans[0])); - - next_input = _mm512_srli_epi32(next_input, CHAR_BIT); - - /* load high 32 bits of 16 transactions at once. */ - *tr_hi = _mm512_i32gather_epi32(addr, (tr + 1), sizeof(trans[0])); - - return next_input; -} - -/* - * Execute first transition for up to 16 flows in parallel. - * next_input should contain one input byte for up to 16 flows. - * msk - mask of active flows. - * tr_lo contains low 32 bits for up to 16 transitions. - * tr_hi contains high 32 bits for up to 16 transitions. - */ -static __rte_always_inline void -first_trans16(const struct acl_flow_avx512 *flow, __m512i next_input, - __mmask16 msk, __m512i *tr_lo, __m512i *tr_hi) +_F_(gather_bytes)(__m512i zero, const __m512i p[2], const uint32_t m[2], + uint32_t bnum) { - const int32_t *tr; - __m512i addr, root; - - tr = (const int32_t *)(uintptr_t)flow->trans; - - addr = _mm512_set1_epi32(UINT8_MAX); - root = _mm512_set1_epi32(flow->root_index); - - addr = _mm512_and_si512(next_input, addr); - addr = _mm512_add_epi32(root, addr); - - /* load lower 32 bits of 16 transactions at once. */ - *tr_lo = _mm512_mask_i32gather_epi32(*tr_lo, msk, addr, tr, - sizeof(flow->trans[0])); - - /* load high 32 bits of 16 transactions at once. */ - *tr_hi = _mm512_mask_i32gather_epi32(*tr_hi, msk, addr, (tr + 1), - sizeof(flow->trans[0])); -} - -/* - * Load and return next 4 input bytes for up to 16 flows in parallel. - * pdata - 8x2 pointers to flow input data - * mask - mask of active flows. - * di - data indexes for these 16 flows. - */ -static inline __m512i -get_next_bytes_avx512x16(const struct acl_flow_avx512 *flow, __m512i pdata[2], - uint32_t msk, __m512i *di, uint32_t bnum) -{ - const int32_t *div; - uint32_t m[2]; - __m512i one, zero, t, p[2]; - ymm_t inp[2]; - - div = (const int32_t *)flow->data_index; - - one = _mm512_set1_epi32(1); - zero = _mm512_xor_si512(one, one); - - /* load data offsets for given indexes */ - t = _mm512_mask_i32gather_epi32(zero, msk, *di, div, sizeof(div[0])); - - /* increment data indexes */ - *di = _mm512_mask_add_epi32(*di, msk, *di, one); - - /* - * unsigned expand 32-bit indexes to 64-bit - * (for later pointer arithmetic), i.e: - * for (i = 0; i != 16; i++) - * p[i/8].u64[i%8] = (uint64_t)t.u32[i]; - */ - p[0] = _mm512_maskz_permutexvar_epi32(zmm_pmidx_msk, zmm_pmidx[0].z, t); - p[1] = _mm512_maskz_permutexvar_epi32(zmm_pmidx_msk, zmm_pmidx[1].z, t); - - p[0] = _mm512_add_epi64(p[0], pdata[0]); - p[1] = _mm512_add_epi64(p[1], pdata[1]); - - /* load input byte(s), either one or four */ - - m[0] = msk & ZMM_PTR_MSK; - m[1] = msk >> ZMM_PTR_NUM; + __m256i inp[2]; if (bnum == sizeof(uint8_t)) { inp[0] = _m512_mask_gather_epi8x8(p[0], m[0]); inp[1] = _m512_mask_gather_epi8x8(p[1], m[1]); } else { inp[0] = _mm512_mask_i64gather_epi32( - _mm512_castsi512_si256(zero), m[0], p[0], - NULL, sizeof(uint8_t)); + _mm512_castsi512_si256(zero), + m[0], p[0], NULL, sizeof(uint8_t)); inp[1] = _mm512_mask_i64gather_epi32( - _mm512_castsi512_si256(zero), m[1], p[1], - NULL, sizeof(uint8_t)); + _mm512_castsi512_si256(zero), + m[1], p[1], NULL, sizeof(uint8_t)); } /* squeeze input into one 512-bit register */ return _mm512_permutex2var_epi32(_mm512_castsi256_si512(inp[0]), - zmm_pminp.z, _mm512_castsi256_si512(inp[1])); -} - -/* - * Start up to 16 new flows. - * num - number of flows to start - * msk - mask of new flows. - * pdata - pointers to flow input data - * idx - match indexed for given flows - * di - data indexes for these flows. - */ -static inline void -start_flow16(struct acl_flow_avx512 *flow, uint32_t num, uint32_t msk, - __m512i pdata[2], __m512i *idx, __m512i *di) -{ - uint32_t n, m[2], nm[2]; - __m512i ni, nd[2]; - - /* split mask into two - one for each pdata[] */ - m[0] = msk & ZMM_PTR_MSK; - m[1] = msk >> ZMM_PTR_NUM; - - /* calculate masks for new flows */ - n = __builtin_popcount(m[0]); - nm[0] = (1 << n) - 1; - nm[1] = (1 << (num - n)) - 1; - - /* load input data pointers for new flows */ - nd[0] = _mm512_maskz_loadu_epi64(nm[0], - flow->idata + flow->num_packets); - nd[1] = _mm512_maskz_loadu_epi64(nm[1], - flow->idata + flow->num_packets + n); - - /* calculate match indexes of new flows */ - ni = _mm512_set1_epi32(flow->num_packets); - ni = _mm512_add_epi32(ni, zmm_idx_add.z); - - /* merge new and existing flows data */ - pdata[0] = _mm512_mask_expand_epi64(pdata[0], m[0], nd[0]); - pdata[1] = _mm512_mask_expand_epi64(pdata[1], m[1], nd[1]); - - /* update match and data indexes */ - *idx = _mm512_mask_expand_epi32(*idx, msk, ni); - *di = _mm512_maskz_mov_epi32(msk ^ UINT16_MAX, *di); - - flow->num_packets += num; -} - -/* - * Process found matches for up to 16 flows. - * fmsk - mask of active flows - * rmsk - mask of found matches - * pdata - pointers to flow input data - * di - data indexes for these flows - * idx - match indexed for given flows - * tr_lo contains low 32 bits for up to 8 transitions. - * tr_hi contains high 32 bits for up to 8 transitions. - */ -static inline uint32_t -match_process_avx512x16(struct acl_flow_avx512 *flow, uint32_t *fmsk, - uint32_t *rmsk, __m512i pdata[2], __m512i *di, __m512i *idx, - __m512i *tr_lo, __m512i *tr_hi) -{ - uint32_t n; - __m512i res; - - if (rmsk[0] == 0) - return 0; - - /* extract match indexes */ - res = _mm512_and_si512(tr_lo[0], zmm_index_mask.z); - - /* mask matched transitions to nop */ - tr_lo[0] = _mm512_mask_mov_epi32(tr_lo[0], rmsk[0], zmm_trlo_idle.z); - tr_hi[0] = _mm512_mask_mov_epi32(tr_hi[0], rmsk[0], zmm_trhi_idle.z); - - /* save found match indexes */ - _mm512_mask_i32scatter_epi32(flow->matches, rmsk[0], - idx[0], res, sizeof(flow->matches[0])); - - /* update masks and start new flows for matches */ - n = update_flow_mask(flow, fmsk, rmsk); - start_flow16(flow, n, rmsk[0], pdata, idx, di); - - return n; -} - -/* - * Test for matches ut to 32 (2x16) flows at once, - * if matches exist - process them and start new flows. - */ -static inline void -match_check_process_avx512x16x2(struct acl_flow_avx512 *flow, uint32_t fm[2], - __m512i pdata[4], __m512i di[2], __m512i idx[2], __m512i inp[2], - __m512i tr_lo[2], __m512i tr_hi[2]) -{ - uint32_t n[2]; - uint32_t rm[2]; - - /* check for matches */ - rm[0] = _mm512_test_epi32_mask(tr_lo[0], zmm_match_mask.z); - rm[1] = _mm512_test_epi32_mask(tr_lo[1], zmm_match_mask.z); - - /* till unprocessed matches exist */ - while ((rm[0] | rm[1]) != 0) { - - /* process matches and start new flows */ - n[0] = match_process_avx512x16(flow, &fm[0], &rm[0], &pdata[0], - &di[0], &idx[0], &tr_lo[0], &tr_hi[0]); - n[1] = match_process_avx512x16(flow, &fm[1], &rm[1], &pdata[2], - &di[1], &idx[1], &tr_lo[1], &tr_hi[1]); - - /* execute first transition for new flows, if any */ - - if (n[0] != 0) { - inp[0] = get_next_bytes_avx512x16(flow, &pdata[0], - rm[0], &di[0], flow->first_load_sz); - first_trans16(flow, inp[0], rm[0], &tr_lo[0], - &tr_hi[0]); - rm[0] = _mm512_test_epi32_mask(tr_lo[0], - zmm_match_mask.z); - } - - if (n[1] != 0) { - inp[1] = get_next_bytes_avx512x16(flow, &pdata[2], - rm[1], &di[1], flow->first_load_sz); - first_trans16(flow, inp[1], rm[1], &tr_lo[1], - &tr_hi[1]); - rm[1] = _mm512_test_epi32_mask(tr_lo[1], - zmm_match_mask.z); - } - } -} - -/* - * Perform search for up to 32 flows in parallel. - * Use two sets of metadata, each serves 16 flows max. - * So in fact we perform search for 2x16 flows. - */ -static inline void -search_trie_avx512x16x2(struct acl_flow_avx512 *flow) -{ - uint32_t fm[2]; - __m512i di[2], idx[2], in[2], pdata[4], tr_lo[2], tr_hi[2]; - - /* first 1B load */ - start_flow16(flow, MASK16_BIT, UINT16_MAX, &pdata[0], &idx[0], &di[0]); - start_flow16(flow, MASK16_BIT, UINT16_MAX, &pdata[2], &idx[1], &di[1]); - - in[0] = get_next_bytes_avx512x16(flow, &pdata[0], UINT16_MAX, &di[0], - flow->first_load_sz); - in[1] = get_next_bytes_avx512x16(flow, &pdata[2], UINT16_MAX, &di[1], - flow->first_load_sz); - - first_trans16(flow, in[0], UINT16_MAX, &tr_lo[0], &tr_hi[0]); - first_trans16(flow, in[1], UINT16_MAX, &tr_lo[1], &tr_hi[1]); - - fm[0] = UINT16_MAX; - fm[1] = UINT16_MAX; - - /* match check */ - match_check_process_avx512x16x2(flow, fm, pdata, di, idx, in, - tr_lo, tr_hi); - - while ((fm[0] | fm[1]) != 0) { - - /* load next 4B */ - - in[0] = get_next_bytes_avx512x16(flow, &pdata[0], fm[0], - &di[0], sizeof(uint32_t)); - in[1] = get_next_bytes_avx512x16(flow, &pdata[2], fm[1], - &di[1], sizeof(uint32_t)); - - /* main 4B loop */ - - in[0] = transition16(in[0], flow->trans, &tr_lo[0], &tr_hi[0]); - in[1] = transition16(in[1], flow->trans, &tr_lo[1], &tr_hi[1]); - - in[0] = transition16(in[0], flow->trans, &tr_lo[0], &tr_hi[0]); - in[1] = transition16(in[1], flow->trans, &tr_lo[1], &tr_hi[1]); - - in[0] = transition16(in[0], flow->trans, &tr_lo[0], &tr_hi[0]); - in[1] = transition16(in[1], flow->trans, &tr_lo[1], &tr_hi[1]); - - in[0] = transition16(in[0], flow->trans, &tr_lo[0], &tr_hi[0]); - in[1] = transition16(in[1], flow->trans, &tr_lo[1], &tr_hi[1]); - - /* check for matches */ - match_check_process_avx512x16x2(flow, fm, pdata, di, idx, in, - tr_lo, tr_hi); - } + _SV_(pminp), _mm512_castsi256_si512(inp[1])); } /* @@ -582,120 +283,12 @@ resolve_mcgt8_avx512x1(uint32_t result[], } } -/* - * resolve match index to actual result/priority offset. - */ -static inline __m512i -resolve_match_idx_avx512x16(__m512i mi) -{ - RTE_BUILD_BUG_ON(sizeof(struct rte_acl_match_results) != - 1 << (match_log + 2)); - return _mm512_slli_epi32(mi, match_log); -} - -/* - * Resolve multiple matches for the same flow based on priority. - */ -static inline __m512i -resolve_pri_avx512x16(const int32_t res[], const int32_t pri[], - const uint32_t match[], __mmask16 msk, uint32_t nb_trie, - uint32_t nb_skip) -{ - uint32_t i; - const uint32_t *pm; - __mmask16 m; - __m512i cp, cr, np, nr, mch; - - const __m512i zero = _mm512_set1_epi32(0); - - /* get match indexes */ - mch = _mm512_maskz_loadu_epi32(msk, match); - mch = resolve_match_idx_avx512x16(mch); - - /* read result and priority values for first trie */ - cr = _mm512_mask_i32gather_epi32(zero, msk, mch, res, sizeof(res[0])); - cp = _mm512_mask_i32gather_epi32(zero, msk, mch, pri, sizeof(pri[0])); - - /* - * read result and priority values for next tries and select one - * with highest priority. - */ - for (i = 1, pm = match + nb_skip; i != nb_trie; - i++, pm += nb_skip) { - - mch = _mm512_maskz_loadu_epi32(msk, pm); - mch = resolve_match_idx_avx512x16(mch); - - nr = _mm512_mask_i32gather_epi32(zero, msk, mch, res, - sizeof(res[0])); - np = _mm512_mask_i32gather_epi32(zero, msk, mch, pri, - sizeof(pri[0])); - - m = _mm512_cmpgt_epi32_mask(cp, np); - cr = _mm512_mask_mov_epi32(nr, m, cr); - cp = _mm512_mask_mov_epi32(np, m, cp); - } - - return cr; -} - -/* - * Resolve num (<= 16) matches for single category - */ -static inline void -resolve_sc_avx512x16(uint32_t result[], const int32_t res[], - const int32_t pri[], const uint32_t match[], uint32_t nb_pkt, - uint32_t nb_trie, uint32_t nb_skip) -{ - __mmask16 msk; - __m512i cr; - - msk = (1 << nb_pkt) - 1; - cr = resolve_pri_avx512x16(res, pri, match, msk, nb_trie, nb_skip); - _mm512_mask_storeu_epi32(result, msk, cr); -} +#include "acl_run_avx512_common.h" /* - * Resolve matches for single category + * Perform search for up to (2 * 16) flows in parallel. + * Use two sets of metadata, each serves 16 flows max. */ -static inline void -resolve_sc_avx512x16x2(uint32_t result[], - const struct rte_acl_match_results pr[], const uint32_t match[], - uint32_t nb_pkt, uint32_t nb_trie) -{ - uint32_t j, k, n; - const int32_t *res, *pri; - __m512i cr[2]; - - res = (const int32_t *)pr->results; - pri = pr->priority; - - for (k = 0; k != (nb_pkt & ~MSK_AVX512X16X2); k += NUM_AVX512X16X2) { - - j = k + MASK16_BIT; - - cr[0] = resolve_pri_avx512x16(res, pri, match + k, UINT16_MAX, - nb_trie, nb_pkt); - cr[1] = resolve_pri_avx512x16(res, pri, match + j, UINT16_MAX, - nb_trie, nb_pkt); - - _mm512_storeu_si512(result + k, cr[0]); - _mm512_storeu_si512(result + j, cr[1]); - } - - n = nb_pkt - k; - if (n != 0) { - if (n > MASK16_BIT) { - resolve_sc_avx512x16(result + k, res, pri, match + k, - MASK16_BIT, nb_trie, nb_pkt); - k += MASK16_BIT; - n -= MASK16_BIT; - } - resolve_sc_avx512x16(result + k, res, pri, match + k, n, - nb_trie, nb_pkt); - } -} - static inline int search_avx512x16x2(const struct rte_acl_ctx *ctx, const uint8_t **data, uint32_t *results, uint32_t total_packets, uint32_t categories) @@ -711,7 +304,7 @@ search_avx512x16x2(const struct rte_acl_ctx *ctx, const uint8_t **data, acl_set_flow_avx512(&flow, ctx, i, data, pm, total_packets); /* process the trie */ - search_trie_avx512x16x2(&flow); + _F_(search_trie)(&flow); } /* resolve matches */ @@ -719,7 +312,7 @@ search_avx512x16x2(const struct rte_acl_ctx *ctx, const uint8_t **data, (ctx->trans_table + ctx->match_index); if (categories == 1) - resolve_sc_avx512x16x2(results, pr, match, total_packets, + _F_(resolve_single_cat)(results, pr, match, total_packets, ctx->num_tries); else if (categories <= RTE_ACL_MAX_CATEGORIES / 2) resolve_mcle8_avx512x1(results, pr, match, total_packets, @@ -730,3 +323,19 @@ search_avx512x16x2(const struct rte_acl_ctx *ctx, const uint8_t **data, return 0; } + +#undef _SIMD_PTR_MSK_ +#undef _SIMD_PTR_NUM_ +#undef _SIMD_FLOW_MSK_ +#undef _SIMD_FLOW_NUM_ +#undef _SIMD_MASK_MAX_ +#undef _SIMD_MASK_BIT_ +#undef _M_GI_ +#undef _M_MGI_ +#undef _M_SI_ +#undef _M_I_ +#undef _F_ +#undef _SV_ +#undef _SC_ +#undef _T_mask +#undef _T_simd diff --git a/lib/librte_acl/acl_run_avx512x8.h b/lib/librte_acl/acl_run_avx512x8.h index fedd79b9ae..61ac9d1b47 100644 --- a/lib/librte_acl/acl_run_avx512x8.h +++ b/lib/librte_acl/acl_run_avx512x8.h @@ -2,16 +2,57 @@ * Copyright(c) 2020 Intel Corporation */ -#define MASK8_BIT (sizeof(__mmask8) * CHAR_BIT) +/* + * Defines required by "acl_run_avx512_common.h". + * Note that all of them has to be undefined by the end + * of this file, as "acl_run_avx512_common.h" can be included several + * times from different *.h files for the same *.c. + */ + +/* + * This implementation uses 256-bit registers(ymm) and instrincts. + * So our main SIMD type is 256-bit width and each such variable can + * process sizeof(__m256i) / sizeof(uint32_t) == 8 entries in parallel. + */ +#define _T_simd __m256i +#define _T_mask __mmask8 + +/* Naming convention for static const variables. */ +#define _SC_(x) ymm_##x +#define _SV_(x) (ymm_##x.y) + +/* Naming convention for internal functions. */ +#define _F_(x) x##_avx512x8 + +/* + * Same instrincts have different syntaxis (depending on the bit-width), + * so to overcome that few macros need to be defined. + */ + +/* Naming convention for generic epi(packed integers) type instrincts. */ +#define _M_I_(x) _mm256_##x + +/* Naming convention for si(whole simd integer) type instrincts. */ +#define _M_SI_(x) _mm256_##x##_si256 -#define NUM_AVX512X8X2 (2 * MASK8_BIT) -#define MSK_AVX512X8X2 (NUM_AVX512X8X2 - 1) +/* Naming convention for masked gather type instrincts. */ +#define _M_MGI_(x) _mm256_m##x + +/* Naming convention for gather type instrincts. */ +#define _M_GI_(name, idx, base, scale) _mm256_##name(base, idx, scale) + +/* num/mask of transitions per SIMD regs */ +#define _SIMD_MASK_BIT_ (sizeof(_T_simd) / sizeof(uint32_t)) +#define _SIMD_MASK_MAX_ RTE_LEN2MASK(_SIMD_MASK_BIT_, uint32_t) + +#define _SIMD_FLOW_NUM_ (2 * _SIMD_MASK_BIT_) +#define _SIMD_FLOW_MSK_ (_SIMD_FLOW_NUM_ - 1) /* num/mask of pointers per SIMD regs */ -#define YMM_PTR_NUM (sizeof(__m256i) / sizeof(uintptr_t)) -#define YMM_PTR_MSK RTE_LEN2MASK(YMM_PTR_NUM, uint32_t) +#define _SIMD_PTR_NUM_ (sizeof(_T_simd) / sizeof(uintptr_t)) +#define _SIMD_PTR_MSK_ RTE_LEN2MASK(_SIMD_PTR_NUM_, uint32_t) -static const rte_ymm_t ymm_match_mask = { +static const rte_ymm_t _SC_(match_mask) = { .u32 = { RTE_ACL_NODE_MATCH, RTE_ACL_NODE_MATCH, @@ -24,7 +65,7 @@ static const rte_ymm_t ymm_match_mask = { }, }; -static const rte_ymm_t ymm_index_mask = { +static const rte_ymm_t _SC_(index_mask) = { .u32 = { RTE_ACL_NODE_INDEX, RTE_ACL_NODE_INDEX, @@ -37,7 +78,7 @@ static const rte_ymm_t ymm_index_mask = { }, }; -static const rte_ymm_t ymm_trlo_idle = { +static const rte_ymm_t _SC_(trlo_idle) = { .u32 = { RTE_ACL_IDLE_NODE, RTE_ACL_IDLE_NODE, @@ -50,51 +91,51 @@ static const rte_ymm_t ymm_trlo_idle = { }, }; -static const rte_ymm_t ymm_trhi_idle = { +static const rte_ymm_t _SC_(trhi_idle) = { .u32 = { 0, 0, 0, 0, 0, 0, 0, 0, }, }; -static const rte_ymm_t ymm_shuffle_input = { +static const rte_ymm_t _SC_(shuffle_input) = { .u32 = { 0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c, 0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c, }, }; -static const rte_ymm_t ymm_four_32 = { +static const rte_ymm_t _SC_(four_32) = { .u32 = { 4, 4, 4, 4, 4, 4, 4, 4, }, }; -static const rte_ymm_t ymm_idx_add = { +static const rte_ymm_t _SC_(idx_add) = { .u32 = { 0, 1, 2, 3, 4, 5, 6, 7, }, }; -static const rte_ymm_t ymm_range_base = { +static const rte_ymm_t _SC_(range_base) = { .u32 = { 0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c, 0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c, }, }; -static const rte_ymm_t ymm_pminp = { +static const rte_ymm_t _SC_(pminp) = { .u32 = { 0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b, }, }; -static const __mmask16 ymm_pmidx_msk = 0x55; +static const __mmask16 _SC_(pmidx_msk) = 0x55; -static const rte_ymm_t ymm_pmidx[2] = { +static const rte_ymm_t _SC_(pmidx[2]) = { [0] = { .u32 = { 0, 0, 1, 0, 2, 0, 3, 0, @@ -120,7 +161,7 @@ _m256_mask_gather_epi8x4(__m256i pdata, __mmask8 mask) static const uint32_t zero; - p.y = _mm256_mask_set1_epi64(pdata, mask ^ YMM_PTR_MSK, + p.y = _mm256_mask_set1_epi64(pdata, mask ^ _SIMD_PTR_MSK_, (uintptr_t)&zero); v.u32[0] = *(uint8_t *)p.u64[0]; @@ -132,483 +173,37 @@ _m256_mask_gather_epi8x4(__m256i pdata, __mmask8 mask) } /* - * Calculate the address of the next transition for - * all types of nodes. Note that only DFA nodes and range - * nodes actually transition to another node. Match - * nodes not supposed to be encountered here. - * For quad range nodes: - * Calculate number of range boundaries that are less than the - * input value. Range boundaries for each node are in signed 8 bit, - * ordered from -128 to 127. - * This is effectively a popcnt of bytes that are greater than the - * input byte. - * Single nodes are processed in the same ways as quad range nodes. - */ -static __rte_always_inline __m256i -calc_addr8(__m256i index_mask, __m256i next_input, __m256i shuffle_input, - __m256i four_32, __m256i range_base, __m256i tr_lo, __m256i tr_hi) -{ - __mmask32 qm; - __mmask8 dfa_msk; - __m256i addr, in, node_type, r, t; - __m256i dfa_ofs, quad_ofs; - - t = _mm256_xor_si256(index_mask, index_mask); - in = _mm256_shuffle_epi8(next_input, shuffle_input); - - /* Calc node type and node addr */ - node_type = _mm256_andnot_si256(index_mask, tr_lo); - addr = _mm256_and_si256(index_mask, tr_lo); - - /* mask for DFA type(0) nodes */ - dfa_msk = _mm256_cmpeq_epi32_mask(node_type, t); - - /* DFA calculations. */ - r = _mm256_srli_epi32(in, 30); - r = _mm256_add_epi8(r, range_base); - t = _mm256_srli_epi32(in, 24); - r = _mm256_shuffle_epi8(tr_hi, r); - - dfa_ofs = _mm256_sub_epi32(t, r); - - /* QUAD/SINGLE calculations. */ - qm = _mm256_cmpgt_epi8_mask(in, tr_hi); - t = _mm256_maskz_set1_epi8(qm, (uint8_t)UINT8_MAX); - t = _mm256_lzcnt_epi32(t); - t = _mm256_srli_epi32(t, 3); - quad_ofs = _mm256_sub_epi32(four_32, t); - - /* blend DFA and QUAD/SINGLE. */ - t = _mm256_mask_mov_epi32(quad_ofs, dfa_msk, dfa_ofs); - - /* calculate address for next transitions. */ - addr = _mm256_add_epi32(addr, t); - return addr; -} - -/* - * Process 16 transitions in parallel. - * tr_lo contains low 32 bits for 16 transition. - * tr_hi contains high 32 bits for 16 transition. - * next_input contains up to 4 input bytes for 16 flows. + * Gather 4/1 input bytes for up to 8 (2*8) locations in parallel. */ static __rte_always_inline __m256i -transition8(__m256i next_input, const uint64_t *trans, __m256i *tr_lo, - __m256i *tr_hi) -{ - const int32_t *tr; - __m256i addr; - - tr = (const int32_t *)(uintptr_t)trans; - - /* Calculate the address (array index) for all 8 transitions. */ - addr = calc_addr8(ymm_index_mask.y, next_input, ymm_shuffle_input.y, - ymm_four_32.y, ymm_range_base.y, *tr_lo, *tr_hi); - - /* load lower 32 bits of 16 transactions at once. */ - *tr_lo = _mm256_i32gather_epi32(tr, addr, sizeof(trans[0])); - - next_input = _mm256_srli_epi32(next_input, CHAR_BIT); - - /* load high 32 bits of 16 transactions at once. */ - *tr_hi = _mm256_i32gather_epi32(tr + 1, addr, sizeof(trans[0])); - - return next_input; -} - -/* - * Execute first transition for up to 16 flows in parallel. - * next_input should contain one input byte for up to 16 flows. - * msk - mask of active flows. - * tr_lo contains low 32 bits for up to 16 transitions. - * tr_hi contains high 32 bits for up to 16 transitions. - */ -static __rte_always_inline void -first_trans8(const struct acl_flow_avx512 *flow, __m256i next_input, - __mmask8 msk, __m256i *tr_lo, __m256i *tr_hi) +_F_(gather_bytes)(__m256i zero, const __m256i p[2], const uint32_t m[2], + uint32_t bnum) { - const int32_t *tr; - __m256i addr, root; - - tr = (const int32_t *)(uintptr_t)flow->trans; - - addr = _mm256_set1_epi32(UINT8_MAX); - root = _mm256_set1_epi32(flow->root_index); - - addr = _mm256_and_si256(next_input, addr); - addr = _mm256_add_epi32(root, addr); - - /* load lower 32 bits of 16 transactions at once. */ - *tr_lo = _mm256_mmask_i32gather_epi32(*tr_lo, msk, addr, tr, - sizeof(flow->trans[0])); - - /* load high 32 bits of 16 transactions at once. */ - *tr_hi = _mm256_mmask_i32gather_epi32(*tr_hi, msk, addr, (tr + 1), - sizeof(flow->trans[0])); -} - -/* - * Load and return next 4 input bytes for up to 16 flows in parallel. - * pdata - 8x2 pointers to flow input data - * mask - mask of active flows. - * di - data indexes for these 16 flows. - */ -static inline __m256i -get_next_bytes_avx512x8(const struct acl_flow_avx512 *flow, __m256i pdata[2], - uint32_t msk, __m256i *di, uint32_t bnum) -{ - const int32_t *div; - uint32_t m[2]; - __m256i one, zero, t, p[2]; __m128i inp[2]; - div = (const int32_t *)flow->data_index; - - one = _mm256_set1_epi32(1); - zero = _mm256_xor_si256(one, one); - - /* load data offsets for given indexes */ - t = _mm256_mmask_i32gather_epi32(zero, msk, *di, div, sizeof(div[0])); - - /* increment data indexes */ - *di = _mm256_mask_add_epi32(*di, msk, *di, one); - - /* - * unsigned expand 32-bit indexes to 64-bit - * (for later pointer arithmetic), i.e: - * for (i = 0; i != 16; i++) - * p[i/8].u64[i%8] = (uint64_t)t.u32[i]; - */ - p[0] = _mm256_maskz_permutexvar_epi32(ymm_pmidx_msk, ymm_pmidx[0].y, t); - p[1] = _mm256_maskz_permutexvar_epi32(ymm_pmidx_msk, ymm_pmidx[1].y, t); - - p[0] = _mm256_add_epi64(p[0], pdata[0]); - p[1] = _mm256_add_epi64(p[1], pdata[1]); - - /* load input byte(s), either one or four */ - - m[0] = msk & YMM_PTR_MSK; - m[1] = msk >> YMM_PTR_NUM; - if (bnum == sizeof(uint8_t)) { inp[0] = _m256_mask_gather_epi8x4(p[0], m[0]); inp[1] = _m256_mask_gather_epi8x4(p[1], m[1]); } else { inp[0] = _mm256_mmask_i64gather_epi32( - _mm256_castsi256_si128(zero), m[0], p[0], - NULL, sizeof(uint8_t)); + _mm256_castsi256_si128(zero), + m[0], p[0], NULL, sizeof(uint8_t)); inp[1] = _mm256_mmask_i64gather_epi32( - _mm256_castsi256_si128(zero), m[1], p[1], - NULL, sizeof(uint8_t)); + _mm256_castsi256_si128(zero), + m[1], p[1], NULL, sizeof(uint8_t)); } - /* squeeze input into one 512-bit register */ + /* squeeze input into one 256-bit register */ return _mm256_permutex2var_epi32(_mm256_castsi128_si256(inp[0]), - ymm_pminp.y, _mm256_castsi128_si256(inp[1])); -} - -/* - * Start up to 16 new flows. - * num - number of flows to start - * msk - mask of new flows. - * pdata - pointers to flow input data - * idx - match indexed for given flows - * di - data indexes for these flows. - */ -static inline void -start_flow8(struct acl_flow_avx512 *flow, uint32_t num, uint32_t msk, - __m256i pdata[2], __m256i *idx, __m256i *di) -{ - uint32_t n, m[2], nm[2]; - __m256i ni, nd[2]; - - m[0] = msk & YMM_PTR_MSK; - m[1] = msk >> YMM_PTR_NUM; - - n = __builtin_popcount(m[0]); - nm[0] = (1 << n) - 1; - nm[1] = (1 << (num - n)) - 1; - - /* load input data pointers for new flows */ - nd[0] = _mm256_maskz_loadu_epi64(nm[0], - flow->idata + flow->num_packets); - nd[1] = _mm256_maskz_loadu_epi64(nm[1], - flow->idata + flow->num_packets + n); - - /* calculate match indexes of new flows */ - ni = _mm256_set1_epi32(flow->num_packets); - ni = _mm256_add_epi32(ni, ymm_idx_add.y); - - /* merge new and existing flows data */ - pdata[0] = _mm256_mask_expand_epi64(pdata[0], m[0], nd[0]); - pdata[1] = _mm256_mask_expand_epi64(pdata[1], m[1], nd[1]); - - /* update match and data indexes */ - *idx = _mm256_mask_expand_epi32(*idx, msk, ni); - *di = _mm256_maskz_mov_epi32(msk ^ UINT8_MAX, *di); - - flow->num_packets += num; -} - -/* - * Process found matches for up to 16 flows. - * fmsk - mask of active flows - * rmsk - mask of found matches - * pdata - pointers to flow input data - * di - data indexes for these flows - * idx - match indexed for given flows - * tr_lo contains low 32 bits for up to 8 transitions. - * tr_hi contains high 32 bits for up to 8 transitions. - */ -static inline uint32_t -match_process_avx512x8(struct acl_flow_avx512 *flow, uint32_t *fmsk, - uint32_t *rmsk, __m256i pdata[2], __m256i *di, __m256i *idx, - __m256i *tr_lo, __m256i *tr_hi) -{ - uint32_t n; - __m256i res; - - if (rmsk[0] == 0) - return 0; - - /* extract match indexes */ - res = _mm256_and_si256(tr_lo[0], ymm_index_mask.y); - - /* mask matched transitions to nop */ - tr_lo[0] = _mm256_mask_mov_epi32(tr_lo[0], rmsk[0], ymm_trlo_idle.y); - tr_hi[0] = _mm256_mask_mov_epi32(tr_hi[0], rmsk[0], ymm_trhi_idle.y); - - /* save found match indexes */ - _mm256_mask_i32scatter_epi32(flow->matches, rmsk[0], - idx[0], res, sizeof(flow->matches[0])); - - /* update masks and start new flows for matches */ - n = update_flow_mask(flow, fmsk, rmsk); - start_flow8(flow, n, rmsk[0], pdata, idx, di); - - return n; -} - -/* - * Test for matches ut to 32 (2x16) flows at once, - * if matches exist - process them and start new flows. - */ -static inline void -match_check_process_avx512x8x2(struct acl_flow_avx512 *flow, uint32_t fm[2], - __m256i pdata[4], __m256i di[2], __m256i idx[2], __m256i inp[2], - __m256i tr_lo[2], __m256i tr_hi[2]) -{ - uint32_t n[2]; - uint32_t rm[2]; - - /* check for matches */ - rm[0] = _mm256_test_epi32_mask(tr_lo[0], ymm_match_mask.y); - rm[1] = _mm256_test_epi32_mask(tr_lo[1], ymm_match_mask.y); - - /* till unprocessed matches exist */ - while ((rm[0] | rm[1]) != 0) { - - /* process matches and start new flows */ - n[0] = match_process_avx512x8(flow, &fm[0], &rm[0], &pdata[0], - &di[0], &idx[0], &tr_lo[0], &tr_hi[0]); - n[1] = match_process_avx512x8(flow, &fm[1], &rm[1], &pdata[2], - &di[1], &idx[1], &tr_lo[1], &tr_hi[1]); - - /* execute first transition for new flows, if any */ - - if (n[0] != 0) { - inp[0] = get_next_bytes_avx512x8(flow, &pdata[0], - rm[0], &di[0], flow->first_load_sz); - first_trans8(flow, inp[0], rm[0], &tr_lo[0], - &tr_hi[0]); - rm[0] = _mm256_test_epi32_mask(tr_lo[0], - ymm_match_mask.y); - } - - if (n[1] != 0) { - inp[1] = get_next_bytes_avx512x8(flow, &pdata[2], - rm[1], &di[1], flow->first_load_sz); - first_trans8(flow, inp[1], rm[1], &tr_lo[1], - &tr_hi[1]); - rm[1] = _mm256_test_epi32_mask(tr_lo[1], - ymm_match_mask.y); - } - } -} - -/* - * Perform search for up to 32 flows in parallel. - * Use two sets of metadata, each serves 16 flows max. - * So in fact we perform search for 2x16 flows. - */ -static inline void -search_trie_avx512x8x2(struct acl_flow_avx512 *flow) -{ - uint32_t fm[2]; - __m256i di[2], idx[2], in[2], pdata[4], tr_lo[2], tr_hi[2]; - - /* first 1B load */ - start_flow8(flow, MASK8_BIT, UINT8_MAX, &pdata[0], &idx[0], &di[0]); - start_flow8(flow, MASK8_BIT, UINT8_MAX, &pdata[2], &idx[1], &di[1]); - - in[0] = get_next_bytes_avx512x8(flow, &pdata[0], UINT8_MAX, &di[0], - flow->first_load_sz); - in[1] = get_next_bytes_avx512x8(flow, &pdata[2], UINT8_MAX, &di[1], - flow->first_load_sz); - - first_trans8(flow, in[0], UINT8_MAX, &tr_lo[0], &tr_hi[0]); - first_trans8(flow, in[1], UINT8_MAX, &tr_lo[1], &tr_hi[1]); - - fm[0] = UINT8_MAX; - fm[1] = UINT8_MAX; - - /* match check */ - match_check_process_avx512x8x2(flow, fm, pdata, di, idx, in, - tr_lo, tr_hi); - - while ((fm[0] | fm[1]) != 0) { - - /* load next 4B */ - - in[0] = get_next_bytes_avx512x8(flow, &pdata[0], fm[0], - &di[0], sizeof(uint32_t)); - in[1] = get_next_bytes_avx512x8(flow, &pdata[2], fm[1], - &di[1], sizeof(uint32_t)); - - /* main 4B loop */ - - in[0] = transition8(in[0], flow->trans, &tr_lo[0], &tr_hi[0]); - in[1] = transition8(in[1], flow->trans, &tr_lo[1], &tr_hi[1]); - - in[0] = transition8(in[0], flow->trans, &tr_lo[0], &tr_hi[0]); - in[1] = transition8(in[1], flow->trans, &tr_lo[1], &tr_hi[1]); - - in[0] = transition8(in[0], flow->trans, &tr_lo[0], &tr_hi[0]); - in[1] = transition8(in[1], flow->trans, &tr_lo[1], &tr_hi[1]); - - in[0] = transition8(in[0], flow->trans, &tr_lo[0], &tr_hi[0]); - in[1] = transition8(in[1], flow->trans, &tr_lo[1], &tr_hi[1]); - - /* check for matches */ - match_check_process_avx512x8x2(flow, fm, pdata, di, idx, in, - tr_lo, tr_hi); - } -} - -/* - * resolve match index to actual result/priority offset. - */ -static inline __m256i -resolve_match_idx_avx512x8(__m256i mi) -{ - RTE_BUILD_BUG_ON(sizeof(struct rte_acl_match_results) != - 1 << (match_log + 2)); - return _mm256_slli_epi32(mi, match_log); + _SV_(pminp), _mm256_castsi128_si256(inp[1])); } -/* - * Resolve multiple matches for the same flow based on priority. - */ -static inline __m256i -resolve_pri_avx512x8(const int32_t res[], const int32_t pri[], - const uint32_t match[], __mmask8 msk, uint32_t nb_trie, - uint32_t nb_skip) -{ - uint32_t i; - const uint32_t *pm; - __mmask16 m; - __m256i cp, cr, np, nr, mch; - - const __m256i zero = _mm256_set1_epi32(0); - - /* get match indexes */ - mch = _mm256_maskz_loadu_epi32(msk, match); - mch = resolve_match_idx_avx512x8(mch); - - /* read result and priority values for first trie */ - cr = _mm256_mmask_i32gather_epi32(zero, msk, mch, res, sizeof(res[0])); - cp = _mm256_mmask_i32gather_epi32(zero, msk, mch, pri, sizeof(pri[0])); - - /* - * read result and priority values for next tries and select one - * with highest priority. - */ - for (i = 1, pm = match + nb_skip; i != nb_trie; - i++, pm += nb_skip) { - - mch = _mm256_maskz_loadu_epi32(msk, pm); - mch = resolve_match_idx_avx512x8(mch); - - nr = _mm256_mmask_i32gather_epi32(zero, msk, mch, res, - sizeof(res[0])); - np = _mm256_mmask_i32gather_epi32(zero, msk, mch, pri, - sizeof(pri[0])); - - m = _mm256_cmpgt_epi32_mask(cp, np); - cr = _mm256_mask_mov_epi32(nr, m, cr); - cp = _mm256_mask_mov_epi32(np, m, cp); - } - - return cr; -} - -/* - * Resolve num (<= 8) matches for single category - */ -static inline void -resolve_sc_avx512x8(uint32_t result[], const int32_t res[], - const int32_t pri[], const uint32_t match[], uint32_t nb_pkt, - uint32_t nb_trie, uint32_t nb_skip) -{ - __mmask8 msk; - __m256i cr; - - msk = (1 << nb_pkt) - 1; - cr = resolve_pri_avx512x8(res, pri, match, msk, nb_trie, nb_skip); - _mm256_mask_storeu_epi32(result, msk, cr); -} +#include "acl_run_avx512_common.h" /* - * Resolve matches for single category + * Perform search for up to (2 * 8) flows in parallel. + * Use two sets of metadata, each serves 8 flows max. */ -static inline void -resolve_sc_avx512x8x2(uint32_t result[], - const struct rte_acl_match_results pr[], const uint32_t match[], - uint32_t nb_pkt, uint32_t nb_trie) -{ - uint32_t j, k, n; - const int32_t *res, *pri; - __m256i cr[2]; - - res = (const int32_t *)pr->results; - pri = pr->priority; - - for (k = 0; k != (nb_pkt & ~MSK_AVX512X8X2); k += NUM_AVX512X8X2) { - - j = k + MASK8_BIT; - - cr[0] = resolve_pri_avx512x8(res, pri, match + k, UINT8_MAX, - nb_trie, nb_pkt); - cr[1] = resolve_pri_avx512x8(res, pri, match + j, UINT8_MAX, - nb_trie, nb_pkt); - - _mm256_storeu_si256((void *)(result + k), cr[0]); - _mm256_storeu_si256((void *)(result + j), cr[1]); - } - - n = nb_pkt - k; - if (n != 0) { - if (n > MASK8_BIT) { - resolve_sc_avx512x8(result + k, res, pri, match + k, - MASK8_BIT, nb_trie, nb_pkt); - k += MASK8_BIT; - n -= MASK8_BIT; - } - resolve_sc_avx512x8(result + k, res, pri, match + k, n, - nb_trie, nb_pkt); - } -} - static inline int search_avx512x8x2(const struct rte_acl_ctx *ctx, const uint8_t **data, uint32_t *results, uint32_t total_packets, uint32_t categories) @@ -624,7 +219,7 @@ search_avx512x8x2(const struct rte_acl_ctx *ctx, const uint8_t **data, acl_set_flow_avx512(&flow, ctx, i, data, pm, total_packets); /* process the trie */ - search_trie_avx512x8x2(&flow); + _F_(search_trie)(&flow); } /* resolve matches */ @@ -632,7 +227,7 @@ search_avx512x8x2(const struct rte_acl_ctx *ctx, const uint8_t **data, (ctx->trans_table + ctx->match_index); if (categories == 1) - resolve_sc_avx512x8x2(results, pr, match, total_packets, + _F_(resolve_single_cat)(results, pr, match, total_packets, ctx->num_tries); else resolve_mcle8_avx512x1(results, pr, match, total_packets, @@ -640,3 +235,19 @@ search_avx512x8x2(const struct rte_acl_ctx *ctx, const uint8_t **data, return 0; } + +#undef _SIMD_PTR_MSK_ +#undef _SIMD_PTR_NUM_ +#undef _SIMD_FLOW_MSK_ +#undef _SIMD_FLOW_NUM_ +#undef _SIMD_MASK_MAX_ +#undef _SIMD_MASK_BIT_ +#undef _M_GI_ +#undef _M_MGI_ +#undef _M_SI_ +#undef _M_I_ +#undef _F_ +#undef _SV_ +#undef _SC_ +#undef _T_mask +#undef _T_simd