X-Git-Url: http://git.droids-corp.org/?a=blobdiff_plain;f=lib%2Flibrte_acl%2Facl_run_avx2.h;h=d06d2e8782d698dbbbdef2d9b128b2a3871f98d7;hb=52c9a533b4179e9d16dfdc03410440c39b61e5af;hp=1688c505cb27ba7c257a726f5dce4107f3805455;hpb=5dd71363bfd2b8d33d6b7aea9ab43db5792fd4e2;p=dpdk.git diff --git a/lib/librte_acl/acl_run_avx2.h b/lib/librte_acl/acl_run_avx2.h index 1688c505cb..d06d2e8782 100644 --- a/lib/librte_acl/acl_run_avx2.h +++ b/lib/librte_acl/acl_run_avx2.h @@ -1,34 +1,5 @@ -/*- - * BSD LICENSE - * - * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * * Neither the name of Intel Corporation nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation */ #include "acl_run_sse.h" @@ -73,52 +44,20 @@ static const rte_ymm_t ymm_ones_16 = { }, }; -static inline __attribute__((always_inline)) ymm_t -calc_addr_avx2(ymm_t index_mask, ymm_t next_input, ymm_t shuffle_input, - ymm_t ones_16, ymm_t tr_lo, ymm_t tr_hi) -{ - ymm_t in, node_type, r, t; - ymm_t dfa_msk, dfa_ofs, quad_ofs; - ymm_t addr; - - const ymm_t range_base = _mm256_set_epi32( - 0xffffff0c, 0xffffff08, 0xffffff04, 0xffffff00, - 0xffffff0c, 0xffffff08, 0xffffff04, 0xffffff00); - - t = _mm256_xor_si256(index_mask, index_mask); - in = _mm256_shuffle_epi8(next_input, shuffle_input); - - /* Calc node type and node addr */ - node_type = _mm256_andnot_si256(index_mask, tr_lo); - addr = _mm256_and_si256(index_mask, tr_lo); - - /* DFA calculations. */ - - dfa_msk = _mm256_cmpeq_epi32(node_type, t); - - r = _mm256_srli_epi32(in, 30); - r = _mm256_add_epi8(r, range_base); - - t = _mm256_srli_epi32(in, 24); - r = _mm256_shuffle_epi8(tr_hi, r); - - dfa_ofs = _mm256_sub_epi32(t, r); - - /* QUAD/SINGLE caluclations. */ - - t = _mm256_cmpgt_epi8(in, tr_hi); - t = _mm256_sign_epi8(t, t); - t = _mm256_maddubs_epi16(t, t); - quad_ofs = _mm256_madd_epi16(t, ones_16); - - /* blend DFA and QUAD/SINGLE. */ - t = _mm256_blendv_epi8(quad_ofs, dfa_ofs, dfa_msk); - - addr = _mm256_add_epi32(addr, t); - return addr; -} +static const rte_ymm_t ymm_range_base = { + .u32 = { + 0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c, + 0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c, + }, +}; -static inline __attribute__((always_inline)) ymm_t +/* + * Process 8 transitions in parallel. + * tr_lo contains low 32 bits for 8 transition. + * tr_hi contains high 32 bits for 8 transition. + * next_input contains up to 4 input bytes for 8 flows. + */ +static __rte_always_inline ymm_t transition8(ymm_t next_input, const uint64_t *trans, ymm_t *tr_lo, ymm_t *tr_hi) { const int32_t *tr; @@ -126,8 +65,10 @@ transition8(ymm_t next_input, const uint64_t *trans, ymm_t *tr_lo, ymm_t *tr_hi) tr = (const int32_t *)(uintptr_t)trans; - addr = calc_addr_avx2(ymm_index_mask.y, next_input, ymm_shuffle_input.y, - ymm_ones_16.y, *tr_lo, *tr_hi); + /* Calculate the address (array index) for all 8 transitions. */ + ACL_TR_CALC_ADDR(mm256, 256, addr, ymm_index_mask.y, next_input, + ymm_shuffle_input.y, ymm_ones_16.y, ymm_range_base.y, + *tr_lo, *tr_hi); /* load lower 32 bits of 8 transactions at once. */ *tr_lo = _mm256_i32gather_epi32(tr, addr, sizeof(trans[0])); @@ -140,6 +81,11 @@ transition8(ymm_t next_input, const uint64_t *trans, ymm_t *tr_lo, ymm_t *tr_hi) return next_input; } +/* + * Process matches for 8 flows. + * tr_lo contains low 32 bits for 8 transition. + * tr_hi contains high 32 bits for 8 transition. + */ static inline void acl_process_matches_avx2x8(const struct rte_acl_ctx *ctx, struct parms *parms, struct acl_flow_data *flows, uint32_t slot, @@ -155,6 +101,11 @@ acl_process_matches_avx2x8(const struct rte_acl_ctx *ctx, l0 = _mm256_castsi256_si128(*tr_lo); for (i = 0; i != RTE_DIM(tr) / 2; i++) { + + /* + * Extract low 32bits of each transition. + * That's enough to process the match. + */ tr[i] = (uint32_t)_mm_cvtsi128_si32(l0); tr[i + 4] = (uint32_t)_mm_cvtsi128_si32(l1); @@ -167,12 +118,14 @@ acl_process_matches_avx2x8(const struct rte_acl_ctx *ctx, ctx, parms, flows, resolve_priority_sse); } + /* Collect new transitions into 2 YMM registers. */ t0 = _mm256_set_epi64x(tr[5], tr[4], tr[1], tr[0]); t1 = _mm256_set_epi64x(tr[7], tr[6], tr[3], tr[2]); - lo = (ymm_t)_mm256_shuffle_ps((__m256)t0, (__m256)t1, 0x88); - hi = (ymm_t)_mm256_shuffle_ps((__m256)t0, (__m256)t1, 0xdd); + /* For each transition: put low 32 into tr_lo and high 32 into tr_hi */ + ACL_TR_HILO(mm256, __m256, t0, t1, lo, hi); + /* Keep transitions wth NOMATCH intact. */ *tr_lo = _mm256_blendv_epi8(*tr_lo, lo, matches); *tr_hi = _mm256_blendv_epi8(*tr_hi, hi, matches); } @@ -200,6 +153,9 @@ acl_match_check_avx2x8(const struct rte_acl_ctx *ctx, struct parms *parms, } } +/* + * Execute trie traversal for up to 16 flows in parallel. + */ static inline int search_avx2x16(const struct rte_acl_ctx *ctx, const uint8_t **data, uint32_t *results, uint32_t total_packets, uint32_t categories) @@ -225,16 +181,14 @@ search_avx2x16(const struct rte_acl_ctx *ctx, const uint8_t **data, t1 = _mm256_set_epi64x(index_array[7], index_array[6], index_array[3], index_array[2]); - tr_lo[0] = (ymm_t)_mm256_shuffle_ps((__m256)t0, (__m256)t1, 0x88); - tr_hi[0] = (ymm_t)_mm256_shuffle_ps((__m256)t0, (__m256)t1, 0xdd); + ACL_TR_HILO(mm256, __m256, t0, t1, tr_lo[0], tr_hi[0]); t0 = _mm256_set_epi64x(index_array[13], index_array[12], index_array[9], index_array[8]); t1 = _mm256_set_epi64x(index_array[15], index_array[14], index_array[11], index_array[10]); - tr_lo[1] = (ymm_t)_mm256_shuffle_ps((__m256)t0, (__m256)t1, 0x88); - tr_hi[1] = (ymm_t)_mm256_shuffle_ps((__m256)t0, (__m256)t1, 0xdd); + ACL_TR_HILO(mm256, __m256, t0, t1, tr_lo[1], tr_hi[1]); /* Check for any matches. */ acl_match_check_avx2x8(ctx, parms, &flows, 0, &tr_lo[0], &tr_hi[0],