From: Konstantin Ananyev Date: Tue, 20 Jan 2015 18:41:02 +0000 (+0000) Subject: acl: use scalar method fastest for some cases X-Git-Tag: spdx-start~9812 X-Git-Url: http://git.droids-corp.org/?a=commitdiff_plain;h=4269eae463d156b76cfee0dac3fa17aabde86d3a;p=dpdk.git acl: use scalar method fastest for some cases Previous improvements made scalar method the fastest one for tiny bunch of packets (< 4). That allows us to remove specific vector code-path for small number of packets (search_sse_2) and always use scalar method for such cases. Signed-off-by: Konstantin Ananyev Acked-by: Neil Horman --- diff --git a/lib/librte_acl/acl_run.h b/lib/librte_acl/acl_run.h index 850bc81a44..b2fc42c64a 100644 --- a/lib/librte_acl/acl_run.h +++ b/lib/librte_acl/acl_run.h @@ -40,7 +40,6 @@ #define MAX_SEARCHES_AVX16 16 #define MAX_SEARCHES_SSE8 8 #define MAX_SEARCHES_SSE4 4 -#define MAX_SEARCHES_SSE2 2 #define MAX_SEARCHES_SCALAR 2 #define GET_NEXT_4BYTES(prm, idx) \ diff --git a/lib/librte_acl/acl_run_avx2.c b/lib/librte_acl/acl_run_avx2.c index 0a42f72838..79ebbd6cfc 100644 --- a/lib/librte_acl/acl_run_avx2.c +++ b/lib/librte_acl/acl_run_avx2.c @@ -49,6 +49,6 @@ rte_acl_classify_avx2(const struct rte_acl_ctx *ctx, const uint8_t **data, else if (num >= MAX_SEARCHES_SSE4) return search_sse_4(ctx, data, results, num, categories); else - return search_sse_2(ctx, data, results, num, + return rte_acl_classify_scalar(ctx, data, results, num, categories); } diff --git a/lib/librte_acl/acl_run_sse.c b/lib/librte_acl/acl_run_sse.c index 77b32b35dc..a5a7d361bd 100644 --- a/lib/librte_acl/acl_run_sse.c +++ b/lib/librte_acl/acl_run_sse.c @@ -42,5 +42,6 @@ rte_acl_classify_sse(const struct rte_acl_ctx *ctx, const uint8_t **data, else if (num >= MAX_SEARCHES_SSE4) return search_sse_4(ctx, data, results, num, categories); else - return search_sse_2(ctx, data, results, num, categories); + return rte_acl_classify_scalar(ctx, data, results, num, + categories); } diff --git a/lib/librte_acl/acl_run_sse.h b/lib/librte_acl/acl_run_sse.h index e33e16bc1c..1b7870e4c2 100644 --- a/lib/librte_acl/acl_run_sse.h +++ b/lib/librte_acl/acl_run_sse.h @@ -45,10 +45,6 @@ static const rte_xmm_t xmm_shuffle_input = { .u32 = {0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c}, }; -static const rte_xmm_t xmm_shuffle_input64 = { - .u32 = {0x00000000, 0x04040404, 0x80808080, 0x80808080}, -}; - static const rte_xmm_t xmm_ones_16 = { .u16 = {1, 1, 1, 1, 1, 1, 1, 1}, }; @@ -62,15 +58,6 @@ static const rte_xmm_t xmm_match_mask = { }, }; -static const rte_xmm_t xmm_match_mask64 = { - .u32 = { - RTE_ACL_NODE_MATCH, - 0, - RTE_ACL_NODE_MATCH, - 0, - }, -}; - static const rte_xmm_t xmm_index_mask = { .u32 = { RTE_ACL_NODE_INDEX, @@ -80,16 +67,6 @@ static const rte_xmm_t xmm_index_mask = { }, }; -static const rte_xmm_t xmm_index_mask64 = { - .u32 = { - RTE_ACL_NODE_INDEX, - RTE_ACL_NODE_INDEX, - 0, - 0, - }, -}; - - /* * Resolve priority for multiple results (sse version). * This consists comparing the priority of the current traversal with the @@ -160,22 +137,6 @@ acl_process_matches(xmm_t *indices, int slot, const struct rte_acl_ctx *ctx, *indices = MM_SET64(transition2, transition1); } -/* - * Check for a match in 2 transitions (contained in SSE register) - */ -static inline __attribute__((always_inline)) void -acl_match_check_x2(int slot, const struct rte_acl_ctx *ctx, struct parms *parms, - struct acl_flow_data *flows, xmm_t *indices, xmm_t match_mask) -{ - xmm_t temp; - - temp = MM_AND(match_mask, *indices); - while (!MM_TESTZ(temp, temp)) { - acl_process_matches(indices, slot, ctx, parms, flows); - temp = MM_AND(match_mask, *indices); - } -} - /* * Check for any match in 4 transitions (contained in 2 SSE registers) */ @@ -460,74 +421,3 @@ search_sse_4(const struct rte_acl_ctx *ctx, const uint8_t **data, return 0; } - -static inline __attribute__((always_inline)) xmm_t -transition2(xmm_t next_input, const uint64_t *trans, xmm_t *indices1) -{ - uint64_t t; - xmm_t addr, indices2; - - indices2 = _mm_setzero_si128(); - - addr = calc_addr_sse(xmm_index_mask.x, next_input, xmm_shuffle_input.x, - xmm_ones_16.x, *indices1, indices2); - - /* Gather 64 bit transitions and pack 2 per register. */ - - t = trans[MM_CVT32(addr)]; - - /* get slot 1 */ - addr = MM_SHUFFLE32(addr, SHUFFLE32_SLOT1); - *indices1 = MM_SET64(trans[MM_CVT32(addr)], t); - - return MM_SRL32(next_input, CHAR_BIT); -} - -/* - * Execute trie traversal with 2 traversals in parallel. - */ -static inline int -search_sse_2(const struct rte_acl_ctx *ctx, const uint8_t **data, - uint32_t *results, uint32_t total_packets, uint32_t categories) -{ - int n; - struct acl_flow_data flows; - uint64_t index_array[MAX_SEARCHES_SSE2]; - struct completion cmplt[MAX_SEARCHES_SSE2]; - struct parms parms[MAX_SEARCHES_SSE2]; - xmm_t input, indices; - - acl_set_flow(&flows, cmplt, RTE_DIM(cmplt), data, results, - total_packets, categories, ctx->trans_table); - - for (n = 0; n < MAX_SEARCHES_SSE2; n++) { - cmplt[n].count = 0; - index_array[n] = acl_start_next_trie(&flows, parms, n, ctx); - } - - indices = MM_LOADU((xmm_t *) &index_array[0]); - - /* Check for any matches. */ - acl_match_check_x2(0, ctx, parms, &flows, &indices, - xmm_match_mask64.x); - - while (flows.started > 0) { - - /* Gather 4 bytes of input data for each stream. */ - input = _mm_cvtsi32_si128(GET_NEXT_4BYTES(parms, 0)); - input = MM_INSERT32(input, GET_NEXT_4BYTES(parms, 1), 1); - - /* Process the 4 bytes of input on each stream. */ - - input = transition2(input, flows.trans, &indices); - input = transition2(input, flows.trans, &indices); - input = transition2(input, flows.trans, &indices); - input = transition2(input, flows.trans, &indices); - - /* Check for any matches. */ - acl_match_check_x2(0, ctx, parms, &flows, &indices, - xmm_match_mask64.x); - } - - return 0; -}