X-Git-Url: http://git.droids-corp.org/?a=blobdiff_plain;f=lib%2Flibrte_acl%2Facl_run_sse.h;h=93286a2c38b4ea7e490b63569a2573d346a30463;hb=48f9faddc65d3a45cdd7e41fef49a91cfc156d04;hp=1b7870e4c2042184645fe2868ea75418a0d791f8;hpb=4269eae463d156b76cfee0dac3fa17aabde86d3a;p=dpdk.git

diff --git a/lib/librte_acl/acl_run_sse.h b/lib/librte_acl/acl_run_sse.h
index 1b7870e4c2..93286a2c38 100644
--- a/lib/librte_acl/acl_run_sse.h
+++ b/lib/librte_acl/acl_run_sse.h
@@ -1,34 +1,5 @@
-/*-
- *   BSD LICENSE
- *
- *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
- *   All rights reserved.
- *
- *   Redistribution and use in source and binary forms, with or without
- *   modification, are permitted provided that the following conditions
- *   are met:
- *
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in
- *       the documentation and/or other materials provided with the
- *       distribution.
- *     * Neither the name of Intel Corporation nor the names of its
- *       contributors may be used to endorse or promote products derived
- *       from this software without specific prior written permission.
- *
- *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2014 Intel Corporation
  */
 
 #include "acl_run.h"
@@ -67,6 +38,12 @@ static const rte_xmm_t xmm_index_mask = {
 	},
 };
 
+static const rte_xmm_t xmm_range_base = {
+	.u32 = {
+		0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c,
+	},
+};
+
 /*
  * Resolve priority for multiple results (sse version).
  * This consists comparing the priority of the current traversal with the
@@ -90,25 +67,28 @@ resolve_priority_sse(uint64_t transition, int n, const struct rte_acl_ctx *ctx,
 			(xmm_t *)(&parms[n].cmplt->priority[x]);
 
 		/* get results and priorities for completed trie */
-		results = MM_LOADU((const xmm_t *)&p[transition].results[x]);
-		priority = MM_LOADU((const xmm_t *)&p[transition].priority[x]);
+		results = _mm_loadu_si128(
+			(const xmm_t *)&p[transition].results[x]);
+		priority = _mm_loadu_si128(
+			(const xmm_t *)&p[transition].priority[x]);
 
 		/* if this is not the first completed trie */
 		if (parms[n].cmplt->count != ctx->num_tries) {
 
 			/* get running best results and their priorities */
-			results1 = MM_LOADU(saved_results);
-			priority1 = MM_LOADU(saved_priority);
+			results1 = _mm_loadu_si128(saved_results);
+			priority1 = _mm_loadu_si128(saved_priority);
 
 			/* select results that are highest priority */
-			selector = MM_CMPGT32(priority1, priority);
-			results = MM_BLENDV8(results, results1, selector);
-			priority = MM_BLENDV8(priority, priority1, selector);
+			selector = _mm_cmpgt_epi32(priority1, priority);
+			results = _mm_blendv_epi8(results, results1, selector);
+			priority = _mm_blendv_epi8(priority, priority1,
+				selector);
 		}
 
 		/* save running best results and their priorities */
-		MM_STOREU(saved_results, results);
-		MM_STOREU(saved_priority, priority);
+		_mm_storeu_si128(saved_results, results);
+		_mm_storeu_si128(saved_priority, priority);
 	}
 }
 
@@ -122,11 +102,11 @@ acl_process_matches(xmm_t *indices, int slot, const struct rte_acl_ctx *ctx,
 	uint64_t transition1, transition2;
 
 	/* extract transition from low 64 bits. */
-	transition1 = MM_CVT64(*indices);
+	transition1 = _mm_cvtsi128_si64(*indices);
 
 	/* extract transition from high 64 bits. */
-	*indices = MM_SHUFFLE32(*indices, SHUFFLE32_SWAP64);
-	transition2 = MM_CVT64(*indices);
+	*indices = _mm_shuffle_epi32(*indices, SHUFFLE32_SWAP64);
+	transition2 = _mm_cvtsi128_si64(*indices);
 
 	transition1 = acl_match_check(transition1, slot, ctx,
 		parms, flows, resolve_priority_sse);
@@ -134,13 +114,13 @@ acl_process_matches(xmm_t *indices, int slot, const struct rte_acl_ctx *ctx,
 		parms, flows, resolve_priority_sse);
 
 	/* update indices with new transitions. */
-	*indices = MM_SET64(transition2, transition1);
+	*indices = _mm_set_epi64x(transition2, transition1);
 }
 
 /*
  * Check for any match in 4 transitions (contained in 2 SSE registers)
  */
-static inline __attribute__((always_inline)) void
+static __rte_always_inline void
 acl_match_check_x4(int slot, const struct rte_acl_ctx *ctx, struct parms *parms,
 	struct acl_flow_data *flows, xmm_t *indices1, xmm_t *indices2,
 	xmm_t match_mask)
@@ -148,138 +128,63 @@ acl_match_check_x4(int slot, const struct rte_acl_ctx *ctx, struct parms *parms,
 	xmm_t temp;
 
 	/* put low 32 bits of each transition into one register */
-	temp = (xmm_t)MM_SHUFFLEPS((__m128)*indices1, (__m128)*indices2,
+	temp = (xmm_t)_mm_shuffle_ps((__m128)*indices1, (__m128)*indices2,
 		0x88);
 	/* test for match node */
-	temp = MM_AND(match_mask, temp);
+	temp = _mm_and_si128(match_mask, temp);
 
-	while (!MM_TESTZ(temp, temp)) {
+	while (!_mm_testz_si128(temp, temp)) {
 		acl_process_matches(indices1, slot, ctx, parms, flows);
 		acl_process_matches(indices2, slot + 2, ctx, parms, flows);
 
-		temp = (xmm_t)MM_SHUFFLEPS((__m128)*indices1,
+		temp = (xmm_t)_mm_shuffle_ps((__m128)*indices1,
 					(__m128)*indices2,
 					0x88);
-		temp = MM_AND(match_mask, temp);
+		temp = _mm_and_si128(match_mask, temp);
 	}
 }
 
 /*
- * Calculate the address of the next transition for
- * all types of nodes. Note that only DFA nodes and range
- * nodes actually transition to another node. Match
- * nodes don't move.
+ * Process 4 transitions (in 2 XMM registers) in parallel
  */
-static inline __attribute__((always_inline)) xmm_t
-calc_addr_sse(xmm_t index_mask, xmm_t next_input, xmm_t shuffle_input,
-	xmm_t ones_16, xmm_t indices1, xmm_t indices2)
-{
-	xmm_t addr, node_types, range, temp;
-	xmm_t dfa_msk, dfa_ofs, quad_ofs;
-	xmm_t in, r, t;
-
-	const xmm_t range_base = _mm_set_epi32(0xffffff0c, 0xffffff08,
-		0xffffff04, 0xffffff00);
-
-	/*
-	 * Note that no transition is done for a match
-	 * node and therefore a stream freezes when
-	 * it reaches a match.
-	 */
-
-	/* Shuffle low 32 into temp and high 32 into indices2 */
-	temp = (xmm_t)MM_SHUFFLEPS((__m128)indices1, (__m128)indices2, 0x88);
-	range = (xmm_t)MM_SHUFFLEPS((__m128)indices1, (__m128)indices2, 0xdd);
-
-	t = MM_XOR(index_mask, index_mask);
-
-	/* shuffle input byte to all 4 positions of 32 bit value */
-	in = MM_SHUFFLE8(next_input, shuffle_input);
-
-	/* Calc node type and node addr */
-	node_types = MM_ANDNOT(index_mask, temp);
-	addr = MM_AND(index_mask, temp);
-
-	/*
-	 * Calc addr for DFAs - addr = dfa_index + input_byte
-	 */
-
-	/* mask for DFA type (0) nodes */
-	dfa_msk = MM_CMPEQ32(node_types, t);
-
-	r = _mm_srli_epi32(in, 30);
-	r = _mm_add_epi8(r, range_base);
-
-	t = _mm_srli_epi32(in, 24);
-	r = _mm_shuffle_epi8(range, r);
-
-	dfa_ofs = _mm_sub_epi32(t, r);
-
-	/*
-	 * Calculate number of range boundaries that are less than the
-	 * input value. Range boundaries for each node are in signed 8 bit,
-	 * ordered from -128 to 127 in the indices2 register.
-	 * This is effectively a popcnt of bytes that are greater than the
-	 * input byte.
-	 */
-
-	/* check ranges */
-	temp = MM_CMPGT8(in, range);
-
-	/* convert -1 to 1 (bytes greater than input byte */
-	temp = MM_SIGN8(temp, temp);
-
-	/* horizontal add pairs of bytes into words */
-	temp = MM_MADD8(temp, temp);
-
-	/* horizontal add pairs of words into dwords */
-	quad_ofs = MM_MADD16(temp, ones_16);
-
-	/* mask to range type nodes */
-	temp = _mm_blendv_epi8(quad_ofs, dfa_ofs, dfa_msk);
-
-	/* add index into node position */
-	return MM_ADD32(addr, temp);
-}
-
-/*
- * Process 4 transitions (in 2 SIMD registers) in parallel
- */
-static inline __attribute__((always_inline)) xmm_t
+static __rte_always_inline xmm_t
 transition4(xmm_t next_input, const uint64_t *trans,
 	xmm_t *indices1, xmm_t *indices2)
 {
-	xmm_t addr;
+	xmm_t addr, tr_lo, tr_hi;
 	uint64_t trans0, trans2;
 
-	 /* Calculate the address (array index) for all 4 transitions. */
+	/* Shuffle low 32 into tr_lo and high 32 into tr_hi */
+	ACL_TR_HILO(mm, __m128, *indices1, *indices2, tr_lo, tr_hi);
 
-	addr = calc_addr_sse(xmm_index_mask.x, next_input, xmm_shuffle_input.x,
-		xmm_ones_16.x, *indices1, *indices2);
+	 /* Calculate the address (array index) for all 4 transitions. */
+	ACL_TR_CALC_ADDR(mm, 128, addr, xmm_index_mask.x, next_input,
+		xmm_shuffle_input.x, xmm_ones_16.x, xmm_range_base.x,
+		tr_lo, tr_hi);
 
 	 /* Gather 64 bit transitions and pack back into 2 registers. */
 
-	trans0 = trans[MM_CVT32(addr)];
+	trans0 = trans[_mm_cvtsi128_si32(addr)];
 
 	/* get slot 2 */
 
 	/* {x0, x1, x2, x3} -> {x2, x1, x2, x3} */
-	addr = MM_SHUFFLE32(addr, SHUFFLE32_SLOT2);
-	trans2 = trans[MM_CVT32(addr)];
+	addr = _mm_shuffle_epi32(addr, SHUFFLE32_SLOT2);
+	trans2 = trans[_mm_cvtsi128_si32(addr)];
 
 	/* get slot 1 */
 
 	/* {x2, x1, x2, x3} -> {x1, x1, x2, x3} */
-	addr = MM_SHUFFLE32(addr, SHUFFLE32_SLOT1);
-	*indices1 = MM_SET64(trans[MM_CVT32(addr)], trans0);
+	addr = _mm_shuffle_epi32(addr, SHUFFLE32_SLOT1);
+	*indices1 = _mm_set_epi64x(trans[_mm_cvtsi128_si32(addr)], trans0);
 
 	/* get slot 3 */
 
 	/* {x1, x1, x2, x3} -> {x3, x1, x2, x3} */
-	addr = MM_SHUFFLE32(addr, SHUFFLE32_SLOT3);
-	*indices2 = MM_SET64(trans[MM_CVT32(addr)], trans2);
+	addr = _mm_shuffle_epi32(addr, SHUFFLE32_SLOT3);
+	*indices2 = _mm_set_epi64x(trans[_mm_cvtsi128_si32(addr)], trans2);
 
-	return MM_SRL32(next_input, CHAR_BIT);
+	return _mm_srli_epi32(next_input, CHAR_BIT);
 }
 
 /*
@@ -312,11 +217,11 @@ search_sse_8(const struct rte_acl_ctx *ctx, const uint8_t **data,
 	 * indices4 contains index_array[6,7]
 	 */
 
-	indices1 = MM_LOADU((xmm_t *) &index_array[0]);
-	indices2 = MM_LOADU((xmm_t *) &index_array[2]);
+	indices1 = _mm_loadu_si128((xmm_t *) &index_array[0]);
+	indices2 = _mm_loadu_si128((xmm_t *) &index_array[2]);
 
-	indices3 = MM_LOADU((xmm_t *) &index_array[4]);
-	indices4 = MM_LOADU((xmm_t *) &index_array[6]);
+	indices3 = _mm_loadu_si128((xmm_t *) &index_array[4]);
+	indices4 = _mm_loadu_si128((xmm_t *) &index_array[6]);
 
 	 /* Check for any matches. */
 	acl_match_check_x4(0, ctx, parms, &flows,
@@ -330,14 +235,14 @@ search_sse_8(const struct rte_acl_ctx *ctx, const uint8_t **data,
 		input0 = _mm_cvtsi32_si128(GET_NEXT_4BYTES(parms, 0));
 		input1 = _mm_cvtsi32_si128(GET_NEXT_4BYTES(parms, 4));
 
-		input0 = MM_INSERT32(input0, GET_NEXT_4BYTES(parms, 1), 1);
-		input1 = MM_INSERT32(input1, GET_NEXT_4BYTES(parms, 5), 1);
+		input0 = _mm_insert_epi32(input0, GET_NEXT_4BYTES(parms, 1), 1);
+		input1 = _mm_insert_epi32(input1, GET_NEXT_4BYTES(parms, 5), 1);
 
-		input0 = MM_INSERT32(input0, GET_NEXT_4BYTES(parms, 2), 2);
-		input1 = MM_INSERT32(input1, GET_NEXT_4BYTES(parms, 6), 2);
+		input0 = _mm_insert_epi32(input0, GET_NEXT_4BYTES(parms, 2), 2);
+		input1 = _mm_insert_epi32(input1, GET_NEXT_4BYTES(parms, 6), 2);
 
-		input0 = MM_INSERT32(input0, GET_NEXT_4BYTES(parms, 3), 3);
-		input1 = MM_INSERT32(input1, GET_NEXT_4BYTES(parms, 7), 3);
+		input0 = _mm_insert_epi32(input0, GET_NEXT_4BYTES(parms, 3), 3);
+		input1 = _mm_insert_epi32(input1, GET_NEXT_4BYTES(parms, 7), 3);
 
 		 /* Process the 4 bytes of input on each stream. */
 
@@ -393,8 +298,8 @@ search_sse_4(const struct rte_acl_ctx *ctx, const uint8_t **data,
 		index_array[n] = acl_start_next_trie(&flows, parms, n, ctx);
 	}
 
-	indices1 = MM_LOADU((xmm_t *) &index_array[0]);
-	indices2 = MM_LOADU((xmm_t *) &index_array[2]);
+	indices1 = _mm_loadu_si128((xmm_t *) &index_array[0]);
+	indices2 = _mm_loadu_si128((xmm_t *) &index_array[2]);
 
 	/* Check for any matches. */
 	acl_match_check_x4(0, ctx, parms, &flows,
@@ -404,9 +309,9 @@ search_sse_4(const struct rte_acl_ctx *ctx, const uint8_t **data,
 
 		/* Gather 4 bytes of input data for each stream. */
 		input = _mm_cvtsi32_si128(GET_NEXT_4BYTES(parms, 0));
-		input = MM_INSERT32(input, GET_NEXT_4BYTES(parms, 1), 1);
-		input = MM_INSERT32(input, GET_NEXT_4BYTES(parms, 2), 2);
-		input = MM_INSERT32(input, GET_NEXT_4BYTES(parms, 3), 3);
+		input = _mm_insert_epi32(input, GET_NEXT_4BYTES(parms, 1), 1);
+		input = _mm_insert_epi32(input, GET_NEXT_4BYTES(parms, 2), 2);
+		input = _mm_insert_epi32(input, GET_NEXT_4BYTES(parms, 3), 3);
 
 		/* Process the 4 bytes of input on each stream. */
 		input = transition4(input, flows.trans, &indices1, &indices2);