From: Konstantin Ananyev <konstantin.ananyev@intel.com>
Date: Tue, 6 Oct 2020 15:03:10 +0000 (+0100)
Subject: acl: add 256-bit AVX512 classify method
X-Git-Url: http://git.droids-corp.org/?a=commitdiff_plain;h=b64c2295f7fc;p=dpdk.git

acl: add 256-bit AVX512 classify method

Introduce classify implementation that uses AVX512 specific ISA.
rte_acl_classify_avx512x16() is able to process up to 16 flows in parallel.
It uses 256-bit width registers/instructions only
(to avoid frequency level change).
Note that for now only 64-bit version is supported.

Signed-off-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
---

diff --git a/app/test-acl/main.c b/app/test-acl/main.c
index d9b65517cb..f9985b18e4 100644
--- a/app/test-acl/main.c
+++ b/app/test-acl/main.c
@@ -81,6 +81,10 @@ static const struct acl_alg acl_alg[] = {
 		.name = "altivec",
 		.alg = RTE_ACL_CLASSIFY_ALTIVEC,
 	},
+	{
+		.name = "avx512x16",
+		.alg = RTE_ACL_CLASSIFY_AVX512X16,
+	},
 };
 
 static struct {
diff --git a/app/test/test_acl.c b/app/test/test_acl.c
index aea129e678..b9d132cf18 100644
--- a/app/test/test_acl.c
+++ b/app/test/test_acl.c
@@ -341,6 +341,7 @@ test_classify_run(struct rte_acl_ctx *acx, struct ipv4_7tuple test_data[],
 		RTE_ACL_CLASSIFY_AVX2,
 		RTE_ACL_CLASSIFY_NEON,
 		RTE_ACL_CLASSIFY_ALTIVEC,
+		RTE_ACL_CLASSIFY_AVX512X16,
 	};
 
 	/* swap all bytes in the data to network order */
diff --git a/doc/guides/prog_guide/packet_classif_access_ctrl.rst b/doc/guides/prog_guide/packet_classif_access_ctrl.rst
index daf03e6d7a..11f4bc841b 100644
--- a/doc/guides/prog_guide/packet_classif_access_ctrl.rst
+++ b/doc/guides/prog_guide/packet_classif_access_ctrl.rst
@@ -379,6 +379,10 @@ There are several implementations of classify algorithm:
 *   **RTE_ACL_CLASSIFY_ALTIVEC**: vector implementation, can process up to 8
     flows in parallel. Requires ALTIVEC support.
 
+*   **RTE_ACL_CLASSIFY_AVX512X16**: vector implementation, can process up to 16
+    flows in parallel. Uses 256-bit width SIMD registers.
+    Requires AVX512 support.
+
 It is purely a runtime decision which method to choose, there is no build-time difference.
 All implementations operates over the same internal RT structures and use similar principles. The main difference is that vector implementations can manually exploit IA SIMD instructions and process several input data flows in parallel.
 At startup ACL library determines the highest available classify method for the given platform and sets it as default one. Though the user has an ability to override the default classifier function for a given ACL context or perform particular search using non-default classify method. In that case it is user responsibility to make sure that given platform supports selected classify implementation.
diff --git a/doc/guides/rel_notes/release_20_11.rst b/doc/guides/rel_notes/release_20_11.rst
index 77e2aed444..8814ae74a3 100644
--- a/doc/guides/rel_notes/release_20_11.rst
+++ b/doc/guides/rel_notes/release_20_11.rst
@@ -183,6 +183,11 @@ New Features
   * Extern objects and functions can be plugged into the pipeline.
   * Transaction-oriented table updates.
 
+* **Add new AVX512 specific classify algorithms for ACL library.**
+
+  * Added new ``RTE_ACL_CLASSIFY_AVX512X16`` vector implementation,
+    which can process up to 16 flows in parallel. Requires AVX512 support.
+
 
 Removed Items
 -------------
diff --git a/lib/librte_acl/acl.h b/lib/librte_acl/acl.h
index 543ce55659..7ac0d12f08 100644
--- a/lib/librte_acl/acl.h
+++ b/lib/librte_acl/acl.h
@@ -76,6 +76,13 @@ struct rte_acl_bitset {
  * input_byte - ((uint8_t *)&transition)[4 + input_byte / 64].
  */
 
+/*
+ * Each ACL RT contains an idle nomatch node:
+ * a SINGLE node at predefined position (RTE_ACL_DFA_SIZE)
+ * that points to itself.
+ */
+#define RTE_ACL_IDLE_NODE	(RTE_ACL_DFA_SIZE | RTE_ACL_NODE_SINGLE)
+
 /*
  * Structure of a node is a set of ptrs and each ptr has a bit map
  * of values associated with this transition.
diff --git a/lib/librte_acl/acl_gen.c b/lib/librte_acl/acl_gen.c
index f1b9d12f1e..e759a2ca15 100644
--- a/lib/librte_acl/acl_gen.c
+++ b/lib/librte_acl/acl_gen.c
@@ -496,7 +496,7 @@ rte_acl_gen(struct rte_acl_ctx *ctx, struct rte_acl_trie *trie,
 	 * highest index, that points to itself)
 	 */
 
-	node_array[RTE_ACL_DFA_SIZE] = RTE_ACL_DFA_SIZE | RTE_ACL_NODE_SINGLE;
+	node_array[RTE_ACL_DFA_SIZE] = RTE_ACL_IDLE_NODE;
 
 	for (n = 0; n < RTE_ACL_DFA_SIZE; n++)
 		node_array[n] = no_match;
diff --git a/lib/librte_acl/acl_run_avx512.c b/lib/librte_acl/acl_run_avx512.c
index 1817f88b29..f5bc628b7c 100644
--- a/lib/librte_acl/acl_run_avx512.c
+++ b/lib/librte_acl/acl_run_avx512.c
@@ -4,10 +4,126 @@
 
 #include "acl_run_sse.h"
 
+/*sizeof(uint32_t) << match_log == sizeof(struct rte_acl_match_results)*/
+static const uint32_t match_log = 5;
+
+struct acl_flow_avx512 {
+	uint32_t num_packets;       /* number of packets processed */
+	uint32_t total_packets;     /* max number of packets to process */
+	uint32_t root_index;        /* current root index */
+	const uint64_t *trans;      /* transition table */
+	const uint32_t *data_index; /* input data indexes */
+	const uint8_t **idata;      /* input data */
+	uint32_t *matches;          /* match indexes */
+};
+
+static inline void
+acl_set_flow_avx512(struct acl_flow_avx512 *flow, const struct rte_acl_ctx *ctx,
+	uint32_t trie, const uint8_t *data[], uint32_t *matches,
+	uint32_t total_packets)
+{
+	flow->num_packets = 0;
+	flow->total_packets = total_packets;
+	flow->root_index = ctx->trie[trie].root_index;
+	flow->trans = ctx->trans_table;
+	flow->data_index = ctx->trie[trie].data_index;
+	flow->idata = data;
+	flow->matches = matches;
+}
+
+/*
+ * Update flow and result masks based on the number of unprocessed flows.
+ */
+static inline uint32_t
+update_flow_mask(const struct acl_flow_avx512 *flow, uint32_t *fmsk,
+	uint32_t *rmsk)
+{
+	uint32_t i, j, k, m, n;
+
+	fmsk[0] ^= rmsk[0];
+	m = rmsk[0];
+
+	k = __builtin_popcount(m);
+	n = flow->total_packets - flow->num_packets;
+
+	if (n < k) {
+		/* reduce mask */
+		for (i = k - n; i != 0; i--) {
+			j = sizeof(m) * CHAR_BIT - 1 - __builtin_clz(m);
+			m ^= 1 << j;
+		}
+	} else
+		n = k;
+
+	rmsk[0] = m;
+	fmsk[0] |= rmsk[0];
+
+	return n;
+}
+
+/*
+ * Resolve matches for multiple categories (LE 8, use 128b instuctions/regs)
+ */
+static inline void
+resolve_mcle8_avx512x1(uint32_t result[],
+	const struct rte_acl_match_results pr[], const uint32_t match[],
+	uint32_t nb_pkt, uint32_t nb_cat, uint32_t nb_trie)
+{
+	const int32_t *pri;
+	const uint32_t *pm, *res;
+	uint32_t i, j, k, mi, mn;
+	__mmask8 msk;
+	xmm_t cp, cr, np, nr;
+
+	res = pr->results;
+	pri = pr->priority;
+
+	for (k = 0; k != nb_pkt; k++, result += nb_cat) {
+
+		mi = match[k] << match_log;
+
+		for (j = 0; j != nb_cat; j += RTE_ACL_RESULTS_MULTIPLIER) {
+
+			cr = _mm_loadu_si128((const xmm_t *)(res + mi + j));
+			cp = _mm_loadu_si128((const xmm_t *)(pri + mi + j));
+
+			for (i = 1, pm = match + nb_pkt; i != nb_trie;
+				i++, pm += nb_pkt) {
+
+				mn = j + (pm[k] << match_log);
+
+				nr = _mm_loadu_si128((const xmm_t *)(res + mn));
+				np = _mm_loadu_si128((const xmm_t *)(pri + mn));
+
+				msk = _mm_cmpgt_epi32_mask(cp, np);
+				cr = _mm_mask_mov_epi32(nr, msk, cr);
+				cp = _mm_mask_mov_epi32(np, msk, cp);
+			}
+
+			_mm_storeu_si128((xmm_t *)(result + j), cr);
+		}
+	}
+}
+
+#include "acl_run_avx512x8.h"
+
 int
 rte_acl_classify_avx512x16(const struct rte_acl_ctx *ctx, const uint8_t **data,
 	uint32_t *results, uint32_t num, uint32_t categories)
 {
+	const uint32_t max_iter = MAX_SEARCHES_AVX16 * MAX_SEARCHES_AVX16;
+
+	/* split huge lookup (gt 256) into series of fixed size ones */
+	while (num > max_iter) {
+		search_avx512x8x2(ctx, data, results, max_iter, categories);
+		data += max_iter;
+		results += max_iter * categories;
+		num -= max_iter;
+	}
+
+	/* select classify method based on number of remaining requests */
+	if (num >= MAX_SEARCHES_AVX16)
+		return search_avx512x8x2(ctx, data, results, num, categories);
 	if (num >= MAX_SEARCHES_SSE8)
 		return search_sse_8(ctx, data, results, num, categories);
 	if (num >= MAX_SEARCHES_SSE4)
@@ -20,6 +136,19 @@ int
 rte_acl_classify_avx512x32(const struct rte_acl_ctx *ctx, const uint8_t **data,
 	uint32_t *results, uint32_t num, uint32_t categories)
 {
+	const uint32_t max_iter = MAX_SEARCHES_AVX16 * MAX_SEARCHES_AVX16;
+
+	/* split huge lookup (gt 256) into series of fixed size ones */
+	while (num > max_iter) {
+		search_avx512x8x2(ctx, data, results, max_iter, categories);
+		data += max_iter;
+		results += max_iter * categories;
+		num -= max_iter;
+	}
+
+	/* select classify method based on number of remaining requests */
+	if (num >= MAX_SEARCHES_AVX16)
+		return search_avx512x8x2(ctx, data, results, num, categories);
 	if (num >= MAX_SEARCHES_SSE8)
 		return search_sse_8(ctx, data, results, num, categories);
 	if (num >= MAX_SEARCHES_SSE4)
diff --git a/lib/librte_acl/acl_run_avx512x8.h b/lib/librte_acl/acl_run_avx512x8.h
new file mode 100644
index 0000000000..cfba0299ed
--- /dev/null
+++ b/lib/librte_acl/acl_run_avx512x8.h
@@ -0,0 +1,642 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2020 Intel Corporation
+ */
+
+#define MASK8_BIT	(sizeof(__mmask8) * CHAR_BIT)
+
+#define NUM_AVX512X8X2	(2 * MASK8_BIT)
+#define MSK_AVX512X8X2	(NUM_AVX512X8X2 - 1)
+
+/* num/mask of pointers per SIMD regs */
+#define YMM_PTR_NUM	(sizeof(__m256i) / sizeof(uintptr_t))
+#define YMM_PTR_MSK	RTE_LEN2MASK(YMM_PTR_NUM, uint32_t)
+
+static const rte_ymm_t ymm_match_mask = {
+	.u32 = {
+		RTE_ACL_NODE_MATCH,
+		RTE_ACL_NODE_MATCH,
+		RTE_ACL_NODE_MATCH,
+		RTE_ACL_NODE_MATCH,
+		RTE_ACL_NODE_MATCH,
+		RTE_ACL_NODE_MATCH,
+		RTE_ACL_NODE_MATCH,
+		RTE_ACL_NODE_MATCH,
+	},
+};
+
+static const rte_ymm_t ymm_index_mask = {
+	.u32 = {
+		RTE_ACL_NODE_INDEX,
+		RTE_ACL_NODE_INDEX,
+		RTE_ACL_NODE_INDEX,
+		RTE_ACL_NODE_INDEX,
+		RTE_ACL_NODE_INDEX,
+		RTE_ACL_NODE_INDEX,
+		RTE_ACL_NODE_INDEX,
+		RTE_ACL_NODE_INDEX,
+	},
+};
+
+static const rte_ymm_t ymm_trlo_idle = {
+	.u32 = {
+		RTE_ACL_IDLE_NODE,
+		RTE_ACL_IDLE_NODE,
+		RTE_ACL_IDLE_NODE,
+		RTE_ACL_IDLE_NODE,
+		RTE_ACL_IDLE_NODE,
+		RTE_ACL_IDLE_NODE,
+		RTE_ACL_IDLE_NODE,
+		RTE_ACL_IDLE_NODE,
+	},
+};
+
+static const rte_ymm_t ymm_trhi_idle = {
+	.u32 = {
+		0, 0, 0, 0,
+		0, 0, 0, 0,
+	},
+};
+
+static const rte_ymm_t ymm_shuffle_input = {
+	.u32 = {
+		0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c,
+		0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c,
+	},
+};
+
+static const rte_ymm_t ymm_four_32 = {
+	.u32 = {
+		4, 4, 4, 4,
+		4, 4, 4, 4,
+	},
+};
+
+static const rte_ymm_t ymm_idx_add = {
+	.u32 = {
+		0, 1, 2, 3,
+		4, 5, 6, 7,
+	},
+};
+
+static const rte_ymm_t ymm_range_base = {
+	.u32 = {
+		0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c,
+		0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c,
+	},
+};
+
+static const rte_ymm_t ymm_pminp = {
+	.u32 = {
+		0x00, 0x01, 0x02, 0x03,
+		0x08, 0x09, 0x0a, 0x0b,
+	},
+};
+
+static const __mmask16 ymm_pmidx_msk = 0x55;
+
+static const rte_ymm_t ymm_pmidx[2] = {
+	[0] = {
+		.u32 = {
+			0, 0, 1, 0, 2, 0, 3, 0,
+		},
+	},
+	[1] = {
+		.u32 = {
+			4, 0, 5, 0, 6, 0, 7, 0,
+		},
+	},
+};
+
+/*
+ * unfortunately current AVX512 ISA doesn't provide ability for
+ * gather load on a byte quantity. So we have to mimic it in SW,
+ * by doing 4x1B scalar loads.
+ */
+static inline __m128i
+_m256_mask_gather_epi8x4(__m256i pdata, __mmask8 mask)
+{
+	rte_xmm_t v;
+	rte_ymm_t p;
+
+	static const uint32_t zero;
+
+	p.y = _mm256_mask_set1_epi64(pdata, mask ^ YMM_PTR_MSK,
+		(uintptr_t)&zero);
+
+	v.u32[0] = *(uint8_t *)p.u64[0];
+	v.u32[1] = *(uint8_t *)p.u64[1];
+	v.u32[2] = *(uint8_t *)p.u64[2];
+	v.u32[3] = *(uint8_t *)p.u64[3];
+
+	return v.x;
+}
+
+/*
+ * Calculate the address of the next transition for
+ * all types of nodes. Note that only DFA nodes and range
+ * nodes actually transition to another node. Match
+ * nodes not supposed to be encountered here.
+ * For quad range nodes:
+ * Calculate number of range boundaries that are less than the
+ * input value. Range boundaries for each node are in signed 8 bit,
+ * ordered from -128 to 127.
+ * This is effectively a popcnt of bytes that are greater than the
+ * input byte.
+ * Single nodes are processed in the same ways as quad range nodes.
+ */
+static __rte_always_inline __m256i
+calc_addr8(__m256i index_mask, __m256i next_input, __m256i shuffle_input,
+	__m256i four_32, __m256i range_base, __m256i tr_lo, __m256i tr_hi)
+{
+	__mmask32 qm;
+	__mmask8 dfa_msk;
+	__m256i addr, in, node_type, r, t;
+	__m256i dfa_ofs, quad_ofs;
+
+	t = _mm256_xor_si256(index_mask, index_mask);
+	in = _mm256_shuffle_epi8(next_input, shuffle_input);
+
+	/* Calc node type and node addr */
+	node_type = _mm256_andnot_si256(index_mask, tr_lo);
+	addr = _mm256_and_si256(index_mask, tr_lo);
+
+	/* mask for DFA type(0) nodes */
+	dfa_msk = _mm256_cmpeq_epi32_mask(node_type, t);
+
+	/* DFA calculations. */
+	r = _mm256_srli_epi32(in, 30);
+	r = _mm256_add_epi8(r, range_base);
+	t = _mm256_srli_epi32(in, 24);
+	r = _mm256_shuffle_epi8(tr_hi, r);
+
+	dfa_ofs = _mm256_sub_epi32(t, r);
+
+	/* QUAD/SINGLE calculations. */
+	qm = _mm256_cmpgt_epi8_mask(in, tr_hi);
+	t = _mm256_maskz_set1_epi8(qm, (uint8_t)UINT8_MAX);
+	t = _mm256_lzcnt_epi32(t);
+	t = _mm256_srli_epi32(t, 3);
+	quad_ofs = _mm256_sub_epi32(four_32, t);
+
+	/* blend DFA and QUAD/SINGLE. */
+	t = _mm256_mask_mov_epi32(quad_ofs, dfa_msk, dfa_ofs);
+
+	/* calculate address for next transitions. */
+	addr = _mm256_add_epi32(addr, t);
+	return addr;
+}
+
+/*
+ * Process 16 transitions in parallel.
+ * tr_lo contains low 32 bits for 16 transition.
+ * tr_hi contains high 32 bits for 16 transition.
+ * next_input contains up to 4 input bytes for 16 flows.
+ */
+static __rte_always_inline __m256i
+transition8(__m256i next_input, const uint64_t *trans, __m256i *tr_lo,
+	__m256i *tr_hi)
+{
+	const int32_t *tr;
+	__m256i addr;
+
+	tr = (const int32_t *)(uintptr_t)trans;
+
+	/* Calculate the address (array index) for all 8 transitions. */
+	addr = calc_addr8(ymm_index_mask.y, next_input, ymm_shuffle_input.y,
+		ymm_four_32.y, ymm_range_base.y, *tr_lo, *tr_hi);
+
+	/* load lower 32 bits of 16 transactions at once. */
+	*tr_lo = _mm256_i32gather_epi32(tr, addr, sizeof(trans[0]));
+
+	next_input = _mm256_srli_epi32(next_input, CHAR_BIT);
+
+	/* load high 32 bits of 16 transactions at once. */
+	*tr_hi = _mm256_i32gather_epi32(tr + 1, addr, sizeof(trans[0]));
+
+	return next_input;
+}
+
+/*
+ * Execute first transition for up to 16 flows in parallel.
+ * next_input should contain one input byte for up to 16 flows.
+ * msk - mask of active flows.
+ * tr_lo contains low 32 bits for up to 16 transitions.
+ * tr_hi contains high 32 bits for up to 16 transitions.
+ */
+static __rte_always_inline void
+first_trans8(const struct acl_flow_avx512 *flow, __m256i next_input,
+	__mmask8 msk, __m256i *tr_lo, __m256i *tr_hi)
+{
+	const int32_t *tr;
+	__m256i addr, root;
+
+	tr = (const int32_t *)(uintptr_t)flow->trans;
+
+	addr = _mm256_set1_epi32(UINT8_MAX);
+	root = _mm256_set1_epi32(flow->root_index);
+
+	addr = _mm256_and_si256(next_input, addr);
+	addr = _mm256_add_epi32(root, addr);
+
+	/* load lower 32 bits of 16 transactions at once. */
+	*tr_lo = _mm256_mmask_i32gather_epi32(*tr_lo, msk, addr, tr,
+		sizeof(flow->trans[0]));
+
+	/* load high 32 bits of 16 transactions at once. */
+	*tr_hi = _mm256_mmask_i32gather_epi32(*tr_hi, msk, addr, (tr + 1),
+		sizeof(flow->trans[0]));
+}
+
+/*
+ * Load and return next 4 input bytes for up to 16 flows in parallel.
+ * pdata - 8x2 pointers to flow input data
+ * mask - mask of active flows.
+ * di - data indexes for these 16 flows.
+ */
+static inline __m256i
+get_next_bytes_avx512x8(const struct acl_flow_avx512 *flow, __m256i pdata[2],
+	uint32_t msk, __m256i *di, uint32_t bnum)
+{
+	const int32_t *div;
+	uint32_t m[2];
+	__m256i one, zero, t, p[2];
+	__m128i inp[2];
+
+	div = (const int32_t *)flow->data_index;
+
+	one = _mm256_set1_epi32(1);
+	zero = _mm256_xor_si256(one, one);
+
+	/* load data offsets for given indexes */
+	t = _mm256_mmask_i32gather_epi32(zero, msk, *di, div, sizeof(div[0]));
+
+	/* increment data indexes */
+	*di = _mm256_mask_add_epi32(*di, msk, *di, one);
+
+	/*
+	 * unsigned expand 32-bit indexes to 64-bit
+	 * (for later pointer arithmetic), i.e:
+	 * for (i = 0; i != 16; i++)
+	 *   p[i/8].u64[i%8] = (uint64_t)t.u32[i];
+	 */
+	p[0] = _mm256_maskz_permutexvar_epi32(ymm_pmidx_msk, ymm_pmidx[0].y, t);
+	p[1] = _mm256_maskz_permutexvar_epi32(ymm_pmidx_msk, ymm_pmidx[1].y, t);
+
+	p[0] = _mm256_add_epi64(p[0], pdata[0]);
+	p[1] = _mm256_add_epi64(p[1], pdata[1]);
+
+	/* load input byte(s), either one or four */
+
+	m[0] = msk & YMM_PTR_MSK;
+	m[1] = msk >> YMM_PTR_NUM;
+
+	if (bnum == sizeof(uint8_t)) {
+		inp[0] = _m256_mask_gather_epi8x4(p[0], m[0]);
+		inp[1] = _m256_mask_gather_epi8x4(p[1], m[1]);
+	} else {
+		inp[0] = _mm256_mmask_i64gather_epi32(
+				_mm256_castsi256_si128(zero), m[0], p[0],
+				NULL, sizeof(uint8_t));
+		inp[1] = _mm256_mmask_i64gather_epi32(
+				_mm256_castsi256_si128(zero), m[1], p[1],
+				NULL, sizeof(uint8_t));
+	}
+
+	/* squeeze input into one 512-bit register */
+	return _mm256_permutex2var_epi32(_mm256_castsi128_si256(inp[0]),
+			ymm_pminp.y,  _mm256_castsi128_si256(inp[1]));
+}
+
+/*
+ * Start up to 16 new flows.
+ * num - number of flows to start
+ * msk - mask of new flows.
+ * pdata - pointers to flow input data
+ * idx - match indexed for given flows
+ * di - data indexes for these flows.
+ */
+static inline void
+start_flow8(struct acl_flow_avx512 *flow, uint32_t num, uint32_t msk,
+	__m256i pdata[2], __m256i *idx, __m256i *di)
+{
+	uint32_t n, m[2], nm[2];
+	__m256i ni, nd[2];
+
+	m[0] = msk & YMM_PTR_MSK;
+	m[1] = msk >> YMM_PTR_NUM;
+
+	n = __builtin_popcount(m[0]);
+	nm[0] = (1 << n) - 1;
+	nm[1] = (1 << (num - n)) - 1;
+
+	/* load input data pointers for new flows */
+	nd[0] = _mm256_maskz_loadu_epi64(nm[0],
+		flow->idata + flow->num_packets);
+	nd[1] = _mm256_maskz_loadu_epi64(nm[1],
+		flow->idata + flow->num_packets + n);
+
+	/* calculate match indexes of new flows */
+	ni = _mm256_set1_epi32(flow->num_packets);
+	ni = _mm256_add_epi32(ni, ymm_idx_add.y);
+
+	/* merge new and existing flows data */
+	pdata[0] = _mm256_mask_expand_epi64(pdata[0], m[0], nd[0]);
+	pdata[1] = _mm256_mask_expand_epi64(pdata[1], m[1], nd[1]);
+
+	/* update match and data indexes */
+	*idx = _mm256_mask_expand_epi32(*idx, msk, ni);
+	*di = _mm256_maskz_mov_epi32(msk ^ UINT8_MAX, *di);
+
+	flow->num_packets += num;
+}
+
+/*
+ * Process found matches for up to 16 flows.
+ * fmsk - mask of active flows
+ * rmsk - mask of found matches
+ * pdata - pointers to flow input data
+ * di - data indexes for these flows
+ * idx - match indexed for given flows
+ * tr_lo contains low 32 bits for up to 8 transitions.
+ * tr_hi contains high 32 bits for up to 8 transitions.
+ */
+static inline uint32_t
+match_process_avx512x8(struct acl_flow_avx512 *flow, uint32_t *fmsk,
+	uint32_t *rmsk, __m256i pdata[2], __m256i *di, __m256i *idx,
+	__m256i *tr_lo, __m256i *tr_hi)
+{
+	uint32_t n;
+	__m256i res;
+
+	if (rmsk[0] == 0)
+		return 0;
+
+	/* extract match indexes */
+	res = _mm256_and_si256(tr_lo[0], ymm_index_mask.y);
+
+	/* mask  matched transitions to nop */
+	tr_lo[0] = _mm256_mask_mov_epi32(tr_lo[0], rmsk[0], ymm_trlo_idle.y);
+	tr_hi[0] = _mm256_mask_mov_epi32(tr_hi[0], rmsk[0], ymm_trhi_idle.y);
+
+	/* save found match indexes */
+	_mm256_mask_i32scatter_epi32(flow->matches, rmsk[0],
+		idx[0], res, sizeof(flow->matches[0]));
+
+	/* update masks and start new flows for matches */
+	n = update_flow_mask(flow, fmsk, rmsk);
+	start_flow8(flow, n, rmsk[0], pdata, idx, di);
+
+	return n;
+}
+
+/*
+ * Test for matches ut to 32 (2x16) flows at once,
+ * if matches exist - process them and start new flows.
+ */
+static inline void
+match_check_process_avx512x8x2(struct acl_flow_avx512 *flow, uint32_t fm[2],
+	__m256i pdata[4], __m256i di[2], __m256i idx[2], __m256i inp[2],
+	__m256i tr_lo[2], __m256i tr_hi[2])
+{
+	uint32_t n[2];
+	uint32_t rm[2];
+
+	/* check for matches */
+	rm[0] = _mm256_test_epi32_mask(tr_lo[0], ymm_match_mask.y);
+	rm[1] = _mm256_test_epi32_mask(tr_lo[1], ymm_match_mask.y);
+
+	/* till unprocessed matches exist */
+	while ((rm[0] | rm[1]) != 0) {
+
+		/* process matches and start new flows */
+		n[0] = match_process_avx512x8(flow, &fm[0], &rm[0], &pdata[0],
+			&di[0], &idx[0], &tr_lo[0], &tr_hi[0]);
+		n[1] = match_process_avx512x8(flow, &fm[1], &rm[1], &pdata[2],
+			&di[1], &idx[1], &tr_lo[1], &tr_hi[1]);
+
+		/* execute first transition for new flows, if any */
+
+		if (n[0] != 0) {
+			inp[0] = get_next_bytes_avx512x8(flow, &pdata[0],
+				rm[0], &di[0], sizeof(uint8_t));
+			first_trans8(flow, inp[0], rm[0], &tr_lo[0],
+				&tr_hi[0]);
+			rm[0] = _mm256_test_epi32_mask(tr_lo[0],
+				ymm_match_mask.y);
+		}
+
+		if (n[1] != 0) {
+			inp[1] = get_next_bytes_avx512x8(flow, &pdata[2],
+				rm[1], &di[1], sizeof(uint8_t));
+			first_trans8(flow, inp[1], rm[1], &tr_lo[1],
+				&tr_hi[1]);
+			rm[1] = _mm256_test_epi32_mask(tr_lo[1],
+				ymm_match_mask.y);
+		}
+	}
+}
+
+/*
+ * Perform search for up to 32 flows in parallel.
+ * Use two sets of metadata, each serves 16 flows max.
+ * So in fact we perform search for 2x16 flows.
+ */
+static inline void
+search_trie_avx512x8x2(struct acl_flow_avx512 *flow)
+{
+	uint32_t fm[2];
+	__m256i di[2], idx[2], in[2], pdata[4], tr_lo[2], tr_hi[2];
+
+	/* first 1B load */
+	start_flow8(flow, MASK8_BIT, UINT8_MAX, &pdata[0], &idx[0], &di[0]);
+	start_flow8(flow, MASK8_BIT, UINT8_MAX, &pdata[2], &idx[1], &di[1]);
+
+	in[0] = get_next_bytes_avx512x8(flow, &pdata[0], UINT8_MAX, &di[0],
+			sizeof(uint8_t));
+	in[1] = get_next_bytes_avx512x8(flow, &pdata[2], UINT8_MAX, &di[1],
+			sizeof(uint8_t));
+
+	first_trans8(flow, in[0], UINT8_MAX, &tr_lo[0], &tr_hi[0]);
+	first_trans8(flow, in[1], UINT8_MAX, &tr_lo[1], &tr_hi[1]);
+
+	fm[0] = UINT8_MAX;
+	fm[1] = UINT8_MAX;
+
+	/* match check */
+	match_check_process_avx512x8x2(flow, fm, pdata, di, idx, in,
+		tr_lo, tr_hi);
+
+	while ((fm[0] | fm[1]) != 0) {
+
+		/* load next 4B */
+
+		in[0] = get_next_bytes_avx512x8(flow, &pdata[0], fm[0],
+			&di[0], sizeof(uint32_t));
+		in[1] = get_next_bytes_avx512x8(flow, &pdata[2], fm[1],
+			&di[1], sizeof(uint32_t));
+
+		/* main 4B loop */
+
+		in[0] = transition8(in[0], flow->trans, &tr_lo[0], &tr_hi[0]);
+		in[1] = transition8(in[1], flow->trans, &tr_lo[1], &tr_hi[1]);
+
+		in[0] = transition8(in[0], flow->trans, &tr_lo[0], &tr_hi[0]);
+		in[1] = transition8(in[1], flow->trans, &tr_lo[1], &tr_hi[1]);
+
+		in[0] = transition8(in[0], flow->trans, &tr_lo[0], &tr_hi[0]);
+		in[1] = transition8(in[1], flow->trans, &tr_lo[1], &tr_hi[1]);
+
+		in[0] = transition8(in[0], flow->trans, &tr_lo[0], &tr_hi[0]);
+		in[1] = transition8(in[1], flow->trans, &tr_lo[1], &tr_hi[1]);
+
+		/* check for matches */
+		match_check_process_avx512x8x2(flow, fm, pdata, di, idx, in,
+			tr_lo, tr_hi);
+	}
+}
+
+/*
+ * resolve match index to actual result/priority offset.
+ */
+static inline __m256i
+resolve_match_idx_avx512x8(__m256i mi)
+{
+	RTE_BUILD_BUG_ON(sizeof(struct rte_acl_match_results) !=
+		1 << (match_log + 2));
+	return _mm256_slli_epi32(mi, match_log);
+}
+
+/*
+ * Resolve multiple matches for the same flow based on priority.
+ */
+static inline __m256i
+resolve_pri_avx512x8(const int32_t res[], const int32_t pri[],
+	const uint32_t match[], __mmask8 msk, uint32_t nb_trie,
+	uint32_t nb_skip)
+{
+	uint32_t i;
+	const uint32_t *pm;
+	__mmask16 m;
+	__m256i cp, cr, np, nr, mch;
+
+	const __m256i zero = _mm256_set1_epi32(0);
+
+	/* get match indexes */
+	mch = _mm256_maskz_loadu_epi32(msk, match);
+	mch = resolve_match_idx_avx512x8(mch);
+
+	/* read result and priority values for first trie */
+	cr = _mm256_mmask_i32gather_epi32(zero, msk, mch, res, sizeof(res[0]));
+	cp = _mm256_mmask_i32gather_epi32(zero, msk, mch, pri, sizeof(pri[0]));
+
+	/*
+	 * read result and priority values for next tries and select one
+	 * with highest priority.
+	 */
+	for (i = 1, pm = match + nb_skip; i != nb_trie;
+			i++, pm += nb_skip) {
+
+		mch = _mm256_maskz_loadu_epi32(msk, pm);
+		mch = resolve_match_idx_avx512x8(mch);
+
+		nr = _mm256_mmask_i32gather_epi32(zero, msk, mch, res,
+			sizeof(res[0]));
+		np = _mm256_mmask_i32gather_epi32(zero, msk, mch, pri,
+			sizeof(pri[0]));
+
+		m = _mm256_cmpgt_epi32_mask(cp, np);
+		cr = _mm256_mask_mov_epi32(nr, m, cr);
+		cp = _mm256_mask_mov_epi32(np, m, cp);
+	}
+
+	return cr;
+}
+
+/*
+ * Resolve num (<= 8) matches for single category
+ */
+static inline void
+resolve_sc_avx512x8(uint32_t result[], const int32_t res[],
+	const int32_t pri[], const uint32_t match[], uint32_t nb_pkt,
+	uint32_t nb_trie, uint32_t nb_skip)
+{
+	__mmask8 msk;
+	__m256i cr;
+
+	msk = (1 << nb_pkt) - 1;
+	cr = resolve_pri_avx512x8(res, pri, match, msk, nb_trie, nb_skip);
+	_mm256_mask_storeu_epi32(result, msk, cr);
+}
+
+/*
+ * Resolve matches for single category
+ */
+static inline void
+resolve_sc_avx512x8x2(uint32_t result[],
+	const struct rte_acl_match_results pr[], const uint32_t match[],
+	uint32_t nb_pkt, uint32_t nb_trie)
+{
+	uint32_t j, k, n;
+	const int32_t *res, *pri;
+	__m256i cr[2];
+
+	res = (const int32_t *)pr->results;
+	pri = pr->priority;
+
+	for (k = 0; k != (nb_pkt & ~MSK_AVX512X8X2); k += NUM_AVX512X8X2) {
+
+		j = k + MASK8_BIT;
+
+		cr[0] = resolve_pri_avx512x8(res, pri, match + k, UINT8_MAX,
+				nb_trie, nb_pkt);
+		cr[1] = resolve_pri_avx512x8(res, pri, match + j, UINT8_MAX,
+				nb_trie, nb_pkt);
+
+		_mm256_storeu_si256((void *)(result + k), cr[0]);
+		_mm256_storeu_si256((void *)(result + j), cr[1]);
+	}
+
+	n = nb_pkt - k;
+	if (n != 0) {
+		if (n > MASK8_BIT) {
+			resolve_sc_avx512x8(result + k, res, pri, match + k,
+				MASK8_BIT, nb_trie, nb_pkt);
+			k += MASK8_BIT;
+			n -= MASK8_BIT;
+		}
+		resolve_sc_avx512x8(result + k, res, pri, match + k, n,
+				nb_trie, nb_pkt);
+	}
+}
+
+static inline int
+search_avx512x8x2(const struct rte_acl_ctx *ctx, const uint8_t **data,
+	uint32_t *results, uint32_t total_packets, uint32_t categories)
+{
+	uint32_t i, *pm;
+	const struct rte_acl_match_results *pr;
+	struct acl_flow_avx512 flow;
+	uint32_t match[ctx->num_tries * total_packets];
+
+	for (i = 0, pm = match; i != ctx->num_tries; i++, pm += total_packets) {
+
+		/* setup for next trie */
+		acl_set_flow_avx512(&flow, ctx, i, data, pm, total_packets);
+
+		/* process the trie */
+		search_trie_avx512x8x2(&flow);
+	}
+
+	/* resolve matches */
+	pr = (const struct rte_acl_match_results *)
+		(ctx->trans_table + ctx->match_index);
+
+	if (categories == 1)
+		resolve_sc_avx512x8x2(results, pr, match, total_packets,
+			ctx->num_tries);
+	else
+		resolve_mcle8_avx512x1(results, pr, match, total_packets,
+			categories, ctx->num_tries);
+
+	return 0;
+}