1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright(c) 2020 Intel Corporation
6 * Defines required by "acl_run_avx512_common.h".
7 * Note that all of them has to be undefined by the end
8 * of this file, as "acl_run_avx512_common.h" can be included several
9 * times from different *.h files for the same *.c.
13 * This implementation uses 512-bit registers(zmm) and instrincts.
14 * So our main SIMD type is 512-bit width and each such variable can
15 * process sizeof(__m512i) / sizeof(uint32_t) == 16 entries in parallel.
17 #define _T_simd __m512i
18 #define _T_mask __mmask16
20 /* Naming convention for static const variables. */
21 #define _SC_(x) zmm_##x
22 #define _SV_(x) (zmm_##x.z)
24 /* Naming convention for internal functions. */
25 #define _F_(x) x##_avx512x16
28 * Same instrincts have different syntaxis (depending on the bit-width),
29 * so to overcome that few macros need to be defined.
32 /* Naming convention for generic epi(packed integers) type instrincts. */
33 #define _M_I_(x) _mm512_##x
35 /* Naming convention for si(whole simd integer) type instrincts. */
36 #define _M_SI_(x) _mm512_##x##_si512
38 /* Naming convention for masked gather type instrincts. */
39 #define _M_MGI_(x) _mm512_##x
41 /* Naming convention for gather type instrincts. */
42 #define _M_GI_(name, idx, base, scale) _mm512_##name(idx, base, scale)
44 /* num/mask of transitions per SIMD regs */
45 #define _SIMD_MASK_BIT_ (sizeof(_T_simd) / sizeof(uint32_t))
46 #define _SIMD_MASK_MAX_ RTE_LEN2MASK(_SIMD_MASK_BIT_, uint32_t)
48 #define _SIMD_FLOW_NUM_ (2 * _SIMD_MASK_BIT_)
49 #define _SIMD_FLOW_MSK_ (_SIMD_FLOW_NUM_ - 1)
51 /* num/mask of pointers per SIMD regs */
52 #define _SIMD_PTR_NUM_ (sizeof(_T_simd) / sizeof(uintptr_t))
53 #define _SIMD_PTR_MSK_ RTE_LEN2MASK(_SIMD_PTR_NUM_, uint32_t)
55 static const __rte_x86_zmm_t _SC_(match_mask) = {
76 static const __rte_x86_zmm_t _SC_(index_mask) = {
97 static const __rte_x86_zmm_t _SC_(trlo_idle) = {
118 static const __rte_x86_zmm_t _SC_(trhi_idle) = {
127 static const __rte_x86_zmm_t _SC_(shuffle_input) = {
129 0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c,
130 0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c,
131 0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c,
132 0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c,
136 static const __rte_x86_zmm_t _SC_(four_32) = {
145 static const __rte_x86_zmm_t _SC_(idx_add) = {
154 static const __rte_x86_zmm_t _SC_(range_base) = {
156 0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c,
157 0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c,
158 0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c,
159 0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c,
163 static const __rte_x86_zmm_t _SC_(pminp) = {
165 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
166 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
170 static const _T_mask _SC_(pmidx_msk) = 0x5555;
172 static const __rte_x86_zmm_t _SC_(pmidx[2]) = {
175 0, 0, 1, 0, 2, 0, 3, 0,
176 4, 0, 5, 0, 6, 0, 7, 0,
181 8, 0, 9, 0, 10, 0, 11, 0,
182 12, 0, 13, 0, 14, 0, 15, 0,
188 * unfortunately current AVX512 ISA doesn't provide ability for
189 * gather load on a byte quantity. So we have to mimic it in SW,
190 * by doing 8x1B scalar loads.
192 static inline __m256i
193 _m512_mask_gather_epi8x8(__m512i pdata, __mmask8 mask)
198 static const uint32_t zero;
200 p.z = _mm512_mask_set1_epi64(pdata, mask ^ _SIMD_PTR_MSK_,
203 v.u32[0] = *(uint8_t *)p.u64[0];
204 v.u32[1] = *(uint8_t *)p.u64[1];
205 v.u32[2] = *(uint8_t *)p.u64[2];
206 v.u32[3] = *(uint8_t *)p.u64[3];
207 v.u32[4] = *(uint8_t *)p.u64[4];
208 v.u32[5] = *(uint8_t *)p.u64[5];
209 v.u32[6] = *(uint8_t *)p.u64[6];
210 v.u32[7] = *(uint8_t *)p.u64[7];
216 * Gather 4/1 input bytes for up to 16 (2*8) locations in parallel.
218 static __rte_always_inline __m512i
219 _F_(gather_bytes)(__m512i zero, const __m512i p[2], const uint32_t m[2],
224 if (bnum == sizeof(uint8_t)) {
225 inp[0] = _m512_mask_gather_epi8x8(p[0], m[0]);
226 inp[1] = _m512_mask_gather_epi8x8(p[1], m[1]);
228 inp[0] = _mm512_mask_i64gather_epi32(
229 _mm512_castsi512_si256(zero),
230 m[0], p[0], NULL, sizeof(uint8_t));
231 inp[1] = _mm512_mask_i64gather_epi32(
232 _mm512_castsi512_si256(zero),
233 m[1], p[1], NULL, sizeof(uint8_t));
236 /* squeeze input into one 512-bit register */
237 return _mm512_permutex2var_epi32(_mm512_castsi256_si512(inp[0]),
238 _SV_(pminp), _mm512_castsi256_si512(inp[1]));
242 * Resolve matches for multiple categories (GT 8, use 512b instuctions/regs)
245 resolve_mcgt8_avx512x1(uint32_t result[],
246 const struct rte_acl_match_results pr[], const uint32_t match[],
247 uint32_t nb_pkt, uint32_t nb_cat, uint32_t nb_trie)
250 const uint32_t *pm, *res;
253 __m512i cp, cr, np, nr;
255 const uint32_t match_log = 5;
260 cm = (1 << nb_cat) - 1;
262 for (k = 0; k != nb_pkt; k++, result += nb_cat) {
264 mi = match[k] << match_log;
266 cr = _mm512_maskz_loadu_epi32(cm, res + mi);
267 cp = _mm512_maskz_loadu_epi32(cm, pri + mi);
269 for (i = 1, pm = match + nb_pkt; i != nb_trie;
272 mi = pm[k] << match_log;
274 nr = _mm512_maskz_loadu_epi32(cm, res + mi);
275 np = _mm512_maskz_loadu_epi32(cm, pri + mi);
277 sm = _mm512_cmpgt_epi32_mask(cp, np);
278 cr = _mm512_mask_mov_epi32(nr, sm, cr);
279 cp = _mm512_mask_mov_epi32(np, sm, cp);
282 _mm512_mask_storeu_epi32(result, cm, cr);
286 #include "acl_run_avx512_common.h"
289 * Perform search for up to (2 * 16) flows in parallel.
290 * Use two sets of metadata, each serves 16 flows max.
293 search_avx512x16x2(const struct rte_acl_ctx *ctx, const uint8_t **data,
294 uint32_t *results, uint32_t total_packets, uint32_t categories)
297 const struct rte_acl_match_results *pr;
298 struct acl_flow_avx512 flow;
299 uint32_t match[ctx->num_tries * total_packets];
301 for (i = 0, pm = match; i != ctx->num_tries; i++, pm += total_packets) {
303 /* setup for next trie */
304 acl_set_flow_avx512(&flow, ctx, i, data, pm, total_packets);
306 /* process the trie */
307 _F_(search_trie)(&flow);
310 /* resolve matches */
311 pr = (const struct rte_acl_match_results *)
312 (ctx->trans_table + ctx->match_index);
315 _F_(resolve_single_cat)(results, pr, match, total_packets,
317 else if (categories <= RTE_ACL_MAX_CATEGORIES / 2)
318 resolve_mcle8_avx512x1(results, pr, match, total_packets,
319 categories, ctx->num_tries);
321 resolve_mcgt8_avx512x1(results, pr, match, total_packets,
322 categories, ctx->num_tries);
327 #undef _SIMD_PTR_MSK_
328 #undef _SIMD_PTR_NUM_
329 #undef _SIMD_FLOW_MSK_
330 #undef _SIMD_FLOW_NUM_
331 #undef _SIMD_MASK_MAX_
332 #undef _SIMD_MASK_BIT_