1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright(c) 2020 Intel Corporation
6 * Defines required by "acl_run_avx512_common.h".
7 * Note that all of them has to be undefined by the end
8 * of this file, as "acl_run_avx512_common.h" can be included several
9 * times from different *.h files for the same *.c.
13 * This implementation uses 512-bit registers(zmm) and instrincts.
14 * So our main SIMD type is 512-bit width and each such variable can
15 * process sizeof(__m512i) / sizeof(uint32_t) == 16 entries in parallel.
17 #define _T_simd __m512i
18 #define _T_mask __mmask16
20 /* Naming convention for static const variables. */
21 #define _SC_(x) zmm_##x
22 #define _SV_(x) (zmm_##x.z)
24 /* Naming convention for internal functions. */
25 #define _F_(x) x##_avx512x16
28 * Same instrincts have different syntaxis (depending on the bit-width),
29 * so to overcome that few macros need to be defined.
32 /* Naming convention for generic epi(packed integers) type instrincts. */
33 #define _M_I_(x) _mm512_##x
35 /* Naming convention for si(whole simd integer) type instrincts. */
36 #define _M_SI_(x) _mm512_##x##_si512
38 /* Naming convention for masked gather type instrincts. */
39 #define _M_MGI_(x) _mm512_##x
41 /* Naming convention for gather type instrincts. */
42 #define _M_GI_(name, idx, base, scale) _mm512_##name(idx, base, scale)
44 /* num/mask of transitions per SIMD regs */
45 #define _SIMD_MASK_BIT_ (sizeof(_T_simd) / sizeof(uint32_t))
46 #define _SIMD_MASK_MAX_ RTE_LEN2MASK(_SIMD_MASK_BIT_, uint32_t)
48 #define _SIMD_FLOW_NUM_ (2 * _SIMD_MASK_BIT_)
49 #define _SIMD_FLOW_MSK_ (_SIMD_FLOW_NUM_ - 1)
51 /* num/mask of pointers per SIMD regs */
52 #define _SIMD_PTR_NUM_ (sizeof(_T_simd) / sizeof(uintptr_t))
53 #define _SIMD_PTR_MSK_ RTE_LEN2MASK(_SIMD_PTR_NUM_, uint32_t)
55 static const __rte_x86_zmm_t _SC_(match_mask) = {
76 static const __rte_x86_zmm_t _SC_(index_mask) = {
97 static const __rte_x86_zmm_t _SC_(trlo_idle) = {
118 static const __rte_x86_zmm_t _SC_(trhi_idle) = {
127 static const __rte_x86_zmm_t _SC_(shuffle_input) = {
129 0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c,
130 0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c,
131 0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c,
132 0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c,
136 static const __rte_x86_zmm_t _SC_(four_32) = {
145 static const __rte_x86_zmm_t _SC_(idx_add) = {
154 static const __rte_x86_zmm_t _SC_(range_base) = {
156 0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c,
157 0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c,
158 0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c,
159 0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c,
163 static const __rte_x86_zmm_t _SC_(pminp) = {
165 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
166 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
170 static const _T_mask _SC_(pmidx_msk) = 0x5555;
172 static const __rte_x86_zmm_t _SC_(pmidx[2]) = {
175 0, 0, 1, 0, 2, 0, 3, 0,
176 4, 0, 5, 0, 6, 0, 7, 0,
181 8, 0, 9, 0, 10, 0, 11, 0,
182 12, 0, 13, 0, 14, 0, 15, 0,
188 * unfortunately current AVX512 ISA doesn't provide ability for
189 * gather load on a byte quantity. So we have to mimic it in SW,
190 * by doing 8x1B scalar loads.
192 static inline __m256i
193 _m512_mask_gather_epi8x8(__m512i pdata, __mmask8 mask)
198 static const uint32_t zero;
200 p.z = _mm512_mask_set1_epi64(pdata, mask ^ _SIMD_PTR_MSK_,
203 v.u32[0] = *(uint8_t *)p.u64[0];
204 v.u32[1] = *(uint8_t *)p.u64[1];
205 v.u32[2] = *(uint8_t *)p.u64[2];
206 v.u32[3] = *(uint8_t *)p.u64[3];
207 v.u32[4] = *(uint8_t *)p.u64[4];
208 v.u32[5] = *(uint8_t *)p.u64[5];
209 v.u32[6] = *(uint8_t *)p.u64[6];
210 v.u32[7] = *(uint8_t *)p.u64[7];
216 * Gather 4/1 input bytes for up to 16 (2*8) locations in parallel.
218 static __rte_always_inline __m512i
219 _F_(gather_bytes)(__m512i zero, const __m512i p[2], const uint32_t m[2],
224 if (bnum == sizeof(uint8_t)) {
225 inp[0] = _m512_mask_gather_epi8x8(p[0], m[0]);
226 inp[1] = _m512_mask_gather_epi8x8(p[1], m[1]);
228 inp[0] = _mm512_mask_i64gather_epi32(
229 _mm512_castsi512_si256(zero),
230 m[0], p[0], NULL, sizeof(uint8_t));
231 inp[1] = _mm512_mask_i64gather_epi32(
232 _mm512_castsi512_si256(zero),
233 m[1], p[1], NULL, sizeof(uint8_t));
236 /* squeeze input into one 512-bit register */
237 return _mm512_permutex2var_epi32(_mm512_castsi256_si512(inp[0]),
238 _SV_(pminp), _mm512_castsi256_si512(inp[1]));
242 * Resolve matches for multiple categories (GT 8, use 512b instuctions/regs)
245 resolve_mcgt8_avx512x1(uint32_t result[],
246 const struct rte_acl_match_results pr[], const uint32_t match[],
247 uint32_t nb_pkt, uint32_t nb_cat, uint32_t nb_trie)
250 const uint32_t *pm, *res;
253 __m512i cp, cr, np, nr;
258 cm = (1 << nb_cat) - 1;
260 for (k = 0; k != nb_pkt; k++, result += nb_cat) {
262 mi = match[k] << ACL_MATCH_LOG;
264 cr = _mm512_maskz_loadu_epi32(cm, res + mi);
265 cp = _mm512_maskz_loadu_epi32(cm, pri + mi);
267 for (i = 1, pm = match + nb_pkt; i != nb_trie;
270 mi = pm[k] << ACL_MATCH_LOG;
272 nr = _mm512_maskz_loadu_epi32(cm, res + mi);
273 np = _mm512_maskz_loadu_epi32(cm, pri + mi);
275 sm = _mm512_cmpgt_epi32_mask(cp, np);
276 cr = _mm512_mask_mov_epi32(nr, sm, cr);
277 cp = _mm512_mask_mov_epi32(np, sm, cp);
280 _mm512_mask_storeu_epi32(result, cm, cr);
284 #include "acl_run_avx512_common.h"
287 * Perform search for up to (2 * 16) flows in parallel.
288 * Use two sets of metadata, each serves 16 flows max.
291 search_avx512x16x2(const struct rte_acl_ctx *ctx, const uint8_t **data,
292 uint32_t *results, uint32_t total_packets, uint32_t categories)
295 const struct rte_acl_match_results *pr;
296 struct acl_flow_avx512 flow;
297 uint32_t match[ctx->num_tries * total_packets];
299 for (i = 0, pm = match; i != ctx->num_tries; i++, pm += total_packets) {
301 /* setup for next trie */
302 acl_set_flow_avx512(&flow, ctx, i, data, pm, total_packets);
304 /* process the trie */
305 _F_(search_trie)(&flow);
308 /* resolve matches */
309 pr = (const struct rte_acl_match_results *)
310 (ctx->trans_table + ctx->match_index);
313 _F_(resolve_single_cat)(results, pr, match, total_packets,
315 else if (categories <= RTE_ACL_MAX_CATEGORIES / 2)
316 resolve_mcle8_avx512x1(results, pr, match, total_packets,
317 categories, ctx->num_tries);
319 resolve_mcgt8_avx512x1(results, pr, match, total_packets,
320 categories, ctx->num_tries);
325 #undef _SIMD_PTR_MSK_
326 #undef _SIMD_PTR_NUM_
327 #undef _SIMD_FLOW_MSK_
328 #undef _SIMD_FLOW_NUM_
329 #undef _SIMD_MASK_MAX_
330 #undef _SIMD_MASK_BIT_