1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright(c) 2020 Intel Corporation
5 #define MASK16_BIT (sizeof(__mmask16) * CHAR_BIT)
7 #define NUM_AVX512X16X2 (2 * MASK16_BIT)
8 #define MSK_AVX512X16X2 (NUM_AVX512X16X2 - 1)
10 /* num/mask of pointers per SIMD regs */
11 #define ZMM_PTR_NUM (sizeof(__m512i) / sizeof(uintptr_t))
12 #define ZMM_PTR_MSK RTE_LEN2MASK(ZMM_PTR_NUM, uint32_t)
14 static const __rte_x86_zmm_t zmm_match_mask = {
35 static const __rte_x86_zmm_t zmm_index_mask = {
56 static const __rte_x86_zmm_t zmm_trlo_idle = {
77 static const __rte_x86_zmm_t zmm_trhi_idle = {
86 static const __rte_x86_zmm_t zmm_shuffle_input = {
88 0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c,
89 0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c,
90 0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c,
91 0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c,
95 static const __rte_x86_zmm_t zmm_four_32 = {
104 static const __rte_x86_zmm_t zmm_idx_add = {
113 static const __rte_x86_zmm_t zmm_range_base = {
115 0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c,
116 0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c,
117 0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c,
118 0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c,
122 static const __rte_x86_zmm_t zmm_pminp = {
124 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
125 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
129 static const __mmask16 zmm_pmidx_msk = 0x5555;
131 static const __rte_x86_zmm_t zmm_pmidx[2] = {
134 0, 0, 1, 0, 2, 0, 3, 0,
135 4, 0, 5, 0, 6, 0, 7, 0,
140 8, 0, 9, 0, 10, 0, 11, 0,
141 12, 0, 13, 0, 14, 0, 15, 0,
147 * unfortunately current AVX512 ISA doesn't provide ability for
148 * gather load on a byte quantity. So we have to mimic it in SW,
149 * by doing 8x1B scalar loads.
152 _m512_mask_gather_epi8x8(__m512i pdata, __mmask8 mask)
157 static const uint32_t zero;
159 p.z = _mm512_mask_set1_epi64(pdata, mask ^ ZMM_PTR_MSK,
162 v.u32[0] = *(uint8_t *)p.u64[0];
163 v.u32[1] = *(uint8_t *)p.u64[1];
164 v.u32[2] = *(uint8_t *)p.u64[2];
165 v.u32[3] = *(uint8_t *)p.u64[3];
166 v.u32[4] = *(uint8_t *)p.u64[4];
167 v.u32[5] = *(uint8_t *)p.u64[5];
168 v.u32[6] = *(uint8_t *)p.u64[6];
169 v.u32[7] = *(uint8_t *)p.u64[7];
175 * Calculate the address of the next transition for
176 * all types of nodes. Note that only DFA nodes and range
177 * nodes actually transition to another node. Match
178 * nodes not supposed to be encountered here.
179 * For quad range nodes:
180 * Calculate number of range boundaries that are less than the
181 * input value. Range boundaries for each node are in signed 8 bit,
182 * ordered from -128 to 127.
183 * This is effectively a popcnt of bytes that are greater than the
185 * Single nodes are processed in the same ways as quad range nodes.
187 static __rte_always_inline __m512i
188 calc_addr16(__m512i index_mask, __m512i next_input, __m512i shuffle_input,
189 __m512i four_32, __m512i range_base, __m512i tr_lo, __m512i tr_hi)
193 __m512i addr, in, node_type, r, t;
194 __m512i dfa_ofs, quad_ofs;
196 t = _mm512_xor_si512(index_mask, index_mask);
197 in = _mm512_shuffle_epi8(next_input, shuffle_input);
199 /* Calc node type and node addr */
200 node_type = _mm512_andnot_si512(index_mask, tr_lo);
201 addr = _mm512_and_si512(index_mask, tr_lo);
203 /* mask for DFA type(0) nodes */
204 dfa_msk = _mm512_cmpeq_epi32_mask(node_type, t);
206 /* DFA calculations. */
207 r = _mm512_srli_epi32(in, 30);
208 r = _mm512_add_epi8(r, range_base);
209 t = _mm512_srli_epi32(in, 24);
210 r = _mm512_shuffle_epi8(tr_hi, r);
212 dfa_ofs = _mm512_sub_epi32(t, r);
214 /* QUAD/SINGLE calculations. */
215 qm = _mm512_cmpgt_epi8_mask(in, tr_hi);
216 t = _mm512_maskz_set1_epi8(qm, (uint8_t)UINT8_MAX);
217 t = _mm512_lzcnt_epi32(t);
218 t = _mm512_srli_epi32(t, 3);
219 quad_ofs = _mm512_sub_epi32(four_32, t);
221 /* blend DFA and QUAD/SINGLE. */
222 t = _mm512_mask_mov_epi32(quad_ofs, dfa_msk, dfa_ofs);
224 /* calculate address for next transitions. */
225 addr = _mm512_add_epi32(addr, t);
230 * Process 16 transitions in parallel.
231 * tr_lo contains low 32 bits for 16 transition.
232 * tr_hi contains high 32 bits for 16 transition.
233 * next_input contains up to 4 input bytes for 16 flows.
235 static __rte_always_inline __m512i
236 transition16(__m512i next_input, const uint64_t *trans, __m512i *tr_lo,
242 tr = (const int32_t *)(uintptr_t)trans;
244 /* Calculate the address (array index) for all 16 transitions. */
245 addr = calc_addr16(zmm_index_mask.z, next_input, zmm_shuffle_input.z,
246 zmm_four_32.z, zmm_range_base.z, *tr_lo, *tr_hi);
248 /* load lower 32 bits of 16 transactions at once. */
249 *tr_lo = _mm512_i32gather_epi32(addr, tr, sizeof(trans[0]));
251 next_input = _mm512_srli_epi32(next_input, CHAR_BIT);
253 /* load high 32 bits of 16 transactions at once. */
254 *tr_hi = _mm512_i32gather_epi32(addr, (tr + 1), sizeof(trans[0]));
260 * Execute first transition for up to 16 flows in parallel.
261 * next_input should contain one input byte for up to 16 flows.
262 * msk - mask of active flows.
263 * tr_lo contains low 32 bits for up to 16 transitions.
264 * tr_hi contains high 32 bits for up to 16 transitions.
266 static __rte_always_inline void
267 first_trans16(const struct acl_flow_avx512 *flow, __m512i next_input,
268 __mmask16 msk, __m512i *tr_lo, __m512i *tr_hi)
273 tr = (const int32_t *)(uintptr_t)flow->trans;
275 addr = _mm512_set1_epi32(UINT8_MAX);
276 root = _mm512_set1_epi32(flow->root_index);
278 addr = _mm512_and_si512(next_input, addr);
279 addr = _mm512_add_epi32(root, addr);
281 /* load lower 32 bits of 16 transactions at once. */
282 *tr_lo = _mm512_mask_i32gather_epi32(*tr_lo, msk, addr, tr,
283 sizeof(flow->trans[0]));
285 /* load high 32 bits of 16 transactions at once. */
286 *tr_hi = _mm512_mask_i32gather_epi32(*tr_hi, msk, addr, (tr + 1),
287 sizeof(flow->trans[0]));
291 * Load and return next 4 input bytes for up to 16 flows in parallel.
292 * pdata - 8x2 pointers to flow input data
293 * mask - mask of active flows.
294 * di - data indexes for these 16 flows.
296 static inline __m512i
297 get_next_bytes_avx512x16(const struct acl_flow_avx512 *flow, __m512i pdata[2],
298 uint32_t msk, __m512i *di, uint32_t bnum)
302 __m512i one, zero, t, p[2];
305 div = (const int32_t *)flow->data_index;
307 one = _mm512_set1_epi32(1);
308 zero = _mm512_xor_si512(one, one);
310 /* load data offsets for given indexes */
311 t = _mm512_mask_i32gather_epi32(zero, msk, *di, div, sizeof(div[0]));
313 /* increment data indexes */
314 *di = _mm512_mask_add_epi32(*di, msk, *di, one);
317 * unsigned expand 32-bit indexes to 64-bit
318 * (for later pointer arithmetic), i.e:
319 * for (i = 0; i != 16; i++)
320 * p[i/8].u64[i%8] = (uint64_t)t.u32[i];
322 p[0] = _mm512_maskz_permutexvar_epi32(zmm_pmidx_msk, zmm_pmidx[0].z, t);
323 p[1] = _mm512_maskz_permutexvar_epi32(zmm_pmidx_msk, zmm_pmidx[1].z, t);
325 p[0] = _mm512_add_epi64(p[0], pdata[0]);
326 p[1] = _mm512_add_epi64(p[1], pdata[1]);
328 /* load input byte(s), either one or four */
330 m[0] = msk & ZMM_PTR_MSK;
331 m[1] = msk >> ZMM_PTR_NUM;
333 if (bnum == sizeof(uint8_t)) {
334 inp[0] = _m512_mask_gather_epi8x8(p[0], m[0]);
335 inp[1] = _m512_mask_gather_epi8x8(p[1], m[1]);
337 inp[0] = _mm512_mask_i64gather_epi32(
338 _mm512_castsi512_si256(zero), m[0], p[0],
339 NULL, sizeof(uint8_t));
340 inp[1] = _mm512_mask_i64gather_epi32(
341 _mm512_castsi512_si256(zero), m[1], p[1],
342 NULL, sizeof(uint8_t));
345 /* squeeze input into one 512-bit register */
346 return _mm512_permutex2var_epi32(_mm512_castsi256_si512(inp[0]),
347 zmm_pminp.z, _mm512_castsi256_si512(inp[1]));
351 * Start up to 16 new flows.
352 * num - number of flows to start
353 * msk - mask of new flows.
354 * pdata - pointers to flow input data
355 * idx - match indexed for given flows
356 * di - data indexes for these flows.
359 start_flow16(struct acl_flow_avx512 *flow, uint32_t num, uint32_t msk,
360 __m512i pdata[2], __m512i *idx, __m512i *di)
362 uint32_t n, m[2], nm[2];
365 /* split mask into two - one for each pdata[] */
366 m[0] = msk & ZMM_PTR_MSK;
367 m[1] = msk >> ZMM_PTR_NUM;
369 /* calculate masks for new flows */
370 n = __builtin_popcount(m[0]);
371 nm[0] = (1 << n) - 1;
372 nm[1] = (1 << (num - n)) - 1;
374 /* load input data pointers for new flows */
375 nd[0] = _mm512_maskz_loadu_epi64(nm[0],
376 flow->idata + flow->num_packets);
377 nd[1] = _mm512_maskz_loadu_epi64(nm[1],
378 flow->idata + flow->num_packets + n);
380 /* calculate match indexes of new flows */
381 ni = _mm512_set1_epi32(flow->num_packets);
382 ni = _mm512_add_epi32(ni, zmm_idx_add.z);
384 /* merge new and existing flows data */
385 pdata[0] = _mm512_mask_expand_epi64(pdata[0], m[0], nd[0]);
386 pdata[1] = _mm512_mask_expand_epi64(pdata[1], m[1], nd[1]);
388 /* update match and data indexes */
389 *idx = _mm512_mask_expand_epi32(*idx, msk, ni);
390 *di = _mm512_maskz_mov_epi32(msk ^ UINT16_MAX, *di);
392 flow->num_packets += num;
396 * Process found matches for up to 16 flows.
397 * fmsk - mask of active flows
398 * rmsk - mask of found matches
399 * pdata - pointers to flow input data
400 * di - data indexes for these flows
401 * idx - match indexed for given flows
402 * tr_lo contains low 32 bits for up to 8 transitions.
403 * tr_hi contains high 32 bits for up to 8 transitions.
405 static inline uint32_t
406 match_process_avx512x16(struct acl_flow_avx512 *flow, uint32_t *fmsk,
407 uint32_t *rmsk, __m512i pdata[2], __m512i *di, __m512i *idx,
408 __m512i *tr_lo, __m512i *tr_hi)
416 /* extract match indexes */
417 res = _mm512_and_si512(tr_lo[0], zmm_index_mask.z);
419 /* mask matched transitions to nop */
420 tr_lo[0] = _mm512_mask_mov_epi32(tr_lo[0], rmsk[0], zmm_trlo_idle.z);
421 tr_hi[0] = _mm512_mask_mov_epi32(tr_hi[0], rmsk[0], zmm_trhi_idle.z);
423 /* save found match indexes */
424 _mm512_mask_i32scatter_epi32(flow->matches, rmsk[0],
425 idx[0], res, sizeof(flow->matches[0]));
427 /* update masks and start new flows for matches */
428 n = update_flow_mask(flow, fmsk, rmsk);
429 start_flow16(flow, n, rmsk[0], pdata, idx, di);
435 * Test for matches ut to 32 (2x16) flows at once,
436 * if matches exist - process them and start new flows.
439 match_check_process_avx512x16x2(struct acl_flow_avx512 *flow, uint32_t fm[2],
440 __m512i pdata[4], __m512i di[2], __m512i idx[2], __m512i inp[2],
441 __m512i tr_lo[2], __m512i tr_hi[2])
446 /* check for matches */
447 rm[0] = _mm512_test_epi32_mask(tr_lo[0], zmm_match_mask.z);
448 rm[1] = _mm512_test_epi32_mask(tr_lo[1], zmm_match_mask.z);
450 /* till unprocessed matches exist */
451 while ((rm[0] | rm[1]) != 0) {
453 /* process matches and start new flows */
454 n[0] = match_process_avx512x16(flow, &fm[0], &rm[0], &pdata[0],
455 &di[0], &idx[0], &tr_lo[0], &tr_hi[0]);
456 n[1] = match_process_avx512x16(flow, &fm[1], &rm[1], &pdata[2],
457 &di[1], &idx[1], &tr_lo[1], &tr_hi[1]);
459 /* execute first transition for new flows, if any */
462 inp[0] = get_next_bytes_avx512x16(flow, &pdata[0],
463 rm[0], &di[0], flow->first_load_sz);
464 first_trans16(flow, inp[0], rm[0], &tr_lo[0],
466 rm[0] = _mm512_test_epi32_mask(tr_lo[0],
471 inp[1] = get_next_bytes_avx512x16(flow, &pdata[2],
472 rm[1], &di[1], flow->first_load_sz);
473 first_trans16(flow, inp[1], rm[1], &tr_lo[1],
475 rm[1] = _mm512_test_epi32_mask(tr_lo[1],
482 * Perform search for up to 32 flows in parallel.
483 * Use two sets of metadata, each serves 16 flows max.
484 * So in fact we perform search for 2x16 flows.
487 search_trie_avx512x16x2(struct acl_flow_avx512 *flow)
490 __m512i di[2], idx[2], in[2], pdata[4], tr_lo[2], tr_hi[2];
493 start_flow16(flow, MASK16_BIT, UINT16_MAX, &pdata[0], &idx[0], &di[0]);
494 start_flow16(flow, MASK16_BIT, UINT16_MAX, &pdata[2], &idx[1], &di[1]);
496 in[0] = get_next_bytes_avx512x16(flow, &pdata[0], UINT16_MAX, &di[0],
497 flow->first_load_sz);
498 in[1] = get_next_bytes_avx512x16(flow, &pdata[2], UINT16_MAX, &di[1],
499 flow->first_load_sz);
501 first_trans16(flow, in[0], UINT16_MAX, &tr_lo[0], &tr_hi[0]);
502 first_trans16(flow, in[1], UINT16_MAX, &tr_lo[1], &tr_hi[1]);
508 match_check_process_avx512x16x2(flow, fm, pdata, di, idx, in,
511 while ((fm[0] | fm[1]) != 0) {
515 in[0] = get_next_bytes_avx512x16(flow, &pdata[0], fm[0],
516 &di[0], sizeof(uint32_t));
517 in[1] = get_next_bytes_avx512x16(flow, &pdata[2], fm[1],
518 &di[1], sizeof(uint32_t));
522 in[0] = transition16(in[0], flow->trans, &tr_lo[0], &tr_hi[0]);
523 in[1] = transition16(in[1], flow->trans, &tr_lo[1], &tr_hi[1]);
525 in[0] = transition16(in[0], flow->trans, &tr_lo[0], &tr_hi[0]);
526 in[1] = transition16(in[1], flow->trans, &tr_lo[1], &tr_hi[1]);
528 in[0] = transition16(in[0], flow->trans, &tr_lo[0], &tr_hi[0]);
529 in[1] = transition16(in[1], flow->trans, &tr_lo[1], &tr_hi[1]);
531 in[0] = transition16(in[0], flow->trans, &tr_lo[0], &tr_hi[0]);
532 in[1] = transition16(in[1], flow->trans, &tr_lo[1], &tr_hi[1]);
534 /* check for matches */
535 match_check_process_avx512x16x2(flow, fm, pdata, di, idx, in,
541 * Resolve matches for multiple categories (GT 8, use 512b instuctions/regs)
544 resolve_mcgt8_avx512x1(uint32_t result[],
545 const struct rte_acl_match_results pr[], const uint32_t match[],
546 uint32_t nb_pkt, uint32_t nb_cat, uint32_t nb_trie)
549 const uint32_t *pm, *res;
552 __m512i cp, cr, np, nr;
554 const uint32_t match_log = 5;
559 cm = (1 << nb_cat) - 1;
561 for (k = 0; k != nb_pkt; k++, result += nb_cat) {
563 mi = match[k] << match_log;
565 cr = _mm512_maskz_loadu_epi32(cm, res + mi);
566 cp = _mm512_maskz_loadu_epi32(cm, pri + mi);
568 for (i = 1, pm = match + nb_pkt; i != nb_trie;
571 mi = pm[k] << match_log;
573 nr = _mm512_maskz_loadu_epi32(cm, res + mi);
574 np = _mm512_maskz_loadu_epi32(cm, pri + mi);
576 sm = _mm512_cmpgt_epi32_mask(cp, np);
577 cr = _mm512_mask_mov_epi32(nr, sm, cr);
578 cp = _mm512_mask_mov_epi32(np, sm, cp);
581 _mm512_mask_storeu_epi32(result, cm, cr);
586 * resolve match index to actual result/priority offset.
588 static inline __m512i
589 resolve_match_idx_avx512x16(__m512i mi)
591 RTE_BUILD_BUG_ON(sizeof(struct rte_acl_match_results) !=
592 1 << (match_log + 2));
593 return _mm512_slli_epi32(mi, match_log);
597 * Resolve multiple matches for the same flow based on priority.
599 static inline __m512i
600 resolve_pri_avx512x16(const int32_t res[], const int32_t pri[],
601 const uint32_t match[], __mmask16 msk, uint32_t nb_trie,
607 __m512i cp, cr, np, nr, mch;
609 const __m512i zero = _mm512_set1_epi32(0);
611 /* get match indexes */
612 mch = _mm512_maskz_loadu_epi32(msk, match);
613 mch = resolve_match_idx_avx512x16(mch);
615 /* read result and priority values for first trie */
616 cr = _mm512_mask_i32gather_epi32(zero, msk, mch, res, sizeof(res[0]));
617 cp = _mm512_mask_i32gather_epi32(zero, msk, mch, pri, sizeof(pri[0]));
620 * read result and priority values for next tries and select one
621 * with highest priority.
623 for (i = 1, pm = match + nb_skip; i != nb_trie;
624 i++, pm += nb_skip) {
626 mch = _mm512_maskz_loadu_epi32(msk, pm);
627 mch = resolve_match_idx_avx512x16(mch);
629 nr = _mm512_mask_i32gather_epi32(zero, msk, mch, res,
631 np = _mm512_mask_i32gather_epi32(zero, msk, mch, pri,
634 m = _mm512_cmpgt_epi32_mask(cp, np);
635 cr = _mm512_mask_mov_epi32(nr, m, cr);
636 cp = _mm512_mask_mov_epi32(np, m, cp);
643 * Resolve num (<= 16) matches for single category
646 resolve_sc_avx512x16(uint32_t result[], const int32_t res[],
647 const int32_t pri[], const uint32_t match[], uint32_t nb_pkt,
648 uint32_t nb_trie, uint32_t nb_skip)
653 msk = (1 << nb_pkt) - 1;
654 cr = resolve_pri_avx512x16(res, pri, match, msk, nb_trie, nb_skip);
655 _mm512_mask_storeu_epi32(result, msk, cr);
659 * Resolve matches for single category
662 resolve_sc_avx512x16x2(uint32_t result[],
663 const struct rte_acl_match_results pr[], const uint32_t match[],
664 uint32_t nb_pkt, uint32_t nb_trie)
667 const int32_t *res, *pri;
670 res = (const int32_t *)pr->results;
673 for (k = 0; k != (nb_pkt & ~MSK_AVX512X16X2); k += NUM_AVX512X16X2) {
677 cr[0] = resolve_pri_avx512x16(res, pri, match + k, UINT16_MAX,
679 cr[1] = resolve_pri_avx512x16(res, pri, match + j, UINT16_MAX,
682 _mm512_storeu_si512(result + k, cr[0]);
683 _mm512_storeu_si512(result + j, cr[1]);
688 if (n > MASK16_BIT) {
689 resolve_sc_avx512x16(result + k, res, pri, match + k,
690 MASK16_BIT, nb_trie, nb_pkt);
694 resolve_sc_avx512x16(result + k, res, pri, match + k, n,
700 search_avx512x16x2(const struct rte_acl_ctx *ctx, const uint8_t **data,
701 uint32_t *results, uint32_t total_packets, uint32_t categories)
704 const struct rte_acl_match_results *pr;
705 struct acl_flow_avx512 flow;
706 uint32_t match[ctx->num_tries * total_packets];
708 for (i = 0, pm = match; i != ctx->num_tries; i++, pm += total_packets) {
710 /* setup for next trie */
711 acl_set_flow_avx512(&flow, ctx, i, data, pm, total_packets);
713 /* process the trie */
714 search_trie_avx512x16x2(&flow);
717 /* resolve matches */
718 pr = (const struct rte_acl_match_results *)
719 (ctx->trans_table + ctx->match_index);
722 resolve_sc_avx512x16x2(results, pr, match, total_packets,
724 else if (categories <= RTE_ACL_MAX_CATEGORIES / 2)
725 resolve_mcle8_avx512x1(results, pr, match, total_packets,
726 categories, ctx->num_tries);
728 resolve_mcgt8_avx512x1(results, pr, match, total_packets,
729 categories, ctx->num_tries);