lib/librte_acl/acl_run_avx512x16.h

   1 /* SPDX-License-Identifier: BSD-3-Clause
   2  * Copyright(c) 2020 Intel Corporation
   3  */
   4
   5 #define MASK16_BIT      (sizeof(__mmask16) * CHAR_BIT)
   6
   7 #define NUM_AVX512X16X2 (2 * MASK16_BIT)
   8 #define MSK_AVX512X16X2 (NUM_AVX512X16X2 - 1)
   9
  10 /* num/mask of pointers per SIMD regs */
  11 #define ZMM_PTR_NUM     (sizeof(__m512i) / sizeof(uintptr_t))
  12 #define ZMM_PTR_MSK     RTE_LEN2MASK(ZMM_PTR_NUM, uint32_t)
  13
  14 static const __rte_x86_zmm_t zmm_match_mask = {
  15         .u32 = {
  16                 RTE_ACL_NODE_MATCH,
  17                 RTE_ACL_NODE_MATCH,
  18                 RTE_ACL_NODE_MATCH,
  19                 RTE_ACL_NODE_MATCH,
  20                 RTE_ACL_NODE_MATCH,
  21                 RTE_ACL_NODE_MATCH,
  22                 RTE_ACL_NODE_MATCH,
  23                 RTE_ACL_NODE_MATCH,
  24                 RTE_ACL_NODE_MATCH,
  25                 RTE_ACL_NODE_MATCH,
  26                 RTE_ACL_NODE_MATCH,
  27                 RTE_ACL_NODE_MATCH,
  28                 RTE_ACL_NODE_MATCH,
  29                 RTE_ACL_NODE_MATCH,
  30                 RTE_ACL_NODE_MATCH,
  31                 RTE_ACL_NODE_MATCH,
  32         },
  33 };
  34
  35 static const __rte_x86_zmm_t zmm_index_mask = {
  36         .u32 = {
  37                 RTE_ACL_NODE_INDEX,
  38                 RTE_ACL_NODE_INDEX,
  39                 RTE_ACL_NODE_INDEX,
  40                 RTE_ACL_NODE_INDEX,
  41                 RTE_ACL_NODE_INDEX,
  42                 RTE_ACL_NODE_INDEX,
  43                 RTE_ACL_NODE_INDEX,
  44                 RTE_ACL_NODE_INDEX,
  45                 RTE_ACL_NODE_INDEX,
  46                 RTE_ACL_NODE_INDEX,
  47                 RTE_ACL_NODE_INDEX,
  48                 RTE_ACL_NODE_INDEX,
  49                 RTE_ACL_NODE_INDEX,
  50                 RTE_ACL_NODE_INDEX,
  51                 RTE_ACL_NODE_INDEX,
  52                 RTE_ACL_NODE_INDEX,
  53         },
  54 };
  55
  56 static const __rte_x86_zmm_t zmm_trlo_idle = {
  57         .u32 = {
  58                 RTE_ACL_IDLE_NODE,
  59                 RTE_ACL_IDLE_NODE,
  60                 RTE_ACL_IDLE_NODE,
  61                 RTE_ACL_IDLE_NODE,
  62                 RTE_ACL_IDLE_NODE,
  63                 RTE_ACL_IDLE_NODE,
  64                 RTE_ACL_IDLE_NODE,
  65                 RTE_ACL_IDLE_NODE,
  66                 RTE_ACL_IDLE_NODE,
  67                 RTE_ACL_IDLE_NODE,
  68                 RTE_ACL_IDLE_NODE,
  69                 RTE_ACL_IDLE_NODE,
  70                 RTE_ACL_IDLE_NODE,
  71                 RTE_ACL_IDLE_NODE,
  72                 RTE_ACL_IDLE_NODE,
  73                 RTE_ACL_IDLE_NODE,
  74         },
  75 };
  76
  77 static const __rte_x86_zmm_t zmm_trhi_idle = {
  78         .u32 = {
  79                 0, 0, 0, 0,
  80                 0, 0, 0, 0,
  81                 0, 0, 0, 0,
  82                 0, 0, 0, 0,
  83         },
  84 };
  85
  86 static const __rte_x86_zmm_t zmm_shuffle_input = {
  87         .u32 = {
  88                 0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c,
  89                 0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c,
  90                 0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c,
  91                 0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c,
  92         },
  93 };
  94
  95 static const __rte_x86_zmm_t zmm_four_32 = {
  96         .u32 = {
  97                 4, 4, 4, 4,
  98                 4, 4, 4, 4,
  99                 4, 4, 4, 4,
 100                 4, 4, 4, 4,
 101         },
 102 };
 103
 104 static const __rte_x86_zmm_t zmm_idx_add = {
 105         .u32 = {
 106                 0, 1, 2, 3,
 107                 4, 5, 6, 7,
 108                 8, 9, 10, 11,
 109                 12, 13, 14, 15,
 110         },
 111 };
 112
 113 static const __rte_x86_zmm_t zmm_range_base = {
 114         .u32 = {
 115                 0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c,
 116                 0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c,
 117                 0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c,
 118                 0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c,
 119         },
 120 };
 121
 122 static const __rte_x86_zmm_t zmm_pminp = {
 123         .u32 = {
 124                 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
 125                 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
 126         },
 127 };
 128
 129 static const __mmask16 zmm_pmidx_msk = 0x5555;
 130
 131 static const __rte_x86_zmm_t zmm_pmidx[2] = {
 132         [0] = {
 133                 .u32 = {
 134                         0, 0, 1, 0, 2, 0, 3, 0,
 135                         4, 0, 5, 0, 6, 0, 7, 0,
 136                 },
 137         },
 138         [1] = {
 139                 .u32 = {
 140                         8, 0, 9, 0, 10, 0, 11, 0,
 141                         12, 0, 13, 0, 14, 0, 15, 0,
 142                 },
 143         },
 144 };
 145
 146 /*
 147  * unfortunately current AVX512 ISA doesn't provide ability for
 148  * gather load on a byte quantity. So we have to mimic it in SW,
 149  * by doing 8x1B scalar loads.
 150  */
 151 static inline ymm_t
 152 _m512_mask_gather_epi8x8(__m512i pdata, __mmask8 mask)
 153 {
 154         rte_ymm_t v;
 155         __rte_x86_zmm_t p;
 156
 157         static const uint32_t zero;
 158
 159         p.z = _mm512_mask_set1_epi64(pdata, mask ^ ZMM_PTR_MSK,
 160                 (uintptr_t)&zero);
 161
 162         v.u32[0] = *(uint8_t *)p.u64[0];
 163         v.u32[1] = *(uint8_t *)p.u64[1];
 164         v.u32[2] = *(uint8_t *)p.u64[2];
 165         v.u32[3] = *(uint8_t *)p.u64[3];
 166         v.u32[4] = *(uint8_t *)p.u64[4];
 167         v.u32[5] = *(uint8_t *)p.u64[5];
 168         v.u32[6] = *(uint8_t *)p.u64[6];
 169         v.u32[7] = *(uint8_t *)p.u64[7];
 170
 171         return v.y;
 172 }
 173
 174 /*
 175  * Calculate the address of the next transition for
 176  * all types of nodes. Note that only DFA nodes and range
 177  * nodes actually transition to another node. Match
 178  * nodes not supposed to be encountered here.
 179  * For quad range nodes:
 180  * Calculate number of range boundaries that are less than the
 181  * input value. Range boundaries for each node are in signed 8 bit,
 182  * ordered from -128 to 127.
 183  * This is effectively a popcnt of bytes that are greater than the
 184  * input byte.
 185  * Single nodes are processed in the same ways as quad range nodes.
 186  */
 187 static __rte_always_inline __m512i
 188 calc_addr16(__m512i index_mask, __m512i next_input, __m512i shuffle_input,
 189         __m512i four_32, __m512i range_base, __m512i tr_lo, __m512i tr_hi)
 190 {
 191         __mmask64 qm;
 192         __mmask16 dfa_msk;
 193         __m512i addr, in, node_type, r, t;
 194         __m512i dfa_ofs, quad_ofs;
 195
 196         t = _mm512_xor_si512(index_mask, index_mask);
 197         in = _mm512_shuffle_epi8(next_input, shuffle_input);
 198
 199         /* Calc node type and node addr */
 200         node_type = _mm512_andnot_si512(index_mask, tr_lo);
 201         addr = _mm512_and_si512(index_mask, tr_lo);
 202
 203         /* mask for DFA type(0) nodes */
 204         dfa_msk = _mm512_cmpeq_epi32_mask(node_type, t);
 205
 206         /* DFA calculations. */
 207         r = _mm512_srli_epi32(in, 30);
 208         r = _mm512_add_epi8(r, range_base);
 209         t = _mm512_srli_epi32(in, 24);
 210         r = _mm512_shuffle_epi8(tr_hi, r);
 211
 212         dfa_ofs = _mm512_sub_epi32(t, r);
 213
 214         /* QUAD/SINGLE calculations. */
 215         qm = _mm512_cmpgt_epi8_mask(in, tr_hi);
 216         t = _mm512_maskz_set1_epi8(qm, (uint8_t)UINT8_MAX);
 217         t = _mm512_lzcnt_epi32(t);
 218         t = _mm512_srli_epi32(t, 3);
 219         quad_ofs = _mm512_sub_epi32(four_32, t);
 220
 221         /* blend DFA and QUAD/SINGLE. */
 222         t = _mm512_mask_mov_epi32(quad_ofs, dfa_msk, dfa_ofs);
 223
 224         /* calculate address for next transitions. */
 225         addr = _mm512_add_epi32(addr, t);
 226         return addr;
 227 }
 228
 229 /*
 230  * Process 16 transitions in parallel.
 231  * tr_lo contains low 32 bits for 16 transition.
 232  * tr_hi contains high 32 bits for 16 transition.
 233  * next_input contains up to 4 input bytes for 16 flows.
 234  */
 235 static __rte_always_inline __m512i
 236 transition16(__m512i next_input, const uint64_t *trans, __m512i *tr_lo,
 237         __m512i *tr_hi)
 238 {
 239         const int32_t *tr;
 240         __m512i addr;
 241
 242         tr = (const int32_t *)(uintptr_t)trans;
 243
 244         /* Calculate the address (array index) for all 16 transitions. */
 245         addr = calc_addr16(zmm_index_mask.z, next_input, zmm_shuffle_input.z,
 246                 zmm_four_32.z, zmm_range_base.z, *tr_lo, *tr_hi);
 247
 248         /* load lower 32 bits of 16 transactions at once. */
 249         *tr_lo = _mm512_i32gather_epi32(addr, tr, sizeof(trans[0]));
 250
 251         next_input = _mm512_srli_epi32(next_input, CHAR_BIT);
 252
 253         /* load high 32 bits of 16 transactions at once. */
 254         *tr_hi = _mm512_i32gather_epi32(addr, (tr + 1), sizeof(trans[0]));
 255
 256         return next_input;
 257 }
 258
 259 /*
 260  * Execute first transition for up to 16 flows in parallel.
 261  * next_input should contain one input byte for up to 16 flows.
 262  * msk - mask of active flows.
 263  * tr_lo contains low 32 bits for up to 16 transitions.
 264  * tr_hi contains high 32 bits for up to 16 transitions.
 265  */
 266 static __rte_always_inline void
 267 first_trans16(const struct acl_flow_avx512 *flow, __m512i next_input,
 268         __mmask16 msk, __m512i *tr_lo, __m512i *tr_hi)
 269 {
 270         const int32_t *tr;
 271         __m512i addr, root;
 272
 273         tr = (const int32_t *)(uintptr_t)flow->trans;
 274
 275         addr = _mm512_set1_epi32(UINT8_MAX);
 276         root = _mm512_set1_epi32(flow->root_index);
 277
 278         addr = _mm512_and_si512(next_input, addr);
 279         addr = _mm512_add_epi32(root, addr);
 280
 281         /* load lower 32 bits of 16 transactions at once. */
 282         *tr_lo = _mm512_mask_i32gather_epi32(*tr_lo, msk, addr, tr,
 283                 sizeof(flow->trans[0]));
 284
 285         /* load high 32 bits of 16 transactions at once. */
 286         *tr_hi = _mm512_mask_i32gather_epi32(*tr_hi, msk, addr, (tr + 1),
 287                 sizeof(flow->trans[0]));
 288 }
 289
 290 /*
 291  * Load and return next 4 input bytes for up to 16 flows in parallel.
 292  * pdata - 8x2 pointers to flow input data
 293  * mask - mask of active flows.
 294  * di - data indexes for these 16 flows.
 295  */
 296 static inline __m512i
 297 get_next_bytes_avx512x16(const struct acl_flow_avx512 *flow, __m512i pdata[2],
 298         uint32_t msk, __m512i *di, uint32_t bnum)
 299 {
 300         const int32_t *div;
 301         uint32_t m[2];
 302         __m512i one, zero, t, p[2];
 303         ymm_t inp[2];
 304
 305         div = (const int32_t *)flow->data_index;
 306
 307         one = _mm512_set1_epi32(1);
 308         zero = _mm512_xor_si512(one, one);
 309
 310         /* load data offsets for given indexes */
 311         t = _mm512_mask_i32gather_epi32(zero, msk, *di, div, sizeof(div[0]));
 312
 313         /* increment data indexes */
 314         *di = _mm512_mask_add_epi32(*di, msk, *di, one);
 315
 316         /*
 317          * unsigned expand 32-bit indexes to 64-bit
 318          * (for later pointer arithmetic), i.e:
 319          * for (i = 0; i != 16; i++)
 320          *   p[i/8].u64[i%8] = (uint64_t)t.u32[i];
 321          */
 322         p[0] = _mm512_maskz_permutexvar_epi32(zmm_pmidx_msk, zmm_pmidx[0].z, t);
 323         p[1] = _mm512_maskz_permutexvar_epi32(zmm_pmidx_msk, zmm_pmidx[1].z, t);
 324
 325         p[0] = _mm512_add_epi64(p[0], pdata[0]);
 326         p[1] = _mm512_add_epi64(p[1], pdata[1]);
 327
 328         /* load input byte(s), either one or four */
 329
 330         m[0] = msk & ZMM_PTR_MSK;
 331         m[1] = msk >> ZMM_PTR_NUM;
 332
 333         if (bnum == sizeof(uint8_t)) {
 334                 inp[0] = _m512_mask_gather_epi8x8(p[0], m[0]);
 335                 inp[1] = _m512_mask_gather_epi8x8(p[1], m[1]);
 336         } else {
 337                 inp[0] = _mm512_mask_i64gather_epi32(
 338                                 _mm512_castsi512_si256(zero), m[0], p[0],
 339                                 NULL, sizeof(uint8_t));
 340                 inp[1] = _mm512_mask_i64gather_epi32(
 341                                 _mm512_castsi512_si256(zero), m[1], p[1],
 342                                 NULL, sizeof(uint8_t));
 343         }
 344
 345         /* squeeze input into one 512-bit register */
 346         return _mm512_permutex2var_epi32(_mm512_castsi256_si512(inp[0]),
 347                         zmm_pminp.z, _mm512_castsi256_si512(inp[1]));
 348 }
 349
 350 /*
 351  * Start up to 16 new flows.
 352  * num - number of flows to start
 353  * msk - mask of new flows.
 354  * pdata - pointers to flow input data
 355  * idx - match indexed for given flows
 356  * di - data indexes for these flows.
 357  */
 358 static inline void
 359 start_flow16(struct acl_flow_avx512 *flow, uint32_t num, uint32_t msk,
 360         __m512i pdata[2], __m512i *idx, __m512i *di)
 361 {
 362         uint32_t n, m[2], nm[2];
 363         __m512i ni, nd[2];
 364
 365         /* split mask into two - one for each pdata[] */
 366         m[0] = msk & ZMM_PTR_MSK;
 367         m[1] = msk >> ZMM_PTR_NUM;
 368
 369         /* calculate masks for new flows */
 370         n = __builtin_popcount(m[0]);
 371         nm[0] = (1 << n) - 1;
 372         nm[1] = (1 << (num - n)) - 1;
 373
 374         /* load input data pointers for new flows */
 375         nd[0] = _mm512_maskz_loadu_epi64(nm[0],
 376                 flow->idata + flow->num_packets);
 377         nd[1] = _mm512_maskz_loadu_epi64(nm[1],
 378                 flow->idata + flow->num_packets + n);
 379
 380         /* calculate match indexes of new flows */
 381         ni = _mm512_set1_epi32(flow->num_packets);
 382         ni = _mm512_add_epi32(ni, zmm_idx_add.z);
 383
 384         /* merge new and existing flows data */
 385         pdata[0] = _mm512_mask_expand_epi64(pdata[0], m[0], nd[0]);
 386         pdata[1] = _mm512_mask_expand_epi64(pdata[1], m[1], nd[1]);
 387
 388         /* update match and data indexes */
 389         *idx = _mm512_mask_expand_epi32(*idx, msk, ni);
 390         *di = _mm512_maskz_mov_epi32(msk ^ UINT16_MAX, *di);
 391
 392         flow->num_packets += num;
 393 }
 394
 395 /*
 396  * Process found matches for up to 16 flows.
 397  * fmsk - mask of active flows
 398  * rmsk - mask of found matches
 399  * pdata - pointers to flow input data
 400  * di - data indexes for these flows
 401  * idx - match indexed for given flows
 402  * tr_lo contains low 32 bits for up to 8 transitions.
 403  * tr_hi contains high 32 bits for up to 8 transitions.
 404  */
 405 static inline uint32_t
 406 match_process_avx512x16(struct acl_flow_avx512 *flow, uint32_t *fmsk,
 407         uint32_t *rmsk, __m512i pdata[2], __m512i *di, __m512i *idx,
 408         __m512i *tr_lo, __m512i *tr_hi)
 409 {
 410         uint32_t n;
 411         __m512i res;
 412
 413         if (rmsk[0] == 0)
 414                 return 0;
 415
 416         /* extract match indexes */
 417         res = _mm512_and_si512(tr_lo[0], zmm_index_mask.z);
 418
 419         /* mask  matched transitions to nop */
 420         tr_lo[0] = _mm512_mask_mov_epi32(tr_lo[0], rmsk[0], zmm_trlo_idle.z);
 421         tr_hi[0] = _mm512_mask_mov_epi32(tr_hi[0], rmsk[0], zmm_trhi_idle.z);
 422
 423         /* save found match indexes */
 424         _mm512_mask_i32scatter_epi32(flow->matches, rmsk[0],
 425                 idx[0], res, sizeof(flow->matches[0]));
 426
 427         /* update masks and start new flows for matches */
 428         n = update_flow_mask(flow, fmsk, rmsk);
 429         start_flow16(flow, n, rmsk[0], pdata, idx, di);
 430
 431         return n;
 432 }
 433
 434 /*
 435  * Test for matches ut to 32 (2x16) flows at once,
 436  * if matches exist - process them and start new flows.
 437  */
 438 static inline void
 439 match_check_process_avx512x16x2(struct acl_flow_avx512 *flow, uint32_t fm[2],
 440         __m512i pdata[4], __m512i di[2], __m512i idx[2], __m512i inp[2],
 441         __m512i tr_lo[2], __m512i tr_hi[2])
 442 {
 443         uint32_t n[2];
 444         uint32_t rm[2];
 445
 446         /* check for matches */
 447         rm[0] = _mm512_test_epi32_mask(tr_lo[0], zmm_match_mask.z);
 448         rm[1] = _mm512_test_epi32_mask(tr_lo[1], zmm_match_mask.z);
 449
 450         /* till unprocessed matches exist */
 451         while ((rm[0] | rm[1]) != 0) {
 452
 453                 /* process matches and start new flows */
 454                 n[0] = match_process_avx512x16(flow, &fm[0], &rm[0], &pdata[0],
 455                         &di[0], &idx[0], &tr_lo[0], &tr_hi[0]);
 456                 n[1] = match_process_avx512x16(flow, &fm[1], &rm[1], &pdata[2],
 457                         &di[1], &idx[1], &tr_lo[1], &tr_hi[1]);
 458
 459                 /* execute first transition for new flows, if any */
 460
 461                 if (n[0] != 0) {
 462                         inp[0] = get_next_bytes_avx512x16(flow, &pdata[0],
 463                                 rm[0], &di[0], flow->first_load_sz);
 464                         first_trans16(flow, inp[0], rm[0], &tr_lo[0],
 465                                 &tr_hi[0]);
 466                         rm[0] = _mm512_test_epi32_mask(tr_lo[0],
 467                                 zmm_match_mask.z);
 468                 }
 469
 470                 if (n[1] != 0) {
 471                         inp[1] = get_next_bytes_avx512x16(flow, &pdata[2],
 472                                 rm[1], &di[1], flow->first_load_sz);
 473                         first_trans16(flow, inp[1], rm[1], &tr_lo[1],
 474                                 &tr_hi[1]);
 475                         rm[1] = _mm512_test_epi32_mask(tr_lo[1],
 476                                 zmm_match_mask.z);
 477                 }
 478         }
 479 }
 480
 481 /*
 482  * Perform search for up to 32 flows in parallel.
 483  * Use two sets of metadata, each serves 16 flows max.
 484  * So in fact we perform search for 2x16 flows.
 485  */
 486 static inline void
 487 search_trie_avx512x16x2(struct acl_flow_avx512 *flow)
 488 {
 489         uint32_t fm[2];
 490         __m512i di[2], idx[2], in[2], pdata[4], tr_lo[2], tr_hi[2];
 491
 492         /* first 1B load */
 493         start_flow16(flow, MASK16_BIT, UINT16_MAX, &pdata[0], &idx[0], &di[0]);
 494         start_flow16(flow, MASK16_BIT, UINT16_MAX, &pdata[2], &idx[1], &di[1]);
 495
 496         in[0] = get_next_bytes_avx512x16(flow, &pdata[0], UINT16_MAX, &di[0],
 497                         flow->first_load_sz);
 498         in[1] = get_next_bytes_avx512x16(flow, &pdata[2], UINT16_MAX, &di[1],
 499                         flow->first_load_sz);
 500
 501         first_trans16(flow, in[0], UINT16_MAX, &tr_lo[0], &tr_hi[0]);
 502         first_trans16(flow, in[1], UINT16_MAX, &tr_lo[1], &tr_hi[1]);
 503
 504         fm[0] = UINT16_MAX;
 505         fm[1] = UINT16_MAX;
 506
 507         /* match check */
 508         match_check_process_avx512x16x2(flow, fm, pdata, di, idx, in,
 509                 tr_lo, tr_hi);
 510
 511         while ((fm[0] | fm[1]) != 0) {
 512
 513                 /* load next 4B */
 514
 515                 in[0] = get_next_bytes_avx512x16(flow, &pdata[0], fm[0],
 516                         &di[0], sizeof(uint32_t));
 517                 in[1] = get_next_bytes_avx512x16(flow, &pdata[2], fm[1],
 518                         &di[1], sizeof(uint32_t));
 519
 520                 /* main 4B loop */
 521
 522                 in[0] = transition16(in[0], flow->trans, &tr_lo[0], &tr_hi[0]);
 523                 in[1] = transition16(in[1], flow->trans, &tr_lo[1], &tr_hi[1]);
 524
 525                 in[0] = transition16(in[0], flow->trans, &tr_lo[0], &tr_hi[0]);
 526                 in[1] = transition16(in[1], flow->trans, &tr_lo[1], &tr_hi[1]);
 527
 528                 in[0] = transition16(in[0], flow->trans, &tr_lo[0], &tr_hi[0]);
 529                 in[1] = transition16(in[1], flow->trans, &tr_lo[1], &tr_hi[1]);
 530
 531                 in[0] = transition16(in[0], flow->trans, &tr_lo[0], &tr_hi[0]);
 532                 in[1] = transition16(in[1], flow->trans, &tr_lo[1], &tr_hi[1]);
 533
 534                 /* check for matches */
 535                 match_check_process_avx512x16x2(flow, fm, pdata, di, idx, in,
 536                         tr_lo, tr_hi);
 537         }
 538 }
 539
 540 /*
 541  * Resolve matches for multiple categories (GT 8, use 512b instuctions/regs)
 542  */
 543 static inline void
 544 resolve_mcgt8_avx512x1(uint32_t result[],
 545         const struct rte_acl_match_results pr[], const uint32_t match[],
 546         uint32_t nb_pkt, uint32_t nb_cat, uint32_t nb_trie)
 547 {
 548         const int32_t *pri;
 549         const uint32_t *pm, *res;
 550         uint32_t i, k, mi;
 551         __mmask16 cm, sm;
 552         __m512i cp, cr, np, nr;
 553
 554         const uint32_t match_log = 5;
 555
 556         res = pr->results;
 557         pri = pr->priority;
 558
 559         cm = (1 << nb_cat) - 1;
 560
 561         for (k = 0; k != nb_pkt; k++, result += nb_cat) {
 562
 563                 mi = match[k] << match_log;
 564
 565                 cr = _mm512_maskz_loadu_epi32(cm, res + mi);
 566                 cp = _mm512_maskz_loadu_epi32(cm, pri + mi);
 567
 568                 for (i = 1, pm = match + nb_pkt; i != nb_trie;
 569                                 i++, pm += nb_pkt) {
 570
 571                         mi = pm[k] << match_log;
 572
 573                         nr = _mm512_maskz_loadu_epi32(cm, res + mi);
 574                         np = _mm512_maskz_loadu_epi32(cm, pri + mi);
 575
 576                         sm = _mm512_cmpgt_epi32_mask(cp, np);
 577                         cr = _mm512_mask_mov_epi32(nr, sm, cr);
 578                         cp = _mm512_mask_mov_epi32(np, sm, cp);
 579                 }
 580
 581                 _mm512_mask_storeu_epi32(result, cm, cr);
 582         }
 583 }
 584
 585 /*
 586  * resolve match index to actual result/priority offset.
 587  */
 588 static inline __m512i
 589 resolve_match_idx_avx512x16(__m512i mi)
 590 {
 591         RTE_BUILD_BUG_ON(sizeof(struct rte_acl_match_results) !=
 592                 1 << (match_log + 2));
 593         return _mm512_slli_epi32(mi, match_log);
 594 }
 595
 596 /*
 597  * Resolve multiple matches for the same flow based on priority.
 598  */
 599 static inline __m512i
 600 resolve_pri_avx512x16(const int32_t res[], const int32_t pri[],
 601         const uint32_t match[], __mmask16 msk, uint32_t nb_trie,
 602         uint32_t nb_skip)
 603 {
 604         uint32_t i;
 605         const uint32_t *pm;
 606         __mmask16 m;
 607         __m512i cp, cr, np, nr, mch;
 608
 609         const __m512i zero = _mm512_set1_epi32(0);
 610
 611         /* get match indexes */
 612         mch = _mm512_maskz_loadu_epi32(msk, match);
 613         mch = resolve_match_idx_avx512x16(mch);
 614
 615         /* read result and priority values for first trie */
 616         cr = _mm512_mask_i32gather_epi32(zero, msk, mch, res, sizeof(res[0]));
 617         cp = _mm512_mask_i32gather_epi32(zero, msk, mch, pri, sizeof(pri[0]));
 618
 619         /*
 620          * read result and priority values for next tries and select one
 621          * with highest priority.
 622          */
 623         for (i = 1, pm = match + nb_skip; i != nb_trie;
 624                         i++, pm += nb_skip) {
 625
 626                 mch = _mm512_maskz_loadu_epi32(msk, pm);
 627                 mch = resolve_match_idx_avx512x16(mch);
 628
 629                 nr = _mm512_mask_i32gather_epi32(zero, msk, mch, res,
 630                         sizeof(res[0]));
 631                 np = _mm512_mask_i32gather_epi32(zero, msk, mch, pri,
 632                         sizeof(pri[0]));
 633
 634                 m = _mm512_cmpgt_epi32_mask(cp, np);
 635                 cr = _mm512_mask_mov_epi32(nr, m, cr);
 636                 cp = _mm512_mask_mov_epi32(np, m, cp);
 637         }
 638
 639         return cr;
 640 }
 641
 642 /*
 643  * Resolve num (<= 16) matches for single category
 644  */
 645 static inline void
 646 resolve_sc_avx512x16(uint32_t result[], const int32_t res[],
 647         const int32_t pri[], const uint32_t match[], uint32_t nb_pkt,
 648         uint32_t nb_trie, uint32_t nb_skip)
 649 {
 650         __mmask16 msk;
 651         __m512i cr;
 652
 653         msk = (1 << nb_pkt) - 1;
 654         cr = resolve_pri_avx512x16(res, pri, match, msk, nb_trie, nb_skip);
 655         _mm512_mask_storeu_epi32(result, msk, cr);
 656 }
 657
 658 /*
 659  * Resolve matches for single category
 660  */
 661 static inline void
 662 resolve_sc_avx512x16x2(uint32_t result[],
 663         const struct rte_acl_match_results pr[], const uint32_t match[],
 664         uint32_t nb_pkt, uint32_t nb_trie)
 665 {
 666         uint32_t j, k, n;
 667         const int32_t *res, *pri;
 668         __m512i cr[2];
 669
 670         res = (const int32_t *)pr->results;
 671         pri = pr->priority;
 672
 673         for (k = 0; k != (nb_pkt & ~MSK_AVX512X16X2); k += NUM_AVX512X16X2) {
 674
 675                 j = k + MASK16_BIT;
 676
 677                 cr[0] = resolve_pri_avx512x16(res, pri, match + k, UINT16_MAX,
 678                                 nb_trie, nb_pkt);
 679                 cr[1] = resolve_pri_avx512x16(res, pri, match + j, UINT16_MAX,
 680                                 nb_trie, nb_pkt);
 681
 682                 _mm512_storeu_si512(result + k, cr[0]);
 683                 _mm512_storeu_si512(result + j, cr[1]);
 684         }
 685
 686         n = nb_pkt - k;
 687         if (n != 0) {
 688                 if (n > MASK16_BIT) {
 689                         resolve_sc_avx512x16(result + k, res, pri, match + k,
 690                                 MASK16_BIT, nb_trie, nb_pkt);
 691                         k += MASK16_BIT;
 692                         n -= MASK16_BIT;
 693                 }
 694                 resolve_sc_avx512x16(result + k, res, pri, match + k, n,
 695                                 nb_trie, nb_pkt);
 696         }
 697 }
 698
 699 static inline int
 700 search_avx512x16x2(const struct rte_acl_ctx *ctx, const uint8_t **data,
 701         uint32_t *results, uint32_t total_packets, uint32_t categories)
 702 {
 703         uint32_t i, *pm;
 704         const struct rte_acl_match_results *pr;
 705         struct acl_flow_avx512 flow;
 706         uint32_t match[ctx->num_tries * total_packets];
 707
 708         for (i = 0, pm = match; i != ctx->num_tries; i++, pm += total_packets) {
 709
 710                 /* setup for next trie */
 711                 acl_set_flow_avx512(&flow, ctx, i, data, pm, total_packets);
 712
 713                 /* process the trie */
 714                 search_trie_avx512x16x2(&flow);
 715         }
 716
 717         /* resolve matches */
 718         pr = (const struct rte_acl_match_results *)
 719                 (ctx->trans_table + ctx->match_index);
 720
 721         if (categories == 1)
 722                 resolve_sc_avx512x16x2(results, pr, match, total_packets,
 723                         ctx->num_tries);
 724         else if (categories <= RTE_ACL_MAX_CATEGORIES / 2)
 725                 resolve_mcle8_avx512x1(results, pr, match, total_packets,
 726                         categories, ctx->num_tries);
 727         else
 728                 resolve_mcgt8_avx512x1(results, pr, match, total_packets,
 729                         categories, ctx->num_tries);
 730
 731         return 0;
 732 }