lib/acl/acl_run_avx512_common.h

   1 /* SPDX-License-Identifier: BSD-3-Clause
   2  * Copyright(c) 2020 Intel Corporation
   3  */
   4
   5 /*
   6  * WARNING: It is not recommended to include this file directly.
   7  * Please include "acl_run_avx512x*.h" instead.
   8  * To make this file to generate proper code an includer has to
   9  * define several macros, refer to "acl_run_avx512x*.h" for more details.
  10  */
  11
  12 /*
  13  * Calculate the address of the next transition for
  14  * all types of nodes. Note that only DFA nodes and range
  15  * nodes actually transition to another node. Match
  16  * nodes not supposed to be encountered here.
  17  * For quad range nodes:
  18  * Calculate number of range boundaries that are less than the
  19  * input value. Range boundaries for each node are in signed 8 bit,
  20  * ordered from -128 to 127.
  21  * This is effectively a popcnt of bytes that are greater than the
  22  * input byte.
  23  * Single nodes are processed in the same ways as quad range nodes.
  24  */
  25 static __rte_always_inline _T_simd
  26 _F_(calc_addr)(_T_simd index_mask, _T_simd next_input, _T_simd shuffle_input,
  27         _T_simd four_32, _T_simd range_base, _T_simd tr_lo, _T_simd tr_hi)
  28 {
  29         __mmask64 qm;
  30         _T_mask dfa_msk;
  31         _T_simd addr, in, node_type, r, t;
  32         _T_simd dfa_ofs, quad_ofs;
  33
  34         t = _M_SI_(xor)(index_mask, index_mask);
  35         in = _M_I_(shuffle_epi8)(next_input, shuffle_input);
  36
  37         /* Calc node type and node addr */
  38         node_type = _M_SI_(andnot)(index_mask, tr_lo);
  39         addr = _M_SI_(and)(index_mask, tr_lo);
  40
  41         /* mask for DFA type(0) nodes */
  42         dfa_msk = _M_I_(cmpeq_epi32_mask)(node_type, t);
  43
  44         /* DFA calculations. */
  45         r = _M_I_(srli_epi32)(in, 30);
  46         r = _M_I_(add_epi8)(r, range_base);
  47         t = _M_I_(srli_epi32)(in, 24);
  48         r = _M_I_(shuffle_epi8)(tr_hi, r);
  49
  50         dfa_ofs = _M_I_(sub_epi32)(t, r);
  51
  52         /* QUAD/SINGLE calculations. */
  53         qm = _M_I_(cmpgt_epi8_mask)(in, tr_hi);
  54         t = _M_I_(maskz_set1_epi8)(qm, (uint8_t)UINT8_MAX);
  55         t = _M_I_(lzcnt_epi32)(t);
  56         t = _M_I_(srli_epi32)(t, 3);
  57         quad_ofs = _M_I_(sub_epi32)(four_32, t);
  58
  59         /* blend DFA and QUAD/SINGLE. */
  60         t = _M_I_(mask_mov_epi32)(quad_ofs, dfa_msk, dfa_ofs);
  61
  62         /* calculate address for next transitions. */
  63         addr = _M_I_(add_epi32)(addr, t);
  64         return addr;
  65 }
  66
  67 /*
  68  * Process _N_ transitions in parallel.
  69  * tr_lo contains low 32 bits for _N_ transition.
  70  * tr_hi contains high 32 bits for _N_ transition.
  71  * next_input contains up to 4 input bytes for _N_ flows.
  72  */
  73 static __rte_always_inline _T_simd
  74 _F_(trans)(_T_simd next_input, const uint64_t *trans, _T_simd *tr_lo,
  75         _T_simd *tr_hi)
  76 {
  77         const int32_t *tr;
  78         _T_simd addr;
  79
  80         tr = (const int32_t *)(uintptr_t)trans;
  81
  82         /* Calculate the address (array index) for all _N_ transitions. */
  83         addr = _F_(calc_addr)(_SV_(index_mask), next_input, _SV_(shuffle_input),
  84                 _SV_(four_32), _SV_(range_base), *tr_lo, *tr_hi);
  85
  86         /* load lower 32 bits of _N_ transactions at once. */
  87         *tr_lo = _M_GI_(i32gather_epi32, addr, tr, sizeof(trans[0]));
  88
  89         next_input = _M_I_(srli_epi32)(next_input, CHAR_BIT);
  90
  91         /* load high 32 bits of _N_ transactions at once. */
  92         *tr_hi = _M_GI_(i32gather_epi32, addr, (tr + 1), sizeof(trans[0]));
  93
  94         return next_input;
  95 }
  96
  97 /*
  98  * Execute first transition for up to _N_ flows in parallel.
  99  * next_input should contain one input byte for up to _N_ flows.
 100  * msk - mask of active flows.
 101  * tr_lo contains low 32 bits for up to _N_ transitions.
 102  * tr_hi contains high 32 bits for up to _N_ transitions.
 103  */
 104 static __rte_always_inline void
 105 _F_(first_trans)(const struct acl_flow_avx512 *flow, _T_simd next_input,
 106         _T_mask msk, _T_simd *tr_lo, _T_simd *tr_hi)
 107 {
 108         const int32_t *tr;
 109         _T_simd addr, root;
 110
 111         tr = (const int32_t *)(uintptr_t)flow->trans;
 112
 113         addr = _M_I_(set1_epi32)(UINT8_MAX);
 114         root = _M_I_(set1_epi32)(flow->root_index);
 115
 116         addr = _M_SI_(and)(next_input, addr);
 117         addr = _M_I_(add_epi32)(root, addr);
 118
 119         /* load lower 32 bits of _N_ transactions at once. */
 120         *tr_lo = _M_MGI_(mask_i32gather_epi32)(*tr_lo, msk, addr, tr,
 121                 sizeof(flow->trans[0]));
 122
 123         /* load high 32 bits of _N_ transactions at once. */
 124         *tr_hi = _M_MGI_(mask_i32gather_epi32)(*tr_hi, msk, addr, (tr + 1),
 125                 sizeof(flow->trans[0]));
 126 }
 127
 128 /*
 129  * Load and return next 4 input bytes for up to _N_ flows in parallel.
 130  * pdata - 8x2 pointers to flow input data
 131  * mask - mask of active flows.
 132  * di - data indexes for these _N_ flows.
 133  */
 134 static inline _T_simd
 135 _F_(get_next_bytes)(const struct acl_flow_avx512 *flow, _T_simd pdata[2],
 136         uint32_t msk, _T_simd *di, uint32_t bnum)
 137 {
 138         const int32_t *div;
 139         uint32_t m[2];
 140         _T_simd one, zero, t, p[2];
 141
 142         div = (const int32_t *)flow->data_index;
 143
 144         one = _M_I_(set1_epi32)(1);
 145         zero = _M_SI_(xor)(one, one);
 146
 147         /* load data offsets for given indexes */
 148         t = _M_MGI_(mask_i32gather_epi32)(zero, msk, *di, div, sizeof(div[0]));
 149
 150         /* increment data indexes */
 151         *di = _M_I_(mask_add_epi32)(*di, msk, *di, one);
 152
 153         /*
 154          * unsigned expand 32-bit indexes to 64-bit
 155          * (for later pointer arithmetic), i.e:
 156          * for (i = 0; i != _N_; i++)
 157          *   p[i/8].u64[i%8] = (uint64_t)t.u32[i];
 158          */
 159         p[0] = _M_I_(maskz_permutexvar_epi32)(_SC_(pmidx_msk), _SV_(pmidx[0]),
 160                         t);
 161         p[1] = _M_I_(maskz_permutexvar_epi32)(_SC_(pmidx_msk), _SV_(pmidx[1]),
 162                         t);
 163
 164         p[0] = _M_I_(add_epi64)(p[0], pdata[0]);
 165         p[1] = _M_I_(add_epi64)(p[1], pdata[1]);
 166
 167         /* load input byte(s), either one or four */
 168
 169         m[0] = msk & _SIMD_PTR_MSK_;
 170         m[1] = msk >> _SIMD_PTR_NUM_;
 171
 172         return _F_(gather_bytes)(zero, p, m, bnum);
 173 }
 174
 175 /*
 176  * Start up to _N_ new flows.
 177  * num - number of flows to start
 178  * msk - mask of new flows.
 179  * pdata - pointers to flow input data
 180  * idx - match indexed for given flows
 181  * di - data indexes for these flows.
 182  */
 183 static inline void
 184 _F_(start_flow)(struct acl_flow_avx512 *flow, uint32_t num, uint32_t msk,
 185         _T_simd pdata[2], _T_simd *idx, _T_simd *di)
 186 {
 187         uint32_t n, m[2], nm[2];
 188         _T_simd ni, nd[2];
 189
 190         /* split mask into two - one for each pdata[] */
 191         m[0] = msk & _SIMD_PTR_MSK_;
 192         m[1] = msk >> _SIMD_PTR_NUM_;
 193
 194         /* calculate masks for new flows */
 195         n = __builtin_popcount(m[0]);
 196         nm[0] = (1 << n) - 1;
 197         nm[1] = (1 << (num - n)) - 1;
 198
 199         /* load input data pointers for new flows */
 200         nd[0] = _M_I_(maskz_loadu_epi64)(nm[0],
 201                         flow->idata + flow->num_packets);
 202         nd[1] = _M_I_(maskz_loadu_epi64)(nm[1],
 203                         flow->idata + flow->num_packets + n);
 204
 205         /* calculate match indexes of new flows */
 206         ni = _M_I_(set1_epi32)(flow->num_packets);
 207         ni = _M_I_(add_epi32)(ni, _SV_(idx_add));
 208
 209         /* merge new and existing flows data */
 210         pdata[0] = _M_I_(mask_expand_epi64)(pdata[0], m[0], nd[0]);
 211         pdata[1] = _M_I_(mask_expand_epi64)(pdata[1], m[1], nd[1]);
 212
 213         /* update match and data indexes */
 214         *idx = _M_I_(mask_expand_epi32)(*idx, msk, ni);
 215         *di = _M_I_(maskz_mov_epi32)(msk ^ _SIMD_MASK_MAX_, *di);
 216
 217         flow->num_packets += num;
 218 }
 219
 220 /*
 221  * Process found matches for up to _N_ flows.
 222  * fmsk - mask of active flows
 223  * rmsk - mask of found matches
 224  * pdata - pointers to flow input data
 225  * di - data indexes for these flows
 226  * idx - match indexed for given flows
 227  * tr_lo contains low 32 bits for up to _N_ transitions.
 228  * tr_hi contains high 32 bits for up to _N_ transitions.
 229  */
 230 static inline uint32_t
 231 _F_(match_process)(struct acl_flow_avx512 *flow, uint32_t *fmsk,
 232         uint32_t *rmsk, _T_simd pdata[2], _T_simd *di, _T_simd *idx,
 233         _T_simd *tr_lo, _T_simd *tr_hi)
 234 {
 235         uint32_t n;
 236         _T_simd res;
 237
 238         if (rmsk[0] == 0)
 239                 return 0;
 240
 241         /* extract match indexes */
 242         res = _M_SI_(and)(tr_lo[0], _SV_(index_mask));
 243
 244         /* mask  matched transitions to nop */
 245         tr_lo[0] = _M_I_(mask_mov_epi32)(tr_lo[0], rmsk[0], _SV_(trlo_idle));
 246         tr_hi[0] = _M_I_(mask_mov_epi32)(tr_hi[0], rmsk[0], _SV_(trhi_idle));
 247
 248         /* save found match indexes */
 249         _M_I_(mask_i32scatter_epi32)((void *)flow->matches, rmsk[0], idx[0],
 250                         res, sizeof(flow->matches[0]));
 251
 252         /* update masks and start new flows for matches */
 253         n = update_flow_mask(flow, fmsk, rmsk);
 254         _F_(start_flow)(flow, n, rmsk[0], pdata, idx, di);
 255
 256         return n;
 257 }
 258
 259 /*
 260  * Test for matches ut to (2 * _N_) flows at once,
 261  * if matches exist - process them and start new flows.
 262  */
 263 static inline void
 264 _F_(match_check_process)(struct acl_flow_avx512 *flow, uint32_t fm[2],
 265         _T_simd pdata[4], _T_simd di[2], _T_simd idx[2], _T_simd inp[2],
 266         _T_simd tr_lo[2], _T_simd tr_hi[2])
 267 {
 268         uint32_t n[2];
 269         uint32_t rm[2];
 270
 271         /* check for matches */
 272         rm[0] = _M_I_(test_epi32_mask)(tr_lo[0], _SV_(match_mask));
 273         rm[1] = _M_I_(test_epi32_mask)(tr_lo[1], _SV_(match_mask));
 274
 275         /* till unprocessed matches exist */
 276         while ((rm[0] | rm[1]) != 0) {
 277
 278                 /* process matches and start new flows */
 279                 n[0] = _F_(match_process)(flow, &fm[0], &rm[0], &pdata[0],
 280                         &di[0], &idx[0], &tr_lo[0], &tr_hi[0]);
 281                 n[1] = _F_(match_process)(flow, &fm[1], &rm[1], &pdata[2],
 282                         &di[1], &idx[1], &tr_lo[1], &tr_hi[1]);
 283
 284                 /* execute first transition for new flows, if any */
 285
 286                 if (n[0] != 0) {
 287                         inp[0] = _F_(get_next_bytes)(flow, &pdata[0],
 288                                         rm[0], &di[0], flow->first_load_sz);
 289                         _F_(first_trans)(flow, inp[0], rm[0], &tr_lo[0],
 290                                         &tr_hi[0]);
 291                         rm[0] = _M_I_(test_epi32_mask)(tr_lo[0],
 292                                         _SV_(match_mask));
 293                 }
 294
 295                 if (n[1] != 0) {
 296                         inp[1] = _F_(get_next_bytes)(flow, &pdata[2],
 297                                         rm[1], &di[1], flow->first_load_sz);
 298                         _F_(first_trans)(flow, inp[1], rm[1], &tr_lo[1],
 299                                         &tr_hi[1]);
 300                         rm[1] = _M_I_(test_epi32_mask)(tr_lo[1],
 301                                         _SV_(match_mask));
 302                 }
 303         }
 304 }
 305
 306 static inline void
 307 _F_(reset_flow_vars)(_T_simd di[2], _T_simd idx[2], _T_simd pdata[4],
 308         _T_simd tr_lo[2], _T_simd tr_hi[2])
 309 {
 310         di[0] = _M_SI_(setzero)();
 311         di[1] = _M_SI_(setzero)();
 312
 313         idx[0] = _M_SI_(setzero)();
 314         idx[1] = _M_SI_(setzero)();
 315
 316         pdata[0] = _M_SI_(setzero)();
 317         pdata[1] = _M_SI_(setzero)();
 318         pdata[2] = _M_SI_(setzero)();
 319         pdata[3] = _M_SI_(setzero)();
 320
 321         tr_lo[0] = _M_SI_(setzero)();
 322         tr_lo[1] = _M_SI_(setzero)();
 323
 324         tr_hi[0] = _M_SI_(setzero)();
 325         tr_hi[1] = _M_SI_(setzero)();
 326 }
 327
 328 /*
 329  * Perform search for up to (2 * _N_) flows in parallel.
 330  * Use two sets of metadata, each serves _N_ flows max.
 331  */
 332 static inline void
 333 _F_(search_trie)(struct acl_flow_avx512 *flow)
 334 {
 335         uint32_t fm[2];
 336         _T_simd di[2], idx[2], in[2], pdata[4], tr_lo[2], tr_hi[2];
 337
 338         _F_(reset_flow_vars)(di, idx, pdata, tr_lo, tr_hi);
 339
 340         /* first 1B load */
 341         _F_(start_flow)(flow, _SIMD_MASK_BIT_, _SIMD_MASK_MAX_,
 342                         &pdata[0], &idx[0], &di[0]);
 343         _F_(start_flow)(flow, _SIMD_MASK_BIT_, _SIMD_MASK_MAX_,
 344                         &pdata[2], &idx[1], &di[1]);
 345
 346         in[0] = _F_(get_next_bytes)(flow, &pdata[0], _SIMD_MASK_MAX_, &di[0],
 347                         flow->first_load_sz);
 348         in[1] = _F_(get_next_bytes)(flow, &pdata[2], _SIMD_MASK_MAX_, &di[1],
 349                         flow->first_load_sz);
 350
 351         _F_(first_trans)(flow, in[0], _SIMD_MASK_MAX_, &tr_lo[0], &tr_hi[0]);
 352         _F_(first_trans)(flow, in[1], _SIMD_MASK_MAX_, &tr_lo[1], &tr_hi[1]);
 353
 354         fm[0] = _SIMD_MASK_MAX_;
 355         fm[1] = _SIMD_MASK_MAX_;
 356
 357         /* match check */
 358         _F_(match_check_process)(flow, fm, pdata, di, idx, in, tr_lo, tr_hi);
 359
 360         while ((fm[0] | fm[1]) != 0) {
 361
 362                 /* load next 4B */
 363
 364                 in[0] = _F_(get_next_bytes)(flow, &pdata[0], fm[0],
 365                                 &di[0], sizeof(uint32_t));
 366                 in[1] = _F_(get_next_bytes)(flow, &pdata[2], fm[1],
 367                                 &di[1], sizeof(uint32_t));
 368
 369                 /* main 4B loop */
 370
 371                 in[0] = _F_(trans)(in[0], flow->trans, &tr_lo[0], &tr_hi[0]);
 372                 in[1] = _F_(trans)(in[1], flow->trans, &tr_lo[1], &tr_hi[1]);
 373
 374                 in[0] = _F_(trans)(in[0], flow->trans, &tr_lo[0], &tr_hi[0]);
 375                 in[1] = _F_(trans)(in[1], flow->trans, &tr_lo[1], &tr_hi[1]);
 376
 377                 in[0] = _F_(trans)(in[0], flow->trans, &tr_lo[0], &tr_hi[0]);
 378                 in[1] = _F_(trans)(in[1], flow->trans, &tr_lo[1], &tr_hi[1]);
 379
 380                 in[0] = _F_(trans)(in[0], flow->trans, &tr_lo[0], &tr_hi[0]);
 381                 in[1] = _F_(trans)(in[1], flow->trans, &tr_lo[1], &tr_hi[1]);
 382
 383                 /* check for matches */
 384                 _F_(match_check_process)(flow, fm, pdata, di, idx, in,
 385                         tr_lo, tr_hi);
 386         }
 387 }
 388
 389 /*
 390  * resolve match index to actual result/priority offset.
 391  */
 392 static inline _T_simd
 393 _F_(resolve_match_idx)(_T_simd mi)
 394 {
 395         RTE_BUILD_BUG_ON(sizeof(struct rte_acl_match_results) !=
 396                 1 << (ACL_MATCH_LOG + 2));
 397         return _M_I_(slli_epi32)(mi, ACL_MATCH_LOG);
 398 }
 399
 400 /*
 401  * Resolve multiple matches for the same flow based on priority.
 402  */
 403 static inline _T_simd
 404 _F_(resolve_pri)(const int32_t res[], const int32_t pri[],
 405         const uint32_t match[], _T_mask msk, uint32_t nb_trie,
 406         uint32_t nb_skip)
 407 {
 408         uint32_t i;
 409         const uint32_t *pm;
 410         _T_mask m;
 411         _T_simd cp, cr, np, nr, mch;
 412
 413         const _T_simd zero = _M_I_(set1_epi32)(0);
 414
 415         /* get match indexes */
 416         mch = _M_I_(maskz_loadu_epi32)(msk, match);
 417         mch = _F_(resolve_match_idx)(mch);
 418
 419         /* read result and priority values for first trie */
 420         cr = _M_MGI_(mask_i32gather_epi32)(zero, msk, mch, res, sizeof(res[0]));
 421         cp = _M_MGI_(mask_i32gather_epi32)(zero, msk, mch, pri, sizeof(pri[0]));
 422
 423         /*
 424          * read result and priority values for next tries and select one
 425          * with highest priority.
 426          */
 427         for (i = 1, pm = match + nb_skip; i != nb_trie;
 428                         i++, pm += nb_skip) {
 429
 430                 mch = _M_I_(maskz_loadu_epi32)(msk, pm);
 431                 mch = _F_(resolve_match_idx)(mch);
 432
 433                 nr = _M_MGI_(mask_i32gather_epi32)(zero, msk, mch, res,
 434                                 sizeof(res[0]));
 435                 np = _M_MGI_(mask_i32gather_epi32)(zero, msk, mch, pri,
 436                                 sizeof(pri[0]));
 437
 438                 m = _M_I_(cmpgt_epi32_mask)(cp, np);
 439                 cr = _M_I_(mask_mov_epi32)(nr, m, cr);
 440                 cp = _M_I_(mask_mov_epi32)(np, m, cp);
 441         }
 442
 443         return cr;
 444 }
 445
 446 /*
 447  * Resolve num (<= _N_) matches for single category
 448  */
 449 static inline void
 450 _F_(resolve_sc)(uint32_t result[], const int32_t res[],
 451         const int32_t pri[], const uint32_t match[], uint32_t nb_pkt,
 452         uint32_t nb_trie, uint32_t nb_skip)
 453 {
 454         _T_mask msk;
 455         _T_simd cr;
 456
 457         msk = (1 << nb_pkt) - 1;
 458         cr = _F_(resolve_pri)(res, pri, match, msk, nb_trie, nb_skip);
 459         _M_I_(mask_storeu_epi32)(result, msk, cr);
 460 }
 461
 462 /*
 463  * Resolve matches for single category
 464  */
 465 static inline void
 466 _F_(resolve_single_cat)(uint32_t result[],
 467         const struct rte_acl_match_results pr[], const uint32_t match[],
 468         uint32_t nb_pkt, uint32_t nb_trie)
 469 {
 470         uint32_t j, k, n;
 471         const int32_t *res, *pri;
 472         _T_simd cr[2];
 473
 474         res = (const int32_t *)pr->results;
 475         pri = pr->priority;
 476
 477         for (k = 0; k != (nb_pkt & ~_SIMD_FLOW_MSK_); k += _SIMD_FLOW_NUM_) {
 478
 479                 j = k + _SIMD_MASK_BIT_;
 480
 481                 cr[0] = _F_(resolve_pri)(res, pri, match + k, _SIMD_MASK_MAX_,
 482                                 nb_trie, nb_pkt);
 483                 cr[1] = _F_(resolve_pri)(res, pri, match + j, _SIMD_MASK_MAX_,
 484                                 nb_trie, nb_pkt);
 485
 486                 _M_SI_(storeu)((void *)(result + k), cr[0]);
 487                 _M_SI_(storeu)((void *)(result + j), cr[1]);
 488         }
 489
 490         n = nb_pkt - k;
 491         if (n != 0) {
 492                 if (n > _SIMD_MASK_BIT_) {
 493                         _F_(resolve_sc)(result + k, res, pri, match + k,
 494                                 _SIMD_MASK_BIT_, nb_trie, nb_pkt);
 495                         k += _SIMD_MASK_BIT_;
 496                         n -= _SIMD_MASK_BIT_;
 497                 }
 498                 _F_(resolve_sc)(result + k, res, pri, match + k, n,
 499                                 nb_trie, nb_pkt);
 500         }
 501 }