acl/acl_run_avx512x16.h

   1 /* SPDX-License-Identifier: BSD-3-Clause
   2  * Copyright(c) 2020 Intel Corporation
   3  */
   4
   5 /*
   6  * Defines required by "acl_run_avx512_common.h".
   7  * Note that all of them has to be undefined by the end
   8  * of this file, as "acl_run_avx512_common.h" can be included several
   9  * times from different *.h files for the same *.c.
  10  */
  11
  12 /*
  13  * This implementation uses 512-bit registers(zmm) and instrincts.
  14  * So our main SIMD type is 512-bit width and each such variable can
  15  * process sizeof(__m512i) / sizeof(uint32_t) == 16 entries in parallel.
  16  */
  17 #define _T_simd         __m512i
  18 #define _T_mask         __mmask16
  19
  20 /* Naming convention for static const variables. */
  21 #define _SC_(x)         zmm_##x
  22 #define _SV_(x)         (zmm_##x.z)
  23
  24 /* Naming convention for internal functions. */
  25 #define _F_(x)          x##_avx512x16
  26
  27 /*
  28  * Same instrincts have different syntaxis (depending on the bit-width),
  29  * so to overcome that few macros need to be defined.
  30  */
  31
  32 /* Naming convention for generic epi(packed integers) type instrincts. */
  33 #define _M_I_(x)        _mm512_##x
  34
  35 /* Naming convention for si(whole simd integer) type instrincts. */
  36 #define _M_SI_(x)       _mm512_##x##_si512
  37
  38 /* Naming convention for masked gather type instrincts. */
  39 #define _M_MGI_(x)      _mm512_##x
  40
  41 /* Naming convention for gather type instrincts. */
  42 #define _M_GI_(name, idx, base, scale)  _mm512_##name(idx, base, scale)
  43
  44 /* num/mask of transitions per SIMD regs */
  45 #define _SIMD_MASK_BIT_ (sizeof(_T_simd) / sizeof(uint32_t))
  46 #define _SIMD_MASK_MAX_ RTE_LEN2MASK(_SIMD_MASK_BIT_, uint32_t)
  47
  48 #define _SIMD_FLOW_NUM_ (2 * _SIMD_MASK_BIT_)
  49 #define _SIMD_FLOW_MSK_ (_SIMD_FLOW_NUM_ - 1)
  50
  51 /* num/mask of pointers per SIMD regs */
  52 #define _SIMD_PTR_NUM_  (sizeof(_T_simd) / sizeof(uintptr_t))
  53 #define _SIMD_PTR_MSK_  RTE_LEN2MASK(_SIMD_PTR_NUM_, uint32_t)
  54
  55 static const __rte_x86_zmm_t _SC_(match_mask) = {
  56         .u32 = {
  57                 RTE_ACL_NODE_MATCH,
  58                 RTE_ACL_NODE_MATCH,
  59                 RTE_ACL_NODE_MATCH,
  60                 RTE_ACL_NODE_MATCH,
  61                 RTE_ACL_NODE_MATCH,
  62                 RTE_ACL_NODE_MATCH,
  63                 RTE_ACL_NODE_MATCH,
  64                 RTE_ACL_NODE_MATCH,
  65                 RTE_ACL_NODE_MATCH,
  66                 RTE_ACL_NODE_MATCH,
  67                 RTE_ACL_NODE_MATCH,
  68                 RTE_ACL_NODE_MATCH,
  69                 RTE_ACL_NODE_MATCH,
  70                 RTE_ACL_NODE_MATCH,
  71                 RTE_ACL_NODE_MATCH,
  72                 RTE_ACL_NODE_MATCH,
  73         },
  74 };
  75
  76 static const __rte_x86_zmm_t _SC_(index_mask) = {
  77         .u32 = {
  78                 RTE_ACL_NODE_INDEX,
  79                 RTE_ACL_NODE_INDEX,
  80                 RTE_ACL_NODE_INDEX,
  81                 RTE_ACL_NODE_INDEX,
  82                 RTE_ACL_NODE_INDEX,
  83                 RTE_ACL_NODE_INDEX,
  84                 RTE_ACL_NODE_INDEX,
  85                 RTE_ACL_NODE_INDEX,
  86                 RTE_ACL_NODE_INDEX,
  87                 RTE_ACL_NODE_INDEX,
  88                 RTE_ACL_NODE_INDEX,
  89                 RTE_ACL_NODE_INDEX,
  90                 RTE_ACL_NODE_INDEX,
  91                 RTE_ACL_NODE_INDEX,
  92                 RTE_ACL_NODE_INDEX,
  93                 RTE_ACL_NODE_INDEX,
  94         },
  95 };
  96
  97 static const __rte_x86_zmm_t _SC_(trlo_idle) = {
  98         .u32 = {
  99                 RTE_ACL_IDLE_NODE,
 100                 RTE_ACL_IDLE_NODE,
 101                 RTE_ACL_IDLE_NODE,
 102                 RTE_ACL_IDLE_NODE,
 103                 RTE_ACL_IDLE_NODE,
 104                 RTE_ACL_IDLE_NODE,
 105                 RTE_ACL_IDLE_NODE,
 106                 RTE_ACL_IDLE_NODE,
 107                 RTE_ACL_IDLE_NODE,
 108                 RTE_ACL_IDLE_NODE,
 109                 RTE_ACL_IDLE_NODE,
 110                 RTE_ACL_IDLE_NODE,
 111                 RTE_ACL_IDLE_NODE,
 112                 RTE_ACL_IDLE_NODE,
 113                 RTE_ACL_IDLE_NODE,
 114                 RTE_ACL_IDLE_NODE,
 115         },
 116 };
 117
 118 static const __rte_x86_zmm_t _SC_(trhi_idle) = {
 119         .u32 = {
 120                 0, 0, 0, 0,
 121                 0, 0, 0, 0,
 122                 0, 0, 0, 0,
 123                 0, 0, 0, 0,
 124         },
 125 };
 126
 127 static const __rte_x86_zmm_t _SC_(shuffle_input) = {
 128         .u32 = {
 129                 0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c,
 130                 0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c,
 131                 0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c,
 132                 0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c,
 133         },
 134 };
 135
 136 static const __rte_x86_zmm_t _SC_(four_32) = {
 137         .u32 = {
 138                 4, 4, 4, 4,
 139                 4, 4, 4, 4,
 140                 4, 4, 4, 4,
 141                 4, 4, 4, 4,
 142         },
 143 };
 144
 145 static const __rte_x86_zmm_t _SC_(idx_add) = {
 146         .u32 = {
 147                 0, 1, 2, 3,
 148                 4, 5, 6, 7,
 149                 8, 9, 10, 11,
 150                 12, 13, 14, 15,
 151         },
 152 };
 153
 154 static const __rte_x86_zmm_t _SC_(range_base) = {
 155         .u32 = {
 156                 0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c,
 157                 0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c,
 158                 0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c,
 159                 0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c,
 160         },
 161 };
 162
 163 static const __rte_x86_zmm_t _SC_(pminp) = {
 164         .u32 = {
 165                 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
 166                 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
 167         },
 168 };
 169
 170 static const _T_mask _SC_(pmidx_msk) = 0x5555;
 171
 172 static const __rte_x86_zmm_t _SC_(pmidx[2]) = {
 173         [0] = {
 174                 .u32 = {
 175                         0, 0, 1, 0, 2, 0, 3, 0,
 176                         4, 0, 5, 0, 6, 0, 7, 0,
 177                 },
 178         },
 179         [1] = {
 180                 .u32 = {
 181                         8, 0, 9, 0, 10, 0, 11, 0,
 182                         12, 0, 13, 0, 14, 0, 15, 0,
 183                 },
 184         },
 185 };
 186
 187 /*
 188  * unfortunately current AVX512 ISA doesn't provide ability for
 189  * gather load on a byte quantity. So we have to mimic it in SW,
 190  * by doing 8x1B scalar loads.
 191  */
 192 static inline __m256i
 193 _m512_mask_gather_epi8x8(__m512i pdata, __mmask8 mask)
 194 {
 195         rte_ymm_t v;
 196         __rte_x86_zmm_t p;
 197
 198         static const uint32_t zero;
 199
 200         p.z = _mm512_mask_set1_epi64(pdata, mask ^ _SIMD_PTR_MSK_,
 201                 (uintptr_t)&zero);
 202
 203         v.u32[0] = *(uint8_t *)p.u64[0];
 204         v.u32[1] = *(uint8_t *)p.u64[1];
 205         v.u32[2] = *(uint8_t *)p.u64[2];
 206         v.u32[3] = *(uint8_t *)p.u64[3];
 207         v.u32[4] = *(uint8_t *)p.u64[4];
 208         v.u32[5] = *(uint8_t *)p.u64[5];
 209         v.u32[6] = *(uint8_t *)p.u64[6];
 210         v.u32[7] = *(uint8_t *)p.u64[7];
 211
 212         return v.y;
 213 }
 214
 215 /*
 216  * Gather 4/1 input bytes for up to 16 (2*8) locations in parallel.
 217  */
 218 static __rte_always_inline __m512i
 219 _F_(gather_bytes)(__m512i zero, const __m512i p[2], const uint32_t m[2],
 220         uint32_t bnum)
 221 {
 222         __m256i inp[2];
 223
 224         if (bnum == sizeof(uint8_t)) {
 225                 inp[0] = _m512_mask_gather_epi8x8(p[0], m[0]);
 226                 inp[1] = _m512_mask_gather_epi8x8(p[1], m[1]);
 227         } else {
 228                 inp[0] = _mm512_mask_i64gather_epi32(
 229                                 _mm512_castsi512_si256(zero),
 230                                 m[0], p[0], NULL, sizeof(uint8_t));
 231                 inp[1] = _mm512_mask_i64gather_epi32(
 232                                 _mm512_castsi512_si256(zero),
 233                                 m[1], p[1], NULL, sizeof(uint8_t));
 234         }
 235
 236         /* squeeze input into one 512-bit register */
 237         return _mm512_permutex2var_epi32(_mm512_castsi256_si512(inp[0]),
 238                         _SV_(pminp), _mm512_castsi256_si512(inp[1]));
 239 }
 240
 241 /*
 242  * Resolve matches for multiple categories (GT 8, use 512b instuctions/regs)
 243  */
 244 static inline void
 245 resolve_mcgt8_avx512x1(uint32_t result[],
 246         const struct rte_acl_match_results pr[], const uint32_t match[],
 247         uint32_t nb_pkt, uint32_t nb_cat, uint32_t nb_trie)
 248 {
 249         const int32_t *pri;
 250         const uint32_t *pm, *res;
 251         uint32_t i, k, mi;
 252         __mmask16 cm, sm;
 253         __m512i cp, cr, np, nr;
 254
 255         res = pr->results;
 256         pri = pr->priority;
 257
 258         cm = (1 << nb_cat) - 1;
 259
 260         for (k = 0; k != nb_pkt; k++, result += nb_cat) {
 261
 262                 mi = match[k] << ACL_MATCH_LOG;
 263
 264                 cr = _mm512_maskz_loadu_epi32(cm, res + mi);
 265                 cp = _mm512_maskz_loadu_epi32(cm, pri + mi);
 266
 267                 for (i = 1, pm = match + nb_pkt; i != nb_trie;
 268                                 i++, pm += nb_pkt) {
 269
 270                         mi = pm[k] << ACL_MATCH_LOG;
 271
 272                         nr = _mm512_maskz_loadu_epi32(cm, res + mi);
 273                         np = _mm512_maskz_loadu_epi32(cm, pri + mi);
 274
 275                         sm = _mm512_cmpgt_epi32_mask(cp, np);
 276                         cr = _mm512_mask_mov_epi32(nr, sm, cr);
 277                         cp = _mm512_mask_mov_epi32(np, sm, cp);
 278                 }
 279
 280                 _mm512_mask_storeu_epi32(result, cm, cr);
 281         }
 282 }
 283
 284 #include "acl_run_avx512_common.h"
 285
 286 /*
 287  * Perform search for up to (2 * 16) flows in parallel.
 288  * Use two sets of metadata, each serves 16 flows max.
 289  */
 290 static inline int
 291 search_avx512x16x2(const struct rte_acl_ctx *ctx, const uint8_t **data,
 292         uint32_t *results, uint32_t total_packets, uint32_t categories)
 293 {
 294         uint32_t i, *pm;
 295         const struct rte_acl_match_results *pr;
 296         struct acl_flow_avx512 flow;
 297         uint32_t match[ctx->num_tries * total_packets];
 298
 299         for (i = 0, pm = match; i != ctx->num_tries; i++, pm += total_packets) {
 300
 301                 /* setup for next trie */
 302                 acl_set_flow_avx512(&flow, ctx, i, data, pm, total_packets);
 303
 304                 /* process the trie */
 305                 _F_(search_trie)(&flow);
 306         }
 307
 308         /* resolve matches */
 309         pr = (const struct rte_acl_match_results *)
 310                 (ctx->trans_table + ctx->match_index);
 311
 312         if (categories == 1)
 313                 _F_(resolve_single_cat)(results, pr, match, total_packets,
 314                         ctx->num_tries);
 315         else if (categories <= RTE_ACL_MAX_CATEGORIES / 2)
 316                 resolve_mcle8_avx512x1(results, pr, match, total_packets,
 317                         categories, ctx->num_tries);
 318         else
 319                 resolve_mcgt8_avx512x1(results, pr, match, total_packets,
 320                         categories, ctx->num_tries);
 321
 322         return 0;
 323 }
 324
 325 #undef _SIMD_PTR_MSK_
 326 #undef _SIMD_PTR_NUM_
 327 #undef _SIMD_FLOW_MSK_
 328 #undef _SIMD_FLOW_NUM_
 329 #undef _SIMD_MASK_MAX_
 330 #undef _SIMD_MASK_BIT_
 331 #undef _M_GI_
 332 #undef _M_MGI_
 333 #undef _M_SI_
 334 #undef _M_I_
 335 #undef _F_
 336 #undef _SV_
 337 #undef _SC_
 338 #undef _T_mask
 339 #undef _T_simd