lib/librte_acl/acl_run_avx512x8.h

   1 /* SPDX-License-Identifier: BSD-3-Clause
   2  * Copyright(c) 2020 Intel Corporation
   3  */
   4
   5 /*
   6  * Defines required by "acl_run_avx512_common.h".
   7  * Note that all of them has to be undefined by the end
   8  * of this file, as "acl_run_avx512_common.h" can be included several
   9  * times from different *.h files for the same *.c.
  10  */
  11
  12 /*
  13  * This implementation uses 256-bit registers(ymm) and instrincts.
  14  * So our main SIMD type is 256-bit width and each such variable can
  15  * process sizeof(__m256i) / sizeof(uint32_t) == 8 entries in parallel.
  16  */
  17 #define _T_simd         __m256i
  18 #define _T_mask         __mmask8
  19
  20 /* Naming convention for static const variables. */
  21 #define _SC_(x)         ymm_##x
  22 #define _SV_(x)         (ymm_##x.y)
  23
  24 /* Naming convention for internal functions. */
  25 #define _F_(x)          x##_avx512x8
  26
  27 /*
  28  * Same instrincts have different syntaxis (depending on the bit-width),
  29  * so to overcome that few macros need to be defined.
  30  */
  31
  32 /* Naming convention for generic epi(packed integers) type instrincts. */
  33 #define _M_I_(x)        _mm256_##x
  34
  35 /* Naming convention for si(whole simd integer) type instrincts. */
  36 #define _M_SI_(x)       _mm256_##x##_si256
  37
  38 /* Naming convention for masked gather type instrincts. */
  39 #define _M_MGI_(x)      _mm256_m##x
  40
  41 /* Naming convention for gather type instrincts. */
  42 #define _M_GI_(name, idx, base, scale)  _mm256_##name(base, idx, scale)
  43
  44 /* num/mask of transitions per SIMD regs */
  45 #define _SIMD_MASK_BIT_ (sizeof(_T_simd) / sizeof(uint32_t))
  46 #define _SIMD_MASK_MAX_ RTE_LEN2MASK(_SIMD_MASK_BIT_, uint32_t)
  47
  48 #define _SIMD_FLOW_NUM_ (2 * _SIMD_MASK_BIT_)
  49 #define _SIMD_FLOW_MSK_ (_SIMD_FLOW_NUM_ - 1)
  50
  51 /* num/mask of pointers per SIMD regs */
  52 #define _SIMD_PTR_NUM_  (sizeof(_T_simd) / sizeof(uintptr_t))
  53 #define _SIMD_PTR_MSK_  RTE_LEN2MASK(_SIMD_PTR_NUM_, uint32_t)
  54
  55 static const rte_ymm_t _SC_(match_mask) = {
  56         .u32 = {
  57                 RTE_ACL_NODE_MATCH,
  58                 RTE_ACL_NODE_MATCH,
  59                 RTE_ACL_NODE_MATCH,
  60                 RTE_ACL_NODE_MATCH,
  61                 RTE_ACL_NODE_MATCH,
  62                 RTE_ACL_NODE_MATCH,
  63                 RTE_ACL_NODE_MATCH,
  64                 RTE_ACL_NODE_MATCH,
  65         },
  66 };
  67
  68 static const rte_ymm_t _SC_(index_mask) = {
  69         .u32 = {
  70                 RTE_ACL_NODE_INDEX,
  71                 RTE_ACL_NODE_INDEX,
  72                 RTE_ACL_NODE_INDEX,
  73                 RTE_ACL_NODE_INDEX,
  74                 RTE_ACL_NODE_INDEX,
  75                 RTE_ACL_NODE_INDEX,
  76                 RTE_ACL_NODE_INDEX,
  77                 RTE_ACL_NODE_INDEX,
  78         },
  79 };
  80
  81 static const rte_ymm_t _SC_(trlo_idle) = {
  82         .u32 = {
  83                 RTE_ACL_IDLE_NODE,
  84                 RTE_ACL_IDLE_NODE,
  85                 RTE_ACL_IDLE_NODE,
  86                 RTE_ACL_IDLE_NODE,
  87                 RTE_ACL_IDLE_NODE,
  88                 RTE_ACL_IDLE_NODE,
  89                 RTE_ACL_IDLE_NODE,
  90                 RTE_ACL_IDLE_NODE,
  91         },
  92 };
  93
  94 static const rte_ymm_t _SC_(trhi_idle) = {
  95         .u32 = {
  96                 0, 0, 0, 0,
  97                 0, 0, 0, 0,
  98         },
  99 };
 100
 101 static const rte_ymm_t _SC_(shuffle_input) = {
 102         .u32 = {
 103                 0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c,
 104                 0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c,
 105         },
 106 };
 107
 108 static const rte_ymm_t _SC_(four_32) = {
 109         .u32 = {
 110                 4, 4, 4, 4,
 111                 4, 4, 4, 4,
 112         },
 113 };
 114
 115 static const rte_ymm_t _SC_(idx_add) = {
 116         .u32 = {
 117                 0, 1, 2, 3,
 118                 4, 5, 6, 7,
 119         },
 120 };
 121
 122 static const rte_ymm_t _SC_(range_base) = {
 123         .u32 = {
 124                 0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c,
 125                 0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c,
 126         },
 127 };
 128
 129 static const rte_ymm_t _SC_(pminp) = {
 130         .u32 = {
 131                 0x00, 0x01, 0x02, 0x03,
 132                 0x08, 0x09, 0x0a, 0x0b,
 133         },
 134 };
 135
 136 static const __mmask16 _SC_(pmidx_msk) = 0x55;
 137
 138 static const rte_ymm_t _SC_(pmidx[2]) = {
 139         [0] = {
 140                 .u32 = {
 141                         0, 0, 1, 0, 2, 0, 3, 0,
 142                 },
 143         },
 144         [1] = {
 145                 .u32 = {
 146                         4, 0, 5, 0, 6, 0, 7, 0,
 147                 },
 148         },
 149 };
 150
 151 /*
 152  * unfortunately current AVX512 ISA doesn't provide ability for
 153  * gather load on a byte quantity. So we have to mimic it in SW,
 154  * by doing 4x1B scalar loads.
 155  */
 156 static inline __m128i
 157 _m256_mask_gather_epi8x4(__m256i pdata, __mmask8 mask)
 158 {
 159         rte_xmm_t v;
 160         rte_ymm_t p;
 161
 162         static const uint32_t zero;
 163
 164         p.y = _mm256_mask_set1_epi64(pdata, mask ^ _SIMD_PTR_MSK_,
 165                 (uintptr_t)&zero);
 166
 167         v.u32[0] = *(uint8_t *)p.u64[0];
 168         v.u32[1] = *(uint8_t *)p.u64[1];
 169         v.u32[2] = *(uint8_t *)p.u64[2];
 170         v.u32[3] = *(uint8_t *)p.u64[3];
 171
 172         return v.x;
 173 }
 174
 175 /*
 176  * Gather 4/1 input bytes for up to 8 (2*8) locations in parallel.
 177  */
 178 static __rte_always_inline __m256i
 179 _F_(gather_bytes)(__m256i zero, const __m256i p[2], const uint32_t m[2],
 180         uint32_t bnum)
 181 {
 182         __m128i inp[2];
 183
 184         if (bnum == sizeof(uint8_t)) {
 185                 inp[0] = _m256_mask_gather_epi8x4(p[0], m[0]);
 186                 inp[1] = _m256_mask_gather_epi8x4(p[1], m[1]);
 187         } else {
 188                 inp[0] = _mm256_mmask_i64gather_epi32(
 189                                 _mm256_castsi256_si128(zero),
 190                                 m[0], p[0], NULL, sizeof(uint8_t));
 191                 inp[1] = _mm256_mmask_i64gather_epi32(
 192                                 _mm256_castsi256_si128(zero),
 193                                 m[1], p[1], NULL, sizeof(uint8_t));
 194         }
 195
 196         /* squeeze input into one 256-bit register */
 197         return _mm256_permutex2var_epi32(_mm256_castsi128_si256(inp[0]),
 198                         _SV_(pminp), _mm256_castsi128_si256(inp[1]));
 199 }
 200
 201 #include "acl_run_avx512_common.h"
 202
 203 /*
 204  * Perform search for up to (2 * 8) flows in parallel.
 205  * Use two sets of metadata, each serves 8 flows max.
 206  */
 207 static inline int
 208 search_avx512x8x2(const struct rte_acl_ctx *ctx, const uint8_t **data,
 209         uint32_t *results, uint32_t total_packets, uint32_t categories)
 210 {
 211         uint32_t i, *pm;
 212         const struct rte_acl_match_results *pr;
 213         struct acl_flow_avx512 flow;
 214         uint32_t match[ctx->num_tries * total_packets];
 215
 216         for (i = 0, pm = match; i != ctx->num_tries; i++, pm += total_packets) {
 217
 218                 /* setup for next trie */
 219                 acl_set_flow_avx512(&flow, ctx, i, data, pm, total_packets);
 220
 221                 /* process the trie */
 222                 _F_(search_trie)(&flow);
 223         }
 224
 225         /* resolve matches */
 226         pr = (const struct rte_acl_match_results *)
 227                 (ctx->trans_table + ctx->match_index);
 228
 229         if (categories == 1)
 230                 _F_(resolve_single_cat)(results, pr, match, total_packets,
 231                         ctx->num_tries);
 232         else
 233                 resolve_mcle8_avx512x1(results, pr, match, total_packets,
 234                         categories, ctx->num_tries);
 235
 236         return 0;
 237 }
 238
 239 #undef _SIMD_PTR_MSK_
 240 #undef _SIMD_PTR_NUM_
 241 #undef _SIMD_FLOW_MSK_
 242 #undef _SIMD_FLOW_NUM_
 243 #undef _SIMD_MASK_MAX_
 244 #undef _SIMD_MASK_BIT_
 245 #undef _M_GI_
 246 #undef _M_MGI_
 247 #undef _M_SI_
 248 #undef _M_I_
 249 #undef _F_
 250 #undef _SV_
 251 #undef _SC_
 252 #undef _T_mask
 253 #undef _T_simd