lib/librte_acl/acl_run_avx2.h

   1 /*-
   2  *   BSD LICENSE
   3  *
   4  *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
   5  *   All rights reserved.
   6  *
   7  *   Redistribution and use in source and binary forms, with or without
   8  *   modification, are permitted provided that the following conditions
   9  *   are met:
  10  *
  11  *     * Redistributions of source code must retain the above copyright
  12  *       notice, this list of conditions and the following disclaimer.
  13  *     * Redistributions in binary form must reproduce the above copyright
  14  *       notice, this list of conditions and the following disclaimer in
  15  *       the documentation and/or other materials provided with the
  16  *       distribution.
  17  *     * Neither the name of Intel Corporation nor the names of its
  18  *       contributors may be used to endorse or promote products derived
  19  *       from this software without specific prior written permission.
  20  *
  21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  32  */
  33
  34 #include "acl_run_sse.h"
  35
  36 static const rte_ymm_t ymm_match_mask = {
  37         .u32 = {
  38                 RTE_ACL_NODE_MATCH,
  39                 RTE_ACL_NODE_MATCH,
  40                 RTE_ACL_NODE_MATCH,
  41                 RTE_ACL_NODE_MATCH,
  42                 RTE_ACL_NODE_MATCH,
  43                 RTE_ACL_NODE_MATCH,
  44                 RTE_ACL_NODE_MATCH,
  45                 RTE_ACL_NODE_MATCH,
  46         },
  47 };
  48
  49 static const rte_ymm_t ymm_index_mask = {
  50         .u32 = {
  51                 RTE_ACL_NODE_INDEX,
  52                 RTE_ACL_NODE_INDEX,
  53                 RTE_ACL_NODE_INDEX,
  54                 RTE_ACL_NODE_INDEX,
  55                 RTE_ACL_NODE_INDEX,
  56                 RTE_ACL_NODE_INDEX,
  57                 RTE_ACL_NODE_INDEX,
  58                 RTE_ACL_NODE_INDEX,
  59         },
  60 };
  61
  62 static const rte_ymm_t ymm_shuffle_input = {
  63         .u32 = {
  64                 0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c,
  65                 0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c,
  66         },
  67 };
  68
  69 static const rte_ymm_t ymm_ones_16 = {
  70         .u16 = {
  71                 1, 1, 1, 1, 1, 1, 1, 1,
  72                 1, 1, 1, 1, 1, 1, 1, 1,
  73         },
  74 };
  75
  76 static inline __attribute__((always_inline)) ymm_t
  77 calc_addr_avx2(ymm_t index_mask, ymm_t next_input, ymm_t shuffle_input,
  78         ymm_t ones_16, ymm_t tr_lo, ymm_t tr_hi)
  79 {
  80         ymm_t in, node_type, r, t;
  81         ymm_t dfa_msk, dfa_ofs, quad_ofs;
  82         ymm_t addr;
  83
  84         const ymm_t range_base = _mm256_set_epi32(
  85                 0xffffff0c, 0xffffff08, 0xffffff04, 0xffffff00,
  86                 0xffffff0c, 0xffffff08, 0xffffff04, 0xffffff00);
  87
  88         t = _mm256_xor_si256(index_mask, index_mask);
  89         in = _mm256_shuffle_epi8(next_input, shuffle_input);
  90
  91         /* Calc node type and node addr */
  92         node_type = _mm256_andnot_si256(index_mask, tr_lo);
  93         addr = _mm256_and_si256(index_mask, tr_lo);
  94
  95         /* DFA calculations. */
  96
  97         dfa_msk = _mm256_cmpeq_epi32(node_type, t);
  98
  99         r = _mm256_srli_epi32(in, 30);
 100         r = _mm256_add_epi8(r, range_base);
 101
 102         t = _mm256_srli_epi32(in, 24);
 103         r = _mm256_shuffle_epi8(tr_hi, r);
 104
 105         dfa_ofs = _mm256_sub_epi32(t, r);
 106
 107         /* QUAD/SINGLE caluclations. */
 108
 109         t = _mm256_cmpgt_epi8(in, tr_hi);
 110         t = _mm256_sign_epi8(t, t);
 111         t = _mm256_maddubs_epi16(t, t);
 112         quad_ofs = _mm256_madd_epi16(t, ones_16);
 113
 114         /* blend DFA and QUAD/SINGLE. */
 115         t = _mm256_blendv_epi8(quad_ofs, dfa_ofs, dfa_msk);
 116
 117         addr = _mm256_add_epi32(addr, t);
 118         return addr;
 119 }
 120
 121 static inline __attribute__((always_inline)) ymm_t
 122 transition8(ymm_t next_input, const uint64_t *trans, ymm_t *tr_lo, ymm_t *tr_hi)
 123 {
 124         const int32_t *tr;
 125         ymm_t addr;
 126
 127         tr = (const int32_t *)(uintptr_t)trans;
 128
 129         addr = calc_addr_avx2(ymm_index_mask.y, next_input, ymm_shuffle_input.y,
 130                 ymm_ones_16.y, *tr_lo, *tr_hi);
 131
 132         /* load lower 32 bits of 8 transactions at once. */
 133         *tr_lo = _mm256_i32gather_epi32(tr, addr, sizeof(trans[0]));
 134
 135         next_input = _mm256_srli_epi32(next_input, CHAR_BIT);
 136
 137         /* load high 32 bits of 8 transactions at once. */
 138         *tr_hi = _mm256_i32gather_epi32(tr + 1, addr, sizeof(trans[0]));
 139
 140         return next_input;
 141 }
 142
 143 static inline void
 144 acl_process_matches_avx2x8(const struct rte_acl_ctx *ctx,
 145         struct parms *parms, struct acl_flow_data *flows, uint32_t slot,
 146         ymm_t matches, ymm_t *tr_lo, ymm_t *tr_hi)
 147 {
 148         ymm_t t0, t1;
 149         ymm_t lo, hi;
 150         xmm_t l0, l1;
 151         uint32_t i;
 152         uint64_t tr[MAX_SEARCHES_SSE8];
 153
 154         l1 = _mm256_extracti128_si256(*tr_lo, 1);
 155         l0 = _mm256_castsi256_si128(*tr_lo);
 156
 157         for (i = 0; i != RTE_DIM(tr) / 2; i++) {
 158                 tr[i] = (uint32_t)_mm_cvtsi128_si32(l0);
 159                 tr[i + 4] = (uint32_t)_mm_cvtsi128_si32(l1);
 160
 161                 l0 = _mm_srli_si128(l0, sizeof(uint32_t));
 162                 l1 = _mm_srli_si128(l1, sizeof(uint32_t));
 163
 164                 tr[i] = acl_match_check(tr[i], slot + i,
 165                         ctx, parms, flows, resolve_priority_sse);
 166                 tr[i + 4] = acl_match_check(tr[i + 4], slot + i + 4,
 167                         ctx, parms, flows, resolve_priority_sse);
 168         }
 169
 170         t0 = _mm256_set_epi64x(tr[5], tr[4], tr[1], tr[0]);
 171         t1 = _mm256_set_epi64x(tr[7], tr[6], tr[3], tr[2]);
 172
 173         lo = (ymm_t)_mm256_shuffle_ps((__m256)t0, (__m256)t1, 0x88);
 174         hi = (ymm_t)_mm256_shuffle_ps((__m256)t0, (__m256)t1, 0xdd);
 175
 176         *tr_lo = _mm256_blendv_epi8(*tr_lo, lo, matches);
 177         *tr_hi = _mm256_blendv_epi8(*tr_hi, hi, matches);
 178 }
 179
 180 static inline void
 181 acl_match_check_avx2x8(const struct rte_acl_ctx *ctx, struct parms *parms,
 182         struct acl_flow_data *flows, uint32_t slot,
 183         ymm_t *tr_lo, ymm_t *tr_hi, ymm_t match_mask)
 184 {
 185         uint32_t msk;
 186         ymm_t matches, temp;
 187
 188         /* test for match node */
 189         temp = _mm256_and_si256(match_mask, *tr_lo);
 190         matches = _mm256_cmpeq_epi32(temp, match_mask);
 191         msk = _mm256_movemask_epi8(matches);
 192
 193         while (msk != 0) {
 194
 195                 acl_process_matches_avx2x8(ctx, parms, flows, slot,
 196                         matches, tr_lo, tr_hi);
 197                 temp = _mm256_and_si256(match_mask, *tr_lo);
 198                 matches = _mm256_cmpeq_epi32(temp, match_mask);
 199                 msk = _mm256_movemask_epi8(matches);
 200         }
 201 }
 202
 203 static inline int
 204 search_avx2x16(const struct rte_acl_ctx *ctx, const uint8_t **data,
 205         uint32_t *results, uint32_t total_packets, uint32_t categories)
 206 {
 207         uint32_t n;
 208         struct acl_flow_data flows;
 209         uint64_t index_array[MAX_SEARCHES_AVX16];
 210         struct completion cmplt[MAX_SEARCHES_AVX16];
 211         struct parms parms[MAX_SEARCHES_AVX16];
 212         ymm_t input[2], tr_lo[2], tr_hi[2];
 213         ymm_t t0, t1;
 214
 215         acl_set_flow(&flows, cmplt, RTE_DIM(cmplt), data, results,
 216                 total_packets, categories, ctx->trans_table);
 217
 218         for (n = 0; n < RTE_DIM(cmplt); n++) {
 219                 cmplt[n].count = 0;
 220                 index_array[n] = acl_start_next_trie(&flows, parms, n, ctx);
 221         }
 222
 223         t0 = _mm256_set_epi64x(index_array[5], index_array[4],
 224                 index_array[1], index_array[0]);
 225         t1 = _mm256_set_epi64x(index_array[7], index_array[6],
 226                 index_array[3], index_array[2]);
 227
 228         tr_lo[0] = (ymm_t)_mm256_shuffle_ps((__m256)t0, (__m256)t1, 0x88);
 229         tr_hi[0] = (ymm_t)_mm256_shuffle_ps((__m256)t0, (__m256)t1, 0xdd);
 230
 231         t0 = _mm256_set_epi64x(index_array[13], index_array[12],
 232                 index_array[9], index_array[8]);
 233         t1 = _mm256_set_epi64x(index_array[15], index_array[14],
 234                 index_array[11], index_array[10]);
 235
 236         tr_lo[1] = (ymm_t)_mm256_shuffle_ps((__m256)t0, (__m256)t1, 0x88);
 237         tr_hi[1] = (ymm_t)_mm256_shuffle_ps((__m256)t0, (__m256)t1, 0xdd);
 238
 239          /* Check for any matches. */
 240         acl_match_check_avx2x8(ctx, parms, &flows, 0, &tr_lo[0], &tr_hi[0],
 241                 ymm_match_mask.y);
 242         acl_match_check_avx2x8(ctx, parms, &flows, 8, &tr_lo[1], &tr_hi[1],
 243                 ymm_match_mask.y);
 244
 245         while (flows.started > 0) {
 246
 247                 uint32_t in[MAX_SEARCHES_SSE8];
 248
 249                 /* Gather 4 bytes of input data for first 8 flows. */
 250                 in[0] = GET_NEXT_4BYTES(parms, 0);
 251                 in[4] = GET_NEXT_4BYTES(parms, 4);
 252                 in[1] = GET_NEXT_4BYTES(parms, 1);
 253                 in[5] = GET_NEXT_4BYTES(parms, 5);
 254                 in[2] = GET_NEXT_4BYTES(parms, 2);
 255                 in[6] = GET_NEXT_4BYTES(parms, 6);
 256                 in[3] = GET_NEXT_4BYTES(parms, 3);
 257                 in[7] = GET_NEXT_4BYTES(parms, 7);
 258                 input[0] = _mm256_set_epi32(in[7], in[6], in[5], in[4],
 259                         in[3], in[2], in[1], in[0]);
 260
 261                 /* Gather 4 bytes of input data for last 8 flows. */
 262                 in[0] = GET_NEXT_4BYTES(parms, 8);
 263                 in[4] = GET_NEXT_4BYTES(parms, 12);
 264                 in[1] = GET_NEXT_4BYTES(parms, 9);
 265                 in[5] = GET_NEXT_4BYTES(parms, 13);
 266                 in[2] = GET_NEXT_4BYTES(parms, 10);
 267                 in[6] = GET_NEXT_4BYTES(parms, 14);
 268                 in[3] = GET_NEXT_4BYTES(parms, 11);
 269                 in[7] = GET_NEXT_4BYTES(parms, 15);
 270                 input[1] = _mm256_set_epi32(in[7], in[6], in[5], in[4],
 271                         in[3], in[2], in[1], in[0]);
 272
 273                 input[0] = transition8(input[0], flows.trans,
 274                         &tr_lo[0], &tr_hi[0]);
 275                 input[1] = transition8(input[1], flows.trans,
 276                         &tr_lo[1], &tr_hi[1]);
 277
 278                 input[0] = transition8(input[0], flows.trans,
 279                         &tr_lo[0], &tr_hi[0]);
 280                 input[1] = transition8(input[1], flows.trans,
 281                         &tr_lo[1], &tr_hi[1]);
 282
 283                 input[0] = transition8(input[0], flows.trans,
 284                         &tr_lo[0], &tr_hi[0]);
 285                 input[1] = transition8(input[1], flows.trans,
 286                         &tr_lo[1], &tr_hi[1]);
 287
 288                 input[0] = transition8(input[0], flows.trans,
 289                         &tr_lo[0], &tr_hi[0]);
 290                 input[1] = transition8(input[1], flows.trans,
 291                         &tr_lo[1], &tr_hi[1]);
 292
 293                  /* Check for any matches. */
 294                 acl_match_check_avx2x8(ctx, parms, &flows, 0,
 295                         &tr_lo[0], &tr_hi[0], ymm_match_mask.y);
 296                 acl_match_check_avx2x8(ctx, parms, &flows, 8,
 297                         &tr_lo[1], &tr_hi[1], ymm_match_mask.y);
 298         }
 299
 300         return 0;
 301 }