lib/librte_acl/acl_run_sse.c

   1 /*-
   2  *   BSD LICENSE
   3  *
   4  *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
   5  *   All rights reserved.
   6  *
   7  *   Redistribution and use in source and binary forms, with or without
   8  *   modification, are permitted provided that the following conditions
   9  *   are met:
  10  *
  11  *     * Redistributions of source code must retain the above copyright
  12  *       notice, this list of conditions and the following disclaimer.
  13  *     * Redistributions in binary form must reproduce the above copyright
  14  *       notice, this list of conditions and the following disclaimer in
  15  *       the documentation and/or other materials provided with the
  16  *       distribution.
  17  *     * Neither the name of Intel Corporation nor the names of its
  18  *       contributors may be used to endorse or promote products derived
  19  *       from this software without specific prior written permission.
  20  *
  21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  32  */
  33
  34 #include "acl_run.h"
  35
  36 enum {
  37         SHUFFLE32_SLOT1 = 0xe5,
  38         SHUFFLE32_SLOT2 = 0xe6,
  39         SHUFFLE32_SLOT3 = 0xe7,
  40         SHUFFLE32_SWAP64 = 0x4e,
  41 };
  42
  43 static const rte_xmm_t mm_type_quad_range = {
  44         .u32 = {
  45                 RTE_ACL_NODE_QRANGE,
  46                 RTE_ACL_NODE_QRANGE,
  47                 RTE_ACL_NODE_QRANGE,
  48                 RTE_ACL_NODE_QRANGE,
  49         },
  50 };
  51
  52 static const rte_xmm_t mm_type_quad_range64 = {
  53         .u32 = {
  54                 RTE_ACL_NODE_QRANGE,
  55                 RTE_ACL_NODE_QRANGE,
  56                 0,
  57                 0,
  58         },
  59 };
  60
  61 static const rte_xmm_t mm_shuffle_input = {
  62         .u32 = {0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c},
  63 };
  64
  65 static const rte_xmm_t mm_shuffle_input64 = {
  66         .u32 = {0x00000000, 0x04040404, 0x80808080, 0x80808080},
  67 };
  68
  69 static const rte_xmm_t mm_ones_16 = {
  70         .u16 = {1, 1, 1, 1, 1, 1, 1, 1},
  71 };
  72
  73 static const rte_xmm_t mm_bytes = {
  74         .u32 = {UINT8_MAX, UINT8_MAX, UINT8_MAX, UINT8_MAX},
  75 };
  76
  77 static const rte_xmm_t mm_bytes64 = {
  78         .u32 = {UINT8_MAX, UINT8_MAX, 0, 0},
  79 };
  80
  81 static const rte_xmm_t mm_match_mask = {
  82         .u32 = {
  83                 RTE_ACL_NODE_MATCH,
  84                 RTE_ACL_NODE_MATCH,
  85                 RTE_ACL_NODE_MATCH,
  86                 RTE_ACL_NODE_MATCH,
  87         },
  88 };
  89
  90 static const rte_xmm_t mm_match_mask64 = {
  91         .u32 = {
  92                 RTE_ACL_NODE_MATCH,
  93                 0,
  94                 RTE_ACL_NODE_MATCH,
  95                 0,
  96         },
  97 };
  98
  99 static const rte_xmm_t mm_index_mask = {
 100         .u32 = {
 101                 RTE_ACL_NODE_INDEX,
 102                 RTE_ACL_NODE_INDEX,
 103                 RTE_ACL_NODE_INDEX,
 104                 RTE_ACL_NODE_INDEX,
 105         },
 106 };
 107
 108 static const rte_xmm_t mm_index_mask64 = {
 109         .u32 = {
 110                 RTE_ACL_NODE_INDEX,
 111                 RTE_ACL_NODE_INDEX,
 112                 0,
 113                 0,
 114         },
 115 };
 116
 117
 118 /*
 119  * Resolve priority for multiple results (sse version).
 120  * This consists comparing the priority of the current traversal with the
 121  * running set of results for the packet.
 122  * For each result, keep a running array of the result (rule number) and
 123  * its priority for each category.
 124  */
 125 static inline void
 126 resolve_priority_sse(uint64_t transition, int n, const struct rte_acl_ctx *ctx,
 127         struct parms *parms, const struct rte_acl_match_results *p,
 128         uint32_t categories)
 129 {
 130         uint32_t x;
 131         xmm_t results, priority, results1, priority1, selector;
 132         xmm_t *saved_results, *saved_priority;
 133
 134         for (x = 0; x < categories; x += RTE_ACL_RESULTS_MULTIPLIER) {
 135
 136                 saved_results = (xmm_t *)(&parms[n].cmplt->results[x]);
 137                 saved_priority =
 138                         (xmm_t *)(&parms[n].cmplt->priority[x]);
 139
 140                 /* get results and priorities for completed trie */
 141                 results = MM_LOADU((const xmm_t *)&p[transition].results[x]);
 142                 priority = MM_LOADU((const xmm_t *)&p[transition].priority[x]);
 143
 144                 /* if this is not the first completed trie */
 145                 if (parms[n].cmplt->count != ctx->num_tries) {
 146
 147                         /* get running best results and their priorities */
 148                         results1 = MM_LOADU(saved_results);
 149                         priority1 = MM_LOADU(saved_priority);
 150
 151                         /* select results that are highest priority */
 152                         selector = MM_CMPGT32(priority1, priority);
 153                         results = MM_BLENDV8(results, results1, selector);
 154                         priority = MM_BLENDV8(priority, priority1, selector);
 155                 }
 156
 157                 /* save running best results and their priorities */
 158                 MM_STOREU(saved_results, results);
 159                 MM_STOREU(saved_priority, priority);
 160         }
 161 }
 162
 163 /*
 164  * Extract transitions from an XMM register and check for any matches
 165  */
 166 static void
 167 acl_process_matches(xmm_t *indicies, int slot, const struct rte_acl_ctx *ctx,
 168         struct parms *parms, struct acl_flow_data *flows)
 169 {
 170         uint64_t transition1, transition2;
 171
 172         /* extract transition from low 64 bits. */
 173         transition1 = MM_CVT64(*indicies);
 174
 175         /* extract transition from high 64 bits. */
 176         *indicies = MM_SHUFFLE32(*indicies, SHUFFLE32_SWAP64);
 177         transition2 = MM_CVT64(*indicies);
 178
 179         transition1 = acl_match_check(transition1, slot, ctx,
 180                 parms, flows, resolve_priority_sse);
 181         transition2 = acl_match_check(transition2, slot + 1, ctx,
 182                 parms, flows, resolve_priority_sse);
 183
 184         /* update indicies with new transitions. */
 185         *indicies = MM_SET64(transition2, transition1);
 186 }
 187
 188 /*
 189  * Check for a match in 2 transitions (contained in SSE register)
 190  */
 191 static inline void
 192 acl_match_check_x2(int slot, const struct rte_acl_ctx *ctx, struct parms *parms,
 193         struct acl_flow_data *flows, xmm_t *indicies, xmm_t match_mask)
 194 {
 195         xmm_t temp;
 196
 197         temp = MM_AND(match_mask, *indicies);
 198         while (!MM_TESTZ(temp, temp)) {
 199                 acl_process_matches(indicies, slot, ctx, parms, flows);
 200                 temp = MM_AND(match_mask, *indicies);
 201         }
 202 }
 203
 204 /*
 205  * Check for any match in 4 transitions (contained in 2 SSE registers)
 206  */
 207 static inline void
 208 acl_match_check_x4(int slot, const struct rte_acl_ctx *ctx, struct parms *parms,
 209         struct acl_flow_data *flows, xmm_t *indicies1, xmm_t *indicies2,
 210         xmm_t match_mask)
 211 {
 212         xmm_t temp;
 213
 214         /* put low 32 bits of each transition into one register */
 215         temp = (xmm_t)MM_SHUFFLEPS((__m128)*indicies1, (__m128)*indicies2,
 216                 0x88);
 217         /* test for match node */
 218         temp = MM_AND(match_mask, temp);
 219
 220         while (!MM_TESTZ(temp, temp)) {
 221                 acl_process_matches(indicies1, slot, ctx, parms, flows);
 222                 acl_process_matches(indicies2, slot + 2, ctx, parms, flows);
 223
 224                 temp = (xmm_t)MM_SHUFFLEPS((__m128)*indicies1,
 225                                         (__m128)*indicies2,
 226                                         0x88);
 227                 temp = MM_AND(match_mask, temp);
 228         }
 229 }
 230
 231 /*
 232  * Calculate the address of the next transition for
 233  * all types of nodes. Note that only DFA nodes and range
 234  * nodes actually transition to another node. Match
 235  * nodes don't move.
 236  */
 237 static inline xmm_t
 238 acl_calc_addr(xmm_t index_mask, xmm_t next_input, xmm_t shuffle_input,
 239         xmm_t ones_16, xmm_t bytes, xmm_t type_quad_range,
 240         xmm_t *indicies1, xmm_t *indicies2)
 241 {
 242         xmm_t addr, node_types, temp;
 243
 244         /*
 245          * Note that no transition is done for a match
 246          * node and therefore a stream freezes when
 247          * it reaches a match.
 248          */
 249
 250         /* Shuffle low 32 into temp and high 32 into indicies2 */
 251         temp = (xmm_t)MM_SHUFFLEPS((__m128)*indicies1, (__m128)*indicies2,
 252                 0x88);
 253         *indicies2 = (xmm_t)MM_SHUFFLEPS((__m128)*indicies1,
 254                 (__m128)*indicies2, 0xdd);
 255
 256         /* Calc node type and node addr */
 257         node_types = MM_ANDNOT(index_mask, temp);
 258         addr = MM_AND(index_mask, temp);
 259
 260         /*
 261          * Calc addr for DFAs - addr = dfa_index + input_byte
 262          */
 263
 264         /* mask for DFA type (0) nodes */
 265         temp = MM_CMPEQ32(node_types, MM_XOR(node_types, node_types));
 266
 267         /* add input byte to DFA position */
 268         temp = MM_AND(temp, bytes);
 269         temp = MM_AND(temp, next_input);
 270         addr = MM_ADD32(addr, temp);
 271
 272         /*
 273          * Calc addr for Range nodes -> range_index + range(input)
 274          */
 275         node_types = MM_CMPEQ32(node_types, type_quad_range);
 276
 277         /*
 278          * Calculate number of range boundaries that are less than the
 279          * input value. Range boundaries for each node are in signed 8 bit,
 280          * ordered from -128 to 127 in the indicies2 register.
 281          * This is effectively a popcnt of bytes that are greater than the
 282          * input byte.
 283          */
 284
 285         /* shuffle input byte to all 4 positions of 32 bit value */
 286         temp = MM_SHUFFLE8(next_input, shuffle_input);
 287
 288         /* check ranges */
 289         temp = MM_CMPGT8(temp, *indicies2);
 290
 291         /* convert -1 to 1 (bytes greater than input byte */
 292         temp = MM_SIGN8(temp, temp);
 293
 294         /* horizontal add pairs of bytes into words */
 295         temp = MM_MADD8(temp, temp);
 296
 297         /* horizontal add pairs of words into dwords */
 298         temp = MM_MADD16(temp, ones_16);
 299
 300         /* mask to range type nodes */
 301         temp = MM_AND(temp, node_types);
 302
 303         /* add index into node position */
 304         return MM_ADD32(addr, temp);
 305 }
 306
 307 /*
 308  * Process 4 transitions (in 2 SIMD registers) in parallel
 309  */
 310 static inline xmm_t
 311 transition4(xmm_t index_mask, xmm_t next_input, xmm_t shuffle_input,
 312         xmm_t ones_16, xmm_t bytes, xmm_t type_quad_range,
 313         const uint64_t *trans, xmm_t *indicies1, xmm_t *indicies2)
 314 {
 315         xmm_t addr;
 316         uint64_t trans0, trans2;
 317
 318          /* Calculate the address (array index) for all 4 transitions. */
 319
 320         addr = acl_calc_addr(index_mask, next_input, shuffle_input, ones_16,
 321                 bytes, type_quad_range, indicies1, indicies2);
 322
 323          /* Gather 64 bit transitions and pack back into 2 registers. */
 324
 325         trans0 = trans[MM_CVT32(addr)];
 326
 327         /* get slot 2 */
 328
 329         /* {x0, x1, x2, x3} -> {x2, x1, x2, x3} */
 330         addr = MM_SHUFFLE32(addr, SHUFFLE32_SLOT2);
 331         trans2 = trans[MM_CVT32(addr)];
 332
 333         /* get slot 1 */
 334
 335         /* {x2, x1, x2, x3} -> {x1, x1, x2, x3} */
 336         addr = MM_SHUFFLE32(addr, SHUFFLE32_SLOT1);
 337         *indicies1 = MM_SET64(trans[MM_CVT32(addr)], trans0);
 338
 339         /* get slot 3 */
 340
 341         /* {x1, x1, x2, x3} -> {x3, x1, x2, x3} */
 342         addr = MM_SHUFFLE32(addr, SHUFFLE32_SLOT3);
 343         *indicies2 = MM_SET64(trans[MM_CVT32(addr)], trans2);
 344
 345         return MM_SRL32(next_input, 8);
 346 }
 347
 348 /*
 349  * Execute trie traversal with 8 traversals in parallel
 350  */
 351 static inline int
 352 search_sse_8(const struct rte_acl_ctx *ctx, const uint8_t **data,
 353         uint32_t *results, uint32_t total_packets, uint32_t categories)
 354 {
 355         int n;
 356         struct acl_flow_data flows;
 357         uint64_t index_array[MAX_SEARCHES_SSE8];
 358         struct completion cmplt[MAX_SEARCHES_SSE8];
 359         struct parms parms[MAX_SEARCHES_SSE8];
 360         xmm_t input0, input1;
 361         xmm_t indicies1, indicies2, indicies3, indicies4;
 362
 363         acl_set_flow(&flows, cmplt, RTE_DIM(cmplt), data, results,
 364                 total_packets, categories, ctx->trans_table);
 365
 366         for (n = 0; n < MAX_SEARCHES_SSE8; n++) {
 367                 cmplt[n].count = 0;
 368                 index_array[n] = acl_start_next_trie(&flows, parms, n, ctx);
 369         }
 370
 371         /*
 372          * indicies1 contains index_array[0,1]
 373          * indicies2 contains index_array[2,3]
 374          * indicies3 contains index_array[4,5]
 375          * indicies4 contains index_array[6,7]
 376          */
 377
 378         indicies1 = MM_LOADU((xmm_t *) &index_array[0]);
 379         indicies2 = MM_LOADU((xmm_t *) &index_array[2]);
 380
 381         indicies3 = MM_LOADU((xmm_t *) &index_array[4]);
 382         indicies4 = MM_LOADU((xmm_t *) &index_array[6]);
 383
 384          /* Check for any matches. */
 385         acl_match_check_x4(0, ctx, parms, &flows,
 386                 &indicies1, &indicies2, mm_match_mask.m);
 387         acl_match_check_x4(4, ctx, parms, &flows,
 388                 &indicies3, &indicies4, mm_match_mask.m);
 389
 390         while (flows.started > 0) {
 391
 392                 /* Gather 4 bytes of input data for each stream. */
 393                 input0 = MM_INSERT32(mm_ones_16.m, GET_NEXT_4BYTES(parms, 0),
 394                         0);
 395                 input1 = MM_INSERT32(mm_ones_16.m, GET_NEXT_4BYTES(parms, 4),
 396                         0);
 397
 398                 input0 = MM_INSERT32(input0, GET_NEXT_4BYTES(parms, 1), 1);
 399                 input1 = MM_INSERT32(input1, GET_NEXT_4BYTES(parms, 5), 1);
 400
 401                 input0 = MM_INSERT32(input0, GET_NEXT_4BYTES(parms, 2), 2);
 402                 input1 = MM_INSERT32(input1, GET_NEXT_4BYTES(parms, 6), 2);
 403
 404                 input0 = MM_INSERT32(input0, GET_NEXT_4BYTES(parms, 3), 3);
 405                 input1 = MM_INSERT32(input1, GET_NEXT_4BYTES(parms, 7), 3);
 406
 407                  /* Process the 4 bytes of input on each stream. */
 408
 409                 input0 = transition4(mm_index_mask.m, input0,
 410                         mm_shuffle_input.m, mm_ones_16.m,
 411                         mm_bytes.m, mm_type_quad_range.m,
 412                         flows.trans, &indicies1, &indicies2);
 413
 414                 input1 = transition4(mm_index_mask.m, input1,
 415                         mm_shuffle_input.m, mm_ones_16.m,
 416                         mm_bytes.m, mm_type_quad_range.m,
 417                         flows.trans, &indicies3, &indicies4);
 418
 419                 input0 = transition4(mm_index_mask.m, input0,
 420                         mm_shuffle_input.m, mm_ones_16.m,
 421                         mm_bytes.m, mm_type_quad_range.m,
 422                         flows.trans, &indicies1, &indicies2);
 423
 424                 input1 = transition4(mm_index_mask.m, input1,
 425                         mm_shuffle_input.m, mm_ones_16.m,
 426                         mm_bytes.m, mm_type_quad_range.m,
 427                         flows.trans, &indicies3, &indicies4);
 428
 429                 input0 = transition4(mm_index_mask.m, input0,
 430                         mm_shuffle_input.m, mm_ones_16.m,
 431                         mm_bytes.m, mm_type_quad_range.m,
 432                         flows.trans, &indicies1, &indicies2);
 433
 434                 input1 = transition4(mm_index_mask.m, input1,
 435                         mm_shuffle_input.m, mm_ones_16.m,
 436                         mm_bytes.m, mm_type_quad_range.m,
 437                         flows.trans, &indicies3, &indicies4);
 438
 439                 input0 = transition4(mm_index_mask.m, input0,
 440                         mm_shuffle_input.m, mm_ones_16.m,
 441                         mm_bytes.m, mm_type_quad_range.m,
 442                         flows.trans, &indicies1, &indicies2);
 443
 444                 input1 = transition4(mm_index_mask.m, input1,
 445                         mm_shuffle_input.m, mm_ones_16.m,
 446                         mm_bytes.m, mm_type_quad_range.m,
 447                         flows.trans, &indicies3, &indicies4);
 448
 449                  /* Check for any matches. */
 450                 acl_match_check_x4(0, ctx, parms, &flows,
 451                         &indicies1, &indicies2, mm_match_mask.m);
 452                 acl_match_check_x4(4, ctx, parms, &flows,
 453                         &indicies3, &indicies4, mm_match_mask.m);
 454         }
 455
 456         return 0;
 457 }
 458
 459 /*
 460  * Execute trie traversal with 4 traversals in parallel
 461  */
 462 static inline int
 463 search_sse_4(const struct rte_acl_ctx *ctx, const uint8_t **data,
 464          uint32_t *results, int total_packets, uint32_t categories)
 465 {
 466         int n;
 467         struct acl_flow_data flows;
 468         uint64_t index_array[MAX_SEARCHES_SSE4];
 469         struct completion cmplt[MAX_SEARCHES_SSE4];
 470         struct parms parms[MAX_SEARCHES_SSE4];
 471         xmm_t input, indicies1, indicies2;
 472
 473         acl_set_flow(&flows, cmplt, RTE_DIM(cmplt), data, results,
 474                 total_packets, categories, ctx->trans_table);
 475
 476         for (n = 0; n < MAX_SEARCHES_SSE4; n++) {
 477                 cmplt[n].count = 0;
 478                 index_array[n] = acl_start_next_trie(&flows, parms, n, ctx);
 479         }
 480
 481         indicies1 = MM_LOADU((xmm_t *) &index_array[0]);
 482         indicies2 = MM_LOADU((xmm_t *) &index_array[2]);
 483
 484         /* Check for any matches. */
 485         acl_match_check_x4(0, ctx, parms, &flows,
 486                 &indicies1, &indicies2, mm_match_mask.m);
 487
 488         while (flows.started > 0) {
 489
 490                 /* Gather 4 bytes of input data for each stream. */
 491                 input = MM_INSERT32(mm_ones_16.m, GET_NEXT_4BYTES(parms, 0), 0);
 492                 input = MM_INSERT32(input, GET_NEXT_4BYTES(parms, 1), 1);
 493                 input = MM_INSERT32(input, GET_NEXT_4BYTES(parms, 2), 2);
 494                 input = MM_INSERT32(input, GET_NEXT_4BYTES(parms, 3), 3);
 495
 496                 /* Process the 4 bytes of input on each stream. */
 497                 input = transition4(mm_index_mask.m, input,
 498                         mm_shuffle_input.m, mm_ones_16.m,
 499                         mm_bytes.m, mm_type_quad_range.m,
 500                         flows.trans, &indicies1, &indicies2);
 501
 502                  input = transition4(mm_index_mask.m, input,
 503                         mm_shuffle_input.m, mm_ones_16.m,
 504                         mm_bytes.m, mm_type_quad_range.m,
 505                         flows.trans, &indicies1, &indicies2);
 506
 507                  input = transition4(mm_index_mask.m, input,
 508                         mm_shuffle_input.m, mm_ones_16.m,
 509                         mm_bytes.m, mm_type_quad_range.m,
 510                         flows.trans, &indicies1, &indicies2);
 511
 512                  input = transition4(mm_index_mask.m, input,
 513                         mm_shuffle_input.m, mm_ones_16.m,
 514                         mm_bytes.m, mm_type_quad_range.m,
 515                         flows.trans, &indicies1, &indicies2);
 516
 517                 /* Check for any matches. */
 518                 acl_match_check_x4(0, ctx, parms, &flows,
 519                         &indicies1, &indicies2, mm_match_mask.m);
 520         }
 521
 522         return 0;
 523 }
 524
 525 static inline xmm_t
 526 transition2(xmm_t index_mask, xmm_t next_input, xmm_t shuffle_input,
 527         xmm_t ones_16, xmm_t bytes, xmm_t type_quad_range,
 528         const uint64_t *trans, xmm_t *indicies1)
 529 {
 530         uint64_t t;
 531         xmm_t addr, indicies2;
 532
 533         indicies2 = MM_XOR(ones_16, ones_16);
 534
 535         addr = acl_calc_addr(index_mask, next_input, shuffle_input, ones_16,
 536                 bytes, type_quad_range, indicies1, &indicies2);
 537
 538         /* Gather 64 bit transitions and pack 2 per register. */
 539
 540         t = trans[MM_CVT32(addr)];
 541
 542         /* get slot 1 */
 543         addr = MM_SHUFFLE32(addr, SHUFFLE32_SLOT1);
 544         *indicies1 = MM_SET64(trans[MM_CVT32(addr)], t);
 545
 546         return MM_SRL32(next_input, 8);
 547 }
 548
 549 /*
 550  * Execute trie traversal with 2 traversals in parallel.
 551  */
 552 static inline int
 553 search_sse_2(const struct rte_acl_ctx *ctx, const uint8_t **data,
 554         uint32_t *results, uint32_t total_packets, uint32_t categories)
 555 {
 556         int n;
 557         struct acl_flow_data flows;
 558         uint64_t index_array[MAX_SEARCHES_SSE2];
 559         struct completion cmplt[MAX_SEARCHES_SSE2];
 560         struct parms parms[MAX_SEARCHES_SSE2];
 561         xmm_t input, indicies;
 562
 563         acl_set_flow(&flows, cmplt, RTE_DIM(cmplt), data, results,
 564                 total_packets, categories, ctx->trans_table);
 565
 566         for (n = 0; n < MAX_SEARCHES_SSE2; n++) {
 567                 cmplt[n].count = 0;
 568                 index_array[n] = acl_start_next_trie(&flows, parms, n, ctx);
 569         }
 570
 571         indicies = MM_LOADU((xmm_t *) &index_array[0]);
 572
 573         /* Check for any matches. */
 574         acl_match_check_x2(0, ctx, parms, &flows, &indicies, mm_match_mask64.m);
 575
 576         while (flows.started > 0) {
 577
 578                 /* Gather 4 bytes of input data for each stream. */
 579                 input = MM_INSERT32(mm_ones_16.m, GET_NEXT_4BYTES(parms, 0), 0);
 580                 input = MM_INSERT32(input, GET_NEXT_4BYTES(parms, 1), 1);
 581
 582                 /* Process the 4 bytes of input on each stream. */
 583
 584                 input = transition2(mm_index_mask64.m, input,
 585                         mm_shuffle_input64.m, mm_ones_16.m,
 586                         mm_bytes64.m, mm_type_quad_range64.m,
 587                         flows.trans, &indicies);
 588
 589                 input = transition2(mm_index_mask64.m, input,
 590                         mm_shuffle_input64.m, mm_ones_16.m,
 591                         mm_bytes64.m, mm_type_quad_range64.m,
 592                         flows.trans, &indicies);
 593
 594                 input = transition2(mm_index_mask64.m, input,
 595                         mm_shuffle_input64.m, mm_ones_16.m,
 596                         mm_bytes64.m, mm_type_quad_range64.m,
 597                         flows.trans, &indicies);
 598
 599                 input = transition2(mm_index_mask64.m, input,
 600                         mm_shuffle_input64.m, mm_ones_16.m,
 601                         mm_bytes64.m, mm_type_quad_range64.m,
 602                         flows.trans, &indicies);
 603
 604                 /* Check for any matches. */
 605                 acl_match_check_x2(0, ctx, parms, &flows, &indicies,
 606                         mm_match_mask64.m);
 607         }
 608
 609         return 0;
 610 }
 611
 612 int
 613 rte_acl_classify_sse(const struct rte_acl_ctx *ctx, const uint8_t **data,
 614         uint32_t *results, uint32_t num, uint32_t categories)
 615 {
 616         if (categories != 1 &&
 617                 ((RTE_ACL_RESULTS_MULTIPLIER - 1) & categories) != 0)
 618                 return -EINVAL;
 619
 620         if (likely(num >= MAX_SEARCHES_SSE8))
 621                 return search_sse_8(ctx, data, results, num, categories);
 622         else if (num >= MAX_SEARCHES_SSE4)
 623                 return search_sse_4(ctx, data, results, num, categories);
 624         else
 625                 return search_sse_2(ctx, data, results, num, categories);
 626 }