lib/librte_acl/acl_run.c

   1 /*-
   2  *   BSD LICENSE
   3  *
   4  *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
   5  *   All rights reserved.
   6  *
   7  *   Redistribution and use in source and binary forms, with or without
   8  *   modification, are permitted provided that the following conditions
   9  *   are met:
  10  *
  11  *     * Redistributions of source code must retain the above copyright
  12  *       notice, this list of conditions and the following disclaimer.
  13  *     * Redistributions in binary form must reproduce the above copyright
  14  *       notice, this list of conditions and the following disclaimer in
  15  *       the documentation and/or other materials provided with the
  16  *       distribution.
  17  *     * Neither the name of Intel Corporation nor the names of its
  18  *       contributors may be used to endorse or promote products derived
  19  *       from this software without specific prior written permission.
  20  *
  21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  32  */
  33
  34 #include <rte_acl.h>
  35 #include "acl_vect.h"
  36 #include "acl.h"
  37
  38 #define MAX_SEARCHES_SSE8       8
  39 #define MAX_SEARCHES_SSE4       4
  40 #define MAX_SEARCHES_SSE2       2
  41 #define MAX_SEARCHES_SCALAR     2
  42
  43 #define GET_NEXT_4BYTES(prm, idx)       \
  44         (*((const int32_t *)((prm)[(idx)].data + *(prm)[idx].data_index++)))
  45
  46
  47 #define RTE_ACL_NODE_INDEX      ((uint32_t)~RTE_ACL_NODE_TYPE)
  48
  49 #define SCALAR_QRANGE_MULT      0x01010101
  50 #define SCALAR_QRANGE_MASK      0x7f7f7f7f
  51 #define SCALAR_QRANGE_MIN       0x80808080
  52
  53 enum {
  54         SHUFFLE32_SLOT1 = 0xe5,
  55         SHUFFLE32_SLOT2 = 0xe6,
  56         SHUFFLE32_SLOT3 = 0xe7,
  57         SHUFFLE32_SWAP64 = 0x4e,
  58 };
  59
  60 /*
  61  * Structure to manage N parallel trie traversals.
  62  * The runtime trie traversal routines can process 8, 4, or 2 tries
  63  * in parallel. Each packet may require multiple trie traversals (up to 4).
  64  * This structure is used to fill the slots (0 to n-1) for parallel processing
  65  * with the trie traversals needed for each packet.
  66  */
  67 struct acl_flow_data {
  68         uint32_t            num_packets;
  69         /* number of packets processed */
  70         uint32_t            started;
  71         /* number of trie traversals in progress */
  72         uint32_t            trie;
  73         /* current trie index (0 to N-1) */
  74         uint32_t            cmplt_size;
  75         uint32_t            total_packets;
  76         uint32_t            categories;
  77         /* number of result categories per packet. */
  78         /* maximum number of packets to process */
  79         const uint64_t     *trans;
  80         const uint8_t     **data;
  81         uint32_t           *results;
  82         struct completion  *last_cmplt;
  83         struct completion  *cmplt_array;
  84 };
  85
  86 /*
  87  * Structure to maintain running results for
  88  * a single packet (up to 4 tries).
  89  */
  90 struct completion {
  91         uint32_t *results;                          /* running results. */
  92         int32_t   priority[RTE_ACL_MAX_CATEGORIES]; /* running priorities. */
  93         uint32_t  count;                            /* num of remaining tries */
  94         /* true for allocated struct */
  95 } __attribute__((aligned(XMM_SIZE)));
  96
  97 /*
  98  * One parms structure for each slot in the search engine.
  99  */
 100 struct parms {
 101         const uint8_t              *data;
 102         /* input data for this packet */
 103         const uint32_t             *data_index;
 104         /* data indirection for this trie */
 105         struct completion          *cmplt;
 106         /* completion data for this packet */
 107 };
 108
 109 /*
 110  * Define an global idle node for unused engine slots
 111  */
 112 static const uint32_t idle[UINT8_MAX + 1];
 113
 114 static const rte_xmm_t mm_type_quad_range = {
 115         .u32 = {
 116                 RTE_ACL_NODE_QRANGE,
 117                 RTE_ACL_NODE_QRANGE,
 118                 RTE_ACL_NODE_QRANGE,
 119                 RTE_ACL_NODE_QRANGE,
 120         },
 121 };
 122
 123 static const rte_xmm_t mm_type_quad_range64 = {
 124         .u32 = {
 125                 RTE_ACL_NODE_QRANGE,
 126                 RTE_ACL_NODE_QRANGE,
 127                 0,
 128                 0,
 129         },
 130 };
 131
 132 static const rte_xmm_t mm_shuffle_input = {
 133         .u32 = {0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c},
 134 };
 135
 136 static const rte_xmm_t mm_shuffle_input64 = {
 137         .u32 = {0x00000000, 0x04040404, 0x80808080, 0x80808080},
 138 };
 139
 140 static const rte_xmm_t mm_ones_16 = {
 141         .u16 = {1, 1, 1, 1, 1, 1, 1, 1},
 142 };
 143
 144 static const rte_xmm_t mm_bytes = {
 145         .u32 = {UINT8_MAX, UINT8_MAX, UINT8_MAX, UINT8_MAX},
 146 };
 147
 148 static const rte_xmm_t mm_bytes64 = {
 149         .u32 = {UINT8_MAX, UINT8_MAX, 0, 0},
 150 };
 151
 152 static const rte_xmm_t mm_match_mask = {
 153         .u32 = {
 154                 RTE_ACL_NODE_MATCH,
 155                 RTE_ACL_NODE_MATCH,
 156                 RTE_ACL_NODE_MATCH,
 157                 RTE_ACL_NODE_MATCH,
 158         },
 159 };
 160
 161 static const rte_xmm_t mm_match_mask64 = {
 162         .u32 = {
 163                 RTE_ACL_NODE_MATCH,
 164                 0,
 165                 RTE_ACL_NODE_MATCH,
 166                 0,
 167         },
 168 };
 169
 170 static const rte_xmm_t mm_index_mask = {
 171         .u32 = {
 172                 RTE_ACL_NODE_INDEX,
 173                 RTE_ACL_NODE_INDEX,
 174                 RTE_ACL_NODE_INDEX,
 175                 RTE_ACL_NODE_INDEX,
 176         },
 177 };
 178
 179 static const rte_xmm_t mm_index_mask64 = {
 180         .u32 = {
 181                 RTE_ACL_NODE_INDEX,
 182                 RTE_ACL_NODE_INDEX,
 183                 0,
 184                 0,
 185         },
 186 };
 187
 188 /*
 189  * Allocate a completion structure to manage the tries for a packet.
 190  */
 191 static inline struct completion *
 192 alloc_completion(struct completion *p, uint32_t size, uint32_t tries,
 193         uint32_t *results)
 194 {
 195         uint32_t n;
 196
 197         for (n = 0; n < size; n++) {
 198
 199                 if (p[n].count == 0) {
 200
 201                         /* mark as allocated and set number of tries. */
 202                         p[n].count = tries;
 203                         p[n].results = results;
 204                         return &(p[n]);
 205                 }
 206         }
 207
 208         /* should never get here */
 209         return NULL;
 210 }
 211
 212 /*
 213  * Resolve priority for a single result trie.
 214  */
 215 static inline void
 216 resolve_single_priority(uint64_t transition, int n,
 217         const struct rte_acl_ctx *ctx, struct parms *parms,
 218         const struct rte_acl_match_results *p)
 219 {
 220         if (parms[n].cmplt->count == ctx->num_tries ||
 221                         parms[n].cmplt->priority[0] <=
 222                         p[transition].priority[0]) {
 223
 224                 parms[n].cmplt->priority[0] = p[transition].priority[0];
 225                 parms[n].cmplt->results[0] = p[transition].results[0];
 226         }
 227
 228         parms[n].cmplt->count--;
 229 }
 230
 231 /*
 232  * Resolve priority for multiple results. This consists comparing
 233  * the priority of the current traversal with the running set of
 234  * results for the packet. For each result, keep a running array of
 235  * the result (rule number) and its priority for each category.
 236  */
 237 static inline void
 238 resolve_priority(uint64_t transition, int n, const struct rte_acl_ctx *ctx,
 239         struct parms *parms, const struct rte_acl_match_results *p,
 240         uint32_t categories)
 241 {
 242         uint32_t x;
 243         xmm_t results, priority, results1, priority1, selector;
 244         xmm_t *saved_results, *saved_priority;
 245
 246         for (x = 0; x < categories; x += RTE_ACL_RESULTS_MULTIPLIER) {
 247
 248                 saved_results = (xmm_t *)(&parms[n].cmplt->results[x]);
 249                 saved_priority =
 250                         (xmm_t *)(&parms[n].cmplt->priority[x]);
 251
 252                 /* get results and priorities for completed trie */
 253                 results = MM_LOADU((const xmm_t *)&p[transition].results[x]);
 254                 priority = MM_LOADU((const xmm_t *)&p[transition].priority[x]);
 255
 256                 /* if this is not the first completed trie */
 257                 if (parms[n].cmplt->count != ctx->num_tries) {
 258
 259                         /* get running best results and their priorities */
 260                         results1 = MM_LOADU(saved_results);
 261                         priority1 = MM_LOADU(saved_priority);
 262
 263                         /* select results that are highest priority */
 264                         selector = MM_CMPGT32(priority1, priority);
 265                         results = MM_BLENDV8(results, results1, selector);
 266                         priority = MM_BLENDV8(priority, priority1, selector);
 267                 }
 268
 269                 /* save running best results and their priorities */
 270                 MM_STOREU(saved_results, results);
 271                 MM_STOREU(saved_priority, priority);
 272         }
 273
 274         /* Count down completed tries for this search request */
 275         parms[n].cmplt->count--;
 276 }
 277
 278 /*
 279  * Routine to fill a slot in the parallel trie traversal array (parms) from
 280  * the list of packets (flows).
 281  */
 282 static inline uint64_t
 283 acl_start_next_trie(struct acl_flow_data *flows, struct parms *parms, int n,
 284         const struct rte_acl_ctx *ctx)
 285 {
 286         uint64_t transition;
 287
 288         /* if there are any more packets to process */
 289         if (flows->num_packets < flows->total_packets) {
 290                 parms[n].data = flows->data[flows->num_packets];
 291                 parms[n].data_index = ctx->trie[flows->trie].data_index;
 292
 293                 /* if this is the first trie for this packet */
 294                 if (flows->trie == 0) {
 295                         flows->last_cmplt = alloc_completion(flows->cmplt_array,
 296                                 flows->cmplt_size, ctx->num_tries,
 297                                 flows->results +
 298                                 flows->num_packets * flows->categories);
 299                 }
 300
 301                 /* set completion parameters and starting index for this slot */
 302                 parms[n].cmplt = flows->last_cmplt;
 303                 transition =
 304                         flows->trans[parms[n].data[*parms[n].data_index++] +
 305                         ctx->trie[flows->trie].root_index];
 306
 307                 /*
 308                  * if this is the last trie for this packet,
 309                  * then setup next packet.
 310                  */
 311                 flows->trie++;
 312                 if (flows->trie >= ctx->num_tries) {
 313                         flows->trie = 0;
 314                         flows->num_packets++;
 315                 }
 316
 317                 /* keep track of number of active trie traversals */
 318                 flows->started++;
 319
 320         /* no more tries to process, set slot to an idle position */
 321         } else {
 322                 transition = ctx->idle;
 323                 parms[n].data = (const uint8_t *)idle;
 324                 parms[n].data_index = idle;
 325         }
 326         return transition;
 327 }
 328
 329 /*
 330  * Detect matches. If a match node transition is found, then this trie
 331  * traversal is complete and fill the slot with the next trie
 332  * to be processed.
 333  */
 334 static inline uint64_t
 335 acl_match_check_transition(uint64_t transition, int slot,
 336         const struct rte_acl_ctx *ctx, struct parms *parms,
 337         struct acl_flow_data *flows)
 338 {
 339         const struct rte_acl_match_results *p;
 340
 341         p = (const struct rte_acl_match_results *)
 342                 (flows->trans + ctx->match_index);
 343
 344         if (transition & RTE_ACL_NODE_MATCH) {
 345
 346                 /* Remove flags from index and decrement active traversals */
 347                 transition &= RTE_ACL_NODE_INDEX;
 348                 flows->started--;
 349
 350                 /* Resolve priorities for this trie and running results */
 351                 if (flows->categories == 1)
 352                         resolve_single_priority(transition, slot, ctx,
 353                                 parms, p);
 354                 else
 355                         resolve_priority(transition, slot, ctx, parms, p,
 356                                 flows->categories);
 357
 358                 /* Fill the slot with the next trie or idle trie */
 359                 transition = acl_start_next_trie(flows, parms, slot, ctx);
 360
 361         } else if (transition == ctx->idle) {
 362                 /* reset indirection table for idle slots */
 363                 parms[slot].data_index = idle;
 364         }
 365
 366         return transition;
 367 }
 368
 369 /*
 370  * Extract transitions from an XMM register and check for any matches
 371  */
 372 static void
 373 acl_process_matches(xmm_t *indicies, int slot, const struct rte_acl_ctx *ctx,
 374         struct parms *parms, struct acl_flow_data *flows)
 375 {
 376         uint64_t transition1, transition2;
 377
 378         /* extract transition from low 64 bits. */
 379         transition1 = MM_CVT64(*indicies);
 380
 381         /* extract transition from high 64 bits. */
 382         *indicies = MM_SHUFFLE32(*indicies, SHUFFLE32_SWAP64);
 383         transition2 = MM_CVT64(*indicies);
 384
 385         transition1 = acl_match_check_transition(transition1, slot, ctx,
 386                 parms, flows);
 387         transition2 = acl_match_check_transition(transition2, slot + 1, ctx,
 388                 parms, flows);
 389
 390         /* update indicies with new transitions. */
 391         *indicies = MM_SET64(transition2, transition1);
 392 }
 393
 394 /*
 395  * Check for a match in 2 transitions (contained in SSE register)
 396  */
 397 static inline void
 398 acl_match_check_x2(int slot, const struct rte_acl_ctx *ctx, struct parms *parms,
 399         struct acl_flow_data *flows, xmm_t *indicies, xmm_t match_mask)
 400 {
 401         xmm_t temp;
 402
 403         temp = MM_AND(match_mask, *indicies);
 404         while (!MM_TESTZ(temp, temp)) {
 405                 acl_process_matches(indicies, slot, ctx, parms, flows);
 406                 temp = MM_AND(match_mask, *indicies);
 407         }
 408 }
 409
 410 /*
 411  * Check for any match in 4 transitions (contained in 2 SSE registers)
 412  */
 413 static inline void
 414 acl_match_check_x4(int slot, const struct rte_acl_ctx *ctx, struct parms *parms,
 415         struct acl_flow_data *flows, xmm_t *indicies1, xmm_t *indicies2,
 416         xmm_t match_mask)
 417 {
 418         xmm_t temp;
 419
 420         /* put low 32 bits of each transition into one register */
 421         temp = (xmm_t)MM_SHUFFLEPS((__m128)*indicies1, (__m128)*indicies2,
 422                 0x88);
 423         /* test for match node */
 424         temp = MM_AND(match_mask, temp);
 425
 426         while (!MM_TESTZ(temp, temp)) {
 427                 acl_process_matches(indicies1, slot, ctx, parms, flows);
 428                 acl_process_matches(indicies2, slot + 2, ctx, parms, flows);
 429
 430                 temp = (xmm_t)MM_SHUFFLEPS((__m128)*indicies1,
 431                                         (__m128)*indicies2,
 432                                         0x88);
 433                 temp = MM_AND(match_mask, temp);
 434         }
 435 }
 436
 437 /*
 438  * Calculate the address of the next transition for
 439  * all types of nodes. Note that only DFA nodes and range
 440  * nodes actually transition to another node. Match
 441  * nodes don't move.
 442  */
 443 static inline xmm_t
 444 acl_calc_addr(xmm_t index_mask, xmm_t next_input, xmm_t shuffle_input,
 445         xmm_t ones_16, xmm_t bytes, xmm_t type_quad_range,
 446         xmm_t *indicies1, xmm_t *indicies2)
 447 {
 448         xmm_t addr, node_types, temp;
 449
 450         /*
 451          * Note that no transition is done for a match
 452          * node and therefore a stream freezes when
 453          * it reaches a match.
 454          */
 455
 456         /* Shuffle low 32 into temp and high 32 into indicies2 */
 457         temp = (xmm_t)MM_SHUFFLEPS((__m128)*indicies1, (__m128)*indicies2,
 458                 0x88);
 459         *indicies2 = (xmm_t)MM_SHUFFLEPS((__m128)*indicies1,
 460                 (__m128)*indicies2, 0xdd);
 461
 462         /* Calc node type and node addr */
 463         node_types = MM_ANDNOT(index_mask, temp);
 464         addr = MM_AND(index_mask, temp);
 465
 466         /*
 467          * Calc addr for DFAs - addr = dfa_index + input_byte
 468          */
 469
 470         /* mask for DFA type (0) nodes */
 471         temp = MM_CMPEQ32(node_types, MM_XOR(node_types, node_types));
 472
 473         /* add input byte to DFA position */
 474         temp = MM_AND(temp, bytes);
 475         temp = MM_AND(temp, next_input);
 476         addr = MM_ADD32(addr, temp);
 477
 478         /*
 479          * Calc addr for Range nodes -> range_index + range(input)
 480          */
 481         node_types = MM_CMPEQ32(node_types, type_quad_range);
 482
 483         /*
 484          * Calculate number of range boundaries that are less than the
 485          * input value. Range boundaries for each node are in signed 8 bit,
 486          * ordered from -128 to 127 in the indicies2 register.
 487          * This is effectively a popcnt of bytes that are greater than the
 488          * input byte.
 489          */
 490
 491         /* shuffle input byte to all 4 positions of 32 bit value */
 492         temp = MM_SHUFFLE8(next_input, shuffle_input);
 493
 494         /* check ranges */
 495         temp = MM_CMPGT8(temp, *indicies2);
 496
 497         /* convert -1 to 1 (bytes greater than input byte */
 498         temp = MM_SIGN8(temp, temp);
 499
 500         /* horizontal add pairs of bytes into words */
 501         temp = MM_MADD8(temp, temp);
 502
 503         /* horizontal add pairs of words into dwords */
 504         temp = MM_MADD16(temp, ones_16);
 505
 506         /* mask to range type nodes */
 507         temp = MM_AND(temp, node_types);
 508
 509         /* add index into node position */
 510         return MM_ADD32(addr, temp);
 511 }
 512
 513 /*
 514  * Process 4 transitions (in 2 SIMD registers) in parallel
 515  */
 516 static inline xmm_t
 517 transition4(xmm_t index_mask, xmm_t next_input, xmm_t shuffle_input,
 518         xmm_t ones_16, xmm_t bytes, xmm_t type_quad_range,
 519         const uint64_t *trans, xmm_t *indicies1, xmm_t *indicies2)
 520 {
 521         xmm_t addr;
 522         uint64_t trans0, trans2;
 523
 524          /* Calculate the address (array index) for all 4 transitions. */
 525
 526         addr = acl_calc_addr(index_mask, next_input, shuffle_input, ones_16,
 527                 bytes, type_quad_range, indicies1, indicies2);
 528
 529          /* Gather 64 bit transitions and pack back into 2 registers. */
 530
 531         trans0 = trans[MM_CVT32(addr)];
 532
 533         /* get slot 2 */
 534
 535         /* {x0, x1, x2, x3} -> {x2, x1, x2, x3} */
 536         addr = MM_SHUFFLE32(addr, SHUFFLE32_SLOT2);
 537         trans2 = trans[MM_CVT32(addr)];
 538
 539         /* get slot 1 */
 540
 541         /* {x2, x1, x2, x3} -> {x1, x1, x2, x3} */
 542         addr = MM_SHUFFLE32(addr, SHUFFLE32_SLOT1);
 543         *indicies1 = MM_SET64(trans[MM_CVT32(addr)], trans0);
 544
 545         /* get slot 3 */
 546
 547         /* {x1, x1, x2, x3} -> {x3, x1, x2, x3} */
 548         addr = MM_SHUFFLE32(addr, SHUFFLE32_SLOT3);
 549         *indicies2 = MM_SET64(trans[MM_CVT32(addr)], trans2);
 550
 551         return MM_SRL32(next_input, 8);
 552 }
 553
 554 static inline void
 555 acl_set_flow(struct acl_flow_data *flows, struct completion *cmplt,
 556         uint32_t cmplt_size, const uint8_t **data, uint32_t *results,
 557         uint32_t data_num, uint32_t categories, const uint64_t *trans)
 558 {
 559         flows->num_packets = 0;
 560         flows->started = 0;
 561         flows->trie = 0;
 562         flows->last_cmplt = NULL;
 563         flows->cmplt_array = cmplt;
 564         flows->total_packets = data_num;
 565         flows->categories = categories;
 566         flows->cmplt_size = cmplt_size;
 567         flows->data = data;
 568         flows->results = results;
 569         flows->trans = trans;
 570 }
 571
 572 /*
 573  * Execute trie traversal with 8 traversals in parallel
 574  */
 575 static inline void
 576 search_sse_8(const struct rte_acl_ctx *ctx, const uint8_t **data,
 577         uint32_t *results, uint32_t total_packets, uint32_t categories)
 578 {
 579         int n;
 580         struct acl_flow_data flows;
 581         uint64_t index_array[MAX_SEARCHES_SSE8];
 582         struct completion cmplt[MAX_SEARCHES_SSE8];
 583         struct parms parms[MAX_SEARCHES_SSE8];
 584         xmm_t input0, input1;
 585         xmm_t indicies1, indicies2, indicies3, indicies4;
 586
 587         acl_set_flow(&flows, cmplt, RTE_DIM(cmplt), data, results,
 588                 total_packets, categories, ctx->trans_table);
 589
 590         for (n = 0; n < MAX_SEARCHES_SSE8; n++) {
 591                 cmplt[n].count = 0;
 592                 index_array[n] = acl_start_next_trie(&flows, parms, n, ctx);
 593         }
 594
 595         /*
 596          * indicies1 contains index_array[0,1]
 597          * indicies2 contains index_array[2,3]
 598          * indicies3 contains index_array[4,5]
 599          * indicies4 contains index_array[6,7]
 600          */
 601
 602         indicies1 = MM_LOADU((xmm_t *) &index_array[0]);
 603         indicies2 = MM_LOADU((xmm_t *) &index_array[2]);
 604
 605         indicies3 = MM_LOADU((xmm_t *) &index_array[4]);
 606         indicies4 = MM_LOADU((xmm_t *) &index_array[6]);
 607
 608          /* Check for any matches. */
 609         acl_match_check_x4(0, ctx, parms, &flows,
 610                 &indicies1, &indicies2, mm_match_mask.m);
 611         acl_match_check_x4(4, ctx, parms, &flows,
 612                 &indicies3, &indicies4, mm_match_mask.m);
 613
 614         while (flows.started > 0) {
 615
 616                 /* Gather 4 bytes of input data for each stream. */
 617                 input0 = MM_INSERT32(mm_ones_16.m, GET_NEXT_4BYTES(parms, 0),
 618                         0);
 619                 input1 = MM_INSERT32(mm_ones_16.m, GET_NEXT_4BYTES(parms, 4),
 620                         0);
 621
 622                 input0 = MM_INSERT32(input0, GET_NEXT_4BYTES(parms, 1), 1);
 623                 input1 = MM_INSERT32(input1, GET_NEXT_4BYTES(parms, 5), 1);
 624
 625                 input0 = MM_INSERT32(input0, GET_NEXT_4BYTES(parms, 2), 2);
 626                 input1 = MM_INSERT32(input1, GET_NEXT_4BYTES(parms, 6), 2);
 627
 628                 input0 = MM_INSERT32(input0, GET_NEXT_4BYTES(parms, 3), 3);
 629                 input1 = MM_INSERT32(input1, GET_NEXT_4BYTES(parms, 7), 3);
 630
 631                  /* Process the 4 bytes of input on each stream. */
 632
 633                 input0 = transition4(mm_index_mask.m, input0,
 634                         mm_shuffle_input.m, mm_ones_16.m,
 635                         mm_bytes.m, mm_type_quad_range.m,
 636                         flows.trans, &indicies1, &indicies2);
 637
 638                 input1 = transition4(mm_index_mask.m, input1,
 639                         mm_shuffle_input.m, mm_ones_16.m,
 640                         mm_bytes.m, mm_type_quad_range.m,
 641                         flows.trans, &indicies3, &indicies4);
 642
 643                 input0 = transition4(mm_index_mask.m, input0,
 644                         mm_shuffle_input.m, mm_ones_16.m,
 645                         mm_bytes.m, mm_type_quad_range.m,
 646                         flows.trans, &indicies1, &indicies2);
 647
 648                 input1 = transition4(mm_index_mask.m, input1,
 649                         mm_shuffle_input.m, mm_ones_16.m,
 650                         mm_bytes.m, mm_type_quad_range.m,
 651                         flows.trans, &indicies3, &indicies4);
 652
 653                 input0 = transition4(mm_index_mask.m, input0,
 654                         mm_shuffle_input.m, mm_ones_16.m,
 655                         mm_bytes.m, mm_type_quad_range.m,
 656                         flows.trans, &indicies1, &indicies2);
 657
 658                 input1 = transition4(mm_index_mask.m, input1,
 659                         mm_shuffle_input.m, mm_ones_16.m,
 660                         mm_bytes.m, mm_type_quad_range.m,
 661                         flows.trans, &indicies3, &indicies4);
 662
 663                 input0 = transition4(mm_index_mask.m, input0,
 664                         mm_shuffle_input.m, mm_ones_16.m,
 665                         mm_bytes.m, mm_type_quad_range.m,
 666                         flows.trans, &indicies1, &indicies2);
 667
 668                 input1 = transition4(mm_index_mask.m, input1,
 669                         mm_shuffle_input.m, mm_ones_16.m,
 670                         mm_bytes.m, mm_type_quad_range.m,
 671                         flows.trans, &indicies3, &indicies4);
 672
 673                  /* Check for any matches. */
 674                 acl_match_check_x4(0, ctx, parms, &flows,
 675                         &indicies1, &indicies2, mm_match_mask.m);
 676                 acl_match_check_x4(4, ctx, parms, &flows,
 677                         &indicies3, &indicies4, mm_match_mask.m);
 678         }
 679 }
 680
 681 /*
 682  * Execute trie traversal with 4 traversals in parallel
 683  */
 684 static inline void
 685 search_sse_4(const struct rte_acl_ctx *ctx, const uint8_t **data,
 686          uint32_t *results, int total_packets, uint32_t categories)
 687 {
 688         int n;
 689         struct acl_flow_data flows;
 690         uint64_t index_array[MAX_SEARCHES_SSE4];
 691         struct completion cmplt[MAX_SEARCHES_SSE4];
 692         struct parms parms[MAX_SEARCHES_SSE4];
 693         xmm_t input, indicies1, indicies2;
 694
 695         acl_set_flow(&flows, cmplt, RTE_DIM(cmplt), data, results,
 696                 total_packets, categories, ctx->trans_table);
 697
 698         for (n = 0; n < MAX_SEARCHES_SSE4; n++) {
 699                 cmplt[n].count = 0;
 700                 index_array[n] = acl_start_next_trie(&flows, parms, n, ctx);
 701         }
 702
 703         indicies1 = MM_LOADU((xmm_t *) &index_array[0]);
 704         indicies2 = MM_LOADU((xmm_t *) &index_array[2]);
 705
 706         /* Check for any matches. */
 707         acl_match_check_x4(0, ctx, parms, &flows,
 708                 &indicies1, &indicies2, mm_match_mask.m);
 709
 710         while (flows.started > 0) {
 711
 712                 /* Gather 4 bytes of input data for each stream. */
 713                 input = MM_INSERT32(mm_ones_16.m, GET_NEXT_4BYTES(parms, 0), 0);
 714                 input = MM_INSERT32(input, GET_NEXT_4BYTES(parms, 1), 1);
 715                 input = MM_INSERT32(input, GET_NEXT_4BYTES(parms, 2), 2);
 716                 input = MM_INSERT32(input, GET_NEXT_4BYTES(parms, 3), 3);
 717
 718                 /* Process the 4 bytes of input on each stream. */
 719                 input = transition4(mm_index_mask.m, input,
 720                         mm_shuffle_input.m, mm_ones_16.m,
 721                         mm_bytes.m, mm_type_quad_range.m,
 722                         flows.trans, &indicies1, &indicies2);
 723
 724                  input = transition4(mm_index_mask.m, input,
 725                         mm_shuffle_input.m, mm_ones_16.m,
 726                         mm_bytes.m, mm_type_quad_range.m,
 727                         flows.trans, &indicies1, &indicies2);
 728
 729                  input = transition4(mm_index_mask.m, input,
 730                         mm_shuffle_input.m, mm_ones_16.m,
 731                         mm_bytes.m, mm_type_quad_range.m,
 732                         flows.trans, &indicies1, &indicies2);
 733
 734                  input = transition4(mm_index_mask.m, input,
 735                         mm_shuffle_input.m, mm_ones_16.m,
 736                         mm_bytes.m, mm_type_quad_range.m,
 737                         flows.trans, &indicies1, &indicies2);
 738
 739                 /* Check for any matches. */
 740                 acl_match_check_x4(0, ctx, parms, &flows,
 741                         &indicies1, &indicies2, mm_match_mask.m);
 742         }
 743 }
 744
 745 static inline xmm_t
 746 transition2(xmm_t index_mask, xmm_t next_input, xmm_t shuffle_input,
 747         xmm_t ones_16, xmm_t bytes, xmm_t type_quad_range,
 748         const uint64_t *trans, xmm_t *indicies1)
 749 {
 750         uint64_t t;
 751         xmm_t addr, indicies2;
 752
 753         indicies2 = MM_XOR(ones_16, ones_16);
 754
 755         addr = acl_calc_addr(index_mask, next_input, shuffle_input, ones_16,
 756                 bytes, type_quad_range, indicies1, &indicies2);
 757
 758         /* Gather 64 bit transitions and pack 2 per register. */
 759
 760         t = trans[MM_CVT32(addr)];
 761
 762         /* get slot 1 */
 763         addr = MM_SHUFFLE32(addr, SHUFFLE32_SLOT1);
 764         *indicies1 = MM_SET64(trans[MM_CVT32(addr)], t);
 765
 766         return MM_SRL32(next_input, 8);
 767 }
 768
 769 /*
 770  * Execute trie traversal with 2 traversals in parallel.
 771  */
 772 static inline void
 773 search_sse_2(const struct rte_acl_ctx *ctx, const uint8_t **data,
 774         uint32_t *results, uint32_t total_packets, uint32_t categories)
 775 {
 776         int n;
 777         struct acl_flow_data flows;
 778         uint64_t index_array[MAX_SEARCHES_SSE2];
 779         struct completion cmplt[MAX_SEARCHES_SSE2];
 780         struct parms parms[MAX_SEARCHES_SSE2];
 781         xmm_t input, indicies;
 782
 783         acl_set_flow(&flows, cmplt, RTE_DIM(cmplt), data, results,
 784                 total_packets, categories, ctx->trans_table);
 785
 786         for (n = 0; n < MAX_SEARCHES_SSE2; n++) {
 787                 cmplt[n].count = 0;
 788                 index_array[n] = acl_start_next_trie(&flows, parms, n, ctx);
 789         }
 790
 791         indicies = MM_LOADU((xmm_t *) &index_array[0]);
 792
 793         /* Check for any matches. */
 794         acl_match_check_x2(0, ctx, parms, &flows, &indicies, mm_match_mask64.m);
 795
 796         while (flows.started > 0) {
 797
 798                 /* Gather 4 bytes of input data for each stream. */
 799                 input = MM_INSERT32(mm_ones_16.m, GET_NEXT_4BYTES(parms, 0), 0);
 800                 input = MM_INSERT32(input, GET_NEXT_4BYTES(parms, 1), 1);
 801
 802                 /* Process the 4 bytes of input on each stream. */
 803
 804                 input = transition2(mm_index_mask64.m, input,
 805                         mm_shuffle_input64.m, mm_ones_16.m,
 806                         mm_bytes64.m, mm_type_quad_range64.m,
 807                         flows.trans, &indicies);
 808
 809                 input = transition2(mm_index_mask64.m, input,
 810                         mm_shuffle_input64.m, mm_ones_16.m,
 811                         mm_bytes64.m, mm_type_quad_range64.m,
 812                         flows.trans, &indicies);
 813
 814                 input = transition2(mm_index_mask64.m, input,
 815                         mm_shuffle_input64.m, mm_ones_16.m,
 816                         mm_bytes64.m, mm_type_quad_range64.m,
 817                         flows.trans, &indicies);
 818
 819                 input = transition2(mm_index_mask64.m, input,
 820                         mm_shuffle_input64.m, mm_ones_16.m,
 821                         mm_bytes64.m, mm_type_quad_range64.m,
 822                         flows.trans, &indicies);
 823
 824                 /* Check for any matches. */
 825                 acl_match_check_x2(0, ctx, parms, &flows, &indicies,
 826                         mm_match_mask64.m);
 827         }
 828 }
 829
 830 /*
 831  * When processing the transition, rather than using if/else
 832  * construct, the offset is calculated for DFA and QRANGE and
 833  * then conditionally added to the address based on node type.
 834  * This is done to avoid branch mis-predictions. Since the
 835  * offset is rather simple calculation it is more efficient
 836  * to do the calculation and do a condition move rather than
 837  * a conditional branch to determine which calculation to do.
 838  */
 839 static inline uint32_t
 840 scan_forward(uint32_t input, uint32_t max)
 841 {
 842         return (input == 0) ? max : rte_bsf32(input);
 843 }
 844
 845 static inline uint64_t
 846 scalar_transition(const uint64_t *trans_table, uint64_t transition,
 847         uint8_t input)
 848 {
 849         uint32_t addr, index, ranges, x, a, b, c;
 850
 851         /* break transition into component parts */
 852         ranges = transition >> (sizeof(index) * CHAR_BIT);
 853
 854         /* calc address for a QRANGE node */
 855         c = input * SCALAR_QRANGE_MULT;
 856         a = ranges | SCALAR_QRANGE_MIN;
 857         index = transition & ~RTE_ACL_NODE_INDEX;
 858         a -= (c & SCALAR_QRANGE_MASK);
 859         b = c & SCALAR_QRANGE_MIN;
 860         addr = transition ^ index;
 861         a &= SCALAR_QRANGE_MIN;
 862         a ^= (ranges ^ b) & (a ^ b);
 863         x = scan_forward(a, 32) >> 3;
 864         addr += (index == RTE_ACL_NODE_DFA) ? input : x;
 865
 866         /* pickup next transition */
 867         transition = *(trans_table + addr);
 868         return transition;
 869 }
 870
 871 int
 872 rte_acl_classify_scalar(const struct rte_acl_ctx *ctx, const uint8_t **data,
 873         uint32_t *results, uint32_t num, uint32_t categories)
 874 {
 875         int n;
 876         uint64_t transition0, transition1;
 877         uint32_t input0, input1;
 878         struct acl_flow_data flows;
 879         uint64_t index_array[MAX_SEARCHES_SCALAR];
 880         struct completion cmplt[MAX_SEARCHES_SCALAR];
 881         struct parms parms[MAX_SEARCHES_SCALAR];
 882
 883         if (categories != 1 &&
 884                 ((RTE_ACL_RESULTS_MULTIPLIER - 1) & categories) != 0)
 885                 return -EINVAL;
 886
 887         acl_set_flow(&flows, cmplt, RTE_DIM(cmplt), data, results, num,
 888                 categories, ctx->trans_table);
 889
 890         for (n = 0; n < MAX_SEARCHES_SCALAR; n++) {
 891                 cmplt[n].count = 0;
 892                 index_array[n] = acl_start_next_trie(&flows, parms, n, ctx);
 893         }
 894
 895         transition0 = index_array[0];
 896         transition1 = index_array[1];
 897
 898         while (flows.started > 0) {
 899
 900                 input0 = GET_NEXT_4BYTES(parms, 0);
 901                 input1 = GET_NEXT_4BYTES(parms, 1);
 902
 903                 for (n = 0; n < 4; n++) {
 904                         if (likely((transition0 & RTE_ACL_NODE_MATCH) == 0))
 905                                 transition0 = scalar_transition(flows.trans,
 906                                         transition0, (uint8_t)input0);
 907
 908                         input0 >>= CHAR_BIT;
 909
 910                         if (likely((transition1 & RTE_ACL_NODE_MATCH) == 0))
 911                                 transition1 = scalar_transition(flows.trans,
 912                                         transition1, (uint8_t)input1);
 913
 914                         input1 >>= CHAR_BIT;
 915
 916                 }
 917                 if ((transition0 | transition1) & RTE_ACL_NODE_MATCH) {
 918                         transition0 = acl_match_check_transition(transition0,
 919                                 0, ctx, parms, &flows);
 920                         transition1 = acl_match_check_transition(transition1,
 921                                 1, ctx, parms, &flows);
 922
 923                 }
 924         }
 925         return 0;
 926 }
 927
 928 int
 929 rte_acl_classify(const struct rte_acl_ctx *ctx, const uint8_t **data,
 930         uint32_t *results, uint32_t num, uint32_t categories)
 931 {
 932         if (categories != 1 &&
 933                 ((RTE_ACL_RESULTS_MULTIPLIER - 1) & categories) != 0)
 934                 return -EINVAL;
 935
 936         if (likely(num >= MAX_SEARCHES_SSE8))
 937                 search_sse_8(ctx, data, results, num, categories);
 938         else if (num >= MAX_SEARCHES_SSE4)
 939                 search_sse_4(ctx, data, results, num, categories);
 940         else
 941                 search_sse_2(ctx, data, results, num, categories);
 942
 943         return 0;
 944 }