4 * Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * * Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * * Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * * Neither the name of Intel Corporation nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37 SHUFFLE32_SLOT1 = 0xe5,
38 SHUFFLE32_SLOT2 = 0xe6,
39 SHUFFLE32_SLOT3 = 0xe7,
40 SHUFFLE32_SWAP64 = 0x4e,
43 static const rte_xmm_t mm_type_quad_range = {
52 static const rte_xmm_t mm_type_quad_range64 = {
61 static const rte_xmm_t mm_shuffle_input = {
62 .u32 = {0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c},
65 static const rte_xmm_t mm_shuffle_input64 = {
66 .u32 = {0x00000000, 0x04040404, 0x80808080, 0x80808080},
69 static const rte_xmm_t mm_ones_16 = {
70 .u16 = {1, 1, 1, 1, 1, 1, 1, 1},
73 static const rte_xmm_t mm_bytes = {
74 .u32 = {UINT8_MAX, UINT8_MAX, UINT8_MAX, UINT8_MAX},
77 static const rte_xmm_t mm_bytes64 = {
78 .u32 = {UINT8_MAX, UINT8_MAX, 0, 0},
81 static const rte_xmm_t mm_match_mask = {
90 static const rte_xmm_t mm_match_mask64 = {
99 static const rte_xmm_t mm_index_mask = {
108 static const rte_xmm_t mm_index_mask64 = {
119 * Resolve priority for multiple results (sse version).
120 * This consists comparing the priority of the current traversal with the
121 * running set of results for the packet.
122 * For each result, keep a running array of the result (rule number) and
123 * its priority for each category.
126 resolve_priority_sse(uint64_t transition, int n, const struct rte_acl_ctx *ctx,
127 struct parms *parms, const struct rte_acl_match_results *p,
131 xmm_t results, priority, results1, priority1, selector;
132 xmm_t *saved_results, *saved_priority;
134 for (x = 0; x < categories; x += RTE_ACL_RESULTS_MULTIPLIER) {
136 saved_results = (xmm_t *)(&parms[n].cmplt->results[x]);
138 (xmm_t *)(&parms[n].cmplt->priority[x]);
140 /* get results and priorities for completed trie */
141 results = MM_LOADU((const xmm_t *)&p[transition].results[x]);
142 priority = MM_LOADU((const xmm_t *)&p[transition].priority[x]);
144 /* if this is not the first completed trie */
145 if (parms[n].cmplt->count != ctx->num_tries) {
147 /* get running best results and their priorities */
148 results1 = MM_LOADU(saved_results);
149 priority1 = MM_LOADU(saved_priority);
151 /* select results that are highest priority */
152 selector = MM_CMPGT32(priority1, priority);
153 results = MM_BLENDV8(results, results1, selector);
154 priority = MM_BLENDV8(priority, priority1, selector);
157 /* save running best results and their priorities */
158 MM_STOREU(saved_results, results);
159 MM_STOREU(saved_priority, priority);
164 * Extract transitions from an XMM register and check for any matches
167 acl_process_matches(xmm_t *indicies, int slot, const struct rte_acl_ctx *ctx,
168 struct parms *parms, struct acl_flow_data *flows)
170 uint64_t transition1, transition2;
172 /* extract transition from low 64 bits. */
173 transition1 = MM_CVT64(*indicies);
175 /* extract transition from high 64 bits. */
176 *indicies = MM_SHUFFLE32(*indicies, SHUFFLE32_SWAP64);
177 transition2 = MM_CVT64(*indicies);
179 transition1 = acl_match_check(transition1, slot, ctx,
180 parms, flows, resolve_priority_sse);
181 transition2 = acl_match_check(transition2, slot + 1, ctx,
182 parms, flows, resolve_priority_sse);
184 /* update indicies with new transitions. */
185 *indicies = MM_SET64(transition2, transition1);
189 * Check for a match in 2 transitions (contained in SSE register)
192 acl_match_check_x2(int slot, const struct rte_acl_ctx *ctx, struct parms *parms,
193 struct acl_flow_data *flows, xmm_t *indicies, xmm_t match_mask)
197 temp = MM_AND(match_mask, *indicies);
198 while (!MM_TESTZ(temp, temp)) {
199 acl_process_matches(indicies, slot, ctx, parms, flows);
200 temp = MM_AND(match_mask, *indicies);
205 * Check for any match in 4 transitions (contained in 2 SSE registers)
208 acl_match_check_x4(int slot, const struct rte_acl_ctx *ctx, struct parms *parms,
209 struct acl_flow_data *flows, xmm_t *indicies1, xmm_t *indicies2,
214 /* put low 32 bits of each transition into one register */
215 temp = (xmm_t)MM_SHUFFLEPS((__m128)*indicies1, (__m128)*indicies2,
217 /* test for match node */
218 temp = MM_AND(match_mask, temp);
220 while (!MM_TESTZ(temp, temp)) {
221 acl_process_matches(indicies1, slot, ctx, parms, flows);
222 acl_process_matches(indicies2, slot + 2, ctx, parms, flows);
224 temp = (xmm_t)MM_SHUFFLEPS((__m128)*indicies1,
227 temp = MM_AND(match_mask, temp);
232 * Calculate the address of the next transition for
233 * all types of nodes. Note that only DFA nodes and range
234 * nodes actually transition to another node. Match
238 acl_calc_addr(xmm_t index_mask, xmm_t next_input, xmm_t shuffle_input,
239 xmm_t ones_16, xmm_t bytes, xmm_t type_quad_range,
240 xmm_t *indicies1, xmm_t *indicies2)
242 xmm_t addr, node_types, temp;
245 * Note that no transition is done for a match
246 * node and therefore a stream freezes when
247 * it reaches a match.
250 /* Shuffle low 32 into temp and high 32 into indicies2 */
251 temp = (xmm_t)MM_SHUFFLEPS((__m128)*indicies1, (__m128)*indicies2,
253 *indicies2 = (xmm_t)MM_SHUFFLEPS((__m128)*indicies1,
254 (__m128)*indicies2, 0xdd);
256 /* Calc node type and node addr */
257 node_types = MM_ANDNOT(index_mask, temp);
258 addr = MM_AND(index_mask, temp);
261 * Calc addr for DFAs - addr = dfa_index + input_byte
264 /* mask for DFA type (0) nodes */
265 temp = MM_CMPEQ32(node_types, MM_XOR(node_types, node_types));
267 /* add input byte to DFA position */
268 temp = MM_AND(temp, bytes);
269 temp = MM_AND(temp, next_input);
270 addr = MM_ADD32(addr, temp);
273 * Calc addr for Range nodes -> range_index + range(input)
275 node_types = MM_CMPEQ32(node_types, type_quad_range);
278 * Calculate number of range boundaries that are less than the
279 * input value. Range boundaries for each node are in signed 8 bit,
280 * ordered from -128 to 127 in the indicies2 register.
281 * This is effectively a popcnt of bytes that are greater than the
285 /* shuffle input byte to all 4 positions of 32 bit value */
286 temp = MM_SHUFFLE8(next_input, shuffle_input);
289 temp = MM_CMPGT8(temp, *indicies2);
291 /* convert -1 to 1 (bytes greater than input byte */
292 temp = MM_SIGN8(temp, temp);
294 /* horizontal add pairs of bytes into words */
295 temp = MM_MADD8(temp, temp);
297 /* horizontal add pairs of words into dwords */
298 temp = MM_MADD16(temp, ones_16);
300 /* mask to range type nodes */
301 temp = MM_AND(temp, node_types);
303 /* add index into node position */
304 return MM_ADD32(addr, temp);
308 * Process 4 transitions (in 2 SIMD registers) in parallel
311 transition4(xmm_t index_mask, xmm_t next_input, xmm_t shuffle_input,
312 xmm_t ones_16, xmm_t bytes, xmm_t type_quad_range,
313 const uint64_t *trans, xmm_t *indicies1, xmm_t *indicies2)
316 uint64_t trans0, trans2;
318 /* Calculate the address (array index) for all 4 transitions. */
320 addr = acl_calc_addr(index_mask, next_input, shuffle_input, ones_16,
321 bytes, type_quad_range, indicies1, indicies2);
323 /* Gather 64 bit transitions and pack back into 2 registers. */
325 trans0 = trans[MM_CVT32(addr)];
329 /* {x0, x1, x2, x3} -> {x2, x1, x2, x3} */
330 addr = MM_SHUFFLE32(addr, SHUFFLE32_SLOT2);
331 trans2 = trans[MM_CVT32(addr)];
335 /* {x2, x1, x2, x3} -> {x1, x1, x2, x3} */
336 addr = MM_SHUFFLE32(addr, SHUFFLE32_SLOT1);
337 *indicies1 = MM_SET64(trans[MM_CVT32(addr)], trans0);
341 /* {x1, x1, x2, x3} -> {x3, x1, x2, x3} */
342 addr = MM_SHUFFLE32(addr, SHUFFLE32_SLOT3);
343 *indicies2 = MM_SET64(trans[MM_CVT32(addr)], trans2);
345 return MM_SRL32(next_input, 8);
349 * Execute trie traversal with 8 traversals in parallel
352 search_sse_8(const struct rte_acl_ctx *ctx, const uint8_t **data,
353 uint32_t *results, uint32_t total_packets, uint32_t categories)
356 struct acl_flow_data flows;
357 uint64_t index_array[MAX_SEARCHES_SSE8];
358 struct completion cmplt[MAX_SEARCHES_SSE8];
359 struct parms parms[MAX_SEARCHES_SSE8];
360 xmm_t input0, input1;
361 xmm_t indicies1, indicies2, indicies3, indicies4;
363 acl_set_flow(&flows, cmplt, RTE_DIM(cmplt), data, results,
364 total_packets, categories, ctx->trans_table);
366 for (n = 0; n < MAX_SEARCHES_SSE8; n++) {
368 index_array[n] = acl_start_next_trie(&flows, parms, n, ctx);
372 * indicies1 contains index_array[0,1]
373 * indicies2 contains index_array[2,3]
374 * indicies3 contains index_array[4,5]
375 * indicies4 contains index_array[6,7]
378 indicies1 = MM_LOADU((xmm_t *) &index_array[0]);
379 indicies2 = MM_LOADU((xmm_t *) &index_array[2]);
381 indicies3 = MM_LOADU((xmm_t *) &index_array[4]);
382 indicies4 = MM_LOADU((xmm_t *) &index_array[6]);
384 /* Check for any matches. */
385 acl_match_check_x4(0, ctx, parms, &flows,
386 &indicies1, &indicies2, mm_match_mask.m);
387 acl_match_check_x4(4, ctx, parms, &flows,
388 &indicies3, &indicies4, mm_match_mask.m);
390 while (flows.started > 0) {
392 /* Gather 4 bytes of input data for each stream. */
393 input0 = MM_INSERT32(mm_ones_16.m, GET_NEXT_4BYTES(parms, 0),
395 input1 = MM_INSERT32(mm_ones_16.m, GET_NEXT_4BYTES(parms, 4),
398 input0 = MM_INSERT32(input0, GET_NEXT_4BYTES(parms, 1), 1);
399 input1 = MM_INSERT32(input1, GET_NEXT_4BYTES(parms, 5), 1);
401 input0 = MM_INSERT32(input0, GET_NEXT_4BYTES(parms, 2), 2);
402 input1 = MM_INSERT32(input1, GET_NEXT_4BYTES(parms, 6), 2);
404 input0 = MM_INSERT32(input0, GET_NEXT_4BYTES(parms, 3), 3);
405 input1 = MM_INSERT32(input1, GET_NEXT_4BYTES(parms, 7), 3);
407 /* Process the 4 bytes of input on each stream. */
409 input0 = transition4(mm_index_mask.m, input0,
410 mm_shuffle_input.m, mm_ones_16.m,
411 mm_bytes.m, mm_type_quad_range.m,
412 flows.trans, &indicies1, &indicies2);
414 input1 = transition4(mm_index_mask.m, input1,
415 mm_shuffle_input.m, mm_ones_16.m,
416 mm_bytes.m, mm_type_quad_range.m,
417 flows.trans, &indicies3, &indicies4);
419 input0 = transition4(mm_index_mask.m, input0,
420 mm_shuffle_input.m, mm_ones_16.m,
421 mm_bytes.m, mm_type_quad_range.m,
422 flows.trans, &indicies1, &indicies2);
424 input1 = transition4(mm_index_mask.m, input1,
425 mm_shuffle_input.m, mm_ones_16.m,
426 mm_bytes.m, mm_type_quad_range.m,
427 flows.trans, &indicies3, &indicies4);
429 input0 = transition4(mm_index_mask.m, input0,
430 mm_shuffle_input.m, mm_ones_16.m,
431 mm_bytes.m, mm_type_quad_range.m,
432 flows.trans, &indicies1, &indicies2);
434 input1 = transition4(mm_index_mask.m, input1,
435 mm_shuffle_input.m, mm_ones_16.m,
436 mm_bytes.m, mm_type_quad_range.m,
437 flows.trans, &indicies3, &indicies4);
439 input0 = transition4(mm_index_mask.m, input0,
440 mm_shuffle_input.m, mm_ones_16.m,
441 mm_bytes.m, mm_type_quad_range.m,
442 flows.trans, &indicies1, &indicies2);
444 input1 = transition4(mm_index_mask.m, input1,
445 mm_shuffle_input.m, mm_ones_16.m,
446 mm_bytes.m, mm_type_quad_range.m,
447 flows.trans, &indicies3, &indicies4);
449 /* Check for any matches. */
450 acl_match_check_x4(0, ctx, parms, &flows,
451 &indicies1, &indicies2, mm_match_mask.m);
452 acl_match_check_x4(4, ctx, parms, &flows,
453 &indicies3, &indicies4, mm_match_mask.m);
460 * Execute trie traversal with 4 traversals in parallel
463 search_sse_4(const struct rte_acl_ctx *ctx, const uint8_t **data,
464 uint32_t *results, int total_packets, uint32_t categories)
467 struct acl_flow_data flows;
468 uint64_t index_array[MAX_SEARCHES_SSE4];
469 struct completion cmplt[MAX_SEARCHES_SSE4];
470 struct parms parms[MAX_SEARCHES_SSE4];
471 xmm_t input, indicies1, indicies2;
473 acl_set_flow(&flows, cmplt, RTE_DIM(cmplt), data, results,
474 total_packets, categories, ctx->trans_table);
476 for (n = 0; n < MAX_SEARCHES_SSE4; n++) {
478 index_array[n] = acl_start_next_trie(&flows, parms, n, ctx);
481 indicies1 = MM_LOADU((xmm_t *) &index_array[0]);
482 indicies2 = MM_LOADU((xmm_t *) &index_array[2]);
484 /* Check for any matches. */
485 acl_match_check_x4(0, ctx, parms, &flows,
486 &indicies1, &indicies2, mm_match_mask.m);
488 while (flows.started > 0) {
490 /* Gather 4 bytes of input data for each stream. */
491 input = MM_INSERT32(mm_ones_16.m, GET_NEXT_4BYTES(parms, 0), 0);
492 input = MM_INSERT32(input, GET_NEXT_4BYTES(parms, 1), 1);
493 input = MM_INSERT32(input, GET_NEXT_4BYTES(parms, 2), 2);
494 input = MM_INSERT32(input, GET_NEXT_4BYTES(parms, 3), 3);
496 /* Process the 4 bytes of input on each stream. */
497 input = transition4(mm_index_mask.m, input,
498 mm_shuffle_input.m, mm_ones_16.m,
499 mm_bytes.m, mm_type_quad_range.m,
500 flows.trans, &indicies1, &indicies2);
502 input = transition4(mm_index_mask.m, input,
503 mm_shuffle_input.m, mm_ones_16.m,
504 mm_bytes.m, mm_type_quad_range.m,
505 flows.trans, &indicies1, &indicies2);
507 input = transition4(mm_index_mask.m, input,
508 mm_shuffle_input.m, mm_ones_16.m,
509 mm_bytes.m, mm_type_quad_range.m,
510 flows.trans, &indicies1, &indicies2);
512 input = transition4(mm_index_mask.m, input,
513 mm_shuffle_input.m, mm_ones_16.m,
514 mm_bytes.m, mm_type_quad_range.m,
515 flows.trans, &indicies1, &indicies2);
517 /* Check for any matches. */
518 acl_match_check_x4(0, ctx, parms, &flows,
519 &indicies1, &indicies2, mm_match_mask.m);
526 transition2(xmm_t index_mask, xmm_t next_input, xmm_t shuffle_input,
527 xmm_t ones_16, xmm_t bytes, xmm_t type_quad_range,
528 const uint64_t *trans, xmm_t *indicies1)
531 xmm_t addr, indicies2;
533 indicies2 = MM_XOR(ones_16, ones_16);
535 addr = acl_calc_addr(index_mask, next_input, shuffle_input, ones_16,
536 bytes, type_quad_range, indicies1, &indicies2);
538 /* Gather 64 bit transitions and pack 2 per register. */
540 t = trans[MM_CVT32(addr)];
543 addr = MM_SHUFFLE32(addr, SHUFFLE32_SLOT1);
544 *indicies1 = MM_SET64(trans[MM_CVT32(addr)], t);
546 return MM_SRL32(next_input, 8);
550 * Execute trie traversal with 2 traversals in parallel.
553 search_sse_2(const struct rte_acl_ctx *ctx, const uint8_t **data,
554 uint32_t *results, uint32_t total_packets, uint32_t categories)
557 struct acl_flow_data flows;
558 uint64_t index_array[MAX_SEARCHES_SSE2];
559 struct completion cmplt[MAX_SEARCHES_SSE2];
560 struct parms parms[MAX_SEARCHES_SSE2];
561 xmm_t input, indicies;
563 acl_set_flow(&flows, cmplt, RTE_DIM(cmplt), data, results,
564 total_packets, categories, ctx->trans_table);
566 for (n = 0; n < MAX_SEARCHES_SSE2; n++) {
568 index_array[n] = acl_start_next_trie(&flows, parms, n, ctx);
571 indicies = MM_LOADU((xmm_t *) &index_array[0]);
573 /* Check for any matches. */
574 acl_match_check_x2(0, ctx, parms, &flows, &indicies, mm_match_mask64.m);
576 while (flows.started > 0) {
578 /* Gather 4 bytes of input data for each stream. */
579 input = MM_INSERT32(mm_ones_16.m, GET_NEXT_4BYTES(parms, 0), 0);
580 input = MM_INSERT32(input, GET_NEXT_4BYTES(parms, 1), 1);
582 /* Process the 4 bytes of input on each stream. */
584 input = transition2(mm_index_mask64.m, input,
585 mm_shuffle_input64.m, mm_ones_16.m,
586 mm_bytes64.m, mm_type_quad_range64.m,
587 flows.trans, &indicies);
589 input = transition2(mm_index_mask64.m, input,
590 mm_shuffle_input64.m, mm_ones_16.m,
591 mm_bytes64.m, mm_type_quad_range64.m,
592 flows.trans, &indicies);
594 input = transition2(mm_index_mask64.m, input,
595 mm_shuffle_input64.m, mm_ones_16.m,
596 mm_bytes64.m, mm_type_quad_range64.m,
597 flows.trans, &indicies);
599 input = transition2(mm_index_mask64.m, input,
600 mm_shuffle_input64.m, mm_ones_16.m,
601 mm_bytes64.m, mm_type_quad_range64.m,
602 flows.trans, &indicies);
604 /* Check for any matches. */
605 acl_match_check_x2(0, ctx, parms, &flows, &indicies,
613 rte_acl_classify_sse(const struct rte_acl_ctx *ctx, const uint8_t **data,
614 uint32_t *results, uint32_t num, uint32_t categories)
616 if (categories != 1 &&
617 ((RTE_ACL_RESULTS_MULTIPLIER - 1) & categories) != 0)
620 if (likely(num >= MAX_SEARCHES_SSE8))
621 return search_sse_8(ctx, data, results, num, categories);
622 else if (num >= MAX_SEARCHES_SSE4)
623 return search_sse_4(ctx, data, results, num, categories);
625 return search_sse_2(ctx, data, results, num, categories);