#define RTE_ACL_DFA_MAX UINT8_MAX
#define RTE_ACL_DFA_SIZE (UINT8_MAX + 1)
+#define RTE_ACL_DFA_GR64_SIZE 64
+#define RTE_ACL_DFA_GR64_NUM (RTE_ACL_DFA_SIZE / RTE_ACL_DFA_GR64_SIZE)
+#define RTE_ACL_DFA_GR64_BIT \
+ (CHAR_BIT * sizeof(uint32_t) / RTE_ACL_DFA_GR64_NUM)
+
typedef int bits_t;
#define RTE_ACL_BIT_SET_SIZE ((UINT8_MAX + 1) / (sizeof(bits_t) * CHAR_BIT))
/* number of ranges (transitions w/ consecutive bits) */
int32_t id;
struct rte_acl_match_results *mrt; /* only valid when match_flag != 0 */
- char transitions[RTE_ACL_QUAD_SIZE];
- /* boundaries for ranged node */
+ union {
+ char transitions[RTE_ACL_QUAD_SIZE];
+ /* boundaries for ranged node */
+ uint8_t dfa_gr64[RTE_ACL_DFA_GR64_NUM];
+ };
struct rte_acl_node *next;
/* free list link or pointer to duplicate node during merge */
struct rte_acl_node *prev;
} while (0)
struct acl_node_counters {
- int match;
- int match_used;
- int single;
- int quad;
- int quad_vectors;
- int dfa;
- int smallest_match;
+ int32_t match;
+ int32_t match_used;
+ int32_t single;
+ int32_t quad;
+ int32_t quad_vectors;
+ int32_t dfa;
+ int32_t dfa_gr64;
+ int32_t smallest_match;
};
struct rte_acl_indices {
static void
acl_gen_log_stats(const struct rte_acl_ctx *ctx,
- const struct acl_node_counters *counts)
+ const struct acl_node_counters *counts,
+ const struct rte_acl_indices *indices)
{
RTE_LOG(DEBUG, ACL, "Gen phase for ACL \"%s\":\n"
"runtime memory footprint on socket %d:\n"
"single nodes/bytes used: %d/%zu\n"
- "quad nodes/bytes used: %d/%zu\n"
- "DFA nodes/bytes used: %d/%zu\n"
+ "quad nodes/vectors/bytes used: %d/%d/%zu\n"
+ "DFA nodes/group64/bytes used: %d/%d/%zu\n"
"match nodes/bytes used: %d/%zu\n"
"total: %zu bytes\n",
ctx->name, ctx->socket_id,
counts->single, counts->single * sizeof(uint64_t),
- counts->quad, counts->quad_vectors * sizeof(uint64_t),
- counts->dfa, counts->dfa * RTE_ACL_DFA_SIZE * sizeof(uint64_t),
+ counts->quad, counts->quad_vectors,
+ (indices->quad_index - indices->dfa_index) * sizeof(uint64_t),
+ counts->dfa, counts->dfa_gr64,
+ indices->dfa_index * sizeof(uint64_t),
counts->match,
counts->match * sizeof(struct rte_acl_match_results),
ctx->mem_sz);
}
+static uint64_t
+acl_dfa_gen_idx(const struct rte_acl_node *node, uint32_t index)
+{
+ uint64_t idx;
+ uint32_t i;
+
+ idx = 0;
+ for (i = 0; i != RTE_DIM(node->dfa_gr64); i++) {
+ RTE_ACL_VERIFY(node->dfa_gr64[i] < RTE_ACL_DFA_GR64_NUM);
+ RTE_ACL_VERIFY(node->dfa_gr64[i] < node->fanout);
+ idx |= (i - node->dfa_gr64[i]) <<
+ (6 + RTE_ACL_DFA_GR64_BIT * i);
+ }
+
+ return idx << (CHAR_BIT * sizeof(index)) | index | node->node_type;
+}
+
+static void
+acl_dfa_fill_gr64(const struct rte_acl_node *node,
+ const uint64_t src[RTE_ACL_DFA_SIZE], uint64_t dst[RTE_ACL_DFA_SIZE])
+{
+ uint32_t i;
+
+ for (i = 0; i != RTE_DIM(node->dfa_gr64); i++) {
+ memcpy(dst + node->dfa_gr64[i] * RTE_ACL_DFA_GR64_SIZE,
+ src + i * RTE_ACL_DFA_GR64_SIZE,
+ RTE_ACL_DFA_GR64_SIZE * sizeof(dst[0]));
+ }
+}
+
+static uint32_t
+acl_dfa_count_gr64(const uint64_t array_ptr[RTE_ACL_DFA_SIZE],
+ uint8_t gr64[RTE_ACL_DFA_GR64_NUM])
+{
+ uint32_t i, j, k;
+
+ k = 0;
+ for (i = 0; i != RTE_ACL_DFA_GR64_NUM; i++) {
+ gr64[i] = i;
+ for (j = 0; j != i; j++) {
+ if (memcmp(array_ptr + i * RTE_ACL_DFA_GR64_SIZE,
+ array_ptr + j * RTE_ACL_DFA_GR64_SIZE,
+ RTE_ACL_DFA_GR64_SIZE *
+ sizeof(array_ptr[0])) == 0)
+ break;
+ }
+ gr64[i] = (j != i) ? gr64[j] : k++;
+ }
+
+ return k;
+}
+
+static uint32_t
+acl_node_fill_dfa(const struct rte_acl_node *node,
+ uint64_t dfa[RTE_ACL_DFA_SIZE], uint64_t no_match, int32_t resolved)
+{
+ uint32_t n, x;
+ uint32_t ranges, last_bit;
+ struct rte_acl_node *child;
+ struct rte_acl_bitset *bits;
+
+ ranges = 0;
+ last_bit = 0;
+
+ for (n = 0; n < RTE_ACL_DFA_SIZE; n++)
+ dfa[n] = no_match;
+
+ for (x = 0; x < node->num_ptrs; x++) {
+
+ child = node->ptrs[x].ptr;
+ if (child == NULL)
+ continue;
+
+ bits = &node->ptrs[x].values;
+ for (n = 0; n < RTE_ACL_DFA_SIZE; n++) {
+
+ if (bits->bits[n / (sizeof(bits_t) * CHAR_BIT)] &
+ (1 << (n % (sizeof(bits_t) * CHAR_BIT)))) {
+
+ dfa[n] = resolved ? child->node_index : x;
+ ranges += (last_bit == 0);
+ last_bit = 1;
+ } else {
+ last_bit = 0;
+ }
+ }
+ }
+
+ return ranges;
+}
+
/*
* Counts the number of groups of sequential bits that are
* either 0 or 1, as specified by the zero_one parameter. This is used to
*/
static int
acl_count_trie_types(struct acl_node_counters *counts,
- struct rte_acl_node *node, int match, int force_dfa)
+ struct rte_acl_node *node, uint64_t no_match, int match, int force_dfa)
{
uint32_t n;
int num_ptrs;
+ uint64_t dfa[RTE_ACL_DFA_SIZE];
/* skip if this node has been counted */
if (node->node_type != (uint32_t)RTE_ACL_NODE_UNDEFINED)
} else {
counts->dfa++;
node->node_type = RTE_ACL_NODE_DFA;
+ if (force_dfa != 0) {
+ /* always expand to a max number of nodes. */
+ for (n = 0; n != RTE_DIM(node->dfa_gr64); n++)
+ node->dfa_gr64[n] = n;
+ node->fanout = n;
+ } else {
+ acl_node_fill_dfa(node, dfa, no_match, 0);
+ node->fanout = acl_dfa_count_gr64(dfa, node->dfa_gr64);
+ }
+ counts->dfa_gr64 += node->fanout;
}
/*
for (n = 0; n < node->num_ptrs; n++) {
if (node->ptrs[n].ptr != NULL)
match = acl_count_trie_types(counts, node->ptrs[n].ptr,
- match, 0);
+ no_match, match, 0);
}
return match;
acl_add_ptrs(struct rte_acl_node *node, uint64_t *node_array, uint64_t no_match,
int resolved)
{
- uint32_t n, x;
- int m, ranges, last_bit;
- struct rte_acl_node *child;
- struct rte_acl_bitset *bits;
+ uint32_t x;
+ int32_t m;
uint64_t *node_a, index, dfa[RTE_ACL_DFA_SIZE];
- ranges = 0;
- last_bit = 0;
-
- for (n = 0; n < RTE_DIM(dfa); n++)
- dfa[n] = no_match;
-
- for (x = 0; x < node->num_ptrs; x++) {
-
- child = node->ptrs[x].ptr;
- if (child == NULL)
- continue;
-
- bits = &node->ptrs[x].values;
- for (n = 0; n < RTE_DIM(dfa); n++) {
-
- if (bits->bits[n / (sizeof(bits_t) * CHAR_BIT)] &
- (1 << (n % (sizeof(bits_t) * CHAR_BIT)))) {
-
- dfa[n] = resolved ? child->node_index : x;
- ranges += (last_bit == 0);
- last_bit = 1;
- } else {
- last_bit = 0;
- }
- }
- }
+ acl_node_fill_dfa(node, dfa, no_match, resolved);
/*
* Rather than going from 0 to 256, the range count and
RTE_ACL_VERIFY(m <= RTE_ACL_QUAD_SIZE);
} else if (node->node_type == RTE_ACL_NODE_DFA && resolved) {
- for (n = 0; n < RTE_DIM(dfa); n++)
- node_array[n] = dfa[n];
+ acl_dfa_fill_gr64(node, dfa, node_array);
}
}
acl_gen_node(struct rte_acl_node *node, uint64_t *node_array,
uint64_t no_match, struct rte_acl_indices *index, int num_categories)
{
- uint32_t n, *qtrp;
+ uint32_t n, sz, *qtrp;
uint64_t *array_ptr;
struct rte_acl_match_results *match;
switch (node->node_type) {
case RTE_ACL_NODE_DFA:
- node->node_index = index->dfa_index | node->node_type;
array_ptr = &node_array[index->dfa_index];
- index->dfa_index += RTE_ACL_DFA_SIZE;
- for (n = 0; n < RTE_ACL_DFA_SIZE; n++)
+ node->node_index = acl_dfa_gen_idx(node, index->dfa_index);
+ sz = node->fanout * RTE_ACL_DFA_GR64_SIZE;
+ index->dfa_index += sz;
+ for (n = 0; n < sz; n++)
array_ptr[n] = no_match;
break;
case RTE_ACL_NODE_SINGLE:
break;
case RTE_ACL_NODE_QRANGE:
array_ptr = &node_array[index->quad_index];
- acl_add_ptrs(node, array_ptr, no_match, 0);
+ acl_add_ptrs(node, array_ptr, no_match, 0);
qtrp = (uint32_t *)node->transitions;
node->node_index = qtrp[0];
node->node_index <<= sizeof(index->quad_index) * CHAR_BIT;
acl_calc_counts_indices(struct acl_node_counters *counts,
struct rte_acl_indices *indices, struct rte_acl_trie *trie,
struct rte_acl_bld_trie *node_bld_trie, uint32_t num_tries,
- int match_num)
+ int match_num, uint64_t no_match)
{
uint32_t n;
for (n = 0; n < num_tries; n++) {
counts->smallest_match = INT32_MAX;
match_num = acl_count_trie_types(counts, node_bld_trie[n].trie,
- match_num, 1);
+ no_match, match_num, 1);
trie[n].smallest = counts->smallest_match;
}
indices->dfa_index = RTE_ACL_DFA_SIZE + 1;
indices->quad_index = indices->dfa_index +
- counts->dfa * RTE_ACL_DFA_SIZE;
+ counts->dfa_gr64 * RTE_ACL_DFA_GR64_SIZE;
indices->single_index = indices->quad_index + counts->quad_vectors;
indices->match_index = indices->single_index + counts->single + 1;
indices->match_index = RTE_ALIGN(indices->match_index,
struct acl_node_counters counts;
struct rte_acl_indices indices;
+ no_match = RTE_ACL_NODE_MATCH;
+
/* Fill counts and indices arrays from the nodes. */
match_num = acl_calc_counts_indices(&counts, &indices, trie,
- node_bld_trie, num_tries, match_num);
+ node_bld_trie, num_tries, match_num, no_match);
/* Allocate runtime memory (align to cache boundary) */
total_size = RTE_ALIGN(data_index_sz, RTE_CACHE_LINE_SIZE) +
*/
node_array[RTE_ACL_DFA_SIZE] = RTE_ACL_DFA_SIZE | RTE_ACL_NODE_SINGLE;
- no_match = RTE_ACL_NODE_MATCH;
for (n = 0; n < RTE_ACL_DFA_SIZE; n++)
node_array[n] = no_match;
+ /* NOMATCH result at index 0 */
match = ((struct rte_acl_match_results *)(node_array + match_index));
memset(match, 0, sizeof(*match));
ctx->trans_table = node_array;
memcpy(ctx->trie, trie, sizeof(ctx->trie));
- acl_gen_log_stats(ctx, &counts);
+ acl_gen_log_stats(ctx, &counts, &indices);
return 0;
}
}
}
-/*
- * When processing the transition, rather than using if/else
- * construct, the offset is calculated for DFA and QRANGE and
- * then conditionally added to the address based on node type.
- * This is done to avoid branch mis-predictions. Since the
- * offset is rather simple calculation it is more efficient
- * to do the calculation and do a condition move rather than
- * a conditional branch to determine which calculation to do.
- */
static inline uint32_t
scan_forward(uint32_t input, uint32_t max)
{
/* break transition into component parts */
ranges = transition >> (sizeof(index) * CHAR_BIT);
-
- /* calc address for a QRANGE node */
- c = input * SCALAR_QRANGE_MULT;
- a = ranges | SCALAR_QRANGE_MIN;
index = transition & ~RTE_ACL_NODE_INDEX;
- a -= (c & SCALAR_QRANGE_MASK);
- b = c & SCALAR_QRANGE_MIN;
addr = transition ^ index;
- a &= SCALAR_QRANGE_MIN;
- a ^= (ranges ^ b) & (a ^ b);
- x = scan_forward(a, 32) >> 3;
- addr += (index == RTE_ACL_NODE_DFA) ? input : x;
+
+ if (index != RTE_ACL_NODE_DFA) {
+ /* calc address for a QRANGE/SINGLE node */
+ c = (uint32_t)input * SCALAR_QRANGE_MULT;
+ a = ranges | SCALAR_QRANGE_MIN;
+ a -= (c & SCALAR_QRANGE_MASK);
+ b = c & SCALAR_QRANGE_MIN;
+ a &= SCALAR_QRANGE_MIN;
+ a ^= (ranges ^ b) & (a ^ b);
+ x = scan_forward(a, 32) >> 3;
+ } else {
+ /* calc address for a DFA node */
+ x = ranges >> (input /
+ RTE_ACL_DFA_GR64_SIZE * RTE_ACL_DFA_GR64_BIT);
+ x &= UINT8_MAX;
+ x = input - x;
+ }
+
+ addr += x;
/* pickup next transition */
transition = *(trans_table + addr);
SHUFFLE32_SWAP64 = 0x4e,
};
-static const rte_xmm_t mm_type_quad_range = {
- .u32 = {
- RTE_ACL_NODE_QRANGE,
- RTE_ACL_NODE_QRANGE,
- RTE_ACL_NODE_QRANGE,
- RTE_ACL_NODE_QRANGE,
- },
-};
-
-static const rte_xmm_t mm_type_quad_range64 = {
- .u32 = {
- RTE_ACL_NODE_QRANGE,
- RTE_ACL_NODE_QRANGE,
- 0,
- 0,
- },
-};
-
static const rte_xmm_t mm_shuffle_input = {
.u32 = {0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c},
};
.u16 = {1, 1, 1, 1, 1, 1, 1, 1},
};
-static const rte_xmm_t mm_bytes = {
- .u32 = {UINT8_MAX, UINT8_MAX, UINT8_MAX, UINT8_MAX},
-};
-
-static const rte_xmm_t mm_bytes64 = {
- .u32 = {UINT8_MAX, UINT8_MAX, 0, 0},
-};
-
static const rte_xmm_t mm_match_mask = {
.u32 = {
RTE_ACL_NODE_MATCH,
*/
static inline xmm_t
acl_calc_addr(xmm_t index_mask, xmm_t next_input, xmm_t shuffle_input,
- xmm_t ones_16, xmm_t bytes, xmm_t type_quad_range,
- xmm_t *indices1, xmm_t *indices2)
+ xmm_t ones_16, xmm_t indices1, xmm_t indices2)
{
- xmm_t addr, node_types, temp;
+ xmm_t addr, node_types, range, temp;
+ xmm_t dfa_msk, dfa_ofs, quad_ofs;
+ xmm_t in, r, t;
+
+ const xmm_t range_base = _mm_set_epi32(0xffffff0c, 0xffffff08,
+ 0xffffff04, 0xffffff00);
/*
* Note that no transition is done for a match
*/
/* Shuffle low 32 into temp and high 32 into indices2 */
- temp = (xmm_t)MM_SHUFFLEPS((__m128)*indices1, (__m128)*indices2,
- 0x88);
- *indices2 = (xmm_t)MM_SHUFFLEPS((__m128)*indices1,
- (__m128)*indices2, 0xdd);
+ temp = (xmm_t)MM_SHUFFLEPS((__m128)indices1, (__m128)indices2, 0x88);
+ range = (xmm_t)MM_SHUFFLEPS((__m128)indices1, (__m128)indices2, 0xdd);
+
+ t = MM_XOR(index_mask, index_mask);
+
+ /* shuffle input byte to all 4 positions of 32 bit value */
+ in = MM_SHUFFLE8(next_input, shuffle_input);
/* Calc node type and node addr */
node_types = MM_ANDNOT(index_mask, temp);
*/
/* mask for DFA type (0) nodes */
- temp = MM_CMPEQ32(node_types, MM_XOR(node_types, node_types));
+ dfa_msk = MM_CMPEQ32(node_types, t);
- /* add input byte to DFA position */
- temp = MM_AND(temp, bytes);
- temp = MM_AND(temp, next_input);
- addr = MM_ADD32(addr, temp);
+ r = _mm_srli_epi32(in, 30);
+ r = _mm_add_epi8(r, range_base);
- /*
- * Calc addr for Range nodes -> range_index + range(input)
- */
- node_types = MM_CMPEQ32(node_types, type_quad_range);
+ t = _mm_srli_epi32(in, 24);
+ r = _mm_shuffle_epi8(range, r);
+
+ dfa_ofs = _mm_sub_epi32(t, r);
/*
* Calculate number of range boundaries that are less than the
* input byte.
*/
- /* shuffle input byte to all 4 positions of 32 bit value */
- temp = MM_SHUFFLE8(next_input, shuffle_input);
-
/* check ranges */
- temp = MM_CMPGT8(temp, *indices2);
+ temp = MM_CMPGT8(in, range);
/* convert -1 to 1 (bytes greater than input byte */
temp = MM_SIGN8(temp, temp);
temp = MM_MADD8(temp, temp);
/* horizontal add pairs of words into dwords */
- temp = MM_MADD16(temp, ones_16);
+ quad_ofs = MM_MADD16(temp, ones_16);
/* mask to range type nodes */
- temp = MM_AND(temp, node_types);
+ temp = _mm_blendv_epi8(quad_ofs, dfa_ofs, dfa_msk);
/* add index into node position */
return MM_ADD32(addr, temp);
*/
static inline xmm_t
transition4(xmm_t index_mask, xmm_t next_input, xmm_t shuffle_input,
- xmm_t ones_16, xmm_t bytes, xmm_t type_quad_range,
- const uint64_t *trans, xmm_t *indices1, xmm_t *indices2)
+ xmm_t ones_16, const uint64_t *trans,
+ xmm_t *indices1, xmm_t *indices2)
{
xmm_t addr;
uint64_t trans0, trans2;
/* Calculate the address (array index) for all 4 transitions. */
addr = acl_calc_addr(index_mask, next_input, shuffle_input, ones_16,
- bytes, type_quad_range, indices1, indices2);
+ *indices1, *indices2);
/* Gather 64 bit transitions and pack back into 2 registers. */
input0 = transition4(mm_index_mask.m, input0,
mm_shuffle_input.m, mm_ones_16.m,
- mm_bytes.m, mm_type_quad_range.m,
flows.trans, &indices1, &indices2);
input1 = transition4(mm_index_mask.m, input1,
mm_shuffle_input.m, mm_ones_16.m,
- mm_bytes.m, mm_type_quad_range.m,
flows.trans, &indices3, &indices4);
input0 = transition4(mm_index_mask.m, input0,
mm_shuffle_input.m, mm_ones_16.m,
- mm_bytes.m, mm_type_quad_range.m,
flows.trans, &indices1, &indices2);
input1 = transition4(mm_index_mask.m, input1,
mm_shuffle_input.m, mm_ones_16.m,
- mm_bytes.m, mm_type_quad_range.m,
flows.trans, &indices3, &indices4);
input0 = transition4(mm_index_mask.m, input0,
mm_shuffle_input.m, mm_ones_16.m,
- mm_bytes.m, mm_type_quad_range.m,
flows.trans, &indices1, &indices2);
input1 = transition4(mm_index_mask.m, input1,
mm_shuffle_input.m, mm_ones_16.m,
- mm_bytes.m, mm_type_quad_range.m,
flows.trans, &indices3, &indices4);
input0 = transition4(mm_index_mask.m, input0,
mm_shuffle_input.m, mm_ones_16.m,
- mm_bytes.m, mm_type_quad_range.m,
flows.trans, &indices1, &indices2);
input1 = transition4(mm_index_mask.m, input1,
mm_shuffle_input.m, mm_ones_16.m,
- mm_bytes.m, mm_type_quad_range.m,
flows.trans, &indices3, &indices4);
/* Check for any matches. */
/* Process the 4 bytes of input on each stream. */
input = transition4(mm_index_mask.m, input,
mm_shuffle_input.m, mm_ones_16.m,
- mm_bytes.m, mm_type_quad_range.m,
flows.trans, &indices1, &indices2);
input = transition4(mm_index_mask.m, input,
mm_shuffle_input.m, mm_ones_16.m,
- mm_bytes.m, mm_type_quad_range.m,
flows.trans, &indices1, &indices2);
input = transition4(mm_index_mask.m, input,
mm_shuffle_input.m, mm_ones_16.m,
- mm_bytes.m, mm_type_quad_range.m,
flows.trans, &indices1, &indices2);
input = transition4(mm_index_mask.m, input,
mm_shuffle_input.m, mm_ones_16.m,
- mm_bytes.m, mm_type_quad_range.m,
flows.trans, &indices1, &indices2);
/* Check for any matches. */
static inline xmm_t
transition2(xmm_t index_mask, xmm_t next_input, xmm_t shuffle_input,
- xmm_t ones_16, xmm_t bytes, xmm_t type_quad_range,
- const uint64_t *trans, xmm_t *indices1)
+ xmm_t ones_16, const uint64_t *trans, xmm_t *indices1)
{
uint64_t t;
xmm_t addr, indices2;
indices2 = MM_XOR(ones_16, ones_16);
addr = acl_calc_addr(index_mask, next_input, shuffle_input, ones_16,
- bytes, type_quad_range, indices1, &indices2);
+ *indices1, indices2);
/* Gather 64 bit transitions and pack 2 per register. */
input = transition2(mm_index_mask64.m, input,
mm_shuffle_input64.m, mm_ones_16.m,
- mm_bytes64.m, mm_type_quad_range64.m,
flows.trans, &indices);
input = transition2(mm_index_mask64.m, input,
mm_shuffle_input64.m, mm_ones_16.m,
- mm_bytes64.m, mm_type_quad_range64.m,
flows.trans, &indices);
input = transition2(mm_index_mask64.m, input,
mm_shuffle_input64.m, mm_ones_16.m,
- mm_bytes64.m, mm_type_quad_range64.m,
flows.trans, &indices);
input = transition2(mm_index_mask64.m, input,
mm_shuffle_input64.m, mm_ones_16.m,
- mm_bytes64.m, mm_type_quad_range64.m,
flows.trans, &indices);
/* Check for any matches. */