4 * Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * * Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * * Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * * Neither the name of Intel Corporation nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 #include "acl_run_sse.h"
36 static const rte_ymm_t ymm_match_mask = {
49 static const rte_ymm_t ymm_index_mask = {
62 static const rte_ymm_t ymm_shuffle_input = {
64 0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c,
65 0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c,
69 static const rte_ymm_t ymm_ones_16 = {
71 1, 1, 1, 1, 1, 1, 1, 1,
72 1, 1, 1, 1, 1, 1, 1, 1,
76 static inline __attribute__((always_inline)) ymm_t
77 calc_addr_avx2(ymm_t index_mask, ymm_t next_input, ymm_t shuffle_input,
78 ymm_t ones_16, ymm_t tr_lo, ymm_t tr_hi)
80 ymm_t in, node_type, r, t;
81 ymm_t dfa_msk, dfa_ofs, quad_ofs;
84 const ymm_t range_base = _mm256_set_epi32(
85 0xffffff0c, 0xffffff08, 0xffffff04, 0xffffff00,
86 0xffffff0c, 0xffffff08, 0xffffff04, 0xffffff00);
88 t = _mm256_xor_si256(index_mask, index_mask);
89 in = _mm256_shuffle_epi8(next_input, shuffle_input);
91 /* Calc node type and node addr */
92 node_type = _mm256_andnot_si256(index_mask, tr_lo);
93 addr = _mm256_and_si256(index_mask, tr_lo);
95 /* DFA calculations. */
97 dfa_msk = _mm256_cmpeq_epi32(node_type, t);
99 r = _mm256_srli_epi32(in, 30);
100 r = _mm256_add_epi8(r, range_base);
102 t = _mm256_srli_epi32(in, 24);
103 r = _mm256_shuffle_epi8(tr_hi, r);
105 dfa_ofs = _mm256_sub_epi32(t, r);
107 /* QUAD/SINGLE caluclations. */
109 t = _mm256_cmpgt_epi8(in, tr_hi);
110 t = _mm256_sign_epi8(t, t);
111 t = _mm256_maddubs_epi16(t, t);
112 quad_ofs = _mm256_madd_epi16(t, ones_16);
114 /* blend DFA and QUAD/SINGLE. */
115 t = _mm256_blendv_epi8(quad_ofs, dfa_ofs, dfa_msk);
117 addr = _mm256_add_epi32(addr, t);
121 static inline __attribute__((always_inline)) ymm_t
122 transition8(ymm_t next_input, const uint64_t *trans, ymm_t *tr_lo, ymm_t *tr_hi)
127 tr = (const int32_t *)(uintptr_t)trans;
129 addr = calc_addr_avx2(ymm_index_mask.y, next_input, ymm_shuffle_input.y,
130 ymm_ones_16.y, *tr_lo, *tr_hi);
132 /* load lower 32 bits of 8 transactions at once. */
133 *tr_lo = _mm256_i32gather_epi32(tr, addr, sizeof(trans[0]));
135 next_input = _mm256_srli_epi32(next_input, CHAR_BIT);
137 /* load high 32 bits of 8 transactions at once. */
138 *tr_hi = _mm256_i32gather_epi32(tr + 1, addr, sizeof(trans[0]));
144 acl_process_matches_avx2x8(const struct rte_acl_ctx *ctx,
145 struct parms *parms, struct acl_flow_data *flows, uint32_t slot,
146 ymm_t matches, ymm_t *tr_lo, ymm_t *tr_hi)
152 uint64_t tr[MAX_SEARCHES_SSE8];
154 l1 = _mm256_extracti128_si256(*tr_lo, 1);
155 l0 = _mm256_castsi256_si128(*tr_lo);
157 for (i = 0; i != RTE_DIM(tr) / 2; i++) {
158 tr[i] = (uint32_t)_mm_cvtsi128_si32(l0);
159 tr[i + 4] = (uint32_t)_mm_cvtsi128_si32(l1);
161 l0 = _mm_srli_si128(l0, sizeof(uint32_t));
162 l1 = _mm_srli_si128(l1, sizeof(uint32_t));
164 tr[i] = acl_match_check(tr[i], slot + i,
165 ctx, parms, flows, resolve_priority_sse);
166 tr[i + 4] = acl_match_check(tr[i + 4], slot + i + 4,
167 ctx, parms, flows, resolve_priority_sse);
170 t0 = _mm256_set_epi64x(tr[5], tr[4], tr[1], tr[0]);
171 t1 = _mm256_set_epi64x(tr[7], tr[6], tr[3], tr[2]);
173 lo = (ymm_t)_mm256_shuffle_ps((__m256)t0, (__m256)t1, 0x88);
174 hi = (ymm_t)_mm256_shuffle_ps((__m256)t0, (__m256)t1, 0xdd);
176 *tr_lo = _mm256_blendv_epi8(*tr_lo, lo, matches);
177 *tr_hi = _mm256_blendv_epi8(*tr_hi, hi, matches);
181 acl_match_check_avx2x8(const struct rte_acl_ctx *ctx, struct parms *parms,
182 struct acl_flow_data *flows, uint32_t slot,
183 ymm_t *tr_lo, ymm_t *tr_hi, ymm_t match_mask)
188 /* test for match node */
189 temp = _mm256_and_si256(match_mask, *tr_lo);
190 matches = _mm256_cmpeq_epi32(temp, match_mask);
191 msk = _mm256_movemask_epi8(matches);
195 acl_process_matches_avx2x8(ctx, parms, flows, slot,
196 matches, tr_lo, tr_hi);
197 temp = _mm256_and_si256(match_mask, *tr_lo);
198 matches = _mm256_cmpeq_epi32(temp, match_mask);
199 msk = _mm256_movemask_epi8(matches);
204 search_avx2x16(const struct rte_acl_ctx *ctx, const uint8_t **data,
205 uint32_t *results, uint32_t total_packets, uint32_t categories)
208 struct acl_flow_data flows;
209 uint64_t index_array[MAX_SEARCHES_AVX16];
210 struct completion cmplt[MAX_SEARCHES_AVX16];
211 struct parms parms[MAX_SEARCHES_AVX16];
212 ymm_t input[2], tr_lo[2], tr_hi[2];
215 acl_set_flow(&flows, cmplt, RTE_DIM(cmplt), data, results,
216 total_packets, categories, ctx->trans_table);
218 for (n = 0; n < RTE_DIM(cmplt); n++) {
220 index_array[n] = acl_start_next_trie(&flows, parms, n, ctx);
223 t0 = _mm256_set_epi64x(index_array[5], index_array[4],
224 index_array[1], index_array[0]);
225 t1 = _mm256_set_epi64x(index_array[7], index_array[6],
226 index_array[3], index_array[2]);
228 tr_lo[0] = (ymm_t)_mm256_shuffle_ps((__m256)t0, (__m256)t1, 0x88);
229 tr_hi[0] = (ymm_t)_mm256_shuffle_ps((__m256)t0, (__m256)t1, 0xdd);
231 t0 = _mm256_set_epi64x(index_array[13], index_array[12],
232 index_array[9], index_array[8]);
233 t1 = _mm256_set_epi64x(index_array[15], index_array[14],
234 index_array[11], index_array[10]);
236 tr_lo[1] = (ymm_t)_mm256_shuffle_ps((__m256)t0, (__m256)t1, 0x88);
237 tr_hi[1] = (ymm_t)_mm256_shuffle_ps((__m256)t0, (__m256)t1, 0xdd);
239 /* Check for any matches. */
240 acl_match_check_avx2x8(ctx, parms, &flows, 0, &tr_lo[0], &tr_hi[0],
242 acl_match_check_avx2x8(ctx, parms, &flows, 8, &tr_lo[1], &tr_hi[1],
245 while (flows.started > 0) {
247 uint32_t in[MAX_SEARCHES_SSE8];
249 /* Gather 4 bytes of input data for first 8 flows. */
250 in[0] = GET_NEXT_4BYTES(parms, 0);
251 in[4] = GET_NEXT_4BYTES(parms, 4);
252 in[1] = GET_NEXT_4BYTES(parms, 1);
253 in[5] = GET_NEXT_4BYTES(parms, 5);
254 in[2] = GET_NEXT_4BYTES(parms, 2);
255 in[6] = GET_NEXT_4BYTES(parms, 6);
256 in[3] = GET_NEXT_4BYTES(parms, 3);
257 in[7] = GET_NEXT_4BYTES(parms, 7);
258 input[0] = _mm256_set_epi32(in[7], in[6], in[5], in[4],
259 in[3], in[2], in[1], in[0]);
261 /* Gather 4 bytes of input data for last 8 flows. */
262 in[0] = GET_NEXT_4BYTES(parms, 8);
263 in[4] = GET_NEXT_4BYTES(parms, 12);
264 in[1] = GET_NEXT_4BYTES(parms, 9);
265 in[5] = GET_NEXT_4BYTES(parms, 13);
266 in[2] = GET_NEXT_4BYTES(parms, 10);
267 in[6] = GET_NEXT_4BYTES(parms, 14);
268 in[3] = GET_NEXT_4BYTES(parms, 11);
269 in[7] = GET_NEXT_4BYTES(parms, 15);
270 input[1] = _mm256_set_epi32(in[7], in[6], in[5], in[4],
271 in[3], in[2], in[1], in[0]);
273 input[0] = transition8(input[0], flows.trans,
274 &tr_lo[0], &tr_hi[0]);
275 input[1] = transition8(input[1], flows.trans,
276 &tr_lo[1], &tr_hi[1]);
278 input[0] = transition8(input[0], flows.trans,
279 &tr_lo[0], &tr_hi[0]);
280 input[1] = transition8(input[1], flows.trans,
281 &tr_lo[1], &tr_hi[1]);
283 input[0] = transition8(input[0], flows.trans,
284 &tr_lo[0], &tr_hi[0]);
285 input[1] = transition8(input[1], flows.trans,
286 &tr_lo[1], &tr_hi[1]);
288 input[0] = transition8(input[0], flows.trans,
289 &tr_lo[0], &tr_hi[0]);
290 input[1] = transition8(input[1], flows.trans,
291 &tr_lo[1], &tr_hi[1]);
293 /* Check for any matches. */
294 acl_match_check_avx2x8(ctx, parms, &flows, 0,
295 &tr_lo[0], &tr_hi[0], ymm_match_mask.y);
296 acl_match_check_avx2x8(ctx, parms, &flows, 8,
297 &tr_lo[1], &tr_hi[1], ymm_match_mask.y);