acl: deduplicate some SSE and AVX2 code

[dpdk.git] / lib / librte_acl / acl_run_avx2.h
diff --git a/lib/librte_acl/acl_run_avx2.h b/lib/librte_acl/acl_run_avx2.h

index 1688c50..b01a46a 100644 (file)
--- a/lib/librte_acl/acl_run_avx2.h
+++ b/lib/librte_acl/acl_run_avx2.h
@@ -73,51 +73,19 @@ static const rte_ymm_t ymm_ones_16 = {
         },
  };
  
-static inline __attribute__((always_inline)) ymm_t
-calc_addr_avx2(ymm_t index_mask, ymm_t next_input, ymm_t shuffle_input,
-       ymm_t ones_16, ymm_t tr_lo, ymm_t tr_hi)
-{
-       ymm_t in, node_type, r, t;
-       ymm_t dfa_msk, dfa_ofs, quad_ofs;
-       ymm_t addr;
-
-       const ymm_t range_base = _mm256_set_epi32(
-               0xffffff0c, 0xffffff08, 0xffffff04, 0xffffff00,
-               0xffffff0c, 0xffffff08, 0xffffff04, 0xffffff00);
-
-       t = _mm256_xor_si256(index_mask, index_mask);
-       in = _mm256_shuffle_epi8(next_input, shuffle_input);
-
-       /* Calc node type and node addr */
-       node_type = _mm256_andnot_si256(index_mask, tr_lo);
-       addr = _mm256_and_si256(index_mask, tr_lo);
-
-       /* DFA calculations. */
-
-       dfa_msk = _mm256_cmpeq_epi32(node_type, t);
-
-       r = _mm256_srli_epi32(in, 30);
-       r = _mm256_add_epi8(r, range_base);
-
-       t = _mm256_srli_epi32(in, 24);
-       r = _mm256_shuffle_epi8(tr_hi, r);
-
-       dfa_ofs = _mm256_sub_epi32(t, r);
-
-       /* QUAD/SINGLE caluclations. */
-
-       t = _mm256_cmpgt_epi8(in, tr_hi);
-       t = _mm256_sign_epi8(t, t);
-       t = _mm256_maddubs_epi16(t, t);
-       quad_ofs = _mm256_madd_epi16(t, ones_16);
-
-       /* blend DFA and QUAD/SINGLE. */
-       t = _mm256_blendv_epi8(quad_ofs, dfa_ofs, dfa_msk);
-
-       addr = _mm256_add_epi32(addr, t);
-       return addr;
-}
+static const rte_ymm_t ymm_range_base = {
+       .u32 = {
+               0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c,
+               0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c,
+       },
+};
  
+/*
+ * Process 8 transitions in parallel.
+ * tr_lo contains low 32 bits for 8 transition.
+ * tr_hi contains high 32 bits for 8 transition.
+ * next_input contains up to 4 input bytes for 8 flows.
+ */
  static inline __attribute__((always_inline)) ymm_t
  transition8(ymm_t next_input, const uint64_t *trans, ymm_t *tr_lo, ymm_t *tr_hi)
  {
@@ -126,8 +94,10 @@ transition8(ymm_t next_input, const uint64_t *trans, ymm_t *tr_lo, ymm_t *tr_hi)
  
         tr = (const int32_t *)(uintptr_t)trans;
  
-       addr = calc_addr_avx2(ymm_index_mask.y, next_input, ymm_shuffle_input.y,
-               ymm_ones_16.y, *tr_lo, *tr_hi);
+       /* Calculate the address (array index) for all 8 transitions. */
+       ACL_TR_CALC_ADDR(mm256, 256, addr, ymm_index_mask.y, next_input,
+               ymm_shuffle_input.y, ymm_ones_16.y, ymm_range_base.y,
+               *tr_lo, *tr_hi);
  
         /* load lower 32 bits of 8 transactions at once. */
         *tr_lo = _mm256_i32gather_epi32(tr, addr, sizeof(trans[0]));
@@ -140,6 +110,11 @@ transition8(ymm_t next_input, const uint64_t *trans, ymm_t *tr_lo, ymm_t *tr_hi)
         return next_input;
  }
  
+/*
+ * Process matches for  8 flows.
+ * tr_lo contains low 32 bits for 8 transition.
+ * tr_hi contains high 32 bits for 8 transition.
+ */
  static inline void
  acl_process_matches_avx2x8(const struct rte_acl_ctx *ctx,
         struct parms *parms, struct acl_flow_data *flows, uint32_t slot,
@@ -155,6 +130,11 @@ acl_process_matches_avx2x8(const struct rte_acl_ctx *ctx,
         l0 = _mm256_castsi256_si128(*tr_lo);
  
         for (i = 0; i != RTE_DIM(tr) / 2; i++) {
+
+               /*
+                * Extract low 32bits of each transition.
+                * That's enough to process the match.
+                */
                 tr[i] = (uint32_t)_mm_cvtsi128_si32(l0);
                 tr[i + 4] = (uint32_t)_mm_cvtsi128_si32(l1);
  
@@ -167,12 +147,14 @@ acl_process_matches_avx2x8(const struct rte_acl_ctx *ctx,
                         ctx, parms, flows, resolve_priority_sse);
         }
  
+       /* Collect new transitions into 2 YMM registers. */
         t0 = _mm256_set_epi64x(tr[5], tr[4], tr[1], tr[0]);
         t1 = _mm256_set_epi64x(tr[7], tr[6], tr[3], tr[2]);
  
-       lo = (ymm_t)_mm256_shuffle_ps((__m256)t0, (__m256)t1, 0x88);
-       hi = (ymm_t)_mm256_shuffle_ps((__m256)t0, (__m256)t1, 0xdd);
+       /* For each transition: put low 32 into tr_lo and high 32 into tr_hi */
+       ACL_TR_HILO(mm256, __m256, t0, t1, lo, hi);
  
+       /* Keep transitions wth NOMATCH intact. */
         *tr_lo = _mm256_blendv_epi8(*tr_lo, lo, matches);
         *tr_hi = _mm256_blendv_epi8(*tr_hi, hi, matches);
  }
@@ -200,6 +182,9 @@ acl_match_check_avx2x8(const struct rte_acl_ctx *ctx, struct parms *parms,
         }
  }
  
+/*
+ * Execute trie traversal for up to 16 flows in parallel.
+ */
  static inline int
  search_avx2x16(const struct rte_acl_ctx *ctx, const uint8_t **data,
         uint32_t *results, uint32_t total_packets, uint32_t categories)
@@ -225,16 +210,14 @@ search_avx2x16(const struct rte_acl_ctx *ctx, const uint8_t **data,
         t1 = _mm256_set_epi64x(index_array[7], index_array[6],
                 index_array[3], index_array[2]);
  
-       tr_lo[0] = (ymm_t)_mm256_shuffle_ps((__m256)t0, (__m256)t1, 0x88);
-       tr_hi[0] = (ymm_t)_mm256_shuffle_ps((__m256)t0, (__m256)t1, 0xdd);
+       ACL_TR_HILO(mm256, __m256, t0, t1, tr_lo[0], tr_hi[0]);
  
         t0 = _mm256_set_epi64x(index_array[13], index_array[12],
                 index_array[9], index_array[8]);
         t1 = _mm256_set_epi64x(index_array[15], index_array[14],
                 index_array[11], index_array[10]);
  
-       tr_lo[1] = (ymm_t)_mm256_shuffle_ps((__m256)t0, (__m256)t1, 0x88);
-       tr_hi[1] = (ymm_t)_mm256_shuffle_ps((__m256)t0, (__m256)t1, 0xdd);
+       ACL_TR_HILO(mm256, __m256, t0, t1, tr_lo[1], tr_hi[1]);
  
          /* Check for any matches. */
         acl_match_check_avx2x8(ctx, parms, &flows, 0, &tr_lo[0], &tr_hi[0],