eal: introduce ymm type for AVX 256-bit
authorKonstantin Ananyev <konstantin.ananyev@intel.com>
Tue, 20 Jan 2015 18:40:59 +0000 (18:40 +0000)
committerThomas Monjalon <thomas.monjalon@6wind.com>
Wed, 28 Jan 2015 16:11:25 +0000 (17:11 +0100)
New data type to manipulate 256 bit AVX values.
Rename field in the rte_xmm to keep common naming across SSE/AVX fields.

Signed-off-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
examples/l3fwd/main.c
lib/librte_acl/acl_run_sse.c
lib/librte_acl/rte_acl_osdep_alone.h
lib/librte_eal/common/include/rte_common_vect.h
lib/librte_lpm/rte_lpm.h

index 918f2cb..6f7d7d4 100644 (file)
@@ -1170,7 +1170,7 @@ processx4_step2(const struct lcore_conf *qconf, __m128i dip, uint32_t flag,
        if (likely(flag != 0)) {
                rte_lpm_lookupx4(qconf->ipv4_lookup_struct, dip, dprt, portid);
        } else {
-               dst.m = dip;
+               dst.x = dip;
                dprt[0] = get_dst_port(qconf, pkt[0], dst.u32[0], portid);
                dprt[1] = get_dst_port(qconf, pkt[1], dst.u32[1], portid);
                dprt[2] = get_dst_port(qconf, pkt[2], dst.u32[2], portid);
index 09e32be..4605b58 100644 (file)
@@ -359,16 +359,16 @@ search_sse_8(const struct rte_acl_ctx *ctx, const uint8_t **data,
 
         /* Check for any matches. */
        acl_match_check_x4(0, ctx, parms, &flows,
-               &indices1, &indices2, mm_match_mask.m);
+               &indices1, &indices2, mm_match_mask.x);
        acl_match_check_x4(4, ctx, parms, &flows,
-               &indices3, &indices4, mm_match_mask.m);
+               &indices3, &indices4, mm_match_mask.x);
 
        while (flows.started > 0) {
 
                /* Gather 4 bytes of input data for each stream. */
-               input0 = MM_INSERT32(mm_ones_16.m, GET_NEXT_4BYTES(parms, 0),
+               input0 = MM_INSERT32(mm_ones_16.x, GET_NEXT_4BYTES(parms, 0),
                        0);
-               input1 = MM_INSERT32(mm_ones_16.m, GET_NEXT_4BYTES(parms, 4),
+               input1 = MM_INSERT32(mm_ones_16.x, GET_NEXT_4BYTES(parms, 4),
                        0);
 
                input0 = MM_INSERT32(input0, GET_NEXT_4BYTES(parms, 1), 1);
@@ -382,43 +382,43 @@ search_sse_8(const struct rte_acl_ctx *ctx, const uint8_t **data,
 
                 /* Process the 4 bytes of input on each stream. */
 
-               input0 = transition4(mm_index_mask.m, input0,
-                       mm_shuffle_input.m, mm_ones_16.m,
+               input0 = transition4(mm_index_mask.x, input0,
+                       mm_shuffle_input.x, mm_ones_16.x,
                        flows.trans, &indices1, &indices2);
 
-               input1 = transition4(mm_index_mask.m, input1,
-                       mm_shuffle_input.m, mm_ones_16.m,
+               input1 = transition4(mm_index_mask.x, input1,
+                       mm_shuffle_input.x, mm_ones_16.x,
                        flows.trans, &indices3, &indices4);
 
-               input0 = transition4(mm_index_mask.m, input0,
-                       mm_shuffle_input.m, mm_ones_16.m,
+               input0 = transition4(mm_index_mask.x, input0,
+                       mm_shuffle_input.x, mm_ones_16.x,
                        flows.trans, &indices1, &indices2);
 
-               input1 = transition4(mm_index_mask.m, input1,
-                       mm_shuffle_input.m, mm_ones_16.m,
+               input1 = transition4(mm_index_mask.x, input1,
+                       mm_shuffle_input.x, mm_ones_16.x,
                        flows.trans, &indices3, &indices4);
 
-               input0 = transition4(mm_index_mask.m, input0,
-                       mm_shuffle_input.m, mm_ones_16.m,
+               input0 = transition4(mm_index_mask.x, input0,
+                       mm_shuffle_input.x, mm_ones_16.x,
                        flows.trans, &indices1, &indices2);
 
-               input1 = transition4(mm_index_mask.m, input1,
-                       mm_shuffle_input.m, mm_ones_16.m,
+               input1 = transition4(mm_index_mask.x, input1,
+                       mm_shuffle_input.x, mm_ones_16.x,
                        flows.trans, &indices3, &indices4);
 
-               input0 = transition4(mm_index_mask.m, input0,
-                       mm_shuffle_input.m, mm_ones_16.m,
+               input0 = transition4(mm_index_mask.x, input0,
+                       mm_shuffle_input.x, mm_ones_16.x,
                        flows.trans, &indices1, &indices2);
 
-               input1 = transition4(mm_index_mask.m, input1,
-                       mm_shuffle_input.m, mm_ones_16.m,
+               input1 = transition4(mm_index_mask.x, input1,
+                       mm_shuffle_input.x, mm_ones_16.x,
                        flows.trans, &indices3, &indices4);
 
                 /* Check for any matches. */
                acl_match_check_x4(0, ctx, parms, &flows,
-                       &indices1, &indices2, mm_match_mask.m);
+                       &indices1, &indices2, mm_match_mask.x);
                acl_match_check_x4(4, ctx, parms, &flows,
-                       &indices3, &indices4, mm_match_mask.m);
+                       &indices3, &indices4, mm_match_mask.x);
        }
 
        return 0;
@@ -451,36 +451,36 @@ search_sse_4(const struct rte_acl_ctx *ctx, const uint8_t **data,
 
        /* Check for any matches. */
        acl_match_check_x4(0, ctx, parms, &flows,
-               &indices1, &indices2, mm_match_mask.m);
+               &indices1, &indices2, mm_match_mask.x);
 
        while (flows.started > 0) {
 
                /* Gather 4 bytes of input data for each stream. */
-               input = MM_INSERT32(mm_ones_16.m, GET_NEXT_4BYTES(parms, 0), 0);
+               input = MM_INSERT32(mm_ones_16.x, GET_NEXT_4BYTES(parms, 0), 0);
                input = MM_INSERT32(input, GET_NEXT_4BYTES(parms, 1), 1);
                input = MM_INSERT32(input, GET_NEXT_4BYTES(parms, 2), 2);
                input = MM_INSERT32(input, GET_NEXT_4BYTES(parms, 3), 3);
 
                /* Process the 4 bytes of input on each stream. */
-               input = transition4(mm_index_mask.m, input,
-                       mm_shuffle_input.m, mm_ones_16.m,
+               input = transition4(mm_index_mask.x, input,
+                       mm_shuffle_input.x, mm_ones_16.x,
                        flows.trans, &indices1, &indices2);
 
-                input = transition4(mm_index_mask.m, input,
-                       mm_shuffle_input.m, mm_ones_16.m,
+                input = transition4(mm_index_mask.x, input,
+                       mm_shuffle_input.x, mm_ones_16.x,
                        flows.trans, &indices1, &indices2);
 
-                input = transition4(mm_index_mask.m, input,
-                       mm_shuffle_input.m, mm_ones_16.m,
+                input = transition4(mm_index_mask.x, input,
+                       mm_shuffle_input.x, mm_ones_16.x,
                        flows.trans, &indices1, &indices2);
 
-                input = transition4(mm_index_mask.m, input,
-                       mm_shuffle_input.m, mm_ones_16.m,
+                input = transition4(mm_index_mask.x, input,
+                       mm_shuffle_input.x, mm_ones_16.x,
                        flows.trans, &indices1, &indices2);
 
                /* Check for any matches. */
                acl_match_check_x4(0, ctx, parms, &flows,
-                       &indices1, &indices2, mm_match_mask.m);
+                       &indices1, &indices2, mm_match_mask.x);
        }
 
        return 0;
@@ -534,35 +534,35 @@ search_sse_2(const struct rte_acl_ctx *ctx, const uint8_t **data,
        indices = MM_LOADU((xmm_t *) &index_array[0]);
 
        /* Check for any matches. */
-       acl_match_check_x2(0, ctx, parms, &flows, &indices, mm_match_mask64.m);
+       acl_match_check_x2(0, ctx, parms, &flows, &indices, mm_match_mask64.x);
 
        while (flows.started > 0) {
 
                /* Gather 4 bytes of input data for each stream. */
-               input = MM_INSERT32(mm_ones_16.m, GET_NEXT_4BYTES(parms, 0), 0);
+               input = MM_INSERT32(mm_ones_16.x, GET_NEXT_4BYTES(parms, 0), 0);
                input = MM_INSERT32(input, GET_NEXT_4BYTES(parms, 1), 1);
 
                /* Process the 4 bytes of input on each stream. */
 
-               input = transition2(mm_index_mask64.m, input,
-                       mm_shuffle_input64.m, mm_ones_16.m,
+               input = transition2(mm_index_mask64.x, input,
+                       mm_shuffle_input64.x, mm_ones_16.x,
                        flows.trans, &indices);
 
-               input = transition2(mm_index_mask64.m, input,
-                       mm_shuffle_input64.m, mm_ones_16.m,
+               input = transition2(mm_index_mask64.x, input,
+                       mm_shuffle_input64.x, mm_ones_16.x,
                        flows.trans, &indices);
 
-               input = transition2(mm_index_mask64.m, input,
-                       mm_shuffle_input64.m, mm_ones_16.m,
+               input = transition2(mm_index_mask64.x, input,
+                       mm_shuffle_input64.x, mm_ones_16.x,
                        flows.trans, &indices);
 
-               input = transition2(mm_index_mask64.m, input,
-                       mm_shuffle_input64.m, mm_ones_16.m,
+               input = transition2(mm_index_mask64.x, input,
+                       mm_shuffle_input64.x, mm_ones_16.x,
                        flows.trans, &indices);
 
                /* Check for any matches. */
                acl_match_check_x2(0, ctx, parms, &flows, &indices,
-                       mm_match_mask64.m);
+                       mm_match_mask64.x);
        }
 
        return 0;
index 2a99860..58c4f6a 100644 (file)
 #include <smmintrin.h>
 #endif
 
+#if defined(__AVX__)
+#include <immintrin.h>
+#endif
+
 #else
 
 #include <x86intrin.h>
@@ -128,8 +132,8 @@ typedef __m128i xmm_t;
 #define        XMM_SIZE        (sizeof(xmm_t))
 #define        XMM_MASK        (XMM_SIZE - 1)
 
-typedef union rte_mmsse {
-       xmm_t    m;
+typedef union rte_xmm {
+       xmm_t    x;
        uint8_t  u8[XMM_SIZE / sizeof(uint8_t)];
        uint16_t u16[XMM_SIZE / sizeof(uint16_t)];
        uint32_t u32[XMM_SIZE / sizeof(uint32_t)];
@@ -137,6 +141,33 @@ typedef union rte_mmsse {
        double   pd[XMM_SIZE / sizeof(double)];
 } rte_xmm_t;
 
+#ifdef __AVX__
+
+typedef __m256i ymm_t;
+
+#define        YMM_SIZE        (sizeof(ymm_t))
+#define        YMM_MASK        (YMM_SIZE - 1)
+
+typedef union rte_ymm {
+       ymm_t    y;
+       xmm_t    x[YMM_SIZE / sizeof(xmm_t)];
+       uint8_t  u8[YMM_SIZE / sizeof(uint8_t)];
+       uint16_t u16[YMM_SIZE / sizeof(uint16_t)];
+       uint32_t u32[YMM_SIZE / sizeof(uint32_t)];
+       uint64_t u64[YMM_SIZE / sizeof(uint64_t)];
+       double   pd[YMM_SIZE / sizeof(double)];
+} rte_ymm_t;
+
+#endif /* __AVX__ */
+
+#ifdef RTE_ARCH_I686
+#define _mm_cvtsi128_si64(a) ({ \
+       rte_xmm_t m;            \
+       m.x = (a);              \
+       (m.u64[0]);             \
+})
+#endif
+
 /*
  * rte_cycles related.
  */
index 95bf4b1..617470b 100644 (file)
 #include <smmintrin.h>
 #endif
 
+#if defined(__AVX__)
+#include <immintrin.h>
+#endif
+
 #else
 
 #include <x86intrin.h>
@@ -70,7 +74,7 @@ typedef __m128i xmm_t;
 #define        XMM_MASK        (XMM_SIZE - 1)
 
 typedef union rte_xmm {
-       xmm_t    m;
+       xmm_t    x;
        uint8_t  u8[XMM_SIZE / sizeof(uint8_t)];
        uint16_t u16[XMM_SIZE / sizeof(uint16_t)];
        uint32_t u32[XMM_SIZE / sizeof(uint32_t)];
@@ -78,10 +82,29 @@ typedef union rte_xmm {
        double   pd[XMM_SIZE / sizeof(double)];
 } rte_xmm_t;
 
+#ifdef __AVX__
+
+typedef __m256i ymm_t;
+
+#define        YMM_SIZE        (sizeof(ymm_t))
+#define        YMM_MASK        (YMM_SIZE - 1)
+
+typedef union rte_ymm {
+       ymm_t    y;
+       xmm_t    x[YMM_SIZE / sizeof(xmm_t)];
+       uint8_t  u8[YMM_SIZE / sizeof(uint8_t)];
+       uint16_t u16[YMM_SIZE / sizeof(uint16_t)];
+       uint32_t u32[YMM_SIZE / sizeof(uint32_t)];
+       uint64_t u64[YMM_SIZE / sizeof(uint64_t)];
+       double   pd[YMM_SIZE / sizeof(double)];
+} rte_ymm_t;
+
+#endif /* __AVX__ */
+
 #ifdef RTE_ARCH_I686
 #define _mm_cvtsi128_si64(a) ({ \
        rte_xmm_t m;            \
-       m.m = (a);              \
+       m.x = (a);              \
        (m.u64[0]);             \
 })
 #endif
index 62d7736..586300b 100644 (file)
@@ -420,7 +420,7 @@ rte_lpm_lookupx4(const struct rte_lpm *lpm, __m128i ip, uint16_t hop[4],
        tbl[3] = *(const uint16_t *)&lpm->tbl24[idx >> 32];
 
        /* get 4 indexes for tbl8[]. */
-       i8.m = _mm_and_si128(ip, mask8);
+       i8.x = _mm_and_si128(ip, mask8);
 
        pt = (uint64_t)tbl[0] |
                (uint64_t)tbl[1] << 16 |