From: Konstantin Ananyev Date: Tue, 20 Jan 2015 18:40:59 +0000 (+0000) Subject: eal: introduce ymm type for AVX 256-bit X-Git-Tag: spdx-start~9815 X-Git-Url: http://git.droids-corp.org/?a=commitdiff_plain;h=da826b7135a411c05ae574565a65bca26b48f8ac;p=dpdk.git eal: introduce ymm type for AVX 256-bit New data type to manipulate 256 bit AVX values. Rename field in the rte_xmm to keep common naming across SSE/AVX fields. Signed-off-by: Konstantin Ananyev Acked-by: Neil Horman --- diff --git a/examples/l3fwd/main.c b/examples/l3fwd/main.c index 918f2cbb69..6f7d7d4017 100644 --- a/examples/l3fwd/main.c +++ b/examples/l3fwd/main.c @@ -1170,7 +1170,7 @@ processx4_step2(const struct lcore_conf *qconf, __m128i dip, uint32_t flag, if (likely(flag != 0)) { rte_lpm_lookupx4(qconf->ipv4_lookup_struct, dip, dprt, portid); } else { - dst.m = dip; + dst.x = dip; dprt[0] = get_dst_port(qconf, pkt[0], dst.u32[0], portid); dprt[1] = get_dst_port(qconf, pkt[1], dst.u32[1], portid); dprt[2] = get_dst_port(qconf, pkt[2], dst.u32[2], portid); diff --git a/lib/librte_acl/acl_run_sse.c b/lib/librte_acl/acl_run_sse.c index 09e32bee6e..4605b58e4b 100644 --- a/lib/librte_acl/acl_run_sse.c +++ b/lib/librte_acl/acl_run_sse.c @@ -359,16 +359,16 @@ search_sse_8(const struct rte_acl_ctx *ctx, const uint8_t **data, /* Check for any matches. */ acl_match_check_x4(0, ctx, parms, &flows, - &indices1, &indices2, mm_match_mask.m); + &indices1, &indices2, mm_match_mask.x); acl_match_check_x4(4, ctx, parms, &flows, - &indices3, &indices4, mm_match_mask.m); + &indices3, &indices4, mm_match_mask.x); while (flows.started > 0) { /* Gather 4 bytes of input data for each stream. */ - input0 = MM_INSERT32(mm_ones_16.m, GET_NEXT_4BYTES(parms, 0), + input0 = MM_INSERT32(mm_ones_16.x, GET_NEXT_4BYTES(parms, 0), 0); - input1 = MM_INSERT32(mm_ones_16.m, GET_NEXT_4BYTES(parms, 4), + input1 = MM_INSERT32(mm_ones_16.x, GET_NEXT_4BYTES(parms, 4), 0); input0 = MM_INSERT32(input0, GET_NEXT_4BYTES(parms, 1), 1); @@ -382,43 +382,43 @@ search_sse_8(const struct rte_acl_ctx *ctx, const uint8_t **data, /* Process the 4 bytes of input on each stream. */ - input0 = transition4(mm_index_mask.m, input0, - mm_shuffle_input.m, mm_ones_16.m, + input0 = transition4(mm_index_mask.x, input0, + mm_shuffle_input.x, mm_ones_16.x, flows.trans, &indices1, &indices2); - input1 = transition4(mm_index_mask.m, input1, - mm_shuffle_input.m, mm_ones_16.m, + input1 = transition4(mm_index_mask.x, input1, + mm_shuffle_input.x, mm_ones_16.x, flows.trans, &indices3, &indices4); - input0 = transition4(mm_index_mask.m, input0, - mm_shuffle_input.m, mm_ones_16.m, + input0 = transition4(mm_index_mask.x, input0, + mm_shuffle_input.x, mm_ones_16.x, flows.trans, &indices1, &indices2); - input1 = transition4(mm_index_mask.m, input1, - mm_shuffle_input.m, mm_ones_16.m, + input1 = transition4(mm_index_mask.x, input1, + mm_shuffle_input.x, mm_ones_16.x, flows.trans, &indices3, &indices4); - input0 = transition4(mm_index_mask.m, input0, - mm_shuffle_input.m, mm_ones_16.m, + input0 = transition4(mm_index_mask.x, input0, + mm_shuffle_input.x, mm_ones_16.x, flows.trans, &indices1, &indices2); - input1 = transition4(mm_index_mask.m, input1, - mm_shuffle_input.m, mm_ones_16.m, + input1 = transition4(mm_index_mask.x, input1, + mm_shuffle_input.x, mm_ones_16.x, flows.trans, &indices3, &indices4); - input0 = transition4(mm_index_mask.m, input0, - mm_shuffle_input.m, mm_ones_16.m, + input0 = transition4(mm_index_mask.x, input0, + mm_shuffle_input.x, mm_ones_16.x, flows.trans, &indices1, &indices2); - input1 = transition4(mm_index_mask.m, input1, - mm_shuffle_input.m, mm_ones_16.m, + input1 = transition4(mm_index_mask.x, input1, + mm_shuffle_input.x, mm_ones_16.x, flows.trans, &indices3, &indices4); /* Check for any matches. */ acl_match_check_x4(0, ctx, parms, &flows, - &indices1, &indices2, mm_match_mask.m); + &indices1, &indices2, mm_match_mask.x); acl_match_check_x4(4, ctx, parms, &flows, - &indices3, &indices4, mm_match_mask.m); + &indices3, &indices4, mm_match_mask.x); } return 0; @@ -451,36 +451,36 @@ search_sse_4(const struct rte_acl_ctx *ctx, const uint8_t **data, /* Check for any matches. */ acl_match_check_x4(0, ctx, parms, &flows, - &indices1, &indices2, mm_match_mask.m); + &indices1, &indices2, mm_match_mask.x); while (flows.started > 0) { /* Gather 4 bytes of input data for each stream. */ - input = MM_INSERT32(mm_ones_16.m, GET_NEXT_4BYTES(parms, 0), 0); + input = MM_INSERT32(mm_ones_16.x, GET_NEXT_4BYTES(parms, 0), 0); input = MM_INSERT32(input, GET_NEXT_4BYTES(parms, 1), 1); input = MM_INSERT32(input, GET_NEXT_4BYTES(parms, 2), 2); input = MM_INSERT32(input, GET_NEXT_4BYTES(parms, 3), 3); /* Process the 4 bytes of input on each stream. */ - input = transition4(mm_index_mask.m, input, - mm_shuffle_input.m, mm_ones_16.m, + input = transition4(mm_index_mask.x, input, + mm_shuffle_input.x, mm_ones_16.x, flows.trans, &indices1, &indices2); - input = transition4(mm_index_mask.m, input, - mm_shuffle_input.m, mm_ones_16.m, + input = transition4(mm_index_mask.x, input, + mm_shuffle_input.x, mm_ones_16.x, flows.trans, &indices1, &indices2); - input = transition4(mm_index_mask.m, input, - mm_shuffle_input.m, mm_ones_16.m, + input = transition4(mm_index_mask.x, input, + mm_shuffle_input.x, mm_ones_16.x, flows.trans, &indices1, &indices2); - input = transition4(mm_index_mask.m, input, - mm_shuffle_input.m, mm_ones_16.m, + input = transition4(mm_index_mask.x, input, + mm_shuffle_input.x, mm_ones_16.x, flows.trans, &indices1, &indices2); /* Check for any matches. */ acl_match_check_x4(0, ctx, parms, &flows, - &indices1, &indices2, mm_match_mask.m); + &indices1, &indices2, mm_match_mask.x); } return 0; @@ -534,35 +534,35 @@ search_sse_2(const struct rte_acl_ctx *ctx, const uint8_t **data, indices = MM_LOADU((xmm_t *) &index_array[0]); /* Check for any matches. */ - acl_match_check_x2(0, ctx, parms, &flows, &indices, mm_match_mask64.m); + acl_match_check_x2(0, ctx, parms, &flows, &indices, mm_match_mask64.x); while (flows.started > 0) { /* Gather 4 bytes of input data for each stream. */ - input = MM_INSERT32(mm_ones_16.m, GET_NEXT_4BYTES(parms, 0), 0); + input = MM_INSERT32(mm_ones_16.x, GET_NEXT_4BYTES(parms, 0), 0); input = MM_INSERT32(input, GET_NEXT_4BYTES(parms, 1), 1); /* Process the 4 bytes of input on each stream. */ - input = transition2(mm_index_mask64.m, input, - mm_shuffle_input64.m, mm_ones_16.m, + input = transition2(mm_index_mask64.x, input, + mm_shuffle_input64.x, mm_ones_16.x, flows.trans, &indices); - input = transition2(mm_index_mask64.m, input, - mm_shuffle_input64.m, mm_ones_16.m, + input = transition2(mm_index_mask64.x, input, + mm_shuffle_input64.x, mm_ones_16.x, flows.trans, &indices); - input = transition2(mm_index_mask64.m, input, - mm_shuffle_input64.m, mm_ones_16.m, + input = transition2(mm_index_mask64.x, input, + mm_shuffle_input64.x, mm_ones_16.x, flows.trans, &indices); - input = transition2(mm_index_mask64.m, input, - mm_shuffle_input64.m, mm_ones_16.m, + input = transition2(mm_index_mask64.x, input, + mm_shuffle_input64.x, mm_ones_16.x, flows.trans, &indices); /* Check for any matches. */ acl_match_check_x2(0, ctx, parms, &flows, &indices, - mm_match_mask64.m); + mm_match_mask64.x); } return 0; diff --git a/lib/librte_acl/rte_acl_osdep_alone.h b/lib/librte_acl/rte_acl_osdep_alone.h index 2a99860723..58c4f6aff4 100644 --- a/lib/librte_acl/rte_acl_osdep_alone.h +++ b/lib/librte_acl/rte_acl_osdep_alone.h @@ -57,6 +57,10 @@ #include #endif +#if defined(__AVX__) +#include +#endif + #else #include @@ -128,8 +132,8 @@ typedef __m128i xmm_t; #define XMM_SIZE (sizeof(xmm_t)) #define XMM_MASK (XMM_SIZE - 1) -typedef union rte_mmsse { - xmm_t m; +typedef union rte_xmm { + xmm_t x; uint8_t u8[XMM_SIZE / sizeof(uint8_t)]; uint16_t u16[XMM_SIZE / sizeof(uint16_t)]; uint32_t u32[XMM_SIZE / sizeof(uint32_t)]; @@ -137,6 +141,33 @@ typedef union rte_mmsse { double pd[XMM_SIZE / sizeof(double)]; } rte_xmm_t; +#ifdef __AVX__ + +typedef __m256i ymm_t; + +#define YMM_SIZE (sizeof(ymm_t)) +#define YMM_MASK (YMM_SIZE - 1) + +typedef union rte_ymm { + ymm_t y; + xmm_t x[YMM_SIZE / sizeof(xmm_t)]; + uint8_t u8[YMM_SIZE / sizeof(uint8_t)]; + uint16_t u16[YMM_SIZE / sizeof(uint16_t)]; + uint32_t u32[YMM_SIZE / sizeof(uint32_t)]; + uint64_t u64[YMM_SIZE / sizeof(uint64_t)]; + double pd[YMM_SIZE / sizeof(double)]; +} rte_ymm_t; + +#endif /* __AVX__ */ + +#ifdef RTE_ARCH_I686 +#define _mm_cvtsi128_si64(a) ({ \ + rte_xmm_t m; \ + m.x = (a); \ + (m.u64[0]); \ +}) +#endif + /* * rte_cycles related. */ diff --git a/lib/librte_eal/common/include/rte_common_vect.h b/lib/librte_eal/common/include/rte_common_vect.h index 95bf4b1a34..617470b89b 100644 --- a/lib/librte_eal/common/include/rte_common_vect.h +++ b/lib/librte_eal/common/include/rte_common_vect.h @@ -54,6 +54,10 @@ #include #endif +#if defined(__AVX__) +#include +#endif + #else #include @@ -70,7 +74,7 @@ typedef __m128i xmm_t; #define XMM_MASK (XMM_SIZE - 1) typedef union rte_xmm { - xmm_t m; + xmm_t x; uint8_t u8[XMM_SIZE / sizeof(uint8_t)]; uint16_t u16[XMM_SIZE / sizeof(uint16_t)]; uint32_t u32[XMM_SIZE / sizeof(uint32_t)]; @@ -78,10 +82,29 @@ typedef union rte_xmm { double pd[XMM_SIZE / sizeof(double)]; } rte_xmm_t; +#ifdef __AVX__ + +typedef __m256i ymm_t; + +#define YMM_SIZE (sizeof(ymm_t)) +#define YMM_MASK (YMM_SIZE - 1) + +typedef union rte_ymm { + ymm_t y; + xmm_t x[YMM_SIZE / sizeof(xmm_t)]; + uint8_t u8[YMM_SIZE / sizeof(uint8_t)]; + uint16_t u16[YMM_SIZE / sizeof(uint16_t)]; + uint32_t u32[YMM_SIZE / sizeof(uint32_t)]; + uint64_t u64[YMM_SIZE / sizeof(uint64_t)]; + double pd[YMM_SIZE / sizeof(double)]; +} rte_ymm_t; + +#endif /* __AVX__ */ + #ifdef RTE_ARCH_I686 #define _mm_cvtsi128_si64(a) ({ \ rte_xmm_t m; \ - m.m = (a); \ + m.x = (a); \ (m.u64[0]); \ }) #endif diff --git a/lib/librte_lpm/rte_lpm.h b/lib/librte_lpm/rte_lpm.h index 62d7736ec6..586300b4db 100644 --- a/lib/librte_lpm/rte_lpm.h +++ b/lib/librte_lpm/rte_lpm.h @@ -420,7 +420,7 @@ rte_lpm_lookupx4(const struct rte_lpm *lpm, __m128i ip, uint16_t hop[4], tbl[3] = *(const uint16_t *)&lpm->tbl24[idx >> 32]; /* get 4 indexes for tbl8[]. */ - i8.m = _mm_and_si128(ip, mask8); + i8.x = _mm_and_si128(ip, mask8); pt = (uint64_t)tbl[0] | (uint64_t)tbl[1] << 16 |