X-Git-Url: http://git.droids-corp.org/?a=blobdiff_plain;f=lib%2Flibrte_eal%2Fcommon%2Finclude%2Farch%2Fx86%2Frte_memcpy.h;h=ba44c4a32835e123327dbef03aa5f9de2e7266a9;hb=2a1991799ea11c9fe10cffd3f25f06a418e37f35;hp=fee954ada4dd26747519715fde2d4b0a3047311d;hpb=9484092baad32c5fbd181670df894c30838988b1;p=dpdk.git diff --git a/lib/librte_eal/common/include/arch/x86/rte_memcpy.h b/lib/librte_eal/common/include/arch/x86/rte_memcpy.h index fee954ada4..ba44c4a328 100644 --- a/lib/librte_eal/common/include/arch/x86/rte_memcpy.h +++ b/lib/librte_eal/common/include/arch/x86/rte_memcpy.h @@ -1,34 +1,5 @@ -/*- - * BSD LICENSE - * - * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * * Neither the name of Intel Corporation nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation */ #ifndef _RTE_MEMCPY_X86_64_H_ @@ -44,6 +15,8 @@ #include #include #include +#include +#include #ifdef __cplusplus extern "C" { @@ -64,11 +37,13 @@ extern "C" { * @return * Pointer to the destination data. */ -static inline void * -rte_memcpy(void *dst, const void *src, size_t n) __attribute__((always_inline)); +static __rte_always_inline void * +rte_memcpy(void *dst, const void *src, size_t n); #ifdef RTE_MACHINE_CPUFLAG_AVX512F +#define ALIGNMENT_MASK 0x3F + /** * AVX512 implementation below */ @@ -77,7 +52,7 @@ rte_memcpy(void *dst, const void *src, size_t n) __attribute__((always_inline)); * Copy 16 bytes from one location to another, * locations should not overlap. */ -static inline void +static __rte_always_inline void rte_mov16(uint8_t *dst, const uint8_t *src) { __m128i xmm0; @@ -90,7 +65,7 @@ rte_mov16(uint8_t *dst, const uint8_t *src) * Copy 32 bytes from one location to another, * locations should not overlap. */ -static inline void +static __rte_always_inline void rte_mov32(uint8_t *dst, const uint8_t *src) { __m256i ymm0; @@ -103,7 +78,7 @@ rte_mov32(uint8_t *dst, const uint8_t *src) * Copy 64 bytes from one location to another, * locations should not overlap. */ -static inline void +static __rte_always_inline void rte_mov64(uint8_t *dst, const uint8_t *src) { __m512i zmm0; @@ -116,7 +91,7 @@ rte_mov64(uint8_t *dst, const uint8_t *src) * Copy 128 bytes from one location to another, * locations should not overlap. */ -static inline void +static __rte_always_inline void rte_mov128(uint8_t *dst, const uint8_t *src) { rte_mov64(dst + 0 * 64, src + 0 * 64); @@ -127,7 +102,7 @@ rte_mov128(uint8_t *dst, const uint8_t *src) * Copy 256 bytes from one location to another, * locations should not overlap. */ -static inline void +static __rte_always_inline void rte_mov256(uint8_t *dst, const uint8_t *src) { rte_mov64(dst + 0 * 64, src + 0 * 64); @@ -140,7 +115,7 @@ rte_mov256(uint8_t *dst, const uint8_t *src) * Copy 128-byte blocks from one location to another, * locations should not overlap. */ -static inline void +static __rte_always_inline void rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n) { __m512i zmm0, zmm1; @@ -188,8 +163,8 @@ rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n) } } -static inline void * -rte_memcpy(void *dst, const void *src, size_t n) +static __rte_always_inline void * +rte_memcpy_generic(void *dst, const void *src, size_t n) { uintptr_t dstu = (uintptr_t)dst; uintptr_t srcu = (uintptr_t)src; @@ -306,7 +281,9 @@ COPY_BLOCK_128_BACK63: goto COPY_BLOCK_128_BACK63; } -#elif RTE_MACHINE_CPUFLAG_AVX2 +#elif defined RTE_MACHINE_CPUFLAG_AVX2 + +#define ALIGNMENT_MASK 0x1F /** * AVX2 implementation below @@ -316,7 +293,7 @@ COPY_BLOCK_128_BACK63: * Copy 16 bytes from one location to another, * locations should not overlap. */ -static inline void +static __rte_always_inline void rte_mov16(uint8_t *dst, const uint8_t *src) { __m128i xmm0; @@ -329,7 +306,7 @@ rte_mov16(uint8_t *dst, const uint8_t *src) * Copy 32 bytes from one location to another, * locations should not overlap. */ -static inline void +static __rte_always_inline void rte_mov32(uint8_t *dst, const uint8_t *src) { __m256i ymm0; @@ -342,7 +319,7 @@ rte_mov32(uint8_t *dst, const uint8_t *src) * Copy 64 bytes from one location to another, * locations should not overlap. */ -static inline void +static __rte_always_inline void rte_mov64(uint8_t *dst, const uint8_t *src) { rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32); @@ -353,7 +330,7 @@ rte_mov64(uint8_t *dst, const uint8_t *src) * Copy 128 bytes from one location to another, * locations should not overlap. */ -static inline void +static __rte_always_inline void rte_mov128(uint8_t *dst, const uint8_t *src) { rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32); @@ -363,76 +340,31 @@ rte_mov128(uint8_t *dst, const uint8_t *src) } /** - * Copy 256 bytes from one location to another, - * locations should not overlap. - */ -static inline void -rte_mov256(uint8_t *dst, const uint8_t *src) -{ - rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32); - rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32); - rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32); - rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32); - rte_mov32((uint8_t *)dst + 4 * 32, (const uint8_t *)src + 4 * 32); - rte_mov32((uint8_t *)dst + 5 * 32, (const uint8_t *)src + 5 * 32); - rte_mov32((uint8_t *)dst + 6 * 32, (const uint8_t *)src + 6 * 32); - rte_mov32((uint8_t *)dst + 7 * 32, (const uint8_t *)src + 7 * 32); -} - -/** - * Copy 64-byte blocks from one location to another, - * locations should not overlap. - */ -static inline void -rte_mov64blocks(uint8_t *dst, const uint8_t *src, size_t n) -{ - __m256i ymm0, ymm1; - - while (n >= 64) { - ymm0 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 0 * 32)); - n -= 64; - ymm1 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 1 * 32)); - src = (const uint8_t *)src + 64; - _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 0 * 32), ymm0); - _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 1 * 32), ymm1); - dst = (uint8_t *)dst + 64; - } -} - -/** - * Copy 256-byte blocks from one location to another, + * Copy 128-byte blocks from one location to another, * locations should not overlap. */ -static inline void -rte_mov256blocks(uint8_t *dst, const uint8_t *src, size_t n) +static __rte_always_inline void +rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n) { - __m256i ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7; + __m256i ymm0, ymm1, ymm2, ymm3; - while (n >= 256) { + while (n >= 128) { ymm0 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 0 * 32)); - n -= 256; + n -= 128; ymm1 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 1 * 32)); ymm2 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 2 * 32)); ymm3 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 3 * 32)); - ymm4 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 4 * 32)); - ymm5 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 5 * 32)); - ymm6 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 6 * 32)); - ymm7 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 7 * 32)); - src = (const uint8_t *)src + 256; + src = (const uint8_t *)src + 128; _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 0 * 32), ymm0); _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 1 * 32), ymm1); _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 2 * 32), ymm2); _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 3 * 32), ymm3); - _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 4 * 32), ymm4); - _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 5 * 32), ymm5); - _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 6 * 32), ymm6); - _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 7 * 32), ymm7); - dst = (uint8_t *)dst + 256; + dst = (uint8_t *)dst + 128; } } -static inline void * -rte_memcpy(void *dst, const void *src, size_t n) +static __rte_always_inline void * +rte_memcpy_generic(void *dst, const void *src, size_t n) { uintptr_t dstu = (uintptr_t)dst; uintptr_t srcu = (uintptr_t)src; @@ -466,92 +398,86 @@ rte_memcpy(void *dst, const void *src, size_t n) } /** - * Fast way when copy size doesn't exceed 512 bytes + * Fast way when copy size doesn't exceed 256 bytes */ if (n <= 32) { rte_mov16((uint8_t *)dst, (const uint8_t *)src); - rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); + rte_mov16((uint8_t *)dst - 16 + n, + (const uint8_t *)src - 16 + n); + return ret; + } + if (n <= 48) { + rte_mov16((uint8_t *)dst, (const uint8_t *)src); + rte_mov16((uint8_t *)dst + 16, (const uint8_t *)src + 16); + rte_mov16((uint8_t *)dst - 16 + n, + (const uint8_t *)src - 16 + n); return ret; } if (n <= 64) { rte_mov32((uint8_t *)dst, (const uint8_t *)src); - rte_mov32((uint8_t *)dst - 32 + n, (const uint8_t *)src - 32 + n); + rte_mov32((uint8_t *)dst - 32 + n, + (const uint8_t *)src - 32 + n); return ret; } - if (n <= 512) { - if (n >= 256) { - n -= 256; - rte_mov256((uint8_t *)dst, (const uint8_t *)src); - src = (const uint8_t *)src + 256; - dst = (uint8_t *)dst + 256; - } + if (n <= 256) { if (n >= 128) { n -= 128; rte_mov128((uint8_t *)dst, (const uint8_t *)src); src = (const uint8_t *)src + 128; dst = (uint8_t *)dst + 128; } +COPY_BLOCK_128_BACK31: if (n >= 64) { n -= 64; rte_mov64((uint8_t *)dst, (const uint8_t *)src); src = (const uint8_t *)src + 64; dst = (uint8_t *)dst + 64; } -COPY_BLOCK_64_BACK31: if (n > 32) { rte_mov32((uint8_t *)dst, (const uint8_t *)src); - rte_mov32((uint8_t *)dst - 32 + n, (const uint8_t *)src - 32 + n); + rte_mov32((uint8_t *)dst - 32 + n, + (const uint8_t *)src - 32 + n); return ret; } if (n > 0) { - rte_mov32((uint8_t *)dst - 32 + n, (const uint8_t *)src - 32 + n); + rte_mov32((uint8_t *)dst - 32 + n, + (const uint8_t *)src - 32 + n); } return ret; } /** - * Make store aligned when copy size exceeds 512 bytes + * Make store aligned when copy size exceeds 256 bytes */ - dstofss = 32 - ((uintptr_t)dst & 0x1F); - n -= dstofss; - rte_mov32((uint8_t *)dst, (const uint8_t *)src); - src = (const uint8_t *)src + dstofss; - dst = (uint8_t *)dst + dstofss; + dstofss = (uintptr_t)dst & 0x1F; + if (dstofss > 0) { + dstofss = 32 - dstofss; + n -= dstofss; + rte_mov32((uint8_t *)dst, (const uint8_t *)src); + src = (const uint8_t *)src + dstofss; + dst = (uint8_t *)dst + dstofss; + } /** - * Copy 256-byte blocks. - * Use copy block function for better instruction order control, - * which is important when load is unaligned. + * Copy 128-byte blocks */ - rte_mov256blocks((uint8_t *)dst, (const uint8_t *)src, n); + rte_mov128blocks((uint8_t *)dst, (const uint8_t *)src, n); bits = n; - n = n & 255; + n = n & 127; bits -= n; src = (const uint8_t *)src + bits; dst = (uint8_t *)dst + bits; - /** - * Copy 64-byte blocks. - * Use copy block function for better instruction order control, - * which is important when load is unaligned. - */ - if (n >= 64) { - rte_mov64blocks((uint8_t *)dst, (const uint8_t *)src, n); - bits = n; - n = n & 63; - bits -= n; - src = (const uint8_t *)src + bits; - dst = (uint8_t *)dst + bits; - } - /** * Copy whatever left */ - goto COPY_BLOCK_64_BACK31; + goto COPY_BLOCK_128_BACK31; } #else /* RTE_MACHINE_CPUFLAG */ +#define ALIGNMENT_MASK 0x0F + /** * SSE & AVX implementation below */ @@ -560,7 +486,7 @@ COPY_BLOCK_64_BACK31: * Copy 16 bytes from one location to another, * locations should not overlap. */ -static inline void +static __rte_always_inline void rte_mov16(uint8_t *dst, const uint8_t *src) { __m128i xmm0; @@ -573,7 +499,7 @@ rte_mov16(uint8_t *dst, const uint8_t *src) * Copy 32 bytes from one location to another, * locations should not overlap. */ -static inline void +static __rte_always_inline void rte_mov32(uint8_t *dst, const uint8_t *src) { rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); @@ -584,7 +510,7 @@ rte_mov32(uint8_t *dst, const uint8_t *src) * Copy 64 bytes from one location to another, * locations should not overlap. */ -static inline void +static __rte_always_inline void rte_mov64(uint8_t *dst, const uint8_t *src) { rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); @@ -597,7 +523,7 @@ rte_mov64(uint8_t *dst, const uint8_t *src) * Copy 128 bytes from one location to another, * locations should not overlap. */ -static inline void +static __rte_always_inline void rte_mov128(uint8_t *dst, const uint8_t *src) { rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); @@ -647,8 +573,8 @@ rte_mov256(uint8_t *dst, const uint8_t *src) * - __m128i ~ must be pre-defined */ #define MOVEUNALIGNED_LEFT47_IMM(dst, src, len, offset) \ -({ \ - int tmp; \ +__extension__ ({ \ + size_t tmp; \ while (len >= 128 + 16 - offset) { \ xmm0 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 0 * 16)); \ len -= 128; \ @@ -708,7 +634,7 @@ rte_mov256(uint8_t *dst, const uint8_t *src) * - __m128i ~ used in MOVEUNALIGNED_LEFT47_IMM must be pre-defined */ #define MOVEUNALIGNED_LEFT47(dst, src, len, offset) \ -({ \ +__extension__ ({ \ switch (offset) { \ case 0x01: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x01); break; \ case 0x02: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x02); break; \ @@ -729,8 +655,8 @@ rte_mov256(uint8_t *dst, const uint8_t *src) } \ }) -static inline void * -rte_memcpy(void *dst, const void *src, size_t n) +static __rte_always_inline void * +rte_memcpy_generic(void *dst, const void *src, size_t n) { __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8; uintptr_t dstu = (uintptr_t)dst; @@ -832,11 +758,14 @@ COPY_BLOCK_64_BACK15: * unaligned copy functions require up to 15 bytes * backwards access. */ - dstofss = 16 - ((uintptr_t)dst & 0x0F) + 16; - n -= dstofss; - rte_mov32((uint8_t *)dst, (const uint8_t *)src); - src = (const uint8_t *)src + dstofss; - dst = (uint8_t *)dst + dstofss; + dstofss = (uintptr_t)dst & 0x0F; + if (dstofss > 0) { + dstofss = 16 - dstofss + 16; + n -= dstofss; + rte_mov32((uint8_t *)dst, (const uint8_t *)src); + src = (const uint8_t *)src + dstofss; + dst = (uint8_t *)dst + dstofss; + } srcofs = ((uintptr_t)src & 0x0F); /** @@ -871,6 +800,75 @@ COPY_BLOCK_64_BACK15: #endif /* RTE_MACHINE_CPUFLAG */ +static __rte_always_inline void * +rte_memcpy_aligned(void *dst, const void *src, size_t n) +{ + void *ret = dst; + + /* Copy size <= 16 bytes */ + if (n < 16) { + if (n & 0x01) { + *(uint8_t *)dst = *(const uint8_t *)src; + src = (const uint8_t *)src + 1; + dst = (uint8_t *)dst + 1; + } + if (n & 0x02) { + *(uint16_t *)dst = *(const uint16_t *)src; + src = (const uint16_t *)src + 1; + dst = (uint16_t *)dst + 1; + } + if (n & 0x04) { + *(uint32_t *)dst = *(const uint32_t *)src; + src = (const uint32_t *)src + 1; + dst = (uint32_t *)dst + 1; + } + if (n & 0x08) + *(uint64_t *)dst = *(const uint64_t *)src; + + return ret; + } + + /* Copy 16 <= size <= 32 bytes */ + if (n <= 32) { + rte_mov16((uint8_t *)dst, (const uint8_t *)src); + rte_mov16((uint8_t *)dst - 16 + n, + (const uint8_t *)src - 16 + n); + + return ret; + } + + /* Copy 32 < size <= 64 bytes */ + if (n <= 64) { + rte_mov32((uint8_t *)dst, (const uint8_t *)src); + rte_mov32((uint8_t *)dst - 32 + n, + (const uint8_t *)src - 32 + n); + + return ret; + } + + /* Copy 64 bytes blocks */ + for (; n >= 64; n -= 64) { + rte_mov64((uint8_t *)dst, (const uint8_t *)src); + dst = (uint8_t *)dst + 64; + src = (const uint8_t *)src + 64; + } + + /* Copy whatever left */ + rte_mov64((uint8_t *)dst - 64 + n, + (const uint8_t *)src - 64 + n); + + return ret; +} + +static __rte_always_inline void * +rte_memcpy(void *dst, const void *src, size_t n) +{ + if (!(((uintptr_t)dst | (uintptr_t)src) & ALIGNMENT_MASK)) + return rte_memcpy_aligned(dst, src, n); + else + return rte_memcpy_generic(dst, src, n); +} + #ifdef __cplusplus } #endif