eal/arm64: optimize memcpy

author Herbert Guan <herbert.guan@arm.com>

Fri, 19 Jan 2018 06:10:36 +0000 (14:10 +0800)

committer Thomas Monjalon <thomas@monjalon.net>

Sat, 20 Jan 2018 16:47:25 +0000 (17:47 +0100)
author Herbert Guan <herbert.guan@arm.com>
Fri, 19 Jan 2018 06:10:36 +0000 (14:10 +0800)
committer Thomas Monjalon <thomas@monjalon.net>
Sat, 20 Jan 2018 16:47:25 +0000 (17:47 +0100)
diff --git a/config/common_armv8a_linuxapp b/config/common_armv8a_linuxapp

index 781c8542356c752bdfb7ecc9360a6e85be0f949f..790e716263e8fa946373c8d9a2802833c4b8bb9c 100644 (file)
--- a/config/common_armv8a_linuxapp
+++ b/config/common_armv8a_linuxapp
@@ -17,6 +17,18 @@ CONFIG_RTE_FORCE_INTRINSICS=y
  # to address minimum DMA alignment across all arm64 implementations.
  CONFIG_RTE_CACHE_LINE_SIZE=128
  
+# Accelarate rte_memcpy. Be sure to run unit test (memcpy_perf_autotest)
+# to determine the best threshold in code. Refer to notes in source file
+# (lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h) for more info.
+CONFIG_RTE_ARCH_ARM64_MEMCPY=n
+#CONFIG_RTE_ARM64_MEMCPY_ALIGNED_THRESHOLD=2048
+#CONFIG_RTE_ARM64_MEMCPY_UNALIGNED_THRESHOLD=512
+# Leave below RTE_ARM64_MEMCPY_xxx options commented out, unless there're
+# strong reasons.
+#CONFIG_RTE_ARM64_MEMCPY_SKIP_GCC_VER_CHECK=n
+#CONFIG_RTE_ARM64_MEMCPY_ALIGN_MASK=0xF
+#CONFIG_RTE_ARM64_MEMCPY_STRICT_ALIGN=n
+
  CONFIG_RTE_LIBRTE_FM10K_PMD=n
  CONFIG_RTE_LIBRTE_SFC_EFX_PMD=n
  CONFIG_RTE_LIBRTE_AVP_PMD=n
diff --git a/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h b/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h

index f408aaa732c8b15e9e7e0799a3a95ab5d7b81719..beb97a71ef0cefc62892b7e0981fd3746b3c1e27 100644 (file)
--- a/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h
+++ b/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h
@@ -14,6 +14,317 @@ extern "C" {
  
  #include "generic/rte_memcpy.h"
  
+#ifdef RTE_ARCH_ARM64_MEMCPY
+#include <rte_common.h>
+#include <rte_branch_prediction.h>
+
+/*
+ * The memory copy performance differs on different AArch64 micro-architectures.
+ * And the most recent glibc (e.g. 2.23 or later) can provide a better memcpy()
+ * performance compared to old glibc versions. It's always suggested to use a
+ * more recent glibc if possible, from which the entire system can get benefit.
+ *
+ * This implementation improves memory copy on some aarch64 micro-architectures,
+ * when an old glibc (e.g. 2.19, 2.17...) is being used. It is disabled by
+ * default and needs "RTE_ARCH_ARM64_MEMCPY" defined to activate. It's not
+ * always providing better performance than memcpy() so users need to run unit
+ * test "memcpy_perf_autotest" and customize parameters in customization section
+ * below for best performance.
+ *
+ * Compiler version will also impact the rte_memcpy() performance. It's observed
+ * on some platforms and with the same code, GCC 7.2.0 compiled binaries can
+ * provide better performance than GCC 4.8.5 compiled binaries.
+ */
+
+/**************************************
+ * Beginning of customization section
+ **************************************/
+#ifndef RTE_ARM64_MEMCPY_ALIGN_MASK
+#define RTE_ARM64_MEMCPY_ALIGN_MASK ((RTE_CACHE_LINE_SIZE >> 3) - 1)
+#endif
+
+#ifndef RTE_ARM64_MEMCPY_STRICT_ALIGN
+/* Only src unalignment will be treated as unaligned copy */
+#define RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src) \
+       ((uintptr_t)(src) & RTE_ARM64_MEMCPY_ALIGN_MASK)
+#else
+/* Both dst and src unalignment will be treated as unaligned copy */
+#define RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src) \
+       (((uintptr_t)(dst) | (uintptr_t)(src)) & RTE_ARM64_MEMCPY_ALIGN_MASK)
+#endif
+
+
+/*
+ * If copy size is larger than threshold, memcpy() will be used.
+ * Run "memcpy_perf_autotest" to determine the proper threshold.
+ */
+#ifdef RTE_ARM64_MEMCPY_ALIGNED_THRESHOLD
+#define USE_ALIGNED_RTE_MEMCPY(dst, src, n) \
+(!RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src) && \
+n <= (size_t)RTE_ARM64_MEMCPY_ALIGNED_THRESHOLD)
+#else
+#define USE_ALIGNED_RTE_MEMCPY(dst, src, n) \
+(!RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src))
+#endif
+#ifdef RTE_ARM64_MEMCPY_UNALIGNED_THRESHOLD
+#define USE_UNALIGNED_RTE_MEMCPY(dst, src, n) \
+(RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src) && \
+n <= (size_t)RTE_ARM64_MEMCPY_UNALIGNED_THRESHOLD)
+#else
+#define USE_UNALIGNED_RTE_MEMCPY(dst, src, n) \
+(RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src))
+#endif
+/*
+ * The logic of USE_RTE_MEMCPY() can also be modified to best fit platform.
+ */
+#if defined(RTE_ARM64_MEMCPY_ALIGNED_THRESHOLD) \
+|| defined(RTE_ARM64_MEMCPY_UNALIGNED_THRESHOLD)
+#define USE_RTE_MEMCPY(dst, src, n) \
+(USE_ALIGNED_RTE_MEMCPY(dst, src, n) || USE_UNALIGNED_RTE_MEMCPY(dst, src, n))
+#else
+#define USE_RTE_MEMCPY(dst, src, n) (1)
+#endif
+/**************************************
+ * End of customization section
+ **************************************/
+
+
+#if defined(RTE_TOOLCHAIN_GCC) && !defined(RTE_ARM64_MEMCPY_SKIP_GCC_VER_CHECK)
+#if (GCC_VERSION < 50400)
+#warning "The GCC version is quite old, which may result in sub-optimal \
+performance of the compiled code. It is suggested that at least GCC 5.4.0 \
+be used."
+#endif
+#endif
+
+static __rte_always_inline
+void rte_mov16(uint8_t *dst, const uint8_t *src)
+{
+       __uint128_t *dst128 = (__uint128_t *)dst;
+       const __uint128_t *src128 = (const __uint128_t *)src;
+       *dst128 = *src128;
+}
+
+static __rte_always_inline
+void rte_mov32(uint8_t *dst, const uint8_t *src)
+{
+       __uint128_t *dst128 = (__uint128_t *)dst;
+       const __uint128_t *src128 = (const __uint128_t *)src;
+       const __uint128_t x0 = src128[0], x1 = src128[1];
+       dst128[0] = x0;
+       dst128[1] = x1;
+}
+
+static __rte_always_inline
+void rte_mov48(uint8_t *dst, const uint8_t *src)
+{
+       __uint128_t *dst128 = (__uint128_t *)dst;
+       const __uint128_t *src128 = (const __uint128_t *)src;
+       const __uint128_t x0 = src128[0], x1 = src128[1], x2 = src128[2];
+       dst128[0] = x0;
+       dst128[1] = x1;
+       dst128[2] = x2;
+}
+
+static __rte_always_inline
+void rte_mov64(uint8_t *dst, const uint8_t *src)
+{
+       __uint128_t *dst128 = (__uint128_t *)dst;
+       const __uint128_t *src128 = (const __uint128_t *)src;
+       const __uint128_t
+               x0 = src128[0], x1 = src128[1], x2 = src128[2], x3 = src128[3];
+       dst128[0] = x0;
+       dst128[1] = x1;
+       dst128[2] = x2;
+       dst128[3] = x3;
+}
+
+static __rte_always_inline
+void rte_mov128(uint8_t *dst, const uint8_t *src)
+{
+       __uint128_t *dst128 = (__uint128_t *)dst;
+       const __uint128_t *src128 = (const __uint128_t *)src;
+       /* Keep below declaration & copy sequence for optimized instructions */
+       const __uint128_t
+               x0 = src128[0], x1 = src128[1], x2 = src128[2], x3 = src128[3];
+       dst128[0] = x0;
+       __uint128_t x4 = src128[4];
+       dst128[1] = x1;
+       __uint128_t x5 = src128[5];
+       dst128[2] = x2;
+       __uint128_t x6 = src128[6];
+       dst128[3] = x3;
+       __uint128_t x7 = src128[7];
+       dst128[4] = x4;
+       dst128[5] = x5;
+       dst128[6] = x6;
+       dst128[7] = x7;
+}
+
+static __rte_always_inline
+void rte_mov256(uint8_t *dst, const uint8_t *src)
+{
+       rte_mov128(dst, src);
+       rte_mov128(dst + 128, src + 128);
+}
+
+static __rte_always_inline void
+rte_memcpy_lt16(uint8_t *dst, const uint8_t *src, size_t n)
+{
+       if (n & 0x08) {
+               /* copy 8 ~ 15 bytes */
+               *(uint64_t *)dst = *(const uint64_t *)src;
+               *(uint64_t *)(dst - 8 + n) = *(const uint64_t *)(src - 8 + n);
+       } else if (n & 0x04) {
+               /* copy 4 ~ 7 bytes */
+               *(uint32_t *)dst = *(const uint32_t *)src;
+               *(uint32_t *)(dst - 4 + n) = *(const uint32_t *)(src - 4 + n);
+       } else if (n & 0x02) {
+               /* copy 2 ~ 3 bytes */
+               *(uint16_t *)dst = *(const uint16_t *)src;
+               *(uint16_t *)(dst - 2 + n) = *(const uint16_t *)(src - 2 + n);
+       } else if (n & 0x01) {
+               /* copy 1 byte */
+               *dst = *src;
+       }
+}
+
+static __rte_always_inline
+void rte_memcpy_ge16_lt128(uint8_t *dst, const uint8_t *src, size_t n)
+{
+       if (n < 64) {
+               if (n == 16) {
+                       rte_mov16(dst, src);
+               } else if (n <= 32) {
+                       rte_mov16(dst, src);
+                       rte_mov16(dst - 16 + n, src - 16 + n);
+               } else if (n <= 48) {
+                       rte_mov32(dst, src);
+                       rte_mov16(dst - 16 + n, src - 16 + n);
+               } else {
+                       rte_mov48(dst, src);
+                       rte_mov16(dst - 16 + n, src - 16 + n);
+               }
+       } else {
+               rte_mov64((uint8_t *)dst, (const uint8_t *)src);
+               if (n > 48 + 64)
+                       rte_mov64(dst - 64 + n, src - 64 + n);
+               else if (n > 32 + 64)
+                       rte_mov48(dst - 48 + n, src - 48 + n);
+               else if (n > 16 + 64)
+                       rte_mov32(dst - 32 + n, src - 32 + n);
+               else if (n > 64)
+                       rte_mov16(dst - 16 + n, src - 16 + n);
+       }
+}
+
+static __rte_always_inline
+void rte_memcpy_ge128(uint8_t *dst, const uint8_t *src, size_t n)
+{
+       do {
+               rte_mov128(dst, src);
+               src += 128;
+               dst += 128;
+               n -= 128;
+       } while (likely(n >= 128));
+
+       if (likely(n)) {
+               if (n <= 16)
+                       rte_mov16(dst - 16 + n, src - 16 + n);
+               else if (n <= 32)
+                       rte_mov32(dst - 32 + n, src - 32 + n);
+               else if (n <= 48)
+                       rte_mov48(dst - 48 + n, src - 48 + n);
+               else if (n <= 64)
+                       rte_mov64(dst - 64 + n, src - 64 + n);
+               else
+                       rte_memcpy_ge16_lt128(dst, src, n);
+       }
+}
+
+static __rte_always_inline
+void rte_memcpy_ge16_lt64(uint8_t *dst, const uint8_t *src, size_t n)
+{
+       if (n == 16) {
+               rte_mov16(dst, src);
+       } else if (n <= 32) {
+               rte_mov16(dst, src);
+               rte_mov16(dst - 16 + n, src - 16 + n);
+       } else if (n <= 48) {
+               rte_mov32(dst, src);
+               rte_mov16(dst - 16 + n, src - 16 + n);
+       } else {
+               rte_mov48(dst, src);
+               rte_mov16(dst - 16 + n, src - 16 + n);
+       }
+}
+
+static __rte_always_inline
+void rte_memcpy_ge64(uint8_t *dst, const uint8_t *src, size_t n)
+{
+       do {
+               rte_mov64(dst, src);
+               src += 64;
+               dst += 64;
+               n -= 64;
+       } while (likely(n >= 64));
+
+       if (likely(n)) {
+               if (n <= 16)
+                       rte_mov16(dst - 16 + n, src - 16 + n);
+               else if (n <= 32)
+                       rte_mov32(dst - 32 + n, src - 32 + n);
+               else if (n <= 48)
+                       rte_mov48(dst - 48 + n, src - 48 + n);
+               else
+                       rte_mov64(dst - 64 + n, src - 64 + n);
+       }
+}
+
+#if RTE_CACHE_LINE_SIZE >= 128
+static __rte_always_inline
+void *rte_memcpy(void *dst, const void *src, size_t n)
+{
+       if (n < 16) {
+               rte_memcpy_lt16((uint8_t *)dst, (const uint8_t *)src, n);
+               return dst;
+       }
+       if (n < 128) {
+               rte_memcpy_ge16_lt128((uint8_t *)dst, (const uint8_t *)src, n);
+               return dst;
+       }
+       __builtin_prefetch(src, 0, 0);
+       __builtin_prefetch(dst, 1, 0);
+       if (likely(USE_RTE_MEMCPY(dst, src, n))) {
+               rte_memcpy_ge128((uint8_t *)dst, (const uint8_t *)src, n);
+               return dst;
+       } else
+               return memcpy(dst, src, n);
+}
+
+#else
+static __rte_always_inline
+void *rte_memcpy(void *dst, const void *src, size_t n)
+{
+       if (n < 16) {
+               rte_memcpy_lt16((uint8_t *)dst, (const uint8_t *)src, n);
+               return dst;
+       }
+       if (n < 64) {
+               rte_memcpy_ge16_lt64((uint8_t *)dst, (const uint8_t *)src, n);
+               return dst;
+       }
+       __builtin_prefetch(src, 0, 0);
+       __builtin_prefetch(dst, 1, 0);
+       if (likely(USE_RTE_MEMCPY(dst, src, n))) {
+               rte_memcpy_ge64((uint8_t *)dst, (const uint8_t *)src, n);
+               return dst;
+       } else
+               return memcpy(dst, src, n);
+}
+#endif /* RTE_CACHE_LINE_SIZE >= 128 */
+
+#else
  static inline void
  rte_mov16(uint8_t *dst, const uint8_t *src)
  {
@@ -52,6 +363,8 @@ rte_mov256(uint8_t *dst, const uint8_t *src)
  
  #define rte_memcpy(d, s, n)    memcpy((d), (s), (n))
  
+#endif /* RTE_ARCH_ARM64_MEMCPY */
+
  #ifdef __cplusplus
  }
  #endif
author	Herbert Guan <herbert.guan@arm.com>
	Fri, 19 Jan 2018 06:10:36 +0000 (14:10 +0800)
committer	Thomas Monjalon <thomas@monjalon.net>
	Sat, 20 Jan 2018 16:47:25 +0000 (17:47 +0100)
config/common_armv8a_linuxapp		patch \| blob \| history
lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h		patch \| blob \| history