]> git.droids-corp.org - dpdk.git/commitdiff
eal/arm64: add 128-bit atomic compare exchange
authorPhil Yang <phil.yang@arm.com>
Fri, 18 Oct 2019 11:21:28 +0000 (19:21 +0800)
committerDavid Marchand <david.marchand@redhat.com>
Mon, 21 Oct 2019 08:06:13 +0000 (10:06 +0200)
This patch adds the implementation of the 128-bit atomic compare
exchange API on aarch64. Using 64-bit 'ldxp/stxp' instructions
can perform this operation. Moreover, on the LSE atomic extension
accelerated platforms, it is implemented by 'casp' instructions for
better performance.

Since the '__ARM_FEATURE_ATOMICS' flag only supports GCC-9, this
patch adds a new config flag 'RTE_ARM_FEATURE_ATOMICS' to enable
the 'cas' version on older version compilers.
For octeontx2, we make sure that the lse (and other) extensions are
enabled even if the compiler does not know of the octeontx2 target
cpu.

Since direct x0 register used in the code and cas_op_name() and
rte_atomic128_cmp_exchange() is inline function, based on parent
function load, it may corrupt x0 register aka break aarch64 ABI.
Define CAS operations as rte_noinline functions to avoid an ABI
break [1].

1: https://git.dpdk.org/dpdk/commit/?id=5b40ec6b9662

Suggested-by: Jerin Jacob <jerinj@marvell.com>
Signed-off-by: Phil Yang <phil.yang@arm.com>
Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
Tested-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
Acked-by: Jerin Jacob <jerinj@marvell.com>
Reviewed-by: David Marchand <david.marchand@redhat.com>
config/arm/meson.build
config/common_base
config/defconfig_arm64-octeontx2-linuxapp-gcc
config/defconfig_arm64-thunderx2-linuxapp-gcc
lib/librte_eal/common/include/arch/arm/rte_atomic_64.h
lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
lib/librte_eal/common/include/generic/rte_atomic.h

index 979018e1630eae14c62c48cb111d0f83e38d65b1..d9f9811f7e58ac8b2f2a31aa5f558fea009105ef 100644 (file)
@@ -71,11 +71,13 @@ flags_thunderx2_extra = [
        ['RTE_CACHE_LINE_SIZE', 64],
        ['RTE_MAX_NUMA_NODES', 2],
        ['RTE_MAX_LCORE', 256],
+       ['RTE_ARM_FEATURE_ATOMICS', true],
        ['RTE_USE_C11_MEM_MODEL', true]]
 flags_octeontx2_extra = [
        ['RTE_MACHINE', '"octeontx2"'],
        ['RTE_MAX_NUMA_NODES', 1],
        ['RTE_MAX_LCORE', 24],
+       ['RTE_ARM_FEATURE_ATOMICS', true],
        ['RTE_EAL_IGB_UIO', false],
        ['RTE_USE_C11_MEM_MODEL', true]]
 
@@ -96,7 +98,7 @@ machine_args_cavium = [
        ['0xa2', ['-mcpu=thunderxt81'], flags_thunderx_extra],
        ['0xa3', ['-mcpu=thunderxt83'], flags_thunderx_extra],
        ['0xaf', ['-march=armv8.1-a+crc+crypto','-mcpu=thunderx2t99'], flags_thunderx2_extra],
-       ['0xb2', ['-mcpu=octeontx2'], flags_octeontx2_extra]]
+       ['0xb2', ['-march=armv8.2-a+crc+crypto+lse','-mcpu=octeontx2'], flags_octeontx2_extra]]
 
 ## Arm implementer ID (ARM DDI 0487C.a, Section G7.2.106, Page G7-5321)
 impl_generic = ['Generic armv8', flags_generic, machine_args_generic]
index e843a21604d1f3b2a707cd600a0fa146d092d442..232315920285dda0d180426a3a1d41abf5359882 100644 (file)
@@ -120,6 +120,9 @@ CONFIG_RTE_USE_LIBBSD=n
 CONFIG_RTE_ENABLE_AVX=y
 CONFIG_RTE_ENABLE_AVX512=n
 
+# Use ARM LSE ATOMIC instructions
+CONFIG_RTE_ARM_FEATURE_ATOMICS=n
+
 # Default driver path (or "" to disable)
 CONFIG_RTE_EAL_PMD_PATH=""
 
index f20da2442306c0c97acdf9562aeb25ee4a64c843..7687dbec8d8bc7944e30c0d6fc8eb0cd44a95fc5 100644 (file)
@@ -9,6 +9,7 @@ CONFIG_RTE_MACHINE="octeontx2"
 CONFIG_RTE_CACHE_LINE_SIZE=128
 CONFIG_RTE_MAX_NUMA_NODES=1
 CONFIG_RTE_MAX_LCORE=24
+CONFIG_RTE_ARM_FEATURE_ATOMICS=y
 
 # Doesn't support NUMA
 CONFIG_RTE_EAL_NUMA_AWARE_HUGEPAGES=n
index cc5c64ba0236ad5f0887b9ad621f4ad36e9c0f9c..af4a89c480fd9303633d15b11098702f102bab58 100644 (file)
@@ -9,3 +9,4 @@ CONFIG_RTE_MACHINE="thunderx2"
 CONFIG_RTE_CACHE_LINE_SIZE=64
 CONFIG_RTE_MAX_NUMA_NODES=2
 CONFIG_RTE_MAX_LCORE=256
+CONFIG_RTE_ARM_FEATURE_ATOMICS=y
index 97060e4447081f7139312636bd57162e346ae3f8..859ae129d8d1459387d90e8d618c27161b6a00da 100644 (file)
@@ -1,5 +1,6 @@
 /* SPDX-License-Identifier: BSD-3-Clause
  * Copyright(c) 2015 Cavium, Inc
+ * Copyright(c) 2019 Arm Limited
  */
 
 #ifndef _RTE_ATOMIC_ARM64_H_
@@ -14,6 +15,9 @@ extern "C" {
 #endif
 
 #include "generic/rte_atomic.h"
+#include <rte_branch_prediction.h>
+#include <rte_compat.h>
+#include <rte_debug.h>
 
 #define dsb(opt) asm volatile("dsb " #opt : : : "memory")
 #define dmb(opt) asm volatile("dmb " #opt : : : "memory")
@@ -40,6 +44,148 @@ extern "C" {
 
 #define rte_cio_rmb() dmb(oshld)
 
+/*------------------------ 128 bit atomic operations -------------------------*/
+
+#if defined(__ARM_FEATURE_ATOMICS) || defined(RTE_ARM_FEATURE_ATOMICS)
+#define __ATOMIC128_CAS_OP(cas_op_name, op_string)                          \
+static __rte_noinline rte_int128_t                                          \
+cas_op_name(rte_int128_t *dst, rte_int128_t old, rte_int128_t updated)      \
+{                                                                           \
+       /* caspX instructions register pair must start from even-numbered
+        * register at operand 1.
+        * So, specify registers for local variables here.
+        */                                                                 \
+       register uint64_t x0 __asm("x0") = (uint64_t)old.val[0];            \
+       register uint64_t x1 __asm("x1") = (uint64_t)old.val[1];            \
+       register uint64_t x2 __asm("x2") = (uint64_t)updated.val[0];        \
+       register uint64_t x3 __asm("x3") = (uint64_t)updated.val[1];        \
+       asm volatile(                                                       \
+               op_string " %[old0], %[old1], %[upd0], %[upd1], [%[dst]]"   \
+               : [old0] "+r" (x0),                                         \
+               [old1] "+r" (x1)                                            \
+               : [upd0] "r" (x2),                                          \
+               [upd1] "r" (x3),                                            \
+               [dst] "r" (dst)                                             \
+               : "memory");                                                \
+       old.val[0] = x0;                                                    \
+       old.val[1] = x1;                                                    \
+       return old;                                                         \
+}
+
+__ATOMIC128_CAS_OP(__cas_128_relaxed, "casp")
+__ATOMIC128_CAS_OP(__cas_128_acquire, "caspa")
+__ATOMIC128_CAS_OP(__cas_128_release, "caspl")
+__ATOMIC128_CAS_OP(__cas_128_acq_rel, "caspal")
+
+#undef __ATOMIC128_CAS_OP
+
+#endif
+
+__rte_experimental
+static inline int
+rte_atomic128_cmp_exchange(rte_int128_t *dst, rte_int128_t *exp,
+               const rte_int128_t *src, unsigned int weak, int success,
+               int failure)
+{
+       /* Always do strong CAS */
+       RTE_SET_USED(weak);
+       /* Ignore memory ordering for failure, memory order for
+        * success must be stronger or equal
+        */
+       RTE_SET_USED(failure);
+       /* Find invalid memory order */
+       RTE_ASSERT(success == __ATOMIC_RELAXED ||
+               success == __ATOMIC_ACQUIRE ||
+               success == __ATOMIC_RELEASE ||
+               success == __ATOMIC_ACQ_REL ||
+               success == __ATOMIC_SEQ_CST);
+
+       rte_int128_t expected = *exp;
+       rte_int128_t desired = *src;
+       rte_int128_t old;
+
+#if defined(__ARM_FEATURE_ATOMICS) || defined(RTE_ARM_FEATURE_ATOMICS)
+       if (success == __ATOMIC_RELAXED)
+               old = __cas_128_relaxed(dst, expected, desired);
+       else if (success == __ATOMIC_ACQUIRE)
+               old = __cas_128_acquire(dst, expected, desired);
+       else if (success == __ATOMIC_RELEASE)
+               old = __cas_128_release(dst, expected, desired);
+       else
+               old = __cas_128_acq_rel(dst, expected, desired);
+#else
+#define __HAS_ACQ(mo) ((mo) != __ATOMIC_RELAXED && (mo) != __ATOMIC_RELEASE)
+#define __HAS_RLS(mo) ((mo) == __ATOMIC_RELEASE || (mo) == __ATOMIC_ACQ_REL || \
+               (mo) == __ATOMIC_SEQ_CST)
+
+       int ldx_mo = __HAS_ACQ(success) ? __ATOMIC_ACQUIRE : __ATOMIC_RELAXED;
+       int stx_mo = __HAS_RLS(success) ? __ATOMIC_RELEASE : __ATOMIC_RELAXED;
+
+#undef __HAS_ACQ
+#undef __HAS_RLS
+
+       uint32_t ret = 1;
+
+       /* ldx128 can not guarantee atomic,
+        * Must write back src or old to verify atomicity of ldx128;
+        */
+       do {
+
+#define __LOAD_128(op_string, src, dst) { \
+       asm volatile(                     \
+               op_string " %0, %1, %2"   \
+               : "=&r" (dst.val[0]),     \
+                 "=&r" (dst.val[1])      \
+               : "Q" (src->val[0])       \
+               : "memory"); }
+
+               if (ldx_mo == __ATOMIC_RELAXED)
+                       __LOAD_128("ldxp", dst, old)
+               else
+                       __LOAD_128("ldaxp", dst, old)
+
+#undef __LOAD_128
+
+#define __STORE_128(op_string, dst, src, ret) { \
+       asm volatile(                           \
+               op_string " %w0, %1, %2, %3"    \
+               : "=&r" (ret)                   \
+               : "r" (src.val[0]),             \
+                 "r" (src.val[1]),             \
+                 "Q" (dst->val[0])             \
+               : "memory"); }
+
+               if (likely(old.int128 == expected.int128)) {
+                       if (stx_mo == __ATOMIC_RELAXED)
+                               __STORE_128("stxp", dst, desired, ret)
+                       else
+                               __STORE_128("stlxp", dst, desired, ret)
+               } else {
+                       /* In the failure case (since 'weak' is ignored and only
+                        * weak == 0 is implemented), expected should contain
+                        * the atomically read value of dst. This means, 'old'
+                        * needs to be stored back to ensure it was read
+                        * atomically.
+                        */
+                       if (stx_mo == __ATOMIC_RELAXED)
+                               __STORE_128("stxp", dst, old, ret)
+                       else
+                               __STORE_128("stlxp", dst, old, ret)
+               }
+
+#undef __STORE_128
+
+       } while (unlikely(ret));
+#endif
+
+       /* Unconditionally updating expected removes an 'if' statement.
+        * expected should already be in register if not in the cache.
+        */
+       *exp = old;
+
+       return (old.int128 == expected.int128);
+}
+
 #ifdef __cplusplus
 }
 #endif
index 1335d92f4c31fb4c0d5efcdfb21b48e9e762fd4e..cfe7067ddd0639b756e06fe5f9da41259cb0c479 100644 (file)
@@ -183,18 +183,6 @@ static inline void rte_atomic64_clear(rte_atomic64_t *v)
 
 /*------------------------ 128 bit atomic operations -------------------------*/
 
-/**
- * 128-bit integer structure.
- */
-RTE_STD_C11
-typedef struct {
-       RTE_STD_C11
-       union {
-               uint64_t val[2];
-               __extension__ __int128 int128;
-       };
-} __rte_aligned(16) rte_int128_t;
-
 __rte_experimental
 static inline int
 rte_atomic128_cmp_exchange(rte_int128_t *dst,
index 24ff7dcae634d1e25b68f7cd8594a9ff3d51bc8d..e6ab15a9733ce5e3f2ca85dd7e1cae86eda842ef 100644 (file)
@@ -1081,6 +1081,20 @@ static inline void rte_atomic64_clear(rte_atomic64_t *v)
 
 /*------------------------ 128 bit atomic operations -------------------------*/
 
+/**
+ * 128-bit integer structure.
+ */
+RTE_STD_C11
+typedef struct {
+       RTE_STD_C11
+       union {
+               uint64_t val[2];
+#ifdef RTE_ARCH_64
+               __extension__ __int128 int128;
+#endif
+       };
+} __rte_aligned(16) rte_int128_t;
+
 #ifdef __DOXYGEN__
 
 /**
@@ -1093,7 +1107,8 @@ static inline void rte_atomic64_clear(rte_atomic64_t *v)
  *     *exp = *dst
  * @endcode
  *
- * @note This function is currently only available for the x86-64 platform.
+ * @note This function is currently available for the x86-64 and aarch64
+ * platforms.
  *
  * @note The success and failure arguments must be one of the __ATOMIC_* values
  * defined in the C++11 standard. For details on their behavior, refer to the