lib/librte_eal/arm/include/rte_vect.h

   1 /* SPDX-License-Identifier: BSD-3-Clause
   2  * Copyright(c) 2015 Cavium, Inc
   3  */
   4
   5 #ifndef _RTE_VECT_ARM_H_
   6 #define _RTE_VECT_ARM_H_
   7
   8 #include <stdint.h>
   9 #include "generic/rte_vect.h"
  10 #include "rte_debug.h"
  11 #include "arm_neon.h"
  12
  13 #ifdef __cplusplus
  14 extern "C" {
  15 #endif
  16
  17 typedef int32x4_t xmm_t;
  18
  19 #define XMM_SIZE        (sizeof(xmm_t))
  20 #define XMM_MASK        (XMM_SIZE - 1)
  21
  22 typedef union rte_xmm {
  23         xmm_t    x;
  24         uint8_t  u8[XMM_SIZE / sizeof(uint8_t)];
  25         uint16_t u16[XMM_SIZE / sizeof(uint16_t)];
  26         uint32_t u32[XMM_SIZE / sizeof(uint32_t)];
  27         uint64_t u64[XMM_SIZE / sizeof(uint64_t)];
  28         double   pd[XMM_SIZE / sizeof(double)];
  29 } __rte_aligned(16) rte_xmm_t;
  30
  31 #ifdef RTE_ARCH_ARM
  32 /* NEON intrinsic vqtbl1q_u8() is not supported in ARMv7-A(AArch32) */
  33 static __inline uint8x16_t
  34 vqtbl1q_u8(uint8x16_t a, uint8x16_t b)
  35 {
  36         uint8_t i, pos;
  37         rte_xmm_t rte_a, rte_b, rte_ret;
  38
  39         vst1q_u8(rte_a.u8, a);
  40         vst1q_u8(rte_b.u8, b);
  41
  42         for (i = 0; i < 16; i++) {
  43                 pos = rte_b.u8[i];
  44                 if (pos < 16)
  45                         rte_ret.u8[i] = rte_a.u8[pos];
  46                 else
  47                         rte_ret.u8[i] = 0;
  48         }
  49
  50         return vld1q_u8(rte_ret.u8);
  51 }
  52
  53 static inline uint16_t
  54 vaddvq_u16(uint16x8_t a)
  55 {
  56         uint32x4_t m = vpaddlq_u16(a);
  57         uint64x2_t n = vpaddlq_u32(m);
  58         uint64x1_t o = vget_low_u64(n) + vget_high_u64(n);
  59
  60         return vget_lane_u32((uint32x2_t)o, 0);
  61 }
  62
  63 #endif
  64
  65 #if defined(RTE_ARCH_ARM) || \
  66 (defined(RTE_ARCH_ARM64) && RTE_CC_IS_GNU && (GCC_VERSION < 70000))
  67 /* NEON intrinsic vcopyq_laneq_u32() is not supported in ARMv7-A(AArch32)
  68  * On AArch64, this intrinsic is supported since GCC version 7.
  69  */
  70 static inline uint32x4_t
  71 vcopyq_laneq_u32(uint32x4_t a, const int lane_a,
  72                  uint32x4_t b, const int lane_b)
  73 {
  74         return vsetq_lane_u32(vgetq_lane_u32(b, lane_b), a, lane_a);
  75 }
  76 #endif
  77
  78 #if defined(RTE_ARCH_ARM64)
  79 #if RTE_CC_IS_GNU && (GCC_VERSION < 70000)
  80
  81 #if (GCC_VERSION < 40900)
  82 typedef uint64_t poly64_t;
  83 typedef uint64x2_t poly64x2_t;
  84 typedef uint8_t poly128_t __attribute__((vector_size(16), aligned(16)));
  85
  86 static inline uint32x4_t
  87 vceqzq_u32(uint32x4_t a)
  88 {
  89         return (a == 0);
  90 }
  91 #endif
  92
  93 /* NEON intrinsic vreinterpretq_u64_p128() is supported since GCC version 7 */
  94 static inline uint64x2_t
  95 vreinterpretq_u64_p128(poly128_t x)
  96 {
  97         return (uint64x2_t)x;
  98 }
  99
 100 /* NEON intrinsic vreinterpretq_p64_u64() is supported since GCC version 7 */
 101 static inline poly64x2_t
 102 vreinterpretq_p64_u64(uint64x2_t x)
 103 {
 104         return (poly64x2_t)x;
 105 }
 106
 107 /* NEON intrinsic vgetq_lane_p64() is supported since GCC version 7 */
 108 static inline poly64_t
 109 vgetq_lane_p64(poly64x2_t x, const int lane)
 110 {
 111         RTE_ASSERT(lane >= 0 && lane <= 1);
 112
 113         poly64_t *p = (poly64_t *)&x;
 114
 115         return p[lane];
 116 }
 117 #endif
 118 #endif
 119
 120 /*
 121  * If (0 <= index <= 15), then call the ASIMD ext instruction on the
 122  * 128 bit regs v0 and v1 with the appropriate index.
 123  *
 124  * Else returns a zero vector.
 125  */
 126 static inline uint8x16_t
 127 vextract(uint8x16_t v0, uint8x16_t v1, const int index)
 128 {
 129         switch (index) {
 130         case 0: return vextq_u8(v0, v1, 0);
 131         case 1: return vextq_u8(v0, v1, 1);
 132         case 2: return vextq_u8(v0, v1, 2);
 133         case 3: return vextq_u8(v0, v1, 3);
 134         case 4: return vextq_u8(v0, v1, 4);
 135         case 5: return vextq_u8(v0, v1, 5);
 136         case 6: return vextq_u8(v0, v1, 6);
 137         case 7: return vextq_u8(v0, v1, 7);
 138         case 8: return vextq_u8(v0, v1, 8);
 139         case 9: return vextq_u8(v0, v1, 9);
 140         case 10: return vextq_u8(v0, v1, 10);
 141         case 11: return vextq_u8(v0, v1, 11);
 142         case 12: return vextq_u8(v0, v1, 12);
 143         case 13: return vextq_u8(v0, v1, 13);
 144         case 14: return vextq_u8(v0, v1, 14);
 145         case 15: return vextq_u8(v0, v1, 15);
 146         }
 147         return vdupq_n_u8(0);
 148 }
 149
 150 /**
 151  * Shifts right 128 bit register by specified number of bytes
 152  *
 153  * Value of shift parameter must be in range 0 - 16
 154  */
 155 static inline uint64x2_t
 156 vshift_bytes_right(uint64x2_t reg, const unsigned int shift)
 157 {
 158         return vreinterpretq_u64_u8(vextract(
 159                                 vreinterpretq_u8_u64(reg),
 160                                 vdupq_n_u8(0),
 161                                 shift));
 162 }
 163
 164 /**
 165  * Shifts left 128 bit register by specified number of bytes
 166  *
 167  * Value of shift parameter must be in range 0 - 16
 168  */
 169 static inline uint64x2_t
 170 vshift_bytes_left(uint64x2_t reg, const unsigned int shift)
 171 {
 172         return vreinterpretq_u64_u8(vextract(
 173                                 vdupq_n_u8(0),
 174                                 vreinterpretq_u8_u64(reg),
 175                                 16 - shift));
 176 }
 177
 178 #ifdef __cplusplus
 179 }
 180 #endif
 181
 182 #endif