lib/librte_eal/common/include/arch/arm/rte_vect.h

   1 /* SPDX-License-Identifier: BSD-3-Clause
   2  * Copyright(c) 2015 Cavium, Inc
   3  */
   4
   5 #ifndef _RTE_VECT_ARM_H_
   6 #define _RTE_VECT_ARM_H_
   7
   8 #include <stdint.h>
   9 #include "generic/rte_vect.h"
  10 #include "rte_debug.h"
  11 #include "arm_neon.h"
  12
  13 #ifdef __cplusplus
  14 extern "C" {
  15 #endif
  16
  17 typedef int32x4_t xmm_t;
  18
  19 #define XMM_SIZE        (sizeof(xmm_t))
  20 #define XMM_MASK        (XMM_SIZE - 1)
  21
  22 typedef union rte_xmm {
  23         xmm_t    x;
  24         uint8_t  u8[XMM_SIZE / sizeof(uint8_t)];
  25         uint16_t u16[XMM_SIZE / sizeof(uint16_t)];
  26         uint32_t u32[XMM_SIZE / sizeof(uint32_t)];
  27         uint64_t u64[XMM_SIZE / sizeof(uint64_t)];
  28         double   pd[XMM_SIZE / sizeof(double)];
  29 } __attribute__((aligned(16))) rte_xmm_t;
  30
  31 #ifdef RTE_ARCH_ARM
  32 /* NEON intrinsic vqtbl1q_u8() is not supported in ARMv7-A(AArch32) */
  33 static __inline uint8x16_t
  34 vqtbl1q_u8(uint8x16_t a, uint8x16_t b)
  35 {
  36         uint8_t i, pos;
  37         rte_xmm_t rte_a, rte_b, rte_ret;
  38
  39         vst1q_u8(rte_a.u8, a);
  40         vst1q_u8(rte_b.u8, b);
  41
  42         for (i = 0; i < 16; i++) {
  43                 pos = rte_b.u8[i];
  44                 if (pos < 16)
  45                         rte_ret.u8[i] = rte_a.u8[pos];
  46                 else
  47                         rte_ret.u8[i] = 0;
  48         }
  49
  50         return vld1q_u8(rte_ret.u8);
  51 }
  52
  53 static inline uint16_t
  54 vaddvq_u16(uint16x8_t a)
  55 {
  56         uint32x4_t m = vpaddlq_u16(a);
  57         uint64x2_t n = vpaddlq_u32(m);
  58         uint64x1_t o = vget_low_u64(n) + vget_high_u64(n);
  59
  60         return vget_lane_u32((uint32x2_t)o, 0);
  61 }
  62
  63 #endif
  64
  65 #if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION < 70000)
  66 static inline uint32x4_t
  67 vcopyq_laneq_u32(uint32x4_t a, const int lane_a,
  68                  uint32x4_t b, const int lane_b)
  69 {
  70         return vsetq_lane_u32(vgetq_lane_u32(b, lane_b), a, lane_a);
  71 }
  72 #endif
  73
  74 #if defined(RTE_ARCH_ARM64)
  75 #if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION < 70000)
  76
  77 #if (GCC_VERSION < 40900)
  78 typedef uint64_t poly64_t;
  79 typedef uint64x2_t poly64x2_t;
  80 typedef uint8_t poly128_t __attribute__((vector_size(16), aligned(16)));
  81 #endif
  82
  83 /* NEON intrinsic vreinterpretq_u64_p128() is supported since GCC version 7 */
  84 static inline uint64x2_t
  85 vreinterpretq_u64_p128(poly128_t x)
  86 {
  87         return (uint64x2_t)x;
  88 }
  89
  90 /* NEON intrinsic vreinterpretq_p64_u64() is supported since GCC version 7 */
  91 static inline poly64x2_t
  92 vreinterpretq_p64_u64(uint64x2_t x)
  93 {
  94         return (poly64x2_t)x;
  95 }
  96
  97 /* NEON intrinsic vgetq_lane_p64() is supported since GCC version 7 */
  98 static inline poly64_t
  99 vgetq_lane_p64(poly64x2_t x, const int lane)
 100 {
 101         RTE_ASSERT(lane >= 0 && lane <= 1);
 102
 103         poly64_t *p = (poly64_t *)&x;
 104
 105         return p[lane];
 106 }
 107 #endif
 108 #endif
 109
 110 /*
 111  * If (0 <= index <= 15), then call the ASIMD ext instruction on the
 112  * 128 bit regs v0 and v1 with the appropriate index.
 113  *
 114  * Else returns a zero vector.
 115  */
 116 static inline uint8x16_t
 117 vextract(uint8x16_t v0, uint8x16_t v1, const int index)
 118 {
 119         switch (index) {
 120         case 0: return vextq_u8(v0, v1, 0);
 121         case 1: return vextq_u8(v0, v1, 1);
 122         case 2: return vextq_u8(v0, v1, 2);
 123         case 3: return vextq_u8(v0, v1, 3);
 124         case 4: return vextq_u8(v0, v1, 4);
 125         case 5: return vextq_u8(v0, v1, 5);
 126         case 6: return vextq_u8(v0, v1, 6);
 127         case 7: return vextq_u8(v0, v1, 7);
 128         case 8: return vextq_u8(v0, v1, 8);
 129         case 9: return vextq_u8(v0, v1, 9);
 130         case 10: return vextq_u8(v0, v1, 10);
 131         case 11: return vextq_u8(v0, v1, 11);
 132         case 12: return vextq_u8(v0, v1, 12);
 133         case 13: return vextq_u8(v0, v1, 13);
 134         case 14: return vextq_u8(v0, v1, 14);
 135         case 15: return vextq_u8(v0, v1, 15);
 136         }
 137         return vdupq_n_u8(0);
 138 }
 139
 140 /**
 141  * Shifts right 128 bit register by specified number of bytes
 142  *
 143  * Value of shift parameter must be in range 0 - 16
 144  */
 145 static inline uint64x2_t
 146 vshift_bytes_right(uint64x2_t reg, const unsigned int shift)
 147 {
 148         return vreinterpretq_u64_u8(vextract(
 149                                 vreinterpretq_u8_u64(reg),
 150                                 vdupq_n_u8(0),
 151                                 shift));
 152 }
 153
 154 /**
 155  * Shifts left 128 bit register by specified number of bytes
 156  *
 157  * Value of shift parameter must be in range 0 - 16
 158  */
 159 static inline uint64x2_t
 160 vshift_bytes_left(uint64x2_t reg, const unsigned int shift)
 161 {
 162         return vreinterpretq_u64_u8(vextract(
 163                                 vdupq_n_u8(0),
 164                                 vreinterpretq_u8_u64(reg),
 165                                 16 - shift));
 166 }
 167
 168 #ifdef __cplusplus
 169 }
 170 #endif
 171
 172 #endif