lib/librte_eal/common/include/arch/arm/rte_memcpy_32.h

   1 /*
   2  *   BSD LICENSE
   3  *
   4  *   Copyright(c) 2015 RehiveTech. All rights reserved.
   5  *
   6  *   Redistribution and use in source and binary forms, with or without
   7  *   modification, are permitted provided that the following conditions
   8  *   are met:
   9  *
  10  *     * Redistributions of source code must retain the above copyright
  11  *       notice, this list of conditions and the following disclaimer.
  12  *     * Redistributions in binary form must reproduce the above copyright
  13  *       notice, this list of conditions and the following disclaimer in
  14  *       the documentation and/or other materials provided with the
  15  *       distribution.
  16  *     * Neither the name of RehiveTech nor the names of its
  17  *       contributors may be used to endorse or promote products derived
  18  *       from this software without specific prior written permission.
  19  *
  20  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  21  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  22  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  23  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  24  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  25  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  26  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  27  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  28  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  29  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  30  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  31  */
  32
  33 #ifndef _RTE_MEMCPY_ARM32_H_
  34 #define _RTE_MEMCPY_ARM32_H_
  35
  36 #include <stdint.h>
  37 #include <string.h>
  38
  39 #ifdef __cplusplus
  40 extern "C" {
  41 #endif
  42
  43 #include "generic/rte_memcpy.h"
  44
  45 #ifdef __ARM_NEON_FP
  46
  47 /* ARM NEON Intrinsics are used to copy data */
  48 #include <arm_neon.h>
  49
  50 static inline void
  51 rte_mov16(uint8_t *dst, const uint8_t *src)
  52 {
  53         vst1q_u8(dst, vld1q_u8(src));
  54 }
  55
  56 static inline void
  57 rte_mov32(uint8_t *dst, const uint8_t *src)
  58 {
  59         asm volatile (
  60                 "vld1.8 {d0-d3}, [%0]\n\t"
  61                 "vst1.8 {d0-d3}, [%1]\n\t"
  62                 : "+r" (src), "+r" (dst)
  63                 : : "memory", "d0", "d1", "d2", "d3");
  64 }
  65
  66 static inline void
  67 rte_mov48(uint8_t *dst, const uint8_t *src)
  68 {
  69         asm volatile (
  70                 "vld1.8 {d0-d3}, [%0]!\n\t"
  71                 "vld1.8 {d4-d5}, [%0]\n\t"
  72                 "vst1.8 {d0-d3}, [%1]!\n\t"
  73                 "vst1.8 {d4-d5}, [%1]\n\t"
  74                 : "+r" (src), "+r" (dst)
  75                 :
  76                 : "memory", "d0", "d1", "d2", "d3", "d4", "d5");
  77 }
  78
  79 static inline void
  80 rte_mov64(uint8_t *dst, const uint8_t *src)
  81 {
  82         asm volatile (
  83                 "vld1.8 {d0-d3}, [%0]!\n\t"
  84                 "vld1.8 {d4-d7}, [%0]\n\t"
  85                 "vst1.8 {d0-d3}, [%1]!\n\t"
  86                 "vst1.8 {d4-d7}, [%1]\n\t"
  87                 : "+r" (src), "+r" (dst)
  88                 :
  89                 : "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7");
  90 }
  91
  92 static inline void
  93 rte_mov128(uint8_t *dst, const uint8_t *src)
  94 {
  95         asm volatile ("pld [%0, #64]" : : "r" (src));
  96         asm volatile (
  97                 "vld1.8 {d0-d3},   [%0]!\n\t"
  98                 "vld1.8 {d4-d7},   [%0]!\n\t"
  99                 "vld1.8 {d8-d11},  [%0]!\n\t"
 100                 "vld1.8 {d12-d15}, [%0]\n\t"
 101                 "vst1.8 {d0-d3},   [%1]!\n\t"
 102                 "vst1.8 {d4-d7},   [%1]!\n\t"
 103                 "vst1.8 {d8-d11},  [%1]!\n\t"
 104                 "vst1.8 {d12-d15}, [%1]\n\t"
 105                 : "+r" (src), "+r" (dst)
 106                 :
 107                 : "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
 108                 "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15");
 109 }
 110
 111 static inline void
 112 rte_mov256(uint8_t *dst, const uint8_t *src)
 113 {
 114         asm volatile ("pld [%0,  #64]" : : "r" (src));
 115         asm volatile ("pld [%0, #128]" : : "r" (src));
 116         asm volatile ("pld [%0, #192]" : : "r" (src));
 117         asm volatile ("pld [%0, #256]" : : "r" (src));
 118         asm volatile ("pld [%0, #320]" : : "r" (src));
 119         asm volatile ("pld [%0, #384]" : : "r" (src));
 120         asm volatile ("pld [%0, #448]" : : "r" (src));
 121         asm volatile (
 122                 "vld1.8 {d0-d3},   [%0]!\n\t"
 123                 "vld1.8 {d4-d7},   [%0]!\n\t"
 124                 "vld1.8 {d8-d11},  [%0]!\n\t"
 125                 "vld1.8 {d12-d15}, [%0]!\n\t"
 126                 "vld1.8 {d16-d19}, [%0]!\n\t"
 127                 "vld1.8 {d20-d23}, [%0]!\n\t"
 128                 "vld1.8 {d24-d27}, [%0]!\n\t"
 129                 "vld1.8 {d28-d31}, [%0]\n\t"
 130                 "vst1.8 {d0-d3},   [%1]!\n\t"
 131                 "vst1.8 {d4-d7},   [%1]!\n\t"
 132                 "vst1.8 {d8-d11},  [%1]!\n\t"
 133                 "vst1.8 {d12-d15}, [%1]!\n\t"
 134                 "vst1.8 {d16-d19}, [%1]!\n\t"
 135                 "vst1.8 {d20-d23}, [%1]!\n\t"
 136                 "vst1.8 {d24-d27}, [%1]!\n\t"
 137                 "vst1.8 {d28-d31}, [%1]!\n\t"
 138                 : "+r" (src), "+r" (dst)
 139                 :
 140                 : "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
 141                 "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15",
 142                 "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23",
 143                 "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31");
 144 }
 145
 146 #define rte_memcpy(dst, src, n)              \
 147         ({ (__builtin_constant_p(n)) ?       \
 148         memcpy((dst), (src), (n)) :          \
 149         rte_memcpy_func((dst), (src), (n)); })
 150
 151 static inline void *
 152 rte_memcpy_func(void *dst, const void *src, size_t n)
 153 {
 154         void *ret = dst;
 155
 156         /* We can't copy < 16 bytes using XMM registers so do it manually. */
 157         if (n < 16) {
 158                 if (n & 0x01) {
 159                         *(uint8_t *)dst = *(const uint8_t *)src;
 160                         dst = (uint8_t *)dst + 1;
 161                         src = (const uint8_t *)src + 1;
 162                 }
 163                 if (n & 0x02) {
 164                         *(uint16_t *)dst = *(const uint16_t *)src;
 165                         dst = (uint16_t *)dst + 1;
 166                         src = (const uint16_t *)src + 1;
 167                 }
 168                 if (n & 0x04) {
 169                         *(uint32_t *)dst = *(const uint32_t *)src;
 170                         dst = (uint32_t *)dst + 1;
 171                         src = (const uint32_t *)src + 1;
 172                 }
 173                 if (n & 0x08) {
 174                         /* ARMv7 can not handle unaligned access to long long
 175                          * (uint64_t). Therefore two uint32_t operations are
 176                          * used.
 177                          */
 178                         *(uint32_t *)dst = *(const uint32_t *)src;
 179                         dst = (uint32_t *)dst + 1;
 180                         src = (const uint32_t *)src + 1;
 181                         *(uint32_t *)dst = *(const uint32_t *)src;
 182                 }
 183                 return ret;
 184         }
 185
 186         /* Special fast cases for <= 128 bytes */
 187         if (n <= 32) {
 188                 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
 189                 rte_mov16((uint8_t *)dst - 16 + n,
 190                         (const uint8_t *)src - 16 + n);
 191                 return ret;
 192         }
 193
 194         if (n <= 64) {
 195                 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 196                 rte_mov32((uint8_t *)dst - 32 + n,
 197                         (const uint8_t *)src - 32 + n);
 198                 return ret;
 199         }
 200
 201         if (n <= 128) {
 202                 rte_mov64((uint8_t *)dst, (const uint8_t *)src);
 203                 rte_mov64((uint8_t *)dst - 64 + n,
 204                         (const uint8_t *)src - 64 + n);
 205                 return ret;
 206         }
 207
 208         /*
 209          * For large copies > 128 bytes. This combination of 256, 64 and 16 byte
 210          * copies was found to be faster than doing 128 and 32 byte copies as
 211          * well.
 212          */
 213         for ( ; n >= 256; n -= 256) {
 214                 rte_mov256((uint8_t *)dst, (const uint8_t *)src);
 215                 dst = (uint8_t *)dst + 256;
 216                 src = (const uint8_t *)src + 256;
 217         }
 218
 219         /*
 220          * We split the remaining bytes (which will be less than 256) into
 221          * 64byte (2^6) chunks.
 222          * Using incrementing integers in the case labels of a switch statement
 223          * enourages the compiler to use a jump table. To get incrementing
 224          * integers, we shift the 2 relevant bits to the LSB position to first
 225          * get decrementing integers, and then subtract.
 226          */
 227         switch (3 - (n >> 6)) {
 228         case 0x00:
 229                 rte_mov64((uint8_t *)dst, (const uint8_t *)src);
 230                 n -= 64;
 231                 dst = (uint8_t *)dst + 64;
 232                 src = (const uint8_t *)src + 64;      /* fallthrough */
 233         case 0x01:
 234                 rte_mov64((uint8_t *)dst, (const uint8_t *)src);
 235                 n -= 64;
 236                 dst = (uint8_t *)dst + 64;
 237                 src = (const uint8_t *)src + 64;      /* fallthrough */
 238         case 0x02:
 239                 rte_mov64((uint8_t *)dst, (const uint8_t *)src);
 240                 n -= 64;
 241                 dst = (uint8_t *)dst + 64;
 242                 src = (const uint8_t *)src + 64;      /* fallthrough */
 243         default:
 244                 break;
 245         }
 246
 247         /*
 248          * We split the remaining bytes (which will be less than 64) into
 249          * 16byte (2^4) chunks, using the same switch structure as above.
 250          */
 251         switch (3 - (n >> 4)) {
 252         case 0x00:
 253                 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
 254                 n -= 16;
 255                 dst = (uint8_t *)dst + 16;
 256                 src = (const uint8_t *)src + 16;      /* fallthrough */
 257         case 0x01:
 258                 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
 259                 n -= 16;
 260                 dst = (uint8_t *)dst + 16;
 261                 src = (const uint8_t *)src + 16;      /* fallthrough */
 262         case 0x02:
 263                 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
 264                 n -= 16;
 265                 dst = (uint8_t *)dst + 16;
 266                 src = (const uint8_t *)src + 16;      /* fallthrough */
 267         default:
 268                 break;
 269         }
 270
 271         /* Copy any remaining bytes, without going beyond end of buffers */
 272         if (n != 0)
 273                 rte_mov16((uint8_t *)dst - 16 + n,
 274                         (const uint8_t *)src - 16 + n);
 275         return ret;
 276 }
 277
 278 #else
 279
 280 static inline void
 281 rte_mov16(uint8_t *dst, const uint8_t *src)
 282 {
 283         memcpy(dst, src, 16);
 284 }
 285
 286 static inline void
 287 rte_mov32(uint8_t *dst, const uint8_t *src)
 288 {
 289         memcpy(dst, src, 32);
 290 }
 291
 292 static inline void
 293 rte_mov48(uint8_t *dst, const uint8_t *src)
 294 {
 295         memcpy(dst, src, 48);
 296 }
 297
 298 static inline void
 299 rte_mov64(uint8_t *dst, const uint8_t *src)
 300 {
 301         memcpy(dst, src, 64);
 302 }
 303
 304 static inline void
 305 rte_mov128(uint8_t *dst, const uint8_t *src)
 306 {
 307         memcpy(dst, src, 128);
 308 }
 309
 310 static inline void
 311 rte_mov256(uint8_t *dst, const uint8_t *src)
 312 {
 313         memcpy(dst, src, 256);
 314 }
 315
 316 static inline void *
 317 rte_memcpy(void *dst, const void *src, size_t n)
 318 {
 319         return memcpy(dst, src, n);
 320 }
 321
 322 static inline void *
 323 rte_memcpy_func(void *dst, const void *src, size_t n)
 324 {
 325         return memcpy(dst, src, n);
 326 }
 327
 328 #endif /* __ARM_NEON_FP */
 329
 330 #ifdef __cplusplus
 331 }
 332 #endif
 333
 334 #endif /* _RTE_MEMCPY_ARM32_H_ */