lib/eal/ppc/include/rte_memcpy.h

   1 /*
   2  * SPDX-License-Identifier: BSD-3-Clause
   3  * Copyright (C) IBM Corporation 2014,2021
   4  */
   5
   6 #ifndef _RTE_MEMCPY_PPC_64_H_
   7 #define _RTE_MEMCPY_PPC_64_H_
   8
   9 #include <stdint.h>
  10 #include <string.h>
  11
  12 #include "rte_altivec.h"
  13 #include "rte_common.h"
  14
  15 #ifdef __cplusplus
  16 extern "C" {
  17 #endif
  18
  19 #include "generic/rte_memcpy.h"
  20
  21 #if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION >= 90000)
  22 #pragma GCC diagnostic push
  23 #pragma GCC diagnostic ignored "-Warray-bounds"
  24 #endif
  25
  26 #if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION >= 100000)
  27 #pragma GCC diagnostic push
  28 #pragma GCC diagnostic ignored "-Wstringop-overflow"
  29 #endif
  30
  31 static inline void
  32 rte_mov16(uint8_t *dst, const uint8_t *src)
  33 {
  34         vec_vsx_st(vec_vsx_ld(0, src), 0, dst);
  35 }
  36
  37 static inline void
  38 rte_mov32(uint8_t *dst, const uint8_t *src)
  39 {
  40         vec_vsx_st(vec_vsx_ld(0, src), 0, dst);
  41         vec_vsx_st(vec_vsx_ld(16, src), 16, dst);
  42 }
  43
  44 static inline void
  45 rte_mov48(uint8_t *dst, const uint8_t *src)
  46 {
  47         vec_vsx_st(vec_vsx_ld(0, src), 0, dst);
  48         vec_vsx_st(vec_vsx_ld(16, src), 16, dst);
  49         vec_vsx_st(vec_vsx_ld(32, src), 32, dst);
  50 }
  51
  52 static inline void
  53 rte_mov64(uint8_t *dst, const uint8_t *src)
  54 {
  55         vec_vsx_st(vec_vsx_ld(0, src), 0, dst);
  56         vec_vsx_st(vec_vsx_ld(16, src), 16, dst);
  57         vec_vsx_st(vec_vsx_ld(32, src), 32, dst);
  58         vec_vsx_st(vec_vsx_ld(48, src), 48, dst);
  59 }
  60
  61 static inline void
  62 rte_mov128(uint8_t *dst, const uint8_t *src)
  63 {
  64         vec_vsx_st(vec_vsx_ld(0, src), 0, dst);
  65         vec_vsx_st(vec_vsx_ld(16, src), 16, dst);
  66         vec_vsx_st(vec_vsx_ld(32, src), 32, dst);
  67         vec_vsx_st(vec_vsx_ld(48, src), 48, dst);
  68         vec_vsx_st(vec_vsx_ld(64, src), 64, dst);
  69         vec_vsx_st(vec_vsx_ld(80, src), 80, dst);
  70         vec_vsx_st(vec_vsx_ld(96, src), 96, dst);
  71         vec_vsx_st(vec_vsx_ld(112, src), 112, dst);
  72 }
  73
  74 static inline void
  75 rte_mov256(uint8_t *dst, const uint8_t *src)
  76 {
  77         rte_mov128(dst, src);
  78         rte_mov128(dst + 128, src + 128);
  79 }
  80
  81 #define rte_memcpy(dst, src, n)              \
  82         __extension__ ({                     \
  83         (__builtin_constant_p(n)) ?          \
  84         memcpy((dst), (src), (n)) :          \
  85         rte_memcpy_func((dst), (src), (n)); })
  86
  87 static inline void *
  88 rte_memcpy_func(void *dst, const void *src, size_t n)
  89 {
  90         void *ret = dst;
  91
  92         /* We can't copy < 16 bytes using XMM registers so do it manually. */
  93         if (n < 16) {
  94                 if (n & 0x01) {
  95                         *(uint8_t *)dst = *(const uint8_t *)src;
  96                         dst = (uint8_t *)dst + 1;
  97                         src = (const uint8_t *)src + 1;
  98                 }
  99                 if (n & 0x02) {
 100                         *(uint16_t *)dst = *(const uint16_t *)src;
 101                         dst = (uint16_t *)dst + 1;
 102                         src = (const uint16_t *)src + 1;
 103                 }
 104                 if (n & 0x04) {
 105                         *(uint32_t *)dst = *(const uint32_t *)src;
 106                         dst = (uint32_t *)dst + 1;
 107                         src = (const uint32_t *)src + 1;
 108                 }
 109                 if (n & 0x08)
 110                         *(uint64_t *)dst = *(const uint64_t *)src;
 111                 return ret;
 112         }
 113
 114         /* Special fast cases for <= 128 bytes */
 115         if (n <= 32) {
 116                 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
 117                 rte_mov16((uint8_t *)dst - 16 + n,
 118                         (const uint8_t *)src - 16 + n);
 119                 return ret;
 120         }
 121
 122         if (n <= 64) {
 123                 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 124                 rte_mov32((uint8_t *)dst - 32 + n,
 125                         (const uint8_t *)src - 32 + n);
 126                 return ret;
 127         }
 128
 129         if (n <= 128) {
 130                 rte_mov64((uint8_t *)dst, (const uint8_t *)src);
 131                 rte_mov64((uint8_t *)dst - 64 + n,
 132                         (const uint8_t *)src - 64 + n);
 133                 return ret;
 134         }
 135
 136         /*
 137          * For large copies > 128 bytes. This combination of 256, 64 and 16 byte
 138          * copies was found to be faster than doing 128 and 32 byte copies as
 139          * well.
 140          */
 141         for ( ; n >= 256; n -= 256) {
 142                 rte_mov256((uint8_t *)dst, (const uint8_t *)src);
 143                 dst = (uint8_t *)dst + 256;
 144                 src = (const uint8_t *)src + 256;
 145         }
 146
 147         /*
 148          * We split the remaining bytes (which will be less than 256) into
 149          * 64byte (2^6) chunks.
 150          * Using incrementing integers in the case labels of a switch statement
 151          * encourages the compiler to use a jump table. To get incrementing
 152          * integers, we shift the 2 relevant bits to the LSB position to first
 153          * get decrementing integers, and then subtract.
 154          */
 155         switch (3 - (n >> 6)) {
 156         case 0x00:
 157                 rte_mov64((uint8_t *)dst, (const uint8_t *)src);
 158                 n -= 64;
 159                 dst = (uint8_t *)dst + 64;
 160                 src = (const uint8_t *)src + 64;      /* fallthrough */
 161         case 0x01:
 162                 rte_mov64((uint8_t *)dst, (const uint8_t *)src);
 163                 n -= 64;
 164                 dst = (uint8_t *)dst + 64;
 165                 src = (const uint8_t *)src + 64;      /* fallthrough */
 166         case 0x02:
 167                 rte_mov64((uint8_t *)dst, (const uint8_t *)src);
 168                 n -= 64;
 169                 dst = (uint8_t *)dst + 64;
 170                 src = (const uint8_t *)src + 64;      /* fallthrough */
 171         default:
 172                 ;
 173         }
 174
 175         /*
 176          * We split the remaining bytes (which will be less than 64) into
 177          * 16byte (2^4) chunks, using the same switch structure as above.
 178          */
 179         switch (3 - (n >> 4)) {
 180         case 0x00:
 181                 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
 182                 n -= 16;
 183                 dst = (uint8_t *)dst + 16;
 184                 src = (const uint8_t *)src + 16;      /* fallthrough */
 185         case 0x01:
 186                 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
 187                 n -= 16;
 188                 dst = (uint8_t *)dst + 16;
 189                 src = (const uint8_t *)src + 16;      /* fallthrough */
 190         case 0x02:
 191                 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
 192                 n -= 16;
 193                 dst = (uint8_t *)dst + 16;
 194                 src = (const uint8_t *)src + 16;      /* fallthrough */
 195         default:
 196                 ;
 197         }
 198
 199         /* Copy any remaining bytes, without going beyond end of buffers */
 200         if (n != 0)
 201                 rte_mov16((uint8_t *)dst - 16 + n,
 202                         (const uint8_t *)src - 16 + n);
 203         return ret;
 204 }
 205
 206 #if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION >= 100000)
 207 #pragma GCC diagnostic pop
 208 #endif
 209
 210 #if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION >= 90000)
 211 #pragma GCC diagnostic pop
 212 #endif
 213
 214 #ifdef __cplusplus
 215 }
 216 #endif
 217
 218 #endif /* _RTE_MEMCPY_PPC_64_H_ */