lib/eal/ppc/include/rte_memcpy.h

   1 /*
   2  * SPDX-License-Identifier: BSD-3-Clause
   3  * Copyright (C) IBM Corporation 2014.
   4  */
   5
   6 #ifndef _RTE_MEMCPY_PPC_64_H_
   7 #define _RTE_MEMCPY_PPC_64_H_
   8
   9 #include <stdint.h>
  10 #include <string.h>
  11
  12 #include "rte_altivec.h"
  13 #include "rte_common.h"
  14
  15 #ifdef __cplusplus
  16 extern "C" {
  17 #endif
  18
  19 #include "generic/rte_memcpy.h"
  20
  21 #if (GCC_VERSION >= 90000 && GCC_VERSION < 90400)
  22 #pragma GCC diagnostic push
  23 #pragma GCC diagnostic ignored "-Warray-bounds"
  24 #endif
  25
  26 static inline void
  27 rte_mov16(uint8_t *dst, const uint8_t *src)
  28 {
  29         vec_vsx_st(vec_vsx_ld(0, src), 0, dst);
  30 }
  31
  32 static inline void
  33 rte_mov32(uint8_t *dst, const uint8_t *src)
  34 {
  35         vec_vsx_st(vec_vsx_ld(0, src), 0, dst);
  36         vec_vsx_st(vec_vsx_ld(16, src), 16, dst);
  37 }
  38
  39 static inline void
  40 rte_mov48(uint8_t *dst, const uint8_t *src)
  41 {
  42         vec_vsx_st(vec_vsx_ld(0, src), 0, dst);
  43         vec_vsx_st(vec_vsx_ld(16, src), 16, dst);
  44         vec_vsx_st(vec_vsx_ld(32, src), 32, dst);
  45 }
  46
  47 static inline void
  48 rte_mov64(uint8_t *dst, const uint8_t *src)
  49 {
  50         vec_vsx_st(vec_vsx_ld(0, src), 0, dst);
  51         vec_vsx_st(vec_vsx_ld(16, src), 16, dst);
  52         vec_vsx_st(vec_vsx_ld(32, src), 32, dst);
  53         vec_vsx_st(vec_vsx_ld(48, src), 48, dst);
  54 }
  55
  56 static inline void
  57 rte_mov128(uint8_t *dst, const uint8_t *src)
  58 {
  59         vec_vsx_st(vec_vsx_ld(0, src), 0, dst);
  60         vec_vsx_st(vec_vsx_ld(16, src), 16, dst);
  61         vec_vsx_st(vec_vsx_ld(32, src), 32, dst);
  62         vec_vsx_st(vec_vsx_ld(48, src), 48, dst);
  63         vec_vsx_st(vec_vsx_ld(64, src), 64, dst);
  64         vec_vsx_st(vec_vsx_ld(80, src), 80, dst);
  65         vec_vsx_st(vec_vsx_ld(96, src), 96, dst);
  66         vec_vsx_st(vec_vsx_ld(112, src), 112, dst);
  67 }
  68
  69 static inline void
  70 rte_mov256(uint8_t *dst, const uint8_t *src)
  71 {
  72         rte_mov128(dst, src);
  73         rte_mov128(dst + 128, src + 128);
  74 }
  75
  76 #define rte_memcpy(dst, src, n)              \
  77         __extension__ ({                     \
  78         (__builtin_constant_p(n)) ?          \
  79         memcpy((dst), (src), (n)) :          \
  80         rte_memcpy_func((dst), (src), (n)); })
  81
  82 static inline void *
  83 rte_memcpy_func(void *dst, const void *src, size_t n)
  84 {
  85         void *ret = dst;
  86
  87         /* We can't copy < 16 bytes using XMM registers so do it manually. */
  88         if (n < 16) {
  89                 if (n & 0x01) {
  90                         *(uint8_t *)dst = *(const uint8_t *)src;
  91                         dst = (uint8_t *)dst + 1;
  92                         src = (const uint8_t *)src + 1;
  93                 }
  94                 if (n & 0x02) {
  95                         *(uint16_t *)dst = *(const uint16_t *)src;
  96                         dst = (uint16_t *)dst + 1;
  97                         src = (const uint16_t *)src + 1;
  98                 }
  99                 if (n & 0x04) {
 100                         *(uint32_t *)dst = *(const uint32_t *)src;
 101                         dst = (uint32_t *)dst + 1;
 102                         src = (const uint32_t *)src + 1;
 103                 }
 104                 if (n & 0x08)
 105                         *(uint64_t *)dst = *(const uint64_t *)src;
 106                 return ret;
 107         }
 108
 109         /* Special fast cases for <= 128 bytes */
 110         if (n <= 32) {
 111                 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
 112                 rte_mov16((uint8_t *)dst - 16 + n,
 113                         (const uint8_t *)src - 16 + n);
 114                 return ret;
 115         }
 116
 117         if (n <= 64) {
 118                 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 119                 rte_mov32((uint8_t *)dst - 32 + n,
 120                         (const uint8_t *)src - 32 + n);
 121                 return ret;
 122         }
 123
 124         if (n <= 128) {
 125                 rte_mov64((uint8_t *)dst, (const uint8_t *)src);
 126                 rte_mov64((uint8_t *)dst - 64 + n,
 127                         (const uint8_t *)src - 64 + n);
 128                 return ret;
 129         }
 130
 131         /*
 132          * For large copies > 128 bytes. This combination of 256, 64 and 16 byte
 133          * copies was found to be faster than doing 128 and 32 byte copies as
 134          * well.
 135          */
 136         for ( ; n >= 256; n -= 256) {
 137                 rte_mov256((uint8_t *)dst, (const uint8_t *)src);
 138                 dst = (uint8_t *)dst + 256;
 139                 src = (const uint8_t *)src + 256;
 140         }
 141
 142         /*
 143          * We split the remaining bytes (which will be less than 256) into
 144          * 64byte (2^6) chunks.
 145          * Using incrementing integers in the case labels of a switch statement
 146          * encourages the compiler to use a jump table. To get incrementing
 147          * integers, we shift the 2 relevant bits to the LSB position to first
 148          * get decrementing integers, and then subtract.
 149          */
 150         switch (3 - (n >> 6)) {
 151         case 0x00:
 152                 rte_mov64((uint8_t *)dst, (const uint8_t *)src);
 153                 n -= 64;
 154                 dst = (uint8_t *)dst + 64;
 155                 src = (const uint8_t *)src + 64;      /* fallthrough */
 156         case 0x01:
 157                 rte_mov64((uint8_t *)dst, (const uint8_t *)src);
 158                 n -= 64;
 159                 dst = (uint8_t *)dst + 64;
 160                 src = (const uint8_t *)src + 64;      /* fallthrough */
 161         case 0x02:
 162                 rte_mov64((uint8_t *)dst, (const uint8_t *)src);
 163                 n -= 64;
 164                 dst = (uint8_t *)dst + 64;
 165                 src = (const uint8_t *)src + 64;      /* fallthrough */
 166         default:
 167                 ;
 168         }
 169
 170         /*
 171          * We split the remaining bytes (which will be less than 64) into
 172          * 16byte (2^4) chunks, using the same switch structure as above.
 173          */
 174         switch (3 - (n >> 4)) {
 175         case 0x00:
 176                 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
 177                 n -= 16;
 178                 dst = (uint8_t *)dst + 16;
 179                 src = (const uint8_t *)src + 16;      /* fallthrough */
 180         case 0x01:
 181                 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
 182                 n -= 16;
 183                 dst = (uint8_t *)dst + 16;
 184                 src = (const uint8_t *)src + 16;      /* fallthrough */
 185         case 0x02:
 186                 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
 187                 n -= 16;
 188                 dst = (uint8_t *)dst + 16;
 189                 src = (const uint8_t *)src + 16;      /* fallthrough */
 190         default:
 191                 ;
 192         }
 193
 194         /* Copy any remaining bytes, without going beyond end of buffers */
 195         if (n != 0)
 196                 rte_mov16((uint8_t *)dst - 16 + n,
 197                         (const uint8_t *)src - 16 + n);
 198         return ret;
 199 }
 200
 201 #if (GCC_VERSION >= 90000 && GCC_VERSION < 90400)
 202 #pragma GCC diagnostic pop
 203 #endif
 204
 205 #ifdef __cplusplus
 206 }
 207 #endif
 208
 209 #endif /* _RTE_MEMCPY_PPC_64_H_ */