lib/librte_eal/common/include/arch/ppc_64/rte_memcpy.h

   1 /*
   2  * SPDX-License-Identifier: BSD-3-Clause
   3  * Copyright (C) IBM Corporation 2014.
   4  */
   5
   6 #ifndef _RTE_MEMCPY_PPC_64_H_
   7 #define _RTE_MEMCPY_PPC_64_H_
   8
   9 #include <stdint.h>
  10 #include <string.h>
  11 /*To include altivec.h, GCC version must  >= 4.8 */
  12 #include <altivec.h>
  13
  14 #ifdef __cplusplus
  15 extern "C" {
  16 #endif
  17
  18 #include "generic/rte_memcpy.h"
  19
  20 static inline void
  21 rte_mov16(uint8_t *dst, const uint8_t *src)
  22 {
  23         vec_vsx_st(vec_vsx_ld(0, src), 0, dst);
  24 }
  25
  26 static inline void
  27 rte_mov32(uint8_t *dst, const uint8_t *src)
  28 {
  29         vec_vsx_st(vec_vsx_ld(0, src), 0, dst);
  30         vec_vsx_st(vec_vsx_ld(16, src), 16, dst);
  31 }
  32
  33 static inline void
  34 rte_mov48(uint8_t *dst, const uint8_t *src)
  35 {
  36         vec_vsx_st(vec_vsx_ld(0, src), 0, dst);
  37         vec_vsx_st(vec_vsx_ld(16, src), 16, dst);
  38         vec_vsx_st(vec_vsx_ld(32, src), 32, dst);
  39 }
  40
  41 static inline void
  42 rte_mov64(uint8_t *dst, const uint8_t *src)
  43 {
  44         vec_vsx_st(vec_vsx_ld(0, src), 0, dst);
  45         vec_vsx_st(vec_vsx_ld(16, src), 16, dst);
  46         vec_vsx_st(vec_vsx_ld(32, src), 32, dst);
  47         vec_vsx_st(vec_vsx_ld(48, src), 48, dst);
  48 }
  49
  50 static inline void
  51 rte_mov128(uint8_t *dst, const uint8_t *src)
  52 {
  53         vec_vsx_st(vec_vsx_ld(0, src), 0, dst);
  54         vec_vsx_st(vec_vsx_ld(16, src), 16, dst);
  55         vec_vsx_st(vec_vsx_ld(32, src), 32, dst);
  56         vec_vsx_st(vec_vsx_ld(48, src), 48, dst);
  57         vec_vsx_st(vec_vsx_ld(64, src), 64, dst);
  58         vec_vsx_st(vec_vsx_ld(80, src), 80, dst);
  59         vec_vsx_st(vec_vsx_ld(96, src), 96, dst);
  60         vec_vsx_st(vec_vsx_ld(112, src), 112, dst);
  61 }
  62
  63 static inline void
  64 rte_mov256(uint8_t *dst, const uint8_t *src)
  65 {
  66         rte_mov128(dst, src);
  67         rte_mov128(dst + 128, src + 128);
  68 }
  69
  70 #define rte_memcpy(dst, src, n)              \
  71         __extension__ ({                     \
  72         (__builtin_constant_p(n)) ?          \
  73         memcpy((dst), (src), (n)) :          \
  74         rte_memcpy_func((dst), (src), (n)); })
  75
  76 static inline void *
  77 rte_memcpy_func(void *dst, const void *src, size_t n)
  78 {
  79         void *ret = dst;
  80
  81         /* We can't copy < 16 bytes using XMM registers so do it manually. */
  82         if (n < 16) {
  83                 if (n & 0x01) {
  84                         *(uint8_t *)dst = *(const uint8_t *)src;
  85                         dst = (uint8_t *)dst + 1;
  86                         src = (const uint8_t *)src + 1;
  87                 }
  88                 if (n & 0x02) {
  89                         *(uint16_t *)dst = *(const uint16_t *)src;
  90                         dst = (uint16_t *)dst + 1;
  91                         src = (const uint16_t *)src + 1;
  92                 }
  93                 if (n & 0x04) {
  94                         *(uint32_t *)dst = *(const uint32_t *)src;
  95                         dst = (uint32_t *)dst + 1;
  96                         src = (const uint32_t *)src + 1;
  97                 }
  98                 if (n & 0x08)
  99                         *(uint64_t *)dst = *(const uint64_t *)src;
 100                 return ret;
 101         }
 102
 103         /* Special fast cases for <= 128 bytes */
 104         if (n <= 32) {
 105                 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
 106                 rte_mov16((uint8_t *)dst - 16 + n,
 107                         (const uint8_t *)src - 16 + n);
 108                 return ret;
 109         }
 110
 111         if (n <= 64) {
 112                 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 113                 rte_mov32((uint8_t *)dst - 32 + n,
 114                         (const uint8_t *)src - 32 + n);
 115                 return ret;
 116         }
 117
 118         if (n <= 128) {
 119                 rte_mov64((uint8_t *)dst, (const uint8_t *)src);
 120                 rte_mov64((uint8_t *)dst - 64 + n,
 121                         (const uint8_t *)src - 64 + n);
 122                 return ret;
 123         }
 124
 125         /*
 126          * For large copies > 128 bytes. This combination of 256, 64 and 16 byte
 127          * copies was found to be faster than doing 128 and 32 byte copies as
 128          * well.
 129          */
 130         for ( ; n >= 256; n -= 256) {
 131                 rte_mov256((uint8_t *)dst, (const uint8_t *)src);
 132                 dst = (uint8_t *)dst + 256;
 133                 src = (const uint8_t *)src + 256;
 134         }
 135
 136         /*
 137          * We split the remaining bytes (which will be less than 256) into
 138          * 64byte (2^6) chunks.
 139          * Using incrementing integers in the case labels of a switch statement
 140          * encourages the compiler to use a jump table. To get incrementing
 141          * integers, we shift the 2 relevant bits to the LSB position to first
 142          * get decrementing integers, and then subtract.
 143          */
 144         switch (3 - (n >> 6)) {
 145         case 0x00:
 146                 rte_mov64((uint8_t *)dst, (const uint8_t *)src);
 147                 n -= 64;
 148                 dst = (uint8_t *)dst + 64;
 149                 src = (const uint8_t *)src + 64;      /* fallthrough */
 150         case 0x01:
 151                 rte_mov64((uint8_t *)dst, (const uint8_t *)src);
 152                 n -= 64;
 153                 dst = (uint8_t *)dst + 64;
 154                 src = (const uint8_t *)src + 64;      /* fallthrough */
 155         case 0x02:
 156                 rte_mov64((uint8_t *)dst, (const uint8_t *)src);
 157                 n -= 64;
 158                 dst = (uint8_t *)dst + 64;
 159                 src = (const uint8_t *)src + 64;      /* fallthrough */
 160         default:
 161                 ;
 162         }
 163
 164         /*
 165          * We split the remaining bytes (which will be less than 64) into
 166          * 16byte (2^4) chunks, using the same switch structure as above.
 167          */
 168         switch (3 - (n >> 4)) {
 169         case 0x00:
 170                 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
 171                 n -= 16;
 172                 dst = (uint8_t *)dst + 16;
 173                 src = (const uint8_t *)src + 16;      /* fallthrough */
 174         case 0x01:
 175                 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
 176                 n -= 16;
 177                 dst = (uint8_t *)dst + 16;
 178                 src = (const uint8_t *)src + 16;      /* fallthrough */
 179         case 0x02:
 180                 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
 181                 n -= 16;
 182                 dst = (uint8_t *)dst + 16;
 183                 src = (const uint8_t *)src + 16;      /* fallthrough */
 184         default:
 185                 ;
 186         }
 187
 188         /* Copy any remaining bytes, without going beyond end of buffers */
 189         if (n != 0)
 190                 rte_mov16((uint8_t *)dst - 16 + n,
 191                         (const uint8_t *)src - 16 + n);
 192         return ret;
 193 }
 194
 195 #ifdef __cplusplus
 196 }
 197 #endif
 198
 199 #endif /* _RTE_MEMCPY_PPC_64_H_ */