lib/librte_eal/common/include/rte_memcpy.h

   1 /*-
   2  *   BSD LICENSE
   3  *
   4  *   Copyright(c) 2010-2012 Intel Corporation. All rights reserved.
   5  *   All rights reserved.
   6  *
   7  *   Redistribution and use in source and binary forms, with or without
   8  *   modification, are permitted provided that the following conditions
   9  *   are met:
  10  *
  11  *     * Redistributions of source code must retain the above copyright
  12  *       notice, this list of conditions and the following disclaimer.
  13  *     * Redistributions in binary form must reproduce the above copyright
  14  *       notice, this list of conditions and the following disclaimer in
  15  *       the documentation and/or other materials provided with the
  16  *       distribution.
  17  *     * Neither the name of Intel Corporation nor the names of its
  18  *       contributors may be used to endorse or promote products derived
  19  *       from this software without specific prior written permission.
  20  *
  21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  32  *
  33  *  version: DPDK.L.1.2.3-3
  34  */
  35
  36 #ifndef _RTE_MEMCPY_H_
  37 #define _RTE_MEMCPY_H_
  38
  39 /**
  40  * @file
  41  *
  42  * Functions for SSE implementation of memcpy().
  43  */
  44
  45 #include <stdint.h>
  46 #include <string.h>
  47
  48 #ifdef __cplusplus
  49 extern "C" {
  50 #endif
  51
  52 /**
  53  * Copy 16 bytes from one location to another using optimised SSE
  54  * instructions. The locations should not overlap.
  55  *
  56  * @param dst
  57  *   Pointer to the destination of the data.
  58  * @param src
  59  *   Pointer to the source data.
  60  */
  61 static inline void
  62 rte_mov16(uint8_t *dst, const uint8_t *src)
  63 {
  64         asm volatile ("movdqu (%[src]), %%xmm0\n\t"
  65                       "movdqu %%xmm0, (%[dst])\n\t"
  66                       :
  67                       : [src] "r" (src),
  68                         [dst] "r"(dst)
  69                       : "xmm0", "memory");
  70 }
  71
  72 /**
  73  * Copy 32 bytes from one location to another using optimised SSE
  74  * instructions. The locations should not overlap.
  75  *
  76  * @param dst
  77  *   Pointer to the destination of the data.
  78  * @param src
  79  *   Pointer to the source data.
  80  */
  81 static inline void
  82 rte_mov32(uint8_t *dst, const uint8_t *src)
  83 {
  84         asm volatile ("movdqu (%[src]), %%xmm0\n\t"
  85                       "movdqu 16(%[src]), %%xmm1\n\t"
  86                       "movdqu %%xmm0, (%[dst])\n\t"
  87                       "movdqu %%xmm1, 16(%[dst])"
  88                       :
  89                       : [src] "r" (src),
  90                         [dst] "r"(dst)
  91                       : "xmm0", "xmm1", "memory");
  92 }
  93
  94 /**
  95  * Copy 48 bytes from one location to another using optimised SSE
  96  * instructions. The locations should not overlap.
  97  *
  98  * @param dst
  99  *   Pointer to the destination of the data.
 100  * @param src
 101  *   Pointer to the source data.
 102  */
 103 static inline void
 104 rte_mov48(uint8_t *dst, const uint8_t *src)
 105 {
 106         asm volatile ("movdqu (%[src]), %%xmm0\n\t"
 107                       "movdqu 16(%[src]), %%xmm1\n\t"
 108                       "movdqu 32(%[src]), %%xmm2\n\t"
 109                       "movdqu %%xmm0, (%[dst])\n\t"
 110                       "movdqu %%xmm1, 16(%[dst])\n\t"
 111                       "movdqu %%xmm2, 32(%[dst])"
 112                       :
 113                       : [src] "r" (src),
 114                         [dst] "r"(dst)
 115                       : "xmm0", "xmm1", "memory");
 116 }
 117
 118 /**
 119  * Copy 64 bytes from one location to another using optimised SSE
 120  * instructions. The locations should not overlap.
 121  *
 122  * @param dst
 123  *   Pointer to the destination of the data.
 124  * @param src
 125  *   Pointer to the source data.
 126  */
 127 static inline void
 128 rte_mov64(uint8_t *dst, const uint8_t *src)
 129 {
 130         asm volatile ("movdqu (%[src]), %%xmm0\n\t"
 131                       "movdqu 16(%[src]), %%xmm1\n\t"
 132                       "movdqu 32(%[src]), %%xmm2\n\t"
 133                       "movdqu 48(%[src]), %%xmm3\n\t"
 134                       "movdqu %%xmm0, (%[dst])\n\t"
 135                       "movdqu %%xmm1, 16(%[dst])\n\t"
 136                       "movdqu %%xmm2, 32(%[dst])\n\t"
 137                       "movdqu %%xmm3, 48(%[dst])"
 138                       :
 139                       : [src] "r" (src),
 140                         [dst] "r"(dst)
 141                       : "xmm0", "xmm1", "xmm2", "xmm3","memory");
 142 }
 143
 144 /**
 145  * Copy 128 bytes from one location to another using optimised SSE
 146  * instructions. The locations should not overlap.
 147  *
 148  * @param dst
 149  *   Pointer to the destination of the data.
 150  * @param src
 151  *   Pointer to the source data.
 152  */
 153 static inline void
 154 rte_mov128(uint8_t *dst, const uint8_t *src)
 155 {
 156         asm volatile ("movdqu (%[src]), %%xmm0\n\t"
 157                       "movdqu 16(%[src]), %%xmm1\n\t"
 158                       "movdqu 32(%[src]), %%xmm2\n\t"
 159                       "movdqu 48(%[src]), %%xmm3\n\t"
 160                       "movdqu 64(%[src]), %%xmm4\n\t"
 161                       "movdqu 80(%[src]), %%xmm5\n\t"
 162                       "movdqu 96(%[src]), %%xmm6\n\t"
 163                       "movdqu 112(%[src]), %%xmm7\n\t"
 164                       "movdqu %%xmm0, (%[dst])\n\t"
 165                       "movdqu %%xmm1, 16(%[dst])\n\t"
 166                       "movdqu %%xmm2, 32(%[dst])\n\t"
 167                       "movdqu %%xmm3, 48(%[dst])\n\t"
 168                       "movdqu %%xmm4, 64(%[dst])\n\t"
 169                       "movdqu %%xmm5, 80(%[dst])\n\t"
 170                       "movdqu %%xmm6, 96(%[dst])\n\t"
 171                       "movdqu %%xmm7, 112(%[dst])"
 172                       :
 173                       : [src] "r" (src),
 174                         [dst] "r"(dst)
 175                       : "xmm0", "xmm1", "xmm2", "xmm3",
 176                         "xmm4", "xmm5", "xmm6", "xmm7", "memory");
 177 }
 178
 179 /**
 180  * Copy 256 bytes from one location to another using optimised SSE
 181  * instructions. The locations should not overlap.
 182  *
 183  * @param dst
 184  *   Pointer to the destination of the data.
 185  * @param src
 186  *   Pointer to the source data.
 187  */
 188 static inline void
 189 rte_mov256(uint8_t *dst, const uint8_t *src)
 190 {
 191         /*
 192          * There are 16XMM registers, but this function does not use
 193          * them all so that it can still be compiled as 32bit
 194          * code. The performance increase was neglible if all 16
 195          * registers were used.
 196          */
 197         rte_mov128(dst, src);
 198         rte_mov128(dst + 128, src + 128);
 199 }
 200
 201 #ifdef RTE_MEMCPY_BUILTIN_CONSTANT_P
 202 /**
 203  * Choose between compiler built-in implementation of memcpy or DPDK
 204  * implementation depending if size is a compile-time constant
 205  */
 206 #define rte_memcpy(dst, src, n) \
 207         (__builtin_constant_p (n) ? \
 208         memcpy(dst, src, n) : rte_memcpy_func(dst, src, n))
 209 #else
 210 /**
 211  * Always use DPDK implementation.
 212  */
 213 #define rte_memcpy rte_memcpy_func
 214 #endif
 215
 216 /**
 217  * Copy bytes from one location to another. The locations must not overlap.
 218  *
 219  * @param dst
 220  *   Pointer to the destination of the data.
 221  * @param src
 222  *   Pointer to the source data.
 223  * @param n
 224  *   Number of bytes to copy.
 225  * @return
 226  *   Pointer to the destination data.
 227  */
 228 static inline void *
 229 rte_memcpy_func(void *dst, const void *src, size_t n)
 230 {
 231         void *ret = dst;
 232
 233         /* We can't copy < 16 bytes using XMM registers so do it manually. */
 234         if (n < 16) {
 235                 if (n & 0x01) {
 236                         *(uint8_t *)dst = *(const uint8_t *)src;
 237                         dst = (uint8_t *)dst + 1;
 238                         src = (const uint8_t *)src + 1;
 239                 }
 240                 if (n & 0x02) {
 241                         *(uint16_t *)dst = *(const uint16_t *)src;
 242                         dst = (uint16_t *)dst + 1;
 243                         src = (const uint16_t *)src + 1;
 244                 }
 245                 if (n & 0x04) {
 246                         /*
 247                          * NOTE: doing this as a 32bit copy causes "strict
 248                          * aliasing" compile errors, but worked fine for 64bit
 249                          * copy below, for unknown reasons.
 250                          */
 251                         *(uint16_t *)dst = *(const uint16_t *)src;
 252                         *((uint16_t *)dst + 1) = *((const uint16_t *)src + 1);
 253                         dst = (uint32_t *)dst + 1;
 254                         src = (const uint32_t *)src + 1;
 255                 }
 256                 if (n & 0x08) {
 257                         *(uint64_t *)dst = *(const uint64_t *)src;
 258                 }
 259                 return ret;
 260         }
 261
 262         /* Special fast cases for <= 128 bytes */
 263         if (n <= 32) {
 264                 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
 265                 rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
 266                 return ret;
 267         }
 268
 269         if (n <= 64) {
 270                 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 271                 rte_mov32((uint8_t *)dst - 32 + n, (const uint8_t *)src - 32 + n);
 272                 return ret;
 273         }
 274
 275         if (n <= 128) {
 276                 rte_mov64((uint8_t *)dst, (const uint8_t *)src);
 277                 rte_mov64((uint8_t *)dst - 64 + n, (const uint8_t *)src - 64 + n);
 278                 return ret;
 279         }
 280
 281         /*
 282          * For large copies > 128 bytes. This combination of 256, 64 and 16 byte
 283          * copies was found to be faster than doing 128 and 32 byte copies as
 284          * well.
 285          */
 286         for ( ; n >= 256; n -= 256) {
 287                 rte_mov256((uint8_t *)dst, (const uint8_t *)src);
 288                 dst = (uint8_t *)dst + 256;
 289                 src = (const uint8_t *)src + 256;
 290         }
 291
 292         /*
 293          * We split the remaining bytes (which will be less than 256) into
 294          * 64byte (2^6) chunks.
 295          * Using incrementing integers in the case labels of a switch statement
 296          * enourages the compiler to use a jump table. To get incrementing
 297          * integers, we shift the 2 relevant bits to the LSB position to first
 298          * get decrementing integers, and then subtract.
 299          */
 300         switch (3 - (n >> 6)) {
 301         case 0x00:
 302                 rte_mov64((uint8_t *)dst, (const uint8_t *)src);
 303                 n -= 64;
 304                 dst = (uint8_t *)dst + 64;
 305                 src = (const uint8_t *)src + 64;      /* fallthrough */
 306         case 0x01:
 307                 rte_mov64((uint8_t *)dst, (const uint8_t *)src);
 308                 n -= 64;
 309                 dst = (uint8_t *)dst + 64;
 310                 src = (const uint8_t *)src + 64;      /* fallthrough */
 311         case 0x02:
 312                 rte_mov64((uint8_t *)dst, (const uint8_t *)src);
 313                 n -= 64;
 314                 dst = (uint8_t *)dst + 64;
 315                 src = (const uint8_t *)src + 64;      /* fallthrough */
 316         default:
 317                 ;
 318         }
 319
 320         /*
 321          * We split the remaining bytes (which will be less than 64) into
 322          * 16byte (2^4) chunks, using the same switch structure as above.
 323          */
 324         switch (3 - (n >> 4)) {
 325         case 0x00:
 326                 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
 327                 n -= 16;
 328                 dst = (uint8_t *)dst + 16;
 329                 src = (const uint8_t *)src + 16;      /* fallthrough */
 330         case 0x01:
 331                 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
 332                 n -= 16;
 333                 dst = (uint8_t *)dst + 16;
 334                 src = (const uint8_t *)src + 16;      /* fallthrough */
 335         case 0x02:
 336                 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
 337                 n -= 16;
 338                 dst = (uint8_t *)dst + 16;
 339                 src = (const uint8_t *)src + 16;      /* fallthrough */
 340         default:
 341                 ;
 342         }
 343
 344         /* Copy any remaining bytes, without going beyond end of buffers */
 345         if (n != 0) {
 346                 rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
 347         }
 348         return ret;
 349 }
 350
 351 #ifdef __cplusplus
 352 }
 353 #endif
 354
 355 #endif /* _RTE_MEMCPY_H_ */