lib/librte_eal/common/include/rte_memcpy.h

   1 /*-
   2  *   BSD LICENSE
   3  *
   4  *   Copyright(c) 2010-2012 Intel Corporation. All rights reserved.
   5  *   All rights reserved.
   6  *
   7  *   Redistribution and use in source and binary forms, with or without
   8  *   modification, are permitted provided that the following conditions
   9  *   are met:
  10  *
  11  *     * Redistributions of source code must retain the above copyright
  12  *       notice, this list of conditions and the following disclaimer.
  13  *     * Redistributions in binary form must reproduce the above copyright
  14  *       notice, this list of conditions and the following disclaimer in
  15  *       the documentation and/or other materials provided with the
  16  *       distribution.
  17  *     * Neither the name of Intel Corporation nor the names of its
  18  *       contributors may be used to endorse or promote products derived
  19  *       from this software without specific prior written permission.
  20  *
  21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  32  *
  33  */
  34
  35 #ifndef _RTE_MEMCPY_H_
  36 #define _RTE_MEMCPY_H_
  37
  38 /**
  39  * @file
  40  *
  41  * Functions for SSE implementation of memcpy().
  42  */
  43
  44 #include <stdint.h>
  45 #include <string.h>
  46 #include <emmintrin.h>
  47
  48 #ifdef __cplusplus
  49 extern "C" {
  50 #endif
  51
  52 #ifdef __INTEL_COMPILER
  53 #pragma warning(disable:593) /* Stop unused variable warning (reg_a etc). */
  54 #endif
  55
  56 /**
  57  * Copy 16 bytes from one location to another using optimised SSE
  58  * instructions. The locations should not overlap.
  59  *
  60  * @param dst
  61  *   Pointer to the destination of the data.
  62  * @param src
  63  *   Pointer to the source data.
  64  */
  65 static inline void
  66 rte_mov16(uint8_t *dst, const uint8_t *src)
  67 {
  68         __m128i reg_a;
  69         asm volatile (
  70                 "movdqu (%[src]), %[reg_a]\n\t"
  71                 "movdqu %[reg_a], (%[dst])\n\t"
  72                 : [reg_a] "=x" (reg_a)
  73                 : [src] "r" (src),
  74                   [dst] "r"(dst)
  75                 : "memory"
  76         );
  77 }
  78
  79 /**
  80  * Copy 32 bytes from one location to another using optimised SSE
  81  * instructions. The locations should not overlap.
  82  *
  83  * @param dst
  84  *   Pointer to the destination of the data.
  85  * @param src
  86  *   Pointer to the source data.
  87  */
  88 static inline void
  89 rte_mov32(uint8_t *dst, const uint8_t *src)
  90 {
  91         __m128i reg_a, reg_b;
  92         asm volatile (
  93                 "movdqu (%[src]), %[reg_a]\n\t"
  94                 "movdqu 16(%[src]), %[reg_b]\n\t"
  95                 "movdqu %[reg_a], (%[dst])\n\t"
  96                 "movdqu %[reg_b], 16(%[dst])\n\t"
  97                 : [reg_a] "=x" (reg_a),
  98                   [reg_b] "=x" (reg_b)
  99                 : [src] "r" (src),
 100                   [dst] "r"(dst)
 101                 : "memory"
 102         );
 103 }
 104
 105 /**
 106  * Copy 48 bytes from one location to another using optimised SSE
 107  * instructions. The locations should not overlap.
 108  *
 109  * @param dst
 110  *   Pointer to the destination of the data.
 111  * @param src
 112  *   Pointer to the source data.
 113  */
 114 static inline void
 115 rte_mov48(uint8_t *dst, const uint8_t *src)
 116 {
 117         __m128i reg_a, reg_b, reg_c;
 118         asm volatile (
 119                 "movdqu (%[src]), %[reg_a]\n\t"
 120                 "movdqu 16(%[src]), %[reg_b]\n\t"
 121                 "movdqu 32(%[src]), %[reg_c]\n\t"
 122                 "movdqu %[reg_a], (%[dst])\n\t"
 123                 "movdqu %[reg_b], 16(%[dst])\n\t"
 124                 "movdqu %[reg_c], 32(%[dst])\n\t"
 125                 : [reg_a] "=x" (reg_a),
 126                   [reg_b] "=x" (reg_b),
 127                   [reg_c] "=x" (reg_c)
 128                 : [src] "r" (src),
 129                   [dst] "r"(dst)
 130                 : "memory"
 131         );
 132 }
 133
 134 /**
 135  * Copy 64 bytes from one location to another using optimised SSE
 136  * instructions. The locations should not overlap.
 137  *
 138  * @param dst
 139  *   Pointer to the destination of the data.
 140  * @param src
 141  *   Pointer to the source data.
 142  */
 143 static inline void
 144 rte_mov64(uint8_t *dst, const uint8_t *src)
 145 {
 146         __m128i reg_a, reg_b, reg_c, reg_d;
 147         asm volatile (
 148                 "movdqu (%[src]), %[reg_a]\n\t"
 149                 "movdqu 16(%[src]), %[reg_b]\n\t"
 150                 "movdqu 32(%[src]), %[reg_c]\n\t"
 151                 "movdqu 48(%[src]), %[reg_d]\n\t"
 152                 "movdqu %[reg_a], (%[dst])\n\t"
 153                 "movdqu %[reg_b], 16(%[dst])\n\t"
 154                 "movdqu %[reg_c], 32(%[dst])\n\t"
 155                 "movdqu %[reg_d], 48(%[dst])\n\t"
 156                 : [reg_a] "=x" (reg_a),
 157                   [reg_b] "=x" (reg_b),
 158                   [reg_c] "=x" (reg_c),
 159                   [reg_d] "=x" (reg_d)
 160                 : [src] "r" (src),
 161                   [dst] "r"(dst)
 162                 : "memory"
 163         );
 164 }
 165
 166 /**
 167  * Copy 128 bytes from one location to another using optimised SSE
 168  * instructions. The locations should not overlap.
 169  *
 170  * @param dst
 171  *   Pointer to the destination of the data.
 172  * @param src
 173  *   Pointer to the source data.
 174  */
 175 static inline void
 176 rte_mov128(uint8_t *dst, const uint8_t *src)
 177 {
 178         __m128i reg_a, reg_b, reg_c, reg_d, reg_e, reg_f, reg_g, reg_h;
 179         asm volatile (
 180                 "movdqu (%[src]), %[reg_a]\n\t"
 181                 "movdqu 16(%[src]), %[reg_b]\n\t"
 182                 "movdqu 32(%[src]), %[reg_c]\n\t"
 183                 "movdqu 48(%[src]), %[reg_d]\n\t"
 184                 "movdqu 64(%[src]), %[reg_e]\n\t"
 185                 "movdqu 80(%[src]), %[reg_f]\n\t"
 186                 "movdqu 96(%[src]), %[reg_g]\n\t"
 187                 "movdqu 112(%[src]), %[reg_h]\n\t"
 188                 "movdqu %[reg_a], (%[dst])\n\t"
 189                 "movdqu %[reg_b], 16(%[dst])\n\t"
 190                 "movdqu %[reg_c], 32(%[dst])\n\t"
 191                 "movdqu %[reg_d], 48(%[dst])\n\t"
 192                 "movdqu %[reg_e], 64(%[dst])\n\t"
 193                 "movdqu %[reg_f], 80(%[dst])\n\t"
 194                 "movdqu %[reg_g], 96(%[dst])\n\t"
 195                 "movdqu %[reg_h], 112(%[dst])\n\t"
 196                 : [reg_a] "=x" (reg_a),
 197                   [reg_b] "=x" (reg_b),
 198                   [reg_c] "=x" (reg_c),
 199                   [reg_d] "=x" (reg_d),
 200                   [reg_e] "=x" (reg_e),
 201                   [reg_f] "=x" (reg_f),
 202                   [reg_g] "=x" (reg_g),
 203                   [reg_h] "=x" (reg_h)
 204                 : [src] "r" (src),
 205                   [dst] "r"(dst)
 206                 : "memory"
 207         );
 208 }
 209
 210 #ifdef __INTEL_COMPILER
 211 #pragma warning(enable:593)
 212 #endif
 213
 214 /**
 215  * Copy 256 bytes from one location to another using optimised SSE
 216  * instructions. The locations should not overlap.
 217  *
 218  * @param dst
 219  *   Pointer to the destination of the data.
 220  * @param src
 221  *   Pointer to the source data.
 222  */
 223 static inline void
 224 rte_mov256(uint8_t *dst, const uint8_t *src)
 225 {
 226         rte_mov128(dst, src);
 227         rte_mov128(dst + 128, src + 128);
 228 }
 229
 230 /**
 231  * Copy bytes from one location to another. The locations must not overlap.
 232  *
 233  * @note This is implemented as a macro, so it's address should not be taken
 234  * and care is needed as parameter expressions may be evaluated multiple times.
 235  *
 236  * @param dst
 237  *   Pointer to the destination of the data.
 238  * @param src
 239  *   Pointer to the source data.
 240  * @param n
 241  *   Number of bytes to copy.
 242  * @return
 243  *   Pointer to the destination data.
 244  */
 245 #define rte_memcpy(dst, src, n)              \
 246         ((__builtin_constant_p(n)) ?          \
 247         memcpy((dst), (src), (n)) :          \
 248         rte_memcpy_func((dst), (src), (n)))
 249
 250 /*
 251  * memcpy() function used by rte_memcpy macro
 252  */
 253 static inline void *
 254 rte_memcpy_func(void *dst, const void *src, size_t n) __attribute__((always_inline));
 255
 256 static inline void *
 257 rte_memcpy_func(void *dst, const void *src, size_t n)
 258 {
 259         void *ret = dst;
 260
 261         /* We can't copy < 16 bytes using XMM registers so do it manually. */
 262         if (n < 16) {
 263                 if (n & 0x01) {
 264                         *(uint8_t *)dst = *(const uint8_t *)src;
 265                         dst = (uint8_t *)dst + 1;
 266                         src = (const uint8_t *)src + 1;
 267                 }
 268                 if (n & 0x02) {
 269                         *(uint16_t *)dst = *(const uint16_t *)src;
 270                         dst = (uint16_t *)dst + 1;
 271                         src = (const uint16_t *)src + 1;
 272                 }
 273                 if (n & 0x04) {
 274                         *(uint32_t *)dst = *(const uint32_t *)src;
 275                         dst = (uint32_t *)dst + 1;
 276                         src = (const uint32_t *)src + 1;
 277                 }
 278                 if (n & 0x08) {
 279                         *(uint64_t *)dst = *(const uint64_t *)src;
 280                 }
 281                 return ret;
 282         }
 283
 284         /* Special fast cases for <= 128 bytes */
 285         if (n <= 32) {
 286                 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
 287                 rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
 288                 return ret;
 289         }
 290
 291         if (n <= 64) {
 292                 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 293                 rte_mov32((uint8_t *)dst - 32 + n, (const uint8_t *)src - 32 + n);
 294                 return ret;
 295         }
 296
 297         if (n <= 128) {
 298                 rte_mov64((uint8_t *)dst, (const uint8_t *)src);
 299                 rte_mov64((uint8_t *)dst - 64 + n, (const uint8_t *)src - 64 + n);
 300                 return ret;
 301         }
 302
 303         /*
 304          * For large copies > 128 bytes. This combination of 256, 64 and 16 byte
 305          * copies was found to be faster than doing 128 and 32 byte copies as
 306          * well.
 307          */
 308         for ( ; n >= 256; n -= 256) {
 309                 rte_mov256((uint8_t *)dst, (const uint8_t *)src);
 310                 dst = (uint8_t *)dst + 256;
 311                 src = (const uint8_t *)src + 256;
 312         }
 313
 314         /*
 315          * We split the remaining bytes (which will be less than 256) into
 316          * 64byte (2^6) chunks.
 317          * Using incrementing integers in the case labels of a switch statement
 318          * enourages the compiler to use a jump table. To get incrementing
 319          * integers, we shift the 2 relevant bits to the LSB position to first
 320          * get decrementing integers, and then subtract.
 321          */
 322         switch (3 - (n >> 6)) {
 323         case 0x00:
 324                 rte_mov64((uint8_t *)dst, (const uint8_t *)src);
 325                 n -= 64;
 326                 dst = (uint8_t *)dst + 64;
 327                 src = (const uint8_t *)src + 64;      /* fallthrough */
 328         case 0x01:
 329                 rte_mov64((uint8_t *)dst, (const uint8_t *)src);
 330                 n -= 64;
 331                 dst = (uint8_t *)dst + 64;
 332                 src = (const uint8_t *)src + 64;      /* fallthrough */
 333         case 0x02:
 334                 rte_mov64((uint8_t *)dst, (const uint8_t *)src);
 335                 n -= 64;
 336                 dst = (uint8_t *)dst + 64;
 337                 src = (const uint8_t *)src + 64;      /* fallthrough */
 338         default:
 339                 ;
 340         }
 341
 342         /*
 343          * We split the remaining bytes (which will be less than 64) into
 344          * 16byte (2^4) chunks, using the same switch structure as above.
 345          */
 346         switch (3 - (n >> 4)) {
 347         case 0x00:
 348                 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
 349                 n -= 16;
 350                 dst = (uint8_t *)dst + 16;
 351                 src = (const uint8_t *)src + 16;      /* fallthrough */
 352         case 0x01:
 353                 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
 354                 n -= 16;
 355                 dst = (uint8_t *)dst + 16;
 356                 src = (const uint8_t *)src + 16;      /* fallthrough */
 357         case 0x02:
 358                 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
 359                 n -= 16;
 360                 dst = (uint8_t *)dst + 16;
 361                 src = (const uint8_t *)src + 16;      /* fallthrough */
 362         default:
 363                 ;
 364         }
 365
 366         /* Copy any remaining bytes, without going beyond end of buffers */
 367         if (n != 0) {
 368                 rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
 369         }
 370         return ret;
 371 }
 372
 373 #ifdef __cplusplus
 374 }
 375 #endif
 376
 377 #endif /* _RTE_MEMCPY_H_ */