lib/librte_eal/common/include/rte_memcpy.h

   1 /*-
   2  *   BSD LICENSE
   3  *
   4  *   Copyright(c) 2010-2013 Intel Corporation. All rights reserved.
   5  *   All rights reserved.
   6  *
   7  *   Redistribution and use in source and binary forms, with or without
   8  *   modification, are permitted provided that the following conditions
   9  *   are met:
  10  *
  11  *     * Redistributions of source code must retain the above copyright
  12  *       notice, this list of conditions and the following disclaimer.
  13  *     * Redistributions in binary form must reproduce the above copyright
  14  *       notice, this list of conditions and the following disclaimer in
  15  *       the documentation and/or other materials provided with the
  16  *       distribution.
  17  *     * Neither the name of Intel Corporation nor the names of its
  18  *       contributors may be used to endorse or promote products derived
  19  *       from this software without specific prior written permission.
  20  *
  21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  32  */
  33
  34 #ifndef _RTE_MEMCPY_H_
  35 #define _RTE_MEMCPY_H_
  36
  37 /**
  38  * @file
  39  *
  40  * Functions for SSE implementation of memcpy().
  41  */
  42
  43 #include <stdint.h>
  44 #include <string.h>
  45 #include <emmintrin.h>
  46
  47 #ifdef __cplusplus
  48 extern "C" {
  49 #endif
  50
  51 #ifdef __INTEL_COMPILER
  52 #pragma warning(disable:593) /* Stop unused variable warning (reg_a etc). */
  53 #endif
  54
  55 /**
  56  * Copy 16 bytes from one location to another using optimised SSE
  57  * instructions. The locations should not overlap.
  58  *
  59  * @param dst
  60  *   Pointer to the destination of the data.
  61  * @param src
  62  *   Pointer to the source data.
  63  */
  64 static inline void
  65 rte_mov16(uint8_t *dst, const uint8_t *src)
  66 {
  67         __m128i reg_a;
  68         asm volatile (
  69                 "movdqu (%[src]), %[reg_a]\n\t"
  70                 "movdqu %[reg_a], (%[dst])\n\t"
  71                 : [reg_a] "=x" (reg_a)
  72                 : [src] "r" (src),
  73                   [dst] "r"(dst)
  74                 : "memory"
  75         );
  76 }
  77
  78 /**
  79  * Copy 32 bytes from one location to another using optimised SSE
  80  * instructions. The locations should not overlap.
  81  *
  82  * @param dst
  83  *   Pointer to the destination of the data.
  84  * @param src
  85  *   Pointer to the source data.
  86  */
  87 static inline void
  88 rte_mov32(uint8_t *dst, const uint8_t *src)
  89 {
  90         __m128i reg_a, reg_b;
  91         asm volatile (
  92                 "movdqu (%[src]), %[reg_a]\n\t"
  93                 "movdqu 16(%[src]), %[reg_b]\n\t"
  94                 "movdqu %[reg_a], (%[dst])\n\t"
  95                 "movdqu %[reg_b], 16(%[dst])\n\t"
  96                 : [reg_a] "=x" (reg_a),
  97                   [reg_b] "=x" (reg_b)
  98                 : [src] "r" (src),
  99                   [dst] "r"(dst)
 100                 : "memory"
 101         );
 102 }
 103
 104 /**
 105  * Copy 48 bytes from one location to another using optimised SSE
 106  * instructions. The locations should not overlap.
 107  *
 108  * @param dst
 109  *   Pointer to the destination of the data.
 110  * @param src
 111  *   Pointer to the source data.
 112  */
 113 static inline void
 114 rte_mov48(uint8_t *dst, const uint8_t *src)
 115 {
 116         __m128i reg_a, reg_b, reg_c;
 117         asm volatile (
 118                 "movdqu (%[src]), %[reg_a]\n\t"
 119                 "movdqu 16(%[src]), %[reg_b]\n\t"
 120                 "movdqu 32(%[src]), %[reg_c]\n\t"
 121                 "movdqu %[reg_a], (%[dst])\n\t"
 122                 "movdqu %[reg_b], 16(%[dst])\n\t"
 123                 "movdqu %[reg_c], 32(%[dst])\n\t"
 124                 : [reg_a] "=x" (reg_a),
 125                   [reg_b] "=x" (reg_b),
 126                   [reg_c] "=x" (reg_c)
 127                 : [src] "r" (src),
 128                   [dst] "r"(dst)
 129                 : "memory"
 130         );
 131 }
 132
 133 /**
 134  * Copy 64 bytes from one location to another using optimised SSE
 135  * instructions. The locations should not overlap.
 136  *
 137  * @param dst
 138  *   Pointer to the destination of the data.
 139  * @param src
 140  *   Pointer to the source data.
 141  */
 142 static inline void
 143 rte_mov64(uint8_t *dst, const uint8_t *src)
 144 {
 145         __m128i reg_a, reg_b, reg_c, reg_d;
 146         asm volatile (
 147                 "movdqu (%[src]), %[reg_a]\n\t"
 148                 "movdqu 16(%[src]), %[reg_b]\n\t"
 149                 "movdqu 32(%[src]), %[reg_c]\n\t"
 150                 "movdqu 48(%[src]), %[reg_d]\n\t"
 151                 "movdqu %[reg_a], (%[dst])\n\t"
 152                 "movdqu %[reg_b], 16(%[dst])\n\t"
 153                 "movdqu %[reg_c], 32(%[dst])\n\t"
 154                 "movdqu %[reg_d], 48(%[dst])\n\t"
 155                 : [reg_a] "=x" (reg_a),
 156                   [reg_b] "=x" (reg_b),
 157                   [reg_c] "=x" (reg_c),
 158                   [reg_d] "=x" (reg_d)
 159                 : [src] "r" (src),
 160                   [dst] "r"(dst)
 161                 : "memory"
 162         );
 163 }
 164
 165 /**
 166  * Copy 128 bytes from one location to another using optimised SSE
 167  * instructions. The locations should not overlap.
 168  *
 169  * @param dst
 170  *   Pointer to the destination of the data.
 171  * @param src
 172  *   Pointer to the source data.
 173  */
 174 static inline void
 175 rte_mov128(uint8_t *dst, const uint8_t *src)
 176 {
 177         __m128i reg_a, reg_b, reg_c, reg_d, reg_e, reg_f, reg_g, reg_h;
 178         asm volatile (
 179                 "movdqu (%[src]), %[reg_a]\n\t"
 180                 "movdqu 16(%[src]), %[reg_b]\n\t"
 181                 "movdqu 32(%[src]), %[reg_c]\n\t"
 182                 "movdqu 48(%[src]), %[reg_d]\n\t"
 183                 "movdqu 64(%[src]), %[reg_e]\n\t"
 184                 "movdqu 80(%[src]), %[reg_f]\n\t"
 185                 "movdqu 96(%[src]), %[reg_g]\n\t"
 186                 "movdqu 112(%[src]), %[reg_h]\n\t"
 187                 "movdqu %[reg_a], (%[dst])\n\t"
 188                 "movdqu %[reg_b], 16(%[dst])\n\t"
 189                 "movdqu %[reg_c], 32(%[dst])\n\t"
 190                 "movdqu %[reg_d], 48(%[dst])\n\t"
 191                 "movdqu %[reg_e], 64(%[dst])\n\t"
 192                 "movdqu %[reg_f], 80(%[dst])\n\t"
 193                 "movdqu %[reg_g], 96(%[dst])\n\t"
 194                 "movdqu %[reg_h], 112(%[dst])\n\t"
 195                 : [reg_a] "=x" (reg_a),
 196                   [reg_b] "=x" (reg_b),
 197                   [reg_c] "=x" (reg_c),
 198                   [reg_d] "=x" (reg_d),
 199                   [reg_e] "=x" (reg_e),
 200                   [reg_f] "=x" (reg_f),
 201                   [reg_g] "=x" (reg_g),
 202                   [reg_h] "=x" (reg_h)
 203                 : [src] "r" (src),
 204                   [dst] "r"(dst)
 205                 : "memory"
 206         );
 207 }
 208
 209 #ifdef __INTEL_COMPILER
 210 #pragma warning(enable:593)
 211 #endif
 212
 213 /**
 214  * Copy 256 bytes from one location to another using optimised SSE
 215  * instructions. The locations should not overlap.
 216  *
 217  * @param dst
 218  *   Pointer to the destination of the data.
 219  * @param src
 220  *   Pointer to the source data.
 221  */
 222 static inline void
 223 rte_mov256(uint8_t *dst, const uint8_t *src)
 224 {
 225         rte_mov128(dst, src);
 226         rte_mov128(dst + 128, src + 128);
 227 }
 228
 229 /**
 230  * Copy bytes from one location to another. The locations must not overlap.
 231  *
 232  * @note This is implemented as a macro, so it's address should not be taken
 233  * and care is needed as parameter expressions may be evaluated multiple times.
 234  *
 235  * @param dst
 236  *   Pointer to the destination of the data.
 237  * @param src
 238  *   Pointer to the source data.
 239  * @param n
 240  *   Number of bytes to copy.
 241  * @return
 242  *   Pointer to the destination data.
 243  */
 244 #define rte_memcpy(dst, src, n)              \
 245         ((__builtin_constant_p(n)) ?          \
 246         memcpy((dst), (src), (n)) :          \
 247         rte_memcpy_func((dst), (src), (n)))
 248
 249 /*
 250  * memcpy() function used by rte_memcpy macro
 251  */
 252 static inline void *
 253 rte_memcpy_func(void *dst, const void *src, size_t n) __attribute__((always_inline));
 254
 255 static inline void *
 256 rte_memcpy_func(void *dst, const void *src, size_t n)
 257 {
 258         void *ret = dst;
 259
 260         /* We can't copy < 16 bytes using XMM registers so do it manually. */
 261         if (n < 16) {
 262                 if (n & 0x01) {
 263                         *(uint8_t *)dst = *(const uint8_t *)src;
 264                         dst = (uint8_t *)dst + 1;
 265                         src = (const uint8_t *)src + 1;
 266                 }
 267                 if (n & 0x02) {
 268                         *(uint16_t *)dst = *(const uint16_t *)src;
 269                         dst = (uint16_t *)dst + 1;
 270                         src = (const uint16_t *)src + 1;
 271                 }
 272                 if (n & 0x04) {
 273                         *(uint32_t *)dst = *(const uint32_t *)src;
 274                         dst = (uint32_t *)dst + 1;
 275                         src = (const uint32_t *)src + 1;
 276                 }
 277                 if (n & 0x08) {
 278                         *(uint64_t *)dst = *(const uint64_t *)src;
 279                 }
 280                 return ret;
 281         }
 282
 283         /* Special fast cases for <= 128 bytes */
 284         if (n <= 32) {
 285                 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
 286                 rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
 287                 return ret;
 288         }
 289
 290         if (n <= 64) {
 291                 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 292                 rte_mov32((uint8_t *)dst - 32 + n, (const uint8_t *)src - 32 + n);
 293                 return ret;
 294         }
 295
 296         if (n <= 128) {
 297                 rte_mov64((uint8_t *)dst, (const uint8_t *)src);
 298                 rte_mov64((uint8_t *)dst - 64 + n, (const uint8_t *)src - 64 + n);
 299                 return ret;
 300         }
 301
 302         /*
 303          * For large copies > 128 bytes. This combination of 256, 64 and 16 byte
 304          * copies was found to be faster than doing 128 and 32 byte copies as
 305          * well.
 306          */
 307         for ( ; n >= 256; n -= 256) {
 308                 rte_mov256((uint8_t *)dst, (const uint8_t *)src);
 309                 dst = (uint8_t *)dst + 256;
 310                 src = (const uint8_t *)src + 256;
 311         }
 312
 313         /*
 314          * We split the remaining bytes (which will be less than 256) into
 315          * 64byte (2^6) chunks.
 316          * Using incrementing integers in the case labels of a switch statement
 317          * enourages the compiler to use a jump table. To get incrementing
 318          * integers, we shift the 2 relevant bits to the LSB position to first
 319          * get decrementing integers, and then subtract.
 320          */
 321         switch (3 - (n >> 6)) {
 322         case 0x00:
 323                 rte_mov64((uint8_t *)dst, (const uint8_t *)src);
 324                 n -= 64;
 325                 dst = (uint8_t *)dst + 64;
 326                 src = (const uint8_t *)src + 64;      /* fallthrough */
 327         case 0x01:
 328                 rte_mov64((uint8_t *)dst, (const uint8_t *)src);
 329                 n -= 64;
 330                 dst = (uint8_t *)dst + 64;
 331                 src = (const uint8_t *)src + 64;      /* fallthrough */
 332         case 0x02:
 333                 rte_mov64((uint8_t *)dst, (const uint8_t *)src);
 334                 n -= 64;
 335                 dst = (uint8_t *)dst + 64;
 336                 src = (const uint8_t *)src + 64;      /* fallthrough */
 337         default:
 338                 ;
 339         }
 340
 341         /*
 342          * We split the remaining bytes (which will be less than 64) into
 343          * 16byte (2^4) chunks, using the same switch structure as above.
 344          */
 345         switch (3 - (n >> 4)) {
 346         case 0x00:
 347                 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
 348                 n -= 16;
 349                 dst = (uint8_t *)dst + 16;
 350                 src = (const uint8_t *)src + 16;      /* fallthrough */
 351         case 0x01:
 352                 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
 353                 n -= 16;
 354                 dst = (uint8_t *)dst + 16;
 355                 src = (const uint8_t *)src + 16;      /* fallthrough */
 356         case 0x02:
 357                 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
 358                 n -= 16;
 359                 dst = (uint8_t *)dst + 16;
 360                 src = (const uint8_t *)src + 16;      /* fallthrough */
 361         default:
 362                 ;
 363         }
 364
 365         /* Copy any remaining bytes, without going beyond end of buffers */
 366         if (n != 0) {
 367                 rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
 368         }
 369         return ret;
 370 }
 371
 372 #ifdef __cplusplus
 373 }
 374 #endif
 375
 376 #endif /* _RTE_MEMCPY_H_ */