lib/librte_eal/common/include/arch/x86_64/rte_memcpy.h

   1 /*-
   2  *   BSD LICENSE
   3  *
   4  *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
   5  *   All rights reserved.
   6  *
   7  *   Redistribution and use in source and binary forms, with or without
   8  *   modification, are permitted provided that the following conditions
   9  *   are met:
  10  *
  11  *     * Redistributions of source code must retain the above copyright
  12  *       notice, this list of conditions and the following disclaimer.
  13  *     * Redistributions in binary form must reproduce the above copyright
  14  *       notice, this list of conditions and the following disclaimer in
  15  *       the documentation and/or other materials provided with the
  16  *       distribution.
  17  *     * Neither the name of Intel Corporation nor the names of its
  18  *       contributors may be used to endorse or promote products derived
  19  *       from this software without specific prior written permission.
  20  *
  21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  32  */
  33
  34 #ifndef _RTE_MEMCPY_X86_64_H_
  35 #define _RTE_MEMCPY_X86_64_H_
  36
  37 #include <stdint.h>
  38 #include <string.h>
  39 #include <emmintrin.h>
  40
  41 #ifdef __cplusplus
  42 extern "C" {
  43 #endif
  44
  45 #include "generic/rte_memcpy.h"
  46
  47 #ifdef __INTEL_COMPILER
  48 #pragma warning(disable:593) /* Stop unused variable warning (reg_a etc). */
  49 #endif
  50
  51 static inline void
  52 rte_mov16(uint8_t *dst, const uint8_t *src)
  53 {
  54         __m128i reg_a;
  55         asm volatile (
  56                 "movdqu (%[src]), %[reg_a]\n\t"
  57                 "movdqu %[reg_a], (%[dst])\n\t"
  58                 : [reg_a] "=x" (reg_a)
  59                 : [src] "r" (src),
  60                   [dst] "r"(dst)
  61                 : "memory"
  62         );
  63 }
  64
  65 static inline void
  66 rte_mov32(uint8_t *dst, const uint8_t *src)
  67 {
  68         __m128i reg_a, reg_b;
  69         asm volatile (
  70                 "movdqu (%[src]), %[reg_a]\n\t"
  71                 "movdqu 16(%[src]), %[reg_b]\n\t"
  72                 "movdqu %[reg_a], (%[dst])\n\t"
  73                 "movdqu %[reg_b], 16(%[dst])\n\t"
  74                 : [reg_a] "=x" (reg_a),
  75                   [reg_b] "=x" (reg_b)
  76                 : [src] "r" (src),
  77                   [dst] "r"(dst)
  78                 : "memory"
  79         );
  80 }
  81
  82 static inline void
  83 rte_mov48(uint8_t *dst, const uint8_t *src)
  84 {
  85         __m128i reg_a, reg_b, reg_c;
  86         asm volatile (
  87                 "movdqu (%[src]), %[reg_a]\n\t"
  88                 "movdqu 16(%[src]), %[reg_b]\n\t"
  89                 "movdqu 32(%[src]), %[reg_c]\n\t"
  90                 "movdqu %[reg_a], (%[dst])\n\t"
  91                 "movdqu %[reg_b], 16(%[dst])\n\t"
  92                 "movdqu %[reg_c], 32(%[dst])\n\t"
  93                 : [reg_a] "=x" (reg_a),
  94                   [reg_b] "=x" (reg_b),
  95                   [reg_c] "=x" (reg_c)
  96                 : [src] "r" (src),
  97                   [dst] "r"(dst)
  98                 : "memory"
  99         );
 100 }
 101
 102 static inline void
 103 rte_mov64(uint8_t *dst, const uint8_t *src)
 104 {
 105         __m128i reg_a, reg_b, reg_c, reg_d;
 106         asm volatile (
 107                 "movdqu (%[src]), %[reg_a]\n\t"
 108                 "movdqu 16(%[src]), %[reg_b]\n\t"
 109                 "movdqu 32(%[src]), %[reg_c]\n\t"
 110                 "movdqu 48(%[src]), %[reg_d]\n\t"
 111                 "movdqu %[reg_a], (%[dst])\n\t"
 112                 "movdqu %[reg_b], 16(%[dst])\n\t"
 113                 "movdqu %[reg_c], 32(%[dst])\n\t"
 114                 "movdqu %[reg_d], 48(%[dst])\n\t"
 115                 : [reg_a] "=x" (reg_a),
 116                   [reg_b] "=x" (reg_b),
 117                   [reg_c] "=x" (reg_c),
 118                   [reg_d] "=x" (reg_d)
 119                 : [src] "r" (src),
 120                   [dst] "r"(dst)
 121                 : "memory"
 122         );
 123 }
 124
 125 static inline void
 126 rte_mov128(uint8_t *dst, const uint8_t *src)
 127 {
 128         __m128i reg_a, reg_b, reg_c, reg_d, reg_e, reg_f, reg_g, reg_h;
 129         asm volatile (
 130                 "movdqu (%[src]), %[reg_a]\n\t"
 131                 "movdqu 16(%[src]), %[reg_b]\n\t"
 132                 "movdqu 32(%[src]), %[reg_c]\n\t"
 133                 "movdqu 48(%[src]), %[reg_d]\n\t"
 134                 "movdqu 64(%[src]), %[reg_e]\n\t"
 135                 "movdqu 80(%[src]), %[reg_f]\n\t"
 136                 "movdqu 96(%[src]), %[reg_g]\n\t"
 137                 "movdqu 112(%[src]), %[reg_h]\n\t"
 138                 "movdqu %[reg_a], (%[dst])\n\t"
 139                 "movdqu %[reg_b], 16(%[dst])\n\t"
 140                 "movdqu %[reg_c], 32(%[dst])\n\t"
 141                 "movdqu %[reg_d], 48(%[dst])\n\t"
 142                 "movdqu %[reg_e], 64(%[dst])\n\t"
 143                 "movdqu %[reg_f], 80(%[dst])\n\t"
 144                 "movdqu %[reg_g], 96(%[dst])\n\t"
 145                 "movdqu %[reg_h], 112(%[dst])\n\t"
 146                 : [reg_a] "=x" (reg_a),
 147                   [reg_b] "=x" (reg_b),
 148                   [reg_c] "=x" (reg_c),
 149                   [reg_d] "=x" (reg_d),
 150                   [reg_e] "=x" (reg_e),
 151                   [reg_f] "=x" (reg_f),
 152                   [reg_g] "=x" (reg_g),
 153                   [reg_h] "=x" (reg_h)
 154                 : [src] "r" (src),
 155                   [dst] "r"(dst)
 156                 : "memory"
 157         );
 158 }
 159
 160 #ifdef __INTEL_COMPILER
 161 #pragma warning(enable:593)
 162 #endif
 163
 164 static inline void
 165 rte_mov256(uint8_t *dst, const uint8_t *src)
 166 {
 167         rte_mov128(dst, src);
 168         rte_mov128(dst + 128, src + 128);
 169 }
 170
 171 #define rte_memcpy(dst, src, n)              \
 172         ((__builtin_constant_p(n)) ?          \
 173         memcpy((dst), (src), (n)) :          \
 174         rte_memcpy_func((dst), (src), (n)))
 175
 176 static inline void *
 177 rte_memcpy_func(void *dst, const void *src, size_t n)
 178 {
 179         void *ret = dst;
 180
 181         /* We can't copy < 16 bytes using XMM registers so do it manually. */
 182         if (n < 16) {
 183                 if (n & 0x01) {
 184                         *(uint8_t *)dst = *(const uint8_t *)src;
 185                         dst = (uint8_t *)dst + 1;
 186                         src = (const uint8_t *)src + 1;
 187                 }
 188                 if (n & 0x02) {
 189                         *(uint16_t *)dst = *(const uint16_t *)src;
 190                         dst = (uint16_t *)dst + 1;
 191                         src = (const uint16_t *)src + 1;
 192                 }
 193                 if (n & 0x04) {
 194                         *(uint32_t *)dst = *(const uint32_t *)src;
 195                         dst = (uint32_t *)dst + 1;
 196                         src = (const uint32_t *)src + 1;
 197                 }
 198                 if (n & 0x08) {
 199                         *(uint64_t *)dst = *(const uint64_t *)src;
 200                 }
 201                 return ret;
 202         }
 203
 204         /* Special fast cases for <= 128 bytes */
 205         if (n <= 32) {
 206                 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
 207                 rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
 208                 return ret;
 209         }
 210
 211         if (n <= 64) {
 212                 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 213                 rte_mov32((uint8_t *)dst - 32 + n, (const uint8_t *)src - 32 + n);
 214                 return ret;
 215         }
 216
 217         if (n <= 128) {
 218                 rte_mov64((uint8_t *)dst, (const uint8_t *)src);
 219                 rte_mov64((uint8_t *)dst - 64 + n, (const uint8_t *)src - 64 + n);
 220                 return ret;
 221         }
 222
 223         /*
 224          * For large copies > 128 bytes. This combination of 256, 64 and 16 byte
 225          * copies was found to be faster than doing 128 and 32 byte copies as
 226          * well.
 227          */
 228         for ( ; n >= 256; n -= 256) {
 229                 rte_mov256((uint8_t *)dst, (const uint8_t *)src);
 230                 dst = (uint8_t *)dst + 256;
 231                 src = (const uint8_t *)src + 256;
 232         }
 233
 234         /*
 235          * We split the remaining bytes (which will be less than 256) into
 236          * 64byte (2^6) chunks.
 237          * Using incrementing integers in the case labels of a switch statement
 238          * enourages the compiler to use a jump table. To get incrementing
 239          * integers, we shift the 2 relevant bits to the LSB position to first
 240          * get decrementing integers, and then subtract.
 241          */
 242         switch (3 - (n >> 6)) {
 243         case 0x00:
 244                 rte_mov64((uint8_t *)dst, (const uint8_t *)src);
 245                 n -= 64;
 246                 dst = (uint8_t *)dst + 64;
 247                 src = (const uint8_t *)src + 64;      /* fallthrough */
 248         case 0x01:
 249                 rte_mov64((uint8_t *)dst, (const uint8_t *)src);
 250                 n -= 64;
 251                 dst = (uint8_t *)dst + 64;
 252                 src = (const uint8_t *)src + 64;      /* fallthrough */
 253         case 0x02:
 254                 rte_mov64((uint8_t *)dst, (const uint8_t *)src);
 255                 n -= 64;
 256                 dst = (uint8_t *)dst + 64;
 257                 src = (const uint8_t *)src + 64;      /* fallthrough */
 258         default:
 259                 ;
 260         }
 261
 262         /*
 263          * We split the remaining bytes (which will be less than 64) into
 264          * 16byte (2^4) chunks, using the same switch structure as above.
 265          */
 266         switch (3 - (n >> 4)) {
 267         case 0x00:
 268                 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
 269                 n -= 16;
 270                 dst = (uint8_t *)dst + 16;
 271                 src = (const uint8_t *)src + 16;      /* fallthrough */
 272         case 0x01:
 273                 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
 274                 n -= 16;
 275                 dst = (uint8_t *)dst + 16;
 276                 src = (const uint8_t *)src + 16;      /* fallthrough */
 277         case 0x02:
 278                 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
 279                 n -= 16;
 280                 dst = (uint8_t *)dst + 16;
 281                 src = (const uint8_t *)src + 16;      /* fallthrough */
 282         default:
 283                 ;
 284         }
 285
 286         /* Copy any remaining bytes, without going beyond end of buffers */
 287         if (n != 0) {
 288                 rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
 289         }
 290         return ret;
 291 }
 292
 293 #ifdef __cplusplus
 294 }
 295 #endif
 296
 297 #endif /* _RTE_MEMCPY_X86_64_H_ */