lib/librte_eal/common/include/arch/x86/rte_memcpy.h

   1 /*-
   2  *   BSD LICENSE
   3  *
   4  *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
   5  *   All rights reserved.
   6  *
   7  *   Redistribution and use in source and binary forms, with or without
   8  *   modification, are permitted provided that the following conditions
   9  *   are met:
  10  *
  11  *     * Redistributions of source code must retain the above copyright
  12  *       notice, this list of conditions and the following disclaimer.
  13  *     * Redistributions in binary form must reproduce the above copyright
  14  *       notice, this list of conditions and the following disclaimer in
  15  *       the documentation and/or other materials provided with the
  16  *       distribution.
  17  *     * Neither the name of Intel Corporation nor the names of its
  18  *       contributors may be used to endorse or promote products derived
  19  *       from this software without specific prior written permission.
  20  *
  21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  32  */
  33
  34 #ifndef _RTE_MEMCPY_X86_64_H_
  35 #define _RTE_MEMCPY_X86_64_H_
  36
  37 /**
  38  * @file
  39  *
  40  * Functions for SSE/AVX/AVX2/AVX512 implementation of memcpy().
  41  */
  42
  43 #include <stdio.h>
  44 #include <stdint.h>
  45 #include <string.h>
  46 #include <rte_vect.h>
  47
  48 #ifdef __cplusplus
  49 extern "C" {
  50 #endif
  51
  52 /**
  53  * Copy bytes from one location to another. The locations must not overlap.
  54  *
  55  * @note This is implemented as a macro, so it's address should not be taken
  56  * and care is needed as parameter expressions may be evaluated multiple times.
  57  *
  58  * @param dst
  59  *   Pointer to the destination of the data.
  60  * @param src
  61  *   Pointer to the source data.
  62  * @param n
  63  *   Number of bytes to copy.
  64  * @return
  65  *   Pointer to the destination data.
  66  */
  67 static inline void *
  68 rte_memcpy(void *dst, const void *src, size_t n) __attribute__((always_inline));
  69
  70 #ifdef RTE_MACHINE_CPUFLAG_AVX512F
  71
  72 /**
  73  * AVX512 implementation below
  74  */
  75
  76 /**
  77  * Copy 16 bytes from one location to another,
  78  * locations should not overlap.
  79  */
  80 static inline void
  81 rte_mov16(uint8_t *dst, const uint8_t *src)
  82 {
  83         __m128i xmm0;
  84
  85         xmm0 = _mm_loadu_si128((const __m128i *)src);
  86         _mm_storeu_si128((__m128i *)dst, xmm0);
  87 }
  88
  89 /**
  90  * Copy 32 bytes from one location to another,
  91  * locations should not overlap.
  92  */
  93 static inline void
  94 rte_mov32(uint8_t *dst, const uint8_t *src)
  95 {
  96         __m256i ymm0;
  97
  98         ymm0 = _mm256_loadu_si256((const __m256i *)src);
  99         _mm256_storeu_si256((__m256i *)dst, ymm0);
 100 }
 101
 102 /**
 103  * Copy 64 bytes from one location to another,
 104  * locations should not overlap.
 105  */
 106 static inline void
 107 rte_mov64(uint8_t *dst, const uint8_t *src)
 108 {
 109         __m512i zmm0;
 110
 111         zmm0 = _mm512_loadu_si512((const void *)src);
 112         _mm512_storeu_si512((void *)dst, zmm0);
 113 }
 114
 115 /**
 116  * Copy 128 bytes from one location to another,
 117  * locations should not overlap.
 118  */
 119 static inline void
 120 rte_mov128(uint8_t *dst, const uint8_t *src)
 121 {
 122         rte_mov64(dst + 0 * 64, src + 0 * 64);
 123         rte_mov64(dst + 1 * 64, src + 1 * 64);
 124 }
 125
 126 /**
 127  * Copy 256 bytes from one location to another,
 128  * locations should not overlap.
 129  */
 130 static inline void
 131 rte_mov256(uint8_t *dst, const uint8_t *src)
 132 {
 133         rte_mov64(dst + 0 * 64, src + 0 * 64);
 134         rte_mov64(dst + 1 * 64, src + 1 * 64);
 135         rte_mov64(dst + 2 * 64, src + 2 * 64);
 136         rte_mov64(dst + 3 * 64, src + 3 * 64);
 137 }
 138
 139 /**
 140  * Copy 128-byte blocks from one location to another,
 141  * locations should not overlap.
 142  */
 143 static inline void
 144 rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
 145 {
 146         __m512i zmm0, zmm1;
 147
 148         while (n >= 128) {
 149                 zmm0 = _mm512_loadu_si512((const void *)(src + 0 * 64));
 150                 n -= 128;
 151                 zmm1 = _mm512_loadu_si512((const void *)(src + 1 * 64));
 152                 src = src + 128;
 153                 _mm512_storeu_si512((void *)(dst + 0 * 64), zmm0);
 154                 _mm512_storeu_si512((void *)(dst + 1 * 64), zmm1);
 155                 dst = dst + 128;
 156         }
 157 }
 158
 159 /**
 160  * Copy 512-byte blocks from one location to another,
 161  * locations should not overlap.
 162  */
 163 static inline void
 164 rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n)
 165 {
 166         __m512i zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7;
 167
 168         while (n >= 512) {
 169                 zmm0 = _mm512_loadu_si512((const void *)(src + 0 * 64));
 170                 n -= 512;
 171                 zmm1 = _mm512_loadu_si512((const void *)(src + 1 * 64));
 172                 zmm2 = _mm512_loadu_si512((const void *)(src + 2 * 64));
 173                 zmm3 = _mm512_loadu_si512((const void *)(src + 3 * 64));
 174                 zmm4 = _mm512_loadu_si512((const void *)(src + 4 * 64));
 175                 zmm5 = _mm512_loadu_si512((const void *)(src + 5 * 64));
 176                 zmm6 = _mm512_loadu_si512((const void *)(src + 6 * 64));
 177                 zmm7 = _mm512_loadu_si512((const void *)(src + 7 * 64));
 178                 src = src + 512;
 179                 _mm512_storeu_si512((void *)(dst + 0 * 64), zmm0);
 180                 _mm512_storeu_si512((void *)(dst + 1 * 64), zmm1);
 181                 _mm512_storeu_si512((void *)(dst + 2 * 64), zmm2);
 182                 _mm512_storeu_si512((void *)(dst + 3 * 64), zmm3);
 183                 _mm512_storeu_si512((void *)(dst + 4 * 64), zmm4);
 184                 _mm512_storeu_si512((void *)(dst + 5 * 64), zmm5);
 185                 _mm512_storeu_si512((void *)(dst + 6 * 64), zmm6);
 186                 _mm512_storeu_si512((void *)(dst + 7 * 64), zmm7);
 187                 dst = dst + 512;
 188         }
 189 }
 190
 191 static inline void *
 192 rte_memcpy(void *dst, const void *src, size_t n)
 193 {
 194         uintptr_t dstu = (uintptr_t)dst;
 195         uintptr_t srcu = (uintptr_t)src;
 196         void *ret = dst;
 197         size_t dstofss;
 198         size_t bits;
 199
 200         /**
 201          * Copy less than 16 bytes
 202          */
 203         if (n < 16) {
 204                 if (n & 0x01) {
 205                         *(uint8_t *)dstu = *(const uint8_t *)srcu;
 206                         srcu = (uintptr_t)((const uint8_t *)srcu + 1);
 207                         dstu = (uintptr_t)((uint8_t *)dstu + 1);
 208                 }
 209                 if (n & 0x02) {
 210                         *(uint16_t *)dstu = *(const uint16_t *)srcu;
 211                         srcu = (uintptr_t)((const uint16_t *)srcu + 1);
 212                         dstu = (uintptr_t)((uint16_t *)dstu + 1);
 213                 }
 214                 if (n & 0x04) {
 215                         *(uint32_t *)dstu = *(const uint32_t *)srcu;
 216                         srcu = (uintptr_t)((const uint32_t *)srcu + 1);
 217                         dstu = (uintptr_t)((uint32_t *)dstu + 1);
 218                 }
 219                 if (n & 0x08)
 220                         *(uint64_t *)dstu = *(const uint64_t *)srcu;
 221                 return ret;
 222         }
 223
 224         /**
 225          * Fast way when copy size doesn't exceed 512 bytes
 226          */
 227         if (n <= 32) {
 228                 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
 229                 rte_mov16((uint8_t *)dst - 16 + n,
 230                                   (const uint8_t *)src - 16 + n);
 231                 return ret;
 232         }
 233         if (n <= 64) {
 234                 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 235                 rte_mov32((uint8_t *)dst - 32 + n,
 236                                   (const uint8_t *)src - 32 + n);
 237                 return ret;
 238         }
 239         if (n <= 512) {
 240                 if (n >= 256) {
 241                         n -= 256;
 242                         rte_mov256((uint8_t *)dst, (const uint8_t *)src);
 243                         src = (const uint8_t *)src + 256;
 244                         dst = (uint8_t *)dst + 256;
 245                 }
 246                 if (n >= 128) {
 247                         n -= 128;
 248                         rte_mov128((uint8_t *)dst, (const uint8_t *)src);
 249                         src = (const uint8_t *)src + 128;
 250                         dst = (uint8_t *)dst + 128;
 251                 }
 252 COPY_BLOCK_128_BACK63:
 253                 if (n > 64) {
 254                         rte_mov64((uint8_t *)dst, (const uint8_t *)src);
 255                         rte_mov64((uint8_t *)dst - 64 + n,
 256                                           (const uint8_t *)src - 64 + n);
 257                         return ret;
 258                 }
 259                 if (n > 0)
 260                         rte_mov64((uint8_t *)dst - 64 + n,
 261                                           (const uint8_t *)src - 64 + n);
 262                 return ret;
 263         }
 264
 265         /**
 266          * Make store aligned when copy size exceeds 512 bytes
 267          */
 268         dstofss = ((uintptr_t)dst & 0x3F);
 269         if (dstofss > 0) {
 270                 dstofss = 64 - dstofss;
 271                 n -= dstofss;
 272                 rte_mov64((uint8_t *)dst, (const uint8_t *)src);
 273                 src = (const uint8_t *)src + dstofss;
 274                 dst = (uint8_t *)dst + dstofss;
 275         }
 276
 277         /**
 278          * Copy 512-byte blocks.
 279          * Use copy block function for better instruction order control,
 280          * which is important when load is unaligned.
 281          */
 282         rte_mov512blocks((uint8_t *)dst, (const uint8_t *)src, n);
 283         bits = n;
 284         n = n & 511;
 285         bits -= n;
 286         src = (const uint8_t *)src + bits;
 287         dst = (uint8_t *)dst + bits;
 288
 289         /**
 290          * Copy 128-byte blocks.
 291          * Use copy block function for better instruction order control,
 292          * which is important when load is unaligned.
 293          */
 294         if (n >= 128) {
 295                 rte_mov128blocks((uint8_t *)dst, (const uint8_t *)src, n);
 296                 bits = n;
 297                 n = n & 127;
 298                 bits -= n;
 299                 src = (const uint8_t *)src + bits;
 300                 dst = (uint8_t *)dst + bits;
 301         }
 302
 303         /**
 304          * Copy whatever left
 305          */
 306         goto COPY_BLOCK_128_BACK63;
 307 }
 308
 309 #elif defined RTE_MACHINE_CPUFLAG_AVX2
 310
 311 /**
 312  * AVX2 implementation below
 313  */
 314
 315 /**
 316  * Copy 16 bytes from one location to another,
 317  * locations should not overlap.
 318  */
 319 static inline void
 320 rte_mov16(uint8_t *dst, const uint8_t *src)
 321 {
 322         __m128i xmm0;
 323
 324         xmm0 = _mm_loadu_si128((const __m128i *)src);
 325         _mm_storeu_si128((__m128i *)dst, xmm0);
 326 }
 327
 328 /**
 329  * Copy 32 bytes from one location to another,
 330  * locations should not overlap.
 331  */
 332 static inline void
 333 rte_mov32(uint8_t *dst, const uint8_t *src)
 334 {
 335         __m256i ymm0;
 336
 337         ymm0 = _mm256_loadu_si256((const __m256i *)src);
 338         _mm256_storeu_si256((__m256i *)dst, ymm0);
 339 }
 340
 341 /**
 342  * Copy 64 bytes from one location to another,
 343  * locations should not overlap.
 344  */
 345 static inline void
 346 rte_mov64(uint8_t *dst, const uint8_t *src)
 347 {
 348         rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
 349         rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
 350 }
 351
 352 /**
 353  * Copy 128 bytes from one location to another,
 354  * locations should not overlap.
 355  */
 356 static inline void
 357 rte_mov128(uint8_t *dst, const uint8_t *src)
 358 {
 359         rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
 360         rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
 361         rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32);
 362         rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32);
 363 }
 364
 365 /**
 366  * Copy 256 bytes from one location to another,
 367  * locations should not overlap.
 368  */
 369 static inline void
 370 rte_mov256(uint8_t *dst, const uint8_t *src)
 371 {
 372         rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
 373         rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
 374         rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32);
 375         rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32);
 376         rte_mov32((uint8_t *)dst + 4 * 32, (const uint8_t *)src + 4 * 32);
 377         rte_mov32((uint8_t *)dst + 5 * 32, (const uint8_t *)src + 5 * 32);
 378         rte_mov32((uint8_t *)dst + 6 * 32, (const uint8_t *)src + 6 * 32);
 379         rte_mov32((uint8_t *)dst + 7 * 32, (const uint8_t *)src + 7 * 32);
 380 }
 381
 382 /**
 383  * Copy 64-byte blocks from one location to another,
 384  * locations should not overlap.
 385  */
 386 static inline void
 387 rte_mov64blocks(uint8_t *dst, const uint8_t *src, size_t n)
 388 {
 389         __m256i ymm0, ymm1;
 390
 391         while (n >= 64) {
 392                 ymm0 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 0 * 32));
 393                 n -= 64;
 394                 ymm1 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 1 * 32));
 395                 src = (const uint8_t *)src + 64;
 396                 _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 0 * 32), ymm0);
 397                 _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 1 * 32), ymm1);
 398                 dst = (uint8_t *)dst + 64;
 399         }
 400 }
 401
 402 /**
 403  * Copy 256-byte blocks from one location to another,
 404  * locations should not overlap.
 405  */
 406 static inline void
 407 rte_mov256blocks(uint8_t *dst, const uint8_t *src, size_t n)
 408 {
 409         __m256i ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7;
 410
 411         while (n >= 256) {
 412                 ymm0 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 0 * 32));
 413                 n -= 256;
 414                 ymm1 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 1 * 32));
 415                 ymm2 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 2 * 32));
 416                 ymm3 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 3 * 32));
 417                 ymm4 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 4 * 32));
 418                 ymm5 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 5 * 32));
 419                 ymm6 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 6 * 32));
 420                 ymm7 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 7 * 32));
 421                 src = (const uint8_t *)src + 256;
 422                 _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 0 * 32), ymm0);
 423                 _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 1 * 32), ymm1);
 424                 _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 2 * 32), ymm2);
 425                 _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 3 * 32), ymm3);
 426                 _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 4 * 32), ymm4);
 427                 _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 5 * 32), ymm5);
 428                 _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 6 * 32), ymm6);
 429                 _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 7 * 32), ymm7);
 430                 dst = (uint8_t *)dst + 256;
 431         }
 432 }
 433
 434 static inline void *
 435 rte_memcpy(void *dst, const void *src, size_t n)
 436 {
 437         uintptr_t dstu = (uintptr_t)dst;
 438         uintptr_t srcu = (uintptr_t)src;
 439         void *ret = dst;
 440         size_t dstofss;
 441         size_t bits;
 442
 443         /**
 444          * Copy less than 16 bytes
 445          */
 446         if (n < 16) {
 447                 if (n & 0x01) {
 448                         *(uint8_t *)dstu = *(const uint8_t *)srcu;
 449                         srcu = (uintptr_t)((const uint8_t *)srcu + 1);
 450                         dstu = (uintptr_t)((uint8_t *)dstu + 1);
 451                 }
 452                 if (n & 0x02) {
 453                         *(uint16_t *)dstu = *(const uint16_t *)srcu;
 454                         srcu = (uintptr_t)((const uint16_t *)srcu + 1);
 455                         dstu = (uintptr_t)((uint16_t *)dstu + 1);
 456                 }
 457                 if (n & 0x04) {
 458                         *(uint32_t *)dstu = *(const uint32_t *)srcu;
 459                         srcu = (uintptr_t)((const uint32_t *)srcu + 1);
 460                         dstu = (uintptr_t)((uint32_t *)dstu + 1);
 461                 }
 462                 if (n & 0x08) {
 463                         *(uint64_t *)dstu = *(const uint64_t *)srcu;
 464                 }
 465                 return ret;
 466         }
 467
 468         /**
 469          * Fast way when copy size doesn't exceed 512 bytes
 470          */
 471         if (n <= 32) {
 472                 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
 473                 rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
 474                 return ret;
 475         }
 476         if (n <= 64) {
 477                 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 478                 rte_mov32((uint8_t *)dst - 32 + n, (const uint8_t *)src - 32 + n);
 479                 return ret;
 480         }
 481         if (n <= 512) {
 482                 if (n >= 256) {
 483                         n -= 256;
 484                         rte_mov256((uint8_t *)dst, (const uint8_t *)src);
 485                         src = (const uint8_t *)src + 256;
 486                         dst = (uint8_t *)dst + 256;
 487                 }
 488                 if (n >= 128) {
 489                         n -= 128;
 490                         rte_mov128((uint8_t *)dst, (const uint8_t *)src);
 491                         src = (const uint8_t *)src + 128;
 492                         dst = (uint8_t *)dst + 128;
 493                 }
 494                 if (n >= 64) {
 495                         n -= 64;
 496                         rte_mov64((uint8_t *)dst, (const uint8_t *)src);
 497                         src = (const uint8_t *)src + 64;
 498                         dst = (uint8_t *)dst + 64;
 499                 }
 500 COPY_BLOCK_64_BACK31:
 501                 if (n > 32) {
 502                         rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 503                         rte_mov32((uint8_t *)dst - 32 + n, (const uint8_t *)src - 32 + n);
 504                         return ret;
 505                 }
 506                 if (n > 0) {
 507                         rte_mov32((uint8_t *)dst - 32 + n, (const uint8_t *)src - 32 + n);
 508                 }
 509                 return ret;
 510         }
 511
 512         /**
 513          * Make store aligned when copy size exceeds 512 bytes
 514          */
 515         dstofss = 32 - ((uintptr_t)dst & 0x1F);
 516         if (dstofss > 0) {
 517                 n -= dstofss;
 518                 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 519                 src = (const uint8_t *)src + dstofss;
 520                 dst = (uint8_t *)dst + dstofss;
 521         }
 522
 523         /**
 524          * Copy 256-byte blocks.
 525          * Use copy block function for better instruction order control,
 526          * which is important when load is unaligned.
 527          */
 528         rte_mov256blocks((uint8_t *)dst, (const uint8_t *)src, n);
 529         bits = n;
 530         n = n & 255;
 531         bits -= n;
 532         src = (const uint8_t *)src + bits;
 533         dst = (uint8_t *)dst + bits;
 534
 535         /**
 536          * Copy 64-byte blocks.
 537          * Use copy block function for better instruction order control,
 538          * which is important when load is unaligned.
 539          */
 540         if (n >= 64) {
 541                 rte_mov64blocks((uint8_t *)dst, (const uint8_t *)src, n);
 542                 bits = n;
 543                 n = n & 63;
 544                 bits -= n;
 545                 src = (const uint8_t *)src + bits;
 546                 dst = (uint8_t *)dst + bits;
 547         }
 548
 549         /**
 550          * Copy whatever left
 551          */
 552         goto COPY_BLOCK_64_BACK31;
 553 }
 554
 555 #else /* RTE_MACHINE_CPUFLAG */
 556
 557 /**
 558  * SSE & AVX implementation below
 559  */
 560
 561 /**
 562  * Copy 16 bytes from one location to another,
 563  * locations should not overlap.
 564  */
 565 static inline void
 566 rte_mov16(uint8_t *dst, const uint8_t *src)
 567 {
 568         __m128i xmm0;
 569
 570         xmm0 = _mm_loadu_si128((const __m128i *)(const __m128i *)src);
 571         _mm_storeu_si128((__m128i *)dst, xmm0);
 572 }
 573
 574 /**
 575  * Copy 32 bytes from one location to another,
 576  * locations should not overlap.
 577  */
 578 static inline void
 579 rte_mov32(uint8_t *dst, const uint8_t *src)
 580 {
 581         rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
 582         rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
 583 }
 584
 585 /**
 586  * Copy 64 bytes from one location to another,
 587  * locations should not overlap.
 588  */
 589 static inline void
 590 rte_mov64(uint8_t *dst, const uint8_t *src)
 591 {
 592         rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
 593         rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
 594         rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
 595         rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
 596 }
 597
 598 /**
 599  * Copy 128 bytes from one location to another,
 600  * locations should not overlap.
 601  */
 602 static inline void
 603 rte_mov128(uint8_t *dst, const uint8_t *src)
 604 {
 605         rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
 606         rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
 607         rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
 608         rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
 609         rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
 610         rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
 611         rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
 612         rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
 613 }
 614
 615 /**
 616  * Copy 256 bytes from one location to another,
 617  * locations should not overlap.
 618  */
 619 static inline void
 620 rte_mov256(uint8_t *dst, const uint8_t *src)
 621 {
 622         rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
 623         rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
 624         rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
 625         rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
 626         rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
 627         rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
 628         rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
 629         rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
 630         rte_mov16((uint8_t *)dst + 8 * 16, (const uint8_t *)src + 8 * 16);
 631         rte_mov16((uint8_t *)dst + 9 * 16, (const uint8_t *)src + 9 * 16);
 632         rte_mov16((uint8_t *)dst + 10 * 16, (const uint8_t *)src + 10 * 16);
 633         rte_mov16((uint8_t *)dst + 11 * 16, (const uint8_t *)src + 11 * 16);
 634         rte_mov16((uint8_t *)dst + 12 * 16, (const uint8_t *)src + 12 * 16);
 635         rte_mov16((uint8_t *)dst + 13 * 16, (const uint8_t *)src + 13 * 16);
 636         rte_mov16((uint8_t *)dst + 14 * 16, (const uint8_t *)src + 14 * 16);
 637         rte_mov16((uint8_t *)dst + 15 * 16, (const uint8_t *)src + 15 * 16);
 638 }
 639
 640 /**
 641  * Macro for copying unaligned block from one location to another with constant load offset,
 642  * 47 bytes leftover maximum,
 643  * locations should not overlap.
 644  * Requirements:
 645  * - Store is aligned
 646  * - Load offset is <offset>, which must be immediate value within [1, 15]
 647  * - For <src>, make sure <offset> bit backwards & <16 - offset> bit forwards are available for loading
 648  * - <dst>, <src>, <len> must be variables
 649  * - __m128i <xmm0> ~ <xmm8> must be pre-defined
 650  */
 651 #define MOVEUNALIGNED_LEFT47_IMM(dst, src, len, offset)                                                     \
 652 ({                                                                                                          \
 653     int tmp;                                                                                                \
 654     while (len >= 128 + 16 - offset) {                                                                      \
 655         xmm0 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 0 * 16));                  \
 656         len -= 128;                                                                                         \
 657         xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 1 * 16));                  \
 658         xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 2 * 16));                  \
 659         xmm3 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 3 * 16));                  \
 660         xmm4 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 4 * 16));                  \
 661         xmm5 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 5 * 16));                  \
 662         xmm6 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 6 * 16));                  \
 663         xmm7 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 7 * 16));                  \
 664         xmm8 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 8 * 16));                  \
 665         src = (const uint8_t *)src + 128;                                                                   \
 666         _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset));        \
 667         _mm_storeu_si128((__m128i *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset));        \
 668         _mm_storeu_si128((__m128i *)((uint8_t *)dst + 2 * 16), _mm_alignr_epi8(xmm3, xmm2, offset));        \
 669         _mm_storeu_si128((__m128i *)((uint8_t *)dst + 3 * 16), _mm_alignr_epi8(xmm4, xmm3, offset));        \
 670         _mm_storeu_si128((__m128i *)((uint8_t *)dst + 4 * 16), _mm_alignr_epi8(xmm5, xmm4, offset));        \
 671         _mm_storeu_si128((__m128i *)((uint8_t *)dst + 5 * 16), _mm_alignr_epi8(xmm6, xmm5, offset));        \
 672         _mm_storeu_si128((__m128i *)((uint8_t *)dst + 6 * 16), _mm_alignr_epi8(xmm7, xmm6, offset));        \
 673         _mm_storeu_si128((__m128i *)((uint8_t *)dst + 7 * 16), _mm_alignr_epi8(xmm8, xmm7, offset));        \
 674         dst = (uint8_t *)dst + 128;                                                                         \
 675     }                                                                                                       \
 676     tmp = len;                                                                                              \
 677     len = ((len - 16 + offset) & 127) + 16 - offset;                                                        \
 678     tmp -= len;                                                                                             \
 679     src = (const uint8_t *)src + tmp;                                                                       \
 680     dst = (uint8_t *)dst + tmp;                                                                             \
 681     if (len >= 32 + 16 - offset) {                                                                          \
 682         while (len >= 32 + 16 - offset) {                                                                   \
 683             xmm0 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 0 * 16));              \
 684             len -= 32;                                                                                      \
 685             xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 1 * 16));              \
 686             xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 2 * 16));              \
 687             src = (const uint8_t *)src + 32;                                                                \
 688             _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset));    \
 689             _mm_storeu_si128((__m128i *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset));    \
 690             dst = (uint8_t *)dst + 32;                                                                      \
 691         }                                                                                                   \
 692         tmp = len;                                                                                          \
 693         len = ((len - 16 + offset) & 31) + 16 - offset;                                                     \
 694         tmp -= len;                                                                                         \
 695         src = (const uint8_t *)src + tmp;                                                                   \
 696         dst = (uint8_t *)dst + tmp;                                                                         \
 697     }                                                                                                       \
 698 })
 699
 700 /**
 701  * Macro for copying unaligned block from one location to another,
 702  * 47 bytes leftover maximum,
 703  * locations should not overlap.
 704  * Use switch here because the aligning instruction requires immediate value for shift count.
 705  * Requirements:
 706  * - Store is aligned
 707  * - Load offset is <offset>, which must be within [1, 15]
 708  * - For <src>, make sure <offset> bit backwards & <16 - offset> bit forwards are available for loading
 709  * - <dst>, <src>, <len> must be variables
 710  * - __m128i <xmm0> ~ <xmm8> used in MOVEUNALIGNED_LEFT47_IMM must be pre-defined
 711  */
 712 #define MOVEUNALIGNED_LEFT47(dst, src, len, offset)                   \
 713 ({                                                                    \
 714     switch (offset) {                                                 \
 715     case 0x01: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x01); break;    \
 716     case 0x02: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x02); break;    \
 717     case 0x03: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x03); break;    \
 718     case 0x04: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x04); break;    \
 719     case 0x05: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x05); break;    \
 720     case 0x06: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x06); break;    \
 721     case 0x07: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x07); break;    \
 722     case 0x08: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x08); break;    \
 723     case 0x09: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x09); break;    \
 724     case 0x0A: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0A); break;    \
 725     case 0x0B: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0B); break;    \
 726     case 0x0C: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0C); break;    \
 727     case 0x0D: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0D); break;    \
 728     case 0x0E: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0E); break;    \
 729     case 0x0F: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0F); break;    \
 730     default:;                                                         \
 731     }                                                                 \
 732 })
 733
 734 static inline void *
 735 rte_memcpy(void *dst, const void *src, size_t n)
 736 {
 737         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
 738         uintptr_t dstu = (uintptr_t)dst;
 739         uintptr_t srcu = (uintptr_t)src;
 740         void *ret = dst;
 741         size_t dstofss;
 742         size_t srcofs;
 743
 744         /**
 745          * Copy less than 16 bytes
 746          */
 747         if (n < 16) {
 748                 if (n & 0x01) {
 749                         *(uint8_t *)dstu = *(const uint8_t *)srcu;
 750                         srcu = (uintptr_t)((const uint8_t *)srcu + 1);
 751                         dstu = (uintptr_t)((uint8_t *)dstu + 1);
 752                 }
 753                 if (n & 0x02) {
 754                         *(uint16_t *)dstu = *(const uint16_t *)srcu;
 755                         srcu = (uintptr_t)((const uint16_t *)srcu + 1);
 756                         dstu = (uintptr_t)((uint16_t *)dstu + 1);
 757                 }
 758                 if (n & 0x04) {
 759                         *(uint32_t *)dstu = *(const uint32_t *)srcu;
 760                         srcu = (uintptr_t)((const uint32_t *)srcu + 1);
 761                         dstu = (uintptr_t)((uint32_t *)dstu + 1);
 762                 }
 763                 if (n & 0x08) {
 764                         *(uint64_t *)dstu = *(const uint64_t *)srcu;
 765                 }
 766                 return ret;
 767         }
 768
 769         /**
 770          * Fast way when copy size doesn't exceed 512 bytes
 771          */
 772         if (n <= 32) {
 773                 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
 774                 rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
 775                 return ret;
 776         }
 777         if (n <= 48) {
 778                 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 779                 rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
 780                 return ret;
 781         }
 782         if (n <= 64) {
 783                 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 784                 rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32);
 785                 rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
 786                 return ret;
 787         }
 788         if (n <= 128) {
 789                 goto COPY_BLOCK_128_BACK15;
 790         }
 791         if (n <= 512) {
 792                 if (n >= 256) {
 793                         n -= 256;
 794                         rte_mov128((uint8_t *)dst, (const uint8_t *)src);
 795                         rte_mov128((uint8_t *)dst + 128, (const uint8_t *)src + 128);
 796                         src = (const uint8_t *)src + 256;
 797                         dst = (uint8_t *)dst + 256;
 798                 }
 799 COPY_BLOCK_255_BACK15:
 800                 if (n >= 128) {
 801                         n -= 128;
 802                         rte_mov128((uint8_t *)dst, (const uint8_t *)src);
 803                         src = (const uint8_t *)src + 128;
 804                         dst = (uint8_t *)dst + 128;
 805                 }
 806 COPY_BLOCK_128_BACK15:
 807                 if (n >= 64) {
 808                         n -= 64;
 809                         rte_mov64((uint8_t *)dst, (const uint8_t *)src);
 810                         src = (const uint8_t *)src + 64;
 811                         dst = (uint8_t *)dst + 64;
 812                 }
 813 COPY_BLOCK_64_BACK15:
 814                 if (n >= 32) {
 815                         n -= 32;
 816                         rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 817                         src = (const uint8_t *)src + 32;
 818                         dst = (uint8_t *)dst + 32;
 819                 }
 820                 if (n > 16) {
 821                         rte_mov16((uint8_t *)dst, (const uint8_t *)src);
 822                         rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
 823                         return ret;
 824                 }
 825                 if (n > 0) {
 826                         rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
 827                 }
 828                 return ret;
 829         }
 830
 831         /**
 832          * Make store aligned when copy size exceeds 512 bytes,
 833          * and make sure the first 15 bytes are copied, because
 834          * unaligned copy functions require up to 15 bytes
 835          * backwards access.
 836          */
 837         dstofss = 16 - ((uintptr_t)dst & 0x0F) + 16;
 838         if (dstofss > 0) {
 839                 n -= dstofss;
 840                 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 841                 src = (const uint8_t *)src + dstofss;
 842                 dst = (uint8_t *)dst + dstofss;
 843                 srcofs = ((uintptr_t)src & 0x0F);
 844         }
 845
 846         /**
 847          * For aligned copy
 848          */
 849         if (srcofs == 0) {
 850                 /**
 851                  * Copy 256-byte blocks
 852                  */
 853                 for (; n >= 256; n -= 256) {
 854                         rte_mov256((uint8_t *)dst, (const uint8_t *)src);
 855                         dst = (uint8_t *)dst + 256;
 856                         src = (const uint8_t *)src + 256;
 857                 }
 858
 859                 /**
 860                  * Copy whatever left
 861                  */
 862                 goto COPY_BLOCK_255_BACK15;
 863         }
 864
 865         /**
 866          * For copy with unaligned load
 867          */
 868         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
 869
 870         /**
 871          * Copy whatever left
 872          */
 873         goto COPY_BLOCK_64_BACK15;
 874 }
 875
 876 #endif /* RTE_MACHINE_CPUFLAG */
 877
 878 #ifdef __cplusplus
 879 }
 880 #endif
 881
 882 #endif /* _RTE_MEMCPY_X86_64_H_ */