lib/librte_eal/common/include/arch/x86/rte_memcpy_internal.h

   1 /*-
   2  *   BSD LICENSE
   3  *
   4  *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
   5  *   All rights reserved.
   6  *
   7  *   Redistribution and use in source and binary forms, with or without
   8  *   modification, are permitted provided that the following conditions
   9  *   are met:
  10  *
  11  *     * Redistributions of source code must retain the above copyright
  12  *       notice, this list of conditions and the following disclaimer.
  13  *     * Redistributions in binary form must reproduce the above copyright
  14  *       notice, this list of conditions and the following disclaimer in
  15  *       the documentation and/or other materials provided with the
  16  *       distribution.
  17  *     * Neither the name of Intel Corporation nor the names of its
  18  *       contributors may be used to endorse or promote products derived
  19  *       from this software without specific prior written permission.
  20  *
  21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  32  */
  33
  34 #ifndef _RTE_MEMCPY_INTERNAL_X86_64_H_
  35 #define _RTE_MEMCPY_INTERNAL_X86_64_H_
  36
  37 /**
  38  * @file
  39  *
  40  * Functions for SSE/AVX/AVX2/AVX512 implementation of memcpy().
  41  */
  42
  43 #include <stdio.h>
  44 #include <stdint.h>
  45 #include <string.h>
  46 #include <rte_vect.h>
  47 #include <rte_common.h>
  48
  49 #ifdef __cplusplus
  50 extern "C" {
  51 #endif
  52
  53 /**
  54  * Copy bytes from one location to another. The locations must not overlap.
  55  *
  56  * @note This is implemented as a macro, so it's address should not be taken
  57  * and care is needed as parameter expressions may be evaluated multiple times.
  58  *
  59  * @param dst
  60  *   Pointer to the destination of the data.
  61  * @param src
  62  *   Pointer to the source data.
  63  * @param n
  64  *   Number of bytes to copy.
  65  * @return
  66  *   Pointer to the destination data.
  67  */
  68
  69 #ifdef RTE_MACHINE_CPUFLAG_AVX512F
  70
  71 #define ALIGNMENT_MASK 0x3F
  72
  73 /**
  74  * AVX512 implementation below
  75  */
  76
  77 /**
  78  * Copy 16 bytes from one location to another,
  79  * locations should not overlap.
  80  */
  81 static inline void
  82 rte_mov16(uint8_t *dst, const uint8_t *src)
  83 {
  84         __m128i xmm0;
  85
  86         xmm0 = _mm_loadu_si128((const __m128i *)src);
  87         _mm_storeu_si128((__m128i *)dst, xmm0);
  88 }
  89
  90 /**
  91  * Copy 32 bytes from one location to another,
  92  * locations should not overlap.
  93  */
  94 static inline void
  95 rte_mov32(uint8_t *dst, const uint8_t *src)
  96 {
  97         __m256i ymm0;
  98
  99         ymm0 = _mm256_loadu_si256((const __m256i *)src);
 100         _mm256_storeu_si256((__m256i *)dst, ymm0);
 101 }
 102
 103 /**
 104  * Copy 64 bytes from one location to another,
 105  * locations should not overlap.
 106  */
 107 static inline void
 108 rte_mov64(uint8_t *dst, const uint8_t *src)
 109 {
 110         __m512i zmm0;
 111
 112         zmm0 = _mm512_loadu_si512((const void *)src);
 113         _mm512_storeu_si512((void *)dst, zmm0);
 114 }
 115
 116 /**
 117  * Copy 128 bytes from one location to another,
 118  * locations should not overlap.
 119  */
 120 static inline void
 121 rte_mov128(uint8_t *dst, const uint8_t *src)
 122 {
 123         rte_mov64(dst + 0 * 64, src + 0 * 64);
 124         rte_mov64(dst + 1 * 64, src + 1 * 64);
 125 }
 126
 127 /**
 128  * Copy 256 bytes from one location to another,
 129  * locations should not overlap.
 130  */
 131 static inline void
 132 rte_mov256(uint8_t *dst, const uint8_t *src)
 133 {
 134         rte_mov64(dst + 0 * 64, src + 0 * 64);
 135         rte_mov64(dst + 1 * 64, src + 1 * 64);
 136         rte_mov64(dst + 2 * 64, src + 2 * 64);
 137         rte_mov64(dst + 3 * 64, src + 3 * 64);
 138 }
 139
 140 /**
 141  * Copy 128-byte blocks from one location to another,
 142  * locations should not overlap.
 143  */
 144 static inline void
 145 rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
 146 {
 147         __m512i zmm0, zmm1;
 148
 149         while (n >= 128) {
 150                 zmm0 = _mm512_loadu_si512((const void *)(src + 0 * 64));
 151                 n -= 128;
 152                 zmm1 = _mm512_loadu_si512((const void *)(src + 1 * 64));
 153                 src = src + 128;
 154                 _mm512_storeu_si512((void *)(dst + 0 * 64), zmm0);
 155                 _mm512_storeu_si512((void *)(dst + 1 * 64), zmm1);
 156                 dst = dst + 128;
 157         }
 158 }
 159
 160 /**
 161  * Copy 512-byte blocks from one location to another,
 162  * locations should not overlap.
 163  */
 164 static inline void
 165 rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n)
 166 {
 167         __m512i zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7;
 168
 169         while (n >= 512) {
 170                 zmm0 = _mm512_loadu_si512((const void *)(src + 0 * 64));
 171                 n -= 512;
 172                 zmm1 = _mm512_loadu_si512((const void *)(src + 1 * 64));
 173                 zmm2 = _mm512_loadu_si512((const void *)(src + 2 * 64));
 174                 zmm3 = _mm512_loadu_si512((const void *)(src + 3 * 64));
 175                 zmm4 = _mm512_loadu_si512((const void *)(src + 4 * 64));
 176                 zmm5 = _mm512_loadu_si512((const void *)(src + 5 * 64));
 177                 zmm6 = _mm512_loadu_si512((const void *)(src + 6 * 64));
 178                 zmm7 = _mm512_loadu_si512((const void *)(src + 7 * 64));
 179                 src = src + 512;
 180                 _mm512_storeu_si512((void *)(dst + 0 * 64), zmm0);
 181                 _mm512_storeu_si512((void *)(dst + 1 * 64), zmm1);
 182                 _mm512_storeu_si512((void *)(dst + 2 * 64), zmm2);
 183                 _mm512_storeu_si512((void *)(dst + 3 * 64), zmm3);
 184                 _mm512_storeu_si512((void *)(dst + 4 * 64), zmm4);
 185                 _mm512_storeu_si512((void *)(dst + 5 * 64), zmm5);
 186                 _mm512_storeu_si512((void *)(dst + 6 * 64), zmm6);
 187                 _mm512_storeu_si512((void *)(dst + 7 * 64), zmm7);
 188                 dst = dst + 512;
 189         }
 190 }
 191
 192 static inline void *
 193 rte_memcpy_generic(void *dst, const void *src, size_t n)
 194 {
 195         uintptr_t dstu = (uintptr_t)dst;
 196         uintptr_t srcu = (uintptr_t)src;
 197         void *ret = dst;
 198         size_t dstofss;
 199         size_t bits;
 200
 201         /**
 202          * Copy less than 16 bytes
 203          */
 204         if (n < 16) {
 205                 if (n & 0x01) {
 206                         *(uint8_t *)dstu = *(const uint8_t *)srcu;
 207                         srcu = (uintptr_t)((const uint8_t *)srcu + 1);
 208                         dstu = (uintptr_t)((uint8_t *)dstu + 1);
 209                 }
 210                 if (n & 0x02) {
 211                         *(uint16_t *)dstu = *(const uint16_t *)srcu;
 212                         srcu = (uintptr_t)((const uint16_t *)srcu + 1);
 213                         dstu = (uintptr_t)((uint16_t *)dstu + 1);
 214                 }
 215                 if (n & 0x04) {
 216                         *(uint32_t *)dstu = *(const uint32_t *)srcu;
 217                         srcu = (uintptr_t)((const uint32_t *)srcu + 1);
 218                         dstu = (uintptr_t)((uint32_t *)dstu + 1);
 219                 }
 220                 if (n & 0x08)
 221                         *(uint64_t *)dstu = *(const uint64_t *)srcu;
 222                 return ret;
 223         }
 224
 225         /**
 226          * Fast way when copy size doesn't exceed 512 bytes
 227          */
 228         if (n <= 32) {
 229                 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
 230                 rte_mov16((uint8_t *)dst - 16 + n,
 231                                   (const uint8_t *)src - 16 + n);
 232                 return ret;
 233         }
 234         if (n <= 64) {
 235                 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 236                 rte_mov32((uint8_t *)dst - 32 + n,
 237                                   (const uint8_t *)src - 32 + n);
 238                 return ret;
 239         }
 240         if (n <= 512) {
 241                 if (n >= 256) {
 242                         n -= 256;
 243                         rte_mov256((uint8_t *)dst, (const uint8_t *)src);
 244                         src = (const uint8_t *)src + 256;
 245                         dst = (uint8_t *)dst + 256;
 246                 }
 247                 if (n >= 128) {
 248                         n -= 128;
 249                         rte_mov128((uint8_t *)dst, (const uint8_t *)src);
 250                         src = (const uint8_t *)src + 128;
 251                         dst = (uint8_t *)dst + 128;
 252                 }
 253 COPY_BLOCK_128_BACK63:
 254                 if (n > 64) {
 255                         rte_mov64((uint8_t *)dst, (const uint8_t *)src);
 256                         rte_mov64((uint8_t *)dst - 64 + n,
 257                                           (const uint8_t *)src - 64 + n);
 258                         return ret;
 259                 }
 260                 if (n > 0)
 261                         rte_mov64((uint8_t *)dst - 64 + n,
 262                                           (const uint8_t *)src - 64 + n);
 263                 return ret;
 264         }
 265
 266         /**
 267          * Make store aligned when copy size exceeds 512 bytes
 268          */
 269         dstofss = ((uintptr_t)dst & 0x3F);
 270         if (dstofss > 0) {
 271                 dstofss = 64 - dstofss;
 272                 n -= dstofss;
 273                 rte_mov64((uint8_t *)dst, (const uint8_t *)src);
 274                 src = (const uint8_t *)src + dstofss;
 275                 dst = (uint8_t *)dst + dstofss;
 276         }
 277
 278         /**
 279          * Copy 512-byte blocks.
 280          * Use copy block function for better instruction order control,
 281          * which is important when load is unaligned.
 282          */
 283         rte_mov512blocks((uint8_t *)dst, (const uint8_t *)src, n);
 284         bits = n;
 285         n = n & 511;
 286         bits -= n;
 287         src = (const uint8_t *)src + bits;
 288         dst = (uint8_t *)dst + bits;
 289
 290         /**
 291          * Copy 128-byte blocks.
 292          * Use copy block function for better instruction order control,
 293          * which is important when load is unaligned.
 294          */
 295         if (n >= 128) {
 296                 rte_mov128blocks((uint8_t *)dst, (const uint8_t *)src, n);
 297                 bits = n;
 298                 n = n & 127;
 299                 bits -= n;
 300                 src = (const uint8_t *)src + bits;
 301                 dst = (uint8_t *)dst + bits;
 302         }
 303
 304         /**
 305          * Copy whatever left
 306          */
 307         goto COPY_BLOCK_128_BACK63;
 308 }
 309
 310 #elif defined RTE_MACHINE_CPUFLAG_AVX2
 311
 312 #define ALIGNMENT_MASK 0x1F
 313
 314 /**
 315  * AVX2 implementation below
 316  */
 317
 318 /**
 319  * Copy 16 bytes from one location to another,
 320  * locations should not overlap.
 321  */
 322 static inline void
 323 rte_mov16(uint8_t *dst, const uint8_t *src)
 324 {
 325         __m128i xmm0;
 326
 327         xmm0 = _mm_loadu_si128((const __m128i *)src);
 328         _mm_storeu_si128((__m128i *)dst, xmm0);
 329 }
 330
 331 /**
 332  * Copy 32 bytes from one location to another,
 333  * locations should not overlap.
 334  */
 335 static inline void
 336 rte_mov32(uint8_t *dst, const uint8_t *src)
 337 {
 338         __m256i ymm0;
 339
 340         ymm0 = _mm256_loadu_si256((const __m256i *)src);
 341         _mm256_storeu_si256((__m256i *)dst, ymm0);
 342 }
 343
 344 /**
 345  * Copy 64 bytes from one location to another,
 346  * locations should not overlap.
 347  */
 348 static inline void
 349 rte_mov64(uint8_t *dst, const uint8_t *src)
 350 {
 351         rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
 352         rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
 353 }
 354
 355 /**
 356  * Copy 128 bytes from one location to another,
 357  * locations should not overlap.
 358  */
 359 static inline void
 360 rte_mov128(uint8_t *dst, const uint8_t *src)
 361 {
 362         rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
 363         rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
 364         rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32);
 365         rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32);
 366 }
 367
 368 /**
 369  * Copy 128-byte blocks from one location to another,
 370  * locations should not overlap.
 371  */
 372 static inline void
 373 rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
 374 {
 375         __m256i ymm0, ymm1, ymm2, ymm3;
 376
 377         while (n >= 128) {
 378                 ymm0 = _mm256_loadu_si256((const __m256i *)
 379                                 ((const uint8_t *)src + 0 * 32));
 380                 n -= 128;
 381                 ymm1 = _mm256_loadu_si256((const __m256i *)
 382                                 ((const uint8_t *)src + 1 * 32));
 383                 ymm2 = _mm256_loadu_si256((const __m256i *)
 384                                 ((const uint8_t *)src + 2 * 32));
 385                 ymm3 = _mm256_loadu_si256((const __m256i *)
 386                                 ((const uint8_t *)src + 3 * 32));
 387                 src = (const uint8_t *)src + 128;
 388                 _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 0 * 32), ymm0);
 389                 _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 1 * 32), ymm1);
 390                 _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 2 * 32), ymm2);
 391                 _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 3 * 32), ymm3);
 392                 dst = (uint8_t *)dst + 128;
 393         }
 394 }
 395
 396 static inline void *
 397 rte_memcpy_generic(void *dst, const void *src, size_t n)
 398 {
 399         uintptr_t dstu = (uintptr_t)dst;
 400         uintptr_t srcu = (uintptr_t)src;
 401         void *ret = dst;
 402         size_t dstofss;
 403         size_t bits;
 404
 405         /**
 406          * Copy less than 16 bytes
 407          */
 408         if (n < 16) {
 409                 if (n & 0x01) {
 410                         *(uint8_t *)dstu = *(const uint8_t *)srcu;
 411                         srcu = (uintptr_t)((const uint8_t *)srcu + 1);
 412                         dstu = (uintptr_t)((uint8_t *)dstu + 1);
 413                 }
 414                 if (n & 0x02) {
 415                         *(uint16_t *)dstu = *(const uint16_t *)srcu;
 416                         srcu = (uintptr_t)((const uint16_t *)srcu + 1);
 417                         dstu = (uintptr_t)((uint16_t *)dstu + 1);
 418                 }
 419                 if (n & 0x04) {
 420                         *(uint32_t *)dstu = *(const uint32_t *)srcu;
 421                         srcu = (uintptr_t)((const uint32_t *)srcu + 1);
 422                         dstu = (uintptr_t)((uint32_t *)dstu + 1);
 423                 }
 424                 if (n & 0x08)
 425                         *(uint64_t *)dstu = *(const uint64_t *)srcu;
 426                 return ret;
 427         }
 428
 429         /**
 430          * Fast way when copy size doesn't exceed 256 bytes
 431          */
 432         if (n <= 32) {
 433                 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
 434                 rte_mov16((uint8_t *)dst - 16 + n,
 435                                 (const uint8_t *)src - 16 + n);
 436                 return ret;
 437         }
 438         if (n <= 48) {
 439                 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
 440                 rte_mov16((uint8_t *)dst + 16, (const uint8_t *)src + 16);
 441                 rte_mov16((uint8_t *)dst - 16 + n,
 442                                 (const uint8_t *)src - 16 + n);
 443                 return ret;
 444         }
 445         if (n <= 64) {
 446                 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 447                 rte_mov32((uint8_t *)dst - 32 + n,
 448                                 (const uint8_t *)src - 32 + n);
 449                 return ret;
 450         }
 451         if (n <= 256) {
 452                 if (n >= 128) {
 453                         n -= 128;
 454                         rte_mov128((uint8_t *)dst, (const uint8_t *)src);
 455                         src = (const uint8_t *)src + 128;
 456                         dst = (uint8_t *)dst + 128;
 457                 }
 458 COPY_BLOCK_128_BACK31:
 459                 if (n >= 64) {
 460                         n -= 64;
 461                         rte_mov64((uint8_t *)dst, (const uint8_t *)src);
 462                         src = (const uint8_t *)src + 64;
 463                         dst = (uint8_t *)dst + 64;
 464                 }
 465                 if (n > 32) {
 466                         rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 467                         rte_mov32((uint8_t *)dst - 32 + n,
 468                                         (const uint8_t *)src - 32 + n);
 469                         return ret;
 470                 }
 471                 if (n > 0) {
 472                         rte_mov32((uint8_t *)dst - 32 + n,
 473                                         (const uint8_t *)src - 32 + n);
 474                 }
 475                 return ret;
 476         }
 477
 478         /**
 479          * Make store aligned when copy size exceeds 256 bytes
 480          */
 481         dstofss = (uintptr_t)dst & 0x1F;
 482         if (dstofss > 0) {
 483                 dstofss = 32 - dstofss;
 484                 n -= dstofss;
 485                 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 486                 src = (const uint8_t *)src + dstofss;
 487                 dst = (uint8_t *)dst + dstofss;
 488         }
 489
 490         /**
 491          * Copy 128-byte blocks
 492          */
 493         rte_mov128blocks((uint8_t *)dst, (const uint8_t *)src, n);
 494         bits = n;
 495         n = n & 127;
 496         bits -= n;
 497         src = (const uint8_t *)src + bits;
 498         dst = (uint8_t *)dst + bits;
 499
 500         /**
 501          * Copy whatever left
 502          */
 503         goto COPY_BLOCK_128_BACK31;
 504 }
 505
 506 #else /* RTE_MACHINE_CPUFLAG */
 507
 508 #define ALIGNMENT_MASK 0x0F
 509
 510 /**
 511  * SSE & AVX implementation below
 512  */
 513
 514 /**
 515  * Copy 16 bytes from one location to another,
 516  * locations should not overlap.
 517  */
 518 static inline void
 519 rte_mov16(uint8_t *dst, const uint8_t *src)
 520 {
 521         __m128i xmm0;
 522
 523         xmm0 = _mm_loadu_si128((const __m128i *)(const __m128i *)src);
 524         _mm_storeu_si128((__m128i *)dst, xmm0);
 525 }
 526
 527 /**
 528  * Copy 32 bytes from one location to another,
 529  * locations should not overlap.
 530  */
 531 static inline void
 532 rte_mov32(uint8_t *dst, const uint8_t *src)
 533 {
 534         rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
 535         rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
 536 }
 537
 538 /**
 539  * Copy 64 bytes from one location to another,
 540  * locations should not overlap.
 541  */
 542 static inline void
 543 rte_mov64(uint8_t *dst, const uint8_t *src)
 544 {
 545         rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
 546         rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
 547         rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
 548         rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
 549 }
 550
 551 /**
 552  * Copy 128 bytes from one location to another,
 553  * locations should not overlap.
 554  */
 555 static inline void
 556 rte_mov128(uint8_t *dst, const uint8_t *src)
 557 {
 558         rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
 559         rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
 560         rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
 561         rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
 562         rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
 563         rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
 564         rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
 565         rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
 566 }
 567
 568 /**
 569  * Copy 256 bytes from one location to another,
 570  * locations should not overlap.
 571  */
 572 static inline void
 573 rte_mov256(uint8_t *dst, const uint8_t *src)
 574 {
 575         rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
 576         rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
 577         rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
 578         rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
 579         rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
 580         rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
 581         rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
 582         rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
 583         rte_mov16((uint8_t *)dst + 8 * 16, (const uint8_t *)src + 8 * 16);
 584         rte_mov16((uint8_t *)dst + 9 * 16, (const uint8_t *)src + 9 * 16);
 585         rte_mov16((uint8_t *)dst + 10 * 16, (const uint8_t *)src + 10 * 16);
 586         rte_mov16((uint8_t *)dst + 11 * 16, (const uint8_t *)src + 11 * 16);
 587         rte_mov16((uint8_t *)dst + 12 * 16, (const uint8_t *)src + 12 * 16);
 588         rte_mov16((uint8_t *)dst + 13 * 16, (const uint8_t *)src + 13 * 16);
 589         rte_mov16((uint8_t *)dst + 14 * 16, (const uint8_t *)src + 14 * 16);
 590         rte_mov16((uint8_t *)dst + 15 * 16, (const uint8_t *)src + 15 * 16);
 591 }
 592
 593 /**
 594  * Macro for copying unaligned block from one location to another with constant
 595  * load offset, 47 bytes leftover maximum,
 596  * locations should not overlap.
 597  * Requirements:
 598  * - Store is aligned
 599  * - Load offset is <offset>, which must be immediate value within [1, 15]
 600  * - For <src>, make sure <offset> bit backwards & <16 - offset> bit forwards
 601  *   are available for loading
 602  * - <dst>, <src>, <len> must be variables
 603  * - __m128i <xmm0> ~ <xmm8> must be pre-defined
 604  */
 605 #define MOVEUNALIGNED_LEFT47_IMM(dst, src, len, offset)(                      \
 606 __extension__ ({                                                              \
 607         int tmp;                                                              \
 608         while (len >= 128 + 16 - offset) {                                    \
 609                 xmm0 = _mm_loadu_si128((const __m128i *)                      \
 610                         ((const uint8_t *)src - offset + 0 * 16));            \
 611                 len -= 128;                                                   \
 612                 xmm1 = _mm_loadu_si128((const __m128i *)                      \
 613                         ((const uint8_t *)src - offset + 1 * 16));            \
 614                 xmm2 = _mm_loadu_si128((const __m128i *)                      \
 615                         ((const uint8_t *)src - offset + 2 * 16));            \
 616                 xmm3 = _mm_loadu_si128((const __m128i *)                      \
 617                         ((const uint8_t *)src - offset + 3 * 16));            \
 618                 xmm4 = _mm_loadu_si128((const __m128i *)                      \
 619                         ((const uint8_t *)src - offset + 4 * 16));            \
 620                 xmm5 = _mm_loadu_si128((const __m128i *)                      \
 621                         ((const uint8_t *)src - offset + 5 * 16));            \
 622                 xmm6 = _mm_loadu_si128((const __m128i *)                      \
 623                         ((const uint8_t *)src - offset + 6 * 16));            \
 624                 xmm7 = _mm_loadu_si128((const __m128i *)                      \
 625                         ((const uint8_t *)src - offset + 7 * 16));            \
 626                 xmm8 = _mm_loadu_si128((const __m128i *)                      \
 627                         ((const uint8_t *)src - offset + 8 * 16));            \
 628                 src = (const uint8_t *)src + 128;                             \
 629                 _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 * 16),        \
 630                         _mm_alignr_epi8(xmm1, xmm0, offset));                 \
 631                 _mm_storeu_si128((__m128i *)((uint8_t *)dst + 1 * 16),        \
 632                         _mm_alignr_epi8(xmm2, xmm1, offset));                 \
 633                 _mm_storeu_si128((__m128i *)((uint8_t *)dst + 2 * 16),        \
 634                         _mm_alignr_epi8(xmm3, xmm2, offset));                 \
 635                 _mm_storeu_si128((__m128i *)((uint8_t *)dst + 3 * 16),        \
 636                         _mm_alignr_epi8(xmm4, xmm3, offset));                 \
 637                 _mm_storeu_si128((__m128i *)((uint8_t *)dst + 4 * 16),        \
 638                         _mm_alignr_epi8(xmm5, xmm4, offset));                 \
 639                 _mm_storeu_si128((__m128i *)((uint8_t *)dst + 5 * 16),        \
 640                         _mm_alignr_epi8(xmm6, xmm5, offset));                 \
 641                 _mm_storeu_si128((__m128i *)((uint8_t *)dst + 6 * 16),        \
 642                         _mm_alignr_epi8(xmm7, xmm6, offset));                 \
 643                 _mm_storeu_si128((__m128i *)((uint8_t *)dst + 7 * 16),        \
 644                         _mm_alignr_epi8(xmm8, xmm7, offset));                 \
 645                 dst = (uint8_t *)dst + 128;                                   \
 646         }                                                                     \
 647         tmp = len;                                                            \
 648         len = ((len - 16 + offset) & 127) + 16 - offset;                      \
 649         tmp -= len;                                                           \
 650         src = (const uint8_t *)src + tmp;                                     \
 651         dst = (uint8_t *)dst + tmp;                                           \
 652         if (len >= 32 + 16 - offset) {                                        \
 653                 while (len >= 32 + 16 - offset) {                             \
 654                         xmm0 = _mm_loadu_si128((const __m128i *)              \
 655                                 ((const uint8_t *)src - offset + 0 * 16));    \
 656                         len -= 32;                                            \
 657                         xmm1 = _mm_loadu_si128((const __m128i *)              \
 658                                 ((const uint8_t *)src - offset + 1 * 16));    \
 659                         xmm2 = _mm_loadu_si128((const __m128i *)              \
 660                                 ((const uint8_t *)src - offset + 2 * 16));    \
 661                         src = (const uint8_t *)src + 32;                      \
 662                         _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 * 16),\
 663                                 _mm_alignr_epi8(xmm1, xmm0, offset));         \
 664                         _mm_storeu_si128((__m128i *)((uint8_t *)dst + 1 * 16),\
 665                                 _mm_alignr_epi8(xmm2, xmm1, offset));         \
 666                         dst = (uint8_t *)dst + 32;                            \
 667                 }                                                             \
 668                 tmp = len;                                                    \
 669                 len = ((len - 16 + offset) & 31) + 16 - offset;               \
 670                 tmp -= len;                                                   \
 671                 src = (const uint8_t *)src + tmp;                             \
 672                 dst = (uint8_t *)dst + tmp;                                   \
 673         }                                                                     \
 674 }))
 675
 676 /**
 677  * Macro for copying unaligned block from one location to another,
 678  * 47 bytes leftover maximum,
 679  * locations should not overlap.
 680  * Use switch here because the aligning instruction requires immediate value
 681  * for shift count.
 682  * Requirements:
 683  * - Store is aligned
 684  * - Load offset is <offset>, which must be within [1, 15]
 685  * - For <src>, make sure <offset> bit backwards & <16 - offset> bit forwards
 686  *   are available for loading
 687  * - <dst>, <src>, <len> must be variables
 688  * - __m128i <xmm0> ~ <xmm8> used in MOVEUNALIGNED_LEFT47_IMM must be
 689  *   pre-defined
 690  */
 691 #define MOVEUNALIGNED_LEFT47(dst, src, len, offset)(                        \
 692 __extension__ ({                                                            \
 693         switch (offset) {                                                   \
 694         case 0x01:                                                          \
 695                 MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x01);                \
 696                 break;                                                      \
 697         case 0x02:                                                          \
 698                 MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x02);                \
 699                 break;                                                      \
 700         case 0x03:                                                          \
 701                 MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x03);                \
 702                 break;                                                      \
 703         case 0x04:                                                          \
 704                 MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x04);                \
 705                 break;                                                      \
 706         case 0x05:                                                          \
 707                 MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x05);                \
 708                 break;                                                      \
 709         case 0x06:                                                          \
 710                 MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x06);                \
 711                 break;                                                      \
 712         case 0x07:                                                          \
 713                 MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x07);                \
 714                 break;                                                      \
 715         case 0x08:                                                          \
 716                 MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x08);                \
 717                 break;                                                      \
 718         case 0x09:                                                          \
 719                 MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x09);                \
 720                 break;                                                      \
 721         case 0x0A:                                                          \
 722                 MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0A);                \
 723                 break;                                                      \
 724         case 0x0B:                                                          \
 725                 MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0B);                \
 726                 break;                                                      \
 727         case 0x0C:                                                          \
 728                 MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0C);                \
 729                 break;                                                      \
 730         case 0x0D:                                                          \
 731                 MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0D);                \
 732                 break;                                                      \
 733         case 0x0E:                                                          \
 734                 MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0E);                \
 735                 break;                                                      \
 736         case 0x0F:                                                          \
 737                 MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0F);                \
 738                 break;                                                      \
 739         default:                                                            \
 740                 break;                                                      \
 741         }                                                                   \
 742 }))
 743
 744 static inline void *
 745 rte_memcpy_generic(void *dst, const void *src, size_t n)
 746 {
 747         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
 748         uintptr_t dstu = (uintptr_t)dst;
 749         uintptr_t srcu = (uintptr_t)src;
 750         void *ret = dst;
 751         size_t dstofss;
 752         size_t srcofs;
 753
 754         /**
 755          * Copy less than 16 bytes
 756          */
 757         if (n < 16) {
 758                 if (n & 0x01) {
 759                         *(uint8_t *)dstu = *(const uint8_t *)srcu;
 760                         srcu = (uintptr_t)((const uint8_t *)srcu + 1);
 761                         dstu = (uintptr_t)((uint8_t *)dstu + 1);
 762                 }
 763                 if (n & 0x02) {
 764                         *(uint16_t *)dstu = *(const uint16_t *)srcu;
 765                         srcu = (uintptr_t)((const uint16_t *)srcu + 1);
 766                         dstu = (uintptr_t)((uint16_t *)dstu + 1);
 767                 }
 768                 if (n & 0x04) {
 769                         *(uint32_t *)dstu = *(const uint32_t *)srcu;
 770                         srcu = (uintptr_t)((const uint32_t *)srcu + 1);
 771                         dstu = (uintptr_t)((uint32_t *)dstu + 1);
 772                 }
 773                 if (n & 0x08)
 774                         *(uint64_t *)dstu = *(const uint64_t *)srcu;
 775                 return ret;
 776         }
 777
 778         /**
 779          * Fast way when copy size doesn't exceed 512 bytes
 780          */
 781         if (n <= 32) {
 782                 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
 783                 rte_mov16((uint8_t *)dst - 16 + n,
 784                                 (const uint8_t *)src - 16 + n);
 785                 return ret;
 786         }
 787         if (n <= 48) {
 788                 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 789                 rte_mov16((uint8_t *)dst - 16 + n,
 790                                 (const uint8_t *)src - 16 + n);
 791                 return ret;
 792         }
 793         if (n <= 64) {
 794                 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 795                 rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32);
 796                 rte_mov16((uint8_t *)dst - 16 + n,
 797                                 (const uint8_t *)src - 16 + n);
 798                 return ret;
 799         }
 800         if (n <= 128)
 801                 goto COPY_BLOCK_128_BACK15;
 802         if (n <= 512) {
 803                 if (n >= 256) {
 804                         n -= 256;
 805                         rte_mov128((uint8_t *)dst, (const uint8_t *)src);
 806                         rte_mov128((uint8_t *)dst + 128,
 807                                         (const uint8_t *)src + 128);
 808                         src = (const uint8_t *)src + 256;
 809                         dst = (uint8_t *)dst + 256;
 810                 }
 811 COPY_BLOCK_255_BACK15:
 812                 if (n >= 128) {
 813                         n -= 128;
 814                         rte_mov128((uint8_t *)dst, (const uint8_t *)src);
 815                         src = (const uint8_t *)src + 128;
 816                         dst = (uint8_t *)dst + 128;
 817                 }
 818 COPY_BLOCK_128_BACK15:
 819                 if (n >= 64) {
 820                         n -= 64;
 821                         rte_mov64((uint8_t *)dst, (const uint8_t *)src);
 822                         src = (const uint8_t *)src + 64;
 823                         dst = (uint8_t *)dst + 64;
 824                 }
 825 COPY_BLOCK_64_BACK15:
 826                 if (n >= 32) {
 827                         n -= 32;
 828                         rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 829                         src = (const uint8_t *)src + 32;
 830                         dst = (uint8_t *)dst + 32;
 831                 }
 832                 if (n > 16) {
 833                         rte_mov16((uint8_t *)dst, (const uint8_t *)src);
 834                         rte_mov16((uint8_t *)dst - 16 + n,
 835                                         (const uint8_t *)src - 16 + n);
 836                         return ret;
 837                 }
 838                 if (n > 0) {
 839                         rte_mov16((uint8_t *)dst - 16 + n,
 840                                         (const uint8_t *)src - 16 + n);
 841                 }
 842                 return ret;
 843         }
 844
 845         /**
 846          * Make store aligned when copy size exceeds 512 bytes,
 847          * and make sure the first 15 bytes are copied, because
 848          * unaligned copy functions require up to 15 bytes
 849          * backwards access.
 850          */
 851         dstofss = (uintptr_t)dst & 0x0F;
 852         if (dstofss > 0) {
 853                 dstofss = 16 - dstofss + 16;
 854                 n -= dstofss;
 855                 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 856                 src = (const uint8_t *)src + dstofss;
 857                 dst = (uint8_t *)dst + dstofss;
 858         }
 859         srcofs = ((uintptr_t)src & 0x0F);
 860
 861         /**
 862          * For aligned copy
 863          */
 864         if (srcofs == 0) {
 865                 /**
 866                  * Copy 256-byte blocks
 867                  */
 868                 for (; n >= 256; n -= 256) {
 869                         rte_mov256((uint8_t *)dst, (const uint8_t *)src);
 870                         dst = (uint8_t *)dst + 256;
 871                         src = (const uint8_t *)src + 256;
 872                 }
 873
 874                 /**
 875                  * Copy whatever left
 876                  */
 877                 goto COPY_BLOCK_255_BACK15;
 878         }
 879
 880         /**
 881          * For copy with unaligned load
 882          */
 883         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
 884
 885         /**
 886          * Copy whatever left
 887          */
 888         goto COPY_BLOCK_64_BACK15;
 889 }
 890
 891 #endif /* RTE_MACHINE_CPUFLAG */
 892
 893 static inline void *
 894 rte_memcpy_aligned(void *dst, const void *src, size_t n)
 895 {
 896         void *ret = dst;
 897
 898         /* Copy size <= 16 bytes */
 899         if (n < 16) {
 900                 if (n & 0x01) {
 901                         *(uint8_t *)dst = *(const uint8_t *)src;
 902                         src = (const uint8_t *)src + 1;
 903                         dst = (uint8_t *)dst + 1;
 904                 }
 905                 if (n & 0x02) {
 906                         *(uint16_t *)dst = *(const uint16_t *)src;
 907                         src = (const uint16_t *)src + 1;
 908                         dst = (uint16_t *)dst + 1;
 909                 }
 910                 if (n & 0x04) {
 911                         *(uint32_t *)dst = *(const uint32_t *)src;
 912                         src = (const uint32_t *)src + 1;
 913                         dst = (uint32_t *)dst + 1;
 914                 }
 915                 if (n & 0x08)
 916                         *(uint64_t *)dst = *(const uint64_t *)src;
 917
 918                 return ret;
 919         }
 920
 921         /* Copy 16 <= size <= 32 bytes */
 922         if (n <= 32) {
 923                 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
 924                 rte_mov16((uint8_t *)dst - 16 + n,
 925                                 (const uint8_t *)src - 16 + n);
 926
 927                 return ret;
 928         }
 929
 930         /* Copy 32 < size <= 64 bytes */
 931         if (n <= 64) {
 932                 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 933                 rte_mov32((uint8_t *)dst - 32 + n,
 934                                 (const uint8_t *)src - 32 + n);
 935
 936                 return ret;
 937         }
 938
 939         /* Copy 64 bytes blocks */
 940         for (; n >= 64; n -= 64) {
 941                 rte_mov64((uint8_t *)dst, (const uint8_t *)src);
 942                 dst = (uint8_t *)dst + 64;
 943                 src = (const uint8_t *)src + 64;
 944         }
 945
 946         /* Copy whatever left */
 947         rte_mov64((uint8_t *)dst - 64 + n,
 948                         (const uint8_t *)src - 64 + n);
 949
 950         return ret;
 951 }
 952
 953 static inline void *
 954 rte_memcpy_internal(void *dst, const void *src, size_t n)
 955 {
 956         if (!(((uintptr_t)dst | (uintptr_t)src) & ALIGNMENT_MASK))
 957                 return rte_memcpy_aligned(dst, src, n);
 958         else
 959                 return rte_memcpy_generic(dst, src, n);
 960 }
 961
 962 #ifdef __cplusplus
 963 }
 964 #endif
 965
 966 #endif /* _RTE_MEMCPY_INTERNAL_X86_64_H_ */