lib/eal/x86/include/rte_memcpy.h

   1 /* SPDX-License-Identifier: BSD-3-Clause
   2  * Copyright(c) 2010-2014 Intel Corporation
   3  */
   4
   5 #ifndef _RTE_MEMCPY_X86_64_H_
   6 #define _RTE_MEMCPY_X86_64_H_
   7
   8 /**
   9  * @file
  10  *
  11  * Functions for SSE/AVX/AVX2/AVX512 implementation of memcpy().
  12  */
  13
  14 #include <stdio.h>
  15 #include <stdint.h>
  16 #include <string.h>
  17 #include <rte_vect.h>
  18 #include <rte_common.h>
  19 #include <rte_config.h>
  20
  21 #ifdef __cplusplus
  22 extern "C" {
  23 #endif
  24
  25 #if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION >= 100000)
  26 #pragma GCC diagnostic push
  27 #pragma GCC diagnostic ignored "-Wstringop-overflow"
  28 #endif
  29
  30 /**
  31  * Copy bytes from one location to another. The locations must not overlap.
  32  *
  33  * @note This is implemented as a macro, so it's address should not be taken
  34  * and care is needed as parameter expressions may be evaluated multiple times.
  35  *
  36  * @param dst
  37  *   Pointer to the destination of the data.
  38  * @param src
  39  *   Pointer to the source data.
  40  * @param n
  41  *   Number of bytes to copy.
  42  * @return
  43  *   Pointer to the destination data.
  44  */
  45 static __rte_always_inline void *
  46 rte_memcpy(void *dst, const void *src, size_t n);
  47
  48 #if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
  49
  50 #define ALIGNMENT_MASK 0x3F
  51
  52 /**
  53  * AVX512 implementation below
  54  */
  55
  56 /**
  57  * Copy 16 bytes from one location to another,
  58  * locations should not overlap.
  59  */
  60 static __rte_always_inline void
  61 rte_mov16(uint8_t *dst, const uint8_t *src)
  62 {
  63         __m128i xmm0;
  64
  65         xmm0 = _mm_loadu_si128((const __m128i *)src);
  66         _mm_storeu_si128((__m128i *)dst, xmm0);
  67 }
  68
  69 /**
  70  * Copy 32 bytes from one location to another,
  71  * locations should not overlap.
  72  */
  73 static __rte_always_inline void
  74 rte_mov32(uint8_t *dst, const uint8_t *src)
  75 {
  76         __m256i ymm0;
  77
  78         ymm0 = _mm256_loadu_si256((const __m256i *)src);
  79         _mm256_storeu_si256((__m256i *)dst, ymm0);
  80 }
  81
  82 /**
  83  * Copy 64 bytes from one location to another,
  84  * locations should not overlap.
  85  */
  86 static __rte_always_inline void
  87 rte_mov64(uint8_t *dst, const uint8_t *src)
  88 {
  89         __m512i zmm0;
  90
  91         zmm0 = _mm512_loadu_si512((const void *)src);
  92         _mm512_storeu_si512((void *)dst, zmm0);
  93 }
  94
  95 /**
  96  * Copy 128 bytes from one location to another,
  97  * locations should not overlap.
  98  */
  99 static __rte_always_inline void
 100 rte_mov128(uint8_t *dst, const uint8_t *src)
 101 {
 102         rte_mov64(dst + 0 * 64, src + 0 * 64);
 103         rte_mov64(dst + 1 * 64, src + 1 * 64);
 104 }
 105
 106 /**
 107  * Copy 256 bytes from one location to another,
 108  * locations should not overlap.
 109  */
 110 static __rte_always_inline void
 111 rte_mov256(uint8_t *dst, const uint8_t *src)
 112 {
 113         rte_mov64(dst + 0 * 64, src + 0 * 64);
 114         rte_mov64(dst + 1 * 64, src + 1 * 64);
 115         rte_mov64(dst + 2 * 64, src + 2 * 64);
 116         rte_mov64(dst + 3 * 64, src + 3 * 64);
 117 }
 118
 119 /**
 120  * Copy 128-byte blocks from one location to another,
 121  * locations should not overlap.
 122  */
 123 static __rte_always_inline void
 124 rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
 125 {
 126         __m512i zmm0, zmm1;
 127
 128         while (n >= 128) {
 129                 zmm0 = _mm512_loadu_si512((const void *)(src + 0 * 64));
 130                 n -= 128;
 131                 zmm1 = _mm512_loadu_si512((const void *)(src + 1 * 64));
 132                 src = src + 128;
 133                 _mm512_storeu_si512((void *)(dst + 0 * 64), zmm0);
 134                 _mm512_storeu_si512((void *)(dst + 1 * 64), zmm1);
 135                 dst = dst + 128;
 136         }
 137 }
 138
 139 /**
 140  * Copy 512-byte blocks from one location to another,
 141  * locations should not overlap.
 142  */
 143 static inline void
 144 rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n)
 145 {
 146         __m512i zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7;
 147
 148         while (n >= 512) {
 149                 zmm0 = _mm512_loadu_si512((const void *)(src + 0 * 64));
 150                 n -= 512;
 151                 zmm1 = _mm512_loadu_si512((const void *)(src + 1 * 64));
 152                 zmm2 = _mm512_loadu_si512((const void *)(src + 2 * 64));
 153                 zmm3 = _mm512_loadu_si512((const void *)(src + 3 * 64));
 154                 zmm4 = _mm512_loadu_si512((const void *)(src + 4 * 64));
 155                 zmm5 = _mm512_loadu_si512((const void *)(src + 5 * 64));
 156                 zmm6 = _mm512_loadu_si512((const void *)(src + 6 * 64));
 157                 zmm7 = _mm512_loadu_si512((const void *)(src + 7 * 64));
 158                 src = src + 512;
 159                 _mm512_storeu_si512((void *)(dst + 0 * 64), zmm0);
 160                 _mm512_storeu_si512((void *)(dst + 1 * 64), zmm1);
 161                 _mm512_storeu_si512((void *)(dst + 2 * 64), zmm2);
 162                 _mm512_storeu_si512((void *)(dst + 3 * 64), zmm3);
 163                 _mm512_storeu_si512((void *)(dst + 4 * 64), zmm4);
 164                 _mm512_storeu_si512((void *)(dst + 5 * 64), zmm5);
 165                 _mm512_storeu_si512((void *)(dst + 6 * 64), zmm6);
 166                 _mm512_storeu_si512((void *)(dst + 7 * 64), zmm7);
 167                 dst = dst + 512;
 168         }
 169 }
 170
 171 static __rte_always_inline void *
 172 rte_memcpy_generic(void *dst, const void *src, size_t n)
 173 {
 174         uintptr_t dstu = (uintptr_t)dst;
 175         uintptr_t srcu = (uintptr_t)src;
 176         void *ret = dst;
 177         size_t dstofss;
 178         size_t bits;
 179
 180         /**
 181          * Copy less than 16 bytes
 182          */
 183         if (n < 16) {
 184                 if (n & 0x01) {
 185                         *(uint8_t *)dstu = *(const uint8_t *)srcu;
 186                         srcu = (uintptr_t)((const uint8_t *)srcu + 1);
 187                         dstu = (uintptr_t)((uint8_t *)dstu + 1);
 188                 }
 189                 if (n & 0x02) {
 190                         *(uint16_t *)dstu = *(const uint16_t *)srcu;
 191                         srcu = (uintptr_t)((const uint16_t *)srcu + 1);
 192                         dstu = (uintptr_t)((uint16_t *)dstu + 1);
 193                 }
 194                 if (n & 0x04) {
 195                         *(uint32_t *)dstu = *(const uint32_t *)srcu;
 196                         srcu = (uintptr_t)((const uint32_t *)srcu + 1);
 197                         dstu = (uintptr_t)((uint32_t *)dstu + 1);
 198                 }
 199                 if (n & 0x08)
 200                         *(uint64_t *)dstu = *(const uint64_t *)srcu;
 201                 return ret;
 202         }
 203
 204         /**
 205          * Fast way when copy size doesn't exceed 512 bytes
 206          */
 207         if (n <= 32) {
 208                 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
 209                 rte_mov16((uint8_t *)dst - 16 + n,
 210                                   (const uint8_t *)src - 16 + n);
 211                 return ret;
 212         }
 213         if (n <= 64) {
 214                 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 215                 rte_mov32((uint8_t *)dst - 32 + n,
 216                                   (const uint8_t *)src - 32 + n);
 217                 return ret;
 218         }
 219         if (n <= 512) {
 220                 if (n >= 256) {
 221                         n -= 256;
 222                         rte_mov256((uint8_t *)dst, (const uint8_t *)src);
 223                         src = (const uint8_t *)src + 256;
 224                         dst = (uint8_t *)dst + 256;
 225                 }
 226                 if (n >= 128) {
 227                         n -= 128;
 228                         rte_mov128((uint8_t *)dst, (const uint8_t *)src);
 229                         src = (const uint8_t *)src + 128;
 230                         dst = (uint8_t *)dst + 128;
 231                 }
 232 COPY_BLOCK_128_BACK63:
 233                 if (n > 64) {
 234                         rte_mov64((uint8_t *)dst, (const uint8_t *)src);
 235                         rte_mov64((uint8_t *)dst - 64 + n,
 236                                           (const uint8_t *)src - 64 + n);
 237                         return ret;
 238                 }
 239                 if (n > 0)
 240                         rte_mov64((uint8_t *)dst - 64 + n,
 241                                           (const uint8_t *)src - 64 + n);
 242                 return ret;
 243         }
 244
 245         /**
 246          * Make store aligned when copy size exceeds 512 bytes
 247          */
 248         dstofss = ((uintptr_t)dst & 0x3F);
 249         if (dstofss > 0) {
 250                 dstofss = 64 - dstofss;
 251                 n -= dstofss;
 252                 rte_mov64((uint8_t *)dst, (const uint8_t *)src);
 253                 src = (const uint8_t *)src + dstofss;
 254                 dst = (uint8_t *)dst + dstofss;
 255         }
 256
 257         /**
 258          * Copy 512-byte blocks.
 259          * Use copy block function for better instruction order control,
 260          * which is important when load is unaligned.
 261          */
 262         rte_mov512blocks((uint8_t *)dst, (const uint8_t *)src, n);
 263         bits = n;
 264         n = n & 511;
 265         bits -= n;
 266         src = (const uint8_t *)src + bits;
 267         dst = (uint8_t *)dst + bits;
 268
 269         /**
 270          * Copy 128-byte blocks.
 271          * Use copy block function for better instruction order control,
 272          * which is important when load is unaligned.
 273          */
 274         if (n >= 128) {
 275                 rte_mov128blocks((uint8_t *)dst, (const uint8_t *)src, n);
 276                 bits = n;
 277                 n = n & 127;
 278                 bits -= n;
 279                 src = (const uint8_t *)src + bits;
 280                 dst = (uint8_t *)dst + bits;
 281         }
 282
 283         /**
 284          * Copy whatever left
 285          */
 286         goto COPY_BLOCK_128_BACK63;
 287 }
 288
 289 #elif defined __AVX2__
 290
 291 #define ALIGNMENT_MASK 0x1F
 292
 293 /**
 294  * AVX2 implementation below
 295  */
 296
 297 /**
 298  * Copy 16 bytes from one location to another,
 299  * locations should not overlap.
 300  */
 301 static __rte_always_inline void
 302 rte_mov16(uint8_t *dst, const uint8_t *src)
 303 {
 304         __m128i xmm0;
 305
 306         xmm0 = _mm_loadu_si128((const __m128i *)src);
 307         _mm_storeu_si128((__m128i *)dst, xmm0);
 308 }
 309
 310 /**
 311  * Copy 32 bytes from one location to another,
 312  * locations should not overlap.
 313  */
 314 static __rte_always_inline void
 315 rte_mov32(uint8_t *dst, const uint8_t *src)
 316 {
 317         __m256i ymm0;
 318
 319         ymm0 = _mm256_loadu_si256((const __m256i *)src);
 320         _mm256_storeu_si256((__m256i *)dst, ymm0);
 321 }
 322
 323 /**
 324  * Copy 64 bytes from one location to another,
 325  * locations should not overlap.
 326  */
 327 static __rte_always_inline void
 328 rte_mov64(uint8_t *dst, const uint8_t *src)
 329 {
 330         rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
 331         rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
 332 }
 333
 334 /**
 335  * Copy 128 bytes from one location to another,
 336  * locations should not overlap.
 337  */
 338 static __rte_always_inline void
 339 rte_mov128(uint8_t *dst, const uint8_t *src)
 340 {
 341         rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
 342         rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
 343         rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32);
 344         rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32);
 345 }
 346
 347 /**
 348  * Copy 128-byte blocks from one location to another,
 349  * locations should not overlap.
 350  */
 351 static __rte_always_inline void
 352 rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
 353 {
 354         __m256i ymm0, ymm1, ymm2, ymm3;
 355
 356         while (n >= 128) {
 357                 ymm0 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 0 * 32));
 358                 n -= 128;
 359                 ymm1 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 1 * 32));
 360                 ymm2 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 2 * 32));
 361                 ymm3 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 3 * 32));
 362                 src = (const uint8_t *)src + 128;
 363                 _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 0 * 32), ymm0);
 364                 _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 1 * 32), ymm1);
 365                 _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 2 * 32), ymm2);
 366                 _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 3 * 32), ymm3);
 367                 dst = (uint8_t *)dst + 128;
 368         }
 369 }
 370
 371 static __rte_always_inline void *
 372 rte_memcpy_generic(void *dst, const void *src, size_t n)
 373 {
 374         uintptr_t dstu = (uintptr_t)dst;
 375         uintptr_t srcu = (uintptr_t)src;
 376         void *ret = dst;
 377         size_t dstofss;
 378         size_t bits;
 379
 380         /**
 381          * Copy less than 16 bytes
 382          */
 383         if (n < 16) {
 384                 if (n & 0x01) {
 385                         *(uint8_t *)dstu = *(const uint8_t *)srcu;
 386                         srcu = (uintptr_t)((const uint8_t *)srcu + 1);
 387                         dstu = (uintptr_t)((uint8_t *)dstu + 1);
 388                 }
 389                 if (n & 0x02) {
 390                         *(uint16_t *)dstu = *(const uint16_t *)srcu;
 391                         srcu = (uintptr_t)((const uint16_t *)srcu + 1);
 392                         dstu = (uintptr_t)((uint16_t *)dstu + 1);
 393                 }
 394                 if (n & 0x04) {
 395                         *(uint32_t *)dstu = *(const uint32_t *)srcu;
 396                         srcu = (uintptr_t)((const uint32_t *)srcu + 1);
 397                         dstu = (uintptr_t)((uint32_t *)dstu + 1);
 398                 }
 399                 if (n & 0x08) {
 400                         *(uint64_t *)dstu = *(const uint64_t *)srcu;
 401                 }
 402                 return ret;
 403         }
 404
 405         /**
 406          * Fast way when copy size doesn't exceed 256 bytes
 407          */
 408         if (n <= 32) {
 409                 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
 410                 rte_mov16((uint8_t *)dst - 16 + n,
 411                                 (const uint8_t *)src - 16 + n);
 412                 return ret;
 413         }
 414         if (n <= 48) {
 415                 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
 416                 rte_mov16((uint8_t *)dst + 16, (const uint8_t *)src + 16);
 417                 rte_mov16((uint8_t *)dst - 16 + n,
 418                                 (const uint8_t *)src - 16 + n);
 419                 return ret;
 420         }
 421         if (n <= 64) {
 422                 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 423                 rte_mov32((uint8_t *)dst - 32 + n,
 424                                 (const uint8_t *)src - 32 + n);
 425                 return ret;
 426         }
 427         if (n <= 256) {
 428                 if (n >= 128) {
 429                         n -= 128;
 430                         rte_mov128((uint8_t *)dst, (const uint8_t *)src);
 431                         src = (const uint8_t *)src + 128;
 432                         dst = (uint8_t *)dst + 128;
 433                 }
 434 COPY_BLOCK_128_BACK31:
 435                 if (n >= 64) {
 436                         n -= 64;
 437                         rte_mov64((uint8_t *)dst, (const uint8_t *)src);
 438                         src = (const uint8_t *)src + 64;
 439                         dst = (uint8_t *)dst + 64;
 440                 }
 441                 if (n > 32) {
 442                         rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 443                         rte_mov32((uint8_t *)dst - 32 + n,
 444                                         (const uint8_t *)src - 32 + n);
 445                         return ret;
 446                 }
 447                 if (n > 0) {
 448                         rte_mov32((uint8_t *)dst - 32 + n,
 449                                         (const uint8_t *)src - 32 + n);
 450                 }
 451                 return ret;
 452         }
 453
 454         /**
 455          * Make store aligned when copy size exceeds 256 bytes
 456          */
 457         dstofss = (uintptr_t)dst & 0x1F;
 458         if (dstofss > 0) {
 459                 dstofss = 32 - dstofss;
 460                 n -= dstofss;
 461                 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 462                 src = (const uint8_t *)src + dstofss;
 463                 dst = (uint8_t *)dst + dstofss;
 464         }
 465
 466         /**
 467          * Copy 128-byte blocks
 468          */
 469         rte_mov128blocks((uint8_t *)dst, (const uint8_t *)src, n);
 470         bits = n;
 471         n = n & 127;
 472         bits -= n;
 473         src = (const uint8_t *)src + bits;
 474         dst = (uint8_t *)dst + bits;
 475
 476         /**
 477          * Copy whatever left
 478          */
 479         goto COPY_BLOCK_128_BACK31;
 480 }
 481
 482 #else /* __AVX512F__ */
 483
 484 #define ALIGNMENT_MASK 0x0F
 485
 486 /**
 487  * SSE & AVX implementation below
 488  */
 489
 490 /**
 491  * Copy 16 bytes from one location to another,
 492  * locations should not overlap.
 493  */
 494 static __rte_always_inline void
 495 rte_mov16(uint8_t *dst, const uint8_t *src)
 496 {
 497         __m128i xmm0;
 498
 499         xmm0 = _mm_loadu_si128((const __m128i *)(const __m128i *)src);
 500         _mm_storeu_si128((__m128i *)dst, xmm0);
 501 }
 502
 503 /**
 504  * Copy 32 bytes from one location to another,
 505  * locations should not overlap.
 506  */
 507 static __rte_always_inline void
 508 rte_mov32(uint8_t *dst, const uint8_t *src)
 509 {
 510         rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
 511         rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
 512 }
 513
 514 /**
 515  * Copy 64 bytes from one location to another,
 516  * locations should not overlap.
 517  */
 518 static __rte_always_inline void
 519 rte_mov64(uint8_t *dst, const uint8_t *src)
 520 {
 521         rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
 522         rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
 523         rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
 524         rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
 525 }
 526
 527 /**
 528  * Copy 128 bytes from one location to another,
 529  * locations should not overlap.
 530  */
 531 static __rte_always_inline void
 532 rte_mov128(uint8_t *dst, const uint8_t *src)
 533 {
 534         rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
 535         rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
 536         rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
 537         rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
 538         rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
 539         rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
 540         rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
 541         rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
 542 }
 543
 544 /**
 545  * Copy 256 bytes from one location to another,
 546  * locations should not overlap.
 547  */
 548 static inline void
 549 rte_mov256(uint8_t *dst, const uint8_t *src)
 550 {
 551         rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
 552         rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
 553         rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
 554         rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
 555         rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
 556         rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
 557         rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
 558         rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
 559         rte_mov16((uint8_t *)dst + 8 * 16, (const uint8_t *)src + 8 * 16);
 560         rte_mov16((uint8_t *)dst + 9 * 16, (const uint8_t *)src + 9 * 16);
 561         rte_mov16((uint8_t *)dst + 10 * 16, (const uint8_t *)src + 10 * 16);
 562         rte_mov16((uint8_t *)dst + 11 * 16, (const uint8_t *)src + 11 * 16);
 563         rte_mov16((uint8_t *)dst + 12 * 16, (const uint8_t *)src + 12 * 16);
 564         rte_mov16((uint8_t *)dst + 13 * 16, (const uint8_t *)src + 13 * 16);
 565         rte_mov16((uint8_t *)dst + 14 * 16, (const uint8_t *)src + 14 * 16);
 566         rte_mov16((uint8_t *)dst + 15 * 16, (const uint8_t *)src + 15 * 16);
 567 }
 568
 569 /**
 570  * Macro for copying unaligned block from one location to another with constant load offset,
 571  * 47 bytes leftover maximum,
 572  * locations should not overlap.
 573  * Requirements:
 574  * - Store is aligned
 575  * - Load offset is <offset>, which must be immediate value within [1, 15]
 576  * - For <src>, make sure <offset> bit backwards & <16 - offset> bit forwards are available for loading
 577  * - <dst>, <src>, <len> must be variables
 578  * - __m128i <xmm0> ~ <xmm8> must be pre-defined
 579  */
 580 #define MOVEUNALIGNED_LEFT47_IMM(dst, src, len, offset)                                                     \
 581 __extension__ ({                                                                                            \
 582     size_t tmp;                                                                                                \
 583     while (len >= 128 + 16 - offset) {                                                                      \
 584         xmm0 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 0 * 16));                  \
 585         len -= 128;                                                                                         \
 586         xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 1 * 16));                  \
 587         xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 2 * 16));                  \
 588         xmm3 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 3 * 16));                  \
 589         xmm4 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 4 * 16));                  \
 590         xmm5 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 5 * 16));                  \
 591         xmm6 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 6 * 16));                  \
 592         xmm7 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 7 * 16));                  \
 593         xmm8 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 8 * 16));                  \
 594         src = (const uint8_t *)src + 128;                                                                   \
 595         _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset));        \
 596         _mm_storeu_si128((__m128i *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset));        \
 597         _mm_storeu_si128((__m128i *)((uint8_t *)dst + 2 * 16), _mm_alignr_epi8(xmm3, xmm2, offset));        \
 598         _mm_storeu_si128((__m128i *)((uint8_t *)dst + 3 * 16), _mm_alignr_epi8(xmm4, xmm3, offset));        \
 599         _mm_storeu_si128((__m128i *)((uint8_t *)dst + 4 * 16), _mm_alignr_epi8(xmm5, xmm4, offset));        \
 600         _mm_storeu_si128((__m128i *)((uint8_t *)dst + 5 * 16), _mm_alignr_epi8(xmm6, xmm5, offset));        \
 601         _mm_storeu_si128((__m128i *)((uint8_t *)dst + 6 * 16), _mm_alignr_epi8(xmm7, xmm6, offset));        \
 602         _mm_storeu_si128((__m128i *)((uint8_t *)dst + 7 * 16), _mm_alignr_epi8(xmm8, xmm7, offset));        \
 603         dst = (uint8_t *)dst + 128;                                                                         \
 604     }                                                                                                       \
 605     tmp = len;                                                                                              \
 606     len = ((len - 16 + offset) & 127) + 16 - offset;                                                        \
 607     tmp -= len;                                                                                             \
 608     src = (const uint8_t *)src + tmp;                                                                       \
 609     dst = (uint8_t *)dst + tmp;                                                                             \
 610     if (len >= 32 + 16 - offset) {                                                                          \
 611         while (len >= 32 + 16 - offset) {                                                                   \
 612             xmm0 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 0 * 16));              \
 613             len -= 32;                                                                                      \
 614             xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 1 * 16));              \
 615             xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 2 * 16));              \
 616             src = (const uint8_t *)src + 32;                                                                \
 617             _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset));    \
 618             _mm_storeu_si128((__m128i *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset));    \
 619             dst = (uint8_t *)dst + 32;                                                                      \
 620         }                                                                                                   \
 621         tmp = len;                                                                                          \
 622         len = ((len - 16 + offset) & 31) + 16 - offset;                                                     \
 623         tmp -= len;                                                                                         \
 624         src = (const uint8_t *)src + tmp;                                                                   \
 625         dst = (uint8_t *)dst + tmp;                                                                         \
 626     }                                                                                                       \
 627 })
 628
 629 /**
 630  * Macro for copying unaligned block from one location to another,
 631  * 47 bytes leftover maximum,
 632  * locations should not overlap.
 633  * Use switch here because the aligning instruction requires immediate value for shift count.
 634  * Requirements:
 635  * - Store is aligned
 636  * - Load offset is <offset>, which must be within [1, 15]
 637  * - For <src>, make sure <offset> bit backwards & <16 - offset> bit forwards are available for loading
 638  * - <dst>, <src>, <len> must be variables
 639  * - __m128i <xmm0> ~ <xmm8> used in MOVEUNALIGNED_LEFT47_IMM must be pre-defined
 640  */
 641 #define MOVEUNALIGNED_LEFT47(dst, src, len, offset)                   \
 642 __extension__ ({                                                      \
 643     switch (offset) {                                                 \
 644     case 0x01: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x01); break;    \
 645     case 0x02: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x02); break;    \
 646     case 0x03: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x03); break;    \
 647     case 0x04: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x04); break;    \
 648     case 0x05: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x05); break;    \
 649     case 0x06: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x06); break;    \
 650     case 0x07: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x07); break;    \
 651     case 0x08: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x08); break;    \
 652     case 0x09: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x09); break;    \
 653     case 0x0A: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0A); break;    \
 654     case 0x0B: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0B); break;    \
 655     case 0x0C: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0C); break;    \
 656     case 0x0D: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0D); break;    \
 657     case 0x0E: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0E); break;    \
 658     case 0x0F: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0F); break;    \
 659     default:;                                                         \
 660     }                                                                 \
 661 })
 662
 663 static __rte_always_inline void *
 664 rte_memcpy_generic(void *dst, const void *src, size_t n)
 665 {
 666         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
 667         uintptr_t dstu = (uintptr_t)dst;
 668         uintptr_t srcu = (uintptr_t)src;
 669         void *ret = dst;
 670         size_t dstofss;
 671         size_t srcofs;
 672
 673         /**
 674          * Copy less than 16 bytes
 675          */
 676         if (n < 16) {
 677                 if (n & 0x01) {
 678                         *(uint8_t *)dstu = *(const uint8_t *)srcu;
 679                         srcu = (uintptr_t)((const uint8_t *)srcu + 1);
 680                         dstu = (uintptr_t)((uint8_t *)dstu + 1);
 681                 }
 682                 if (n & 0x02) {
 683                         *(uint16_t *)dstu = *(const uint16_t *)srcu;
 684                         srcu = (uintptr_t)((const uint16_t *)srcu + 1);
 685                         dstu = (uintptr_t)((uint16_t *)dstu + 1);
 686                 }
 687                 if (n & 0x04) {
 688                         *(uint32_t *)dstu = *(const uint32_t *)srcu;
 689                         srcu = (uintptr_t)((const uint32_t *)srcu + 1);
 690                         dstu = (uintptr_t)((uint32_t *)dstu + 1);
 691                 }
 692                 if (n & 0x08) {
 693                         *(uint64_t *)dstu = *(const uint64_t *)srcu;
 694                 }
 695                 return ret;
 696         }
 697
 698         /**
 699          * Fast way when copy size doesn't exceed 512 bytes
 700          */
 701         if (n <= 32) {
 702                 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
 703                 rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
 704                 return ret;
 705         }
 706         if (n <= 48) {
 707                 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 708                 rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
 709                 return ret;
 710         }
 711         if (n <= 64) {
 712                 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 713                 rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32);
 714                 rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
 715                 return ret;
 716         }
 717         if (n <= 128) {
 718                 goto COPY_BLOCK_128_BACK15;
 719         }
 720         if (n <= 512) {
 721                 if (n >= 256) {
 722                         n -= 256;
 723                         rte_mov128((uint8_t *)dst, (const uint8_t *)src);
 724                         rte_mov128((uint8_t *)dst + 128, (const uint8_t *)src + 128);
 725                         src = (const uint8_t *)src + 256;
 726                         dst = (uint8_t *)dst + 256;
 727                 }
 728 COPY_BLOCK_255_BACK15:
 729                 if (n >= 128) {
 730                         n -= 128;
 731                         rte_mov128((uint8_t *)dst, (const uint8_t *)src);
 732                         src = (const uint8_t *)src + 128;
 733                         dst = (uint8_t *)dst + 128;
 734                 }
 735 COPY_BLOCK_128_BACK15:
 736                 if (n >= 64) {
 737                         n -= 64;
 738                         rte_mov64((uint8_t *)dst, (const uint8_t *)src);
 739                         src = (const uint8_t *)src + 64;
 740                         dst = (uint8_t *)dst + 64;
 741                 }
 742 COPY_BLOCK_64_BACK15:
 743                 if (n >= 32) {
 744                         n -= 32;
 745                         rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 746                         src = (const uint8_t *)src + 32;
 747                         dst = (uint8_t *)dst + 32;
 748                 }
 749                 if (n > 16) {
 750                         rte_mov16((uint8_t *)dst, (const uint8_t *)src);
 751                         rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
 752                         return ret;
 753                 }
 754                 if (n > 0) {
 755                         rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
 756                 }
 757                 return ret;
 758         }
 759
 760         /**
 761          * Make store aligned when copy size exceeds 512 bytes,
 762          * and make sure the first 15 bytes are copied, because
 763          * unaligned copy functions require up to 15 bytes
 764          * backwards access.
 765          */
 766         dstofss = (uintptr_t)dst & 0x0F;
 767         if (dstofss > 0) {
 768                 dstofss = 16 - dstofss + 16;
 769                 n -= dstofss;
 770                 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 771                 src = (const uint8_t *)src + dstofss;
 772                 dst = (uint8_t *)dst + dstofss;
 773         }
 774         srcofs = ((uintptr_t)src & 0x0F);
 775
 776         /**
 777          * For aligned copy
 778          */
 779         if (srcofs == 0) {
 780                 /**
 781                  * Copy 256-byte blocks
 782                  */
 783                 for (; n >= 256; n -= 256) {
 784                         rte_mov256((uint8_t *)dst, (const uint8_t *)src);
 785                         dst = (uint8_t *)dst + 256;
 786                         src = (const uint8_t *)src + 256;
 787                 }
 788
 789                 /**
 790                  * Copy whatever left
 791                  */
 792                 goto COPY_BLOCK_255_BACK15;
 793         }
 794
 795         /**
 796          * For copy with unaligned load
 797          */
 798         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
 799
 800         /**
 801          * Copy whatever left
 802          */
 803         goto COPY_BLOCK_64_BACK15;
 804 }
 805
 806 #endif /* __AVX512F__ */
 807
 808 static __rte_always_inline void *
 809 rte_memcpy_aligned(void *dst, const void *src, size_t n)
 810 {
 811         void *ret = dst;
 812
 813         /* Copy size <= 16 bytes */
 814         if (n < 16) {
 815                 if (n & 0x01) {
 816                         *(uint8_t *)dst = *(const uint8_t *)src;
 817                         src = (const uint8_t *)src + 1;
 818                         dst = (uint8_t *)dst + 1;
 819                 }
 820                 if (n & 0x02) {
 821                         *(uint16_t *)dst = *(const uint16_t *)src;
 822                         src = (const uint16_t *)src + 1;
 823                         dst = (uint16_t *)dst + 1;
 824                 }
 825                 if (n & 0x04) {
 826                         *(uint32_t *)dst = *(const uint32_t *)src;
 827                         src = (const uint32_t *)src + 1;
 828                         dst = (uint32_t *)dst + 1;
 829                 }
 830                 if (n & 0x08)
 831                         *(uint64_t *)dst = *(const uint64_t *)src;
 832
 833                 return ret;
 834         }
 835
 836         /* Copy 16 <= size <= 32 bytes */
 837         if (n <= 32) {
 838                 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
 839                 rte_mov16((uint8_t *)dst - 16 + n,
 840                                 (const uint8_t *)src - 16 + n);
 841
 842                 return ret;
 843         }
 844
 845         /* Copy 32 < size <= 64 bytes */
 846         if (n <= 64) {
 847                 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 848                 rte_mov32((uint8_t *)dst - 32 + n,
 849                                 (const uint8_t *)src - 32 + n);
 850
 851                 return ret;
 852         }
 853
 854         /* Copy 64 bytes blocks */
 855         for (; n >= 64; n -= 64) {
 856                 rte_mov64((uint8_t *)dst, (const uint8_t *)src);
 857                 dst = (uint8_t *)dst + 64;
 858                 src = (const uint8_t *)src + 64;
 859         }
 860
 861         /* Copy whatever left */
 862         rte_mov64((uint8_t *)dst - 64 + n,
 863                         (const uint8_t *)src - 64 + n);
 864
 865         return ret;
 866 }
 867
 868 static __rte_always_inline void *
 869 rte_memcpy(void *dst, const void *src, size_t n)
 870 {
 871         if (!(((uintptr_t)dst | (uintptr_t)src) & ALIGNMENT_MASK))
 872                 return rte_memcpy_aligned(dst, src, n);
 873         else
 874                 return rte_memcpy_generic(dst, src, n);
 875 }
 876
 877 #if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION >= 100000)
 878 #pragma GCC diagnostic pop
 879 #endif
 880
 881 #ifdef __cplusplus
 882 }
 883 #endif
 884
 885 #endif /* _RTE_MEMCPY_X86_64_H_ */