lib/eal/x86/include/rte_memcpy.h

   1 /* SPDX-License-Identifier: BSD-3-Clause
   2  * Copyright(c) 2010-2014 Intel Corporation
   3  */
   4
   5 #ifndef _RTE_MEMCPY_X86_64_H_
   6 #define _RTE_MEMCPY_X86_64_H_
   7
   8 /**
   9  * @file
  10  *
  11  * Functions for SSE/AVX/AVX2/AVX512 implementation of memcpy().
  12  */
  13
  14 #include <stdio.h>
  15 #include <stdint.h>
  16 #include <string.h>
  17 #include <rte_vect.h>
  18 #include <rte_common.h>
  19 #include <rte_config.h>
  20
  21 #ifdef __cplusplus
  22 extern "C" {
  23 #endif
  24
  25 #if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION >= 100000)
  26 #pragma GCC diagnostic push
  27 #pragma GCC diagnostic ignored "-Wstringop-overflow"
  28 #endif
  29
  30 /**
  31  * Copy bytes from one location to another. The locations must not overlap.
  32  *
  33  * @note This is implemented as a macro, so it's address should not be taken
  34  * and care is needed as parameter expressions may be evaluated multiple times.
  35  *
  36  * @param dst
  37  *   Pointer to the destination of the data.
  38  * @param src
  39  *   Pointer to the source data.
  40  * @param n
  41  *   Number of bytes to copy.
  42  * @return
  43  *   Pointer to the destination data.
  44  */
  45 static __rte_always_inline void *
  46 rte_memcpy(void *dst, const void *src, size_t n);
  47
  48 #if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
  49
  50 #define ALIGNMENT_MASK 0x3F
  51
  52 /**
  53  * AVX512 implementation below
  54  */
  55
  56 /**
  57  * Copy 16 bytes from one location to another,
  58  * locations should not overlap.
  59  */
  60 static __rte_always_inline void
  61 rte_mov16(uint8_t *dst, const uint8_t *src)
  62 {
  63         __m128i xmm0;
  64
  65         xmm0 = _mm_loadu_si128((const __m128i *)src);
  66         _mm_storeu_si128((__m128i *)dst, xmm0);
  67 }
  68
  69 /**
  70  * Copy 32 bytes from one location to another,
  71  * locations should not overlap.
  72  */
  73 static __rte_always_inline void
  74 rte_mov32(uint8_t *dst, const uint8_t *src)
  75 {
  76         __m256i ymm0;
  77
  78         ymm0 = _mm256_loadu_si256((const __m256i *)src);
  79         _mm256_storeu_si256((__m256i *)dst, ymm0);
  80 }
  81
  82 /**
  83  * Copy 64 bytes from one location to another,
  84  * locations should not overlap.
  85  */
  86 static __rte_always_inline void
  87 rte_mov64(uint8_t *dst, const uint8_t *src)
  88 {
  89         __m512i zmm0;
  90
  91         zmm0 = _mm512_loadu_si512((const void *)src);
  92         _mm512_storeu_si512((void *)dst, zmm0);
  93 }
  94
  95 /**
  96  * Copy 128 bytes from one location to another,
  97  * locations should not overlap.
  98  */
  99 static __rte_always_inline void
 100 rte_mov128(uint8_t *dst, const uint8_t *src)
 101 {
 102         rte_mov64(dst + 0 * 64, src + 0 * 64);
 103         rte_mov64(dst + 1 * 64, src + 1 * 64);
 104 }
 105
 106 /**
 107  * Copy 256 bytes from one location to another,
 108  * locations should not overlap.
 109  */
 110 static __rte_always_inline void
 111 rte_mov256(uint8_t *dst, const uint8_t *src)
 112 {
 113         rte_mov64(dst + 0 * 64, src + 0 * 64);
 114         rte_mov64(dst + 1 * 64, src + 1 * 64);
 115         rte_mov64(dst + 2 * 64, src + 2 * 64);
 116         rte_mov64(dst + 3 * 64, src + 3 * 64);
 117 }
 118
 119 /**
 120  * Copy 128-byte blocks from one location to another,
 121  * locations should not overlap.
 122  */
 123 static __rte_always_inline void
 124 rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
 125 {
 126         __m512i zmm0, zmm1;
 127
 128         while (n >= 128) {
 129                 zmm0 = _mm512_loadu_si512((const void *)(src + 0 * 64));
 130                 n -= 128;
 131                 zmm1 = _mm512_loadu_si512((const void *)(src + 1 * 64));
 132                 src = src + 128;
 133                 _mm512_storeu_si512((void *)(dst + 0 * 64), zmm0);
 134                 _mm512_storeu_si512((void *)(dst + 1 * 64), zmm1);
 135                 dst = dst + 128;
 136         }
 137 }
 138
 139 /**
 140  * Copy 512-byte blocks from one location to another,
 141  * locations should not overlap.
 142  */
 143 static inline void
 144 rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n)
 145 {
 146         __m512i zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7;
 147
 148         while (n >= 512) {
 149                 zmm0 = _mm512_loadu_si512((const void *)(src + 0 * 64));
 150                 n -= 512;
 151                 zmm1 = _mm512_loadu_si512((const void *)(src + 1 * 64));
 152                 zmm2 = _mm512_loadu_si512((const void *)(src + 2 * 64));
 153                 zmm3 = _mm512_loadu_si512((const void *)(src + 3 * 64));
 154                 zmm4 = _mm512_loadu_si512((const void *)(src + 4 * 64));
 155                 zmm5 = _mm512_loadu_si512((const void *)(src + 5 * 64));
 156                 zmm6 = _mm512_loadu_si512((const void *)(src + 6 * 64));
 157                 zmm7 = _mm512_loadu_si512((const void *)(src + 7 * 64));
 158                 src = src + 512;
 159                 _mm512_storeu_si512((void *)(dst + 0 * 64), zmm0);
 160                 _mm512_storeu_si512((void *)(dst + 1 * 64), zmm1);
 161                 _mm512_storeu_si512((void *)(dst + 2 * 64), zmm2);
 162                 _mm512_storeu_si512((void *)(dst + 3 * 64), zmm3);
 163                 _mm512_storeu_si512((void *)(dst + 4 * 64), zmm4);
 164                 _mm512_storeu_si512((void *)(dst + 5 * 64), zmm5);
 165                 _mm512_storeu_si512((void *)(dst + 6 * 64), zmm6);
 166                 _mm512_storeu_si512((void *)(dst + 7 * 64), zmm7);
 167                 dst = dst + 512;
 168         }
 169 }
 170
 171 static __rte_always_inline void *
 172 rte_memcpy_generic(void *dst, const void *src, size_t n)
 173 {
 174         uintptr_t dstu = (uintptr_t)dst;
 175         uintptr_t srcu = (uintptr_t)src;
 176         void *ret = dst;
 177         size_t dstofss;
 178         size_t bits;
 179
 180         /**
 181          * Copy less than 16 bytes
 182          */
 183         if (n < 16) {
 184                 if (n & 0x01) {
 185                         *(uint8_t *)dstu = *(const uint8_t *)srcu;
 186                         srcu = (uintptr_t)((const uint8_t *)srcu + 1);
 187                         dstu = (uintptr_t)((uint8_t *)dstu + 1);
 188                 }
 189                 if (n & 0x02) {
 190                         *(uint16_t *)dstu = *(const uint16_t *)srcu;
 191                         srcu = (uintptr_t)((const uint16_t *)srcu + 1);
 192                         dstu = (uintptr_t)((uint16_t *)dstu + 1);
 193                 }
 194                 if (n & 0x04) {
 195                         *(uint32_t *)dstu = *(const uint32_t *)srcu;
 196                         srcu = (uintptr_t)((const uint32_t *)srcu + 1);
 197                         dstu = (uintptr_t)((uint32_t *)dstu + 1);
 198                 }
 199                 if (n & 0x08)
 200                         *(uint64_t *)dstu = *(const uint64_t *)srcu;
 201                 return ret;
 202         }
 203
 204         /**
 205          * Fast way when copy size doesn't exceed 512 bytes
 206          */
 207         if (n <= 32) {
 208                 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
 209                 rte_mov16((uint8_t *)dst - 16 + n,
 210                                   (const uint8_t *)src - 16 + n);
 211                 return ret;
 212         }
 213         if (n <= 64) {
 214                 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 215                 rte_mov32((uint8_t *)dst - 32 + n,
 216                                   (const uint8_t *)src - 32 + n);
 217                 return ret;
 218         }
 219         if (n <= 512) {
 220                 if (n >= 256) {
 221                         n -= 256;
 222                         rte_mov256((uint8_t *)dst, (const uint8_t *)src);
 223                         src = (const uint8_t *)src + 256;
 224                         dst = (uint8_t *)dst + 256;
 225                 }
 226                 if (n >= 128) {
 227                         n -= 128;
 228                         rte_mov128((uint8_t *)dst, (const uint8_t *)src);
 229                         src = (const uint8_t *)src + 128;
 230                         dst = (uint8_t *)dst + 128;
 231                 }
 232 COPY_BLOCK_128_BACK63:
 233                 if (n > 64) {
 234                         rte_mov64((uint8_t *)dst, (const uint8_t *)src);
 235                         rte_mov64((uint8_t *)dst - 64 + n,
 236                                           (const uint8_t *)src - 64 + n);
 237                         return ret;
 238                 }
 239                 if (n > 0)
 240                         rte_mov64((uint8_t *)dst - 64 + n,
 241                                           (const uint8_t *)src - 64 + n);
 242                 return ret;
 243         }
 244
 245         /**
 246          * Make store aligned when copy size exceeds 512 bytes
 247          */
 248         dstofss = ((uintptr_t)dst & 0x3F);
 249         if (dstofss > 0) {
 250                 dstofss = 64 - dstofss;
 251                 n -= dstofss;
 252                 rte_mov64((uint8_t *)dst, (const uint8_t *)src);
 253                 src = (const uint8_t *)src + dstofss;
 254                 dst = (uint8_t *)dst + dstofss;
 255         }
 256
 257         /**
 258          * Copy 512-byte blocks.
 259          * Use copy block function for better instruction order control,
 260          * which is important when load is unaligned.
 261          */
 262         rte_mov512blocks((uint8_t *)dst, (const uint8_t *)src, n);
 263         bits = n;
 264         n = n & 511;
 265         bits -= n;
 266         src = (const uint8_t *)src + bits;
 267         dst = (uint8_t *)dst + bits;
 268
 269         /**
 270          * Copy 128-byte blocks.
 271          * Use copy block function for better instruction order control,
 272          * which is important when load is unaligned.
 273          */
 274         if (n >= 128) {
 275                 rte_mov128blocks((uint8_t *)dst, (const uint8_t *)src, n);
 276                 bits = n;
 277                 n = n & 127;
 278                 bits -= n;
 279                 src = (const uint8_t *)src + bits;
 280                 dst = (uint8_t *)dst + bits;
 281         }
 282
 283         /**
 284          * Copy whatever left
 285          */
 286         goto COPY_BLOCK_128_BACK63;
 287 }
 288
 289 #elif defined __AVX2__
 290
 291 #define ALIGNMENT_MASK 0x1F
 292
 293 /**
 294  * AVX2 implementation below
 295  */
 296
 297 /**
 298  * Copy 16 bytes from one location to another,
 299  * locations should not overlap.
 300  */
 301 static __rte_always_inline void
 302 rte_mov16(uint8_t *dst, const uint8_t *src)
 303 {
 304         __m128i xmm0;
 305
 306         xmm0 = _mm_loadu_si128((const __m128i *)(const void *)src);
 307         _mm_storeu_si128((__m128i *)(void *)dst, xmm0);
 308 }
 309
 310 /**
 311  * Copy 32 bytes from one location to another,
 312  * locations should not overlap.
 313  */
 314 static __rte_always_inline void
 315 rte_mov32(uint8_t *dst, const uint8_t *src)
 316 {
 317         __m256i ymm0;
 318
 319         ymm0 = _mm256_loadu_si256((const __m256i *)(const void *)src);
 320         _mm256_storeu_si256((__m256i *)(void *)dst, ymm0);
 321 }
 322
 323 /**
 324  * Copy 64 bytes from one location to another,
 325  * locations should not overlap.
 326  */
 327 static __rte_always_inline void
 328 rte_mov64(uint8_t *dst, const uint8_t *src)
 329 {
 330         rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
 331         rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
 332 }
 333
 334 /**
 335  * Copy 128 bytes from one location to another,
 336  * locations should not overlap.
 337  */
 338 static __rte_always_inline void
 339 rte_mov128(uint8_t *dst, const uint8_t *src)
 340 {
 341         rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
 342         rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
 343         rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32);
 344         rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32);
 345 }
 346
 347 /**
 348  * Copy 128-byte blocks from one location to another,
 349  * locations should not overlap.
 350  */
 351 static __rte_always_inline void
 352 rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
 353 {
 354         __m256i ymm0, ymm1, ymm2, ymm3;
 355
 356         while (n >= 128) {
 357                 ymm0 = _mm256_loadu_si256((const __m256i *)(const void *)
 358                                           ((const uint8_t *)src + 0 * 32));
 359                 n -= 128;
 360                 ymm1 = _mm256_loadu_si256((const __m256i *)(const void *)
 361                                           ((const uint8_t *)src + 1 * 32));
 362                 ymm2 = _mm256_loadu_si256((const __m256i *)(const void *)
 363                                           ((const uint8_t *)src + 2 * 32));
 364                 ymm3 = _mm256_loadu_si256((const __m256i *)(const void *)
 365                                           ((const uint8_t *)src + 3 * 32));
 366                 src = (const uint8_t *)src + 128;
 367                 _mm256_storeu_si256((__m256i *)(void *)
 368                                     ((uint8_t *)dst + 0 * 32), ymm0);
 369                 _mm256_storeu_si256((__m256i *)(void *)
 370                                     ((uint8_t *)dst + 1 * 32), ymm1);
 371                 _mm256_storeu_si256((__m256i *)(void *)
 372                                     ((uint8_t *)dst + 2 * 32), ymm2);
 373                 _mm256_storeu_si256((__m256i *)(void *)
 374                                     ((uint8_t *)dst + 3 * 32), ymm3);
 375                 dst = (uint8_t *)dst + 128;
 376         }
 377 }
 378
 379 static __rte_always_inline void *
 380 rte_memcpy_generic(void *dst, const void *src, size_t n)
 381 {
 382         uintptr_t dstu = (uintptr_t)dst;
 383         uintptr_t srcu = (uintptr_t)src;
 384         void *ret = dst;
 385         size_t dstofss;
 386         size_t bits;
 387
 388         /**
 389          * Copy less than 16 bytes
 390          */
 391         if (n < 16) {
 392                 if (n & 0x01) {
 393                         *(uint8_t *)dstu = *(const uint8_t *)srcu;
 394                         srcu = (uintptr_t)((const uint8_t *)srcu + 1);
 395                         dstu = (uintptr_t)((uint8_t *)dstu + 1);
 396                 }
 397                 if (n & 0x02) {
 398                         *(uint16_t *)dstu = *(const uint16_t *)srcu;
 399                         srcu = (uintptr_t)((const uint16_t *)srcu + 1);
 400                         dstu = (uintptr_t)((uint16_t *)dstu + 1);
 401                 }
 402                 if (n & 0x04) {
 403                         *(uint32_t *)dstu = *(const uint32_t *)srcu;
 404                         srcu = (uintptr_t)((const uint32_t *)srcu + 1);
 405                         dstu = (uintptr_t)((uint32_t *)dstu + 1);
 406                 }
 407                 if (n & 0x08) {
 408                         *(uint64_t *)dstu = *(const uint64_t *)srcu;
 409                 }
 410                 return ret;
 411         }
 412
 413         /**
 414          * Fast way when copy size doesn't exceed 256 bytes
 415          */
 416         if (n <= 32) {
 417                 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
 418                 rte_mov16((uint8_t *)dst - 16 + n,
 419                                 (const uint8_t *)src - 16 + n);
 420                 return ret;
 421         }
 422         if (n <= 48) {
 423                 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
 424                 rte_mov16((uint8_t *)dst + 16, (const uint8_t *)src + 16);
 425                 rte_mov16((uint8_t *)dst - 16 + n,
 426                                 (const uint8_t *)src - 16 + n);
 427                 return ret;
 428         }
 429         if (n <= 64) {
 430                 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 431                 rte_mov32((uint8_t *)dst - 32 + n,
 432                                 (const uint8_t *)src - 32 + n);
 433                 return ret;
 434         }
 435         if (n <= 256) {
 436                 if (n >= 128) {
 437                         n -= 128;
 438                         rte_mov128((uint8_t *)dst, (const uint8_t *)src);
 439                         src = (const uint8_t *)src + 128;
 440                         dst = (uint8_t *)dst + 128;
 441                 }
 442 COPY_BLOCK_128_BACK31:
 443                 if (n >= 64) {
 444                         n -= 64;
 445                         rte_mov64((uint8_t *)dst, (const uint8_t *)src);
 446                         src = (const uint8_t *)src + 64;
 447                         dst = (uint8_t *)dst + 64;
 448                 }
 449                 if (n > 32) {
 450                         rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 451                         rte_mov32((uint8_t *)dst - 32 + n,
 452                                         (const uint8_t *)src - 32 + n);
 453                         return ret;
 454                 }
 455                 if (n > 0) {
 456                         rte_mov32((uint8_t *)dst - 32 + n,
 457                                         (const uint8_t *)src - 32 + n);
 458                 }
 459                 return ret;
 460         }
 461
 462         /**
 463          * Make store aligned when copy size exceeds 256 bytes
 464          */
 465         dstofss = (uintptr_t)dst & 0x1F;
 466         if (dstofss > 0) {
 467                 dstofss = 32 - dstofss;
 468                 n -= dstofss;
 469                 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 470                 src = (const uint8_t *)src + dstofss;
 471                 dst = (uint8_t *)dst + dstofss;
 472         }
 473
 474         /**
 475          * Copy 128-byte blocks
 476          */
 477         rte_mov128blocks((uint8_t *)dst, (const uint8_t *)src, n);
 478         bits = n;
 479         n = n & 127;
 480         bits -= n;
 481         src = (const uint8_t *)src + bits;
 482         dst = (uint8_t *)dst + bits;
 483
 484         /**
 485          * Copy whatever left
 486          */
 487         goto COPY_BLOCK_128_BACK31;
 488 }
 489
 490 #else /* __AVX512F__ */
 491
 492 #define ALIGNMENT_MASK 0x0F
 493
 494 /**
 495  * SSE & AVX implementation below
 496  */
 497
 498 /**
 499  * Copy 16 bytes from one location to another,
 500  * locations should not overlap.
 501  */
 502 static __rte_always_inline void
 503 rte_mov16(uint8_t *dst, const uint8_t *src)
 504 {
 505         __m128i xmm0;
 506
 507         xmm0 = _mm_loadu_si128((const __m128i *)(const void *)src);
 508         _mm_storeu_si128((__m128i *)(void *)dst, xmm0);
 509 }
 510
 511 /**
 512  * Copy 32 bytes from one location to another,
 513  * locations should not overlap.
 514  */
 515 static __rte_always_inline void
 516 rte_mov32(uint8_t *dst, const uint8_t *src)
 517 {
 518         rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
 519         rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
 520 }
 521
 522 /**
 523  * Copy 64 bytes from one location to another,
 524  * locations should not overlap.
 525  */
 526 static __rte_always_inline void
 527 rte_mov64(uint8_t *dst, const uint8_t *src)
 528 {
 529         rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
 530         rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
 531         rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
 532         rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
 533 }
 534
 535 /**
 536  * Copy 128 bytes from one location to another,
 537  * locations should not overlap.
 538  */
 539 static __rte_always_inline void
 540 rte_mov128(uint8_t *dst, const uint8_t *src)
 541 {
 542         rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
 543         rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
 544         rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
 545         rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
 546         rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
 547         rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
 548         rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
 549         rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
 550 }
 551
 552 /**
 553  * Copy 256 bytes from one location to another,
 554  * locations should not overlap.
 555  */
 556 static inline void
 557 rte_mov256(uint8_t *dst, const uint8_t *src)
 558 {
 559         rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
 560         rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
 561         rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
 562         rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
 563         rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
 564         rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
 565         rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
 566         rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
 567         rte_mov16((uint8_t *)dst + 8 * 16, (const uint8_t *)src + 8 * 16);
 568         rte_mov16((uint8_t *)dst + 9 * 16, (const uint8_t *)src + 9 * 16);
 569         rte_mov16((uint8_t *)dst + 10 * 16, (const uint8_t *)src + 10 * 16);
 570         rte_mov16((uint8_t *)dst + 11 * 16, (const uint8_t *)src + 11 * 16);
 571         rte_mov16((uint8_t *)dst + 12 * 16, (const uint8_t *)src + 12 * 16);
 572         rte_mov16((uint8_t *)dst + 13 * 16, (const uint8_t *)src + 13 * 16);
 573         rte_mov16((uint8_t *)dst + 14 * 16, (const uint8_t *)src + 14 * 16);
 574         rte_mov16((uint8_t *)dst + 15 * 16, (const uint8_t *)src + 15 * 16);
 575 }
 576
 577 /**
 578  * Macro for copying unaligned block from one location to another with constant load offset,
 579  * 47 bytes leftover maximum,
 580  * locations should not overlap.
 581  * Requirements:
 582  * - Store is aligned
 583  * - Load offset is <offset>, which must be immediate value within [1, 15]
 584  * - For <src>, make sure <offset> bit backwards & <16 - offset> bit forwards are available for loading
 585  * - <dst>, <src>, <len> must be variables
 586  * - __m128i <xmm0> ~ <xmm8> must be pre-defined
 587  */
 588 #define MOVEUNALIGNED_LEFT47_IMM(dst, src, len, offset)                                                     \
 589 __extension__ ({                                                                                            \
 590     size_t tmp;                                                                                                \
 591     while (len >= 128 + 16 - offset) {                                                                      \
 592         xmm0 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 0 * 16));                  \
 593         len -= 128;                                                                                         \
 594         xmm1 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 1 * 16));                  \
 595         xmm2 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 2 * 16));                  \
 596         xmm3 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 3 * 16));                  \
 597         xmm4 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 4 * 16));                  \
 598         xmm5 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 5 * 16));                  \
 599         xmm6 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 6 * 16));                  \
 600         xmm7 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 7 * 16));                  \
 601         xmm8 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 8 * 16));                  \
 602         src = (const uint8_t *)src + 128;                                                                   \
 603         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset));        \
 604         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset));        \
 605         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 2 * 16), _mm_alignr_epi8(xmm3, xmm2, offset));        \
 606         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 3 * 16), _mm_alignr_epi8(xmm4, xmm3, offset));        \
 607         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 4 * 16), _mm_alignr_epi8(xmm5, xmm4, offset));        \
 608         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 5 * 16), _mm_alignr_epi8(xmm6, xmm5, offset));        \
 609         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 6 * 16), _mm_alignr_epi8(xmm7, xmm6, offset));        \
 610         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 7 * 16), _mm_alignr_epi8(xmm8, xmm7, offset));        \
 611         dst = (uint8_t *)dst + 128;                                                                         \
 612     }                                                                                                       \
 613     tmp = len;                                                                                              \
 614     len = ((len - 16 + offset) & 127) + 16 - offset;                                                        \
 615     tmp -= len;                                                                                             \
 616     src = (const uint8_t *)src + tmp;                                                                       \
 617     dst = (uint8_t *)dst + tmp;                                                                             \
 618     if (len >= 32 + 16 - offset) {                                                                          \
 619         while (len >= 32 + 16 - offset) {                                                                   \
 620             xmm0 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 0 * 16));              \
 621             len -= 32;                                                                                      \
 622             xmm1 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 1 * 16));              \
 623             xmm2 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 2 * 16));              \
 624             src = (const uint8_t *)src + 32;                                                                \
 625             _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset));    \
 626             _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset));    \
 627             dst = (uint8_t *)dst + 32;                                                                      \
 628         }                                                                                                   \
 629         tmp = len;                                                                                          \
 630         len = ((len - 16 + offset) & 31) + 16 - offset;                                                     \
 631         tmp -= len;                                                                                         \
 632         src = (const uint8_t *)src + tmp;                                                                   \
 633         dst = (uint8_t *)dst + tmp;                                                                         \
 634     }                                                                                                       \
 635 })
 636
 637 /**
 638  * Macro for copying unaligned block from one location to another,
 639  * 47 bytes leftover maximum,
 640  * locations should not overlap.
 641  * Use switch here because the aligning instruction requires immediate value for shift count.
 642  * Requirements:
 643  * - Store is aligned
 644  * - Load offset is <offset>, which must be within [1, 15]
 645  * - For <src>, make sure <offset> bit backwards & <16 - offset> bit forwards are available for loading
 646  * - <dst>, <src>, <len> must be variables
 647  * - __m128i <xmm0> ~ <xmm8> used in MOVEUNALIGNED_LEFT47_IMM must be pre-defined
 648  */
 649 #define MOVEUNALIGNED_LEFT47(dst, src, len, offset)                   \
 650 __extension__ ({                                                      \
 651     switch (offset) {                                                 \
 652     case 0x01: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x01); break;    \
 653     case 0x02: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x02); break;    \
 654     case 0x03: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x03); break;    \
 655     case 0x04: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x04); break;    \
 656     case 0x05: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x05); break;    \
 657     case 0x06: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x06); break;    \
 658     case 0x07: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x07); break;    \
 659     case 0x08: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x08); break;    \
 660     case 0x09: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x09); break;    \
 661     case 0x0A: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0A); break;    \
 662     case 0x0B: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0B); break;    \
 663     case 0x0C: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0C); break;    \
 664     case 0x0D: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0D); break;    \
 665     case 0x0E: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0E); break;    \
 666     case 0x0F: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0F); break;    \
 667     default:;                                                         \
 668     }                                                                 \
 669 })
 670
 671 static __rte_always_inline void *
 672 rte_memcpy_generic(void *dst, const void *src, size_t n)
 673 {
 674         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
 675         uintptr_t dstu = (uintptr_t)dst;
 676         uintptr_t srcu = (uintptr_t)src;
 677         void *ret = dst;
 678         size_t dstofss;
 679         size_t srcofs;
 680
 681         /**
 682          * Copy less than 16 bytes
 683          */
 684         if (n < 16) {
 685                 if (n & 0x01) {
 686                         *(uint8_t *)dstu = *(const uint8_t *)srcu;
 687                         srcu = (uintptr_t)((const uint8_t *)srcu + 1);
 688                         dstu = (uintptr_t)((uint8_t *)dstu + 1);
 689                 }
 690                 if (n & 0x02) {
 691                         *(uint16_t *)dstu = *(const uint16_t *)srcu;
 692                         srcu = (uintptr_t)((const uint16_t *)srcu + 1);
 693                         dstu = (uintptr_t)((uint16_t *)dstu + 1);
 694                 }
 695                 if (n & 0x04) {
 696                         *(uint32_t *)dstu = *(const uint32_t *)srcu;
 697                         srcu = (uintptr_t)((const uint32_t *)srcu + 1);
 698                         dstu = (uintptr_t)((uint32_t *)dstu + 1);
 699                 }
 700                 if (n & 0x08) {
 701                         *(uint64_t *)dstu = *(const uint64_t *)srcu;
 702                 }
 703                 return ret;
 704         }
 705
 706         /**
 707          * Fast way when copy size doesn't exceed 512 bytes
 708          */
 709         if (n <= 32) {
 710                 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
 711                 rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
 712                 return ret;
 713         }
 714         if (n <= 48) {
 715                 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 716                 rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
 717                 return ret;
 718         }
 719         if (n <= 64) {
 720                 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 721                 rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32);
 722                 rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
 723                 return ret;
 724         }
 725         if (n <= 128) {
 726                 goto COPY_BLOCK_128_BACK15;
 727         }
 728         if (n <= 512) {
 729                 if (n >= 256) {
 730                         n -= 256;
 731                         rte_mov128((uint8_t *)dst, (const uint8_t *)src);
 732                         rte_mov128((uint8_t *)dst + 128, (const uint8_t *)src + 128);
 733                         src = (const uint8_t *)src + 256;
 734                         dst = (uint8_t *)dst + 256;
 735                 }
 736 COPY_BLOCK_255_BACK15:
 737                 if (n >= 128) {
 738                         n -= 128;
 739                         rte_mov128((uint8_t *)dst, (const uint8_t *)src);
 740                         src = (const uint8_t *)src + 128;
 741                         dst = (uint8_t *)dst + 128;
 742                 }
 743 COPY_BLOCK_128_BACK15:
 744                 if (n >= 64) {
 745                         n -= 64;
 746                         rte_mov64((uint8_t *)dst, (const uint8_t *)src);
 747                         src = (const uint8_t *)src + 64;
 748                         dst = (uint8_t *)dst + 64;
 749                 }
 750 COPY_BLOCK_64_BACK15:
 751                 if (n >= 32) {
 752                         n -= 32;
 753                         rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 754                         src = (const uint8_t *)src + 32;
 755                         dst = (uint8_t *)dst + 32;
 756                 }
 757                 if (n > 16) {
 758                         rte_mov16((uint8_t *)dst, (const uint8_t *)src);
 759                         rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
 760                         return ret;
 761                 }
 762                 if (n > 0) {
 763                         rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
 764                 }
 765                 return ret;
 766         }
 767
 768         /**
 769          * Make store aligned when copy size exceeds 512 bytes,
 770          * and make sure the first 15 bytes are copied, because
 771          * unaligned copy functions require up to 15 bytes
 772          * backwards access.
 773          */
 774         dstofss = (uintptr_t)dst & 0x0F;
 775         if (dstofss > 0) {
 776                 dstofss = 16 - dstofss + 16;
 777                 n -= dstofss;
 778                 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 779                 src = (const uint8_t *)src + dstofss;
 780                 dst = (uint8_t *)dst + dstofss;
 781         }
 782         srcofs = ((uintptr_t)src & 0x0F);
 783
 784         /**
 785          * For aligned copy
 786          */
 787         if (srcofs == 0) {
 788                 /**
 789                  * Copy 256-byte blocks
 790                  */
 791                 for (; n >= 256; n -= 256) {
 792                         rte_mov256((uint8_t *)dst, (const uint8_t *)src);
 793                         dst = (uint8_t *)dst + 256;
 794                         src = (const uint8_t *)src + 256;
 795                 }
 796
 797                 /**
 798                  * Copy whatever left
 799                  */
 800                 goto COPY_BLOCK_255_BACK15;
 801         }
 802
 803         /**
 804          * For copy with unaligned load
 805          */
 806         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
 807
 808         /**
 809          * Copy whatever left
 810          */
 811         goto COPY_BLOCK_64_BACK15;
 812 }
 813
 814 #endif /* __AVX512F__ */
 815
 816 static __rte_always_inline void *
 817 rte_memcpy_aligned(void *dst, const void *src, size_t n)
 818 {
 819         void *ret = dst;
 820
 821         /* Copy size <= 16 bytes */
 822         if (n < 16) {
 823                 if (n & 0x01) {
 824                         *(uint8_t *)dst = *(const uint8_t *)src;
 825                         src = (const uint8_t *)src + 1;
 826                         dst = (uint8_t *)dst + 1;
 827                 }
 828                 if (n & 0x02) {
 829                         *(uint16_t *)dst = *(const uint16_t *)src;
 830                         src = (const uint16_t *)src + 1;
 831                         dst = (uint16_t *)dst + 1;
 832                 }
 833                 if (n & 0x04) {
 834                         *(uint32_t *)dst = *(const uint32_t *)src;
 835                         src = (const uint32_t *)src + 1;
 836                         dst = (uint32_t *)dst + 1;
 837                 }
 838                 if (n & 0x08)
 839                         *(uint64_t *)dst = *(const uint64_t *)src;
 840
 841                 return ret;
 842         }
 843
 844         /* Copy 16 <= size <= 32 bytes */
 845         if (n <= 32) {
 846                 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
 847                 rte_mov16((uint8_t *)dst - 16 + n,
 848                                 (const uint8_t *)src - 16 + n);
 849
 850                 return ret;
 851         }
 852
 853         /* Copy 32 < size <= 64 bytes */
 854         if (n <= 64) {
 855                 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 856                 rte_mov32((uint8_t *)dst - 32 + n,
 857                                 (const uint8_t *)src - 32 + n);
 858
 859                 return ret;
 860         }
 861
 862         /* Copy 64 bytes blocks */
 863         for (; n >= 64; n -= 64) {
 864                 rte_mov64((uint8_t *)dst, (const uint8_t *)src);
 865                 dst = (uint8_t *)dst + 64;
 866                 src = (const uint8_t *)src + 64;
 867         }
 868
 869         /* Copy whatever left */
 870         rte_mov64((uint8_t *)dst - 64 + n,
 871                         (const uint8_t *)src - 64 + n);
 872
 873         return ret;
 874 }
 875
 876 static __rte_always_inline void *
 877 rte_memcpy(void *dst, const void *src, size_t n)
 878 {
 879         if (!(((uintptr_t)dst | (uintptr_t)src) & ALIGNMENT_MASK))
 880                 return rte_memcpy_aligned(dst, src, n);
 881         else
 882                 return rte_memcpy_generic(dst, src, n);
 883 }
 884
 885 #if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION >= 100000)
 886 #pragma GCC diagnostic pop
 887 #endif
 888
 889 #ifdef __cplusplus
 890 }
 891 #endif
 892
 893 #endif /* _RTE_MEMCPY_X86_64_H_ */