lib/librte_eal/common/include/arch/x86/rte_memcpy.h

   1 /* SPDX-License-Identifier: BSD-3-Clause
   2  * Copyright(c) 2010-2014 Intel Corporation
   3  */
   4
   5 #ifndef _RTE_MEMCPY_X86_64_H_
   6 #define _RTE_MEMCPY_X86_64_H_
   7
   8 /**
   9  * @file
  10  *
  11  * Functions for SSE/AVX/AVX2/AVX512 implementation of memcpy().
  12  */
  13
  14 #include <stdio.h>
  15 #include <stdint.h>
  16 #include <string.h>
  17 #include <rte_vect.h>
  18 #include <rte_common.h>
  19
  20 #ifdef __cplusplus
  21 extern "C" {
  22 #endif
  23
  24 /**
  25  * Copy bytes from one location to another. The locations must not overlap.
  26  *
  27  * @note This is implemented as a macro, so it's address should not be taken
  28  * and care is needed as parameter expressions may be evaluated multiple times.
  29  *
  30  * @param dst
  31  *   Pointer to the destination of the data.
  32  * @param src
  33  *   Pointer to the source data.
  34  * @param n
  35  *   Number of bytes to copy.
  36  * @return
  37  *   Pointer to the destination data.
  38  */
  39 static __rte_always_inline void *
  40 rte_memcpy(void *dst, const void *src, size_t n);
  41
  42 #ifdef RTE_MACHINE_CPUFLAG_AVX512F
  43
  44 #define ALIGNMENT_MASK 0x3F
  45
  46 /**
  47  * AVX512 implementation below
  48  */
  49
  50 /**
  51  * Copy 16 bytes from one location to another,
  52  * locations should not overlap.
  53  */
  54 static inline void
  55 rte_mov16(uint8_t *dst, const uint8_t *src)
  56 {
  57         __m128i xmm0;
  58
  59         xmm0 = _mm_loadu_si128((const __m128i *)src);
  60         _mm_storeu_si128((__m128i *)dst, xmm0);
  61 }
  62
  63 /**
  64  * Copy 32 bytes from one location to another,
  65  * locations should not overlap.
  66  */
  67 static inline void
  68 rte_mov32(uint8_t *dst, const uint8_t *src)
  69 {
  70         __m256i ymm0;
  71
  72         ymm0 = _mm256_loadu_si256((const __m256i *)src);
  73         _mm256_storeu_si256((__m256i *)dst, ymm0);
  74 }
  75
  76 /**
  77  * Copy 64 bytes from one location to another,
  78  * locations should not overlap.
  79  */
  80 static inline void
  81 rte_mov64(uint8_t *dst, const uint8_t *src)
  82 {
  83         __m512i zmm0;
  84
  85         zmm0 = _mm512_loadu_si512((const void *)src);
  86         _mm512_storeu_si512((void *)dst, zmm0);
  87 }
  88
  89 /**
  90  * Copy 128 bytes from one location to another,
  91  * locations should not overlap.
  92  */
  93 static inline void
  94 rte_mov128(uint8_t *dst, const uint8_t *src)
  95 {
  96         rte_mov64(dst + 0 * 64, src + 0 * 64);
  97         rte_mov64(dst + 1 * 64, src + 1 * 64);
  98 }
  99
 100 /**
 101  * Copy 256 bytes from one location to another,
 102  * locations should not overlap.
 103  */
 104 static inline void
 105 rte_mov256(uint8_t *dst, const uint8_t *src)
 106 {
 107         rte_mov64(dst + 0 * 64, src + 0 * 64);
 108         rte_mov64(dst + 1 * 64, src + 1 * 64);
 109         rte_mov64(dst + 2 * 64, src + 2 * 64);
 110         rte_mov64(dst + 3 * 64, src + 3 * 64);
 111 }
 112
 113 /**
 114  * Copy 128-byte blocks from one location to another,
 115  * locations should not overlap.
 116  */
 117 static inline void
 118 rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
 119 {
 120         __m512i zmm0, zmm1;
 121
 122         while (n >= 128) {
 123                 zmm0 = _mm512_loadu_si512((const void *)(src + 0 * 64));
 124                 n -= 128;
 125                 zmm1 = _mm512_loadu_si512((const void *)(src + 1 * 64));
 126                 src = src + 128;
 127                 _mm512_storeu_si512((void *)(dst + 0 * 64), zmm0);
 128                 _mm512_storeu_si512((void *)(dst + 1 * 64), zmm1);
 129                 dst = dst + 128;
 130         }
 131 }
 132
 133 /**
 134  * Copy 512-byte blocks from one location to another,
 135  * locations should not overlap.
 136  */
 137 static inline void
 138 rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n)
 139 {
 140         __m512i zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7;
 141
 142         while (n >= 512) {
 143                 zmm0 = _mm512_loadu_si512((const void *)(src + 0 * 64));
 144                 n -= 512;
 145                 zmm1 = _mm512_loadu_si512((const void *)(src + 1 * 64));
 146                 zmm2 = _mm512_loadu_si512((const void *)(src + 2 * 64));
 147                 zmm3 = _mm512_loadu_si512((const void *)(src + 3 * 64));
 148                 zmm4 = _mm512_loadu_si512((const void *)(src + 4 * 64));
 149                 zmm5 = _mm512_loadu_si512((const void *)(src + 5 * 64));
 150                 zmm6 = _mm512_loadu_si512((const void *)(src + 6 * 64));
 151                 zmm7 = _mm512_loadu_si512((const void *)(src + 7 * 64));
 152                 src = src + 512;
 153                 _mm512_storeu_si512((void *)(dst + 0 * 64), zmm0);
 154                 _mm512_storeu_si512((void *)(dst + 1 * 64), zmm1);
 155                 _mm512_storeu_si512((void *)(dst + 2 * 64), zmm2);
 156                 _mm512_storeu_si512((void *)(dst + 3 * 64), zmm3);
 157                 _mm512_storeu_si512((void *)(dst + 4 * 64), zmm4);
 158                 _mm512_storeu_si512((void *)(dst + 5 * 64), zmm5);
 159                 _mm512_storeu_si512((void *)(dst + 6 * 64), zmm6);
 160                 _mm512_storeu_si512((void *)(dst + 7 * 64), zmm7);
 161                 dst = dst + 512;
 162         }
 163 }
 164
 165 static inline void *
 166 rte_memcpy_generic(void *dst, const void *src, size_t n)
 167 {
 168         uintptr_t dstu = (uintptr_t)dst;
 169         uintptr_t srcu = (uintptr_t)src;
 170         void *ret = dst;
 171         size_t dstofss;
 172         size_t bits;
 173
 174         /**
 175          * Copy less than 16 bytes
 176          */
 177         if (n < 16) {
 178                 if (n & 0x01) {
 179                         *(uint8_t *)dstu = *(const uint8_t *)srcu;
 180                         srcu = (uintptr_t)((const uint8_t *)srcu + 1);
 181                         dstu = (uintptr_t)((uint8_t *)dstu + 1);
 182                 }
 183                 if (n & 0x02) {
 184                         *(uint16_t *)dstu = *(const uint16_t *)srcu;
 185                         srcu = (uintptr_t)((const uint16_t *)srcu + 1);
 186                         dstu = (uintptr_t)((uint16_t *)dstu + 1);
 187                 }
 188                 if (n & 0x04) {
 189                         *(uint32_t *)dstu = *(const uint32_t *)srcu;
 190                         srcu = (uintptr_t)((const uint32_t *)srcu + 1);
 191                         dstu = (uintptr_t)((uint32_t *)dstu + 1);
 192                 }
 193                 if (n & 0x08)
 194                         *(uint64_t *)dstu = *(const uint64_t *)srcu;
 195                 return ret;
 196         }
 197
 198         /**
 199          * Fast way when copy size doesn't exceed 512 bytes
 200          */
 201         if (n <= 32) {
 202                 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
 203                 rte_mov16((uint8_t *)dst - 16 + n,
 204                                   (const uint8_t *)src - 16 + n);
 205                 return ret;
 206         }
 207         if (n <= 64) {
 208                 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 209                 rte_mov32((uint8_t *)dst - 32 + n,
 210                                   (const uint8_t *)src - 32 + n);
 211                 return ret;
 212         }
 213         if (n <= 512) {
 214                 if (n >= 256) {
 215                         n -= 256;
 216                         rte_mov256((uint8_t *)dst, (const uint8_t *)src);
 217                         src = (const uint8_t *)src + 256;
 218                         dst = (uint8_t *)dst + 256;
 219                 }
 220                 if (n >= 128) {
 221                         n -= 128;
 222                         rte_mov128((uint8_t *)dst, (const uint8_t *)src);
 223                         src = (const uint8_t *)src + 128;
 224                         dst = (uint8_t *)dst + 128;
 225                 }
 226 COPY_BLOCK_128_BACK63:
 227                 if (n > 64) {
 228                         rte_mov64((uint8_t *)dst, (const uint8_t *)src);
 229                         rte_mov64((uint8_t *)dst - 64 + n,
 230                                           (const uint8_t *)src - 64 + n);
 231                         return ret;
 232                 }
 233                 if (n > 0)
 234                         rte_mov64((uint8_t *)dst - 64 + n,
 235                                           (const uint8_t *)src - 64 + n);
 236                 return ret;
 237         }
 238
 239         /**
 240          * Make store aligned when copy size exceeds 512 bytes
 241          */
 242         dstofss = ((uintptr_t)dst & 0x3F);
 243         if (dstofss > 0) {
 244                 dstofss = 64 - dstofss;
 245                 n -= dstofss;
 246                 rte_mov64((uint8_t *)dst, (const uint8_t *)src);
 247                 src = (const uint8_t *)src + dstofss;
 248                 dst = (uint8_t *)dst + dstofss;
 249         }
 250
 251         /**
 252          * Copy 512-byte blocks.
 253          * Use copy block function for better instruction order control,
 254          * which is important when load is unaligned.
 255          */
 256         rte_mov512blocks((uint8_t *)dst, (const uint8_t *)src, n);
 257         bits = n;
 258         n = n & 511;
 259         bits -= n;
 260         src = (const uint8_t *)src + bits;
 261         dst = (uint8_t *)dst + bits;
 262
 263         /**
 264          * Copy 128-byte blocks.
 265          * Use copy block function for better instruction order control,
 266          * which is important when load is unaligned.
 267          */
 268         if (n >= 128) {
 269                 rte_mov128blocks((uint8_t *)dst, (const uint8_t *)src, n);
 270                 bits = n;
 271                 n = n & 127;
 272                 bits -= n;
 273                 src = (const uint8_t *)src + bits;
 274                 dst = (uint8_t *)dst + bits;
 275         }
 276
 277         /**
 278          * Copy whatever left
 279          */
 280         goto COPY_BLOCK_128_BACK63;
 281 }
 282
 283 #elif defined RTE_MACHINE_CPUFLAG_AVX2
 284
 285 #define ALIGNMENT_MASK 0x1F
 286
 287 /**
 288  * AVX2 implementation below
 289  */
 290
 291 /**
 292  * Copy 16 bytes from one location to another,
 293  * locations should not overlap.
 294  */
 295 static inline void
 296 rte_mov16(uint8_t *dst, const uint8_t *src)
 297 {
 298         __m128i xmm0;
 299
 300         xmm0 = _mm_loadu_si128((const __m128i *)src);
 301         _mm_storeu_si128((__m128i *)dst, xmm0);
 302 }
 303
 304 /**
 305  * Copy 32 bytes from one location to another,
 306  * locations should not overlap.
 307  */
 308 static inline void
 309 rte_mov32(uint8_t *dst, const uint8_t *src)
 310 {
 311         __m256i ymm0;
 312
 313         ymm0 = _mm256_loadu_si256((const __m256i *)src);
 314         _mm256_storeu_si256((__m256i *)dst, ymm0);
 315 }
 316
 317 /**
 318  * Copy 64 bytes from one location to another,
 319  * locations should not overlap.
 320  */
 321 static inline void
 322 rte_mov64(uint8_t *dst, const uint8_t *src)
 323 {
 324         rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
 325         rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
 326 }
 327
 328 /**
 329  * Copy 128 bytes from one location to another,
 330  * locations should not overlap.
 331  */
 332 static inline void
 333 rte_mov128(uint8_t *dst, const uint8_t *src)
 334 {
 335         rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
 336         rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
 337         rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32);
 338         rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32);
 339 }
 340
 341 /**
 342  * Copy 128-byte blocks from one location to another,
 343  * locations should not overlap.
 344  */
 345 static inline void
 346 rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
 347 {
 348         __m256i ymm0, ymm1, ymm2, ymm3;
 349
 350         while (n >= 128) {
 351                 ymm0 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 0 * 32));
 352                 n -= 128;
 353                 ymm1 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 1 * 32));
 354                 ymm2 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 2 * 32));
 355                 ymm3 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 3 * 32));
 356                 src = (const uint8_t *)src + 128;
 357                 _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 0 * 32), ymm0);
 358                 _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 1 * 32), ymm1);
 359                 _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 2 * 32), ymm2);
 360                 _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 3 * 32), ymm3);
 361                 dst = (uint8_t *)dst + 128;
 362         }
 363 }
 364
 365 static inline void *
 366 rte_memcpy_generic(void *dst, const void *src, size_t n)
 367 {
 368         uintptr_t dstu = (uintptr_t)dst;
 369         uintptr_t srcu = (uintptr_t)src;
 370         void *ret = dst;
 371         size_t dstofss;
 372         size_t bits;
 373
 374         /**
 375          * Copy less than 16 bytes
 376          */
 377         if (n < 16) {
 378                 if (n & 0x01) {
 379                         *(uint8_t *)dstu = *(const uint8_t *)srcu;
 380                         srcu = (uintptr_t)((const uint8_t *)srcu + 1);
 381                         dstu = (uintptr_t)((uint8_t *)dstu + 1);
 382                 }
 383                 if (n & 0x02) {
 384                         *(uint16_t *)dstu = *(const uint16_t *)srcu;
 385                         srcu = (uintptr_t)((const uint16_t *)srcu + 1);
 386                         dstu = (uintptr_t)((uint16_t *)dstu + 1);
 387                 }
 388                 if (n & 0x04) {
 389                         *(uint32_t *)dstu = *(const uint32_t *)srcu;
 390                         srcu = (uintptr_t)((const uint32_t *)srcu + 1);
 391                         dstu = (uintptr_t)((uint32_t *)dstu + 1);
 392                 }
 393                 if (n & 0x08) {
 394                         *(uint64_t *)dstu = *(const uint64_t *)srcu;
 395                 }
 396                 return ret;
 397         }
 398
 399         /**
 400          * Fast way when copy size doesn't exceed 256 bytes
 401          */
 402         if (n <= 32) {
 403                 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
 404                 rte_mov16((uint8_t *)dst - 16 + n,
 405                                 (const uint8_t *)src - 16 + n);
 406                 return ret;
 407         }
 408         if (n <= 48) {
 409                 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
 410                 rte_mov16((uint8_t *)dst + 16, (const uint8_t *)src + 16);
 411                 rte_mov16((uint8_t *)dst - 16 + n,
 412                                 (const uint8_t *)src - 16 + n);
 413                 return ret;
 414         }
 415         if (n <= 64) {
 416                 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 417                 rte_mov32((uint8_t *)dst - 32 + n,
 418                                 (const uint8_t *)src - 32 + n);
 419                 return ret;
 420         }
 421         if (n <= 256) {
 422                 if (n >= 128) {
 423                         n -= 128;
 424                         rte_mov128((uint8_t *)dst, (const uint8_t *)src);
 425                         src = (const uint8_t *)src + 128;
 426                         dst = (uint8_t *)dst + 128;
 427                 }
 428 COPY_BLOCK_128_BACK31:
 429                 if (n >= 64) {
 430                         n -= 64;
 431                         rte_mov64((uint8_t *)dst, (const uint8_t *)src);
 432                         src = (const uint8_t *)src + 64;
 433                         dst = (uint8_t *)dst + 64;
 434                 }
 435                 if (n > 32) {
 436                         rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 437                         rte_mov32((uint8_t *)dst - 32 + n,
 438                                         (const uint8_t *)src - 32 + n);
 439                         return ret;
 440                 }
 441                 if (n > 0) {
 442                         rte_mov32((uint8_t *)dst - 32 + n,
 443                                         (const uint8_t *)src - 32 + n);
 444                 }
 445                 return ret;
 446         }
 447
 448         /**
 449          * Make store aligned when copy size exceeds 256 bytes
 450          */
 451         dstofss = (uintptr_t)dst & 0x1F;
 452         if (dstofss > 0) {
 453                 dstofss = 32 - dstofss;
 454                 n -= dstofss;
 455                 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 456                 src = (const uint8_t *)src + dstofss;
 457                 dst = (uint8_t *)dst + dstofss;
 458         }
 459
 460         /**
 461          * Copy 128-byte blocks
 462          */
 463         rte_mov128blocks((uint8_t *)dst, (const uint8_t *)src, n);
 464         bits = n;
 465         n = n & 127;
 466         bits -= n;
 467         src = (const uint8_t *)src + bits;
 468         dst = (uint8_t *)dst + bits;
 469
 470         /**
 471          * Copy whatever left
 472          */
 473         goto COPY_BLOCK_128_BACK31;
 474 }
 475
 476 #else /* RTE_MACHINE_CPUFLAG */
 477
 478 #define ALIGNMENT_MASK 0x0F
 479
 480 /**
 481  * SSE & AVX implementation below
 482  */
 483
 484 /**
 485  * Copy 16 bytes from one location to another,
 486  * locations should not overlap.
 487  */
 488 static inline void
 489 rte_mov16(uint8_t *dst, const uint8_t *src)
 490 {
 491         __m128i xmm0;
 492
 493         xmm0 = _mm_loadu_si128((const __m128i *)(const __m128i *)src);
 494         _mm_storeu_si128((__m128i *)dst, xmm0);
 495 }
 496
 497 /**
 498  * Copy 32 bytes from one location to another,
 499  * locations should not overlap.
 500  */
 501 static inline void
 502 rte_mov32(uint8_t *dst, const uint8_t *src)
 503 {
 504         rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
 505         rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
 506 }
 507
 508 /**
 509  * Copy 64 bytes from one location to another,
 510  * locations should not overlap.
 511  */
 512 static inline void
 513 rte_mov64(uint8_t *dst, const uint8_t *src)
 514 {
 515         rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
 516         rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
 517         rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
 518         rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
 519 }
 520
 521 /**
 522  * Copy 128 bytes from one location to another,
 523  * locations should not overlap.
 524  */
 525 static inline void
 526 rte_mov128(uint8_t *dst, const uint8_t *src)
 527 {
 528         rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
 529         rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
 530         rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
 531         rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
 532         rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
 533         rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
 534         rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
 535         rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
 536 }
 537
 538 /**
 539  * Copy 256 bytes from one location to another,
 540  * locations should not overlap.
 541  */
 542 static inline void
 543 rte_mov256(uint8_t *dst, const uint8_t *src)
 544 {
 545         rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
 546         rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
 547         rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
 548         rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
 549         rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
 550         rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
 551         rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
 552         rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
 553         rte_mov16((uint8_t *)dst + 8 * 16, (const uint8_t *)src + 8 * 16);
 554         rte_mov16((uint8_t *)dst + 9 * 16, (const uint8_t *)src + 9 * 16);
 555         rte_mov16((uint8_t *)dst + 10 * 16, (const uint8_t *)src + 10 * 16);
 556         rte_mov16((uint8_t *)dst + 11 * 16, (const uint8_t *)src + 11 * 16);
 557         rte_mov16((uint8_t *)dst + 12 * 16, (const uint8_t *)src + 12 * 16);
 558         rte_mov16((uint8_t *)dst + 13 * 16, (const uint8_t *)src + 13 * 16);
 559         rte_mov16((uint8_t *)dst + 14 * 16, (const uint8_t *)src + 14 * 16);
 560         rte_mov16((uint8_t *)dst + 15 * 16, (const uint8_t *)src + 15 * 16);
 561 }
 562
 563 /**
 564  * Macro for copying unaligned block from one location to another with constant load offset,
 565  * 47 bytes leftover maximum,
 566  * locations should not overlap.
 567  * Requirements:
 568  * - Store is aligned
 569  * - Load offset is <offset>, which must be immediate value within [1, 15]
 570  * - For <src>, make sure <offset> bit backwards & <16 - offset> bit forwards are available for loading
 571  * - <dst>, <src>, <len> must be variables
 572  * - __m128i <xmm0> ~ <xmm8> must be pre-defined
 573  */
 574 #define MOVEUNALIGNED_LEFT47_IMM(dst, src, len, offset)                                                     \
 575 __extension__ ({                                                                                            \
 576     int tmp;                                                                                                \
 577     while (len >= 128 + 16 - offset) {                                                                      \
 578         xmm0 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 0 * 16));                  \
 579         len -= 128;                                                                                         \
 580         xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 1 * 16));                  \
 581         xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 2 * 16));                  \
 582         xmm3 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 3 * 16));                  \
 583         xmm4 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 4 * 16));                  \
 584         xmm5 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 5 * 16));                  \
 585         xmm6 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 6 * 16));                  \
 586         xmm7 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 7 * 16));                  \
 587         xmm8 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 8 * 16));                  \
 588         src = (const uint8_t *)src + 128;                                                                   \
 589         _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset));        \
 590         _mm_storeu_si128((__m128i *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset));        \
 591         _mm_storeu_si128((__m128i *)((uint8_t *)dst + 2 * 16), _mm_alignr_epi8(xmm3, xmm2, offset));        \
 592         _mm_storeu_si128((__m128i *)((uint8_t *)dst + 3 * 16), _mm_alignr_epi8(xmm4, xmm3, offset));        \
 593         _mm_storeu_si128((__m128i *)((uint8_t *)dst + 4 * 16), _mm_alignr_epi8(xmm5, xmm4, offset));        \
 594         _mm_storeu_si128((__m128i *)((uint8_t *)dst + 5 * 16), _mm_alignr_epi8(xmm6, xmm5, offset));        \
 595         _mm_storeu_si128((__m128i *)((uint8_t *)dst + 6 * 16), _mm_alignr_epi8(xmm7, xmm6, offset));        \
 596         _mm_storeu_si128((__m128i *)((uint8_t *)dst + 7 * 16), _mm_alignr_epi8(xmm8, xmm7, offset));        \
 597         dst = (uint8_t *)dst + 128;                                                                         \
 598     }                                                                                                       \
 599     tmp = len;                                                                                              \
 600     len = ((len - 16 + offset) & 127) + 16 - offset;                                                        \
 601     tmp -= len;                                                                                             \
 602     src = (const uint8_t *)src + tmp;                                                                       \
 603     dst = (uint8_t *)dst + tmp;                                                                             \
 604     if (len >= 32 + 16 - offset) {                                                                          \
 605         while (len >= 32 + 16 - offset) {                                                                   \
 606             xmm0 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 0 * 16));              \
 607             len -= 32;                                                                                      \
 608             xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 1 * 16));              \
 609             xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 2 * 16));              \
 610             src = (const uint8_t *)src + 32;                                                                \
 611             _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset));    \
 612             _mm_storeu_si128((__m128i *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset));    \
 613             dst = (uint8_t *)dst + 32;                                                                      \
 614         }                                                                                                   \
 615         tmp = len;                                                                                          \
 616         len = ((len - 16 + offset) & 31) + 16 - offset;                                                     \
 617         tmp -= len;                                                                                         \
 618         src = (const uint8_t *)src + tmp;                                                                   \
 619         dst = (uint8_t *)dst + tmp;                                                                         \
 620     }                                                                                                       \
 621 })
 622
 623 /**
 624  * Macro for copying unaligned block from one location to another,
 625  * 47 bytes leftover maximum,
 626  * locations should not overlap.
 627  * Use switch here because the aligning instruction requires immediate value for shift count.
 628  * Requirements:
 629  * - Store is aligned
 630  * - Load offset is <offset>, which must be within [1, 15]
 631  * - For <src>, make sure <offset> bit backwards & <16 - offset> bit forwards are available for loading
 632  * - <dst>, <src>, <len> must be variables
 633  * - __m128i <xmm0> ~ <xmm8> used in MOVEUNALIGNED_LEFT47_IMM must be pre-defined
 634  */
 635 #define MOVEUNALIGNED_LEFT47(dst, src, len, offset)                   \
 636 __extension__ ({                                                      \
 637     switch (offset) {                                                 \
 638     case 0x01: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x01); break;    \
 639     case 0x02: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x02); break;    \
 640     case 0x03: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x03); break;    \
 641     case 0x04: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x04); break;    \
 642     case 0x05: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x05); break;    \
 643     case 0x06: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x06); break;    \
 644     case 0x07: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x07); break;    \
 645     case 0x08: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x08); break;    \
 646     case 0x09: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x09); break;    \
 647     case 0x0A: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0A); break;    \
 648     case 0x0B: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0B); break;    \
 649     case 0x0C: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0C); break;    \
 650     case 0x0D: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0D); break;    \
 651     case 0x0E: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0E); break;    \
 652     case 0x0F: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0F); break;    \
 653     default:;                                                         \
 654     }                                                                 \
 655 })
 656
 657 static inline void *
 658 rte_memcpy_generic(void *dst, const void *src, size_t n)
 659 {
 660         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
 661         uintptr_t dstu = (uintptr_t)dst;
 662         uintptr_t srcu = (uintptr_t)src;
 663         void *ret = dst;
 664         size_t dstofss;
 665         size_t srcofs;
 666
 667         /**
 668          * Copy less than 16 bytes
 669          */
 670         if (n < 16) {
 671                 if (n & 0x01) {
 672                         *(uint8_t *)dstu = *(const uint8_t *)srcu;
 673                         srcu = (uintptr_t)((const uint8_t *)srcu + 1);
 674                         dstu = (uintptr_t)((uint8_t *)dstu + 1);
 675                 }
 676                 if (n & 0x02) {
 677                         *(uint16_t *)dstu = *(const uint16_t *)srcu;
 678                         srcu = (uintptr_t)((const uint16_t *)srcu + 1);
 679                         dstu = (uintptr_t)((uint16_t *)dstu + 1);
 680                 }
 681                 if (n & 0x04) {
 682                         *(uint32_t *)dstu = *(const uint32_t *)srcu;
 683                         srcu = (uintptr_t)((const uint32_t *)srcu + 1);
 684                         dstu = (uintptr_t)((uint32_t *)dstu + 1);
 685                 }
 686                 if (n & 0x08) {
 687                         *(uint64_t *)dstu = *(const uint64_t *)srcu;
 688                 }
 689                 return ret;
 690         }
 691
 692         /**
 693          * Fast way when copy size doesn't exceed 512 bytes
 694          */
 695         if (n <= 32) {
 696                 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
 697                 rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
 698                 return ret;
 699         }
 700         if (n <= 48) {
 701                 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 702                 rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
 703                 return ret;
 704         }
 705         if (n <= 64) {
 706                 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 707                 rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32);
 708                 rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
 709                 return ret;
 710         }
 711         if (n <= 128) {
 712                 goto COPY_BLOCK_128_BACK15;
 713         }
 714         if (n <= 512) {
 715                 if (n >= 256) {
 716                         n -= 256;
 717                         rte_mov128((uint8_t *)dst, (const uint8_t *)src);
 718                         rte_mov128((uint8_t *)dst + 128, (const uint8_t *)src + 128);
 719                         src = (const uint8_t *)src + 256;
 720                         dst = (uint8_t *)dst + 256;
 721                 }
 722 COPY_BLOCK_255_BACK15:
 723                 if (n >= 128) {
 724                         n -= 128;
 725                         rte_mov128((uint8_t *)dst, (const uint8_t *)src);
 726                         src = (const uint8_t *)src + 128;
 727                         dst = (uint8_t *)dst + 128;
 728                 }
 729 COPY_BLOCK_128_BACK15:
 730                 if (n >= 64) {
 731                         n -= 64;
 732                         rte_mov64((uint8_t *)dst, (const uint8_t *)src);
 733                         src = (const uint8_t *)src + 64;
 734                         dst = (uint8_t *)dst + 64;
 735                 }
 736 COPY_BLOCK_64_BACK15:
 737                 if (n >= 32) {
 738                         n -= 32;
 739                         rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 740                         src = (const uint8_t *)src + 32;
 741                         dst = (uint8_t *)dst + 32;
 742                 }
 743                 if (n > 16) {
 744                         rte_mov16((uint8_t *)dst, (const uint8_t *)src);
 745                         rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
 746                         return ret;
 747                 }
 748                 if (n > 0) {
 749                         rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
 750                 }
 751                 return ret;
 752         }
 753
 754         /**
 755          * Make store aligned when copy size exceeds 512 bytes,
 756          * and make sure the first 15 bytes are copied, because
 757          * unaligned copy functions require up to 15 bytes
 758          * backwards access.
 759          */
 760         dstofss = (uintptr_t)dst & 0x0F;
 761         if (dstofss > 0) {
 762                 dstofss = 16 - dstofss + 16;
 763                 n -= dstofss;
 764                 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 765                 src = (const uint8_t *)src + dstofss;
 766                 dst = (uint8_t *)dst + dstofss;
 767         }
 768         srcofs = ((uintptr_t)src & 0x0F);
 769
 770         /**
 771          * For aligned copy
 772          */
 773         if (srcofs == 0) {
 774                 /**
 775                  * Copy 256-byte blocks
 776                  */
 777                 for (; n >= 256; n -= 256) {
 778                         rte_mov256((uint8_t *)dst, (const uint8_t *)src);
 779                         dst = (uint8_t *)dst + 256;
 780                         src = (const uint8_t *)src + 256;
 781                 }
 782
 783                 /**
 784                  * Copy whatever left
 785                  */
 786                 goto COPY_BLOCK_255_BACK15;
 787         }
 788
 789         /**
 790          * For copy with unaligned load
 791          */
 792         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
 793
 794         /**
 795          * Copy whatever left
 796          */
 797         goto COPY_BLOCK_64_BACK15;
 798 }
 799
 800 #endif /* RTE_MACHINE_CPUFLAG */
 801
 802 static inline void *
 803 rte_memcpy_aligned(void *dst, const void *src, size_t n)
 804 {
 805         void *ret = dst;
 806
 807         /* Copy size <= 16 bytes */
 808         if (n < 16) {
 809                 if (n & 0x01) {
 810                         *(uint8_t *)dst = *(const uint8_t *)src;
 811                         src = (const uint8_t *)src + 1;
 812                         dst = (uint8_t *)dst + 1;
 813                 }
 814                 if (n & 0x02) {
 815                         *(uint16_t *)dst = *(const uint16_t *)src;
 816                         src = (const uint16_t *)src + 1;
 817                         dst = (uint16_t *)dst + 1;
 818                 }
 819                 if (n & 0x04) {
 820                         *(uint32_t *)dst = *(const uint32_t *)src;
 821                         src = (const uint32_t *)src + 1;
 822                         dst = (uint32_t *)dst + 1;
 823                 }
 824                 if (n & 0x08)
 825                         *(uint64_t *)dst = *(const uint64_t *)src;
 826
 827                 return ret;
 828         }
 829
 830         /* Copy 16 <= size <= 32 bytes */
 831         if (n <= 32) {
 832                 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
 833                 rte_mov16((uint8_t *)dst - 16 + n,
 834                                 (const uint8_t *)src - 16 + n);
 835
 836                 return ret;
 837         }
 838
 839         /* Copy 32 < size <= 64 bytes */
 840         if (n <= 64) {
 841                 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 842                 rte_mov32((uint8_t *)dst - 32 + n,
 843                                 (const uint8_t *)src - 32 + n);
 844
 845                 return ret;
 846         }
 847
 848         /* Copy 64 bytes blocks */
 849         for (; n >= 64; n -= 64) {
 850                 rte_mov64((uint8_t *)dst, (const uint8_t *)src);
 851                 dst = (uint8_t *)dst + 64;
 852                 src = (const uint8_t *)src + 64;
 853         }
 854
 855         /* Copy whatever left */
 856         rte_mov64((uint8_t *)dst - 64 + n,
 857                         (const uint8_t *)src - 64 + n);
 858
 859         return ret;
 860 }
 861
 862 static inline void *
 863 rte_memcpy(void *dst, const void *src, size_t n)
 864 {
 865         if (!(((uintptr_t)dst | (uintptr_t)src) & ALIGNMENT_MASK))
 866                 return rte_memcpy_aligned(dst, src, n);
 867         else
 868                 return rte_memcpy_generic(dst, src, n);
 869 }
 870
 871 #ifdef __cplusplus
 872 }
 873 #endif
 874
 875 #endif /* _RTE_MEMCPY_X86_64_H_ */