lib/librte_eal/common/include/arch/x86/rte_memcpy.h

   1 /*-
   2  *   BSD LICENSE
   3  *
   4  *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
   5  *   All rights reserved.
   6  *
   7  *   Redistribution and use in source and binary forms, with or without
   8  *   modification, are permitted provided that the following conditions
   9  *   are met:
  10  *
  11  *     * Redistributions of source code must retain the above copyright
  12  *       notice, this list of conditions and the following disclaimer.
  13  *     * Redistributions in binary form must reproduce the above copyright
  14  *       notice, this list of conditions and the following disclaimer in
  15  *       the documentation and/or other materials provided with the
  16  *       distribution.
  17  *     * Neither the name of Intel Corporation nor the names of its
  18  *       contributors may be used to endorse or promote products derived
  19  *       from this software without specific prior written permission.
  20  *
  21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  32  */
  33
  34 #ifndef _RTE_MEMCPY_X86_64_H_
  35 #define _RTE_MEMCPY_X86_64_H_
  36
  37 /**
  38  * @file
  39  *
  40  * Functions for SSE/AVX/AVX2/AVX512 implementation of memcpy().
  41  */
  42
  43 #include <stdio.h>
  44 #include <stdint.h>
  45 #include <string.h>
  46 #include <rte_vect.h>
  47
  48 #ifdef __cplusplus
  49 extern "C" {
  50 #endif
  51
  52 /**
  53  * Copy bytes from one location to another. The locations must not overlap.
  54  *
  55  * @note This is implemented as a macro, so it's address should not be taken
  56  * and care is needed as parameter expressions may be evaluated multiple times.
  57  *
  58  * @param dst
  59  *   Pointer to the destination of the data.
  60  * @param src
  61  *   Pointer to the source data.
  62  * @param n
  63  *   Number of bytes to copy.
  64  * @return
  65  *   Pointer to the destination data.
  66  */
  67 static inline void *
  68 rte_memcpy(void *dst, const void *src, size_t n) __attribute__((always_inline));
  69
  70 #ifdef RTE_MACHINE_CPUFLAG_AVX512F
  71
  72 /**
  73  * AVX512 implementation below
  74  */
  75
  76 /**
  77  * Copy 16 bytes from one location to another,
  78  * locations should not overlap.
  79  */
  80 static inline void
  81 rte_mov16(uint8_t *dst, const uint8_t *src)
  82 {
  83         __m128i xmm0;
  84
  85         xmm0 = _mm_loadu_si128((const __m128i *)src);
  86         _mm_storeu_si128((__m128i *)dst, xmm0);
  87 }
  88
  89 /**
  90  * Copy 32 bytes from one location to another,
  91  * locations should not overlap.
  92  */
  93 static inline void
  94 rte_mov32(uint8_t *dst, const uint8_t *src)
  95 {
  96         __m256i ymm0;
  97
  98         ymm0 = _mm256_loadu_si256((const __m256i *)src);
  99         _mm256_storeu_si256((__m256i *)dst, ymm0);
 100 }
 101
 102 /**
 103  * Copy 64 bytes from one location to another,
 104  * locations should not overlap.
 105  */
 106 static inline void
 107 rte_mov64(uint8_t *dst, const uint8_t *src)
 108 {
 109         __m512i zmm0;
 110
 111         zmm0 = _mm512_loadu_si512((const void *)src);
 112         _mm512_storeu_si512((void *)dst, zmm0);
 113 }
 114
 115 /**
 116  * Copy 128 bytes from one location to another,
 117  * locations should not overlap.
 118  */
 119 static inline void
 120 rte_mov128(uint8_t *dst, const uint8_t *src)
 121 {
 122         rte_mov64(dst + 0 * 64, src + 0 * 64);
 123         rte_mov64(dst + 1 * 64, src + 1 * 64);
 124 }
 125
 126 /**
 127  * Copy 256 bytes from one location to another,
 128  * locations should not overlap.
 129  */
 130 static inline void
 131 rte_mov256(uint8_t *dst, const uint8_t *src)
 132 {
 133         rte_mov64(dst + 0 * 64, src + 0 * 64);
 134         rte_mov64(dst + 1 * 64, src + 1 * 64);
 135         rte_mov64(dst + 2 * 64, src + 2 * 64);
 136         rte_mov64(dst + 3 * 64, src + 3 * 64);
 137 }
 138
 139 /**
 140  * Copy 128-byte blocks from one location to another,
 141  * locations should not overlap.
 142  */
 143 static inline void
 144 rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
 145 {
 146         __m512i zmm0, zmm1;
 147
 148         while (n >= 128) {
 149                 zmm0 = _mm512_loadu_si512((const void *)(src + 0 * 64));
 150                 n -= 128;
 151                 zmm1 = _mm512_loadu_si512((const void *)(src + 1 * 64));
 152                 src = src + 128;
 153                 _mm512_storeu_si512((void *)(dst + 0 * 64), zmm0);
 154                 _mm512_storeu_si512((void *)(dst + 1 * 64), zmm1);
 155                 dst = dst + 128;
 156         }
 157 }
 158
 159 /**
 160  * Copy 512-byte blocks from one location to another,
 161  * locations should not overlap.
 162  */
 163 static inline void
 164 rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n)
 165 {
 166         __m512i zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7;
 167
 168         while (n >= 512) {
 169                 zmm0 = _mm512_loadu_si512((const void *)(src + 0 * 64));
 170                 n -= 512;
 171                 zmm1 = _mm512_loadu_si512((const void *)(src + 1 * 64));
 172                 zmm2 = _mm512_loadu_si512((const void *)(src + 2 * 64));
 173                 zmm3 = _mm512_loadu_si512((const void *)(src + 3 * 64));
 174                 zmm4 = _mm512_loadu_si512((const void *)(src + 4 * 64));
 175                 zmm5 = _mm512_loadu_si512((const void *)(src + 5 * 64));
 176                 zmm6 = _mm512_loadu_si512((const void *)(src + 6 * 64));
 177                 zmm7 = _mm512_loadu_si512((const void *)(src + 7 * 64));
 178                 src = src + 512;
 179                 _mm512_storeu_si512((void *)(dst + 0 * 64), zmm0);
 180                 _mm512_storeu_si512((void *)(dst + 1 * 64), zmm1);
 181                 _mm512_storeu_si512((void *)(dst + 2 * 64), zmm2);
 182                 _mm512_storeu_si512((void *)(dst + 3 * 64), zmm3);
 183                 _mm512_storeu_si512((void *)(dst + 4 * 64), zmm4);
 184                 _mm512_storeu_si512((void *)(dst + 5 * 64), zmm5);
 185                 _mm512_storeu_si512((void *)(dst + 6 * 64), zmm6);
 186                 _mm512_storeu_si512((void *)(dst + 7 * 64), zmm7);
 187                 dst = dst + 512;
 188         }
 189 }
 190
 191 static inline void *
 192 rte_memcpy(void *dst, const void *src, size_t n)
 193 {
 194         uintptr_t dstu = (uintptr_t)dst;
 195         uintptr_t srcu = (uintptr_t)src;
 196         void *ret = dst;
 197         size_t dstofss;
 198         size_t bits;
 199
 200         /**
 201          * Copy less than 16 bytes
 202          */
 203         if (n < 16) {
 204                 if (n & 0x01) {
 205                         *(uint8_t *)dstu = *(const uint8_t *)srcu;
 206                         srcu = (uintptr_t)((const uint8_t *)srcu + 1);
 207                         dstu = (uintptr_t)((uint8_t *)dstu + 1);
 208                 }
 209                 if (n & 0x02) {
 210                         *(uint16_t *)dstu = *(const uint16_t *)srcu;
 211                         srcu = (uintptr_t)((const uint16_t *)srcu + 1);
 212                         dstu = (uintptr_t)((uint16_t *)dstu + 1);
 213                 }
 214                 if (n & 0x04) {
 215                         *(uint32_t *)dstu = *(const uint32_t *)srcu;
 216                         srcu = (uintptr_t)((const uint32_t *)srcu + 1);
 217                         dstu = (uintptr_t)((uint32_t *)dstu + 1);
 218                 }
 219                 if (n & 0x08)
 220                         *(uint64_t *)dstu = *(const uint64_t *)srcu;
 221                 return ret;
 222         }
 223
 224         /**
 225          * Fast way when copy size doesn't exceed 512 bytes
 226          */
 227         if (n <= 32) {
 228                 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
 229                 rte_mov16((uint8_t *)dst - 16 + n,
 230                                   (const uint8_t *)src - 16 + n);
 231                 return ret;
 232         }
 233         if (n <= 64) {
 234                 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 235                 rte_mov32((uint8_t *)dst - 32 + n,
 236                                   (const uint8_t *)src - 32 + n);
 237                 return ret;
 238         }
 239         if (n <= 512) {
 240                 if (n >= 256) {
 241                         n -= 256;
 242                         rte_mov256((uint8_t *)dst, (const uint8_t *)src);
 243                         src = (const uint8_t *)src + 256;
 244                         dst = (uint8_t *)dst + 256;
 245                 }
 246                 if (n >= 128) {
 247                         n -= 128;
 248                         rte_mov128((uint8_t *)dst, (const uint8_t *)src);
 249                         src = (const uint8_t *)src + 128;
 250                         dst = (uint8_t *)dst + 128;
 251                 }
 252 COPY_BLOCK_128_BACK63:
 253                 if (n > 64) {
 254                         rte_mov64((uint8_t *)dst, (const uint8_t *)src);
 255                         rte_mov64((uint8_t *)dst - 64 + n,
 256                                           (const uint8_t *)src - 64 + n);
 257                         return ret;
 258                 }
 259                 if (n > 0)
 260                         rte_mov64((uint8_t *)dst - 64 + n,
 261                                           (const uint8_t *)src - 64 + n);
 262                 return ret;
 263         }
 264
 265         /**
 266          * Make store aligned when copy size exceeds 512 bytes
 267          */
 268         dstofss = ((uintptr_t)dst & 0x3F);
 269         if (dstofss > 0) {
 270                 dstofss = 64 - dstofss;
 271                 n -= dstofss;
 272                 rte_mov64((uint8_t *)dst, (const uint8_t *)src);
 273                 src = (const uint8_t *)src + dstofss;
 274                 dst = (uint8_t *)dst + dstofss;
 275         }
 276
 277         /**
 278          * Copy 512-byte blocks.
 279          * Use copy block function for better instruction order control,
 280          * which is important when load is unaligned.
 281          */
 282         rte_mov512blocks((uint8_t *)dst, (const uint8_t *)src, n);
 283         bits = n;
 284         n = n & 511;
 285         bits -= n;
 286         src = (const uint8_t *)src + bits;
 287         dst = (uint8_t *)dst + bits;
 288
 289         /**
 290          * Copy 128-byte blocks.
 291          * Use copy block function for better instruction order control,
 292          * which is important when load is unaligned.
 293          */
 294         if (n >= 128) {
 295                 rte_mov128blocks((uint8_t *)dst, (const uint8_t *)src, n);
 296                 bits = n;
 297                 n = n & 127;
 298                 bits -= n;
 299                 src = (const uint8_t *)src + bits;
 300                 dst = (uint8_t *)dst + bits;
 301         }
 302
 303         /**
 304          * Copy whatever left
 305          */
 306         goto COPY_BLOCK_128_BACK63;
 307 }
 308
 309 #elif RTE_MACHINE_CPUFLAG_AVX2
 310
 311 /**
 312  * AVX2 implementation below
 313  */
 314
 315 /**
 316  * Copy 16 bytes from one location to another,
 317  * locations should not overlap.
 318  */
 319 static inline void
 320 rte_mov16(uint8_t *dst, const uint8_t *src)
 321 {
 322         __m128i xmm0;
 323
 324         xmm0 = _mm_loadu_si128((const __m128i *)src);
 325         _mm_storeu_si128((__m128i *)dst, xmm0);
 326 }
 327
 328 /**
 329  * Copy 32 bytes from one location to another,
 330  * locations should not overlap.
 331  */
 332 static inline void
 333 rte_mov32(uint8_t *dst, const uint8_t *src)
 334 {
 335         __m256i ymm0;
 336
 337         ymm0 = _mm256_loadu_si256((const __m256i *)src);
 338         _mm256_storeu_si256((__m256i *)dst, ymm0);
 339 }
 340
 341 /**
 342  * Copy 64 bytes from one location to another,
 343  * locations should not overlap.
 344  */
 345 static inline void
 346 rte_mov64(uint8_t *dst, const uint8_t *src)
 347 {
 348         rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
 349         rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
 350 }
 351
 352 /**
 353  * Copy 128 bytes from one location to another,
 354  * locations should not overlap.
 355  */
 356 static inline void
 357 rte_mov128(uint8_t *dst, const uint8_t *src)
 358 {
 359         rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
 360         rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
 361         rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32);
 362         rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32);
 363 }
 364
 365 /**
 366  * Copy 256 bytes from one location to another,
 367  * locations should not overlap.
 368  */
 369 static inline void
 370 rte_mov256(uint8_t *dst, const uint8_t *src)
 371 {
 372         rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
 373         rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
 374         rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32);
 375         rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32);
 376         rte_mov32((uint8_t *)dst + 4 * 32, (const uint8_t *)src + 4 * 32);
 377         rte_mov32((uint8_t *)dst + 5 * 32, (const uint8_t *)src + 5 * 32);
 378         rte_mov32((uint8_t *)dst + 6 * 32, (const uint8_t *)src + 6 * 32);
 379         rte_mov32((uint8_t *)dst + 7 * 32, (const uint8_t *)src + 7 * 32);
 380 }
 381
 382 /**
 383  * Copy 64-byte blocks from one location to another,
 384  * locations should not overlap.
 385  */
 386 static inline void
 387 rte_mov64blocks(uint8_t *dst, const uint8_t *src, size_t n)
 388 {
 389         __m256i ymm0, ymm1;
 390
 391         while (n >= 64) {
 392                 ymm0 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 0 * 32));
 393                 n -= 64;
 394                 ymm1 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 1 * 32));
 395                 src = (const uint8_t *)src + 64;
 396                 _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 0 * 32), ymm0);
 397                 _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 1 * 32), ymm1);
 398                 dst = (uint8_t *)dst + 64;
 399         }
 400 }
 401
 402 /**
 403  * Copy 256-byte blocks from one location to another,
 404  * locations should not overlap.
 405  */
 406 static inline void
 407 rte_mov256blocks(uint8_t *dst, const uint8_t *src, size_t n)
 408 {
 409         __m256i ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7;
 410
 411         while (n >= 256) {
 412                 ymm0 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 0 * 32));
 413                 n -= 256;
 414                 ymm1 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 1 * 32));
 415                 ymm2 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 2 * 32));
 416                 ymm3 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 3 * 32));
 417                 ymm4 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 4 * 32));
 418                 ymm5 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 5 * 32));
 419                 ymm6 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 6 * 32));
 420                 ymm7 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 7 * 32));
 421                 src = (const uint8_t *)src + 256;
 422                 _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 0 * 32), ymm0);
 423                 _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 1 * 32), ymm1);
 424                 _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 2 * 32), ymm2);
 425                 _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 3 * 32), ymm3);
 426                 _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 4 * 32), ymm4);
 427                 _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 5 * 32), ymm5);
 428                 _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 6 * 32), ymm6);
 429                 _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 7 * 32), ymm7);
 430                 dst = (uint8_t *)dst + 256;
 431         }
 432 }
 433
 434 static inline void *
 435 rte_memcpy(void *dst, const void *src, size_t n)
 436 {
 437         uintptr_t dstu = (uintptr_t)dst;
 438         uintptr_t srcu = (uintptr_t)src;
 439         void *ret = dst;
 440         size_t dstofss;
 441         size_t bits;
 442
 443         /**
 444          * Copy less than 16 bytes
 445          */
 446         if (n < 16) {
 447                 if (n & 0x01) {
 448                         *(uint8_t *)dstu = *(const uint8_t *)srcu;
 449                         srcu = (uintptr_t)((const uint8_t *)srcu + 1);
 450                         dstu = (uintptr_t)((uint8_t *)dstu + 1);
 451                 }
 452                 if (n & 0x02) {
 453                         *(uint16_t *)dstu = *(const uint16_t *)srcu;
 454                         srcu = (uintptr_t)((const uint16_t *)srcu + 1);
 455                         dstu = (uintptr_t)((uint16_t *)dstu + 1);
 456                 }
 457                 if (n & 0x04) {
 458                         *(uint32_t *)dstu = *(const uint32_t *)srcu;
 459                         srcu = (uintptr_t)((const uint32_t *)srcu + 1);
 460                         dstu = (uintptr_t)((uint32_t *)dstu + 1);
 461                 }
 462                 if (n & 0x08) {
 463                         *(uint64_t *)dstu = *(const uint64_t *)srcu;
 464                 }
 465                 return ret;
 466         }
 467
 468         /**
 469          * Fast way when copy size doesn't exceed 512 bytes
 470          */
 471         if (n <= 32) {
 472                 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
 473                 rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
 474                 return ret;
 475         }
 476         if (n <= 64) {
 477                 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 478                 rte_mov32((uint8_t *)dst - 32 + n, (const uint8_t *)src - 32 + n);
 479                 return ret;
 480         }
 481         if (n <= 512) {
 482                 if (n >= 256) {
 483                         n -= 256;
 484                         rte_mov256((uint8_t *)dst, (const uint8_t *)src);
 485                         src = (const uint8_t *)src + 256;
 486                         dst = (uint8_t *)dst + 256;
 487                 }
 488                 if (n >= 128) {
 489                         n -= 128;
 490                         rte_mov128((uint8_t *)dst, (const uint8_t *)src);
 491                         src = (const uint8_t *)src + 128;
 492                         dst = (uint8_t *)dst + 128;
 493                 }
 494                 if (n >= 64) {
 495                         n -= 64;
 496                         rte_mov64((uint8_t *)dst, (const uint8_t *)src);
 497                         src = (const uint8_t *)src + 64;
 498                         dst = (uint8_t *)dst + 64;
 499                 }
 500 COPY_BLOCK_64_BACK31:
 501                 if (n > 32) {
 502                         rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 503                         rte_mov32((uint8_t *)dst - 32 + n, (const uint8_t *)src - 32 + n);
 504                         return ret;
 505                 }
 506                 if (n > 0) {
 507                         rte_mov32((uint8_t *)dst - 32 + n, (const uint8_t *)src - 32 + n);
 508                 }
 509                 return ret;
 510         }
 511
 512         /**
 513          * Make store aligned when copy size exceeds 512 bytes
 514          */
 515         dstofss = 32 - ((uintptr_t)dst & 0x1F);
 516         n -= dstofss;
 517         rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 518         src = (const uint8_t *)src + dstofss;
 519         dst = (uint8_t *)dst + dstofss;
 520
 521         /**
 522          * Copy 256-byte blocks.
 523          * Use copy block function for better instruction order control,
 524          * which is important when load is unaligned.
 525          */
 526         rte_mov256blocks((uint8_t *)dst, (const uint8_t *)src, n);
 527         bits = n;
 528         n = n & 255;
 529         bits -= n;
 530         src = (const uint8_t *)src + bits;
 531         dst = (uint8_t *)dst + bits;
 532
 533         /**
 534          * Copy 64-byte blocks.
 535          * Use copy block function for better instruction order control,
 536          * which is important when load is unaligned.
 537          */
 538         if (n >= 64) {
 539                 rte_mov64blocks((uint8_t *)dst, (const uint8_t *)src, n);
 540                 bits = n;
 541                 n = n & 63;
 542                 bits -= n;
 543                 src = (const uint8_t *)src + bits;
 544                 dst = (uint8_t *)dst + bits;
 545         }
 546
 547         /**
 548          * Copy whatever left
 549          */
 550         goto COPY_BLOCK_64_BACK31;
 551 }
 552
 553 #else /* RTE_MACHINE_CPUFLAG */
 554
 555 /**
 556  * SSE & AVX implementation below
 557  */
 558
 559 /**
 560  * Copy 16 bytes from one location to another,
 561  * locations should not overlap.
 562  */
 563 static inline void
 564 rte_mov16(uint8_t *dst, const uint8_t *src)
 565 {
 566         __m128i xmm0;
 567
 568         xmm0 = _mm_loadu_si128((const __m128i *)(const __m128i *)src);
 569         _mm_storeu_si128((__m128i *)dst, xmm0);
 570 }
 571
 572 /**
 573  * Copy 32 bytes from one location to another,
 574  * locations should not overlap.
 575  */
 576 static inline void
 577 rte_mov32(uint8_t *dst, const uint8_t *src)
 578 {
 579         rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
 580         rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
 581 }
 582
 583 /**
 584  * Copy 64 bytes from one location to another,
 585  * locations should not overlap.
 586  */
 587 static inline void
 588 rte_mov64(uint8_t *dst, const uint8_t *src)
 589 {
 590         rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
 591         rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
 592         rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
 593         rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
 594 }
 595
 596 /**
 597  * Copy 128 bytes from one location to another,
 598  * locations should not overlap.
 599  */
 600 static inline void
 601 rte_mov128(uint8_t *dst, const uint8_t *src)
 602 {
 603         rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
 604         rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
 605         rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
 606         rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
 607         rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
 608         rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
 609         rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
 610         rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
 611 }
 612
 613 /**
 614  * Copy 256 bytes from one location to another,
 615  * locations should not overlap.
 616  */
 617 static inline void
 618 rte_mov256(uint8_t *dst, const uint8_t *src)
 619 {
 620         rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
 621         rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
 622         rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
 623         rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
 624         rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
 625         rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
 626         rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
 627         rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
 628         rte_mov16((uint8_t *)dst + 8 * 16, (const uint8_t *)src + 8 * 16);
 629         rte_mov16((uint8_t *)dst + 9 * 16, (const uint8_t *)src + 9 * 16);
 630         rte_mov16((uint8_t *)dst + 10 * 16, (const uint8_t *)src + 10 * 16);
 631         rte_mov16((uint8_t *)dst + 11 * 16, (const uint8_t *)src + 11 * 16);
 632         rte_mov16((uint8_t *)dst + 12 * 16, (const uint8_t *)src + 12 * 16);
 633         rte_mov16((uint8_t *)dst + 13 * 16, (const uint8_t *)src + 13 * 16);
 634         rte_mov16((uint8_t *)dst + 14 * 16, (const uint8_t *)src + 14 * 16);
 635         rte_mov16((uint8_t *)dst + 15 * 16, (const uint8_t *)src + 15 * 16);
 636 }
 637
 638 /**
 639  * Macro for copying unaligned block from one location to another with constant load offset,
 640  * 47 bytes leftover maximum,
 641  * locations should not overlap.
 642  * Requirements:
 643  * - Store is aligned
 644  * - Load offset is <offset>, which must be immediate value within [1, 15]
 645  * - For <src>, make sure <offset> bit backwards & <16 - offset> bit forwards are available for loading
 646  * - <dst>, <src>, <len> must be variables
 647  * - __m128i <xmm0> ~ <xmm8> must be pre-defined
 648  */
 649 #define MOVEUNALIGNED_LEFT47_IMM(dst, src, len, offset)                                                     \
 650 ({                                                                                                          \
 651     int tmp;                                                                                                \
 652     while (len >= 128 + 16 - offset) {                                                                      \
 653         xmm0 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 0 * 16));                  \
 654         len -= 128;                                                                                         \
 655         xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 1 * 16));                  \
 656         xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 2 * 16));                  \
 657         xmm3 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 3 * 16));                  \
 658         xmm4 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 4 * 16));                  \
 659         xmm5 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 5 * 16));                  \
 660         xmm6 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 6 * 16));                  \
 661         xmm7 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 7 * 16));                  \
 662         xmm8 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 8 * 16));                  \
 663         src = (const uint8_t *)src + 128;                                                                   \
 664         _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset));        \
 665         _mm_storeu_si128((__m128i *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset));        \
 666         _mm_storeu_si128((__m128i *)((uint8_t *)dst + 2 * 16), _mm_alignr_epi8(xmm3, xmm2, offset));        \
 667         _mm_storeu_si128((__m128i *)((uint8_t *)dst + 3 * 16), _mm_alignr_epi8(xmm4, xmm3, offset));        \
 668         _mm_storeu_si128((__m128i *)((uint8_t *)dst + 4 * 16), _mm_alignr_epi8(xmm5, xmm4, offset));        \
 669         _mm_storeu_si128((__m128i *)((uint8_t *)dst + 5 * 16), _mm_alignr_epi8(xmm6, xmm5, offset));        \
 670         _mm_storeu_si128((__m128i *)((uint8_t *)dst + 6 * 16), _mm_alignr_epi8(xmm7, xmm6, offset));        \
 671         _mm_storeu_si128((__m128i *)((uint8_t *)dst + 7 * 16), _mm_alignr_epi8(xmm8, xmm7, offset));        \
 672         dst = (uint8_t *)dst + 128;                                                                         \
 673     }                                                                                                       \
 674     tmp = len;                                                                                              \
 675     len = ((len - 16 + offset) & 127) + 16 - offset;                                                        \
 676     tmp -= len;                                                                                             \
 677     src = (const uint8_t *)src + tmp;                                                                       \
 678     dst = (uint8_t *)dst + tmp;                                                                             \
 679     if (len >= 32 + 16 - offset) {                                                                          \
 680         while (len >= 32 + 16 - offset) {                                                                   \
 681             xmm0 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 0 * 16));              \
 682             len -= 32;                                                                                      \
 683             xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 1 * 16));              \
 684             xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 2 * 16));              \
 685             src = (const uint8_t *)src + 32;                                                                \
 686             _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset));    \
 687             _mm_storeu_si128((__m128i *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset));    \
 688             dst = (uint8_t *)dst + 32;                                                                      \
 689         }                                                                                                   \
 690         tmp = len;                                                                                          \
 691         len = ((len - 16 + offset) & 31) + 16 - offset;                                                     \
 692         tmp -= len;                                                                                         \
 693         src = (const uint8_t *)src + tmp;                                                                   \
 694         dst = (uint8_t *)dst + tmp;                                                                         \
 695     }                                                                                                       \
 696 })
 697
 698 /**
 699  * Macro for copying unaligned block from one location to another,
 700  * 47 bytes leftover maximum,
 701  * locations should not overlap.
 702  * Use switch here because the aligning instruction requires immediate value for shift count.
 703  * Requirements:
 704  * - Store is aligned
 705  * - Load offset is <offset>, which must be within [1, 15]
 706  * - For <src>, make sure <offset> bit backwards & <16 - offset> bit forwards are available for loading
 707  * - <dst>, <src>, <len> must be variables
 708  * - __m128i <xmm0> ~ <xmm8> used in MOVEUNALIGNED_LEFT47_IMM must be pre-defined
 709  */
 710 #define MOVEUNALIGNED_LEFT47(dst, src, len, offset)                   \
 711 ({                                                                    \
 712     switch (offset) {                                                 \
 713     case 0x01: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x01); break;    \
 714     case 0x02: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x02); break;    \
 715     case 0x03: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x03); break;    \
 716     case 0x04: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x04); break;    \
 717     case 0x05: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x05); break;    \
 718     case 0x06: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x06); break;    \
 719     case 0x07: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x07); break;    \
 720     case 0x08: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x08); break;    \
 721     case 0x09: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x09); break;    \
 722     case 0x0A: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0A); break;    \
 723     case 0x0B: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0B); break;    \
 724     case 0x0C: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0C); break;    \
 725     case 0x0D: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0D); break;    \
 726     case 0x0E: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0E); break;    \
 727     case 0x0F: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0F); break;    \
 728     default:;                                                         \
 729     }                                                                 \
 730 })
 731
 732 static inline void *
 733 rte_memcpy(void *dst, const void *src, size_t n)
 734 {
 735         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
 736         uintptr_t dstu = (uintptr_t)dst;
 737         uintptr_t srcu = (uintptr_t)src;
 738         void *ret = dst;
 739         size_t dstofss;
 740         size_t srcofs;
 741
 742         /**
 743          * Copy less than 16 bytes
 744          */
 745         if (n < 16) {
 746                 if (n & 0x01) {
 747                         *(uint8_t *)dstu = *(const uint8_t *)srcu;
 748                         srcu = (uintptr_t)((const uint8_t *)srcu + 1);
 749                         dstu = (uintptr_t)((uint8_t *)dstu + 1);
 750                 }
 751                 if (n & 0x02) {
 752                         *(uint16_t *)dstu = *(const uint16_t *)srcu;
 753                         srcu = (uintptr_t)((const uint16_t *)srcu + 1);
 754                         dstu = (uintptr_t)((uint16_t *)dstu + 1);
 755                 }
 756                 if (n & 0x04) {
 757                         *(uint32_t *)dstu = *(const uint32_t *)srcu;
 758                         srcu = (uintptr_t)((const uint32_t *)srcu + 1);
 759                         dstu = (uintptr_t)((uint32_t *)dstu + 1);
 760                 }
 761                 if (n & 0x08) {
 762                         *(uint64_t *)dstu = *(const uint64_t *)srcu;
 763                 }
 764                 return ret;
 765         }
 766
 767         /**
 768          * Fast way when copy size doesn't exceed 512 bytes
 769          */
 770         if (n <= 32) {
 771                 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
 772                 rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
 773                 return ret;
 774         }
 775         if (n <= 48) {
 776                 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 777                 rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
 778                 return ret;
 779         }
 780         if (n <= 64) {
 781                 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 782                 rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32);
 783                 rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
 784                 return ret;
 785         }
 786         if (n <= 128) {
 787                 goto COPY_BLOCK_128_BACK15;
 788         }
 789         if (n <= 512) {
 790                 if (n >= 256) {
 791                         n -= 256;
 792                         rte_mov128((uint8_t *)dst, (const uint8_t *)src);
 793                         rte_mov128((uint8_t *)dst + 128, (const uint8_t *)src + 128);
 794                         src = (const uint8_t *)src + 256;
 795                         dst = (uint8_t *)dst + 256;
 796                 }
 797 COPY_BLOCK_255_BACK15:
 798                 if (n >= 128) {
 799                         n -= 128;
 800                         rte_mov128((uint8_t *)dst, (const uint8_t *)src);
 801                         src = (const uint8_t *)src + 128;
 802                         dst = (uint8_t *)dst + 128;
 803                 }
 804 COPY_BLOCK_128_BACK15:
 805                 if (n >= 64) {
 806                         n -= 64;
 807                         rte_mov64((uint8_t *)dst, (const uint8_t *)src);
 808                         src = (const uint8_t *)src + 64;
 809                         dst = (uint8_t *)dst + 64;
 810                 }
 811 COPY_BLOCK_64_BACK15:
 812                 if (n >= 32) {
 813                         n -= 32;
 814                         rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 815                         src = (const uint8_t *)src + 32;
 816                         dst = (uint8_t *)dst + 32;
 817                 }
 818                 if (n > 16) {
 819                         rte_mov16((uint8_t *)dst, (const uint8_t *)src);
 820                         rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
 821                         return ret;
 822                 }
 823                 if (n > 0) {
 824                         rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
 825                 }
 826                 return ret;
 827         }
 828
 829         /**
 830          * Make store aligned when copy size exceeds 512 bytes,
 831          * and make sure the first 15 bytes are copied, because
 832          * unaligned copy functions require up to 15 bytes
 833          * backwards access.
 834          */
 835         dstofss = 16 - ((uintptr_t)dst & 0x0F) + 16;
 836         n -= dstofss;
 837         rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 838         src = (const uint8_t *)src + dstofss;
 839         dst = (uint8_t *)dst + dstofss;
 840         srcofs = ((uintptr_t)src & 0x0F);
 841
 842         /**
 843          * For aligned copy
 844          */
 845         if (srcofs == 0) {
 846                 /**
 847                  * Copy 256-byte blocks
 848                  */
 849                 for (; n >= 256; n -= 256) {
 850                         rte_mov256((uint8_t *)dst, (const uint8_t *)src);
 851                         dst = (uint8_t *)dst + 256;
 852                         src = (const uint8_t *)src + 256;
 853                 }
 854
 855                 /**
 856                  * Copy whatever left
 857                  */
 858                 goto COPY_BLOCK_255_BACK15;
 859         }
 860
 861         /**
 862          * For copy with unaligned load
 863          */
 864         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
 865
 866         /**
 867          * Copy whatever left
 868          */
 869         goto COPY_BLOCK_64_BACK15;
 870 }
 871
 872 #endif /* RTE_MACHINE_CPUFLAG */
 873
 874 #ifdef __cplusplus
 875 }
 876 #endif
 877
 878 #endif /* _RTE_MEMCPY_X86_64_H_ */