1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright(c) 2010-2014 Intel Corporation
5 #ifndef _RTE_MEMCPY_X86_64_H_
6 #define _RTE_MEMCPY_X86_64_H_
11 * Functions for SSE/AVX/AVX2/AVX512 implementation of memcpy().
18 #include <rte_common.h>
19 #include <rte_config.h>
25 #if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION >= 100000)
26 #pragma GCC diagnostic push
27 #pragma GCC diagnostic ignored "-Wstringop-overflow"
31 * Copy bytes from one location to another. The locations must not overlap.
33 * @note This is implemented as a macro, so it's address should not be taken
34 * and care is needed as parameter expressions may be evaluated multiple times.
37 * Pointer to the destination of the data.
39 * Pointer to the source data.
41 * Number of bytes to copy.
43 * Pointer to the destination data.
45 static __rte_always_inline void *
46 rte_memcpy(void *dst, const void *src, size_t n);
48 #if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
50 #define ALIGNMENT_MASK 0x3F
53 * AVX512 implementation below
57 * Copy 16 bytes from one location to another,
58 * locations should not overlap.
60 static __rte_always_inline void
61 rte_mov16(uint8_t *dst, const uint8_t *src)
65 xmm0 = _mm_loadu_si128((const __m128i *)src);
66 _mm_storeu_si128((__m128i *)dst, xmm0);
70 * Copy 32 bytes from one location to another,
71 * locations should not overlap.
73 static __rte_always_inline void
74 rte_mov32(uint8_t *dst, const uint8_t *src)
78 ymm0 = _mm256_loadu_si256((const __m256i *)src);
79 _mm256_storeu_si256((__m256i *)dst, ymm0);
83 * Copy 64 bytes from one location to another,
84 * locations should not overlap.
86 static __rte_always_inline void
87 rte_mov64(uint8_t *dst, const uint8_t *src)
91 zmm0 = _mm512_loadu_si512((const void *)src);
92 _mm512_storeu_si512((void *)dst, zmm0);
96 * Copy 128 bytes from one location to another,
97 * locations should not overlap.
99 static __rte_always_inline void
100 rte_mov128(uint8_t *dst, const uint8_t *src)
102 rte_mov64(dst + 0 * 64, src + 0 * 64);
103 rte_mov64(dst + 1 * 64, src + 1 * 64);
107 * Copy 256 bytes from one location to another,
108 * locations should not overlap.
110 static __rte_always_inline void
111 rte_mov256(uint8_t *dst, const uint8_t *src)
113 rte_mov64(dst + 0 * 64, src + 0 * 64);
114 rte_mov64(dst + 1 * 64, src + 1 * 64);
115 rte_mov64(dst + 2 * 64, src + 2 * 64);
116 rte_mov64(dst + 3 * 64, src + 3 * 64);
120 * Copy 128-byte blocks from one location to another,
121 * locations should not overlap.
123 static __rte_always_inline void
124 rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
129 zmm0 = _mm512_loadu_si512((const void *)(src + 0 * 64));
131 zmm1 = _mm512_loadu_si512((const void *)(src + 1 * 64));
133 _mm512_storeu_si512((void *)(dst + 0 * 64), zmm0);
134 _mm512_storeu_si512((void *)(dst + 1 * 64), zmm1);
140 * Copy 512-byte blocks from one location to another,
141 * locations should not overlap.
144 rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n)
146 __m512i zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7;
149 zmm0 = _mm512_loadu_si512((const void *)(src + 0 * 64));
151 zmm1 = _mm512_loadu_si512((const void *)(src + 1 * 64));
152 zmm2 = _mm512_loadu_si512((const void *)(src + 2 * 64));
153 zmm3 = _mm512_loadu_si512((const void *)(src + 3 * 64));
154 zmm4 = _mm512_loadu_si512((const void *)(src + 4 * 64));
155 zmm5 = _mm512_loadu_si512((const void *)(src + 5 * 64));
156 zmm6 = _mm512_loadu_si512((const void *)(src + 6 * 64));
157 zmm7 = _mm512_loadu_si512((const void *)(src + 7 * 64));
159 _mm512_storeu_si512((void *)(dst + 0 * 64), zmm0);
160 _mm512_storeu_si512((void *)(dst + 1 * 64), zmm1);
161 _mm512_storeu_si512((void *)(dst + 2 * 64), zmm2);
162 _mm512_storeu_si512((void *)(dst + 3 * 64), zmm3);
163 _mm512_storeu_si512((void *)(dst + 4 * 64), zmm4);
164 _mm512_storeu_si512((void *)(dst + 5 * 64), zmm5);
165 _mm512_storeu_si512((void *)(dst + 6 * 64), zmm6);
166 _mm512_storeu_si512((void *)(dst + 7 * 64), zmm7);
171 static __rte_always_inline void *
172 rte_memcpy_generic(void *dst, const void *src, size_t n)
174 uintptr_t dstu = (uintptr_t)dst;
175 uintptr_t srcu = (uintptr_t)src;
181 * Copy less than 16 bytes
185 *(uint8_t *)dstu = *(const uint8_t *)srcu;
186 srcu = (uintptr_t)((const uint8_t *)srcu + 1);
187 dstu = (uintptr_t)((uint8_t *)dstu + 1);
190 *(uint16_t *)dstu = *(const uint16_t *)srcu;
191 srcu = (uintptr_t)((const uint16_t *)srcu + 1);
192 dstu = (uintptr_t)((uint16_t *)dstu + 1);
195 *(uint32_t *)dstu = *(const uint32_t *)srcu;
196 srcu = (uintptr_t)((const uint32_t *)srcu + 1);
197 dstu = (uintptr_t)((uint32_t *)dstu + 1);
200 *(uint64_t *)dstu = *(const uint64_t *)srcu;
205 * Fast way when copy size doesn't exceed 512 bytes
208 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
209 rte_mov16((uint8_t *)dst - 16 + n,
210 (const uint8_t *)src - 16 + n);
214 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
215 rte_mov32((uint8_t *)dst - 32 + n,
216 (const uint8_t *)src - 32 + n);
222 rte_mov256((uint8_t *)dst, (const uint8_t *)src);
223 src = (const uint8_t *)src + 256;
224 dst = (uint8_t *)dst + 256;
228 rte_mov128((uint8_t *)dst, (const uint8_t *)src);
229 src = (const uint8_t *)src + 128;
230 dst = (uint8_t *)dst + 128;
232 COPY_BLOCK_128_BACK63:
234 rte_mov64((uint8_t *)dst, (const uint8_t *)src);
235 rte_mov64((uint8_t *)dst - 64 + n,
236 (const uint8_t *)src - 64 + n);
240 rte_mov64((uint8_t *)dst - 64 + n,
241 (const uint8_t *)src - 64 + n);
246 * Make store aligned when copy size exceeds 512 bytes
248 dstofss = ((uintptr_t)dst & 0x3F);
250 dstofss = 64 - dstofss;
252 rte_mov64((uint8_t *)dst, (const uint8_t *)src);
253 src = (const uint8_t *)src + dstofss;
254 dst = (uint8_t *)dst + dstofss;
258 * Copy 512-byte blocks.
259 * Use copy block function for better instruction order control,
260 * which is important when load is unaligned.
262 rte_mov512blocks((uint8_t *)dst, (const uint8_t *)src, n);
266 src = (const uint8_t *)src + bits;
267 dst = (uint8_t *)dst + bits;
270 * Copy 128-byte blocks.
271 * Use copy block function for better instruction order control,
272 * which is important when load is unaligned.
275 rte_mov128blocks((uint8_t *)dst, (const uint8_t *)src, n);
279 src = (const uint8_t *)src + bits;
280 dst = (uint8_t *)dst + bits;
286 goto COPY_BLOCK_128_BACK63;
289 #elif defined __AVX2__
291 #define ALIGNMENT_MASK 0x1F
294 * AVX2 implementation below
298 * Copy 16 bytes from one location to another,
299 * locations should not overlap.
301 static __rte_always_inline void
302 rte_mov16(uint8_t *dst, const uint8_t *src)
306 xmm0 = _mm_loadu_si128((const __m128i *)(const void *)src);
307 _mm_storeu_si128((__m128i *)(void *)dst, xmm0);
311 * Copy 32 bytes from one location to another,
312 * locations should not overlap.
314 static __rte_always_inline void
315 rte_mov32(uint8_t *dst, const uint8_t *src)
319 ymm0 = _mm256_loadu_si256((const __m256i *)(const void *)src);
320 _mm256_storeu_si256((__m256i *)(void *)dst, ymm0);
324 * Copy 64 bytes from one location to another,
325 * locations should not overlap.
327 static __rte_always_inline void
328 rte_mov64(uint8_t *dst, const uint8_t *src)
330 rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
331 rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
335 * Copy 128 bytes from one location to another,
336 * locations should not overlap.
338 static __rte_always_inline void
339 rte_mov128(uint8_t *dst, const uint8_t *src)
341 rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
342 rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
343 rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32);
344 rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32);
348 * Copy 128-byte blocks from one location to another,
349 * locations should not overlap.
351 static __rte_always_inline void
352 rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
354 __m256i ymm0, ymm1, ymm2, ymm3;
357 ymm0 = _mm256_loadu_si256((const __m256i *)(const void *)
358 ((const uint8_t *)src + 0 * 32));
360 ymm1 = _mm256_loadu_si256((const __m256i *)(const void *)
361 ((const uint8_t *)src + 1 * 32));
362 ymm2 = _mm256_loadu_si256((const __m256i *)(const void *)
363 ((const uint8_t *)src + 2 * 32));
364 ymm3 = _mm256_loadu_si256((const __m256i *)(const void *)
365 ((const uint8_t *)src + 3 * 32));
366 src = (const uint8_t *)src + 128;
367 _mm256_storeu_si256((__m256i *)(void *)
368 ((uint8_t *)dst + 0 * 32), ymm0);
369 _mm256_storeu_si256((__m256i *)(void *)
370 ((uint8_t *)dst + 1 * 32), ymm1);
371 _mm256_storeu_si256((__m256i *)(void *)
372 ((uint8_t *)dst + 2 * 32), ymm2);
373 _mm256_storeu_si256((__m256i *)(void *)
374 ((uint8_t *)dst + 3 * 32), ymm3);
375 dst = (uint8_t *)dst + 128;
379 static __rte_always_inline void *
380 rte_memcpy_generic(void *dst, const void *src, size_t n)
382 uintptr_t dstu = (uintptr_t)dst;
383 uintptr_t srcu = (uintptr_t)src;
389 * Copy less than 16 bytes
393 *(uint8_t *)dstu = *(const uint8_t *)srcu;
394 srcu = (uintptr_t)((const uint8_t *)srcu + 1);
395 dstu = (uintptr_t)((uint8_t *)dstu + 1);
398 *(uint16_t *)dstu = *(const uint16_t *)srcu;
399 srcu = (uintptr_t)((const uint16_t *)srcu + 1);
400 dstu = (uintptr_t)((uint16_t *)dstu + 1);
403 *(uint32_t *)dstu = *(const uint32_t *)srcu;
404 srcu = (uintptr_t)((const uint32_t *)srcu + 1);
405 dstu = (uintptr_t)((uint32_t *)dstu + 1);
408 *(uint64_t *)dstu = *(const uint64_t *)srcu;
414 * Fast way when copy size doesn't exceed 256 bytes
417 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
418 rte_mov16((uint8_t *)dst - 16 + n,
419 (const uint8_t *)src - 16 + n);
423 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
424 rte_mov16((uint8_t *)dst + 16, (const uint8_t *)src + 16);
425 rte_mov16((uint8_t *)dst - 16 + n,
426 (const uint8_t *)src - 16 + n);
430 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
431 rte_mov32((uint8_t *)dst - 32 + n,
432 (const uint8_t *)src - 32 + n);
438 rte_mov128((uint8_t *)dst, (const uint8_t *)src);
439 src = (const uint8_t *)src + 128;
440 dst = (uint8_t *)dst + 128;
442 COPY_BLOCK_128_BACK31:
445 rte_mov64((uint8_t *)dst, (const uint8_t *)src);
446 src = (const uint8_t *)src + 64;
447 dst = (uint8_t *)dst + 64;
450 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
451 rte_mov32((uint8_t *)dst - 32 + n,
452 (const uint8_t *)src - 32 + n);
456 rte_mov32((uint8_t *)dst - 32 + n,
457 (const uint8_t *)src - 32 + n);
463 * Make store aligned when copy size exceeds 256 bytes
465 dstofss = (uintptr_t)dst & 0x1F;
467 dstofss = 32 - dstofss;
469 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
470 src = (const uint8_t *)src + dstofss;
471 dst = (uint8_t *)dst + dstofss;
475 * Copy 128-byte blocks
477 rte_mov128blocks((uint8_t *)dst, (const uint8_t *)src, n);
481 src = (const uint8_t *)src + bits;
482 dst = (uint8_t *)dst + bits;
487 goto COPY_BLOCK_128_BACK31;
490 #else /* __AVX512F__ */
492 #define ALIGNMENT_MASK 0x0F
495 * SSE & AVX implementation below
499 * Copy 16 bytes from one location to another,
500 * locations should not overlap.
502 static __rte_always_inline void
503 rte_mov16(uint8_t *dst, const uint8_t *src)
507 xmm0 = _mm_loadu_si128((const __m128i *)(const void *)src);
508 _mm_storeu_si128((__m128i *)(void *)dst, xmm0);
512 * Copy 32 bytes from one location to another,
513 * locations should not overlap.
515 static __rte_always_inline void
516 rte_mov32(uint8_t *dst, const uint8_t *src)
518 rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
519 rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
523 * Copy 64 bytes from one location to another,
524 * locations should not overlap.
526 static __rte_always_inline void
527 rte_mov64(uint8_t *dst, const uint8_t *src)
529 rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
530 rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
531 rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
532 rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
536 * Copy 128 bytes from one location to another,
537 * locations should not overlap.
539 static __rte_always_inline void
540 rte_mov128(uint8_t *dst, const uint8_t *src)
542 rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
543 rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
544 rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
545 rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
546 rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
547 rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
548 rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
549 rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
553 * Copy 256 bytes from one location to another,
554 * locations should not overlap.
557 rte_mov256(uint8_t *dst, const uint8_t *src)
559 rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
560 rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
561 rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
562 rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
563 rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
564 rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
565 rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
566 rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
567 rte_mov16((uint8_t *)dst + 8 * 16, (const uint8_t *)src + 8 * 16);
568 rte_mov16((uint8_t *)dst + 9 * 16, (const uint8_t *)src + 9 * 16);
569 rte_mov16((uint8_t *)dst + 10 * 16, (const uint8_t *)src + 10 * 16);
570 rte_mov16((uint8_t *)dst + 11 * 16, (const uint8_t *)src + 11 * 16);
571 rte_mov16((uint8_t *)dst + 12 * 16, (const uint8_t *)src + 12 * 16);
572 rte_mov16((uint8_t *)dst + 13 * 16, (const uint8_t *)src + 13 * 16);
573 rte_mov16((uint8_t *)dst + 14 * 16, (const uint8_t *)src + 14 * 16);
574 rte_mov16((uint8_t *)dst + 15 * 16, (const uint8_t *)src + 15 * 16);
578 * Macro for copying unaligned block from one location to another with constant load offset,
579 * 47 bytes leftover maximum,
580 * locations should not overlap.
583 * - Load offset is <offset>, which must be immediate value within [1, 15]
584 * - For <src>, make sure <offset> bit backwards & <16 - offset> bit forwards are available for loading
585 * - <dst>, <src>, <len> must be variables
586 * - __m128i <xmm0> ~ <xmm8> must be pre-defined
588 #define MOVEUNALIGNED_LEFT47_IMM(dst, src, len, offset) \
591 while (len >= 128 + 16 - offset) { \
592 xmm0 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 0 * 16)); \
594 xmm1 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 1 * 16)); \
595 xmm2 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 2 * 16)); \
596 xmm3 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 3 * 16)); \
597 xmm4 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 4 * 16)); \
598 xmm5 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 5 * 16)); \
599 xmm6 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 6 * 16)); \
600 xmm7 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 7 * 16)); \
601 xmm8 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 8 * 16)); \
602 src = (const uint8_t *)src + 128; \
603 _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset)); \
604 _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset)); \
605 _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 2 * 16), _mm_alignr_epi8(xmm3, xmm2, offset)); \
606 _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 3 * 16), _mm_alignr_epi8(xmm4, xmm3, offset)); \
607 _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 4 * 16), _mm_alignr_epi8(xmm5, xmm4, offset)); \
608 _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 5 * 16), _mm_alignr_epi8(xmm6, xmm5, offset)); \
609 _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 6 * 16), _mm_alignr_epi8(xmm7, xmm6, offset)); \
610 _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 7 * 16), _mm_alignr_epi8(xmm8, xmm7, offset)); \
611 dst = (uint8_t *)dst + 128; \
614 len = ((len - 16 + offset) & 127) + 16 - offset; \
616 src = (const uint8_t *)src + tmp; \
617 dst = (uint8_t *)dst + tmp; \
618 if (len >= 32 + 16 - offset) { \
619 while (len >= 32 + 16 - offset) { \
620 xmm0 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 0 * 16)); \
622 xmm1 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 1 * 16)); \
623 xmm2 = _mm_loadu_si128((const __m128i *)(const void *)((const uint8_t *)src - offset + 2 * 16)); \
624 src = (const uint8_t *)src + 32; \
625 _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset)); \
626 _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset)); \
627 dst = (uint8_t *)dst + 32; \
630 len = ((len - 16 + offset) & 31) + 16 - offset; \
632 src = (const uint8_t *)src + tmp; \
633 dst = (uint8_t *)dst + tmp; \
638 * Macro for copying unaligned block from one location to another,
639 * 47 bytes leftover maximum,
640 * locations should not overlap.
641 * Use switch here because the aligning instruction requires immediate value for shift count.
644 * - Load offset is <offset>, which must be within [1, 15]
645 * - For <src>, make sure <offset> bit backwards & <16 - offset> bit forwards are available for loading
646 * - <dst>, <src>, <len> must be variables
647 * - __m128i <xmm0> ~ <xmm8> used in MOVEUNALIGNED_LEFT47_IMM must be pre-defined
649 #define MOVEUNALIGNED_LEFT47(dst, src, len, offset) \
652 case 0x01: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x01); break; \
653 case 0x02: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x02); break; \
654 case 0x03: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x03); break; \
655 case 0x04: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x04); break; \
656 case 0x05: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x05); break; \
657 case 0x06: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x06); break; \
658 case 0x07: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x07); break; \
659 case 0x08: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x08); break; \
660 case 0x09: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x09); break; \
661 case 0x0A: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0A); break; \
662 case 0x0B: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0B); break; \
663 case 0x0C: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0C); break; \
664 case 0x0D: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0D); break; \
665 case 0x0E: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0E); break; \
666 case 0x0F: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0F); break; \
671 static __rte_always_inline void *
672 rte_memcpy_generic(void *dst, const void *src, size_t n)
674 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
675 uintptr_t dstu = (uintptr_t)dst;
676 uintptr_t srcu = (uintptr_t)src;
682 * Copy less than 16 bytes
686 *(uint8_t *)dstu = *(const uint8_t *)srcu;
687 srcu = (uintptr_t)((const uint8_t *)srcu + 1);
688 dstu = (uintptr_t)((uint8_t *)dstu + 1);
691 *(uint16_t *)dstu = *(const uint16_t *)srcu;
692 srcu = (uintptr_t)((const uint16_t *)srcu + 1);
693 dstu = (uintptr_t)((uint16_t *)dstu + 1);
696 *(uint32_t *)dstu = *(const uint32_t *)srcu;
697 srcu = (uintptr_t)((const uint32_t *)srcu + 1);
698 dstu = (uintptr_t)((uint32_t *)dstu + 1);
701 *(uint64_t *)dstu = *(const uint64_t *)srcu;
707 * Fast way when copy size doesn't exceed 512 bytes
710 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
711 rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
715 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
716 rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
720 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
721 rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32);
722 rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
726 goto COPY_BLOCK_128_BACK15;
731 rte_mov128((uint8_t *)dst, (const uint8_t *)src);
732 rte_mov128((uint8_t *)dst + 128, (const uint8_t *)src + 128);
733 src = (const uint8_t *)src + 256;
734 dst = (uint8_t *)dst + 256;
736 COPY_BLOCK_255_BACK15:
739 rte_mov128((uint8_t *)dst, (const uint8_t *)src);
740 src = (const uint8_t *)src + 128;
741 dst = (uint8_t *)dst + 128;
743 COPY_BLOCK_128_BACK15:
746 rte_mov64((uint8_t *)dst, (const uint8_t *)src);
747 src = (const uint8_t *)src + 64;
748 dst = (uint8_t *)dst + 64;
750 COPY_BLOCK_64_BACK15:
753 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
754 src = (const uint8_t *)src + 32;
755 dst = (uint8_t *)dst + 32;
758 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
759 rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
763 rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
769 * Make store aligned when copy size exceeds 512 bytes,
770 * and make sure the first 15 bytes are copied, because
771 * unaligned copy functions require up to 15 bytes
774 dstofss = (uintptr_t)dst & 0x0F;
776 dstofss = 16 - dstofss + 16;
778 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
779 src = (const uint8_t *)src + dstofss;
780 dst = (uint8_t *)dst + dstofss;
782 srcofs = ((uintptr_t)src & 0x0F);
789 * Copy 256-byte blocks
791 for (; n >= 256; n -= 256) {
792 rte_mov256((uint8_t *)dst, (const uint8_t *)src);
793 dst = (uint8_t *)dst + 256;
794 src = (const uint8_t *)src + 256;
800 goto COPY_BLOCK_255_BACK15;
804 * For copy with unaligned load
806 MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
811 goto COPY_BLOCK_64_BACK15;
814 #endif /* __AVX512F__ */
816 static __rte_always_inline void *
817 rte_memcpy_aligned(void *dst, const void *src, size_t n)
821 /* Copy size <= 16 bytes */
824 *(uint8_t *)dst = *(const uint8_t *)src;
825 src = (const uint8_t *)src + 1;
826 dst = (uint8_t *)dst + 1;
829 *(uint16_t *)dst = *(const uint16_t *)src;
830 src = (const uint16_t *)src + 1;
831 dst = (uint16_t *)dst + 1;
834 *(uint32_t *)dst = *(const uint32_t *)src;
835 src = (const uint32_t *)src + 1;
836 dst = (uint32_t *)dst + 1;
839 *(uint64_t *)dst = *(const uint64_t *)src;
844 /* Copy 16 <= size <= 32 bytes */
846 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
847 rte_mov16((uint8_t *)dst - 16 + n,
848 (const uint8_t *)src - 16 + n);
853 /* Copy 32 < size <= 64 bytes */
855 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
856 rte_mov32((uint8_t *)dst - 32 + n,
857 (const uint8_t *)src - 32 + n);
862 /* Copy 64 bytes blocks */
863 for (; n >= 64; n -= 64) {
864 rte_mov64((uint8_t *)dst, (const uint8_t *)src);
865 dst = (uint8_t *)dst + 64;
866 src = (const uint8_t *)src + 64;
869 /* Copy whatever left */
870 rte_mov64((uint8_t *)dst - 64 + n,
871 (const uint8_t *)src - 64 + n);
876 static __rte_always_inline void *
877 rte_memcpy(void *dst, const void *src, size_t n)
879 if (!(((uintptr_t)dst | (uintptr_t)src) & ALIGNMENT_MASK))
880 return rte_memcpy_aligned(dst, src, n);
882 return rte_memcpy_generic(dst, src, n);
885 #if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION >= 100000)
886 #pragma GCC diagnostic pop
893 #endif /* _RTE_MEMCPY_X86_64_H_ */