1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright(c) 2010-2014 Intel Corporation
5 #ifndef _RTE_MEMCPY_X86_64_H_
6 #define _RTE_MEMCPY_X86_64_H_
11 * Functions for SSE/AVX/AVX2/AVX512 implementation of memcpy().
18 #include <rte_common.h>
19 #include <rte_config.h>
26 * Copy bytes from one location to another. The locations must not overlap.
28 * @note This is implemented as a macro, so it's address should not be taken
29 * and care is needed as parameter expressions may be evaluated multiple times.
32 * Pointer to the destination of the data.
34 * Pointer to the source data.
36 * Number of bytes to copy.
38 * Pointer to the destination data.
40 static __rte_always_inline void *
41 rte_memcpy(void *dst, const void *src, size_t n);
43 #ifdef RTE_MACHINE_CPUFLAG_AVX512F
45 #define ALIGNMENT_MASK 0x3F
48 * AVX512 implementation below
52 * Copy 16 bytes from one location to another,
53 * locations should not overlap.
56 rte_mov16(uint8_t *dst, const uint8_t *src)
60 xmm0 = _mm_loadu_si128((const __m128i *)src);
61 _mm_storeu_si128((__m128i *)dst, xmm0);
65 * Copy 32 bytes from one location to another,
66 * locations should not overlap.
69 rte_mov32(uint8_t *dst, const uint8_t *src)
73 ymm0 = _mm256_loadu_si256((const __m256i *)src);
74 _mm256_storeu_si256((__m256i *)dst, ymm0);
78 * Copy 64 bytes from one location to another,
79 * locations should not overlap.
82 rte_mov64(uint8_t *dst, const uint8_t *src)
86 zmm0 = _mm512_loadu_si512((const void *)src);
87 _mm512_storeu_si512((void *)dst, zmm0);
91 * Copy 128 bytes from one location to another,
92 * locations should not overlap.
95 rte_mov128(uint8_t *dst, const uint8_t *src)
97 rte_mov64(dst + 0 * 64, src + 0 * 64);
98 rte_mov64(dst + 1 * 64, src + 1 * 64);
102 * Copy 256 bytes from one location to another,
103 * locations should not overlap.
106 rte_mov256(uint8_t *dst, const uint8_t *src)
108 rte_mov64(dst + 0 * 64, src + 0 * 64);
109 rte_mov64(dst + 1 * 64, src + 1 * 64);
110 rte_mov64(dst + 2 * 64, src + 2 * 64);
111 rte_mov64(dst + 3 * 64, src + 3 * 64);
115 * Copy 128-byte blocks from one location to another,
116 * locations should not overlap.
119 rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
124 zmm0 = _mm512_loadu_si512((const void *)(src + 0 * 64));
126 zmm1 = _mm512_loadu_si512((const void *)(src + 1 * 64));
128 _mm512_storeu_si512((void *)(dst + 0 * 64), zmm0);
129 _mm512_storeu_si512((void *)(dst + 1 * 64), zmm1);
135 * Copy 512-byte blocks from one location to another,
136 * locations should not overlap.
139 rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n)
141 __m512i zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7;
144 zmm0 = _mm512_loadu_si512((const void *)(src + 0 * 64));
146 zmm1 = _mm512_loadu_si512((const void *)(src + 1 * 64));
147 zmm2 = _mm512_loadu_si512((const void *)(src + 2 * 64));
148 zmm3 = _mm512_loadu_si512((const void *)(src + 3 * 64));
149 zmm4 = _mm512_loadu_si512((const void *)(src + 4 * 64));
150 zmm5 = _mm512_loadu_si512((const void *)(src + 5 * 64));
151 zmm6 = _mm512_loadu_si512((const void *)(src + 6 * 64));
152 zmm7 = _mm512_loadu_si512((const void *)(src + 7 * 64));
154 _mm512_storeu_si512((void *)(dst + 0 * 64), zmm0);
155 _mm512_storeu_si512((void *)(dst + 1 * 64), zmm1);
156 _mm512_storeu_si512((void *)(dst + 2 * 64), zmm2);
157 _mm512_storeu_si512((void *)(dst + 3 * 64), zmm3);
158 _mm512_storeu_si512((void *)(dst + 4 * 64), zmm4);
159 _mm512_storeu_si512((void *)(dst + 5 * 64), zmm5);
160 _mm512_storeu_si512((void *)(dst + 6 * 64), zmm6);
161 _mm512_storeu_si512((void *)(dst + 7 * 64), zmm7);
167 rte_memcpy_generic(void *dst, const void *src, size_t n)
169 uintptr_t dstu = (uintptr_t)dst;
170 uintptr_t srcu = (uintptr_t)src;
176 * Copy less than 16 bytes
180 *(uint8_t *)dstu = *(const uint8_t *)srcu;
181 srcu = (uintptr_t)((const uint8_t *)srcu + 1);
182 dstu = (uintptr_t)((uint8_t *)dstu + 1);
185 *(uint16_t *)dstu = *(const uint16_t *)srcu;
186 srcu = (uintptr_t)((const uint16_t *)srcu + 1);
187 dstu = (uintptr_t)((uint16_t *)dstu + 1);
190 *(uint32_t *)dstu = *(const uint32_t *)srcu;
191 srcu = (uintptr_t)((const uint32_t *)srcu + 1);
192 dstu = (uintptr_t)((uint32_t *)dstu + 1);
195 *(uint64_t *)dstu = *(const uint64_t *)srcu;
200 * Fast way when copy size doesn't exceed 512 bytes
203 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
204 rte_mov16((uint8_t *)dst - 16 + n,
205 (const uint8_t *)src - 16 + n);
209 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
210 rte_mov32((uint8_t *)dst - 32 + n,
211 (const uint8_t *)src - 32 + n);
217 rte_mov256((uint8_t *)dst, (const uint8_t *)src);
218 src = (const uint8_t *)src + 256;
219 dst = (uint8_t *)dst + 256;
223 rte_mov128((uint8_t *)dst, (const uint8_t *)src);
224 src = (const uint8_t *)src + 128;
225 dst = (uint8_t *)dst + 128;
227 COPY_BLOCK_128_BACK63:
229 rte_mov64((uint8_t *)dst, (const uint8_t *)src);
230 rte_mov64((uint8_t *)dst - 64 + n,
231 (const uint8_t *)src - 64 + n);
235 rte_mov64((uint8_t *)dst - 64 + n,
236 (const uint8_t *)src - 64 + n);
241 * Make store aligned when copy size exceeds 512 bytes
243 dstofss = ((uintptr_t)dst & 0x3F);
245 dstofss = 64 - dstofss;
247 rte_mov64((uint8_t *)dst, (const uint8_t *)src);
248 src = (const uint8_t *)src + dstofss;
249 dst = (uint8_t *)dst + dstofss;
253 * Copy 512-byte blocks.
254 * Use copy block function for better instruction order control,
255 * which is important when load is unaligned.
257 rte_mov512blocks((uint8_t *)dst, (const uint8_t *)src, n);
261 src = (const uint8_t *)src + bits;
262 dst = (uint8_t *)dst + bits;
265 * Copy 128-byte blocks.
266 * Use copy block function for better instruction order control,
267 * which is important when load is unaligned.
270 rte_mov128blocks((uint8_t *)dst, (const uint8_t *)src, n);
274 src = (const uint8_t *)src + bits;
275 dst = (uint8_t *)dst + bits;
281 goto COPY_BLOCK_128_BACK63;
284 #elif defined RTE_MACHINE_CPUFLAG_AVX2
286 #define ALIGNMENT_MASK 0x1F
289 * AVX2 implementation below
293 * Copy 16 bytes from one location to another,
294 * locations should not overlap.
297 rte_mov16(uint8_t *dst, const uint8_t *src)
301 xmm0 = _mm_loadu_si128((const __m128i *)src);
302 _mm_storeu_si128((__m128i *)dst, xmm0);
306 * Copy 32 bytes from one location to another,
307 * locations should not overlap.
310 rte_mov32(uint8_t *dst, const uint8_t *src)
314 ymm0 = _mm256_loadu_si256((const __m256i *)src);
315 _mm256_storeu_si256((__m256i *)dst, ymm0);
319 * Copy 64 bytes from one location to another,
320 * locations should not overlap.
323 rte_mov64(uint8_t *dst, const uint8_t *src)
325 rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
326 rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
330 * Copy 128 bytes from one location to another,
331 * locations should not overlap.
334 rte_mov128(uint8_t *dst, const uint8_t *src)
336 rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
337 rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
338 rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32);
339 rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32);
343 * Copy 128-byte blocks from one location to another,
344 * locations should not overlap.
347 rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
349 __m256i ymm0, ymm1, ymm2, ymm3;
352 ymm0 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 0 * 32));
354 ymm1 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 1 * 32));
355 ymm2 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 2 * 32));
356 ymm3 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 3 * 32));
357 src = (const uint8_t *)src + 128;
358 _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 0 * 32), ymm0);
359 _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 1 * 32), ymm1);
360 _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 2 * 32), ymm2);
361 _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 3 * 32), ymm3);
362 dst = (uint8_t *)dst + 128;
367 rte_memcpy_generic(void *dst, const void *src, size_t n)
369 uintptr_t dstu = (uintptr_t)dst;
370 uintptr_t srcu = (uintptr_t)src;
376 * Copy less than 16 bytes
380 *(uint8_t *)dstu = *(const uint8_t *)srcu;
381 srcu = (uintptr_t)((const uint8_t *)srcu + 1);
382 dstu = (uintptr_t)((uint8_t *)dstu + 1);
385 *(uint16_t *)dstu = *(const uint16_t *)srcu;
386 srcu = (uintptr_t)((const uint16_t *)srcu + 1);
387 dstu = (uintptr_t)((uint16_t *)dstu + 1);
390 *(uint32_t *)dstu = *(const uint32_t *)srcu;
391 srcu = (uintptr_t)((const uint32_t *)srcu + 1);
392 dstu = (uintptr_t)((uint32_t *)dstu + 1);
395 *(uint64_t *)dstu = *(const uint64_t *)srcu;
401 * Fast way when copy size doesn't exceed 256 bytes
404 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
405 rte_mov16((uint8_t *)dst - 16 + n,
406 (const uint8_t *)src - 16 + n);
410 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
411 rte_mov16((uint8_t *)dst + 16, (const uint8_t *)src + 16);
412 rte_mov16((uint8_t *)dst - 16 + n,
413 (const uint8_t *)src - 16 + n);
417 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
418 rte_mov32((uint8_t *)dst - 32 + n,
419 (const uint8_t *)src - 32 + n);
425 rte_mov128((uint8_t *)dst, (const uint8_t *)src);
426 src = (const uint8_t *)src + 128;
427 dst = (uint8_t *)dst + 128;
429 COPY_BLOCK_128_BACK31:
432 rte_mov64((uint8_t *)dst, (const uint8_t *)src);
433 src = (const uint8_t *)src + 64;
434 dst = (uint8_t *)dst + 64;
437 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
438 rte_mov32((uint8_t *)dst - 32 + n,
439 (const uint8_t *)src - 32 + n);
443 rte_mov32((uint8_t *)dst - 32 + n,
444 (const uint8_t *)src - 32 + n);
450 * Make store aligned when copy size exceeds 256 bytes
452 dstofss = (uintptr_t)dst & 0x1F;
454 dstofss = 32 - dstofss;
456 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
457 src = (const uint8_t *)src + dstofss;
458 dst = (uint8_t *)dst + dstofss;
462 * Copy 128-byte blocks
464 rte_mov128blocks((uint8_t *)dst, (const uint8_t *)src, n);
468 src = (const uint8_t *)src + bits;
469 dst = (uint8_t *)dst + bits;
474 goto COPY_BLOCK_128_BACK31;
477 #else /* RTE_MACHINE_CPUFLAG */
479 #define ALIGNMENT_MASK 0x0F
482 * SSE & AVX implementation below
486 * Copy 16 bytes from one location to another,
487 * locations should not overlap.
490 rte_mov16(uint8_t *dst, const uint8_t *src)
494 xmm0 = _mm_loadu_si128((const __m128i *)(const __m128i *)src);
495 _mm_storeu_si128((__m128i *)dst, xmm0);
499 * Copy 32 bytes from one location to another,
500 * locations should not overlap.
503 rte_mov32(uint8_t *dst, const uint8_t *src)
505 rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
506 rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
510 * Copy 64 bytes from one location to another,
511 * locations should not overlap.
514 rte_mov64(uint8_t *dst, const uint8_t *src)
516 rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
517 rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
518 rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
519 rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
523 * Copy 128 bytes from one location to another,
524 * locations should not overlap.
527 rte_mov128(uint8_t *dst, const uint8_t *src)
529 rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
530 rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
531 rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
532 rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
533 rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
534 rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
535 rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
536 rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
540 * Copy 256 bytes from one location to another,
541 * locations should not overlap.
544 rte_mov256(uint8_t *dst, const uint8_t *src)
546 rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
547 rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
548 rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
549 rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
550 rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
551 rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
552 rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
553 rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
554 rte_mov16((uint8_t *)dst + 8 * 16, (const uint8_t *)src + 8 * 16);
555 rte_mov16((uint8_t *)dst + 9 * 16, (const uint8_t *)src + 9 * 16);
556 rte_mov16((uint8_t *)dst + 10 * 16, (const uint8_t *)src + 10 * 16);
557 rte_mov16((uint8_t *)dst + 11 * 16, (const uint8_t *)src + 11 * 16);
558 rte_mov16((uint8_t *)dst + 12 * 16, (const uint8_t *)src + 12 * 16);
559 rte_mov16((uint8_t *)dst + 13 * 16, (const uint8_t *)src + 13 * 16);
560 rte_mov16((uint8_t *)dst + 14 * 16, (const uint8_t *)src + 14 * 16);
561 rte_mov16((uint8_t *)dst + 15 * 16, (const uint8_t *)src + 15 * 16);
565 * Macro for copying unaligned block from one location to another with constant load offset,
566 * 47 bytes leftover maximum,
567 * locations should not overlap.
570 * - Load offset is <offset>, which must be immediate value within [1, 15]
571 * - For <src>, make sure <offset> bit backwards & <16 - offset> bit forwards are available for loading
572 * - <dst>, <src>, <len> must be variables
573 * - __m128i <xmm0> ~ <xmm8> must be pre-defined
575 #define MOVEUNALIGNED_LEFT47_IMM(dst, src, len, offset) \
578 while (len >= 128 + 16 - offset) { \
579 xmm0 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 0 * 16)); \
581 xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 1 * 16)); \
582 xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 2 * 16)); \
583 xmm3 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 3 * 16)); \
584 xmm4 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 4 * 16)); \
585 xmm5 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 5 * 16)); \
586 xmm6 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 6 * 16)); \
587 xmm7 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 7 * 16)); \
588 xmm8 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 8 * 16)); \
589 src = (const uint8_t *)src + 128; \
590 _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset)); \
591 _mm_storeu_si128((__m128i *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset)); \
592 _mm_storeu_si128((__m128i *)((uint8_t *)dst + 2 * 16), _mm_alignr_epi8(xmm3, xmm2, offset)); \
593 _mm_storeu_si128((__m128i *)((uint8_t *)dst + 3 * 16), _mm_alignr_epi8(xmm4, xmm3, offset)); \
594 _mm_storeu_si128((__m128i *)((uint8_t *)dst + 4 * 16), _mm_alignr_epi8(xmm5, xmm4, offset)); \
595 _mm_storeu_si128((__m128i *)((uint8_t *)dst + 5 * 16), _mm_alignr_epi8(xmm6, xmm5, offset)); \
596 _mm_storeu_si128((__m128i *)((uint8_t *)dst + 6 * 16), _mm_alignr_epi8(xmm7, xmm6, offset)); \
597 _mm_storeu_si128((__m128i *)((uint8_t *)dst + 7 * 16), _mm_alignr_epi8(xmm8, xmm7, offset)); \
598 dst = (uint8_t *)dst + 128; \
601 len = ((len - 16 + offset) & 127) + 16 - offset; \
603 src = (const uint8_t *)src + tmp; \
604 dst = (uint8_t *)dst + tmp; \
605 if (len >= 32 + 16 - offset) { \
606 while (len >= 32 + 16 - offset) { \
607 xmm0 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 0 * 16)); \
609 xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 1 * 16)); \
610 xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 2 * 16)); \
611 src = (const uint8_t *)src + 32; \
612 _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset)); \
613 _mm_storeu_si128((__m128i *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset)); \
614 dst = (uint8_t *)dst + 32; \
617 len = ((len - 16 + offset) & 31) + 16 - offset; \
619 src = (const uint8_t *)src + tmp; \
620 dst = (uint8_t *)dst + tmp; \
625 * Macro for copying unaligned block from one location to another,
626 * 47 bytes leftover maximum,
627 * locations should not overlap.
628 * Use switch here because the aligning instruction requires immediate value for shift count.
631 * - Load offset is <offset>, which must be within [1, 15]
632 * - For <src>, make sure <offset> bit backwards & <16 - offset> bit forwards are available for loading
633 * - <dst>, <src>, <len> must be variables
634 * - __m128i <xmm0> ~ <xmm8> used in MOVEUNALIGNED_LEFT47_IMM must be pre-defined
636 #define MOVEUNALIGNED_LEFT47(dst, src, len, offset) \
639 case 0x01: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x01); break; \
640 case 0x02: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x02); break; \
641 case 0x03: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x03); break; \
642 case 0x04: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x04); break; \
643 case 0x05: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x05); break; \
644 case 0x06: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x06); break; \
645 case 0x07: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x07); break; \
646 case 0x08: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x08); break; \
647 case 0x09: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x09); break; \
648 case 0x0A: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0A); break; \
649 case 0x0B: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0B); break; \
650 case 0x0C: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0C); break; \
651 case 0x0D: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0D); break; \
652 case 0x0E: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0E); break; \
653 case 0x0F: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0F); break; \
659 rte_memcpy_generic(void *dst, const void *src, size_t n)
661 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
662 uintptr_t dstu = (uintptr_t)dst;
663 uintptr_t srcu = (uintptr_t)src;
669 * Copy less than 16 bytes
673 *(uint8_t *)dstu = *(const uint8_t *)srcu;
674 srcu = (uintptr_t)((const uint8_t *)srcu + 1);
675 dstu = (uintptr_t)((uint8_t *)dstu + 1);
678 *(uint16_t *)dstu = *(const uint16_t *)srcu;
679 srcu = (uintptr_t)((const uint16_t *)srcu + 1);
680 dstu = (uintptr_t)((uint16_t *)dstu + 1);
683 *(uint32_t *)dstu = *(const uint32_t *)srcu;
684 srcu = (uintptr_t)((const uint32_t *)srcu + 1);
685 dstu = (uintptr_t)((uint32_t *)dstu + 1);
688 *(uint64_t *)dstu = *(const uint64_t *)srcu;
694 * Fast way when copy size doesn't exceed 512 bytes
697 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
698 rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
702 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
703 rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
707 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
708 rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32);
709 rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
713 goto COPY_BLOCK_128_BACK15;
718 rte_mov128((uint8_t *)dst, (const uint8_t *)src);
719 rte_mov128((uint8_t *)dst + 128, (const uint8_t *)src + 128);
720 src = (const uint8_t *)src + 256;
721 dst = (uint8_t *)dst + 256;
723 COPY_BLOCK_255_BACK15:
726 rte_mov128((uint8_t *)dst, (const uint8_t *)src);
727 src = (const uint8_t *)src + 128;
728 dst = (uint8_t *)dst + 128;
730 COPY_BLOCK_128_BACK15:
733 rte_mov64((uint8_t *)dst, (const uint8_t *)src);
734 src = (const uint8_t *)src + 64;
735 dst = (uint8_t *)dst + 64;
737 COPY_BLOCK_64_BACK15:
740 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
741 src = (const uint8_t *)src + 32;
742 dst = (uint8_t *)dst + 32;
745 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
746 rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
750 rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
756 * Make store aligned when copy size exceeds 512 bytes,
757 * and make sure the first 15 bytes are copied, because
758 * unaligned copy functions require up to 15 bytes
761 dstofss = (uintptr_t)dst & 0x0F;
763 dstofss = 16 - dstofss + 16;
765 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
766 src = (const uint8_t *)src + dstofss;
767 dst = (uint8_t *)dst + dstofss;
769 srcofs = ((uintptr_t)src & 0x0F);
776 * Copy 256-byte blocks
778 for (; n >= 256; n -= 256) {
779 rte_mov256((uint8_t *)dst, (const uint8_t *)src);
780 dst = (uint8_t *)dst + 256;
781 src = (const uint8_t *)src + 256;
787 goto COPY_BLOCK_255_BACK15;
791 * For copy with unaligned load
793 MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
798 goto COPY_BLOCK_64_BACK15;
801 #endif /* RTE_MACHINE_CPUFLAG */
804 rte_memcpy_aligned(void *dst, const void *src, size_t n)
808 /* Copy size <= 16 bytes */
811 *(uint8_t *)dst = *(const uint8_t *)src;
812 src = (const uint8_t *)src + 1;
813 dst = (uint8_t *)dst + 1;
816 *(uint16_t *)dst = *(const uint16_t *)src;
817 src = (const uint16_t *)src + 1;
818 dst = (uint16_t *)dst + 1;
821 *(uint32_t *)dst = *(const uint32_t *)src;
822 src = (const uint32_t *)src + 1;
823 dst = (uint32_t *)dst + 1;
826 *(uint64_t *)dst = *(const uint64_t *)src;
831 /* Copy 16 <= size <= 32 bytes */
833 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
834 rte_mov16((uint8_t *)dst - 16 + n,
835 (const uint8_t *)src - 16 + n);
840 /* Copy 32 < size <= 64 bytes */
842 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
843 rte_mov32((uint8_t *)dst - 32 + n,
844 (const uint8_t *)src - 32 + n);
849 /* Copy 64 bytes blocks */
850 for (; n >= 64; n -= 64) {
851 rte_mov64((uint8_t *)dst, (const uint8_t *)src);
852 dst = (uint8_t *)dst + 64;
853 src = (const uint8_t *)src + 64;
856 /* Copy whatever left */
857 rte_mov64((uint8_t *)dst - 64 + n,
858 (const uint8_t *)src - 64 + n);
864 rte_memcpy(void *dst, const void *src, size_t n)
866 if (!(((uintptr_t)dst | (uintptr_t)src) & ALIGNMENT_MASK))
867 return rte_memcpy_aligned(dst, src, n);
869 return rte_memcpy_generic(dst, src, n);
876 #endif /* _RTE_MEMCPY_X86_64_H_ */