4 * Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * * Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * * Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * * Neither the name of Intel Corporation nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 #ifndef _RTE_MEMCPY_X86_64_H_
35 #define _RTE_MEMCPY_X86_64_H_
40 * Functions for SSE/AVX/AVX2/AVX512 implementation of memcpy().
53 * Copy bytes from one location to another. The locations must not overlap.
55 * @note This is implemented as a macro, so it's address should not be taken
56 * and care is needed as parameter expressions may be evaluated multiple times.
59 * Pointer to the destination of the data.
61 * Pointer to the source data.
63 * Number of bytes to copy.
65 * Pointer to the destination data.
68 rte_memcpy(void *dst, const void *src, size_t n) __attribute__((always_inline));
70 #ifdef RTE_MACHINE_CPUFLAG_AVX512F
73 * AVX512 implementation below
77 * Copy 16 bytes from one location to another,
78 * locations should not overlap.
81 rte_mov16(uint8_t *dst, const uint8_t *src)
85 xmm0 = _mm_loadu_si128((const __m128i *)src);
86 _mm_storeu_si128((__m128i *)dst, xmm0);
90 * Copy 32 bytes from one location to another,
91 * locations should not overlap.
94 rte_mov32(uint8_t *dst, const uint8_t *src)
98 ymm0 = _mm256_loadu_si256((const __m256i *)src);
99 _mm256_storeu_si256((__m256i *)dst, ymm0);
103 * Copy 64 bytes from one location to another,
104 * locations should not overlap.
107 rte_mov64(uint8_t *dst, const uint8_t *src)
111 zmm0 = _mm512_loadu_si512((const void *)src);
112 _mm512_storeu_si512((void *)dst, zmm0);
116 * Copy 128 bytes from one location to another,
117 * locations should not overlap.
120 rte_mov128(uint8_t *dst, const uint8_t *src)
122 rte_mov64(dst + 0 * 64, src + 0 * 64);
123 rte_mov64(dst + 1 * 64, src + 1 * 64);
127 * Copy 256 bytes from one location to another,
128 * locations should not overlap.
131 rte_mov256(uint8_t *dst, const uint8_t *src)
133 rte_mov64(dst + 0 * 64, src + 0 * 64);
134 rte_mov64(dst + 1 * 64, src + 1 * 64);
135 rte_mov64(dst + 2 * 64, src + 2 * 64);
136 rte_mov64(dst + 3 * 64, src + 3 * 64);
140 * Copy 128-byte blocks from one location to another,
141 * locations should not overlap.
144 rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
149 zmm0 = _mm512_loadu_si512((const void *)(src + 0 * 64));
151 zmm1 = _mm512_loadu_si512((const void *)(src + 1 * 64));
153 _mm512_storeu_si512((void *)(dst + 0 * 64), zmm0);
154 _mm512_storeu_si512((void *)(dst + 1 * 64), zmm1);
160 * Copy 512-byte blocks from one location to another,
161 * locations should not overlap.
164 rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n)
166 __m512i zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7;
169 zmm0 = _mm512_loadu_si512((const void *)(src + 0 * 64));
171 zmm1 = _mm512_loadu_si512((const void *)(src + 1 * 64));
172 zmm2 = _mm512_loadu_si512((const void *)(src + 2 * 64));
173 zmm3 = _mm512_loadu_si512((const void *)(src + 3 * 64));
174 zmm4 = _mm512_loadu_si512((const void *)(src + 4 * 64));
175 zmm5 = _mm512_loadu_si512((const void *)(src + 5 * 64));
176 zmm6 = _mm512_loadu_si512((const void *)(src + 6 * 64));
177 zmm7 = _mm512_loadu_si512((const void *)(src + 7 * 64));
179 _mm512_storeu_si512((void *)(dst + 0 * 64), zmm0);
180 _mm512_storeu_si512((void *)(dst + 1 * 64), zmm1);
181 _mm512_storeu_si512((void *)(dst + 2 * 64), zmm2);
182 _mm512_storeu_si512((void *)(dst + 3 * 64), zmm3);
183 _mm512_storeu_si512((void *)(dst + 4 * 64), zmm4);
184 _mm512_storeu_si512((void *)(dst + 5 * 64), zmm5);
185 _mm512_storeu_si512((void *)(dst + 6 * 64), zmm6);
186 _mm512_storeu_si512((void *)(dst + 7 * 64), zmm7);
192 rte_memcpy(void *dst, const void *src, size_t n)
194 uintptr_t dstu = (uintptr_t)dst;
195 uintptr_t srcu = (uintptr_t)src;
201 * Copy less than 16 bytes
205 *(uint8_t *)dstu = *(const uint8_t *)srcu;
206 srcu = (uintptr_t)((const uint8_t *)srcu + 1);
207 dstu = (uintptr_t)((uint8_t *)dstu + 1);
210 *(uint16_t *)dstu = *(const uint16_t *)srcu;
211 srcu = (uintptr_t)((const uint16_t *)srcu + 1);
212 dstu = (uintptr_t)((uint16_t *)dstu + 1);
215 *(uint32_t *)dstu = *(const uint32_t *)srcu;
216 srcu = (uintptr_t)((const uint32_t *)srcu + 1);
217 dstu = (uintptr_t)((uint32_t *)dstu + 1);
220 *(uint64_t *)dstu = *(const uint64_t *)srcu;
225 * Fast way when copy size doesn't exceed 512 bytes
228 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
229 rte_mov16((uint8_t *)dst - 16 + n,
230 (const uint8_t *)src - 16 + n);
234 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
235 rte_mov32((uint8_t *)dst - 32 + n,
236 (const uint8_t *)src - 32 + n);
242 rte_mov256((uint8_t *)dst, (const uint8_t *)src);
243 src = (const uint8_t *)src + 256;
244 dst = (uint8_t *)dst + 256;
248 rte_mov128((uint8_t *)dst, (const uint8_t *)src);
249 src = (const uint8_t *)src + 128;
250 dst = (uint8_t *)dst + 128;
252 COPY_BLOCK_128_BACK63:
254 rte_mov64((uint8_t *)dst, (const uint8_t *)src);
255 rte_mov64((uint8_t *)dst - 64 + n,
256 (const uint8_t *)src - 64 + n);
260 rte_mov64((uint8_t *)dst - 64 + n,
261 (const uint8_t *)src - 64 + n);
266 * Make store aligned when copy size exceeds 512 bytes
268 dstofss = ((uintptr_t)dst & 0x3F);
270 dstofss = 64 - dstofss;
272 rte_mov64((uint8_t *)dst, (const uint8_t *)src);
273 src = (const uint8_t *)src + dstofss;
274 dst = (uint8_t *)dst + dstofss;
278 * Copy 512-byte blocks.
279 * Use copy block function for better instruction order control,
280 * which is important when load is unaligned.
282 rte_mov512blocks((uint8_t *)dst, (const uint8_t *)src, n);
286 src = (const uint8_t *)src + bits;
287 dst = (uint8_t *)dst + bits;
290 * Copy 128-byte blocks.
291 * Use copy block function for better instruction order control,
292 * which is important when load is unaligned.
295 rte_mov128blocks((uint8_t *)dst, (const uint8_t *)src, n);
299 src = (const uint8_t *)src + bits;
300 dst = (uint8_t *)dst + bits;
306 goto COPY_BLOCK_128_BACK63;
309 #elif defined RTE_MACHINE_CPUFLAG_AVX2
312 * AVX2 implementation below
316 * Copy 16 bytes from one location to another,
317 * locations should not overlap.
320 rte_mov16(uint8_t *dst, const uint8_t *src)
324 xmm0 = _mm_loadu_si128((const __m128i *)src);
325 _mm_storeu_si128((__m128i *)dst, xmm0);
329 * Copy 32 bytes from one location to another,
330 * locations should not overlap.
333 rte_mov32(uint8_t *dst, const uint8_t *src)
337 ymm0 = _mm256_loadu_si256((const __m256i *)src);
338 _mm256_storeu_si256((__m256i *)dst, ymm0);
342 * Copy 64 bytes from one location to another,
343 * locations should not overlap.
346 rte_mov64(uint8_t *dst, const uint8_t *src)
348 rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
349 rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
353 * Copy 128 bytes from one location to another,
354 * locations should not overlap.
357 rte_mov128(uint8_t *dst, const uint8_t *src)
359 rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
360 rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
361 rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32);
362 rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32);
366 * Copy 256 bytes from one location to another,
367 * locations should not overlap.
370 rte_mov256(uint8_t *dst, const uint8_t *src)
372 rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
373 rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
374 rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32);
375 rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32);
376 rte_mov32((uint8_t *)dst + 4 * 32, (const uint8_t *)src + 4 * 32);
377 rte_mov32((uint8_t *)dst + 5 * 32, (const uint8_t *)src + 5 * 32);
378 rte_mov32((uint8_t *)dst + 6 * 32, (const uint8_t *)src + 6 * 32);
379 rte_mov32((uint8_t *)dst + 7 * 32, (const uint8_t *)src + 7 * 32);
383 * Copy 64-byte blocks from one location to another,
384 * locations should not overlap.
387 rte_mov64blocks(uint8_t *dst, const uint8_t *src, size_t n)
392 ymm0 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 0 * 32));
394 ymm1 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 1 * 32));
395 src = (const uint8_t *)src + 64;
396 _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 0 * 32), ymm0);
397 _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 1 * 32), ymm1);
398 dst = (uint8_t *)dst + 64;
403 * Copy 256-byte blocks from one location to another,
404 * locations should not overlap.
407 rte_mov256blocks(uint8_t *dst, const uint8_t *src, size_t n)
409 __m256i ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7;
412 ymm0 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 0 * 32));
414 ymm1 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 1 * 32));
415 ymm2 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 2 * 32));
416 ymm3 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 3 * 32));
417 ymm4 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 4 * 32));
418 ymm5 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 5 * 32));
419 ymm6 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 6 * 32));
420 ymm7 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 7 * 32));
421 src = (const uint8_t *)src + 256;
422 _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 0 * 32), ymm0);
423 _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 1 * 32), ymm1);
424 _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 2 * 32), ymm2);
425 _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 3 * 32), ymm3);
426 _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 4 * 32), ymm4);
427 _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 5 * 32), ymm5);
428 _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 6 * 32), ymm6);
429 _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 7 * 32), ymm7);
430 dst = (uint8_t *)dst + 256;
435 rte_memcpy(void *dst, const void *src, size_t n)
437 uintptr_t dstu = (uintptr_t)dst;
438 uintptr_t srcu = (uintptr_t)src;
444 * Copy less than 16 bytes
448 *(uint8_t *)dstu = *(const uint8_t *)srcu;
449 srcu = (uintptr_t)((const uint8_t *)srcu + 1);
450 dstu = (uintptr_t)((uint8_t *)dstu + 1);
453 *(uint16_t *)dstu = *(const uint16_t *)srcu;
454 srcu = (uintptr_t)((const uint16_t *)srcu + 1);
455 dstu = (uintptr_t)((uint16_t *)dstu + 1);
458 *(uint32_t *)dstu = *(const uint32_t *)srcu;
459 srcu = (uintptr_t)((const uint32_t *)srcu + 1);
460 dstu = (uintptr_t)((uint32_t *)dstu + 1);
463 *(uint64_t *)dstu = *(const uint64_t *)srcu;
469 * Fast way when copy size doesn't exceed 512 bytes
472 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
473 rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
477 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
478 rte_mov32((uint8_t *)dst - 32 + n, (const uint8_t *)src - 32 + n);
484 rte_mov256((uint8_t *)dst, (const uint8_t *)src);
485 src = (const uint8_t *)src + 256;
486 dst = (uint8_t *)dst + 256;
490 rte_mov128((uint8_t *)dst, (const uint8_t *)src);
491 src = (const uint8_t *)src + 128;
492 dst = (uint8_t *)dst + 128;
496 rte_mov64((uint8_t *)dst, (const uint8_t *)src);
497 src = (const uint8_t *)src + 64;
498 dst = (uint8_t *)dst + 64;
500 COPY_BLOCK_64_BACK31:
502 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
503 rte_mov32((uint8_t *)dst - 32 + n, (const uint8_t *)src - 32 + n);
507 rte_mov32((uint8_t *)dst - 32 + n, (const uint8_t *)src - 32 + n);
513 * Make store aligned when copy size exceeds 512 bytes
515 dstofss = 32 - ((uintptr_t)dst & 0x1F);
518 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
519 src = (const uint8_t *)src + dstofss;
520 dst = (uint8_t *)dst + dstofss;
524 * Copy 256-byte blocks.
525 * Use copy block function for better instruction order control,
526 * which is important when load is unaligned.
528 rte_mov256blocks((uint8_t *)dst, (const uint8_t *)src, n);
532 src = (const uint8_t *)src + bits;
533 dst = (uint8_t *)dst + bits;
536 * Copy 64-byte blocks.
537 * Use copy block function for better instruction order control,
538 * which is important when load is unaligned.
541 rte_mov64blocks((uint8_t *)dst, (const uint8_t *)src, n);
545 src = (const uint8_t *)src + bits;
546 dst = (uint8_t *)dst + bits;
552 goto COPY_BLOCK_64_BACK31;
555 #else /* RTE_MACHINE_CPUFLAG */
558 * SSE & AVX implementation below
562 * Copy 16 bytes from one location to another,
563 * locations should not overlap.
566 rte_mov16(uint8_t *dst, const uint8_t *src)
570 xmm0 = _mm_loadu_si128((const __m128i *)(const __m128i *)src);
571 _mm_storeu_si128((__m128i *)dst, xmm0);
575 * Copy 32 bytes from one location to another,
576 * locations should not overlap.
579 rte_mov32(uint8_t *dst, const uint8_t *src)
581 rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
582 rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
586 * Copy 64 bytes from one location to another,
587 * locations should not overlap.
590 rte_mov64(uint8_t *dst, const uint8_t *src)
592 rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
593 rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
594 rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
595 rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
599 * Copy 128 bytes from one location to another,
600 * locations should not overlap.
603 rte_mov128(uint8_t *dst, const uint8_t *src)
605 rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
606 rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
607 rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
608 rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
609 rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
610 rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
611 rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
612 rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
616 * Copy 256 bytes from one location to another,
617 * locations should not overlap.
620 rte_mov256(uint8_t *dst, const uint8_t *src)
622 rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
623 rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
624 rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
625 rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
626 rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
627 rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
628 rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
629 rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
630 rte_mov16((uint8_t *)dst + 8 * 16, (const uint8_t *)src + 8 * 16);
631 rte_mov16((uint8_t *)dst + 9 * 16, (const uint8_t *)src + 9 * 16);
632 rte_mov16((uint8_t *)dst + 10 * 16, (const uint8_t *)src + 10 * 16);
633 rte_mov16((uint8_t *)dst + 11 * 16, (const uint8_t *)src + 11 * 16);
634 rte_mov16((uint8_t *)dst + 12 * 16, (const uint8_t *)src + 12 * 16);
635 rte_mov16((uint8_t *)dst + 13 * 16, (const uint8_t *)src + 13 * 16);
636 rte_mov16((uint8_t *)dst + 14 * 16, (const uint8_t *)src + 14 * 16);
637 rte_mov16((uint8_t *)dst + 15 * 16, (const uint8_t *)src + 15 * 16);
641 * Macro for copying unaligned block from one location to another with constant load offset,
642 * 47 bytes leftover maximum,
643 * locations should not overlap.
646 * - Load offset is <offset>, which must be immediate value within [1, 15]
647 * - For <src>, make sure <offset> bit backwards & <16 - offset> bit forwards are available for loading
648 * - <dst>, <src>, <len> must be variables
649 * - __m128i <xmm0> ~ <xmm8> must be pre-defined
651 #define MOVEUNALIGNED_LEFT47_IMM(dst, src, len, offset) \
654 while (len >= 128 + 16 - offset) { \
655 xmm0 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 0 * 16)); \
657 xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 1 * 16)); \
658 xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 2 * 16)); \
659 xmm3 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 3 * 16)); \
660 xmm4 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 4 * 16)); \
661 xmm5 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 5 * 16)); \
662 xmm6 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 6 * 16)); \
663 xmm7 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 7 * 16)); \
664 xmm8 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 8 * 16)); \
665 src = (const uint8_t *)src + 128; \
666 _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset)); \
667 _mm_storeu_si128((__m128i *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset)); \
668 _mm_storeu_si128((__m128i *)((uint8_t *)dst + 2 * 16), _mm_alignr_epi8(xmm3, xmm2, offset)); \
669 _mm_storeu_si128((__m128i *)((uint8_t *)dst + 3 * 16), _mm_alignr_epi8(xmm4, xmm3, offset)); \
670 _mm_storeu_si128((__m128i *)((uint8_t *)dst + 4 * 16), _mm_alignr_epi8(xmm5, xmm4, offset)); \
671 _mm_storeu_si128((__m128i *)((uint8_t *)dst + 5 * 16), _mm_alignr_epi8(xmm6, xmm5, offset)); \
672 _mm_storeu_si128((__m128i *)((uint8_t *)dst + 6 * 16), _mm_alignr_epi8(xmm7, xmm6, offset)); \
673 _mm_storeu_si128((__m128i *)((uint8_t *)dst + 7 * 16), _mm_alignr_epi8(xmm8, xmm7, offset)); \
674 dst = (uint8_t *)dst + 128; \
677 len = ((len - 16 + offset) & 127) + 16 - offset; \
679 src = (const uint8_t *)src + tmp; \
680 dst = (uint8_t *)dst + tmp; \
681 if (len >= 32 + 16 - offset) { \
682 while (len >= 32 + 16 - offset) { \
683 xmm0 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 0 * 16)); \
685 xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 1 * 16)); \
686 xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 2 * 16)); \
687 src = (const uint8_t *)src + 32; \
688 _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset)); \
689 _mm_storeu_si128((__m128i *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset)); \
690 dst = (uint8_t *)dst + 32; \
693 len = ((len - 16 + offset) & 31) + 16 - offset; \
695 src = (const uint8_t *)src + tmp; \
696 dst = (uint8_t *)dst + tmp; \
701 * Macro for copying unaligned block from one location to another,
702 * 47 bytes leftover maximum,
703 * locations should not overlap.
704 * Use switch here because the aligning instruction requires immediate value for shift count.
707 * - Load offset is <offset>, which must be within [1, 15]
708 * - For <src>, make sure <offset> bit backwards & <16 - offset> bit forwards are available for loading
709 * - <dst>, <src>, <len> must be variables
710 * - __m128i <xmm0> ~ <xmm8> used in MOVEUNALIGNED_LEFT47_IMM must be pre-defined
712 #define MOVEUNALIGNED_LEFT47(dst, src, len, offset) \
715 case 0x01: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x01); break; \
716 case 0x02: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x02); break; \
717 case 0x03: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x03); break; \
718 case 0x04: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x04); break; \
719 case 0x05: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x05); break; \
720 case 0x06: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x06); break; \
721 case 0x07: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x07); break; \
722 case 0x08: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x08); break; \
723 case 0x09: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x09); break; \
724 case 0x0A: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0A); break; \
725 case 0x0B: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0B); break; \
726 case 0x0C: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0C); break; \
727 case 0x0D: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0D); break; \
728 case 0x0E: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0E); break; \
729 case 0x0F: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0F); break; \
735 rte_memcpy(void *dst, const void *src, size_t n)
737 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
738 uintptr_t dstu = (uintptr_t)dst;
739 uintptr_t srcu = (uintptr_t)src;
745 * Copy less than 16 bytes
749 *(uint8_t *)dstu = *(const uint8_t *)srcu;
750 srcu = (uintptr_t)((const uint8_t *)srcu + 1);
751 dstu = (uintptr_t)((uint8_t *)dstu + 1);
754 *(uint16_t *)dstu = *(const uint16_t *)srcu;
755 srcu = (uintptr_t)((const uint16_t *)srcu + 1);
756 dstu = (uintptr_t)((uint16_t *)dstu + 1);
759 *(uint32_t *)dstu = *(const uint32_t *)srcu;
760 srcu = (uintptr_t)((const uint32_t *)srcu + 1);
761 dstu = (uintptr_t)((uint32_t *)dstu + 1);
764 *(uint64_t *)dstu = *(const uint64_t *)srcu;
770 * Fast way when copy size doesn't exceed 512 bytes
773 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
774 rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
778 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
779 rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
783 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
784 rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32);
785 rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
789 goto COPY_BLOCK_128_BACK15;
794 rte_mov128((uint8_t *)dst, (const uint8_t *)src);
795 rte_mov128((uint8_t *)dst + 128, (const uint8_t *)src + 128);
796 src = (const uint8_t *)src + 256;
797 dst = (uint8_t *)dst + 256;
799 COPY_BLOCK_255_BACK15:
802 rte_mov128((uint8_t *)dst, (const uint8_t *)src);
803 src = (const uint8_t *)src + 128;
804 dst = (uint8_t *)dst + 128;
806 COPY_BLOCK_128_BACK15:
809 rte_mov64((uint8_t *)dst, (const uint8_t *)src);
810 src = (const uint8_t *)src + 64;
811 dst = (uint8_t *)dst + 64;
813 COPY_BLOCK_64_BACK15:
816 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
817 src = (const uint8_t *)src + 32;
818 dst = (uint8_t *)dst + 32;
821 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
822 rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
826 rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
832 * Make store aligned when copy size exceeds 512 bytes,
833 * and make sure the first 15 bytes are copied, because
834 * unaligned copy functions require up to 15 bytes
837 dstofss = 16 - ((uintptr_t)dst & 0x0F) + 16;
840 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
841 src = (const uint8_t *)src + dstofss;
842 dst = (uint8_t *)dst + dstofss;
843 srcofs = ((uintptr_t)src & 0x0F);
851 * Copy 256-byte blocks
853 for (; n >= 256; n -= 256) {
854 rte_mov256((uint8_t *)dst, (const uint8_t *)src);
855 dst = (uint8_t *)dst + 256;
856 src = (const uint8_t *)src + 256;
862 goto COPY_BLOCK_255_BACK15;
866 * For copy with unaligned load
868 MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
873 goto COPY_BLOCK_64_BACK15;
876 #endif /* RTE_MACHINE_CPUFLAG */
882 #endif /* _RTE_MEMCPY_X86_64_H_ */