2 * SPDX-License-Identifier: BSD-3-Clause
3 * Copyright (C) IBM Corporation 2014.
6 #ifndef _RTE_MEMCPY_PPC_64_H_
7 #define _RTE_MEMCPY_PPC_64_H_
11 /*To include altivec.h, GCC version must >= 4.8 */
18 #include "generic/rte_memcpy.h"
21 rte_mov16(uint8_t *dst, const uint8_t *src)
23 vec_vsx_st(vec_vsx_ld(0, src), 0, dst);
27 rte_mov32(uint8_t *dst, const uint8_t *src)
29 vec_vsx_st(vec_vsx_ld(0, src), 0, dst);
30 vec_vsx_st(vec_vsx_ld(16, src), 16, dst);
34 rte_mov48(uint8_t *dst, const uint8_t *src)
36 vec_vsx_st(vec_vsx_ld(0, src), 0, dst);
37 vec_vsx_st(vec_vsx_ld(16, src), 16, dst);
38 vec_vsx_st(vec_vsx_ld(32, src), 32, dst);
42 rte_mov64(uint8_t *dst, const uint8_t *src)
44 vec_vsx_st(vec_vsx_ld(0, src), 0, dst);
45 vec_vsx_st(vec_vsx_ld(16, src), 16, dst);
46 vec_vsx_st(vec_vsx_ld(32, src), 32, dst);
47 vec_vsx_st(vec_vsx_ld(48, src), 48, dst);
51 rte_mov128(uint8_t *dst, const uint8_t *src)
53 vec_vsx_st(vec_vsx_ld(0, src), 0, dst);
54 vec_vsx_st(vec_vsx_ld(16, src), 16, dst);
55 vec_vsx_st(vec_vsx_ld(32, src), 32, dst);
56 vec_vsx_st(vec_vsx_ld(48, src), 48, dst);
57 vec_vsx_st(vec_vsx_ld(64, src), 64, dst);
58 vec_vsx_st(vec_vsx_ld(80, src), 80, dst);
59 vec_vsx_st(vec_vsx_ld(96, src), 96, dst);
60 vec_vsx_st(vec_vsx_ld(112, src), 112, dst);
64 rte_mov256(uint8_t *dst, const uint8_t *src)
67 rte_mov128(dst + 128, src + 128);
70 #define rte_memcpy(dst, src, n) \
72 (__builtin_constant_p(n)) ? \
73 memcpy((dst), (src), (n)) : \
74 rte_memcpy_func((dst), (src), (n)); })
77 rte_memcpy_func(void *dst, const void *src, size_t n)
81 /* We can't copy < 16 bytes using XMM registers so do it manually. */
84 *(uint8_t *)dst = *(const uint8_t *)src;
85 dst = (uint8_t *)dst + 1;
86 src = (const uint8_t *)src + 1;
89 *(uint16_t *)dst = *(const uint16_t *)src;
90 dst = (uint16_t *)dst + 1;
91 src = (const uint16_t *)src + 1;
94 *(uint32_t *)dst = *(const uint32_t *)src;
95 dst = (uint32_t *)dst + 1;
96 src = (const uint32_t *)src + 1;
99 *(uint64_t *)dst = *(const uint64_t *)src;
103 /* Special fast cases for <= 128 bytes */
105 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
106 rte_mov16((uint8_t *)dst - 16 + n,
107 (const uint8_t *)src - 16 + n);
112 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
113 rte_mov32((uint8_t *)dst - 32 + n,
114 (const uint8_t *)src - 32 + n);
119 rte_mov64((uint8_t *)dst, (const uint8_t *)src);
120 rte_mov64((uint8_t *)dst - 64 + n,
121 (const uint8_t *)src - 64 + n);
126 * For large copies > 128 bytes. This combination of 256, 64 and 16 byte
127 * copies was found to be faster than doing 128 and 32 byte copies as
130 for ( ; n >= 256; n -= 256) {
131 rte_mov256((uint8_t *)dst, (const uint8_t *)src);
132 dst = (uint8_t *)dst + 256;
133 src = (const uint8_t *)src + 256;
137 * We split the remaining bytes (which will be less than 256) into
138 * 64byte (2^6) chunks.
139 * Using incrementing integers in the case labels of a switch statement
140 * encourages the compiler to use a jump table. To get incrementing
141 * integers, we shift the 2 relevant bits to the LSB position to first
142 * get decrementing integers, and then subtract.
144 switch (3 - (n >> 6)) {
146 rte_mov64((uint8_t *)dst, (const uint8_t *)src);
148 dst = (uint8_t *)dst + 64;
149 src = (const uint8_t *)src + 64; /* fallthrough */
151 rte_mov64((uint8_t *)dst, (const uint8_t *)src);
153 dst = (uint8_t *)dst + 64;
154 src = (const uint8_t *)src + 64; /* fallthrough */
156 rte_mov64((uint8_t *)dst, (const uint8_t *)src);
158 dst = (uint8_t *)dst + 64;
159 src = (const uint8_t *)src + 64; /* fallthrough */
165 * We split the remaining bytes (which will be less than 64) into
166 * 16byte (2^4) chunks, using the same switch structure as above.
168 switch (3 - (n >> 4)) {
170 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
172 dst = (uint8_t *)dst + 16;
173 src = (const uint8_t *)src + 16; /* fallthrough */
175 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
177 dst = (uint8_t *)dst + 16;
178 src = (const uint8_t *)src + 16; /* fallthrough */
180 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
182 dst = (uint8_t *)dst + 16;
183 src = (const uint8_t *)src + 16; /* fallthrough */
188 /* Copy any remaining bytes, without going beyond end of buffers */
190 rte_mov16((uint8_t *)dst - 16 + n,
191 (const uint8_t *)src - 16 + n);
199 #endif /* _RTE_MEMCPY_PPC_64_H_ */