91de8568e2ec7c4b9a96fd2da8a78beb1c805509
[dpdk.git] / test / test / test_memcpy_perf.c
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33
34 #include <stdint.h>
35 #include <stdio.h>
36 #include <string.h>
37 #include <stdlib.h>
38
39 #include <rte_common.h>
40 #include <rte_cycles.h>
41 #include <rte_random.h>
42 #include <rte_malloc.h>
43
44 #include <rte_memcpy.h>
45 #include <rte_cpuflags.h>
46
47 #include "test.h"
48
49 /*
50  * Set this to the maximum buffer size you want to test. If it is 0, then the
51  * values in the buf_sizes[] array below will be used.
52  */
53 #define TEST_VALUE_RANGE        0
54
55 /* List of buffer sizes to test */
56 #if TEST_VALUE_RANGE == 0
57 static size_t buf_sizes[] = {
58         1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 15, 16, 17, 31, 32, 33, 63, 64, 65, 127, 128,
59         129, 191, 192, 193, 255, 256, 257, 319, 320, 321, 383, 384, 385, 447, 448,
60         449, 511, 512, 513, 767, 768, 769, 1023, 1024, 1025, 1518, 1522, 1536, 1600,
61         2048, 2560, 3072, 3584, 4096, 4608, 5120, 5632, 6144, 6656, 7168, 7680, 8192
62 };
63 /* MUST be as large as largest packet size above */
64 #define SMALL_BUFFER_SIZE       8192
65 #else /* TEST_VALUE_RANGE != 0 */
66 static size_t buf_sizes[TEST_VALUE_RANGE];
67 #define SMALL_BUFFER_SIZE       TEST_VALUE_RANGE
68 #endif /* TEST_VALUE_RANGE == 0 */
69
70
71 /*
72  * Arrays of this size are used for measuring uncached memory accesses by
73  * picking a random location within the buffer. Make this smaller if there are
74  * memory allocation errors.
75  */
76 #define LARGE_BUFFER_SIZE       (100 * 1024 * 1024)
77
78 /* How many times to run timing loop for performance tests */
79 #define TEST_ITERATIONS         1000000
80 #define TEST_BATCH_SIZE         100
81
82 /* Data is aligned on this many bytes (power of 2) */
83 static uint8_t alignment_unit = 16;
84
85 /*
86  * Pointers used in performance tests. The two large buffers are for uncached
87  * access where random addresses within the buffer are used for each
88  * memcpy. The two small buffers are for cached access.
89  */
90 static uint8_t *large_buf_read, *large_buf_write;
91 static uint8_t *small_buf_read, *small_buf_write;
92
93 /* Initialise alignment_unit based on machine at run-time. */
94 static void
95 init_alignment_unit(void)
96 {
97 #ifdef CC_SUPPORT_AVX512
98         if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F)) {
99                 alignment_unit = 64;
100                 return;
101         }
102 #endif
103 #ifdef CC_SUPPORT_AVX2
104         if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2)) {
105                 alignment_unit = 32;
106                 return;
107         }
108 #endif
109         alignment_unit = 16;
110 }
111
112 /* Initialise data buffers. */
113 static int
114 init_buffers(void)
115 {
116         unsigned i;
117
118         init_alignment_unit();
119
120         large_buf_read = rte_malloc("memcpy",
121                                     LARGE_BUFFER_SIZE + alignment_unit,
122                                     alignment_unit);
123         if (large_buf_read == NULL)
124                 goto error_large_buf_read;
125
126         large_buf_write = rte_malloc("memcpy",
127                                      LARGE_BUFFER_SIZE + alignment_unit,
128                                      alignment_unit);
129         if (large_buf_write == NULL)
130                 goto error_large_buf_write;
131
132         small_buf_read = rte_malloc("memcpy",
133                                     SMALL_BUFFER_SIZE + alignment_unit,
134                                     alignment_unit);
135         if (small_buf_read == NULL)
136                 goto error_small_buf_read;
137
138         small_buf_write = rte_malloc("memcpy",
139                                      SMALL_BUFFER_SIZE + alignment_unit,
140                                      alignment_unit);
141         if (small_buf_write == NULL)
142                 goto error_small_buf_write;
143
144         for (i = 0; i < LARGE_BUFFER_SIZE; i++)
145                 large_buf_read[i] = rte_rand();
146         for (i = 0; i < SMALL_BUFFER_SIZE; i++)
147                 small_buf_read[i] = rte_rand();
148
149         return 0;
150
151 error_small_buf_write:
152         rte_free(small_buf_read);
153 error_small_buf_read:
154         rte_free(large_buf_write);
155 error_large_buf_write:
156         rte_free(large_buf_read);
157 error_large_buf_read:
158         printf("ERROR: not enough memory\n");
159         return -1;
160 }
161
162 /* Cleanup data buffers */
163 static void
164 free_buffers(void)
165 {
166         rte_free(large_buf_read);
167         rte_free(large_buf_write);
168         rte_free(small_buf_read);
169         rte_free(small_buf_write);
170 }
171
172 /*
173  * Get a random offset into large array, with enough space needed to perform
174  * max copy size. Offset is aligned, uoffset is used for unalignment setting.
175  */
176 static inline size_t
177 get_rand_offset(size_t uoffset)
178 {
179         return ((rte_rand() % (LARGE_BUFFER_SIZE - SMALL_BUFFER_SIZE)) &
180                         ~(alignment_unit - 1)) + uoffset;
181 }
182
183 /* Fill in source and destination addresses. */
184 static inline void
185 fill_addr_arrays(size_t *dst_addr, int is_dst_cached, size_t dst_uoffset,
186                                  size_t *src_addr, int is_src_cached, size_t src_uoffset)
187 {
188         unsigned int i;
189
190         for (i = 0; i < TEST_BATCH_SIZE; i++) {
191                 dst_addr[i] = (is_dst_cached) ? dst_uoffset : get_rand_offset(dst_uoffset);
192                 src_addr[i] = (is_src_cached) ? src_uoffset : get_rand_offset(src_uoffset);
193         }
194 }
195
196 /*
197  * WORKAROUND: For some reason the first test doing an uncached write
198  * takes a very long time (~25 times longer than is expected). So we do
199  * it once without timing.
200  */
201 static void
202 do_uncached_write(uint8_t *dst, int is_dst_cached,
203                                   const uint8_t *src, int is_src_cached, size_t size)
204 {
205         unsigned i, j;
206         size_t dst_addrs[TEST_BATCH_SIZE], src_addrs[TEST_BATCH_SIZE];
207
208         for (i = 0; i < (TEST_ITERATIONS / TEST_BATCH_SIZE); i++) {
209                 fill_addr_arrays(dst_addrs, is_dst_cached, 0,
210                                                  src_addrs, is_src_cached, 0);
211                 for (j = 0; j < TEST_BATCH_SIZE; j++) {
212                         rte_memcpy(dst+dst_addrs[j], src+src_addrs[j], size);
213                 }
214         }
215 }
216
217 /*
218  * Run a single memcpy performance test. This is a macro to ensure that if
219  * the "size" parameter is a constant it won't be converted to a variable.
220  */
221 #define SINGLE_PERF_TEST(dst, is_dst_cached, dst_uoffset,                   \
222                          src, is_src_cached, src_uoffset, size)             \
223 do {                                                                        \
224     unsigned int iter, t;                                                   \
225     size_t dst_addrs[TEST_BATCH_SIZE], src_addrs[TEST_BATCH_SIZE];          \
226     uint64_t start_time, total_time = 0;                                    \
227     uint64_t total_time2 = 0;                                               \
228     for (iter = 0; iter < (TEST_ITERATIONS / TEST_BATCH_SIZE); iter++) {    \
229         fill_addr_arrays(dst_addrs, is_dst_cached, dst_uoffset,             \
230                          src_addrs, is_src_cached, src_uoffset);            \
231         start_time = rte_rdtsc();                                           \
232         for (t = 0; t < TEST_BATCH_SIZE; t++)                               \
233             rte_memcpy(dst+dst_addrs[t], src+src_addrs[t], size);           \
234         total_time += rte_rdtsc() - start_time;                             \
235     }                                                                       \
236     for (iter = 0; iter < (TEST_ITERATIONS / TEST_BATCH_SIZE); iter++) {    \
237         fill_addr_arrays(dst_addrs, is_dst_cached, dst_uoffset,             \
238                          src_addrs, is_src_cached, src_uoffset);            \
239         start_time = rte_rdtsc();                                           \
240         for (t = 0; t < TEST_BATCH_SIZE; t++)                               \
241             memcpy(dst+dst_addrs[t], src+src_addrs[t], size);               \
242         total_time2 += rte_rdtsc() - start_time;                            \
243     }                                                                       \
244     printf("%8.0f -",  (double)total_time /TEST_ITERATIONS);                \
245     printf("%5.0f",  (double)total_time2 / TEST_ITERATIONS);                \
246 } while (0)
247
248 /* Run aligned memcpy tests for each cached/uncached permutation */
249 #define ALL_PERF_TESTS_FOR_SIZE(n)                                       \
250 do {                                                                     \
251     if (__builtin_constant_p(n))                                         \
252         printf("\nC%6u", (unsigned)n);                                   \
253     else                                                                 \
254         printf("\n%7u", (unsigned)n);                                    \
255     SINGLE_PERF_TEST(small_buf_write, 1, 0, small_buf_read, 1, 0, n);    \
256     SINGLE_PERF_TEST(large_buf_write, 0, 0, small_buf_read, 1, 0, n);    \
257     SINGLE_PERF_TEST(small_buf_write, 1, 0, large_buf_read, 0, 0, n);    \
258     SINGLE_PERF_TEST(large_buf_write, 0, 0, large_buf_read, 0, 0, n);    \
259 } while (0)
260
261 /* Run unaligned memcpy tests for each cached/uncached permutation */
262 #define ALL_PERF_TESTS_FOR_SIZE_UNALIGNED(n)                             \
263 do {                                                                     \
264     if (__builtin_constant_p(n))                                         \
265         printf("\nC%6u", (unsigned)n);                                   \
266     else                                                                 \
267         printf("\n%7u", (unsigned)n);                                    \
268     SINGLE_PERF_TEST(small_buf_write, 1, 1, small_buf_read, 1, 5, n);    \
269     SINGLE_PERF_TEST(large_buf_write, 0, 1, small_buf_read, 1, 5, n);    \
270     SINGLE_PERF_TEST(small_buf_write, 1, 1, large_buf_read, 0, 5, n);    \
271     SINGLE_PERF_TEST(large_buf_write, 0, 1, large_buf_read, 0, 5, n);    \
272 } while (0)
273
274 /* Run memcpy tests for constant length */
275 #define ALL_PERF_TEST_FOR_CONSTANT                                      \
276 do {                                                                    \
277     TEST_CONSTANT(6U); TEST_CONSTANT(64U); TEST_CONSTANT(128U);         \
278     TEST_CONSTANT(192U); TEST_CONSTANT(256U); TEST_CONSTANT(512U);      \
279     TEST_CONSTANT(768U); TEST_CONSTANT(1024U); TEST_CONSTANT(1536U);    \
280 } while (0)
281
282 /* Run all memcpy tests for aligned constant cases */
283 static inline void
284 perf_test_constant_aligned(void)
285 {
286 #define TEST_CONSTANT ALL_PERF_TESTS_FOR_SIZE
287         ALL_PERF_TEST_FOR_CONSTANT;
288 #undef TEST_CONSTANT
289 }
290
291 /* Run all memcpy tests for unaligned constant cases */
292 static inline void
293 perf_test_constant_unaligned(void)
294 {
295 #define TEST_CONSTANT ALL_PERF_TESTS_FOR_SIZE_UNALIGNED
296         ALL_PERF_TEST_FOR_CONSTANT;
297 #undef TEST_CONSTANT
298 }
299
300 /* Run all memcpy tests for aligned variable cases */
301 static inline void
302 perf_test_variable_aligned(void)
303 {
304         unsigned n = sizeof(buf_sizes) / sizeof(buf_sizes[0]);
305         unsigned i;
306         for (i = 0; i < n; i++) {
307                 ALL_PERF_TESTS_FOR_SIZE((size_t)buf_sizes[i]);
308         }
309 }
310
311 /* Run all memcpy tests for unaligned variable cases */
312 static inline void
313 perf_test_variable_unaligned(void)
314 {
315         unsigned n = sizeof(buf_sizes) / sizeof(buf_sizes[0]);
316         unsigned i;
317         for (i = 0; i < n; i++) {
318                 ALL_PERF_TESTS_FOR_SIZE_UNALIGNED((size_t)buf_sizes[i]);
319         }
320 }
321
322 /* Run all memcpy tests */
323 static int
324 perf_test(void)
325 {
326         int ret;
327
328         ret = init_buffers();
329         if (ret != 0)
330                 return ret;
331
332 #if TEST_VALUE_RANGE != 0
333         /* Set up buf_sizes array, if required */
334         unsigned i;
335         for (i = 0; i < TEST_VALUE_RANGE; i++)
336                 buf_sizes[i] = i;
337 #endif
338
339         /* See function comment */
340         do_uncached_write(large_buf_write, 0, small_buf_read, 1, SMALL_BUFFER_SIZE);
341
342         printf("\n** rte_memcpy() - memcpy perf. tests (C = compile-time constant) **\n"
343                    "======= ============== ============== ============== ==============\n"
344                    "   Size Cache to cache   Cache to mem   Mem to cache     Mem to mem\n"
345                    "(bytes)        (ticks)        (ticks)        (ticks)        (ticks)\n"
346                    "------- -------------- -------------- -------------- --------------");
347
348         printf("\n========================= %2dB aligned ============================",
349                 alignment_unit);
350         /* Do aligned tests where size is a variable */
351         perf_test_variable_aligned();
352         printf("\n------- -------------- -------------- -------------- --------------");
353         /* Do aligned tests where size is a compile-time constant */
354         perf_test_constant_aligned();
355         printf("\n=========================== Unaligned =============================");
356         /* Do unaligned tests where size is a variable */
357         perf_test_variable_unaligned();
358         printf("\n------- -------------- -------------- -------------- --------------");
359         /* Do unaligned tests where size is a compile-time constant */
360         perf_test_constant_unaligned();
361         printf("\n======= ============== ============== ============== ==============\n\n");
362
363         free_buffers();
364
365         return 0;
366 }
367
368 static int
369 test_memcpy_perf(void)
370 {
371         int ret;
372
373         ret = perf_test();
374         if (ret != 0)
375                 return -1;
376         return 0;
377 }
378
379 REGISTER_TEST_COMMAND(memcpy_perf_autotest, test_memcpy_perf);