mbuf: add a copy routine
[dpdk.git] / app / test / test_stack_perf.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2019 Intel Corporation
3  */
4
5
6 #include <stdio.h>
7 #include <inttypes.h>
8
9 #include <rte_atomic.h>
10 #include <rte_cycles.h>
11 #include <rte_launch.h>
12 #include <rte_pause.h>
13 #include <rte_stack.h>
14
15 #include "test.h"
16
17 #define STACK_NAME "STACK_PERF"
18 #define MAX_BURST 32
19 #define STACK_SIZE (RTE_MAX_LCORE * MAX_BURST)
20
21 #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
22
23 /*
24  * Push/pop bulk sizes, marked volatile so they aren't treated as compile-time
25  * constants.
26  */
27 static volatile unsigned int bulk_sizes[] = {8, MAX_BURST};
28
29 static rte_atomic32_t lcore_barrier;
30
31 struct lcore_pair {
32         unsigned int c1;
33         unsigned int c2;
34 };
35
36 static int
37 get_two_hyperthreads(struct lcore_pair *lcp)
38 {
39         unsigned int socket[2];
40         unsigned int core[2];
41         unsigned int id[2];
42
43         RTE_LCORE_FOREACH(id[0]) {
44                 RTE_LCORE_FOREACH(id[1]) {
45                         if (id[0] == id[1])
46                                 continue;
47                         core[0] = rte_lcore_to_cpu_id(id[0]);
48                         core[1] = rte_lcore_to_cpu_id(id[1]);
49                         socket[0] = rte_lcore_to_socket_id(id[0]);
50                         socket[1] = rte_lcore_to_socket_id(id[1]);
51                         if ((core[0] == core[1]) && (socket[0] == socket[1])) {
52                                 lcp->c1 = id[0];
53                                 lcp->c2 = id[1];
54                                 return 0;
55                         }
56                 }
57         }
58
59         return 1;
60 }
61
62 static int
63 get_two_cores(struct lcore_pair *lcp)
64 {
65         unsigned int socket[2];
66         unsigned int core[2];
67         unsigned int id[2];
68
69         RTE_LCORE_FOREACH(id[0]) {
70                 RTE_LCORE_FOREACH(id[1]) {
71                         if (id[0] == id[1])
72                                 continue;
73                         core[0] = rte_lcore_to_cpu_id(id[0]);
74                         core[1] = rte_lcore_to_cpu_id(id[1]);
75                         socket[0] = rte_lcore_to_socket_id(id[0]);
76                         socket[1] = rte_lcore_to_socket_id(id[1]);
77                         if ((core[0] != core[1]) && (socket[0] == socket[1])) {
78                                 lcp->c1 = id[0];
79                                 lcp->c2 = id[1];
80                                 return 0;
81                         }
82                 }
83         }
84
85         return 1;
86 }
87
88 static int
89 get_two_sockets(struct lcore_pair *lcp)
90 {
91         unsigned int socket[2];
92         unsigned int id[2];
93
94         RTE_LCORE_FOREACH(id[0]) {
95                 RTE_LCORE_FOREACH(id[1]) {
96                         if (id[0] == id[1])
97                                 continue;
98                         socket[0] = rte_lcore_to_socket_id(id[0]);
99                         socket[1] = rte_lcore_to_socket_id(id[1]);
100                         if (socket[0] != socket[1]) {
101                                 lcp->c1 = id[0];
102                                 lcp->c2 = id[1];
103                                 return 0;
104                         }
105                 }
106         }
107
108         return 1;
109 }
110
111 /* Measure the cycle cost of popping an empty stack. */
112 static void
113 test_empty_pop(struct rte_stack *s)
114 {
115         unsigned int iterations = 100000000;
116         void *objs[MAX_BURST];
117         unsigned int i;
118
119         uint64_t start = rte_rdtsc();
120
121         for (i = 0; i < iterations; i++)
122                 rte_stack_pop(s, objs, bulk_sizes[0]);
123
124         uint64_t end = rte_rdtsc();
125
126         printf("Stack empty pop: %.2F\n",
127                (double)(end - start) / iterations);
128 }
129
130 struct thread_args {
131         struct rte_stack *s;
132         unsigned int sz;
133         double avg;
134 };
135
136 /* Measure the average per-pointer cycle cost of stack push and pop */
137 static int
138 bulk_push_pop(void *p)
139 {
140         unsigned int iterations = 1000000;
141         struct thread_args *args = p;
142         void *objs[MAX_BURST] = {0};
143         unsigned int size, i;
144         struct rte_stack *s;
145
146         s = args->s;
147         size = args->sz;
148
149         rte_atomic32_sub(&lcore_barrier, 1);
150         while (rte_atomic32_read(&lcore_barrier) != 0)
151                 rte_pause();
152
153         uint64_t start = rte_rdtsc();
154
155         for (i = 0; i < iterations; i++) {
156                 rte_stack_push(s, objs, size);
157                 rte_stack_pop(s, objs, size);
158         }
159
160         uint64_t end = rte_rdtsc();
161
162         args->avg = ((double)(end - start))/(iterations * size);
163
164         return 0;
165 }
166
167 /*
168  * Run bulk_push_pop() simultaneously on pairs of cores, to measure stack
169  * perf when between hyperthread siblings, cores on the same socket, and cores
170  * on different sockets.
171  */
172 static void
173 run_on_core_pair(struct lcore_pair *cores, struct rte_stack *s,
174                  lcore_function_t fn)
175 {
176         struct thread_args args[2];
177         unsigned int i;
178
179         for (i = 0; i < ARRAY_SIZE(bulk_sizes); i++) {
180                 rte_atomic32_set(&lcore_barrier, 2);
181
182                 args[0].sz = args[1].sz = bulk_sizes[i];
183                 args[0].s = args[1].s = s;
184
185                 if (cores->c1 == rte_get_master_lcore()) {
186                         rte_eal_remote_launch(fn, &args[1], cores->c2);
187                         fn(&args[0]);
188                         rte_eal_wait_lcore(cores->c2);
189                 } else {
190                         rte_eal_remote_launch(fn, &args[0], cores->c1);
191                         rte_eal_remote_launch(fn, &args[1], cores->c2);
192                         rte_eal_wait_lcore(cores->c1);
193                         rte_eal_wait_lcore(cores->c2);
194                 }
195
196                 printf("Average cycles per object push/pop (bulk size: %u): %.2F\n",
197                        bulk_sizes[i], (args[0].avg + args[1].avg) / 2);
198         }
199 }
200
201 /* Run bulk_push_pop() simultaneously on 1+ cores. */
202 static void
203 run_on_n_cores(struct rte_stack *s, lcore_function_t fn, int n)
204 {
205         struct thread_args args[RTE_MAX_LCORE];
206         unsigned int i;
207
208         for (i = 0; i < ARRAY_SIZE(bulk_sizes); i++) {
209                 unsigned int lcore_id;
210                 int cnt = 0;
211                 double avg;
212
213                 rte_atomic32_set(&lcore_barrier, n);
214
215                 RTE_LCORE_FOREACH_SLAVE(lcore_id) {
216                         if (++cnt >= n)
217                                 break;
218
219                         args[lcore_id].s = s;
220                         args[lcore_id].sz = bulk_sizes[i];
221
222                         if (rte_eal_remote_launch(fn, &args[lcore_id],
223                                                   lcore_id))
224                                 rte_panic("Failed to launch lcore %d\n",
225                                           lcore_id);
226                 }
227
228                 lcore_id = rte_lcore_id();
229
230                 args[lcore_id].s = s;
231                 args[lcore_id].sz = bulk_sizes[i];
232
233                 fn(&args[lcore_id]);
234
235                 rte_eal_mp_wait_lcore();
236
237                 avg = args[rte_lcore_id()].avg;
238
239                 cnt = 0;
240                 RTE_LCORE_FOREACH_SLAVE(lcore_id) {
241                         if (++cnt >= n)
242                                 break;
243                         avg += args[lcore_id].avg;
244                 }
245
246                 printf("Average cycles per object push/pop (bulk size: %u): %.2F\n",
247                        bulk_sizes[i], avg / n);
248         }
249 }
250
251 /*
252  * Measure the cycle cost of pushing and popping a single pointer on a single
253  * lcore.
254  */
255 static void
256 test_single_push_pop(struct rte_stack *s)
257 {
258         unsigned int iterations = 16000000;
259         void *obj = NULL;
260         unsigned int i;
261
262         uint64_t start = rte_rdtsc();
263
264         for (i = 0; i < iterations; i++) {
265                 rte_stack_push(s, &obj, 1);
266                 rte_stack_pop(s, &obj, 1);
267         }
268
269         uint64_t end = rte_rdtsc();
270
271         printf("Average cycles per single object push/pop: %.2F\n",
272                ((double)(end - start)) / iterations);
273 }
274
275 /* Measure the cycle cost of bulk pushing and popping on a single lcore. */
276 static void
277 test_bulk_push_pop(struct rte_stack *s)
278 {
279         unsigned int iterations = 8000000;
280         void *objs[MAX_BURST];
281         unsigned int sz, i;
282
283         for (sz = 0; sz < ARRAY_SIZE(bulk_sizes); sz++) {
284                 uint64_t start = rte_rdtsc();
285
286                 for (i = 0; i < iterations; i++) {
287                         rte_stack_push(s, objs, bulk_sizes[sz]);
288                         rte_stack_pop(s, objs, bulk_sizes[sz]);
289                 }
290
291                 uint64_t end = rte_rdtsc();
292
293                 double avg = ((double)(end - start) /
294                               (iterations * bulk_sizes[sz]));
295
296                 printf("Average cycles per object push/pop (bulk size: %u): %.2F\n",
297                        bulk_sizes[sz], avg);
298         }
299 }
300
301 static int
302 __test_stack_perf(uint32_t flags)
303 {
304         struct lcore_pair cores;
305         struct rte_stack *s;
306
307         rte_atomic32_init(&lcore_barrier);
308
309         s = rte_stack_create(STACK_NAME, STACK_SIZE, rte_socket_id(), flags);
310         if (s == NULL) {
311                 printf("[%s():%u] failed to create a stack\n",
312                        __func__, __LINE__);
313                 return -1;
314         }
315
316         printf("### Testing single element push/pop ###\n");
317         test_single_push_pop(s);
318
319         printf("\n### Testing empty pop ###\n");
320         test_empty_pop(s);
321
322         printf("\n### Testing using a single lcore ###\n");
323         test_bulk_push_pop(s);
324
325         if (get_two_hyperthreads(&cores) == 0) {
326                 printf("\n### Testing using two hyperthreads ###\n");
327                 run_on_core_pair(&cores, s, bulk_push_pop);
328         }
329         if (get_two_cores(&cores) == 0) {
330                 printf("\n### Testing using two physical cores ###\n");
331                 run_on_core_pair(&cores, s, bulk_push_pop);
332         }
333         if (get_two_sockets(&cores) == 0) {
334                 printf("\n### Testing using two NUMA nodes ###\n");
335                 run_on_core_pair(&cores, s, bulk_push_pop);
336         }
337
338         printf("\n### Testing on all %u lcores ###\n", rte_lcore_count());
339         run_on_n_cores(s, bulk_push_pop, rte_lcore_count());
340
341         rte_stack_free(s);
342         return 0;
343 }
344
345 static int
346 test_stack_perf(void)
347 {
348         return __test_stack_perf(0);
349 }
350
351 static int
352 test_lf_stack_perf(void)
353 {
354         return __test_stack_perf(RTE_STACK_F_LF);
355 }
356
357 REGISTER_TEST_COMMAND(stack_perf_autotest, test_stack_perf);
358 REGISTER_TEST_COMMAND(stack_lf_perf_autotest, test_lf_stack_perf);