test/service: fix race condition on stopping lcore
[dpdk.git] / app / test / test_stack_perf.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2019 Intel Corporation
3  */
4
5
6 #include <stdio.h>
7 #include <inttypes.h>
8
9 #include <rte_atomic.h>
10 #include <rte_cycles.h>
11 #include <rte_launch.h>
12 #include <rte_pause.h>
13 #include <rte_stack.h>
14
15 #include "test.h"
16
17 #define STACK_NAME "STACK_PERF"
18 #define MAX_BURST 32
19 #define STACK_SIZE (RTE_MAX_LCORE * MAX_BURST)
20
21 /*
22  * Push/pop bulk sizes, marked volatile so they aren't treated as compile-time
23  * constants.
24  */
25 static volatile unsigned int bulk_sizes[] = {8, MAX_BURST};
26
27 static rte_atomic32_t lcore_barrier;
28
29 struct lcore_pair {
30         unsigned int c1;
31         unsigned int c2;
32 };
33
34 static int
35 get_two_hyperthreads(struct lcore_pair *lcp)
36 {
37         unsigned int socket[2];
38         unsigned int core[2];
39         unsigned int id[2];
40
41         RTE_LCORE_FOREACH(id[0]) {
42                 RTE_LCORE_FOREACH(id[1]) {
43                         if (id[0] == id[1])
44                                 continue;
45                         core[0] = rte_lcore_to_cpu_id(id[0]);
46                         core[1] = rte_lcore_to_cpu_id(id[1]);
47                         socket[0] = rte_lcore_to_socket_id(id[0]);
48                         socket[1] = rte_lcore_to_socket_id(id[1]);
49                         if ((core[0] == core[1]) && (socket[0] == socket[1])) {
50                                 lcp->c1 = id[0];
51                                 lcp->c2 = id[1];
52                                 return 0;
53                         }
54                 }
55         }
56
57         return 1;
58 }
59
60 static int
61 get_two_cores(struct lcore_pair *lcp)
62 {
63         unsigned int socket[2];
64         unsigned int core[2];
65         unsigned int id[2];
66
67         RTE_LCORE_FOREACH(id[0]) {
68                 RTE_LCORE_FOREACH(id[1]) {
69                         if (id[0] == id[1])
70                                 continue;
71                         core[0] = rte_lcore_to_cpu_id(id[0]);
72                         core[1] = rte_lcore_to_cpu_id(id[1]);
73                         socket[0] = rte_lcore_to_socket_id(id[0]);
74                         socket[1] = rte_lcore_to_socket_id(id[1]);
75                         if ((core[0] != core[1]) && (socket[0] == socket[1])) {
76                                 lcp->c1 = id[0];
77                                 lcp->c2 = id[1];
78                                 return 0;
79                         }
80                 }
81         }
82
83         return 1;
84 }
85
86 static int
87 get_two_sockets(struct lcore_pair *lcp)
88 {
89         unsigned int socket[2];
90         unsigned int id[2];
91
92         RTE_LCORE_FOREACH(id[0]) {
93                 RTE_LCORE_FOREACH(id[1]) {
94                         if (id[0] == id[1])
95                                 continue;
96                         socket[0] = rte_lcore_to_socket_id(id[0]);
97                         socket[1] = rte_lcore_to_socket_id(id[1]);
98                         if (socket[0] != socket[1]) {
99                                 lcp->c1 = id[0];
100                                 lcp->c2 = id[1];
101                                 return 0;
102                         }
103                 }
104         }
105
106         return 1;
107 }
108
109 /* Measure the cycle cost of popping an empty stack. */
110 static void
111 test_empty_pop(struct rte_stack *s)
112 {
113         unsigned int iterations = 100000000;
114         void *objs[MAX_BURST];
115         unsigned int i;
116
117         uint64_t start = rte_rdtsc();
118
119         for (i = 0; i < iterations; i++)
120                 rte_stack_pop(s, objs, bulk_sizes[0]);
121
122         uint64_t end = rte_rdtsc();
123
124         printf("Stack empty pop: %.2F\n",
125                (double)(end - start) / iterations);
126 }
127
128 struct thread_args {
129         struct rte_stack *s;
130         unsigned int sz;
131         double avg;
132 };
133
134 /* Measure the average per-pointer cycle cost of stack push and pop */
135 static int
136 bulk_push_pop(void *p)
137 {
138         unsigned int iterations = 1000000;
139         struct thread_args *args = p;
140         void *objs[MAX_BURST] = {0};
141         unsigned int size, i;
142         struct rte_stack *s;
143
144         s = args->s;
145         size = args->sz;
146
147         rte_atomic32_sub(&lcore_barrier, 1);
148         while (rte_atomic32_read(&lcore_barrier) != 0)
149                 rte_pause();
150
151         uint64_t start = rte_rdtsc();
152
153         for (i = 0; i < iterations; i++) {
154                 rte_stack_push(s, objs, size);
155                 rte_stack_pop(s, objs, size);
156         }
157
158         uint64_t end = rte_rdtsc();
159
160         args->avg = ((double)(end - start))/(iterations * size);
161
162         return 0;
163 }
164
165 /*
166  * Run bulk_push_pop() simultaneously on pairs of cores, to measure stack
167  * perf when between hyperthread siblings, cores on the same socket, and cores
168  * on different sockets.
169  */
170 static void
171 run_on_core_pair(struct lcore_pair *cores, struct rte_stack *s,
172                  lcore_function_t fn)
173 {
174         struct thread_args args[2];
175         unsigned int i;
176
177         for (i = 0; i < RTE_DIM(bulk_sizes); i++) {
178                 rte_atomic32_set(&lcore_barrier, 2);
179
180                 args[0].sz = args[1].sz = bulk_sizes[i];
181                 args[0].s = args[1].s = s;
182
183                 if (cores->c1 == rte_get_master_lcore()) {
184                         rte_eal_remote_launch(fn, &args[1], cores->c2);
185                         fn(&args[0]);
186                         rte_eal_wait_lcore(cores->c2);
187                 } else {
188                         rte_eal_remote_launch(fn, &args[0], cores->c1);
189                         rte_eal_remote_launch(fn, &args[1], cores->c2);
190                         rte_eal_wait_lcore(cores->c1);
191                         rte_eal_wait_lcore(cores->c2);
192                 }
193
194                 printf("Average cycles per object push/pop (bulk size: %u): %.2F\n",
195                        bulk_sizes[i], (args[0].avg + args[1].avg) / 2);
196         }
197 }
198
199 /* Run bulk_push_pop() simultaneously on 1+ cores. */
200 static void
201 run_on_n_cores(struct rte_stack *s, lcore_function_t fn, int n)
202 {
203         struct thread_args args[RTE_MAX_LCORE];
204         unsigned int i;
205
206         for (i = 0; i < RTE_DIM(bulk_sizes); i++) {
207                 unsigned int lcore_id;
208                 int cnt = 0;
209                 double avg;
210
211                 rte_atomic32_set(&lcore_barrier, n);
212
213                 RTE_LCORE_FOREACH_SLAVE(lcore_id) {
214                         if (++cnt >= n)
215                                 break;
216
217                         args[lcore_id].s = s;
218                         args[lcore_id].sz = bulk_sizes[i];
219
220                         if (rte_eal_remote_launch(fn, &args[lcore_id],
221                                                   lcore_id))
222                                 rte_panic("Failed to launch lcore %d\n",
223                                           lcore_id);
224                 }
225
226                 lcore_id = rte_lcore_id();
227
228                 args[lcore_id].s = s;
229                 args[lcore_id].sz = bulk_sizes[i];
230
231                 fn(&args[lcore_id]);
232
233                 rte_eal_mp_wait_lcore();
234
235                 avg = args[rte_lcore_id()].avg;
236
237                 cnt = 0;
238                 RTE_LCORE_FOREACH_SLAVE(lcore_id) {
239                         if (++cnt >= n)
240                                 break;
241                         avg += args[lcore_id].avg;
242                 }
243
244                 printf("Average cycles per object push/pop (bulk size: %u): %.2F\n",
245                        bulk_sizes[i], avg / n);
246         }
247 }
248
249 /*
250  * Measure the cycle cost of pushing and popping a single pointer on a single
251  * lcore.
252  */
253 static void
254 test_single_push_pop(struct rte_stack *s)
255 {
256         unsigned int iterations = 16000000;
257         void *obj = NULL;
258         unsigned int i;
259
260         uint64_t start = rte_rdtsc();
261
262         for (i = 0; i < iterations; i++) {
263                 rte_stack_push(s, &obj, 1);
264                 rte_stack_pop(s, &obj, 1);
265         }
266
267         uint64_t end = rte_rdtsc();
268
269         printf("Average cycles per single object push/pop: %.2F\n",
270                ((double)(end - start)) / iterations);
271 }
272
273 /* Measure the cycle cost of bulk pushing and popping on a single lcore. */
274 static void
275 test_bulk_push_pop(struct rte_stack *s)
276 {
277         unsigned int iterations = 8000000;
278         void *objs[MAX_BURST];
279         unsigned int sz, i;
280
281         for (sz = 0; sz < RTE_DIM(bulk_sizes); sz++) {
282                 uint64_t start = rte_rdtsc();
283
284                 for (i = 0; i < iterations; i++) {
285                         rte_stack_push(s, objs, bulk_sizes[sz]);
286                         rte_stack_pop(s, objs, bulk_sizes[sz]);
287                 }
288
289                 uint64_t end = rte_rdtsc();
290
291                 double avg = ((double)(end - start) /
292                               (iterations * bulk_sizes[sz]));
293
294                 printf("Average cycles per object push/pop (bulk size: %u): %.2F\n",
295                        bulk_sizes[sz], avg);
296         }
297 }
298
299 static int
300 __test_stack_perf(uint32_t flags)
301 {
302         struct lcore_pair cores;
303         struct rte_stack *s;
304
305         rte_atomic32_init(&lcore_barrier);
306
307         s = rte_stack_create(STACK_NAME, STACK_SIZE, rte_socket_id(), flags);
308         if (s == NULL) {
309                 printf("[%s():%u] failed to create a stack\n",
310                        __func__, __LINE__);
311                 return -1;
312         }
313
314         printf("### Testing single element push/pop ###\n");
315         test_single_push_pop(s);
316
317         printf("\n### Testing empty pop ###\n");
318         test_empty_pop(s);
319
320         printf("\n### Testing using a single lcore ###\n");
321         test_bulk_push_pop(s);
322
323         if (get_two_hyperthreads(&cores) == 0) {
324                 printf("\n### Testing using two hyperthreads ###\n");
325                 run_on_core_pair(&cores, s, bulk_push_pop);
326         }
327         if (get_two_cores(&cores) == 0) {
328                 printf("\n### Testing using two physical cores ###\n");
329                 run_on_core_pair(&cores, s, bulk_push_pop);
330         }
331         if (get_two_sockets(&cores) == 0) {
332                 printf("\n### Testing using two NUMA nodes ###\n");
333                 run_on_core_pair(&cores, s, bulk_push_pop);
334         }
335
336         printf("\n### Testing on all %u lcores ###\n", rte_lcore_count());
337         run_on_n_cores(s, bulk_push_pop, rte_lcore_count());
338
339         rte_stack_free(s);
340         return 0;
341 }
342
343 static int
344 test_stack_perf(void)
345 {
346         return __test_stack_perf(0);
347 }
348
349 static int
350 test_lf_stack_perf(void)
351 {
352         return __test_stack_perf(RTE_STACK_F_LF);
353 }
354
355 REGISTER_TEST_COMMAND(stack_perf_autotest, test_stack_perf);
356 REGISTER_TEST_COMMAND(stack_lf_perf_autotest, test_lf_stack_perf);