net/i40e: move testpmd commands
[dpdk.git] / app / test / test_stack_perf.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2019 Intel Corporation
3  */
4
5
6 #include <stdio.h>
7 #include <inttypes.h>
8
9 #include <rte_cycles.h>
10 #include <rte_launch.h>
11 #include <rte_pause.h>
12 #include <rte_stack.h>
13
14 #include "test.h"
15
16 #define STACK_NAME "STACK_PERF"
17 #define MAX_BURST 32
18 #define STACK_SIZE (RTE_MAX_LCORE * MAX_BURST)
19
20 /*
21  * Push/pop bulk sizes, marked volatile so they aren't treated as compile-time
22  * constants.
23  */
24 static volatile unsigned int bulk_sizes[] = {8, MAX_BURST};
25
26 static uint32_t lcore_barrier;
27
28 struct lcore_pair {
29         unsigned int c1;
30         unsigned int c2;
31 };
32
33 static int
34 get_two_hyperthreads(struct lcore_pair *lcp)
35 {
36         unsigned int socket[2];
37         unsigned int core[2];
38         unsigned int id[2];
39
40         RTE_LCORE_FOREACH(id[0]) {
41                 RTE_LCORE_FOREACH(id[1]) {
42                         if (id[0] == id[1])
43                                 continue;
44                         core[0] = rte_lcore_to_cpu_id(id[0]);
45                         core[1] = rte_lcore_to_cpu_id(id[1]);
46                         socket[0] = rte_lcore_to_socket_id(id[0]);
47                         socket[1] = rte_lcore_to_socket_id(id[1]);
48                         if ((core[0] == core[1]) && (socket[0] == socket[1])) {
49                                 lcp->c1 = id[0];
50                                 lcp->c2 = id[1];
51                                 return 0;
52                         }
53                 }
54         }
55
56         return 1;
57 }
58
59 static int
60 get_two_cores(struct lcore_pair *lcp)
61 {
62         unsigned int socket[2];
63         unsigned int core[2];
64         unsigned int id[2];
65
66         RTE_LCORE_FOREACH(id[0]) {
67                 RTE_LCORE_FOREACH(id[1]) {
68                         if (id[0] == id[1])
69                                 continue;
70                         core[0] = rte_lcore_to_cpu_id(id[0]);
71                         core[1] = rte_lcore_to_cpu_id(id[1]);
72                         socket[0] = rte_lcore_to_socket_id(id[0]);
73                         socket[1] = rte_lcore_to_socket_id(id[1]);
74                         if ((core[0] != core[1]) && (socket[0] == socket[1])) {
75                                 lcp->c1 = id[0];
76                                 lcp->c2 = id[1];
77                                 return 0;
78                         }
79                 }
80         }
81
82         return 1;
83 }
84
85 static int
86 get_two_sockets(struct lcore_pair *lcp)
87 {
88         unsigned int socket[2];
89         unsigned int id[2];
90
91         RTE_LCORE_FOREACH(id[0]) {
92                 RTE_LCORE_FOREACH(id[1]) {
93                         if (id[0] == id[1])
94                                 continue;
95                         socket[0] = rte_lcore_to_socket_id(id[0]);
96                         socket[1] = rte_lcore_to_socket_id(id[1]);
97                         if (socket[0] != socket[1]) {
98                                 lcp->c1 = id[0];
99                                 lcp->c2 = id[1];
100                                 return 0;
101                         }
102                 }
103         }
104
105         return 1;
106 }
107
108 /* Measure the cycle cost of popping an empty stack. */
109 static void
110 test_empty_pop(struct rte_stack *s)
111 {
112         unsigned int iterations = 100000000;
113         void *objs[MAX_BURST];
114         unsigned int i;
115
116         uint64_t start = rte_rdtsc();
117
118         for (i = 0; i < iterations; i++)
119                 rte_stack_pop(s, objs, bulk_sizes[0]);
120
121         uint64_t end = rte_rdtsc();
122
123         printf("Stack empty pop: %.2F\n",
124                (double)(end - start) / iterations);
125 }
126
127 struct thread_args {
128         struct rte_stack *s;
129         unsigned int sz;
130         double avg;
131 };
132
133 /* Measure the average per-pointer cycle cost of stack push and pop */
134 static int
135 bulk_push_pop(void *p)
136 {
137         unsigned int iterations = 1000000;
138         struct thread_args *args = p;
139         void *objs[MAX_BURST] = {0};
140         unsigned int size, i;
141         struct rte_stack *s;
142
143         s = args->s;
144         size = args->sz;
145
146         __atomic_fetch_sub(&lcore_barrier, 1, __ATOMIC_RELAXED);
147         rte_wait_until_equal_32(&lcore_barrier, 0, __ATOMIC_RELAXED);
148
149         uint64_t start = rte_rdtsc();
150
151         for (i = 0; i < iterations; i++) {
152                 rte_stack_push(s, objs, size);
153                 rte_stack_pop(s, objs, size);
154         }
155
156         uint64_t end = rte_rdtsc();
157
158         args->avg = ((double)(end - start))/(iterations * size);
159
160         return 0;
161 }
162
163 /*
164  * Run bulk_push_pop() simultaneously on pairs of cores, to measure stack
165  * perf when between hyperthread siblings, cores on the same socket, and cores
166  * on different sockets.
167  */
168 static void
169 run_on_core_pair(struct lcore_pair *cores, struct rte_stack *s,
170                  lcore_function_t fn)
171 {
172         struct thread_args args[2];
173         unsigned int i;
174
175         for (i = 0; i < RTE_DIM(bulk_sizes); i++) {
176                 __atomic_store_n(&lcore_barrier, 2, __ATOMIC_RELAXED);
177
178                 args[0].sz = args[1].sz = bulk_sizes[i];
179                 args[0].s = args[1].s = s;
180
181                 if (cores->c1 == rte_get_main_lcore()) {
182                         rte_eal_remote_launch(fn, &args[1], cores->c2);
183                         fn(&args[0]);
184                         rte_eal_wait_lcore(cores->c2);
185                 } else {
186                         rte_eal_remote_launch(fn, &args[0], cores->c1);
187                         rte_eal_remote_launch(fn, &args[1], cores->c2);
188                         rte_eal_wait_lcore(cores->c1);
189                         rte_eal_wait_lcore(cores->c2);
190                 }
191
192                 printf("Average cycles per object push/pop (bulk size: %u): %.2F\n",
193                        bulk_sizes[i], (args[0].avg + args[1].avg) / 2);
194         }
195 }
196
197 /* Run bulk_push_pop() simultaneously on 1+ cores. */
198 static void
199 run_on_n_cores(struct rte_stack *s, lcore_function_t fn, int n)
200 {
201         struct thread_args args[RTE_MAX_LCORE];
202         unsigned int i;
203
204         for (i = 0; i < RTE_DIM(bulk_sizes); i++) {
205                 unsigned int lcore_id;
206                 int cnt = 0;
207                 double avg;
208
209                 __atomic_store_n(&lcore_barrier, n, __ATOMIC_RELAXED);
210
211                 RTE_LCORE_FOREACH_WORKER(lcore_id) {
212                         if (++cnt >= n)
213                                 break;
214
215                         args[lcore_id].s = s;
216                         args[lcore_id].sz = bulk_sizes[i];
217
218                         if (rte_eal_remote_launch(fn, &args[lcore_id],
219                                                   lcore_id))
220                                 rte_panic("Failed to launch lcore %d\n",
221                                           lcore_id);
222                 }
223
224                 lcore_id = rte_lcore_id();
225
226                 args[lcore_id].s = s;
227                 args[lcore_id].sz = bulk_sizes[i];
228
229                 fn(&args[lcore_id]);
230
231                 rte_eal_mp_wait_lcore();
232
233                 avg = args[rte_lcore_id()].avg;
234
235                 cnt = 0;
236                 RTE_LCORE_FOREACH_WORKER(lcore_id) {
237                         if (++cnt >= n)
238                                 break;
239                         avg += args[lcore_id].avg;
240                 }
241
242                 printf("Average cycles per object push/pop (bulk size: %u): %.2F\n",
243                        bulk_sizes[i], avg / n);
244         }
245 }
246
247 /*
248  * Measure the cycle cost of pushing and popping a single pointer on a single
249  * lcore.
250  */
251 static void
252 test_single_push_pop(struct rte_stack *s)
253 {
254         unsigned int iterations = 16000000;
255         void *obj = NULL;
256         unsigned int i;
257
258         uint64_t start = rte_rdtsc();
259
260         for (i = 0; i < iterations; i++) {
261                 rte_stack_push(s, &obj, 1);
262                 rte_stack_pop(s, &obj, 1);
263         }
264
265         uint64_t end = rte_rdtsc();
266
267         printf("Average cycles per single object push/pop: %.2F\n",
268                ((double)(end - start)) / iterations);
269 }
270
271 /* Measure the cycle cost of bulk pushing and popping on a single lcore. */
272 static void
273 test_bulk_push_pop(struct rte_stack *s)
274 {
275         unsigned int iterations = 8000000;
276         void *objs[MAX_BURST];
277         unsigned int sz, i;
278
279         for (sz = 0; sz < RTE_DIM(bulk_sizes); sz++) {
280                 uint64_t start = rte_rdtsc();
281
282                 for (i = 0; i < iterations; i++) {
283                         rte_stack_push(s, objs, bulk_sizes[sz]);
284                         rte_stack_pop(s, objs, bulk_sizes[sz]);
285                 }
286
287                 uint64_t end = rte_rdtsc();
288
289                 double avg = ((double)(end - start) /
290                               (iterations * bulk_sizes[sz]));
291
292                 printf("Average cycles per object push/pop (bulk size: %u): %.2F\n",
293                        bulk_sizes[sz], avg);
294         }
295 }
296
297 static int
298 __test_stack_perf(uint32_t flags)
299 {
300         struct lcore_pair cores;
301         struct rte_stack *s;
302
303         __atomic_store_n(&lcore_barrier, 0, __ATOMIC_RELAXED);
304
305         s = rte_stack_create(STACK_NAME, STACK_SIZE, rte_socket_id(), flags);
306         if (s == NULL) {
307                 printf("[%s():%u] failed to create a stack\n",
308                        __func__, __LINE__);
309                 return -1;
310         }
311
312         printf("### Testing single element push/pop ###\n");
313         test_single_push_pop(s);
314
315         printf("\n### Testing empty pop ###\n");
316         test_empty_pop(s);
317
318         printf("\n### Testing using a single lcore ###\n");
319         test_bulk_push_pop(s);
320
321         if (get_two_hyperthreads(&cores) == 0) {
322                 printf("\n### Testing using two hyperthreads ###\n");
323                 run_on_core_pair(&cores, s, bulk_push_pop);
324         }
325         if (get_two_cores(&cores) == 0) {
326                 printf("\n### Testing using two physical cores ###\n");
327                 run_on_core_pair(&cores, s, bulk_push_pop);
328         }
329         if (get_two_sockets(&cores) == 0) {
330                 printf("\n### Testing using two NUMA nodes ###\n");
331                 run_on_core_pair(&cores, s, bulk_push_pop);
332         }
333
334         printf("\n### Testing on all %u lcores ###\n", rte_lcore_count());
335         run_on_n_cores(s, bulk_push_pop, rte_lcore_count());
336
337         rte_stack_free(s);
338         return 0;
339 }
340
341 static int
342 test_stack_perf(void)
343 {
344         return __test_stack_perf(0);
345 }
346
347 static int
348 test_lf_stack_perf(void)
349 {
350 #if defined(RTE_STACK_LF_SUPPORTED)
351         return __test_stack_perf(RTE_STACK_F_LF);
352 #else
353         return TEST_SKIPPED;
354 #endif
355 }
356
357 REGISTER_TEST_COMMAND(stack_perf_autotest, test_stack_perf);
358 REGISTER_TEST_COMMAND(stack_lf_perf_autotest, test_lf_stack_perf);