1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright(c) 2019 Intel Corporation
9 #include <rte_atomic.h>
10 #include <rte_cycles.h>
11 #include <rte_launch.h>
12 #include <rte_pause.h>
13 #include <rte_stack.h>
17 #define STACK_NAME "STACK_PERF"
19 #define STACK_SIZE (RTE_MAX_LCORE * MAX_BURST)
21 #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
24 * Push/pop bulk sizes, marked volatile so they aren't treated as compile-time
27 static volatile unsigned int bulk_sizes[] = {8, MAX_BURST};
29 static rte_atomic32_t lcore_barrier;
37 get_two_hyperthreads(struct lcore_pair *lcp)
39 unsigned int socket[2];
43 RTE_LCORE_FOREACH(id[0]) {
44 RTE_LCORE_FOREACH(id[1]) {
47 core[0] = rte_lcore_to_cpu_id(id[0]);
48 core[1] = rte_lcore_to_cpu_id(id[1]);
49 socket[0] = rte_lcore_to_socket_id(id[0]);
50 socket[1] = rte_lcore_to_socket_id(id[1]);
51 if ((core[0] == core[1]) && (socket[0] == socket[1])) {
63 get_two_cores(struct lcore_pair *lcp)
65 unsigned int socket[2];
69 RTE_LCORE_FOREACH(id[0]) {
70 RTE_LCORE_FOREACH(id[1]) {
73 core[0] = rte_lcore_to_cpu_id(id[0]);
74 core[1] = rte_lcore_to_cpu_id(id[1]);
75 socket[0] = rte_lcore_to_socket_id(id[0]);
76 socket[1] = rte_lcore_to_socket_id(id[1]);
77 if ((core[0] != core[1]) && (socket[0] == socket[1])) {
89 get_two_sockets(struct lcore_pair *lcp)
91 unsigned int socket[2];
94 RTE_LCORE_FOREACH(id[0]) {
95 RTE_LCORE_FOREACH(id[1]) {
98 socket[0] = rte_lcore_to_socket_id(id[0]);
99 socket[1] = rte_lcore_to_socket_id(id[1]);
100 if (socket[0] != socket[1]) {
111 /* Measure the cycle cost of popping an empty stack. */
113 test_empty_pop(struct rte_stack *s)
115 unsigned int iterations = 100000000;
116 void *objs[MAX_BURST];
119 uint64_t start = rte_rdtsc();
121 for (i = 0; i < iterations; i++)
122 rte_stack_pop(s, objs, bulk_sizes[0]);
124 uint64_t end = rte_rdtsc();
126 printf("Stack empty pop: %.2F\n",
127 (double)(end - start) / iterations);
136 /* Measure the average per-pointer cycle cost of stack push and pop */
138 bulk_push_pop(void *p)
140 unsigned int iterations = 1000000;
141 struct thread_args *args = p;
142 void *objs[MAX_BURST] = {0};
143 unsigned int size, i;
149 rte_atomic32_sub(&lcore_barrier, 1);
150 while (rte_atomic32_read(&lcore_barrier) != 0)
153 uint64_t start = rte_rdtsc();
155 for (i = 0; i < iterations; i++) {
156 rte_stack_push(s, objs, size);
157 rte_stack_pop(s, objs, size);
160 uint64_t end = rte_rdtsc();
162 args->avg = ((double)(end - start))/(iterations * size);
168 * Run bulk_push_pop() simultaneously on pairs of cores, to measure stack
169 * perf when between hyperthread siblings, cores on the same socket, and cores
170 * on different sockets.
173 run_on_core_pair(struct lcore_pair *cores, struct rte_stack *s,
176 struct thread_args args[2];
179 for (i = 0; i < ARRAY_SIZE(bulk_sizes); i++) {
180 rte_atomic32_set(&lcore_barrier, 2);
182 args[0].sz = args[1].sz = bulk_sizes[i];
183 args[0].s = args[1].s = s;
185 if (cores->c1 == rte_get_master_lcore()) {
186 rte_eal_remote_launch(fn, &args[1], cores->c2);
188 rte_eal_wait_lcore(cores->c2);
190 rte_eal_remote_launch(fn, &args[0], cores->c1);
191 rte_eal_remote_launch(fn, &args[1], cores->c2);
192 rte_eal_wait_lcore(cores->c1);
193 rte_eal_wait_lcore(cores->c2);
196 printf("Average cycles per object push/pop (bulk size: %u): %.2F\n",
197 bulk_sizes[i], (args[0].avg + args[1].avg) / 2);
201 /* Run bulk_push_pop() simultaneously on 1+ cores. */
203 run_on_n_cores(struct rte_stack *s, lcore_function_t fn, int n)
205 struct thread_args args[RTE_MAX_LCORE];
208 for (i = 0; i < ARRAY_SIZE(bulk_sizes); i++) {
209 unsigned int lcore_id;
213 rte_atomic32_set(&lcore_barrier, n);
215 RTE_LCORE_FOREACH_SLAVE(lcore_id) {
219 args[lcore_id].s = s;
220 args[lcore_id].sz = bulk_sizes[i];
222 if (rte_eal_remote_launch(fn, &args[lcore_id],
224 rte_panic("Failed to launch lcore %d\n",
228 lcore_id = rte_lcore_id();
230 args[lcore_id].s = s;
231 args[lcore_id].sz = bulk_sizes[i];
235 rte_eal_mp_wait_lcore();
237 avg = args[rte_lcore_id()].avg;
240 RTE_LCORE_FOREACH_SLAVE(lcore_id) {
243 avg += args[lcore_id].avg;
246 printf("Average cycles per object push/pop (bulk size: %u): %.2F\n",
247 bulk_sizes[i], avg / n);
252 * Measure the cycle cost of pushing and popping a single pointer on a single
256 test_single_push_pop(struct rte_stack *s)
258 unsigned int iterations = 16000000;
262 uint64_t start = rte_rdtsc();
264 for (i = 0; i < iterations; i++) {
265 rte_stack_push(s, &obj, 1);
266 rte_stack_pop(s, &obj, 1);
269 uint64_t end = rte_rdtsc();
271 printf("Average cycles per single object push/pop: %.2F\n",
272 ((double)(end - start)) / iterations);
275 /* Measure the cycle cost of bulk pushing and popping on a single lcore. */
277 test_bulk_push_pop(struct rte_stack *s)
279 unsigned int iterations = 8000000;
280 void *objs[MAX_BURST];
283 for (sz = 0; sz < ARRAY_SIZE(bulk_sizes); sz++) {
284 uint64_t start = rte_rdtsc();
286 for (i = 0; i < iterations; i++) {
287 rte_stack_push(s, objs, bulk_sizes[sz]);
288 rte_stack_pop(s, objs, bulk_sizes[sz]);
291 uint64_t end = rte_rdtsc();
293 double avg = ((double)(end - start) /
294 (iterations * bulk_sizes[sz]));
296 printf("Average cycles per object push/pop (bulk size: %u): %.2F\n",
297 bulk_sizes[sz], avg);
302 __test_stack_perf(uint32_t flags)
304 struct lcore_pair cores;
307 rte_atomic32_init(&lcore_barrier);
309 s = rte_stack_create(STACK_NAME, STACK_SIZE, rte_socket_id(), flags);
311 printf("[%s():%u] failed to create a stack\n",
316 printf("### Testing single element push/pop ###\n");
317 test_single_push_pop(s);
319 printf("\n### Testing empty pop ###\n");
322 printf("\n### Testing using a single lcore ###\n");
323 test_bulk_push_pop(s);
325 if (get_two_hyperthreads(&cores) == 0) {
326 printf("\n### Testing using two hyperthreads ###\n");
327 run_on_core_pair(&cores, s, bulk_push_pop);
329 if (get_two_cores(&cores) == 0) {
330 printf("\n### Testing using two physical cores ###\n");
331 run_on_core_pair(&cores, s, bulk_push_pop);
333 if (get_two_sockets(&cores) == 0) {
334 printf("\n### Testing using two NUMA nodes ###\n");
335 run_on_core_pair(&cores, s, bulk_push_pop);
338 printf("\n### Testing on all %u lcores ###\n", rte_lcore_count());
339 run_on_n_cores(s, bulk_push_pop, rte_lcore_count());
346 test_stack_perf(void)
348 return __test_stack_perf(0);
352 test_lf_stack_perf(void)
354 return __test_stack_perf(RTE_STACK_F_LF);
357 REGISTER_TEST_COMMAND(stack_perf_autotest, test_stack_perf);
358 REGISTER_TEST_COMMAND(stack_lf_perf_autotest, test_lf_stack_perf);