1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright(c) 2019 Intel Corporation
9 #include <rte_atomic.h>
10 #include <rte_cycles.h>
11 #include <rte_launch.h>
12 #include <rte_pause.h>
13 #include <rte_stack.h>
17 #define STACK_NAME "STACK_PERF"
19 #define STACK_SIZE (RTE_MAX_LCORE * MAX_BURST)
22 * Push/pop bulk sizes, marked volatile so they aren't treated as compile-time
25 static volatile unsigned int bulk_sizes[] = {8, MAX_BURST};
27 static rte_atomic32_t lcore_barrier;
35 get_two_hyperthreads(struct lcore_pair *lcp)
37 unsigned int socket[2];
41 RTE_LCORE_FOREACH(id[0]) {
42 RTE_LCORE_FOREACH(id[1]) {
45 core[0] = rte_lcore_to_cpu_id(id[0]);
46 core[1] = rte_lcore_to_cpu_id(id[1]);
47 socket[0] = rte_lcore_to_socket_id(id[0]);
48 socket[1] = rte_lcore_to_socket_id(id[1]);
49 if ((core[0] == core[1]) && (socket[0] == socket[1])) {
61 get_two_cores(struct lcore_pair *lcp)
63 unsigned int socket[2];
67 RTE_LCORE_FOREACH(id[0]) {
68 RTE_LCORE_FOREACH(id[1]) {
71 core[0] = rte_lcore_to_cpu_id(id[0]);
72 core[1] = rte_lcore_to_cpu_id(id[1]);
73 socket[0] = rte_lcore_to_socket_id(id[0]);
74 socket[1] = rte_lcore_to_socket_id(id[1]);
75 if ((core[0] != core[1]) && (socket[0] == socket[1])) {
87 get_two_sockets(struct lcore_pair *lcp)
89 unsigned int socket[2];
92 RTE_LCORE_FOREACH(id[0]) {
93 RTE_LCORE_FOREACH(id[1]) {
96 socket[0] = rte_lcore_to_socket_id(id[0]);
97 socket[1] = rte_lcore_to_socket_id(id[1]);
98 if (socket[0] != socket[1]) {
109 /* Measure the cycle cost of popping an empty stack. */
111 test_empty_pop(struct rte_stack *s)
113 unsigned int iterations = 100000000;
114 void *objs[MAX_BURST];
117 uint64_t start = rte_rdtsc();
119 for (i = 0; i < iterations; i++)
120 rte_stack_pop(s, objs, bulk_sizes[0]);
122 uint64_t end = rte_rdtsc();
124 printf("Stack empty pop: %.2F\n",
125 (double)(end - start) / iterations);
134 /* Measure the average per-pointer cycle cost of stack push and pop */
136 bulk_push_pop(void *p)
138 unsigned int iterations = 1000000;
139 struct thread_args *args = p;
140 void *objs[MAX_BURST] = {0};
141 unsigned int size, i;
147 rte_atomic32_sub(&lcore_barrier, 1);
148 while (rte_atomic32_read(&lcore_barrier) != 0)
151 uint64_t start = rte_rdtsc();
153 for (i = 0; i < iterations; i++) {
154 rte_stack_push(s, objs, size);
155 rte_stack_pop(s, objs, size);
158 uint64_t end = rte_rdtsc();
160 args->avg = ((double)(end - start))/(iterations * size);
166 * Run bulk_push_pop() simultaneously on pairs of cores, to measure stack
167 * perf when between hyperthread siblings, cores on the same socket, and cores
168 * on different sockets.
171 run_on_core_pair(struct lcore_pair *cores, struct rte_stack *s,
174 struct thread_args args[2];
177 for (i = 0; i < RTE_DIM(bulk_sizes); i++) {
178 rte_atomic32_set(&lcore_barrier, 2);
180 args[0].sz = args[1].sz = bulk_sizes[i];
181 args[0].s = args[1].s = s;
183 if (cores->c1 == rte_get_main_lcore()) {
184 rte_eal_remote_launch(fn, &args[1], cores->c2);
186 rte_eal_wait_lcore(cores->c2);
188 rte_eal_remote_launch(fn, &args[0], cores->c1);
189 rte_eal_remote_launch(fn, &args[1], cores->c2);
190 rte_eal_wait_lcore(cores->c1);
191 rte_eal_wait_lcore(cores->c2);
194 printf("Average cycles per object push/pop (bulk size: %u): %.2F\n",
195 bulk_sizes[i], (args[0].avg + args[1].avg) / 2);
199 /* Run bulk_push_pop() simultaneously on 1+ cores. */
201 run_on_n_cores(struct rte_stack *s, lcore_function_t fn, int n)
203 struct thread_args args[RTE_MAX_LCORE];
206 for (i = 0; i < RTE_DIM(bulk_sizes); i++) {
207 unsigned int lcore_id;
211 rte_atomic32_set(&lcore_barrier, n);
213 RTE_LCORE_FOREACH_WORKER(lcore_id) {
217 args[lcore_id].s = s;
218 args[lcore_id].sz = bulk_sizes[i];
220 if (rte_eal_remote_launch(fn, &args[lcore_id],
222 rte_panic("Failed to launch lcore %d\n",
226 lcore_id = rte_lcore_id();
228 args[lcore_id].s = s;
229 args[lcore_id].sz = bulk_sizes[i];
233 rte_eal_mp_wait_lcore();
235 avg = args[rte_lcore_id()].avg;
238 RTE_LCORE_FOREACH_WORKER(lcore_id) {
241 avg += args[lcore_id].avg;
244 printf("Average cycles per object push/pop (bulk size: %u): %.2F\n",
245 bulk_sizes[i], avg / n);
250 * Measure the cycle cost of pushing and popping a single pointer on a single
254 test_single_push_pop(struct rte_stack *s)
256 unsigned int iterations = 16000000;
260 uint64_t start = rte_rdtsc();
262 for (i = 0; i < iterations; i++) {
263 rte_stack_push(s, &obj, 1);
264 rte_stack_pop(s, &obj, 1);
267 uint64_t end = rte_rdtsc();
269 printf("Average cycles per single object push/pop: %.2F\n",
270 ((double)(end - start)) / iterations);
273 /* Measure the cycle cost of bulk pushing and popping on a single lcore. */
275 test_bulk_push_pop(struct rte_stack *s)
277 unsigned int iterations = 8000000;
278 void *objs[MAX_BURST];
281 for (sz = 0; sz < RTE_DIM(bulk_sizes); sz++) {
282 uint64_t start = rte_rdtsc();
284 for (i = 0; i < iterations; i++) {
285 rte_stack_push(s, objs, bulk_sizes[sz]);
286 rte_stack_pop(s, objs, bulk_sizes[sz]);
289 uint64_t end = rte_rdtsc();
291 double avg = ((double)(end - start) /
292 (iterations * bulk_sizes[sz]));
294 printf("Average cycles per object push/pop (bulk size: %u): %.2F\n",
295 bulk_sizes[sz], avg);
300 __test_stack_perf(uint32_t flags)
302 struct lcore_pair cores;
305 rte_atomic32_init(&lcore_barrier);
307 s = rte_stack_create(STACK_NAME, STACK_SIZE, rte_socket_id(), flags);
309 printf("[%s():%u] failed to create a stack\n",
314 printf("### Testing single element push/pop ###\n");
315 test_single_push_pop(s);
317 printf("\n### Testing empty pop ###\n");
320 printf("\n### Testing using a single lcore ###\n");
321 test_bulk_push_pop(s);
323 if (get_two_hyperthreads(&cores) == 0) {
324 printf("\n### Testing using two hyperthreads ###\n");
325 run_on_core_pair(&cores, s, bulk_push_pop);
327 if (get_two_cores(&cores) == 0) {
328 printf("\n### Testing using two physical cores ###\n");
329 run_on_core_pair(&cores, s, bulk_push_pop);
331 if (get_two_sockets(&cores) == 0) {
332 printf("\n### Testing using two NUMA nodes ###\n");
333 run_on_core_pair(&cores, s, bulk_push_pop);
336 printf("\n### Testing on all %u lcores ###\n", rte_lcore_count());
337 run_on_n_cores(s, bulk_push_pop, rte_lcore_count());
344 test_stack_perf(void)
346 return __test_stack_perf(0);
350 test_lf_stack_perf(void)
352 #if defined(RTE_STACK_LF_SUPPORTED)
353 return __test_stack_perf(RTE_STACK_F_LF);
359 REGISTER_TEST_COMMAND(stack_perf_autotest, test_stack_perf);
360 REGISTER_TEST_COMMAND(stack_lf_perf_autotest, test_lf_stack_perf);