1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright(c) 2010-2014 Intel Corporation
9 #include <rte_cycles.h>
10 #include <rte_launch.h>
11 #include <rte_pause.h>
19 * Measures performance of various operations using rdtsc
20 * * Empty ring dequeue
21 * * Enqueue/dequeue of bursts in 1 threads
22 * * Enqueue/dequeue of bursts in 2 threads
25 #define RING_NAME "RING_PERF"
26 #define RING_SIZE 4096
30 * the sizes to enqueue and dequeue in testing
31 * (marked volatile so they won't be seen as compile-time constants)
33 static const volatile unsigned bulk_sizes[] = { 8, 32 };
39 static volatile unsigned lcore_count = 0;
41 /**** Functions to analyse our core mask to get cores for different tests ***/
44 get_two_hyperthreads(struct lcore_pair *lcp)
47 unsigned c1, c2, s1, s2;
48 RTE_LCORE_FOREACH(id1) {
49 /* inner loop just re-reads all id's. We could skip the first few
50 * elements, but since number of cores is small there is little point
52 RTE_LCORE_FOREACH(id2) {
56 c1 = rte_lcore_to_cpu_id(id1);
57 c2 = rte_lcore_to_cpu_id(id2);
58 s1 = rte_lcore_to_socket_id(id1);
59 s2 = rte_lcore_to_socket_id(id2);
60 if ((c1 == c2) && (s1 == s2)){
71 get_two_cores(struct lcore_pair *lcp)
74 unsigned c1, c2, s1, s2;
75 RTE_LCORE_FOREACH(id1) {
76 RTE_LCORE_FOREACH(id2) {
80 c1 = rte_lcore_to_cpu_id(id1);
81 c2 = rte_lcore_to_cpu_id(id2);
82 s1 = rte_lcore_to_socket_id(id1);
83 s2 = rte_lcore_to_socket_id(id2);
84 if ((c1 != c2) && (s1 == s2)){
95 get_two_sockets(struct lcore_pair *lcp)
99 RTE_LCORE_FOREACH(id1) {
100 RTE_LCORE_FOREACH(id2) {
103 s1 = rte_lcore_to_socket_id(id1);
104 s2 = rte_lcore_to_socket_id(id2);
115 /* Get cycle counts for dequeuing from an empty ring. Should be 2 or 3 cycles */
117 test_empty_dequeue(struct rte_ring *r)
119 const unsigned iter_shift = 26;
120 const unsigned iterations = 1<<iter_shift;
122 void *burst[MAX_BURST];
124 const uint64_t sc_start = rte_rdtsc();
125 for (i = 0; i < iterations; i++)
126 rte_ring_sc_dequeue_bulk(r, burst, bulk_sizes[0], NULL);
127 const uint64_t sc_end = rte_rdtsc();
129 const uint64_t mc_start = rte_rdtsc();
130 for (i = 0; i < iterations; i++)
131 rte_ring_mc_dequeue_bulk(r, burst, bulk_sizes[0], NULL);
132 const uint64_t mc_end = rte_rdtsc();
134 printf("SC empty dequeue: %.2F\n",
135 (double)(sc_end-sc_start) / iterations);
136 printf("MC empty dequeue: %.2F\n",
137 (double)(mc_end-mc_start) / iterations);
141 * for the separate enqueue and dequeue threads they take in one param
142 * and return two. Input = burst size, output = cycle average for sp/sc & mp/mc
144 struct thread_params {
146 unsigned size; /* input value, the burst size */
147 double spsc, mpmc; /* output value, the single or multi timings */
151 * Function that uses rdtsc to measure timing for ring enqueue. Needs pair
152 * thread running dequeue_bulk function
155 enqueue_bulk(void *p)
157 const unsigned iter_shift = 23;
158 const unsigned iterations = 1<<iter_shift;
159 struct thread_params *params = p;
160 struct rte_ring *r = params->r;
161 const unsigned size = params->size;
163 void *burst[MAX_BURST] = {0};
165 #ifdef RTE_USE_C11_MEM_MODEL
166 if (__atomic_add_fetch(&lcore_count, 1, __ATOMIC_RELAXED) != 2)
168 if (__sync_add_and_fetch(&lcore_count, 1) != 2)
170 while(lcore_count != 2)
173 const uint64_t sp_start = rte_rdtsc();
174 for (i = 0; i < iterations; i++)
175 while (rte_ring_sp_enqueue_bulk(r, burst, size, NULL) == 0)
177 const uint64_t sp_end = rte_rdtsc();
179 const uint64_t mp_start = rte_rdtsc();
180 for (i = 0; i < iterations; i++)
181 while (rte_ring_mp_enqueue_bulk(r, burst, size, NULL) == 0)
183 const uint64_t mp_end = rte_rdtsc();
185 params->spsc = ((double)(sp_end - sp_start))/(iterations*size);
186 params->mpmc = ((double)(mp_end - mp_start))/(iterations*size);
191 * Function that uses rdtsc to measure timing for ring dequeue. Needs pair
192 * thread running enqueue_bulk function
195 dequeue_bulk(void *p)
197 const unsigned iter_shift = 23;
198 const unsigned iterations = 1<<iter_shift;
199 struct thread_params *params = p;
200 struct rte_ring *r = params->r;
201 const unsigned size = params->size;
203 void *burst[MAX_BURST] = {0};
205 #ifdef RTE_USE_C11_MEM_MODEL
206 if (__atomic_add_fetch(&lcore_count, 1, __ATOMIC_RELAXED) != 2)
208 if (__sync_add_and_fetch(&lcore_count, 1) != 2)
210 while(lcore_count != 2)
213 const uint64_t sc_start = rte_rdtsc();
214 for (i = 0; i < iterations; i++)
215 while (rte_ring_sc_dequeue_bulk(r, burst, size, NULL) == 0)
217 const uint64_t sc_end = rte_rdtsc();
219 const uint64_t mc_start = rte_rdtsc();
220 for (i = 0; i < iterations; i++)
221 while (rte_ring_mc_dequeue_bulk(r, burst, size, NULL) == 0)
223 const uint64_t mc_end = rte_rdtsc();
225 params->spsc = ((double)(sc_end - sc_start))/(iterations*size);
226 params->mpmc = ((double)(mc_end - mc_start))/(iterations*size);
231 * Function that calls the enqueue and dequeue bulk functions on pairs of cores.
232 * used to measure ring perf between hyperthreads, cores and sockets.
235 run_on_core_pair(struct lcore_pair *cores, struct rte_ring *r,
236 lcore_function_t f1, lcore_function_t f2)
238 struct thread_params param1 = {0}, param2 = {0};
240 for (i = 0; i < sizeof(bulk_sizes)/sizeof(bulk_sizes[0]); i++) {
242 param1.size = param2.size = bulk_sizes[i];
243 param1.r = param2.r = r;
244 if (cores->c1 == rte_get_master_lcore()) {
245 rte_eal_remote_launch(f2, ¶m2, cores->c2);
247 rte_eal_wait_lcore(cores->c2);
249 rte_eal_remote_launch(f1, ¶m1, cores->c1);
250 rte_eal_remote_launch(f2, ¶m2, cores->c2);
251 rte_eal_wait_lcore(cores->c1);
252 rte_eal_wait_lcore(cores->c2);
254 printf("SP/SC bulk enq/dequeue (size: %u): %.2F\n", bulk_sizes[i],
255 param1.spsc + param2.spsc);
256 printf("MP/MC bulk enq/dequeue (size: %u): %.2F\n", bulk_sizes[i],
257 param1.mpmc + param2.mpmc);
262 * Test function that determines how long an enqueue + dequeue of a single item
263 * takes on a single lcore. Result is for comparison with the bulk enq+deq.
266 test_single_enqueue_dequeue(struct rte_ring *r)
268 const unsigned iter_shift = 24;
269 const unsigned iterations = 1<<iter_shift;
273 const uint64_t sc_start = rte_rdtsc();
274 for (i = 0; i < iterations; i++) {
275 rte_ring_sp_enqueue(r, burst);
276 rte_ring_sc_dequeue(r, &burst);
278 const uint64_t sc_end = rte_rdtsc();
280 const uint64_t mc_start = rte_rdtsc();
281 for (i = 0; i < iterations; i++) {
282 rte_ring_mp_enqueue(r, burst);
283 rte_ring_mc_dequeue(r, &burst);
285 const uint64_t mc_end = rte_rdtsc();
287 printf("SP/SC single enq/dequeue: %"PRIu64"\n",
288 (sc_end-sc_start) >> iter_shift);
289 printf("MP/MC single enq/dequeue: %"PRIu64"\n",
290 (mc_end-mc_start) >> iter_shift);
294 * Test that does both enqueue and dequeue on a core using the burst() API calls
295 * instead of the bulk() calls used in other tests. Results should be the same
296 * as for the bulk function called on a single lcore.
299 test_burst_enqueue_dequeue(struct rte_ring *r)
301 const unsigned iter_shift = 23;
302 const unsigned iterations = 1<<iter_shift;
304 void *burst[MAX_BURST] = {0};
306 for (sz = 0; sz < sizeof(bulk_sizes)/sizeof(bulk_sizes[0]); sz++) {
307 const uint64_t sc_start = rte_rdtsc();
308 for (i = 0; i < iterations; i++) {
309 rte_ring_sp_enqueue_burst(r, burst,
310 bulk_sizes[sz], NULL);
311 rte_ring_sc_dequeue_burst(r, burst,
312 bulk_sizes[sz], NULL);
314 const uint64_t sc_end = rte_rdtsc();
316 const uint64_t mc_start = rte_rdtsc();
317 for (i = 0; i < iterations; i++) {
318 rte_ring_mp_enqueue_burst(r, burst,
319 bulk_sizes[sz], NULL);
320 rte_ring_mc_dequeue_burst(r, burst,
321 bulk_sizes[sz], NULL);
323 const uint64_t mc_end = rte_rdtsc();
325 uint64_t mc_avg = ((mc_end-mc_start) >> iter_shift) / bulk_sizes[sz];
326 uint64_t sc_avg = ((sc_end-sc_start) >> iter_shift) / bulk_sizes[sz];
328 printf("SP/SC burst enq/dequeue (size: %u): %"PRIu64"\n", bulk_sizes[sz],
330 printf("MP/MC burst enq/dequeue (size: %u): %"PRIu64"\n", bulk_sizes[sz],
335 /* Times enqueue and dequeue on a single lcore */
337 test_bulk_enqueue_dequeue(struct rte_ring *r)
339 const unsigned iter_shift = 23;
340 const unsigned iterations = 1<<iter_shift;
342 void *burst[MAX_BURST] = {0};
344 for (sz = 0; sz < sizeof(bulk_sizes)/sizeof(bulk_sizes[0]); sz++) {
345 const uint64_t sc_start = rte_rdtsc();
346 for (i = 0; i < iterations; i++) {
347 rte_ring_sp_enqueue_bulk(r, burst,
348 bulk_sizes[sz], NULL);
349 rte_ring_sc_dequeue_bulk(r, burst,
350 bulk_sizes[sz], NULL);
352 const uint64_t sc_end = rte_rdtsc();
354 const uint64_t mc_start = rte_rdtsc();
355 for (i = 0; i < iterations; i++) {
356 rte_ring_mp_enqueue_bulk(r, burst,
357 bulk_sizes[sz], NULL);
358 rte_ring_mc_dequeue_bulk(r, burst,
359 bulk_sizes[sz], NULL);
361 const uint64_t mc_end = rte_rdtsc();
363 double sc_avg = ((double)(sc_end-sc_start) /
364 (iterations * bulk_sizes[sz]));
365 double mc_avg = ((double)(mc_end-mc_start) /
366 (iterations * bulk_sizes[sz]));
368 printf("SP/SC bulk enq/dequeue (size: %u): %.2F\n", bulk_sizes[sz],
370 printf("MP/MC bulk enq/dequeue (size: %u): %.2F\n", bulk_sizes[sz],
378 struct lcore_pair cores;
379 struct rte_ring *r = NULL;
381 r = rte_ring_create(RING_NAME, RING_SIZE, rte_socket_id(), 0);
385 printf("### Testing single element and burst enq/deq ###\n");
386 test_single_enqueue_dequeue(r);
387 test_burst_enqueue_dequeue(r);
389 printf("\n### Testing empty dequeue ###\n");
390 test_empty_dequeue(r);
392 printf("\n### Testing using a single lcore ###\n");
393 test_bulk_enqueue_dequeue(r);
395 if (get_two_hyperthreads(&cores) == 0) {
396 printf("\n### Testing using two hyperthreads ###\n");
397 run_on_core_pair(&cores, r, enqueue_bulk, dequeue_bulk);
399 if (get_two_cores(&cores) == 0) {
400 printf("\n### Testing using two physical cores ###\n");
401 run_on_core_pair(&cores, r, enqueue_bulk, dequeue_bulk);
403 if (get_two_sockets(&cores) == 0) {
404 printf("\n### Testing using two NUMA nodes ###\n");
405 run_on_core_pair(&cores, r, enqueue_bulk, dequeue_bulk);
411 REGISTER_TEST_COMMAND(ring_perf_autotest, test_ring_perf);