1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright(c) 2010-2014 Intel Corporation
9 #include <rte_cycles.h>
10 #include <rte_launch.h>
11 #include <rte_pause.h>
19 * Measures performance of various operations using rdtsc
20 * * Empty ring dequeue
21 * * Enqueue/dequeue of bursts in 1 threads
22 * * Enqueue/dequeue of bursts in 2 threads
25 #define RING_NAME "RING_PERF"
26 #define RING_SIZE 4096
30 * the sizes to enqueue and dequeue in testing
31 * (marked volatile so they won't be seen as compile-time constants)
33 static const volatile unsigned bulk_sizes[] = { 8, 32 };
35 /* The ring structure used for tests */
36 static struct rte_ring *r;
42 static volatile unsigned lcore_count = 0;
44 /**** Functions to analyse our core mask to get cores for different tests ***/
47 get_two_hyperthreads(struct lcore_pair *lcp)
50 unsigned c1, c2, s1, s2;
51 RTE_LCORE_FOREACH(id1) {
52 /* inner loop just re-reads all id's. We could skip the first few
53 * elements, but since number of cores is small there is little point
55 RTE_LCORE_FOREACH(id2) {
58 c1 = lcore_config[id1].core_id;
59 c2 = lcore_config[id2].core_id;
60 s1 = lcore_config[id1].socket_id;
61 s2 = lcore_config[id2].socket_id;
62 if ((c1 == c2) && (s1 == s2)){
73 get_two_cores(struct lcore_pair *lcp)
76 unsigned c1, c2, s1, s2;
77 RTE_LCORE_FOREACH(id1) {
78 RTE_LCORE_FOREACH(id2) {
81 c1 = lcore_config[id1].core_id;
82 c2 = lcore_config[id2].core_id;
83 s1 = lcore_config[id1].socket_id;
84 s2 = lcore_config[id2].socket_id;
85 if ((c1 != c2) && (s1 == s2)){
96 get_two_sockets(struct lcore_pair *lcp)
100 RTE_LCORE_FOREACH(id1) {
101 RTE_LCORE_FOREACH(id2) {
104 s1 = lcore_config[id1].socket_id;
105 s2 = lcore_config[id2].socket_id;
116 /* Get cycle counts for dequeuing from an empty ring. Should be 2 or 3 cycles */
118 test_empty_dequeue(void)
120 const unsigned iter_shift = 26;
121 const unsigned iterations = 1<<iter_shift;
123 void *burst[MAX_BURST];
125 const uint64_t sc_start = rte_rdtsc();
126 for (i = 0; i < iterations; i++)
127 rte_ring_sc_dequeue_bulk(r, burst, bulk_sizes[0], NULL);
128 const uint64_t sc_end = rte_rdtsc();
130 const uint64_t mc_start = rte_rdtsc();
131 for (i = 0; i < iterations; i++)
132 rte_ring_mc_dequeue_bulk(r, burst, bulk_sizes[0], NULL);
133 const uint64_t mc_end = rte_rdtsc();
135 printf("SC empty dequeue: %.2F\n",
136 (double)(sc_end-sc_start) / iterations);
137 printf("MC empty dequeue: %.2F\n",
138 (double)(mc_end-mc_start) / iterations);
142 * for the separate enqueue and dequeue threads they take in one param
143 * and return two. Input = burst size, output = cycle average for sp/sc & mp/mc
145 struct thread_params {
146 unsigned size; /* input value, the burst size */
147 double spsc, mpmc; /* output value, the single or multi timings */
151 * Function that uses rdtsc to measure timing for ring enqueue. Needs pair
152 * thread running dequeue_bulk function
155 enqueue_bulk(void *p)
157 const unsigned iter_shift = 23;
158 const unsigned iterations = 1<<iter_shift;
159 struct thread_params *params = p;
160 const unsigned size = params->size;
162 void *burst[MAX_BURST] = {0};
164 if ( __sync_add_and_fetch(&lcore_count, 1) != 2 )
165 while(lcore_count != 2)
168 const uint64_t sp_start = rte_rdtsc();
169 for (i = 0; i < iterations; i++)
170 while (rte_ring_sp_enqueue_bulk(r, burst, size, NULL) == 0)
172 const uint64_t sp_end = rte_rdtsc();
174 const uint64_t mp_start = rte_rdtsc();
175 for (i = 0; i < iterations; i++)
176 while (rte_ring_mp_enqueue_bulk(r, burst, size, NULL) == 0)
178 const uint64_t mp_end = rte_rdtsc();
180 params->spsc = ((double)(sp_end - sp_start))/(iterations*size);
181 params->mpmc = ((double)(mp_end - mp_start))/(iterations*size);
186 * Function that uses rdtsc to measure timing for ring dequeue. Needs pair
187 * thread running enqueue_bulk function
190 dequeue_bulk(void *p)
192 const unsigned iter_shift = 23;
193 const unsigned iterations = 1<<iter_shift;
194 struct thread_params *params = p;
195 const unsigned size = params->size;
197 void *burst[MAX_BURST] = {0};
199 if ( __sync_add_and_fetch(&lcore_count, 1) != 2 )
200 while(lcore_count != 2)
203 const uint64_t sc_start = rte_rdtsc();
204 for (i = 0; i < iterations; i++)
205 while (rte_ring_sc_dequeue_bulk(r, burst, size, NULL) == 0)
207 const uint64_t sc_end = rte_rdtsc();
209 const uint64_t mc_start = rte_rdtsc();
210 for (i = 0; i < iterations; i++)
211 while (rte_ring_mc_dequeue_bulk(r, burst, size, NULL) == 0)
213 const uint64_t mc_end = rte_rdtsc();
215 params->spsc = ((double)(sc_end - sc_start))/(iterations*size);
216 params->mpmc = ((double)(mc_end - mc_start))/(iterations*size);
221 * Function that calls the enqueue and dequeue bulk functions on pairs of cores.
222 * used to measure ring perf between hyperthreads, cores and sockets.
225 run_on_core_pair(struct lcore_pair *cores,
226 lcore_function_t f1, lcore_function_t f2)
228 struct thread_params param1 = {0}, param2 = {0};
230 for (i = 0; i < sizeof(bulk_sizes)/sizeof(bulk_sizes[0]); i++) {
232 param1.size = param2.size = bulk_sizes[i];
233 if (cores->c1 == rte_get_master_lcore()) {
234 rte_eal_remote_launch(f2, ¶m2, cores->c2);
236 rte_eal_wait_lcore(cores->c2);
238 rte_eal_remote_launch(f1, ¶m1, cores->c1);
239 rte_eal_remote_launch(f2, ¶m2, cores->c2);
240 rte_eal_wait_lcore(cores->c1);
241 rte_eal_wait_lcore(cores->c2);
243 printf("SP/SC bulk enq/dequeue (size: %u): %.2F\n", bulk_sizes[i],
244 param1.spsc + param2.spsc);
245 printf("MP/MC bulk enq/dequeue (size: %u): %.2F\n", bulk_sizes[i],
246 param1.mpmc + param2.mpmc);
251 * Test function that determines how long an enqueue + dequeue of a single item
252 * takes on a single lcore. Result is for comparison with the bulk enq+deq.
255 test_single_enqueue_dequeue(void)
257 const unsigned iter_shift = 24;
258 const unsigned iterations = 1<<iter_shift;
262 const uint64_t sc_start = rte_rdtsc();
263 for (i = 0; i < iterations; i++) {
264 rte_ring_sp_enqueue(r, burst);
265 rte_ring_sc_dequeue(r, &burst);
267 const uint64_t sc_end = rte_rdtsc();
269 const uint64_t mc_start = rte_rdtsc();
270 for (i = 0; i < iterations; i++) {
271 rte_ring_mp_enqueue(r, burst);
272 rte_ring_mc_dequeue(r, &burst);
274 const uint64_t mc_end = rte_rdtsc();
276 printf("SP/SC single enq/dequeue: %"PRIu64"\n",
277 (sc_end-sc_start) >> iter_shift);
278 printf("MP/MC single enq/dequeue: %"PRIu64"\n",
279 (mc_end-mc_start) >> iter_shift);
283 * Test that does both enqueue and dequeue on a core using the burst() API calls
284 * instead of the bulk() calls used in other tests. Results should be the same
285 * as for the bulk function called on a single lcore.
288 test_burst_enqueue_dequeue(void)
290 const unsigned iter_shift = 23;
291 const unsigned iterations = 1<<iter_shift;
293 void *burst[MAX_BURST] = {0};
295 for (sz = 0; sz < sizeof(bulk_sizes)/sizeof(bulk_sizes[0]); sz++) {
296 const uint64_t sc_start = rte_rdtsc();
297 for (i = 0; i < iterations; i++) {
298 rte_ring_sp_enqueue_burst(r, burst,
299 bulk_sizes[sz], NULL);
300 rte_ring_sc_dequeue_burst(r, burst,
301 bulk_sizes[sz], NULL);
303 const uint64_t sc_end = rte_rdtsc();
305 const uint64_t mc_start = rte_rdtsc();
306 for (i = 0; i < iterations; i++) {
307 rte_ring_mp_enqueue_burst(r, burst,
308 bulk_sizes[sz], NULL);
309 rte_ring_mc_dequeue_burst(r, burst,
310 bulk_sizes[sz], NULL);
312 const uint64_t mc_end = rte_rdtsc();
314 uint64_t mc_avg = ((mc_end-mc_start) >> iter_shift) / bulk_sizes[sz];
315 uint64_t sc_avg = ((sc_end-sc_start) >> iter_shift) / bulk_sizes[sz];
317 printf("SP/SC burst enq/dequeue (size: %u): %"PRIu64"\n", bulk_sizes[sz],
319 printf("MP/MC burst enq/dequeue (size: %u): %"PRIu64"\n", bulk_sizes[sz],
324 /* Times enqueue and dequeue on a single lcore */
326 test_bulk_enqueue_dequeue(void)
328 const unsigned iter_shift = 23;
329 const unsigned iterations = 1<<iter_shift;
331 void *burst[MAX_BURST] = {0};
333 for (sz = 0; sz < sizeof(bulk_sizes)/sizeof(bulk_sizes[0]); sz++) {
334 const uint64_t sc_start = rte_rdtsc();
335 for (i = 0; i < iterations; i++) {
336 rte_ring_sp_enqueue_bulk(r, burst,
337 bulk_sizes[sz], NULL);
338 rte_ring_sc_dequeue_bulk(r, burst,
339 bulk_sizes[sz], NULL);
341 const uint64_t sc_end = rte_rdtsc();
343 const uint64_t mc_start = rte_rdtsc();
344 for (i = 0; i < iterations; i++) {
345 rte_ring_mp_enqueue_bulk(r, burst,
346 bulk_sizes[sz], NULL);
347 rte_ring_mc_dequeue_bulk(r, burst,
348 bulk_sizes[sz], NULL);
350 const uint64_t mc_end = rte_rdtsc();
352 double sc_avg = ((double)(sc_end-sc_start) /
353 (iterations * bulk_sizes[sz]));
354 double mc_avg = ((double)(mc_end-mc_start) /
355 (iterations * bulk_sizes[sz]));
357 printf("SP/SC bulk enq/dequeue (size: %u): %.2F\n", bulk_sizes[sz],
359 printf("MP/MC bulk enq/dequeue (size: %u): %.2F\n", bulk_sizes[sz],
367 struct lcore_pair cores;
368 r = rte_ring_create(RING_NAME, RING_SIZE, rte_socket_id(), 0);
369 if (r == NULL && (r = rte_ring_lookup(RING_NAME)) == NULL)
372 printf("### Testing single element and burst enq/deq ###\n");
373 test_single_enqueue_dequeue();
374 test_burst_enqueue_dequeue();
376 printf("\n### Testing empty dequeue ###\n");
377 test_empty_dequeue();
379 printf("\n### Testing using a single lcore ###\n");
380 test_bulk_enqueue_dequeue();
382 if (get_two_hyperthreads(&cores) == 0) {
383 printf("\n### Testing using two hyperthreads ###\n");
384 run_on_core_pair(&cores, enqueue_bulk, dequeue_bulk);
386 if (get_two_cores(&cores) == 0) {
387 printf("\n### Testing using two physical cores ###\n");
388 run_on_core_pair(&cores, enqueue_bulk, dequeue_bulk);
390 if (get_two_sockets(&cores) == 0) {
391 printf("\n### Testing using two NUMA nodes ###\n");
392 run_on_core_pair(&cores, enqueue_bulk, dequeue_bulk);
397 REGISTER_TEST_COMMAND(ring_perf_autotest, test_ring_perf);