1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright(c) 2010-2014 Intel Corporation
9 #include <rte_cycles.h>
10 #include <rte_launch.h>
11 #include <rte_pause.h>
19 * Measures performance of various operations using rdtsc
20 * * Empty ring dequeue
21 * * Enqueue/dequeue of bursts in 1 threads
22 * * Enqueue/dequeue of bursts in 2 threads
25 #define RING_NAME "RING_PERF"
26 #define RING_SIZE 4096
30 * the sizes to enqueue and dequeue in testing
31 * (marked volatile so they won't be seen as compile-time constants)
33 static const volatile unsigned bulk_sizes[] = { 8, 32 };
39 static volatile unsigned lcore_count = 0;
41 /**** Functions to analyse our core mask to get cores for different tests ***/
44 get_two_hyperthreads(struct lcore_pair *lcp)
47 unsigned c1, c2, s1, s2;
48 RTE_LCORE_FOREACH(id1) {
49 /* inner loop just re-reads all id's. We could skip the first few
50 * elements, but since number of cores is small there is little point
52 RTE_LCORE_FOREACH(id2) {
56 c1 = rte_lcore_to_cpu_id(id1);
57 c2 = rte_lcore_to_cpu_id(id2);
58 s1 = rte_lcore_to_socket_id(id1);
59 s2 = rte_lcore_to_socket_id(id2);
60 if ((c1 == c2) && (s1 == s2)){
71 get_two_cores(struct lcore_pair *lcp)
74 unsigned c1, c2, s1, s2;
75 RTE_LCORE_FOREACH(id1) {
76 RTE_LCORE_FOREACH(id2) {
80 c1 = rte_lcore_to_cpu_id(id1);
81 c2 = rte_lcore_to_cpu_id(id2);
82 s1 = rte_lcore_to_socket_id(id1);
83 s2 = rte_lcore_to_socket_id(id2);
84 if ((c1 != c2) && (s1 == s2)){
95 get_two_sockets(struct lcore_pair *lcp)
99 RTE_LCORE_FOREACH(id1) {
100 RTE_LCORE_FOREACH(id2) {
103 s1 = rte_lcore_to_socket_id(id1);
104 s2 = rte_lcore_to_socket_id(id2);
115 /* Get cycle counts for dequeuing from an empty ring. Should be 2 or 3 cycles */
117 test_empty_dequeue(struct rte_ring *r)
119 const unsigned iter_shift = 26;
120 const unsigned iterations = 1<<iter_shift;
122 void *burst[MAX_BURST];
124 const uint64_t sc_start = rte_rdtsc();
125 for (i = 0; i < iterations; i++)
126 rte_ring_sc_dequeue_bulk(r, burst, bulk_sizes[0], NULL);
127 const uint64_t sc_end = rte_rdtsc();
129 const uint64_t mc_start = rte_rdtsc();
130 for (i = 0; i < iterations; i++)
131 rte_ring_mc_dequeue_bulk(r, burst, bulk_sizes[0], NULL);
132 const uint64_t mc_end = rte_rdtsc();
134 printf("SC empty dequeue: %.2F\n",
135 (double)(sc_end-sc_start) / iterations);
136 printf("MC empty dequeue: %.2F\n",
137 (double)(mc_end-mc_start) / iterations);
141 * for the separate enqueue and dequeue threads they take in one param
142 * and return two. Input = burst size, output = cycle average for sp/sc & mp/mc
144 struct thread_params {
146 unsigned size; /* input value, the burst size */
147 double spsc, mpmc; /* output value, the single or multi timings */
151 * Function that uses rdtsc to measure timing for ring enqueue. Needs pair
152 * thread running dequeue_bulk function
155 enqueue_bulk(void *p)
157 const unsigned iter_shift = 23;
158 const unsigned iterations = 1<<iter_shift;
159 struct thread_params *params = p;
160 struct rte_ring *r = params->r;
161 const unsigned size = params->size;
163 void *burst[MAX_BURST] = {0};
165 if ( __sync_add_and_fetch(&lcore_count, 1) != 2 )
166 while(lcore_count != 2)
169 const uint64_t sp_start = rte_rdtsc();
170 for (i = 0; i < iterations; i++)
171 while (rte_ring_sp_enqueue_bulk(r, burst, size, NULL) == 0)
173 const uint64_t sp_end = rte_rdtsc();
175 const uint64_t mp_start = rte_rdtsc();
176 for (i = 0; i < iterations; i++)
177 while (rte_ring_mp_enqueue_bulk(r, burst, size, NULL) == 0)
179 const uint64_t mp_end = rte_rdtsc();
181 params->spsc = ((double)(sp_end - sp_start))/(iterations*size);
182 params->mpmc = ((double)(mp_end - mp_start))/(iterations*size);
187 * Function that uses rdtsc to measure timing for ring dequeue. Needs pair
188 * thread running enqueue_bulk function
191 dequeue_bulk(void *p)
193 const unsigned iter_shift = 23;
194 const unsigned iterations = 1<<iter_shift;
195 struct thread_params *params = p;
196 struct rte_ring *r = params->r;
197 const unsigned size = params->size;
199 void *burst[MAX_BURST] = {0};
201 if ( __sync_add_and_fetch(&lcore_count, 1) != 2 )
202 while(lcore_count != 2)
205 const uint64_t sc_start = rte_rdtsc();
206 for (i = 0; i < iterations; i++)
207 while (rte_ring_sc_dequeue_bulk(r, burst, size, NULL) == 0)
209 const uint64_t sc_end = rte_rdtsc();
211 const uint64_t mc_start = rte_rdtsc();
212 for (i = 0; i < iterations; i++)
213 while (rte_ring_mc_dequeue_bulk(r, burst, size, NULL) == 0)
215 const uint64_t mc_end = rte_rdtsc();
217 params->spsc = ((double)(sc_end - sc_start))/(iterations*size);
218 params->mpmc = ((double)(mc_end - mc_start))/(iterations*size);
223 * Function that calls the enqueue and dequeue bulk functions on pairs of cores.
224 * used to measure ring perf between hyperthreads, cores and sockets.
227 run_on_core_pair(struct lcore_pair *cores, struct rte_ring *r,
228 lcore_function_t f1, lcore_function_t f2)
230 struct thread_params param1 = {0}, param2 = {0};
232 for (i = 0; i < sizeof(bulk_sizes)/sizeof(bulk_sizes[0]); i++) {
234 param1.size = param2.size = bulk_sizes[i];
235 param1.r = param2.r = r;
236 if (cores->c1 == rte_get_master_lcore()) {
237 rte_eal_remote_launch(f2, ¶m2, cores->c2);
239 rte_eal_wait_lcore(cores->c2);
241 rte_eal_remote_launch(f1, ¶m1, cores->c1);
242 rte_eal_remote_launch(f2, ¶m2, cores->c2);
243 rte_eal_wait_lcore(cores->c1);
244 rte_eal_wait_lcore(cores->c2);
246 printf("SP/SC bulk enq/dequeue (size: %u): %.2F\n", bulk_sizes[i],
247 param1.spsc + param2.spsc);
248 printf("MP/MC bulk enq/dequeue (size: %u): %.2F\n", bulk_sizes[i],
249 param1.mpmc + param2.mpmc);
254 * Test function that determines how long an enqueue + dequeue of a single item
255 * takes on a single lcore. Result is for comparison with the bulk enq+deq.
258 test_single_enqueue_dequeue(struct rte_ring *r)
260 const unsigned iter_shift = 24;
261 const unsigned iterations = 1<<iter_shift;
265 const uint64_t sc_start = rte_rdtsc();
266 for (i = 0; i < iterations; i++) {
267 rte_ring_sp_enqueue(r, burst);
268 rte_ring_sc_dequeue(r, &burst);
270 const uint64_t sc_end = rte_rdtsc();
272 const uint64_t mc_start = rte_rdtsc();
273 for (i = 0; i < iterations; i++) {
274 rte_ring_mp_enqueue(r, burst);
275 rte_ring_mc_dequeue(r, &burst);
277 const uint64_t mc_end = rte_rdtsc();
279 printf("SP/SC single enq/dequeue: %"PRIu64"\n",
280 (sc_end-sc_start) >> iter_shift);
281 printf("MP/MC single enq/dequeue: %"PRIu64"\n",
282 (mc_end-mc_start) >> iter_shift);
286 * Test that does both enqueue and dequeue on a core using the burst() API calls
287 * instead of the bulk() calls used in other tests. Results should be the same
288 * as for the bulk function called on a single lcore.
291 test_burst_enqueue_dequeue(struct rte_ring *r)
293 const unsigned iter_shift = 23;
294 const unsigned iterations = 1<<iter_shift;
296 void *burst[MAX_BURST] = {0};
298 for (sz = 0; sz < sizeof(bulk_sizes)/sizeof(bulk_sizes[0]); sz++) {
299 const uint64_t sc_start = rte_rdtsc();
300 for (i = 0; i < iterations; i++) {
301 rte_ring_sp_enqueue_burst(r, burst,
302 bulk_sizes[sz], NULL);
303 rte_ring_sc_dequeue_burst(r, burst,
304 bulk_sizes[sz], NULL);
306 const uint64_t sc_end = rte_rdtsc();
308 const uint64_t mc_start = rte_rdtsc();
309 for (i = 0; i < iterations; i++) {
310 rte_ring_mp_enqueue_burst(r, burst,
311 bulk_sizes[sz], NULL);
312 rte_ring_mc_dequeue_burst(r, burst,
313 bulk_sizes[sz], NULL);
315 const uint64_t mc_end = rte_rdtsc();
317 uint64_t mc_avg = ((mc_end-mc_start) >> iter_shift) / bulk_sizes[sz];
318 uint64_t sc_avg = ((sc_end-sc_start) >> iter_shift) / bulk_sizes[sz];
320 printf("SP/SC burst enq/dequeue (size: %u): %"PRIu64"\n", bulk_sizes[sz],
322 printf("MP/MC burst enq/dequeue (size: %u): %"PRIu64"\n", bulk_sizes[sz],
327 /* Times enqueue and dequeue on a single lcore */
329 test_bulk_enqueue_dequeue(struct rte_ring *r)
331 const unsigned iter_shift = 23;
332 const unsigned iterations = 1<<iter_shift;
334 void *burst[MAX_BURST] = {0};
336 for (sz = 0; sz < sizeof(bulk_sizes)/sizeof(bulk_sizes[0]); sz++) {
337 const uint64_t sc_start = rte_rdtsc();
338 for (i = 0; i < iterations; i++) {
339 rte_ring_sp_enqueue_bulk(r, burst,
340 bulk_sizes[sz], NULL);
341 rte_ring_sc_dequeue_bulk(r, burst,
342 bulk_sizes[sz], NULL);
344 const uint64_t sc_end = rte_rdtsc();
346 const uint64_t mc_start = rte_rdtsc();
347 for (i = 0; i < iterations; i++) {
348 rte_ring_mp_enqueue_bulk(r, burst,
349 bulk_sizes[sz], NULL);
350 rte_ring_mc_dequeue_bulk(r, burst,
351 bulk_sizes[sz], NULL);
353 const uint64_t mc_end = rte_rdtsc();
355 double sc_avg = ((double)(sc_end-sc_start) /
356 (iterations * bulk_sizes[sz]));
357 double mc_avg = ((double)(mc_end-mc_start) /
358 (iterations * bulk_sizes[sz]));
360 printf("SP/SC bulk enq/dequeue (size: %u): %.2F\n", bulk_sizes[sz],
362 printf("MP/MC bulk enq/dequeue (size: %u): %.2F\n", bulk_sizes[sz],
370 struct lcore_pair cores;
371 struct rte_ring *r = NULL;
373 r = rte_ring_create(RING_NAME, RING_SIZE, rte_socket_id(), 0);
377 printf("### Testing single element and burst enq/deq ###\n");
378 test_single_enqueue_dequeue(r);
379 test_burst_enqueue_dequeue(r);
381 printf("\n### Testing empty dequeue ###\n");
382 test_empty_dequeue(r);
384 printf("\n### Testing using a single lcore ###\n");
385 test_bulk_enqueue_dequeue(r);
387 if (get_two_hyperthreads(&cores) == 0) {
388 printf("\n### Testing using two hyperthreads ###\n");
389 run_on_core_pair(&cores, r, enqueue_bulk, dequeue_bulk);
391 if (get_two_cores(&cores) == 0) {
392 printf("\n### Testing using two physical cores ###\n");
393 run_on_core_pair(&cores, r, enqueue_bulk, dequeue_bulk);
395 if (get_two_sockets(&cores) == 0) {
396 printf("\n### Testing using two NUMA nodes ###\n");
397 run_on_core_pair(&cores, r, enqueue_bulk, dequeue_bulk);
403 REGISTER_TEST_COMMAND(ring_perf_autotest, test_ring_perf);