test: skip tests when missing requirements
[dpdk.git] / app / test / test_ring_perf.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2014 Intel Corporation
3  */
4
5
6 #include <stdio.h>
7 #include <inttypes.h>
8 #include <rte_ring.h>
9 #include <rte_cycles.h>
10 #include <rte_launch.h>
11 #include <rte_pause.h>
12
13 #include "test.h"
14
15 /*
16  * Ring
17  * ====
18  *
19  * Measures performance of various operations using rdtsc
20  *  * Empty ring dequeue
21  *  * Enqueue/dequeue of bursts in 1 threads
22  *  * Enqueue/dequeue of bursts in 2 threads
23  */
24
25 #define RING_NAME "RING_PERF"
26 #define RING_SIZE 4096
27 #define MAX_BURST 32
28
29 /*
30  * the sizes to enqueue and dequeue in testing
31  * (marked volatile so they won't be seen as compile-time constants)
32  */
33 static const volatile unsigned bulk_sizes[] = { 8, 32 };
34
35 struct lcore_pair {
36         unsigned c1, c2;
37 };
38
39 static volatile unsigned lcore_count = 0;
40
41 /**** Functions to analyse our core mask to get cores for different tests ***/
42
43 static int
44 get_two_hyperthreads(struct lcore_pair *lcp)
45 {
46         unsigned id1, id2;
47         unsigned c1, c2, s1, s2;
48         RTE_LCORE_FOREACH(id1) {
49                 /* inner loop just re-reads all id's. We could skip the first few
50                  * elements, but since number of cores is small there is little point
51                  */
52                 RTE_LCORE_FOREACH(id2) {
53                         if (id1 == id2)
54                                 continue;
55
56                         c1 = rte_lcore_to_cpu_id(id1);
57                         c2 = rte_lcore_to_cpu_id(id2);
58                         s1 = rte_lcore_to_socket_id(id1);
59                         s2 = rte_lcore_to_socket_id(id2);
60                         if ((c1 == c2) && (s1 == s2)){
61                                 lcp->c1 = id1;
62                                 lcp->c2 = id2;
63                                 return 0;
64                         }
65                 }
66         }
67         return 1;
68 }
69
70 static int
71 get_two_cores(struct lcore_pair *lcp)
72 {
73         unsigned id1, id2;
74         unsigned c1, c2, s1, s2;
75         RTE_LCORE_FOREACH(id1) {
76                 RTE_LCORE_FOREACH(id2) {
77                         if (id1 == id2)
78                                 continue;
79
80                         c1 = rte_lcore_to_cpu_id(id1);
81                         c2 = rte_lcore_to_cpu_id(id2);
82                         s1 = rte_lcore_to_socket_id(id1);
83                         s2 = rte_lcore_to_socket_id(id2);
84                         if ((c1 != c2) && (s1 == s2)){
85                                 lcp->c1 = id1;
86                                 lcp->c2 = id2;
87                                 return 0;
88                         }
89                 }
90         }
91         return 1;
92 }
93
94 static int
95 get_two_sockets(struct lcore_pair *lcp)
96 {
97         unsigned id1, id2;
98         unsigned s1, s2;
99         RTE_LCORE_FOREACH(id1) {
100                 RTE_LCORE_FOREACH(id2) {
101                         if (id1 == id2)
102                                 continue;
103                         s1 = rte_lcore_to_socket_id(id1);
104                         s2 = rte_lcore_to_socket_id(id2);
105                         if (s1 != s2){
106                                 lcp->c1 = id1;
107                                 lcp->c2 = id2;
108                                 return 0;
109                         }
110                 }
111         }
112         return 1;
113 }
114
115 /* Get cycle counts for dequeuing from an empty ring. Should be 2 or 3 cycles */
116 static void
117 test_empty_dequeue(struct rte_ring *r)
118 {
119         const unsigned iter_shift = 26;
120         const unsigned iterations = 1<<iter_shift;
121         unsigned i = 0;
122         void *burst[MAX_BURST];
123
124         const uint64_t sc_start = rte_rdtsc();
125         for (i = 0; i < iterations; i++)
126                 rte_ring_sc_dequeue_bulk(r, burst, bulk_sizes[0], NULL);
127         const uint64_t sc_end = rte_rdtsc();
128
129         const uint64_t mc_start = rte_rdtsc();
130         for (i = 0; i < iterations; i++)
131                 rte_ring_mc_dequeue_bulk(r, burst, bulk_sizes[0], NULL);
132         const uint64_t mc_end = rte_rdtsc();
133
134         printf("SC empty dequeue: %.2F\n",
135                         (double)(sc_end-sc_start) / iterations);
136         printf("MC empty dequeue: %.2F\n",
137                         (double)(mc_end-mc_start) / iterations);
138 }
139
140 /*
141  * for the separate enqueue and dequeue threads they take in one param
142  * and return two. Input = burst size, output = cycle average for sp/sc & mp/mc
143  */
144 struct thread_params {
145         struct rte_ring *r;
146         unsigned size;        /* input value, the burst size */
147         double spsc, mpmc;    /* output value, the single or multi timings */
148 };
149
150 /*
151  * Function that uses rdtsc to measure timing for ring enqueue. Needs pair
152  * thread running dequeue_bulk function
153  */
154 static int
155 enqueue_bulk(void *p)
156 {
157         const unsigned iter_shift = 23;
158         const unsigned iterations = 1<<iter_shift;
159         struct thread_params *params = p;
160         struct rte_ring *r = params->r;
161         const unsigned size = params->size;
162         unsigned i;
163         void *burst[MAX_BURST] = {0};
164
165         if ( __sync_add_and_fetch(&lcore_count, 1) != 2 )
166                 while(lcore_count != 2)
167                         rte_pause();
168
169         const uint64_t sp_start = rte_rdtsc();
170         for (i = 0; i < iterations; i++)
171                 while (rte_ring_sp_enqueue_bulk(r, burst, size, NULL) == 0)
172                         rte_pause();
173         const uint64_t sp_end = rte_rdtsc();
174
175         const uint64_t mp_start = rte_rdtsc();
176         for (i = 0; i < iterations; i++)
177                 while (rte_ring_mp_enqueue_bulk(r, burst, size, NULL) == 0)
178                         rte_pause();
179         const uint64_t mp_end = rte_rdtsc();
180
181         params->spsc = ((double)(sp_end - sp_start))/(iterations*size);
182         params->mpmc = ((double)(mp_end - mp_start))/(iterations*size);
183         return 0;
184 }
185
186 /*
187  * Function that uses rdtsc to measure timing for ring dequeue. Needs pair
188  * thread running enqueue_bulk function
189  */
190 static int
191 dequeue_bulk(void *p)
192 {
193         const unsigned iter_shift = 23;
194         const unsigned iterations = 1<<iter_shift;
195         struct thread_params *params = p;
196         struct rte_ring *r = params->r;
197         const unsigned size = params->size;
198         unsigned i;
199         void *burst[MAX_BURST] = {0};
200
201         if ( __sync_add_and_fetch(&lcore_count, 1) != 2 )
202                 while(lcore_count != 2)
203                         rte_pause();
204
205         const uint64_t sc_start = rte_rdtsc();
206         for (i = 0; i < iterations; i++)
207                 while (rte_ring_sc_dequeue_bulk(r, burst, size, NULL) == 0)
208                         rte_pause();
209         const uint64_t sc_end = rte_rdtsc();
210
211         const uint64_t mc_start = rte_rdtsc();
212         for (i = 0; i < iterations; i++)
213                 while (rte_ring_mc_dequeue_bulk(r, burst, size, NULL) == 0)
214                         rte_pause();
215         const uint64_t mc_end = rte_rdtsc();
216
217         params->spsc = ((double)(sc_end - sc_start))/(iterations*size);
218         params->mpmc = ((double)(mc_end - mc_start))/(iterations*size);
219         return 0;
220 }
221
222 /*
223  * Function that calls the enqueue and dequeue bulk functions on pairs of cores.
224  * used to measure ring perf between hyperthreads, cores and sockets.
225  */
226 static void
227 run_on_core_pair(struct lcore_pair *cores, struct rte_ring *r,
228                 lcore_function_t f1, lcore_function_t f2)
229 {
230         struct thread_params param1 = {0}, param2 = {0};
231         unsigned i;
232         for (i = 0; i < sizeof(bulk_sizes)/sizeof(bulk_sizes[0]); i++) {
233                 lcore_count = 0;
234                 param1.size = param2.size = bulk_sizes[i];
235                 param1.r = param2.r = r;
236                 if (cores->c1 == rte_get_master_lcore()) {
237                         rte_eal_remote_launch(f2, &param2, cores->c2);
238                         f1(&param1);
239                         rte_eal_wait_lcore(cores->c2);
240                 } else {
241                         rte_eal_remote_launch(f1, &param1, cores->c1);
242                         rte_eal_remote_launch(f2, &param2, cores->c2);
243                         rte_eal_wait_lcore(cores->c1);
244                         rte_eal_wait_lcore(cores->c2);
245                 }
246                 printf("SP/SC bulk enq/dequeue (size: %u): %.2F\n", bulk_sizes[i],
247                                 param1.spsc + param2.spsc);
248                 printf("MP/MC bulk enq/dequeue (size: %u): %.2F\n", bulk_sizes[i],
249                                 param1.mpmc + param2.mpmc);
250         }
251 }
252
253 /*
254  * Test function that determines how long an enqueue + dequeue of a single item
255  * takes on a single lcore. Result is for comparison with the bulk enq+deq.
256  */
257 static void
258 test_single_enqueue_dequeue(struct rte_ring *r)
259 {
260         const unsigned iter_shift = 24;
261         const unsigned iterations = 1<<iter_shift;
262         unsigned i = 0;
263         void *burst = NULL;
264
265         const uint64_t sc_start = rte_rdtsc();
266         for (i = 0; i < iterations; i++) {
267                 rte_ring_sp_enqueue(r, burst);
268                 rte_ring_sc_dequeue(r, &burst);
269         }
270         const uint64_t sc_end = rte_rdtsc();
271
272         const uint64_t mc_start = rte_rdtsc();
273         for (i = 0; i < iterations; i++) {
274                 rte_ring_mp_enqueue(r, burst);
275                 rte_ring_mc_dequeue(r, &burst);
276         }
277         const uint64_t mc_end = rte_rdtsc();
278
279         printf("SP/SC single enq/dequeue: %"PRIu64"\n",
280                         (sc_end-sc_start) >> iter_shift);
281         printf("MP/MC single enq/dequeue: %"PRIu64"\n",
282                         (mc_end-mc_start) >> iter_shift);
283 }
284
285 /*
286  * Test that does both enqueue and dequeue on a core using the burst() API calls
287  * instead of the bulk() calls used in other tests. Results should be the same
288  * as for the bulk function called on a single lcore.
289  */
290 static void
291 test_burst_enqueue_dequeue(struct rte_ring *r)
292 {
293         const unsigned iter_shift = 23;
294         const unsigned iterations = 1<<iter_shift;
295         unsigned sz, i = 0;
296         void *burst[MAX_BURST] = {0};
297
298         for (sz = 0; sz < sizeof(bulk_sizes)/sizeof(bulk_sizes[0]); sz++) {
299                 const uint64_t sc_start = rte_rdtsc();
300                 for (i = 0; i < iterations; i++) {
301                         rte_ring_sp_enqueue_burst(r, burst,
302                                         bulk_sizes[sz], NULL);
303                         rte_ring_sc_dequeue_burst(r, burst,
304                                         bulk_sizes[sz], NULL);
305                 }
306                 const uint64_t sc_end = rte_rdtsc();
307
308                 const uint64_t mc_start = rte_rdtsc();
309                 for (i = 0; i < iterations; i++) {
310                         rte_ring_mp_enqueue_burst(r, burst,
311                                         bulk_sizes[sz], NULL);
312                         rte_ring_mc_dequeue_burst(r, burst,
313                                         bulk_sizes[sz], NULL);
314                 }
315                 const uint64_t mc_end = rte_rdtsc();
316
317                 uint64_t mc_avg = ((mc_end-mc_start) >> iter_shift) / bulk_sizes[sz];
318                 uint64_t sc_avg = ((sc_end-sc_start) >> iter_shift) / bulk_sizes[sz];
319
320                 printf("SP/SC burst enq/dequeue (size: %u): %"PRIu64"\n", bulk_sizes[sz],
321                                 sc_avg);
322                 printf("MP/MC burst enq/dequeue (size: %u): %"PRIu64"\n", bulk_sizes[sz],
323                                 mc_avg);
324         }
325 }
326
327 /* Times enqueue and dequeue on a single lcore */
328 static void
329 test_bulk_enqueue_dequeue(struct rte_ring *r)
330 {
331         const unsigned iter_shift = 23;
332         const unsigned iterations = 1<<iter_shift;
333         unsigned sz, i = 0;
334         void *burst[MAX_BURST] = {0};
335
336         for (sz = 0; sz < sizeof(bulk_sizes)/sizeof(bulk_sizes[0]); sz++) {
337                 const uint64_t sc_start = rte_rdtsc();
338                 for (i = 0; i < iterations; i++) {
339                         rte_ring_sp_enqueue_bulk(r, burst,
340                                         bulk_sizes[sz], NULL);
341                         rte_ring_sc_dequeue_bulk(r, burst,
342                                         bulk_sizes[sz], NULL);
343                 }
344                 const uint64_t sc_end = rte_rdtsc();
345
346                 const uint64_t mc_start = rte_rdtsc();
347                 for (i = 0; i < iterations; i++) {
348                         rte_ring_mp_enqueue_bulk(r, burst,
349                                         bulk_sizes[sz], NULL);
350                         rte_ring_mc_dequeue_bulk(r, burst,
351                                         bulk_sizes[sz], NULL);
352                 }
353                 const uint64_t mc_end = rte_rdtsc();
354
355                 double sc_avg = ((double)(sc_end-sc_start) /
356                                 (iterations * bulk_sizes[sz]));
357                 double mc_avg = ((double)(mc_end-mc_start) /
358                                 (iterations * bulk_sizes[sz]));
359
360                 printf("SP/SC bulk enq/dequeue (size: %u): %.2F\n", bulk_sizes[sz],
361                                 sc_avg);
362                 printf("MP/MC bulk enq/dequeue (size: %u): %.2F\n", bulk_sizes[sz],
363                                 mc_avg);
364         }
365 }
366
367 static int
368 test_ring_perf(void)
369 {
370         struct lcore_pair cores;
371         struct rte_ring *r = NULL;
372
373         r = rte_ring_create(RING_NAME, RING_SIZE, rte_socket_id(), 0);
374         if (r == NULL)
375                 return -1;
376
377         printf("### Testing single element and burst enq/deq ###\n");
378         test_single_enqueue_dequeue(r);
379         test_burst_enqueue_dequeue(r);
380
381         printf("\n### Testing empty dequeue ###\n");
382         test_empty_dequeue(r);
383
384         printf("\n### Testing using a single lcore ###\n");
385         test_bulk_enqueue_dequeue(r);
386
387         if (get_two_hyperthreads(&cores) == 0) {
388                 printf("\n### Testing using two hyperthreads ###\n");
389                 run_on_core_pair(&cores, r, enqueue_bulk, dequeue_bulk);
390         }
391         if (get_two_cores(&cores) == 0) {
392                 printf("\n### Testing using two physical cores ###\n");
393                 run_on_core_pair(&cores, r, enqueue_bulk, dequeue_bulk);
394         }
395         if (get_two_sockets(&cores) == 0) {
396                 printf("\n### Testing using two NUMA nodes ###\n");
397                 run_on_core_pair(&cores, r, enqueue_bulk, dequeue_bulk);
398         }
399         rte_ring_free(r);
400         return 0;
401 }
402
403 REGISTER_TEST_COMMAND(ring_perf_autotest, test_ring_perf);