kvargs: use SPDX tags
[dpdk.git] / test / test / test_ring_perf.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2014 Intel Corporation
3  */
4
5
6 #include <stdio.h>
7 #include <inttypes.h>
8 #include <rte_ring.h>
9 #include <rte_cycles.h>
10 #include <rte_launch.h>
11 #include <rte_pause.h>
12
13 #include "test.h"
14
15 /*
16  * Ring
17  * ====
18  *
19  * Measures performance of various operations using rdtsc
20  *  * Empty ring dequeue
21  *  * Enqueue/dequeue of bursts in 1 threads
22  *  * Enqueue/dequeue of bursts in 2 threads
23  */
24
25 #define RING_NAME "RING_PERF"
26 #define RING_SIZE 4096
27 #define MAX_BURST 32
28
29 /*
30  * the sizes to enqueue and dequeue in testing
31  * (marked volatile so they won't be seen as compile-time constants)
32  */
33 static const volatile unsigned bulk_sizes[] = { 8, 32 };
34
35 /* The ring structure used for tests */
36 static struct rte_ring *r;
37
38 struct lcore_pair {
39         unsigned c1, c2;
40 };
41
42 static volatile unsigned lcore_count = 0;
43
44 /**** Functions to analyse our core mask to get cores for different tests ***/
45
46 static int
47 get_two_hyperthreads(struct lcore_pair *lcp)
48 {
49         unsigned id1, id2;
50         unsigned c1, c2, s1, s2;
51         RTE_LCORE_FOREACH(id1) {
52                 /* inner loop just re-reads all id's. We could skip the first few
53                  * elements, but since number of cores is small there is little point
54                  */
55                 RTE_LCORE_FOREACH(id2) {
56                         if (id1 == id2)
57                                 continue;
58                         c1 = lcore_config[id1].core_id;
59                         c2 = lcore_config[id2].core_id;
60                         s1 = lcore_config[id1].socket_id;
61                         s2 = lcore_config[id2].socket_id;
62                         if ((c1 == c2) && (s1 == s2)){
63                                 lcp->c1 = id1;
64                                 lcp->c2 = id2;
65                                 return 0;
66                         }
67                 }
68         }
69         return 1;
70 }
71
72 static int
73 get_two_cores(struct lcore_pair *lcp)
74 {
75         unsigned id1, id2;
76         unsigned c1, c2, s1, s2;
77         RTE_LCORE_FOREACH(id1) {
78                 RTE_LCORE_FOREACH(id2) {
79                         if (id1 == id2)
80                                 continue;
81                         c1 = lcore_config[id1].core_id;
82                         c2 = lcore_config[id2].core_id;
83                         s1 = lcore_config[id1].socket_id;
84                         s2 = lcore_config[id2].socket_id;
85                         if ((c1 != c2) && (s1 == s2)){
86                                 lcp->c1 = id1;
87                                 lcp->c2 = id2;
88                                 return 0;
89                         }
90                 }
91         }
92         return 1;
93 }
94
95 static int
96 get_two_sockets(struct lcore_pair *lcp)
97 {
98         unsigned id1, id2;
99         unsigned s1, s2;
100         RTE_LCORE_FOREACH(id1) {
101                 RTE_LCORE_FOREACH(id2) {
102                         if (id1 == id2)
103                                 continue;
104                         s1 = lcore_config[id1].socket_id;
105                         s2 = lcore_config[id2].socket_id;
106                         if (s1 != s2){
107                                 lcp->c1 = id1;
108                                 lcp->c2 = id2;
109                                 return 0;
110                         }
111                 }
112         }
113         return 1;
114 }
115
116 /* Get cycle counts for dequeuing from an empty ring. Should be 2 or 3 cycles */
117 static void
118 test_empty_dequeue(void)
119 {
120         const unsigned iter_shift = 26;
121         const unsigned iterations = 1<<iter_shift;
122         unsigned i = 0;
123         void *burst[MAX_BURST];
124
125         const uint64_t sc_start = rte_rdtsc();
126         for (i = 0; i < iterations; i++)
127                 rte_ring_sc_dequeue_bulk(r, burst, bulk_sizes[0], NULL);
128         const uint64_t sc_end = rte_rdtsc();
129
130         const uint64_t mc_start = rte_rdtsc();
131         for (i = 0; i < iterations; i++)
132                 rte_ring_mc_dequeue_bulk(r, burst, bulk_sizes[0], NULL);
133         const uint64_t mc_end = rte_rdtsc();
134
135         printf("SC empty dequeue: %.2F\n",
136                         (double)(sc_end-sc_start) / iterations);
137         printf("MC empty dequeue: %.2F\n",
138                         (double)(mc_end-mc_start) / iterations);
139 }
140
141 /*
142  * for the separate enqueue and dequeue threads they take in one param
143  * and return two. Input = burst size, output = cycle average for sp/sc & mp/mc
144  */
145 struct thread_params {
146         unsigned size;        /* input value, the burst size */
147         double spsc, mpmc;    /* output value, the single or multi timings */
148 };
149
150 /*
151  * Function that uses rdtsc to measure timing for ring enqueue. Needs pair
152  * thread running dequeue_bulk function
153  */
154 static int
155 enqueue_bulk(void *p)
156 {
157         const unsigned iter_shift = 23;
158         const unsigned iterations = 1<<iter_shift;
159         struct thread_params *params = p;
160         const unsigned size = params->size;
161         unsigned i;
162         void *burst[MAX_BURST] = {0};
163
164         if ( __sync_add_and_fetch(&lcore_count, 1) != 2 )
165                 while(lcore_count != 2)
166                         rte_pause();
167
168         const uint64_t sp_start = rte_rdtsc();
169         for (i = 0; i < iterations; i++)
170                 while (rte_ring_sp_enqueue_bulk(r, burst, size, NULL) == 0)
171                         rte_pause();
172         const uint64_t sp_end = rte_rdtsc();
173
174         const uint64_t mp_start = rte_rdtsc();
175         for (i = 0; i < iterations; i++)
176                 while (rte_ring_mp_enqueue_bulk(r, burst, size, NULL) == 0)
177                         rte_pause();
178         const uint64_t mp_end = rte_rdtsc();
179
180         params->spsc = ((double)(sp_end - sp_start))/(iterations*size);
181         params->mpmc = ((double)(mp_end - mp_start))/(iterations*size);
182         return 0;
183 }
184
185 /*
186  * Function that uses rdtsc to measure timing for ring dequeue. Needs pair
187  * thread running enqueue_bulk function
188  */
189 static int
190 dequeue_bulk(void *p)
191 {
192         const unsigned iter_shift = 23;
193         const unsigned iterations = 1<<iter_shift;
194         struct thread_params *params = p;
195         const unsigned size = params->size;
196         unsigned i;
197         void *burst[MAX_BURST] = {0};
198
199         if ( __sync_add_and_fetch(&lcore_count, 1) != 2 )
200                 while(lcore_count != 2)
201                         rte_pause();
202
203         const uint64_t sc_start = rte_rdtsc();
204         for (i = 0; i < iterations; i++)
205                 while (rte_ring_sc_dequeue_bulk(r, burst, size, NULL) == 0)
206                         rte_pause();
207         const uint64_t sc_end = rte_rdtsc();
208
209         const uint64_t mc_start = rte_rdtsc();
210         for (i = 0; i < iterations; i++)
211                 while (rte_ring_mc_dequeue_bulk(r, burst, size, NULL) == 0)
212                         rte_pause();
213         const uint64_t mc_end = rte_rdtsc();
214
215         params->spsc = ((double)(sc_end - sc_start))/(iterations*size);
216         params->mpmc = ((double)(mc_end - mc_start))/(iterations*size);
217         return 0;
218 }
219
220 /*
221  * Function that calls the enqueue and dequeue bulk functions on pairs of cores.
222  * used to measure ring perf between hyperthreads, cores and sockets.
223  */
224 static void
225 run_on_core_pair(struct lcore_pair *cores,
226                 lcore_function_t f1, lcore_function_t f2)
227 {
228         struct thread_params param1 = {0}, param2 = {0};
229         unsigned i;
230         for (i = 0; i < sizeof(bulk_sizes)/sizeof(bulk_sizes[0]); i++) {
231                 lcore_count = 0;
232                 param1.size = param2.size = bulk_sizes[i];
233                 if (cores->c1 == rte_get_master_lcore()) {
234                         rte_eal_remote_launch(f2, &param2, cores->c2);
235                         f1(&param1);
236                         rte_eal_wait_lcore(cores->c2);
237                 } else {
238                         rte_eal_remote_launch(f1, &param1, cores->c1);
239                         rte_eal_remote_launch(f2, &param2, cores->c2);
240                         rte_eal_wait_lcore(cores->c1);
241                         rte_eal_wait_lcore(cores->c2);
242                 }
243                 printf("SP/SC bulk enq/dequeue (size: %u): %.2F\n", bulk_sizes[i],
244                                 param1.spsc + param2.spsc);
245                 printf("MP/MC bulk enq/dequeue (size: %u): %.2F\n", bulk_sizes[i],
246                                 param1.mpmc + param2.mpmc);
247         }
248 }
249
250 /*
251  * Test function that determines how long an enqueue + dequeue of a single item
252  * takes on a single lcore. Result is for comparison with the bulk enq+deq.
253  */
254 static void
255 test_single_enqueue_dequeue(void)
256 {
257         const unsigned iter_shift = 24;
258         const unsigned iterations = 1<<iter_shift;
259         unsigned i = 0;
260         void *burst = NULL;
261
262         const uint64_t sc_start = rte_rdtsc();
263         for (i = 0; i < iterations; i++) {
264                 rte_ring_sp_enqueue(r, burst);
265                 rte_ring_sc_dequeue(r, &burst);
266         }
267         const uint64_t sc_end = rte_rdtsc();
268
269         const uint64_t mc_start = rte_rdtsc();
270         for (i = 0; i < iterations; i++) {
271                 rte_ring_mp_enqueue(r, burst);
272                 rte_ring_mc_dequeue(r, &burst);
273         }
274         const uint64_t mc_end = rte_rdtsc();
275
276         printf("SP/SC single enq/dequeue: %"PRIu64"\n",
277                         (sc_end-sc_start) >> iter_shift);
278         printf("MP/MC single enq/dequeue: %"PRIu64"\n",
279                         (mc_end-mc_start) >> iter_shift);
280 }
281
282 /*
283  * Test that does both enqueue and dequeue on a core using the burst() API calls
284  * instead of the bulk() calls used in other tests. Results should be the same
285  * as for the bulk function called on a single lcore.
286  */
287 static void
288 test_burst_enqueue_dequeue(void)
289 {
290         const unsigned iter_shift = 23;
291         const unsigned iterations = 1<<iter_shift;
292         unsigned sz, i = 0;
293         void *burst[MAX_BURST] = {0};
294
295         for (sz = 0; sz < sizeof(bulk_sizes)/sizeof(bulk_sizes[0]); sz++) {
296                 const uint64_t sc_start = rte_rdtsc();
297                 for (i = 0; i < iterations; i++) {
298                         rte_ring_sp_enqueue_burst(r, burst,
299                                         bulk_sizes[sz], NULL);
300                         rte_ring_sc_dequeue_burst(r, burst,
301                                         bulk_sizes[sz], NULL);
302                 }
303                 const uint64_t sc_end = rte_rdtsc();
304
305                 const uint64_t mc_start = rte_rdtsc();
306                 for (i = 0; i < iterations; i++) {
307                         rte_ring_mp_enqueue_burst(r, burst,
308                                         bulk_sizes[sz], NULL);
309                         rte_ring_mc_dequeue_burst(r, burst,
310                                         bulk_sizes[sz], NULL);
311                 }
312                 const uint64_t mc_end = rte_rdtsc();
313
314                 uint64_t mc_avg = ((mc_end-mc_start) >> iter_shift) / bulk_sizes[sz];
315                 uint64_t sc_avg = ((sc_end-sc_start) >> iter_shift) / bulk_sizes[sz];
316
317                 printf("SP/SC burst enq/dequeue (size: %u): %"PRIu64"\n", bulk_sizes[sz],
318                                 sc_avg);
319                 printf("MP/MC burst enq/dequeue (size: %u): %"PRIu64"\n", bulk_sizes[sz],
320                                 mc_avg);
321         }
322 }
323
324 /* Times enqueue and dequeue on a single lcore */
325 static void
326 test_bulk_enqueue_dequeue(void)
327 {
328         const unsigned iter_shift = 23;
329         const unsigned iterations = 1<<iter_shift;
330         unsigned sz, i = 0;
331         void *burst[MAX_BURST] = {0};
332
333         for (sz = 0; sz < sizeof(bulk_sizes)/sizeof(bulk_sizes[0]); sz++) {
334                 const uint64_t sc_start = rte_rdtsc();
335                 for (i = 0; i < iterations; i++) {
336                         rte_ring_sp_enqueue_bulk(r, burst,
337                                         bulk_sizes[sz], NULL);
338                         rte_ring_sc_dequeue_bulk(r, burst,
339                                         bulk_sizes[sz], NULL);
340                 }
341                 const uint64_t sc_end = rte_rdtsc();
342
343                 const uint64_t mc_start = rte_rdtsc();
344                 for (i = 0; i < iterations; i++) {
345                         rte_ring_mp_enqueue_bulk(r, burst,
346                                         bulk_sizes[sz], NULL);
347                         rte_ring_mc_dequeue_bulk(r, burst,
348                                         bulk_sizes[sz], NULL);
349                 }
350                 const uint64_t mc_end = rte_rdtsc();
351
352                 double sc_avg = ((double)(sc_end-sc_start) /
353                                 (iterations * bulk_sizes[sz]));
354                 double mc_avg = ((double)(mc_end-mc_start) /
355                                 (iterations * bulk_sizes[sz]));
356
357                 printf("SP/SC bulk enq/dequeue (size: %u): %.2F\n", bulk_sizes[sz],
358                                 sc_avg);
359                 printf("MP/MC bulk enq/dequeue (size: %u): %.2F\n", bulk_sizes[sz],
360                                 mc_avg);
361         }
362 }
363
364 static int
365 test_ring_perf(void)
366 {
367         struct lcore_pair cores;
368         r = rte_ring_create(RING_NAME, RING_SIZE, rte_socket_id(), 0);
369         if (r == NULL && (r = rte_ring_lookup(RING_NAME)) == NULL)
370                 return -1;
371
372         printf("### Testing single element and burst enq/deq ###\n");
373         test_single_enqueue_dequeue();
374         test_burst_enqueue_dequeue();
375
376         printf("\n### Testing empty dequeue ###\n");
377         test_empty_dequeue();
378
379         printf("\n### Testing using a single lcore ###\n");
380         test_bulk_enqueue_dequeue();
381
382         if (get_two_hyperthreads(&cores) == 0) {
383                 printf("\n### Testing using two hyperthreads ###\n");
384                 run_on_core_pair(&cores, enqueue_bulk, dequeue_bulk);
385         }
386         if (get_two_cores(&cores) == 0) {
387                 printf("\n### Testing using two physical cores ###\n");
388                 run_on_core_pair(&cores, enqueue_bulk, dequeue_bulk);
389         }
390         if (get_two_sockets(&cores) == 0) {
391                 printf("\n### Testing using two NUMA nodes ###\n");
392                 run_on_core_pair(&cores, enqueue_bulk, dequeue_bulk);
393         }
394         return 0;
395 }
396
397 REGISTER_TEST_COMMAND(ring_perf_autotest, test_ring_perf);