doc: whitespace changes in licenses
[dpdk.git] / app / test / test_ring_perf.c
1 /*-
2  *   BSD LICENSE
3  * 
4  *   Copyright(c) 2010-2013 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  * 
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  * 
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  * 
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33
34
35 #include <stdio.h>
36 #include <inttypes.h>
37 #include <rte_ring.h>
38 #include <rte_cycles.h>
39 #include <rte_launch.h>
40
41 #include <cmdline_parse.h>
42
43 #include "test.h"
44
45 /*
46  * Ring
47  * ====
48  *
49  * Measures performance of various operations using rdtsc
50  *  * Empty ring dequeue
51  *  * Enqueue/dequeue of bursts in 1 threads
52  *  * Enqueue/dequeue of bursts in 2 threads
53  */
54
55 #define RING_NAME "RING_PERF"
56 #define RING_SIZE 4096
57 #define MAX_BURST 32
58
59 /* 
60  * the sizes to enqueue and dequeue in testing
61  * (marked volatile so they won't be seen as compile-time constants)
62  */
63 static const volatile unsigned bulk_sizes[] = { 8, 32 };
64
65 /* The ring structure used for tests */
66 static struct rte_ring *r;
67
68 struct lcore_pair {
69         unsigned c1, c2;
70 };
71
72 static volatile unsigned lcore_count = 0;
73
74 /**** Functions to analyse our core mask to get cores for different tests ***/
75
76 static int
77 get_two_hyperthreads(struct lcore_pair *lcp)
78 {
79         unsigned id1, id2;
80         unsigned c1, c2, s1, s2;
81         RTE_LCORE_FOREACH(id1) {
82                 /* inner loop just re-reads all id's. We could skip the first few
83                  * elements, but since number of cores is small there is little point
84                  */
85                 RTE_LCORE_FOREACH(id2) {
86                         if (id1 == id2)
87                                 continue;
88                         c1 = lcore_config[id1].core_id;
89                         c2 = lcore_config[id2].core_id;
90                         s1 = lcore_config[id1].socket_id;
91                         s2 = lcore_config[id2].socket_id;
92                         if ((c1 == c2) && (s1 == s2)){
93                                 lcp->c1 = id1;
94                                 lcp->c2 = id2;
95                                 return 0;
96                         }
97                 }
98         }
99         return 1;
100 }
101
102 static int
103 get_two_cores(struct lcore_pair *lcp)
104 {
105         unsigned id1, id2;
106         unsigned c1, c2, s1, s2;
107         RTE_LCORE_FOREACH(id1) {
108                 RTE_LCORE_FOREACH(id2) {
109                         if (id1 == id2)
110                                 continue;
111                         c1 = lcore_config[id1].core_id;
112                         c2 = lcore_config[id2].core_id;
113                         s1 = lcore_config[id1].socket_id;
114                         s2 = lcore_config[id2].socket_id;
115                         if ((c1 != c2) && (s1 == s2)){
116                                 lcp->c1 = id1;
117                                 lcp->c2 = id2;
118                                 return 0;
119                         }
120                 }
121         }
122         return 1;
123 }
124
125 static int
126 get_two_sockets(struct lcore_pair *lcp)
127 {
128         unsigned id1, id2;
129         unsigned s1, s2;
130         RTE_LCORE_FOREACH(id1) {
131                 RTE_LCORE_FOREACH(id2) {
132                         if (id1 == id2)
133                                 continue;
134                         s1 = lcore_config[id1].socket_id;
135                         s2 = lcore_config[id2].socket_id;
136                         if (s1 != s2){
137                                 lcp->c1 = id1;
138                                 lcp->c2 = id2;
139                                 return 0;
140                         }
141                 }
142         }
143         return 1;
144 }
145
146 /* Get cycle counts for dequeuing from an empty ring. Should be 2 or 3 cycles */
147 static void
148 test_empty_dequeue(void)
149 {
150         const unsigned iter_shift = 26;
151         const unsigned iterations = 1<<iter_shift;
152         unsigned i = 0;
153         void *burst[MAX_BURST];
154
155         const uint64_t sc_start = rte_rdtsc();
156         for (i = 0; i < iterations; i++)
157                 rte_ring_sc_dequeue_bulk(r, burst, bulk_sizes[0]);
158         const uint64_t sc_end = rte_rdtsc();
159
160         const uint64_t mc_start = rte_rdtsc();
161         for (i = 0; i < iterations; i++)
162                 rte_ring_mc_dequeue_bulk(r, burst, bulk_sizes[0]);
163         const uint64_t mc_end = rte_rdtsc();
164
165         printf("SC empty dequeue: %.2F\n",
166                         (double)(sc_end-sc_start) / iterations);
167         printf("MC empty dequeue: %.2F\n",
168                         (double)(mc_end-mc_start) / iterations);
169 }
170
171 /* 
172  * for the separate enqueue and dequeue threads they take in one param
173  * and return two. Input = burst size, output = cycle average for sp/sc & mp/mc
174  */
175 struct thread_params {
176         unsigned size;        /* input value, the burst size */
177         double spsc, mpmc;    /* output value, the single or multi timings */
178 };
179
180 /* 
181  * Function that uses rdtsc to measure timing for ring enqueue. Needs pair
182  * thread running dequeue_bulk function 
183  */
184 static int
185 enqueue_bulk(void *p)
186 {
187         const unsigned iter_shift = 23;
188         const unsigned iterations = 1<<iter_shift;
189         struct thread_params *params = p;
190         const unsigned size = params->size;
191         unsigned i;
192         void *burst[MAX_BURST] = {0};
193
194         if ( __sync_add_and_fetch(&lcore_count, 1) != 2 )
195                 while(lcore_count != 2)
196                         rte_pause();
197
198         const uint64_t sp_start = rte_rdtsc();
199         for (i = 0; i < iterations; i++)
200                 while (rte_ring_sp_enqueue_bulk(r, burst, size) != 0)
201                         rte_pause();
202         const uint64_t sp_end = rte_rdtsc();
203
204         const uint64_t mp_start = rte_rdtsc();
205         for (i = 0; i < iterations; i++)
206                 while (rte_ring_mp_enqueue_bulk(r, burst, size) != 0)
207                         rte_pause();
208         const uint64_t mp_end = rte_rdtsc();
209
210         params->spsc = ((double)(sp_end - sp_start))/(iterations*size);
211         params->mpmc = ((double)(mp_end - mp_start))/(iterations*size);
212         return 0;
213 }
214
215 /* 
216  * Function that uses rdtsc to measure timing for ring dequeue. Needs pair
217  * thread running enqueue_bulk function 
218  */
219 static int
220 dequeue_bulk(void *p)
221 {
222         const unsigned iter_shift = 23;
223         const unsigned iterations = 1<<iter_shift;
224         struct thread_params *params = p;
225         const unsigned size = params->size;
226         unsigned i;
227         void *burst[MAX_BURST] = {0};
228
229         if ( __sync_add_and_fetch(&lcore_count, 1) != 2 )
230                 while(lcore_count != 2)
231                         rte_pause();
232
233         const uint64_t sc_start = rte_rdtsc();
234         for (i = 0; i < iterations; i++)
235                 while (rte_ring_sc_dequeue_bulk(r, burst, size) != 0)
236                         rte_pause();
237         const uint64_t sc_end = rte_rdtsc();
238
239         const uint64_t mc_start = rte_rdtsc();
240         for (i = 0; i < iterations; i++)
241                 while (rte_ring_mc_dequeue_bulk(r, burst, size) != 0)
242                         rte_pause();
243         const uint64_t mc_end = rte_rdtsc();
244
245         params->spsc = ((double)(sc_end - sc_start))/(iterations*size);
246         params->mpmc = ((double)(mc_end - mc_start))/(iterations*size);
247         return 0;
248 }
249
250 /* 
251  * Function that calls the enqueue and dequeue bulk functions on pairs of cores.
252  * used to measure ring perf between hyperthreads, cores and sockets.
253  */
254 static void
255 run_on_core_pair(struct lcore_pair *cores,
256                 lcore_function_t f1, lcore_function_t f2)
257 {
258         struct thread_params param1 = {.size = 0}, param2 = {.size = 0};
259         unsigned i;
260         for (i = 0; i < sizeof(bulk_sizes)/sizeof(bulk_sizes[0]); i++) {
261                 lcore_count = 0;
262                 param1.size = param2.size = bulk_sizes[i];
263                 if (cores->c1 == rte_get_master_lcore()) {
264                         rte_eal_remote_launch(f2, &param2, cores->c2);
265                         f1(&param1);
266                         rte_eal_wait_lcore(cores->c2);
267                 } else {
268                         rte_eal_remote_launch(f1, &param1, cores->c1);
269                         rte_eal_remote_launch(f2, &param2, cores->c2);
270                         rte_eal_wait_lcore(cores->c1);
271                         rte_eal_wait_lcore(cores->c2);
272                 }
273                 printf("SP/SC bulk enq/dequeue (size: %u): %.2F\n", bulk_sizes[i],
274                                 param1.spsc + param2.spsc);
275                 printf("MP/MC bulk enq/dequeue (size: %u): %.2F\n", bulk_sizes[i],
276                                 param1.mpmc + param2.mpmc);
277         }
278 }
279
280 /* 
281  * Test function that determines how long an enqueue + dequeue of a single item
282  * takes on a single lcore. Result is for comparison with the bulk enq+deq.
283  */
284 static void
285 test_single_enqueue_dequeue(void)
286 {
287         const unsigned iter_shift = 24;
288         const unsigned iterations = 1<<iter_shift;
289         unsigned i = 0;
290         void *burst = NULL;
291
292         const uint64_t sc_start = rte_rdtsc();
293         for (i = 0; i < iterations; i++) {
294                 rte_ring_sp_enqueue(r, burst);
295                 rte_ring_sc_dequeue(r, &burst);
296         }
297         const uint64_t sc_end = rte_rdtsc();
298
299         const uint64_t mc_start = rte_rdtsc();
300         for (i = 0; i < iterations; i++) {
301                 rte_ring_mp_enqueue(r, burst);
302                 rte_ring_mc_dequeue(r, &burst);
303         }
304         const uint64_t mc_end = rte_rdtsc();
305
306         printf("SP/SC single enq/dequeue: %"PRIu64"\n",
307                         (sc_end-sc_start) >> iter_shift);
308         printf("MP/MC single enq/dequeue: %"PRIu64"\n",
309                         (mc_end-mc_start) >> iter_shift);
310 }
311
312 /* 
313  * Test that does both enqueue and dequeue on a core using the burst() API calls
314  * instead of the bulk() calls used in other tests. Results should be the same
315  * as for the bulk function called on a single lcore.
316  */
317 static void
318 test_burst_enqueue_dequeue(void)
319 {
320         const unsigned iter_shift = 23;
321         const unsigned iterations = 1<<iter_shift;
322         unsigned sz, i = 0;
323         void *burst[MAX_BURST] = {0};
324
325         for (sz = 0; sz < sizeof(bulk_sizes)/sizeof(bulk_sizes[0]); sz++) {
326                 const uint64_t sc_start = rte_rdtsc();
327                 for (i = 0; i < iterations; i++) {
328                         rte_ring_sp_enqueue_burst(r, burst, bulk_sizes[sz]);
329                         rte_ring_sc_dequeue_burst(r, burst, bulk_sizes[sz]);
330                 }
331                 const uint64_t sc_end = rte_rdtsc();
332
333                 const uint64_t mc_start = rte_rdtsc();
334                 for (i = 0; i < iterations; i++) {
335                         rte_ring_mp_enqueue_burst(r, burst, bulk_sizes[sz]);
336                         rte_ring_mc_dequeue_burst(r, burst, bulk_sizes[sz]);
337                 }
338                 const uint64_t mc_end = rte_rdtsc();
339
340                 uint64_t mc_avg = ((mc_end-mc_start) >> iter_shift) / bulk_sizes[sz];
341                 uint64_t sc_avg = ((sc_end-sc_start) >> iter_shift) / bulk_sizes[sz];
342
343                 printf("SP/SC burst enq/dequeue (size: %u): %"PRIu64"\n", bulk_sizes[sz],
344                                 sc_avg);
345                 printf("MP/MC burst enq/dequeue (size: %u): %"PRIu64"\n", bulk_sizes[sz],
346                                 mc_avg);
347         }
348 }
349
350 /* Times enqueue and dequeue on a single lcore */
351 static void
352 test_bulk_enqueue_dequeue(void)
353 {
354         const unsigned iter_shift = 23;
355         const unsigned iterations = 1<<iter_shift;
356         unsigned sz, i = 0;
357         void *burst[MAX_BURST] = {0};
358
359         for (sz = 0; sz < sizeof(bulk_sizes)/sizeof(bulk_sizes[0]); sz++) {
360                 const uint64_t sc_start = rte_rdtsc();
361                 for (i = 0; i < iterations; i++) {
362                         rte_ring_sp_enqueue_bulk(r, burst, bulk_sizes[sz]);
363                         rte_ring_sc_dequeue_bulk(r, burst, bulk_sizes[sz]);
364                 }
365                 const uint64_t sc_end = rte_rdtsc();
366
367                 const uint64_t mc_start = rte_rdtsc();
368                 for (i = 0; i < iterations; i++) {
369                         rte_ring_mp_enqueue_bulk(r, burst, bulk_sizes[sz]);
370                         rte_ring_mc_dequeue_bulk(r, burst, bulk_sizes[sz]);
371                 }
372                 const uint64_t mc_end = rte_rdtsc();
373
374                 double sc_avg = ((double)(sc_end-sc_start) /
375                                 (iterations * bulk_sizes[sz]));
376                 double mc_avg = ((double)(mc_end-mc_start) /
377                                 (iterations * bulk_sizes[sz]));
378
379                 printf("SP/SC bulk enq/dequeue (size: %u): %.2F\n", bulk_sizes[sz],
380                                 sc_avg);
381                 printf("MP/MC bulk enq/dequeue (size: %u): %.2F\n", bulk_sizes[sz],
382                                 mc_avg);
383         }
384 }
385
386 int
387 test_ring_perf(void)
388 {
389         struct lcore_pair cores;
390         r = rte_ring_create(RING_NAME, RING_SIZE, rte_socket_id(), 0);
391         if (r == NULL && (r = rte_ring_lookup(RING_NAME)) == NULL)
392                 return -1;
393
394         printf("### Testing single element and burst enq/deq ###\n");
395         test_single_enqueue_dequeue();
396         test_burst_enqueue_dequeue();
397
398         printf("\n### Testing empty dequeue ###\n");
399         test_empty_dequeue();
400
401         printf("\n### Testing using a single lcore ###\n");
402         test_bulk_enqueue_dequeue();
403
404         if (get_two_hyperthreads(&cores) == 0) {
405                 printf("\n### Testing using two hyperthreads ###\n");
406                 run_on_core_pair(&cores, enqueue_bulk, dequeue_bulk);
407         }
408         if (get_two_cores(&cores) == 0) {
409                 printf("\n### Testing using two physical cores ###\n");
410                 run_on_core_pair(&cores, enqueue_bulk, dequeue_bulk);
411         }
412         if (get_two_sockets(&cores) == 0) {
413                 printf("\n### Testing using two NUMA nodes ###\n");
414                 run_on_core_pair(&cores, enqueue_bulk, dequeue_bulk);
415         }
416         return 0;
417 }