lpm: hide defer queue handle
[dpdk.git] / app / test / test_lpm_perf.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2014 Intel Corporation
3  * Copyright(c) 2020 Arm Limited
4  */
5
6 #include <stdio.h>
7 #include <stdint.h>
8 #include <stdlib.h>
9 #include <math.h>
10
11 #include <rte_cycles.h>
12 #include <rte_random.h>
13 #include <rte_branch_prediction.h>
14 #include <rte_malloc.h>
15 #include <rte_ip.h>
16 #include <rte_lpm.h>
17
18 #include "test.h"
19 #include "test_xmmt_ops.h"
20
21 struct rte_lpm *lpm;
22 static struct rte_rcu_qsbr *rv;
23 static volatile uint8_t writer_done;
24 static volatile uint32_t thr_id;
25 static uint64_t gwrite_cycles;
26 static uint64_t gwrites;
27 /* LPM APIs are not thread safe, use mutex to provide thread safety */
28 static pthread_mutex_t lpm_mutex = PTHREAD_MUTEX_INITIALIZER;
29
30 /* Report quiescent state interval every 1024 lookups. Larger critical
31  * sections in reader will result in writer polling multiple times.
32  */
33 #define QSBR_REPORTING_INTERVAL 1024
34
35 #define TEST_LPM_ASSERT(cond) do {                                            \
36         if (!(cond)) {                                                        \
37                 printf("Error at line %d: \n", __LINE__);                     \
38                 return -1;                                                    \
39         }                                                                     \
40 } while(0)
41
42 #define ITERATIONS (1 << 10)
43 #define RCU_ITERATIONS 10
44 #define BATCH_SIZE (1 << 12)
45 #define BULK_SIZE 32
46
47 #define MAX_RULE_NUM (1200000)
48
49 struct route_rule {
50         uint32_t ip;
51         uint8_t depth;
52 };
53
54 static struct route_rule large_route_table[MAX_RULE_NUM];
55 /* Route table for routes with depth > 24 */
56 struct route_rule large_ldepth_route_table[MAX_RULE_NUM];
57
58 static uint32_t num_route_entries;
59 static uint32_t num_ldepth_route_entries;
60 #define NUM_ROUTE_ENTRIES num_route_entries
61 #define NUM_LDEPTH_ROUTE_ENTRIES num_ldepth_route_entries
62
63 enum {
64         IP_CLASS_A,
65         IP_CLASS_B,
66         IP_CLASS_C
67 };
68
69 /* struct route_rule_count defines the total number of rules in following a/b/c
70  * each item in a[]/b[]/c[] is the number of common IP address class A/B/C, not
71  * including the ones for private local network.
72  */
73 struct route_rule_count {
74         uint32_t a[RTE_LPM_MAX_DEPTH];
75         uint32_t b[RTE_LPM_MAX_DEPTH];
76         uint32_t c[RTE_LPM_MAX_DEPTH];
77 };
78
79 /* All following numbers of each depth of each common IP class are just
80  * got from previous large constant table in app/test/test_lpm_routes.h .
81  * In order to match similar performance, they keep same depth and IP
82  * address coverage as previous constant table. These numbers don't
83  * include any private local IP address. As previous large const rule
84  * table was just dumped from a real router, there are no any IP address
85  * in class C or D.
86  */
87 static struct route_rule_count rule_count = {
88         .a = { /* IP class A in which the most significant bit is 0 */
89                     0, /* depth =  1 */
90                     0, /* depth =  2 */
91                     1, /* depth =  3 */
92                     0, /* depth =  4 */
93                     2, /* depth =  5 */
94                     1, /* depth =  6 */
95                     3, /* depth =  7 */
96                   185, /* depth =  8 */
97                    26, /* depth =  9 */
98                    16, /* depth = 10 */
99                    39, /* depth = 11 */
100                   144, /* depth = 12 */
101                   233, /* depth = 13 */
102                   528, /* depth = 14 */
103                   866, /* depth = 15 */
104                  3856, /* depth = 16 */
105                  3268, /* depth = 17 */
106                  5662, /* depth = 18 */
107                 17301, /* depth = 19 */
108                 22226, /* depth = 20 */
109                 11147, /* depth = 21 */
110                 16746, /* depth = 22 */
111                 17120, /* depth = 23 */
112                 77578, /* depth = 24 */
113                   401, /* depth = 25 */
114                   656, /* depth = 26 */
115                  1107, /* depth = 27 */
116                  1121, /* depth = 28 */
117                  2316, /* depth = 29 */
118                   717, /* depth = 30 */
119                    10, /* depth = 31 */
120                    66  /* depth = 32 */
121         },
122         .b = { /* IP class A in which the most 2 significant bits are 10 */
123                     0, /* depth =  1 */
124                     0, /* depth =  2 */
125                     0, /* depth =  3 */
126                     0, /* depth =  4 */
127                     1, /* depth =  5 */
128                     1, /* depth =  6 */
129                     1, /* depth =  7 */
130                     3, /* depth =  8 */
131                     3, /* depth =  9 */
132                    30, /* depth = 10 */
133                    25, /* depth = 11 */
134                   168, /* depth = 12 */
135                   305, /* depth = 13 */
136                   569, /* depth = 14 */
137                  1129, /* depth = 15 */
138                 50800, /* depth = 16 */
139                  1645, /* depth = 17 */
140                  1820, /* depth = 18 */
141                  3506, /* depth = 19 */
142                  3258, /* depth = 20 */
143                  3424, /* depth = 21 */
144                  4971, /* depth = 22 */
145                  6885, /* depth = 23 */
146                 39771, /* depth = 24 */
147                   424, /* depth = 25 */
148                   170, /* depth = 26 */
149                   433, /* depth = 27 */
150                    92, /* depth = 28 */
151                   366, /* depth = 29 */
152                   377, /* depth = 30 */
153                     2, /* depth = 31 */
154                   200  /* depth = 32 */
155         },
156         .c = { /* IP class A in which the most 3 significant bits are 110 */
157                      0, /* depth =  1 */
158                      0, /* depth =  2 */
159                      0, /* depth =  3 */
160                      0, /* depth =  4 */
161                      0, /* depth =  5 */
162                      0, /* depth =  6 */
163                      0, /* depth =  7 */
164                     12, /* depth =  8 */
165                      8, /* depth =  9 */
166                      9, /* depth = 10 */
167                     33, /* depth = 11 */
168                     69, /* depth = 12 */
169                    237, /* depth = 13 */
170                   1007, /* depth = 14 */
171                   1717, /* depth = 15 */
172                  14663, /* depth = 16 */
173                   8070, /* depth = 17 */
174                  16185, /* depth = 18 */
175                  48261, /* depth = 19 */
176                  36870, /* depth = 20 */
177                  33960, /* depth = 21 */
178                  50638, /* depth = 22 */
179                  61422, /* depth = 23 */
180                 466549, /* depth = 24 */
181                   1829, /* depth = 25 */
182                   4824, /* depth = 26 */
183                   4927, /* depth = 27 */
184                   5914, /* depth = 28 */
185                  10254, /* depth = 29 */
186                   4905, /* depth = 30 */
187                      1, /* depth = 31 */
188                    716  /* depth = 32 */
189         }
190 };
191
192 static void generate_random_rule_prefix(uint32_t ip_class, uint8_t depth)
193 {
194 /* IP address class A, the most significant bit is 0 */
195 #define IP_HEAD_MASK_A                  0x00000000
196 #define IP_HEAD_BIT_NUM_A               1
197
198 /* IP address class B, the most significant 2 bits are 10 */
199 #define IP_HEAD_MASK_B                  0x80000000
200 #define IP_HEAD_BIT_NUM_B               2
201
202 /* IP address class C, the most significant 3 bits are 110 */
203 #define IP_HEAD_MASK_C                  0xC0000000
204 #define IP_HEAD_BIT_NUM_C               3
205
206         uint32_t class_depth;
207         uint32_t range;
208         uint32_t mask;
209         uint32_t step;
210         uint32_t start;
211         uint32_t fixed_bit_num;
212         uint32_t ip_head_mask;
213         uint32_t rule_num;
214         uint32_t k;
215         struct route_rule *ptr_rule, *ptr_ldepth_rule;
216
217         if (ip_class == IP_CLASS_A) {        /* IP Address class A */
218                 fixed_bit_num = IP_HEAD_BIT_NUM_A;
219                 ip_head_mask = IP_HEAD_MASK_A;
220                 rule_num = rule_count.a[depth - 1];
221         } else if (ip_class == IP_CLASS_B) { /* IP Address class B */
222                 fixed_bit_num = IP_HEAD_BIT_NUM_B;
223                 ip_head_mask = IP_HEAD_MASK_B;
224                 rule_num = rule_count.b[depth - 1];
225         } else {                             /* IP Address class C */
226                 fixed_bit_num = IP_HEAD_BIT_NUM_C;
227                 ip_head_mask = IP_HEAD_MASK_C;
228                 rule_num = rule_count.c[depth - 1];
229         }
230
231         if (rule_num == 0)
232                 return;
233
234         /* the number of rest bits which don't include the most significant
235          * fixed bits for this IP address class
236          */
237         class_depth = depth - fixed_bit_num;
238
239         /* range is the maximum number of rules for this depth and
240          * this IP address class
241          */
242         range = 1 << class_depth;
243
244         /* only mask the most depth significant generated bits
245          * except fixed bits for IP address class
246          */
247         mask = range - 1;
248
249         /* Widen coverage of IP address in generated rules */
250         if (range <= rule_num)
251                 step = 1;
252         else
253                 step = round((double)range / rule_num);
254
255         /* Only generate rest bits except the most significant
256          * fixed bits for IP address class
257          */
258         start = lrand48() & mask;
259         ptr_rule = &large_route_table[num_route_entries];
260         ptr_ldepth_rule = &large_ldepth_route_table[num_ldepth_route_entries];
261         for (k = 0; k < rule_num; k++) {
262                 ptr_rule->ip = (start << (RTE_LPM_MAX_DEPTH - depth))
263                         | ip_head_mask;
264                 ptr_rule->depth = depth;
265                 /* If the depth of the route is more than 24, store it
266                  * in another table as well.
267                  */
268                 if (depth > 24) {
269                         ptr_ldepth_rule->ip = ptr_rule->ip;
270                         ptr_ldepth_rule->depth = ptr_rule->depth;
271                         ptr_ldepth_rule++;
272                         num_ldepth_route_entries++;
273                 }
274                 ptr_rule++;
275                 start = (start + step) & mask;
276         }
277         num_route_entries += rule_num;
278 }
279
280 static void insert_rule_in_random_pos(uint32_t ip, uint8_t depth)
281 {
282         uint32_t pos;
283         int try_count = 0;
284         struct route_rule tmp;
285
286         do {
287                 pos = lrand48();
288                 try_count++;
289         } while ((try_count < 10) && (pos > num_route_entries));
290
291         if ((pos > num_route_entries) || (pos >= MAX_RULE_NUM))
292                 pos = num_route_entries >> 1;
293
294         tmp = large_route_table[pos];
295         large_route_table[pos].ip = ip;
296         large_route_table[pos].depth = depth;
297         if (num_route_entries < MAX_RULE_NUM)
298                 large_route_table[num_route_entries++] = tmp;
299 }
300
301 static void generate_large_route_rule_table(void)
302 {
303         uint32_t ip_class;
304         uint8_t  depth;
305
306         num_route_entries = 0;
307         num_ldepth_route_entries = 0;
308         memset(large_route_table, 0, sizeof(large_route_table));
309
310         for (ip_class = IP_CLASS_A; ip_class <= IP_CLASS_C; ip_class++) {
311                 for (depth = 1; depth <= RTE_LPM_MAX_DEPTH; depth++) {
312                         generate_random_rule_prefix(ip_class, depth);
313                 }
314         }
315
316         /* Add following rules to keep same as previous large constant table,
317          * they are 4 rules with private local IP address and 1 all-zeros prefix
318          * with depth = 8.
319          */
320         insert_rule_in_random_pos(RTE_IPV4(0, 0, 0, 0), 8);
321         insert_rule_in_random_pos(RTE_IPV4(10, 2, 23, 147), 32);
322         insert_rule_in_random_pos(RTE_IPV4(192, 168, 100, 10), 24);
323         insert_rule_in_random_pos(RTE_IPV4(192, 168, 25, 100), 24);
324         insert_rule_in_random_pos(RTE_IPV4(192, 168, 129, 124), 32);
325 }
326
327 static void
328 print_route_distribution(const struct route_rule *table, uint32_t n)
329 {
330         unsigned i, j;
331
332         printf("Route distribution per prefix width: \n");
333         printf("DEPTH    QUANTITY (PERCENT)\n");
334         printf("--------------------------- \n");
335
336         /* Count depths. */
337         for (i = 1; i <= 32; i++) {
338                 unsigned depth_counter = 0;
339                 double percent_hits;
340
341                 for (j = 0; j < n; j++)
342                         if (table[j].depth == (uint8_t) i)
343                                 depth_counter++;
344
345                 percent_hits = ((double)depth_counter)/((double)n) * 100;
346                 printf("%.2u%15u (%.2f)\n", i, depth_counter, percent_hits);
347         }
348         printf("\n");
349 }
350
351 /* Check condition and return an error if true. */
352 static uint16_t enabled_core_ids[RTE_MAX_LCORE];
353 static unsigned int num_cores;
354
355 /* Simple way to allocate thread ids in 0 to RTE_MAX_LCORE space */
356 static inline uint32_t
357 alloc_thread_id(void)
358 {
359         uint32_t tmp_thr_id;
360
361         tmp_thr_id = __atomic_fetch_add(&thr_id, 1, __ATOMIC_RELAXED);
362         if (tmp_thr_id >= RTE_MAX_LCORE)
363                 printf("Invalid thread id %u\n", tmp_thr_id);
364
365         return tmp_thr_id;
366 }
367
368 /*
369  * Reader thread using rte_lpm data structure without RCU.
370  */
371 static int
372 test_lpm_reader(void *arg)
373 {
374         int i;
375         uint32_t ip_batch[QSBR_REPORTING_INTERVAL];
376         uint32_t next_hop_return = 0;
377
378         RTE_SET_USED(arg);
379         do {
380                 for (i = 0; i < QSBR_REPORTING_INTERVAL; i++)
381                         ip_batch[i] = rte_rand();
382
383                 for (i = 0; i < QSBR_REPORTING_INTERVAL; i++)
384                         rte_lpm_lookup(lpm, ip_batch[i], &next_hop_return);
385
386         } while (!writer_done);
387
388         return 0;
389 }
390
391 /*
392  * Reader thread using rte_lpm data structure with RCU.
393  */
394 static int
395 test_lpm_rcu_qsbr_reader(void *arg)
396 {
397         int i;
398         uint32_t thread_id = alloc_thread_id();
399         uint32_t ip_batch[QSBR_REPORTING_INTERVAL];
400         uint32_t next_hop_return = 0;
401
402         RTE_SET_USED(arg);
403         /* Register this thread to report quiescent state */
404         rte_rcu_qsbr_thread_register(rv, thread_id);
405         rte_rcu_qsbr_thread_online(rv, thread_id);
406
407         do {
408                 for (i = 0; i < QSBR_REPORTING_INTERVAL; i++)
409                         ip_batch[i] = rte_rand();
410
411                 for (i = 0; i < QSBR_REPORTING_INTERVAL; i++)
412                         rte_lpm_lookup(lpm, ip_batch[i], &next_hop_return);
413
414                 /* Update quiescent state */
415                 rte_rcu_qsbr_quiescent(rv, thread_id);
416         } while (!writer_done);
417
418         rte_rcu_qsbr_thread_offline(rv, thread_id);
419         rte_rcu_qsbr_thread_unregister(rv, thread_id);
420
421         return 0;
422 }
423
424 /*
425  * Writer thread using rte_lpm data structure with RCU.
426  */
427 static int
428 test_lpm_rcu_qsbr_writer(void *arg)
429 {
430         unsigned int i, j, si, ei;
431         uint64_t begin, total_cycles;
432         uint8_t core_id = (uint8_t)((uintptr_t)arg);
433         uint32_t next_hop_add = 0xAA;
434
435         RTE_SET_USED(arg);
436         /* 2 writer threads are used */
437         if (core_id % 2 == 0) {
438                 si = 0;
439                 ei = NUM_LDEPTH_ROUTE_ENTRIES / 2;
440         } else {
441                 si = NUM_LDEPTH_ROUTE_ENTRIES / 2;
442                 ei = NUM_LDEPTH_ROUTE_ENTRIES;
443         }
444
445         /* Measure add/delete. */
446         begin = rte_rdtsc_precise();
447         for (i = 0; i < RCU_ITERATIONS; i++) {
448                 /* Add all the entries */
449                 for (j = si; j < ei; j++) {
450                         pthread_mutex_lock(&lpm_mutex);
451                         if (rte_lpm_add(lpm, large_ldepth_route_table[j].ip,
452                                         large_ldepth_route_table[j].depth,
453                                         next_hop_add) != 0) {
454                                 printf("Failed to add iteration %d, route# %d\n",
455                                         i, j);
456                         }
457                         pthread_mutex_unlock(&lpm_mutex);
458                 }
459
460                 /* Delete all the entries */
461                 for (j = si; j < ei; j++) {
462                         pthread_mutex_lock(&lpm_mutex);
463                         if (rte_lpm_delete(lpm, large_ldepth_route_table[j].ip,
464                                 large_ldepth_route_table[j].depth) != 0) {
465                                 printf("Failed to delete iteration %d, route# %d\n",
466                                         i, j);
467                         }
468                         pthread_mutex_unlock(&lpm_mutex);
469                 }
470         }
471
472         total_cycles = rte_rdtsc_precise() - begin;
473
474         __atomic_fetch_add(&gwrite_cycles, total_cycles, __ATOMIC_RELAXED);
475         __atomic_fetch_add(&gwrites,
476                         2 * NUM_LDEPTH_ROUTE_ENTRIES * RCU_ITERATIONS,
477                         __ATOMIC_RELAXED);
478
479         return 0;
480 }
481
482 /*
483  * Functional test:
484  * 2 writers, rest are readers
485  */
486 static int
487 test_lpm_rcu_perf_multi_writer(void)
488 {
489         struct rte_lpm_config config;
490         size_t sz;
491         unsigned int i;
492         uint16_t core_id;
493         struct rte_lpm_rcu_config rcu_cfg = {0};
494
495         if (rte_lcore_count() < 3) {
496                 printf("Not enough cores for lpm_rcu_perf_autotest, expecting at least 3\n");
497                 return TEST_SKIPPED;
498         }
499
500         num_cores = 0;
501         RTE_LCORE_FOREACH_SLAVE(core_id) {
502                 enabled_core_ids[num_cores] = core_id;
503                 num_cores++;
504         }
505
506         printf("\nPerf test: 2 writers, %d readers, RCU integration enabled\n",
507                 num_cores - 2);
508
509         /* Create LPM table */
510         config.max_rules = NUM_LDEPTH_ROUTE_ENTRIES;
511         config.number_tbl8s = NUM_LDEPTH_ROUTE_ENTRIES;
512         config.flags = 0;
513         lpm = rte_lpm_create(__func__, SOCKET_ID_ANY, &config);
514         TEST_LPM_ASSERT(lpm != NULL);
515
516         /* Init RCU variable */
517         sz = rte_rcu_qsbr_get_memsize(num_cores);
518         rv = (struct rte_rcu_qsbr *)rte_zmalloc("rcu0", sz,
519                                                 RTE_CACHE_LINE_SIZE);
520         rte_rcu_qsbr_init(rv, num_cores);
521
522         rcu_cfg.v = rv;
523         /* Assign the RCU variable to LPM */
524         if (rte_lpm_rcu_qsbr_add(lpm, &rcu_cfg) != 0) {
525                 printf("RCU variable assignment failed\n");
526                 goto error;
527         }
528
529         writer_done = 0;
530         __atomic_store_n(&gwrite_cycles, 0, __ATOMIC_RELAXED);
531         __atomic_store_n(&gwrites, 0, __ATOMIC_RELAXED);
532
533         __atomic_store_n(&thr_id, 0, __ATOMIC_SEQ_CST);
534
535         /* Launch reader threads */
536         for (i = 2; i < num_cores; i++)
537                 rte_eal_remote_launch(test_lpm_rcu_qsbr_reader, NULL,
538                                         enabled_core_ids[i]);
539
540         /* Launch writer threads */
541         for (i = 0; i < 2; i++)
542                 rte_eal_remote_launch(test_lpm_rcu_qsbr_writer,
543                                         (void *)(uintptr_t)i,
544                                         enabled_core_ids[i]);
545
546         /* Wait for writer threads */
547         for (i = 0; i < 2; i++)
548                 if (rte_eal_wait_lcore(enabled_core_ids[i]) < 0)
549                         goto error;
550
551         printf("Total LPM Adds: %d\n",
552                 2 * ITERATIONS * NUM_LDEPTH_ROUTE_ENTRIES);
553         printf("Total LPM Deletes: %d\n",
554                 2 * ITERATIONS * NUM_LDEPTH_ROUTE_ENTRIES);
555         printf("Average LPM Add/Del: %"PRIu64" cycles\n",
556                 __atomic_load_n(&gwrite_cycles, __ATOMIC_RELAXED) /
557                         __atomic_load_n(&gwrites, __ATOMIC_RELAXED)
558                 );
559
560         /* Wait and check return value from reader threads */
561         writer_done = 1;
562         for (i = 2; i < num_cores; i++)
563                 if (rte_eal_wait_lcore(enabled_core_ids[i]) < 0)
564                         goto error;
565
566         rte_lpm_free(lpm);
567         rte_free(rv);
568         lpm = NULL;
569         rv = NULL;
570
571         /* Test without RCU integration */
572         printf("\nPerf test: 2 writers, %d readers, RCU integration disabled\n",
573                 num_cores - 2);
574
575         /* Create LPM table */
576         config.max_rules = NUM_LDEPTH_ROUTE_ENTRIES;
577         config.number_tbl8s = NUM_LDEPTH_ROUTE_ENTRIES;
578         config.flags = 0;
579         lpm = rte_lpm_create(__func__, SOCKET_ID_ANY, &config);
580         TEST_LPM_ASSERT(lpm != NULL);
581
582         writer_done = 0;
583         __atomic_store_n(&gwrite_cycles, 0, __ATOMIC_RELAXED);
584         __atomic_store_n(&gwrites, 0, __ATOMIC_RELAXED);
585         __atomic_store_n(&thr_id, 0, __ATOMIC_SEQ_CST);
586
587         /* Launch reader threads */
588         for (i = 2; i < num_cores; i++)
589                 rte_eal_remote_launch(test_lpm_reader, NULL,
590                                         enabled_core_ids[i]);
591
592         /* Launch writer threads */
593         for (i = 0; i < 2; i++)
594                 rte_eal_remote_launch(test_lpm_rcu_qsbr_writer,
595                                         (void *)(uintptr_t)i,
596                                         enabled_core_ids[i]);
597
598         /* Wait for writer threads */
599         for (i = 0; i < 2; i++)
600                 if (rte_eal_wait_lcore(enabled_core_ids[i]) < 0)
601                         goto error;
602
603         printf("Total LPM Adds: %d\n",
604                 2 * ITERATIONS * NUM_LDEPTH_ROUTE_ENTRIES);
605         printf("Total LPM Deletes: %d\n",
606                 2 * ITERATIONS * NUM_LDEPTH_ROUTE_ENTRIES);
607         printf("Average LPM Add/Del: %"PRIu64" cycles\n",
608                 __atomic_load_n(&gwrite_cycles, __ATOMIC_RELAXED) /
609                         __atomic_load_n(&gwrites, __ATOMIC_RELAXED)
610                 );
611
612         writer_done = 1;
613         /* Wait and check return value from reader threads */
614         for (i = 2; i < num_cores; i++)
615                 if (rte_eal_wait_lcore(enabled_core_ids[i]) < 0)
616                         goto error;
617
618         rte_lpm_free(lpm);
619
620         return 0;
621
622 error:
623         writer_done = 1;
624         /* Wait until all readers have exited */
625         rte_eal_mp_wait_lcore();
626
627         rte_lpm_free(lpm);
628         rte_free(rv);
629
630         return -1;
631 }
632
633 /*
634  * Functional test:
635  * Single writer, rest are readers
636  */
637 static int
638 test_lpm_rcu_perf(void)
639 {
640         struct rte_lpm_config config;
641         uint64_t begin, total_cycles;
642         size_t sz;
643         unsigned int i, j;
644         uint16_t core_id;
645         uint32_t next_hop_add = 0xAA;
646         struct rte_lpm_rcu_config rcu_cfg = {0};
647
648         if (rte_lcore_count() < 2) {
649                 printf("Not enough cores for lpm_rcu_perf_autotest, expecting at least 2\n");
650                 return TEST_SKIPPED;
651         }
652
653         num_cores = 0;
654         RTE_LCORE_FOREACH_SLAVE(core_id) {
655                 enabled_core_ids[num_cores] = core_id;
656                 num_cores++;
657         }
658
659         printf("\nPerf test: 1 writer, %d readers, RCU integration enabled\n",
660                 num_cores);
661
662         /* Create LPM table */
663         config.max_rules = NUM_LDEPTH_ROUTE_ENTRIES;
664         config.number_tbl8s = NUM_LDEPTH_ROUTE_ENTRIES;
665         config.flags = 0;
666         lpm = rte_lpm_create(__func__, SOCKET_ID_ANY, &config);
667         TEST_LPM_ASSERT(lpm != NULL);
668
669         /* Init RCU variable */
670         sz = rte_rcu_qsbr_get_memsize(num_cores);
671         rv = (struct rte_rcu_qsbr *)rte_zmalloc("rcu0", sz,
672                                                 RTE_CACHE_LINE_SIZE);
673         rte_rcu_qsbr_init(rv, num_cores);
674
675         rcu_cfg.v = rv;
676         /* Assign the RCU variable to LPM */
677         if (rte_lpm_rcu_qsbr_add(lpm, &rcu_cfg) != 0) {
678                 printf("RCU variable assignment failed\n");
679                 goto error;
680         }
681
682         writer_done = 0;
683         __atomic_store_n(&thr_id, 0, __ATOMIC_SEQ_CST);
684
685         /* Launch reader threads */
686         for (i = 0; i < num_cores; i++)
687                 rte_eal_remote_launch(test_lpm_rcu_qsbr_reader, NULL,
688                                         enabled_core_ids[i]);
689
690         /* Measure add/delete. */
691         begin = rte_rdtsc_precise();
692         for (i = 0; i < RCU_ITERATIONS; i++) {
693                 /* Add all the entries */
694                 for (j = 0; j < NUM_LDEPTH_ROUTE_ENTRIES; j++)
695                         if (rte_lpm_add(lpm, large_ldepth_route_table[j].ip,
696                                         large_ldepth_route_table[j].depth,
697                                         next_hop_add) != 0) {
698                                 printf("Failed to add iteration %d, route# %d\n",
699                                         i, j);
700                                 goto error;
701                         }
702
703                 /* Delete all the entries */
704                 for (j = 0; j < NUM_LDEPTH_ROUTE_ENTRIES; j++)
705                         if (rte_lpm_delete(lpm, large_ldepth_route_table[j].ip,
706                                 large_ldepth_route_table[j].depth) != 0) {
707                                 printf("Failed to delete iteration %d, route# %d\n",
708                                         i, j);
709                                 goto error;
710                         }
711         }
712         total_cycles = rte_rdtsc_precise() - begin;
713
714         printf("Total LPM Adds: %d\n", ITERATIONS * NUM_LDEPTH_ROUTE_ENTRIES);
715         printf("Total LPM Deletes: %d\n",
716                 ITERATIONS * NUM_LDEPTH_ROUTE_ENTRIES);
717         printf("Average LPM Add/Del: %g cycles\n",
718                 (double)total_cycles / (NUM_LDEPTH_ROUTE_ENTRIES * ITERATIONS));
719
720         writer_done = 1;
721         /* Wait and check return value from reader threads */
722         for (i = 0; i < num_cores; i++)
723                 if (rte_eal_wait_lcore(enabled_core_ids[i]) < 0)
724                         goto error;
725
726         rte_lpm_free(lpm);
727         rte_free(rv);
728         lpm = NULL;
729         rv = NULL;
730
731         /* Test without RCU integration */
732         printf("\nPerf test: 1 writer, %d readers, RCU integration disabled\n",
733                 num_cores);
734
735         /* Create LPM table */
736         config.max_rules = NUM_LDEPTH_ROUTE_ENTRIES;
737         config.number_tbl8s = NUM_LDEPTH_ROUTE_ENTRIES;
738         config.flags = 0;
739         lpm = rte_lpm_create(__func__, SOCKET_ID_ANY, &config);
740         TEST_LPM_ASSERT(lpm != NULL);
741
742         writer_done = 0;
743         __atomic_store_n(&thr_id, 0, __ATOMIC_SEQ_CST);
744
745         /* Launch reader threads */
746         for (i = 0; i < num_cores; i++)
747                 rte_eal_remote_launch(test_lpm_reader, NULL,
748                                         enabled_core_ids[i]);
749
750         /* Measure add/delete. */
751         begin = rte_rdtsc_precise();
752         for (i = 0; i < RCU_ITERATIONS; i++) {
753                 /* Add all the entries */
754                 for (j = 0; j < NUM_LDEPTH_ROUTE_ENTRIES; j++)
755                         if (rte_lpm_add(lpm, large_ldepth_route_table[j].ip,
756                                         large_ldepth_route_table[j].depth,
757                                         next_hop_add) != 0) {
758                                 printf("Failed to add iteration %d, route# %d\n",
759                                         i, j);
760                                 goto error;
761                         }
762
763                 /* Delete all the entries */
764                 for (j = 0; j < NUM_LDEPTH_ROUTE_ENTRIES; j++)
765                         if (rte_lpm_delete(lpm, large_ldepth_route_table[j].ip,
766                                 large_ldepth_route_table[j].depth) != 0) {
767                                 printf("Failed to delete iteration %d, route# %d\n",
768                                         i, j);
769                                 goto error;
770                         }
771         }
772         total_cycles = rte_rdtsc_precise() - begin;
773
774         printf("Total LPM Adds: %d\n", ITERATIONS * NUM_LDEPTH_ROUTE_ENTRIES);
775         printf("Total LPM Deletes: %d\n",
776                 ITERATIONS * NUM_LDEPTH_ROUTE_ENTRIES);
777         printf("Average LPM Add/Del: %g cycles\n",
778                 (double)total_cycles / (NUM_LDEPTH_ROUTE_ENTRIES * ITERATIONS));
779
780         writer_done = 1;
781         /* Wait and check return value from reader threads */
782         for (i = 0; i < num_cores; i++)
783                 if (rte_eal_wait_lcore(enabled_core_ids[i]) < 0)
784                         printf("Warning: lcore %u not finished.\n",
785                                 enabled_core_ids[i]);
786
787         rte_lpm_free(lpm);
788
789         return 0;
790
791 error:
792         writer_done = 1;
793         /* Wait until all readers have exited */
794         rte_eal_mp_wait_lcore();
795
796         rte_lpm_free(lpm);
797         rte_free(rv);
798
799         return -1;
800 }
801
802 static int
803 test_lpm_perf(void)
804 {
805         struct rte_lpm_config config;
806
807         config.max_rules = 2000000;
808         config.number_tbl8s = 2048;
809         config.flags = 0;
810         uint64_t begin, total_time, lpm_used_entries = 0;
811         unsigned i, j;
812         uint32_t next_hop_add = 0xAA, next_hop_return = 0;
813         int status = 0;
814         uint64_t cache_line_counter = 0;
815         int64_t count = 0;
816
817         rte_srand(rte_rdtsc());
818
819         generate_large_route_rule_table();
820
821         printf("No. routes = %u\n", (unsigned) NUM_ROUTE_ENTRIES);
822
823         print_route_distribution(large_route_table, (uint32_t) NUM_ROUTE_ENTRIES);
824
825         lpm = rte_lpm_create(__func__, SOCKET_ID_ANY, &config);
826         TEST_LPM_ASSERT(lpm != NULL);
827
828         /* Measure add. */
829         begin = rte_rdtsc();
830
831         for (i = 0; i < NUM_ROUTE_ENTRIES; i++) {
832                 if (rte_lpm_add(lpm, large_route_table[i].ip,
833                                 large_route_table[i].depth, next_hop_add) == 0)
834                         status++;
835         }
836         /* End Timer. */
837         total_time = rte_rdtsc() - begin;
838
839         printf("Unique added entries = %d\n", status);
840         /* Obtain add statistics. */
841         for (i = 0; i < RTE_LPM_TBL24_NUM_ENTRIES; i++) {
842                 if (lpm->tbl24[i].valid)
843                         lpm_used_entries++;
844
845                 if (i % 32 == 0) {
846                         if ((uint64_t)count < lpm_used_entries) {
847                                 cache_line_counter++;
848                                 count = lpm_used_entries;
849                         }
850                 }
851         }
852
853         printf("Used table 24 entries = %u (%g%%)\n",
854                         (unsigned) lpm_used_entries,
855                         (lpm_used_entries * 100.0) / RTE_LPM_TBL24_NUM_ENTRIES);
856         printf("64 byte Cache entries used = %u (%u bytes)\n",
857                         (unsigned) cache_line_counter, (unsigned) cache_line_counter * 64);
858
859         printf("Average LPM Add: %g cycles\n",
860                         (double)total_time / NUM_ROUTE_ENTRIES);
861
862         /* Measure single Lookup */
863         total_time = 0;
864         count = 0;
865
866         for (i = 0; i < ITERATIONS; i++) {
867                 static uint32_t ip_batch[BATCH_SIZE];
868
869                 for (j = 0; j < BATCH_SIZE; j++)
870                         ip_batch[j] = rte_rand();
871
872                 /* Lookup per batch */
873                 begin = rte_rdtsc();
874
875                 for (j = 0; j < BATCH_SIZE; j++) {
876                         if (rte_lpm_lookup(lpm, ip_batch[j], &next_hop_return) != 0)
877                                 count++;
878                 }
879
880                 total_time += rte_rdtsc() - begin;
881
882         }
883         printf("Average LPM Lookup: %.1f cycles (fails = %.1f%%)\n",
884                         (double)total_time / ((double)ITERATIONS * BATCH_SIZE),
885                         (count * 100.0) / (double)(ITERATIONS * BATCH_SIZE));
886
887         /* Measure bulk Lookup */
888         total_time = 0;
889         count = 0;
890         for (i = 0; i < ITERATIONS; i++) {
891                 static uint32_t ip_batch[BATCH_SIZE];
892                 uint32_t next_hops[BULK_SIZE];
893
894                 /* Create array of random IP addresses */
895                 for (j = 0; j < BATCH_SIZE; j++)
896                         ip_batch[j] = rte_rand();
897
898                 /* Lookup per batch */
899                 begin = rte_rdtsc();
900                 for (j = 0; j < BATCH_SIZE; j += BULK_SIZE) {
901                         unsigned k;
902                         rte_lpm_lookup_bulk(lpm, &ip_batch[j], next_hops, BULK_SIZE);
903                         for (k = 0; k < BULK_SIZE; k++)
904                                 if (unlikely(!(next_hops[k] & RTE_LPM_LOOKUP_SUCCESS)))
905                                         count++;
906                 }
907
908                 total_time += rte_rdtsc() - begin;
909         }
910         printf("BULK LPM Lookup: %.1f cycles (fails = %.1f%%)\n",
911                         (double)total_time / ((double)ITERATIONS * BATCH_SIZE),
912                         (count * 100.0) / (double)(ITERATIONS * BATCH_SIZE));
913
914         /* Measure LookupX4 */
915         total_time = 0;
916         count = 0;
917         for (i = 0; i < ITERATIONS; i++) {
918                 static uint32_t ip_batch[BATCH_SIZE];
919                 uint32_t next_hops[4];
920
921                 /* Create array of random IP addresses */
922                 for (j = 0; j < BATCH_SIZE; j++)
923                         ip_batch[j] = rte_rand();
924
925                 /* Lookup per batch */
926                 begin = rte_rdtsc();
927                 for (j = 0; j < BATCH_SIZE; j += RTE_DIM(next_hops)) {
928                         unsigned k;
929                         xmm_t ipx4;
930
931                         ipx4 = vect_loadu_sil128((xmm_t *)(ip_batch + j));
932                         ipx4 = *(xmm_t *)(ip_batch + j);
933                         rte_lpm_lookupx4(lpm, ipx4, next_hops, UINT32_MAX);
934                         for (k = 0; k < RTE_DIM(next_hops); k++)
935                                 if (unlikely(next_hops[k] == UINT32_MAX))
936                                         count++;
937                 }
938
939                 total_time += rte_rdtsc() - begin;
940         }
941         printf("LPM LookupX4: %.1f cycles (fails = %.1f%%)\n",
942                         (double)total_time / ((double)ITERATIONS * BATCH_SIZE),
943                         (count * 100.0) / (double)(ITERATIONS * BATCH_SIZE));
944
945         /* Measure Delete */
946         status = 0;
947         begin = rte_rdtsc();
948
949         for (i = 0; i < NUM_ROUTE_ENTRIES; i++) {
950                 /* rte_lpm_delete(lpm, ip, depth) */
951                 status += rte_lpm_delete(lpm, large_route_table[i].ip,
952                                 large_route_table[i].depth);
953         }
954
955         total_time = rte_rdtsc() - begin;
956
957         printf("Average LPM Delete: %g cycles\n",
958                         (double)total_time / NUM_ROUTE_ENTRIES);
959
960         rte_lpm_delete_all(lpm);
961         rte_lpm_free(lpm);
962
963         test_lpm_rcu_perf();
964
965         test_lpm_rcu_perf_multi_writer();
966
967         return 0;
968 }
969
970 REGISTER_TEST_COMMAND(lpm_perf_autotest, test_lpm_perf);