--- /dev/null
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2019 Intel Corporation
+ */
+
+
+#include <stdio.h>
+#include <inttypes.h>
+
+#include <rte_atomic.h>
+#include <rte_cycles.h>
+#include <rte_launch.h>
+#include <rte_pause.h>
+#include <rte_stack.h>
+
+#include "test.h"
+
+#define STACK_NAME "STACK_PERF"
+#define MAX_BURST 32
+#define STACK_SIZE (RTE_MAX_LCORE * MAX_BURST)
+
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+
+/*
+ * Push/pop bulk sizes, marked volatile so they aren't treated as compile-time
+ * constants.
+ */
+static volatile unsigned int bulk_sizes[] = {8, MAX_BURST};
+
+static rte_atomic32_t lcore_barrier;
+
+struct lcore_pair {
+ unsigned int c1;
+ unsigned int c2;
+};
+
+static int
+get_two_hyperthreads(struct lcore_pair *lcp)
+{
+ unsigned int socket[2];
+ unsigned int core[2];
+ unsigned int id[2];
+
+ RTE_LCORE_FOREACH(id[0]) {
+ RTE_LCORE_FOREACH(id[1]) {
+ if (id[0] == id[1])
+ continue;
+ core[0] = lcore_config[id[0]].core_id;
+ core[1] = lcore_config[id[1]].core_id;
+ socket[0] = lcore_config[id[0]].socket_id;
+ socket[1] = lcore_config[id[1]].socket_id;
+ if ((core[0] == core[1]) && (socket[0] == socket[1])) {
+ lcp->c1 = id[0];
+ lcp->c2 = id[1];
+ return 0;
+ }
+ }
+ }
+
+ return 1;
+}
+
+static int
+get_two_cores(struct lcore_pair *lcp)
+{
+ unsigned int socket[2];
+ unsigned int core[2];
+ unsigned int id[2];
+
+ RTE_LCORE_FOREACH(id[0]) {
+ RTE_LCORE_FOREACH(id[1]) {
+ if (id[0] == id[1])
+ continue;
+ core[0] = lcore_config[id[0]].core_id;
+ core[1] = lcore_config[id[1]].core_id;
+ socket[0] = lcore_config[id[0]].socket_id;
+ socket[1] = lcore_config[id[1]].socket_id;
+ if ((core[0] != core[1]) && (socket[0] == socket[1])) {
+ lcp->c1 = id[0];
+ lcp->c2 = id[1];
+ return 0;
+ }
+ }
+ }
+
+ return 1;
+}
+
+static int
+get_two_sockets(struct lcore_pair *lcp)
+{
+ unsigned int socket[2];
+ unsigned int id[2];
+
+ RTE_LCORE_FOREACH(id[0]) {
+ RTE_LCORE_FOREACH(id[1]) {
+ if (id[0] == id[1])
+ continue;
+ socket[0] = lcore_config[id[0]].socket_id;
+ socket[1] = lcore_config[id[1]].socket_id;
+ if (socket[0] != socket[1]) {
+ lcp->c1 = id[0];
+ lcp->c2 = id[1];
+ return 0;
+ }
+ }
+ }
+
+ return 1;
+}
+
+/* Measure the cycle cost of popping an empty stack. */
+static void
+test_empty_pop(struct rte_stack *s)
+{
+ unsigned int iterations = 100000000;
+ void *objs[MAX_BURST];
+ unsigned int i;
+
+ uint64_t start = rte_rdtsc();
+
+ for (i = 0; i < iterations; i++)
+ rte_stack_pop(s, objs, bulk_sizes[0]);
+
+ uint64_t end = rte_rdtsc();
+
+ printf("Stack empty pop: %.2F\n",
+ (double)(end - start) / iterations);
+}
+
+struct thread_args {
+ struct rte_stack *s;
+ unsigned int sz;
+ double avg;
+};
+
+/* Measure the average per-pointer cycle cost of stack push and pop */
+static int
+bulk_push_pop(void *p)
+{
+ unsigned int iterations = 1000000;
+ struct thread_args *args = p;
+ void *objs[MAX_BURST] = {0};
+ unsigned int size, i;
+ struct rte_stack *s;
+
+ s = args->s;
+ size = args->sz;
+
+ rte_atomic32_sub(&lcore_barrier, 1);
+ while (rte_atomic32_read(&lcore_barrier) != 0)
+ rte_pause();
+
+ uint64_t start = rte_rdtsc();
+
+ for (i = 0; i < iterations; i++) {
+ rte_stack_push(s, objs, size);
+ rte_stack_pop(s, objs, size);
+ }
+
+ uint64_t end = rte_rdtsc();
+
+ args->avg = ((double)(end - start))/(iterations * size);
+
+ return 0;
+}
+
+/*
+ * Run bulk_push_pop() simultaneously on pairs of cores, to measure stack
+ * perf when between hyperthread siblings, cores on the same socket, and cores
+ * on different sockets.
+ */
+static void
+run_on_core_pair(struct lcore_pair *cores, struct rte_stack *s,
+ lcore_function_t fn)
+{
+ struct thread_args args[2];
+ unsigned int i;
+
+ for (i = 0; i < ARRAY_SIZE(bulk_sizes); i++) {
+ rte_atomic32_set(&lcore_barrier, 2);
+
+ args[0].sz = args[1].sz = bulk_sizes[i];
+ args[0].s = args[1].s = s;
+
+ if (cores->c1 == rte_get_master_lcore()) {
+ rte_eal_remote_launch(fn, &args[1], cores->c2);
+ fn(&args[0]);
+ rte_eal_wait_lcore(cores->c2);
+ } else {
+ rte_eal_remote_launch(fn, &args[0], cores->c1);
+ rte_eal_remote_launch(fn, &args[1], cores->c2);
+ rte_eal_wait_lcore(cores->c1);
+ rte_eal_wait_lcore(cores->c2);
+ }
+
+ printf("Average cycles per object push/pop (bulk size: %u): %.2F\n",
+ bulk_sizes[i], (args[0].avg + args[1].avg) / 2);
+ }
+}
+
+/* Run bulk_push_pop() simultaneously on 1+ cores. */
+static void
+run_on_n_cores(struct rte_stack *s, lcore_function_t fn, int n)
+{
+ struct thread_args args[RTE_MAX_LCORE];
+ unsigned int i;
+
+ for (i = 0; i < ARRAY_SIZE(bulk_sizes); i++) {
+ unsigned int lcore_id;
+ int cnt = 0;
+ double avg;
+
+ rte_atomic32_set(&lcore_barrier, n);
+
+ RTE_LCORE_FOREACH_SLAVE(lcore_id) {
+ if (++cnt >= n)
+ break;
+
+ args[lcore_id].s = s;
+ args[lcore_id].sz = bulk_sizes[i];
+
+ if (rte_eal_remote_launch(fn, &args[lcore_id],
+ lcore_id))
+ rte_panic("Failed to launch lcore %d\n",
+ lcore_id);
+ }
+
+ lcore_id = rte_lcore_id();
+
+ args[lcore_id].s = s;
+ args[lcore_id].sz = bulk_sizes[i];
+
+ fn(&args[lcore_id]);
+
+ rte_eal_mp_wait_lcore();
+
+ avg = args[rte_lcore_id()].avg;
+
+ cnt = 0;
+ RTE_LCORE_FOREACH_SLAVE(lcore_id) {
+ if (++cnt >= n)
+ break;
+ avg += args[lcore_id].avg;
+ }
+
+ printf("Average cycles per object push/pop (bulk size: %u): %.2F\n",
+ bulk_sizes[i], avg / n);
+ }
+}
+
+/*
+ * Measure the cycle cost of pushing and popping a single pointer on a single
+ * lcore.
+ */
+static void
+test_single_push_pop(struct rte_stack *s)
+{
+ unsigned int iterations = 16000000;
+ void *obj = NULL;
+ unsigned int i;
+
+ uint64_t start = rte_rdtsc();
+
+ for (i = 0; i < iterations; i++) {
+ rte_stack_push(s, &obj, 1);
+ rte_stack_pop(s, &obj, 1);
+ }
+
+ uint64_t end = rte_rdtsc();
+
+ printf("Average cycles per single object push/pop: %.2F\n",
+ ((double)(end - start)) / iterations);
+}
+
+/* Measure the cycle cost of bulk pushing and popping on a single lcore. */
+static void
+test_bulk_push_pop(struct rte_stack *s)
+{
+ unsigned int iterations = 8000000;
+ void *objs[MAX_BURST];
+ unsigned int sz, i;
+
+ for (sz = 0; sz < ARRAY_SIZE(bulk_sizes); sz++) {
+ uint64_t start = rte_rdtsc();
+
+ for (i = 0; i < iterations; i++) {
+ rte_stack_push(s, objs, bulk_sizes[sz]);
+ rte_stack_pop(s, objs, bulk_sizes[sz]);
+ }
+
+ uint64_t end = rte_rdtsc();
+
+ double avg = ((double)(end - start) /
+ (iterations * bulk_sizes[sz]));
+
+ printf("Average cycles per object push/pop (bulk size: %u): %.2F\n",
+ bulk_sizes[sz], avg);
+ }
+}
+
+static int
+test_stack_perf(void)
+{
+ struct lcore_pair cores;
+ struct rte_stack *s;
+
+ rte_atomic32_init(&lcore_barrier);
+
+ s = rte_stack_create(STACK_NAME, STACK_SIZE, rte_socket_id(), 0);
+ if (s == NULL) {
+ printf("[%s():%u] failed to create a stack\n",
+ __func__, __LINE__);
+ return -1;
+ }
+
+ printf("### Testing single element push/pop ###\n");
+ test_single_push_pop(s);
+
+ printf("\n### Testing empty pop ###\n");
+ test_empty_pop(s);
+
+ printf("\n### Testing using a single lcore ###\n");
+ test_bulk_push_pop(s);
+
+ if (get_two_hyperthreads(&cores) == 0) {
+ printf("\n### Testing using two hyperthreads ###\n");
+ run_on_core_pair(&cores, s, bulk_push_pop);
+ }
+ if (get_two_cores(&cores) == 0) {
+ printf("\n### Testing using two physical cores ###\n");
+ run_on_core_pair(&cores, s, bulk_push_pop);
+ }
+ if (get_two_sockets(&cores) == 0) {
+ printf("\n### Testing using two NUMA nodes ###\n");
+ run_on_core_pair(&cores, s, bulk_push_pop);
+ }
+
+ printf("\n### Testing on all %u lcores ###\n", rte_lcore_count());
+ run_on_n_cores(s, bulk_push_pop, rte_lcore_count());
+
+ rte_stack_free(s);
+ return 0;
+}
+
+REGISTER_TEST_COMMAND(stack_perf_autotest, test_stack_perf);