net/ice: support auxiliary IP offset Rx descriptor
[dpdk.git] / app / test-flow-perf / main.c
index 1a4a50d..3589b31 100644 (file)
@@ -27,6 +27,7 @@
 #include <stdbool.h>
 #include <sys/time.h>
 #include <signal.h>
+#include <unistd.h>
 
 #include <rte_malloc.h>
 #include <rte_mempool.h>
@@ -47,13 +48,45 @@ static uint8_t flow_group;
 static uint64_t flow_items;
 static uint64_t flow_actions;
 static uint64_t flow_attrs;
+
 static volatile bool force_quit;
 static bool dump_iterations;
+static bool delete_flag;
+static bool dump_socket_mem_flag;
+static bool enable_fwd;
+
 static struct rte_mempool *mbuf_mp;
 static uint32_t nb_lcores;
 static uint32_t flows_count;
 static uint32_t iterations_number;
-static uint32_t hairpinq;
+static uint32_t hairpin_queues_num; /* total hairpin q number - default: 0 */
+static uint32_t nb_lcores;
+
+#define MAX_PKT_BURST    32
+#define LCORE_MODE_PKT    1
+#define LCORE_MODE_STATS  2
+#define MAX_STREAMS      64
+#define MAX_LCORES       64
+
+struct stream {
+       int tx_port;
+       int tx_queue;
+       int rx_port;
+       int rx_queue;
+};
+
+struct lcore_info {
+       int mode;
+       int streams_nb;
+       struct stream streams[MAX_STREAMS];
+       /* stats */
+       uint64_t tx_pkts;
+       uint64_t tx_drops;
+       uint64_t rx_pkts;
+       struct rte_mbuf *pkts[MAX_PKT_BURST];
+} __rte_cache_aligned;
+
+static struct lcore_info lcore_infos[MAX_LCORES];
 
 static void
 usage(char *progname)
@@ -64,6 +97,11 @@ usage(char *progname)
                " flows to insert, default is 4,000,000\n");
        printf("  --dump-iterations: To print rates for each"
                " iteration\n");
+       printf("  --deletion-rate: Enable deletion rate"
+               " calculations\n");
+       printf("  --dump-socket-mem: To dump all socket memory\n");
+       printf("  --enable-fwd: To enable packets forwarding"
+               " after insertion\n");
 
        printf("To set flow attributes:\n");
        printf("  --ingress: set ingress attribute in flows\n");
@@ -98,7 +136,7 @@ usage(char *progname)
        printf("  --set-tag: add set tag action in flow actions\n");
        printf("  --drop: add drop action in flow actions\n");
        printf("  --hairpin-queue=N: add hairpin-queue action in flow actions\n");
-       printf("  --hairpin-rss=N: add hairping-rss action in flow actions\n");
+       printf("  --hairpin-rss=N: add hairpin-rss action in flow actions\n");
 }
 
 static void
@@ -246,6 +284,9 @@ args_parse(int argc, char **argv)
                { "help",                       0, 0, 0 },
                { "flows-count",                1, 0, 0 },
                { "dump-iterations",            0, 0, 0 },
+               { "deletion-rate",              0, 0, 0 },
+               { "dump-socket-mem",            0, 0, 0 },
+               { "enable-fwd",                 0, 0, 0 },
                /* Attributes */
                { "ingress",                    0, 0, 0 },
                { "egress",                     0, 0, 0 },
@@ -282,7 +323,7 @@ args_parse(int argc, char **argv)
        flow_items = 0;
        flow_actions = 0;
        flow_attrs = 0;
-       hairpinq = 0;
+       hairpin_queues_num = 0;
        argvopt = argv;
 
        printf(":: Flow -> ");
@@ -317,7 +358,7 @@ args_parse(int argc, char **argv)
                                        "hairpin-rss") == 0) {
                                n = atoi(optarg);
                                if (n > 0)
-                                       hairpinq = n;
+                                       hairpin_queues_num = n;
                                else
                                        rte_exit(EXIT_SUCCESS,
                                                "Hairpin queues should be > 0\n");
@@ -329,7 +370,7 @@ args_parse(int argc, char **argv)
                                        "hairpin-queue") == 0) {
                                n = atoi(optarg);
                                if (n > 0)
-                                       hairpinq = n;
+                                       hairpin_queues_num = n;
                                else
                                        rte_exit(EXIT_SUCCESS,
                                                "Hairpin queues should be > 0\n");
@@ -353,6 +394,15 @@ args_parse(int argc, char **argv)
                        if (strcmp(lgopts[opt_idx].name,
                                        "dump-iterations") == 0)
                                dump_iterations = true;
+                       if (strcmp(lgopts[opt_idx].name,
+                                       "deletion-rate") == 0)
+                               delete_flag = true;
+                       if (strcmp(lgopts[opt_idx].name,
+                                       "dump-socket-mem") == 0)
+                               dump_socket_mem_flag = true;
+                       if (strcmp(lgopts[opt_idx].name,
+                                       "enable-fwd") == 0)
+                               enable_fwd = true;
                        break;
                default:
                        fprintf(stderr, "Invalid option: %s\n", argv[optind]);
@@ -364,6 +414,62 @@ args_parse(int argc, char **argv)
        printf("end_flow\n");
 }
 
+/* Dump the socket memory statistics on console */
+static size_t
+dump_socket_mem(FILE *f)
+{
+       struct rte_malloc_socket_stats socket_stats;
+       unsigned int i = 0;
+       size_t total = 0;
+       size_t alloc = 0;
+       size_t free = 0;
+       unsigned int n_alloc = 0;
+       unsigned int n_free = 0;
+       bool active_nodes = false;
+
+
+       for (i = 0; i < RTE_MAX_NUMA_NODES; i++) {
+               if (rte_malloc_get_socket_stats(i, &socket_stats) ||
+                   !socket_stats.heap_totalsz_bytes)
+                       continue;
+               active_nodes = true;
+               total += socket_stats.heap_totalsz_bytes;
+               alloc += socket_stats.heap_allocsz_bytes;
+               free += socket_stats.heap_freesz_bytes;
+               n_alloc += socket_stats.alloc_count;
+               n_free += socket_stats.free_count;
+               if (dump_socket_mem_flag) {
+                       fprintf(f, "::::::::::::::::::::::::::::::::::::::::");
+                       fprintf(f,
+                               "\nSocket %u:\nsize(M) total: %.6lf\nalloc:"
+                               " %.6lf(%.3lf%%)\nfree: %.6lf"
+                               "\nmax: %.6lf"
+                               "\ncount alloc: %u\nfree: %u\n",
+                               i,
+                               socket_stats.heap_totalsz_bytes / 1.0e6,
+                               socket_stats.heap_allocsz_bytes / 1.0e6,
+                               (double)socket_stats.heap_allocsz_bytes * 100 /
+                               (double)socket_stats.heap_totalsz_bytes,
+                               socket_stats.heap_freesz_bytes / 1.0e6,
+                               socket_stats.greatest_free_size / 1.0e6,
+                               socket_stats.alloc_count,
+                               socket_stats.free_count);
+                               fprintf(f, "::::::::::::::::::::::::::::::::::::::::");
+               }
+       }
+       if (dump_socket_mem_flag && active_nodes) {
+               fprintf(f,
+                       "\nTotal: size(M)\ntotal: %.6lf"
+                       "\nalloc: %.6lf(%.3lf%%)\nfree: %.6lf"
+                       "\ncount alloc: %u\nfree: %u\n",
+                       total / 1.0e6, alloc / 1.0e6,
+                       (double)alloc * 100 / (double)total, free / 1.0e6,
+                       n_alloc, n_free);
+               fprintf(f, "::::::::::::::::::::::::::::::::::::::::\n");
+       }
+       return alloc;
+}
+
 static void
 print_flow_error(struct rte_flow_error error)
 {
@@ -372,9 +478,75 @@ print_flow_error(struct rte_flow_error error)
                error.message ? error.message : "(no stated reason)");
 }
 
+static inline void
+destroy_flows(int port_id, struct rte_flow **flow_list)
+{
+       struct rte_flow_error error;
+       clock_t start_iter, end_iter;
+       double cpu_time_used = 0;
+       double flows_rate;
+       double cpu_time_per_iter[MAX_ITERATIONS];
+       double delta;
+       uint32_t i;
+       int iter_id;
+
+       for (i = 0; i < MAX_ITERATIONS; i++)
+               cpu_time_per_iter[i] = -1;
+
+       if (iterations_number > flows_count)
+               iterations_number = flows_count;
+
+       /* Deletion Rate */
+       printf("Flows Deletion on port = %d\n", port_id);
+       start_iter = clock();
+       for (i = 0; i < flows_count; i++) {
+               if (flow_list[i] == 0)
+                       break;
+
+               memset(&error, 0x33, sizeof(error));
+               if (rte_flow_destroy(port_id, flow_list[i], &error)) {
+                       print_flow_error(error);
+                       rte_exit(EXIT_FAILURE, "Error in deleting flow");
+               }
+
+               if (i && !((i + 1) % iterations_number)) {
+                       /* Save the deletion rate of each iter */
+                       end_iter = clock();
+                       delta = (double) (end_iter - start_iter);
+                       iter_id = ((i + 1) / iterations_number) - 1;
+                       cpu_time_per_iter[iter_id] =
+                               delta / CLOCKS_PER_SEC;
+                       cpu_time_used += cpu_time_per_iter[iter_id];
+                       start_iter = clock();
+               }
+       }
+
+       /* Deletion rate per iteration */
+       if (dump_iterations)
+               for (i = 0; i < MAX_ITERATIONS; i++) {
+                       if (cpu_time_per_iter[i] == -1)
+                               continue;
+                       delta = (double)(iterations_number /
+                               cpu_time_per_iter[i]);
+                       flows_rate = delta / 1000;
+                       printf(":: Iteration #%d: %d flows "
+                               "in %f sec[ Rate = %f K/Sec ]\n",
+                               i, iterations_number,
+                               cpu_time_per_iter[i], flows_rate);
+               }
+
+       /* Deletion rate for all flows */
+       flows_rate = ((double) (flows_count / cpu_time_used) / 1000);
+       printf("\n:: Total flow deletion rate -> %f K/Sec\n",
+               flows_rate);
+       printf(":: The time for deleting %d in flows %f seconds\n",
+               flows_count, cpu_time_used);
+}
+
 static inline void
 flows_handler(void)
 {
+       struct rte_flow **flow_list;
        struct rte_flow_error error;
        clock_t start_iter, end_iter;
        double cpu_time_used;
@@ -385,6 +557,7 @@ flows_handler(void)
        uint32_t i;
        int port_id;
        int iter_id;
+       uint32_t flow_index;
 
        nr_ports = rte_eth_dev_count_avail();
 
@@ -396,8 +569,14 @@ flows_handler(void)
 
        printf(":: Flows Count per port: %d\n", flows_count);
 
+       flow_list = rte_zmalloc("flow_list",
+               (sizeof(struct rte_flow *) * flows_count) + 1, 0);
+       if (flow_list == NULL)
+               rte_exit(EXIT_FAILURE, "No Memory available!");
+
        for (port_id = 0; port_id < nr_ports; port_id++) {
                cpu_time_used = 0;
+               flow_index = 0;
                if (flow_group > 0) {
                        /*
                         * Create global rule to jump into flow_group,
@@ -416,6 +595,7 @@ flows_handler(void)
                                print_flow_error(error);
                                rte_exit(EXIT_FAILURE, "error in creating flow");
                        }
+                       flow_list[flow_index++] = flow;
                }
 
                /* Insertion Rate */
@@ -424,7 +604,8 @@ flows_handler(void)
                for (i = 0; i < flows_count; i++) {
                        flow = generate_flow(port_id, flow_group,
                                flow_attrs, flow_items, flow_actions,
-                               JUMP_ACTION_TABLE, i, hairpinq, &error);
+                               JUMP_ACTION_TABLE, i,
+                               hairpin_queues_num, &error);
 
                        if (force_quit)
                                i = flows_count;
@@ -434,6 +615,8 @@ flows_handler(void)
                                rte_exit(EXIT_FAILURE, "error in creating flow");
                        }
 
+                       flow_list[flow_index++] = flow;
+
                        if (i && !((i + 1) % iterations_number)) {
                                /* Save the insertion rate of each iter */
                                end_iter = clock();
@@ -466,6 +649,9 @@ flows_handler(void)
                                                flows_rate);
                printf(":: The time for creating %d in flows %f seconds\n",
                                                flows_count, cpu_time_used);
+
+               if (delete_flag)
+                       destroy_flows(port_id, flow_list);
        }
 }
 
@@ -480,12 +666,271 @@ signal_handler(int signum)
        }
 }
 
+static inline uint16_t
+do_rx(struct lcore_info *li, uint16_t rx_port, uint16_t rx_queue)
+{
+       uint16_t cnt = 0;
+       cnt = rte_eth_rx_burst(rx_port, rx_queue, li->pkts, MAX_PKT_BURST);
+       li->rx_pkts += cnt;
+       return cnt;
+}
+
+static inline void
+do_tx(struct lcore_info *li, uint16_t cnt, uint16_t tx_port,
+                       uint16_t tx_queue)
+{
+       uint16_t nr_tx = 0;
+       uint16_t i;
+
+       nr_tx = rte_eth_tx_burst(tx_port, tx_queue, li->pkts, cnt);
+       li->tx_pkts  += nr_tx;
+       li->tx_drops += cnt - nr_tx;
+
+       for (i = nr_tx; i < cnt; i++)
+               rte_pktmbuf_free(li->pkts[i]);
+}
+
+/*
+ * Method to convert numbers into pretty numbers that easy
+ * to read. The design here is to add comma after each three
+ * digits and set all of this inside buffer.
+ *
+ * For example if n = 1799321, the output will be
+ * 1,799,321 after this method which is easier to read.
+ */
+static char *
+pretty_number(uint64_t n, char *buf)
+{
+       char p[6][4];
+       int i = 0;
+       int off = 0;
+
+       while (n > 1000) {
+               sprintf(p[i], "%03d", (int)(n % 1000));
+               n /= 1000;
+               i += 1;
+       }
+
+       sprintf(p[i++], "%d", (int)n);
+
+       while (i--)
+               off += sprintf(buf + off, "%s,", p[i]);
+       buf[strlen(buf) - 1] = '\0';
+
+       return buf;
+}
+
+static void
+packet_per_second_stats(void)
+{
+       struct lcore_info *old;
+       struct lcore_info *li, *oli;
+       int nr_lines = 0;
+       int i;
+
+       old = rte_zmalloc("old",
+               sizeof(struct lcore_info) * MAX_LCORES, 0);
+       if (old == NULL)
+               rte_exit(EXIT_FAILURE, "No Memory available!");
+
+       memcpy(old, lcore_infos,
+               sizeof(struct lcore_info) * MAX_LCORES);
+
+       while (!force_quit) {
+               uint64_t total_tx_pkts = 0;
+               uint64_t total_rx_pkts = 0;
+               uint64_t total_tx_drops = 0;
+               uint64_t tx_delta, rx_delta, drops_delta;
+               char buf[3][32];
+               int nr_valid_core = 0;
+
+               sleep(1);
+
+               if (nr_lines) {
+                       char go_up_nr_lines[16];
+
+                       sprintf(go_up_nr_lines, "%c[%dA\r", 27, nr_lines);
+                       printf("%s\r", go_up_nr_lines);
+               }
+
+               printf("\n%6s %16s %16s %16s\n", "core", "tx", "tx drops", "rx");
+               printf("%6s %16s %16s %16s\n", "------", "----------------",
+                       "----------------", "----------------");
+               nr_lines = 3;
+               for (i = 0; i < MAX_LCORES; i++) {
+                       li  = &lcore_infos[i];
+                       oli = &old[i];
+                       if (li->mode != LCORE_MODE_PKT)
+                               continue;
+
+                       tx_delta    = li->tx_pkts  - oli->tx_pkts;
+                       rx_delta    = li->rx_pkts  - oli->rx_pkts;
+                       drops_delta = li->tx_drops - oli->tx_drops;
+                       printf("%6d %16s %16s %16s\n", i,
+                               pretty_number(tx_delta,    buf[0]),
+                               pretty_number(drops_delta, buf[1]),
+                               pretty_number(rx_delta,    buf[2]));
+
+                       total_tx_pkts  += tx_delta;
+                       total_rx_pkts  += rx_delta;
+                       total_tx_drops += drops_delta;
+
+                       nr_valid_core++;
+                       nr_lines += 1;
+               }
+
+               if (nr_valid_core > 1) {
+                       printf("%6s %16s %16s %16s\n", "total",
+                               pretty_number(total_tx_pkts,  buf[0]),
+                               pretty_number(total_tx_drops, buf[1]),
+                               pretty_number(total_rx_pkts,  buf[2]));
+                       nr_lines += 1;
+               }
+
+               memcpy(old, lcore_infos,
+                       sizeof(struct lcore_info) * MAX_LCORES);
+       }
+}
+
+static int
+start_forwarding(void *data __rte_unused)
+{
+       int lcore = rte_lcore_id();
+       int stream_id;
+       uint16_t cnt;
+       struct lcore_info *li = &lcore_infos[lcore];
+
+       if (!li->mode)
+               return 0;
+
+       if (li->mode == LCORE_MODE_STATS) {
+               printf(":: started stats on lcore %u\n", lcore);
+               packet_per_second_stats();
+               return 0;
+       }
+
+       while (!force_quit)
+               for (stream_id = 0; stream_id < MAX_STREAMS; stream_id++) {
+                       if (li->streams[stream_id].rx_port == -1)
+                               continue;
+
+                       cnt = do_rx(li,
+                                       li->streams[stream_id].rx_port,
+                                       li->streams[stream_id].rx_queue);
+                       if (cnt)
+                               do_tx(li, cnt,
+                                       li->streams[stream_id].tx_port,
+                                       li->streams[stream_id].tx_queue);
+               }
+       return 0;
+}
+
+static void
+init_lcore_info(void)
+{
+       int i, j;
+       unsigned int lcore;
+       uint16_t nr_port;
+       uint16_t queue;
+       int port;
+       int stream_id = 0;
+       int streams_per_core;
+       int unassigned_streams;
+       int nb_fwd_streams;
+       nr_port = rte_eth_dev_count_avail();
+
+       /* First logical core is reserved for stats printing */
+       lcore = rte_get_next_lcore(-1, 0, 0);
+       lcore_infos[lcore].mode = LCORE_MODE_STATS;
+
+       /*
+        * Initialize all cores
+        * All cores at first must have -1 value in all streams
+        * This means that this stream is not used, or not set
+        * yet.
+        */
+       for (i = 0; i < MAX_LCORES; i++)
+               for (j = 0; j < MAX_STREAMS; j++) {
+                       lcore_infos[i].streams[j].tx_port = -1;
+                       lcore_infos[i].streams[j].rx_port = -1;
+                       lcore_infos[i].streams[j].tx_queue = -1;
+                       lcore_infos[i].streams[j].rx_queue = -1;
+                       lcore_infos[i].streams_nb = 0;
+               }
+
+       /*
+        * Calculate the total streams count.
+        * Also distribute those streams count between the available
+        * logical cores except first core, since it's reserved for
+        * stats prints.
+        */
+       nb_fwd_streams = nr_port * RXQ_NUM;
+       if ((int)(nb_lcores - 1) >= nb_fwd_streams)
+               for (i = 0; i < (int)(nb_lcores - 1); i++) {
+                       lcore = rte_get_next_lcore(lcore, 0, 0);
+                       lcore_infos[lcore].streams_nb = 1;
+               }
+       else {
+               streams_per_core = nb_fwd_streams / (nb_lcores - 1);
+               unassigned_streams = nb_fwd_streams % (nb_lcores - 1);
+               for (i = 0; i < (int)(nb_lcores - 1); i++) {
+                       lcore = rte_get_next_lcore(lcore, 0, 0);
+                       lcore_infos[lcore].streams_nb = streams_per_core;
+                       if (unassigned_streams) {
+                               lcore_infos[lcore].streams_nb++;
+                               unassigned_streams--;
+                       }
+               }
+       }
+
+       /*
+        * Set the streams for the cores according to each logical
+        * core stream count.
+        * The streams is built on the design of what received should
+        * forward as well, this means that if you received packets on
+        * port 0 queue 0 then the same queue should forward the
+        * packets, using the same logical core.
+        */
+       lcore = rte_get_next_lcore(-1, 0, 0);
+       for (port = 0; port < nr_port; port++) {
+               /* Create FWD stream */
+               for (queue = 0; queue < RXQ_NUM; queue++) {
+                       if (!lcore_infos[lcore].streams_nb ||
+                               !(stream_id % lcore_infos[lcore].streams_nb)) {
+                               lcore = rte_get_next_lcore(lcore, 0, 0);
+                               lcore_infos[lcore].mode = LCORE_MODE_PKT;
+                               stream_id = 0;
+                       }
+                       lcore_infos[lcore].streams[stream_id].rx_queue = queue;
+                       lcore_infos[lcore].streams[stream_id].tx_queue = queue;
+                       lcore_infos[lcore].streams[stream_id].rx_port = port;
+                       lcore_infos[lcore].streams[stream_id].tx_port = port;
+                       stream_id++;
+               }
+       }
+
+       /* Print all streams */
+       printf(":: Stream -> core id[N]: (rx_port, rx_queue)->(tx_port, tx_queue)\n");
+       for (i = 0; i < MAX_LCORES; i++)
+               for (j = 0; j < MAX_STREAMS; j++) {
+                       /* No streams for this core */
+                       if (lcore_infos[i].streams[j].tx_port == -1)
+                               break;
+                       printf("Stream -> core id[%d]: (%d,%d)->(%d,%d)\n",
+                               i,
+                               lcore_infos[i].streams[j].rx_port,
+                               lcore_infos[i].streams[j].rx_queue,
+                               lcore_infos[i].streams[j].tx_port,
+                               lcore_infos[i].streams[j].tx_queue);
+               }
+}
+
 static void
 init_port(void)
 {
        int ret;
        uint16_t std_queue;
-       uint16_t hairpin_q;
+       uint16_t hairpin_queue;
        uint16_t port_id;
        uint16_t nr_ports;
        uint16_t nr_queues;
@@ -503,8 +948,8 @@ init_port(void)
        struct rte_eth_dev_info dev_info;
 
        nr_queues = RXQ_NUM;
-       if (hairpinq != 0)
-               nr_queues = RXQ_NUM + hairpinq;
+       if (hairpin_queues_num != 0)
+               nr_queues = RXQ_NUM + hairpin_queues_num;
 
        nr_ports = rte_eth_dev_count_avail();
        if (nr_ports == 0)
@@ -567,15 +1012,20 @@ init_port(void)
                                ":: promiscuous mode enable failed: err=%s, port=%u\n",
                                rte_strerror(-ret), port_id);
 
-               if (hairpinq != 0) {
-                       for (hairpin_q = RXQ_NUM, std_queue = 0;
-                                       std_queue < nr_queues;
-                                       hairpin_q++, std_queue++) {
+               if (hairpin_queues_num != 0) {
+                       /*
+                        * Configure peer which represents hairpin Tx.
+                        * Hairpin queue numbers start after standard queues
+                        * (RXQ_NUM and TXQ_NUM).
+                        */
+                       for (hairpin_queue = RXQ_NUM, std_queue = 0;
+                                       hairpin_queue < nr_queues;
+                                       hairpin_queue++, std_queue++) {
                                hairpin_conf.peers[0].port = port_id;
                                hairpin_conf.peers[0].queue =
                                        std_queue + TXQ_NUM;
                                ret = rte_eth_rx_hairpin_queue_setup(
-                                               port_id, hairpin_q,
+                                               port_id, hairpin_queue,
                                                NR_RXD, &hairpin_conf);
                                if (ret != 0)
                                        rte_exit(EXIT_FAILURE,
@@ -583,14 +1033,14 @@ init_port(void)
                                                ret, port_id);
                        }
 
-                       for (hairpin_q = TXQ_NUM, std_queue = 0;
-                                       std_queue < nr_queues;
-                                       hairpin_q++, std_queue++) {
+                       for (hairpin_queue = TXQ_NUM, std_queue = 0;
+                                       hairpin_queue < nr_queues;
+                                       hairpin_queue++, std_queue++) {
                                hairpin_conf.peers[0].port = port_id;
                                hairpin_conf.peers[0].queue =
                                        std_queue + RXQ_NUM;
                                ret = rte_eth_tx_hairpin_queue_setup(
-                                               port_id, hairpin_q,
+                                               port_id, hairpin_queue,
                                                NR_TXD, &hairpin_conf);
                                if (ret != 0)
                                        rte_exit(EXIT_FAILURE,
@@ -615,6 +1065,7 @@ main(int argc, char **argv)
        int ret;
        uint16_t port;
        struct rte_flow_error error;
+       int64_t alloc, last_alloc;
 
        ret = rte_eal_init(argc, argv);
        if (ret < 0)
@@ -624,6 +1075,8 @@ main(int argc, char **argv)
        dump_iterations = false;
        flows_count = DEFAULT_RULES_COUNT;
        iterations_number = DEFAULT_ITERATION;
+       delete_flag = false;
+       dump_socket_mem_flag = false;
        flow_group = 0;
 
        signal(SIGINT, signal_handler);
@@ -640,7 +1093,18 @@ main(int argc, char **argv)
        if (nb_lcores <= 1)
                rte_exit(EXIT_FAILURE, "This app needs at least two cores\n");
 
+       last_alloc = (int64_t)dump_socket_mem(stdout);
        flows_handler();
+       alloc = (int64_t)dump_socket_mem(stdout);
+
+       if (last_alloc)
+               fprintf(stdout, ":: Memory allocation change(M): %.6lf\n",
+               (alloc - last_alloc) / 1.0e6);
+
+       if (enable_fwd) {
+               init_lcore_info();
+               rte_eal_mp_remote_launch(start_forwarding, NULL, CALL_MASTER);
+       }
 
        RTE_ETH_FOREACH_DEV(port) {
                rte_flow_flush(port, &error);