X-Git-Url: http://git.droids-corp.org/?a=blobdiff_plain;ds=sidebyside;f=app%2Ftest-flow-perf%2Fmain.c;h=99d04634566a46544e0244cd3be3055a89b7cc71;hb=92533e9dfe7501a64b54260420cf00ece7352056;hp=8b5a0c7eaf20ea8fda1fbd51a32669765781838a;hpb=c12f4f217d8519582d60050a928b68a965a7b37c;p=dpdk.git

diff --git a/app/test-flow-perf/main.c b/app/test-flow-perf/main.c
index 8b5a0c7eaf..99d0463456 100644
--- a/app/test-flow-perf/main.c
+++ b/app/test-flow-perf/main.c
@@ -27,53 +27,119 @@
 #include <stdbool.h>
 #include <sys/time.h>
 #include <signal.h>
+#include <unistd.h>
 
 #include <rte_malloc.h>
 #include <rte_mempool.h>
 #include <rte_mbuf.h>
 #include <rte_ethdev.h>
 #include <rte_flow.h>
+#include <rte_mtr.h>
 
 #include "config.h"
 #include "flow_gen.h"
 
-#define MAX_ITERATIONS             100
+#define MAX_BATCHES_COUNT          100
 #define DEFAULT_RULES_COUNT    4000000
-#define DEFAULT_ITERATION       100000
+#define DEFAULT_RULES_BATCH     100000
+#define DEFAULT_GROUP                0
 
 struct rte_flow *flow;
 static uint8_t flow_group;
 
-static uint64_t flow_items;
-static uint64_t flow_actions;
-static uint64_t flow_attrs;
+static uint64_t encap_data;
+static uint64_t decap_data;
+
+static uint64_t flow_items[MAX_ITEMS_NUM];
+static uint64_t flow_actions[MAX_ACTIONS_NUM];
+static uint64_t flow_attrs[MAX_ATTRS_NUM];
+static uint8_t items_idx, actions_idx, attrs_idx;
+
+static uint64_t ports_mask;
 static volatile bool force_quit;
 static bool dump_iterations;
 static bool delete_flag;
+static bool dump_socket_mem_flag;
+static bool enable_fwd;
+
 static struct rte_mempool *mbuf_mp;
 static uint32_t nb_lcores;
-static uint32_t flows_count;
-static uint32_t iterations_number;
-static uint32_t hairpinq;
+static uint32_t rules_count;
+static uint32_t rules_batch;
+static uint32_t hairpin_queues_num; /* total hairpin q number - default: 0 */
+static uint32_t nb_lcores;
+
+#define MAX_PKT_BURST    32
+#define LCORE_MODE_PKT    1
+#define LCORE_MODE_STATS  2
+#define MAX_STREAMS      64
+#define METER_CREATE	  1
+#define METER_DELETE	  2
+
+struct stream {
+	int tx_port;
+	int tx_queue;
+	int rx_port;
+	int rx_queue;
+};
+
+struct lcore_info {
+	int mode;
+	int streams_nb;
+	struct stream streams[MAX_STREAMS];
+	/* stats */
+	uint64_t tx_pkts;
+	uint64_t tx_drops;
+	uint64_t rx_pkts;
+	struct rte_mbuf *pkts[MAX_PKT_BURST];
+} __rte_cache_aligned;
+
+static struct lcore_info lcore_infos[RTE_MAX_LCORE];
+
+struct used_cpu_time {
+	double insertion[MAX_PORTS][RTE_MAX_LCORE];
+	double deletion[MAX_PORTS][RTE_MAX_LCORE];
+};
+
+struct multi_cores_pool {
+	uint32_t cores_count;
+	uint32_t rules_count;
+	struct used_cpu_time create_meter;
+	struct used_cpu_time create_flow;
+	int64_t last_alloc[RTE_MAX_LCORE];
+	int64_t current_alloc[RTE_MAX_LCORE];
+} __rte_cache_aligned;
+
+static struct multi_cores_pool mc_pool = {
+	.cores_count = 1,
+};
 
 static void
 usage(char *progname)
 {
 	printf("\nusage: %s\n", progname);
 	printf("\nControl configurations:\n");
-	printf("  --flows-count=N: to set the number of needed"
-		" flows to insert, default is 4,000,000\n");
+	printf("  --rules-count=N: to set the number of needed"
+		" rules to insert, default is %d\n", DEFAULT_RULES_COUNT);
+	printf("  --rules-batch=N: set number of batched rules,"
+		" default is %d\n", DEFAULT_RULES_BATCH);
 	printf("  --dump-iterations: To print rates for each"
 		" iteration\n");
 	printf("  --deletion-rate: Enable deletion rate"
 		" calculations\n");
+	printf("  --dump-socket-mem: To dump all socket memory\n");
+	printf("  --enable-fwd: To enable packets forwarding"
+		" after insertion\n");
+	printf("  --portmask=N: hexadecimal bitmask of ports used\n");
 
 	printf("To set flow attributes:\n");
 	printf("  --ingress: set ingress attribute in flows\n");
 	printf("  --egress: set egress attribute in flows\n");
 	printf("  --transfer: set transfer attribute in flows\n");
 	printf("  --group=N: set group for all flows,"
-		" default is 0\n");
+		" default is %d\n", DEFAULT_GROUP);
+	printf("  --cores=N: to set the number of needed "
+		"cores to insert rte_flow rules, default is 1\n");
 
 	printf("To set flow items:\n");
 	printf("  --ether: add ether layer in flow items\n");
@@ -89,6 +155,8 @@ usage(char *progname)
 	printf("  --gtp: add gtp layer in flow items\n");
 	printf("  --meta: add meta layer in flow items\n");
 	printf("  --tag: add tag layer in flow items\n");
+	printf("  --icmpv4: add icmpv4 layer in flow items\n");
+	printf("  --icmpv6: add icmpv6 layer in flow items\n");
 
 	printf("To set flow actions:\n");
 	printf("  --port-id: add port-id action in flow actions\n");
@@ -101,13 +169,60 @@ usage(char *progname)
 	printf("  --set-tag: add set tag action in flow actions\n");
 	printf("  --drop: add drop action in flow actions\n");
 	printf("  --hairpin-queue=N: add hairpin-queue action in flow actions\n");
-	printf("  --hairpin-rss=N: add hairping-rss action in flow actions\n");
+	printf("  --hairpin-rss=N: add hairpin-rss action in flow actions\n");
+	printf("  --set-src-mac: add set src mac action to flow actions\n"
+		"Src mac to be set is random each flow\n");
+	printf("  --set-dst-mac: add set dst mac action to flow actions\n"
+		 "Dst mac to be set is random each flow\n");
+	printf("  --set-src-ipv4: add set src ipv4 action to flow actions\n"
+		"Src ipv4 to be set is random each flow\n");
+	printf("  --set-dst-ipv4 add set dst ipv4 action to flow actions\n"
+		"Dst ipv4 to be set is random each flow\n");
+	printf("  --set-src-ipv6: add set src ipv6 action to flow actions\n"
+		"Src ipv6 to be set is random each flow\n");
+	printf("  --set-dst-ipv6: add set dst ipv6 action to flow actions\n"
+		"Dst ipv6 to be set is random each flow\n");
+	printf("  --set-src-tp: add set src tp action to flow actions\n"
+		"Src tp to be set is random each flow\n");
+	printf("  --set-dst-tp: add set dst tp action to flow actions\n"
+		"Dst tp to be set is random each flow\n");
+	printf("  --inc-tcp-ack: add inc tcp ack action to flow actions\n"
+		"tcp ack will be increments by 1\n");
+	printf("  --dec-tcp-ack: add dec tcp ack action to flow actions\n"
+		"tcp ack will be decrements by 1\n");
+	printf("  --inc-tcp-seq: add inc tcp seq action to flow actions\n"
+		"tcp seq will be increments by 1\n");
+	printf("  --dec-tcp-seq: add dec tcp seq action to flow actions\n"
+		"tcp seq will be decrements by 1\n");
+	printf("  --set-ttl: add set ttl action to flow actions\n"
+		"L3 ttl to be set is random each flow\n");
+	printf("  --dec-ttl: add dec ttl action to flow actions\n"
+		"L3 ttl will be decrements by 1\n");
+	printf("  --set-ipv4-dscp: add set ipv4 dscp action to flow actions\n"
+		"ipv4 dscp value to be set is random each flow\n");
+	printf("  --set-ipv6-dscp: add set ipv6 dscp action to flow actions\n"
+		"ipv6 dscp value to be set is random each flow\n");
+	printf("  --flag: add flag action to flow actions\n");
+	printf("  --meter: add meter action to flow actions\n");
+	printf("  --raw-encap=<data>: add raw encap action to flow actions\n"
+		"Data is the data needed to be encaped\n"
+		"Example: raw-encap=ether,ipv4,udp,vxlan\n");
+	printf("  --raw-decap=<data>: add raw decap action to flow actions\n"
+		"Data is the data needed to be decaped\n"
+		"Example: raw-decap=ether,ipv4,udp,vxlan\n");
+	printf("  --vxlan-encap: add vxlan-encap action to flow actions\n"
+		"Encapped data is fixed with pattern: ether,ipv4,udp,vxlan\n"
+		"With fixed values\n");
+	printf("  --vxlan-decap: add vxlan_decap action to flow actions\n");
 }
 
 static void
 args_parse(int argc, char **argv)
 {
+	uint64_t pm;
 	char **argvopt;
+	char *token;
+	char *end;
 	int n, opt;
 	int opt_idx;
 	size_t i;
@@ -115,141 +230,345 @@ args_parse(int argc, char **argv)
 	static const struct option_dict {
 		const char *str;
 		const uint64_t mask;
-		uint64_t *bitmap;
+		uint64_t *map;
+		uint8_t *map_idx;
+
 	} flow_options[] = {
 		{
 			.str = "ether",
 			.mask = FLOW_ITEM_MASK(RTE_FLOW_ITEM_TYPE_ETH),
-			.bitmap = &flow_items
+			.map = &flow_items[0],
+			.map_idx = &items_idx
 		},
 		{
 			.str = "ipv4",
 			.mask = FLOW_ITEM_MASK(RTE_FLOW_ITEM_TYPE_IPV4),
-			.bitmap = &flow_items
+			.map = &flow_items[0],
+			.map_idx = &items_idx
 		},
 		{
 			.str = "ipv6",
 			.mask = FLOW_ITEM_MASK(RTE_FLOW_ITEM_TYPE_IPV6),
-			.bitmap = &flow_items
+			.map = &flow_items[0],
+			.map_idx = &items_idx
 		},
 		{
 			.str = "vlan",
 			.mask = FLOW_ITEM_MASK(RTE_FLOW_ITEM_TYPE_VLAN),
-			.bitmap = &flow_items
+			.map = &flow_items[0],
+			.map_idx = &items_idx
 		},
 		{
 			.str = "tcp",
 			.mask = FLOW_ITEM_MASK(RTE_FLOW_ITEM_TYPE_TCP),
-			.bitmap = &flow_items
+			.map = &flow_items[0],
+			.map_idx = &items_idx
 		},
 		{
 			.str = "udp",
 			.mask = FLOW_ITEM_MASK(RTE_FLOW_ITEM_TYPE_UDP),
-			.bitmap = &flow_items
+			.map = &flow_items[0],
+			.map_idx = &items_idx
 		},
 		{
 			.str = "vxlan",
 			.mask = FLOW_ITEM_MASK(RTE_FLOW_ITEM_TYPE_VXLAN),
-			.bitmap = &flow_items
+			.map = &flow_items[0],
+			.map_idx = &items_idx
 		},
 		{
 			.str = "vxlan-gpe",
 			.mask = FLOW_ITEM_MASK(RTE_FLOW_ITEM_TYPE_VXLAN_GPE),
-			.bitmap = &flow_items
+			.map = &flow_items[0],
+			.map_idx = &items_idx
 		},
 		{
 			.str = "gre",
 			.mask = FLOW_ITEM_MASK(RTE_FLOW_ITEM_TYPE_GRE),
-			.bitmap = &flow_items
+			.map = &flow_items[0],
+			.map_idx = &items_idx
 		},
 		{
 			.str = "geneve",
 			.mask = FLOW_ITEM_MASK(RTE_FLOW_ITEM_TYPE_GENEVE),
-			.bitmap = &flow_items
+			.map = &flow_items[0],
+			.map_idx = &items_idx
 		},
 		{
 			.str = "gtp",
 			.mask = FLOW_ITEM_MASK(RTE_FLOW_ITEM_TYPE_GTP),
-			.bitmap = &flow_items
+			.map = &flow_items[0],
+			.map_idx = &items_idx
 		},
 		{
 			.str = "meta",
 			.mask = FLOW_ITEM_MASK(RTE_FLOW_ITEM_TYPE_META),
-			.bitmap = &flow_items
+			.map = &flow_items[0],
+			.map_idx = &items_idx
 		},
 		{
 			.str = "tag",
 			.mask = FLOW_ITEM_MASK(RTE_FLOW_ITEM_TYPE_TAG),
-			.bitmap = &flow_items
+			.map = &flow_items[0],
+			.map_idx = &items_idx
+		},
+		{
+			.str = "icmpv4",
+			.mask = FLOW_ITEM_MASK(RTE_FLOW_ITEM_TYPE_ICMP),
+			.map = &flow_items[0],
+			.map_idx = &items_idx
+		},
+		{
+			.str = "icmpv6",
+			.mask = FLOW_ITEM_MASK(RTE_FLOW_ITEM_TYPE_ICMP6),
+			.map = &flow_items[0],
+			.map_idx = &items_idx
 		},
 		{
 			.str = "ingress",
 			.mask = INGRESS,
-			.bitmap = &flow_attrs
+			.map = &flow_attrs[0],
+			.map_idx = &attrs_idx
 		},
 		{
 			.str = "egress",
 			.mask = EGRESS,
-			.bitmap = &flow_attrs
+			.map = &flow_attrs[0],
+			.map_idx = &attrs_idx
 		},
 		{
 			.str = "transfer",
 			.mask = TRANSFER,
-			.bitmap = &flow_attrs
+			.map = &flow_attrs[0],
+			.map_idx = &attrs_idx
 		},
 		{
 			.str = "port-id",
 			.mask = FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_PORT_ID),
-			.bitmap = &flow_actions
+			.map = &flow_actions[0],
+			.map_idx = &actions_idx
 		},
 		{
 			.str = "rss",
 			.mask = FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_RSS),
-			.bitmap = &flow_actions
+			.map = &flow_actions[0],
+			.map_idx = &actions_idx
 		},
 		{
 			.str = "queue",
 			.mask = FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_QUEUE),
-			.bitmap = &flow_actions
+			.map = &flow_actions[0],
+			.map_idx = &actions_idx
 		},
 		{
 			.str = "jump",
 			.mask = FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_JUMP),
-			.bitmap = &flow_actions
+			.map = &flow_actions[0],
+			.map_idx = &actions_idx
 		},
 		{
 			.str = "mark",
 			.mask = FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_MARK),
-			.bitmap = &flow_actions
+			.map = &flow_actions[0],
+			.map_idx = &actions_idx
 		},
 		{
 			.str = "count",
 			.mask = FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_COUNT),
-			.bitmap = &flow_actions
+			.map = &flow_actions[0],
+			.map_idx = &actions_idx
 		},
 		{
 			.str = "set-meta",
 			.mask = FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_SET_META),
-			.bitmap = &flow_actions
+			.map = &flow_actions[0],
+			.map_idx = &actions_idx
 		},
 		{
 			.str = "set-tag",
 			.mask = FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_SET_TAG),
-			.bitmap = &flow_actions
+			.map = &flow_actions[0],
+			.map_idx = &actions_idx
 		},
 		{
 			.str = "drop",
 			.mask = FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_DROP),
-			.bitmap = &flow_actions
-		}
+			.map = &flow_actions[0],
+			.map_idx = &actions_idx
+		},
+		{
+			.str = "set-src-mac",
+			.mask = FLOW_ACTION_MASK(
+				RTE_FLOW_ACTION_TYPE_SET_MAC_SRC
+			),
+			.map = &flow_actions[0],
+			.map_idx = &actions_idx
+		},
+		{
+			.str = "set-dst-mac",
+			.mask = FLOW_ACTION_MASK(
+				RTE_FLOW_ACTION_TYPE_SET_MAC_DST
+			),
+			.map = &flow_actions[0],
+			.map_idx = &actions_idx
+		},
+		{
+			.str = "set-src-ipv4",
+			.mask = FLOW_ACTION_MASK(
+				RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC
+			),
+			.map = &flow_actions[0],
+			.map_idx = &actions_idx
+		},
+		{
+			.str = "set-dst-ipv4",
+			.mask = FLOW_ACTION_MASK(
+				RTE_FLOW_ACTION_TYPE_SET_IPV4_DST
+			),
+			.map = &flow_actions[0],
+			.map_idx = &actions_idx
+		},
+		{
+			.str = "set-src-ipv6",
+			.mask = FLOW_ACTION_MASK(
+				RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC
+			),
+			.map = &flow_actions[0],
+			.map_idx = &actions_idx
+		},
+		{
+			.str = "set-dst-ipv6",
+			.mask = FLOW_ACTION_MASK(
+				RTE_FLOW_ACTION_TYPE_SET_IPV6_DST
+			),
+			.map = &flow_actions[0],
+			.map_idx = &actions_idx
+		},
+		{
+			.str = "set-src-tp",
+			.mask = FLOW_ACTION_MASK(
+				RTE_FLOW_ACTION_TYPE_SET_TP_SRC
+			),
+			.map = &flow_actions[0],
+			.map_idx = &actions_idx
+		},
+		{
+			.str = "set-dst-tp",
+			.mask = FLOW_ACTION_MASK(
+				RTE_FLOW_ACTION_TYPE_SET_TP_DST
+			),
+			.map = &flow_actions[0],
+			.map_idx = &actions_idx
+		},
+		{
+			.str = "inc-tcp-ack",
+			.mask = FLOW_ACTION_MASK(
+				RTE_FLOW_ACTION_TYPE_INC_TCP_ACK
+			),
+			.map = &flow_actions[0],
+			.map_idx = &actions_idx
+		},
+		{
+			.str = "dec-tcp-ack",
+			.mask = FLOW_ACTION_MASK(
+				RTE_FLOW_ACTION_TYPE_DEC_TCP_ACK
+			),
+			.map = &flow_actions[0],
+			.map_idx = &actions_idx
+		},
+		{
+			.str = "inc-tcp-seq",
+			.mask = FLOW_ACTION_MASK(
+				RTE_FLOW_ACTION_TYPE_INC_TCP_SEQ
+			),
+			.map = &flow_actions[0],
+			.map_idx = &actions_idx
+		},
+		{
+			.str = "dec-tcp-seq",
+			.mask = FLOW_ACTION_MASK(
+				RTE_FLOW_ACTION_TYPE_DEC_TCP_SEQ
+			),
+			.map = &flow_actions[0],
+			.map_idx = &actions_idx
+		},
+		{
+			.str = "set-ttl",
+			.mask = FLOW_ACTION_MASK(
+				RTE_FLOW_ACTION_TYPE_SET_TTL
+			),
+			.map = &flow_actions[0],
+			.map_idx = &actions_idx
+		},
+		{
+			.str = "dec-ttl",
+			.mask = FLOW_ACTION_MASK(
+				RTE_FLOW_ACTION_TYPE_DEC_TTL
+			),
+			.map = &flow_actions[0],
+			.map_idx = &actions_idx
+		},
+		{
+			.str = "set-ipv4-dscp",
+			.mask = FLOW_ACTION_MASK(
+				RTE_FLOW_ACTION_TYPE_SET_IPV4_DSCP
+			),
+			.map = &flow_actions[0],
+			.map_idx = &actions_idx
+		},
+		{
+			.str = "set-ipv6-dscp",
+			.mask = FLOW_ACTION_MASK(
+				RTE_FLOW_ACTION_TYPE_SET_IPV6_DSCP
+			),
+			.map = &flow_actions[0],
+			.map_idx = &actions_idx
+		},
+		{
+			.str = "flag",
+			.mask = FLOW_ACTION_MASK(
+				RTE_FLOW_ACTION_TYPE_FLAG
+			),
+			.map = &flow_actions[0],
+			.map_idx = &actions_idx
+		},
+		{
+			.str = "meter",
+			.mask = FLOW_ACTION_MASK(
+				RTE_FLOW_ACTION_TYPE_METER
+			),
+			.map = &flow_actions[0],
+			.map_idx = &actions_idx
+		},
+		{
+			.str = "vxlan-encap",
+			.mask = FLOW_ACTION_MASK(
+				RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP
+			),
+			.map = &flow_actions[0],
+			.map_idx = &actions_idx
+		},
+		{
+			.str = "vxlan-decap",
+			.mask = FLOW_ACTION_MASK(
+				RTE_FLOW_ACTION_TYPE_VXLAN_DECAP
+			),
+			.map = &flow_actions[0],
+			.map_idx = &actions_idx
+		},
 	};
 
 	static const struct option lgopts[] = {
 		/* Control */
 		{ "help",                       0, 0, 0 },
-		{ "flows-count",                1, 0, 0 },
+		{ "rules-count",                1, 0, 0 },
+		{ "rules-batch",                1, 0, 0 },
 		{ "dump-iterations",            0, 0, 0 },
 		{ "deletion-rate",              0, 0, 0 },
+		{ "dump-socket-mem",            0, 0, 0 },
+		{ "enable-fwd",                 0, 0, 0 },
+		{ "portmask",                   1, 0, 0 },
+		{ "cores",                      1, 0, 0 },
 		/* Attributes */
 		{ "ingress",                    0, 0, 0 },
 		{ "egress",                     0, 0, 0 },
@@ -269,6 +588,8 @@ args_parse(int argc, char **argv)
 		{ "gtp",                        0, 0, 0 },
 		{ "meta",                       0, 0, 0 },
 		{ "tag",                        0, 0, 0 },
+		{ "icmpv4",                     0, 0, 0 },
+		{ "icmpv6",                     0, 0, 0 },
 		/* Actions */
 		{ "port-id",                    0, 0, 0 },
 		{ "rss",                        0, 0, 0 },
@@ -281,12 +602,34 @@ args_parse(int argc, char **argv)
 		{ "drop",                       0, 0, 0 },
 		{ "hairpin-queue",              1, 0, 0 },
 		{ "hairpin-rss",                1, 0, 0 },
+		{ "set-src-mac",                0, 0, 0 },
+		{ "set-dst-mac",                0, 0, 0 },
+		{ "set-src-ipv4",               0, 0, 0 },
+		{ "set-dst-ipv4",               0, 0, 0 },
+		{ "set-src-ipv6",               0, 0, 0 },
+		{ "set-dst-ipv6",               0, 0, 0 },
+		{ "set-src-tp",                 0, 0, 0 },
+		{ "set-dst-tp",                 0, 0, 0 },
+		{ "inc-tcp-ack",                0, 0, 0 },
+		{ "dec-tcp-ack",                0, 0, 0 },
+		{ "inc-tcp-seq",                0, 0, 0 },
+		{ "dec-tcp-seq",                0, 0, 0 },
+		{ "set-ttl",                    0, 0, 0 },
+		{ "dec-ttl",                    0, 0, 0 },
+		{ "set-ipv4-dscp",              0, 0, 0 },
+		{ "set-ipv6-dscp",              0, 0, 0 },
+		{ "flag",                       0, 0, 0 },
+		{ "meter",		        0, 0, 0 },
+		{ "raw-encap",                  1, 0, 0 },
+		{ "raw-decap",                  1, 0, 0 },
+		{ "vxlan-encap",                0, 0, 0 },
+		{ "vxlan-decap",                0, 0, 0 },
 	};
 
-	flow_items = 0;
-	flow_actions = 0;
-	flow_attrs = 0;
-	hairpinq = 0;
+	RTE_ETH_FOREACH_DEV(i)
+		ports_mask |= 1 << i;
+
+	hairpin_queues_num = 0;
 	argvopt = argv;
 
 	printf(":: Flow -> ");
@@ -306,13 +649,14 @@ args_parse(int argc, char **argv)
 				else
 					rte_exit(EXIT_SUCCESS,
 						"flow group should be >= 0\n");
-				printf("group %d ", flow_group);
+				printf("group %d / ", flow_group);
 			}
 
 			for (i = 0; i < RTE_DIM(flow_options); i++)
 				if (strcmp(lgopts[opt_idx].name,
 						flow_options[i].str) == 0) {
-					*flow_options[i].bitmap |=
+					flow_options[i].map[
+					(*flow_options[i].map_idx)++] =
 						flow_options[i].mask;
 					printf("%s / ", flow_options[i].str);
 				}
@@ -321,45 +665,139 @@ args_parse(int argc, char **argv)
 					"hairpin-rss") == 0) {
 				n = atoi(optarg);
 				if (n > 0)
-					hairpinq = n;
+					hairpin_queues_num = n;
 				else
 					rte_exit(EXIT_SUCCESS,
 						"Hairpin queues should be > 0\n");
 
-				flow_actions |= HAIRPIN_RSS_ACTION;
+				flow_actions[actions_idx++] =
+					HAIRPIN_RSS_ACTION;
 				printf("hairpin-rss / ");
 			}
 			if (strcmp(lgopts[opt_idx].name,
 					"hairpin-queue") == 0) {
 				n = atoi(optarg);
 				if (n > 0)
-					hairpinq = n;
+					hairpin_queues_num = n;
 				else
 					rte_exit(EXIT_SUCCESS,
 						"Hairpin queues should be > 0\n");
 
-				flow_actions |= HAIRPIN_QUEUE_ACTION;
+				flow_actions[actions_idx++] =
+					HAIRPIN_QUEUE_ACTION;
 				printf("hairpin-queue / ");
 			}
 
+			if (strcmp(lgopts[opt_idx].name, "raw-encap") == 0) {
+				printf("raw-encap ");
+				flow_actions[actions_idx++] =
+					FLOW_ITEM_MASK(
+						RTE_FLOW_ACTION_TYPE_RAW_ENCAP
+					);
+
+				token = strtok(optarg, ",");
+				while (token != NULL) {
+					for (i = 0; i < RTE_DIM(flow_options); i++) {
+						if (strcmp(flow_options[i].str, token) == 0) {
+							printf("%s,", token);
+							encap_data |= flow_options[i].mask;
+							break;
+						}
+						/* Reached last item with no match */
+						if (i == (RTE_DIM(flow_options) - 1)) {
+							fprintf(stderr, "Invalid encap item: %s\n", token);
+							usage(argv[0]);
+							rte_exit(EXIT_SUCCESS, "Invalid encap item\n");
+						}
+					}
+					token = strtok(NULL, ",");
+				}
+				printf(" / ");
+			}
+			if (strcmp(lgopts[opt_idx].name, "raw-decap") == 0) {
+				printf("raw-decap ");
+				flow_actions[actions_idx++] =
+					FLOW_ITEM_MASK(
+						RTE_FLOW_ACTION_TYPE_RAW_DECAP
+					);
+
+				token = strtok(optarg, ",");
+				while (token != NULL) {
+					for (i = 0; i < RTE_DIM(flow_options); i++) {
+						if (strcmp(flow_options[i].str, token) == 0) {
+							printf("%s,", token);
+							encap_data |= flow_options[i].mask;
+							break;
+						}
+						/* Reached last item with no match */
+						if (i == (RTE_DIM(flow_options) - 1)) {
+							fprintf(stderr, "Invalid decap item: %s\n", token);
+							usage(argv[0]);
+							rte_exit(EXIT_SUCCESS, "Invalid decap item\n");
+						}
+					}
+					token = strtok(NULL, ",");
+				}
+				printf(" / ");
+			}
 			/* Control */
 			if (strcmp(lgopts[opt_idx].name,
-					"flows-count") == 0) {
+					"rules-batch") == 0) {
 				n = atoi(optarg);
-				if (n > (int) iterations_number)
-					flows_count = n;
+				if (n >= DEFAULT_RULES_BATCH)
+					rules_batch = n;
 				else {
-					printf("\n\nflows_count should be > %d\n",
-						iterations_number);
+					printf("\n\nrules_batch should be >= %d\n",
+						DEFAULT_RULES_BATCH);
 					rte_exit(EXIT_SUCCESS, " ");
 				}
 			}
+			if (strcmp(lgopts[opt_idx].name,
+					"rules-count") == 0) {
+				n = atoi(optarg);
+				if (n >= (int) rules_batch)
+					rules_count = n;
+				else {
+					printf("\n\nrules_count should be >= %d\n",
+						rules_batch);
+				}
+			}
 			if (strcmp(lgopts[opt_idx].name,
 					"dump-iterations") == 0)
 				dump_iterations = true;
 			if (strcmp(lgopts[opt_idx].name,
 					"deletion-rate") == 0)
 				delete_flag = true;
+			if (strcmp(lgopts[opt_idx].name,
+					"dump-socket-mem") == 0)
+				dump_socket_mem_flag = true;
+			if (strcmp(lgopts[opt_idx].name,
+					"enable-fwd") == 0)
+				enable_fwd = true;
+			if (strcmp(lgopts[opt_idx].name,
+					"portmask") == 0) {
+				/* parse hexadecimal string */
+				end = NULL;
+				pm = strtoull(optarg, &end, 16);
+				if ((optarg[0] == '\0') || (end == NULL) || (*end != '\0'))
+					rte_exit(EXIT_FAILURE, "Invalid fwd port mask\n");
+				ports_mask = pm;
+			}
+			if (strcmp(lgopts[opt_idx].name, "cores") == 0) {
+				n = atoi(optarg);
+				if ((int) rte_lcore_count() <= n) {
+					printf("\nError: you need %d cores to run on multi-cores\n"
+						"Existing cores are: %d\n", n, rte_lcore_count());
+					rte_exit(EXIT_FAILURE, " ");
+				}
+				if (n <= RTE_MAX_LCORE && n > 0)
+					mc_pool.cores_count = n;
+				else {
+					printf("Error: cores count must be > 0 "
+						" and < %d\n", RTE_MAX_LCORE);
+					rte_exit(EXIT_FAILURE, " ");
+				}
+			}
 			break;
 		default:
 			fprintf(stderr, "Invalid option: %s\n", argv[optind]);
@@ -371,6 +809,62 @@ args_parse(int argc, char **argv)
 	printf("end_flow\n");
 }
 
+/* Dump the socket memory statistics on console */
+static size_t
+dump_socket_mem(FILE *f)
+{
+	struct rte_malloc_socket_stats socket_stats;
+	unsigned int i = 0;
+	size_t total = 0;
+	size_t alloc = 0;
+	size_t free = 0;
+	unsigned int n_alloc = 0;
+	unsigned int n_free = 0;
+	bool active_nodes = false;
+
+
+	for (i = 0; i < RTE_MAX_NUMA_NODES; i++) {
+		if (rte_malloc_get_socket_stats(i, &socket_stats) ||
+		    !socket_stats.heap_totalsz_bytes)
+			continue;
+		active_nodes = true;
+		total += socket_stats.heap_totalsz_bytes;
+		alloc += socket_stats.heap_allocsz_bytes;
+		free += socket_stats.heap_freesz_bytes;
+		n_alloc += socket_stats.alloc_count;
+		n_free += socket_stats.free_count;
+		if (dump_socket_mem_flag) {
+			fprintf(f, "::::::::::::::::::::::::::::::::::::::::");
+			fprintf(f,
+				"\nSocket %u:\nsize(M) total: %.6lf\nalloc:"
+				" %.6lf(%.3lf%%)\nfree: %.6lf"
+				"\nmax: %.6lf"
+				"\ncount alloc: %u\nfree: %u\n",
+				i,
+				socket_stats.heap_totalsz_bytes / 1.0e6,
+				socket_stats.heap_allocsz_bytes / 1.0e6,
+				(double)socket_stats.heap_allocsz_bytes * 100 /
+				(double)socket_stats.heap_totalsz_bytes,
+				socket_stats.heap_freesz_bytes / 1.0e6,
+				socket_stats.greatest_free_size / 1.0e6,
+				socket_stats.alloc_count,
+				socket_stats.free_count);
+				fprintf(f, "::::::::::::::::::::::::::::::::::::::::");
+		}
+	}
+	if (dump_socket_mem_flag && active_nodes) {
+		fprintf(f,
+			"\nTotal: size(M)\ntotal: %.6lf"
+			"\nalloc: %.6lf(%.3lf%%)\nfree: %.6lf"
+			"\ncount alloc: %u\nfree: %u\n",
+			total / 1.0e6, alloc / 1.0e6,
+			(double)alloc * 100 / (double)total, free / 1.0e6,
+			n_alloc, n_free);
+		fprintf(f, "::::::::::::::::::::::::::::::::::::::::\n");
+	}
+	return alloc;
+}
+
 static void
 print_flow_error(struct rte_flow_error error)
 {
@@ -380,179 +874,571 @@ print_flow_error(struct rte_flow_error error)
 }
 
 static inline void
-destroy_flows(int port_id, struct rte_flow **flow_list)
+print_rules_batches(double *cpu_time_per_batch)
+{
+	uint8_t idx;
+	double delta;
+	double rate;
+
+	for (idx = 0; idx < MAX_BATCHES_COUNT; idx++) {
+		if (!cpu_time_per_batch[idx])
+			break;
+		delta = (double)(rules_batch / cpu_time_per_batch[idx]);
+		rate = delta / 1000; /* Save rate in K unit. */
+		printf(":: Rules batch #%d: %d rules "
+			"in %f sec[ Rate = %f K Rule/Sec ]\n",
+			idx, rules_batch,
+			cpu_time_per_batch[idx], rate);
+	}
+}
+
+
+static inline int
+has_meter(void)
+{
+	int i;
+
+	for (i = 0; i < MAX_ACTIONS_NUM; i++) {
+		if (flow_actions[i] == 0)
+			break;
+		if (flow_actions[i]
+				& FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_METER))
+			return 1;
+	}
+	return 0;
+}
+
+static void
+create_meter_rule(int port_id, uint32_t counter)
+{
+	int ret;
+	struct rte_mtr_params params;
+	uint32_t default_prof_id = 100;
+	struct rte_mtr_error error;
+
+	memset(&params, 0, sizeof(struct rte_mtr_params));
+	params.meter_enable = 1;
+	params.stats_mask = 0xffff;
+	params.use_prev_mtr_color = 0;
+	params.dscp_table = NULL;
+
+	/*create meter*/
+	params.meter_profile_id = default_prof_id;
+	params.action[RTE_COLOR_GREEN] =
+		MTR_POLICER_ACTION_COLOR_GREEN;
+	params.action[RTE_COLOR_YELLOW] =
+		MTR_POLICER_ACTION_COLOR_YELLOW;
+	params.action[RTE_COLOR_RED] =
+		MTR_POLICER_ACTION_DROP;
+
+	ret = rte_mtr_create(port_id, counter, &params, 1, &error);
+	if (ret != 0) {
+		printf("Port %u create meter idx(%d) error(%d) message: %s\n",
+			port_id, counter, error.type,
+			error.message ? error.message : "(no stated reason)");
+		rte_exit(EXIT_FAILURE, "error in creating meter");
+	}
+}
+
+static void
+destroy_meter_rule(int port_id, uint32_t counter)
+{
+	struct rte_mtr_error error;
+
+	if (rte_mtr_destroy(port_id, counter, &error)) {
+		printf("Port %u destroy meter(%d) error(%d) message: %s\n",
+			port_id, counter, error.type,
+			error.message ? error.message : "(no stated reason)");
+		rte_exit(EXIT_FAILURE, "Error in deleting meter rule");
+	}
+}
+
+static void
+meters_handler(int port_id, uint8_t core_id, uint8_t ops)
+{
+	uint64_t start_batch;
+	double cpu_time_used, insertion_rate;
+	int rules_count_per_core, rules_batch_idx;
+	uint32_t counter, start_counter = 0, end_counter;
+	double cpu_time_per_batch[MAX_BATCHES_COUNT] = { 0 };
+
+	rules_count_per_core = rules_count / mc_pool.cores_count;
+
+	if (core_id)
+		start_counter = core_id * rules_count_per_core;
+	end_counter = (core_id + 1) * rules_count_per_core;
+
+	cpu_time_used = 0;
+	start_batch = rte_rdtsc();
+	for (counter = start_counter; counter < end_counter; counter++) {
+		if (ops == METER_CREATE)
+			create_meter_rule(port_id, counter);
+		else
+			destroy_meter_rule(port_id, counter);
+		/*
+		 * Save the insertion rate for rules batch.
+		 * Check if the insertion reached the rules
+		 * patch counter, then save the insertion rate
+		 * for this batch.
+		 */
+		if (!((counter + 1) % rules_batch)) {
+			rules_batch_idx = ((counter + 1) / rules_batch) - 1;
+			cpu_time_per_batch[rules_batch_idx] =
+				((double)(rte_rdtsc() - start_batch))
+				/ rte_get_tsc_hz();
+			cpu_time_used += cpu_time_per_batch[rules_batch_idx];
+			start_batch = rte_rdtsc();
+		}
+	}
+
+	/* Print insertion rates for all batches */
+	if (dump_iterations)
+		print_rules_batches(cpu_time_per_batch);
+
+	insertion_rate =
+		((double) (rules_count_per_core / cpu_time_used) / 1000);
+
+	/* Insertion rate for all rules in one core */
+	printf(":: Port %d :: Core %d Meter %s :: start @[%d] - end @[%d],"
+		" use:%.02fs, rate:%.02fk Rule/Sec\n",
+		port_id, core_id, ops == METER_CREATE ? "create" : "delete",
+		start_counter, end_counter - 1,
+		cpu_time_used, insertion_rate);
+
+	if (ops == METER_CREATE)
+		mc_pool.create_meter.insertion[port_id][core_id]
+			= cpu_time_used;
+	else
+		mc_pool.create_meter.deletion[port_id][core_id]
+			= cpu_time_used;
+}
+
+static void
+destroy_meter_profile(void)
+{
+	struct rte_mtr_error error;
+	uint16_t nr_ports;
+	int port_id;
+
+	nr_ports = rte_eth_dev_count_avail();
+	for (port_id = 0; port_id < nr_ports; port_id++) {
+		/* If port outside portmask */
+		if (!((ports_mask >> port_id) & 0x1))
+			continue;
+
+		if (rte_mtr_meter_profile_delete
+			(port_id, DEFAULT_METER_PROF_ID, &error)) {
+			printf("Port %u del profile error(%d) message: %s\n",
+				port_id, error.type,
+				error.message ? error.message : "(no stated reason)");
+			rte_exit(EXIT_FAILURE, "Error: Destroy meter profile Failed!\n");
+		}
+	}
+}
+
+static void
+create_meter_profile(void)
+{
+	uint16_t nr_ports;
+	int ret, port_id;
+	struct rte_mtr_meter_profile mp;
+	struct rte_mtr_error error;
+
+	/*
+	 *currently , only create one meter file for one port
+	 *1 meter profile -> N meter rules -> N rte flows
+	 */
+	memset(&mp, 0, sizeof(struct rte_mtr_meter_profile));
+	nr_ports = rte_eth_dev_count_avail();
+	for (port_id = 0; port_id < nr_ports; port_id++) {
+		/* If port outside portmask */
+		if (!((ports_mask >> port_id) & 0x1))
+			continue;
+
+		mp.alg = RTE_MTR_SRTCM_RFC2697;
+		mp.srtcm_rfc2697.cir = METER_CIR;
+		mp.srtcm_rfc2697.cbs = METER_CIR / 8;
+		mp.srtcm_rfc2697.ebs = 0;
+
+		ret = rte_mtr_meter_profile_add
+			(port_id, DEFAULT_METER_PROF_ID, &mp, &error);
+		if (ret != 0) {
+			printf("Port %u create Profile error(%d) message: %s\n",
+				port_id, error.type,
+				error.message ? error.message : "(no stated reason)");
+			rte_exit(EXIT_FAILURE, "Error: Creation meter profile Failed!\n");
+		}
+	}
+}
+
+static inline void
+destroy_flows(int port_id, uint8_t core_id, struct rte_flow **flows_list)
 {
 	struct rte_flow_error error;
-	clock_t start_iter, end_iter;
+	clock_t start_batch, end_batch;
 	double cpu_time_used = 0;
-	double flows_rate;
-	double cpu_time_per_iter[MAX_ITERATIONS];
+	double deletion_rate;
+	double cpu_time_per_batch[MAX_BATCHES_COUNT] = { 0 };
 	double delta;
 	uint32_t i;
-	int iter_id;
+	int rules_batch_idx;
+	int rules_count_per_core;
 
-	for (i = 0; i < MAX_ITERATIONS; i++)
-		cpu_time_per_iter[i] = -1;
+	rules_count_per_core = rules_count / mc_pool.cores_count;
+	/* If group > 0 , should add 1 flow which created in group 0 */
+	if (flow_group > 0 && core_id == 0)
+		rules_count_per_core++;
 
-	if (iterations_number > flows_count)
-		iterations_number = flows_count;
-
-	/* Deletion Rate */
-	printf("Flows Deletion on port = %d\n", port_id);
-	start_iter = clock();
-	for (i = 0; i < flows_count; i++) {
-		if (flow_list[i] == 0)
+	start_batch = rte_rdtsc();
+	for (i = 0; i < (uint32_t) rules_count_per_core; i++) {
+		if (flows_list[i] == 0)
 			break;
 
 		memset(&error, 0x33, sizeof(error));
-		if (rte_flow_destroy(port_id, flow_list[i], &error)) {
+		if (rte_flow_destroy(port_id, flows_list[i], &error)) {
 			print_flow_error(error);
 			rte_exit(EXIT_FAILURE, "Error in deleting flow");
 		}
 
-		if (i && !((i + 1) % iterations_number)) {
-			/* Save the deletion rate of each iter */
-			end_iter = clock();
-			delta = (double) (end_iter - start_iter);
-			iter_id = ((i + 1) / iterations_number) - 1;
-			cpu_time_per_iter[iter_id] =
-				delta / CLOCKS_PER_SEC;
-			cpu_time_used += cpu_time_per_iter[iter_id];
-			start_iter = clock();
+		/*
+		 * Save the deletion rate for rules batch.
+		 * Check if the deletion reached the rules
+		 * patch counter, then save the deletion rate
+		 * for this batch.
+		 */
+		if (!((i + 1) % rules_batch)) {
+			end_batch = rte_rdtsc();
+			delta = (double) (end_batch - start_batch);
+			rules_batch_idx = ((i + 1) / rules_batch) - 1;
+			cpu_time_per_batch[rules_batch_idx] = delta / rte_get_tsc_hz();
+			cpu_time_used += cpu_time_per_batch[rules_batch_idx];
+			start_batch = rte_rdtsc();
 		}
 	}
 
-	/* Deletion rate per iteration */
+	/* Print deletion rates for all batches */
 	if (dump_iterations)
-		for (i = 0; i < MAX_ITERATIONS; i++) {
-			if (cpu_time_per_iter[i] == -1)
-				continue;
-			delta = (double)(iterations_number /
-				cpu_time_per_iter[i]);
-			flows_rate = delta / 1000;
-			printf(":: Iteration #%d: %d flows "
-				"in %f sec[ Rate = %f K/Sec ]\n",
-				i, iterations_number,
-				cpu_time_per_iter[i], flows_rate);
-		}
+		print_rules_batches(cpu_time_per_batch);
+
+	/* Deletion rate for all rules */
+	deletion_rate = ((double) (rules_count_per_core / cpu_time_used) / 1000);
+	printf(":: Port %d :: Core %d :: Rules deletion rate -> %f K Rule/Sec\n",
+		port_id, core_id, deletion_rate);
+	printf(":: Port %d :: Core %d :: The time for deleting %d rules is %f seconds\n",
+		port_id, core_id, rules_count_per_core, cpu_time_used);
 
-	/* Deletion rate for all flows */
-	flows_rate = ((double) (flows_count / cpu_time_used) / 1000);
-	printf("\n:: Total flow deletion rate -> %f K/Sec\n",
-		flows_rate);
-	printf(":: The time for deleting %d in flows %f seconds\n",
-		flows_count, cpu_time_used);
+	mc_pool.create_flow.deletion[port_id][core_id] = cpu_time_used;
 }
 
-static inline void
-flows_handler(void)
+static struct rte_flow **
+insert_flows(int port_id, uint8_t core_id)
 {
-	struct rte_flow **flow_list;
+	struct rte_flow **flows_list;
 	struct rte_flow_error error;
-	clock_t start_iter, end_iter;
+	clock_t start_batch, end_batch;
 	double cpu_time_used;
-	double flows_rate;
-	double cpu_time_per_iter[MAX_ITERATIONS];
+	double insertion_rate;
+	double cpu_time_per_batch[MAX_BATCHES_COUNT] = { 0 };
 	double delta;
-	uint16_t nr_ports;
-	uint32_t i;
-	int port_id;
-	int iter_id;
 	uint32_t flow_index;
+	uint32_t counter, start_counter = 0, end_counter;
+	uint64_t global_items[MAX_ITEMS_NUM] = { 0 };
+	uint64_t global_actions[MAX_ACTIONS_NUM] = { 0 };
+	int rules_batch_idx;
+	int rules_count_per_core;
 
-	nr_ports = rte_eth_dev_count_avail();
-
-	for (i = 0; i < MAX_ITERATIONS; i++)
-		cpu_time_per_iter[i] = -1;
+	rules_count_per_core = rules_count / mc_pool.cores_count;
 
-	if (iterations_number > flows_count)
-		iterations_number = flows_count;
+	/* Set boundaries of rules for each core. */
+	if (core_id)
+		start_counter = core_id * rules_count_per_core;
+	end_counter = (core_id + 1) * rules_count_per_core;
 
-	printf(":: Flows Count per port: %d\n", flows_count);
+	global_items[0] = FLOW_ITEM_MASK(RTE_FLOW_ITEM_TYPE_ETH);
+	global_actions[0] = FLOW_ITEM_MASK(RTE_FLOW_ACTION_TYPE_JUMP);
 
-	flow_list = rte_zmalloc("flow_list",
-		(sizeof(struct rte_flow *) * flows_count) + 1, 0);
-	if (flow_list == NULL)
+	flows_list = rte_zmalloc("flows_list",
+		(sizeof(struct rte_flow *) * rules_count_per_core) + 1, 0);
+	if (flows_list == NULL)
 		rte_exit(EXIT_FAILURE, "No Memory available!");
 
+	cpu_time_used = 0;
+	flow_index = 0;
+	if (flow_group > 0 && core_id == 0) {
+		/*
+		 * Create global rule to jump into flow_group,
+		 * this way the app will avoid the default rules.
+		 *
+		 * This rule will be created only once.
+		 *
+		 * Global rule:
+		 * group 0 eth / end actions jump group <flow_group>
+		 */
+		flow = generate_flow(port_id, 0, flow_attrs,
+			global_items, global_actions,
+			flow_group, 0, 0, 0, 0, core_id, &error);
+
+		if (flow == NULL) {
+			print_flow_error(error);
+			rte_exit(EXIT_FAILURE, "error in creating flow");
+		}
+		flows_list[flow_index++] = flow;
+	}
+
+	start_batch = rte_rdtsc();
+	for (counter = start_counter; counter < end_counter; counter++) {
+		flow = generate_flow(port_id, flow_group,
+			flow_attrs, flow_items, flow_actions,
+			JUMP_ACTION_TABLE, counter,
+			hairpin_queues_num,
+			encap_data, decap_data,
+			core_id, &error);
+
+		if (force_quit)
+			counter = end_counter;
+
+		if (!flow) {
+			print_flow_error(error);
+			rte_exit(EXIT_FAILURE, "error in creating flow");
+		}
+
+		flows_list[flow_index++] = flow;
+
+		/*
+		 * Save the insertion rate for rules batch.
+		 * Check if the insertion reached the rules
+		 * patch counter, then save the insertion rate
+		 * for this batch.
+		 */
+		if (!((counter + 1) % rules_batch)) {
+			end_batch = rte_rdtsc();
+			delta = (double) (end_batch - start_batch);
+			rules_batch_idx = ((counter + 1) / rules_batch) - 1;
+			cpu_time_per_batch[rules_batch_idx] = delta / rte_get_tsc_hz();
+			cpu_time_used += cpu_time_per_batch[rules_batch_idx];
+			start_batch = rte_rdtsc();
+		}
+	}
+
+	/* Print insertion rates for all batches */
+	if (dump_iterations)
+		print_rules_batches(cpu_time_per_batch);
+
+	printf(":: Port %d :: Core %d boundaries :: start @[%d] - end @[%d]\n",
+		port_id, core_id, start_counter, end_counter - 1);
+
+	/* Insertion rate for all rules in one core */
+	insertion_rate = ((double) (rules_count_per_core / cpu_time_used) / 1000);
+	printf(":: Port %d :: Core %d :: Rules insertion rate -> %f K Rule/Sec\n",
+		port_id, core_id, insertion_rate);
+	printf(":: Port %d :: Core %d :: The time for creating %d in rules %f seconds\n",
+		port_id, core_id, rules_count_per_core, cpu_time_used);
+
+	mc_pool.create_flow.insertion[port_id][core_id] = cpu_time_used;
+	return flows_list;
+}
+
+static void
+flows_handler(uint8_t core_id)
+{
+	struct rte_flow **flows_list;
+	uint16_t nr_ports;
+	int port_id;
+
+	nr_ports = rte_eth_dev_count_avail();
+
+	if (rules_batch > rules_count)
+		rules_batch = rules_count;
+
+	printf(":: Rules Count per port: %d\n\n", rules_count);
+
 	for (port_id = 0; port_id < nr_ports; port_id++) {
-		cpu_time_used = 0;
-		flow_index = 0;
-		if (flow_group > 0) {
-			/*
-			 * Create global rule to jump into flow_group,
-			 * this way the app will avoid the default rules.
-			 *
-			 * Global rule:
-			 * group 0 eth / end actions jump group <flow_group>
-			 *
-			 */
-			flow = generate_flow(port_id, 0, flow_attrs,
-				FLOW_ITEM_MASK(RTE_FLOW_ITEM_TYPE_ETH),
-				FLOW_ITEM_MASK(RTE_FLOW_ACTION_TYPE_JUMP),
-				flow_group, 0, 0, &error);
-
-			if (flow == NULL) {
-				print_flow_error(error);
-				rte_exit(EXIT_FAILURE, "error in creating flow");
-			}
-			flow_list[flow_index++] = flow;
+		/* If port outside portmask */
+		if (!((ports_mask >> port_id) & 0x1))
+			continue;
+
+		/* Insertion part. */
+		mc_pool.last_alloc[core_id] = (int64_t)dump_socket_mem(stdout);
+		if (has_meter())
+			meters_handler(port_id, core_id, METER_CREATE);
+		flows_list = insert_flows(port_id, core_id);
+		if (flows_list == NULL)
+			rte_exit(EXIT_FAILURE, "Error: Insertion Failed!\n");
+		mc_pool.current_alloc[core_id] = (int64_t)dump_socket_mem(stdout);
+
+		/* Deletion part. */
+		if (delete_flag) {
+			destroy_flows(port_id, core_id, flows_list);
+			if (has_meter())
+				meters_handler(port_id, core_id, METER_DELETE);
 		}
+	}
+}
 
-		/* Insertion Rate */
-		printf("Flows insertion on port = %d\n", port_id);
-		start_iter = clock();
-		for (i = 0; i < flows_count; i++) {
-			flow = generate_flow(port_id, flow_group,
-				flow_attrs, flow_items, flow_actions,
-				JUMP_ACTION_TABLE, i, hairpinq, &error);
+static void
+dump_used_cpu_time(const char *item,
+		uint16_t port, struct used_cpu_time *used_time)
+{
+	uint32_t i;
+	/* Latency: total count of rte rules divided
+	 * over max time used by thread between all
+	 * threads time.
+	 *
+	 * Throughput: total count of rte rules divided
+	 * over the average of the time cosumed by all
+	 * threads time.
+	 */
+	double insertion_latency_time;
+	double insertion_throughput_time;
+	double deletion_latency_time;
+	double deletion_throughput_time;
+	double insertion_latency, insertion_throughput;
+	double deletion_latency, deletion_throughput;
+
+	/* Save first insertion/deletion rates from first thread.
+	 * Start comparing with all threads, if any thread used
+	 * time more than current saved, replace it.
+	 *
+	 * Thus in the end we will have the max time used for
+	 * insertion/deletion by one thread.
+	 *
+	 * As for memory consumption, save the min of all threads
+	 * of last alloc, and save the max for all threads for
+	 * current alloc.
+	 */
+
+	insertion_latency_time = used_time->insertion[port][0];
+	deletion_latency_time = used_time->deletion[port][0];
+	insertion_throughput_time = used_time->insertion[port][0];
+	deletion_throughput_time = used_time->deletion[port][0];
+
+	i = mc_pool.cores_count;
+	while (i-- > 1) {
+		insertion_throughput_time += used_time->insertion[port][i];
+		deletion_throughput_time += used_time->deletion[port][i];
+		if (insertion_latency_time < used_time->insertion[port][i])
+			insertion_latency_time = used_time->insertion[port][i];
+		if (deletion_latency_time < used_time->deletion[port][i])
+			deletion_latency_time = used_time->deletion[port][i];
+	}
 
-			if (force_quit)
-				i = flows_count;
+	insertion_latency = ((double) (mc_pool.rules_count
+				/ insertion_latency_time) / 1000);
+	deletion_latency = ((double) (mc_pool.rules_count
+				/ deletion_latency_time) / 1000);
+
+	insertion_throughput_time /= mc_pool.cores_count;
+	deletion_throughput_time /= mc_pool.cores_count;
+	insertion_throughput = ((double) (mc_pool.rules_count
+				/ insertion_throughput_time) / 1000);
+	deletion_throughput = ((double) (mc_pool.rules_count
+				/ deletion_throughput_time) / 1000);
+
+	/* Latency stats */
+	printf("\n%s\n:: [Latency | Insertion] All Cores :: Port %d :: ",
+		item, port);
+	printf("Total flows insertion rate -> %f K Rules/Sec\n",
+		insertion_latency);
+	printf(":: [Latency | Insertion] All Cores :: Port %d :: ", port);
+	printf("The time for creating %d rules is %f seconds\n",
+		mc_pool.rules_count, insertion_latency_time);
+
+	/* Throughput stats */
+	printf(":: [Throughput | Insertion] All Cores :: Port %d :: ", port);
+	printf("Total flows insertion rate -> %f K Rules/Sec\n",
+		insertion_throughput);
+	printf(":: [Throughput | Insertion] All Cores :: Port %d :: ", port);
+	printf("The average time for creating %d rules is %f seconds\n",
+		mc_pool.rules_count, insertion_throughput_time);
+
+	if (delete_flag) {
+	/* Latency stats */
+		printf(":: [Latency | Deletion] All Cores :: Port %d :: Total "
+			"deletion rate -> %f K Rules/Sec\n",
+			port, deletion_latency);
+		printf(":: [Latency | Deletion] All Cores :: Port %d :: ",
+			port);
+		printf("The time for deleting %d rules is %f seconds\n",
+			mc_pool.rules_count, deletion_latency_time);
+
+		/* Throughput stats */
+		printf(":: [Throughput | Deletion] All Cores :: Port %d :: Total "
+			"deletion rate -> %f K Rules/Sec\n",
+			port, deletion_throughput);
+		printf(":: [Throughput | Deletion] All Cores :: Port %d :: ",
+			port);
+		printf("The average time for deleting %d rules is %f seconds\n",
+			mc_pool.rules_count, deletion_throughput_time);
+	}
+}
 
-			if (!flow) {
-				print_flow_error(error);
-				rte_exit(EXIT_FAILURE, "error in creating flow");
-			}
+static void
+dump_used_mem(uint16_t port)
+{
+	uint32_t i;
+	int64_t last_alloc, current_alloc;
+	int flow_size_in_bytes;
+
+	last_alloc = mc_pool.last_alloc[0];
+	current_alloc = mc_pool.current_alloc[0];
+
+	i = mc_pool.cores_count;
+	while (i-- > 1) {
+		if (last_alloc > mc_pool.last_alloc[i])
+			last_alloc = mc_pool.last_alloc[i];
+		if (current_alloc < mc_pool.current_alloc[i])
+			current_alloc = mc_pool.current_alloc[i];
+	}
 
-			flow_list[flow_index++] = flow;
-
-			if (i && !((i + 1) % iterations_number)) {
-				/* Save the insertion rate of each iter */
-				end_iter = clock();
-				delta = (double) (end_iter - start_iter);
-				iter_id = ((i + 1) / iterations_number) - 1;
-				cpu_time_per_iter[iter_id] =
-					delta / CLOCKS_PER_SEC;
-				cpu_time_used += cpu_time_per_iter[iter_id];
-				start_iter = clock();
-			}
+	flow_size_in_bytes = (current_alloc - last_alloc) / mc_pool.rules_count;
+	printf("\n:: Port %d :: rte_flow size in DPDK layer: %d Bytes\n",
+		port, flow_size_in_bytes);
+}
+
+static int
+run_rte_flow_handler_cores(void *data __rte_unused)
+{
+	uint16_t port;
+	int lcore_counter = 0;
+	int lcore_id = rte_lcore_id();
+	int i;
+
+	RTE_LCORE_FOREACH(i) {
+		/*  If core not needed return. */
+		if (lcore_id == i) {
+			printf(":: lcore %d mapped with index %d\n", lcore_id, lcore_counter);
+			if (lcore_counter >= (int) mc_pool.cores_count)
+				return 0;
+			break;
 		}
+		lcore_counter++;
+	}
+	lcore_id = lcore_counter;
 
-		/* Iteration rate per iteration */
-		if (dump_iterations)
-			for (i = 0; i < MAX_ITERATIONS; i++) {
-				if (cpu_time_per_iter[i] == -1)
-					continue;
-				delta = (double)(iterations_number /
-					cpu_time_per_iter[i]);
-				flows_rate = delta / 1000;
-				printf(":: Iteration #%d: %d flows "
-					"in %f sec[ Rate = %f K/Sec ]\n",
-					i, iterations_number,
-					cpu_time_per_iter[i], flows_rate);
-			}
+	if (lcore_id >= (int) mc_pool.cores_count)
+		return 0;
+
+	mc_pool.rules_count = rules_count;
 
-		/* Insertion rate for all flows */
-		flows_rate = ((double) (flows_count / cpu_time_used) / 1000);
-		printf("\n:: Total flow insertion rate -> %f K/Sec\n",
-						flows_rate);
-		printf(":: The time for creating %d in flows %f seconds\n",
-						flows_count, cpu_time_used);
+	flows_handler(lcore_id);
 
-		if (delete_flag)
-			destroy_flows(port_id, flow_list);
+	/* Only main core to print total results. */
+	if (lcore_id != 0)
+		return 0;
+
+	/* Make sure all cores finished insertion/deletion process. */
+	rte_eal_mp_wait_lcore();
+
+	RTE_ETH_FOREACH_DEV(port) {
+		if (has_meter())
+			dump_used_cpu_time("Meters:",
+				port, &mc_pool.create_meter);
+		dump_used_cpu_time("Flows:",
+			port, &mc_pool.create_flow);
+		dump_used_mem(port);
 	}
+
+	return 0;
 }
 
 static void
@@ -566,12 +1452,271 @@ signal_handler(int signum)
 	}
 }
 
+static inline uint16_t
+do_rx(struct lcore_info *li, uint16_t rx_port, uint16_t rx_queue)
+{
+	uint16_t cnt = 0;
+	cnt = rte_eth_rx_burst(rx_port, rx_queue, li->pkts, MAX_PKT_BURST);
+	li->rx_pkts += cnt;
+	return cnt;
+}
+
+static inline void
+do_tx(struct lcore_info *li, uint16_t cnt, uint16_t tx_port,
+			uint16_t tx_queue)
+{
+	uint16_t nr_tx = 0;
+	uint16_t i;
+
+	nr_tx = rte_eth_tx_burst(tx_port, tx_queue, li->pkts, cnt);
+	li->tx_pkts  += nr_tx;
+	li->tx_drops += cnt - nr_tx;
+
+	for (i = nr_tx; i < cnt; i++)
+		rte_pktmbuf_free(li->pkts[i]);
+}
+
+/*
+ * Method to convert numbers into pretty numbers that easy
+ * to read. The design here is to add comma after each three
+ * digits and set all of this inside buffer.
+ *
+ * For example if n = 1799321, the output will be
+ * 1,799,321 after this method which is easier to read.
+ */
+static char *
+pretty_number(uint64_t n, char *buf)
+{
+	char p[6][4];
+	int i = 0;
+	int off = 0;
+
+	while (n > 1000) {
+		sprintf(p[i], "%03d", (int)(n % 1000));
+		n /= 1000;
+		i += 1;
+	}
+
+	sprintf(p[i++], "%d", (int)n);
+
+	while (i--)
+		off += sprintf(buf + off, "%s,", p[i]);
+	buf[strlen(buf) - 1] = '\0';
+
+	return buf;
+}
+
+static void
+packet_per_second_stats(void)
+{
+	struct lcore_info *old;
+	struct lcore_info *li, *oli;
+	int nr_lines = 0;
+	int i;
+
+	old = rte_zmalloc("old",
+		sizeof(struct lcore_info) * RTE_MAX_LCORE, 0);
+	if (old == NULL)
+		rte_exit(EXIT_FAILURE, "No Memory available!");
+
+	memcpy(old, lcore_infos,
+		sizeof(struct lcore_info) * RTE_MAX_LCORE);
+
+	while (!force_quit) {
+		uint64_t total_tx_pkts = 0;
+		uint64_t total_rx_pkts = 0;
+		uint64_t total_tx_drops = 0;
+		uint64_t tx_delta, rx_delta, drops_delta;
+		char buf[3][32];
+		int nr_valid_core = 0;
+
+		sleep(1);
+
+		if (nr_lines) {
+			char go_up_nr_lines[16];
+
+			sprintf(go_up_nr_lines, "%c[%dA\r", 27, nr_lines);
+			printf("%s\r", go_up_nr_lines);
+		}
+
+		printf("\n%6s %16s %16s %16s\n", "core", "tx", "tx drops", "rx");
+		printf("%6s %16s %16s %16s\n", "------", "----------------",
+			"----------------", "----------------");
+		nr_lines = 3;
+		for (i = 0; i < RTE_MAX_LCORE; i++) {
+			li  = &lcore_infos[i];
+			oli = &old[i];
+			if (li->mode != LCORE_MODE_PKT)
+				continue;
+
+			tx_delta    = li->tx_pkts  - oli->tx_pkts;
+			rx_delta    = li->rx_pkts  - oli->rx_pkts;
+			drops_delta = li->tx_drops - oli->tx_drops;
+			printf("%6d %16s %16s %16s\n", i,
+				pretty_number(tx_delta,    buf[0]),
+				pretty_number(drops_delta, buf[1]),
+				pretty_number(rx_delta,    buf[2]));
+
+			total_tx_pkts  += tx_delta;
+			total_rx_pkts  += rx_delta;
+			total_tx_drops += drops_delta;
+
+			nr_valid_core++;
+			nr_lines += 1;
+		}
+
+		if (nr_valid_core > 1) {
+			printf("%6s %16s %16s %16s\n", "total",
+				pretty_number(total_tx_pkts,  buf[0]),
+				pretty_number(total_tx_drops, buf[1]),
+				pretty_number(total_rx_pkts,  buf[2]));
+			nr_lines += 1;
+		}
+
+		memcpy(old, lcore_infos,
+			sizeof(struct lcore_info) * RTE_MAX_LCORE);
+	}
+}
+
+static int
+start_forwarding(void *data __rte_unused)
+{
+	int lcore = rte_lcore_id();
+	int stream_id;
+	uint16_t cnt;
+	struct lcore_info *li = &lcore_infos[lcore];
+
+	if (!li->mode)
+		return 0;
+
+	if (li->mode == LCORE_MODE_STATS) {
+		printf(":: started stats on lcore %u\n", lcore);
+		packet_per_second_stats();
+		return 0;
+	}
+
+	while (!force_quit)
+		for (stream_id = 0; stream_id < MAX_STREAMS; stream_id++) {
+			if (li->streams[stream_id].rx_port == -1)
+				continue;
+
+			cnt = do_rx(li,
+					li->streams[stream_id].rx_port,
+					li->streams[stream_id].rx_queue);
+			if (cnt)
+				do_tx(li, cnt,
+					li->streams[stream_id].tx_port,
+					li->streams[stream_id].tx_queue);
+		}
+	return 0;
+}
+
+static void
+init_lcore_info(void)
+{
+	int i, j;
+	unsigned int lcore;
+	uint16_t nr_port;
+	uint16_t queue;
+	int port;
+	int stream_id = 0;
+	int streams_per_core;
+	int unassigned_streams;
+	int nb_fwd_streams;
+	nr_port = rte_eth_dev_count_avail();
+
+	/* First logical core is reserved for stats printing */
+	lcore = rte_get_next_lcore(-1, 0, 0);
+	lcore_infos[lcore].mode = LCORE_MODE_STATS;
+
+	/*
+	 * Initialize all cores
+	 * All cores at first must have -1 value in all streams
+	 * This means that this stream is not used, or not set
+	 * yet.
+	 */
+	for (i = 0; i < RTE_MAX_LCORE; i++)
+		for (j = 0; j < MAX_STREAMS; j++) {
+			lcore_infos[i].streams[j].tx_port = -1;
+			lcore_infos[i].streams[j].rx_port = -1;
+			lcore_infos[i].streams[j].tx_queue = -1;
+			lcore_infos[i].streams[j].rx_queue = -1;
+			lcore_infos[i].streams_nb = 0;
+		}
+
+	/*
+	 * Calculate the total streams count.
+	 * Also distribute those streams count between the available
+	 * logical cores except first core, since it's reserved for
+	 * stats prints.
+	 */
+	nb_fwd_streams = nr_port * RXQ_NUM;
+	if ((int)(nb_lcores - 1) >= nb_fwd_streams)
+		for (i = 0; i < (int)(nb_lcores - 1); i++) {
+			lcore = rte_get_next_lcore(lcore, 0, 0);
+			lcore_infos[lcore].streams_nb = 1;
+		}
+	else {
+		streams_per_core = nb_fwd_streams / (nb_lcores - 1);
+		unassigned_streams = nb_fwd_streams % (nb_lcores - 1);
+		for (i = 0; i < (int)(nb_lcores - 1); i++) {
+			lcore = rte_get_next_lcore(lcore, 0, 0);
+			lcore_infos[lcore].streams_nb = streams_per_core;
+			if (unassigned_streams) {
+				lcore_infos[lcore].streams_nb++;
+				unassigned_streams--;
+			}
+		}
+	}
+
+	/*
+	 * Set the streams for the cores according to each logical
+	 * core stream count.
+	 * The streams is built on the design of what received should
+	 * forward as well, this means that if you received packets on
+	 * port 0 queue 0 then the same queue should forward the
+	 * packets, using the same logical core.
+	 */
+	lcore = rte_get_next_lcore(-1, 0, 0);
+	for (port = 0; port < nr_port; port++) {
+		/* Create FWD stream */
+		for (queue = 0; queue < RXQ_NUM; queue++) {
+			if (!lcore_infos[lcore].streams_nb ||
+				!(stream_id % lcore_infos[lcore].streams_nb)) {
+				lcore = rte_get_next_lcore(lcore, 0, 0);
+				lcore_infos[lcore].mode = LCORE_MODE_PKT;
+				stream_id = 0;
+			}
+			lcore_infos[lcore].streams[stream_id].rx_queue = queue;
+			lcore_infos[lcore].streams[stream_id].tx_queue = queue;
+			lcore_infos[lcore].streams[stream_id].rx_port = port;
+			lcore_infos[lcore].streams[stream_id].tx_port = port;
+			stream_id++;
+		}
+	}
+
+	/* Print all streams */
+	printf(":: Stream -> core id[N]: (rx_port, rx_queue)->(tx_port, tx_queue)\n");
+	for (i = 0; i < RTE_MAX_LCORE; i++)
+		for (j = 0; j < MAX_STREAMS; j++) {
+			/* No streams for this core */
+			if (lcore_infos[i].streams[j].tx_port == -1)
+				break;
+			printf("Stream -> core id[%d]: (%d,%d)->(%d,%d)\n",
+				i,
+				lcore_infos[i].streams[j].rx_port,
+				lcore_infos[i].streams[j].rx_queue,
+				lcore_infos[i].streams[j].tx_port,
+				lcore_infos[i].streams[j].tx_queue);
+		}
+}
+
 static void
 init_port(void)
 {
 	int ret;
 	uint16_t std_queue;
-	uint16_t hairpin_q;
+	uint16_t hairpin_queue;
 	uint16_t port_id;
 	uint16_t nr_ports;
 	uint16_t nr_queues;
@@ -589,8 +1734,8 @@ init_port(void)
 	struct rte_eth_dev_info dev_info;
 
 	nr_queues = RXQ_NUM;
-	if (hairpinq != 0)
-		nr_queues = RXQ_NUM + hairpinq;
+	if (hairpin_queues_num != 0)
+		nr_queues = RXQ_NUM + hairpin_queues_num;
 
 	nr_ports = rte_eth_dev_count_avail();
 	if (nr_ports == 0)
@@ -653,15 +1798,20 @@ init_port(void)
 				":: promiscuous mode enable failed: err=%s, port=%u\n",
 				rte_strerror(-ret), port_id);
 
-		if (hairpinq != 0) {
-			for (hairpin_q = RXQ_NUM, std_queue = 0;
-					std_queue < nr_queues;
-					hairpin_q++, std_queue++) {
+		if (hairpin_queues_num != 0) {
+			/*
+			 * Configure peer which represents hairpin Tx.
+			 * Hairpin queue numbers start after standard queues
+			 * (RXQ_NUM and TXQ_NUM).
+			 */
+			for (hairpin_queue = RXQ_NUM, std_queue = 0;
+					hairpin_queue < nr_queues;
+					hairpin_queue++, std_queue++) {
 				hairpin_conf.peers[0].port = port_id;
 				hairpin_conf.peers[0].queue =
 					std_queue + TXQ_NUM;
 				ret = rte_eth_rx_hairpin_queue_setup(
-						port_id, hairpin_q,
+						port_id, hairpin_queue,
 						NR_RXD, &hairpin_conf);
 				if (ret != 0)
 					rte_exit(EXIT_FAILURE,
@@ -669,14 +1819,14 @@ init_port(void)
 						ret, port_id);
 			}
 
-			for (hairpin_q = TXQ_NUM, std_queue = 0;
-					std_queue < nr_queues;
-					hairpin_q++, std_queue++) {
+			for (hairpin_queue = TXQ_NUM, std_queue = 0;
+					hairpin_queue < nr_queues;
+					hairpin_queue++, std_queue++) {
 				hairpin_conf.peers[0].port = port_id;
 				hairpin_conf.peers[0].queue =
 					std_queue + RXQ_NUM;
 				ret = rte_eth_tx_hairpin_queue_setup(
-						port_id, hairpin_q,
+						port_id, hairpin_queue,
 						NR_TXD, &hairpin_conf);
 				if (ret != 0)
 					rte_exit(EXIT_FAILURE,
@@ -708,10 +1858,11 @@ main(int argc, char **argv)
 
 	force_quit = false;
 	dump_iterations = false;
-	flows_count = DEFAULT_RULES_COUNT;
-	iterations_number = DEFAULT_ITERATION;
+	rules_count = DEFAULT_RULES_COUNT;
+	rules_batch = DEFAULT_RULES_BATCH;
 	delete_flag = false;
-	flow_group = 0;
+	dump_socket_mem_flag = false;
+	flow_group = DEFAULT_GROUP;
 
 	signal(SIGINT, signal_handler);
 	signal(SIGTERM, signal_handler);
@@ -727,12 +1878,26 @@ main(int argc, char **argv)
 	if (nb_lcores <= 1)
 		rte_exit(EXIT_FAILURE, "This app needs at least two cores\n");
 
-	flows_handler();
+
+	printf(":: Flows Count per port: %d\n\n", rules_count);
+
+	if (has_meter())
+		create_meter_profile();
+	rte_eal_mp_remote_launch(run_rte_flow_handler_cores, NULL, CALL_MAIN);
+
+	if (enable_fwd) {
+		init_lcore_info();
+		rte_eal_mp_remote_launch(start_forwarding, NULL, CALL_MAIN);
+	}
+	if (has_meter() && delete_flag)
+		destroy_meter_profile();
 
 	RTE_ETH_FOREACH_DEV(port) {
 		rte_flow_flush(port, &error);
-		rte_eth_dev_stop(port);
+		if (rte_eth_dev_stop(port) != 0)
+			printf("Failed to stop device on port %u\n", port);
 		rte_eth_dev_close(port);
 	}
+	printf("\nBye ...\n");
 	return 0;
 }