X-Git-Url: http://git.droids-corp.org/?a=blobdiff_plain;f=app%2Ftest-bbdev%2Ftest_bbdev_perf.c;h=0fa119a5028b921dd8684cb80dec67f616531a8d;hb=81c2337e044dc16f1d93745d2a1668cdebc37c81;hp=c45cdd2600b47ac4d9c2c21b3e7cac14d0537df9;hpb=f162c47533ed4a5150a5f4d2d0c026e7fb258e44;p=dpdk.git

diff --git a/app/test-bbdev/test_bbdev_perf.c b/app/test-bbdev/test_bbdev_perf.c
index c45cdd2600..0fa119a502 100644
--- a/app/test-bbdev/test_bbdev_perf.c
+++ b/app/test-bbdev/test_bbdev_perf.c
@@ -24,9 +24,10 @@
 #define GET_SOCKET(socket_id) (((socket_id) == SOCKET_ID_ANY) ? 0 : (socket_id))
 
 #define MAX_QUEUES RTE_MAX_LCORE
-#define TEST_REPETITIONS 1000
+#define TEST_REPETITIONS 100
+#define WAIT_OFFLOAD_US 1000
 
-#ifdef RTE_LIBRTE_PMD_BBDEV_FPGA_LTE_FEC
+#ifdef RTE_BASEBAND_FPGA_LTE_FEC
 #include <fpga_lte_fec.h>
 #define FPGA_LTE_PF_DRIVER_NAME ("intel_fpga_lte_fec_pf")
 #define FPGA_LTE_VF_DRIVER_NAME ("intel_fpga_lte_fec_vf")
@@ -39,6 +40,31 @@
 #define FLR_4G_TIMEOUT 610
 #endif
 
+#ifdef RTE_BASEBAND_FPGA_5GNR_FEC
+#include <rte_pmd_fpga_5gnr_fec.h>
+#define FPGA_5GNR_PF_DRIVER_NAME ("intel_fpga_5gnr_fec_pf")
+#define FPGA_5GNR_VF_DRIVER_NAME ("intel_fpga_5gnr_fec_vf")
+#define VF_UL_5G_QUEUE_VALUE 4
+#define VF_DL_5G_QUEUE_VALUE 4
+#define UL_5G_BANDWIDTH 3
+#define DL_5G_BANDWIDTH 3
+#define UL_5G_LOAD_BALANCE 128
+#define DL_5G_LOAD_BALANCE 128
+#define FLR_5G_TIMEOUT 610
+#endif
+
+#ifdef RTE_BASEBAND_ACC100
+#include <rte_acc100_cfg.h>
+#define ACC100PF_DRIVER_NAME   ("intel_acc100_pf")
+#define ACC100VF_DRIVER_NAME   ("intel_acc100_vf")
+#define ACC100_QMGR_NUM_AQS 16
+#define ACC100_QMGR_NUM_QGS 2
+#define ACC100_QMGR_AQ_DEPTH 5
+#define ACC100_QMGR_INVALID_IDX -1
+#define ACC100_QMGR_RR 1
+#define ACC100_QOS_GBR 0
+#endif
+
 #define OPS_CACHE_SIZE 256U
 #define OPS_POOL_SIZE_MIN 511U /* 0.5K per queue */
 
@@ -107,7 +133,7 @@ struct test_op_params {
 	uint16_t num_to_process;
 	uint16_t num_lcores;
 	int vector_mask;
-	rte_atomic16_t sync;
+	uint16_t sync;
 	struct test_buffers q_bufs[RTE_MAX_NUMA_NODES][MAX_QUEUES];
 };
 
@@ -122,9 +148,9 @@ struct thread_params {
 	uint8_t iter_count;
 	double iter_average;
 	double bler;
-	rte_atomic16_t nb_dequeued;
-	rte_atomic16_t processing_status;
-	rte_atomic16_t burst_sz;
+	uint16_t nb_dequeued;
+	int16_t processing_status;
+	uint16_t burst_sz;
 	struct test_op_params *op_params;
 	struct rte_bbdev_dec_op *dec_ops[MAX_BURST];
 	struct rte_bbdev_enc_op *enc_ops[MAX_BURST];
@@ -201,6 +227,45 @@ clear_soft_out_cap(uint32_t *op_flags)
 	*op_flags &= ~RTE_BBDEV_TURBO_NEG_LLR_1_BIT_SOFT_OUT;
 }
 
+/* This API is to convert all the test vector op data entries
+ * to big endian format. It is used when the device supports
+ * the input in the big endian format.
+ */
+static inline void
+convert_op_data_to_be(void)
+{
+	struct op_data_entries *op;
+	enum op_data_type type;
+	uint8_t nb_segs, *rem_data, temp;
+	uint32_t *data, len;
+	int complete, rem, i, j;
+
+	for (type = DATA_INPUT; type < DATA_NUM_TYPES; ++type) {
+		nb_segs = test_vector.entries[type].nb_segments;
+		op = &test_vector.entries[type];
+
+		/* Invert byte endianness for all the segments */
+		for (i = 0; i < nb_segs; ++i) {
+			len = op->segments[i].length;
+			data = op->segments[i].addr;
+
+			/* Swap complete u32 bytes */
+			complete = len / 4;
+			for (j = 0; j < complete; j++)
+				data[j] = rte_bswap32(data[j]);
+
+			/* Swap any remaining bytes */
+			rem = len % 4;
+			rem_data = (uint8_t *)&data[j];
+			for (j = 0; j < rem/2; j++) {
+				temp = rem_data[j];
+				rem_data[j] = rem_data[rem - j - 1];
+				rem_data[rem - j - 1] = temp;
+			}
+		}
+	}
+}
+
 static int
 check_dev_cap(const struct rte_bbdev_info *dev_info)
 {
@@ -208,6 +273,7 @@ check_dev_cap(const struct rte_bbdev_info *dev_info)
 	unsigned int nb_inputs, nb_soft_outputs, nb_hard_outputs,
 		nb_harq_inputs, nb_harq_outputs;
 	const struct rte_bbdev_op_cap *op_cap = dev_info->drv.capabilities;
+	uint8_t dev_data_endianness = dev_info->drv.data_endianness;
 
 	nb_inputs = test_vector.entries[DATA_INPUT].nb_segments;
 	nb_soft_outputs = test_vector.entries[DATA_SOFT_OUTPUT].nb_segments;
@@ -219,6 +285,9 @@ check_dev_cap(const struct rte_bbdev_info *dev_info)
 		if (op_cap->type != test_vector.op_type)
 			continue;
 
+		if (dev_data_endianness == RTE_BIG_ENDIAN)
+			convert_op_data_to_be();
+
 		if (op_cap->type == RTE_BBDEV_OP_TURBO_DEC) {
 			const struct rte_bbdev_op_cap_turbo_dec *cap =
 					&op_cap->cap.turbo_dec;
@@ -346,14 +415,14 @@ check_dev_cap(const struct rte_bbdev_info *dev_info)
 			if (nb_harq_inputs > cap->num_buffers_hard_out) {
 				printf(
 					"Too many HARQ inputs defined: %u, max: %u\n",
-					nb_hard_outputs,
+					nb_harq_inputs,
 					cap->num_buffers_hard_out);
 				return TEST_FAILED;
 			}
 			if (nb_harq_outputs > cap->num_buffers_hard_out) {
 				printf(
 					"Too many HARQ outputs defined: %u, max: %u\n",
-					nb_hard_outputs,
+					nb_harq_outputs,
 					cap->num_buffers_hard_out);
 				return TEST_FAILED;
 			}
@@ -552,17 +621,17 @@ add_bbdev_dev(uint8_t dev_id, struct rte_bbdev_info *info,
 /* Configure fpga lte fec with PF & VF values
  * if '-i' flag is set and using fpga device
  */
-#ifdef RTE_LIBRTE_PMD_BBDEV_FPGA_LTE_FEC
+#ifdef RTE_BASEBAND_FPGA_LTE_FEC
 	if ((get_init_device() == true) &&
 		(!strcmp(info->drv.driver_name, FPGA_LTE_PF_DRIVER_NAME))) {
-		struct fpga_lte_fec_conf conf;
+		struct rte_fpga_lte_fec_conf conf;
 		unsigned int i;
 
 		printf("Configure FPGA LTE FEC Driver %s with default values\n",
 				info->drv.driver_name);
 
 		/* clear default configuration before initialization */
-		memset(&conf, 0, sizeof(struct fpga_lte_fec_conf));
+		memset(&conf, 0, sizeof(struct rte_fpga_lte_fec_conf));
 
 		/* Set PF mode :
 		 * true if PF is used for data plane
@@ -590,12 +659,115 @@ add_bbdev_dev(uint8_t dev_id, struct rte_bbdev_info *info,
 		conf.flr_time_out = FLR_4G_TIMEOUT;
 
 		/* setup FPGA PF with configuration information */
-		ret = fpga_lte_fec_configure(info->dev_name, &conf);
+		ret = rte_fpga_lte_fec_configure(info->dev_name, &conf);
 		TEST_ASSERT_SUCCESS(ret,
 				"Failed to configure 4G FPGA PF for bbdev %s",
 				info->dev_name);
 	}
 #endif
+#ifdef RTE_BASEBAND_FPGA_5GNR_FEC
+	if ((get_init_device() == true) &&
+		(!strcmp(info->drv.driver_name, FPGA_5GNR_PF_DRIVER_NAME))) {
+		struct rte_fpga_5gnr_fec_conf conf;
+		unsigned int i;
+
+		printf("Configure FPGA 5GNR FEC Driver %s with default values\n",
+				info->drv.driver_name);
+
+		/* clear default configuration before initialization */
+		memset(&conf, 0, sizeof(struct rte_fpga_5gnr_fec_conf));
+
+		/* Set PF mode :
+		 * true if PF is used for data plane
+		 * false for VFs
+		 */
+		conf.pf_mode_en = true;
+
+		for (i = 0; i < FPGA_5GNR_FEC_NUM_VFS; ++i) {
+			/* Number of UL queues per VF (fpga supports 8 VFs) */
+			conf.vf_ul_queues_number[i] = VF_UL_5G_QUEUE_VALUE;
+			/* Number of DL queues per VF (fpga supports 8 VFs) */
+			conf.vf_dl_queues_number[i] = VF_DL_5G_QUEUE_VALUE;
+		}
+
+		/* UL bandwidth. Needed for schedule algorithm */
+		conf.ul_bandwidth = UL_5G_BANDWIDTH;
+		/* DL bandwidth */
+		conf.dl_bandwidth = DL_5G_BANDWIDTH;
+
+		/* UL & DL load Balance Factor to 64 */
+		conf.ul_load_balance = UL_5G_LOAD_BALANCE;
+		conf.dl_load_balance = DL_5G_LOAD_BALANCE;
+
+		/**< FLR timeout value */
+		conf.flr_time_out = FLR_5G_TIMEOUT;
+
+		/* setup FPGA PF with configuration information */
+		ret = rte_fpga_5gnr_fec_configure(info->dev_name, &conf);
+		TEST_ASSERT_SUCCESS(ret,
+				"Failed to configure 5G FPGA PF for bbdev %s",
+				info->dev_name);
+	}
+#endif
+#ifdef RTE_BASEBAND_ACC100
+	if ((get_init_device() == true) &&
+		(!strcmp(info->drv.driver_name, ACC100PF_DRIVER_NAME))) {
+		struct rte_acc100_conf conf;
+		unsigned int i;
+
+		printf("Configure ACC100 FEC Driver %s with default values\n",
+				info->drv.driver_name);
+
+		/* clear default configuration before initialization */
+		memset(&conf, 0, sizeof(struct rte_acc100_conf));
+
+		/* Always set in PF mode for built-in configuration */
+		conf.pf_mode_en = true;
+		for (i = 0; i < RTE_ACC100_NUM_VFS; ++i) {
+			conf.arb_dl_4g[i].gbr_threshold1 = ACC100_QOS_GBR;
+			conf.arb_dl_4g[i].gbr_threshold1 = ACC100_QOS_GBR;
+			conf.arb_dl_4g[i].round_robin_weight = ACC100_QMGR_RR;
+			conf.arb_ul_4g[i].gbr_threshold1 = ACC100_QOS_GBR;
+			conf.arb_ul_4g[i].gbr_threshold1 = ACC100_QOS_GBR;
+			conf.arb_ul_4g[i].round_robin_weight = ACC100_QMGR_RR;
+			conf.arb_dl_5g[i].gbr_threshold1 = ACC100_QOS_GBR;
+			conf.arb_dl_5g[i].gbr_threshold1 = ACC100_QOS_GBR;
+			conf.arb_dl_5g[i].round_robin_weight = ACC100_QMGR_RR;
+			conf.arb_ul_5g[i].gbr_threshold1 = ACC100_QOS_GBR;
+			conf.arb_ul_5g[i].gbr_threshold1 = ACC100_QOS_GBR;
+			conf.arb_ul_5g[i].round_robin_weight = ACC100_QMGR_RR;
+		}
+
+		conf.input_pos_llr_1_bit = true;
+		conf.output_pos_llr_1_bit = true;
+		conf.num_vf_bundles = 1; /**< Number of VF bundles to setup */
+
+		conf.q_ul_4g.num_qgroups = ACC100_QMGR_NUM_QGS;
+		conf.q_ul_4g.first_qgroup_index = ACC100_QMGR_INVALID_IDX;
+		conf.q_ul_4g.num_aqs_per_groups = ACC100_QMGR_NUM_AQS;
+		conf.q_ul_4g.aq_depth_log2 = ACC100_QMGR_AQ_DEPTH;
+		conf.q_dl_4g.num_qgroups = ACC100_QMGR_NUM_QGS;
+		conf.q_dl_4g.first_qgroup_index = ACC100_QMGR_INVALID_IDX;
+		conf.q_dl_4g.num_aqs_per_groups = ACC100_QMGR_NUM_AQS;
+		conf.q_dl_4g.aq_depth_log2 = ACC100_QMGR_AQ_DEPTH;
+		conf.q_ul_5g.num_qgroups = ACC100_QMGR_NUM_QGS;
+		conf.q_ul_5g.first_qgroup_index = ACC100_QMGR_INVALID_IDX;
+		conf.q_ul_5g.num_aqs_per_groups = ACC100_QMGR_NUM_AQS;
+		conf.q_ul_5g.aq_depth_log2 = ACC100_QMGR_AQ_DEPTH;
+		conf.q_dl_5g.num_qgroups = ACC100_QMGR_NUM_QGS;
+		conf.q_dl_5g.first_qgroup_index = ACC100_QMGR_INVALID_IDX;
+		conf.q_dl_5g.num_aqs_per_groups = ACC100_QMGR_NUM_AQS;
+		conf.q_dl_5g.aq_depth_log2 = ACC100_QMGR_AQ_DEPTH;
+
+		/* setup PF with configuration information */
+		ret = rte_acc100_configure(info->dev_name, &conf);
+		TEST_ASSERT_SUCCESS(ret,
+				"Failed to configure ACC100 PF for bbdev %s",
+				info->dev_name);
+	}
+#endif
+	/* Let's refresh this now this is configured */
+	rte_bbdev_info_get(dev_id, info);
 	nb_queues = RTE_MIN(rte_lcore_count(), info->drv.max_num_queues);
 	nb_queues = RTE_MIN(nb_queues, (unsigned int) MAX_QUEUES);
 
@@ -828,6 +1000,9 @@ init_op_data_objs(struct rte_bbdev_op_data *bufs,
 			if ((op_type == DATA_INPUT) && large_input) {
 				/* Allocate a fake overused mbuf */
 				data = rte_malloc(NULL, seg->length, 0);
+				TEST_ASSERT_NOT_NULL(data,
+					"rte malloc failed with %u bytes",
+					seg->length);
 				memcpy(data, seg->addr, seg->length);
 				m_head->buf_addr = data;
 				m_head->buf_iova = rte_malloc_virt2iova(data);
@@ -1129,7 +1304,7 @@ copy_reference_dec_op(struct rte_bbdev_dec_op **ops, unsigned int n,
 	struct rte_bbdev_op_turbo_dec *turbo_dec = &ref_op->turbo_dec;
 
 	for (i = 0; i < n; ++i) {
-		if (turbo_dec->code_block_mode == 0) {
+		if (turbo_dec->code_block_mode == RTE_BBDEV_TRANSPORT_BLOCK) {
 			ops[i]->turbo_dec.tb_params.ea =
 					turbo_dec->tb_params.ea;
 			ops[i]->turbo_dec.tb_params.eb =
@@ -1177,7 +1352,7 @@ copy_reference_enc_op(struct rte_bbdev_enc_op **ops, unsigned int n,
 	unsigned int i;
 	struct rte_bbdev_op_turbo_enc *turbo_enc = &ref_op->turbo_enc;
 	for (i = 0; i < n; ++i) {
-		if (turbo_enc->code_block_mode == 0) {
+		if (turbo_enc->code_block_mode == RTE_BBDEV_TRANSPORT_BLOCK) {
 			ops[i]->turbo_enc.tb_params.ea =
 					turbo_enc->tb_params.ea;
 			ops[i]->turbo_enc.tb_params.eb =
@@ -1532,7 +1707,7 @@ copy_reference_ldpc_dec_op(struct rte_bbdev_dec_op **ops, unsigned int n,
 	struct rte_bbdev_op_ldpc_dec *ldpc_dec = &ref_op->ldpc_dec;
 
 	for (i = 0; i < n; ++i) {
-		if (ldpc_dec->code_block_mode == 0) {
+		if (ldpc_dec->code_block_mode == RTE_BBDEV_TRANSPORT_BLOCK) {
 			ops[i]->ldpc_dec.tb_params.ea =
 					ldpc_dec->tb_params.ea;
 			ops[i]->ldpc_dec.tb_params.eb =
@@ -1586,7 +1761,7 @@ copy_reference_ldpc_enc_op(struct rte_bbdev_enc_op **ops, unsigned int n,
 	unsigned int i;
 	struct rte_bbdev_op_ldpc_enc *ldpc_enc = &ref_op->ldpc_enc;
 	for (i = 0; i < n; ++i) {
-		if (ldpc_enc->code_block_mode == 0) {
+		if (ldpc_enc->code_block_mode == RTE_BBDEV_TRANSPORT_BLOCK) {
 			ops[i]->ldpc_enc.tb_params.ea = ldpc_enc->tb_params.ea;
 			ops[i]->ldpc_enc.tb_params.eb = ldpc_enc->tb_params.eb;
 			ops[i]->ldpc_enc.tb_params.cab =
@@ -2110,7 +2285,7 @@ calc_dec_TB_size(struct rte_bbdev_dec_op *op)
 	uint8_t i;
 	uint32_t c, r, tb_size = 0;
 
-	if (op->turbo_dec.code_block_mode) {
+	if (op->turbo_dec.code_block_mode == RTE_BBDEV_CODE_BLOCK) {
 		tb_size = op->turbo_dec.tb_params.k_neg;
 	} else {
 		c = op->turbo_dec.tb_params.c;
@@ -2130,7 +2305,7 @@ calc_ldpc_dec_TB_size(struct rte_bbdev_dec_op *op)
 	uint32_t c, r, tb_size = 0;
 	uint16_t sys_cols = (op->ldpc_dec.basegraph == 1) ? 22 : 10;
 
-	if (op->ldpc_dec.code_block_mode) {
+	if (op->ldpc_dec.code_block_mode == RTE_BBDEV_CODE_BLOCK) {
 		tb_size = sys_cols * op->ldpc_dec.z_c - op->ldpc_dec.n_filler;
 	} else {
 		c = op->ldpc_dec.tb_params.c;
@@ -2148,7 +2323,7 @@ calc_enc_TB_size(struct rte_bbdev_enc_op *op)
 	uint8_t i;
 	uint32_t c, r, tb_size = 0;
 
-	if (op->turbo_enc.code_block_mode) {
+	if (op->turbo_enc.code_block_mode == RTE_BBDEV_CODE_BLOCK) {
 		tb_size = op->turbo_enc.tb_params.k_neg;
 	} else {
 		c = op->turbo_enc.tb_params.c;
@@ -2168,7 +2343,7 @@ calc_ldpc_enc_TB_size(struct rte_bbdev_enc_op *op)
 	uint32_t c, r, tb_size = 0;
 	uint16_t sys_cols = (op->ldpc_enc.basegraph == 1) ? 22 : 10;
 
-	if (op->turbo_enc.code_block_mode) {
+	if (op->ldpc_enc.code_block_mode == RTE_BBDEV_CODE_BLOCK) {
 		tb_size = sys_cols * op->ldpc_enc.z_c - op->ldpc_enc.n_filler;
 	} else {
 		c = op->turbo_enc.tb_params.c;
@@ -2339,7 +2514,7 @@ retrieve_harq_ddr(uint16_t dev_id, uint16_t queue_id,
 {
 	uint16_t j;
 	int save_status, ret;
-	uint32_t harq_offset = (uint32_t) queue_id * HARQ_INCR * 1024;
+	uint32_t harq_offset = (uint32_t) queue_id * HARQ_INCR * MAX_OPS;
 	struct rte_bbdev_dec_op *ops_deq[MAX_BURST];
 	uint32_t flags = ops[0]->ldpc_dec.op_flags;
 	bool loopback = flags & RTE_BBDEV_LDPC_INTERNAL_HARQ_MEMORY_LOOPBACK;
@@ -2385,20 +2560,20 @@ preload_harq_ddr(uint16_t dev_id, uint16_t queue_id,
 		bool preload)
 {
 	uint16_t j;
-	int ret;
-	uint32_t harq_offset = (uint32_t) queue_id * HARQ_INCR * 1024;
-	struct rte_bbdev_op_data save_hc_in, save_hc_out;
-	struct rte_bbdev_dec_op *ops_deq[MAX_BURST];
+	int deq;
+	uint32_t harq_offset = (uint32_t) queue_id * HARQ_INCR * MAX_OPS;
+	struct rte_bbdev_op_data save_hc_in[MAX_OPS], save_hc_out[MAX_OPS];
+	struct rte_bbdev_dec_op *ops_deq[MAX_OPS];
 	uint32_t flags = ops[0]->ldpc_dec.op_flags;
 	bool mem_in = flags & RTE_BBDEV_LDPC_INTERNAL_HARQ_MEMORY_IN_ENABLE;
 	bool hc_in = flags & RTE_BBDEV_LDPC_HQ_COMBINE_IN_ENABLE;
 	bool mem_out = flags & RTE_BBDEV_LDPC_INTERNAL_HARQ_MEMORY_OUT_ENABLE;
 	bool hc_out = flags & RTE_BBDEV_LDPC_HQ_COMBINE_OUT_ENABLE;
 	bool h_comp = flags & RTE_BBDEV_LDPC_HARQ_6BIT_COMPRESSION;
-	for (j = 0; j < n; ++j) {
-		if ((mem_in || hc_in) && preload) {
-			save_hc_in = ops[j]->ldpc_dec.harq_combined_input;
-			save_hc_out = ops[j]->ldpc_dec.harq_combined_output;
+	if ((mem_in || hc_in) && preload) {
+		for (j = 0; j < n; ++j) {
+			save_hc_in[j] = ops[j]->ldpc_dec.harq_combined_input;
+			save_hc_out[j] = ops[j]->ldpc_dec.harq_combined_output;
 			ops[j]->ldpc_dec.op_flags =
 				RTE_BBDEV_LDPC_INTERNAL_HARQ_MEMORY_LOOPBACK +
 				RTE_BBDEV_LDPC_INTERNAL_HARQ_MEMORY_OUT_ENABLE;
@@ -2408,16 +2583,23 @@ preload_harq_ddr(uint16_t dev_id, uint16_t queue_id,
 			ops[j]->ldpc_dec.harq_combined_output.offset =
 					harq_offset;
 			ops[j]->ldpc_dec.harq_combined_input.offset = 0;
-			rte_bbdev_enqueue_ldpc_dec_ops(dev_id, queue_id,
-					&ops[j], 1);
-			ret = 0;
-			while (ret == 0)
-				ret = rte_bbdev_dequeue_ldpc_dec_ops(
-					dev_id, queue_id, &ops_deq[j], 1);
+			harq_offset += HARQ_INCR;
+		}
+		rte_bbdev_enqueue_ldpc_dec_ops(dev_id, queue_id, &ops[0], n);
+		deq = 0;
+		while (deq != n)
+			deq += rte_bbdev_dequeue_ldpc_dec_ops(
+					dev_id, queue_id, &ops_deq[deq],
+					n - deq);
+		/* Restore the operations */
+		for (j = 0; j < n; ++j) {
 			ops[j]->ldpc_dec.op_flags = flags;
-			ops[j]->ldpc_dec.harq_combined_input = save_hc_in;
-			ops[j]->ldpc_dec.harq_combined_output = save_hc_out;
+			ops[j]->ldpc_dec.harq_combined_input = save_hc_in[j];
+			ops[j]->ldpc_dec.harq_combined_output = save_hc_out[j];
 		}
+	}
+	harq_offset = (uint32_t) queue_id * HARQ_INCR * MAX_OPS;
+	for (j = 0; j < n; ++j) {
 		/* Adjust HARQ offset when we reach external DDR */
 		if (mem_in || hc_in)
 			ops[j]->ldpc_dec.harq_combined_input.offset
@@ -2455,46 +2637,46 @@ dequeue_event_callback(uint16_t dev_id,
 	}
 
 	if (unlikely(event != RTE_BBDEV_EVENT_DEQUEUE)) {
-		rte_atomic16_set(&tp->processing_status, TEST_FAILED);
+		__atomic_store_n(&tp->processing_status, TEST_FAILED, __ATOMIC_RELAXED);
 		printf(
 			"Dequeue interrupt handler called for incorrect event!\n");
 		return;
 	}
 
-	burst_sz = rte_atomic16_read(&tp->burst_sz);
+	burst_sz = __atomic_load_n(&tp->burst_sz, __ATOMIC_RELAXED);
 	num_ops = tp->op_params->num_to_process;
 
 	if (test_vector.op_type == RTE_BBDEV_OP_TURBO_DEC)
 		deq = rte_bbdev_dequeue_dec_ops(dev_id, queue_id,
 				&tp->dec_ops[
-					rte_atomic16_read(&tp->nb_dequeued)],
+					__atomic_load_n(&tp->nb_dequeued, __ATOMIC_RELAXED)],
 				burst_sz);
 	else if (test_vector.op_type == RTE_BBDEV_OP_LDPC_DEC)
 		deq = rte_bbdev_dequeue_ldpc_dec_ops(dev_id, queue_id,
 				&tp->dec_ops[
-					rte_atomic16_read(&tp->nb_dequeued)],
+					__atomic_load_n(&tp->nb_dequeued, __ATOMIC_RELAXED)],
 				burst_sz);
 	else if (test_vector.op_type == RTE_BBDEV_OP_LDPC_ENC)
 		deq = rte_bbdev_dequeue_ldpc_enc_ops(dev_id, queue_id,
 				&tp->enc_ops[
-					rte_atomic16_read(&tp->nb_dequeued)],
+					__atomic_load_n(&tp->nb_dequeued, __ATOMIC_RELAXED)],
 				burst_sz);
 	else /*RTE_BBDEV_OP_TURBO_ENC*/
 		deq = rte_bbdev_dequeue_enc_ops(dev_id, queue_id,
 				&tp->enc_ops[
-					rte_atomic16_read(&tp->nb_dequeued)],
+					__atomic_load_n(&tp->nb_dequeued, __ATOMIC_RELAXED)],
 				burst_sz);
 
 	if (deq < burst_sz) {
 		printf(
 			"After receiving the interrupt all operations should be dequeued. Expected: %u, got: %u\n",
 			burst_sz, deq);
-		rte_atomic16_set(&tp->processing_status, TEST_FAILED);
+		__atomic_store_n(&tp->processing_status, TEST_FAILED, __ATOMIC_RELAXED);
 		return;
 	}
 
-	if (rte_atomic16_read(&tp->nb_dequeued) + deq < num_ops) {
-		rte_atomic16_add(&tp->nb_dequeued, deq);
+	if (__atomic_load_n(&tp->nb_dequeued, __ATOMIC_RELAXED) + deq < num_ops) {
+		__atomic_fetch_add(&tp->nb_dequeued, deq, __ATOMIC_RELAXED);
 		return;
 	}
 
@@ -2531,7 +2713,7 @@ dequeue_event_callback(uint16_t dev_id,
 
 	if (ret) {
 		printf("Buffers validation failed\n");
-		rte_atomic16_set(&tp->processing_status, TEST_FAILED);
+		__atomic_store_n(&tp->processing_status, TEST_FAILED, __ATOMIC_RELAXED);
 	}
 
 	switch (test_vector.op_type) {
@@ -2552,7 +2734,7 @@ dequeue_event_callback(uint16_t dev_id,
 		break;
 	default:
 		printf("Unknown op type: %d\n", test_vector.op_type);
-		rte_atomic16_set(&tp->processing_status, TEST_FAILED);
+		__atomic_store_n(&tp->processing_status, TEST_FAILED, __ATOMIC_RELAXED);
 		return;
 	}
 
@@ -2561,7 +2743,7 @@ dequeue_event_callback(uint16_t dev_id,
 	tp->mbps += (((double)(num_ops * tb_len_bits)) / 1000000.0) /
 			((double)total_time / (double)rte_get_tsc_hz());
 
-	rte_atomic16_add(&tp->nb_dequeued, deq);
+	__atomic_fetch_add(&tp->nb_dequeued, deq, __ATOMIC_RELAXED);
 }
 
 static int
@@ -2599,11 +2781,10 @@ throughput_intr_lcore_ldpc_dec(void *arg)
 
 	bufs = &tp->op_params->q_bufs[GET_SOCKET(info.socket_id)][queue_id];
 
-	rte_atomic16_clear(&tp->processing_status);
-	rte_atomic16_clear(&tp->nb_dequeued);
+	__atomic_store_n(&tp->processing_status, 0, __ATOMIC_RELAXED);
+	__atomic_store_n(&tp->nb_dequeued, 0, __ATOMIC_RELAXED);
 
-	while (rte_atomic16_read(&tp->op_params->sync) == SYNC_WAIT)
-		rte_pause();
+	rte_wait_until_equal_16(&tp->op_params->sync, SYNC_START, __ATOMIC_RELAXED);
 
 	ret = rte_bbdev_dec_op_alloc_bulk(tp->op_params->mp, ops,
 				num_to_process);
@@ -2651,17 +2832,15 @@ throughput_intr_lcore_ldpc_dec(void *arg)
 			 * the number of operations is not a multiple of
 			 * burst size.
 			 */
-			rte_atomic16_set(&tp->burst_sz, num_to_enq);
+			__atomic_store_n(&tp->burst_sz, num_to_enq, __ATOMIC_RELAXED);
 
 			/* Wait until processing of previous batch is
 			 * completed
 			 */
-			while (rte_atomic16_read(&tp->nb_dequeued) !=
-					(int16_t) enqueued)
-				rte_pause();
+			rte_wait_until_equal_16(&tp->nb_dequeued, enqueued, __ATOMIC_RELAXED);
 		}
 		if (j != TEST_REPETITIONS - 1)
-			rte_atomic16_clear(&tp->nb_dequeued);
+			__atomic_store_n(&tp->nb_dequeued, 0, __ATOMIC_RELAXED);
 	}
 
 	return TEST_SUCCESS;
@@ -2696,11 +2875,10 @@ throughput_intr_lcore_dec(void *arg)
 
 	bufs = &tp->op_params->q_bufs[GET_SOCKET(info.socket_id)][queue_id];
 
-	rte_atomic16_clear(&tp->processing_status);
-	rte_atomic16_clear(&tp->nb_dequeued);
+	__atomic_store_n(&tp->processing_status, 0, __ATOMIC_RELAXED);
+	__atomic_store_n(&tp->nb_dequeued, 0, __ATOMIC_RELAXED);
 
-	while (rte_atomic16_read(&tp->op_params->sync) == SYNC_WAIT)
-		rte_pause();
+	rte_wait_until_equal_16(&tp->op_params->sync, SYNC_START, __ATOMIC_RELAXED);
 
 	ret = rte_bbdev_dec_op_alloc_bulk(tp->op_params->mp, ops,
 				num_to_process);
@@ -2741,17 +2919,15 @@ throughput_intr_lcore_dec(void *arg)
 			 * the number of operations is not a multiple of
 			 * burst size.
 			 */
-			rte_atomic16_set(&tp->burst_sz, num_to_enq);
+			__atomic_store_n(&tp->burst_sz, num_to_enq, __ATOMIC_RELAXED);
 
 			/* Wait until processing of previous batch is
 			 * completed
 			 */
-			while (rte_atomic16_read(&tp->nb_dequeued) !=
-					(int16_t) enqueued)
-				rte_pause();
+			rte_wait_until_equal_16(&tp->nb_dequeued, enqueued, __ATOMIC_RELAXED);
 		}
 		if (j != TEST_REPETITIONS - 1)
-			rte_atomic16_clear(&tp->nb_dequeued);
+			__atomic_store_n(&tp->nb_dequeued, 0, __ATOMIC_RELAXED);
 	}
 
 	return TEST_SUCCESS;
@@ -2786,11 +2962,10 @@ throughput_intr_lcore_enc(void *arg)
 
 	bufs = &tp->op_params->q_bufs[GET_SOCKET(info.socket_id)][queue_id];
 
-	rte_atomic16_clear(&tp->processing_status);
-	rte_atomic16_clear(&tp->nb_dequeued);
+	__atomic_store_n(&tp->processing_status, 0, __ATOMIC_RELAXED);
+	__atomic_store_n(&tp->nb_dequeued, 0, __ATOMIC_RELAXED);
 
-	while (rte_atomic16_read(&tp->op_params->sync) == SYNC_WAIT)
-		rte_pause();
+	rte_wait_until_equal_16(&tp->op_params->sync, SYNC_START, __ATOMIC_RELAXED);
 
 	ret = rte_bbdev_enc_op_alloc_bulk(tp->op_params->mp, ops,
 			num_to_process);
@@ -2830,17 +3005,15 @@ throughput_intr_lcore_enc(void *arg)
 			 * the number of operations is not a multiple of
 			 * burst size.
 			 */
-			rte_atomic16_set(&tp->burst_sz, num_to_enq);
+			__atomic_store_n(&tp->burst_sz, num_to_enq, __ATOMIC_RELAXED);
 
 			/* Wait until processing of previous batch is
 			 * completed
 			 */
-			while (rte_atomic16_read(&tp->nb_dequeued) !=
-					(int16_t) enqueued)
-				rte_pause();
+			rte_wait_until_equal_16(&tp->nb_dequeued, enqueued, __ATOMIC_RELAXED);
 		}
 		if (j != TEST_REPETITIONS - 1)
-			rte_atomic16_clear(&tp->nb_dequeued);
+			__atomic_store_n(&tp->nb_dequeued, 0, __ATOMIC_RELAXED);
 	}
 
 	return TEST_SUCCESS;
@@ -2876,11 +3049,10 @@ throughput_intr_lcore_ldpc_enc(void *arg)
 
 	bufs = &tp->op_params->q_bufs[GET_SOCKET(info.socket_id)][queue_id];
 
-	rte_atomic16_clear(&tp->processing_status);
-	rte_atomic16_clear(&tp->nb_dequeued);
+	__atomic_store_n(&tp->processing_status, 0, __ATOMIC_RELAXED);
+	__atomic_store_n(&tp->nb_dequeued, 0, __ATOMIC_RELAXED);
 
-	while (rte_atomic16_read(&tp->op_params->sync) == SYNC_WAIT)
-		rte_pause();
+	rte_wait_until_equal_16(&tp->op_params->sync, SYNC_START, __ATOMIC_RELAXED);
 
 	ret = rte_bbdev_enc_op_alloc_bulk(tp->op_params->mp, ops,
 			num_to_process);
@@ -2922,17 +3094,15 @@ throughput_intr_lcore_ldpc_enc(void *arg)
 			 * the number of operations is not a multiple of
 			 * burst size.
 			 */
-			rte_atomic16_set(&tp->burst_sz, num_to_enq);
+			__atomic_store_n(&tp->burst_sz, num_to_enq, __ATOMIC_RELAXED);
 
 			/* Wait until processing of previous batch is
 			 * completed
 			 */
-			while (rte_atomic16_read(&tp->nb_dequeued) !=
-					(int16_t) enqueued)
-				rte_pause();
+			rte_wait_until_equal_16(&tp->nb_dequeued, enqueued, __ATOMIC_RELAXED);
 		}
 		if (j != TEST_REPETITIONS - 1)
-			rte_atomic16_clear(&tp->nb_dequeued);
+			__atomic_store_n(&tp->nb_dequeued, 0, __ATOMIC_RELAXED);
 	}
 
 	return TEST_SUCCESS;
@@ -2966,8 +3136,7 @@ throughput_pmd_lcore_dec(void *arg)
 
 	bufs = &tp->op_params->q_bufs[GET_SOCKET(info.socket_id)][queue_id];
 
-	while (rte_atomic16_read(&tp->op_params->sync) == SYNC_WAIT)
-		rte_pause();
+	rte_wait_until_equal_16(&tp->op_params->sync, SYNC_START, __ATOMIC_RELAXED);
 
 	ret = rte_bbdev_dec_op_alloc_bulk(tp->op_params->mp, ops_enq, num_ops);
 	TEST_ASSERT_SUCCESS(ret, "Allocation failed for %d ops", num_ops);
@@ -3070,8 +3239,7 @@ bler_pmd_lcore_ldpc_dec(void *arg)
 
 	bufs = &tp->op_params->q_bufs[GET_SOCKET(info.socket_id)][queue_id];
 
-	while (rte_atomic16_read(&tp->op_params->sync) == SYNC_WAIT)
-		rte_pause();
+	rte_wait_until_equal_16(&tp->op_params->sync, SYNC_START, __ATOMIC_RELAXED);
 
 	ret = rte_bbdev_dec_op_alloc_bulk(tp->op_params->mp, ops_enq, num_ops);
 	TEST_ASSERT_SUCCESS(ret, "Allocation failed for %d ops", num_ops);
@@ -3103,11 +3271,9 @@ bler_pmd_lcore_ldpc_dec(void *arg)
 				mbuf_reset(
 				ops_enq[j]->ldpc_dec.harq_combined_output.data);
 		}
-		if (extDdr) {
-			bool preload = i == (TEST_REPETITIONS - 1);
+		if (extDdr)
 			preload_harq_ddr(tp->dev_id, queue_id, ops_enq,
-					num_ops, preload);
-		}
+					num_ops, true);
 		start_time = rte_rdtsc_precise();
 
 		for (enq = 0, deq = 0; enq < num_ops;) {
@@ -3202,8 +3368,7 @@ throughput_pmd_lcore_ldpc_dec(void *arg)
 
 	bufs = &tp->op_params->q_bufs[GET_SOCKET(info.socket_id)][queue_id];
 
-	while (rte_atomic16_read(&tp->op_params->sync) == SYNC_WAIT)
-		rte_pause();
+	rte_wait_until_equal_16(&tp->op_params->sync, SYNC_START, __ATOMIC_RELAXED);
 
 	ret = rte_bbdev_dec_op_alloc_bulk(tp->op_params->mp, ops_enq, num_ops);
 	TEST_ASSERT_SUCCESS(ret, "Allocation failed for %d ops", num_ops);
@@ -3234,11 +3399,9 @@ throughput_pmd_lcore_ldpc_dec(void *arg)
 				mbuf_reset(
 				ops_enq[j]->ldpc_dec.harq_combined_output.data);
 		}
-		if (extDdr) {
-			bool preload = i == (TEST_REPETITIONS - 1);
+		if (extDdr)
 			preload_harq_ddr(tp->dev_id, queue_id, ops_enq,
-					num_ops, preload);
-		}
+					num_ops, true);
 		start_time = rte_rdtsc_precise();
 
 		for (enq = 0, deq = 0; enq < num_ops;) {
@@ -3321,8 +3484,7 @@ throughput_pmd_lcore_enc(void *arg)
 
 	bufs = &tp->op_params->q_bufs[GET_SOCKET(info.socket_id)][queue_id];
 
-	while (rte_atomic16_read(&tp->op_params->sync) == SYNC_WAIT)
-		rte_pause();
+	rte_wait_until_equal_16(&tp->op_params->sync, SYNC_START, __ATOMIC_RELAXED);
 
 	ret = rte_bbdev_enc_op_alloc_bulk(tp->op_params->mp, ops_enq,
 			num_ops);
@@ -3412,8 +3574,7 @@ throughput_pmd_lcore_ldpc_enc(void *arg)
 
 	bufs = &tp->op_params->q_bufs[GET_SOCKET(info.socket_id)][queue_id];
 
-	while (rte_atomic16_read(&tp->op_params->sync) == SYNC_WAIT)
-		rte_pause();
+	rte_wait_until_equal_16(&tp->op_params->sync, SYNC_START, __ATOMIC_RELAXED);
 
 	ret = rte_bbdev_enc_op_alloc_bulk(tp->op_params->mp, ops_enq,
 			num_ops);
@@ -3587,21 +3748,25 @@ bler_test(struct active_device *ad,
 			RTE_ALIGN(sizeof(struct thread_params) * num_lcores,
 				RTE_CACHE_LINE_SIZE));
 
-	if (test_vector.op_type == RTE_BBDEV_OP_LDPC_DEC)
+	if ((test_vector.op_type == RTE_BBDEV_OP_LDPC_DEC) &&
+			!check_bit(test_vector.ldpc_dec.op_flags,
+			RTE_BBDEV_LDPC_INTERNAL_HARQ_MEMORY_LOOPBACK)
+			&& !check_bit(test_vector.ldpc_dec.op_flags,
+			RTE_BBDEV_LDPC_LLR_COMPRESSION))
 		bler_function = bler_pmd_lcore_ldpc_dec;
 	else
 		return TEST_SKIPPED;
 
-	rte_atomic16_set(&op_params->sync, SYNC_WAIT);
+	__atomic_store_n(&op_params->sync, SYNC_WAIT, __ATOMIC_RELAXED);
 
-	/* Master core is set at first entry */
+	/* Main core is set at first entry */
 	t_params[0].dev_id = ad->dev_id;
 	t_params[0].lcore_id = rte_lcore_id();
 	t_params[0].op_params = op_params;
 	t_params[0].queue_id = ad->queue_ids[used_cores++];
 	t_params[0].iter_count = 0;
 
-	RTE_LCORE_FOREACH_SLAVE(lcore_id) {
+	RTE_LCORE_FOREACH_WORKER(lcore_id) {
 		if (used_cores >= num_lcores)
 			break;
 
@@ -3615,10 +3780,10 @@ bler_test(struct active_device *ad,
 				&t_params[used_cores++], lcore_id);
 	}
 
-	rte_atomic16_set(&op_params->sync, SYNC_START);
+	__atomic_store_n(&op_params->sync, SYNC_START, __ATOMIC_RELAXED);
 	ret = bler_function(&t_params[0]);
 
-	/* Master core is always used */
+	/* Main core is always used */
 	for (used_cores = 1; used_cores < num_lcores; used_cores++)
 		ret |= rte_eal_wait_lcore(t_params[used_cores].lcore_id);
 
@@ -3710,16 +3875,16 @@ throughput_test(struct active_device *ad,
 			throughput_function = throughput_pmd_lcore_enc;
 	}
 
-	rte_atomic16_set(&op_params->sync, SYNC_WAIT);
+	__atomic_store_n(&op_params->sync, SYNC_WAIT, __ATOMIC_RELAXED);
 
-	/* Master core is set at first entry */
+	/* Main core is set at first entry */
 	t_params[0].dev_id = ad->dev_id;
 	t_params[0].lcore_id = rte_lcore_id();
 	t_params[0].op_params = op_params;
 	t_params[0].queue_id = ad->queue_ids[used_cores++];
 	t_params[0].iter_count = 0;
 
-	RTE_LCORE_FOREACH_SLAVE(lcore_id) {
+	RTE_LCORE_FOREACH_WORKER(lcore_id) {
 		if (used_cores >= num_lcores)
 			break;
 
@@ -3733,10 +3898,10 @@ throughput_test(struct active_device *ad,
 				&t_params[used_cores++], lcore_id);
 	}
 
-	rte_atomic16_set(&op_params->sync, SYNC_START);
+	__atomic_store_n(&op_params->sync, SYNC_START, __ATOMIC_RELAXED);
 	ret = throughput_function(&t_params[0]);
 
-	/* Master core is always used */
+	/* Main core is always used */
 	for (used_cores = 1; used_cores < num_lcores; used_cores++)
 		ret |= rte_eal_wait_lcore(t_params[used_cores].lcore_id);
 
@@ -3760,32 +3925,32 @@ throughput_test(struct active_device *ad,
 	/* In interrupt TC we need to wait for the interrupt callback to deqeue
 	 * all pending operations. Skip waiting for queues which reported an
 	 * error using processing_status variable.
-	 * Wait for master lcore operations.
+	 * Wait for main lcore operations.
 	 */
 	tp = &t_params[0];
-	while ((rte_atomic16_read(&tp->nb_dequeued) <
-			op_params->num_to_process) &&
-			(rte_atomic16_read(&tp->processing_status) !=
-			TEST_FAILED))
+	while ((__atomic_load_n(&tp->nb_dequeued, __ATOMIC_RELAXED) <
+		op_params->num_to_process) &&
+		(__atomic_load_n(&tp->processing_status, __ATOMIC_RELAXED) !=
+		TEST_FAILED))
 		rte_pause();
 
 	tp->ops_per_sec /= TEST_REPETITIONS;
 	tp->mbps /= TEST_REPETITIONS;
-	ret |= (int)rte_atomic16_read(&tp->processing_status);
+	ret |= (int)__atomic_load_n(&tp->processing_status, __ATOMIC_RELAXED);
 
-	/* Wait for slave lcores operations */
+	/* Wait for worker lcores operations */
 	for (used_cores = 1; used_cores < num_lcores; used_cores++) {
 		tp = &t_params[used_cores];
 
-		while ((rte_atomic16_read(&tp->nb_dequeued) <
-				op_params->num_to_process) &&
-				(rte_atomic16_read(&tp->processing_status) !=
-				TEST_FAILED))
+		while ((__atomic_load_n(&tp->nb_dequeued, __ATOMIC_RELAXED) <
+			op_params->num_to_process) &&
+			(__atomic_load_n(&tp->processing_status, __ATOMIC_RELAXED) !=
+			TEST_FAILED))
 			rte_pause();
 
 		tp->ops_per_sec /= TEST_REPETITIONS;
 		tp->mbps /= TEST_REPETITIONS;
-		ret |= (int)rte_atomic16_read(&tp->processing_status);
+		ret |= (int)__atomic_load_n(&tp->processing_status, __ATOMIC_RELAXED);
 	}
 
 	/* Print throughput if test passed */
@@ -3871,12 +4036,14 @@ latency_test_dec(struct rte_mempool *mempool,
 	return i;
 }
 
+/* Test case for latency/validation for LDPC Decoder */
 static int
 latency_test_ldpc_dec(struct rte_mempool *mempool,
 		struct test_buffers *bufs, struct rte_bbdev_dec_op *ref_op,
 		int vector_mask, uint16_t dev_id, uint16_t queue_id,
 		const uint16_t num_to_process, uint16_t burst_sz,
-		uint64_t *total_time, uint64_t *min_time, uint64_t *max_time)
+		uint64_t *total_time, uint64_t *min_time, uint64_t *max_time,
+		bool disable_et)
 {
 	int ret = TEST_SUCCESS;
 	uint16_t i, j, dequeued;
@@ -3898,7 +4065,7 @@ latency_test_ldpc_dec(struct rte_mempool *mempool,
 				"rte_bbdev_dec_op_alloc_bulk() failed");
 
 		/* For latency tests we need to disable early termination */
-		if (check_bit(ref_op->ldpc_dec.op_flags,
+		if (disable_et && check_bit(ref_op->ldpc_dec.op_flags,
 				RTE_BBDEV_LDPC_ITERATION_STOP_ENABLE))
 			ref_op->ldpc_dec.op_flags -=
 					RTE_BBDEV_LDPC_ITERATION_STOP_ENABLE;
@@ -4093,9 +4260,10 @@ latency_test_ldpc_enc(struct rte_mempool *mempool,
 	return i;
 }
 
+/* Common function for running validation and latency test cases */
 static int
-latency_test(struct active_device *ad,
-		struct test_op_params *op_params)
+validation_latency_test(struct active_device *ad,
+		struct test_op_params *op_params, bool latency_flag)
 {
 	int iter;
 	uint16_t burst_sz = op_params->burst_sz;
@@ -4120,7 +4288,11 @@ latency_test(struct active_device *ad,
 	TEST_ASSERT_NOT_NULL(op_type_str, "Invalid op type: %u", op_type);
 
 	printf("+ ------------------------------------------------------- +\n");
-	printf("== test: validation/latency\ndev: %s, burst size: %u, num ops: %u, op type: %s\n",
+	if (latency_flag)
+		printf("== test: latency\ndev:");
+	else
+		printf("== test: validation\ndev:");
+	printf("%s, burst size: %u, num ops: %u, op type: %s\n",
 			info.dev_name, burst_sz, num_to_process, op_type_str);
 
 	if (op_type == RTE_BBDEV_OP_TURBO_DEC)
@@ -4128,11 +4300,6 @@ latency_test(struct active_device *ad,
 				op_params->ref_dec_op, op_params->vector_mask,
 				ad->dev_id, queue_id, num_to_process,
 				burst_sz, &total_time, &min_time, &max_time);
-	else if (op_type == RTE_BBDEV_OP_TURBO_ENC)
-		iter = latency_test_enc(op_params->mp, bufs,
-				op_params->ref_enc_op, ad->dev_id, queue_id,
-				num_to_process, burst_sz, &total_time,
-				&min_time, &max_time);
 	else if (op_type == RTE_BBDEV_OP_LDPC_ENC)
 		iter = latency_test_ldpc_enc(op_params->mp, bufs,
 				op_params->ref_enc_op, ad->dev_id, queue_id,
@@ -4142,13 +4309,14 @@ latency_test(struct active_device *ad,
 		iter = latency_test_ldpc_dec(op_params->mp, bufs,
 				op_params->ref_dec_op, op_params->vector_mask,
 				ad->dev_id, queue_id, num_to_process,
-				burst_sz, &total_time, &min_time, &max_time);
-	else
+				burst_sz, &total_time, &min_time, &max_time,
+				latency_flag);
+	else /* RTE_BBDEV_OP_TURBO_ENC */
 		iter = latency_test_enc(op_params->mp, bufs,
-					op_params->ref_enc_op,
-					ad->dev_id, queue_id,
-					num_to_process, burst_sz, &total_time,
-					&min_time, &max_time);
+				op_params->ref_enc_op,
+				ad->dev_id, queue_id,
+				num_to_process, burst_sz, &total_time,
+				&min_time, &max_time);
 
 	if (iter <= 0)
 		return TEST_FAILED;
@@ -4167,6 +4335,18 @@ latency_test(struct active_device *ad,
 	return TEST_SUCCESS;
 }
 
+static int
+latency_test(struct active_device *ad, struct test_op_params *op_params)
+{
+	return validation_latency_test(ad, op_params, true);
+}
+
+static int
+validation_test(struct active_device *ad, struct test_op_params *op_params)
+{
+	return validation_latency_test(ad, op_params, false);
+}
+
 #ifdef RTE_BBDEV_OFFLOAD_COST
 static int
 get_bbdev_queue_stats(uint16_t dev_id, uint16_t queue_id,
@@ -4242,15 +4422,15 @@ offload_latency_test_dec(struct rte_mempool *mempool, struct test_buffers *bufs,
 		time_st->enq_acc_total_time += stats.acc_offload_cycles;
 
 		/* give time for device to process ops */
-		rte_delay_us(200);
+		rte_delay_us(WAIT_OFFLOAD_US);
 
 		/* Start time meas for dequeue function offload latency */
 		deq_start_time = rte_rdtsc_precise();
 		/* Dequeue one operation */
 		do {
 			deq += rte_bbdev_dequeue_dec_ops(dev_id, queue_id,
-					&ops_deq[deq], 1);
-		} while (unlikely(deq != 1));
+					&ops_deq[deq], enq);
+		} while (unlikely(deq == 0));
 
 		deq_last_time = rte_rdtsc_precise() - deq_start_time;
 		time_st->deq_max_time = RTE_MAX(time_st->deq_max_time,
@@ -4333,15 +4513,15 @@ offload_latency_test_ldpc_dec(struct rte_mempool *mempool,
 		time_st->enq_acc_total_time += stats.acc_offload_cycles;
 
 		/* give time for device to process ops */
-		rte_delay_us(200);
+		rte_delay_us(WAIT_OFFLOAD_US);
 
 		/* Start time meas for dequeue function offload latency */
 		deq_start_time = rte_rdtsc_precise();
 		/* Dequeue one operation */
 		do {
 			deq += rte_bbdev_dequeue_ldpc_dec_ops(dev_id, queue_id,
-					&ops_deq[deq], 1);
-		} while (unlikely(deq != 1));
+					&ops_deq[deq], enq);
+		} while (unlikely(deq == 0));
 
 		deq_last_time = rte_rdtsc_precise() - deq_start_time;
 		time_st->deq_max_time = RTE_MAX(time_st->deq_max_time,
@@ -4421,15 +4601,15 @@ offload_latency_test_enc(struct rte_mempool *mempool, struct test_buffers *bufs,
 		time_st->enq_acc_total_time += stats.acc_offload_cycles;
 
 		/* give time for device to process ops */
-		rte_delay_us(200);
+		rte_delay_us(WAIT_OFFLOAD_US);
 
 		/* Start time meas for dequeue function offload latency */
 		deq_start_time = rte_rdtsc_precise();
 		/* Dequeue one operation */
 		do {
 			deq += rte_bbdev_dequeue_enc_ops(dev_id, queue_id,
-					&ops_deq[deq], 1);
-		} while (unlikely(deq != 1));
+					&ops_deq[deq], enq);
+		} while (unlikely(deq == 0));
 
 		deq_last_time = rte_rdtsc_precise() - deq_start_time;
 		time_st->deq_max_time = RTE_MAX(time_st->deq_max_time,
@@ -4504,15 +4684,15 @@ offload_latency_test_ldpc_enc(struct rte_mempool *mempool,
 		time_st->enq_acc_total_time += stats.acc_offload_cycles;
 
 		/* give time for device to process ops */
-		rte_delay_us(200);
+		rte_delay_us(WAIT_OFFLOAD_US);
 
 		/* Start time meas for dequeue function offload latency */
 		deq_start_time = rte_rdtsc_precise();
 		/* Dequeue one operation */
 		do {
 			deq += rte_bbdev_dequeue_ldpc_enc_ops(dev_id, queue_id,
-					&ops_deq[deq], 1);
-		} while (unlikely(deq != 1));
+					&ops_deq[deq], enq);
+		} while (unlikely(deq == 0));
 
 		deq_last_time = rte_rdtsc_precise() - deq_start_time;
 		time_st->deq_max_time = RTE_MAX(time_st->deq_max_time,
@@ -4634,6 +4814,23 @@ offload_cost_test(struct active_device *ad,
 			(double)(time_st.deq_max_time * 1000000) /
 			rte_get_tsc_hz());
 
+	struct rte_bbdev_stats stats = {0};
+	get_bbdev_queue_stats(ad->dev_id, queue_id, &stats);
+	if (op_type != RTE_BBDEV_OP_LDPC_DEC) {
+		TEST_ASSERT_SUCCESS(stats.enqueued_count != num_to_process,
+				"Mismatch in enqueue count %10"PRIu64" %d",
+				stats.enqueued_count, num_to_process);
+		TEST_ASSERT_SUCCESS(stats.dequeued_count != num_to_process,
+				"Mismatch in dequeue count %10"PRIu64" %d",
+				stats.dequeued_count, num_to_process);
+	}
+	TEST_ASSERT_SUCCESS(stats.enqueue_err_count != 0,
+			"Enqueue count Error %10"PRIu64"",
+			stats.enqueue_err_count);
+	TEST_ASSERT_SUCCESS(stats.dequeue_err_count != 0,
+			"Dequeue count Error (%10"PRIu64"",
+			stats.dequeue_err_count);
+
 	return TEST_SUCCESS;
 #endif
 }
@@ -4643,7 +4840,7 @@ static int
 offload_latency_empty_q_test_dec(uint16_t dev_id, uint16_t queue_id,
 		const uint16_t num_to_process, uint16_t burst_sz,
 		uint64_t *deq_total_time, uint64_t *deq_min_time,
-		uint64_t *deq_max_time)
+		uint64_t *deq_max_time, const enum rte_bbdev_op_type op_type)
 {
 	int i, deq_total;
 	struct rte_bbdev_dec_op *ops[MAX_BURST];
@@ -4657,7 +4854,12 @@ offload_latency_empty_q_test_dec(uint16_t dev_id, uint16_t queue_id,
 
 		if (unlikely(num_to_process - deq_total < burst_sz))
 			burst_sz = num_to_process - deq_total;
-		rte_bbdev_dequeue_dec_ops(dev_id, queue_id, ops, burst_sz);
+		if (op_type == RTE_BBDEV_OP_LDPC_DEC)
+			rte_bbdev_dequeue_ldpc_dec_ops(dev_id, queue_id, ops,
+					burst_sz);
+		else
+			rte_bbdev_dequeue_dec_ops(dev_id, queue_id, ops,
+					burst_sz);
 
 		deq_last_time = rte_rdtsc_precise() - deq_start_time;
 		*deq_max_time = RTE_MAX(*deq_max_time, deq_last_time);
@@ -4672,7 +4874,7 @@ static int
 offload_latency_empty_q_test_enc(uint16_t dev_id, uint16_t queue_id,
 		const uint16_t num_to_process, uint16_t burst_sz,
 		uint64_t *deq_total_time, uint64_t *deq_min_time,
-		uint64_t *deq_max_time)
+		uint64_t *deq_max_time, const enum rte_bbdev_op_type op_type)
 {
 	int i, deq_total;
 	struct rte_bbdev_enc_op *ops[MAX_BURST];
@@ -4685,7 +4887,12 @@ offload_latency_empty_q_test_enc(uint16_t dev_id, uint16_t queue_id,
 
 		if (unlikely(num_to_process - deq_total < burst_sz))
 			burst_sz = num_to_process - deq_total;
-		rte_bbdev_dequeue_enc_ops(dev_id, queue_id, ops, burst_sz);
+		if (op_type == RTE_BBDEV_OP_LDPC_ENC)
+			rte_bbdev_dequeue_ldpc_enc_ops(dev_id, queue_id, ops,
+					burst_sz);
+		else
+			rte_bbdev_dequeue_enc_ops(dev_id, queue_id, ops,
+					burst_sz);
 
 		deq_last_time = rte_rdtsc_precise() - deq_start_time;
 		*deq_max_time = RTE_MAX(*deq_max_time, deq_last_time);
@@ -4695,6 +4902,7 @@ offload_latency_empty_q_test_enc(uint16_t dev_id, uint16_t queue_id,
 
 	return i;
 }
+
 #endif
 
 static int
@@ -4732,14 +4940,15 @@ offload_latency_empty_q_test(struct active_device *ad,
 	printf("== test: offload latency empty dequeue\ndev: %s, burst size: %u, num ops: %u, op type: %s\n",
 			info.dev_name, burst_sz, num_to_process, op_type_str);
 
-	if (op_type == RTE_BBDEV_OP_TURBO_DEC)
+	if (op_type == RTE_BBDEV_OP_TURBO_DEC ||
+			op_type == RTE_BBDEV_OP_LDPC_DEC)
 		iter = offload_latency_empty_q_test_dec(ad->dev_id, queue_id,
 				num_to_process, burst_sz, &deq_total_time,
-				&deq_min_time, &deq_max_time);
+				&deq_min_time, &deq_max_time, op_type);
 	else
 		iter = offload_latency_empty_q_test_enc(ad->dev_id, queue_id,
 				num_to_process, burst_sz, &deq_total_time,
-				&deq_min_time, &deq_max_time);
+				&deq_min_time, &deq_max_time, op_type);
 
 	if (iter <= 0)
 		return TEST_FAILED;
@@ -4789,6 +4998,12 @@ latency_tc(void)
 	return run_test_case(latency_test);
 }
 
+static int
+validation_tc(void)
+{
+	return run_test_case(validation_test);
+}
+
 static int
 interrupt_tc(void)
 {
@@ -4820,7 +5035,7 @@ static struct unit_test_suite bbdev_validation_testsuite = {
 	.setup = testsuite_setup,
 	.teardown = testsuite_teardown,
 	.unit_test_cases = {
-		TEST_CASE_ST(ut_setup, ut_teardown, latency_tc),
+		TEST_CASE_ST(ut_setup, ut_teardown, validation_tc),
 		TEST_CASES_END() /**< NULL terminate unit test array */
 	}
 };