/* Maximum number of DS in WQE. */
 #define MLX5_DSEG_MAX 63
 
+/* The completion mode offset in the WQE control segment line 2. */
+#define MLX5_COMP_MODE_OFFSET 2
+
+/* Completion mode. */
+enum mlx5_completion_mode {
+       MLX5_COMP_ONLY_ERR = 0x0,
+       MLX5_COMP_ONLY_FIRST_ERR = 0x1,
+       MLX5_COMP_ALWAYS = 0x2,
+       MLX5_COMP_CQE_AND_EQE = 0x3,
+};
+
 /* Subset of struct mlx5_wqe_eth_seg. */
 struct mlx5_wqe_eth_seg_small {
        uint32_t rsvd0;
 
        fclose(fd);
 }
 
+/**
+ * Move QP from error state to running state.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param qp
+ *   The qp pointer for recovery.
+ *
+ * @return
+ *   0 on success, else errno value.
+ */
+static int
+tx_recover_qp(struct mlx5_txq_data *txq, struct ibv_qp *qp)
+{
+       int ret;
+       struct ibv_qp_attr mod = {
+                                       .qp_state = IBV_QPS_RESET,
+                                       .port_num = 1,
+                               };
+       ret = mlx5_glue->modify_qp(qp, &mod, IBV_QP_STATE);
+       if (ret) {
+               DRV_LOG(ERR, "Cannot change the Tx QP state to RESET %d\n",
+                       ret);
+               return ret;
+       }
+       mod.qp_state = IBV_QPS_INIT;
+       ret = mlx5_glue->modify_qp(qp, &mod,
+                                  (IBV_QP_STATE | IBV_QP_PORT));
+       if (ret) {
+               DRV_LOG(ERR, "Cannot change Tx QP state to INIT %d\n", ret);
+               return ret;
+       }
+       mod.qp_state = IBV_QPS_RTR;
+       ret = mlx5_glue->modify_qp(qp, &mod, IBV_QP_STATE);
+       if (ret) {
+               DRV_LOG(ERR, "Cannot change Tx QP state to RTR %d\n", ret);
+               return ret;
+       }
+       mod.qp_state = IBV_QPS_RTS;
+       ret = mlx5_glue->modify_qp(qp, &mod, IBV_QP_STATE);
+       if (ret) {
+               DRV_LOG(ERR, "Cannot change Tx QP state to RTS %d\n", ret);
+               return ret;
+       }
+       txq->wqe_ci = 0;
+       txq->wqe_pi = 0;
+       txq->elts_comp = 0;
+       return 0;
+}
+
+/* Return 1 if the error CQE is signed otherwise, sign it and return 0. */
+static int
+check_err_cqe_seen(volatile struct mlx5_err_cqe *err_cqe)
+{
+       static const uint8_t magic[] = "seen";
+       int ret = 1;
+       unsigned int i;
+
+       for (i = 0; i < sizeof(magic); ++i)
+               if (!ret || err_cqe->rsvd1[i] != magic[i]) {
+                       ret = 0;
+                       err_cqe->rsvd1[i] = magic[i];
+               }
+       return ret;
+}
+
+/**
+ * Handle error CQE.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param error_cqe
+ *   Pointer to the error CQE.
+ *
+ * @return
+ *   The last Tx buffer element to free.
+ */
+uint16_t
+mlx5_tx_error_cqe_handle(struct mlx5_txq_data *txq,
+                        volatile struct mlx5_err_cqe *err_cqe)
+{
+       if (err_cqe->syndrome != MLX5_CQE_SYNDROME_WR_FLUSH_ERR) {
+               const uint16_t wqe_m = ((1 << txq->wqe_n) - 1);
+               struct mlx5_txq_ctrl *txq_ctrl =
+                               container_of(txq, struct mlx5_txq_ctrl, txq);
+               uint16_t new_wqe_pi = rte_be_to_cpu_16(err_cqe->wqe_counter);
+               int seen = check_err_cqe_seen(err_cqe);
+
+               if (!seen && txq_ctrl->dump_file_n <
+                   txq_ctrl->priv->config.max_dump_files_num) {
+                       MKSTR(err_str, "Unexpected CQE error syndrome "
+                             "0x%02x CQN = %u SQN = %u wqe_counter = %u "
+                             "wq_ci = %u cq_ci = %u", err_cqe->syndrome,
+                             txq_ctrl->cqn, txq->qp_num_8s >> 8,
+                             rte_be_to_cpu_16(err_cqe->wqe_counter),
+                             txq->wqe_ci, txq->cq_ci);
+                       MKSTR(name, "dpdk_mlx5_port_%u_txq_%u_index_%u_%u",
+                             PORT_ID(txq_ctrl->priv), txq->idx,
+                             txq_ctrl->dump_file_n, (uint32_t)rte_rdtsc());
+                       mlx5_dump_debug_information(name, NULL, err_str, 0);
+                       mlx5_dump_debug_information(name, "MLX5 Error CQ:",
+                                                   (const void *)((uintptr_t)
+                                                   &(*txq->cqes)[0]),
+                                                   sizeof(*err_cqe) *
+                                                   (1 << txq->cqe_n));
+                       mlx5_dump_debug_information(name, "MLX5 Error SQ:",
+                                                   (const void *)((uintptr_t)
+                                                   tx_mlx5_wqe(txq, 0)),
+                                                   MLX5_WQE_SIZE *
+                                                   (1 << txq->wqe_n));
+                       txq_ctrl->dump_file_n++;
+               }
+               if (!seen)
+                       /*
+                        * Count errors in WQEs units.
+                        * Later it can be improved to count error packets,
+                        * for example, by SQ parsing to find how much packets
+                        * should be counted for each WQE.
+                        */
+                       txq->stats.oerrors += ((txq->wqe_ci & wqe_m) -
+                                               new_wqe_pi) & wqe_m;
+               if ((rte_eal_process_type() == RTE_PROC_PRIMARY) &&
+                   tx_recover_qp(txq, txq_ctrl->ibv->qp) == 0) {
+                       txq->cq_ci++;
+                       /* Release all the remaining buffers. */
+                       return txq->elts_head;
+               }
+               /* Recovering failed - try again later on the same WQE. */
+       } else {
+               txq->cq_ci++;
+       }
+       /* Do not release buffers. */
+       return txq->elts_tail;
+}
+
 /**
  * DPDK callback for TX.
  *
                                wqe->ctrl = (rte_v128u32_t){
                                        rte_cpu_to_be_32(txq->wqe_ci << 8),
                                        rte_cpu_to_be_32(txq->qp_num_8s | 1),
-                                       0,
+                                       rte_cpu_to_be_32
+                                               (MLX5_COMP_ONLY_FIRST_ERR <<
+                                                MLX5_COMP_MODE_OFFSET),
                                        0,
                                };
                                ds = 1;
                                rte_cpu_to_be_32((txq->wqe_ci << 8) |
                                                 MLX5_OPCODE_TSO),
                                rte_cpu_to_be_32(txq->qp_num_8s | ds),
-                               0,
+                               rte_cpu_to_be_32(MLX5_COMP_ONLY_FIRST_ERR <<
+                                                MLX5_COMP_MODE_OFFSET),
                                0,
                        };
                        wqe->eseg = (rte_v128u32_t){
                                rte_cpu_to_be_32((txq->wqe_ci << 8) |
                                                 MLX5_OPCODE_SEND),
                                rte_cpu_to_be_32(txq->qp_num_8s | ds),
-                               0,
+                               rte_cpu_to_be_32(MLX5_COMP_ONLY_FIRST_ERR <<
+                                                MLX5_COMP_MODE_OFFSET),
                                0,
                        };
                        wqe->eseg = (rte_v128u32_t){
                /* A CQE slot must always be available. */
                assert((1u << txq->cqe_n) - (txq->cq_pi++ - txq->cq_ci));
                /* Request completion on last WQE. */
-               last_wqe->ctrl2 = rte_cpu_to_be_32(8);
+               last_wqe->ctrl2 = rte_cpu_to_be_32(MLX5_COMP_ALWAYS <<
+                                                  MLX5_COMP_MODE_OFFSET);
                /* Save elts_head in unused "immediate" field of WQE. */
                last_wqe->ctrl3 = txq->elts_head;
                txq->elts_comp = 0;
        mpw->wqe->ctrl[0] = rte_cpu_to_be_32((MLX5_OPC_MOD_MPW << 24) |
                                             (txq->wqe_ci << 8) |
                                             MLX5_OPCODE_TSO);
-       mpw->wqe->ctrl[2] = 0;
+       mpw->wqe->ctrl[2] = rte_cpu_to_be_32(MLX5_COMP_ONLY_FIRST_ERR <<
+                                            MLX5_COMP_MODE_OFFSET);
        mpw->wqe->ctrl[3] = 0;
        mpw->data.dseg[0] = (volatile struct mlx5_wqe_data_seg *)
                (((uintptr_t)mpw->wqe) + (2 * MLX5_WQE_DWORD_SIZE));
                /* A CQE slot must always be available. */
                assert((1u << txq->cqe_n) - (txq->cq_pi++ - txq->cq_ci));
                /* Request completion on last WQE. */
-               wqe->ctrl[2] = rte_cpu_to_be_32(8);
+               wqe->ctrl[2] = rte_cpu_to_be_32(MLX5_COMP_ALWAYS <<
+                                               MLX5_COMP_MODE_OFFSET);
                /* Save elts_head in unused "immediate" field of WQE. */
                wqe->ctrl[3] = elts_head;
                txq->elts_comp = 0;
        mpw->wqe->ctrl[0] = rte_cpu_to_be_32((MLX5_OPC_MOD_MPW << 24) |
                                             (txq->wqe_ci << 8) |
                                             MLX5_OPCODE_TSO);
-       mpw->wqe->ctrl[2] = 0;
+       mpw->wqe->ctrl[2] = rte_cpu_to_be_32(MLX5_COMP_ONLY_FIRST_ERR <<
+                                            MLX5_COMP_MODE_OFFSET);
        mpw->wqe->ctrl[3] = 0;
        mpw->wqe->eseg.mss = rte_cpu_to_be_16(length);
        mpw->wqe->eseg.inline_hdr_sz = 0;
                /* A CQE slot must always be available. */
                assert((1u << txq->cqe_n) - (txq->cq_pi++ - txq->cq_ci));
                /* Request completion on last WQE. */
-               wqe->ctrl[2] = rte_cpu_to_be_32(8);
+               wqe->ctrl[2] = rte_cpu_to_be_32(MLX5_COMP_ALWAYS <<
+                                               MLX5_COMP_MODE_OFFSET);
                /* Save elts_head in unused "immediate" field of WQE. */
                wqe->ctrl[3] = elts_head;
                txq->elts_comp = 0;
                rte_cpu_to_be_32((MLX5_OPC_MOD_ENHANCED_MPSW << 24) |
                                 (txq->wqe_ci << 8) |
                                 MLX5_OPCODE_ENHANCED_MPSW);
-       mpw->wqe->ctrl[2] = 0;
+       mpw->wqe->ctrl[2] = rte_cpu_to_be_32(MLX5_COMP_ONLY_FIRST_ERR <<
+                                            MLX5_COMP_MODE_OFFSET);
        mpw->wqe->ctrl[3] = 0;
        memset((void *)(uintptr_t)&mpw->wqe->eseg, 0, MLX5_WQE_DWORD_SIZE);
        if (unlikely(padding)) {
                /* A CQE slot must always be available. */
                assert((1u << txq->cqe_n) - (txq->cq_pi++ - txq->cq_ci));
                /* Request completion on last WQE. */
-               wqe->ctrl[2] = rte_cpu_to_be_32(8);
+               wqe->ctrl[2] = rte_cpu_to_be_32(MLX5_COMP_ALWAYS <<
+                                               MLX5_COMP_MODE_OFFSET);
                /* Save elts_head in unused "immediate" field of WQE. */
                wqe->ctrl[3] = elts_head;
                txq->elts_comp = 0;
 
        struct mlx5_priv *priv; /* Back pointer to private data. */
        off_t uar_mmap_offset; /* UAR mmap offset for non-primary process. */
        void *bf_reg; /* BlueFlame register from Verbs. */
+       uint32_t cqn; /* CQ number. */
+       uint16_t dump_file_n; /* Number of dump files. */
 };
 
 #define MLX5_TX_BFREG(txq) \
                                  uint16_t pkts_n);
 uint16_t mlx5_tx_burst_empw(void *dpdk_txq, struct rte_mbuf **pkts,
                            uint16_t pkts_n);
+__rte_noinline uint16_t mlx5_tx_error_cqe_handle(struct mlx5_txq_data *txq,
+                                       volatile struct mlx5_err_cqe *err_cqe);
 uint16_t mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n);
 void mlx5_rxq_initialize(struct mlx5_rxq_data *rxq);
 __rte_noinline int mlx5_rx_err_handle(struct mlx5_rxq_data *rxq,
        return (uintptr_t *)((uintptr_t)txq->wqes + ci * MLX5_WQE_SIZE);
 }
 
+/**
+ * Handle the next CQE.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ *
+ * @return
+ *   The last Tx buffer element to free.
+ */
+static __rte_always_inline uint16_t
+mlx5_tx_cqe_handle(struct mlx5_txq_data *txq)
+{
+       const unsigned int cqe_n = 1 << txq->cqe_n;
+       const unsigned int cqe_cnt = cqe_n - 1;
+       uint16_t last_elts;
+       union {
+               volatile struct mlx5_cqe *cqe;
+               volatile struct mlx5_err_cqe *err_cqe;
+       } u = {
+               .cqe =  &(*txq->cqes)[txq->cq_ci & cqe_cnt],
+       };
+       int ret = check_cqe(u.cqe, cqe_n, txq->cq_ci);
+
+       if (unlikely(ret != MLX5_CQE_STATUS_SW_OWN)) {
+               if (unlikely(ret == MLX5_CQE_STATUS_ERR))
+                       last_elts = mlx5_tx_error_cqe_handle(txq, u.err_cqe);
+               else
+                       /* Do not release buffers. */
+                       return txq->elts_tail;
+       } else {
+               uint16_t new_wqe_pi = rte_be_to_cpu_16(u.cqe->wqe_counter);
+               volatile struct mlx5_wqe_ctrl *ctrl =
+                               (volatile struct mlx5_wqe_ctrl *)
+                                       tx_mlx5_wqe(txq, new_wqe_pi);
+
+               /* Release completion burst buffers. */
+               last_elts = ctrl->ctrl3;
+               txq->wqe_pi = new_wqe_pi;
+               txq->cq_ci++;
+       }
+       rte_compiler_barrier();
+       *txq->cq_db = rte_cpu_to_be_32(txq->cq_ci);
+       return last_elts;
+}
+
 /**
  * Manage TX completions.
  *
 {
        const uint16_t elts_n = 1 << txq->elts_n;
        const uint16_t elts_m = elts_n - 1;
-       const unsigned int cqe_n = 1 << txq->cqe_n;
-       const unsigned int cqe_cnt = cqe_n - 1;
        uint16_t elts_free = txq->elts_tail;
        uint16_t elts_tail;
-       uint16_t cq_ci = txq->cq_ci;
-       volatile struct mlx5_cqe *cqe = NULL;
-       volatile struct mlx5_wqe_ctrl *ctrl;
        struct rte_mbuf *m, *free[elts_n];
        struct rte_mempool *pool = NULL;
        unsigned int blk_n = 0;
 
-       cqe = &(*txq->cqes)[cq_ci & cqe_cnt];
-       if (unlikely(check_cqe(cqe, cqe_n, cq_ci)))
-               return;
-#ifndef NDEBUG
-       if ((MLX5_CQE_OPCODE(cqe->op_own) == MLX5_CQE_RESP_ERR) ||
-           (MLX5_CQE_OPCODE(cqe->op_own) == MLX5_CQE_REQ_ERR)) {
-               if (!check_cqe_seen(cqe)) {
-                       DRV_LOG(ERR, "unexpected error CQE, Tx stopped");
-                       rte_hexdump(stderr, "MLX5 TXQ:",
-                                   (const void *)((uintptr_t)txq->wqes),
-                                   ((1 << txq->wqe_n) *
-                                    MLX5_WQE_SIZE));
-               }
-               return;
-       }
-#endif /* NDEBUG */
-       ++cq_ci;
-       rte_cio_rmb();
-       txq->wqe_pi = rte_be_to_cpu_16(cqe->wqe_counter);
-       ctrl = (volatile struct mlx5_wqe_ctrl *)
-               tx_mlx5_wqe(txq, txq->wqe_pi);
-       elts_tail = ctrl->ctrl3;
+       elts_tail = mlx5_tx_cqe_handle(txq);
        assert((elts_tail & elts_m) < (1 << txq->wqe_n));
        /* Free buffers. */
        while (elts_free != elts_tail) {
                ++elts_free;
        }
 #endif
-       txq->cq_ci = cq_ci;
        txq->elts_tail = elts_tail;
-       /* Update the consumer index. */
-       rte_compiler_barrier();
-       *txq->cq_db = rte_cpu_to_be_32(cq_ci);
 }
 
 /**
 
                ctrl = vreinterpretq_u8_u32((uint32x4_t) {
                                MLX5_OPC_MOD_MPW << 24 |
                                txq->wqe_ci << 8 | MLX5_OPCODE_TSO,
-                               txq->qp_num_8s | ds, 0, 0});
+                               txq->qp_num_8s | ds, 4, 0});
                ctrl = vqtbl1q_u8(ctrl, ctrl_shuf_m);
                vst1q_u8((void *)t_wqe, ctrl);
                /* Fill ESEG in the header. */
        if (txq->elts_comp >= MLX5_TX_COMP_THRESH) {
                /* A CQE slot must always be available. */
                assert((1u << txq->cqe_n) - (txq->cq_pi++ - txq->cq_ci));
-               wqe->ctrl[2] = rte_cpu_to_be_32(8);
+               wqe->ctrl[2] = rte_cpu_to_be_32(MLX5_COMP_ALWAYS <<
+                                               MLX5_COMP_MODE_OFFSET);
                wqe->ctrl[3] = txq->elts_head;
                txq->elts_comp = 0;
        }
        unsigned int pos;
        uint16_t max_elts;
        uint16_t max_wqe;
-       uint32_t comp_req = 0;
+       uint32_t comp_req;
        const uint16_t wq_n = 1 << txq->wqe_n;
        const uint16_t wq_mask = wq_n - 1;
        uint16_t wq_idx = txq->wqe_ci & wq_mask;
        }
        if (txq->elts_comp + pkts_n < MLX5_TX_COMP_THRESH) {
                txq->elts_comp += pkts_n;
+               comp_req = MLX5_COMP_ONLY_FIRST_ERR << MLX5_COMP_MODE_OFFSET;
        } else {
                /* A CQE slot must always be available. */
                assert((1u << txq->cqe_n) - (txq->cq_pi++ - txq->cq_ci));
                /* Request a completion. */
                txq->elts_comp = 0;
-               comp_req = 8;
+               comp_req = MLX5_COMP_ALWAYS << MLX5_COMP_MODE_OFFSET;
        }
        /* Fill CTRL in the header. */
        ctrl = vreinterpretq_u8_u32((uint32x4_t) {
 
                } while (--segs_n);
                ++wqe_ci;
                /* Fill CTRL in the header. */
-               ctrl = _mm_set_epi32(0, 0, txq->qp_num_8s | ds,
+               ctrl = _mm_set_epi32(0, 4, txq->qp_num_8s | ds,
                                     MLX5_OPC_MOD_MPW << 24 |
                                     txq->wqe_ci << 8 | MLX5_OPCODE_TSO);
                ctrl = _mm_shuffle_epi8(ctrl, shuf_mask_ctrl);
        if (txq->elts_comp >= MLX5_TX_COMP_THRESH) {
                /* A CQE slot must always be available. */
                assert((1u << txq->cqe_n) - (txq->cq_pi++ - txq->cq_ci));
-               wqe->ctrl[2] = rte_cpu_to_be_32(8);
+               wqe->ctrl[2] = rte_cpu_to_be_32(MLX5_COMP_ALWAYS <<
+                                               MLX5_COMP_MODE_OFFSET);
                wqe->ctrl[3] = txq->elts_head;
                txq->elts_comp = 0;
        }
        unsigned int pos;
        uint16_t max_elts;
        uint16_t max_wqe;
-       uint32_t comp_req = 0;
+       uint32_t comp_req;
        const uint16_t wq_n = 1 << txq->wqe_n;
        const uint16_t wq_mask = wq_n - 1;
        uint16_t wq_idx = txq->wqe_ci & wq_mask;
        }
        if (txq->elts_comp + pkts_n < MLX5_TX_COMP_THRESH) {
                txq->elts_comp += pkts_n;
+               comp_req = MLX5_COMP_ONLY_FIRST_ERR << MLX5_COMP_MODE_OFFSET;
        } else {
                /* A CQE slot must always be available. */
                assert((1u << txq->cqe_n) - (txq->cq_pi++ - txq->cq_ci));
                /* Request a completion. */
                txq->elts_comp = 0;
-               comp_req = 8;
+               comp_req = MLX5_COMP_ALWAYS << MLX5_COMP_MODE_OFFSET;
        }
        /* Fill CTRL in the header. */
        ctrl = _mm_set_epi32(txq->elts_head, comp_req,
 
        attr.cq = (struct ibv_cq_init_attr_ex){
                .comp_mask = 0,
        };
-       cqe_n = ((desc / MLX5_TX_COMP_THRESH) - 1) ?
-               ((desc / MLX5_TX_COMP_THRESH) - 1) : 1;
+       cqe_n = desc / MLX5_TX_COMP_THRESH + 1;
        if (is_empw_burst_func(tx_pkt_burst))
                cqe_n += MLX5_TX_COMP_THRESH_INLINE_DIV;
        tmpl.cq = mlx5_glue->create_cq(priv->sh->ctx, cqe_n, NULL, NULL, 0);
        txq_ibv->cq = tmpl.cq;
        rte_atomic32_inc(&txq_ibv->refcnt);
        txq_ctrl->bf_reg = qp.bf.reg;
+       txq_ctrl->cqn = cq_info.cqn;
        txq_uar_init(txq_ctrl);
        if (qp.comp_mask & MLX5DV_QP_MASK_UAR_MMAP_OFFSET) {
                txq_ctrl->uar_mmap_offset = qp.uar_mmap_offset;