#include <rte_common.h>
#include <rte_branch_prediction.h>
#include <rte_ether.h>
+#include <rte_cycles.h>
#include "mlx5.h"
#include "mlx5_utils.h"
cq_ci = rxq->cq_ci;
}
cqe = &(*rxq->cqes)[cq_ci & cqe_cnt];
- while (check_cqe(cqe, cqe_n, cq_ci) == 0) {
+ while (check_cqe(cqe, cqe_n, cq_ci) != MLX5_CQE_STATUS_HW_OWN) {
int8_t op_own;
unsigned int n;
*rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci);
}
+/**
+ * Handle a Rx error.
+ * The function inserts the RQ state to reset when the first error CQE is
+ * shown, then drains the CQ by the caller function loop. When the CQ is empty,
+ * it moves the RQ state to ready and initializes the RQ.
+ * Next CQE identification and error counting are in the caller responsibility.
+ *
+ * @param[in] rxq
+ * Pointer to RX queue structure.
+ * @param[in] mbuf_prepare
+ * Whether to prepare mbufs for the RQ.
+ *
+ * @return
+ * -1 in case of recovery error, otherwise the CQE status.
+ */
+int
+mlx5_rx_err_handle(struct mlx5_rxq_data *rxq, uint8_t mbuf_prepare)
+{
+ const uint16_t cqe_n = 1 << rxq->cqe_n;
+ const uint16_t cqe_mask = cqe_n - 1;
+ const unsigned int wqe_n = 1 << rxq->elts_n;
+ struct mlx5_rxq_ctrl *rxq_ctrl =
+ container_of(rxq, struct mlx5_rxq_ctrl, rxq);
+ struct ibv_wq_attr mod = {
+ .attr_mask = IBV_WQ_ATTR_STATE,
+ };
+ union {
+ volatile struct mlx5_cqe *cqe;
+ volatile struct mlx5_err_cqe *err_cqe;
+ } u = {
+ .cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_mask],
+ };
+ int ret;
+
+ switch (rxq->err_state) {
+ case MLX5_RXQ_ERR_STATE_NO_ERROR:
+ rxq->err_state = MLX5_RXQ_ERR_STATE_NEED_RESET;
+ /* Fall-through */
+ case MLX5_RXQ_ERR_STATE_NEED_RESET:
+ if (rte_eal_process_type() != RTE_PROC_PRIMARY)
+ return -1;
+ mod.wq_state = IBV_WQS_RESET;
+ ret = mlx5_glue->modify_wq(rxq_ctrl->ibv->wq, &mod);
+ if (ret) {
+ DRV_LOG(ERR, "Cannot change Rx WQ state to RESET %s\n",
+ strerror(errno));
+ return -1;
+ }
+ if (rxq_ctrl->dump_file_n <
+ rxq_ctrl->priv->config.max_dump_files_num) {
+ MKSTR(err_str, "Unexpected CQE error syndrome "
+ "0x%02x CQN = %u RQN = %u wqe_counter = %u"
+ " rq_ci = %u cq_ci = %u", u.err_cqe->syndrome,
+ rxq->cqn, rxq_ctrl->ibv->wq->wq_num,
+ rte_be_to_cpu_16(u.err_cqe->wqe_counter),
+ rxq->rq_ci << rxq->sges_n, rxq->cq_ci);
+ MKSTR(name, "dpdk_mlx5_port_%u_rxq_%u_%u",
+ rxq->port_id, rxq->idx, (uint32_t)rte_rdtsc());
+ mlx5_dump_debug_information(name, NULL, err_str, 0);
+ mlx5_dump_debug_information(name, "MLX5 Error CQ:",
+ (const void *)((uintptr_t)
+ rxq->cqes),
+ sizeof(*u.cqe) * cqe_n);
+ mlx5_dump_debug_information(name, "MLX5 Error RQ:",
+ (const void *)((uintptr_t)
+ rxq->wqes),
+ 16 * wqe_n);
+ rxq_ctrl->dump_file_n++;
+ }
+ rxq->err_state = MLX5_RXQ_ERR_STATE_NEED_READY;
+ /* Fall-through */
+ case MLX5_RXQ_ERR_STATE_NEED_READY:
+ ret = check_cqe(u.cqe, cqe_n, rxq->cq_ci);
+ if (ret == MLX5_CQE_STATUS_HW_OWN) {
+ rte_cio_wmb();
+ *rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci);
+ rte_cio_wmb();
+ /*
+ * The RQ consumer index must be zeroed while moving
+ * from RESET state to RDY state.
+ */
+ *rxq->rq_db = rte_cpu_to_be_32(0);
+ rte_cio_wmb();
+ mod.wq_state = IBV_WQS_RDY;
+ ret = mlx5_glue->modify_wq(rxq_ctrl->ibv->wq, &mod);
+ if (ret) {
+ DRV_LOG(ERR, "Cannot change Rx WQ state to RDY"
+ " %s\n", strerror(errno));
+ return -1;
+ }
+ if (mbuf_prepare) {
+ const uint16_t q_mask = wqe_n - 1;
+ uint16_t elt_idx;
+ struct rte_mbuf **elt;
+ int i;
+ unsigned int n = wqe_n - (rxq->rq_ci -
+ rxq->rq_pi);
+
+ for (i = 0; i < (int)n; ++i) {
+ elt_idx = (rxq->rq_ci + i) & q_mask;
+ elt = &(*rxq->elts)[elt_idx];
+ *elt = rte_mbuf_raw_alloc(rxq->mp);
+ if (!*elt) {
+ for (i--; i >= 0; --i) {
+ elt_idx = (rxq->rq_ci +
+ i) & q_mask;
+ elt = &(*rxq->elts)
+ [elt_idx];
+ rte_pktmbuf_free_seg
+ (*elt);
+ }
+ return -1;
+ }
+ }
+ }
+ mlx5_rxq_initialize(rxq);
+ rxq->err_state = MLX5_RXQ_ERR_STATE_NO_ERROR;
+ }
+ return ret;
+ default:
+ return -1;
+ }
+}
+
/**
* Get size of the next packet for a given CQE. For compressed CQEs, the
* consumer index is updated only once all packets of the current one have
* written.
*
* @return
- * Packet size in bytes (0 if there is none), -1 in case of completion
- * with error.
+ * 0 in case of empty CQE, otherwise the packet size in bytes.
*/
static inline int
mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe,
{
struct rxq_zip *zip = &rxq->zip;
uint16_t cqe_n = cqe_cnt + 1;
- int len = 0;
+ int len;
uint16_t idx, end;
- /* Process compressed data in the CQE and mini arrays. */
- if (zip->ai) {
- volatile struct mlx5_mini_cqe8 (*mc)[8] =
- (volatile struct mlx5_mini_cqe8 (*)[8])
- (uintptr_t)(&(*rxq->cqes)[zip->ca & cqe_cnt].pkt_info);
-
- len = rte_be_to_cpu_32((*mc)[zip->ai & 7].byte_cnt);
- *mcqe = &(*mc)[zip->ai & 7];
- if ((++zip->ai & 7) == 0) {
- /* Invalidate consumed CQEs */
- idx = zip->ca;
- end = zip->na;
- while (idx != end) {
- (*rxq->cqes)[idx & cqe_cnt].op_own =
- MLX5_CQE_INVALIDATE;
- ++idx;
- }
- /*
- * Increment consumer index to skip the number of
- * CQEs consumed. Hardware leaves holes in the CQ
- * ring for software use.
- */
- zip->ca = zip->na;
- zip->na += 8;
- }
- if (unlikely(rxq->zip.ai == rxq->zip.cqe_cnt)) {
- /* Invalidate the rest */
- idx = zip->ca;
- end = zip->cq_ci;
-
- while (idx != end) {
- (*rxq->cqes)[idx & cqe_cnt].op_own =
- MLX5_CQE_INVALIDATE;
- ++idx;
- }
- rxq->cq_ci = zip->cq_ci;
- zip->ai = 0;
- }
- /* No compressed data, get next CQE and verify if it is compressed. */
- } else {
- int ret;
- int8_t op_own;
-
- ret = check_cqe(cqe, cqe_n, rxq->cq_ci);
- if (unlikely(ret == 1))
- return 0;
- ++rxq->cq_ci;
- op_own = cqe->op_own;
- rte_cio_rmb();
- if (MLX5_CQE_FORMAT(op_own) == MLX5_COMPRESSED) {
+ do {
+ len = 0;
+ /* Process compressed data in the CQE and mini arrays. */
+ if (zip->ai) {
volatile struct mlx5_mini_cqe8 (*mc)[8] =
(volatile struct mlx5_mini_cqe8 (*)[8])
- (uintptr_t)(&(*rxq->cqes)[rxq->cq_ci &
+ (uintptr_t)(&(*rxq->cqes)[zip->ca &
cqe_cnt].pkt_info);
- /* Fix endianness. */
- zip->cqe_cnt = rte_be_to_cpu_32(cqe->byte_cnt);
- /*
- * Current mini array position is the one returned by
- * check_cqe64().
- *
- * If completion comprises several mini arrays, as a
- * special case the second one is located 7 CQEs after
- * the initial CQE instead of 8 for subsequent ones.
- */
- zip->ca = rxq->cq_ci;
- zip->na = zip->ca + 7;
- /* Compute the next non compressed CQE. */
- --rxq->cq_ci;
- zip->cq_ci = rxq->cq_ci + zip->cqe_cnt;
- /* Get packet size to return. */
- len = rte_be_to_cpu_32((*mc)[0].byte_cnt);
- *mcqe = &(*mc)[0];
- zip->ai = 1;
- /* Prefetch all the entries to be invalidated */
- idx = zip->ca;
- end = zip->cq_ci;
- while (idx != end) {
- rte_prefetch0(&(*rxq->cqes)[(idx) & cqe_cnt]);
- ++idx;
+ len = rte_be_to_cpu_32((*mc)[zip->ai & 7].byte_cnt);
+ *mcqe = &(*mc)[zip->ai & 7];
+ if ((++zip->ai & 7) == 0) {
+ /* Invalidate consumed CQEs */
+ idx = zip->ca;
+ end = zip->na;
+ while (idx != end) {
+ (*rxq->cqes)[idx & cqe_cnt].op_own =
+ MLX5_CQE_INVALIDATE;
+ ++idx;
+ }
+ /*
+ * Increment consumer index to skip the number
+ * of CQEs consumed. Hardware leaves holes in
+ * the CQ ring for software use.
+ */
+ zip->ca = zip->na;
+ zip->na += 8;
+ }
+ if (unlikely(rxq->zip.ai == rxq->zip.cqe_cnt)) {
+ /* Invalidate the rest */
+ idx = zip->ca;
+ end = zip->cq_ci;
+
+ while (idx != end) {
+ (*rxq->cqes)[idx & cqe_cnt].op_own =
+ MLX5_CQE_INVALIDATE;
+ ++idx;
+ }
+ rxq->cq_ci = zip->cq_ci;
+ zip->ai = 0;
+ }
+ /*
+ * No compressed data, get next CQE and verify if it is
+ * compressed.
+ */
+ } else {
+ int ret;
+ int8_t op_own;
+
+ ret = check_cqe(cqe, cqe_n, rxq->cq_ci);
+ if (unlikely(ret != MLX5_CQE_STATUS_SW_OWN)) {
+ if (unlikely(ret == MLX5_CQE_STATUS_ERR ||
+ rxq->err_state)) {
+ ret = mlx5_rx_err_handle(rxq, 0);
+ if (ret == MLX5_CQE_STATUS_HW_OWN ||
+ ret == -1)
+ return 0;
+ } else {
+ return 0;
+ }
}
+ ++rxq->cq_ci;
+ op_own = cqe->op_own;
+ if (MLX5_CQE_FORMAT(op_own) == MLX5_COMPRESSED) {
+ volatile struct mlx5_mini_cqe8 (*mc)[8] =
+ (volatile struct mlx5_mini_cqe8 (*)[8])
+ (uintptr_t)(&(*rxq->cqes)
+ [rxq->cq_ci &
+ cqe_cnt].pkt_info);
+
+ /* Fix endianness. */
+ zip->cqe_cnt = rte_be_to_cpu_32(cqe->byte_cnt);
+ /*
+ * Current mini array position is the one
+ * returned by check_cqe64().
+ *
+ * If completion comprises several mini arrays,
+ * as a special case the second one is located
+ * 7 CQEs after the initial CQE instead of 8
+ * for subsequent ones.
+ */
+ zip->ca = rxq->cq_ci;
+ zip->na = zip->ca + 7;
+ /* Compute the next non compressed CQE. */
+ --rxq->cq_ci;
+ zip->cq_ci = rxq->cq_ci + zip->cqe_cnt;
+ /* Get packet size to return. */
+ len = rte_be_to_cpu_32((*mc)[0].byte_cnt);
+ *mcqe = &(*mc)[0];
+ zip->ai = 1;
+ /* Prefetch all to be invalidated */
+ idx = zip->ca;
+ end = zip->cq_ci;
+ while (idx != end) {
+ rte_prefetch0(&(*rxq->cqes)[(idx) &
+ cqe_cnt]);
+ ++idx;
+ }
+ } else {
+ len = rte_be_to_cpu_32(cqe->byte_cnt);
+ }
+ }
+ if (unlikely(rxq->err_state)) {
+ cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_cnt];
+ ++rxq->stats.idropped;
} else {
- len = rte_be_to_cpu_32(cqe->byte_cnt);
+ return len;
}
- /* Error while receiving packet. */
- if (unlikely(MLX5_CQE_OPCODE(op_own) == MLX5_CQE_RESP_ERR))
- return -1;
- }
- return len;
+ } while (1);
}
/**
rte_mbuf_raw_free(rep);
break;
}
- if (unlikely(len == -1)) {
- /* RX error, packet is likely too large. */
- rte_mbuf_raw_free(rep);
- ++rxq->stats.idropped;
- goto skip;
- }
pkt = seg;
assert(len >= (rxq->crc_present << 2));
pkt->ol_flags = 0;
pkt = NULL;
--pkts_n;
++i;
-skip:
/* Align consumer index to the next stride. */
rq_ci >>= sges_n;
++rq_ci;
ret = mlx5_rx_poll_len(rxq, cqe, cq_mask, &mcqe);
if (!ret)
break;
- if (unlikely(ret == -1)) {
- /* RX error, packet is likely too large. */
- ++rxq->stats.idropped;
- continue;
- }
byte_cnt = ret;
strd_cnt = (byte_cnt & MLX5_MPRQ_STRIDE_NUM_MASK) >>
MLX5_MPRQ_STRIDE_NUM_SHIFT;
#include "mlx5_autoconf.h"
#include "mlx5_defs.h"
#include "mlx5_prm.h"
+#include "mlx5_glue.h"
/* Support tunnel matching. */
#define MLX5_FLOW_TUNNEL 5
/* Get pointer to the first stride. */
#define mlx5_mprq_buf_addr(ptr) ((ptr) + 1)
+enum mlx5_rxq_err_state {
+ MLX5_RXQ_ERR_STATE_NO_ERROR = 0,
+ MLX5_RXQ_ERR_STATE_NEED_RESET,
+ MLX5_RXQ_ERR_STATE_NEED_READY,
+};
+
/* RX queue descriptor. */
struct mlx5_rxq_data {
unsigned int csum:1; /* Enable checksum offloading. */
unsigned int strd_num_n:5; /* Log 2 of the number of stride. */
unsigned int strd_sz_n:4; /* Log 2 of stride size. */
unsigned int strd_shift_en:1; /* Enable 2bytes shift on a stride. */
- unsigned int :6; /* Remaining bits. */
+ unsigned int err_state:2; /* enum mlx5_rxq_err_state. */
+ unsigned int :4; /* Remaining bits. */
volatile uint32_t *rq_db;
volatile uint32_t *cq_db;
uint16_t port_id;
unsigned int irq:1; /* Whether IRQ is enabled. */
uint32_t flow_mark_n; /* Number of Mark/Flag flows using this Queue. */
uint32_t flow_tunnels_n[MLX5_FLOW_TUNNEL]; /* Tunnels counters. */
+ uint16_t dump_file_n; /* Number of dump files. */
};
/* Indirection table. */
uint16_t mlx5_tx_burst_empw(void *dpdk_txq, struct rte_mbuf **pkts,
uint16_t pkts_n);
uint16_t mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n);
+void mlx5_rxq_initialize(struct mlx5_rxq_data *rxq);
+__rte_noinline int mlx5_rx_err_handle(struct mlx5_rxq_data *rxq,
+ uint8_t mbuf_prepare);
void mlx5_mprq_buf_free_cb(void *addr, void *opaque);
void mlx5_mprq_buf_free(struct mlx5_mprq_buf *buf);
uint16_t mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts,
#define mlx5_uar_write64(val, dst, lock) __mlx5_uar_write64(val, dst, lock)
#endif
-#ifndef NDEBUG
-/**
- * Verify or set magic value in CQE.
- *
- * @param cqe
- * Pointer to CQE.
- *
- * @return
- * 0 the first time.
- */
-static inline int
-check_cqe_seen(volatile struct mlx5_cqe *cqe)
-{
- static const uint8_t magic[] = "seen";
- volatile uint8_t (*buf)[sizeof(cqe->rsvd1)] = &cqe->rsvd1;
- int ret = 1;
- unsigned int i;
-
- for (i = 0; i < sizeof(magic) && i < sizeof(*buf); ++i)
- if (!ret || (*buf)[i] != magic[i]) {
- ret = 0;
- (*buf)[i] = magic[i];
- }
- return ret;
-}
-#endif /* NDEBUG */
+/* CQE status. */
+enum mlx5_cqe_status {
+ MLX5_CQE_STATUS_SW_OWN,
+ MLX5_CQE_STATUS_HW_OWN,
+ MLX5_CQE_STATUS_ERR,
+};
/**
* Check whether CQE is valid.
* Consumer index.
*
* @return
- * 0 on success, 1 on failure.
+ * The CQE status.
*/
-static __rte_always_inline int
-check_cqe(volatile struct mlx5_cqe *cqe,
- unsigned int cqes_n, const uint16_t ci)
+static __rte_always_inline enum mlx5_cqe_status
+check_cqe(volatile struct mlx5_cqe *cqe, const uint16_t cqes_n,
+ const uint16_t ci)
{
- uint16_t idx = ci & cqes_n;
- uint8_t op_own = cqe->op_own;
- uint8_t op_owner = MLX5_CQE_OWNER(op_own);
- uint8_t op_code = MLX5_CQE_OPCODE(op_own);
+ const uint16_t idx = ci & cqes_n;
+ const uint8_t op_own = cqe->op_own;
+ const uint8_t op_owner = MLX5_CQE_OWNER(op_own);
+ const uint8_t op_code = MLX5_CQE_OPCODE(op_own);
if (unlikely((op_owner != (!!(idx))) || (op_code == MLX5_CQE_INVALID)))
- return 1; /* No CQE. */
-#ifndef NDEBUG
- if ((op_code == MLX5_CQE_RESP_ERR) ||
- (op_code == MLX5_CQE_REQ_ERR)) {
- volatile struct mlx5_err_cqe *err_cqe = (volatile void *)cqe;
- uint8_t syndrome = err_cqe->syndrome;
-
- if ((syndrome == MLX5_CQE_SYNDROME_LOCAL_LENGTH_ERR) ||
- (syndrome == MLX5_CQE_SYNDROME_REMOTE_ABORTED_ERR))
- return 0;
- if (!check_cqe_seen(cqe)) {
- DRV_LOG(ERR,
- "unexpected CQE error %u (0x%02x) syndrome"
- " 0x%02x",
- op_code, op_code, syndrome);
- rte_hexdump(stderr, "MLX5 Error CQE:",
- (const void *)((uintptr_t)err_cqe),
- sizeof(*cqe));
- }
- return 1;
- } else if ((op_code != MLX5_CQE_RESP_SEND) &&
- (op_code != MLX5_CQE_REQ)) {
- if (!check_cqe_seen(cqe)) {
- DRV_LOG(ERR, "unexpected CQE opcode %u (0x%02x)",
- op_code, op_code);
- rte_hexdump(stderr, "MLX5 CQE:",
- (const void *)((uintptr_t)cqe),
- sizeof(*cqe));
- }
- return 1;
- }
-#endif /* NDEBUG */
- return 0;
+ return MLX5_CQE_STATUS_HW_OWN;
+ rte_cio_rmb();
+ if (unlikely(op_code == MLX5_CQE_RESP_ERR ||
+ op_code == MLX5_CQE_REQ_ERR))
+ return MLX5_CQE_STATUS_ERR;
+ return MLX5_CQE_STATUS_SW_OWN;
}
/**