+ * TX with Enhanced MPW support.
+ *
+ * @param txq
+ * Pointer to TX queue structure.
+ * @param[in] pkts
+ * Packets to transmit.
+ * @param pkts_n
+ * Number of packets in array.
+ *
+ * @return
+ * Number of packets successfully transmitted (<= pkts_n).
+ */
+static inline uint16_t
+txq_burst_empw(struct mlx5_txq_data *txq, struct rte_mbuf **pkts,
+ uint16_t pkts_n)
+{
+ uint16_t elts_head = txq->elts_head;
+ const uint16_t elts_n = 1 << txq->elts_n;
+ const uint16_t elts_m = elts_n - 1;
+ unsigned int i = 0;
+ unsigned int j = 0;
+ uint16_t max_elts;
+ uint16_t max_wqe;
+ unsigned int max_inline = txq->max_inline * RTE_CACHE_LINE_SIZE;
+ unsigned int mpw_room = 0;
+ unsigned int inl_pad = 0;
+ uint32_t inl_hdr;
+ uint64_t addr_64;
+ struct mlx5_mpw mpw = {
+ .state = MLX5_MPW_STATE_CLOSED,
+ };
+
+ if (unlikely(!pkts_n))
+ return 0;
+ /* Start processing. */
+ mlx5_tx_complete(txq);
+ max_elts = (elts_n - (elts_head - txq->elts_tail));
+ max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi);
+ if (unlikely(!max_wqe))
+ return 0;
+ do {
+ struct rte_mbuf *buf = *(pkts++);
+ uintptr_t addr;
+ unsigned int do_inline = 0; /* Whether inline is possible. */
+ uint32_t length;
+ uint8_t cs_flags;
+ rte_be32_t metadata;
+
+ /* Multi-segmented packet is handled in slow-path outside. */
+ assert(NB_SEGS(buf) == 1);
+ /* Make sure there is enough room to store this packet. */
+ if (max_elts - j == 0)
+ break;
+ cs_flags = txq_ol_cksum_to_cs(buf);
+ /* Copy metadata from mbuf if valid */
+ metadata = buf->ol_flags & PKT_TX_METADATA ? buf->tx_metadata :
+ 0;
+ /* Retrieve packet information. */
+ length = PKT_LEN(buf);
+ /* Start new session if:
+ * - multi-segment packet
+ * - no space left even for a dseg
+ * - next packet can be inlined with a new WQE
+ * - cs_flag differs
+ */
+ if (mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED) {
+ if ((inl_pad + sizeof(struct mlx5_wqe_data_seg) >
+ mpw_room) ||
+ (length <= txq->inline_max_packet_sz &&
+ inl_pad + sizeof(inl_hdr) + length >
+ mpw_room) ||
+ (mpw.wqe->eseg.flow_table_metadata != metadata) ||
+ (mpw.wqe->eseg.cs_flags != cs_flags))
+ max_wqe -= mlx5_empw_close(txq, &mpw);
+ }
+ if (unlikely(mpw.state == MLX5_MPW_STATE_CLOSED)) {
+ /* In Enhanced MPW, inline as much as the budget is
+ * allowed. The remaining space is to be filled with
+ * dsegs. If the title WQEBB isn't padded, it will have
+ * 2 dsegs there.
+ */
+ mpw_room = RTE_MIN(MLX5_WQE_SIZE_MAX,
+ (max_inline ? max_inline :
+ pkts_n * MLX5_WQE_DWORD_SIZE) +
+ MLX5_WQE_SIZE);
+ if (unlikely(max_wqe * MLX5_WQE_SIZE < mpw_room))
+ break;
+ /* Don't pad the title WQEBB to not waste WQ. */
+ mlx5_empw_new(txq, &mpw, 0);
+ mpw_room -= mpw.total_len;
+ inl_pad = 0;
+ do_inline = length <= txq->inline_max_packet_sz &&
+ sizeof(inl_hdr) + length <= mpw_room &&
+ !txq->mpw_hdr_dseg;
+ mpw.wqe->eseg.cs_flags = cs_flags;
+ mpw.wqe->eseg.flow_table_metadata = metadata;
+ } else {
+ /* Evaluate whether the next packet can be inlined.
+ * Inlininig is possible when:
+ * - length is less than configured value
+ * - length fits for remaining space
+ * - not required to fill the title WQEBB with dsegs
+ */
+ do_inline =
+ length <= txq->inline_max_packet_sz &&
+ inl_pad + sizeof(inl_hdr) + length <=
+ mpw_room &&
+ (!txq->mpw_hdr_dseg ||
+ mpw.total_len >= MLX5_WQE_SIZE);
+ }
+ if (max_inline && do_inline) {
+ /* Inline packet into WQE. */
+ unsigned int max;
+
+ assert(mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED);
+ assert(length == DATA_LEN(buf));
+ inl_hdr = rte_cpu_to_be_32(length | MLX5_INLINE_SEG);
+ addr = rte_pktmbuf_mtod(buf, uintptr_t);
+ mpw.data.raw = (volatile void *)
+ ((uintptr_t)mpw.data.raw + inl_pad);
+ max = tx_mlx5_wq_tailroom(txq,
+ (void *)(uintptr_t)mpw.data.raw);
+ /* Copy inline header. */
+ mpw.data.raw = (volatile void *)
+ mlx5_copy_to_wq(
+ (void *)(uintptr_t)mpw.data.raw,
+ &inl_hdr,
+ sizeof(inl_hdr),
+ (void *)(uintptr_t)txq->wqes,
+ max);
+ max = tx_mlx5_wq_tailroom(txq,
+ (void *)(uintptr_t)mpw.data.raw);
+ /* Copy packet data. */
+ mpw.data.raw = (volatile void *)
+ mlx5_copy_to_wq(
+ (void *)(uintptr_t)mpw.data.raw,
+ (void *)addr,
+ length,
+ (void *)(uintptr_t)txq->wqes,
+ max);
+ ++mpw.pkts_n;
+ mpw.total_len += (inl_pad + sizeof(inl_hdr) + length);
+ /* No need to get completion as the entire packet is
+ * copied to WQ. Free the buf right away.
+ */
+ rte_pktmbuf_free_seg(buf);
+ mpw_room -= (inl_pad + sizeof(inl_hdr) + length);
+ /* Add pad in the next packet if any. */
+ inl_pad = (((uintptr_t)mpw.data.raw +
+ (MLX5_WQE_DWORD_SIZE - 1)) &
+ ~(MLX5_WQE_DWORD_SIZE - 1)) -
+ (uintptr_t)mpw.data.raw;
+ } else {
+ /* No inline. Load a dseg of packet pointer. */
+ volatile rte_v128u32_t *dseg;
+
+ assert(mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED);
+ assert((inl_pad + sizeof(*dseg)) <= mpw_room);
+ assert(length == DATA_LEN(buf));
+ if (!tx_mlx5_wq_tailroom(txq,
+ (void *)((uintptr_t)mpw.data.raw
+ + inl_pad)))
+ dseg = (volatile void *)txq->wqes;
+ else
+ dseg = (volatile void *)
+ ((uintptr_t)mpw.data.raw +
+ inl_pad);
+ (*txq->elts)[elts_head++ & elts_m] = buf;
+ addr_64 = rte_cpu_to_be_64(rte_pktmbuf_mtod(buf,
+ uintptr_t));
+ *dseg = (rte_v128u32_t) {
+ rte_cpu_to_be_32(length),
+ mlx5_tx_mb2mr(txq, buf),
+ addr_64,
+ addr_64 >> 32,
+ };
+ mpw.data.raw = (volatile void *)(dseg + 1);
+ mpw.total_len += (inl_pad + sizeof(*dseg));
+ ++j;
+ ++mpw.pkts_n;
+ mpw_room -= (inl_pad + sizeof(*dseg));
+ inl_pad = 0;
+ }
+#ifdef MLX5_PMD_SOFT_COUNTERS
+ /* Increment sent bytes counter. */
+ txq->stats.obytes += length;
+#endif
+ ++i;
+ } while (i < pkts_n);
+ /* Take a shortcut if nothing must be sent. */
+ if (unlikely(i == 0))
+ return 0;
+ /* Check whether completion threshold has been reached. */
+ if (txq->elts_comp + j >= MLX5_TX_COMP_THRESH ||
+ (uint16_t)(txq->wqe_ci - txq->mpw_comp) >=
+ (1 << txq->wqe_n) / MLX5_TX_COMP_THRESH_INLINE_DIV) {
+ volatile struct mlx5_wqe *wqe = mpw.wqe;
+
+ /* A CQE slot must always be available. */
+ assert((1u << txq->cqe_n) - (txq->cq_pi++ - txq->cq_ci));
+ /* Request completion on last WQE. */
+ wqe->ctrl[2] = rte_cpu_to_be_32(8);
+ /* Save elts_head in unused "immediate" field of WQE. */
+ wqe->ctrl[3] = elts_head;
+ txq->elts_comp = 0;
+ txq->mpw_comp = txq->wqe_ci;
+ } else {
+ txq->elts_comp += j;
+ }
+#ifdef MLX5_PMD_SOFT_COUNTERS
+ /* Increment sent packets counter. */
+ txq->stats.opackets += i;
+#endif
+ if (mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED)
+ mlx5_empw_close(txq, &mpw);
+ /* Ring QP doorbell. */
+ mlx5_tx_dbrec(txq, mpw.wqe);
+ txq->elts_head = elts_head;
+ return i;
+}
+
+/**
+ * DPDK callback for TX with Enhanced MPW support.
+ *
+ * @param dpdk_txq
+ * Generic pointer to TX queue structure.
+ * @param[in] pkts
+ * Packets to transmit.
+ * @param pkts_n
+ * Number of packets in array.
+ *
+ * @return
+ * Number of packets successfully transmitted (<= pkts_n).
+ */
+uint16_t
+mlx5_tx_burst_empw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
+{
+ struct mlx5_txq_data *txq = (struct mlx5_txq_data *)dpdk_txq;
+ uint16_t nb_tx = 0;
+
+ while (pkts_n > nb_tx) {
+ uint16_t n;
+ uint16_t ret;
+
+ n = txq_count_contig_multi_seg(&pkts[nb_tx], pkts_n - nb_tx);
+ if (n) {
+ ret = mlx5_tx_burst(dpdk_txq, &pkts[nb_tx], n);
+ if (!ret)
+ break;
+ nb_tx += ret;
+ }
+ n = txq_count_contig_single_seg(&pkts[nb_tx], pkts_n - nb_tx);
+ if (n) {
+ ret = txq_burst_empw(txq, &pkts[nb_tx], n);
+ if (!ret)
+ break;
+ nb_tx += ret;
+ }
+ }
+ return nb_tx;
+}
+
+/**
+ * Translate RX completion flags to packet type.
+ *
+ * @param[in] rxq
+ * Pointer to RX queue structure.
+ * @param[in] cqe
+ * Pointer to CQE.
+ *
+ * @note: fix mlx5_dev_supported_ptypes_get() if any change here.
+ *
+ * @return
+ * Packet type for struct rte_mbuf.
+ */
+static inline uint32_t
+rxq_cq_to_pkt_type(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe)
+{
+ uint8_t idx;
+ uint8_t pinfo = cqe->pkt_info;
+ uint16_t ptype = cqe->hdr_type_etc;
+
+ /*
+ * The index to the array should have:
+ * bit[1:0] = l3_hdr_type
+ * bit[4:2] = l4_hdr_type
+ * bit[5] = ip_frag
+ * bit[6] = tunneled
+ * bit[7] = outer_l3_type
+ */
+ idx = ((pinfo & 0x3) << 6) | ((ptype & 0xfc00) >> 10);
+ return mlx5_ptype_table[idx] | rxq->tunnel * !!(idx & (1 << 6));
+}
+
+/**
+ * Initialize Rx WQ and indexes.
+ *
+ * @param[in] rxq
+ * Pointer to RX queue structure.
+ */
+void
+mlx5_rxq_initialize(struct mlx5_rxq_data *rxq)
+{
+ const unsigned int wqe_n = 1 << rxq->elts_n;
+ unsigned int i;
+
+ for (i = 0; (i != wqe_n); ++i) {
+ volatile struct mlx5_wqe_data_seg *scat;
+ uintptr_t addr;
+ uint32_t byte_count;
+
+ if (mlx5_rxq_mprq_enabled(rxq)) {
+ struct mlx5_mprq_buf *buf = (*rxq->mprq_bufs)[i];
+
+ scat = &((volatile struct mlx5_wqe_mprq *)
+ rxq->wqes)[i].dseg;
+ addr = (uintptr_t)mlx5_mprq_buf_addr(buf);
+ byte_count = (1 << rxq->strd_sz_n) *
+ (1 << rxq->strd_num_n);
+ } else {
+ struct rte_mbuf *buf = (*rxq->elts)[i];
+
+ scat = &((volatile struct mlx5_wqe_data_seg *)
+ rxq->wqes)[i];
+ addr = rte_pktmbuf_mtod(buf, uintptr_t);
+ byte_count = DATA_LEN(buf);
+ }
+ /* scat->addr must be able to store a pointer. */
+ assert(sizeof(scat->addr) >= sizeof(uintptr_t));
+ *scat = (struct mlx5_wqe_data_seg){
+ .addr = rte_cpu_to_be_64(addr),
+ .byte_count = rte_cpu_to_be_32(byte_count),
+ .lkey = mlx5_rx_addr2mr(rxq, addr),
+ };
+ }
+ rxq->consumed_strd = 0;
+ rxq->decompressed = 0;
+ rxq->rq_pi = 0;
+ rxq->zip = (struct rxq_zip){
+ .ai = 0,
+ };
+ /* Update doorbell counter. */
+ rxq->rq_ci = wqe_n >> rxq->sges_n;
+ rte_cio_wmb();
+ *rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci);
+}
+
+/**
+ * Handle a Rx error.
+ * The function inserts the RQ state to reset when the first error CQE is
+ * shown, then drains the CQ by the caller function loop. When the CQ is empty,
+ * it moves the RQ state to ready and initializes the RQ.
+ * Next CQE identification and error counting are in the caller responsibility.
+ *
+ * @param[in] rxq
+ * Pointer to RX queue structure.
+ * @param[in] mbuf_prepare
+ * Whether to prepare mbufs for the RQ.
+ *
+ * @return
+ * -1 in case of recovery error, otherwise the CQE status.
+ */
+int
+mlx5_rx_err_handle(struct mlx5_rxq_data *rxq, uint8_t mbuf_prepare)
+{
+ const uint16_t cqe_n = 1 << rxq->cqe_n;
+ const uint16_t cqe_mask = cqe_n - 1;
+ const unsigned int wqe_n = 1 << rxq->elts_n;
+ struct mlx5_rxq_ctrl *rxq_ctrl =
+ container_of(rxq, struct mlx5_rxq_ctrl, rxq);
+ struct ibv_wq_attr mod = {
+ .attr_mask = IBV_WQ_ATTR_STATE,
+ };
+ union {
+ volatile struct mlx5_cqe *cqe;
+ volatile struct mlx5_err_cqe *err_cqe;
+ } u = {
+ .cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_mask],
+ };
+ int ret;
+
+ switch (rxq->err_state) {
+ case MLX5_RXQ_ERR_STATE_NO_ERROR:
+ rxq->err_state = MLX5_RXQ_ERR_STATE_NEED_RESET;
+ /* Fall-through */
+ case MLX5_RXQ_ERR_STATE_NEED_RESET:
+ if (rte_eal_process_type() != RTE_PROC_PRIMARY)
+ return -1;
+ mod.wq_state = IBV_WQS_RESET;
+ ret = mlx5_glue->modify_wq(rxq_ctrl->ibv->wq, &mod);
+ if (ret) {
+ DRV_LOG(ERR, "Cannot change Rx WQ state to RESET %s\n",
+ strerror(errno));
+ return -1;
+ }
+ if (rxq_ctrl->dump_file_n <
+ rxq_ctrl->priv->config.max_dump_files_num) {
+ MKSTR(err_str, "Unexpected CQE error syndrome "
+ "0x%02x CQN = %u RQN = %u wqe_counter = %u"
+ " rq_ci = %u cq_ci = %u", u.err_cqe->syndrome,
+ rxq->cqn, rxq_ctrl->ibv->wq->wq_num,
+ rte_be_to_cpu_16(u.err_cqe->wqe_counter),
+ rxq->rq_ci << rxq->sges_n, rxq->cq_ci);
+ MKSTR(name, "dpdk_mlx5_port_%u_rxq_%u_%u",
+ rxq->port_id, rxq->idx, (uint32_t)rte_rdtsc());
+ mlx5_dump_debug_information(name, NULL, err_str, 0);
+ mlx5_dump_debug_information(name, "MLX5 Error CQ:",
+ (const void *)((uintptr_t)
+ rxq->cqes),
+ sizeof(*u.cqe) * cqe_n);
+ mlx5_dump_debug_information(name, "MLX5 Error RQ:",
+ (const void *)((uintptr_t)
+ rxq->wqes),
+ 16 * wqe_n);
+ rxq_ctrl->dump_file_n++;
+ }
+ rxq->err_state = MLX5_RXQ_ERR_STATE_NEED_READY;
+ /* Fall-through */
+ case MLX5_RXQ_ERR_STATE_NEED_READY:
+ ret = check_cqe(u.cqe, cqe_n, rxq->cq_ci);
+ if (ret == MLX5_CQE_STATUS_HW_OWN) {
+ rte_cio_wmb();
+ *rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci);
+ rte_cio_wmb();
+ /*
+ * The RQ consumer index must be zeroed while moving
+ * from RESET state to RDY state.
+ */
+ *rxq->rq_db = rte_cpu_to_be_32(0);
+ rte_cio_wmb();
+ mod.wq_state = IBV_WQS_RDY;
+ ret = mlx5_glue->modify_wq(rxq_ctrl->ibv->wq, &mod);
+ if (ret) {
+ DRV_LOG(ERR, "Cannot change Rx WQ state to RDY"
+ " %s\n", strerror(errno));
+ return -1;
+ }
+ if (mbuf_prepare) {
+ const uint16_t q_mask = wqe_n - 1;
+ uint16_t elt_idx;
+ struct rte_mbuf **elt;
+ int i;
+ unsigned int n = wqe_n - (rxq->rq_ci -
+ rxq->rq_pi);
+
+ for (i = 0; i < (int)n; ++i) {
+ elt_idx = (rxq->rq_ci + i) & q_mask;
+ elt = &(*rxq->elts)[elt_idx];
+ *elt = rte_mbuf_raw_alloc(rxq->mp);
+ if (!*elt) {
+ for (i--; i >= 0; --i) {
+ elt_idx = (rxq->rq_ci +
+ i) & q_mask;
+ elt = &(*rxq->elts)
+ [elt_idx];
+ rte_pktmbuf_free_seg
+ (*elt);
+ }
+ return -1;
+ }
+ }
+ }
+ mlx5_rxq_initialize(rxq);
+ rxq->err_state = MLX5_RXQ_ERR_STATE_NO_ERROR;
+ }
+ return ret;
+ default:
+ return -1;
+ }
+}
+
+/**
+ * Get size of the next packet for a given CQE. For compressed CQEs, the
+ * consumer index is updated only once all packets of the current one have
+ * been processed.
+ *
+ * @param rxq
+ * Pointer to RX queue.
+ * @param cqe
+ * CQE to process.
+ * @param[out] mcqe
+ * Store pointer to mini-CQE if compressed. Otherwise, the pointer is not
+ * written.
+ *
+ * @return
+ * 0 in case of empty CQE, otherwise the packet size in bytes.
+ */
+static inline int
+mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe,
+ uint16_t cqe_cnt, volatile struct mlx5_mini_cqe8 **mcqe)
+{
+ struct rxq_zip *zip = &rxq->zip;
+ uint16_t cqe_n = cqe_cnt + 1;
+ int len;
+ uint16_t idx, end;
+
+ do {
+ len = 0;
+ /* Process compressed data in the CQE and mini arrays. */
+ if (zip->ai) {
+ volatile struct mlx5_mini_cqe8 (*mc)[8] =
+ (volatile struct mlx5_mini_cqe8 (*)[8])
+ (uintptr_t)(&(*rxq->cqes)[zip->ca &
+ cqe_cnt].pkt_info);
+
+ len = rte_be_to_cpu_32((*mc)[zip->ai & 7].byte_cnt);
+ *mcqe = &(*mc)[zip->ai & 7];
+ if ((++zip->ai & 7) == 0) {
+ /* Invalidate consumed CQEs */
+ idx = zip->ca;
+ end = zip->na;
+ while (idx != end) {
+ (*rxq->cqes)[idx & cqe_cnt].op_own =
+ MLX5_CQE_INVALIDATE;
+ ++idx;
+ }
+ /*
+ * Increment consumer index to skip the number
+ * of CQEs consumed. Hardware leaves holes in
+ * the CQ ring for software use.
+ */
+ zip->ca = zip->na;
+ zip->na += 8;
+ }
+ if (unlikely(rxq->zip.ai == rxq->zip.cqe_cnt)) {
+ /* Invalidate the rest */
+ idx = zip->ca;
+ end = zip->cq_ci;
+
+ while (idx != end) {
+ (*rxq->cqes)[idx & cqe_cnt].op_own =
+ MLX5_CQE_INVALIDATE;
+ ++idx;
+ }
+ rxq->cq_ci = zip->cq_ci;
+ zip->ai = 0;
+ }
+ /*
+ * No compressed data, get next CQE and verify if it is
+ * compressed.
+ */
+ } else {
+ int ret;
+ int8_t op_own;
+
+ ret = check_cqe(cqe, cqe_n, rxq->cq_ci);
+ if (unlikely(ret != MLX5_CQE_STATUS_SW_OWN)) {
+ if (unlikely(ret == MLX5_CQE_STATUS_ERR ||
+ rxq->err_state)) {
+ ret = mlx5_rx_err_handle(rxq, 0);
+ if (ret == MLX5_CQE_STATUS_HW_OWN ||
+ ret == -1)
+ return 0;
+ } else {
+ return 0;
+ }
+ }
+ ++rxq->cq_ci;
+ op_own = cqe->op_own;
+ if (MLX5_CQE_FORMAT(op_own) == MLX5_COMPRESSED) {
+ volatile struct mlx5_mini_cqe8 (*mc)[8] =
+ (volatile struct mlx5_mini_cqe8 (*)[8])
+ (uintptr_t)(&(*rxq->cqes)
+ [rxq->cq_ci &
+ cqe_cnt].pkt_info);
+
+ /* Fix endianness. */
+ zip->cqe_cnt = rte_be_to_cpu_32(cqe->byte_cnt);
+ /*
+ * Current mini array position is the one
+ * returned by check_cqe64().
+ *
+ * If completion comprises several mini arrays,
+ * as a special case the second one is located
+ * 7 CQEs after the initial CQE instead of 8
+ * for subsequent ones.
+ */
+ zip->ca = rxq->cq_ci;
+ zip->na = zip->ca + 7;
+ /* Compute the next non compressed CQE. */
+ --rxq->cq_ci;
+ zip->cq_ci = rxq->cq_ci + zip->cqe_cnt;
+ /* Get packet size to return. */
+ len = rte_be_to_cpu_32((*mc)[0].byte_cnt);
+ *mcqe = &(*mc)[0];
+ zip->ai = 1;
+ /* Prefetch all to be invalidated */
+ idx = zip->ca;
+ end = zip->cq_ci;
+ while (idx != end) {
+ rte_prefetch0(&(*rxq->cqes)[(idx) &
+ cqe_cnt]);
+ ++idx;
+ }
+ } else {
+ len = rte_be_to_cpu_32(cqe->byte_cnt);
+ }
+ }
+ if (unlikely(rxq->err_state)) {
+ cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_cnt];
+ ++rxq->stats.idropped;
+ } else {
+ return len;
+ }
+ } while (1);
+}
+
+/**
+ * Translate RX completion flags to offload flags.
+ *
+ * @param[in] cqe
+ * Pointer to CQE.