From fdcb0f53053be6e65b385a33b791e66a1760e833 Mon Sep 17 00:00:00 2001 From: =?utf8?q?N=C3=A9lio=20Laranjeiro?= Date: Thu, 24 Nov 2016 17:03:31 +0100 Subject: [PATCH] net/mlx5: use work queue buffer as a raw buffer Define a single work queue element type that encompasses them all. It includes control, Ethernet segment and raw data all grouped in a single place. Signed-off-by: Nelio Laranjeiro Acked-by: Adrien Mazarguil --- drivers/net/mlx5/mlx5_prm.h | 13 ++++- drivers/net/mlx5/mlx5_rxtx.c | 103 ++++++++++++++++++----------------- drivers/net/mlx5/mlx5_rxtx.h | 2 +- drivers/net/mlx5/mlx5_txq.c | 8 +-- 4 files changed, 68 insertions(+), 58 deletions(-) diff --git a/drivers/net/mlx5/mlx5_prm.h b/drivers/net/mlx5/mlx5_prm.h index d5155fdf07..3cc5af9aad 100644 --- a/drivers/net/mlx5/mlx5_prm.h +++ b/drivers/net/mlx5/mlx5_prm.h @@ -125,12 +125,19 @@ struct mlx5_wqe_eth_seg_small { uint32_t rsvd2; uint16_t inline_hdr_sz; uint8_t inline_hdr[2]; -}; +} __rte_aligned(MLX5_WQE_DWORD_SIZE); struct mlx5_wqe_inl_small { uint32_t byte_cnt; uint8_t raw; -}; +} __rte_aligned(MLX5_WQE_DWORD_SIZE); + +struct mlx5_wqe_ctrl { + uint32_t ctrl0; + uint32_t ctrl1; + uint32_t ctrl2; + uint32_t ctrl3; +} __rte_aligned(MLX5_WQE_DWORD_SIZE); /* Small common part of the WQE. */ struct mlx5_wqe { @@ -142,7 +149,7 @@ struct mlx5_wqe { struct mlx5_wqe64 { struct mlx5_wqe hdr; uint8_t raw[32]; -} __rte_aligned(64); +} __rte_aligned(MLX5_WQE_SIZE); /* MPW session status. */ enum mlx5_mpw_state { diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c index 0a76dd2364..e6c0595f4c 100644 --- a/drivers/net/mlx5/mlx5_rxtx.c +++ b/drivers/net/mlx5/mlx5_rxtx.c @@ -152,6 +152,24 @@ check_cqe(volatile struct mlx5_cqe *cqe, return 0; } +/** + * Return the address of the WQE. + * + * @param txq + * Pointer to TX queue structure. + * @param wqe_ci + * WQE consumer index. + * + * @return + * WQE address. + */ +static inline uintptr_t * +tx_mlx5_wqe(struct txq *txq, uint16_t ci) +{ + ci &= ((1 << txq->wqe_n) - 1); + return (uintptr_t *)((uintptr_t)txq->wqes + ci * MLX5_WQE_SIZE); +} + static inline void txq_complete(struct txq *txq) __attribute__((always_inline)); @@ -173,7 +191,7 @@ txq_complete(struct txq *txq) uint16_t elts_tail; uint16_t cq_ci = txq->cq_ci; volatile struct mlx5_cqe *cqe = NULL; - volatile struct mlx5_wqe *wqe; + volatile struct mlx5_wqe_ctrl *ctrl; do { volatile struct mlx5_cqe *tmp; @@ -199,9 +217,9 @@ txq_complete(struct txq *txq) } while (1); if (unlikely(cqe == NULL)) return; - wqe = &(*txq->wqes)[ntohs(cqe->wqe_counter) & - ((1 << txq->wqe_n) - 1)].hdr; - elts_tail = wqe->ctrl[3]; + ctrl = (volatile struct mlx5_wqe_ctrl *) + tx_mlx5_wqe(txq, ntohs(cqe->wqe_counter)); + elts_tail = ctrl->ctrl3; assert(elts_tail < (1 << txq->wqe_n)); /* Free buffers. */ while (elts_free != elts_tail) { @@ -328,23 +346,6 @@ tx_prefetch_cqe(struct txq *txq, uint16_t ci) rte_prefetch0(cqe); } -/** - * Prefetch a WQE. - * - * @param txq - * Pointer to TX queue structure. - * @param wqe_ci - * WQE consumer index. - */ -static inline void -tx_prefetch_wqe(struct txq *txq, uint16_t ci) -{ - volatile struct mlx5_wqe64 *wqe; - - wqe = &(*txq->wqes)[ci & ((1 << txq->wqe_n) - 1)]; - rte_prefetch0(wqe); -} - /** * DPDK callback for TX. * @@ -409,9 +410,9 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) --segs_n; if (!segs_n) --pkts_n; - wqe = &(*txq->wqes)[txq->wqe_ci & - ((1 << txq->wqe_n) - 1)].hdr; - tx_prefetch_wqe(txq, txq->wqe_ci + 1); + wqe = (volatile struct mlx5_wqe *) + tx_mlx5_wqe(txq, txq->wqe_ci); + rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1)); if (pkts_n > 1) rte_prefetch0(*pkts); addr = rte_pktmbuf_mtod(buf, uintptr_t); @@ -462,8 +463,9 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) } /* Inline if enough room. */ if (txq->max_inline != 0) { - uintptr_t end = - (uintptr_t)&(*txq->wqes)[1 << txq->wqe_n]; + uintptr_t end = (uintptr_t) + (((uintptr_t)txq->wqes) + + (1 << txq->wqe_n) * MLX5_WQE_SIZE); uint16_t max_inline = txq->max_inline * RTE_CACHE_LINE_SIZE; uint16_t room; @@ -494,12 +496,13 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) */ ds = 2 + MLX5_WQE_DS(pkt_inline_sz - 2); if (length > 0) { - dseg = (struct mlx5_wqe_data_seg *) + dseg = (volatile struct mlx5_wqe_data_seg *) ((uintptr_t)wqe + (ds * MLX5_WQE_DWORD_SIZE)); if ((uintptr_t)dseg >= end) - dseg = (struct mlx5_wqe_data_seg *) - ((uintptr_t)&(*txq->wqes)[0]); + dseg = (volatile struct + mlx5_wqe_data_seg *) + txq->wqes; goto use_dseg; } else if (!segs_n) { goto next_pkt; @@ -512,12 +515,12 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) * Ethernet Header as been stored. */ wqe->eseg.inline_hdr_sz = htons(MLX5_WQE_DWORD_SIZE); - dseg = (struct mlx5_wqe_data_seg *) + dseg = (volatile struct mlx5_wqe_data_seg *) ((uintptr_t)wqe + (3 * MLX5_WQE_DWORD_SIZE)); ds = 3; use_dseg: /* Add the remaining packet as a simple ds. */ - *dseg = (struct mlx5_wqe_data_seg) { + *dseg = (volatile struct mlx5_wqe_data_seg) { .addr = htonll(addr), .byte_count = htonl(length), .lkey = txq_mp2mr(txq, txq_mb2mp(buf)), @@ -540,9 +543,9 @@ next_seg: unsigned int n = (txq->wqe_ci + ((ds + 3) / 4)) & ((1 << txq->wqe_n) - 1); - dseg = (struct mlx5_wqe_data_seg *) - ((uintptr_t)&(*txq->wqes)[n]); - tx_prefetch_wqe(txq, n + 1); + dseg = (volatile struct mlx5_wqe_data_seg *) + tx_mlx5_wqe(txq, n); + rte_prefetch0(tx_mlx5_wqe(txq, n + 1)); } else { ++dseg; } @@ -554,7 +557,7 @@ next_seg: total_length += length; #endif /* Store segment information. */ - *dseg = (struct mlx5_wqe_data_seg) { + *dseg = (volatile struct mlx5_wqe_data_seg) { .addr = htonll(rte_pktmbuf_mtod(buf, uintptr_t)), .byte_count = htonl(length), .lkey = txq_mp2mr(txq, txq_mb2mp(buf)), @@ -627,13 +630,13 @@ mlx5_mpw_new(struct txq *txq, struct mlx5_mpw *mpw, uint32_t length) uint16_t idx = txq->wqe_ci & ((1 << txq->wqe_n) - 1); volatile struct mlx5_wqe_data_seg (*dseg)[MLX5_MPW_DSEG_MAX] = (volatile struct mlx5_wqe_data_seg (*)[]) - (uintptr_t)&(*txq->wqes)[(idx + 1) & ((1 << txq->wqe_n) - 1)]; + tx_mlx5_wqe(txq, idx + 1); mpw->state = MLX5_MPW_STATE_OPENED; mpw->pkts_n = 0; mpw->len = length; mpw->total_len = 0; - mpw->wqe = (volatile struct mlx5_wqe *)&(*txq->wqes)[idx].hdr; + mpw->wqe = (volatile struct mlx5_wqe *)tx_mlx5_wqe(txq, idx); mpw->wqe->eseg.mss = htons(length); mpw->wqe->eseg.inline_hdr_sz = 0; mpw->wqe->eseg.rsvd0 = 0; @@ -675,8 +678,8 @@ mlx5_mpw_close(struct txq *txq, struct mlx5_mpw *mpw) ++txq->wqe_ci; else txq->wqe_ci += 2; - tx_prefetch_wqe(txq, txq->wqe_ci); - tx_prefetch_wqe(txq, txq->wqe_ci + 1); + rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci)); + rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1)); } /** @@ -710,8 +713,8 @@ mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) return 0; /* Prefetch first packet cacheline. */ tx_prefetch_cqe(txq, txq->cq_ci); - tx_prefetch_wqe(txq, txq->wqe_ci); - tx_prefetch_wqe(txq, txq->wqe_ci + 1); + rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci)); + rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1)); /* Start processing. */ txq_complete(txq); max = (elts_n - (elts_head - txq->elts_tail)); @@ -839,7 +842,7 @@ mlx5_mpw_inline_new(struct txq *txq, struct mlx5_mpw *mpw, uint32_t length) mpw->pkts_n = 0; mpw->len = length; mpw->total_len = 0; - mpw->wqe = (volatile struct mlx5_wqe *)&(*txq->wqes)[idx].hdr; + mpw->wqe = (volatile struct mlx5_wqe *)tx_mlx5_wqe(txq, idx); mpw->wqe->ctrl[0] = htonl((MLX5_OPC_MOD_MPW << 24) | (txq->wqe_ci << 8) | MLX5_OPCODE_TSO); @@ -915,8 +918,8 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts, return 0; /* Prefetch first packet cacheline. */ tx_prefetch_cqe(txq, txq->cq_ci); - tx_prefetch_wqe(txq, txq->wqe_ci); - tx_prefetch_wqe(txq, txq->wqe_ci + 1); + rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci)); + rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1)); /* Start processing. */ txq_complete(txq); max = (elts_n - (elts_head - txq->elts_tail)); @@ -1017,14 +1020,15 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts, addr = rte_pktmbuf_mtod(buf, uintptr_t); (*txq->elts)[elts_head] = buf; /* Maximum number of bytes before wrapping. */ - max = ((uintptr_t)&(*txq->wqes)[1 << txq->wqe_n] - + max = ((((uintptr_t)(txq->wqes)) + + (1 << txq->wqe_n) * + MLX5_WQE_SIZE) - (uintptr_t)mpw.data.raw); if (length > max) { rte_memcpy((void *)(uintptr_t)mpw.data.raw, (void *)addr, max); - mpw.data.raw = - (volatile void *)&(*txq->wqes)[0]; + mpw.data.raw = (volatile void *)txq->wqes; rte_memcpy((void *)(uintptr_t)mpw.data.raw, (void *)(addr + max), length - max); @@ -1036,9 +1040,8 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts, mpw.data.raw += length; } if ((uintptr_t)mpw.data.raw == - (uintptr_t)&(*txq->wqes)[1 << txq->wqe_n]) - mpw.data.raw = - (volatile void *)&(*txq->wqes)[0]; + (uintptr_t)tx_mlx5_wqe(txq, 1 << txq->wqe_n)) + mpw.data.raw = (volatile void *)txq->wqes; ++mpw.pkts_n; ++j; if (mpw.pkts_n == MLX5_MPW_DSEG_MAX) { diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h index eec409ded5..23a2548531 100644 --- a/drivers/net/mlx5/mlx5_rxtx.h +++ b/drivers/net/mlx5/mlx5_rxtx.h @@ -257,7 +257,7 @@ struct txq { uint16_t max_inline; /* Multiple of RTE_CACHE_LINE_SIZE to inline. */ uint32_t qp_num_8s; /* QP number shifted by 8. */ volatile struct mlx5_cqe (*cqes)[]; /* Completion queue. */ - volatile struct mlx5_wqe64 (*wqes)[]; /* Work queue. */ + volatile void *wqes; /* Work queue (use volatile to write into). */ volatile uint32_t *qp_db; /* Work queue doorbell. */ volatile uint32_t *cq_db; /* Completion queue doorbell. */ volatile void *bf_reg; /* Blueflame register. */ diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c index 053665d559..f4c6682a2b 100644 --- a/drivers/net/mlx5/mlx5_txq.c +++ b/drivers/net/mlx5/mlx5_txq.c @@ -82,7 +82,9 @@ txq_alloc_elts(struct txq_ctrl *txq_ctrl, unsigned int elts_n) for (i = 0; (i != elts_n); ++i) (*txq_ctrl->txq.elts)[i] = NULL; for (i = 0; (i != (1u << txq_ctrl->txq.wqe_n)); ++i) { - volatile struct mlx5_wqe64 *wqe = &(*txq_ctrl->txq.wqes)[i]; + volatile struct mlx5_wqe64 *wqe = + (volatile struct mlx5_wqe64 *) + txq_ctrl->txq.wqes + i; memset((void *)(uintptr_t)wqe, 0x0, sizeof(*wqe)); } @@ -214,9 +216,7 @@ txq_setup(struct txq_ctrl *tmpl, struct txq_ctrl *txq_ctrl) } tmpl->txq.cqe_n = log2above(ibcq->cqe); tmpl->txq.qp_num_8s = qp->ctrl_seg.qp_num << 8; - tmpl->txq.wqes = - (volatile struct mlx5_wqe64 (*)[]) - (uintptr_t)qp->gen_data.sqstart; + tmpl->txq.wqes = qp->gen_data.sqstart; tmpl->txq.wqe_n = log2above(qp->sq.wqe_cnt); tmpl->txq.qp_db = &qp->gen_data.db[MLX5_SND_DBR]; tmpl->txq.bf_reg = qp->gen_data.bf->reg; -- 2.20.1