From fb870be5a879c6617fecabf47873ae2b576e6e69 Mon Sep 17 00:00:00 2001 From: Yongseok Koh Date: Tue, 24 Oct 2017 17:27:25 -0700 Subject: [PATCH] net/mlx5: fix Tx doorbell memory barrier Configuring UAR as IO-mapped makes maximum throughput decline by noticeable amount. If UAR is configured as write-combining register, a write memory barrier is needed on ringing a doorbell. rte_wmb() is mostly effective when the size of a burst is comparatively small. Revert the register back to write-combining and enforce a write memory barrier instead, except for vectorized Tx burst routines. Application can change it by setting MLX5_SHUT_UP_BF under its own necessity. Fixes: 9f9bebae5530 ("net/mlx5: don't map doorbell register to write combining") Signed-off-by: Yongseok Koh Acked-by: Shahaf Shuler Acked-by: Nelio Laranjeiro --- doc/guides/nics/mlx5.rst | 17 +++++++++++++++++ drivers/net/mlx5/mlx5.c | 2 -- drivers/net/mlx5/mlx5_rxtx.h | 23 +++++++++++++++++++++-- drivers/net/mlx5/mlx5_rxtx_vec_neon.h | 2 +- drivers/net/mlx5/mlx5_rxtx_vec_sse.h | 2 +- 5 files changed, 40 insertions(+), 6 deletions(-) diff --git a/doc/guides/nics/mlx5.rst b/doc/guides/nics/mlx5.rst index bcc39f66ce..cdb880a4c6 100644 --- a/doc/guides/nics/mlx5.rst +++ b/doc/guides/nics/mlx5.rst @@ -173,6 +173,23 @@ Environment variables This is disabled by default since this can also decrease performance for unaligned packet sizes. +- ``MLX5_SHUT_UP_BF`` + + Configures HW Tx doorbell register as IO-mapped. + + By default, the HW Tx doorbell is configured as a write-combining register. + The register would be flushed to HW usually when the write-combining buffer + becomes full, but it depends on CPU design. + + Except for vectorized Tx burst routines, a write memory barrier is enforced + after updating the register so that the update can be immediately visible to + HW. + + When vectorized Tx burst is called, the barrier is set only if the burst size + is not aligned to MLX5_VPMD_TX_MAX_BURST. However, setting this environmental + variable will bring better latency even though the maximum throughput can + slightly decline. + Run-time configuration ~~~~~~~~~~~~~~~~~~~~~~ diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c index e3c9b8995b..b7f7fa8143 100644 --- a/drivers/net/mlx5/mlx5.c +++ b/drivers/net/mlx5/mlx5.c @@ -1036,8 +1036,6 @@ rte_mlx5_pmd_init(void) * using this PMD, which is not supported in forked processes. */ setenv("RDMAV_HUGEPAGES_SAFE", "1", 1); - /* Don't map UAR to WC if BlueFlame is not used.*/ - setenv("MLX5_SHUT_UP_BF", "1", 1); /* Match the size of Rx completion entry to the size of a cacheline. */ if (RTE_CACHE_LINE_SIZE == 128) setenv("MLX5_CQE_SIZE", "128", 0); diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h index ea037427b8..d34f3cc040 100644 --- a/drivers/net/mlx5/mlx5_rxtx.h +++ b/drivers/net/mlx5/mlx5_rxtx.h @@ -578,15 +578,18 @@ mlx5_tx_mb2mr(struct mlx5_txq_data *txq, struct rte_mbuf *mb) } /** - * Ring TX queue doorbell. + * Ring TX queue doorbell and flush the update if requested. * * @param txq * Pointer to TX queue structure. * @param wqe * Pointer to the last WQE posted in the NIC. + * @param cond + * Request for write memory barrier after BlueFlame update. */ static __rte_always_inline void -mlx5_tx_dbrec(struct mlx5_txq_data *txq, volatile struct mlx5_wqe *wqe) +mlx5_tx_dbrec_cond_wmb(struct mlx5_txq_data *txq, volatile struct mlx5_wqe *wqe, + int cond) { uint64_t *dst = (uint64_t *)((uintptr_t)txq->bf_reg); volatile uint64_t *src = ((volatile uint64_t *)wqe); @@ -596,6 +599,22 @@ mlx5_tx_dbrec(struct mlx5_txq_data *txq, volatile struct mlx5_wqe *wqe) /* Ensure ordering between DB record and BF copy. */ rte_wmb(); *dst = *src; + if (cond) + rte_wmb(); +} + +/** + * Ring TX queue doorbell and flush the update by write memory barrier. + * + * @param txq + * Pointer to TX queue structure. + * @param wqe + * Pointer to the last WQE posted in the NIC. + */ +static __rte_always_inline void +mlx5_tx_dbrec(struct mlx5_txq_data *txq, volatile struct mlx5_wqe *wqe) +{ + mlx5_tx_dbrec_cond_wmb(txq, wqe, 1); } #endif /* RTE_PMD_MLX5_RXTX_H_ */ diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_neon.h b/drivers/net/mlx5/mlx5_rxtx_vec_neon.h index da3c96f4a8..55f0983ec6 100644 --- a/drivers/net/mlx5/mlx5_rxtx_vec_neon.h +++ b/drivers/net/mlx5/mlx5_rxtx_vec_neon.h @@ -345,7 +345,7 @@ txq_burst_v(struct mlx5_txq_data *txq, struct rte_mbuf **pkts, uint16_t pkts_n, txq->wqe_ci += (nb_dword_in_hdr + pkts_n + (nb_dword_per_wqebb - 1)) / nb_dword_per_wqebb; /* Ring QP doorbell. */ - mlx5_tx_dbrec(txq, wqe); + mlx5_tx_dbrec_cond_wmb(txq, wqe, pkts_n < MLX5_VPMD_TX_MAX_BURST); return pkts_n; } diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_sse.h b/drivers/net/mlx5/mlx5_rxtx_vec_sse.h index 95e41d51af..a669edb610 100644 --- a/drivers/net/mlx5/mlx5_rxtx_vec_sse.h +++ b/drivers/net/mlx5/mlx5_rxtx_vec_sse.h @@ -344,7 +344,7 @@ txq_burst_v(struct mlx5_txq_data *txq, struct rte_mbuf **pkts, uint16_t pkts_n, txq->wqe_ci += (nb_dword_in_hdr + pkts_n + (nb_dword_per_wqebb - 1)) / nb_dword_per_wqebb; /* Ring QP doorbell. */ - mlx5_tx_dbrec(txq, wqe); + mlx5_tx_dbrec_cond_wmb(txq, wqe, pkts_n < MLX5_VPMD_TX_MAX_BURST); return pkts_n; } -- 2.20.1