From 0b77e8f3d3643337465e951198497f0c521bfa4c Mon Sep 17 00:00:00 2001 From: Chengwen Feng Date: Tue, 16 Nov 2021 09:22:12 +0800 Subject: [PATCH] net/hns3: optimize Tx performance This patch uses tx_free_thresh to control mbufs free when the common xmit algorithm is used. This patch also modifies the implementation of PMD's tx_done_cleanup because the mbuf free algorithm changed. In the testpmd single core MAC forwarding scenario, the performance is improved by 10% at 64B on Kunpeng920 platform. Cc: stable@dpdk.org Signed-off-by: Chengwen Feng Signed-off-by: Min Hu (Connor) --- drivers/net/hns3/hns3_rxtx.c | 118 ++++++++++++++++------------------- 1 file changed, 54 insertions(+), 64 deletions(-) diff --git a/drivers/net/hns3/hns3_rxtx.c b/drivers/net/hns3/hns3_rxtx.c index f0a57611ec..40cc4e9c1a 100644 --- a/drivers/net/hns3/hns3_rxtx.c +++ b/drivers/net/hns3/hns3_rxtx.c @@ -3077,40 +3077,51 @@ hns3_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t nb_desc, return 0; } -static void +static int hns3_tx_free_useless_buffer(struct hns3_tx_queue *txq) { uint16_t tx_next_clean = txq->next_to_clean; - uint16_t tx_next_use = txq->next_to_use; - uint16_t tx_bd_ready = txq->tx_bd_ready; - uint16_t tx_bd_max = txq->nb_tx_desc; - struct hns3_entry *tx_bak_pkt = &txq->sw_ring[tx_next_clean]; + uint16_t tx_next_use = txq->next_to_use; + struct hns3_entry *tx_entry = &txq->sw_ring[tx_next_clean]; struct hns3_desc *desc = &txq->tx_ring[tx_next_clean]; - struct rte_mbuf *mbuf; + int i; - while ((!(desc->tx.tp_fe_sc_vld_ra_ri & - rte_cpu_to_le_16(BIT(HNS3_TXD_VLD_B)))) && - tx_next_use != tx_next_clean) { - mbuf = tx_bak_pkt->mbuf; - if (mbuf) { - rte_pktmbuf_free_seg(mbuf); - tx_bak_pkt->mbuf = NULL; - } + if (tx_next_use >= tx_next_clean && + tx_next_use < tx_next_clean + txq->tx_rs_thresh) + return -1; - desc++; - tx_bak_pkt++; - tx_next_clean++; - tx_bd_ready++; - - if (tx_next_clean >= tx_bd_max) { - tx_next_clean = 0; - desc = txq->tx_ring; - tx_bak_pkt = txq->sw_ring; - } + /* + * All mbufs can be released only when the VLD bits of all + * descriptors in a batch are cleared. + */ + for (i = 0; i < txq->tx_rs_thresh; i++) { + if (desc[i].tx.tp_fe_sc_vld_ra_ri & + rte_le_to_cpu_16(BIT(HNS3_TXD_VLD_B))) + return -1; } - txq->next_to_clean = tx_next_clean; - txq->tx_bd_ready = tx_bd_ready; + for (i = 0; i < txq->tx_rs_thresh; i++) { + rte_pktmbuf_free_seg(tx_entry[i].mbuf); + tx_entry[i].mbuf = NULL; + } + + /* Update numbers of available descriptor due to buffer freed */ + txq->tx_bd_ready += txq->tx_rs_thresh; + txq->next_to_clean += txq->tx_rs_thresh; + if (txq->next_to_clean >= txq->nb_tx_desc) + txq->next_to_clean = 0; + + return 0; +} + +static inline int +hns3_tx_free_required_buffer(struct hns3_tx_queue *txq, uint16_t required_bds) +{ + while (required_bds > txq->tx_bd_ready) { + if (hns3_tx_free_useless_buffer(txq) != 0) + return -1; + } + return 0; } int @@ -4147,8 +4158,8 @@ hns3_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts) uint16_t nb_tx; uint16_t i; - /* free useless buffer */ - hns3_tx_free_useless_buffer(txq); + if (txq->tx_bd_ready < txq->tx_free_thresh) + (void)hns3_tx_free_useless_buffer(txq); tx_next_use = txq->next_to_use; tx_bd_max = txq->nb_tx_desc; @@ -4163,11 +4174,14 @@ hns3_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts) nb_buf = tx_pkt->nb_segs; if (nb_buf > txq->tx_bd_ready) { - txq->dfx_stats.queue_full_cnt++; - if (nb_tx == 0) - return 0; - - goto end_of_tx; + /* Try to release the required MBUF, but avoid releasing + * all MBUFs, otherwise, the MBUFs will be released for + * a long time and may cause jitter. + */ + if (hns3_tx_free_required_buffer(txq, nb_buf) != 0) { + txq->dfx_stats.queue_full_cnt++; + goto end_of_tx; + } } /* @@ -4577,46 +4591,22 @@ hns3_dev_tx_queue_stop(struct rte_eth_dev *dev, uint16_t tx_queue_id) static int hns3_tx_done_cleanup_full(struct hns3_tx_queue *txq, uint32_t free_cnt) { - uint16_t next_to_clean = txq->next_to_clean; - uint16_t next_to_use = txq->next_to_use; - uint16_t tx_bd_ready = txq->tx_bd_ready; - struct hns3_entry *tx_pkt = &txq->sw_ring[next_to_clean]; - struct hns3_desc *desc = &txq->tx_ring[next_to_clean]; + uint16_t round_free_cnt; uint32_t idx; if (free_cnt == 0 || free_cnt > txq->nb_tx_desc) free_cnt = txq->nb_tx_desc; - for (idx = 0; idx < free_cnt; idx++) { - if (next_to_clean == next_to_use) - break; + if (txq->tx_rs_thresh == 0) + return 0; - if (desc->tx.tp_fe_sc_vld_ra_ri & - rte_cpu_to_le_16(BIT(HNS3_TXD_VLD_B))) + round_free_cnt = roundup(free_cnt, txq->tx_rs_thresh); + for (idx = 0; idx < round_free_cnt; idx += txq->tx_rs_thresh) { + if (hns3_tx_free_useless_buffer(txq) != 0) break; - - if (tx_pkt->mbuf != NULL) { - rte_pktmbuf_free_seg(tx_pkt->mbuf); - tx_pkt->mbuf = NULL; - } - - next_to_clean++; - tx_bd_ready++; - tx_pkt++; - desc++; - if (next_to_clean == txq->nb_tx_desc) { - tx_pkt = txq->sw_ring; - desc = txq->tx_ring; - next_to_clean = 0; - } - } - - if (idx > 0) { - txq->next_to_clean = next_to_clean; - txq->tx_bd_ready = tx_bd_ready; } - return (int)idx; + return RTE_MIN(idx, free_cnt); } int -- 2.20.1