#include "hns3_logs.h"
#define HNS3_CFG_DESC_NUM(num) ((num) / 8 - 1)
-#define DEFAULT_RX_FREE_THRESH 16
+#define DEFAULT_RX_FREE_THRESH 32
static void
hns3_rx_queue_release_mbufs(struct hns3_rx_queue *rxq)
rxq->next_to_use = 0;
rxq->next_to_clean = 0;
+ rxq->nb_rx_hold = 0;
hns3_init_rx_queue_hw(rxq);
return 0;
rxq = (struct hns3_rx_queue *)hw->fkq_data.rx_queues[idx];
rxq->next_to_use = 0;
rxq->next_to_clean = 0;
+ rxq->nb_rx_hold = 0;
hns3_init_rx_queue_hw(rxq);
}
txq->next_to_use = 0;
txq->next_to_clean = 0;
- txq->tx_bd_ready = txq->nb_tx_desc;
+ txq->tx_bd_ready = txq->nb_tx_desc - 1;
hns3_init_tx_queue_hw(txq);
}
rxq->io_base = (void *)((char *)hw->io_base + HNS3_TQP_REG_OFFSET +
idx * HNS3_TQP_REG_SIZE);
rxq->rx_buf_len = hw->rx_buf_len;
- rxq->non_vld_descs = 0;
rxq->l2_errors = 0;
rxq->pkt_len_errors = 0;
rxq->l3_csum_erros = 0;
uint16_t
hns3_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
{
+ volatile struct hns3_desc *rx_ring; /* RX ring (desc) */
+ volatile struct hns3_desc *rxdp; /* pointer of the current desc */
struct hns3_rx_queue *rxq; /* RX queue */
- struct hns3_desc *rx_ring; /* RX ring (desc) */
struct hns3_entry *sw_ring;
struct hns3_entry *rxe;
- struct hns3_desc *rxdp; /* pointer of the current desc */
struct rte_mbuf *first_seg;
struct rte_mbuf *last_seg;
+ struct hns3_desc rxd;
struct rte_mbuf *nmb; /* pointer of the new mbuf */
struct rte_mbuf *rxm;
struct rte_eth_dev *dev;
uint16_t pkt_len;
uint16_t nb_rx;
uint16_t rx_id;
- int num; /* num of desc in ring */
int ret;
nb_rx = 0;
last_seg = rxq->pkt_last_seg;
sw_ring = rxq->sw_ring;
- /* Get num of packets in descriptor ring */
- num = hns3_read_dev(rxq, HNS3_RING_RX_FBDNUM_REG);
- while (nb_rx_bd < num && nb_rx < nb_pkts) {
+ while (nb_rx < nb_pkts) {
rxdp = &rx_ring[rx_id];
bd_base_info = rte_le_to_cpu_32(rxdp->rx.bd_base_info);
- if (unlikely(!hns3_get_bit(bd_base_info, HNS3_RXD_VLD_B))) {
- rxq->non_vld_descs++;
+ if (unlikely(!hns3_get_bit(bd_base_info, HNS3_RXD_VLD_B)))
break;
- }
+ /*
+ * The interactive process between software and hardware of
+ * receiving a new packet in hns3 network engine:
+ * 1. Hardware network engine firstly writes the packet content
+ * to the memory pointed by the 'addr' field of the Rx Buffer
+ * Descriptor, secondly fills the result of parsing the
+ * packet include the valid field into the Rx Buffer
+ * Descriptor in one write operation.
+ * 2. Driver reads the Rx BD's valid field in the loop to check
+ * whether it's valid, if valid then assign a new address to
+ * the addr field, clear the valid field, get the other
+ * information of the packet by parsing Rx BD's other fields,
+ * finally write back the number of Rx BDs processed by the
+ * driver to the HNS3_RING_RX_HEAD_REG register to inform
+ * hardware.
+ * In the above process, the ordering is very important. We must
+ * make sure that CPU read Rx BD's other fields only after the
+ * Rx BD is valid.
+ *
+ * There are two type of re-ordering: compiler re-ordering and
+ * CPU re-ordering under the ARMv8 architecture.
+ * 1. we use volatile to deal with compiler re-ordering, so you
+ * can see that rx_ring/rxdp defined with volatile.
+ * 2. we commonly use memory barrier to deal with CPU
+ * re-ordering, but the cost is high.
+ *
+ * In order to solve the high cost of using memory barrier, we
+ * use the data dependency order under the ARMv8 architecture,
+ * for example:
+ * instr01: load A
+ * instr02: load B <- A
+ * the instr02 will always execute after instr01.
+ *
+ * To construct the data dependency ordering, we use the
+ * following assignment:
+ * rxd = rxdp[(bd_base_info & (1u << HNS3_RXD_VLD_B)) -
+ * (1u<<HNS3_RXD_VLD_B)]
+ * Using gcc compiler under the ARMv8 architecture, the related
+ * assembly code example as follows:
+ * note: (1u << HNS3_RXD_VLD_B) equal 0x10
+ * instr01: ldr w26, [x22, #28] --read bd_base_info
+ * instr02: and w0, w26, #0x10 --calc bd_base_info & 0x10
+ * instr03: sub w0, w0, #0x10 --calc (bd_base_info &
+ * 0x10) - 0x10
+ * instr04: add x0, x22, x0, lsl #5 --calc copy source addr
+ * instr05: ldp x2, x3, [x0]
+ * instr06: stp x2, x3, [x29, #256] --copy BD's [0 ~ 15]B
+ * instr07: ldp x4, x5, [x0, #16]
+ * instr08: stp x4, x5, [x29, #272] --copy BD's [16 ~ 31]B
+ * the instr05~08 depend on x0's value, x0 depent on w26's
+ * value, the w26 is the bd_base_info, this form the data
+ * dependency ordering.
+ * note: if BD is valid, (bd_base_info & (1u<<HNS3_RXD_VLD_B)) -
+ * (1u<<HNS3_RXD_VLD_B) will always zero, so the
+ * assignment is correct.
+ *
+ * So we use the data dependency ordering instead of memory
+ * barrier to improve receive performance.
+ */
+ rxd = rxdp[(bd_base_info & (1u << HNS3_RXD_VLD_B)) -
+ (1u << HNS3_RXD_VLD_B)];
nmb = rte_mbuf_raw_alloc(rxq->mb_pool);
if (unlikely(nmb == NULL)) {
nb_rx_bd++;
rxe = &sw_ring[rx_id];
rx_id++;
- if (rx_id == rxq->nb_rx_desc)
+ if (unlikely(rx_id == rxq->nb_rx_desc))
rx_id = 0;
rte_prefetch0(sw_ring[rx_id].mbuf);
rxe->mbuf = nmb;
dma_addr = rte_cpu_to_le_64(rte_mbuf_data_iova_default(nmb));
- rxdp->addr = dma_addr;
rxdp->rx.bd_base_info = 0;
+ rxdp->addr = dma_addr;
- rte_cio_rmb();
/* Load remained descriptor data and extract necessary fields */
- data_len = (uint16_t)(rte_le_to_cpu_16(rxdp->rx.size));
- l234_info = rte_le_to_cpu_32(rxdp->rx.l234_info);
- ol_info = rte_le_to_cpu_32(rxdp->rx.ol_info);
+ data_len = (uint16_t)(rte_le_to_cpu_16(rxd.rx.size));
+ l234_info = rte_le_to_cpu_32(rxd.rx.l234_info);
+ ol_info = rte_le_to_cpu_32(rxd.rx.ol_info);
if (first_seg == NULL) {
first_seg = rxm;
}
/* The last buffer of the received packet */
- pkt_len = (uint16_t)(rte_le_to_cpu_16(rxdp->rx.pkt_len));
+ pkt_len = (uint16_t)(rte_le_to_cpu_16(rxd.rx.pkt_len));
first_seg->pkt_len = pkt_len;
first_seg->port = rxq->port_id;
- first_seg->hash.rss = rte_le_to_cpu_32(rxdp->rx.rss_hash);
+ first_seg->hash.rss = rte_le_to_cpu_32(rxd.rx.rss_hash);
first_seg->ol_flags |= PKT_RX_RSS_HASH;
if (unlikely(hns3_get_bit(bd_base_info, HNS3_RXD_LUM_B))) {
first_seg->hash.fdir.hi =
- rte_le_to_cpu_32(rxdp->rx.fd_id);
+ rte_le_to_cpu_32(rxd.rx.fd_id);
first_seg->ol_flags |= PKT_RX_FDIR | PKT_RX_FDIR_ID;
}
rxm->next = NULL;
hns3_rx_set_cksum_flag(rxm, first_seg->packet_type,
cksum_err);
- first_seg->vlan_tci = rte_le_to_cpu_16(rxdp->rx.vlan_tag);
+ first_seg->vlan_tci = rte_le_to_cpu_16(rxd.rx.vlan_tag);
first_seg->vlan_tci_outer =
- rte_le_to_cpu_16(rxdp->rx.ot_vlan_tag);
+ rte_le_to_cpu_16(rxd.rx.ot_vlan_tag);
rx_pkts[nb_rx++] = first_seg;
first_seg = NULL;
continue;
rxq->next_to_clean = rx_id;
rxq->pkt_first_seg = first_seg;
rxq->pkt_last_seg = last_seg;
- hns3_clean_rx_buffers(rxq, nb_rx_bd);
+
+ nb_rx_bd = nb_rx_bd + rxq->nb_rx_hold;
+ if (nb_rx_bd > rxq->rx_free_thresh) {
+ hns3_clean_rx_buffers(rxq, nb_rx_bd);
+ nb_rx_bd = 0;
+ }
+ rxq->nb_rx_hold = nb_rx_bd;
return nb_rx;
}
txq->hns = hns;
txq->next_to_use = 0;
txq->next_to_clean = 0;
- txq->tx_bd_ready = txq->nb_tx_desc;
+ txq->tx_bd_ready = txq->nb_tx_desc - 1;
txq->port_id = dev->data->port_id;
txq->configured = true;
txq->io_base = (void *)((char *)hw->io_base + HNS3_TQP_REG_OFFSET +
return 0;
}
-static inline int
-tx_ring_dist(struct hns3_tx_queue *txq, int begin, int end)
-{
- return (end - begin + txq->nb_tx_desc) % txq->nb_tx_desc;
-}
-
-static inline int
-tx_ring_space(struct hns3_tx_queue *txq)
-{
- return txq->nb_tx_desc -
- tx_ring_dist(txq, txq->next_to_clean, txq->next_to_use) - 1;
-}
-
static inline void
hns3_queue_xmit(struct hns3_tx_queue *txq, uint32_t buf_num)
{
struct rte_mbuf *mbuf;
while ((!hns3_get_bit(desc->tx.tp_fe_sc_vld_ra_ri, HNS3_TXD_VLD_B)) &&
- (tx_next_use != tx_next_clean || tx_bd_ready < tx_bd_max)) {
+ tx_next_use != tx_next_clean) {
mbuf = tx_bak_pkt->mbuf;
if (mbuf) {
rte_pktmbuf_free_seg(mbuf);
struct rte_mbuf *m_seg;
uint32_t nb_hold = 0;
uint16_t tx_next_use;
- uint16_t tx_bd_ready;
uint16_t tx_pkt_num;
uint16_t tx_bd_max;
uint16_t nb_buf;
/* free useless buffer */
hns3_tx_free_useless_buffer(txq);
- tx_bd_ready = txq->tx_bd_ready;
- if (tx_bd_ready == 0)
- return 0;
tx_next_use = txq->next_to_use;
tx_bd_max = txq->nb_tx_desc;
- tx_pkt_num = (tx_bd_ready < nb_pkts) ? tx_bd_ready : nb_pkts;
+ tx_pkt_num = nb_pkts;
/* send packets */
tx_bak_pkt = &txq->sw_ring[tx_next_use];
nb_buf = tx_pkt->nb_segs;
- if (nb_buf > tx_ring_space(txq)) {
+ if (nb_buf > txq->tx_bd_ready) {
if (nb_tx == 0)
return 0;
nb_hold += i;
txq->next_to_use = tx_next_use;
+ txq->tx_bd_ready -= i;
}
end_of_tx:
- if (likely(nb_tx)) {
+ if (likely(nb_tx))
hns3_queue_xmit(txq, nb_hold);
- txq->tx_bd_ready = tx_bd_ready - nb_hold;
- }
return nb_tx;
}