+ rxq->next_to_use = rx_id;
+ rxq->rx_free_hold += nb_rx_bd;
+ if (rxq->rx_free_hold > rxq->rx_free_thresh) {
+ hns3_write_reg_opt(rxq->io_head_reg, rxq->rx_free_hold);
+ rxq->rx_free_hold = 0;
+ }
+
+ return nb_rx;
+}
+
+uint16_t
+hns3_recv_scattered_pkts(void *rx_queue,
+ struct rte_mbuf **rx_pkts,
+ uint16_t nb_pkts)
+{
+ volatile struct hns3_desc *rx_ring; /* RX ring (desc) */
+ volatile struct hns3_desc *rxdp; /* pointer of the current desc */
+ struct hns3_rx_queue *rxq; /* RX queue */
+ struct hns3_entry *sw_ring;
+ struct hns3_entry *rxe;
+ struct rte_mbuf *first_seg;
+ struct rte_mbuf *last_seg;
+ struct hns3_desc rxd;
+ struct rte_mbuf *nmb; /* pointer of the new mbuf */
+ struct rte_mbuf *rxm;
+ struct rte_eth_dev *dev;
+ uint32_t bd_base_info;
+ uint32_t cksum_err;
+ uint32_t l234_info;
+ uint32_t gro_size;
+ uint32_t ol_info;
+ uint64_t dma_addr;
+ uint16_t nb_rx_bd;
+ uint16_t nb_rx;
+ uint16_t rx_id;
+ int ret;
+
+ nb_rx = 0;
+ nb_rx_bd = 0;
+ rxq = rx_queue;
+
+ rx_id = rxq->next_to_use;
+ rx_ring = rxq->rx_ring;
+ sw_ring = rxq->sw_ring;
+ first_seg = rxq->pkt_first_seg;
+ last_seg = rxq->pkt_last_seg;
+
+ while (nb_rx < nb_pkts) {
+ rxdp = &rx_ring[rx_id];
+ bd_base_info = rte_le_to_cpu_32(rxdp->rx.bd_base_info);
+ if (unlikely(!(bd_base_info & BIT(HNS3_RXD_VLD_B))))
+ break;
+
+ /*
+ * The interactive process between software and hardware of
+ * receiving a new packet in hns3 network engine:
+ * 1. Hardware network engine firstly writes the packet content
+ * to the memory pointed by the 'addr' field of the Rx Buffer
+ * Descriptor, secondly fills the result of parsing the
+ * packet include the valid field into the Rx Buffer
+ * Descriptor in one write operation.
+ * 2. Driver reads the Rx BD's valid field in the loop to check
+ * whether it's valid, if valid then assign a new address to
+ * the addr field, clear the valid field, get the other
+ * information of the packet by parsing Rx BD's other fields,
+ * finally write back the number of Rx BDs processed by the
+ * driver to the HNS3_RING_RX_HEAD_REG register to inform
+ * hardware.
+ * In the above process, the ordering is very important. We must
+ * make sure that CPU read Rx BD's other fields only after the
+ * Rx BD is valid.
+ *
+ * There are two type of re-ordering: compiler re-ordering and
+ * CPU re-ordering under the ARMv8 architecture.
+ * 1. we use volatile to deal with compiler re-ordering, so you
+ * can see that rx_ring/rxdp defined with volatile.
+ * 2. we commonly use memory barrier to deal with CPU
+ * re-ordering, but the cost is high.
+ *
+ * In order to solve the high cost of using memory barrier, we
+ * use the data dependency order under the ARMv8 architecture,
+ * for example:
+ * instr01: load A
+ * instr02: load B <- A
+ * the instr02 will always execute after instr01.
+ *
+ * To construct the data dependency ordering, we use the
+ * following assignment:
+ * rxd = rxdp[(bd_base_info & (1u << HNS3_RXD_VLD_B)) -
+ * (1u<<HNS3_RXD_VLD_B)]
+ * Using gcc compiler under the ARMv8 architecture, the related
+ * assembly code example as follows:
+ * note: (1u << HNS3_RXD_VLD_B) equal 0x10
+ * instr01: ldr w26, [x22, #28] --read bd_base_info
+ * instr02: and w0, w26, #0x10 --calc bd_base_info & 0x10
+ * instr03: sub w0, w0, #0x10 --calc (bd_base_info &
+ * 0x10) - 0x10
+ * instr04: add x0, x22, x0, lsl #5 --calc copy source addr
+ * instr05: ldp x2, x3, [x0]
+ * instr06: stp x2, x3, [x29, #256] --copy BD's [0 ~ 15]B
+ * instr07: ldp x4, x5, [x0, #16]
+ * instr08: stp x4, x5, [x29, #272] --copy BD's [16 ~ 31]B
+ * the instr05~08 depend on x0's value, x0 depent on w26's
+ * value, the w26 is the bd_base_info, this form the data
+ * dependency ordering.
+ * note: if BD is valid, (bd_base_info & (1u<<HNS3_RXD_VLD_B)) -
+ * (1u<<HNS3_RXD_VLD_B) will always zero, so the
+ * assignment is correct.
+ *
+ * So we use the data dependency ordering instead of memory
+ * barrier to improve receive performance.
+ */
+ rxd = rxdp[(bd_base_info & (1u << HNS3_RXD_VLD_B)) -
+ (1u << HNS3_RXD_VLD_B)];
+
+ nmb = hns3_rx_alloc_buffer(rxq);
+ if (unlikely(nmb == NULL)) {
+ dev = &rte_eth_devices[rxq->port_id];
+ dev->data->rx_mbuf_alloc_failed++;
+ break;
+ }
+
+ nb_rx_bd++;
+ rxe = &sw_ring[rx_id];
+ rx_id++;
+ if (unlikely(rx_id == rxq->nb_rx_desc))
+ rx_id = 0;
+
+ rte_prefetch0(sw_ring[rx_id].mbuf);
+ if ((rx_id & HNS3_RX_RING_PREFETCTH_MASK) == 0) {
+ rte_prefetch0(&rx_ring[rx_id]);
+ rte_prefetch0(&sw_ring[rx_id]);
+ }
+
+ rxm = rxe->mbuf;
+ rxe->mbuf = nmb;
+
+ dma_addr = rte_cpu_to_le_64(rte_mbuf_data_iova_default(nmb));
+ rxdp->rx.bd_base_info = 0;
+ rxdp->addr = dma_addr;
+
+ if (first_seg == NULL) {
+ first_seg = rxm;
+ first_seg->nb_segs = 1;
+ } else {
+ first_seg->nb_segs++;
+ last_seg->next = rxm;
+ }
+
+ rxm->data_off = RTE_PKTMBUF_HEADROOM;
+ rxm->data_len = rte_le_to_cpu_16(rxd.rx.size);
+
+ if (!(bd_base_info & BIT(HNS3_RXD_FE_B))) {
+ last_seg = rxm;
+ rxm->next = NULL;
+ continue;
+ }
+
+ /*
+ * The last buffer of the received packet. packet len from
+ * buffer description may contains CRC len, packet len should
+ * subtract it, same as data len.
+ */
+ first_seg->pkt_len = rte_le_to_cpu_16(rxd.rx.pkt_len);
+
+ /*
+ * This is the last buffer of the received packet. If the CRC
+ * is not stripped by the hardware:
+ * - Subtract the CRC length from the total packet length.
+ * - If the last buffer only contains the whole CRC or a part
+ * of it, free the mbuf associated to the last buffer. If part
+ * of the CRC is also contained in the previous mbuf, subtract
+ * the length of that CRC part from the data length of the
+ * previous mbuf.
+ */
+ rxm->next = NULL;
+ if (unlikely(rxq->crc_len > 0)) {
+ first_seg->pkt_len -= rxq->crc_len;
+ recalculate_data_len(first_seg, last_seg, rxm, rxq,
+ rxm->data_len);
+ }
+
+ first_seg->port = rxq->port_id;
+ first_seg->hash.rss = rte_le_to_cpu_32(rxd.rx.rss_hash);
+ first_seg->ol_flags = PKT_RX_RSS_HASH;
+ if (unlikely(bd_base_info & BIT(HNS3_RXD_LUM_B))) {
+ first_seg->hash.fdir.hi =
+ rte_le_to_cpu_16(rxd.rx.fd_id);
+ first_seg->ol_flags |= PKT_RX_FDIR | PKT_RX_FDIR_ID;
+ }
+
+ gro_size = hns3_get_field(bd_base_info, HNS3_RXD_GRO_SIZE_M,
+ HNS3_RXD_GRO_SIZE_S);
+ if (gro_size != 0) {
+ first_seg->ol_flags |= PKT_RX_LRO;
+ first_seg->tso_segsz = gro_size;
+ }
+
+ l234_info = rte_le_to_cpu_32(rxd.rx.l234_info);
+ ol_info = rte_le_to_cpu_32(rxd.rx.ol_info);
+ ret = hns3_handle_bdinfo(rxq, first_seg, bd_base_info,
+ l234_info, &cksum_err);
+ if (unlikely(ret))
+ goto pkt_err;
+
+ first_seg->packet_type = hns3_rx_calc_ptype(rxq,
+ l234_info, ol_info);
+
+ if (bd_base_info & BIT(HNS3_RXD_L3L4P_B))
+ hns3_rx_set_cksum_flag(first_seg,
+ first_seg->packet_type,
+ cksum_err);
+ hns3_rxd_to_vlan_tci(rxq, first_seg, l234_info, &rxd);
+
+ rx_pkts[nb_rx++] = first_seg;
+ first_seg = NULL;
+ continue;
+pkt_err:
+ rte_pktmbuf_free(first_seg);
+ first_seg = NULL;
+ }
+
+ rxq->next_to_use = rx_id;
+ rxq->pkt_first_seg = first_seg;