- /*
- * The interactive process between software and hardware of
- * receiving a new packet in hns3 network engine:
- * 1. Hardware network engine firstly writes the packet content
- * to the memory pointed by the 'addr' field of the Rx Buffer
- * Descriptor, secondly fills the result of parsing the
- * packet include the valid field into the Rx Buffer
- * Descriptor in one write operation.
- * 2. Driver reads the Rx BD's valid field in the loop to check
- * whether it's valid, if valid then assign a new address to
- * the addr field, clear the valid field, get the other
- * information of the packet by parsing Rx BD's other fields,
- * finally write back the number of Rx BDs processed by the
- * driver to the HNS3_RING_RX_HEAD_REG register to inform
- * hardware.
- * In the above process, the ordering is very important. We must
- * make sure that CPU read Rx BD's other fields only after the
- * Rx BD is valid.
- *
- * There are two type of re-ordering: compiler re-ordering and
- * CPU re-ordering under the ARMv8 architecture.
- * 1. we use volatile to deal with compiler re-ordering, so you
- * can see that rx_ring/rxdp defined with volatile.
- * 2. we commonly use memory barrier to deal with CPU
- * re-ordering, but the cost is high.
- *
- * In order to solve the high cost of using memory barrier, we
- * use the data dependency order under the ARMv8 architecture,
- * for example:
- * instr01: load A
- * instr02: load B <- A
- * the instr02 will always execute after instr01.
- *
- * To construct the data dependency ordering, we use the
- * following assignment:
- * rxd = rxdp[(bd_base_info & (1u << HNS3_RXD_VLD_B)) -
- * (1u<<HNS3_RXD_VLD_B)]
- * Using gcc compiler under the ARMv8 architecture, the related
- * assembly code example as follows:
- * note: (1u << HNS3_RXD_VLD_B) equal 0x10
- * instr01: ldr w26, [x22, #28] --read bd_base_info
- * instr02: and w0, w26, #0x10 --calc bd_base_info & 0x10
- * instr03: sub w0, w0, #0x10 --calc (bd_base_info &
- * 0x10) - 0x10
- * instr04: add x0, x22, x0, lsl #5 --calc copy source addr
- * instr05: ldp x2, x3, [x0]
- * instr06: stp x2, x3, [x29, #256] --copy BD's [0 ~ 15]B
- * instr07: ldp x4, x5, [x0, #16]
+
+ rxd = rxdp[(bd_base_info & (1u << HNS3_RXD_VLD_B)) -
+ (1u << HNS3_RXD_VLD_B)];
+
+ nmb = hns3_rx_alloc_buffer(rxq);
+ if (unlikely(nmb == NULL)) {
+ uint16_t port_id;
+
+ port_id = rxq->port_id;
+ rte_eth_devices[port_id].data->rx_mbuf_alloc_failed++;
+ break;
+ }
+
+ nb_rx_bd++;
+ rxe = &sw_ring[rx_id];
+ rx_id++;
+ if (unlikely(rx_id == rxq->nb_rx_desc))
+ rx_id = 0;
+
+ rte_prefetch0(sw_ring[rx_id].mbuf);
+ if ((rx_id & HNS3_RX_RING_PREFETCTH_MASK) == 0) {
+ rte_prefetch0(&rx_ring[rx_id]);
+ rte_prefetch0(&sw_ring[rx_id]);
+ }
+
+ rxm = rxe->mbuf;
+ rxe->mbuf = nmb;
+
+ dma_addr = rte_mbuf_data_iova_default(nmb);
+ rxdp->addr = rte_cpu_to_le_64(dma_addr);
+ rxdp->rx.bd_base_info = 0;
+
+ rxm->data_off = RTE_PKTMBUF_HEADROOM;
+ rxm->pkt_len = (uint16_t)(rte_le_to_cpu_16(rxd.rx.pkt_len)) -
+ rxq->crc_len;
+ rxm->data_len = rxm->pkt_len;
+ rxm->port = rxq->port_id;
+ rxm->hash.rss = rte_le_to_cpu_32(rxd.rx.rss_hash);
+ rxm->ol_flags = PKT_RX_RSS_HASH;
+ if (unlikely(bd_base_info & BIT(HNS3_RXD_LUM_B))) {
+ rxm->hash.fdir.hi =
+ rte_le_to_cpu_16(rxd.rx.fd_id);
+ rxm->ol_flags |= PKT_RX_FDIR | PKT_RX_FDIR_ID;
+ }
+ rxm->nb_segs = 1;
+ rxm->next = NULL;
+
+ /* Load remained descriptor data and extract necessary fields */
+ l234_info = rte_le_to_cpu_32(rxd.rx.l234_info);
+ ol_info = rte_le_to_cpu_32(rxd.rx.ol_info);
+ ret = hns3_handle_bdinfo(rxq, rxm, bd_base_info,
+ l234_info, &cksum_err);
+ if (unlikely(ret))
+ goto pkt_err;
+
+ rxm->packet_type = hns3_rx_calc_ptype(rxq, l234_info, ol_info);
+
+ if (likely(bd_base_info & BIT(HNS3_RXD_L3L4P_B)))
+ hns3_rx_set_cksum_flag(rxm, rxm->packet_type,
+ cksum_err);
+ hns3_rxd_to_vlan_tci(rxq, rxm, l234_info, &rxd);
+
+ rx_pkts[nb_rx++] = rxm;
+ continue;
+pkt_err:
+ rte_pktmbuf_free(rxm);
+ }
+
+ rxq->next_to_use = rx_id;
+ rxq->rx_free_hold += nb_rx_bd;
+ if (rxq->rx_free_hold > rxq->rx_free_thresh) {
+ hns3_write_reg_opt(rxq->io_head_reg, rxq->rx_free_hold);
+ rxq->rx_free_hold = 0;
+ }
+
+ return nb_rx;
+}
+
+uint16_t
+hns3_recv_scattered_pkts(void *rx_queue,
+ struct rte_mbuf **rx_pkts,
+ uint16_t nb_pkts)
+{
+ volatile struct hns3_desc *rx_ring; /* RX ring (desc) */
+ volatile struct hns3_desc *rxdp; /* pointer of the current desc */
+ struct hns3_rx_queue *rxq; /* RX queue */
+ struct hns3_entry *sw_ring;
+ struct hns3_entry *rxe;
+ struct rte_mbuf *first_seg;
+ struct rte_mbuf *last_seg;
+ struct hns3_desc rxd;
+ struct rte_mbuf *nmb; /* pointer of the new mbuf */
+ struct rte_mbuf *rxm;
+ struct rte_eth_dev *dev;
+ uint32_t bd_base_info;
+ uint32_t cksum_err;
+ uint32_t l234_info;
+ uint32_t gro_size;
+ uint32_t ol_info;
+ uint64_t dma_addr;
+ uint16_t nb_rx_bd;
+ uint16_t nb_rx;
+ uint16_t rx_id;
+ int ret;
+
+ nb_rx = 0;
+ nb_rx_bd = 0;
+ rxq = rx_queue;
+
+ rx_id = rxq->next_to_use;
+ rx_ring = rxq->rx_ring;
+ sw_ring = rxq->sw_ring;
+ first_seg = rxq->pkt_first_seg;
+ last_seg = rxq->pkt_last_seg;
+
+ while (nb_rx < nb_pkts) {
+ rxdp = &rx_ring[rx_id];
+ bd_base_info = rte_le_to_cpu_32(rxdp->rx.bd_base_info);
+ if (unlikely(!(bd_base_info & BIT(HNS3_RXD_VLD_B))))
+ break;
+
+ /*
+ * The interactive process between software and hardware of
+ * receiving a new packet in hns3 network engine:
+ * 1. Hardware network engine firstly writes the packet content
+ * to the memory pointed by the 'addr' field of the Rx Buffer
+ * Descriptor, secondly fills the result of parsing the
+ * packet include the valid field into the Rx Buffer
+ * Descriptor in one write operation.
+ * 2. Driver reads the Rx BD's valid field in the loop to check
+ * whether it's valid, if valid then assign a new address to
+ * the addr field, clear the valid field, get the other
+ * information of the packet by parsing Rx BD's other fields,
+ * finally write back the number of Rx BDs processed by the
+ * driver to the HNS3_RING_RX_HEAD_REG register to inform
+ * hardware.
+ * In the above process, the ordering is very important. We must
+ * make sure that CPU read Rx BD's other fields only after the
+ * Rx BD is valid.
+ *
+ * There are two type of re-ordering: compiler re-ordering and
+ * CPU re-ordering under the ARMv8 architecture.
+ * 1. we use volatile to deal with compiler re-ordering, so you
+ * can see that rx_ring/rxdp defined with volatile.
+ * 2. we commonly use memory barrier to deal with CPU
+ * re-ordering, but the cost is high.
+ *
+ * In order to solve the high cost of using memory barrier, we
+ * use the data dependency order under the ARMv8 architecture,
+ * for example:
+ * instr01: load A
+ * instr02: load B <- A
+ * the instr02 will always execute after instr01.
+ *
+ * To construct the data dependency ordering, we use the
+ * following assignment:
+ * rxd = rxdp[(bd_base_info & (1u << HNS3_RXD_VLD_B)) -
+ * (1u<<HNS3_RXD_VLD_B)]
+ * Using gcc compiler under the ARMv8 architecture, the related
+ * assembly code example as follows:
+ * note: (1u << HNS3_RXD_VLD_B) equal 0x10
+ * instr01: ldr w26, [x22, #28] --read bd_base_info
+ * instr02: and w0, w26, #0x10 --calc bd_base_info & 0x10
+ * instr03: sub w0, w0, #0x10 --calc (bd_base_info &
+ * 0x10) - 0x10
+ * instr04: add x0, x22, x0, lsl #5 --calc copy source addr
+ * instr05: ldp x2, x3, [x0]
+ * instr06: stp x2, x3, [x29, #256] --copy BD's [0 ~ 15]B
+ * instr07: ldp x4, x5, [x0, #16]