This patch adds NEON vector instructions to optimize Rx burst process.
Signed-off-by: Chengwen Feng <fengchengwen@huawei.com>
Signed-off-by: Wei Hu (Xavier) <xavier.huwei@huawei.com>
Signed-off-by: Huisong Li <lihuisong@huawei.com>
goto cfg_err;
hns->rx_simple_allowed = true;
+ hns->rx_vec_allowed = true;
hns->tx_simple_allowed = true;
hns->tx_vec_allowed = true;
};
bool rx_simple_allowed;
+ bool rx_vec_allowed;
bool tx_simple_allowed;
bool tx_vec_allowed;
goto cfg_err;
hns->rx_simple_allowed = true;
+ hns->rx_vec_allowed = true;
hns->tx_simple_allowed = true;
hns->tx_vec_allowed = true;
if (rxq->sw_ring == NULL)
return;
- for (i = 0; i < rxq->nb_rx_desc; i++)
- if (rxq->sw_ring[i].mbuf)
- rte_pktmbuf_free_seg(rxq->sw_ring[i].mbuf);
+ if (rxq->rx_rearm_nb == 0) {
+ for (i = 0; i < rxq->nb_rx_desc; i++) {
+ if (rxq->sw_ring[i].mbuf != NULL)
+ rte_pktmbuf_free_seg(rxq->sw_ring[i].mbuf);
+ }
+ } else {
+ for (i = rxq->next_to_use;
+ i != rxq->rx_rearm_start;
+ i = (i + 1) % rxq->nb_rx_desc) {
+ if (rxq->sw_ring[i].mbuf != NULL)
+ rte_pktmbuf_free_seg(rxq->sw_ring[i].mbuf);
+ }
+ }
for (i = 0; i < rxq->bulk_mbuf_num; i++)
rte_pktmbuf_free_seg(rxq->bulk_mbuf[i]);
}
rxq->next_to_use = 0;
+ rxq->rx_rearm_start = 0;
rxq->rx_free_hold = 0;
+ rxq->rx_rearm_nb = 0;
rxq->pkt_first_seg = NULL;
rxq->pkt_last_seg = NULL;
hns3_init_rx_queue_hw(rxq);
+ hns3_rxq_vec_setup(rxq);
return 0;
}
rxq = (struct hns3_rx_queue *)hw->fkq_data.rx_queues[idx];
rxq->next_to_use = 0;
rxq->rx_free_hold = 0;
+ rxq->rx_rearm_start = 0;
+ rxq->rx_rearm_nb = 0;
hns3_init_rx_queue_hw(rxq);
}
return 0;
}
+/*
+ * Iterate over all Rx Queue, and call the callback() function for each Rx
+ * queue.
+ *
+ * @param[in] dev
+ * The target eth dev.
+ * @param[in] callback
+ * The function to call for each queue.
+ * if callback function return nonzero will stop iterate and return it's value
+ * @param[in] arg
+ * The arguments to provide the callback function with.
+ *
+ * @return
+ * 0 on success, otherwise with errno set.
+ */
+int
+hns3_rxq_iterate(struct rte_eth_dev *dev,
+ int (*callback)(struct hns3_rx_queue *, void *), void *arg)
+{
+ uint32_t i;
+ int ret;
+
+ if (dev->data->rx_queues == NULL)
+ return -EINVAL;
+
+ for (i = 0; i < dev->data->nb_rx_queues; i++) {
+ ret = callback(dev->data->rx_queues[i], arg);
+ if (ret != 0)
+ return ret;
+ }
+
+ return 0;
+}
+
static void*
hns3_alloc_rxq_and_dma_zone(struct rte_eth_dev *dev,
struct hns3_queue_info *q_info)
/* Allocate rx ring hardware descriptors. */
rxq->queue_id = q_info->idx;
rxq->nb_rx_desc = q_info->nb_desc;
- rx_desc = rxq->nb_rx_desc * sizeof(struct hns3_desc);
+
+ /*
+ * Allocate a litter more memory because rx vector functions
+ * don't check boundaries each time.
+ */
+ rx_desc = (rxq->nb_rx_desc + HNS3_DEFAULT_RX_BURST) *
+ sizeof(struct hns3_desc);
rx_mz = rte_eth_dma_zone_reserve(dev, q_info->ring_name, q_info->idx,
rx_desc, HNS3_RING_BASE_ALIGN,
q_info->socket_id);
conf->rx_free_thresh : HNS3_DEFAULT_RX_FREE_THRESH;
rxq->rx_deferred_start = conf->rx_deferred_start;
- rx_entry_len = sizeof(struct hns3_entry) * rxq->nb_rx_desc;
+ rx_entry_len = (rxq->nb_rx_desc + HNS3_DEFAULT_RX_BURST) *
+ sizeof(struct hns3_entry);
rxq->sw_ring = rte_zmalloc_socket("hns3 RX sw ring", rx_entry_len,
RTE_CACHE_LINE_SIZE, socket_id);
if (rxq->sw_ring == NULL) {
rxq->next_to_use = 0;
rxq->rx_free_hold = 0;
+ rxq->rx_rearm_start = 0;
+ rxq->rx_rearm_nb = 0;
rxq->pkt_first_seg = NULL;
rxq->pkt_last_seg = NULL;
rxq->port_id = dev->data->port_id;
};
if (dev->rx_pkt_burst == hns3_recv_pkts ||
- dev->rx_pkt_burst == hns3_recv_scattered_pkts)
+ dev->rx_pkt_burst == hns3_recv_scattered_pkts ||
+ dev->rx_pkt_burst == hns3_recv_pkts_vec)
return ptypes;
return NULL;
return nb_rx;
}
+void __rte_weak
+hns3_rxq_vec_setup(__rte_unused struct hns3_rx_queue *rxq)
+{
+}
+
+int __rte_weak
+hns3_rx_check_vec_support(__rte_unused struct rte_eth_dev *dev)
+{
+ return -ENOTSUP;
+}
+
+uint16_t __rte_weak
+hns3_recv_pkts_vec(__rte_unused void *tx_queue,
+ __rte_unused struct rte_mbuf **tx_pkts,
+ __rte_unused uint16_t nb_pkts)
+{
+ return 0;
+}
+
int
hns3_rx_burst_mode_get(struct rte_eth_dev *dev, __rte_unused uint16_t queue_id,
struct rte_eth_burst_mode *mode)
} burst_infos[] = {
{ hns3_recv_pkts, "Scalar" },
{ hns3_recv_scattered_pkts, "Scalar Scattered" },
+ { hns3_recv_pkts_vec, "Vector Neon" },
};
eth_rx_burst_t pkt_burst = dev->rx_pkt_burst;
struct hns3_adapter *hns = dev->data->dev_private;
uint64_t offloads = dev->data->dev_conf.rxmode.offloads;
+ if (hns->rx_vec_allowed && hns3_rx_check_vec_support(dev) == 0)
+ return hns3_recv_pkts_vec;
+
if (hns->rx_simple_allowed && !dev->data->scattered_rx &&
(offloads & DEV_RX_OFFLOAD_TCP_LRO) == 0)
return hns3_recv_pkts;
#define HNS3_DEFAULT_TX_RS_THRESH 32
#define HNS3_TX_FAST_FREE_AHEAD 64
+#define HNS3_DEFAULT_RX_BURST 32
+#if (HNS3_DEFAULT_RX_BURST > 64)
+#error "PMD HNS3: HNS3_DEFAULT_RX_BURST must <= 64\n"
+#endif
+#define HNS3_DEFAULT_DESCS_PER_LOOP 4
+#define HNS3_SVE_DEFAULT_DESCS_PER_LOOP 8
+#if (HNS3_DEFAULT_DESCS_PER_LOOP > HNS3_SVE_DEFAULT_DESCS_PER_LOOP)
+#define HNS3_VECTOR_RX_OFFSET_TABLE_LEN HNS3_DEFAULT_DESCS_PER_LOOP
+#else
+#define HNS3_VECTOR_RX_OFFSET_TABLE_LEN HNS3_SVE_DEFAULT_DESCS_PER_LOOP
+#endif
+#define HNS3_DEFAULT_RXQ_REARM_THRESH 64
#define HNS3_UINT8_BIT 8
#define HNS3_UINT16_BIT 16
#define HNS3_UINT32_BIT 32
uint16_t ot_vlan_tag;
};
};
- uint32_t bd_base_info;
+ union {
+ uint32_t bd_base_info;
+ struct {
+ uint16_t bdtype_vld_udp0;
+ uint16_t fe_lum_crcp_l3l4p;
+ };
+ };
} rx;
};
} __rte_packed;
uint16_t rx_free_thresh;
uint16_t next_to_use; /* index of next BD to be polled */
uint16_t rx_free_hold; /* num of BDs waited to passed to hardware */
-
+ uint16_t rx_rearm_start; /* index of BD that driver re-arming from */
+ uint16_t rx_rearm_nb; /* number of remaining BDs to be re-armed */
/*
* port based vlan configuration state.
* value range: HNS3_PORT_BASE_VLAN_DISABLE / HNS3_PORT_BASE_VLAN_ENABLE
struct rte_mbuf *bulk_mbuf[HNS3_BULK_ALLOC_MBUF_NUM];
uint16_t bulk_mbuf_num;
+
+ /* offset_table: used for vector, to solve execute re-order problem */
+ uint8_t offset_table[HNS3_VECTOR_RX_OFFSET_TABLE_LEN + 1];
+ uint64_t mbuf_initializer; /* value to init mbufs used with vector rx */
+ struct rte_mbuf fake_mbuf; /* fake mbuf used with vector rx */
};
struct hns3_tx_queue {
void hns3_enable_all_queues(struct hns3_hw *hw, bool en);
int hns3_start_queues(struct hns3_adapter *hns, bool reset_queue);
int hns3_stop_queues(struct hns3_adapter *hns, bool reset_queue);
+int hns3_rxq_iterate(struct rte_eth_dev *dev,
+ int (*callback)(struct hns3_rx_queue *, void *), void *arg);
void hns3_dev_release_mbufs(struct hns3_adapter *hns);
int hns3_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t nb_desc,
unsigned int socket, const struct rte_eth_rxconf *conf,
uint16_t nb_pkts);
uint16_t hns3_recv_scattered_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
uint16_t nb_pkts);
+uint16_t hns3_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+ uint16_t nb_pkts);
int hns3_rx_burst_mode_get(struct rte_eth_dev *dev,
__rte_unused uint16_t queue_id,
struct rte_eth_burst_mode *mode);
+int hns3_rx_check_vec_support(struct rte_eth_dev *dev);
uint16_t hns3_prep_pkts(__rte_unused void *tx_queue, struct rte_mbuf **tx_pkts,
uint16_t nb_pkts);
uint16_t hns3_xmit_pkts_simple(void *tx_queue, struct rte_mbuf **tx_pkts,
void hns3_update_all_queues_pvid_state(struct hns3_hw *hw);
void hns3_rx_scattered_reset(struct rte_eth_dev *dev);
void hns3_rx_scattered_calc(struct rte_eth_dev *dev);
+int hns3_rx_check_vec_support(struct rte_eth_dev *dev);
int hns3_tx_check_vec_support(struct rte_eth_dev *dev);
+void hns3_rxq_vec_setup(struct hns3_rx_queue *rxq);
void hns3_rxq_info_get(struct rte_eth_dev *dev, uint16_t queue_id,
struct rte_eth_rxq_info *qinfo);
void hns3_txq_info_get(struct rte_eth_dev *dev, uint16_t queue_id,
return nb_tx;
}
+
+static inline void
+hns3_rxq_rearm_mbuf(struct hns3_rx_queue *rxq)
+{
+#define REARM_LOOP_STEP_NUM 4
+ struct hns3_entry *rxep = &rxq->sw_ring[rxq->rx_rearm_start];
+ struct hns3_desc *rxdp = rxq->rx_ring + rxq->rx_rearm_start;
+ uint64_t dma_addr;
+ int i;
+
+ if (unlikely(rte_mempool_get_bulk(rxq->mb_pool, (void *)rxep,
+ HNS3_DEFAULT_RXQ_REARM_THRESH) < 0)) {
+ rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed++;
+ return;
+ }
+
+ for (i = 0; i < HNS3_DEFAULT_RXQ_REARM_THRESH; i += REARM_LOOP_STEP_NUM,
+ rxep += REARM_LOOP_STEP_NUM, rxdp += REARM_LOOP_STEP_NUM) {
+ if (likely(i <
+ HNS3_DEFAULT_RXQ_REARM_THRESH - REARM_LOOP_STEP_NUM)) {
+ rte_prefetch_non_temporal(rxep[4].mbuf);
+ rte_prefetch_non_temporal(rxep[5].mbuf);
+ rte_prefetch_non_temporal(rxep[6].mbuf);
+ rte_prefetch_non_temporal(rxep[7].mbuf);
+ }
+
+ dma_addr = rte_mbuf_data_iova_default(rxep[0].mbuf);
+ rxdp[0].addr = rte_cpu_to_le_64(dma_addr);
+ rxdp[0].rx.bd_base_info = 0;
+
+ dma_addr = rte_mbuf_data_iova_default(rxep[1].mbuf);
+ rxdp[1].addr = rte_cpu_to_le_64(dma_addr);
+ rxdp[1].rx.bd_base_info = 0;
+
+ dma_addr = rte_mbuf_data_iova_default(rxep[2].mbuf);
+ rxdp[2].addr = rte_cpu_to_le_64(dma_addr);
+ rxdp[2].rx.bd_base_info = 0;
+
+ dma_addr = rte_mbuf_data_iova_default(rxep[3].mbuf);
+ rxdp[3].addr = rte_cpu_to_le_64(dma_addr);
+ rxdp[3].rx.bd_base_info = 0;
+ }
+
+ rxq->rx_rearm_start += HNS3_DEFAULT_RXQ_REARM_THRESH;
+ if (rxq->rx_rearm_start >= rxq->nb_rx_desc)
+ rxq->rx_rearm_start = 0;
+
+ rxq->rx_rearm_nb -= HNS3_DEFAULT_RXQ_REARM_THRESH;
+
+ hns3_write_reg_opt(rxq->io_head_reg, HNS3_DEFAULT_RXQ_REARM_THRESH);
+}
+
+uint16_t
+hns3_recv_pkts_vec(void *__restrict rx_queue,
+ struct rte_mbuf **__restrict rx_pkts,
+ uint16_t nb_pkts)
+{
+ struct hns3_rx_queue *rxq = rx_queue;
+ struct hns3_desc *rxdp = &rxq->rx_ring[rxq->next_to_use];
+ uint64_t bd_err_mask; /* bit mask indicate whick pkts is error */
+ uint16_t nb_rx;
+
+ nb_pkts = RTE_MIN(nb_pkts, HNS3_DEFAULT_RX_BURST);
+ nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, HNS3_DEFAULT_DESCS_PER_LOOP);
+
+ rte_prefetch_non_temporal(rxdp);
+
+ if (rxq->rx_rearm_nb > HNS3_DEFAULT_RXQ_REARM_THRESH)
+ hns3_rxq_rearm_mbuf(rxq);
+
+ if (unlikely(!(rxdp->rx.bd_base_info &
+ rte_cpu_to_le_32(1u << HNS3_RXD_VLD_B))))
+ return 0;
+
+ rte_prefetch0(rxq->sw_ring[rxq->next_to_use + 0].mbuf);
+ rte_prefetch0(rxq->sw_ring[rxq->next_to_use + 1].mbuf);
+ rte_prefetch0(rxq->sw_ring[rxq->next_to_use + 2].mbuf);
+ rte_prefetch0(rxq->sw_ring[rxq->next_to_use + 3].mbuf);
+
+ bd_err_mask = 0;
+ nb_rx = hns3_recv_burst_vec(rxq, rx_pkts, nb_pkts, &bd_err_mask);
+ if (unlikely(bd_err_mask))
+ nb_rx = hns3_rx_reassemble_pkts(rx_pkts, nb_rx, bd_err_mask);
+
+ return nb_rx;
+}
+
+static void
+hns3_rxq_vec_setup_rearm_data(struct hns3_rx_queue *rxq)
+{
+ uintptr_t p;
+ struct rte_mbuf mb_def = { .buf_addr = 0 }; /* zeroed mbuf */
+
+ mb_def.nb_segs = 1;
+ mb_def.data_off = RTE_PKTMBUF_HEADROOM;
+ mb_def.port = rxq->port_id;
+ rte_mbuf_refcnt_set(&mb_def, 1);
+
+ /* prevent compiler reordering: rearm_data covers previous fields */
+ rte_compiler_barrier();
+ p = (uintptr_t)&mb_def.rearm_data;
+ rxq->mbuf_initializer = *(uint64_t *)p;
+}
+
+void
+hns3_rxq_vec_setup(struct hns3_rx_queue *rxq)
+{
+ struct hns3_entry *sw_ring = &rxq->sw_ring[rxq->nb_rx_desc];
+ unsigned int i;
+
+ memset(&rxq->rx_ring[rxq->nb_rx_desc], 0,
+ sizeof(struct hns3_desc) * HNS3_DEFAULT_RX_BURST);
+
+ memset(&rxq->fake_mbuf, 0, sizeof(rxq->fake_mbuf));
+ for (i = 0; i < HNS3_DEFAULT_RX_BURST; i++)
+ sw_ring[i].mbuf = &rxq->fake_mbuf;
+
+ hns3_rxq_vec_setup_rearm_data(rxq);
+
+ memset(rxq->offset_table, 0, sizeof(rxq->offset_table));
+}
+
+#ifndef RTE_LIBRTE_IEEE1588
+static int
+hns3_rxq_vec_check(struct hns3_rx_queue *rxq, void *arg)
+{
+ uint32_t min_vec_bds = HNS3_DEFAULT_RXQ_REARM_THRESH +
+ HNS3_DEFAULT_RX_BURST;
+
+ if (rxq->nb_rx_desc < min_vec_bds)
+ return -ENOTSUP;
+
+ if (rxq->nb_rx_desc % HNS3_DEFAULT_RXQ_REARM_THRESH)
+ return -ENOTSUP;
+
+ RTE_SET_USED(arg);
+ return 0;
+}
+#endif
+
+int
+hns3_rx_check_vec_support(struct rte_eth_dev *dev)
+{
+#ifndef RTE_LIBRTE_IEEE1588
+ struct rte_fdir_conf *fconf = &dev->data->dev_conf.fdir_conf;
+ struct rte_eth_rxmode *rxmode = &dev->data->dev_conf.rxmode;
+ uint64_t offloads_mask = DEV_RX_OFFLOAD_TCP_LRO |
+ DEV_RX_OFFLOAD_VLAN;
+
+ if (dev->data->scattered_rx)
+ return -ENOTSUP;
+
+ if (fconf->mode != RTE_FDIR_MODE_NONE)
+ return -ENOTSUP;
+
+ if (rxmode->offloads & offloads_mask)
+ return -ENOTSUP;
+
+ if (hns3_rxq_iterate(dev, hns3_rxq_vec_check, NULL) != 0)
+ return -ENOTSUP;
+
+ return 0;
+#else
+ RTE_SET_USED(dev);
+ return -ENOTSUP;
+#endif
+}
if (txq->next_to_clean >= txq->nb_tx_desc)
txq->next_to_clean = 0;
}
+
+static inline uint16_t
+hns3_rx_reassemble_pkts(struct rte_mbuf **rx_pkts,
+ uint16_t nb_pkts,
+ uint64_t pkt_err_mask)
+{
+ uint16_t count, i;
+ uint64_t mask;
+
+ count = 0;
+ for (i = 0; i < nb_pkts; i++) {
+ mask = ((uint64_t)1u) << i;
+ if (pkt_err_mask & mask)
+ rte_pktmbuf_free_seg(rx_pkts[i]);
+ else
+ rx_pkts[count++] = rx_pkts[i];
+ }
+
+ return count;
+}
#endif /* _HNS3_RXTX_VEC_H_ */
return nb_tx;
}
+
+static inline uint32_t
+hns3_desc_parse_field(struct hns3_rx_queue *rxq,
+ struct hns3_entry *sw_ring,
+ struct hns3_desc *rxdp,
+ uint32_t bd_vld_num)
+{
+ uint32_t l234_info, ol_info, bd_base_info;
+ struct rte_mbuf *pkt;
+ uint32_t retcode = 0;
+ uint32_t cksum_err;
+ int ret, i;
+
+ for (i = 0; i < (int)bd_vld_num; i++) {
+ pkt = sw_ring[i].mbuf;
+
+ /* init rte_mbuf.rearm_data last 64-bit */
+ pkt->ol_flags = PKT_RX_RSS_HASH;
+
+ l234_info = rxdp[i].rx.l234_info;
+ ol_info = rxdp[i].rx.ol_info;
+ bd_base_info = rxdp[i].rx.bd_base_info;
+ ret = hns3_handle_bdinfo(rxq, pkt, bd_base_info,
+ l234_info, &cksum_err);
+ if (unlikely(ret)) {
+ retcode |= 1u << i;
+ continue;
+ }
+
+ pkt->packet_type = hns3_rx_calc_ptype(rxq, l234_info, ol_info);
+ if (likely(bd_base_info & BIT(HNS3_RXD_L3L4P_B)))
+ hns3_rx_set_cksum_flag(pkt, pkt->packet_type,
+ cksum_err);
+ }
+
+ return retcode;
+}
+
+static inline uint16_t
+hns3_recv_burst_vec(struct hns3_rx_queue *__restrict rxq,
+ struct rte_mbuf **__restrict rx_pkts,
+ uint16_t nb_pkts,
+ uint64_t *bd_err_mask)
+{
+ uint16_t rx_id = rxq->next_to_use;
+ struct hns3_entry *sw_ring = &rxq->sw_ring[rx_id];
+ struct hns3_desc *rxdp = &rxq->rx_ring[rx_id];
+ uint32_t bd_valid_num, parse_retcode;
+ uint16_t nb_rx = 0;
+ int pos, offset;
+
+ /* mask to shuffle from desc to mbuf's rx_descriptor_fields1 */
+ uint8x16_t shuf_desc_fields_msk = {
+ 0xff, 0xff, 0xff, 0xff, /* packet type init zero */
+ 22, 23, 0xff, 0xff, /* rx.pkt_len to rte_mbuf.pkt_len */
+ 20, 21, /* size to rte_mbuf.data_len */
+ 0xff, 0xff, /* rte_mbuf.vlan_tci init zero */
+ 8, 9, 10, 11, /* rx.rss_hash to rte_mbuf.hash.rss */
+ };
+
+ uint16x8_t crc_adjust = {
+ 0, 0, /* ignore pkt_type field */
+ rxq->crc_len, /* sub crc on pkt_len */
+ 0, /* ignore high-16bits of pkt_len */
+ rxq->crc_len, /* sub crc on data_len */
+ 0, 0, 0, /* ignore non-length fields */
+ };
+
+ for (pos = 0; pos < nb_pkts; pos += HNS3_DEFAULT_DESCS_PER_LOOP,
+ rxdp += HNS3_DEFAULT_DESCS_PER_LOOP) {
+ uint64x2x2_t descs[HNS3_DEFAULT_DESCS_PER_LOOP];
+ uint8x16x2_t pkt_mbuf1, pkt_mbuf2, pkt_mbuf3, pkt_mbuf4;
+ uint8x16_t pkt_mb1, pkt_mb2, pkt_mb3, pkt_mb4;
+ uint64x2_t mbp1, mbp2;
+ uint16x4_t bd_vld = {0};
+ uint16x8_t tmp;
+ uint64_t stat;
+
+ /* calc how many bd valid */
+ bd_vld = vset_lane_u16(rxdp[0].rx.bdtype_vld_udp0, bd_vld, 0);
+ bd_vld = vset_lane_u16(rxdp[1].rx.bdtype_vld_udp0, bd_vld, 1);
+ bd_vld = vset_lane_u16(rxdp[2].rx.bdtype_vld_udp0, bd_vld, 2);
+ bd_vld = vset_lane_u16(rxdp[3].rx.bdtype_vld_udp0, bd_vld, 3);
+
+ /* load 2 mbuf pointer */
+ mbp1 = vld1q_u64((uint64_t *)&sw_ring[pos]);
+
+ bd_vld = vshl_n_u16(bd_vld,
+ HNS3_UINT16_BIT - 1 - HNS3_RXD_VLD_B);
+ bd_vld = vreinterpret_u16_s16(
+ vshr_n_s16(vreinterpret_s16_u16(bd_vld),
+ HNS3_UINT16_BIT - 1));
+ stat = ~vget_lane_u64(vreinterpret_u64_u16(bd_vld), 0);
+
+ /* load 2 mbuf pointer again */
+ mbp2 = vld1q_u64((uint64_t *)&sw_ring[pos + 2]);
+
+ if (likely(stat == 0))
+ bd_valid_num = HNS3_DEFAULT_DESCS_PER_LOOP;
+ else
+ bd_valid_num = __builtin_ctzl(stat) / HNS3_UINT16_BIT;
+ if (bd_valid_num == 0)
+ break;
+
+ /* use offset to control below data load oper ordering */
+ offset = rxq->offset_table[bd_valid_num];
+
+ /* store 2 mbuf pointer into rx_pkts */
+ vst1q_u64((uint64_t *)&rx_pkts[pos], mbp1);
+
+ /* read first two descs */
+ descs[0] = vld2q_u64((uint64_t *)(rxdp + offset));
+ descs[1] = vld2q_u64((uint64_t *)(rxdp + offset + 1));
+
+ /* store 2 mbuf pointer into rx_pkts again */
+ vst1q_u64((uint64_t *)&rx_pkts[pos + 2], mbp2);
+
+ /* read remains two descs */
+ descs[2] = vld2q_u64((uint64_t *)(rxdp + offset + 2));
+ descs[3] = vld2q_u64((uint64_t *)(rxdp + offset + 3));
+
+ pkt_mbuf1.val[0] = vreinterpretq_u8_u64(descs[0].val[0]);
+ pkt_mbuf1.val[1] = vreinterpretq_u8_u64(descs[0].val[1]);
+ pkt_mbuf2.val[0] = vreinterpretq_u8_u64(descs[1].val[0]);
+ pkt_mbuf2.val[1] = vreinterpretq_u8_u64(descs[1].val[1]);
+
+ /* pkt 1,2 convert format from desc to pktmbuf */
+ pkt_mb1 = vqtbl2q_u8(pkt_mbuf1, shuf_desc_fields_msk);
+ pkt_mb2 = vqtbl2q_u8(pkt_mbuf2, shuf_desc_fields_msk);
+
+ /* store the first 8 bytes of pkt 1,2 mbuf's rearm_data */
+ *(uint64_t *)&sw_ring[pos + 0].mbuf->rearm_data =
+ rxq->mbuf_initializer;
+ *(uint64_t *)&sw_ring[pos + 1].mbuf->rearm_data =
+ rxq->mbuf_initializer;
+
+ /* pkt 1,2 remove crc */
+ tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb1), crc_adjust);
+ pkt_mb1 = vreinterpretq_u8_u16(tmp);
+ tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb2), crc_adjust);
+ pkt_mb2 = vreinterpretq_u8_u16(tmp);
+
+ pkt_mbuf3.val[0] = vreinterpretq_u8_u64(descs[2].val[0]);
+ pkt_mbuf3.val[1] = vreinterpretq_u8_u64(descs[2].val[1]);
+ pkt_mbuf4.val[0] = vreinterpretq_u8_u64(descs[3].val[0]);
+ pkt_mbuf4.val[1] = vreinterpretq_u8_u64(descs[3].val[1]);
+
+ /* pkt 3,4 convert format from desc to pktmbuf */
+ pkt_mb3 = vqtbl2q_u8(pkt_mbuf3, shuf_desc_fields_msk);
+ pkt_mb4 = vqtbl2q_u8(pkt_mbuf4, shuf_desc_fields_msk);
+
+ /* pkt 1,2 save to rx_pkts mbuf */
+ vst1q_u8((void *)&sw_ring[pos + 0].mbuf->rx_descriptor_fields1,
+ pkt_mb1);
+ vst1q_u8((void *)&sw_ring[pos + 1].mbuf->rx_descriptor_fields1,
+ pkt_mb2);
+
+ /* pkt 3,4 remove crc */
+ tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb3), crc_adjust);
+ pkt_mb3 = vreinterpretq_u8_u16(tmp);
+ tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb4), crc_adjust);
+ pkt_mb4 = vreinterpretq_u8_u16(tmp);
+
+ /* store the first 8 bytes of pkt 3,4 mbuf's rearm_data */
+ *(uint64_t *)&sw_ring[pos + 2].mbuf->rearm_data =
+ rxq->mbuf_initializer;
+ *(uint64_t *)&sw_ring[pos + 3].mbuf->rearm_data =
+ rxq->mbuf_initializer;
+
+ /* pkt 3,4 save to rx_pkts mbuf */
+ vst1q_u8((void *)&sw_ring[pos + 2].mbuf->rx_descriptor_fields1,
+ pkt_mb3);
+ vst1q_u8((void *)&sw_ring[pos + 3].mbuf->rx_descriptor_fields1,
+ pkt_mb4);
+
+ rte_prefetch_non_temporal(rxdp + HNS3_DEFAULT_DESCS_PER_LOOP);
+
+ parse_retcode = hns3_desc_parse_field(rxq, &sw_ring[pos],
+ &rxdp[offset], bd_valid_num);
+ if (unlikely(parse_retcode))
+ (*bd_err_mask) |= ((uint64_t)parse_retcode) << pos;
+
+ rte_prefetch0(sw_ring[pos +
+ HNS3_DEFAULT_DESCS_PER_LOOP + 0].mbuf);
+ rte_prefetch0(sw_ring[pos +
+ HNS3_DEFAULT_DESCS_PER_LOOP + 1].mbuf);
+ rte_prefetch0(sw_ring[pos +
+ HNS3_DEFAULT_DESCS_PER_LOOP + 2].mbuf);
+ rte_prefetch0(sw_ring[pos +
+ HNS3_DEFAULT_DESCS_PER_LOOP + 3].mbuf);
+
+ nb_rx += bd_valid_num;
+ if (bd_valid_num < HNS3_DEFAULT_DESCS_PER_LOOP)
+ break;
+ }
+
+ rxq->rx_rearm_nb += nb_rx;
+ rxq->next_to_use += nb_rx;
+ if (rxq->next_to_use >= rxq->nb_rx_desc)
+ rxq->next_to_use = 0;
+
+ return nb_rx;
+}
#endif /* _HNS3_RXTX_VEC_NEON_H_ */