net/hns3: support NEON Rx
authorWei Hu (Xavier) <xavier.huwei@huawei.com>
Wed, 9 Sep 2020 09:23:37 +0000 (17:23 +0800)
committerFerruh Yigit <ferruh.yigit@intel.com>
Mon, 21 Sep 2020 16:05:38 +0000 (18:05 +0200)
This patch adds NEON vector instructions to optimize Rx burst process.

Signed-off-by: Chengwen Feng <fengchengwen@huawei.com>
Signed-off-by: Wei Hu (Xavier) <xavier.huwei@huawei.com>
Signed-off-by: Huisong Li <lihuisong@huawei.com>
drivers/net/hns3/hns3_ethdev.c
drivers/net/hns3/hns3_ethdev.h
drivers/net/hns3/hns3_ethdev_vf.c
drivers/net/hns3/hns3_rxtx.c
drivers/net/hns3/hns3_rxtx.h
drivers/net/hns3/hns3_rxtx_vec.c
drivers/net/hns3/hns3_rxtx_vec.h
drivers/net/hns3/hns3_rxtx_vec_neon.h

index 796716e..73d5042 100644 (file)
@@ -2352,6 +2352,7 @@ hns3_dev_configure(struct rte_eth_dev *dev)
                goto cfg_err;
 
        hns->rx_simple_allowed = true;
+       hns->rx_vec_allowed = true;
        hns->tx_simple_allowed = true;
        hns->tx_vec_allowed = true;
 
index 098b6ce..fd6a9f9 100644 (file)
@@ -643,6 +643,7 @@ struct hns3_adapter {
        };
 
        bool rx_simple_allowed;
+       bool rx_vec_allowed;
        bool tx_simple_allowed;
        bool tx_vec_allowed;
 
index f1f32a4..d7f5024 100644 (file)
@@ -822,6 +822,7 @@ hns3vf_dev_configure(struct rte_eth_dev *dev)
                goto cfg_err;
 
        hns->rx_simple_allowed = true;
+       hns->rx_vec_allowed = true;
        hns->tx_simple_allowed = true;
        hns->tx_vec_allowed = true;
 
index 3e708b5..ada02de 100644 (file)
@@ -41,9 +41,19 @@ hns3_rx_queue_release_mbufs(struct hns3_rx_queue *rxq)
        if (rxq->sw_ring == NULL)
                return;
 
-       for (i = 0; i < rxq->nb_rx_desc; i++)
-               if (rxq->sw_ring[i].mbuf)
-                       rte_pktmbuf_free_seg(rxq->sw_ring[i].mbuf);
+       if (rxq->rx_rearm_nb == 0) {
+               for (i = 0; i < rxq->nb_rx_desc; i++) {
+                       if (rxq->sw_ring[i].mbuf != NULL)
+                               rte_pktmbuf_free_seg(rxq->sw_ring[i].mbuf);
+               }
+       } else {
+               for (i = rxq->next_to_use;
+                    i != rxq->rx_rearm_start;
+                    i = (i + 1) % rxq->nb_rx_desc) {
+                       if (rxq->sw_ring[i].mbuf != NULL)
+                               rte_pktmbuf_free_seg(rxq->sw_ring[i].mbuf);
+               }
+       }
 
        for (i = 0; i < rxq->bulk_mbuf_num; i++)
                rte_pktmbuf_free_seg(rxq->bulk_mbuf[i]);
@@ -661,10 +671,13 @@ hns3_dev_rx_queue_start(struct hns3_adapter *hns, uint16_t idx)
        }
 
        rxq->next_to_use = 0;
+       rxq->rx_rearm_start = 0;
        rxq->rx_free_hold = 0;
+       rxq->rx_rearm_nb = 0;
        rxq->pkt_first_seg = NULL;
        rxq->pkt_last_seg = NULL;
        hns3_init_rx_queue_hw(rxq);
+       hns3_rxq_vec_setup(rxq);
 
        return 0;
 }
@@ -678,6 +691,8 @@ hns3_fake_rx_queue_start(struct hns3_adapter *hns, uint16_t idx)
        rxq = (struct hns3_rx_queue *)hw->fkq_data.rx_queues[idx];
        rxq->next_to_use = 0;
        rxq->rx_free_hold = 0;
+       rxq->rx_rearm_start = 0;
+       rxq->rx_rearm_nb = 0;
        hns3_init_rx_queue_hw(rxq);
 }
 
@@ -860,6 +875,40 @@ hns3_stop_queues(struct hns3_adapter *hns, bool reset_queue)
        return 0;
 }
 
+/*
+ * Iterate over all Rx Queue, and call the callback() function for each Rx
+ * queue.
+ *
+ * @param[in] dev
+ *   The target eth dev.
+ * @param[in] callback
+ *   The function to call for each queue.
+ *   if callback function return nonzero will stop iterate and return it's value
+ * @param[in] arg
+ *   The arguments to provide the callback function with.
+ *
+ * @return
+ *   0 on success, otherwise with errno set.
+ */
+int
+hns3_rxq_iterate(struct rte_eth_dev *dev,
+                int (*callback)(struct hns3_rx_queue *, void *), void *arg)
+{
+       uint32_t i;
+       int ret;
+
+       if (dev->data->rx_queues == NULL)
+               return -EINVAL;
+
+       for (i = 0; i < dev->data->nb_rx_queues; i++) {
+               ret = callback(dev->data->rx_queues[i], arg);
+               if (ret != 0)
+                       return ret;
+       }
+
+       return 0;
+}
+
 static void*
 hns3_alloc_rxq_and_dma_zone(struct rte_eth_dev *dev,
                            struct hns3_queue_info *q_info)
@@ -880,7 +929,13 @@ hns3_alloc_rxq_and_dma_zone(struct rte_eth_dev *dev,
        /* Allocate rx ring hardware descriptors. */
        rxq->queue_id = q_info->idx;
        rxq->nb_rx_desc = q_info->nb_desc;
-       rx_desc = rxq->nb_rx_desc * sizeof(struct hns3_desc);
+
+       /*
+        * Allocate a litter more memory because rx vector functions
+        * don't check boundaries each time.
+        */
+       rx_desc = (rxq->nb_rx_desc + HNS3_DEFAULT_RX_BURST) *
+                       sizeof(struct hns3_desc);
        rx_mz = rte_eth_dma_zone_reserve(dev, q_info->ring_name, q_info->idx,
                                         rx_desc, HNS3_RING_BASE_ALIGN,
                                         q_info->socket_id);
@@ -1329,7 +1384,8 @@ hns3_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t nb_desc,
                conf->rx_free_thresh : HNS3_DEFAULT_RX_FREE_THRESH;
        rxq->rx_deferred_start = conf->rx_deferred_start;
 
-       rx_entry_len = sizeof(struct hns3_entry) * rxq->nb_rx_desc;
+       rx_entry_len = (rxq->nb_rx_desc + HNS3_DEFAULT_RX_BURST) *
+                       sizeof(struct hns3_entry);
        rxq->sw_ring = rte_zmalloc_socket("hns3 RX sw ring", rx_entry_len,
                                          RTE_CACHE_LINE_SIZE, socket_id);
        if (rxq->sw_ring == NULL) {
@@ -1340,6 +1396,8 @@ hns3_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t nb_desc,
 
        rxq->next_to_use = 0;
        rxq->rx_free_hold = 0;
+       rxq->rx_rearm_start = 0;
+       rxq->rx_rearm_nb = 0;
        rxq->pkt_first_seg = NULL;
        rxq->pkt_last_seg = NULL;
        rxq->port_id = dev->data->port_id;
@@ -1431,7 +1489,8 @@ hns3_dev_supported_ptypes_get(struct rte_eth_dev *dev)
        };
 
        if (dev->rx_pkt_burst == hns3_recv_pkts ||
-           dev->rx_pkt_burst == hns3_recv_scattered_pkts)
+           dev->rx_pkt_burst == hns3_recv_scattered_pkts ||
+           dev->rx_pkt_burst == hns3_recv_pkts_vec)
                return ptypes;
 
        return NULL;
@@ -1915,6 +1974,25 @@ pkt_err:
        return nb_rx;
 }
 
+void __rte_weak
+hns3_rxq_vec_setup(__rte_unused struct hns3_rx_queue *rxq)
+{
+}
+
+int __rte_weak
+hns3_rx_check_vec_support(__rte_unused struct rte_eth_dev *dev)
+{
+       return -ENOTSUP;
+}
+
+uint16_t __rte_weak
+hns3_recv_pkts_vec(__rte_unused void *tx_queue,
+                  __rte_unused struct rte_mbuf **tx_pkts,
+                  __rte_unused uint16_t nb_pkts)
+{
+       return 0;
+}
+
 int
 hns3_rx_burst_mode_get(struct rte_eth_dev *dev, __rte_unused uint16_t queue_id,
                       struct rte_eth_burst_mode *mode)
@@ -1925,6 +2003,7 @@ hns3_rx_burst_mode_get(struct rte_eth_dev *dev, __rte_unused uint16_t queue_id,
        } burst_infos[] = {
                { hns3_recv_pkts,               "Scalar" },
                { hns3_recv_scattered_pkts,     "Scalar Scattered" },
+               { hns3_recv_pkts_vec,           "Vector Neon" },
        };
 
        eth_rx_burst_t pkt_burst = dev->rx_pkt_burst;
@@ -1949,6 +2028,9 @@ hns3_get_rx_function(struct rte_eth_dev *dev)
        struct hns3_adapter *hns = dev->data->dev_private;
        uint64_t offloads = dev->data->dev_conf.rxmode.offloads;
 
+       if (hns->rx_vec_allowed && hns3_rx_check_vec_support(dev) == 0)
+               return hns3_recv_pkts_vec;
+
        if (hns->rx_simple_allowed && !dev->data->scattered_rx &&
            (offloads & DEV_RX_OFFLOAD_TCP_LRO) == 0)
                return hns3_recv_pkts;
index b471bf5..27041ab 100644 (file)
 #define HNS3_DEFAULT_TX_RS_THRESH      32
 #define HNS3_TX_FAST_FREE_AHEAD                64
 
+#define HNS3_DEFAULT_RX_BURST          32
+#if (HNS3_DEFAULT_RX_BURST > 64)
+#error "PMD HNS3: HNS3_DEFAULT_RX_BURST must <= 64\n"
+#endif
+#define HNS3_DEFAULT_DESCS_PER_LOOP    4
+#define HNS3_SVE_DEFAULT_DESCS_PER_LOOP        8
+#if (HNS3_DEFAULT_DESCS_PER_LOOP > HNS3_SVE_DEFAULT_DESCS_PER_LOOP)
+#define HNS3_VECTOR_RX_OFFSET_TABLE_LEN        HNS3_DEFAULT_DESCS_PER_LOOP
+#else
+#define HNS3_VECTOR_RX_OFFSET_TABLE_LEN        HNS3_SVE_DEFAULT_DESCS_PER_LOOP
+#endif
+#define HNS3_DEFAULT_RXQ_REARM_THRESH  64
 #define HNS3_UINT8_BIT                 8
 #define HNS3_UINT16_BIT                        16
 #define HNS3_UINT32_BIT                        32
@@ -236,7 +248,13 @@ struct hns3_desc {
                                        uint16_t ot_vlan_tag;
                                };
                        };
-                       uint32_t bd_base_info;
+                       union {
+                               uint32_t bd_base_info;
+                               struct {
+                                       uint16_t bdtype_vld_udp0;
+                                       uint16_t fe_lum_crcp_l3l4p;
+                               };
+                       };
                } rx;
        };
 } __rte_packed;
@@ -270,7 +288,8 @@ struct hns3_rx_queue {
        uint16_t rx_free_thresh;
        uint16_t next_to_use;    /* index of next BD to be polled */
        uint16_t rx_free_hold;   /* num of BDs waited to passed to hardware */
-
+       uint16_t rx_rearm_start; /* index of BD that driver re-arming from */
+       uint16_t rx_rearm_nb;    /* number of remaining BDs to be re-armed */
        /*
         * port based vlan configuration state.
         * value range: HNS3_PORT_BASE_VLAN_DISABLE / HNS3_PORT_BASE_VLAN_ENABLE
@@ -292,6 +311,11 @@ struct hns3_rx_queue {
 
        struct rte_mbuf *bulk_mbuf[HNS3_BULK_ALLOC_MBUF_NUM];
        uint16_t bulk_mbuf_num;
+
+       /* offset_table: used for vector, to solve execute re-order problem */
+       uint8_t offset_table[HNS3_VECTOR_RX_OFFSET_TABLE_LEN + 1];
+       uint64_t mbuf_initializer; /* value to init mbufs used with vector rx */
+       struct rte_mbuf fake_mbuf; /* fake mbuf used with vector rx */
 };
 
 struct hns3_tx_queue {
@@ -554,6 +578,8 @@ int hns3_dev_rx_queue_intr_disable(struct rte_eth_dev *dev, uint16_t queue_id);
 void hns3_enable_all_queues(struct hns3_hw *hw, bool en);
 int hns3_start_queues(struct hns3_adapter *hns, bool reset_queue);
 int hns3_stop_queues(struct hns3_adapter *hns, bool reset_queue);
+int hns3_rxq_iterate(struct rte_eth_dev *dev,
+                int (*callback)(struct hns3_rx_queue *, void *), void *arg);
 void hns3_dev_release_mbufs(struct hns3_adapter *hns);
 int hns3_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t nb_desc,
                        unsigned int socket, const struct rte_eth_rxconf *conf,
@@ -564,9 +590,12 @@ uint16_t hns3_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
                        uint16_t nb_pkts);
 uint16_t hns3_recv_scattered_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
                                  uint16_t nb_pkts);
+uint16_t hns3_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+                           uint16_t nb_pkts);
 int hns3_rx_burst_mode_get(struct rte_eth_dev *dev,
                           __rte_unused uint16_t queue_id,
                           struct rte_eth_burst_mode *mode);
+int hns3_rx_check_vec_support(struct rte_eth_dev *dev);
 uint16_t hns3_prep_pkts(__rte_unused void *tx_queue, struct rte_mbuf **tx_pkts,
                        uint16_t nb_pkts);
 uint16_t hns3_xmit_pkts_simple(void *tx_queue, struct rte_mbuf **tx_pkts,
@@ -594,7 +623,9 @@ int hns3_restore_gro_conf(struct hns3_hw *hw);
 void hns3_update_all_queues_pvid_state(struct hns3_hw *hw);
 void hns3_rx_scattered_reset(struct rte_eth_dev *dev);
 void hns3_rx_scattered_calc(struct rte_eth_dev *dev);
+int hns3_rx_check_vec_support(struct rte_eth_dev *dev);
 int hns3_tx_check_vec_support(struct rte_eth_dev *dev);
+void hns3_rxq_vec_setup(struct hns3_rx_queue *rxq);
 void hns3_rxq_info_get(struct rte_eth_dev *dev, uint16_t queue_id,
                       struct rte_eth_rxq_info *qinfo);
 void hns3_txq_info_get(struct rte_eth_dev *dev, uint16_t queue_id,
index 1154b6f..a26c83d 100644 (file)
@@ -45,3 +45,170 @@ hns3_xmit_pkts_vec(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
 
        return nb_tx;
 }
+
+static inline void
+hns3_rxq_rearm_mbuf(struct hns3_rx_queue *rxq)
+{
+#define REARM_LOOP_STEP_NUM    4
+       struct hns3_entry *rxep = &rxq->sw_ring[rxq->rx_rearm_start];
+       struct hns3_desc *rxdp = rxq->rx_ring + rxq->rx_rearm_start;
+       uint64_t dma_addr;
+       int i;
+
+       if (unlikely(rte_mempool_get_bulk(rxq->mb_pool, (void *)rxep,
+                                         HNS3_DEFAULT_RXQ_REARM_THRESH) < 0)) {
+               rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed++;
+               return;
+       }
+
+       for (i = 0; i < HNS3_DEFAULT_RXQ_REARM_THRESH; i += REARM_LOOP_STEP_NUM,
+               rxep += REARM_LOOP_STEP_NUM, rxdp += REARM_LOOP_STEP_NUM) {
+               if (likely(i <
+                       HNS3_DEFAULT_RXQ_REARM_THRESH - REARM_LOOP_STEP_NUM)) {
+                       rte_prefetch_non_temporal(rxep[4].mbuf);
+                       rte_prefetch_non_temporal(rxep[5].mbuf);
+                       rte_prefetch_non_temporal(rxep[6].mbuf);
+                       rte_prefetch_non_temporal(rxep[7].mbuf);
+               }
+
+               dma_addr = rte_mbuf_data_iova_default(rxep[0].mbuf);
+               rxdp[0].addr = rte_cpu_to_le_64(dma_addr);
+               rxdp[0].rx.bd_base_info = 0;
+
+               dma_addr = rte_mbuf_data_iova_default(rxep[1].mbuf);
+               rxdp[1].addr = rte_cpu_to_le_64(dma_addr);
+               rxdp[1].rx.bd_base_info = 0;
+
+               dma_addr = rte_mbuf_data_iova_default(rxep[2].mbuf);
+               rxdp[2].addr = rte_cpu_to_le_64(dma_addr);
+               rxdp[2].rx.bd_base_info = 0;
+
+               dma_addr = rte_mbuf_data_iova_default(rxep[3].mbuf);
+               rxdp[3].addr = rte_cpu_to_le_64(dma_addr);
+               rxdp[3].rx.bd_base_info = 0;
+       }
+
+       rxq->rx_rearm_start += HNS3_DEFAULT_RXQ_REARM_THRESH;
+       if (rxq->rx_rearm_start >= rxq->nb_rx_desc)
+               rxq->rx_rearm_start = 0;
+
+       rxq->rx_rearm_nb -= HNS3_DEFAULT_RXQ_REARM_THRESH;
+
+       hns3_write_reg_opt(rxq->io_head_reg, HNS3_DEFAULT_RXQ_REARM_THRESH);
+}
+
+uint16_t
+hns3_recv_pkts_vec(void *__restrict rx_queue,
+                  struct rte_mbuf **__restrict rx_pkts,
+                  uint16_t nb_pkts)
+{
+       struct hns3_rx_queue *rxq = rx_queue;
+       struct hns3_desc *rxdp = &rxq->rx_ring[rxq->next_to_use];
+       uint64_t bd_err_mask;  /* bit mask indicate whick pkts is error */
+       uint16_t nb_rx;
+
+       nb_pkts = RTE_MIN(nb_pkts, HNS3_DEFAULT_RX_BURST);
+       nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, HNS3_DEFAULT_DESCS_PER_LOOP);
+
+       rte_prefetch_non_temporal(rxdp);
+
+       if (rxq->rx_rearm_nb > HNS3_DEFAULT_RXQ_REARM_THRESH)
+               hns3_rxq_rearm_mbuf(rxq);
+
+       if (unlikely(!(rxdp->rx.bd_base_info &
+                       rte_cpu_to_le_32(1u << HNS3_RXD_VLD_B))))
+               return 0;
+
+       rte_prefetch0(rxq->sw_ring[rxq->next_to_use + 0].mbuf);
+       rte_prefetch0(rxq->sw_ring[rxq->next_to_use + 1].mbuf);
+       rte_prefetch0(rxq->sw_ring[rxq->next_to_use + 2].mbuf);
+       rte_prefetch0(rxq->sw_ring[rxq->next_to_use + 3].mbuf);
+
+       bd_err_mask = 0;
+       nb_rx = hns3_recv_burst_vec(rxq, rx_pkts, nb_pkts, &bd_err_mask);
+       if (unlikely(bd_err_mask))
+               nb_rx = hns3_rx_reassemble_pkts(rx_pkts, nb_rx, bd_err_mask);
+
+       return nb_rx;
+}
+
+static void
+hns3_rxq_vec_setup_rearm_data(struct hns3_rx_queue *rxq)
+{
+       uintptr_t p;
+       struct rte_mbuf mb_def = { .buf_addr = 0 }; /* zeroed mbuf */
+
+       mb_def.nb_segs = 1;
+       mb_def.data_off = RTE_PKTMBUF_HEADROOM;
+       mb_def.port = rxq->port_id;
+       rte_mbuf_refcnt_set(&mb_def, 1);
+
+       /* prevent compiler reordering: rearm_data covers previous fields */
+       rte_compiler_barrier();
+       p = (uintptr_t)&mb_def.rearm_data;
+       rxq->mbuf_initializer = *(uint64_t *)p;
+}
+
+void
+hns3_rxq_vec_setup(struct hns3_rx_queue *rxq)
+{
+       struct hns3_entry *sw_ring = &rxq->sw_ring[rxq->nb_rx_desc];
+       unsigned int i;
+
+       memset(&rxq->rx_ring[rxq->nb_rx_desc], 0,
+               sizeof(struct hns3_desc) * HNS3_DEFAULT_RX_BURST);
+
+       memset(&rxq->fake_mbuf, 0, sizeof(rxq->fake_mbuf));
+       for (i = 0; i < HNS3_DEFAULT_RX_BURST; i++)
+               sw_ring[i].mbuf = &rxq->fake_mbuf;
+
+       hns3_rxq_vec_setup_rearm_data(rxq);
+
+       memset(rxq->offset_table, 0, sizeof(rxq->offset_table));
+}
+
+#ifndef RTE_LIBRTE_IEEE1588
+static int
+hns3_rxq_vec_check(struct hns3_rx_queue *rxq, void *arg)
+{
+       uint32_t min_vec_bds = HNS3_DEFAULT_RXQ_REARM_THRESH +
+                               HNS3_DEFAULT_RX_BURST;
+
+       if (rxq->nb_rx_desc < min_vec_bds)
+               return -ENOTSUP;
+
+       if (rxq->nb_rx_desc % HNS3_DEFAULT_RXQ_REARM_THRESH)
+               return -ENOTSUP;
+
+       RTE_SET_USED(arg);
+       return 0;
+}
+#endif
+
+int
+hns3_rx_check_vec_support(struct rte_eth_dev *dev)
+{
+#ifndef RTE_LIBRTE_IEEE1588
+       struct rte_fdir_conf *fconf = &dev->data->dev_conf.fdir_conf;
+       struct rte_eth_rxmode *rxmode = &dev->data->dev_conf.rxmode;
+       uint64_t offloads_mask = DEV_RX_OFFLOAD_TCP_LRO |
+                                DEV_RX_OFFLOAD_VLAN;
+
+       if (dev->data->scattered_rx)
+               return -ENOTSUP;
+
+       if (fconf->mode != RTE_FDIR_MODE_NONE)
+               return -ENOTSUP;
+
+       if (rxmode->offloads & offloads_mask)
+               return -ENOTSUP;
+
+       if (hns3_rxq_iterate(dev, hns3_rxq_vec_check, NULL) != 0)
+               return -ENOTSUP;
+
+       return 0;
+#else
+       RTE_SET_USED(dev);
+       return -ENOTSUP;
+#endif
+}
index 90679bf..c6df36d 100644 (file)
@@ -54,4 +54,24 @@ hns3_tx_free_buffers(struct hns3_tx_queue *txq)
        if (txq->next_to_clean >= txq->nb_tx_desc)
                txq->next_to_clean = 0;
 }
+
+static inline uint16_t
+hns3_rx_reassemble_pkts(struct rte_mbuf **rx_pkts,
+                       uint16_t nb_pkts,
+                       uint64_t pkt_err_mask)
+{
+       uint16_t count, i;
+       uint64_t mask;
+
+       count = 0;
+       for (i = 0; i < nb_pkts; i++) {
+               mask = ((uint64_t)1u) << i;
+               if (pkt_err_mask & mask)
+                       rte_pktmbuf_free_seg(rx_pkts[i]);
+               else
+                       rx_pkts[count++] = rx_pkts[i];
+       }
+
+       return count;
+}
 #endif /* _HNS3_RXTX_VEC_H_ */
index e878ee1..8d7721b 100644 (file)
@@ -82,4 +82,207 @@ hns3_xmit_fixed_burst_vec(void *__restrict tx_queue,
 
        return nb_tx;
 }
+
+static inline uint32_t
+hns3_desc_parse_field(struct hns3_rx_queue *rxq,
+                     struct hns3_entry *sw_ring,
+                     struct hns3_desc *rxdp,
+                     uint32_t   bd_vld_num)
+{
+       uint32_t l234_info, ol_info, bd_base_info;
+       struct rte_mbuf *pkt;
+       uint32_t retcode = 0;
+       uint32_t cksum_err;
+       int ret, i;
+
+       for (i = 0; i < (int)bd_vld_num; i++) {
+               pkt = sw_ring[i].mbuf;
+
+               /* init rte_mbuf.rearm_data last 64-bit */
+               pkt->ol_flags = PKT_RX_RSS_HASH;
+
+               l234_info = rxdp[i].rx.l234_info;
+               ol_info = rxdp[i].rx.ol_info;
+               bd_base_info = rxdp[i].rx.bd_base_info;
+               ret = hns3_handle_bdinfo(rxq, pkt, bd_base_info,
+                                        l234_info, &cksum_err);
+               if (unlikely(ret)) {
+                       retcode |= 1u << i;
+                       continue;
+               }
+
+               pkt->packet_type = hns3_rx_calc_ptype(rxq, l234_info, ol_info);
+               if (likely(bd_base_info & BIT(HNS3_RXD_L3L4P_B)))
+                       hns3_rx_set_cksum_flag(pkt, pkt->packet_type,
+                                              cksum_err);
+       }
+
+       return retcode;
+}
+
+static inline uint16_t
+hns3_recv_burst_vec(struct hns3_rx_queue *__restrict rxq,
+                   struct rte_mbuf **__restrict rx_pkts,
+                   uint16_t nb_pkts,
+                   uint64_t *bd_err_mask)
+{
+       uint16_t rx_id = rxq->next_to_use;
+       struct hns3_entry *sw_ring = &rxq->sw_ring[rx_id];
+       struct hns3_desc *rxdp = &rxq->rx_ring[rx_id];
+       uint32_t bd_valid_num, parse_retcode;
+       uint16_t nb_rx = 0;
+       int pos, offset;
+
+       /* mask to shuffle from desc to mbuf's rx_descriptor_fields1 */
+       uint8x16_t shuf_desc_fields_msk = {
+               0xff, 0xff, 0xff, 0xff,  /* packet type init zero */
+               22, 23, 0xff, 0xff,      /* rx.pkt_len to rte_mbuf.pkt_len */
+               20, 21,                  /* size to rte_mbuf.data_len */
+               0xff, 0xff,              /* rte_mbuf.vlan_tci init zero */
+               8, 9, 10, 11,            /* rx.rss_hash to rte_mbuf.hash.rss */
+       };
+
+       uint16x8_t crc_adjust = {
+               0, 0,         /* ignore pkt_type field */
+               rxq->crc_len, /* sub crc on pkt_len */
+               0,            /* ignore high-16bits of pkt_len */
+               rxq->crc_len, /* sub crc on data_len */
+               0, 0, 0,      /* ignore non-length fields */
+       };
+
+       for (pos = 0; pos < nb_pkts; pos += HNS3_DEFAULT_DESCS_PER_LOOP,
+                                    rxdp += HNS3_DEFAULT_DESCS_PER_LOOP) {
+               uint64x2x2_t descs[HNS3_DEFAULT_DESCS_PER_LOOP];
+               uint8x16x2_t pkt_mbuf1, pkt_mbuf2, pkt_mbuf3, pkt_mbuf4;
+               uint8x16_t pkt_mb1, pkt_mb2, pkt_mb3, pkt_mb4;
+               uint64x2_t mbp1, mbp2;
+               uint16x4_t bd_vld = {0};
+               uint16x8_t tmp;
+               uint64_t stat;
+
+               /* calc how many bd valid */
+               bd_vld = vset_lane_u16(rxdp[0].rx.bdtype_vld_udp0, bd_vld, 0);
+               bd_vld = vset_lane_u16(rxdp[1].rx.bdtype_vld_udp0, bd_vld, 1);
+               bd_vld = vset_lane_u16(rxdp[2].rx.bdtype_vld_udp0, bd_vld, 2);
+               bd_vld = vset_lane_u16(rxdp[3].rx.bdtype_vld_udp0, bd_vld, 3);
+
+               /* load 2 mbuf pointer */
+               mbp1 = vld1q_u64((uint64_t *)&sw_ring[pos]);
+
+               bd_vld = vshl_n_u16(bd_vld,
+                                   HNS3_UINT16_BIT - 1 - HNS3_RXD_VLD_B);
+               bd_vld = vreinterpret_u16_s16(
+                               vshr_n_s16(vreinterpret_s16_u16(bd_vld),
+                                          HNS3_UINT16_BIT - 1));
+               stat = ~vget_lane_u64(vreinterpret_u64_u16(bd_vld), 0);
+
+               /* load 2 mbuf pointer again */
+               mbp2 = vld1q_u64((uint64_t *)&sw_ring[pos + 2]);
+
+               if (likely(stat == 0))
+                       bd_valid_num = HNS3_DEFAULT_DESCS_PER_LOOP;
+               else
+                       bd_valid_num = __builtin_ctzl(stat) / HNS3_UINT16_BIT;
+               if (bd_valid_num == 0)
+                       break;
+
+               /* use offset to control below data load oper ordering */
+               offset = rxq->offset_table[bd_valid_num];
+
+               /* store 2 mbuf pointer into rx_pkts */
+               vst1q_u64((uint64_t *)&rx_pkts[pos], mbp1);
+
+               /* read first two descs */
+               descs[0] = vld2q_u64((uint64_t *)(rxdp + offset));
+               descs[1] = vld2q_u64((uint64_t *)(rxdp + offset + 1));
+
+               /* store 2 mbuf pointer into rx_pkts again */
+               vst1q_u64((uint64_t *)&rx_pkts[pos + 2], mbp2);
+
+               /* read remains two descs */
+               descs[2] = vld2q_u64((uint64_t *)(rxdp + offset + 2));
+               descs[3] = vld2q_u64((uint64_t *)(rxdp + offset + 3));
+
+               pkt_mbuf1.val[0] = vreinterpretq_u8_u64(descs[0].val[0]);
+               pkt_mbuf1.val[1] = vreinterpretq_u8_u64(descs[0].val[1]);
+               pkt_mbuf2.val[0] = vreinterpretq_u8_u64(descs[1].val[0]);
+               pkt_mbuf2.val[1] = vreinterpretq_u8_u64(descs[1].val[1]);
+
+               /* pkt 1,2 convert format from desc to pktmbuf */
+               pkt_mb1 = vqtbl2q_u8(pkt_mbuf1, shuf_desc_fields_msk);
+               pkt_mb2 = vqtbl2q_u8(pkt_mbuf2, shuf_desc_fields_msk);
+
+               /* store the first 8 bytes of pkt 1,2 mbuf's rearm_data */
+               *(uint64_t *)&sw_ring[pos + 0].mbuf->rearm_data =
+                       rxq->mbuf_initializer;
+               *(uint64_t *)&sw_ring[pos + 1].mbuf->rearm_data =
+                       rxq->mbuf_initializer;
+
+               /* pkt 1,2 remove crc */
+               tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb1), crc_adjust);
+               pkt_mb1 = vreinterpretq_u8_u16(tmp);
+               tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb2), crc_adjust);
+               pkt_mb2 = vreinterpretq_u8_u16(tmp);
+
+               pkt_mbuf3.val[0] = vreinterpretq_u8_u64(descs[2].val[0]);
+               pkt_mbuf3.val[1] = vreinterpretq_u8_u64(descs[2].val[1]);
+               pkt_mbuf4.val[0] = vreinterpretq_u8_u64(descs[3].val[0]);
+               pkt_mbuf4.val[1] = vreinterpretq_u8_u64(descs[3].val[1]);
+
+               /* pkt 3,4 convert format from desc to pktmbuf */
+               pkt_mb3 = vqtbl2q_u8(pkt_mbuf3, shuf_desc_fields_msk);
+               pkt_mb4 = vqtbl2q_u8(pkt_mbuf4, shuf_desc_fields_msk);
+
+               /* pkt 1,2 save to rx_pkts mbuf */
+               vst1q_u8((void *)&sw_ring[pos + 0].mbuf->rx_descriptor_fields1,
+                        pkt_mb1);
+               vst1q_u8((void *)&sw_ring[pos + 1].mbuf->rx_descriptor_fields1,
+                        pkt_mb2);
+
+               /* pkt 3,4 remove crc */
+               tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb3), crc_adjust);
+               pkt_mb3 = vreinterpretq_u8_u16(tmp);
+               tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb4), crc_adjust);
+               pkt_mb4 = vreinterpretq_u8_u16(tmp);
+
+               /* store the first 8 bytes of pkt 3,4 mbuf's rearm_data */
+               *(uint64_t *)&sw_ring[pos + 2].mbuf->rearm_data =
+                       rxq->mbuf_initializer;
+               *(uint64_t *)&sw_ring[pos + 3].mbuf->rearm_data =
+                       rxq->mbuf_initializer;
+
+               /* pkt 3,4 save to rx_pkts mbuf */
+               vst1q_u8((void *)&sw_ring[pos + 2].mbuf->rx_descriptor_fields1,
+                        pkt_mb3);
+               vst1q_u8((void *)&sw_ring[pos + 3].mbuf->rx_descriptor_fields1,
+                        pkt_mb4);
+
+               rte_prefetch_non_temporal(rxdp + HNS3_DEFAULT_DESCS_PER_LOOP);
+
+               parse_retcode = hns3_desc_parse_field(rxq, &sw_ring[pos],
+                       &rxdp[offset], bd_valid_num);
+               if (unlikely(parse_retcode))
+                       (*bd_err_mask) |= ((uint64_t)parse_retcode) << pos;
+
+               rte_prefetch0(sw_ring[pos +
+                                     HNS3_DEFAULT_DESCS_PER_LOOP + 0].mbuf);
+               rte_prefetch0(sw_ring[pos +
+                                     HNS3_DEFAULT_DESCS_PER_LOOP + 1].mbuf);
+               rte_prefetch0(sw_ring[pos +
+                                     HNS3_DEFAULT_DESCS_PER_LOOP + 2].mbuf);
+               rte_prefetch0(sw_ring[pos +
+                                     HNS3_DEFAULT_DESCS_PER_LOOP + 3].mbuf);
+
+               nb_rx += bd_valid_num;
+               if (bd_valid_num < HNS3_DEFAULT_DESCS_PER_LOOP)
+                       break;
+       }
+
+       rxq->rx_rearm_nb += nb_rx;
+       rxq->next_to_use += nb_rx;
+       if (rxq->next_to_use >= rxq->nb_rx_desc)
+               rxq->next_to_use = 0;
+
+       return nb_rx;
+}
 #endif /* _HNS3_RXTX_VEC_NEON_H_ */