From: Wei Hu (Xavier) Date: Wed, 9 Sep 2020 09:23:37 +0000 (+0800) Subject: net/hns3: support NEON Rx X-Git-Url: http://git.droids-corp.org/?a=commitdiff_plain;h=a3d4f4d291d7;p=dpdk.git net/hns3: support NEON Rx This patch adds NEON vector instructions to optimize Rx burst process. Signed-off-by: Chengwen Feng Signed-off-by: Wei Hu (Xavier) Signed-off-by: Huisong Li --- diff --git a/drivers/net/hns3/hns3_ethdev.c b/drivers/net/hns3/hns3_ethdev.c index 796716eef4..73d504253d 100644 --- a/drivers/net/hns3/hns3_ethdev.c +++ b/drivers/net/hns3/hns3_ethdev.c @@ -2352,6 +2352,7 @@ hns3_dev_configure(struct rte_eth_dev *dev) goto cfg_err; hns->rx_simple_allowed = true; + hns->rx_vec_allowed = true; hns->tx_simple_allowed = true; hns->tx_vec_allowed = true; diff --git a/drivers/net/hns3/hns3_ethdev.h b/drivers/net/hns3/hns3_ethdev.h index 098b6ce2c6..fd6a9f998f 100644 --- a/drivers/net/hns3/hns3_ethdev.h +++ b/drivers/net/hns3/hns3_ethdev.h @@ -643,6 +643,7 @@ struct hns3_adapter { }; bool rx_simple_allowed; + bool rx_vec_allowed; bool tx_simple_allowed; bool tx_vec_allowed; diff --git a/drivers/net/hns3/hns3_ethdev_vf.c b/drivers/net/hns3/hns3_ethdev_vf.c index f1f32a40c4..d7f502442c 100644 --- a/drivers/net/hns3/hns3_ethdev_vf.c +++ b/drivers/net/hns3/hns3_ethdev_vf.c @@ -822,6 +822,7 @@ hns3vf_dev_configure(struct rte_eth_dev *dev) goto cfg_err; hns->rx_simple_allowed = true; + hns->rx_vec_allowed = true; hns->tx_simple_allowed = true; hns->tx_vec_allowed = true; diff --git a/drivers/net/hns3/hns3_rxtx.c b/drivers/net/hns3/hns3_rxtx.c index 3e708b5dd8..ada02de0c2 100644 --- a/drivers/net/hns3/hns3_rxtx.c +++ b/drivers/net/hns3/hns3_rxtx.c @@ -41,9 +41,19 @@ hns3_rx_queue_release_mbufs(struct hns3_rx_queue *rxq) if (rxq->sw_ring == NULL) return; - for (i = 0; i < rxq->nb_rx_desc; i++) - if (rxq->sw_ring[i].mbuf) - rte_pktmbuf_free_seg(rxq->sw_ring[i].mbuf); + if (rxq->rx_rearm_nb == 0) { + for (i = 0; i < rxq->nb_rx_desc; i++) { + if (rxq->sw_ring[i].mbuf != NULL) + rte_pktmbuf_free_seg(rxq->sw_ring[i].mbuf); + } + } else { + for (i = rxq->next_to_use; + i != rxq->rx_rearm_start; + i = (i + 1) % rxq->nb_rx_desc) { + if (rxq->sw_ring[i].mbuf != NULL) + rte_pktmbuf_free_seg(rxq->sw_ring[i].mbuf); + } + } for (i = 0; i < rxq->bulk_mbuf_num; i++) rte_pktmbuf_free_seg(rxq->bulk_mbuf[i]); @@ -661,10 +671,13 @@ hns3_dev_rx_queue_start(struct hns3_adapter *hns, uint16_t idx) } rxq->next_to_use = 0; + rxq->rx_rearm_start = 0; rxq->rx_free_hold = 0; + rxq->rx_rearm_nb = 0; rxq->pkt_first_seg = NULL; rxq->pkt_last_seg = NULL; hns3_init_rx_queue_hw(rxq); + hns3_rxq_vec_setup(rxq); return 0; } @@ -678,6 +691,8 @@ hns3_fake_rx_queue_start(struct hns3_adapter *hns, uint16_t idx) rxq = (struct hns3_rx_queue *)hw->fkq_data.rx_queues[idx]; rxq->next_to_use = 0; rxq->rx_free_hold = 0; + rxq->rx_rearm_start = 0; + rxq->rx_rearm_nb = 0; hns3_init_rx_queue_hw(rxq); } @@ -860,6 +875,40 @@ hns3_stop_queues(struct hns3_adapter *hns, bool reset_queue) return 0; } +/* + * Iterate over all Rx Queue, and call the callback() function for each Rx + * queue. + * + * @param[in] dev + * The target eth dev. + * @param[in] callback + * The function to call for each queue. + * if callback function return nonzero will stop iterate and return it's value + * @param[in] arg + * The arguments to provide the callback function with. + * + * @return + * 0 on success, otherwise with errno set. + */ +int +hns3_rxq_iterate(struct rte_eth_dev *dev, + int (*callback)(struct hns3_rx_queue *, void *), void *arg) +{ + uint32_t i; + int ret; + + if (dev->data->rx_queues == NULL) + return -EINVAL; + + for (i = 0; i < dev->data->nb_rx_queues; i++) { + ret = callback(dev->data->rx_queues[i], arg); + if (ret != 0) + return ret; + } + + return 0; +} + static void* hns3_alloc_rxq_and_dma_zone(struct rte_eth_dev *dev, struct hns3_queue_info *q_info) @@ -880,7 +929,13 @@ hns3_alloc_rxq_and_dma_zone(struct rte_eth_dev *dev, /* Allocate rx ring hardware descriptors. */ rxq->queue_id = q_info->idx; rxq->nb_rx_desc = q_info->nb_desc; - rx_desc = rxq->nb_rx_desc * sizeof(struct hns3_desc); + + /* + * Allocate a litter more memory because rx vector functions + * don't check boundaries each time. + */ + rx_desc = (rxq->nb_rx_desc + HNS3_DEFAULT_RX_BURST) * + sizeof(struct hns3_desc); rx_mz = rte_eth_dma_zone_reserve(dev, q_info->ring_name, q_info->idx, rx_desc, HNS3_RING_BASE_ALIGN, q_info->socket_id); @@ -1329,7 +1384,8 @@ hns3_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t nb_desc, conf->rx_free_thresh : HNS3_DEFAULT_RX_FREE_THRESH; rxq->rx_deferred_start = conf->rx_deferred_start; - rx_entry_len = sizeof(struct hns3_entry) * rxq->nb_rx_desc; + rx_entry_len = (rxq->nb_rx_desc + HNS3_DEFAULT_RX_BURST) * + sizeof(struct hns3_entry); rxq->sw_ring = rte_zmalloc_socket("hns3 RX sw ring", rx_entry_len, RTE_CACHE_LINE_SIZE, socket_id); if (rxq->sw_ring == NULL) { @@ -1340,6 +1396,8 @@ hns3_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t nb_desc, rxq->next_to_use = 0; rxq->rx_free_hold = 0; + rxq->rx_rearm_start = 0; + rxq->rx_rearm_nb = 0; rxq->pkt_first_seg = NULL; rxq->pkt_last_seg = NULL; rxq->port_id = dev->data->port_id; @@ -1431,7 +1489,8 @@ hns3_dev_supported_ptypes_get(struct rte_eth_dev *dev) }; if (dev->rx_pkt_burst == hns3_recv_pkts || - dev->rx_pkt_burst == hns3_recv_scattered_pkts) + dev->rx_pkt_burst == hns3_recv_scattered_pkts || + dev->rx_pkt_burst == hns3_recv_pkts_vec) return ptypes; return NULL; @@ -1915,6 +1974,25 @@ pkt_err: return nb_rx; } +void __rte_weak +hns3_rxq_vec_setup(__rte_unused struct hns3_rx_queue *rxq) +{ +} + +int __rte_weak +hns3_rx_check_vec_support(__rte_unused struct rte_eth_dev *dev) +{ + return -ENOTSUP; +} + +uint16_t __rte_weak +hns3_recv_pkts_vec(__rte_unused void *tx_queue, + __rte_unused struct rte_mbuf **tx_pkts, + __rte_unused uint16_t nb_pkts) +{ + return 0; +} + int hns3_rx_burst_mode_get(struct rte_eth_dev *dev, __rte_unused uint16_t queue_id, struct rte_eth_burst_mode *mode) @@ -1925,6 +2003,7 @@ hns3_rx_burst_mode_get(struct rte_eth_dev *dev, __rte_unused uint16_t queue_id, } burst_infos[] = { { hns3_recv_pkts, "Scalar" }, { hns3_recv_scattered_pkts, "Scalar Scattered" }, + { hns3_recv_pkts_vec, "Vector Neon" }, }; eth_rx_burst_t pkt_burst = dev->rx_pkt_burst; @@ -1949,6 +2028,9 @@ hns3_get_rx_function(struct rte_eth_dev *dev) struct hns3_adapter *hns = dev->data->dev_private; uint64_t offloads = dev->data->dev_conf.rxmode.offloads; + if (hns->rx_vec_allowed && hns3_rx_check_vec_support(dev) == 0) + return hns3_recv_pkts_vec; + if (hns->rx_simple_allowed && !dev->data->scattered_rx && (offloads & DEV_RX_OFFLOAD_TCP_LRO) == 0) return hns3_recv_pkts; diff --git a/drivers/net/hns3/hns3_rxtx.h b/drivers/net/hns3/hns3_rxtx.h index b471bf505c..27041ab7c7 100644 --- a/drivers/net/hns3/hns3_rxtx.h +++ b/drivers/net/hns3/hns3_rxtx.h @@ -17,6 +17,18 @@ #define HNS3_DEFAULT_TX_RS_THRESH 32 #define HNS3_TX_FAST_FREE_AHEAD 64 +#define HNS3_DEFAULT_RX_BURST 32 +#if (HNS3_DEFAULT_RX_BURST > 64) +#error "PMD HNS3: HNS3_DEFAULT_RX_BURST must <= 64\n" +#endif +#define HNS3_DEFAULT_DESCS_PER_LOOP 4 +#define HNS3_SVE_DEFAULT_DESCS_PER_LOOP 8 +#if (HNS3_DEFAULT_DESCS_PER_LOOP > HNS3_SVE_DEFAULT_DESCS_PER_LOOP) +#define HNS3_VECTOR_RX_OFFSET_TABLE_LEN HNS3_DEFAULT_DESCS_PER_LOOP +#else +#define HNS3_VECTOR_RX_OFFSET_TABLE_LEN HNS3_SVE_DEFAULT_DESCS_PER_LOOP +#endif +#define HNS3_DEFAULT_RXQ_REARM_THRESH 64 #define HNS3_UINT8_BIT 8 #define HNS3_UINT16_BIT 16 #define HNS3_UINT32_BIT 32 @@ -236,7 +248,13 @@ struct hns3_desc { uint16_t ot_vlan_tag; }; }; - uint32_t bd_base_info; + union { + uint32_t bd_base_info; + struct { + uint16_t bdtype_vld_udp0; + uint16_t fe_lum_crcp_l3l4p; + }; + }; } rx; }; } __rte_packed; @@ -270,7 +288,8 @@ struct hns3_rx_queue { uint16_t rx_free_thresh; uint16_t next_to_use; /* index of next BD to be polled */ uint16_t rx_free_hold; /* num of BDs waited to passed to hardware */ - + uint16_t rx_rearm_start; /* index of BD that driver re-arming from */ + uint16_t rx_rearm_nb; /* number of remaining BDs to be re-armed */ /* * port based vlan configuration state. * value range: HNS3_PORT_BASE_VLAN_DISABLE / HNS3_PORT_BASE_VLAN_ENABLE @@ -292,6 +311,11 @@ struct hns3_rx_queue { struct rte_mbuf *bulk_mbuf[HNS3_BULK_ALLOC_MBUF_NUM]; uint16_t bulk_mbuf_num; + + /* offset_table: used for vector, to solve execute re-order problem */ + uint8_t offset_table[HNS3_VECTOR_RX_OFFSET_TABLE_LEN + 1]; + uint64_t mbuf_initializer; /* value to init mbufs used with vector rx */ + struct rte_mbuf fake_mbuf; /* fake mbuf used with vector rx */ }; struct hns3_tx_queue { @@ -554,6 +578,8 @@ int hns3_dev_rx_queue_intr_disable(struct rte_eth_dev *dev, uint16_t queue_id); void hns3_enable_all_queues(struct hns3_hw *hw, bool en); int hns3_start_queues(struct hns3_adapter *hns, bool reset_queue); int hns3_stop_queues(struct hns3_adapter *hns, bool reset_queue); +int hns3_rxq_iterate(struct rte_eth_dev *dev, + int (*callback)(struct hns3_rx_queue *, void *), void *arg); void hns3_dev_release_mbufs(struct hns3_adapter *hns); int hns3_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t nb_desc, unsigned int socket, const struct rte_eth_rxconf *conf, @@ -564,9 +590,12 @@ uint16_t hns3_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts); uint16_t hns3_recv_scattered_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts); +uint16_t hns3_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts, + uint16_t nb_pkts); int hns3_rx_burst_mode_get(struct rte_eth_dev *dev, __rte_unused uint16_t queue_id, struct rte_eth_burst_mode *mode); +int hns3_rx_check_vec_support(struct rte_eth_dev *dev); uint16_t hns3_prep_pkts(__rte_unused void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts); uint16_t hns3_xmit_pkts_simple(void *tx_queue, struct rte_mbuf **tx_pkts, @@ -594,7 +623,9 @@ int hns3_restore_gro_conf(struct hns3_hw *hw); void hns3_update_all_queues_pvid_state(struct hns3_hw *hw); void hns3_rx_scattered_reset(struct rte_eth_dev *dev); void hns3_rx_scattered_calc(struct rte_eth_dev *dev); +int hns3_rx_check_vec_support(struct rte_eth_dev *dev); int hns3_tx_check_vec_support(struct rte_eth_dev *dev); +void hns3_rxq_vec_setup(struct hns3_rx_queue *rxq); void hns3_rxq_info_get(struct rte_eth_dev *dev, uint16_t queue_id, struct rte_eth_rxq_info *qinfo); void hns3_txq_info_get(struct rte_eth_dev *dev, uint16_t queue_id, diff --git a/drivers/net/hns3/hns3_rxtx_vec.c b/drivers/net/hns3/hns3_rxtx_vec.c index 1154b6fdfb..a26c83d146 100644 --- a/drivers/net/hns3/hns3_rxtx_vec.c +++ b/drivers/net/hns3/hns3_rxtx_vec.c @@ -45,3 +45,170 @@ hns3_xmit_pkts_vec(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts) return nb_tx; } + +static inline void +hns3_rxq_rearm_mbuf(struct hns3_rx_queue *rxq) +{ +#define REARM_LOOP_STEP_NUM 4 + struct hns3_entry *rxep = &rxq->sw_ring[rxq->rx_rearm_start]; + struct hns3_desc *rxdp = rxq->rx_ring + rxq->rx_rearm_start; + uint64_t dma_addr; + int i; + + if (unlikely(rte_mempool_get_bulk(rxq->mb_pool, (void *)rxep, + HNS3_DEFAULT_RXQ_REARM_THRESH) < 0)) { + rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed++; + return; + } + + for (i = 0; i < HNS3_DEFAULT_RXQ_REARM_THRESH; i += REARM_LOOP_STEP_NUM, + rxep += REARM_LOOP_STEP_NUM, rxdp += REARM_LOOP_STEP_NUM) { + if (likely(i < + HNS3_DEFAULT_RXQ_REARM_THRESH - REARM_LOOP_STEP_NUM)) { + rte_prefetch_non_temporal(rxep[4].mbuf); + rte_prefetch_non_temporal(rxep[5].mbuf); + rte_prefetch_non_temporal(rxep[6].mbuf); + rte_prefetch_non_temporal(rxep[7].mbuf); + } + + dma_addr = rte_mbuf_data_iova_default(rxep[0].mbuf); + rxdp[0].addr = rte_cpu_to_le_64(dma_addr); + rxdp[0].rx.bd_base_info = 0; + + dma_addr = rte_mbuf_data_iova_default(rxep[1].mbuf); + rxdp[1].addr = rte_cpu_to_le_64(dma_addr); + rxdp[1].rx.bd_base_info = 0; + + dma_addr = rte_mbuf_data_iova_default(rxep[2].mbuf); + rxdp[2].addr = rte_cpu_to_le_64(dma_addr); + rxdp[2].rx.bd_base_info = 0; + + dma_addr = rte_mbuf_data_iova_default(rxep[3].mbuf); + rxdp[3].addr = rte_cpu_to_le_64(dma_addr); + rxdp[3].rx.bd_base_info = 0; + } + + rxq->rx_rearm_start += HNS3_DEFAULT_RXQ_REARM_THRESH; + if (rxq->rx_rearm_start >= rxq->nb_rx_desc) + rxq->rx_rearm_start = 0; + + rxq->rx_rearm_nb -= HNS3_DEFAULT_RXQ_REARM_THRESH; + + hns3_write_reg_opt(rxq->io_head_reg, HNS3_DEFAULT_RXQ_REARM_THRESH); +} + +uint16_t +hns3_recv_pkts_vec(void *__restrict rx_queue, + struct rte_mbuf **__restrict rx_pkts, + uint16_t nb_pkts) +{ + struct hns3_rx_queue *rxq = rx_queue; + struct hns3_desc *rxdp = &rxq->rx_ring[rxq->next_to_use]; + uint64_t bd_err_mask; /* bit mask indicate whick pkts is error */ + uint16_t nb_rx; + + nb_pkts = RTE_MIN(nb_pkts, HNS3_DEFAULT_RX_BURST); + nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, HNS3_DEFAULT_DESCS_PER_LOOP); + + rte_prefetch_non_temporal(rxdp); + + if (rxq->rx_rearm_nb > HNS3_DEFAULT_RXQ_REARM_THRESH) + hns3_rxq_rearm_mbuf(rxq); + + if (unlikely(!(rxdp->rx.bd_base_info & + rte_cpu_to_le_32(1u << HNS3_RXD_VLD_B)))) + return 0; + + rte_prefetch0(rxq->sw_ring[rxq->next_to_use + 0].mbuf); + rte_prefetch0(rxq->sw_ring[rxq->next_to_use + 1].mbuf); + rte_prefetch0(rxq->sw_ring[rxq->next_to_use + 2].mbuf); + rte_prefetch0(rxq->sw_ring[rxq->next_to_use + 3].mbuf); + + bd_err_mask = 0; + nb_rx = hns3_recv_burst_vec(rxq, rx_pkts, nb_pkts, &bd_err_mask); + if (unlikely(bd_err_mask)) + nb_rx = hns3_rx_reassemble_pkts(rx_pkts, nb_rx, bd_err_mask); + + return nb_rx; +} + +static void +hns3_rxq_vec_setup_rearm_data(struct hns3_rx_queue *rxq) +{ + uintptr_t p; + struct rte_mbuf mb_def = { .buf_addr = 0 }; /* zeroed mbuf */ + + mb_def.nb_segs = 1; + mb_def.data_off = RTE_PKTMBUF_HEADROOM; + mb_def.port = rxq->port_id; + rte_mbuf_refcnt_set(&mb_def, 1); + + /* prevent compiler reordering: rearm_data covers previous fields */ + rte_compiler_barrier(); + p = (uintptr_t)&mb_def.rearm_data; + rxq->mbuf_initializer = *(uint64_t *)p; +} + +void +hns3_rxq_vec_setup(struct hns3_rx_queue *rxq) +{ + struct hns3_entry *sw_ring = &rxq->sw_ring[rxq->nb_rx_desc]; + unsigned int i; + + memset(&rxq->rx_ring[rxq->nb_rx_desc], 0, + sizeof(struct hns3_desc) * HNS3_DEFAULT_RX_BURST); + + memset(&rxq->fake_mbuf, 0, sizeof(rxq->fake_mbuf)); + for (i = 0; i < HNS3_DEFAULT_RX_BURST; i++) + sw_ring[i].mbuf = &rxq->fake_mbuf; + + hns3_rxq_vec_setup_rearm_data(rxq); + + memset(rxq->offset_table, 0, sizeof(rxq->offset_table)); +} + +#ifndef RTE_LIBRTE_IEEE1588 +static int +hns3_rxq_vec_check(struct hns3_rx_queue *rxq, void *arg) +{ + uint32_t min_vec_bds = HNS3_DEFAULT_RXQ_REARM_THRESH + + HNS3_DEFAULT_RX_BURST; + + if (rxq->nb_rx_desc < min_vec_bds) + return -ENOTSUP; + + if (rxq->nb_rx_desc % HNS3_DEFAULT_RXQ_REARM_THRESH) + return -ENOTSUP; + + RTE_SET_USED(arg); + return 0; +} +#endif + +int +hns3_rx_check_vec_support(struct rte_eth_dev *dev) +{ +#ifndef RTE_LIBRTE_IEEE1588 + struct rte_fdir_conf *fconf = &dev->data->dev_conf.fdir_conf; + struct rte_eth_rxmode *rxmode = &dev->data->dev_conf.rxmode; + uint64_t offloads_mask = DEV_RX_OFFLOAD_TCP_LRO | + DEV_RX_OFFLOAD_VLAN; + + if (dev->data->scattered_rx) + return -ENOTSUP; + + if (fconf->mode != RTE_FDIR_MODE_NONE) + return -ENOTSUP; + + if (rxmode->offloads & offloads_mask) + return -ENOTSUP; + + if (hns3_rxq_iterate(dev, hns3_rxq_vec_check, NULL) != 0) + return -ENOTSUP; + + return 0; +#else + RTE_SET_USED(dev); + return -ENOTSUP; +#endif +} diff --git a/drivers/net/hns3/hns3_rxtx_vec.h b/drivers/net/hns3/hns3_rxtx_vec.h index 90679bf335..c6df36d017 100644 --- a/drivers/net/hns3/hns3_rxtx_vec.h +++ b/drivers/net/hns3/hns3_rxtx_vec.h @@ -54,4 +54,24 @@ hns3_tx_free_buffers(struct hns3_tx_queue *txq) if (txq->next_to_clean >= txq->nb_tx_desc) txq->next_to_clean = 0; } + +static inline uint16_t +hns3_rx_reassemble_pkts(struct rte_mbuf **rx_pkts, + uint16_t nb_pkts, + uint64_t pkt_err_mask) +{ + uint16_t count, i; + uint64_t mask; + + count = 0; + for (i = 0; i < nb_pkts; i++) { + mask = ((uint64_t)1u) << i; + if (pkt_err_mask & mask) + rte_pktmbuf_free_seg(rx_pkts[i]); + else + rx_pkts[count++] = rx_pkts[i]; + } + + return count; +} #endif /* _HNS3_RXTX_VEC_H_ */ diff --git a/drivers/net/hns3/hns3_rxtx_vec_neon.h b/drivers/net/hns3/hns3_rxtx_vec_neon.h index e878ee1d24..8d7721baf3 100644 --- a/drivers/net/hns3/hns3_rxtx_vec_neon.h +++ b/drivers/net/hns3/hns3_rxtx_vec_neon.h @@ -82,4 +82,207 @@ hns3_xmit_fixed_burst_vec(void *__restrict tx_queue, return nb_tx; } + +static inline uint32_t +hns3_desc_parse_field(struct hns3_rx_queue *rxq, + struct hns3_entry *sw_ring, + struct hns3_desc *rxdp, + uint32_t bd_vld_num) +{ + uint32_t l234_info, ol_info, bd_base_info; + struct rte_mbuf *pkt; + uint32_t retcode = 0; + uint32_t cksum_err; + int ret, i; + + for (i = 0; i < (int)bd_vld_num; i++) { + pkt = sw_ring[i].mbuf; + + /* init rte_mbuf.rearm_data last 64-bit */ + pkt->ol_flags = PKT_RX_RSS_HASH; + + l234_info = rxdp[i].rx.l234_info; + ol_info = rxdp[i].rx.ol_info; + bd_base_info = rxdp[i].rx.bd_base_info; + ret = hns3_handle_bdinfo(rxq, pkt, bd_base_info, + l234_info, &cksum_err); + if (unlikely(ret)) { + retcode |= 1u << i; + continue; + } + + pkt->packet_type = hns3_rx_calc_ptype(rxq, l234_info, ol_info); + if (likely(bd_base_info & BIT(HNS3_RXD_L3L4P_B))) + hns3_rx_set_cksum_flag(pkt, pkt->packet_type, + cksum_err); + } + + return retcode; +} + +static inline uint16_t +hns3_recv_burst_vec(struct hns3_rx_queue *__restrict rxq, + struct rte_mbuf **__restrict rx_pkts, + uint16_t nb_pkts, + uint64_t *bd_err_mask) +{ + uint16_t rx_id = rxq->next_to_use; + struct hns3_entry *sw_ring = &rxq->sw_ring[rx_id]; + struct hns3_desc *rxdp = &rxq->rx_ring[rx_id]; + uint32_t bd_valid_num, parse_retcode; + uint16_t nb_rx = 0; + int pos, offset; + + /* mask to shuffle from desc to mbuf's rx_descriptor_fields1 */ + uint8x16_t shuf_desc_fields_msk = { + 0xff, 0xff, 0xff, 0xff, /* packet type init zero */ + 22, 23, 0xff, 0xff, /* rx.pkt_len to rte_mbuf.pkt_len */ + 20, 21, /* size to rte_mbuf.data_len */ + 0xff, 0xff, /* rte_mbuf.vlan_tci init zero */ + 8, 9, 10, 11, /* rx.rss_hash to rte_mbuf.hash.rss */ + }; + + uint16x8_t crc_adjust = { + 0, 0, /* ignore pkt_type field */ + rxq->crc_len, /* sub crc on pkt_len */ + 0, /* ignore high-16bits of pkt_len */ + rxq->crc_len, /* sub crc on data_len */ + 0, 0, 0, /* ignore non-length fields */ + }; + + for (pos = 0; pos < nb_pkts; pos += HNS3_DEFAULT_DESCS_PER_LOOP, + rxdp += HNS3_DEFAULT_DESCS_PER_LOOP) { + uint64x2x2_t descs[HNS3_DEFAULT_DESCS_PER_LOOP]; + uint8x16x2_t pkt_mbuf1, pkt_mbuf2, pkt_mbuf3, pkt_mbuf4; + uint8x16_t pkt_mb1, pkt_mb2, pkt_mb3, pkt_mb4; + uint64x2_t mbp1, mbp2; + uint16x4_t bd_vld = {0}; + uint16x8_t tmp; + uint64_t stat; + + /* calc how many bd valid */ + bd_vld = vset_lane_u16(rxdp[0].rx.bdtype_vld_udp0, bd_vld, 0); + bd_vld = vset_lane_u16(rxdp[1].rx.bdtype_vld_udp0, bd_vld, 1); + bd_vld = vset_lane_u16(rxdp[2].rx.bdtype_vld_udp0, bd_vld, 2); + bd_vld = vset_lane_u16(rxdp[3].rx.bdtype_vld_udp0, bd_vld, 3); + + /* load 2 mbuf pointer */ + mbp1 = vld1q_u64((uint64_t *)&sw_ring[pos]); + + bd_vld = vshl_n_u16(bd_vld, + HNS3_UINT16_BIT - 1 - HNS3_RXD_VLD_B); + bd_vld = vreinterpret_u16_s16( + vshr_n_s16(vreinterpret_s16_u16(bd_vld), + HNS3_UINT16_BIT - 1)); + stat = ~vget_lane_u64(vreinterpret_u64_u16(bd_vld), 0); + + /* load 2 mbuf pointer again */ + mbp2 = vld1q_u64((uint64_t *)&sw_ring[pos + 2]); + + if (likely(stat == 0)) + bd_valid_num = HNS3_DEFAULT_DESCS_PER_LOOP; + else + bd_valid_num = __builtin_ctzl(stat) / HNS3_UINT16_BIT; + if (bd_valid_num == 0) + break; + + /* use offset to control below data load oper ordering */ + offset = rxq->offset_table[bd_valid_num]; + + /* store 2 mbuf pointer into rx_pkts */ + vst1q_u64((uint64_t *)&rx_pkts[pos], mbp1); + + /* read first two descs */ + descs[0] = vld2q_u64((uint64_t *)(rxdp + offset)); + descs[1] = vld2q_u64((uint64_t *)(rxdp + offset + 1)); + + /* store 2 mbuf pointer into rx_pkts again */ + vst1q_u64((uint64_t *)&rx_pkts[pos + 2], mbp2); + + /* read remains two descs */ + descs[2] = vld2q_u64((uint64_t *)(rxdp + offset + 2)); + descs[3] = vld2q_u64((uint64_t *)(rxdp + offset + 3)); + + pkt_mbuf1.val[0] = vreinterpretq_u8_u64(descs[0].val[0]); + pkt_mbuf1.val[1] = vreinterpretq_u8_u64(descs[0].val[1]); + pkt_mbuf2.val[0] = vreinterpretq_u8_u64(descs[1].val[0]); + pkt_mbuf2.val[1] = vreinterpretq_u8_u64(descs[1].val[1]); + + /* pkt 1,2 convert format from desc to pktmbuf */ + pkt_mb1 = vqtbl2q_u8(pkt_mbuf1, shuf_desc_fields_msk); + pkt_mb2 = vqtbl2q_u8(pkt_mbuf2, shuf_desc_fields_msk); + + /* store the first 8 bytes of pkt 1,2 mbuf's rearm_data */ + *(uint64_t *)&sw_ring[pos + 0].mbuf->rearm_data = + rxq->mbuf_initializer; + *(uint64_t *)&sw_ring[pos + 1].mbuf->rearm_data = + rxq->mbuf_initializer; + + /* pkt 1,2 remove crc */ + tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb1), crc_adjust); + pkt_mb1 = vreinterpretq_u8_u16(tmp); + tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb2), crc_adjust); + pkt_mb2 = vreinterpretq_u8_u16(tmp); + + pkt_mbuf3.val[0] = vreinterpretq_u8_u64(descs[2].val[0]); + pkt_mbuf3.val[1] = vreinterpretq_u8_u64(descs[2].val[1]); + pkt_mbuf4.val[0] = vreinterpretq_u8_u64(descs[3].val[0]); + pkt_mbuf4.val[1] = vreinterpretq_u8_u64(descs[3].val[1]); + + /* pkt 3,4 convert format from desc to pktmbuf */ + pkt_mb3 = vqtbl2q_u8(pkt_mbuf3, shuf_desc_fields_msk); + pkt_mb4 = vqtbl2q_u8(pkt_mbuf4, shuf_desc_fields_msk); + + /* pkt 1,2 save to rx_pkts mbuf */ + vst1q_u8((void *)&sw_ring[pos + 0].mbuf->rx_descriptor_fields1, + pkt_mb1); + vst1q_u8((void *)&sw_ring[pos + 1].mbuf->rx_descriptor_fields1, + pkt_mb2); + + /* pkt 3,4 remove crc */ + tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb3), crc_adjust); + pkt_mb3 = vreinterpretq_u8_u16(tmp); + tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb4), crc_adjust); + pkt_mb4 = vreinterpretq_u8_u16(tmp); + + /* store the first 8 bytes of pkt 3,4 mbuf's rearm_data */ + *(uint64_t *)&sw_ring[pos + 2].mbuf->rearm_data = + rxq->mbuf_initializer; + *(uint64_t *)&sw_ring[pos + 3].mbuf->rearm_data = + rxq->mbuf_initializer; + + /* pkt 3,4 save to rx_pkts mbuf */ + vst1q_u8((void *)&sw_ring[pos + 2].mbuf->rx_descriptor_fields1, + pkt_mb3); + vst1q_u8((void *)&sw_ring[pos + 3].mbuf->rx_descriptor_fields1, + pkt_mb4); + + rte_prefetch_non_temporal(rxdp + HNS3_DEFAULT_DESCS_PER_LOOP); + + parse_retcode = hns3_desc_parse_field(rxq, &sw_ring[pos], + &rxdp[offset], bd_valid_num); + if (unlikely(parse_retcode)) + (*bd_err_mask) |= ((uint64_t)parse_retcode) << pos; + + rte_prefetch0(sw_ring[pos + + HNS3_DEFAULT_DESCS_PER_LOOP + 0].mbuf); + rte_prefetch0(sw_ring[pos + + HNS3_DEFAULT_DESCS_PER_LOOP + 1].mbuf); + rte_prefetch0(sw_ring[pos + + HNS3_DEFAULT_DESCS_PER_LOOP + 2].mbuf); + rte_prefetch0(sw_ring[pos + + HNS3_DEFAULT_DESCS_PER_LOOP + 3].mbuf); + + nb_rx += bd_valid_num; + if (bd_valid_num < HNS3_DEFAULT_DESCS_PER_LOOP) + break; + } + + rxq->rx_rearm_nb += nb_rx; + rxq->next_to_use += nb_rx; + if (rxq->next_to_use >= rxq->nb_rx_desc) + rxq->next_to_use = 0; + + return nb_rx; +} #endif /* _HNS3_RXTX_VEC_NEON_H_ */