From: Lance Richardson Date: Tue, 7 Jul 2020 22:22:25 +0000 (-0700) Subject: net/bnxt: support NEON X-Git-Url: http://git.droids-corp.org/?a=commitdiff_plain;h=398358341419ce2666fec38eff8fff98d5be5941;p=dpdk.git net/bnxt: support NEON Add bnxt vector PMD support using NEON SIMD instructions. Also update the 20.08 release notes with this information. Signed-off-by: Lance Richardson Reviewed-by: Ajit Khaparde Reviewed-by: Ruifeng Wang --- diff --git a/doc/guides/nics/bnxt.rst b/doc/guides/nics/bnxt.rst index a53cdad21d..6ff75d0a25 100644 --- a/doc/guides/nics/bnxt.rst +++ b/doc/guides/nics/bnxt.rst @@ -774,9 +774,9 @@ Vector processing provides significantly improved performance over scalar processing (see Vector Processor, here). The BNXT PMD supports the vector processing using SSE (Streaming SIMD -Extensions) instructions on x86 platforms. The BNXT vPMD (vector mode PMD) is -currently limited to Intel/AMD CPU architecture. Support for ARM is *not* -currently implemented. +Extensions) instructions on x86 platforms. It also supports NEON intrinsics for +vector processing on ARM CPUs. The BNXT vPMD (vector mode PMD) is available for +Intel/AMD and ARM CPU architectures. This improved performance comes from several optimizations: diff --git a/doc/guides/rel_notes/release_20_08.rst b/doc/guides/rel_notes/release_20_08.rst index dc32edb584..7d8f46232d 100644 --- a/doc/guides/rel_notes/release_20_08.rst +++ b/doc/guides/rel_notes/release_20_08.rst @@ -97,6 +97,7 @@ New Features * Added support for VXLAN encap/decap. * Added support for rte_flow_query for COUNT action. * Added support for rx_burst_mode_get and tx_burst_mode_get. + * Added vector mode support for ARM CPUs. * **Updated Mellanox mlx5 driver.** diff --git a/drivers/net/bnxt/Makefile b/drivers/net/bnxt/Makefile index 6b9544b5d6..c54fd108d1 100644 --- a/drivers/net/bnxt/Makefile +++ b/drivers/net/bnxt/Makefile @@ -44,6 +44,9 @@ SRCS-$(CONFIG_RTE_LIBRTE_BNXT_PMD) += rte_pmd_bnxt.c ifeq ($(CONFIG_RTE_ARCH_X86), y) SRCS-$(CONFIG_RTE_LIBRTE_BNXT_PMD) += bnxt_rxtx_vec_sse.c endif +ifeq ($(CONFIG_RTE_ARCH_ARM64), y) +SRCS-$(CONFIG_RTE_LIBRTE_BNXT_PMD) += bnxt_rxtx_vec_neon.c +endif ifeq ($(CONFIG_RTE_LIBRTE_BNXT_PMD), y) CFLAGS += -I$(SRCDIR) -I$(SRCDIR)/tf_ulp -I$(SRCDIR)/tf_core -I$(SRCDIR)/hcapi diff --git a/drivers/net/bnxt/bnxt_ethdev.c b/drivers/net/bnxt/bnxt_ethdev.c index 1dc85e6dff..3b9d3fbe73 100644 --- a/drivers/net/bnxt/bnxt_ethdev.c +++ b/drivers/net/bnxt/bnxt_ethdev.c @@ -1076,7 +1076,7 @@ bnxt_receive_function(struct rte_eth_dev *eth_dev) { struct bnxt *bp = eth_dev->data->dev_private; -#ifdef RTE_ARCH_X86 +#if defined(RTE_ARCH_X86) || defined(RTE_ARCH_ARM64) #ifndef RTE_LIBRTE_IEEE1588 /* * Vector mode receive can be enabled only if scatter rx is not @@ -1116,7 +1116,7 @@ bnxt_receive_function(struct rte_eth_dev *eth_dev) static eth_tx_burst_t bnxt_transmit_function(__rte_unused struct rte_eth_dev *eth_dev) { -#ifdef RTE_ARCH_X86 +#if defined(RTE_ARCH_X86) || defined(RTE_ARCH_ARM64) #ifndef RTE_LIBRTE_IEEE1588 struct bnxt *bp = eth_dev->data->dev_private; @@ -2679,7 +2679,7 @@ bnxt_rx_burst_mode_get(struct rte_eth_dev *dev, __rte_unused uint16_t queue_id, "Scalar"); return 0; } -#ifdef RTE_ARCH_X86 +#if defined(RTE_ARCH_X86) || defined(RTE_ARCH_ARM64) if (pkt_burst == bnxt_recv_pkts_vec) { snprintf(mode->info, sizeof(mode->info), "%s", "Vector SSE"); @@ -2701,7 +2701,7 @@ bnxt_tx_burst_mode_get(struct rte_eth_dev *dev, __rte_unused uint16_t queue_id, "Scalar"); return 0; } -#ifdef RTE_ARCH_X86 +#if defined(RTE_ARCH_X86) || defined(RTE_ARCH_ARM64) if (pkt_burst == bnxt_xmit_pkts_vec) { snprintf(mode->info, sizeof(mode->info), "%s", "Vector SSE"); @@ -2730,7 +2730,7 @@ int bnxt_mtu_set_op(struct rte_eth_dev *eth_dev, uint16_t new_mtu) new_pkt_size = new_mtu + RTE_ETHER_HDR_LEN + RTE_ETHER_CRC_LEN + VLAN_TAG_SIZE * BNXT_NUM_VLANS; -#ifdef RTE_ARCH_X86 +#if defined(RTE_ARCH_X86) || defined(RTE_ARCH_ARM64) /* * If vector-mode tx/rx is active, disallow any MTU change that would * require scattered receive support. diff --git a/drivers/net/bnxt/bnxt_ring.c b/drivers/net/bnxt/bnxt_ring.c index 24a947f278..54f6547442 100644 --- a/drivers/net/bnxt/bnxt_ring.c +++ b/drivers/net/bnxt/bnxt_ring.c @@ -608,7 +608,7 @@ int bnxt_alloc_hwrm_rx_ring(struct bnxt *bp, int queue_index) bnxt_db_write(&rxr->ag_db, rxr->ag_prod); } rxq->index = queue_index; -#ifdef RTE_ARCH_X86 +#if defined(RTE_ARCH_X86) || defined(RTE_ARCH_ARM64) bnxt_rxq_vec_setup(rxq); #endif @@ -713,7 +713,7 @@ int bnxt_alloc_hwrm_rings(struct bnxt *bp) bnxt_db_write(&rxr->rx_db, rxr->rx_prod); bnxt_db_write(&rxr->ag_db, rxr->ag_prod); rxq->index = i; -#ifdef RTE_ARCH_X86 +#if defined(RTE_ARCH_X86) || defined(RTE_ARCH_ARM64) bnxt_rxq_vec_setup(rxq); #endif } diff --git a/drivers/net/bnxt/bnxt_rxq.h b/drivers/net/bnxt/bnxt_rxq.h index 4f5182d9e9..d5ce3b6d58 100644 --- a/drivers/net/bnxt/bnxt_rxq.h +++ b/drivers/net/bnxt/bnxt_rxq.h @@ -22,7 +22,7 @@ struct bnxt_rx_queue { uint16_t nb_rx_hold; /* num held free RX desc */ uint16_t rx_free_thresh; /* max free RX desc to hold */ uint16_t queue_id; /* RX queue index */ -#ifdef RTE_ARCH_X86 +#if defined(RTE_ARCH_X86) || defined(RTE_ARCH_ARM64) uint16_t rxrearm_nb; /* number of descs to reinit. */ uint16_t rxrearm_start; /* next desc index to reinit. */ #endif diff --git a/drivers/net/bnxt/bnxt_rxr.c b/drivers/net/bnxt/bnxt_rxr.c index eb6f33c7be..baf73cb251 100644 --- a/drivers/net/bnxt/bnxt_rxr.c +++ b/drivers/net/bnxt/bnxt_rxr.c @@ -782,7 +782,7 @@ uint16_t bnxt_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, !rte_spinlock_trylock(&rxq->lock))) return 0; -#if defined(RTE_ARCH_X86) +#if defined(RTE_ARCH_X86) || defined(RTE_ARCH_ARM64) /* * Replenish buffers if needed when a transition has been made from * vector- to non-vector- receive processing. diff --git a/drivers/net/bnxt/bnxt_rxr.h b/drivers/net/bnxt/bnxt_rxr.h index e60c97fa19..2bf46cd910 100644 --- a/drivers/net/bnxt/bnxt_rxr.h +++ b/drivers/net/bnxt/bnxt_rxr.h @@ -221,7 +221,7 @@ int bnxt_init_one_rx_ring(struct bnxt_rx_queue *rxq); int bnxt_rx_queue_start(struct rte_eth_dev *dev, uint16_t rx_queue_id); int bnxt_rx_queue_stop(struct rte_eth_dev *dev, uint16_t rx_queue_id); -#ifdef RTE_ARCH_X86 +#if defined(RTE_ARCH_X86) || defined(RTE_ARCH_ARM64) uint16_t bnxt_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts); int bnxt_rxq_vec_setup(struct bnxt_rx_queue *rxq); diff --git a/drivers/net/bnxt/bnxt_rxtx_vec_common.h b/drivers/net/bnxt/bnxt_rxtx_vec_common.h new file mode 100644 index 0000000000..3da3c48f4e --- /dev/null +++ b/drivers/net/bnxt/bnxt_rxtx_vec_common.h @@ -0,0 +1,57 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2020 Broadcom + * All rights reserved. + */ + +#ifndef _BNXT_RXTX_VEC_COMMON_H_ +#define _BNXT_RXTX_VEC_COMMON_H_ + +#define RTE_BNXT_MAX_RX_BURST 32 +#define RTE_BNXT_MAX_TX_BURST 32 +#define RTE_BNXT_RXQ_REARM_THRESH 32 +#define RTE_BNXT_DESCS_PER_LOOP 4 + +#define TX_BD_FLAGS_CMPL ((1 << TX_BD_LONG_FLAGS_BD_CNT_SFT) | \ + TX_BD_SHORT_FLAGS_COAL_NOW | \ + TX_BD_SHORT_TYPE_TX_BD_SHORT | \ + TX_BD_LONG_FLAGS_PACKET_END) + +#define TX_BD_FLAGS_NOCMPL (TX_BD_FLAGS_CMPL | TX_BD_LONG_FLAGS_NO_CMPL) + +static inline uint32_t +bnxt_xmit_flags_len(uint16_t len, uint16_t flags) +{ + switch (len >> 9) { + case 0: + return flags | TX_BD_LONG_FLAGS_LHINT_LT512; + case 1: + return flags | TX_BD_LONG_FLAGS_LHINT_LT1K; + case 2: + return flags | TX_BD_LONG_FLAGS_LHINT_LT2K; + case 3: + return flags | TX_BD_LONG_FLAGS_LHINT_LT2K; + default: + return flags | TX_BD_LONG_FLAGS_LHINT_GTE2K; + } +} + +static inline int +bnxt_rxq_vec_setup_common(struct bnxt_rx_queue *rxq) +{ + uintptr_t p; + struct rte_mbuf mb_def = { .buf_addr = 0 }; /* zeroed mbuf */ + + mb_def.nb_segs = 1; + mb_def.data_off = RTE_PKTMBUF_HEADROOM; + mb_def.port = rxq->port_id; + rte_mbuf_refcnt_set(&mb_def, 1); + + /* prevent compiler reordering: rearm_data covers previous fields */ + rte_compiler_barrier(); + p = (uintptr_t)&mb_def.rearm_data; + rxq->mbuf_initializer = *(uint64_t *)p; + rxq->rxrearm_nb = 0; + rxq->rxrearm_start = 0; + return 0; +} +#endif /* _BNXT_RXTX_VEC_COMMON_H_ */ diff --git a/drivers/net/bnxt/bnxt_rxtx_vec_neon.c b/drivers/net/bnxt/bnxt_rxtx_vec_neon.c new file mode 100644 index 0000000000..488a0b4667 --- /dev/null +++ b/drivers/net/bnxt/bnxt_rxtx_vec_neon.c @@ -0,0 +1,469 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright(c) 2019-2020 Broadcom All rights reserved. */ + +#include +#include + +#include +#include +#include +#include +#include + +#include "bnxt.h" +#include "bnxt_cpr.h" +#include "bnxt_ring.h" +#include "bnxt_rxr.h" +#include "bnxt_rxq.h" +#include "hsi_struct_def_dpdk.h" +#include "bnxt_rxtx_vec_common.h" + +#include "bnxt_txq.h" +#include "bnxt_txr.h" + +/* + * RX Ring handling + */ + +static inline void +bnxt_rxq_rearm(struct bnxt_rx_queue *rxq, struct bnxt_rx_ring_info *rxr) +{ + struct rx_prod_pkt_bd *rxbds = &rxr->rx_desc_ring[rxq->rxrearm_start]; + struct bnxt_sw_rx_bd *rx_bufs = &rxr->rx_buf_ring[rxq->rxrearm_start]; + struct rte_mbuf *mb0, *mb1; + int i; + + const uint64x2_t hdr_room = {0, RTE_PKTMBUF_HEADROOM}; + const uint64x2_t addrmask = {0, UINT64_MAX}; + + /* Pull RTE_BNXT_RXQ_REARM_THRESH more mbufs into the software ring */ + if (rte_mempool_get_bulk(rxq->mb_pool, + (void *)rx_bufs, + RTE_BNXT_RXQ_REARM_THRESH) < 0) { + rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed += + RTE_BNXT_RXQ_REARM_THRESH; + + return; + } + + /* Initialize the mbufs in vector, process 2 mbufs in one loop */ + for (i = 0; i < RTE_BNXT_RXQ_REARM_THRESH; i += 2, rx_bufs += 2) { + uint64x2_t buf_addr0, buf_addr1; + uint64x2_t rxbd0, rxbd1; + + mb0 = rx_bufs[0].mbuf; + mb1 = rx_bufs[1].mbuf; + + /* Load address fields from both mbufs */ + buf_addr0 = vld1q_u64((uint64_t *)&mb0->buf_addr); + buf_addr1 = vld1q_u64((uint64_t *)&mb1->buf_addr); + + /* Load both rx descriptors (preserving some existing fields) */ + rxbd0 = vld1q_u64((uint64_t *)(rxbds + 0)); + rxbd1 = vld1q_u64((uint64_t *)(rxbds + 1)); + + /* Add default offset to buffer address. */ + buf_addr0 = vaddq_u64(buf_addr0, hdr_room); + buf_addr1 = vaddq_u64(buf_addr1, hdr_room); + + /* Clear all fields except address. */ + buf_addr0 = vandq_u64(buf_addr0, addrmask); + buf_addr1 = vandq_u64(buf_addr1, addrmask); + + /* Clear address field in descriptor. */ + rxbd0 = vbicq_u64(rxbd0, addrmask); + rxbd1 = vbicq_u64(rxbd1, addrmask); + + /* Set address field in descriptor. */ + rxbd0 = vaddq_u64(rxbd0, buf_addr0); + rxbd1 = vaddq_u64(rxbd1, buf_addr1); + + /* Store descriptors to memory. */ + vst1q_u64((uint64_t *)(rxbds++), rxbd0); + vst1q_u64((uint64_t *)(rxbds++), rxbd1); + } + + rxq->rxrearm_start += RTE_BNXT_RXQ_REARM_THRESH; + bnxt_db_write(&rxr->rx_db, rxq->rxrearm_start - 1); + if (rxq->rxrearm_start >= rxq->nb_rx_desc) + rxq->rxrearm_start = 0; + + rxq->rxrearm_nb -= RTE_BNXT_RXQ_REARM_THRESH; +} + +static uint32_t +bnxt_parse_pkt_type(struct rx_pkt_cmpl *rxcmp, struct rx_pkt_cmpl_hi *rxcmp1) +{ + uint32_t l3, pkt_type = 0; + uint32_t t_ipcs = 0, ip6 = 0, vlan = 0; + uint32_t flags_type; + + vlan = !!(rxcmp1->flags2 & + rte_cpu_to_le_32(RX_PKT_CMPL_FLAGS2_META_FORMAT_VLAN)); + pkt_type |= vlan ? RTE_PTYPE_L2_ETHER_VLAN : RTE_PTYPE_L2_ETHER; + + t_ipcs = !!(rxcmp1->flags2 & + rte_cpu_to_le_32(RX_PKT_CMPL_FLAGS2_T_IP_CS_CALC)); + ip6 = !!(rxcmp1->flags2 & + rte_cpu_to_le_32(RX_PKT_CMPL_FLAGS2_IP_TYPE)); + + flags_type = rxcmp->flags_type & + rte_cpu_to_le_32(RX_PKT_CMPL_FLAGS_ITYPE_MASK); + + if (!t_ipcs && !ip6) + l3 = RTE_PTYPE_L3_IPV4_EXT_UNKNOWN; + else if (!t_ipcs && ip6) + l3 = RTE_PTYPE_L3_IPV6_EXT_UNKNOWN; + else if (t_ipcs && !ip6) + l3 = RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN; + else + l3 = RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN; + + switch (flags_type) { + case RTE_LE32(RX_PKT_CMPL_FLAGS_ITYPE_ICMP): + if (!t_ipcs) + pkt_type |= l3 | RTE_PTYPE_L4_ICMP; + else + pkt_type |= l3 | RTE_PTYPE_INNER_L4_ICMP; + break; + + case RTE_LE32(RX_PKT_CMPL_FLAGS_ITYPE_TCP): + if (!t_ipcs) + pkt_type |= l3 | RTE_PTYPE_L4_TCP; + else + pkt_type |= l3 | RTE_PTYPE_INNER_L4_TCP; + break; + + case RTE_LE32(RX_PKT_CMPL_FLAGS_ITYPE_UDP): + if (!t_ipcs) + pkt_type |= l3 | RTE_PTYPE_L4_UDP; + else + pkt_type |= l3 | RTE_PTYPE_INNER_L4_UDP; + break; + + case RTE_LE32(RX_PKT_CMPL_FLAGS_ITYPE_IP): + pkt_type |= l3; + break; + } + + return pkt_type; +} + +static void +bnxt_parse_csum(struct rte_mbuf *mbuf, struct rx_pkt_cmpl_hi *rxcmp1) +{ + uint32_t flags; + + flags = flags2_0xf(rxcmp1); + /* IP Checksum */ + if (likely(IS_IP_NONTUNNEL_PKT(flags))) { + if (unlikely(RX_CMP_IP_CS_ERROR(rxcmp1))) + mbuf->ol_flags |= PKT_RX_IP_CKSUM_BAD; + else + mbuf->ol_flags |= PKT_RX_IP_CKSUM_GOOD; + } else if (IS_IP_TUNNEL_PKT(flags)) { + if (unlikely(RX_CMP_IP_OUTER_CS_ERROR(rxcmp1) || + RX_CMP_IP_CS_ERROR(rxcmp1))) + mbuf->ol_flags |= PKT_RX_IP_CKSUM_BAD; + else + mbuf->ol_flags |= PKT_RX_IP_CKSUM_GOOD; + } else if (unlikely(RX_CMP_IP_CS_UNKNOWN(rxcmp1))) { + mbuf->ol_flags |= PKT_RX_IP_CKSUM_UNKNOWN; + } + + /* L4 Checksum */ + if (likely(IS_L4_NONTUNNEL_PKT(flags))) { + if (unlikely(RX_CMP_L4_INNER_CS_ERR2(rxcmp1))) + mbuf->ol_flags |= PKT_RX_L4_CKSUM_BAD; + else + mbuf->ol_flags |= PKT_RX_L4_CKSUM_GOOD; + } else if (IS_L4_TUNNEL_PKT(flags)) { + if (unlikely(RX_CMP_L4_INNER_CS_ERR2(rxcmp1))) + mbuf->ol_flags |= PKT_RX_L4_CKSUM_BAD; + else + mbuf->ol_flags |= PKT_RX_L4_CKSUM_GOOD; + if (unlikely(RX_CMP_L4_OUTER_CS_ERR2(rxcmp1))) { + mbuf->ol_flags |= PKT_RX_OUTER_L4_CKSUM_BAD; + } else if (unlikely(IS_L4_TUNNEL_PKT_ONLY_INNER_L4_CS + (flags))) { + mbuf->ol_flags |= PKT_RX_OUTER_L4_CKSUM_UNKNOWN; + } else { + mbuf->ol_flags |= PKT_RX_OUTER_L4_CKSUM_GOOD; + } + } else if (unlikely(RX_CMP_L4_CS_UNKNOWN(rxcmp1))) { + mbuf->ol_flags |= PKT_RX_L4_CKSUM_UNKNOWN; + } +} + +uint16_t +bnxt_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts, + uint16_t nb_pkts) +{ + struct bnxt_rx_queue *rxq = rx_queue; + struct bnxt_cp_ring_info *cpr = rxq->cp_ring; + struct bnxt_rx_ring_info *rxr = rxq->rx_ring; + uint32_t raw_cons = cpr->cp_raw_cons; + uint32_t cons; + int nb_rx_pkts = 0; + struct rx_pkt_cmpl *rxcmp; + bool evt = false; + const uint64x2_t mbuf_init = {rxq->mbuf_initializer, 0}; + const uint8x16_t shuf_msk = { + 0xFF, 0xFF, 0xFF, 0xFF, /* pkt_type (zeroes) */ + 2, 3, 0xFF, 0xFF, /* pkt_len */ + 2, 3, /* data_len */ + 0xFF, 0xFF, /* vlan_tci (zeroes) */ + 12, 13, 14, 15 /* rss hash */ + }; + + /* If Rx Q was stopped return */ + if (unlikely(!rxq->rx_started)) + return 0; + + if (rxq->rxrearm_nb >= RTE_BNXT_RXQ_REARM_THRESH) + bnxt_rxq_rearm(rxq, rxr); + + /* Return no more than RTE_BNXT_MAX_RX_BURST per call. */ + nb_pkts = RTE_MIN(nb_pkts, RTE_BNXT_MAX_RX_BURST); + + /* Make nb_pkts an integer multiple of RTE_BNXT_DESCS_PER_LOOP */ + nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_BNXT_DESCS_PER_LOOP); + if (!nb_pkts) + return 0; + + /* Handle RX burst request */ + while (1) { + cons = RING_CMP(cpr->cp_ring_struct, raw_cons); + + rxcmp = (struct rx_pkt_cmpl *)&cpr->cp_desc_ring[cons]; + + if (!CMP_VALID(rxcmp, raw_cons, cpr->cp_ring_struct)) + break; + + if (likely(CMP_TYPE(rxcmp) == RX_PKT_CMPL_TYPE_RX_L2)) { + struct rx_pkt_cmpl_hi *rxcmp1; + uint32_t tmp_raw_cons; + uint16_t cp_cons; + struct rte_mbuf *mbuf; + uint64x2_t mm_rxcmp; + uint8x16_t pkt_mb; + + tmp_raw_cons = NEXT_RAW_CMP(raw_cons); + cp_cons = RING_CMP(cpr->cp_ring_struct, tmp_raw_cons); + rxcmp1 = (struct rx_pkt_cmpl_hi *) + &cpr->cp_desc_ring[cp_cons]; + + if (!CMP_VALID(rxcmp1, tmp_raw_cons, + cpr->cp_ring_struct)) + break; + + raw_cons = tmp_raw_cons; + cons = rxcmp->opaque; + + mbuf = rxr->rx_buf_ring[cons].mbuf; + rte_prefetch0(mbuf); + rxr->rx_buf_ring[cons].mbuf = NULL; + + /* Set constant fields from mbuf initializer. */ + vst1q_u64((uint64_t *)&mbuf->rearm_data, mbuf_init); + + /* Set mbuf pkt_len, data_len, and rss_hash fields. */ + mm_rxcmp = vld1q_u64((uint64_t *)rxcmp); + pkt_mb = vqtbl1q_u8(vreinterpretq_u8_u64(mm_rxcmp), + shuf_msk); + vst1q_u64((uint64_t *)&mbuf->rx_descriptor_fields1, + vreinterpretq_u64_u8(pkt_mb)); + + rte_compiler_barrier(); + + if (rxcmp->flags_type & RX_PKT_CMPL_FLAGS_RSS_VALID) + mbuf->ol_flags |= PKT_RX_RSS_HASH; + + if (rxcmp1->flags2 & + RX_PKT_CMPL_FLAGS2_META_FORMAT_VLAN) { + mbuf->vlan_tci = rxcmp1->metadata & + (RX_PKT_CMPL_METADATA_VID_MASK | + RX_PKT_CMPL_METADATA_DE | + RX_PKT_CMPL_METADATA_PRI_MASK); + mbuf->ol_flags |= + PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED; + } + + bnxt_parse_csum(mbuf, rxcmp1); + mbuf->packet_type = bnxt_parse_pkt_type(rxcmp, rxcmp1); + + rx_pkts[nb_rx_pkts++] = mbuf; + } else if (!BNXT_NUM_ASYNC_CPR(rxq->bp)) { + evt = + bnxt_event_hwrm_resp_handler(rxq->bp, + (struct cmpl_base *)rxcmp); + } + + raw_cons = NEXT_RAW_CMP(raw_cons); + if (nb_rx_pkts == nb_pkts || evt) + break; + } + rxr->rx_prod = RING_ADV(rxr->rx_ring_struct, rxr->rx_prod, nb_rx_pkts); + + rxq->rxrearm_nb += nb_rx_pkts; + cpr->cp_raw_cons = raw_cons; + cpr->valid = !!(cpr->cp_raw_cons & cpr->cp_ring_struct->ring_size); + if (nb_rx_pkts || evt) + bnxt_db_cq(cpr); + + return nb_rx_pkts; +} + +static void +bnxt_tx_cmp_vec(struct bnxt_tx_queue *txq, int nr_pkts) +{ + struct bnxt_tx_ring_info *txr = txq->tx_ring; + struct rte_mbuf **free = txq->free; + uint16_t cons = txr->tx_cons; + unsigned int blk = 0; + + while (nr_pkts--) { + struct bnxt_sw_tx_bd *tx_buf; + struct rte_mbuf *mbuf; + + tx_buf = &txr->tx_buf_ring[cons]; + cons = RING_NEXT(txr->tx_ring_struct, cons); + mbuf = rte_pktmbuf_prefree_seg(tx_buf->mbuf); + tx_buf->mbuf = NULL; + + if (blk && mbuf->pool != free[0]->pool) { + rte_mempool_put_bulk(free[0]->pool, (void **)free, blk); + blk = 0; + } + free[blk++] = mbuf; + } + if (blk) + rte_mempool_put_bulk(free[0]->pool, (void **)free, blk); + + txr->tx_cons = cons; +} + +static void +bnxt_handle_tx_cp_vec(struct bnxt_tx_queue *txq) +{ + struct bnxt_cp_ring_info *cpr = txq->cp_ring; + uint32_t raw_cons = cpr->cp_raw_cons; + uint32_t cons; + uint32_t nb_tx_pkts = 0; + struct tx_cmpl *txcmp; + struct cmpl_base *cp_desc_ring = cpr->cp_desc_ring; + struct bnxt_ring *cp_ring_struct = cpr->cp_ring_struct; + uint32_t ring_mask = cp_ring_struct->ring_mask; + + do { + cons = RING_CMPL(ring_mask, raw_cons); + txcmp = (struct tx_cmpl *)&cp_desc_ring[cons]; + + if (!CMP_VALID(txcmp, raw_cons, cp_ring_struct)) + break; + + if (likely(CMP_TYPE(txcmp) == TX_CMPL_TYPE_TX_L2)) + nb_tx_pkts += txcmp->opaque; + else + RTE_LOG_DP(ERR, PMD, + "Unhandled CMP type %02x\n", + CMP_TYPE(txcmp)); + raw_cons = NEXT_RAW_CMP(raw_cons); + } while (nb_tx_pkts < ring_mask); + + cpr->valid = !!(raw_cons & cp_ring_struct->ring_size); + if (nb_tx_pkts) { + bnxt_tx_cmp_vec(txq, nb_tx_pkts); + cpr->cp_raw_cons = raw_cons; + bnxt_db_cq(cpr); + } +} + +static uint16_t +bnxt_xmit_fixed_burst_vec(void *tx_queue, struct rte_mbuf **tx_pkts, + uint16_t nb_pkts) +{ + struct bnxt_tx_queue *txq = tx_queue; + struct bnxt_tx_ring_info *txr = txq->tx_ring; + uint16_t prod = txr->tx_prod; + struct rte_mbuf *tx_mbuf; + struct tx_bd_long *txbd = NULL; + struct bnxt_sw_tx_bd *tx_buf; + uint16_t to_send; + + nb_pkts = RTE_MIN(nb_pkts, bnxt_tx_avail(txq)); + + if (unlikely(nb_pkts == 0)) + return 0; + + /* Handle TX burst request */ + to_send = nb_pkts; + while (to_send) { + tx_mbuf = *tx_pkts++; + rte_prefetch0(tx_mbuf); + + tx_buf = &txr->tx_buf_ring[prod]; + tx_buf->mbuf = tx_mbuf; + tx_buf->nr_bds = 1; + + txbd = &txr->tx_desc_ring[prod]; + txbd->address = tx_mbuf->buf_iova + tx_mbuf->data_off; + txbd->len = tx_mbuf->data_len; + txbd->flags_type = bnxt_xmit_flags_len(tx_mbuf->data_len, + TX_BD_FLAGS_NOCMPL); + prod = RING_NEXT(txr->tx_ring_struct, prod); + to_send--; + } + + /* Request a completion for last packet in burst */ + if (txbd) { + txbd->opaque = nb_pkts; + txbd->flags_type &= ~TX_BD_LONG_FLAGS_NO_CMPL; + } + + rte_compiler_barrier(); + bnxt_db_write(&txr->tx_db, prod); + + txr->tx_prod = prod; + + return nb_pkts; +} + +uint16_t +bnxt_xmit_pkts_vec(void *tx_queue, struct rte_mbuf **tx_pkts, + uint16_t nb_pkts) +{ + int nb_sent = 0; + struct bnxt_tx_queue *txq = tx_queue; + + /* Tx queue was stopped; wait for it to be restarted */ + if (unlikely(!txq->tx_started)) { + PMD_DRV_LOG(DEBUG, "Tx q stopped;return\n"); + return 0; + } + + /* Handle TX completions */ + if (bnxt_tx_bds_in_hw(txq) >= txq->tx_free_thresh) + bnxt_handle_tx_cp_vec(txq); + + while (nb_pkts) { + uint16_t ret, num; + + num = RTE_MIN(nb_pkts, RTE_BNXT_MAX_TX_BURST); + ret = bnxt_xmit_fixed_burst_vec(tx_queue, + &tx_pkts[nb_sent], + num); + nb_sent += ret; + nb_pkts -= ret; + if (ret < num) + break; + } + + return nb_sent; +} + +int __rte_cold +bnxt_rxq_vec_setup(struct bnxt_rx_queue *rxq) +{ + return bnxt_rxq_vec_setup_common(rxq); +} diff --git a/drivers/net/bnxt/bnxt_rxtx_vec_sse.c b/drivers/net/bnxt/bnxt_rxtx_vec_sse.c index 8f73add9be..c4ca5cf2dd 100644 --- a/drivers/net/bnxt/bnxt_rxtx_vec_sse.c +++ b/drivers/net/bnxt/bnxt_rxtx_vec_sse.c @@ -20,6 +20,7 @@ #include "bnxt_rxr.h" #include "bnxt_rxq.h" #include "hsi_struct_def_dpdk.h" +#include "bnxt_rxtx_vec_common.h" #include "bnxt_txq.h" #include "bnxt_txr.h" @@ -28,11 +29,6 @@ * RX Ring handling */ -#define RTE_BNXT_MAX_RX_BURST 32 -#define RTE_BNXT_MAX_TX_BURST 32 -#define RTE_BNXT_RXQ_REARM_THRESH 32 -#define RTE_BNXT_DESCS_PER_LOOP 4 - static inline void bnxt_rxq_rearm(struct bnxt_rx_queue *rxq, struct bnxt_rx_ring_info *rxr) { @@ -388,30 +384,6 @@ bnxt_handle_tx_cp_vec(struct bnxt_tx_queue *txq) } } -#define TX_BD_FLAGS_CMPL ((1 << TX_BD_LONG_FLAGS_BD_CNT_SFT) | \ - TX_BD_SHORT_FLAGS_COAL_NOW | \ - TX_BD_SHORT_TYPE_TX_BD_SHORT | \ - TX_BD_LONG_FLAGS_PACKET_END) - -#define TX_BD_FLAGS_NOCMPL (TX_BD_FLAGS_CMPL | TX_BD_LONG_FLAGS_NO_CMPL) - -static inline uint32_t -bnxt_xmit_flags_len(uint16_t len, uint16_t flags) -{ - switch (len >> 9) { - case 0: - return flags | TX_BD_LONG_FLAGS_LHINT_LT512; - case 1: - return flags | TX_BD_LONG_FLAGS_LHINT_LT1K; - case 2: - return flags | TX_BD_LONG_FLAGS_LHINT_LT2K; - case 3: - return flags | TX_BD_LONG_FLAGS_LHINT_LT2K; - default: - return flags | TX_BD_LONG_FLAGS_LHINT_GTE2K; - } -} - static uint16_t bnxt_xmit_fixed_burst_vec(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts) @@ -498,19 +470,5 @@ bnxt_xmit_pkts_vec(void *tx_queue, struct rte_mbuf **tx_pkts, int __rte_cold bnxt_rxq_vec_setup(struct bnxt_rx_queue *rxq) { - uintptr_t p; - struct rte_mbuf mb_def = { .buf_addr = 0 }; /* zeroed mbuf */ - - mb_def.nb_segs = 1; - mb_def.data_off = RTE_PKTMBUF_HEADROOM; - mb_def.port = rxq->port_id; - rte_mbuf_refcnt_set(&mb_def, 1); - - /* prevent compiler reordering: rearm_data covers previous fields */ - rte_compiler_barrier(); - p = (uintptr_t)&mb_def.rearm_data; - rxq->mbuf_initializer = *(uint64_t *)p; - rxq->rxrearm_nb = 0; - rxq->rxrearm_start = 0; - return 0; + return bnxt_rxq_vec_setup_common(rxq); } diff --git a/drivers/net/bnxt/bnxt_txr.h b/drivers/net/bnxt/bnxt_txr.h index e7f43f9d1d..7715c11b83 100644 --- a/drivers/net/bnxt/bnxt_txr.h +++ b/drivers/net/bnxt/bnxt_txr.h @@ -59,7 +59,7 @@ uint16_t bnxt_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts); uint16_t bnxt_dummy_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts); -#ifdef RTE_ARCH_X86 +#if defined(RTE_ARCH_X86) || defined(RTE_ARCH_ARM64) uint16_t bnxt_xmit_pkts_vec(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts); #endif diff --git a/drivers/net/bnxt/meson.build b/drivers/net/bnxt/meson.build index 5fb0ed3805..0b93c31716 100644 --- a/drivers/net/bnxt/meson.build +++ b/drivers/net/bnxt/meson.build @@ -68,4 +68,6 @@ sources = files('bnxt_cpr.c', if arch_subdir == 'x86' sources += files('bnxt_rxtx_vec_sse.c') +elif arch_subdir == 'arm' and host_machine.cpu_family().startswith('aarch64') + sources += files('bnxt_rxtx_vec_neon.c') endif