From bc4a000f2f53c30ccd051778d91ebf993ae31121 Mon Sep 17 00:00:00 2001 From: Lance Richardson Date: Wed, 29 May 2019 17:02:23 -0400 Subject: [PATCH] net/bnxt: implement SSE vector mode Introduce SSE vector mode support for the bnxt pmd. Signed-off-by: Lance Richardson Signed-off-by: Ajit Khaparde --- doc/guides/rel_notes/release_19_08.rst | 1 + drivers/net/bnxt/Makefile | 3 + drivers/net/bnxt/bnxt_ethdev.c | 95 ++++- drivers/net/bnxt/bnxt_ring.h | 3 +- drivers/net/bnxt/bnxt_rxq.c | 5 + drivers/net/bnxt/bnxt_rxq.h | 4 + drivers/net/bnxt/bnxt_rxr.h | 9 +- drivers/net/bnxt/bnxt_rxtx_vec_sse.c | 481 +++++++++++++++++++++++++ drivers/net/bnxt/bnxt_txr.h | 5 + drivers/net/bnxt/meson.build | 4 + 10 files changed, 602 insertions(+), 8 deletions(-) create mode 100644 drivers/net/bnxt/bnxt_rxtx_vec_sse.c diff --git a/doc/guides/rel_notes/release_19_08.rst b/doc/guides/rel_notes/release_19_08.rst index 1d684c1384..e1a1fdee91 100644 --- a/doc/guides/rel_notes/release_19_08.rst +++ b/doc/guides/rel_notes/release_19_08.rst @@ -59,6 +59,7 @@ New Features Updated the bnxt PMD. The major enhancements include: * Performance optimizations in non-vector Tx path + * Added support for SSE vector mode Removed Items diff --git a/drivers/net/bnxt/Makefile b/drivers/net/bnxt/Makefile index 8be3cb0e4c..21a5650630 100644 --- a/drivers/net/bnxt/Makefile +++ b/drivers/net/bnxt/Makefile @@ -41,6 +41,9 @@ SRCS-$(CONFIG_RTE_LIBRTE_BNXT_PMD) += bnxt_vnic.c SRCS-$(CONFIG_RTE_LIBRTE_BNXT_PMD) += bnxt_irq.c SRCS-$(CONFIG_RTE_LIBRTE_BNXT_PMD) += bnxt_util.c SRCS-$(CONFIG_RTE_LIBRTE_BNXT_PMD) += rte_pmd_bnxt.c +ifeq ($(CONFIG_RTE_ARCH_X86), y) +SRCS-$(CONFIG_RTE_LIBRTE_BNXT_PMD) += bnxt_rxtx_vec_sse.c +endif # # Export include files diff --git a/drivers/net/bnxt/bnxt_ethdev.c b/drivers/net/bnxt/bnxt_ethdev.c index 43722601cb..41771d8e25 100644 --- a/drivers/net/bnxt/bnxt_ethdev.c +++ b/drivers/net/bnxt/bnxt_ethdev.c @@ -644,6 +644,67 @@ static int bnxt_scattered_rx(struct rte_eth_dev *eth_dev) return 0; } +static eth_rx_burst_t +bnxt_receive_function(__rte_unused struct rte_eth_dev *eth_dev) +{ +#ifdef RTE_ARCH_X86 + /* + * Vector mode receive can be enabled only if scatter rx is not + * in use and rx offloads are limited to VLAN stripping and + * CRC stripping. + */ + if (!eth_dev->data->scattered_rx && + !(eth_dev->data->dev_conf.rxmode.offloads & + ~(DEV_RX_OFFLOAD_VLAN_STRIP | + DEV_RX_OFFLOAD_KEEP_CRC | + DEV_RX_OFFLOAD_JUMBO_FRAME | + DEV_RX_OFFLOAD_IPV4_CKSUM | + DEV_RX_OFFLOAD_UDP_CKSUM | + DEV_RX_OFFLOAD_TCP_CKSUM | + DEV_RX_OFFLOAD_OUTER_IPV4_CKSUM | + DEV_RX_OFFLOAD_VLAN_FILTER))) { + PMD_DRV_LOG(INFO, "Using vector mode receive for port %d\n", + eth_dev->data->port_id); + return bnxt_recv_pkts_vec; + } + PMD_DRV_LOG(INFO, "Vector mode receive disabled for port %d\n", + eth_dev->data->port_id); + PMD_DRV_LOG(INFO, + "Port %d scatter: %d rx offload: %" PRIX64 "\n", + eth_dev->data->port_id, + eth_dev->data->scattered_rx, + eth_dev->data->dev_conf.rxmode.offloads); +#endif + return bnxt_recv_pkts; +} + +static eth_tx_burst_t +bnxt_transmit_function(__rte_unused struct rte_eth_dev *eth_dev) +{ +#ifdef RTE_ARCH_X86 + /* + * Vector mode receive can be enabled only if scatter tx is not + * in use and tx offloads other than VLAN insertion are not + * in use. + */ + if (!eth_dev->data->scattered_rx && + !(eth_dev->data->dev_conf.txmode.offloads & + ~DEV_TX_OFFLOAD_VLAN_INSERT)) { + PMD_DRV_LOG(INFO, "Using vector mode transmit for port %d\n", + eth_dev->data->port_id); + return bnxt_xmit_pkts_vec; + } + PMD_DRV_LOG(INFO, "Vector mode transmit disabled for port %d\n", + eth_dev->data->port_id); + PMD_DRV_LOG(INFO, + "Port %d scatter: %d tx offload: %" PRIX64 "\n", + eth_dev->data->port_id, + eth_dev->data->scattered_rx, + eth_dev->data->dev_conf.txmode.offloads); +#endif + return bnxt_xmit_pkts; +} + static int bnxt_dev_start_op(struct rte_eth_dev *eth_dev) { struct bnxt *bp = (struct bnxt *)eth_dev->data->dev_private; @@ -674,6 +735,8 @@ static int bnxt_dev_start_op(struct rte_eth_dev *eth_dev) if (rc) goto error; + eth_dev->rx_pkt_burst = bnxt_receive_function(eth_dev); + eth_dev->tx_pkt_burst = bnxt_transmit_function(eth_dev); bp->flags |= BNXT_FLAG_INIT_DONE; return 0; @@ -1600,9 +1663,13 @@ static int bnxt_mtu_set_op(struct rte_eth_dev *eth_dev, uint16_t new_mtu) { struct bnxt *bp = eth_dev->data->dev_private; struct rte_eth_dev_info dev_info; + uint32_t new_pkt_size; uint32_t rc = 0; uint32_t i; + new_pkt_size = new_mtu + RTE_ETHER_HDR_LEN + RTE_ETHER_CRC_LEN + + VLAN_TAG_SIZE * BNXT_NUM_VLANS; + bnxt_dev_info_get_op(eth_dev, &dev_info); if (new_mtu < RTE_ETHER_MIN_MTU || new_mtu > BNXT_MAX_MTU) { @@ -1611,6 +1678,23 @@ static int bnxt_mtu_set_op(struct rte_eth_dev *eth_dev, uint16_t new_mtu) return -EINVAL; } +#ifdef RTE_ARCH_X86 + /* + * If vector-mode tx/rx is active, disallow any MTU change that would + * require scattered receive support. + */ + if (eth_dev->data->dev_started && + (eth_dev->rx_pkt_burst == bnxt_recv_pkts_vec || + eth_dev->tx_pkt_burst == bnxt_xmit_pkts_vec) && + (new_pkt_size > + eth_dev->data->min_rx_buf_size - RTE_PKTMBUF_HEADROOM)) { + PMD_DRV_LOG(ERR, + "MTU change would require scattered rx support. "); + PMD_DRV_LOG(ERR, "Stop port before changing MTU.\n"); + return -EINVAL; + } +#endif + if (new_mtu > RTE_ETHER_MTU) { bp->flags |= BNXT_FLAG_JUMBO; bp->eth_dev->data->dev_conf.rxmode.offloads |= @@ -1621,9 +1705,7 @@ static int bnxt_mtu_set_op(struct rte_eth_dev *eth_dev, uint16_t new_mtu) bp->flags &= ~BNXT_FLAG_JUMBO; } - eth_dev->data->dev_conf.rxmode.max_rx_pkt_len = - new_mtu + RTE_ETHER_HDR_LEN + RTE_ETHER_CRC_LEN + - VLAN_TAG_SIZE * 2; + eth_dev->data->dev_conf.rxmode.max_rx_pkt_len = new_pkt_size; eth_dev->data->mtu = new_mtu; PMD_DRV_LOG(INFO, "New MTU is %d\n", eth_dev->data->mtu); @@ -2658,9 +2740,10 @@ bnxt_dev_supported_ptypes_get_op(struct rte_eth_dev *dev) RTE_PTYPE_UNKNOWN }; - if (dev->rx_pkt_burst == bnxt_recv_pkts) - return ptypes; - return NULL; + if (!dev->rx_pkt_burst) + return NULL; + + return ptypes; } static int bnxt_map_regs(struct bnxt *bp, uint32_t *reg_arr, int count, diff --git a/drivers/net/bnxt/bnxt_ring.h b/drivers/net/bnxt/bnxt_ring.h index 1446d784f6..fd3d0bd73f 100644 --- a/drivers/net/bnxt/bnxt_ring.h +++ b/drivers/net/bnxt/bnxt_ring.h @@ -10,7 +10,8 @@ #include -#define RING_NEXT(ring, idx) (((idx) + 1) & (ring)->ring_mask) +#define RING_ADV(ring, idx, n) (((idx) + (n)) & (ring)->ring_mask) +#define RING_NEXT(ring, idx) RING_ADV(ring, idx, 1) #define DB_IDX_MASK 0xffffff #define DB_IDX_VALID (0x1 << 26) diff --git a/drivers/net/bnxt/bnxt_rxq.c b/drivers/net/bnxt/bnxt_rxq.c index f1ee9eedea..2ce2ef427a 100644 --- a/drivers/net/bnxt/bnxt_rxq.c +++ b/drivers/net/bnxt/bnxt_rxq.c @@ -355,6 +355,11 @@ int bnxt_rx_queue_setup_op(struct rte_eth_dev *eth_dev, RTE_ETH_QUEUE_STATE_STARTED; eth_dev->data->rx_queue_state[queue_idx] = queue_state; rte_spinlock_init(&rxq->lock); + +#ifdef RTE_ARCH_X86 + bnxt_rxq_vec_setup(rxq); +#endif + out: return rc; } diff --git a/drivers/net/bnxt/bnxt_rxq.h b/drivers/net/bnxt/bnxt_rxq.h index e5d6001d36..7c6b4dec90 100644 --- a/drivers/net/bnxt/bnxt_rxq.h +++ b/drivers/net/bnxt/bnxt_rxq.h @@ -22,6 +22,10 @@ struct bnxt_rx_queue { uint16_t nb_rx_hold; /* num held free RX desc */ uint16_t rx_free_thresh; /* max free RX desc to hold */ uint16_t queue_id; /* RX queue index */ +#ifdef RTE_ARCH_X86 + uint16_t rxrearm_nb; /* number of descs to reinit. */ + uint16_t rxrearm_start; /* next desc index to reinit. */ +#endif uint16_t reg_idx; /* RX queue register index */ uint16_t port_id; /* Device port identifier */ uint8_t crc_len; /* 0 if CRC stripped, 4 otherwise */ diff --git a/drivers/net/bnxt/bnxt_rxr.h b/drivers/net/bnxt/bnxt_rxr.h index 3815a21991..d10cefb938 100644 --- a/drivers/net/bnxt/bnxt_rxr.h +++ b/drivers/net/bnxt/bnxt_rxr.h @@ -7,7 +7,7 @@ #define _BNXT_RXR_H_ #define B_RX_DB(db, prod) \ - (*(uint32_t *)db = (DB_KEY_RX | prod)) + (*(uint32_t *)db = (DB_KEY_RX | (prod))) #define BNXT_TPA_L4_SIZE(x) \ { \ @@ -110,4 +110,11 @@ int bnxt_init_rx_ring_struct(struct bnxt_rx_queue *rxq, unsigned int socket_id); int bnxt_init_one_rx_ring(struct bnxt_rx_queue *rxq); int bnxt_rx_queue_start(struct rte_eth_dev *dev, uint16_t rx_queue_id); int bnxt_rx_queue_stop(struct rte_eth_dev *dev, uint16_t rx_queue_id); + +#ifdef RTE_ARCH_X86 +uint16_t bnxt_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts, + uint16_t nb_pkts); +int bnxt_rxq_vec_setup(struct bnxt_rx_queue *rxq); +#endif + #endif diff --git a/drivers/net/bnxt/bnxt_rxtx_vec_sse.c b/drivers/net/bnxt/bnxt_rxtx_vec_sse.c new file mode 100644 index 0000000000..1c32c986ce --- /dev/null +++ b/drivers/net/bnxt/bnxt_rxtx_vec_sse.c @@ -0,0 +1,481 @@ +// SPDX-License-Identifier: BSD-3-Clause +/* Copyright(c) 2019 Broadcom All rights reserved. */ + +#include +#include + +#include +#include +#include +#include +#if defined(RTE_ARCH_X86) +#include +#else +#error "bnxt vector pmd: unsupported target." +#endif + +#include "bnxt.h" +#include "bnxt_cpr.h" +#include "bnxt_ring.h" +#include "bnxt_rxr.h" +#include "bnxt_rxq.h" +#include "hsi_struct_def_dpdk.h" + +#include "bnxt_txq.h" +#include "bnxt_txr.h" + +/* + * RX Ring handling + */ + +#define RTE_BNXT_MAX_RX_BURST 32 +#define RTE_BNXT_MAX_TX_BURST 32 +#define RTE_BNXT_RXQ_REARM_THRESH 32 +#define RTE_BNXT_DESCS_PER_LOOP 4 + +static inline void +bnxt_rxq_rearm(struct bnxt_rx_queue *rxq, struct bnxt_rx_ring_info *rxr) +{ + struct rx_prod_pkt_bd *rxbds = &rxr->rx_desc_ring[rxq->rxrearm_start]; + struct bnxt_sw_rx_bd *rx_bufs = &rxr->rx_buf_ring[rxq->rxrearm_start]; + struct rte_mbuf *mb0, *mb1; + int i; + + const __m128i hdr_room = _mm_set_epi64x(RTE_PKTMBUF_HEADROOM, 0); + const __m128i addrmask = _mm_set_epi64x(UINT64_MAX, 0); + + /* Pull RTE_BNXT_RXQ_REARM_THRESH more mbufs into the software ring */ + if (rte_mempool_get_bulk(rxq->mb_pool, + (void *)rx_bufs, + RTE_BNXT_RXQ_REARM_THRESH) < 0) { + rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed += + RTE_BNXT_RXQ_REARM_THRESH; + + return; + } + + /* Initialize the mbufs in vector, process 2 mbufs in one loop */ + for (i = 0; i < RTE_BNXT_RXQ_REARM_THRESH; i += 2, rx_bufs += 2) { + __m128i buf_addr0, buf_addr1; + __m128i rxbd0, rxbd1; + + mb0 = rx_bufs[0].mbuf; + mb1 = rx_bufs[1].mbuf; + + /* Load address fields from both mbufs */ + buf_addr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr); + buf_addr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr); + + /* Load both rx descriptors (preserving some existing fields) */ + rxbd0 = _mm_loadu_si128((__m128i *)(rxbds + 0)); + rxbd1 = _mm_loadu_si128((__m128i *)(rxbds + 1)); + + /* Add default offset to buffer address. */ + buf_addr0 = _mm_add_epi64(buf_addr0, hdr_room); + buf_addr1 = _mm_add_epi64(buf_addr1, hdr_room); + + /* Clear all fields except address. */ + buf_addr0 = _mm_and_si128(buf_addr0, addrmask); + buf_addr1 = _mm_and_si128(buf_addr1, addrmask); + + /* Clear address field in descriptor. */ + rxbd0 = _mm_andnot_si128(addrmask, rxbd0); + rxbd1 = _mm_andnot_si128(addrmask, rxbd1); + + /* Set address field in descriptor. */ + rxbd0 = _mm_add_epi64(rxbd0, buf_addr0); + rxbd1 = _mm_add_epi64(rxbd1, buf_addr1); + + /* Store descriptors to memory. */ + _mm_store_si128((__m128i *)(rxbds++), rxbd0); + _mm_store_si128((__m128i *)(rxbds++), rxbd1); + } + + rxq->rxrearm_start += RTE_BNXT_RXQ_REARM_THRESH; + B_RX_DB(rxr->rx_doorbell, rxq->rxrearm_start - 1); + if (rxq->rxrearm_start >= rxq->nb_rx_desc) + rxq->rxrearm_start = 0; + + rxq->rxrearm_nb -= RTE_BNXT_RXQ_REARM_THRESH; +} + +static uint32_t +bnxt_parse_pkt_type(struct rx_pkt_cmpl *rxcmp, struct rx_pkt_cmpl_hi *rxcmp1) +{ + uint32_t l3, pkt_type = 0; + uint32_t t_ipcs = 0, ip6 = 0, vlan = 0; + uint32_t flags_type; + + vlan = !!(rxcmp1->flags2 & + rte_cpu_to_le_32(RX_PKT_CMPL_FLAGS2_META_FORMAT_VLAN)); + pkt_type |= vlan ? RTE_PTYPE_L2_ETHER_VLAN : RTE_PTYPE_L2_ETHER; + + t_ipcs = !!(rxcmp1->flags2 & + rte_cpu_to_le_32(RX_PKT_CMPL_FLAGS2_T_IP_CS_CALC)); + ip6 = !!(rxcmp1->flags2 & + rte_cpu_to_le_32(RX_PKT_CMPL_FLAGS2_IP_TYPE)); + + flags_type = rxcmp->flags_type & + rte_cpu_to_le_32(RX_PKT_CMPL_FLAGS_ITYPE_MASK); + + if (!t_ipcs && !ip6) + l3 = RTE_PTYPE_L3_IPV4_EXT_UNKNOWN; + else if (!t_ipcs && ip6) + l3 = RTE_PTYPE_L3_IPV6_EXT_UNKNOWN; + else if (t_ipcs && !ip6) + l3 = RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN; + else + l3 = RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN; + + switch (flags_type) { + case RTE_LE32(RX_PKT_CMPL_FLAGS_ITYPE_ICMP): + if (!t_ipcs) + pkt_type |= l3 | RTE_PTYPE_L4_ICMP; + else + pkt_type |= l3 | RTE_PTYPE_INNER_L4_ICMP; + break; + + case RTE_LE32(RX_PKT_CMPL_FLAGS_ITYPE_TCP): + if (!t_ipcs) + pkt_type |= l3 | RTE_PTYPE_L4_TCP; + else + pkt_type |= l3 | RTE_PTYPE_INNER_L4_TCP; + break; + + case RTE_LE32(RX_PKT_CMPL_FLAGS_ITYPE_UDP): + if (!t_ipcs) + pkt_type |= l3 | RTE_PTYPE_L4_UDP; + else + pkt_type |= l3 | RTE_PTYPE_INNER_L4_UDP; + break; + + case RTE_LE32(RX_PKT_CMPL_FLAGS_ITYPE_IP): + pkt_type |= l3; + break; + } + + return pkt_type; +} + +uint16_t +bnxt_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts, + uint16_t nb_pkts) +{ + struct bnxt_rx_queue *rxq = rx_queue; + struct bnxt_cp_ring_info *cpr = rxq->cp_ring; + struct bnxt_rx_ring_info *rxr = rxq->rx_ring; + uint32_t raw_cons = cpr->cp_raw_cons; + uint32_t cons; + int nb_rx_pkts = 0; + struct rx_pkt_cmpl *rxcmp; + bool evt = false; + const __m128i mbuf_init = _mm_set_epi64x(0, rxq->mbuf_initializer); + const __m128i shuf_msk = + _mm_set_epi8(15, 14, 13, 12, /* rss */ + 0xFF, 0xFF, /* vlan_tci (zeroes) */ + 3, 2, /* data_len */ + 0xFF, 0xFF, 3, 2, /* pkt_len */ + 0xFF, 0xFF, 0xFF, 0xFF); /* pkt_type (zeroes) */ + + /* If Rx Q was stopped return */ + if (rxq->rx_deferred_start) + return 0; + + if (rxq->rxrearm_nb >= RTE_BNXT_RXQ_REARM_THRESH) + bnxt_rxq_rearm(rxq, rxr); + + /* Return no more than RTE_BNXT_MAX_RX_BURST per call. */ + nb_pkts = RTE_MIN(nb_pkts, RTE_BNXT_MAX_RX_BURST); + + /* Make nb_pkts an integer multiple of RTE_BNXT_DESCS_PER_LOOP */ + nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_BNXT_DESCS_PER_LOOP); + + /* Handle RX burst request */ + while (1) { + cons = RING_CMP(cpr->cp_ring_struct, raw_cons); + + rxcmp = (struct rx_pkt_cmpl *)&cpr->cp_desc_ring[cons]; + + if (!CMP_VALID(rxcmp, raw_cons, cpr->cp_ring_struct)) + break; + + cpr->valid = FLIP_VALID(cons, + cpr->cp_ring_struct->ring_mask, + cpr->valid); + + if (likely(CMP_TYPE(rxcmp) == RX_PKT_CMPL_TYPE_RX_L2)) { + struct rx_pkt_cmpl_hi *rxcmp1; + uint32_t tmp_raw_cons; + uint16_t cp_cons; + struct rte_mbuf *mbuf; + __m128i mm_rxcmp, pkt_mb; + + tmp_raw_cons = NEXT_RAW_CMP(raw_cons); + cp_cons = RING_CMP(cpr->cp_ring_struct, tmp_raw_cons); + rxcmp1 = (struct rx_pkt_cmpl_hi *) + &cpr->cp_desc_ring[cp_cons]; + + if (!CMP_VALID(rxcmp1, tmp_raw_cons, + cpr->cp_ring_struct)) + break; + + raw_cons = tmp_raw_cons; + cons = rxcmp->opaque; + + mbuf = rxr->rx_buf_ring[cons].mbuf; + rte_prefetch0(mbuf); + rxr->rx_buf_ring[cons].mbuf = NULL; + + cpr->valid = FLIP_VALID(cp_cons, + cpr->cp_ring_struct->ring_mask, + cpr->valid); + + /* Set constant fields from mbuf initializer. */ + _mm_store_si128((__m128i *)&mbuf->rearm_data, + mbuf_init); + + /* Set mbuf pkt_len, data_len, and rss_hash fields. */ + mm_rxcmp = _mm_load_si128((__m128i *)rxcmp); + pkt_mb = _mm_shuffle_epi8(mm_rxcmp, shuf_msk); + _mm_storeu_si128((void *)&mbuf->rx_descriptor_fields1, + pkt_mb); + + rte_compiler_barrier(); + + if (rxcmp->flags_type & RX_PKT_CMPL_FLAGS_RSS_VALID) + mbuf->ol_flags |= PKT_RX_RSS_HASH; + + if (rxcmp1->flags2 & + RX_PKT_CMPL_FLAGS2_META_FORMAT_VLAN) { + mbuf->vlan_tci = rxcmp1->metadata & + (RX_PKT_CMPL_METADATA_VID_MASK | + RX_PKT_CMPL_METADATA_DE | + RX_PKT_CMPL_METADATA_PRI_MASK); + mbuf->ol_flags |= PKT_RX_VLAN; + } + + mbuf->packet_type = bnxt_parse_pkt_type(rxcmp, rxcmp1); + + rx_pkts[nb_rx_pkts++] = mbuf; + } else { + evt = + bnxt_event_hwrm_resp_handler(rxq->bp, + (struct cmpl_base *)rxcmp); + } + + raw_cons = NEXT_RAW_CMP(raw_cons); + if (nb_rx_pkts == nb_pkts || evt) + break; + } + rxr->rx_prod = RING_ADV(rxr->rx_ring_struct, rxr->rx_prod, nb_rx_pkts); + + rxq->rxrearm_nb += nb_rx_pkts; + cpr->cp_raw_cons = raw_cons; + if (nb_rx_pkts || evt) + B_CP_DIS_DB(cpr, cpr->cp_raw_cons); + + return nb_rx_pkts; +} + +static inline void bnxt_next_cmpl(struct bnxt_cp_ring_info *cpr, uint32_t *idx, + bool *v, uint32_t inc) +{ + *idx += inc; + if (unlikely(*idx == cpr->cp_ring_struct->ring_size)) { + *v = !*v; + *idx = 0; + } +} + +static void +bnxt_tx_cmp_vec(struct bnxt_tx_queue *txq, int nr_pkts) +{ + struct bnxt_tx_ring_info *txr = txq->tx_ring; + struct rte_mbuf **free = txq->free; + uint16_t cons = txr->tx_cons; + unsigned int blk = 0; + + while (nr_pkts--) { + struct bnxt_sw_tx_bd *tx_buf; + struct rte_mbuf *mbuf; + + tx_buf = &txr->tx_buf_ring[cons]; + cons = RING_NEXT(txr->tx_ring_struct, cons); + mbuf = rte_pktmbuf_prefree_seg(tx_buf->mbuf); + tx_buf->mbuf = NULL; + + if (blk && mbuf->pool != free[0]->pool) { + rte_mempool_put_bulk(free[0]->pool, (void **)free, blk); + blk = 0; + } + free[blk++] = mbuf; + } + if (blk) + rte_mempool_put_bulk(free[0]->pool, (void **)free, blk); + + txr->tx_cons = cons; +} + +static void +bnxt_handle_tx_cp_vec(struct bnxt_tx_queue *txq) +{ + struct bnxt_cp_ring_info *cpr = txq->cp_ring; + uint32_t raw_cons = cpr->cp_raw_cons; + uint32_t cons; + uint32_t nb_tx_pkts = 0; + struct tx_cmpl *txcmp; + struct cmpl_base *cp_desc_ring = cpr->cp_desc_ring; + struct bnxt_ring *cp_ring_struct = cpr->cp_ring_struct; + uint32_t ring_mask = cp_ring_struct->ring_mask; + + do { + cons = RING_CMPL(ring_mask, raw_cons); + txcmp = (struct tx_cmpl *)&cp_desc_ring[cons]; + + if (!CMPL_VALID(txcmp, cpr->valid)) + break; + bnxt_next_cmpl(cpr, &cons, &cpr->valid, 1); + rte_prefetch0(&cp_desc_ring[cons]); + + if (likely(CMP_TYPE(txcmp) == TX_CMPL_TYPE_TX_L2)) + nb_tx_pkts += txcmp->opaque; + else + RTE_LOG_DP(ERR, PMD, + "Unhandled CMP type %02x\n", + CMP_TYPE(txcmp)); + raw_cons = cons; + } while (nb_tx_pkts < ring_mask); + + if (nb_tx_pkts) { + bnxt_tx_cmp_vec(txq, nb_tx_pkts); + cpr->cp_raw_cons = raw_cons; + B_CP_DB(cpr, raw_cons, ring_mask); + } +} + +#define TX_BD_FLAGS_CMPL ((1 << TX_BD_LONG_FLAGS_BD_CNT_SFT) | \ + TX_BD_SHORT_FLAGS_COAL_NOW | \ + TX_BD_SHORT_TYPE_TX_BD_SHORT | \ + TX_BD_LONG_FLAGS_PACKET_END) + +#define TX_BD_FLAGS_NOCMPL (TX_BD_FLAGS_CMPL | TX_BD_LONG_FLAGS_NO_CMPL) + +static inline uint32_t +bnxt_xmit_flags_len(uint16_t len, uint16_t flags) +{ + switch (len >> 9) { + case 0: + return flags | TX_BD_LONG_FLAGS_LHINT_LT512; + case 1: + return flags | TX_BD_LONG_FLAGS_LHINT_LT1K; + case 2: + return flags | TX_BD_LONG_FLAGS_LHINT_LT2K; + case 3: + return flags | TX_BD_LONG_FLAGS_LHINT_LT2K; + default: + return flags | TX_BD_LONG_FLAGS_LHINT_GTE2K; + } +} + +static uint16_t +bnxt_xmit_fixed_burst_vec(void *tx_queue, struct rte_mbuf **tx_pkts, + uint16_t nb_pkts) +{ + struct bnxt_tx_queue *txq = tx_queue; + struct bnxt_tx_ring_info *txr = txq->tx_ring; + uint16_t prod = txr->tx_prod; + struct rte_mbuf *tx_mbuf; + struct tx_bd_long *txbd = NULL; + struct bnxt_sw_tx_bd *tx_buf; + uint16_t to_send; + + nb_pkts = RTE_MIN(nb_pkts, bnxt_tx_avail(txq)); + + if (unlikely(nb_pkts == 0)) + return 0; + + /* Handle TX burst request */ + to_send = nb_pkts; + while (to_send) { + tx_mbuf = *tx_pkts++; + rte_prefetch0(tx_mbuf); + + tx_buf = &txr->tx_buf_ring[prod]; + tx_buf->mbuf = tx_mbuf; + tx_buf->nr_bds = 1; + + txbd = &txr->tx_desc_ring[prod]; + txbd->address = tx_mbuf->buf_iova + tx_mbuf->data_off; + txbd->len = tx_mbuf->data_len; + txbd->flags_type = bnxt_xmit_flags_len(tx_mbuf->data_len, + TX_BD_FLAGS_NOCMPL); + prod = RING_NEXT(txr->tx_ring_struct, prod); + to_send--; + } + + /* Request a completion for last packet in burst */ + if (txbd) { + txbd->opaque = nb_pkts; + txbd->flags_type &= ~TX_BD_LONG_FLAGS_NO_CMPL; + } + + rte_compiler_barrier(); + B_TX_DB(txr->tx_doorbell, prod); + + txr->tx_prod = prod; + + return nb_pkts; +} + +uint16_t +bnxt_xmit_pkts_vec(void *tx_queue, struct rte_mbuf **tx_pkts, + uint16_t nb_pkts) +{ + int nb_sent = 0; + struct bnxt_tx_queue *txq = tx_queue; + + /* Tx queue was stopped; wait for it to be restarted */ + if (unlikely(txq->tx_deferred_start)) { + PMD_DRV_LOG(DEBUG, "Tx q stopped;return\n"); + return 0; + } + + /* Handle TX completions */ + if (bnxt_tx_bds_in_hw(txq) >= txq->tx_free_thresh) + bnxt_handle_tx_cp_vec(txq); + + while (nb_pkts) { + uint16_t ret, num; + + num = RTE_MIN(nb_pkts, RTE_BNXT_MAX_TX_BURST); + ret = bnxt_xmit_fixed_burst_vec(tx_queue, + &tx_pkts[nb_sent], + num); + nb_sent += ret; + nb_pkts -= ret; + if (ret < num) + break; + } + + return nb_sent; +} + +int __attribute__((cold)) +bnxt_rxq_vec_setup(struct bnxt_rx_queue *rxq) +{ + uintptr_t p; + struct rte_mbuf mb_def = { .buf_addr = 0 }; /* zeroed mbuf */ + + mb_def.nb_segs = 1; + mb_def.data_off = RTE_PKTMBUF_HEADROOM; + mb_def.port = rxq->port_id; + rte_mbuf_refcnt_set(&mb_def, 1); + + /* prevent compiler reordering: rearm_data covers previous fields */ + rte_compiler_barrier(); + p = (uintptr_t)&mb_def.rearm_data; + rxq->mbuf_initializer = *(uint64_t *)p; + rxq->rxrearm_nb = 0; + rxq->rxrearm_start = 0; + return 0; +} diff --git a/drivers/net/bnxt/bnxt_txr.h b/drivers/net/bnxt/bnxt_txr.h index 13ca046763..58678a1c83 100644 --- a/drivers/net/bnxt/bnxt_txr.h +++ b/drivers/net/bnxt/bnxt_txr.h @@ -57,6 +57,11 @@ int bnxt_init_one_tx_ring(struct bnxt_tx_queue *txq); int bnxt_init_tx_ring_struct(struct bnxt_tx_queue *txq, unsigned int socket_id); uint16_t bnxt_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts); +#ifdef RTE_ARCH_X86 +uint16_t bnxt_xmit_pkts_vec(void *tx_queue, struct rte_mbuf **tx_pkts, + uint16_t nb_pkts); +#endif + int bnxt_tx_queue_start(struct rte_eth_dev *dev, uint16_t tx_queue_id); int bnxt_tx_queue_stop(struct rte_eth_dev *dev, uint16_t tx_queue_id); diff --git a/drivers/net/bnxt/meson.build b/drivers/net/bnxt/meson.build index e130f27122..4dda28f5b0 100644 --- a/drivers/net/bnxt/meson.build +++ b/drivers/net/bnxt/meson.build @@ -18,3 +18,7 @@ sources = files('bnxt_cpr.c', 'bnxt_util.c', 'bnxt_vnic.c', 'rte_pmd_bnxt.c') + +if arch_subdir == 'x86' + sources += files('bnxt_rxtx_vec_sse.c') +endif -- 2.20.1