From: Pavan Nikhilesh Date: Tue, 29 Jun 2021 07:44:22 +0000 (+0530) Subject: net/cnxk: enable TSO processing in vector Tx X-Git-Url: http://git.droids-corp.org/?a=commitdiff_plain;h=ee71c9d840dfca161de83726fb1ef5a2c0572e51;p=dpdk.git net/cnxk: enable TSO processing in vector Tx Enable TSO offload in vector Tx burst function. Signed-off-by: Pavan Nikhilesh Acked-by: Nithin Dabilpuram --- diff --git a/drivers/net/cnxk/cn10k_tx.c b/drivers/net/cnxk/cn10k_tx.c index c4c3e65704..d06879163f 100644 --- a/drivers/net/cnxk/cn10k_tx.c +++ b/drivers/net/cnxk/cn10k_tx.c @@ -67,7 +67,7 @@ cn10k_eth_set_tx_function(struct rte_eth_dev *eth_dev) #undef T }; - if (dev->scalar_ena || (dev->tx_offload_flags & NIX_TX_OFFLOAD_TSO_F)) + if (dev->scalar_ena) pick_tx_func(eth_dev, nix_eth_tx_burst); else pick_tx_func(eth_dev, nix_eth_tx_vec_burst); diff --git a/drivers/net/cnxk/cn10k_tx.h b/drivers/net/cnxk/cn10k_tx.h index 8af6799ff6..26797581e7 100644 --- a/drivers/net/cnxk/cn10k_tx.h +++ b/drivers/net/cnxk/cn10k_tx.h @@ -689,6 +689,46 @@ again: #if defined(RTE_ARCH_ARM64) +static __rte_always_inline void +cn10k_nix_prepare_tso(struct rte_mbuf *m, union nix_send_hdr_w1_u *w1, + union nix_send_ext_w0_u *w0, uint64_t ol_flags, + const uint64_t flags, const uint64_t lso_tun_fmt) +{ + uint16_t lso_sb; + uint64_t mask; + + if (!(ol_flags & PKT_TX_TCP_SEG)) + return; + + mask = -(!w1->il3type); + lso_sb = (mask & w1->ol4ptr) + (~mask & w1->il4ptr) + m->l4_len; + + w0->u |= BIT(14); + w0->lso_sb = lso_sb; + w0->lso_mps = m->tso_segsz; + w0->lso_format = NIX_LSO_FORMAT_IDX_TSOV4 + !!(ol_flags & PKT_TX_IPV6); + w1->ol4type = NIX_SENDL4TYPE_TCP_CKSUM; + + /* Handle tunnel tso */ + if ((flags & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F) && + (ol_flags & PKT_TX_TUNNEL_MASK)) { + const uint8_t is_udp_tun = + (CNXK_NIX_UDP_TUN_BITMASK >> + ((ol_flags & PKT_TX_TUNNEL_MASK) >> 45)) & + 0x1; + uint8_t shift = is_udp_tun ? 32 : 0; + + shift += (!!(ol_flags & PKT_TX_OUTER_IPV6) << 4); + shift += (!!(ol_flags & PKT_TX_IPV6) << 3); + + w1->il4type = NIX_SENDL4TYPE_TCP_CKSUM; + w1->ol4type = is_udp_tun ? NIX_SENDL4TYPE_UDP_CKSUM : 0; + /* Update format for UDP tunneled packet */ + + w0->lso_format = (lso_tun_fmt >> shift); + } +} + #define NIX_DESCS_PER_LOOP 4 static __rte_always_inline uint16_t cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts, @@ -723,6 +763,11 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts, /* Reduce the cached count */ txq->fc_cache_pkts -= pkts; + /* Perform header writes before barrier for TSO */ + if (flags & NIX_TX_OFFLOAD_TSO_F) { + for (i = 0; i < pkts; i++) + cn10k_nix_xmit_prepare_tso(tx_pkts[i], flags); + } senddesc01_w0 = vld1q_dup_u64(&txq->send_hdr_w0); senddesc23_w0 = senddesc01_w0; @@ -781,6 +826,13 @@ again: sendmem23_w1 = sendmem01_w1; } + if (flags & NIX_TX_OFFLOAD_TSO_F) { + /* Clear the LSO enable bit. */ + sendext01_w0 = vbicq_u64(sendext01_w0, + vdupq_n_u64(BIT_ULL(14))); + sendext23_w0 = sendext01_w0; + } + /* Move mbufs to iova */ mbuf0 = (uint64_t *)tx_pkts[0]; mbuf1 = (uint64_t *)tx_pkts[1]; @@ -1430,6 +1482,51 @@ again: cmd3[3] = vzip2q_u64(sendmem23_w0, sendmem23_w1); } + if (flags & NIX_TX_OFFLOAD_TSO_F) { + const uint64_t lso_fmt = txq->lso_tun_fmt; + uint64_t sx_w0[NIX_DESCS_PER_LOOP]; + uint64_t sd_w1[NIX_DESCS_PER_LOOP]; + + /* Extract SD W1 as we need to set L4 types. */ + vst1q_u64(sd_w1, senddesc01_w1); + vst1q_u64(sd_w1 + 2, senddesc23_w1); + + /* Extract SX W0 as we need to set LSO fields. */ + vst1q_u64(sx_w0, sendext01_w0); + vst1q_u64(sx_w0 + 2, sendext23_w0); + + /* Extract ol_flags. */ + xtmp128 = vzip1q_u64(len_olflags0, len_olflags1); + ytmp128 = vzip1q_u64(len_olflags2, len_olflags3); + + /* Prepare individual mbufs. */ + cn10k_nix_prepare_tso(tx_pkts[0], + (union nix_send_hdr_w1_u *)&sd_w1[0], + (union nix_send_ext_w0_u *)&sx_w0[0], + vgetq_lane_u64(xtmp128, 0), flags, lso_fmt); + + cn10k_nix_prepare_tso(tx_pkts[1], + (union nix_send_hdr_w1_u *)&sd_w1[1], + (union nix_send_ext_w0_u *)&sx_w0[1], + vgetq_lane_u64(xtmp128, 1), flags, lso_fmt); + + cn10k_nix_prepare_tso(tx_pkts[2], + (union nix_send_hdr_w1_u *)&sd_w1[2], + (union nix_send_ext_w0_u *)&sx_w0[2], + vgetq_lane_u64(ytmp128, 0), flags, lso_fmt); + + cn10k_nix_prepare_tso(tx_pkts[3], + (union nix_send_hdr_w1_u *)&sd_w1[3], + (union nix_send_ext_w0_u *)&sx_w0[3], + vgetq_lane_u64(ytmp128, 1), flags, lso_fmt); + + senddesc01_w1 = vld1q_u64(sd_w1); + senddesc23_w1 = vld1q_u64(sd_w1 + 2); + + sendext01_w0 = vld1q_u64(sx_w0); + sendext23_w0 = vld1q_u64(sx_w0 + 2); + } + if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) { /* Set don't free bit if reference count > 1 */ xmask01 = vdupq_n_u64(0); diff --git a/drivers/net/cnxk/cn10k_tx_vec.c b/drivers/net/cnxk/cn10k_tx_vec.c index 0b4a4c7bae..34e3737501 100644 --- a/drivers/net/cnxk/cn10k_tx_vec.c +++ b/drivers/net/cnxk/cn10k_tx_vec.c @@ -13,8 +13,9 @@ { \ uint64_t cmd[sz]; \ \ - /* TSO is not supported by vec */ \ - if ((flags) & NIX_TX_OFFLOAD_TSO_F) \ + /* For TSO inner checksum is a must */ \ + if (((flags) & NIX_TX_OFFLOAD_TSO_F) && \ + !((flags) & NIX_TX_OFFLOAD_L3_L4_CSUM_F)) \ return 0; \ return cn10k_nix_xmit_pkts_vector(tx_queue, tx_pkts, pkts, cmd,\ (flags)); \ diff --git a/drivers/net/cnxk/cn9k_tx.c b/drivers/net/cnxk/cn9k_tx.c index c32681ed44..735e21cc60 100644 --- a/drivers/net/cnxk/cn9k_tx.c +++ b/drivers/net/cnxk/cn9k_tx.c @@ -66,7 +66,7 @@ cn9k_eth_set_tx_function(struct rte_eth_dev *eth_dev) #undef T }; - if (dev->scalar_ena || (dev->tx_offload_flags & NIX_TX_OFFLOAD_TSO_F)) + if (dev->scalar_ena) pick_tx_func(eth_dev, nix_eth_tx_burst); else pick_tx_func(eth_dev, nix_eth_tx_vec_burst); diff --git a/drivers/net/cnxk/cn9k_tx.h b/drivers/net/cnxk/cn9k_tx.h index cb574a1c1d..dca732a9fa 100644 --- a/drivers/net/cnxk/cn9k_tx.h +++ b/drivers/net/cnxk/cn9k_tx.h @@ -545,6 +545,43 @@ cn9k_nix_xmit_pkts_mseg(void *tx_queue, struct rte_mbuf **tx_pkts, #if defined(RTE_ARCH_ARM64) +static __rte_always_inline void +cn9k_nix_prepare_tso(struct rte_mbuf *m, union nix_send_hdr_w1_u *w1, + union nix_send_ext_w0_u *w0, uint64_t ol_flags, + uint64_t flags) +{ + uint16_t lso_sb; + uint64_t mask; + + if (!(ol_flags & PKT_TX_TCP_SEG)) + return; + + mask = -(!w1->il3type); + lso_sb = (mask & w1->ol4ptr) + (~mask & w1->il4ptr) + m->l4_len; + + w0->u |= BIT(14); + w0->lso_sb = lso_sb; + w0->lso_mps = m->tso_segsz; + w0->lso_format = NIX_LSO_FORMAT_IDX_TSOV4 + !!(ol_flags & PKT_TX_IPV6); + w1->ol4type = NIX_SENDL4TYPE_TCP_CKSUM; + + /* Handle tunnel tso */ + if ((flags & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F) && + (ol_flags & PKT_TX_TUNNEL_MASK)) { + const uint8_t is_udp_tun = + (CNXK_NIX_UDP_TUN_BITMASK >> + ((ol_flags & PKT_TX_TUNNEL_MASK) >> 45)) & + 0x1; + + w1->il4type = NIX_SENDL4TYPE_TCP_CKSUM; + w1->ol4type = is_udp_tun ? NIX_SENDL4TYPE_UDP_CKSUM : 0; + /* Update format for UDP tunneled packet */ + w0->lso_format += is_udp_tun ? 2 : 6; + + w0->lso_format += !!(ol_flags & PKT_TX_OUTER_IPV6) << 1; + } +} + #define NIX_DESCS_PER_LOOP 4 static __rte_always_inline uint16_t cn9k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts, @@ -580,6 +617,12 @@ cn9k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts, /* Reduce the cached count */ txq->fc_cache_pkts -= pkts; + /* Perform header writes before barrier for TSO */ + if (flags & NIX_TX_OFFLOAD_TSO_F) { + for (i = 0; i < pkts; i++) + cn9k_nix_xmit_prepare_tso(tx_pkts[i], flags); + } + /* Lets commit any changes in the packet here as no further changes * to the packet will be done unless no fast free is enabled. */ @@ -637,6 +680,13 @@ cn9k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts, sendmem23_w1 = sendmem01_w1; } + if (flags & NIX_TX_OFFLOAD_TSO_F) { + /* Clear the LSO enable bit. */ + sendext01_w0 = vbicq_u64(sendext01_w0, + vdupq_n_u64(BIT_ULL(14))); + sendext23_w0 = sendext01_w0; + } + /* Move mbufs to iova */ mbuf0 = (uint64_t *)tx_pkts[0]; mbuf1 = (uint64_t *)tx_pkts[1]; @@ -1286,6 +1336,50 @@ cn9k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts, cmd3[3] = vzip2q_u64(sendmem23_w0, sendmem23_w1); } + if (flags & NIX_TX_OFFLOAD_TSO_F) { + uint64_t sx_w0[NIX_DESCS_PER_LOOP]; + uint64_t sd_w1[NIX_DESCS_PER_LOOP]; + + /* Extract SD W1 as we need to set L4 types. */ + vst1q_u64(sd_w1, senddesc01_w1); + vst1q_u64(sd_w1 + 2, senddesc23_w1); + + /* Extract SX W0 as we need to set LSO fields. */ + vst1q_u64(sx_w0, sendext01_w0); + vst1q_u64(sx_w0 + 2, sendext23_w0); + + /* Extract ol_flags. */ + xtmp128 = vzip1q_u64(len_olflags0, len_olflags1); + ytmp128 = vzip1q_u64(len_olflags2, len_olflags3); + + /* Prepare individual mbufs. */ + cn9k_nix_prepare_tso(tx_pkts[0], + (union nix_send_hdr_w1_u *)&sd_w1[0], + (union nix_send_ext_w0_u *)&sx_w0[0], + vgetq_lane_u64(xtmp128, 0), flags); + + cn9k_nix_prepare_tso(tx_pkts[1], + (union nix_send_hdr_w1_u *)&sd_w1[1], + (union nix_send_ext_w0_u *)&sx_w0[1], + vgetq_lane_u64(xtmp128, 1), flags); + + cn9k_nix_prepare_tso(tx_pkts[2], + (union nix_send_hdr_w1_u *)&sd_w1[2], + (union nix_send_ext_w0_u *)&sx_w0[2], + vgetq_lane_u64(ytmp128, 0), flags); + + cn9k_nix_prepare_tso(tx_pkts[3], + (union nix_send_hdr_w1_u *)&sd_w1[3], + (union nix_send_ext_w0_u *)&sx_w0[3], + vgetq_lane_u64(ytmp128, 1), flags); + + senddesc01_w1 = vld1q_u64(sd_w1); + senddesc23_w1 = vld1q_u64(sd_w1 + 2); + + sendext01_w0 = vld1q_u64(sx_w0); + sendext23_w0 = vld1q_u64(sx_w0 + 2); + } + if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) { /* Set don't free bit if reference count > 1 */ xmask01 = vdupq_n_u64(0); diff --git a/drivers/net/cnxk/cn9k_tx_vec.c b/drivers/net/cnxk/cn9k_tx_vec.c index 9ade66db2b..56a3e2514a 100644 --- a/drivers/net/cnxk/cn9k_tx_vec.c +++ b/drivers/net/cnxk/cn9k_tx_vec.c @@ -13,8 +13,9 @@ { \ uint64_t cmd[sz]; \ \ - /* TSO is not supported by vec */ \ - if ((flags) & NIX_TX_OFFLOAD_TSO_F) \ + /* For TSO inner checksum is a must */ \ + if (((flags) & NIX_TX_OFFLOAD_TSO_F) && \ + !((flags) & NIX_TX_OFFLOAD_L3_L4_CSUM_F)) \ return 0; \ return cn9k_nix_xmit_pkts_vector(tx_queue, tx_pkts, pkts, cmd, \ (flags)); \