From 12443386a0b0ccf5c2d9e1e5dbaf43a516b7f8aa Mon Sep 17 00:00:00 2001 From: Junyu Jiang Date: Wed, 16 Sep 2020 03:09:58 +0000 Subject: [PATCH] net/ice: support flex Rx descriptor RxDID22 This patch supports RxDID #22 by the following changes: - add structure and macro definition for RxDID #22. - support RxDID #22 format in normal path. - change RSS hash parsing from RxDID #22 in AVX/SSE data path. Signed-off-by: Junyu Jiang Acked-by: Leyi Rong --- drivers/net/ice/ice_ethdev.c | 20 ++++++ drivers/net/ice/ice_ethdev.h | 4 ++ drivers/net/ice/ice_rxtx.c | 23 ++++--- drivers/net/ice/ice_rxtx.h | 42 +++++++++++++ drivers/net/ice/ice_rxtx_vec_avx2.c | 98 +++++++++++++++++++++++++++-- drivers/net/ice/ice_rxtx_vec_sse.c | 89 +++++++++++++++++++++----- 6 files changed, 249 insertions(+), 27 deletions(-) diff --git a/drivers/net/ice/ice_ethdev.c b/drivers/net/ice/ice_ethdev.c index c42581ea71..097b72023c 100644 --- a/drivers/net/ice/ice_ethdev.c +++ b/drivers/net/ice/ice_ethdev.c @@ -2147,6 +2147,24 @@ ice_rss_ctx_init(struct ice_pf *pf) ICE_HASH_CFG_RESET(&pf->gtpu_hash_ctx.ipv6_tcp); } +static uint64_t +ice_get_supported_rxdid(struct ice_hw *hw) +{ + uint64_t supported_rxdid = 0; /* bitmap for supported RXDID */ + uint32_t regval; + int i; + + supported_rxdid |= BIT(ICE_RXDID_LEGACY_1); + + for (i = ICE_RXDID_FLEX_NIC; i < ICE_FLEX_DESC_RXDID_MAX_NUM; i++) { + regval = ICE_READ_REG(hw, GLFLXP_RXDID_FLAGS(i, 0)); + if ((regval >> GLFLXP_RXDID_FLAGS_FLEXIFLAG_4N_S) + & GLFLXP_RXDID_FLAGS_FLEXIFLAG_4N_M) + supported_rxdid |= BIT(i); + } + return supported_rxdid; +} + static int ice_dev_init(struct rte_eth_dev *dev) { @@ -2298,6 +2316,8 @@ ice_dev_init(struct rte_eth_dev *dev) return ret; } + pf->supported_rxdid = ice_get_supported_rxdid(hw); + return 0; err_pf_setup: diff --git a/drivers/net/ice/ice_ethdev.h b/drivers/net/ice/ice_ethdev.h index 243a023e60..e8c9971fb0 100644 --- a/drivers/net/ice/ice_ethdev.h +++ b/drivers/net/ice/ice_ethdev.h @@ -136,6 +136,9 @@ #define ICE_RXTX_BYTES_HIGH(bytes) ((bytes) & ~ICE_40_BIT_MASK) #define ICE_RXTX_BYTES_LOW(bytes) ((bytes) & ICE_40_BIT_MASK) +/* Max number of flexible descriptor rxdid */ +#define ICE_FLEX_DESC_RXDID_MAX_NUM 64 + /* DDP package type */ enum ice_pkg_type { ICE_PKG_TYPE_UNKNOWN, @@ -435,6 +438,7 @@ struct ice_pf { bool init_link_up; uint64_t old_rx_bytes; uint64_t old_tx_bytes; + uint64_t supported_rxdid; /* bitmap for supported RXDID */ }; #define ICE_MAX_QUEUE_NUM 2048 diff --git a/drivers/net/ice/ice_rxtx.c b/drivers/net/ice/ice_rxtx.c index fecb13459f..fef6ad4544 100644 --- a/drivers/net/ice/ice_rxtx.c +++ b/drivers/net/ice/ice_rxtx.c @@ -63,7 +63,7 @@ static inline uint8_t ice_proto_xtr_type_to_rxdid(uint8_t xtr_type) { static uint8_t rxdid_map[] = { - [PROTO_XTR_NONE] = ICE_RXDID_COMMS_GENERIC, + [PROTO_XTR_NONE] = ICE_RXDID_COMMS_OVS, [PROTO_XTR_VLAN] = ICE_RXDID_COMMS_AUX_VLAN, [PROTO_XTR_IPV4] = ICE_RXDID_COMMS_AUX_IPV4, [PROTO_XTR_IPV6] = ICE_RXDID_COMMS_AUX_IPV6, @@ -73,7 +73,7 @@ ice_proto_xtr_type_to_rxdid(uint8_t xtr_type) }; return xtr_type < RTE_DIM(rxdid_map) ? - rxdid_map[xtr_type] : ICE_RXDID_COMMS_GENERIC; + rxdid_map[xtr_type] : ICE_RXDID_COMMS_OVS; } static enum ice_status @@ -81,12 +81,13 @@ ice_program_hw_rx_queue(struct ice_rx_queue *rxq) { struct ice_vsi *vsi = rxq->vsi; struct ice_hw *hw = ICE_VSI_TO_HW(vsi); + struct ice_pf *pf = ICE_VSI_TO_PF(vsi); struct rte_eth_dev *dev = ICE_VSI_TO_ETH_DEV(rxq->vsi); struct ice_rlan_ctx rx_ctx; enum ice_status err; uint16_t buf_size, len; struct rte_eth_rxmode *rxmode = &dev->data->dev_conf.rxmode; - uint32_t rxdid = ICE_RXDID_COMMS_GENERIC; + uint32_t rxdid = ICE_RXDID_COMMS_OVS; uint32_t regval; /* Set buffer size as the head split is disabled. */ @@ -151,6 +152,12 @@ ice_program_hw_rx_queue(struct ice_rx_queue *rxq) PMD_DRV_LOG(DEBUG, "Port (%u) - Rx queue (%u) is set with RXDID : %u", rxq->port_id, rxq->queue_id, rxdid); + if (!(pf->supported_rxdid & BIT(rxdid))) { + PMD_DRV_LOG(ERR, "currently package doesn't support RXDID (%u)", + rxdid); + return -EINVAL; + } + /* Enable Flexible Descriptors in the queue context which * allows this driver to select a specific receive descriptor format */ @@ -1338,7 +1345,7 @@ ice_rxd_to_vlan_tci(struct rte_mbuf *mb, volatile union ice_rx_flex_desc *rxdp) static void ice_rxd_to_proto_xtr(struct rte_mbuf *mb, - volatile struct ice_32b_rx_flex_desc_comms *desc) + volatile struct ice_32b_rx_flex_desc_comms_ovs *desc) { uint16_t stat_err = rte_le_to_cpu_16(desc->status_error1); uint32_t metadata = 0; @@ -1376,8 +1383,9 @@ static inline void ice_rxd_to_pkt_fields(struct rte_mbuf *mb, volatile union ice_rx_flex_desc *rxdp) { - volatile struct ice_32b_rx_flex_desc_comms *desc = - (volatile struct ice_32b_rx_flex_desc_comms *)rxdp; + volatile struct ice_32b_rx_flex_desc_comms_ovs *desc = + (volatile struct ice_32b_rx_flex_desc_comms_ovs *)rxdp; +#ifndef RTE_LIBRTE_ICE_16BYTE_RX_DESC uint16_t stat_err; stat_err = rte_le_to_cpu_16(desc->status_error0); @@ -1385,13 +1393,14 @@ ice_rxd_to_pkt_fields(struct rte_mbuf *mb, mb->ol_flags |= PKT_RX_RSS_HASH; mb->hash.rss = rte_le_to_cpu_32(desc->rss_hash); } +#endif -#ifndef RTE_LIBRTE_ICE_16BYTE_RX_DESC if (desc->flow_id != 0xFFFFFFFF) { mb->ol_flags |= PKT_RX_FDIR | PKT_RX_FDIR_ID; mb->hash.fdir.hi = rte_le_to_cpu_32(desc->flow_id); } +#ifndef RTE_LIBRTE_ICE_16BYTE_RX_DESC if (unlikely(rte_net_ice_dynf_proto_xtr_metadata_avail())) ice_rxd_to_proto_xtr(mb, desc); #endif diff --git a/drivers/net/ice/ice_rxtx.h b/drivers/net/ice/ice_rxtx.h index 2fdcfb7d04..e21ba152d7 100644 --- a/drivers/net/ice/ice_rxtx.h +++ b/drivers/net/ice/ice_rxtx.h @@ -38,6 +38,8 @@ #define ICE_FDIR_PKT_LEN 512 +#define ICE_RXDID_COMMS_OVS 22 + typedef void (*ice_rx_release_mbufs_t)(struct ice_rx_queue *rxq); typedef void (*ice_tx_release_mbufs_t)(struct ice_tx_queue *txq); @@ -135,6 +137,46 @@ union ice_tx_offload { }; }; +/* Rx Flex Descriptor for Comms Package Profile + * RxDID Profile ID 22 (swap Hash and FlowID) + * Flex-field 0: Flow ID lower 16-bits + * Flex-field 1: Flow ID upper 16-bits + * Flex-field 2: RSS hash lower 16-bits + * Flex-field 3: RSS hash upper 16-bits + * Flex-field 4: AUX0 + * Flex-field 5: AUX1 + */ +struct ice_32b_rx_flex_desc_comms_ovs { + /* Qword 0 */ + u8 rxdid; + u8 mir_id_umb_cast; + __le16 ptype_flexi_flags0; + __le16 pkt_len; + __le16 hdr_len_sph_flex_flags1; + + /* Qword 1 */ + __le16 status_error0; + __le16 l2tag1; + __le32 flow_id; + + /* Qword 2 */ + __le16 status_error1; + u8 flexi_flags2; + u8 ts_low; + __le16 l2tag2_1st; + __le16 l2tag2_2nd; + + /* Qword 3 */ + __le32 rss_hash; + union { + struct { + __le16 aux0; + __le16 aux1; + } flex; + __le32 ts_high; + } flex_ts; +}; + int ice_rx_queue_setup(struct rte_eth_dev *dev, uint16_t queue_idx, uint16_t nb_desc, diff --git a/drivers/net/ice/ice_rxtx_vec_avx2.c b/drivers/net/ice/ice_rxtx_vec_avx2.c index b653805160..96f6f2e10f 100644 --- a/drivers/net/ice/ice_rxtx_vec_avx2.c +++ b/drivers/net/ice/ice_rxtx_vec_avx2.c @@ -191,8 +191,8 @@ _ice_recv_raw_pkts_vec_avx2(struct ice_rx_queue *rxq, struct rte_mbuf **rx_pkts, const __m256i shuf_msk = _mm256_set_epi8 (/* first descriptor */ - 15, 14, - 13, 12, /* octet 12~15, 32 bits rss */ + 0xFF, 0xFF, + 0xFF, 0xFF, /* rss hash parsed separately */ 11, 10, /* octet 10~11, 16 bits vlan_macip */ 5, 4, /* octet 4~5, 16 bits data_len */ 0xFF, 0xFF, /* skip hi 16 bits pkt_len, zero out */ @@ -200,8 +200,8 @@ _ice_recv_raw_pkts_vec_avx2(struct ice_rx_queue *rxq, struct rte_mbuf **rx_pkts, 0xFF, 0xFF, /* pkt_type set as unknown */ 0xFF, 0xFF, /*pkt_type set as unknown */ /* second descriptor */ - 15, 14, - 13, 12, /* octet 12~15, 32 bits rss */ + 0xFF, 0xFF, + 0xFF, 0xFF, /* rss hash parsed separately */ 11, 10, /* octet 10~11, 16 bits vlan_macip */ 5, 4, /* octet 4~5, 16 bits data_len */ 0xFF, 0xFF, /* skip hi 16 bits pkt_len, zero out */ @@ -461,6 +461,96 @@ _ice_recv_raw_pkts_vec_avx2(struct ice_rx_queue *rxq, struct rte_mbuf **rx_pkts, /* merge flags */ const __m256i mbuf_flags = _mm256_or_si256(l3_l4_flags, rss_vlan_flags); + +#ifndef RTE_LIBRTE_ICE_16BYTE_RX_DESC + /** + * needs to load 2nd 16B of each desc for RSS hash parsing, + * will cause performance drop to get into this context. + */ + if (rxq->vsi->adapter->eth_dev->data->dev_conf.rxmode.offloads & + DEV_RX_OFFLOAD_RSS_HASH) { + /* load bottom half of every 32B desc */ + const __m128i raw_desc_bh7 = + _mm_load_si128 + ((void *)(&rxdp[7].wb.status_error1)); + rte_compiler_barrier(); + const __m128i raw_desc_bh6 = + _mm_load_si128 + ((void *)(&rxdp[6].wb.status_error1)); + rte_compiler_barrier(); + const __m128i raw_desc_bh5 = + _mm_load_si128 + ((void *)(&rxdp[5].wb.status_error1)); + rte_compiler_barrier(); + const __m128i raw_desc_bh4 = + _mm_load_si128 + ((void *)(&rxdp[4].wb.status_error1)); + rte_compiler_barrier(); + const __m128i raw_desc_bh3 = + _mm_load_si128 + ((void *)(&rxdp[3].wb.status_error1)); + rte_compiler_barrier(); + const __m128i raw_desc_bh2 = + _mm_load_si128 + ((void *)(&rxdp[2].wb.status_error1)); + rte_compiler_barrier(); + const __m128i raw_desc_bh1 = + _mm_load_si128 + ((void *)(&rxdp[1].wb.status_error1)); + rte_compiler_barrier(); + const __m128i raw_desc_bh0 = + _mm_load_si128 + ((void *)(&rxdp[0].wb.status_error1)); + + __m256i raw_desc_bh6_7 = + _mm256_inserti128_si256 + (_mm256_castsi128_si256(raw_desc_bh6), + raw_desc_bh7, 1); + __m256i raw_desc_bh4_5 = + _mm256_inserti128_si256 + (_mm256_castsi128_si256(raw_desc_bh4), + raw_desc_bh5, 1); + __m256i raw_desc_bh2_3 = + _mm256_inserti128_si256 + (_mm256_castsi128_si256(raw_desc_bh2), + raw_desc_bh3, 1); + __m256i raw_desc_bh0_1 = + _mm256_inserti128_si256 + (_mm256_castsi128_si256(raw_desc_bh0), + raw_desc_bh1, 1); + + /** + * to shift the 32b RSS hash value to the + * highest 32b of each 128b before mask + */ + __m256i rss_hash6_7 = + _mm256_slli_epi64(raw_desc_bh6_7, 32); + __m256i rss_hash4_5 = + _mm256_slli_epi64(raw_desc_bh4_5, 32); + __m256i rss_hash2_3 = + _mm256_slli_epi64(raw_desc_bh2_3, 32); + __m256i rss_hash0_1 = + _mm256_slli_epi64(raw_desc_bh0_1, 32); + + __m256i rss_hash_msk = + _mm256_set_epi32(0xFFFFFFFF, 0, 0, 0, + 0xFFFFFFFF, 0, 0, 0); + + rss_hash6_7 = _mm256_and_si256 + (rss_hash6_7, rss_hash_msk); + rss_hash4_5 = _mm256_and_si256 + (rss_hash4_5, rss_hash_msk); + rss_hash2_3 = _mm256_and_si256 + (rss_hash2_3, rss_hash_msk); + rss_hash0_1 = _mm256_and_si256 + (rss_hash0_1, rss_hash_msk); + + mb6_7 = _mm256_or_si256(mb6_7, rss_hash6_7); + mb4_5 = _mm256_or_si256(mb4_5, rss_hash4_5); + mb2_3 = _mm256_or_si256(mb2_3, rss_hash2_3); + mb0_1 = _mm256_or_si256(mb0_1, rss_hash0_1); + } /* if() on RSS hash parsing */ +#endif /** * At this point, we have the 8 sets of flags in the low 16-bits * of each 32-bit value in vlan0. diff --git a/drivers/net/ice/ice_rxtx_vec_sse.c b/drivers/net/ice/ice_rxtx_vec_sse.c index 382ef31f3a..fffb27138a 100644 --- a/drivers/net/ice/ice_rxtx_vec_sse.c +++ b/drivers/net/ice/ice_rxtx_vec_sse.c @@ -230,7 +230,8 @@ _ice_recv_raw_pkts_vec(struct ice_rx_queue *rxq, struct rte_mbuf **rx_pkts, const __m128i zero = _mm_setzero_si128(); /* mask to shuffle from desc. to mbuf */ const __m128i shuf_msk = _mm_set_epi8 - (15, 14, 13, 12, /* octet 12~15, 32 bits rss */ + (0xFF, 0xFF, + 0xFF, 0xFF, /* rss hash parsed separately */ 11, 10, /* octet 10~11, 16 bits vlan_macip */ 5, 4, /* octet 4~5, 16 bits data_len */ 0xFF, 0xFF, /* skip high 16 bits pkt_len, zero out */ @@ -321,7 +322,7 @@ _ice_recv_raw_pkts_vec(struct ice_rx_queue *rxq, struct rte_mbuf **rx_pkts, pos += ICE_DESCS_PER_LOOP, rxdp += ICE_DESCS_PER_LOOP) { __m128i descs[ICE_DESCS_PER_LOOP]; - __m128i pkt_mb1, pkt_mb2, pkt_mb3, pkt_mb4; + __m128i pkt_mb0, pkt_mb1, pkt_mb2, pkt_mb3; __m128i staterr, sterr_tmp1, sterr_tmp2; /* 2 64 bit or 4 32 bit mbuf pointers in one XMM reg. */ __m128i mbp1; @@ -367,8 +368,12 @@ _ice_recv_raw_pkts_vec(struct ice_rx_queue *rxq, struct rte_mbuf **rx_pkts, rte_compiler_barrier(); /* D.1 pkt 3,4 convert format from desc to pktmbuf */ - pkt_mb4 = _mm_shuffle_epi8(descs[3], shuf_msk); - pkt_mb3 = _mm_shuffle_epi8(descs[2], shuf_msk); + pkt_mb3 = _mm_shuffle_epi8(descs[3], shuf_msk); + pkt_mb2 = _mm_shuffle_epi8(descs[2], shuf_msk); + + /* D.1 pkt 1,2 convert format from desc to pktmbuf */ + pkt_mb1 = _mm_shuffle_epi8(descs[1], shuf_msk); + pkt_mb0 = _mm_shuffle_epi8(descs[0], shuf_msk); /* C.1 4=>2 filter staterr info only */ sterr_tmp2 = _mm_unpackhi_epi32(descs[3], descs[2]); @@ -378,12 +383,68 @@ _ice_recv_raw_pkts_vec(struct ice_rx_queue *rxq, struct rte_mbuf **rx_pkts, ice_rx_desc_to_olflags_v(rxq, descs, &rx_pkts[pos]); /* D.2 pkt 3,4 set in_port/nb_seg and remove crc */ - pkt_mb4 = _mm_add_epi16(pkt_mb4, crc_adjust); pkt_mb3 = _mm_add_epi16(pkt_mb3, crc_adjust); + pkt_mb2 = _mm_add_epi16(pkt_mb2, crc_adjust); - /* D.1 pkt 1,2 convert format from desc to pktmbuf */ - pkt_mb2 = _mm_shuffle_epi8(descs[1], shuf_msk); - pkt_mb1 = _mm_shuffle_epi8(descs[0], shuf_msk); + /* D.2 pkt 1,2 set in_port/nb_seg and remove crc */ + pkt_mb1 = _mm_add_epi16(pkt_mb1, crc_adjust); + pkt_mb0 = _mm_add_epi16(pkt_mb0, crc_adjust); + +#ifndef RTE_LIBRTE_ICE_16BYTE_RX_DESC + /** + * needs to load 2nd 16B of each desc for RSS hash parsing, + * will cause performance drop to get into this context. + */ + if (rxq->vsi->adapter->eth_dev->data->dev_conf.rxmode.offloads & + DEV_RX_OFFLOAD_RSS_HASH) { + /* load bottom half of every 32B desc */ + const __m128i raw_desc_bh3 = + _mm_load_si128 + ((void *)(&rxdp[3].wb.status_error1)); + rte_compiler_barrier(); + const __m128i raw_desc_bh2 = + _mm_load_si128 + ((void *)(&rxdp[2].wb.status_error1)); + rte_compiler_barrier(); + const __m128i raw_desc_bh1 = + _mm_load_si128 + ((void *)(&rxdp[1].wb.status_error1)); + rte_compiler_barrier(); + const __m128i raw_desc_bh0 = + _mm_load_si128 + ((void *)(&rxdp[0].wb.status_error1)); + + /** + * to shift the 32b RSS hash value to the + * highest 32b of each 128b before mask + */ + __m128i rss_hash3 = + _mm_slli_epi64(raw_desc_bh3, 32); + __m128i rss_hash2 = + _mm_slli_epi64(raw_desc_bh2, 32); + __m128i rss_hash1 = + _mm_slli_epi64(raw_desc_bh1, 32); + __m128i rss_hash0 = + _mm_slli_epi64(raw_desc_bh0, 32); + + __m128i rss_hash_msk = + _mm_set_epi32(0xFFFFFFFF, 0, 0, 0); + + rss_hash3 = _mm_and_si128 + (rss_hash3, rss_hash_msk); + rss_hash2 = _mm_and_si128 + (rss_hash2, rss_hash_msk); + rss_hash1 = _mm_and_si128 + (rss_hash1, rss_hash_msk); + rss_hash0 = _mm_and_si128 + (rss_hash0, rss_hash_msk); + + pkt_mb3 = _mm_or_si128(pkt_mb3, rss_hash3); + pkt_mb2 = _mm_or_si128(pkt_mb2, rss_hash2); + pkt_mb1 = _mm_or_si128(pkt_mb1, rss_hash1); + pkt_mb0 = _mm_or_si128(pkt_mb0, rss_hash0); + } /* if() on RSS hash parsing */ +#endif /* C.2 get 4 pkts staterr value */ staterr = _mm_unpacklo_epi32(sterr_tmp1, sterr_tmp2); @@ -391,14 +452,10 @@ _ice_recv_raw_pkts_vec(struct ice_rx_queue *rxq, struct rte_mbuf **rx_pkts, /* D.3 copy final 3,4 data to rx_pkts */ _mm_storeu_si128 ((void *)&rx_pkts[pos + 3]->rx_descriptor_fields1, - pkt_mb4); + pkt_mb3); _mm_storeu_si128 ((void *)&rx_pkts[pos + 2]->rx_descriptor_fields1, - pkt_mb3); - - /* D.2 pkt 1,2 set in_port/nb_seg and remove crc */ - pkt_mb2 = _mm_add_epi16(pkt_mb2, crc_adjust); - pkt_mb1 = _mm_add_epi16(pkt_mb1, crc_adjust); + pkt_mb2); /* C* extract and record EOP bit */ if (split_packet) { @@ -422,9 +479,9 @@ _ice_recv_raw_pkts_vec(struct ice_rx_queue *rxq, struct rte_mbuf **rx_pkts, /* D.3 copy final 1,2 data to rx_pkts */ _mm_storeu_si128 ((void *)&rx_pkts[pos + 1]->rx_descriptor_fields1, - pkt_mb2); + pkt_mb1); _mm_storeu_si128((void *)&rx_pkts[pos]->rx_descriptor_fields1, - pkt_mb1); + pkt_mb0); ice_rx_desc_to_ptype_v(descs, &rx_pkts[pos], ptype_tbl); /* C.4 calc avaialbe number of desc */ var = __builtin_popcountll(_mm_cvtsi128_si64(staterr)); -- 2.20.1