X-Git-Url: http://git.droids-corp.org/?a=blobdiff_plain;f=drivers%2Fnet%2Fice%2Fice_rxtx_vec_sse.c;h=653bd28b417c1a67bd2b1ec2aca466c81b0edf55;hb=7fe741821337f3cbeecac768b8ef3a16bf21c938;hp=965cd8b261f75e0ca248fb3e3ea9b2b55adc339a;hpb=dd4245f079dd5edd088369e3aca308c508a9031f;p=dpdk.git diff --git a/drivers/net/ice/ice_rxtx_vec_sse.c b/drivers/net/ice/ice_rxtx_vec_sse.c index 965cd8b261..653bd28b41 100644 --- a/drivers/net/ice/ice_rxtx_vec_sse.c +++ b/drivers/net/ice/ice_rxtx_vec_sse.c @@ -97,7 +97,7 @@ ice_rxq_rearm(struct ice_rx_queue *rxq) (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1)); /* Update the tail pointer on the NIC */ - ICE_PCI_REG_WRITE(rxq->qrx_tail, rx_id); + ICE_PCI_REG_WC_WRITE(rxq->qrx_tail, rx_id); } static inline void @@ -114,39 +114,67 @@ ice_rx_desc_to_olflags_v(struct ice_rx_queue *rxq, __m128i descs[4], * bit12 for RSS indication. * bit13 for VLAN indication. */ - const __m128i desc_mask = _mm_set_epi32(0x3070, 0x3070, - 0x3070, 0x3070); - + const __m128i desc_mask = _mm_set_epi32(0x30f0, 0x30f0, + 0x30f0, 0x30f0); const __m128i cksum_mask = _mm_set_epi32(PKT_RX_IP_CKSUM_MASK | PKT_RX_L4_CKSUM_MASK | - PKT_RX_EIP_CKSUM_BAD, + PKT_RX_OUTER_L4_CKSUM_MASK | + PKT_RX_OUTER_IP_CKSUM_BAD, PKT_RX_IP_CKSUM_MASK | PKT_RX_L4_CKSUM_MASK | - PKT_RX_EIP_CKSUM_BAD, + PKT_RX_OUTER_L4_CKSUM_MASK | + PKT_RX_OUTER_IP_CKSUM_BAD, PKT_RX_IP_CKSUM_MASK | PKT_RX_L4_CKSUM_MASK | - PKT_RX_EIP_CKSUM_BAD, + PKT_RX_OUTER_L4_CKSUM_MASK | + PKT_RX_OUTER_IP_CKSUM_BAD, PKT_RX_IP_CKSUM_MASK | PKT_RX_L4_CKSUM_MASK | - PKT_RX_EIP_CKSUM_BAD); + PKT_RX_OUTER_L4_CKSUM_MASK | + PKT_RX_OUTER_IP_CKSUM_BAD); /* map the checksum, rss and vlan fields to the checksum, rss * and vlan flag */ - const __m128i cksum_flags = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, - /* shift right 1 bit to make sure it not exceed 255 */ - (PKT_RX_EIP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD | - PKT_RX_IP_CKSUM_BAD) >> 1, - (PKT_RX_EIP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD | - PKT_RX_IP_CKSUM_GOOD) >> 1, - (PKT_RX_EIP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD | - PKT_RX_IP_CKSUM_BAD) >> 1, - (PKT_RX_EIP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD | - PKT_RX_IP_CKSUM_GOOD) >> 1, - (PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1, - (PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_GOOD) >> 1, - (PKT_RX_L4_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD) >> 1, - (PKT_RX_L4_CKSUM_GOOD | PKT_RX_IP_CKSUM_GOOD) >> 1); + const __m128i cksum_flags = + _mm_set_epi8((PKT_RX_OUTER_L4_CKSUM_BAD >> 20 | + PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD | + PKT_RX_IP_CKSUM_BAD) >> 1, + (PKT_RX_OUTER_L4_CKSUM_BAD >> 20 | PKT_RX_OUTER_IP_CKSUM_BAD | + PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_GOOD) >> 1, + (PKT_RX_OUTER_L4_CKSUM_BAD >> 20 | PKT_RX_OUTER_IP_CKSUM_BAD | + PKT_RX_L4_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD) >> 1, + (PKT_RX_OUTER_L4_CKSUM_BAD >> 20 | PKT_RX_OUTER_IP_CKSUM_BAD | + PKT_RX_L4_CKSUM_GOOD | PKT_RX_IP_CKSUM_GOOD) >> 1, + (PKT_RX_OUTER_L4_CKSUM_BAD >> 20 | PKT_RX_L4_CKSUM_BAD | + PKT_RX_IP_CKSUM_BAD) >> 1, + (PKT_RX_OUTER_L4_CKSUM_BAD >> 20 | PKT_RX_L4_CKSUM_BAD | + PKT_RX_IP_CKSUM_GOOD) >> 1, + (PKT_RX_OUTER_L4_CKSUM_BAD >> 20 | PKT_RX_L4_CKSUM_GOOD | + PKT_RX_IP_CKSUM_BAD) >> 1, + (PKT_RX_OUTER_L4_CKSUM_BAD >> 20 | PKT_RX_L4_CKSUM_GOOD | + PKT_RX_IP_CKSUM_GOOD) >> 1, + /** + * shift right 20 bits to use the low two bits to indicate + * outer checksum status + * shift right 1 bit to make sure it not exceed 255 + */ + (PKT_RX_OUTER_L4_CKSUM_GOOD >> 20 | PKT_RX_OUTER_IP_CKSUM_BAD | + PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1, + (PKT_RX_OUTER_L4_CKSUM_GOOD >> 20 | PKT_RX_OUTER_IP_CKSUM_BAD | + PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_GOOD) >> 1, + (PKT_RX_OUTER_L4_CKSUM_GOOD >> 20 | PKT_RX_OUTER_IP_CKSUM_BAD | + PKT_RX_L4_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD) >> 1, + (PKT_RX_OUTER_L4_CKSUM_GOOD >> 20 | PKT_RX_OUTER_IP_CKSUM_BAD | + PKT_RX_L4_CKSUM_GOOD | PKT_RX_IP_CKSUM_GOOD) >> 1, + (PKT_RX_OUTER_L4_CKSUM_GOOD >> 20 | PKT_RX_L4_CKSUM_BAD | + PKT_RX_IP_CKSUM_BAD) >> 1, + (PKT_RX_OUTER_L4_CKSUM_GOOD >> 20 | PKT_RX_L4_CKSUM_BAD | + PKT_RX_IP_CKSUM_GOOD) >> 1, + (PKT_RX_OUTER_L4_CKSUM_GOOD >> 20 | PKT_RX_L4_CKSUM_GOOD | + PKT_RX_IP_CKSUM_BAD) >> 1, + (PKT_RX_OUTER_L4_CKSUM_GOOD >> 20 | PKT_RX_L4_CKSUM_GOOD | + PKT_RX_IP_CKSUM_GOOD) >> 1); const __m128i rss_vlan_flags = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, @@ -159,13 +187,21 @@ ice_rx_desc_to_olflags_v(struct ice_rx_queue *rxq, __m128i descs[4], flags = _mm_unpackhi_epi32(descs[0], descs[1]); tmp_desc = _mm_unpackhi_epi32(descs[2], descs[3]); tmp_desc = _mm_unpacklo_epi64(flags, tmp_desc); - tmp_desc = _mm_and_si128(flags, desc_mask); + tmp_desc = _mm_and_si128(tmp_desc, desc_mask); /* checksum flags */ tmp_desc = _mm_srli_epi32(tmp_desc, 4); flags = _mm_shuffle_epi8(cksum_flags, tmp_desc); /* then we shift left 1 bit */ flags = _mm_slli_epi32(flags, 1); + + __m128i l4_outer_mask = _mm_set_epi32(0x6, 0x6, 0x6, 0x6); + __m128i l4_outer_flags = _mm_and_si128(flags, l4_outer_mask); + l4_outer_flags = _mm_slli_epi32(l4_outer_flags, 20); + + __m128i l3_l4_mask = _mm_set_epi32(~0x6, ~0x6, ~0x6, ~0x6); + __m128i l3_l4_flags = _mm_and_si128(flags, l3_l4_mask); + flags = _mm_or_si128(l3_l4_flags, l4_outer_flags); /* we need to mask out the reduntant bits introduced by RSS or * VLAN fields. */ @@ -217,10 +253,10 @@ ice_rx_desc_to_olflags_v(struct ice_rx_queue *rxq, __m128i descs[4], * appropriate flags means that we have to do a shift and blend for * each mbuf before we do the write. */ - rearm0 = _mm_blend_epi16(mbuf_init, _mm_slli_si128(flags, 8), 0x10); - rearm1 = _mm_blend_epi16(mbuf_init, _mm_slli_si128(flags, 4), 0x10); - rearm2 = _mm_blend_epi16(mbuf_init, flags, 0x10); - rearm3 = _mm_blend_epi16(mbuf_init, _mm_srli_si128(flags, 4), 0x10); + rearm0 = _mm_blend_epi16(mbuf_init, _mm_slli_si128(flags, 8), 0x30); + rearm1 = _mm_blend_epi16(mbuf_init, _mm_slli_si128(flags, 4), 0x30); + rearm2 = _mm_blend_epi16(mbuf_init, flags, 0x30); + rearm3 = _mm_blend_epi16(mbuf_init, _mm_srli_si128(flags, 4), 0x30); /* write the rearm data and the olflags in one write */ RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, ol_flags) != @@ -237,10 +273,10 @@ static inline void ice_rx_desc_to_ptype_v(__m128i descs[4], struct rte_mbuf **rx_pkts, uint32_t *ptype_tbl) { - const __m128i ptype_mask = _mm_set_epi16(0, ICE_RX_FLEX_DESC_PTYPE_M, - 0, ICE_RX_FLEX_DESC_PTYPE_M, - 0, ICE_RX_FLEX_DESC_PTYPE_M, - 0, ICE_RX_FLEX_DESC_PTYPE_M); + const __m128i ptype_mask = _mm_set_epi16(ICE_RX_FLEX_DESC_PTYPE_M, 0, + ICE_RX_FLEX_DESC_PTYPE_M, 0, + ICE_RX_FLEX_DESC_PTYPE_M, 0, + ICE_RX_FLEX_DESC_PTYPE_M, 0); __m128i ptype_01 = _mm_unpacklo_epi32(descs[0], descs[1]); __m128i ptype_23 = _mm_unpacklo_epi32(descs[2], descs[3]); __m128i ptype_all = _mm_unpacklo_epi64(ptype_01, ptype_23); @@ -254,10 +290,11 @@ ice_rx_desc_to_ptype_v(__m128i descs[4], struct rte_mbuf **rx_pkts, } /** + * vPMD raw receive routine, only accept(nb_pkts >= ICE_DESCS_PER_LOOP) + * * Notice: * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet - * - nb_pkts > ICE_VPMD_RX_BURST, only scan ICE_VPMD_RX_BURST - * numbers of DD bits + * - floor align nb_pkts to a ICE_DESCS_PER_LOOP power-of-two */ static inline uint16_t _ice_recv_raw_pkts_vec(struct ice_rx_queue *rxq, struct rte_mbuf **rx_pkts, @@ -314,9 +351,6 @@ _ice_recv_raw_pkts_vec(struct ice_rx_queue *rxq, struct rte_mbuf **rx_pkts, const __m128i eop_check = _mm_set_epi64x(0x0000000200000002LL, 0x0000000200000002LL); - /* nb_pkts shall be less equal than ICE_MAX_RX_BURST */ - nb_pkts = RTE_MIN(nb_pkts, ICE_MAX_RX_BURST); - /* nb_pkts has to be floor-aligned to ICE_DESCS_PER_LOOP */ nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, ICE_DESCS_PER_LOOP); @@ -382,7 +416,7 @@ _ice_recv_raw_pkts_vec(struct ice_rx_queue *rxq, struct rte_mbuf **rx_pkts, /* B.1 load 2 (64 bit) or 4 (32 bit) mbuf points */ mbp1 = _mm_loadu_si128((__m128i *)&sw_ring[pos]); /* Read desc statuses backwards to avoid race condition */ - /* A.1 load 4 pkts desc */ + /* A.1 load desc[3] */ descs[3] = _mm_loadu_si128((__m128i *)(rxdp + 3)); rte_compiler_barrier(); @@ -394,9 +428,9 @@ _ice_recv_raw_pkts_vec(struct ice_rx_queue *rxq, struct rte_mbuf **rx_pkts, mbp2 = _mm_loadu_si128((__m128i *)&sw_ring[pos + 2]); #endif + /* A.1 load desc[2-0] */ descs[2] = _mm_loadu_si128((__m128i *)(rxdp + 2)); rte_compiler_barrier(); - /* B.1 load 2 mbuf point */ descs[1] = _mm_loadu_si128((__m128i *)(rxdp + 1)); rte_compiler_barrier(); descs[0] = _mm_loadu_si128((__m128i *)(rxdp)); @@ -444,7 +478,7 @@ _ice_recv_raw_pkts_vec(struct ice_rx_queue *rxq, struct rte_mbuf **rx_pkts, * needs to load 2nd 16B of each desc for RSS hash parsing, * will cause performance drop to get into this context. */ - if (rxq->vsi->adapter->eth_dev->data->dev_conf.rxmode.offloads & + if (rxq->vsi->adapter->pf.dev_data->dev_conf.rxmode.offloads & DEV_RX_OFFLOAD_RSS_HASH) { /* load bottom half of every 32B desc */ const __m128i raw_desc_bh3 = @@ -511,7 +545,7 @@ _ice_recv_raw_pkts_vec(struct ice_rx_queue *rxq, struct rte_mbuf **rx_pkts, /* and with mask to extract bits, flipping 1-0 */ __m128i eop_bits = _mm_andnot_si128(staterr, eop_check); /* the staterr values are not in order, as the count - * count of dd bits doesn't care. However, for end of + * of dd bits doesn't care. However, for end of * packet tracking, we do care, so shuffle. This also * compresses the 32-bit values to 8-bit */ @@ -560,15 +594,15 @@ ice_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts, return _ice_recv_raw_pkts_vec(rx_queue, rx_pkts, nb_pkts, NULL); } -/* vPMD receive routine that reassembles scattered packets +/** + * vPMD receive routine that reassembles single burst of 32 scattered packets + * * Notice: * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet - * - nb_pkts > ICE_VPMD_RX_BURST, only scan ICE_VPMD_RX_BURST - * numbers of DD bits */ -uint16_t -ice_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts, - uint16_t nb_pkts) +static uint16_t +ice_recv_scattered_burst_vec(void *rx_queue, struct rte_mbuf **rx_pkts, + uint16_t nb_pkts) { struct ice_rx_queue *rxq = rx_queue; uint8_t split_flags[ICE_VPMD_RX_BURST] = {0}; @@ -602,6 +636,32 @@ ice_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts, &split_flags[i]); } +/** + * vPMD receive routine that reassembles scattered packets. + */ +uint16_t +ice_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts, + uint16_t nb_pkts) +{ + uint16_t retval = 0; + + while (nb_pkts > ICE_VPMD_RX_BURST) { + uint16_t burst; + + burst = ice_recv_scattered_burst_vec(rx_queue, + rx_pkts + retval, + ICE_VPMD_RX_BURST); + retval += burst; + nb_pkts -= burst; + if (burst < ICE_VPMD_RX_BURST) + return retval; + } + + return retval + ice_recv_scattered_burst_vec(rx_queue, + rx_pkts + retval, + nb_pkts); +} + static inline void ice_vtx1(volatile struct ice_tx_desc *txdp, struct rte_mbuf *pkt, uint64_t flags) @@ -642,7 +702,7 @@ ice_xmit_fixed_burst_vec(void *tx_queue, struct rte_mbuf **tx_pkts, nb_pkts = RTE_MIN(nb_pkts, txq->tx_rs_thresh); if (txq->nb_tx_free < txq->tx_free_thresh) - ice_tx_free_bufs(txq); + ice_tx_free_bufs_vec(txq); nb_pkts = (uint16_t)RTE_MIN(txq->nb_tx_free, nb_pkts); nb_commit = nb_pkts; @@ -689,7 +749,7 @@ ice_xmit_fixed_burst_vec(void *tx_queue, struct rte_mbuf **tx_pkts, txq->tx_tail = tx_id; - ICE_PCI_REG_WRITE(txq->qtx_tail, txq->tx_tail); + ICE_PCI_REG_WC_WRITE(txq->qtx_tail, txq->tx_tail); return nb_pkts; }