X-Git-Url: http://git.droids-corp.org/?a=blobdiff_plain;f=drivers%2Fnet%2Fmlx5%2Fmlx5_rxtx_vec_neon.h;h=aa36df29a099374454a220e4bf9c001774525933;hb=46c6714ffd4326cd9ea884a9812a459a444f464a;hp=58e4556890abd1a9f339da9a682ee61121bb39a5;hpb=1ded26239aa0552fc95d553c0eb2b0888760f279;p=dpdk.git diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_neon.h b/drivers/net/mlx5/mlx5_rxtx_vec_neon.h index 58e4556890..aa36df29a0 100644 --- a/drivers/net/mlx5/mlx5_rxtx_vec_neon.h +++ b/drivers/net/mlx5/mlx5_rxtx_vec_neon.h @@ -111,7 +111,8 @@ rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq, rxq->crc_present * RTE_ETHER_CRC_LEN, 0, 0, 0 }; - const uint32_t flow_tag = t_pkt->hash.fdir.hi; + uint32x4_t ol_flags = {0, 0, 0, 0}; + uint32x4_t ol_flags_mask = {0, 0, 0, 0}; #ifdef MLX5_PMD_SOFT_COUNTERS uint32_t rcvd_byte = 0; #endif @@ -198,11 +199,139 @@ rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq, rcvd_byte += vget_lane_u64(vpaddl_u32(vpaddl_u16(byte_cnt)), 0); #endif if (rxq->mark) { - /* E.1 store flow tag (rte_flow mark). */ - elts[pos]->hash.fdir.hi = flow_tag; - elts[pos + 1]->hash.fdir.hi = flow_tag; - elts[pos + 2]->hash.fdir.hi = flow_tag; - elts[pos + 3]->hash.fdir.hi = flow_tag; + if (rxq->mcqe_format != + MLX5_CQE_RESP_FORMAT_FTAG_STRIDX) { + const uint32_t flow_tag = t_pkt->hash.fdir.hi; + + /* E.1 store flow tag (rte_flow mark). */ + elts[pos]->hash.fdir.hi = flow_tag; + elts[pos + 1]->hash.fdir.hi = flow_tag; + elts[pos + 2]->hash.fdir.hi = flow_tag; + elts[pos + 3]->hash.fdir.hi = flow_tag; + } else { + const uint32x4_t flow_mark_adj = { + -1, -1, -1, -1 }; + const uint8x16_t flow_mark_shuf = { + 28, 24, 25, -1, + 20, 16, 17, -1, + 12, 8, 9, -1, + 4, 0, 1, -1}; + /* Extract flow_tag field. */ + const uint32x4_t ft_mask = + vdupq_n_u32(MLX5_FLOW_MARK_DEFAULT); + const uint32x4_t fdir_flags = + vdupq_n_u32(RTE_MBUF_F_RX_FDIR); + const uint32x4_t fdir_all_flags = + vdupq_n_u32(RTE_MBUF_F_RX_FDIR | + RTE_MBUF_F_RX_FDIR_ID); + uint32x4_t fdir_id_flags = + vdupq_n_u32(RTE_MBUF_F_RX_FDIR_ID); + uint32x4_t invalid_mask, ftag; + + __asm__ volatile + /* A.1 load mCQEs into a 128bit register. */ + ("ld1 {v16.16b - v17.16b}, [%[mcq]]\n\t" + /* Extract flow_tag. */ + "tbl %[ftag].16b, {v16.16b - v17.16b}, %[flow_mark_shuf].16b\n\t" + : [ftag]"=&w"(ftag) + : [mcq]"r"(p), + [flow_mark_shuf]"w"(flow_mark_shuf) + : "memory", "v16", "v17"); + invalid_mask = vceqzq_u32(ftag); + ol_flags_mask = vorrq_u32(ol_flags_mask, + fdir_all_flags); + /* Set RTE_MBUF_F_RX_FDIR if flow tag is non-zero. */ + ol_flags = vorrq_u32(ol_flags, + vbicq_u32(fdir_flags, invalid_mask)); + /* Mask out invalid entries. */ + fdir_id_flags = vbicq_u32(fdir_id_flags, + invalid_mask); + /* Check if flow tag MLX5_FLOW_MARK_DEFAULT. */ + ol_flags = vorrq_u32(ol_flags, + vbicq_u32(fdir_id_flags, + vceqq_u32(ftag, ft_mask))); + ftag = vaddq_u32(ftag, flow_mark_adj); + elts[pos]->hash.fdir.hi = + vgetq_lane_u32(ftag, 3); + elts[pos + 1]->hash.fdir.hi = + vgetq_lane_u32(ftag, 2); + elts[pos + 2]->hash.fdir.hi = + vgetq_lane_u32(ftag, 1); + elts[pos + 3]->hash.fdir.hi = + vgetq_lane_u32(ftag, 0); + } + } + if (unlikely(rxq->mcqe_format != + MLX5_CQE_RESP_FORMAT_HASH)) { + if (rxq->mcqe_format == + MLX5_CQE_RESP_FORMAT_L34H_STRIDX) { + const uint8_t pkt_info = + (cq->pkt_info & 0x3) << 6; + const uint8_t pkt_hdr0 = + mcq[pos % 8].hdr_type; + const uint8_t pkt_hdr1 = + mcq[pos % 8 + 1].hdr_type; + const uint8_t pkt_hdr2 = + mcq[pos % 8 + 2].hdr_type; + const uint8_t pkt_hdr3 = + mcq[pos % 8 + 3].hdr_type; + const uint32x4_t vlan_mask = + vdupq_n_u32(RTE_MBUF_F_RX_VLAN | + RTE_MBUF_F_RX_VLAN_STRIPPED); + const uint32x4_t cv_mask = + vdupq_n_u32(MLX5_CQE_VLAN_STRIPPED); + const uint32x4_t pkt_cv = { + pkt_hdr0 & 0x1, pkt_hdr1 & 0x1, + pkt_hdr2 & 0x1, pkt_hdr3 & 0x1}; + + ol_flags_mask = vorrq_u32(ol_flags_mask, + vlan_mask); + ol_flags = vorrq_u32(ol_flags, + vandq_u32(vlan_mask, + vceqq_u32(pkt_cv, cv_mask))); + elts[pos]->packet_type = + mlx5_ptype_table[(pkt_hdr0 >> 2) | + pkt_info]; + elts[pos + 1]->packet_type = + mlx5_ptype_table[(pkt_hdr1 >> 2) | + pkt_info]; + elts[pos + 2]->packet_type = + mlx5_ptype_table[(pkt_hdr2 >> 2) | + pkt_info]; + elts[pos + 3]->packet_type = + mlx5_ptype_table[(pkt_hdr3 >> 2) | + pkt_info]; + if (rxq->tunnel) { + elts[pos]->packet_type |= + !!(((pkt_hdr0 >> 2) | + pkt_info) & (1 << 6)); + elts[pos + 1]->packet_type |= + !!(((pkt_hdr1 >> 2) | + pkt_info) & (1 << 6)); + elts[pos + 2]->packet_type |= + !!(((pkt_hdr2 >> 2) | + pkt_info) & (1 << 6)); + elts[pos + 3]->packet_type |= + !!(((pkt_hdr3 >> 2) | + pkt_info) & (1 << 6)); + } + } + const uint32x4_t hash_flags = + vdupq_n_u32(RTE_MBUF_F_RX_RSS_HASH); + const uint32x4_t rearm_flags = + vdupq_n_u32((uint32_t)t_pkt->ol_flags); + + ol_flags_mask = vorrq_u32(ol_flags_mask, hash_flags); + ol_flags = vorrq_u32(ol_flags, + vbicq_u32(rearm_flags, ol_flags_mask)); + elts[pos]->ol_flags = vgetq_lane_u32(ol_flags, 3); + elts[pos + 1]->ol_flags = vgetq_lane_u32(ol_flags, 2); + elts[pos + 2]->ol_flags = vgetq_lane_u32(ol_flags, 1); + elts[pos + 3]->ol_flags = vgetq_lane_u32(ol_flags, 0); + elts[pos]->hash.rss = 0; + elts[pos + 1]->hash.rss = 0; + elts[pos + 2]->hash.rss = 0; + elts[pos + 3]->hash.rss = 0; } if (rxq->dynf_meta) { int32_t offs = rxq->flow_meta_offset; @@ -240,7 +369,6 @@ rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq, rxq->stats.ipackets += mcqe_n; rxq->stats.ibytes += rcvd_byte; #endif - rxq->cq_ci += mcqe_n; return mcqe_n; } @@ -268,22 +396,22 @@ rxq_cq_to_ptype_oflags_v(struct mlx5_rxq_data *rxq, uint16x4_t ptype; uint32x4_t pinfo, cv_flags; uint32x4_t ol_flags = - vdupq_n_u32(rxq->rss_hash * PKT_RX_RSS_HASH | + vdupq_n_u32(rxq->rss_hash * RTE_MBUF_F_RX_RSS_HASH | rxq->hw_timestamp * rxq->timestamp_rx_flag); const uint32x4_t ptype_ol_mask = { 0x106, 0x106, 0x106, 0x106 }; const uint8x16_t cv_flag_sel = { 0, - (uint8_t)(PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED), - (uint8_t)(PKT_RX_IP_CKSUM_GOOD >> 1), + (uint8_t)(RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED), + (uint8_t)(RTE_MBUF_F_RX_IP_CKSUM_GOOD >> 1), 0, - (uint8_t)(PKT_RX_L4_CKSUM_GOOD >> 1), + (uint8_t)(RTE_MBUF_F_RX_L4_CKSUM_GOOD >> 1), 0, - (uint8_t)((PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD) >> 1), + (uint8_t)((RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_L4_CKSUM_GOOD) >> 1), 0, 0, 0, 0, 0, 0, 0, 0, 0 }; const uint32x4_t cv_mask = - vdupq_n_u32(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD | - PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED); + vdupq_n_u32(RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_L4_CKSUM_GOOD | + RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED); const uint64x2_t mbuf_init = vld1q_u64 ((const uint64_t *)&rxq->mbuf_initializer); uint64x2_t rearm0, rearm1, rearm2, rearm3; @@ -291,11 +419,11 @@ rxq_cq_to_ptype_oflags_v(struct mlx5_rxq_data *rxq, if (rxq->mark) { const uint32x4_t ft_def = vdupq_n_u32(MLX5_FLOW_MARK_DEFAULT); - const uint32x4_t fdir_flags = vdupq_n_u32(PKT_RX_FDIR); - uint32x4_t fdir_id_flags = vdupq_n_u32(PKT_RX_FDIR_ID); + const uint32x4_t fdir_flags = vdupq_n_u32(RTE_MBUF_F_RX_FDIR); + uint32x4_t fdir_id_flags = vdupq_n_u32(RTE_MBUF_F_RX_FDIR_ID); uint32x4_t invalid_mask; - /* Check if flow tag is non-zero then set PKT_RX_FDIR. */ + /* Check if flow tag is non-zero then set RTE_MBUF_F_RX_FDIR. */ invalid_mask = vceqzq_u32(flow_tag); ol_flags = vorrq_u32(ol_flags, vbicq_u32(fdir_flags, invalid_mask)); @@ -465,7 +593,7 @@ rxq_cq_process_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq, * there's no instruction to count trailing zeros. __builtin_clzl() is * used instead. * - * A. copy 4 mbuf pointers from elts ring to returing pkts. + * A. copy 4 mbuf pointers from elts ring to returning pkts. * B. load 64B CQE and extract necessary fields * Final 16bytes cqes[] extracted from original 64bytes CQE has the * following structure: @@ -639,16 +767,15 @@ rxq_cq_process_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq, comp_idx = __builtin_clzl(vget_lane_u64(vreinterpret_u64_u16( comp_mask), 0)) / (sizeof(uint16_t) * 8); - /* D.6 mask out entries after the compressed CQE. */ - mask = vcreate_u16(comp_idx < MLX5_VPMD_DESCS_PER_LOOP ? - -1UL >> (comp_idx * sizeof(uint16_t) * 8) : - 0); - invalid_mask = vorr_u16(invalid_mask, mask); + invalid_mask = vorr_u16(invalid_mask, comp_mask); /* D.7 count non-compressed valid CQEs. */ n = __builtin_clzl(vget_lane_u64(vreinterpret_u64_u16( invalid_mask), 0)) / (sizeof(uint16_t) * 8); nocmp_n += n; - /* D.2 get the final invalid mask. */ + /* + * D.2 mask out entries after the compressed CQE. + * get the final invalid mask. + */ mask = vcreate_u16(n < MLX5_VPMD_DESCS_PER_LOOP ? -1UL >> (n * sizeof(uint16_t) * 8) : 0); invalid_mask = vorr_u16(invalid_mask, mask); @@ -704,19 +831,24 @@ rxq_cq_process_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq, if (rxq->dynf_meta) { /* This code is subject for futher optimization. */ int32_t offs = rxq->flow_meta_offset; + uint32_t mask = rxq->flow_meta_port_mask; *RTE_MBUF_DYNFIELD(pkts[pos], offs, uint32_t *) = - container_of(p0, struct mlx5_cqe, - pkt_info)->flow_table_metadata; + rte_be_to_cpu_32(container_of + (p0, struct mlx5_cqe, + pkt_info)->flow_table_metadata) & mask; *RTE_MBUF_DYNFIELD(pkts[pos + 1], offs, uint32_t *) = - container_of(p1, struct mlx5_cqe, - pkt_info)->flow_table_metadata; + rte_be_to_cpu_32(container_of + (p1, struct mlx5_cqe, + pkt_info)->flow_table_metadata) & mask; *RTE_MBUF_DYNFIELD(pkts[pos + 2], offs, uint32_t *) = - container_of(p2, struct mlx5_cqe, - pkt_info)->flow_table_metadata; + rte_be_to_cpu_32(container_of + (p2, struct mlx5_cqe, + pkt_info)->flow_table_metadata) & mask; *RTE_MBUF_DYNFIELD(pkts[pos + 3], offs, uint32_t *) = - container_of(p3, struct mlx5_cqe, - pkt_info)->flow_table_metadata; + rte_be_to_cpu_32(container_of + (p3, struct mlx5_cqe, + pkt_info)->flow_table_metadata) & mask; if (*RTE_MBUF_DYNFIELD(pkts[pos], offs, uint32_t *)) elts[pos]->ol_flags |= rxq->flow_meta_mask; if (*RTE_MBUF_DYNFIELD(pkts[pos + 1], offs, uint32_t *))