X-Git-Url: http://git.droids-corp.org/?a=blobdiff_plain;f=drivers%2Fnet%2Fmlx5%2Fmlx5_rxtx_vec_neon.h;h=4c067d8801f851be646665b60ff4157d3e767ac3;hb=bb5d49c6165b0c6ac32499ddc3e7fee41cfda51d;hp=1c7e3b444a8685f37179fff6fe9f9ec2ac28d207;hpb=957e45fb7bcbcfa1dec7ebca2ffe3a811c5bafca;p=dpdk.git diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_neon.h b/drivers/net/mlx5/mlx5_rxtx_vec_neon.h index 1c7e3b444a..4c067d8801 100644 --- a/drivers/net/mlx5/mlx5_rxtx_vec_neon.h +++ b/drivers/net/mlx5/mlx5_rxtx_vec_neon.h @@ -6,7 +6,6 @@ #ifndef RTE_PMD_MLX5_RXTX_VEC_NEON_H_ #define RTE_PMD_MLX5_RXTX_VEC_NEON_H_ -#include #include #include #include @@ -16,320 +15,30 @@ #include #include +#include + +#include "mlx5_defs.h" #include "mlx5.h" #include "mlx5_utils.h" #include "mlx5_rxtx.h" #include "mlx5_rxtx_vec.h" #include "mlx5_autoconf.h" -#include "mlx5_defs.h" -#include "mlx5_prm.h" #pragma GCC diagnostic ignored "-Wcast-qual" -/** - * Fill in buffer descriptors in a multi-packet send descriptor. - * - * @param txq - * Pointer to TX queue structure. - * @param dseg - * Pointer to buffer descriptor to be written. - * @param pkts - * Pointer to array of packets to be sent. - * @param n - * Number of packets to be filled. - */ -static inline void -txq_wr_dseg_v(struct mlx5_txq_data *txq, uint8_t *dseg, - struct rte_mbuf **pkts, unsigned int n) -{ - unsigned int pos; - uintptr_t addr; - const uint8x16_t dseg_shuf_m = { - 3, 2, 1, 0, /* length, bswap32 */ - 4, 5, 6, 7, /* lkey */ - 15, 14, 13, 12, /* addr, bswap64 */ - 11, 10, 9, 8 - }; -#ifdef MLX5_PMD_SOFT_COUNTERS - uint32_t tx_byte = 0; -#endif - - for (pos = 0; pos < n; ++pos, dseg += MLX5_WQE_DWORD_SIZE) { - uint8x16_t desc; - struct rte_mbuf *pkt = pkts[pos]; - - addr = rte_pktmbuf_mtod(pkt, uintptr_t); - desc = vreinterpretq_u8_u32((uint32x4_t) { - DATA_LEN(pkt), - mlx5_tx_mb2mr(txq, pkt), - addr, - addr >> 32 }); - desc = vqtbl1q_u8(desc, dseg_shuf_m); - vst1q_u8(dseg, desc); -#ifdef MLX5_PMD_SOFT_COUNTERS - tx_byte += DATA_LEN(pkt); -#endif - } -#ifdef MLX5_PMD_SOFT_COUNTERS - txq->stats.obytes += tx_byte; -#endif -} - -/** - * Send multi-segmented packets until it encounters a single segment packet in - * the pkts list. - * - * @param txq - * Pointer to TX queue structure. - * @param pkts - * Pointer to array of packets to be sent. - * @param pkts_n - * Number of packets to be sent. - * - * @return - * Number of packets successfully transmitted (<= pkts_n). - */ -static uint16_t -txq_scatter_v(struct mlx5_txq_data *txq, struct rte_mbuf **pkts, - uint16_t pkts_n) -{ - uint16_t elts_head = txq->elts_head; - const uint16_t elts_n = 1 << txq->elts_n; - const uint16_t elts_m = elts_n - 1; - const uint16_t wq_n = 1 << txq->wqe_n; - const uint16_t wq_mask = wq_n - 1; - const unsigned int nb_dword_per_wqebb = - MLX5_WQE_SIZE / MLX5_WQE_DWORD_SIZE; - const unsigned int nb_dword_in_hdr = - sizeof(struct mlx5_wqe) / MLX5_WQE_DWORD_SIZE; - unsigned int n; - volatile struct mlx5_wqe *wqe = NULL; - bool metadata_ol = - txq->offloads & DEV_TX_OFFLOAD_MATCH_METADATA ? true : false; - - assert(elts_n > pkts_n); - mlx5_tx_complete(txq); - if (unlikely(!pkts_n)) - return 0; - for (n = 0; n < pkts_n; ++n) { - struct rte_mbuf *buf = pkts[n]; - unsigned int segs_n = buf->nb_segs; - unsigned int ds = nb_dword_in_hdr; - unsigned int len = PKT_LEN(buf); - uint16_t wqe_ci = txq->wqe_ci; - const uint8x16_t ctrl_shuf_m = { - 3, 2, 1, 0, /* bswap32 */ - 7, 6, 5, 4, /* bswap32 */ - 11, 10, 9, 8, /* bswap32 */ - 12, 13, 14, 15 - }; - uint8_t cs_flags; - uint16_t max_elts; - uint16_t max_wqe; - uint8x16_t *t_wqe; - uint8_t *dseg; - uint8x16_t ctrl; - rte_be32_t metadata = - metadata_ol && (buf->ol_flags & PKT_TX_METADATA) ? - buf->tx_metadata : 0; - - assert(segs_n); - max_elts = elts_n - (elts_head - txq->elts_tail); - max_wqe = wq_n - (txq->wqe_ci - txq->wqe_pi); - /* - * A MPW session consumes 2 WQEs at most to - * include MLX5_MPW_DSEG_MAX pointers. - */ - if (segs_n == 1 || - max_elts < segs_n || max_wqe < 2) - break; - wqe = &((volatile struct mlx5_wqe64 *) - txq->wqes)[wqe_ci & wq_mask].hdr; - cs_flags = txq_ol_cksum_to_cs(buf); - /* Title WQEBB pointer. */ - t_wqe = (uint8x16_t *)wqe; - dseg = (uint8_t *)(wqe + 1); - do { - if (!(ds++ % nb_dword_per_wqebb)) { - dseg = (uint8_t *) - &((volatile struct mlx5_wqe64 *) - txq->wqes)[++wqe_ci & wq_mask]; - } - txq_wr_dseg_v(txq, dseg, &buf, 1); - dseg += MLX5_WQE_DWORD_SIZE; - (*txq->elts)[elts_head++ & elts_m] = buf; - buf = buf->next; - } while (--segs_n); - ++wqe_ci; - /* Fill CTRL in the header. */ - ctrl = vreinterpretq_u8_u32((uint32x4_t) { - MLX5_OPC_MOD_MPW << 24 | - txq->wqe_ci << 8 | MLX5_OPCODE_TSO, - txq->qp_num_8s | ds, 4, 0}); - ctrl = vqtbl1q_u8(ctrl, ctrl_shuf_m); - vst1q_u8((void *)t_wqe, ctrl); - /* Fill ESEG in the header. */ - vst1q_u32((void *)(t_wqe + 1), - ((uint32x4_t){ 0, - rte_cpu_to_be_16(len) << 16 | cs_flags, - metadata, 0 })); - txq->wqe_ci = wqe_ci; - } - if (!n) - return 0; - txq->elts_comp += (uint16_t)(elts_head - txq->elts_head); - txq->elts_head = elts_head; - if (txq->elts_comp >= MLX5_TX_COMP_THRESH) { - /* A CQE slot must always be available. */ - assert((1u << txq->cqe_n) - (txq->cq_pi++ - txq->cq_ci)); - wqe->ctrl[2] = rte_cpu_to_be_32(MLX5_COMP_ALWAYS << - MLX5_COMP_MODE_OFFSET); - wqe->ctrl[3] = txq->elts_head; - txq->elts_comp = 0; - } -#ifdef MLX5_PMD_SOFT_COUNTERS - txq->stats.opackets += n; -#endif - mlx5_tx_dbrec(txq, wqe); - return n; -} - -/** - * Send burst of packets with Enhanced MPW. If it encounters a multi-seg packet, - * it returns to make it processed by txq_scatter_v(). All the packets in - * the pkts list should be single segment packets having same offload flags. - * This must be checked by txq_count_contig_single_seg() and txq_calc_offload(). - * - * @param txq - * Pointer to TX queue structure. - * @param pkts - * Pointer to array of packets to be sent. - * @param pkts_n - * Number of packets to be sent (<= MLX5_VPMD_TX_MAX_BURST). - * @param cs_flags - * Checksum offload flags to be written in the descriptor. - * @param metadata - * Metadata value to be written in the descriptor. - * - * @return - * Number of packets successfully transmitted (<= pkts_n). - */ -static inline uint16_t -txq_burst_v(struct mlx5_txq_data *txq, struct rte_mbuf **pkts, uint16_t pkts_n, - uint8_t cs_flags, rte_be32_t metadata) -{ - struct rte_mbuf **elts; - uint16_t elts_head = txq->elts_head; - const uint16_t elts_n = 1 << txq->elts_n; - const uint16_t elts_m = elts_n - 1; - const unsigned int nb_dword_per_wqebb = - MLX5_WQE_SIZE / MLX5_WQE_DWORD_SIZE; - const unsigned int nb_dword_in_hdr = - sizeof(struct mlx5_wqe) / MLX5_WQE_DWORD_SIZE; - unsigned int n = 0; - unsigned int pos; - uint16_t max_elts; - uint16_t max_wqe; - uint32_t comp_req; - const uint16_t wq_n = 1 << txq->wqe_n; - const uint16_t wq_mask = wq_n - 1; - uint16_t wq_idx = txq->wqe_ci & wq_mask; - volatile struct mlx5_wqe64 *wq = - &((volatile struct mlx5_wqe64 *)txq->wqes)[wq_idx]; - volatile struct mlx5_wqe *wqe = (volatile struct mlx5_wqe *)wq; - const uint8x16_t ctrl_shuf_m = { - 3, 2, 1, 0, /* bswap32 */ - 7, 6, 5, 4, /* bswap32 */ - 11, 10, 9, 8, /* bswap32 */ - 12, 13, 14, 15 - }; - uint8x16_t *t_wqe; - uint8_t *dseg; - uint8x16_t ctrl; - - /* Make sure all packets can fit into a single WQE. */ - assert(elts_n > pkts_n); - mlx5_tx_complete(txq); - max_elts = (elts_n - (elts_head - txq->elts_tail)); - max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi); - pkts_n = RTE_MIN((unsigned int)RTE_MIN(pkts_n, max_wqe), max_elts); - if (unlikely(!pkts_n)) - return 0; - elts = &(*txq->elts)[elts_head & elts_m]; - /* Loop for available tailroom first. */ - n = RTE_MIN(elts_n - (elts_head & elts_m), pkts_n); - for (pos = 0; pos < (n & -2); pos += 2) - vst1q_u64((void *)&elts[pos], vld1q_u64((void *)&pkts[pos])); - if (n & 1) - elts[pos] = pkts[pos]; - /* Check if it crosses the end of the queue. */ - if (unlikely(n < pkts_n)) { - elts = &(*txq->elts)[0]; - for (pos = 0; pos < pkts_n - n; ++pos) - elts[pos] = pkts[n + pos]; - } - txq->elts_head += pkts_n; - /* Save title WQEBB pointer. */ - t_wqe = (uint8x16_t *)wqe; - dseg = (uint8_t *)(wqe + 1); - /* Calculate the number of entries to the end. */ - n = RTE_MIN( - (wq_n - wq_idx) * nb_dword_per_wqebb - nb_dword_in_hdr, - pkts_n); - /* Fill DSEGs. */ - txq_wr_dseg_v(txq, dseg, pkts, n); - /* Check if it crosses the end of the queue. */ - if (n < pkts_n) { - dseg = (uint8_t *)txq->wqes; - txq_wr_dseg_v(txq, dseg, &pkts[n], pkts_n - n); - } - if (txq->elts_comp + pkts_n < MLX5_TX_COMP_THRESH) { - txq->elts_comp += pkts_n; - comp_req = MLX5_COMP_ONLY_FIRST_ERR << MLX5_COMP_MODE_OFFSET; - } else { - /* A CQE slot must always be available. */ - assert((1u << txq->cqe_n) - (txq->cq_pi++ - txq->cq_ci)); - /* Request a completion. */ - txq->elts_comp = 0; - comp_req = MLX5_COMP_ALWAYS << MLX5_COMP_MODE_OFFSET; - } - /* Fill CTRL in the header. */ - ctrl = vreinterpretq_u8_u32((uint32x4_t) { - MLX5_OPC_MOD_ENHANCED_MPSW << 24 | - txq->wqe_ci << 8 | MLX5_OPCODE_ENHANCED_MPSW, - txq->qp_num_8s | (pkts_n + 2), - comp_req, - txq->elts_head }); - ctrl = vqtbl1q_u8(ctrl, ctrl_shuf_m); - vst1q_u8((void *)t_wqe, ctrl); - /* Fill ESEG in the header. */ - vst1q_u32((void *)(t_wqe + 1), - ((uint32x4_t) { 0, cs_flags, metadata, 0 })); -#ifdef MLX5_PMD_SOFT_COUNTERS - txq->stats.opackets += pkts_n; -#endif - txq->wqe_ci += (nb_dword_in_hdr + pkts_n + (nb_dword_per_wqebb - 1)) / - nb_dword_per_wqebb; - /* Ring QP doorbell. */ - mlx5_tx_dbrec_cond_wmb(txq, wqe, pkts_n < MLX5_VPMD_TX_MAX_BURST); - return pkts_n; -} - /** * Store free buffers to RX SW ring. * - * @param rxq - * Pointer to RX queue structure. + * @param elts + * Pointer to SW ring to be filled. * @param pkts * Pointer to array of packets to be stored. * @param pkts_n * Number of packets to be stored. */ static inline void -rxq_copy_mbuf_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t n) +rxq_copy_mbuf_v(struct rte_mbuf **elts, struct rte_mbuf **pkts, uint16_t n) { - const uint16_t q_mask = (1 << rxq->elts_n) - 1; - struct rte_mbuf **elts = &(*rxq->elts)[rxq->rq_pi & q_mask]; unsigned int pos; uint16_t p = n & -2; @@ -402,7 +111,8 @@ rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq, rxq->crc_present * RTE_ETHER_CRC_LEN, 0, 0, 0 }; - const uint32_t flow_tag = t_pkt->hash.fdir.hi; + uint32x4_t ol_flags = {0, 0, 0, 0}; + uint32x4_t ol_flags_mask = {0, 0, 0, 0}; #ifdef MLX5_PMD_SOFT_COUNTERS uint32_t rcvd_byte = 0; #endif @@ -435,8 +145,9 @@ rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq, sizeof(uint16_t) * 8) : 0); #endif - if (!(pos & 0x7) && pos + 8 < mcqe_n) - rte_prefetch0((void *)(cq + pos + 8)); + for (i = 0; i < MLX5_VPMD_DESCS_PER_LOOP; ++i) + if (likely(pos + i < mcqe_n)) + rte_prefetch0((void *)(cq + pos + i)); __asm__ volatile ( /* A.1 load mCQEs into a 128bit register. */ "ld1 {v16.16b - v17.16b}, [%[mcq]] \n\t" @@ -488,15 +199,164 @@ rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq, rcvd_byte += vget_lane_u64(vpaddl_u32(vpaddl_u16(byte_cnt)), 0); #endif if (rxq->mark) { - /* E.1 store flow tag (rte_flow mark). */ - elts[pos]->hash.fdir.hi = flow_tag; - elts[pos + 1]->hash.fdir.hi = flow_tag; - elts[pos + 2]->hash.fdir.hi = flow_tag; - elts[pos + 3]->hash.fdir.hi = flow_tag; + if (rxq->mcqe_format != + MLX5_CQE_RESP_FORMAT_FTAG_STRIDX) { + const uint32_t flow_tag = t_pkt->hash.fdir.hi; + + /* E.1 store flow tag (rte_flow mark). */ + elts[pos]->hash.fdir.hi = flow_tag; + elts[pos + 1]->hash.fdir.hi = flow_tag; + elts[pos + 2]->hash.fdir.hi = flow_tag; + elts[pos + 3]->hash.fdir.hi = flow_tag; + } else { + const uint32x4_t flow_mark_adj = { + -1, -1, -1, -1 }; + const uint8x16_t flow_mark_shuf = { + 28, 24, 25, -1, + 20, 16, 17, -1, + 12, 8, 9, -1, + 4, 0, 1, -1}; + /* Extract flow_tag field. */ + const uint32x4_t ft_mask = + vdupq_n_u32(MLX5_FLOW_MARK_DEFAULT); + const uint32x4_t fdir_flags = + vdupq_n_u32(PKT_RX_FDIR); + const uint32x4_t fdir_all_flags = + vdupq_n_u32(PKT_RX_FDIR | + PKT_RX_FDIR_ID); + uint32x4_t fdir_id_flags = + vdupq_n_u32(PKT_RX_FDIR_ID); + uint32x4_t invalid_mask, ftag; + + __asm__ volatile + /* A.1 load mCQEs into a 128bit register. */ + ("ld1 {v16.16b - v17.16b}, [%[mcq]]\n\t" + /* Extract flow_tag. */ + "tbl %[ftag].16b, {v16.16b - v17.16b}, %[flow_mark_shuf].16b\n\t" + : [ftag]"=&w"(ftag) + : [mcq]"r"(p), + [flow_mark_shuf]"w"(flow_mark_shuf) + : "memory", "v16", "v17"); + invalid_mask = vceqzq_u32(ftag); + ol_flags_mask = vorrq_u32(ol_flags_mask, + fdir_all_flags); + /* Set PKT_RX_FDIR if flow tag is non-zero. */ + ol_flags = vorrq_u32(ol_flags, + vbicq_u32(fdir_flags, invalid_mask)); + /* Mask out invalid entries. */ + fdir_id_flags = vbicq_u32(fdir_id_flags, + invalid_mask); + /* Check if flow tag MLX5_FLOW_MARK_DEFAULT. */ + ol_flags = vorrq_u32(ol_flags, + vbicq_u32(fdir_id_flags, + vceqq_u32(ftag, ft_mask))); + ftag = vaddq_u32(ftag, flow_mark_adj); + elts[pos]->hash.fdir.hi = + vgetq_lane_u32(ftag, 3); + elts[pos + 1]->hash.fdir.hi = + vgetq_lane_u32(ftag, 2); + elts[pos + 2]->hash.fdir.hi = + vgetq_lane_u32(ftag, 1); + elts[pos + 3]->hash.fdir.hi = + vgetq_lane_u32(ftag, 0); + } + } + if (unlikely(rxq->mcqe_format != + MLX5_CQE_RESP_FORMAT_HASH)) { + if (rxq->mcqe_format == + MLX5_CQE_RESP_FORMAT_L34H_STRIDX) { + const uint8_t pkt_info = + (cq->pkt_info & 0x3) << 6; + const uint8_t pkt_hdr0 = + mcq[pos % 8].hdr_type; + const uint8_t pkt_hdr1 = + mcq[pos % 8 + 1].hdr_type; + const uint8_t pkt_hdr2 = + mcq[pos % 8 + 2].hdr_type; + const uint8_t pkt_hdr3 = + mcq[pos % 8 + 3].hdr_type; + const uint32x4_t vlan_mask = + vdupq_n_u32(PKT_RX_VLAN | + PKT_RX_VLAN_STRIPPED); + const uint32x4_t cv_mask = + vdupq_n_u32(MLX5_CQE_VLAN_STRIPPED); + const uint32x4_t pkt_cv = { + pkt_hdr0 & 0x1, pkt_hdr1 & 0x1, + pkt_hdr2 & 0x1, pkt_hdr3 & 0x1}; + + ol_flags_mask = vorrq_u32(ol_flags_mask, + vlan_mask); + ol_flags = vorrq_u32(ol_flags, + vandq_u32(vlan_mask, + vceqq_u32(pkt_cv, cv_mask))); + elts[pos]->packet_type = + mlx5_ptype_table[(pkt_hdr0 >> 2) | + pkt_info]; + elts[pos + 1]->packet_type = + mlx5_ptype_table[(pkt_hdr1 >> 2) | + pkt_info]; + elts[pos + 2]->packet_type = + mlx5_ptype_table[(pkt_hdr2 >> 2) | + pkt_info]; + elts[pos + 3]->packet_type = + mlx5_ptype_table[(pkt_hdr3 >> 2) | + pkt_info]; + if (rxq->tunnel) { + elts[pos]->packet_type |= + !!(((pkt_hdr0 >> 2) | + pkt_info) & (1 << 6)); + elts[pos + 1]->packet_type |= + !!(((pkt_hdr1 >> 2) | + pkt_info) & (1 << 6)); + elts[pos + 2]->packet_type |= + !!(((pkt_hdr2 >> 2) | + pkt_info) & (1 << 6)); + elts[pos + 3]->packet_type |= + !!(((pkt_hdr3 >> 2) | + pkt_info) & (1 << 6)); + } + } + const uint32x4_t hash_flags = + vdupq_n_u32(PKT_RX_RSS_HASH); + const uint32x4_t rearm_flags = + vdupq_n_u32((uint32_t)t_pkt->ol_flags); + + ol_flags_mask = vorrq_u32(ol_flags_mask, hash_flags); + ol_flags = vorrq_u32(ol_flags, + vbicq_u32(rearm_flags, ol_flags_mask)); + elts[pos]->ol_flags = vgetq_lane_u32(ol_flags, 3); + elts[pos + 1]->ol_flags = vgetq_lane_u32(ol_flags, 2); + elts[pos + 2]->ol_flags = vgetq_lane_u32(ol_flags, 1); + elts[pos + 3]->ol_flags = vgetq_lane_u32(ol_flags, 0); + elts[pos]->hash.rss = 0; + elts[pos + 1]->hash.rss = 0; + elts[pos + 2]->hash.rss = 0; + elts[pos + 3]->hash.rss = 0; + } + if (rxq->dynf_meta) { + int32_t offs = rxq->flow_meta_offset; + const uint32_t meta = + *RTE_MBUF_DYNFIELD(t_pkt, offs, uint32_t *); + + /* Check if title packet has valid metadata. */ + if (meta) { + MLX5_ASSERT(t_pkt->ol_flags & + rxq->flow_meta_mask); + *RTE_MBUF_DYNFIELD(elts[pos], offs, + uint32_t *) = meta; + *RTE_MBUF_DYNFIELD(elts[pos + 1], offs, + uint32_t *) = meta; + *RTE_MBUF_DYNFIELD(elts[pos + 2], offs, + uint32_t *) = meta; + *RTE_MBUF_DYNFIELD(elts[pos + 3], offs, + uint32_t *) = meta; + } } pos += MLX5_VPMD_DESCS_PER_LOOP; /* Move to next CQE and invalidate consumed CQEs. */ if (!(pos & 0x7) && pos < mcqe_n) { + if (pos + 8 < mcqe_n) + rte_prefetch0((void *)(cq + pos + 8)); mcq = (void *)&(cq + pos)->pkt_info; for (i = 0; i < 8; ++i) cq[inv++].op_own = MLX5_CQE_INVALIDATE; @@ -509,7 +369,6 @@ rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq, rxq->stats.ipackets += mcqe_n; rxq->stats.ibytes += rcvd_byte; #endif - rxq->cq_ci += mcqe_n; return mcqe_n; } @@ -538,7 +397,7 @@ rxq_cq_to_ptype_oflags_v(struct mlx5_rxq_data *rxq, uint32x4_t pinfo, cv_flags; uint32x4_t ol_flags = vdupq_n_u32(rxq->rss_hash * PKT_RX_RSS_HASH | - rxq->hw_timestamp * PKT_RX_TIMESTAMP); + rxq->hw_timestamp * rxq->timestamp_rx_flag); const uint32x4_t ptype_ol_mask = { 0x106, 0x106, 0x106, 0x106 }; const uint8x16_t cv_flag_sel = { 0, @@ -553,8 +412,8 @@ rxq_cq_to_ptype_oflags_v(struct mlx5_rxq_data *rxq, const uint32x4_t cv_mask = vdupq_n_u32(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD | PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED); - const uint64x1_t mbuf_init = vld1_u64(&rxq->mbuf_initializer); - const uint64x1_t r32_mask = vcreate_u64(0xffffffff); + const uint64x2_t mbuf_init = vld1q_u64 + ((const uint64_t *)&rxq->mbuf_initializer); uint64x2_t rearm0, rearm1, rearm2, rearm3; uint8_t pt_idx0, pt_idx1, pt_idx2, pt_idx3; @@ -615,18 +474,19 @@ rxq_cq_to_ptype_oflags_v(struct mlx5_rxq_data *rxq, /* Merge to ol_flags. */ ol_flags = vorrq_u32(ol_flags, cv_flags); /* Merge mbuf_init and ol_flags, and store. */ - rearm0 = vcombine_u64(mbuf_init, - vshr_n_u64(vget_high_u64(vreinterpretq_u64_u32( - ol_flags)), 32)); - rearm1 = vcombine_u64(mbuf_init, - vand_u64(vget_high_u64(vreinterpretq_u64_u32( - ol_flags)), r32_mask)); - rearm2 = vcombine_u64(mbuf_init, - vshr_n_u64(vget_low_u64(vreinterpretq_u64_u32( - ol_flags)), 32)); - rearm3 = vcombine_u64(mbuf_init, - vand_u64(vget_low_u64(vreinterpretq_u64_u32( - ol_flags)), r32_mask)); + rearm0 = vreinterpretq_u64_u32(vsetq_lane_u32 + (vgetq_lane_u32(ol_flags, 3), + vreinterpretq_u32_u64(mbuf_init), 2)); + rearm1 = vreinterpretq_u64_u32(vsetq_lane_u32 + (vgetq_lane_u32(ol_flags, 2), + vreinterpretq_u32_u64(mbuf_init), 2)); + rearm2 = vreinterpretq_u64_u32(vsetq_lane_u32 + (vgetq_lane_u32(ol_flags, 1), + vreinterpretq_u32_u64(mbuf_init), 2)); + rearm3 = vreinterpretq_u64_u32(vsetq_lane_u32 + (vgetq_lane_u32(ol_flags, 0), + vreinterpretq_u32_u64(mbuf_init), 2)); + vst1q_u64((void *)&pkts[0]->rearm_data, rearm0); vst1q_u64((void *)&pkts[1]->rearm_data, rearm1); vst1q_u64((void *)&pkts[2]->rearm_data, rearm2); @@ -634,12 +494,16 @@ rxq_cq_to_ptype_oflags_v(struct mlx5_rxq_data *rxq, } /** - * Receive burst of packets. An errored completion also consumes a mbuf, but the - * packet_type is set to be RTE_PTYPE_ALL_MASK. Marked mbufs should be freed - * before returning to application. + * Process a non-compressed completion and fill in mbufs in RX SW ring + * with data extracted from the title completion descriptor. * * @param rxq * Pointer to RX queue structure. + * @param cq + * Pointer to completion array having a non-compressed completion at first. + * @param elts + * Pointer to SW ring to be filled. The first mbuf has to be pre-built from + * the title completion descriptor to be copied to the rest of mbufs. * @param[out] pkts * Array to store received packets. * @param pkts_n @@ -647,26 +511,23 @@ rxq_cq_to_ptype_oflags_v(struct mlx5_rxq_data *rxq, * @param[out] err * Pointer to a flag. Set non-zero value if pkts array has at least one error * packet to handle. + * @param[out] comp + * Pointer to a index. Set it to the first compressed completion if any. * * @return - * Number of packets received including errors (<= pkts_n). + * Number of CQEs successfully processed. */ static inline uint16_t -rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t pkts_n, - uint64_t *err) +rxq_cq_process_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq, + struct rte_mbuf **elts, struct rte_mbuf **pkts, + uint16_t pkts_n, uint64_t *err, uint64_t *comp) { const uint16_t q_n = 1 << rxq->cqe_n; const uint16_t q_mask = q_n - 1; - volatile struct mlx5_cqe *cq; - struct rte_mbuf **elts; unsigned int pos; - uint64_t n; - uint16_t repl_n; + uint64_t n = 0; uint64_t comp_idx = MLX5_VPMD_DESCS_PER_LOOP; uint16_t nocmp_n = 0; - uint16_t rcvd_pkt = 0; - unsigned int cq_idx = rxq->cq_ci & q_mask; - unsigned int elts_idx; const uint16x4_t ownership = vdup_n_u16(!(rxq->cq_ci & (q_mask + 1))); const uint16x4_t owner_check = vcreate_u16(0x0001000100010001); const uint16x4_t opcode_check = vcreate_u16(0x00f000f000f000f0); @@ -727,37 +588,6 @@ rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t pkts_n, }; const uint32x4_t flow_mark_adj = { 0, 0, 0, rxq->mark * (-1) }; - assert(rxq->sges_n == 0); - assert(rxq->cqe_n == rxq->elts_n); - cq = &(*rxq->cqes)[cq_idx]; - rte_prefetch_non_temporal(cq); - rte_prefetch_non_temporal(cq + 1); - rte_prefetch_non_temporal(cq + 2); - rte_prefetch_non_temporal(cq + 3); - pkts_n = RTE_MIN(pkts_n, MLX5_VPMD_RX_MAX_BURST); - repl_n = q_n - (rxq->rq_ci - rxq->rq_pi); - if (repl_n >= rxq->rq_repl_thresh) - mlx5_rx_replenish_bulk_mbuf(rxq, repl_n); - /* See if there're unreturned mbufs from compressed CQE. */ - rcvd_pkt = rxq->decompressed; - if (rcvd_pkt > 0) { - rcvd_pkt = RTE_MIN(rcvd_pkt, pkts_n); - rxq_copy_mbuf_v(rxq, pkts, rcvd_pkt); - rxq->rq_pi += rcvd_pkt; - pkts += rcvd_pkt; - rxq->decompressed -= rcvd_pkt; - } - elts_idx = rxq->rq_pi & q_mask; - elts = &(*rxq->elts)[elts_idx]; - /* Not to overflow pkts array. */ - pkts_n = RTE_ALIGN_FLOOR(pkts_n - rcvd_pkt, MLX5_VPMD_DESCS_PER_LOOP); - /* Not to cross queue end. */ - pkts_n = RTE_MIN(pkts_n, q_n - elts_idx); - pkts_n = RTE_MIN(pkts_n, q_n - cq_idx); - if (!pkts_n) - return rcvd_pkt; - /* At this point, there shouldn't be any remained packets. */ - assert(rxq->decompressed == 0); /* * Note that vectors have reverse order - {v3, v2, v1, v0}, because * there's no instruction to count trailing zeros. __builtin_clzl() is @@ -816,7 +646,7 @@ rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t pkts_n, /* B.0 (CQE 0) load a block having op_own. */ c0 = vld1q_u64((uint64_t *)(p0 + 48)); /* Synchronize for loading the rest of blocks. */ - rte_cio_rmb(); + rte_io_rmb(); /* Prefetch next 4 CQEs. */ if (pkts_n - pos >= 2 * MLX5_VPMD_DESCS_PER_LOOP) { unsigned int next = pos + MLX5_VPMD_DESCS_PER_LOOP; @@ -959,22 +789,70 @@ rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t pkts_n, rxq_cq_to_ptype_oflags_v(rxq, ptype_info, flow_tag, opcode, &elts[pos]); if (rxq->hw_timestamp) { - elts[pos]->timestamp = - rte_be_to_cpu_64( - container_of(p0, struct mlx5_cqe, - pkt_info)->timestamp); - elts[pos + 1]->timestamp = - rte_be_to_cpu_64( - container_of(p1, struct mlx5_cqe, - pkt_info)->timestamp); - elts[pos + 2]->timestamp = - rte_be_to_cpu_64( - container_of(p2, struct mlx5_cqe, - pkt_info)->timestamp); - elts[pos + 3]->timestamp = - rte_be_to_cpu_64( - container_of(p3, struct mlx5_cqe, - pkt_info)->timestamp); + int offset = rxq->timestamp_offset; + if (rxq->rt_timestamp) { + struct mlx5_dev_ctx_shared *sh = rxq->sh; + uint64_t ts; + + ts = rte_be_to_cpu_64 + (container_of(p0, struct mlx5_cqe, + pkt_info)->timestamp); + mlx5_timestamp_set(elts[pos], offset, + mlx5_txpp_convert_rx_ts(sh, ts)); + ts = rte_be_to_cpu_64 + (container_of(p1, struct mlx5_cqe, + pkt_info)->timestamp); + mlx5_timestamp_set(elts[pos + 1], offset, + mlx5_txpp_convert_rx_ts(sh, ts)); + ts = rte_be_to_cpu_64 + (container_of(p2, struct mlx5_cqe, + pkt_info)->timestamp); + mlx5_timestamp_set(elts[pos + 2], offset, + mlx5_txpp_convert_rx_ts(sh, ts)); + ts = rte_be_to_cpu_64 + (container_of(p3, struct mlx5_cqe, + pkt_info)->timestamp); + mlx5_timestamp_set(elts[pos + 3], offset, + mlx5_txpp_convert_rx_ts(sh, ts)); + } else { + mlx5_timestamp_set(elts[pos], offset, + rte_be_to_cpu_64(container_of(p0, + struct mlx5_cqe, pkt_info)->timestamp)); + mlx5_timestamp_set(elts[pos + 1], offset, + rte_be_to_cpu_64(container_of(p1, + struct mlx5_cqe, pkt_info)->timestamp)); + mlx5_timestamp_set(elts[pos + 2], offset, + rte_be_to_cpu_64(container_of(p2, + struct mlx5_cqe, pkt_info)->timestamp)); + mlx5_timestamp_set(elts[pos + 3], offset, + rte_be_to_cpu_64(container_of(p3, + struct mlx5_cqe, pkt_info)->timestamp)); + } + } + if (rxq->dynf_meta) { + /* This code is subject for futher optimization. */ + int32_t offs = rxq->flow_meta_offset; + + *RTE_MBUF_DYNFIELD(pkts[pos], offs, uint32_t *) = + container_of(p0, struct mlx5_cqe, + pkt_info)->flow_table_metadata; + *RTE_MBUF_DYNFIELD(pkts[pos + 1], offs, uint32_t *) = + container_of(p1, struct mlx5_cqe, + pkt_info)->flow_table_metadata; + *RTE_MBUF_DYNFIELD(pkts[pos + 2], offs, uint32_t *) = + container_of(p2, struct mlx5_cqe, + pkt_info)->flow_table_metadata; + *RTE_MBUF_DYNFIELD(pkts[pos + 3], offs, uint32_t *) = + container_of(p3, struct mlx5_cqe, + pkt_info)->flow_table_metadata; + if (*RTE_MBUF_DYNFIELD(pkts[pos], offs, uint32_t *)) + elts[pos]->ol_flags |= rxq->flow_meta_mask; + if (*RTE_MBUF_DYNFIELD(pkts[pos + 1], offs, uint32_t *)) + elts[pos + 1]->ol_flags |= rxq->flow_meta_mask; + if (*RTE_MBUF_DYNFIELD(pkts[pos + 2], offs, uint32_t *)) + elts[pos + 2]->ol_flags |= rxq->flow_meta_mask; + if (*RTE_MBUF_DYNFIELD(pkts[pos + 3], offs, uint32_t *)) + elts[pos + 3]->ol_flags |= rxq->flow_meta_mask; } #ifdef MLX5_PMD_SOFT_COUNTERS /* Add up received bytes count. */ @@ -988,37 +866,13 @@ rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t pkts_n, if (n != MLX5_VPMD_DESCS_PER_LOOP) break; } - /* If no new CQE seen, return without updating cq_db. */ - if (unlikely(!nocmp_n && comp_idx == MLX5_VPMD_DESCS_PER_LOOP)) - return rcvd_pkt; - /* Update the consumer indexes for non-compressed CQEs. */ - assert(nocmp_n <= pkts_n); - rxq->cq_ci += nocmp_n; - rxq->rq_pi += nocmp_n; - rcvd_pkt += nocmp_n; #ifdef MLX5_PMD_SOFT_COUNTERS rxq->stats.ipackets += nocmp_n; rxq->stats.ibytes += rcvd_byte; #endif - /* Decompress the last CQE if compressed. */ - if (comp_idx < MLX5_VPMD_DESCS_PER_LOOP && comp_idx == n) { - assert(comp_idx == (nocmp_n % MLX5_VPMD_DESCS_PER_LOOP)); - rxq->decompressed = rxq_cq_decompress_v(rxq, &cq[nocmp_n], - &elts[nocmp_n]); - /* Return more packets if needed. */ - if (nocmp_n < pkts_n) { - uint16_t n = rxq->decompressed; - - n = RTE_MIN(n, pkts_n - nocmp_n); - rxq_copy_mbuf_v(rxq, &pkts[nocmp_n], n); - rxq->rq_pi += n; - rcvd_pkt += n; - rxq->decompressed -= n; - } - } - rte_compiler_barrier(); - *rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci); - return rcvd_pkt; + if (comp_idx == n) + *comp = comp_idx; + return nocmp_n; } #endif /* RTE_PMD_MLX5_RXTX_VEC_NEON_H_ */