X-Git-Url: http://git.droids-corp.org/?a=blobdiff_plain;f=drivers%2Fnet%2Fmlx5%2Fmlx5_rxtx_vec_neon.h;h=1c7e3b444a8685f37179fff6fe9f9ec2ac28d207;hb=04ab83ea5a28dc873a0959ad3cacfa9886eeec87;hp=2673d6b8cba01ba5adf584f574d9aa9dd5342bec;hpb=3cc08bc6dd9327e75abe74622ebf3ffea115e5d6;p=dpdk.git diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_neon.h b/drivers/net/mlx5/mlx5_rxtx_vec_neon.h index 2673d6b8cb..1c7e3b444a 100644 --- a/drivers/net/mlx5/mlx5_rxtx_vec_neon.h +++ b/drivers/net/mlx5/mlx5_rxtx_vec_neon.h @@ -104,11 +104,11 @@ txq_scatter_v(struct mlx5_txq_data *txq, struct rte_mbuf **pkts, sizeof(struct mlx5_wqe) / MLX5_WQE_DWORD_SIZE; unsigned int n; volatile struct mlx5_wqe *wqe = NULL; + bool metadata_ol = + txq->offloads & DEV_TX_OFFLOAD_MATCH_METADATA ? true : false; assert(elts_n > pkts_n); mlx5_tx_complete(txq); - /* A CQE slot must always be available. */ - assert((1u << txq->cqe_n) - (txq->cq_pi - txq->cq_ci)); if (unlikely(!pkts_n)) return 0; for (n = 0; n < pkts_n; ++n) { @@ -129,6 +129,9 @@ txq_scatter_v(struct mlx5_txq_data *txq, struct rte_mbuf **pkts, uint8x16_t *t_wqe; uint8_t *dseg; uint8x16_t ctrl; + rte_be32_t metadata = + metadata_ol && (buf->ol_flags & PKT_TX_METADATA) ? + buf->tx_metadata : 0; assert(segs_n); max_elts = elts_n - (elts_head - txq->elts_tail); @@ -162,13 +165,14 @@ txq_scatter_v(struct mlx5_txq_data *txq, struct rte_mbuf **pkts, ctrl = vreinterpretq_u8_u32((uint32x4_t) { MLX5_OPC_MOD_MPW << 24 | txq->wqe_ci << 8 | MLX5_OPCODE_TSO, - txq->qp_num_8s | ds, 0, 0}); + txq->qp_num_8s | ds, 4, 0}); ctrl = vqtbl1q_u8(ctrl, ctrl_shuf_m); vst1q_u8((void *)t_wqe, ctrl); /* Fill ESEG in the header. */ - vst1q_u16((void *)(t_wqe + 1), - (uint16x8_t) { 0, 0, cs_flags, rte_cpu_to_be_16(len), - 0, 0, 0, 0 }); + vst1q_u32((void *)(t_wqe + 1), + ((uint32x4_t){ 0, + rte_cpu_to_be_16(len) << 16 | cs_flags, + metadata, 0 })); txq->wqe_ci = wqe_ci; } if (!n) @@ -176,12 +180,12 @@ txq_scatter_v(struct mlx5_txq_data *txq, struct rte_mbuf **pkts, txq->elts_comp += (uint16_t)(elts_head - txq->elts_head); txq->elts_head = elts_head; if (txq->elts_comp >= MLX5_TX_COMP_THRESH) { - wqe->ctrl[2] = rte_cpu_to_be_32(8); + /* A CQE slot must always be available. */ + assert((1u << txq->cqe_n) - (txq->cq_pi++ - txq->cq_ci)); + wqe->ctrl[2] = rte_cpu_to_be_32(MLX5_COMP_ALWAYS << + MLX5_COMP_MODE_OFFSET); wqe->ctrl[3] = txq->elts_head; txq->elts_comp = 0; -#ifndef NDEBUG - ++txq->cq_pi; -#endif } #ifdef MLX5_PMD_SOFT_COUNTERS txq->stats.opackets += n; @@ -204,13 +208,15 @@ txq_scatter_v(struct mlx5_txq_data *txq, struct rte_mbuf **pkts, * Number of packets to be sent (<= MLX5_VPMD_TX_MAX_BURST). * @param cs_flags * Checksum offload flags to be written in the descriptor. + * @param metadata + * Metadata value to be written in the descriptor. * * @return * Number of packets successfully transmitted (<= pkts_n). */ static inline uint16_t txq_burst_v(struct mlx5_txq_data *txq, struct rte_mbuf **pkts, uint16_t pkts_n, - uint8_t cs_flags) + uint8_t cs_flags, rte_be32_t metadata) { struct rte_mbuf **elts; uint16_t elts_head = txq->elts_head; @@ -224,7 +230,7 @@ txq_burst_v(struct mlx5_txq_data *txq, struct rte_mbuf **pkts, uint16_t pkts_n, unsigned int pos; uint16_t max_elts; uint16_t max_wqe; - uint32_t comp_req = 0; + uint32_t comp_req; const uint16_t wq_n = 1 << txq->wqe_n; const uint16_t wq_mask = wq_n - 1; uint16_t wq_idx = txq->wqe_ci & wq_mask; @@ -245,8 +251,6 @@ txq_burst_v(struct mlx5_txq_data *txq, struct rte_mbuf **pkts, uint16_t pkts_n, assert(elts_n > pkts_n); mlx5_tx_complete(txq); max_elts = (elts_n - (elts_head - txq->elts_tail)); - /* A CQE slot must always be available. */ - assert((1u << txq->cqe_n) - (txq->cq_pi - txq->cq_ci)); max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi); pkts_n = RTE_MIN((unsigned int)RTE_MIN(pkts_n, max_wqe), max_elts); if (unlikely(!pkts_n)) @@ -281,13 +285,13 @@ txq_burst_v(struct mlx5_txq_data *txq, struct rte_mbuf **pkts, uint16_t pkts_n, } if (txq->elts_comp + pkts_n < MLX5_TX_COMP_THRESH) { txq->elts_comp += pkts_n; + comp_req = MLX5_COMP_ONLY_FIRST_ERR << MLX5_COMP_MODE_OFFSET; } else { + /* A CQE slot must always be available. */ + assert((1u << txq->cqe_n) - (txq->cq_pi++ - txq->cq_ci)); /* Request a completion. */ txq->elts_comp = 0; -#ifndef NDEBUG - ++txq->cq_pi; -#endif - comp_req = 8; + comp_req = MLX5_COMP_ALWAYS << MLX5_COMP_MODE_OFFSET; } /* Fill CTRL in the header. */ ctrl = vreinterpretq_u8_u32((uint32x4_t) { @@ -299,11 +303,8 @@ txq_burst_v(struct mlx5_txq_data *txq, struct rte_mbuf **pkts, uint16_t pkts_n, ctrl = vqtbl1q_u8(ctrl, ctrl_shuf_m); vst1q_u8((void *)t_wqe, ctrl); /* Fill ESEG in the header. */ - vst1q_u8((void *)(t_wqe + 1), - (uint8x16_t) { 0, 0, 0, 0, - cs_flags, 0, 0, 0, - 0, 0, 0, 0, - 0, 0, 0, 0 }); + vst1q_u32((void *)(t_wqe + 1), + ((uint32x4_t) { 0, cs_flags, metadata, 0 })); #ifdef MLX5_PMD_SOFT_COUNTERS txq->stats.opackets += pkts_n; #endif @@ -353,8 +354,11 @@ rxq_copy_mbuf_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t n) * @param elts * Pointer to SW ring to be filled. The first mbuf has to be pre-built from * the title completion descriptor to be copied to the rest of mbufs. + * + * @return + * Number of mini-CQEs successfully decompressed. */ -static inline void +static inline uint16_t rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq, struct rte_mbuf **elts) { @@ -380,7 +384,7 @@ rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq, }; /* Restore the compressed count. Must be 16 bits. */ const uint16_t mcqe_n = t_pkt->data_len + - (rxq->crc_present * ETHER_CRC_LEN); + (rxq->crc_present * RTE_ETHER_CRC_LEN); const uint64x2_t rearm = vld1q_u64((void *)&t_pkt->rearm_data); const uint32x4_t rxdf_mask = { @@ -394,8 +398,8 @@ rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq, vreinterpretq_u8_u32(rxdf_mask)); const uint16x8_t crc_adj = { 0, 0, - rxq->crc_present * ETHER_CRC_LEN, 0, - rxq->crc_present * ETHER_CRC_LEN, 0, + rxq->crc_present * RTE_ETHER_CRC_LEN, 0, + rxq->crc_present * RTE_ETHER_CRC_LEN, 0, 0, 0 }; const uint32_t flow_tag = t_pkt->hash.fdir.hi; @@ -506,6 +510,7 @@ rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq, rxq->stats.ibytes += rcvd_byte; #endif rxq->cq_ci += mcqe_n; + return mcqe_n; } /** @@ -718,7 +723,7 @@ rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t pkts_n, 12, 13, 14, -1 /* 1st CQE */ }; const uint16x8_t crc_adj = { - 0, 0, rxq->crc_present * ETHER_CRC_LEN, 0, 0, 0, 0, 0 + 0, 0, rxq->crc_present * RTE_ETHER_CRC_LEN, 0, 0, 0, 0, 0 }; const uint32x4_t flow_mark_adj = { 0, 0, 0, rxq->mark * (-1) }; @@ -730,24 +735,17 @@ rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t pkts_n, rte_prefetch_non_temporal(cq + 2); rte_prefetch_non_temporal(cq + 3); pkts_n = RTE_MIN(pkts_n, MLX5_VPMD_RX_MAX_BURST); - /* - * Order of indexes: - * rq_ci >= cq_ci >= rq_pi - * Definition of indexes: - * rq_ci - cq_ci := # of buffers owned by HW (posted). - * cq_ci - rq_pi := # of buffers not returned to app (decompressed). - * N - (rq_ci - rq_pi) := # of buffers consumed (to be replenished). - */ repl_n = q_n - (rxq->rq_ci - rxq->rq_pi); - if (repl_n >= MLX5_VPMD_RXQ_RPLNSH_THRESH) + if (repl_n >= rxq->rq_repl_thresh) mlx5_rx_replenish_bulk_mbuf(rxq, repl_n); /* See if there're unreturned mbufs from compressed CQE. */ - rcvd_pkt = rxq->cq_ci - rxq->rq_pi; + rcvd_pkt = rxq->decompressed; if (rcvd_pkt > 0) { rcvd_pkt = RTE_MIN(rcvd_pkt, pkts_n); rxq_copy_mbuf_v(rxq, pkts, rcvd_pkt); rxq->rq_pi += rcvd_pkt; pkts += rcvd_pkt; + rxq->decompressed -= rcvd_pkt; } elts_idx = rxq->rq_pi & q_mask; elts = &(*rxq->elts)[elts_idx]; @@ -755,10 +753,11 @@ rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t pkts_n, pkts_n = RTE_ALIGN_FLOOR(pkts_n - rcvd_pkt, MLX5_VPMD_DESCS_PER_LOOP); /* Not to cross queue end. */ pkts_n = RTE_MIN(pkts_n, q_n - elts_idx); + pkts_n = RTE_MIN(pkts_n, q_n - cq_idx); if (!pkts_n) return rcvd_pkt; /* At this point, there shouldn't be any remained packets. */ - assert(rxq->rq_pi == rxq->cq_ci); + assert(rxq->decompressed == 0); /* * Note that vectors have reverse order - {v3, v2, v1, v0}, because * there's no instruction to count trailing zeros. __builtin_clzl() is @@ -1004,15 +1003,17 @@ rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t pkts_n, /* Decompress the last CQE if compressed. */ if (comp_idx < MLX5_VPMD_DESCS_PER_LOOP && comp_idx == n) { assert(comp_idx == (nocmp_n % MLX5_VPMD_DESCS_PER_LOOP)); - rxq_cq_decompress_v(rxq, &cq[nocmp_n], &elts[nocmp_n]); + rxq->decompressed = rxq_cq_decompress_v(rxq, &cq[nocmp_n], + &elts[nocmp_n]); /* Return more packets if needed. */ if (nocmp_n < pkts_n) { - uint16_t n = rxq->cq_ci - rxq->rq_pi; + uint16_t n = rxq->decompressed; n = RTE_MIN(n, pkts_n - nocmp_n); rxq_copy_mbuf_v(rxq, &pkts[nocmp_n], n); rxq->rq_pi += n; rcvd_pkt += n; + rxq->decompressed -= n; } } rte_compiler_barrier();