From: Alexander Kozyrev Date: Wed, 21 Oct 2020 20:30:29 +0000 (+0000) Subject: net/mlx5: refactor vectorized Rx X-Git-Url: http://git.droids-corp.org/?a=commitdiff_plain;h=1ded26239aa0;p=dpdk.git net/mlx5: refactor vectorized Rx Move the main processing cycle into a separate function: rxq_cq_process_v. Put the regular rxq_burst_v function to a non-arch specific file. Having all SIMD instructions in a single reusable block is a first preparatory step to implement vectorized Rx burst for MPRQ feature. Pass a pointer to the storage of mbufs directly to the rxq_copy_mbuf_v instead of calculating the pointer inside this function. This is needed for the future vectorized Rx routing which is going to pass a different pointer here. Calculate the number of packets to replenish inside the mlx5_rx_replenish_bulk_mbuf. Containing this logic in one place allows us to do the same for MPRQ case. Signed-off-by: Alexander Kozyrev Acked-by: Viacheslav Ovsiienko --- diff --git a/drivers/net/mlx5/mlx5_rxtx_vec.c b/drivers/net/mlx5/mlx5_rxtx_vec.c index f083038682..aa48775738 100644 --- a/drivers/net/mlx5/mlx5_rxtx_vec.c +++ b/drivers/net/mlx5/mlx5_rxtx_vec.c @@ -77,6 +77,110 @@ rxq_handle_pending_error(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, return n; } +/** + * Receive burst of packets. An errored completion also consumes a mbuf, but the + * packet_type is set to be RTE_PTYPE_ALL_MASK. Marked mbufs should be freed + * before returning to application. + * + * @param rxq + * Pointer to RX queue structure. + * @param[out] pkts + * Array to store received packets. + * @param pkts_n + * Maximum number of packets in array. + * @param[out] err + * Pointer to a flag. Set non-zero value if pkts array has at least one error + * packet to handle. + * @param[out] no_cq + * Pointer to a boolean. Set true if no new CQE seen. + * + * @return + * Number of packets received including errors (<= pkts_n). + */ +static inline uint16_t +rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, + uint16_t pkts_n, uint64_t *err, bool *no_cq) +{ + const uint16_t q_n = 1 << rxq->cqe_n; + const uint16_t q_mask = q_n - 1; + const uint16_t e_n = 1 << rxq->elts_n; + const uint16_t e_mask = e_n - 1; + volatile struct mlx5_cqe *cq; + struct rte_mbuf **elts; + uint64_t comp_idx = MLX5_VPMD_DESCS_PER_LOOP; + uint16_t nocmp_n = 0; + uint16_t rcvd_pkt = 0; + unsigned int cq_idx = rxq->cq_ci & q_mask; + unsigned int elts_idx; + + MLX5_ASSERT(rxq->sges_n == 0); + MLX5_ASSERT(rxq->cqe_n == rxq->elts_n); + cq = &(*rxq->cqes)[cq_idx]; + rte_prefetch0(cq); + rte_prefetch0(cq + 1); + rte_prefetch0(cq + 2); + rte_prefetch0(cq + 3); + pkts_n = RTE_MIN(pkts_n, MLX5_VPMD_RX_MAX_BURST); + mlx5_rx_replenish_bulk_mbuf(rxq); + /* See if there're unreturned mbufs from compressed CQE. */ + rcvd_pkt = rxq->decompressed; + if (rcvd_pkt > 0) { + rcvd_pkt = RTE_MIN(rcvd_pkt, pkts_n); + rxq_copy_mbuf_v(&(*rxq->elts)[rxq->rq_pi & e_mask], + pkts, rcvd_pkt); + rxq->rq_pi += rcvd_pkt; + rxq->decompressed -= rcvd_pkt; + pkts += rcvd_pkt; + } + elts_idx = rxq->rq_pi & e_mask; + elts = &(*rxq->elts)[elts_idx]; + /* Not to overflow pkts array. */ + pkts_n = RTE_ALIGN_FLOOR(pkts_n - rcvd_pkt, MLX5_VPMD_DESCS_PER_LOOP); + /* Not to cross queue end. */ + pkts_n = RTE_MIN(pkts_n, q_n - elts_idx); + pkts_n = RTE_MIN(pkts_n, q_n - cq_idx); + if (!pkts_n) { + *no_cq = !rcvd_pkt; + return rcvd_pkt; + } + /* At this point, there shouldn't be any remaining packets. */ + MLX5_ASSERT(rxq->decompressed == 0); + /* Process all the CQEs */ + nocmp_n = rxq_cq_process_v(rxq, cq, elts, pkts, pkts_n, err, &comp_idx); + /* If no new CQE seen, return without updating cq_db. */ + if (unlikely(!nocmp_n && comp_idx == MLX5_VPMD_DESCS_PER_LOOP)) { + *no_cq = true; + return rcvd_pkt; + } + /* Update the consumer indexes for non-compressed CQEs. */ + MLX5_ASSERT(nocmp_n <= pkts_n); + rxq->cq_ci += nocmp_n; + rxq->rq_pi += nocmp_n; + rcvd_pkt += nocmp_n; + /* Decompress the last CQE if compressed. */ + if (comp_idx < MLX5_VPMD_DESCS_PER_LOOP) { + MLX5_ASSERT(comp_idx == (nocmp_n % MLX5_VPMD_DESCS_PER_LOOP)); + rxq->decompressed = rxq_cq_decompress_v(rxq, &cq[nocmp_n], + &elts[nocmp_n]); + rxq->cq_ci += rxq->decompressed; + /* Return more packets if needed. */ + if (nocmp_n < pkts_n) { + uint16_t n = rxq->decompressed; + + n = RTE_MIN(n, pkts_n - nocmp_n); + rxq_copy_mbuf_v(&(*rxq->elts)[rxq->rq_pi & e_mask], + &pkts[nocmp_n], n); + rxq->rq_pi += n; + rcvd_pkt += n; + rxq->decompressed -= n; + } + } + rte_io_wmb(); + *rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci); + *no_cq = !rcvd_pkt; + return rcvd_pkt; +} + /** * DPDK callback for vectorized RX. * diff --git a/drivers/net/mlx5/mlx5_rxtx_vec.h b/drivers/net/mlx5/mlx5_rxtx_vec.h index a8d6c4f411..ce27074b08 100644 --- a/drivers/net/mlx5/mlx5_rxtx_vec.h +++ b/drivers/net/mlx5/mlx5_rxtx_vec.h @@ -73,53 +73,54 @@ S_ASSERT_MLX5_CQE(offsetof(struct mlx5_cqe, op_own) == * * @param rxq * Pointer to RX queue structure. - * @param n - * Number of buffers to be replenished. */ static inline void -mlx5_rx_replenish_bulk_mbuf(struct mlx5_rxq_data *rxq, uint16_t n) +mlx5_rx_replenish_bulk_mbuf(struct mlx5_rxq_data *rxq) { const uint16_t q_n = 1 << rxq->elts_n; const uint16_t q_mask = q_n - 1; + uint16_t n = q_n - (rxq->rq_ci - rxq->rq_pi); uint16_t elts_idx = rxq->rq_ci & q_mask; struct rte_mbuf **elts = &(*rxq->elts)[elts_idx]; volatile struct mlx5_wqe_data_seg *wq = &((volatile struct mlx5_wqe_data_seg *)rxq->wqes)[elts_idx]; unsigned int i; - MLX5_ASSERT(n >= MLX5_VPMD_RXQ_RPLNSH_THRESH(q_n)); - MLX5_ASSERT(n <= (uint16_t)(q_n - (rxq->rq_ci - rxq->rq_pi))); - MLX5_ASSERT(MLX5_VPMD_RXQ_RPLNSH_THRESH(q_n) > - MLX5_VPMD_DESCS_PER_LOOP); - /* Not to cross queue end. */ - n = RTE_MIN(n - MLX5_VPMD_DESCS_PER_LOOP, q_n - elts_idx); - if (rte_mempool_get_bulk(rxq->mp, (void *)elts, n) < 0) { - rxq->stats.rx_nombuf += n; - return; - } - for (i = 0; i < n; ++i) { - void *buf_addr; + if (n >= rxq->rq_repl_thresh) { + MLX5_ASSERT(n >= MLX5_VPMD_RXQ_RPLNSH_THRESH(q_n)); + MLX5_ASSERT(MLX5_VPMD_RXQ_RPLNSH_THRESH(q_n) > + MLX5_VPMD_DESCS_PER_LOOP); + /* Not to cross queue end. */ + n = RTE_MIN(n - MLX5_VPMD_DESCS_PER_LOOP, q_n - elts_idx); + if (rte_mempool_get_bulk(rxq->mp, (void *)elts, n) < 0) { + rxq->stats.rx_nombuf += n; + return; + } + for (i = 0; i < n; ++i) { + void *buf_addr; - /* - * In order to support the mbufs with external attached - * data buffer we should use the buf_addr pointer instead of - * rte_mbuf_buf_addr(). It touches the mbuf itself and may - * impact the performance. - */ - buf_addr = elts[i]->buf_addr; - wq[i].addr = rte_cpu_to_be_64((uintptr_t)buf_addr + - RTE_PKTMBUF_HEADROOM); - /* If there's only one MR, no need to replace LKey in WQE. */ - if (unlikely(mlx5_mr_btree_len(&rxq->mr_ctrl.cache_bh) > 1)) - wq[i].lkey = mlx5_rx_mb2mr(rxq, elts[i]); + /* + * In order to support the mbufs with external attached + * data buffer we should use the buf_addr pointer + * instead of rte_mbuf_buf_addr(). It touches the mbuf + * itself and may impact the performance. + */ + buf_addr = elts[i]->buf_addr; + wq[i].addr = rte_cpu_to_be_64((uintptr_t)buf_addr + + RTE_PKTMBUF_HEADROOM); + /* If there's a single MR, no need to replace LKey. */ + if (unlikely(mlx5_mr_btree_len(&rxq->mr_ctrl.cache_bh) + > 1)) + wq[i].lkey = mlx5_rx_mb2mr(rxq, elts[i]); + } + rxq->rq_ci += n; + /* Prevent overflowing into consumed mbufs. */ + elts_idx = rxq->rq_ci & q_mask; + for (i = 0; i < MLX5_VPMD_DESCS_PER_LOOP; ++i) + (*rxq->elts)[elts_idx + i] = &rxq->fake_mbuf; + rte_io_wmb(); + *rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci); } - rxq->rq_ci += n; - /* Prevent overflowing into consumed mbufs. */ - elts_idx = rxq->rq_ci & q_mask; - for (i = 0; i < MLX5_VPMD_DESCS_PER_LOOP; ++i) - (*rxq->elts)[elts_idx + i] = &rxq->fake_mbuf; - rte_io_wmb(); - *rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci); } #endif /* RTE_PMD_MLX5_RXTX_VEC_H_ */ diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_altivec.h b/drivers/net/mlx5/mlx5_rxtx_vec_altivec.h index 171d7bb0f8..254fd8e9d2 100644 --- a/drivers/net/mlx5/mlx5_rxtx_vec_altivec.h +++ b/drivers/net/mlx5/mlx5_rxtx_vec_altivec.h @@ -33,18 +33,16 @@ /** * Store free buffers to RX SW ring. * - * @param rxq - * Pointer to RX queue structure. + * @param elts + * Pointer to SW ring to be filled. * @param pkts * Pointer to array of packets to be stored. * @param pkts_n * Number of packets to be stored. */ static inline void -rxq_copy_mbuf_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t n) +rxq_copy_mbuf_v(struct rte_mbuf **elts, struct rte_mbuf **pkts, uint16_t n) { - const uint16_t q_mask = (1 << rxq->elts_n) - 1; - struct rte_mbuf **elts = &(*rxq->elts)[rxq->rq_pi & q_mask]; unsigned int pos; uint16_t p = n & -2; @@ -550,14 +548,17 @@ rxq_cq_to_ptype_oflags_v(struct mlx5_rxq_data *rxq, (vector unsigned char *)&pkts[3]->rearm_data); } - /** - * Receive burst of packets. An errored completion also consumes a mbuf, but the - * packet_type is set to be RTE_PTYPE_ALL_MASK. Marked mbufs should be freed - * before returning to application. + * Process a non-compressed completion and fill in mbufs in RX SW ring + * with data extracted from the title completion descriptor. * * @param rxq * Pointer to RX queue structure. + * @param cq + * Pointer to completion array having a non-compressed completion at first. + * @param elts + * Pointer to SW ring to be filled. The first mbuf has to be pre-built from + * the title completion descriptor to be copied to the rest of mbufs. * @param[out] pkts * Array to store received packets. * @param pkts_n @@ -565,28 +566,23 @@ rxq_cq_to_ptype_oflags_v(struct mlx5_rxq_data *rxq, * @param[out] err * Pointer to a flag. Set non-zero value if pkts array has at least one error * packet to handle. - * @param[out] no_cq - * Pointer to a boolean. Set true if no new CQE seen. + * @param[out] comp + * Pointer to a index. Set it to the first compressed completion if any. * * @return - * Number of packets received including errors (<= pkts_n). + * Number of CQEs successfully processed. */ static inline uint16_t -rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t pkts_n, - uint64_t *err, bool *no_cq) +rxq_cq_process_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq, + struct rte_mbuf **elts, struct rte_mbuf **pkts, + uint16_t pkts_n, uint64_t *err, uint64_t *comp) { const uint16_t q_n = 1 << rxq->cqe_n; const uint16_t q_mask = q_n - 1; - volatile struct mlx5_cqe *cq; - struct rte_mbuf **elts; unsigned int pos; - uint64_t n; - uint16_t repl_n; + uint64_t n = 0; uint64_t comp_idx = MLX5_VPMD_DESCS_PER_LOOP; uint16_t nocmp_n = 0; - uint16_t rcvd_pkt = 0; - unsigned int cq_idx = rxq->cq_ci & q_mask; - unsigned int elts_idx; unsigned int ownership = !!(rxq->cq_ci & (q_mask + 1)); const vector unsigned char zero = (vector unsigned char){0}; const vector unsigned char ones = vec_splat_u8(-1); @@ -638,41 +634,6 @@ rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t pkts_n, const vector unsigned short cqe_sel_mask2 = (vector unsigned short){0, 0, 0xffff, 0, 0, 0, 0, 0}; - MLX5_ASSERT(rxq->sges_n == 0); - MLX5_ASSERT(rxq->cqe_n == rxq->elts_n); - cq = &(*rxq->cqes)[cq_idx]; - rte_prefetch0(cq); - rte_prefetch0(cq + 1); - rte_prefetch0(cq + 2); - rte_prefetch0(cq + 3); - pkts_n = RTE_MIN(pkts_n, MLX5_VPMD_RX_MAX_BURST); - - repl_n = q_n - (rxq->rq_ci - rxq->rq_pi); - if (repl_n >= rxq->rq_repl_thresh) - mlx5_rx_replenish_bulk_mbuf(rxq, repl_n); - /* See if there're unreturned mbufs from compressed CQE. */ - rcvd_pkt = rxq->decompressed; - if (rcvd_pkt > 0) { - rcvd_pkt = RTE_MIN(rcvd_pkt, pkts_n); - rxq_copy_mbuf_v(rxq, pkts, rcvd_pkt); - rxq->rq_pi += rcvd_pkt; - rxq->decompressed -= rcvd_pkt; - pkts += rcvd_pkt; - } - elts_idx = rxq->rq_pi & q_mask; - elts = &(*rxq->elts)[elts_idx]; - /* Not to overflow pkts array. */ - pkts_n = RTE_ALIGN_FLOOR(pkts_n - rcvd_pkt, MLX5_VPMD_DESCS_PER_LOOP); - /* Not to cross queue end. */ - pkts_n = RTE_MIN(pkts_n, q_n - elts_idx); - pkts_n = RTE_MIN(pkts_n, q_n - cq_idx); - if (!pkts_n) { - *no_cq = !rcvd_pkt; - return rcvd_pkt; - } - /* At this point, there shouldn't be any remaining packets. */ - MLX5_ASSERT(rxq->decompressed == 0); - /* * A. load first Qword (8bytes) in one loop. * B. copy 4 mbuf pointers from elts ring to returing pkts. @@ -1102,40 +1063,13 @@ rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t pkts_n, if (n != MLX5_VPMD_DESCS_PER_LOOP) break; } - /* If no new CQE seen, return without updating cq_db. */ - if (unlikely(!nocmp_n && comp_idx == MLX5_VPMD_DESCS_PER_LOOP)) { - *no_cq = true; - return rcvd_pkt; - } - /* Update the consumer indexes for non-compressed CQEs. */ - MLX5_ASSERT(nocmp_n <= pkts_n); - rxq->cq_ci += nocmp_n; - rxq->rq_pi += nocmp_n; - rcvd_pkt += nocmp_n; #ifdef MLX5_PMD_SOFT_COUNTERS rxq->stats.ipackets += nocmp_n; rxq->stats.ibytes += rcvd_byte; #endif - /* Decompress the last CQE if compressed. */ - if (comp_idx < MLX5_VPMD_DESCS_PER_LOOP && comp_idx == n) { - MLX5_ASSERT(comp_idx == (nocmp_n % MLX5_VPMD_DESCS_PER_LOOP)); - rxq->decompressed = - rxq_cq_decompress_v(rxq, &cq[nocmp_n], &elts[nocmp_n]); - /* Return more packets if needed. */ - if (nocmp_n < pkts_n) { - uint16_t n = rxq->decompressed; - - n = RTE_MIN(n, pkts_n - nocmp_n); - rxq_copy_mbuf_v(rxq, &pkts[nocmp_n], n); - rxq->rq_pi += n; - rcvd_pkt += n; - rxq->decompressed -= n; - } - } - rte_compiler_barrier(); - *rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci); - *no_cq = !rcvd_pkt; - return rcvd_pkt; + if (comp_idx == n) + *comp = comp_idx; + return nocmp_n; } #endif /* RTE_PMD_MLX5_RXTX_VEC_ALTIVEC_H_ */ diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_neon.h b/drivers/net/mlx5/mlx5_rxtx_vec_neon.h index 436b247ade..58e4556890 100644 --- a/drivers/net/mlx5/mlx5_rxtx_vec_neon.h +++ b/drivers/net/mlx5/mlx5_rxtx_vec_neon.h @@ -29,18 +29,16 @@ /** * Store free buffers to RX SW ring. * - * @param rxq - * Pointer to RX queue structure. + * @param elts + * Pointer to SW ring to be filled. * @param pkts * Pointer to array of packets to be stored. * @param pkts_n * Number of packets to be stored. */ static inline void -rxq_copy_mbuf_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t n) +rxq_copy_mbuf_v(struct rte_mbuf **elts, struct rte_mbuf **pkts, uint16_t n) { - const uint16_t q_mask = (1 << rxq->elts_n) - 1; - struct rte_mbuf **elts = &(*rxq->elts)[rxq->rq_pi & q_mask]; unsigned int pos; uint16_t p = n & -2; @@ -368,12 +366,16 @@ rxq_cq_to_ptype_oflags_v(struct mlx5_rxq_data *rxq, } /** - * Receive burst of packets. An errored completion also consumes a mbuf, but the - * packet_type is set to be RTE_PTYPE_ALL_MASK. Marked mbufs should be freed - * before returning to application. + * Process a non-compressed completion and fill in mbufs in RX SW ring + * with data extracted from the title completion descriptor. * * @param rxq * Pointer to RX queue structure. + * @param cq + * Pointer to completion array having a non-compressed completion at first. + * @param elts + * Pointer to SW ring to be filled. The first mbuf has to be pre-built from + * the title completion descriptor to be copied to the rest of mbufs. * @param[out] pkts * Array to store received packets. * @param pkts_n @@ -381,28 +383,23 @@ rxq_cq_to_ptype_oflags_v(struct mlx5_rxq_data *rxq, * @param[out] err * Pointer to a flag. Set non-zero value if pkts array has at least one error * packet to handle. - * @param[out] no_cq - * Pointer to a boolean. Set true if no new CQE seen. + * @param[out] comp + * Pointer to a index. Set it to the first compressed completion if any. * * @return - * Number of packets received including errors (<= pkts_n). + * Number of CQEs successfully processed. */ static inline uint16_t -rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t pkts_n, - uint64_t *err, bool *no_cq) +rxq_cq_process_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq, + struct rte_mbuf **elts, struct rte_mbuf **pkts, + uint16_t pkts_n, uint64_t *err, uint64_t *comp) { const uint16_t q_n = 1 << rxq->cqe_n; const uint16_t q_mask = q_n - 1; - volatile struct mlx5_cqe *cq; - struct rte_mbuf **elts; unsigned int pos; - uint64_t n; - uint16_t repl_n; + uint64_t n = 0; uint64_t comp_idx = MLX5_VPMD_DESCS_PER_LOOP; uint16_t nocmp_n = 0; - uint16_t rcvd_pkt = 0; - unsigned int cq_idx = rxq->cq_ci & q_mask; - unsigned int elts_idx; const uint16x4_t ownership = vdup_n_u16(!(rxq->cq_ci & (q_mask + 1))); const uint16x4_t owner_check = vcreate_u16(0x0001000100010001); const uint16x4_t opcode_check = vcreate_u16(0x00f000f000f000f0); @@ -463,39 +460,6 @@ rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t pkts_n, }; const uint32x4_t flow_mark_adj = { 0, 0, 0, rxq->mark * (-1) }; - MLX5_ASSERT(rxq->sges_n == 0); - MLX5_ASSERT(rxq->cqe_n == rxq->elts_n); - cq = &(*rxq->cqes)[cq_idx]; - rte_prefetch_non_temporal(cq); - rte_prefetch_non_temporal(cq + 1); - rte_prefetch_non_temporal(cq + 2); - rte_prefetch_non_temporal(cq + 3); - pkts_n = RTE_MIN(pkts_n, MLX5_VPMD_RX_MAX_BURST); - repl_n = q_n - (rxq->rq_ci - rxq->rq_pi); - if (repl_n >= rxq->rq_repl_thresh) - mlx5_rx_replenish_bulk_mbuf(rxq, repl_n); - /* See if there're unreturned mbufs from compressed CQE. */ - rcvd_pkt = rxq->decompressed; - if (rcvd_pkt > 0) { - rcvd_pkt = RTE_MIN(rcvd_pkt, pkts_n); - rxq_copy_mbuf_v(rxq, pkts, rcvd_pkt); - rxq->rq_pi += rcvd_pkt; - pkts += rcvd_pkt; - rxq->decompressed -= rcvd_pkt; - } - elts_idx = rxq->rq_pi & q_mask; - elts = &(*rxq->elts)[elts_idx]; - /* Not to overflow pkts array. */ - pkts_n = RTE_ALIGN_FLOOR(pkts_n - rcvd_pkt, MLX5_VPMD_DESCS_PER_LOOP); - /* Not to cross queue end. */ - pkts_n = RTE_MIN(pkts_n, q_n - elts_idx); - pkts_n = RTE_MIN(pkts_n, q_n - cq_idx); - if (!pkts_n) { - *no_cq = !rcvd_pkt; - return rcvd_pkt; - } - /* At this point, there shouldn't be any remained packets. */ - MLX5_ASSERT(rxq->decompressed == 0); /* * Note that vectors have reverse order - {v3, v2, v1, v0}, because * there's no instruction to count trailing zeros. __builtin_clzl() is @@ -774,40 +738,13 @@ rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t pkts_n, if (n != MLX5_VPMD_DESCS_PER_LOOP) break; } - /* If no new CQE seen, return without updating cq_db. */ - if (unlikely(!nocmp_n && comp_idx == MLX5_VPMD_DESCS_PER_LOOP)) { - *no_cq = true; - return rcvd_pkt; - } - /* Update the consumer indexes for non-compressed CQEs. */ - MLX5_ASSERT(nocmp_n <= pkts_n); - rxq->cq_ci += nocmp_n; - rxq->rq_pi += nocmp_n; - rcvd_pkt += nocmp_n; #ifdef MLX5_PMD_SOFT_COUNTERS rxq->stats.ipackets += nocmp_n; rxq->stats.ibytes += rcvd_byte; #endif - /* Decompress the last CQE if compressed. */ - if (comp_idx < MLX5_VPMD_DESCS_PER_LOOP && comp_idx == n) { - MLX5_ASSERT(comp_idx == (nocmp_n % MLX5_VPMD_DESCS_PER_LOOP)); - rxq->decompressed = rxq_cq_decompress_v(rxq, &cq[nocmp_n], - &elts[nocmp_n]); - /* Return more packets if needed. */ - if (nocmp_n < pkts_n) { - uint16_t n = rxq->decompressed; - - n = RTE_MIN(n, pkts_n - nocmp_n); - rxq_copy_mbuf_v(rxq, &pkts[nocmp_n], n); - rxq->rq_pi += n; - rcvd_pkt += n; - rxq->decompressed -= n; - } - } - rte_io_wmb(); - *rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci); - *no_cq = !rcvd_pkt; - return rcvd_pkt; + if (comp_idx == n) + *comp = comp_idx; + return nocmp_n; } #endif /* RTE_PMD_MLX5_RXTX_VEC_NEON_H_ */ diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_sse.h b/drivers/net/mlx5/mlx5_rxtx_vec_sse.h index ae4439efc7..02eb65978e 100644 --- a/drivers/net/mlx5/mlx5_rxtx_vec_sse.h +++ b/drivers/net/mlx5/mlx5_rxtx_vec_sse.h @@ -31,18 +31,16 @@ /** * Store free buffers to RX SW ring. * - * @param rxq - * Pointer to RX queue structure. + * @param elts + * Pointer to SW ring to be filled. * @param pkts * Pointer to array of packets to be stored. * @param pkts_n * Number of packets to be stored. */ static inline void -rxq_copy_mbuf_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t n) +rxq_copy_mbuf_v(struct rte_mbuf **elts, struct rte_mbuf **pkts, uint16_t n) { - const uint16_t q_mask = (1 << rxq->elts_n) - 1; - struct rte_mbuf **elts = &(*rxq->elts)[rxq->rq_pi & q_mask]; unsigned int pos; uint16_t p = n & -2; @@ -227,7 +225,6 @@ rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq, rxq->stats.ipackets += mcqe_n; rxq->stats.ibytes += rcvd_byte; #endif - rxq->cq_ci += mcqe_n; return mcqe_n; } @@ -293,9 +290,7 @@ rxq_cq_to_ptype_oflags_v(struct mlx5_rxq_data *rxq, __m128i cqes[4], pinfo1 = _mm_unpackhi_epi32(cqes[2], cqes[3]); ptype = _mm_unpacklo_epi64(pinfo0, pinfo1); if (rxq->mark) { - const __m128i pinfo_ft_mask = - _mm_set_epi32(0xffffff00, 0xffffff00, - 0xffffff00, 0xffffff00); + const __m128i pinfo_ft_mask = _mm_set1_epi32(0xffffff00); const __m128i fdir_flags = _mm_set1_epi32(PKT_RX_FDIR); __m128i fdir_id_flags = _mm_set1_epi32(PKT_RX_FDIR_ID); __m128i flow_tag, invalid_mask; @@ -373,12 +368,16 @@ rxq_cq_to_ptype_oflags_v(struct mlx5_rxq_data *rxq, __m128i cqes[4], } /** - * Receive burst of packets. An errored completion also consumes a mbuf, but the - * packet_type is set to be RTE_PTYPE_ALL_MASK. Marked mbufs should be freed - * before returning to application. + * Process a non-compressed completion and fill in mbufs in RX SW ring + * with data extracted from the title completion descriptor. * * @param rxq * Pointer to RX queue structure. + * @param cq + * Pointer to completion array having a non-compressed completion at first. + * @param elts + * Pointer to SW ring to be filled. The first mbuf has to be pre-built from + * the title completion descriptor to be copied to the rest of mbufs. * @param[out] pkts * Array to store received packets. * @param pkts_n @@ -386,37 +385,28 @@ rxq_cq_to_ptype_oflags_v(struct mlx5_rxq_data *rxq, __m128i cqes[4], * @param[out] err * Pointer to a flag. Set non-zero value if pkts array has at least one error * packet to handle. - * @param[out] no_cq - * Pointer to a boolean. Set true if no new CQE seen. + * @param[out] comp + * Pointer to a index. Set it to the first compressed completion if any. * * @return - * Number of packets received including errors (<= pkts_n). + * Number of CQEs successfully processed. */ static inline uint16_t -rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t pkts_n, - uint64_t *err, bool *no_cq) +rxq_cq_process_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq, + struct rte_mbuf **elts, struct rte_mbuf **pkts, + uint16_t pkts_n, uint64_t *err, uint64_t *comp) { const uint16_t q_n = 1 << rxq->cqe_n; const uint16_t q_mask = q_n - 1; - volatile struct mlx5_cqe *cq; - struct rte_mbuf **elts; unsigned int pos; - uint64_t n; - uint16_t repl_n; + uint64_t n = 0; uint64_t comp_idx = MLX5_VPMD_DESCS_PER_LOOP; uint16_t nocmp_n = 0; - uint16_t rcvd_pkt = 0; - unsigned int cq_idx = rxq->cq_ci & q_mask; - unsigned int elts_idx; unsigned int ownership = !!(rxq->cq_ci & (q_mask + 1)); - const __m128i owner_check = - _mm_set_epi64x(0x0100000001000000LL, 0x0100000001000000LL); - const __m128i opcode_check = - _mm_set_epi64x(0xf0000000f0000000LL, 0xf0000000f0000000LL); - const __m128i format_check = - _mm_set_epi64x(0x0c0000000c000000LL, 0x0c0000000c000000LL); - const __m128i resp_err_check = - _mm_set_epi64x(0xe0000000e0000000LL, 0xe0000000e0000000LL); + const __m128i owner_check = _mm_set1_epi64x(0x0100000001000000LL); + const __m128i opcode_check = _mm_set1_epi64x(0xf0000000f0000000LL); + const __m128i format_check = _mm_set1_epi64x(0x0c0000000c000000LL); + const __m128i resp_err_check = _mm_set1_epi64x(0xe0000000e0000000LL); #ifdef MLX5_PMD_SOFT_COUNTERS uint32_t rcvd_byte = 0; /* Mask to shuffle byte_cnt to add up stats. Do bswap16 for all. */ @@ -448,40 +438,6 @@ rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t pkts_n, 0, rxq->crc_present * RTE_ETHER_CRC_LEN); const __m128i flow_mark_adj = _mm_set_epi32(rxq->mark * (-1), 0, 0, 0); - - MLX5_ASSERT(rxq->sges_n == 0); - MLX5_ASSERT(rxq->cqe_n == rxq->elts_n); - cq = &(*rxq->cqes)[cq_idx]; - rte_prefetch0(cq); - rte_prefetch0(cq + 1); - rte_prefetch0(cq + 2); - rte_prefetch0(cq + 3); - pkts_n = RTE_MIN(pkts_n, MLX5_VPMD_RX_MAX_BURST); - repl_n = q_n - (rxq->rq_ci - rxq->rq_pi); - if (repl_n >= rxq->rq_repl_thresh) - mlx5_rx_replenish_bulk_mbuf(rxq, repl_n); - /* See if there're unreturned mbufs from compressed CQE. */ - rcvd_pkt = rxq->decompressed; - if (rcvd_pkt > 0) { - rcvd_pkt = RTE_MIN(rcvd_pkt, pkts_n); - rxq_copy_mbuf_v(rxq, pkts, rcvd_pkt); - rxq->rq_pi += rcvd_pkt; - rxq->decompressed -= rcvd_pkt; - pkts += rcvd_pkt; - } - elts_idx = rxq->rq_pi & q_mask; - elts = &(*rxq->elts)[elts_idx]; - /* Not to overflow pkts array. */ - pkts_n = RTE_ALIGN_FLOOR(pkts_n - rcvd_pkt, MLX5_VPMD_DESCS_PER_LOOP); - /* Not to cross queue end. */ - pkts_n = RTE_MIN(pkts_n, q_n - elts_idx); - pkts_n = RTE_MIN(pkts_n, q_n - cq_idx); - if (!pkts_n) { - *no_cq = !rcvd_pkt; - return rcvd_pkt; - } - /* At this point, there shouldn't be any remained packets. */ - MLX5_ASSERT(rxq->decompressed == 0); /* * A. load first Qword (8bytes) in one loop. * B. copy 4 mbuf pointers from elts ring to returing pkts. @@ -719,40 +675,13 @@ rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t pkts_n, if (n != MLX5_VPMD_DESCS_PER_LOOP) break; } - /* If no new CQE seen, return without updating cq_db. */ - if (unlikely(!nocmp_n && comp_idx == MLX5_VPMD_DESCS_PER_LOOP)) { - *no_cq = true; - return rcvd_pkt; - } - /* Update the consumer indexes for non-compressed CQEs. */ - MLX5_ASSERT(nocmp_n <= pkts_n); - rxq->cq_ci += nocmp_n; - rxq->rq_pi += nocmp_n; - rcvd_pkt += nocmp_n; #ifdef MLX5_PMD_SOFT_COUNTERS rxq->stats.ipackets += nocmp_n; rxq->stats.ibytes += rcvd_byte; #endif - /* Decompress the last CQE if compressed. */ - if (comp_idx < MLX5_VPMD_DESCS_PER_LOOP && comp_idx == n) { - MLX5_ASSERT(comp_idx == (nocmp_n % MLX5_VPMD_DESCS_PER_LOOP)); - rxq->decompressed = rxq_cq_decompress_v(rxq, &cq[nocmp_n], - &elts[nocmp_n]); - /* Return more packets if needed. */ - if (nocmp_n < pkts_n) { - uint16_t n = rxq->decompressed; - - n = RTE_MIN(n, pkts_n - nocmp_n); - rxq_copy_mbuf_v(rxq, &pkts[nocmp_n], n); - rxq->rq_pi += n; - rcvd_pkt += n; - rxq->decompressed -= n; - } - } - rte_compiler_barrier(); - *rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci); - *no_cq = !rcvd_pkt; - return rcvd_pkt; + if (comp_idx == n) + *comp = comp_idx; + return nocmp_n; } #endif /* RTE_PMD_MLX5_RXTX_VEC_SSE_H_ */