X-Git-Url: http://git.droids-corp.org/?a=blobdiff_plain;f=drivers%2Fnet%2Fmlx5%2Fmlx5_rxtx.c;h=844a1c633d266018b2d23928c41537efc2b6401f;hb=8b3799531b9bc46fb518c4ec84f4960ecc78a0d0;hp=887e283c180301a3ed99cfd0b195ed1fd05c4df9;hpb=9bf26e1318e3cd7a04115c8201255505e90cab83;p=dpdk.git diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c index 887e283c18..844a1c633d 100644 --- a/drivers/net/mlx5/mlx5_rxtx.c +++ b/drivers/net/mlx5/mlx5_rxtx.c @@ -3,22 +3,10 @@ * Copyright 2015-2019 Mellanox Technologies, Ltd */ -#include #include #include #include -/* Verbs header. */ -/* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */ -#ifdef PEDANTIC -#pragma GCC diagnostic ignored "-Wpedantic" -#endif -#include -#include -#ifdef PEDANTIC -#pragma GCC diagnostic error "-Wpedantic" -#endif - #include #include #include @@ -26,13 +14,17 @@ #include #include #include +#include + +#include +#include +#include "mlx5_autoconf.h" +#include "mlx5_defs.h" #include "mlx5.h" +#include "mlx5_mr.h" #include "mlx5_utils.h" #include "mlx5_rxtx.h" -#include "mlx5_autoconf.h" -#include "mlx5_defs.h" -#include "mlx5_prm.h" /* TX burst subroutines return codes. */ enum mlx5_txcmp_code { @@ -61,6 +53,8 @@ enum mlx5_txcmp_code { #define MLX5_TXOFF_CONFIG_VLAN (1u << 5) /* VLAN insertion supported.*/ #define MLX5_TXOFF_CONFIG_METADATA (1u << 6) /* Flow metadata. */ #define MLX5_TXOFF_CONFIG_EMPW (1u << 8) /* Enhanced MPW supported.*/ +#define MLX5_TXOFF_CONFIG_MPW (1u << 9) /* Legacy MPW supported.*/ +#define MLX5_TXOFF_CONFIG_TXPP (1u << 10) /* Scheduling on timestamp.*/ /* The most common offloads groups. */ #define MLX5_TXOFF_CONFIG_NONE 0 @@ -86,7 +80,8 @@ static uint16_t mlx5_tx_burst_##func(void *txq, \ #define MLX5_TXOFF_INFO(func, olx) {mlx5_tx_burst_##func, olx}, static __rte_always_inline uint32_t -rxq_cq_to_pkt_type(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe); +rxq_cq_to_pkt_type(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe, + volatile struct mlx5_mini_cqe8 *mcqe); static __rte_always_inline int mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe, @@ -97,25 +92,23 @@ rxq_cq_to_ol_flags(volatile struct mlx5_cqe *cqe); static __rte_always_inline void rxq_cq_to_mbuf(struct mlx5_rxq_data *rxq, struct rte_mbuf *pkt, - volatile struct mlx5_cqe *cqe, uint32_t rss_hash_res); - -static __rte_always_inline void -mprq_buf_replace(struct mlx5_rxq_data *rxq, uint16_t rq_idx, - const unsigned int strd_n); + volatile struct mlx5_cqe *cqe, + volatile struct mlx5_mini_cqe8 *mcqe); static int mlx5_queue_state_modify(struct rte_eth_dev *dev, struct mlx5_mp_arg_queue_state_modify *sm); static inline void -mlx5_lro_update_tcp_hdr(struct rte_tcp_hdr *restrict tcp, - volatile struct mlx5_cqe *restrict cqe, - uint32_t phcsum); +mlx5_lro_update_tcp_hdr(struct rte_tcp_hdr *__rte_restrict tcp, + volatile struct mlx5_cqe *__rte_restrict cqe, + uint32_t phcsum, uint8_t l4_type); static inline void -mlx5_lro_update_hdr(uint8_t *restrict padd, - volatile struct mlx5_cqe *restrict cqe, - uint32_t len); +mlx5_lro_update_hdr(uint8_t *__rte_restrict padd, + volatile struct mlx5_cqe *__rte_restrict cqe, + volatile struct mlx5_mini_cqe8 *mcqe, + struct mlx5_rxq_data *rxq, uint32_t len); uint32_t mlx5_ptype_table[] __rte_cache_aligned = { [0xff] = RTE_PTYPE_ALL_MASK, /* Last entry for errored packet. */ @@ -124,6 +117,9 @@ uint32_t mlx5_ptype_table[] __rte_cache_aligned = { uint8_t mlx5_cksum_table[1 << 10] __rte_cache_aligned; uint8_t mlx5_swp_types_table[1 << 10] __rte_cache_aligned; +uint64_t rte_net_mlx5_dynf_inline_mask; +#define PKT_TX_DYNF_NOINLINE rte_net_mlx5_dynf_inline_mask + /** * Build a table to translate Rx completion flags to packet type. * @@ -366,7 +362,7 @@ mlx5_set_swp_types_table(void) * Software Parser flags are set by pointer. */ static __rte_always_inline uint32_t -txq_mbuf_to_swp(struct mlx5_txq_local *restrict loc, +txq_mbuf_to_swp(struct mlx5_txq_local *__rte_restrict loc, uint8_t *swp_flags, unsigned int olx) { @@ -466,19 +462,11 @@ rx_queue_count(struct mlx5_rxq_data *rxq) { struct rxq_zip *zip = &rxq->zip; volatile struct mlx5_cqe *cqe; + unsigned int cq_ci = rxq->cq_ci; const unsigned int cqe_n = (1 << rxq->cqe_n); const unsigned int cqe_cnt = cqe_n - 1; - unsigned int cq_ci; - unsigned int used; + unsigned int used = 0; - /* if we are processing a compressed cqe */ - if (zip->ai) { - used = zip->cqe_cnt - zip->ca; - cq_ci = zip->cq_ci; - } else { - used = 0; - cq_ci = rxq->cq_ci; - } cqe = &(*rxq->cqes)[cq_ci & cqe_cnt]; while (check_cqe(cqe, cqe_n, cq_ci) != MLX5_CQE_STATUS_HW_OWN) { int8_t op_own; @@ -486,14 +474,17 @@ rx_queue_count(struct mlx5_rxq_data *rxq) op_own = cqe->op_own; if (MLX5_CQE_FORMAT(op_own) == MLX5_COMPRESSED) - n = rte_be_to_cpu_32(cqe->byte_cnt); + if (unlikely(zip->ai)) + n = zip->cqe_cnt - zip->ai; + else + n = rte_be_to_cpu_32(cqe->byte_cnt); else n = 1; cq_ci += n; used += n; cqe = &(*rxq->cqes)[cq_ci & cqe_cnt]; } - used = RTE_MIN(used, (1U << rxq->elts_n) - 1); + used = RTE_MIN(used, cqe_n); return used; } @@ -516,11 +507,12 @@ mlx5_rx_descriptor_status(void *rx_queue, uint16_t offset) container_of(rxq, struct mlx5_rxq_ctrl, rxq); struct rte_eth_dev *dev = ETH_DEV(rxq_ctrl->priv); - if (dev->rx_pkt_burst != mlx5_rx_burst) { + if (dev->rx_pkt_burst == NULL || + dev->rx_pkt_burst == removed_rx_burst) { rte_errno = ENOTSUP; return -rte_errno; } - if (offset >= (1 << rxq->elts_n)) { + if (offset >= (1 << rxq->cqe_n)) { rte_errno = EINVAL; return -rte_errno; } @@ -529,6 +521,108 @@ mlx5_rx_descriptor_status(void *rx_queue, uint16_t offset) return RTE_ETH_RX_DESC_AVAIL; } +/** + * DPDK callback to get the RX queue information + * + * @param dev + * Pointer to the device structure. + * + * @param rx_queue_id + * Rx queue identificator. + * + * @param qinfo + * Pointer to the RX queue information structure. + * + * @return + * None. + */ + +void +mlx5_rxq_info_get(struct rte_eth_dev *dev, uint16_t rx_queue_id, + struct rte_eth_rxq_info *qinfo) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_rxq_data *rxq = (*priv->rxqs)[rx_queue_id]; + struct mlx5_rxq_ctrl *rxq_ctrl = + container_of(rxq, struct mlx5_rxq_ctrl, rxq); + + if (!rxq) + return; + qinfo->mp = mlx5_rxq_mprq_enabled(rxq) ? + rxq->mprq_mp : rxq->mp; + qinfo->conf.rx_thresh.pthresh = 0; + qinfo->conf.rx_thresh.hthresh = 0; + qinfo->conf.rx_thresh.wthresh = 0; + qinfo->conf.rx_free_thresh = rxq->rq_repl_thresh; + qinfo->conf.rx_drop_en = 1; + qinfo->conf.rx_deferred_start = rxq_ctrl ? 0 : 1; + qinfo->conf.offloads = dev->data->dev_conf.rxmode.offloads; + qinfo->scattered_rx = dev->data->scattered_rx; + qinfo->nb_desc = mlx5_rxq_mprq_enabled(rxq) ? + (1 << rxq->elts_n) * (1 << rxq->strd_num_n) : + (1 << rxq->elts_n); +} + +/** + * DPDK callback to get the RX packet burst mode information + * + * @param dev + * Pointer to the device structure. + * + * @param rx_queue_id + * Rx queue identificatior. + * + * @param mode + * Pointer to the burts mode information. + * + * @return + * 0 as success, -EINVAL as failure. + */ + +int +mlx5_rx_burst_mode_get(struct rte_eth_dev *dev, + uint16_t rx_queue_id __rte_unused, + struct rte_eth_burst_mode *mode) +{ + eth_rx_burst_t pkt_burst = dev->rx_pkt_burst; + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_rxq_data *rxq; + + rxq = (*priv->rxqs)[rx_queue_id]; + if (!rxq) { + rte_errno = EINVAL; + return -rte_errno; + } + if (pkt_burst == mlx5_rx_burst) { + snprintf(mode->info, sizeof(mode->info), "%s", "Scalar"); + } else if (pkt_burst == mlx5_rx_burst_mprq) { + snprintf(mode->info, sizeof(mode->info), "%s", "Multi-Packet RQ"); + } else if (pkt_burst == mlx5_rx_burst_vec) { +#if defined RTE_ARCH_X86_64 + snprintf(mode->info, sizeof(mode->info), "%s", "Vector SSE"); +#elif defined RTE_ARCH_ARM64 + snprintf(mode->info, sizeof(mode->info), "%s", "Vector Neon"); +#elif defined RTE_ARCH_PPC_64 + snprintf(mode->info, sizeof(mode->info), "%s", "Vector AltiVec"); +#else + return -EINVAL; +#endif + } else if (pkt_burst == mlx5_rx_burst_mprq_vec) { +#if defined RTE_ARCH_X86_64 + snprintf(mode->info, sizeof(mode->info), "%s", "MPRQ Vector SSE"); +#elif defined RTE_ARCH_ARM64 + snprintf(mode->info, sizeof(mode->info), "%s", "MPRQ Vector Neon"); +#elif defined RTE_ARCH_PPC_64 + snprintf(mode->info, sizeof(mode->info), "%s", "MPRQ Vector AltiVec"); +#else + return -EINVAL; +#endif + } else { + return -EINVAL; + } + return 0; +} + /** * DPDK callback to get the number of used descriptors in a RX queue * @@ -548,7 +642,8 @@ mlx5_rx_queue_count(struct rte_eth_dev *dev, uint16_t rx_queue_id) struct mlx5_priv *priv = dev->data->dev_private; struct mlx5_rxq_data *rxq; - if (dev->rx_pkt_burst != mlx5_rx_burst) { + if (dev->rx_pkt_burst == NULL || + dev->rx_pkt_burst == removed_rx_burst) { rte_errno = ENOTSUP; return -rte_errno; } @@ -652,11 +747,11 @@ check_err_cqe_seen(volatile struct mlx5_err_cqe *err_cqe) * Pointer to the error CQE. * * @return - * Negative value if queue recovery failed, - * the last Tx buffer element to free otherwise. + * Negative value if queue recovery failed, otherwise + * the error completion entry is handled successfully. */ -int -mlx5_tx_error_cqe_handle(struct mlx5_txq_data *restrict txq, +static int +mlx5_tx_error_cqe_handle(struct mlx5_txq_data *__rte_restrict txq, volatile struct mlx5_err_cqe *err_cqe) { if (err_cqe->syndrome != MLX5_CQE_SYNDROME_WR_FLUSH_ERR) { @@ -699,18 +794,14 @@ mlx5_tx_error_cqe_handle(struct mlx5_txq_data *restrict txq, */ txq->stats.oerrors += ((txq->wqe_ci & wqe_m) - new_wqe_pi) & wqe_m; - if (tx_recover_qp(txq_ctrl) == 0) { - txq->cq_ci++; - /* Release all the remaining buffers. */ - return txq->elts_head; + if (tx_recover_qp(txq_ctrl)) { + /* Recovering failed - retry later on the same WQE. */ + return -1; } - /* Recovering failed - try again later on the same WQE. */ - return -1; - } else { - txq->cq_ci++; + /* Release all the remaining buffers. */ + txq_free_elts(txq_ctrl); } - /* Do not release buffers. */ - return txq->elts_tail; + return 0; } /** @@ -727,12 +818,19 @@ mlx5_tx_error_cqe_handle(struct mlx5_txq_data *restrict txq, * Packet type for struct rte_mbuf. */ static inline uint32_t -rxq_cq_to_pkt_type(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe) +rxq_cq_to_pkt_type(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe, + volatile struct mlx5_mini_cqe8 *mcqe) { uint8_t idx; - uint8_t pinfo = cqe->pkt_info; - uint16_t ptype = cqe->hdr_type_etc; + uint8_t ptype; + uint8_t pinfo = (cqe->pkt_info & 0x3) << 6; + /* Get l3/l4 header from mini-CQE in case L3/L4 format*/ + if (mcqe == NULL || + rxq->mcqe_format != MLX5_CQE_RESP_FORMAT_L34H_STRIDX) + ptype = (cqe->hdr_type_etc & 0xfc00) >> 10; + else + ptype = mcqe->hdr_type >> 2; /* * The index to the array should have: * bit[1:0] = l3_hdr_type @@ -741,7 +839,7 @@ rxq_cq_to_pkt_type(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe) * bit[6] = tunneled * bit[7] = outer_l3_type */ - idx = ((pinfo & 0x3) << 6) | ((ptype & 0xfc00) >> 10); + idx = pinfo | ptype; return mlx5_ptype_table[idx] | rxq->tunnel * !!(idx & (1 << 6)); } @@ -780,7 +878,7 @@ mlx5_rxq_initialize(struct mlx5_rxq_data *rxq) byte_count = DATA_LEN(buf); } /* scat->addr must be able to store a pointer. */ - assert(sizeof(scat->addr) >= sizeof(uintptr_t)); + MLX5_ASSERT(sizeof(scat->addr) >= sizeof(uintptr_t)); *scat = (struct mlx5_wqe_data_seg){ .addr = rte_cpu_to_be_64(addr), .byte_count = rte_cpu_to_be_32(byte_count), @@ -793,9 +891,11 @@ mlx5_rxq_initialize(struct mlx5_rxq_data *rxq) rxq->zip = (struct rxq_zip){ .ai = 0, }; + rxq->elts_ci = mlx5_rxq_mprq_enabled(rxq) ? + (wqe_n >> rxq->sges_n) * (1 << rxq->strd_num_n) : 0; /* Update doorbell counter. */ rxq->rq_ci = wqe_n >> rxq->sges_n; - rte_cio_wmb(); + rte_io_wmb(); *rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci); } @@ -823,30 +923,7 @@ mlx5_queue_state_modify_primary(struct rte_eth_dev *dev, struct mlx5_rxq_ctrl *rxq_ctrl = container_of(rxq, struct mlx5_rxq_ctrl, rxq); - if (rxq_ctrl->obj->type == MLX5_RXQ_OBJ_TYPE_IBV) { - struct ibv_wq_attr mod = { - .attr_mask = IBV_WQ_ATTR_STATE, - .wq_state = sm->state, - }; - - ret = mlx5_glue->modify_wq(rxq_ctrl->obj->wq, &mod); - } else { /* rxq_ctrl->obj->type == MLX5_RXQ_OBJ_TYPE_DEVX_RQ. */ - struct mlx5_devx_modify_rq_attr rq_attr; - - memset(&rq_attr, 0, sizeof(rq_attr)); - if (sm->state == IBV_WQS_RESET) { - rq_attr.rq_state = MLX5_RQC_STATE_ERR; - rq_attr.state = MLX5_RQC_STATE_RST; - } else if (sm->state == IBV_WQS_RDY) { - rq_attr.rq_state = MLX5_RQC_STATE_RST; - rq_attr.state = MLX5_RQC_STATE_RDY; - } else if (sm->state == IBV_WQS_ERR) { - rq_attr.rq_state = MLX5_RQC_STATE_RDY; - rq_attr.state = MLX5_RQC_STATE_ERR; - } - ret = mlx5_devx_cmd_modify_rq(rxq_ctrl->obj->rq, - &rq_attr); - } + ret = priv->obj_ops.rxq_obj_modify(rxq_ctrl->obj, sm->state); if (ret) { DRV_LOG(ERR, "Cannot change Rx WQ state to %u - %s", sm->state, strerror(errno)); @@ -857,44 +934,12 @@ mlx5_queue_state_modify_primary(struct rte_eth_dev *dev, struct mlx5_txq_data *txq = (*priv->txqs)[sm->queue_id]; struct mlx5_txq_ctrl *txq_ctrl = container_of(txq, struct mlx5_txq_ctrl, txq); - struct ibv_qp_attr mod = { - .qp_state = IBV_QPS_RESET, - .port_num = (uint8_t)priv->ibv_port, - }; - struct ibv_qp *qp = txq_ctrl->obj->qp; - ret = mlx5_glue->modify_qp(qp, &mod, IBV_QP_STATE); - if (ret) { - DRV_LOG(ERR, "Cannot change the Tx QP state to RESET " - "%s", strerror(errno)); - rte_errno = errno; + ret = priv->obj_ops.txq_obj_modify(txq_ctrl->obj, + MLX5_TXQ_MOD_ERR2RDY, + (uint8_t)priv->dev_port); + if (ret) return ret; - } - mod.qp_state = IBV_QPS_INIT; - ret = mlx5_glue->modify_qp(qp, &mod, - (IBV_QP_STATE | IBV_QP_PORT)); - if (ret) { - DRV_LOG(ERR, "Cannot change Tx QP state to INIT %s", - strerror(errno)); - rte_errno = errno; - return ret; - } - mod.qp_state = IBV_QPS_RTR; - ret = mlx5_glue->modify_qp(qp, &mod, IBV_QP_STATE); - if (ret) { - DRV_LOG(ERR, "Cannot change Tx QP state to RTR %s", - strerror(errno)); - rte_errno = errno; - return ret; - } - mod.qp_state = IBV_QPS_RTS; - ret = mlx5_glue->modify_qp(qp, &mod, IBV_QP_STATE); - if (ret) { - DRV_LOG(ERR, "Cannot change Tx QP state to RTS %s", - strerror(errno)); - rte_errno = errno; - return ret; - } } return 0; } @@ -914,6 +959,7 @@ static int mlx5_queue_state_modify(struct rte_eth_dev *dev, struct mlx5_mp_arg_queue_state_modify *sm) { + struct mlx5_priv *priv = dev->data->dev_private; int ret = 0; switch (rte_eal_process_type()) { @@ -921,7 +967,7 @@ mlx5_queue_state_modify(struct rte_eth_dev *dev, ret = mlx5_queue_state_modify_primary(dev, sm); break; case RTE_PROC_SECONDARY: - ret = mlx5_mp_req_queue_state_modify(dev, sm); + ret = mlx5_mp_req_queue_state_modify(&priv->mp_id, sm); break; default: break; @@ -950,7 +996,8 @@ mlx5_rx_err_handle(struct mlx5_rxq_data *rxq, uint8_t vec) { const uint16_t cqe_n = 1 << rxq->cqe_n; const uint16_t cqe_mask = cqe_n - 1; - const unsigned int wqe_n = 1 << rxq->elts_n; + const uint16_t wqe_n = 1 << rxq->elts_n; + const uint16_t strd_n = 1 << rxq->strd_num_n; struct mlx5_rxq_ctrl *rxq_ctrl = container_of(rxq, struct mlx5_rxq_ctrl, rxq); union { @@ -998,15 +1045,15 @@ mlx5_rx_err_handle(struct mlx5_rxq_data *rxq, uint8_t vec) case MLX5_RXQ_ERR_STATE_NEED_READY: ret = check_cqe(u.cqe, cqe_n, rxq->cq_ci); if (ret == MLX5_CQE_STATUS_HW_OWN) { - rte_cio_wmb(); + rte_io_wmb(); *rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci); - rte_cio_wmb(); + rte_io_wmb(); /* * The RQ consumer index must be zeroed while moving * from RESET state to RDY state. */ *rxq->rq_db = rte_cpu_to_be_32(0); - rte_cio_wmb(); + rte_io_wmb(); sm.is_wq = 1; sm.queue_id = rxq->idx; sm.state = IBV_WQS_RDY; @@ -1014,21 +1061,27 @@ mlx5_rx_err_handle(struct mlx5_rxq_data *rxq, uint8_t vec) &sm)) return -1; if (vec) { - const uint16_t q_mask = wqe_n - 1; - uint16_t elt_idx; + const uint32_t elts_n = + mlx5_rxq_mprq_enabled(rxq) ? + wqe_n * strd_n : wqe_n; + const uint32_t e_mask = elts_n - 1; + uint32_t elts_ci = + mlx5_rxq_mprq_enabled(rxq) ? + rxq->elts_ci : rxq->rq_ci; + uint32_t elt_idx; struct rte_mbuf **elt; int i; - unsigned int n = wqe_n - (rxq->rq_ci - + unsigned int n = elts_n - (elts_ci - rxq->rq_pi); for (i = 0; i < (int)n; ++i) { - elt_idx = (rxq->rq_ci + i) & q_mask; + elt_idx = (elts_ci + i) & e_mask; elt = &(*rxq->elts)[elt_idx]; *elt = rte_mbuf_raw_alloc(rxq->mp); if (!*elt) { for (i--; i >= 0; --i) { - elt_idx = (rxq->rq_ci + - i) & q_mask; + elt_idx = (elts_ci + + i) & elts_n; elt = &(*rxq->elts) [elt_idx]; rte_pktmbuf_free_seg @@ -1037,7 +1090,7 @@ mlx5_rx_err_handle(struct mlx5_rxq_data *rxq, uint8_t vec) return -1; } } - for (i = 0; i < (int)wqe_n; ++i) { + for (i = 0; i < (int)elts_n; ++i) { elt = &(*rxq->elts)[i]; DATA_LEN(*elt) = (uint16_t)((*elt)->buf_len - @@ -1045,7 +1098,7 @@ mlx5_rx_err_handle(struct mlx5_rxq_data *rxq, uint8_t vec) } /* Padding with a fake mbuf for vec Rx. */ for (i = 0; i < MLX5_VPMD_DESCS_PER_LOOP; ++i) - (*rxq->elts)[wqe_n + i] = + (*rxq->elts)[elts_n + i] = &rxq->fake_mbuf; } mlx5_rxq_initialize(rxq); @@ -1090,8 +1143,8 @@ mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe, (volatile struct mlx5_mini_cqe8 (*)[8]) (uintptr_t)(&(*rxq->cqes)[zip->ca & cqe_cnt].pkt_info); - - len = rte_be_to_cpu_32((*mc)[zip->ai & 7].byte_cnt); + len = rte_be_to_cpu_32((*mc)[zip->ai & 7].byte_cnt & + rxq->byte_mask); *mcqe = &(*mc)[zip->ai & 7]; if ((++zip->ai & 7) == 0) { /* Invalidate consumed CQEs */ @@ -1130,6 +1183,7 @@ mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe, } else { int ret; int8_t op_own; + uint32_t cq_ci; ret = check_cqe(cqe, cqe_n, rxq->cq_ci); if (unlikely(ret != MLX5_CQE_STATUS_SW_OWN)) { @@ -1143,14 +1197,19 @@ mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe, return 0; } } - ++rxq->cq_ci; + /* + * Introduce the local variable to have queue cq_ci + * index in queue structure always consistent with + * actual CQE boundary (not pointing to the middle + * of compressed CQE session). + */ + cq_ci = rxq->cq_ci + 1; op_own = cqe->op_own; if (MLX5_CQE_FORMAT(op_own) == MLX5_COMPRESSED) { volatile struct mlx5_mini_cqe8 (*mc)[8] = (volatile struct mlx5_mini_cqe8 (*)[8]) (uintptr_t)(&(*rxq->cqes) - [rxq->cq_ci & - cqe_cnt].pkt_info); + [cq_ci & cqe_cnt].pkt_info); /* Fix endianness. */ zip->cqe_cnt = rte_be_to_cpu_32(cqe->byte_cnt); @@ -1163,13 +1222,13 @@ mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe, * 7 CQEs after the initial CQE instead of 8 * for subsequent ones. */ - zip->ca = rxq->cq_ci; + zip->ca = cq_ci; zip->na = zip->ca + 7; /* Compute the next non compressed CQE. */ - --rxq->cq_ci; zip->cq_ci = rxq->cq_ci + zip->cqe_cnt; /* Get packet size to return. */ - len = rte_be_to_cpu_32((*mc)[0].byte_cnt); + len = rte_be_to_cpu_32((*mc)[0].byte_cnt & + rxq->byte_mask); *mcqe = &(*mc)[0]; zip->ai = 1; /* Prefetch all to be invalidated */ @@ -1181,6 +1240,7 @@ mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe, ++idx; } } else { + rxq->cq_ci = cq_ci; len = rte_be_to_cpu_32(cqe->byte_cnt); } } @@ -1233,34 +1293,73 @@ rxq_cq_to_ol_flags(volatile struct mlx5_cqe *cqe) */ static inline void rxq_cq_to_mbuf(struct mlx5_rxq_data *rxq, struct rte_mbuf *pkt, - volatile struct mlx5_cqe *cqe, uint32_t rss_hash_res) + volatile struct mlx5_cqe *cqe, + volatile struct mlx5_mini_cqe8 *mcqe) { /* Update packet information. */ - pkt->packet_type = rxq_cq_to_pkt_type(rxq, cqe); - if (rss_hash_res && rxq->rss_hash) { - pkt->hash.rss = rss_hash_res; - pkt->ol_flags |= PKT_RX_RSS_HASH; + pkt->packet_type = rxq_cq_to_pkt_type(rxq, cqe, mcqe); + + if (rxq->rss_hash) { + uint32_t rss_hash_res = 0; + + /* If compressed, take hash result from mini-CQE. */ + if (mcqe == NULL || + rxq->mcqe_format != MLX5_CQE_RESP_FORMAT_HASH) + rss_hash_res = rte_be_to_cpu_32(cqe->rx_hash_res); + else + rss_hash_res = rte_be_to_cpu_32(mcqe->rx_hash_result); + if (rss_hash_res) { + pkt->hash.rss = rss_hash_res; + pkt->ol_flags |= PKT_RX_RSS_HASH; + } } - if (rxq->mark && MLX5_FLOW_MARK_IS_VALID(cqe->sop_drop_qpn)) { - pkt->ol_flags |= PKT_RX_FDIR; - if (cqe->sop_drop_qpn != - rte_cpu_to_be_32(MLX5_FLOW_MARK_DEFAULT)) { - uint32_t mark = cqe->sop_drop_qpn; - - pkt->ol_flags |= PKT_RX_FDIR_ID; - pkt->hash.fdir.hi = mlx5_flow_mark_get(mark); + if (rxq->mark) { + uint32_t mark = 0; + + /* If compressed, take flow tag from mini-CQE. */ + if (mcqe == NULL || + rxq->mcqe_format != MLX5_CQE_RESP_FORMAT_FTAG_STRIDX) + mark = cqe->sop_drop_qpn; + else + mark = ((mcqe->byte_cnt_flow & 0xff) << 8) | + (mcqe->flow_tag_high << 16); + if (MLX5_FLOW_MARK_IS_VALID(mark)) { + pkt->ol_flags |= PKT_RX_FDIR; + if (mark != RTE_BE32(MLX5_FLOW_MARK_DEFAULT)) { + pkt->ol_flags |= PKT_RX_FDIR_ID; + pkt->hash.fdir.hi = mlx5_flow_mark_get(mark); + } } } + if (rxq->dynf_meta && cqe->flow_table_metadata) { + pkt->ol_flags |= rxq->flow_meta_mask; + *RTE_MBUF_DYNFIELD(pkt, rxq->flow_meta_offset, uint32_t *) = + cqe->flow_table_metadata; + } if (rxq->csum) pkt->ol_flags |= rxq_cq_to_ol_flags(cqe); - if (rxq->vlan_strip && - (cqe->hdr_type_etc & rte_cpu_to_be_16(MLX5_CQE_VLAN_STRIPPED))) { - pkt->ol_flags |= PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED; - pkt->vlan_tci = rte_be_to_cpu_16(cqe->vlan_info); + if (rxq->vlan_strip) { + bool vlan_strip; + + if (mcqe == NULL || + rxq->mcqe_format != MLX5_CQE_RESP_FORMAT_L34H_STRIDX) + vlan_strip = cqe->hdr_type_etc & + RTE_BE16(MLX5_CQE_VLAN_STRIPPED); + else + vlan_strip = mcqe->hdr_type & + RTE_BE16(MLX5_CQE_VLAN_STRIPPED); + if (vlan_strip) { + pkt->ol_flags |= PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED; + pkt->vlan_tci = rte_be_to_cpu_16(cqe->vlan_info); + } } if (rxq->hw_timestamp) { - pkt->timestamp = rte_be_to_cpu_64(cqe->timestamp); - pkt->ol_flags |= PKT_RX_TIMESTAMP; + uint64_t ts = rte_be_to_cpu_64(cqe->timestamp); + + if (rxq->rt_timestamp) + ts = mlx5_txpp_convert_rx_ts(rxq->sh, ts); + mlx5_timestamp_set(pkt, rxq->timestamp_offset, ts); + pkt->ol_flags |= rxq->timestamp_rx_flag; } } @@ -1298,7 +1397,6 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) &((volatile struct mlx5_wqe_data_seg *)rxq->wqes)[idx]; struct rte_mbuf *rep = (*rxq->elts)[idx]; volatile struct mlx5_mini_cqe8 *mcqe = NULL; - uint32_t rss_hash_res; if (pkt) NEXT(seg) = rep; @@ -1306,7 +1404,8 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) rte_prefetch0(seg); rte_prefetch0(cqe); rte_prefetch0(wqe); - rep = rte_mbuf_raw_alloc(rxq->mp); + /* Allocate the buf from the same pool. */ + rep = rte_mbuf_raw_alloc(seg->pool); if (unlikely(rep == NULL)) { ++rxq->stats.rx_nombuf; if (!pkt) { @@ -1317,7 +1416,7 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) break; } while (pkt != seg) { - assert(pkt != (*rxq->elts)[idx]); + MLX5_ASSERT(pkt != (*rxq->elts)[idx]); rep = NEXT(pkt); NEXT(pkt) = NULL; NB_SEGS(pkt) = 1; @@ -1334,20 +1433,16 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) break; } pkt = seg; - assert(len >= (rxq->crc_present << 2)); - pkt->ol_flags = 0; - /* If compressed, take hash result from mini-CQE. */ - rss_hash_res = rte_be_to_cpu_32(mcqe == NULL ? - cqe->rx_hash_res : - mcqe->rx_hash_result); - rxq_cq_to_mbuf(rxq, pkt, cqe, rss_hash_res); + MLX5_ASSERT(len >= (rxq->crc_present << 2)); + pkt->ol_flags &= EXT_ATTACHED_MBUF; + rxq_cq_to_mbuf(rxq, pkt, cqe, mcqe); if (rxq->crc_present) len -= RTE_ETHER_CRC_LEN; PKT_LEN(pkt) = len; if (cqe->lro_num_seg > 1) { mlx5_lro_update_hdr (rte_pktmbuf_mtod(pkt, uint8_t *), cqe, - len); + mcqe, rxq, len); pkt->ol_flags |= PKT_RX_LRO; pkt->tso_segsz = len / cqe->lro_num_seg; } @@ -1391,9 +1486,9 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) return 0; /* Update the consumer index. */ rxq->rq_ci = rq_ci >> sges_n; - rte_cio_wmb(); + rte_io_wmb(); *rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci); - rte_cio_wmb(); + rte_io_wmb(); *rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci); #ifdef MLX5_PMD_SOFT_COUNTERS /* Increment packets counter. */ @@ -1415,12 +1510,10 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) * The L3 pseudo-header checksum. */ static inline void -mlx5_lro_update_tcp_hdr(struct rte_tcp_hdr *restrict tcp, - volatile struct mlx5_cqe *restrict cqe, - uint32_t phcsum) +mlx5_lro_update_tcp_hdr(struct rte_tcp_hdr *__rte_restrict tcp, + volatile struct mlx5_cqe *__rte_restrict cqe, + uint32_t phcsum, uint8_t l4_type) { - uint8_t l4_type = (rte_be_to_cpu_16(cqe->hdr_type_etc) & - MLX5_CQE_L4_TYPE_MASK) >> MLX5_CQE_L4_TYPE_SHIFT; /* * The HW calculates only the TCP payload checksum, need to complete * the TCP header checksum and the L3 pseudo-header checksum. @@ -1436,7 +1529,7 @@ mlx5_lro_update_tcp_hdr(struct rte_tcp_hdr *restrict tcp, if (cqe->lro_tcppsh_abort_dupack & MLX5_CQE_LRO_PUSH_MASK) tcp->tcp_flags |= RTE_TCP_PSH_FLAG; tcp->cksum = 0; - csum += rte_raw_cksum(tcp, (tcp->data_off & 0xF) * 4); + csum += rte_raw_cksum(tcp, (tcp->data_off >> 4) * 4); csum = ((csum & 0xffff0000) >> 16) + (csum & 0xffff); csum = (~csum) & 0xffff; if (csum == 0) @@ -1457,9 +1550,10 @@ mlx5_lro_update_tcp_hdr(struct rte_tcp_hdr *restrict tcp, * The packet length. */ static inline void -mlx5_lro_update_hdr(uint8_t *restrict padd, - volatile struct mlx5_cqe *restrict cqe, - uint32_t len) +mlx5_lro_update_hdr(uint8_t *__rte_restrict padd, + volatile struct mlx5_cqe *__rte_restrict cqe, + volatile struct mlx5_mini_cqe8 *mcqe, + struct mlx5_rxq_data *rxq, uint32_t len) { union { struct rte_ether_hdr *eth; @@ -1473,6 +1567,7 @@ mlx5_lro_update_hdr(uint8_t *restrict padd, }; uint16_t proto = h.eth->ether_type; uint32_t phcsum; + uint8_t l4_type; h.eth++; while (proto == RTE_BE16(RTE_ETHER_TYPE_VLAN) || @@ -1494,7 +1589,14 @@ mlx5_lro_update_hdr(uint8_t *restrict padd, phcsum = rte_ipv6_phdr_cksum(h.ipv6, 0); h.ipv6++; } - mlx5_lro_update_tcp_hdr(h.tcp, cqe, phcsum); + if (mcqe == NULL || + rxq->mcqe_format != MLX5_CQE_RESP_FORMAT_L34H_STRIDX) + l4_type = (rte_be_to_cpu_16(cqe->hdr_type_etc) & + MLX5_CQE_L4_TYPE_MASK) >> MLX5_CQE_L4_TYPE_SHIFT; + else + l4_type = (rte_be_to_cpu_16(mcqe->hdr_type) & + MLX5_CQE_L4_TYPE_MASK) >> MLX5_CQE_L4_TYPE_SHIFT; + mlx5_lro_update_tcp_hdr(h.tcp, cqe, phcsum, l4_type); } void @@ -1502,10 +1604,11 @@ mlx5_mprq_buf_free_cb(void *addr __rte_unused, void *opaque) { struct mlx5_mprq_buf *buf = opaque; - if (rte_atomic16_read(&buf->refcnt) == 1) { + if (__atomic_load_n(&buf->refcnt, __ATOMIC_RELAXED) == 1) { rte_mempool_put(buf->mp, buf); - } else if (rte_atomic16_add_return(&buf->refcnt, -1) == 0) { - rte_atomic16_set(&buf->refcnt, 1); + } else if (unlikely(__atomic_sub_fetch(&buf->refcnt, 1, + __ATOMIC_RELAXED) == 0)) { + __atomic_store_n(&buf->refcnt, 1, __ATOMIC_RELAXED); rte_mempool_put(buf->mp, buf); } } @@ -1516,31 +1619,6 @@ mlx5_mprq_buf_free(struct mlx5_mprq_buf *buf) mlx5_mprq_buf_free_cb(NULL, buf); } -static inline void -mprq_buf_replace(struct mlx5_rxq_data *rxq, uint16_t rq_idx, - const unsigned int strd_n) -{ - struct mlx5_mprq_buf *rep = rxq->mprq_repl; - volatile struct mlx5_wqe_data_seg *wqe = - &((volatile struct mlx5_wqe_mprq *)rxq->wqes)[rq_idx].dseg; - void *addr; - - assert(rep != NULL); - /* Replace MPRQ buf. */ - (*rxq->mprq_bufs)[rq_idx] = rep; - /* Replace WQE. */ - addr = mlx5_mprq_buf_addr(rep, strd_n); - wqe->addr = rte_cpu_to_be_64((uintptr_t)addr); - /* If there's only one MR, no need to replace LKey in WQE. */ - if (unlikely(mlx5_mr_btree_len(&rxq->mr_ctrl.cache_bh) > 1)) - wqe->lkey = mlx5_rx_addr2mr(rxq, (uintptr_t)addr); - /* Stash a mbuf for next replacement. */ - if (likely(!rte_mempool_get(rxq->mprq_mp, (void **)&rep))) - rxq->mprq_repl = rep; - else - rxq->mprq_repl = NULL; -} - /** * DPDK callback for RX with Multi-Packet RQ support. * @@ -1558,51 +1636,29 @@ uint16_t mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) { struct mlx5_rxq_data *rxq = dpdk_rxq; - const unsigned int strd_n = 1 << rxq->strd_num_n; - const unsigned int strd_sz = 1 << rxq->strd_sz_n; - const unsigned int strd_shift = - MLX5_MPRQ_STRIDE_SHIFT_BYTE * rxq->strd_shift_en; - const unsigned int cq_mask = (1 << rxq->cqe_n) - 1; - const unsigned int wq_mask = (1 << rxq->elts_n) - 1; + const uint32_t strd_n = 1 << rxq->strd_num_n; + const uint32_t strd_sz = 1 << rxq->strd_sz_n; + const uint32_t cq_mask = (1 << rxq->cqe_n) - 1; + const uint32_t wq_mask = (1 << rxq->elts_n) - 1; volatile struct mlx5_cqe *cqe = &(*rxq->cqes)[rxq->cq_ci & cq_mask]; unsigned int i = 0; uint32_t rq_ci = rxq->rq_ci; uint16_t consumed_strd = rxq->consumed_strd; - uint16_t headroom_sz = rxq->strd_headroom_en * RTE_PKTMBUF_HEADROOM; struct mlx5_mprq_buf *buf = (*rxq->mprq_bufs)[rq_ci & wq_mask]; while (i < pkts_n) { struct rte_mbuf *pkt; - void *addr; int ret; - unsigned int len; + uint32_t len; uint16_t strd_cnt; uint16_t strd_idx; - uint32_t offset; uint32_t byte_cnt; volatile struct mlx5_mini_cqe8 *mcqe = NULL; - uint32_t rss_hash_res = 0; - uint8_t lro_num_seg; + enum mlx5_rqx_code rxq_code; if (consumed_strd == strd_n) { - /* Replace WQE only if the buffer is still in use. */ - if (rte_atomic16_read(&buf->refcnt) > 1) { - mprq_buf_replace(rxq, rq_ci & wq_mask, strd_n); - /* Release the old buffer. */ - mlx5_mprq_buf_free(buf); - } else if (unlikely(rxq->mprq_repl == NULL)) { - struct mlx5_mprq_buf *rep; - - /* - * Currently, the MPRQ mempool is out of buffer - * and doing memcpy regardless of the size of Rx - * packet. Retry allocation to get back to - * normal. - */ - if (!rte_mempool_get(rxq->mprq_mp, - (void **)&rep)) - rxq->mprq_repl = rep; - } + /* Replace WQE if the buffer is still in use. */ + mprq_buf_replace(rxq, rq_ci & wq_mask); /* Advance to the next WQE. */ consumed_strd = 0; ++rq_ci; @@ -1613,127 +1669,54 @@ mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) if (!ret) break; byte_cnt = ret; - strd_cnt = (byte_cnt & MLX5_MPRQ_STRIDE_NUM_MASK) >> - MLX5_MPRQ_STRIDE_NUM_SHIFT; - assert(strd_cnt); + len = (byte_cnt & MLX5_MPRQ_LEN_MASK) >> MLX5_MPRQ_LEN_SHIFT; + MLX5_ASSERT((int)len >= (rxq->crc_present << 2)); + if (rxq->crc_present) + len -= RTE_ETHER_CRC_LEN; + if (mcqe && + rxq->mcqe_format == MLX5_CQE_RESP_FORMAT_FTAG_STRIDX) + strd_cnt = (len / strd_sz) + !!(len % strd_sz); + else + strd_cnt = (byte_cnt & MLX5_MPRQ_STRIDE_NUM_MASK) >> + MLX5_MPRQ_STRIDE_NUM_SHIFT; + MLX5_ASSERT(strd_cnt); consumed_strd += strd_cnt; if (byte_cnt & MLX5_MPRQ_FILLER_MASK) continue; - if (mcqe == NULL) { - rss_hash_res = rte_be_to_cpu_32(cqe->rx_hash_res); - strd_idx = rte_be_to_cpu_16(cqe->wqe_counter); - } else { - /* mini-CQE for MPRQ doesn't have hash result. */ - strd_idx = rte_be_to_cpu_16(mcqe->stride_idx); - } - assert(strd_idx < strd_n); - assert(!((rte_be_to_cpu_16(cqe->wqe_id) ^ rq_ci) & wq_mask)); - lro_num_seg = cqe->lro_num_seg; - /* - * Currently configured to receive a packet per a stride. But if - * MTU is adjusted through kernel interface, device could - * consume multiple strides without raising an error. In this - * case, the packet should be dropped because it is bigger than - * the max_rx_pkt_len. - */ - if (unlikely(!lro_num_seg && strd_cnt > 1)) { - ++rxq->stats.idropped; - continue; - } + strd_idx = rte_be_to_cpu_16(mcqe == NULL ? + cqe->wqe_counter : + mcqe->stride_idx); + MLX5_ASSERT(strd_idx < strd_n); + MLX5_ASSERT(!((rte_be_to_cpu_16(cqe->wqe_id) ^ rq_ci) & + wq_mask)); pkt = rte_pktmbuf_alloc(rxq->mp); if (unlikely(pkt == NULL)) { ++rxq->stats.rx_nombuf; break; } len = (byte_cnt & MLX5_MPRQ_LEN_MASK) >> MLX5_MPRQ_LEN_SHIFT; - assert((int)len >= (rxq->crc_present << 2)); + MLX5_ASSERT((int)len >= (rxq->crc_present << 2)); if (rxq->crc_present) len -= RTE_ETHER_CRC_LEN; - offset = strd_idx * strd_sz + strd_shift; - addr = RTE_PTR_ADD(mlx5_mprq_buf_addr(buf, strd_n), offset); - /* - * Memcpy packets to the target mbuf if: - * - The size of packet is smaller than mprq_max_memcpy_len. - * - Out of buffer in the Mempool for Multi-Packet RQ. - */ - if (len <= rxq->mprq_max_memcpy_len || rxq->mprq_repl == NULL) { - /* - * When memcpy'ing packet due to out-of-buffer, the - * packet must be smaller than the target mbuf. - */ - if (unlikely(rte_pktmbuf_tailroom(pkt) < len)) { - rte_pktmbuf_free_seg(pkt); + rxq_code = mprq_buf_to_pkt(rxq, pkt, len, buf, + strd_idx, strd_cnt); + if (unlikely(rxq_code != MLX5_RXQ_CODE_EXIT)) { + rte_pktmbuf_free_seg(pkt); + if (rxq_code == MLX5_RXQ_CODE_DROPPED) { ++rxq->stats.idropped; continue; } - rte_memcpy(rte_pktmbuf_mtod(pkt, void *), addr, len); - DATA_LEN(pkt) = len; - } else { - rte_iova_t buf_iova; - struct rte_mbuf_ext_shared_info *shinfo; - uint16_t buf_len = strd_cnt * strd_sz; - void *buf_addr; - - /* Increment the refcnt of the whole chunk. */ - rte_atomic16_add_return(&buf->refcnt, 1); - assert((uint16_t)rte_atomic16_read(&buf->refcnt) <= - strd_n + 1); - buf_addr = RTE_PTR_SUB(addr, headroom_sz); - /* - * MLX5 device doesn't use iova but it is necessary in a - * case where the Rx packet is transmitted via a - * different PMD. - */ - buf_iova = rte_mempool_virt2iova(buf) + - RTE_PTR_DIFF(buf_addr, buf); - shinfo = &buf->shinfos[strd_idx]; - rte_mbuf_ext_refcnt_set(shinfo, 1); - /* - * EXT_ATTACHED_MBUF will be set to pkt->ol_flags when - * attaching the stride to mbuf and more offload flags - * will be added below by calling rxq_cq_to_mbuf(). - * Other fields will be overwritten. - */ - rte_pktmbuf_attach_extbuf(pkt, buf_addr, buf_iova, - buf_len, shinfo); - /* Set mbuf head-room. */ - pkt->data_off = headroom_sz; - assert(pkt->ol_flags == EXT_ATTACHED_MBUF); - /* - * Prevent potential overflow due to MTU change through - * kernel interface. - */ - if (unlikely(rte_pktmbuf_tailroom(pkt) < len)) { - rte_pktmbuf_free_seg(pkt); - ++rxq->stats.idropped; - continue; - } - DATA_LEN(pkt) = len; - /* - * LRO packet may consume all the stride memory, in this - * case packet head-room space is not guaranteed so must - * to add an empty mbuf for the head-room. - */ - if (!rxq->strd_headroom_en) { - struct rte_mbuf *headroom_mbuf = - rte_pktmbuf_alloc(rxq->mp); - - if (unlikely(headroom_mbuf == NULL)) { - rte_pktmbuf_free_seg(pkt); - ++rxq->stats.rx_nombuf; - break; - } - PORT(pkt) = rxq->port_id; - NEXT(headroom_mbuf) = pkt; - pkt = headroom_mbuf; - NB_SEGS(pkt) = 2; + if (rxq_code == MLX5_RXQ_CODE_NOMBUF) { + ++rxq->stats.rx_nombuf; + break; } } - rxq_cq_to_mbuf(rxq, pkt, cqe, rss_hash_res); - if (lro_num_seg > 1) { - mlx5_lro_update_hdr(addr, cqe, len); + rxq_cq_to_mbuf(rxq, pkt, cqe, mcqe); + if (cqe->lro_num_seg > 1) { + mlx5_lro_update_hdr(rte_pktmbuf_mtod(pkt, uint8_t *), + cqe, mcqe, rxq, len); pkt->ol_flags |= PKT_RX_LRO; - pkt->tso_segsz = strd_sz; + pkt->tso_segsz = len / cqe->lro_num_seg; } PKT_LEN(pkt) = len; PORT(pkt) = rxq->port_id; @@ -1747,11 +1730,11 @@ mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) } /* Update the consumer indexes. */ rxq->consumed_strd = consumed_strd; - rte_cio_wmb(); + rte_io_wmb(); *rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci); if (rq_ci != rxq->rq_ci) { rxq->rq_ci = rq_ci; - rte_cio_wmb(); + rte_io_wmb(); *rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci); } #ifdef MLX5_PMD_SOFT_COUNTERS @@ -1826,6 +1809,14 @@ mlx5_rx_burst_vec(void *dpdk_txq __rte_unused, return 0; } +__rte_weak uint16_t +mlx5_rx_burst_mprq_vec(void *dpdk_txq __rte_unused, + struct rte_mbuf **pkts __rte_unused, + uint16_t pkts_n __rte_unused) +{ + return 0; +} + __rte_weak int mlx5_rxq_check_vec_support(struct mlx5_rxq_data *rxq __rte_unused) { @@ -1850,7 +1841,7 @@ mlx5_check_vec_rx_support(struct rte_eth_dev *dev __rte_unused) * compile time and may be used for optimization. */ static __rte_always_inline void -mlx5_tx_free_mbuf(struct rte_mbuf **restrict pkts, +mlx5_tx_free_mbuf(struct rte_mbuf **__rte_restrict pkts, unsigned int pkts_n, unsigned int olx __rte_unused) { @@ -1864,8 +1855,8 @@ mlx5_tx_free_mbuf(struct rte_mbuf **restrict pkts, * copying pointers to temporary array * for rte_mempool_put_bulk() calls. */ - assert(pkts); - assert(pkts_n); + MLX5_ASSERT(pkts); + MLX5_ASSERT(pkts_n); for (;;) { for (;;) { /* @@ -1874,7 +1865,7 @@ mlx5_tx_free_mbuf(struct rte_mbuf **restrict pkts, */ mbuf = rte_pktmbuf_prefree_seg(*pkts); if (likely(mbuf != NULL)) { - assert(mbuf == *pkts); + MLX5_ASSERT(mbuf == *pkts); if (likely(n_free != 0)) { if (unlikely(pool != mbuf->pool)) /* From different pool. */ @@ -1911,9 +1902,9 @@ mlx5_tx_free_mbuf(struct rte_mbuf **restrict pkts, * This loop is implemented to avoid multiple * inlining of rte_mempool_put_bulk(). */ - assert(pool); - assert(p_free); - assert(n_free); + MLX5_ASSERT(pool); + MLX5_ASSERT(p_free); + MLX5_ASSERT(n_free); /* * Free the array of pre-freed mbufs * belonging to the same memory pool. @@ -1955,14 +1946,14 @@ mlx5_tx_free_mbuf(struct rte_mbuf **restrict pkts, * compile time and may be used for optimization. */ static __rte_always_inline void -mlx5_tx_free_elts(struct mlx5_txq_data *restrict txq, +mlx5_tx_free_elts(struct mlx5_txq_data *__rte_restrict txq, uint16_t tail, unsigned int olx __rte_unused) { uint16_t n_elts = tail - txq->elts_tail; - assert(n_elts); - assert(n_elts <= txq->elts_s); + MLX5_ASSERT(n_elts); + MLX5_ASSERT(n_elts <= txq->elts_s); /* * Implement a loop to support ring buffer wraparound * with single inlining of mlx5_tx_free_mbuf(). @@ -1972,8 +1963,8 @@ mlx5_tx_free_elts(struct mlx5_txq_data *restrict txq, part = txq->elts_s - (txq->elts_tail & txq->elts_m); part = RTE_MIN(part, n_elts); - assert(part); - assert(part <= txq->elts_s); + MLX5_ASSERT(part); + MLX5_ASSERT(part <= txq->elts_s); mlx5_tx_free_mbuf(&txq->elts[txq->elts_tail & txq->elts_m], part, olx); txq->elts_tail += part; @@ -1996,19 +1987,19 @@ mlx5_tx_free_elts(struct mlx5_txq_data *restrict txq, * compile time and may be used for optimization. */ static __rte_always_inline void -mlx5_tx_copy_elts(struct mlx5_txq_data *restrict txq, - struct rte_mbuf **restrict pkts, +mlx5_tx_copy_elts(struct mlx5_txq_data *__rte_restrict txq, + struct rte_mbuf **__rte_restrict pkts, unsigned int pkts_n, unsigned int olx __rte_unused) { unsigned int part; struct rte_mbuf **elts = (struct rte_mbuf **)txq->elts; - assert(pkts); - assert(pkts_n); + MLX5_ASSERT(pkts); + MLX5_ASSERT(pkts_n); part = txq->elts_s - (txq->elts_head & txq->elts_m); - assert(part); - assert(part <= txq->elts_s); + MLX5_ASSERT(part); + MLX5_ASSERT(part <= txq->elts_s); /* This code is a good candidate for vectorizing with SIMD. */ rte_memcpy((void *)(elts + (txq->elts_head & txq->elts_m)), (void *)pkts, @@ -2028,34 +2019,24 @@ mlx5_tx_copy_elts(struct mlx5_txq_data *restrict txq, * Pointer to TX queue structure. * @param valid CQE pointer * if not NULL update txq->wqe_pi and flush the buffers - * @param itail - * if not negative - flush the buffers till this index. * @param olx * Configured Tx offloads mask. It is fully defined at * compile time and may be used for optimization. */ static __rte_always_inline void -mlx5_tx_comp_flush(struct mlx5_txq_data *restrict txq, +mlx5_tx_comp_flush(struct mlx5_txq_data *__rte_restrict txq, volatile struct mlx5_cqe *last_cqe, - int itail, unsigned int olx __rte_unused) { - uint16_t tail; - if (likely(last_cqe != NULL)) { + uint16_t tail; + txq->wqe_pi = rte_be_to_cpu_16(last_cqe->wqe_counter); - tail = ((volatile struct mlx5_wqe_cseg *) - (txq->wqes + (txq->wqe_pi & txq->wqe_m)))->misc; - } else if (itail >= 0) { - tail = (uint16_t)itail; - } else { - return; - } - rte_compiler_barrier(); - *txq->cq_db = rte_cpu_to_be_32(txq->cq_ci); - if (likely(tail != txq->elts_tail)) { - mlx5_tx_free_elts(txq, tail, olx); - assert(tail == txq->elts_tail); + tail = txq->fcqs[(txq->cq_ci - 1) & txq->cqe_m]; + if (likely(tail != txq->elts_tail)) { + mlx5_tx_free_elts(txq, tail, olx); + MLX5_ASSERT(tail == txq->elts_tail); + } } } @@ -2074,11 +2055,12 @@ mlx5_tx_comp_flush(struct mlx5_txq_data *restrict txq, * routine smaller, simple and faster - from experiments. */ static void -mlx5_tx_handle_completion(struct mlx5_txq_data *restrict txq, +mlx5_tx_handle_completion(struct mlx5_txq_data *__rte_restrict txq, unsigned int olx __rte_unused) { unsigned int count = MLX5_TX_COMP_MAX_CQE; volatile struct mlx5_cqe *last_cqe = NULL; + bool ring_doorbell = false; int ret; static_assert(MLX5_CQE_STATUS_HW_OWN < 0, "Must be negative value"); @@ -2091,7 +2073,7 @@ mlx5_tx_handle_completion(struct mlx5_txq_data *restrict txq, if (unlikely(ret != MLX5_CQE_STATUS_SW_OWN)) { if (likely(ret != MLX5_CQE_STATUS_ERR)) { /* No new CQEs in completion queue. */ - assert(ret == MLX5_CQE_STATUS_HW_OWN); + MLX5_ASSERT(ret == MLX5_CQE_STATUS_HW_OWN); break; } /* @@ -2103,31 +2085,49 @@ mlx5_tx_handle_completion(struct mlx5_txq_data *restrict txq, rte_wmb(); ret = mlx5_tx_error_cqe_handle (txq, (volatile struct mlx5_err_cqe *)cqe); + if (unlikely(ret < 0)) { + /* + * Some error occurred on queue error + * handling, we do not advance the index + * here, allowing to retry on next call. + */ + return; + } /* - * Flush buffers, update consuming index - * if recovery succeeded. Otherwise - * just try to recover later. + * We are going to fetch all entries with + * MLX5_CQE_SYNDROME_WR_FLUSH_ERR status. + * The send queue is supposed to be empty. */ + ring_doorbell = true; + ++txq->cq_ci; + txq->cq_pi = txq->cq_ci; last_cqe = NULL; - break; + continue; } /* Normal transmit completion. */ + MLX5_ASSERT(txq->cq_ci != txq->cq_pi); + MLX5_ASSERT((txq->fcqs[txq->cq_ci & txq->cqe_m] >> 16) == + cqe->wqe_counter); + ring_doorbell = true; ++txq->cq_ci; last_cqe = cqe; -#ifndef NDEBUG - if (txq->cq_pi) - --txq->cq_pi; -#endif - /* - * We have to restrict the amount of processed CQEs - * in one tx_burst routine call. The CQ may be large - * and many CQEs may be updated by the NIC in one - * transaction. Buffers freeing is time consuming, - * multiple iterations may introduce significant - * latency. - */ - } while (--count); - mlx5_tx_comp_flush(txq, last_cqe, ret, olx); + /* + * We have to restrict the amount of processed CQEs + * in one tx_burst routine call. The CQ may be large + * and many CQEs may be updated by the NIC in one + * transaction. Buffers freeing is time consuming, + * multiple iterations may introduce significant + * latency. + */ + if (likely(--count == 0)) + break; + } while (true); + if (likely(ring_doorbell)) { + /* Ring doorbell to notify hardware. */ + rte_compiler_barrier(); + *txq->cq_db = rte_cpu_to_be_32(txq->cq_ci); + mlx5_tx_comp_flush(txq, last_cqe, olx); + } } /** @@ -2139,23 +2139,19 @@ mlx5_tx_handle_completion(struct mlx5_txq_data *restrict txq, * Pointer to TX queue structure. * @param loc * Pointer to burst routine local context. - * @param multi, - * Routine is called from multi-segment sending loop, - * do not correct the elts_head according to the pkts_copy. * @param olx * Configured Tx offloads mask. It is fully defined at * compile time and may be used for optimization. */ static __rte_always_inline void -mlx5_tx_request_completion(struct mlx5_txq_data *restrict txq, - struct mlx5_txq_local *restrict loc, - bool multi, +mlx5_tx_request_completion(struct mlx5_txq_data *__rte_restrict txq, + struct mlx5_txq_local *__rte_restrict loc, unsigned int olx) { uint16_t head = txq->elts_head; unsigned int part; - part = (MLX5_TXOFF_CONFIG(INLINE) || multi) ? + part = MLX5_TXOFF_CONFIG(INLINE) ? 0 : loc->pkts_sent - loc->pkts_copy; head += part; if ((uint16_t)(head - txq->elts_comp) >= MLX5_TX_COMP_THRESH || @@ -2163,21 +2159,22 @@ mlx5_tx_request_completion(struct mlx5_txq_data *restrict txq, (uint16_t)(txq->wqe_ci - txq->wqe_comp) >= txq->wqe_thres)) { volatile struct mlx5_wqe *last = loc->wqe_last; + MLX5_ASSERT(last); txq->elts_comp = head; if (MLX5_TXOFF_CONFIG(INLINE)) txq->wqe_comp = txq->wqe_ci; /* Request unconditional completion on last WQE. */ last->cseg.flags = RTE_BE32(MLX5_COMP_ALWAYS << MLX5_COMP_MODE_OFFSET); - /* Save elts_head in unused "immediate" field of WQE. */ - last->cseg.misc = head; - /* - * A CQE slot must always be available. Count the - * issued CEQ "always" request instead of production - * index due to here can be CQE with errors and - * difference with ci may become inconsistent. - */ - assert(txq->cqe_s > ++txq->cq_pi); + /* Save elts_head in dedicated free on completion queue. */ +#ifdef RTE_LIBRTE_MLX5_DEBUG + txq->fcqs[txq->cq_pi++ & txq->cqe_m] = head | + (last->cseg.opcode >> 8) << 16; +#else + txq->fcqs[txq->cq_pi++ & txq->cqe_m] = head; +#endif + /* A CQE slot must always be available. */ + MLX5_ASSERT((txq->cq_pi - txq->cq_ci) <= txq->cqe_s); } } @@ -2195,7 +2192,7 @@ mlx5_tx_request_completion(struct mlx5_txq_data *restrict txq, int mlx5_tx_descriptor_status(void *tx_queue, uint16_t offset) { - struct mlx5_txq_data *restrict txq = tx_queue; + struct mlx5_txq_data *__rte_restrict txq = tx_queue; uint16_t used; mlx5_tx_handle_completion(txq, 0); @@ -2226,15 +2223,18 @@ mlx5_tx_descriptor_status(void *tx_queue, uint16_t offset) * compile time and may be used for optimization. */ static __rte_always_inline void -mlx5_tx_cseg_init(struct mlx5_txq_data *restrict txq, - struct mlx5_txq_local *restrict loc __rte_unused, - struct mlx5_wqe *restrict wqe, +mlx5_tx_cseg_init(struct mlx5_txq_data *__rte_restrict txq, + struct mlx5_txq_local *__rte_restrict loc __rte_unused, + struct mlx5_wqe *__rte_restrict wqe, unsigned int ds, unsigned int opcode, unsigned int olx __rte_unused) { - struct mlx5_wqe_cseg *restrict cs = &wqe->cseg; + struct mlx5_wqe_cseg *__rte_restrict cs = &wqe->cseg; + /* For legacy MPW replace the EMPW by TSO with modifier. */ + if (MLX5_TXOFF_CONFIG(MPW) && opcode == MLX5_OPCODE_ENHANCED_MPSW) + opcode = MLX5_OPCODE_TSO | MLX5_OPC_MOD_MPW << 24; cs->opcode = rte_cpu_to_be_32((txq->wqe_ci << 8) | opcode); cs->sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | ds); cs->flags = RTE_BE32(MLX5_COMP_ONLY_FIRST_ERR << @@ -2242,6 +2242,37 @@ mlx5_tx_cseg_init(struct mlx5_txq_data *restrict txq, cs->misc = RTE_BE32(0); } +/** + * Build the Synchronize Queue Segment with specified completion index. + * + * @param txq + * Pointer to TX queue structure. + * @param loc + * Pointer to burst routine local context. + * @param wqe + * Pointer to WQE to fill with built Control Segment. + * @param wci + * Completion index in Clock Queue to wait. + * @param olx + * Configured Tx offloads mask. It is fully defined at + * compile time and may be used for optimization. + */ +static __rte_always_inline void +mlx5_tx_wseg_init(struct mlx5_txq_data *restrict txq, + struct mlx5_txq_local *restrict loc __rte_unused, + struct mlx5_wqe *restrict wqe, + unsigned int wci, + unsigned int olx __rte_unused) +{ + struct mlx5_wqe_qseg *qs; + + qs = RTE_PTR_ADD(wqe, MLX5_WSEG_SIZE); + qs->max_index = rte_cpu_to_be_32(wci); + qs->qpn_cqn = rte_cpu_to_be_32(txq->sh->txpp.clock_queue.cq->id); + qs->reserved0 = RTE_BE32(0); + qs->reserved1 = RTE_BE32(0); +} + /** * Build the Ethernet Segment without inlined data. * Supports Software Parser, Checksums and VLAN @@ -2258,12 +2289,12 @@ mlx5_tx_cseg_init(struct mlx5_txq_data *restrict txq, * compile time and may be used for optimization. */ static __rte_always_inline void -mlx5_tx_eseg_none(struct mlx5_txq_data *restrict txq __rte_unused, - struct mlx5_txq_local *restrict loc, - struct mlx5_wqe *restrict wqe, +mlx5_tx_eseg_none(struct mlx5_txq_data *__rte_restrict txq __rte_unused, + struct mlx5_txq_local *__rte_restrict loc, + struct mlx5_wqe *__rte_restrict wqe, unsigned int olx) { - struct mlx5_wqe_eseg *restrict es = &wqe->eseg; + struct mlx5_wqe_eseg *__rte_restrict es = &wqe->eseg; uint32_t csum; /* @@ -2288,7 +2319,7 @@ mlx5_tx_eseg_none(struct mlx5_txq_data *restrict txq __rte_unused, * We should get here only if device support * this feature correctly. */ - assert(txq->vlan_en); + MLX5_ASSERT(txq->vlan_en); es->inline_hdr = rte_cpu_to_be_32(MLX5_ETH_WQE_VLAN_INSERT | loc->mbuf->vlan_tci); } else { @@ -2316,13 +2347,13 @@ mlx5_tx_eseg_none(struct mlx5_txq_data *restrict txq __rte_unused, * compile time and may be used for optimization. */ static __rte_always_inline void -mlx5_tx_eseg_dmin(struct mlx5_txq_data *restrict txq __rte_unused, - struct mlx5_txq_local *restrict loc, - struct mlx5_wqe *restrict wqe, +mlx5_tx_eseg_dmin(struct mlx5_txq_data *__rte_restrict txq __rte_unused, + struct mlx5_txq_local *__rte_restrict loc, + struct mlx5_wqe *__rte_restrict wqe, unsigned int vlan, unsigned int olx) { - struct mlx5_wqe_eseg *restrict es = &wqe->eseg; + struct mlx5_wqe_eseg *__rte_restrict es = &wqe->eseg; uint32_t csum; uint8_t *psrc, *pdst; @@ -2366,7 +2397,7 @@ mlx5_tx_eseg_dmin(struct mlx5_txq_data *restrict txq __rte_unused, loc->mbuf->vlan_tci); pdst += sizeof(struct rte_vlan_hdr); /* Copy the rest two bytes from packet data. */ - assert(pdst == RTE_PTR_ALIGN(pdst, sizeof(uint16_t))); + MLX5_ASSERT(pdst == RTE_PTR_ALIGN(pdst, sizeof(uint16_t))); *(uint16_t *)pdst = *(unaligned_uint16_t *)psrc; } else { /* Fill the gap in the title WQEBB with inline data. */ @@ -2400,15 +2431,15 @@ mlx5_tx_eseg_dmin(struct mlx5_txq_data *restrict txq __rte_unused, * Pointer to the next Data Segment (aligned and wrapped around). */ static __rte_always_inline struct mlx5_wqe_dseg * -mlx5_tx_eseg_data(struct mlx5_txq_data *restrict txq, - struct mlx5_txq_local *restrict loc, - struct mlx5_wqe *restrict wqe, +mlx5_tx_eseg_data(struct mlx5_txq_data *__rte_restrict txq, + struct mlx5_txq_local *__rte_restrict loc, + struct mlx5_wqe *__rte_restrict wqe, unsigned int vlan, unsigned int inlen, unsigned int tso, unsigned int olx) { - struct mlx5_wqe_eseg *restrict es = &wqe->eseg; + struct mlx5_wqe_eseg *__rte_restrict es = &wqe->eseg; uint32_t csum; uint8_t *psrc, *pdst; unsigned int part; @@ -2459,7 +2490,7 @@ mlx5_tx_eseg_data(struct mlx5_txq_data *restrict txq, loc->mbuf->vlan_tci); pdst += sizeof(struct rte_vlan_hdr); /* Copy the rest two bytes from packet data. */ - assert(pdst == RTE_PTR_ALIGN(pdst, sizeof(uint16_t))); + MLX5_ASSERT(pdst == RTE_PTR_ALIGN(pdst, sizeof(uint16_t))); *(uint16_t *)pdst = *(unaligned_uint16_t *)psrc; psrc += sizeof(uint16_t); } else { @@ -2468,11 +2499,11 @@ mlx5_tx_eseg_data(struct mlx5_txq_data *restrict txq, psrc += sizeof(rte_v128u32_t); } pdst = (uint8_t *)(es + 2); - assert(inlen >= MLX5_ESEG_MIN_INLINE_SIZE); - assert(pdst < (uint8_t *)txq->wqes_end); + MLX5_ASSERT(inlen >= MLX5_ESEG_MIN_INLINE_SIZE); + MLX5_ASSERT(pdst < (uint8_t *)txq->wqes_end); inlen -= MLX5_ESEG_MIN_INLINE_SIZE; if (!inlen) { - assert(pdst == RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE)); + MLX5_ASSERT(pdst == RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE)); return (struct mlx5_wqe_dseg *)pdst; } /* @@ -2513,21 +2544,30 @@ mlx5_tx_eseg_data(struct mlx5_txq_data *restrict txq, * Pointer to burst routine local context. * @param len * Length of data to be copied. + * @param must + * Length of data to be copied ignoring no inline hint. * @param olx * Configured Tx offloads mask. It is fully defined at * compile time and may be used for optimization. + * + * @return + * Number of actual copied data bytes. This is always greater than or + * equal to must parameter and might be lesser than len in no inline + * hint flag is encountered. */ -static __rte_always_inline void +static __rte_always_inline unsigned int mlx5_tx_mseg_memcpy(uint8_t *pdst, - struct mlx5_txq_local *restrict loc, + struct mlx5_txq_local *__rte_restrict loc, unsigned int len, + unsigned int must, unsigned int olx __rte_unused) { struct rte_mbuf *mbuf; - unsigned int part, dlen; + unsigned int part, dlen, copy = 0; uint8_t *psrc; - assert(len); + MLX5_ASSERT(len); + MLX5_ASSERT(must <= len); do { /* Allow zero length packets, must check first. */ dlen = rte_pktmbuf_data_len(loc->mbuf); @@ -2537,9 +2577,28 @@ mlx5_tx_mseg_memcpy(uint8_t *pdst, loc->mbuf = mbuf->next; rte_pktmbuf_free_seg(mbuf); loc->mbuf_off = 0; - assert(loc->mbuf_nseg > 1); - assert(loc->mbuf); + MLX5_ASSERT(loc->mbuf_nseg > 1); + MLX5_ASSERT(loc->mbuf); --loc->mbuf_nseg; + if (loc->mbuf->ol_flags & PKT_TX_DYNF_NOINLINE) { + unsigned int diff; + + if (copy >= must) { + /* + * We already copied the minimal + * requested amount of data. + */ + return copy; + } + diff = must - copy; + if (diff <= rte_pktmbuf_data_len(loc->mbuf)) { + /* + * Copy only the minimal required + * part of the data buffer. + */ + len = diff; + } + } continue; } dlen -= loc->mbuf_off; @@ -2547,6 +2606,7 @@ mlx5_tx_mseg_memcpy(uint8_t *pdst, loc->mbuf_off); part = RTE_MIN(len, dlen); rte_memcpy(pdst, psrc, part); + copy += part; loc->mbuf_off += part; len -= part; if (!len) { @@ -2557,10 +2617,10 @@ mlx5_tx_mseg_memcpy(uint8_t *pdst, loc->mbuf = mbuf->next; rte_pktmbuf_free_seg(mbuf); loc->mbuf_off = 0; - assert(loc->mbuf_nseg >= 1); + MLX5_ASSERT(loc->mbuf_nseg >= 1); --loc->mbuf_nseg; } - return; + return copy; } pdst += part; } while (true); @@ -2594,18 +2654,18 @@ mlx5_tx_mseg_memcpy(uint8_t *pdst, * wrapping check on its own). */ static __rte_always_inline struct mlx5_wqe_dseg * -mlx5_tx_eseg_mdat(struct mlx5_txq_data *restrict txq, - struct mlx5_txq_local *restrict loc, - struct mlx5_wqe *restrict wqe, +mlx5_tx_eseg_mdat(struct mlx5_txq_data *__rte_restrict txq, + struct mlx5_txq_local *__rte_restrict loc, + struct mlx5_wqe *__rte_restrict wqe, unsigned int vlan, unsigned int inlen, unsigned int tso, unsigned int olx) { - struct mlx5_wqe_eseg *restrict es = &wqe->eseg; + struct mlx5_wqe_eseg *__rte_restrict es = &wqe->eseg; uint32_t csum; uint8_t *pdst; - unsigned int part; + unsigned int part, tlen = 0; /* * Calculate and set check sum flags first, uint32_t field @@ -2637,37 +2697,46 @@ mlx5_tx_eseg_mdat(struct mlx5_txq_data *restrict txq, sizeof(struct rte_vlan_hdr) + 2 * RTE_ETHER_ADDR_LEN), "invalid Ethernet Segment data size"); - assert(inlen >= MLX5_ESEG_MIN_INLINE_SIZE); - es->inline_hdr_sz = rte_cpu_to_be_16(inlen); + MLX5_ASSERT(inlen >= MLX5_ESEG_MIN_INLINE_SIZE); pdst = (uint8_t *)&es->inline_data; if (MLX5_TXOFF_CONFIG(VLAN) && vlan) { /* Implement VLAN tag insertion as part inline data. */ - mlx5_tx_mseg_memcpy(pdst, loc, 2 * RTE_ETHER_ADDR_LEN, olx); + mlx5_tx_mseg_memcpy(pdst, loc, + 2 * RTE_ETHER_ADDR_LEN, + 2 * RTE_ETHER_ADDR_LEN, olx); pdst += 2 * RTE_ETHER_ADDR_LEN; *(unaligned_uint32_t *)pdst = rte_cpu_to_be_32 ((RTE_ETHER_TYPE_VLAN << 16) | loc->mbuf->vlan_tci); pdst += sizeof(struct rte_vlan_hdr); - inlen -= 2 * RTE_ETHER_ADDR_LEN + sizeof(struct rte_vlan_hdr); + tlen += 2 * RTE_ETHER_ADDR_LEN + sizeof(struct rte_vlan_hdr); } - assert(pdst < (uint8_t *)txq->wqes_end); + MLX5_ASSERT(pdst < (uint8_t *)txq->wqes_end); /* * The WQEBB space availability is checked by caller. * Here we should be aware of WQE ring buffer wraparound only. */ part = (uint8_t *)txq->wqes_end - pdst; - part = RTE_MIN(part, inlen); - assert(part); + part = RTE_MIN(part, inlen - tlen); + MLX5_ASSERT(part); do { - mlx5_tx_mseg_memcpy(pdst, loc, part, olx); - inlen -= part; - if (likely(!inlen)) { - pdst += part; + unsigned int copy; + + /* + * Copying may be interrupted inside the routine + * if run into no inline hint flag. + */ + copy = tlen >= txq->inlen_mode ? 0 : (txq->inlen_mode - tlen); + copy = mlx5_tx_mseg_memcpy(pdst, loc, part, copy, olx); + tlen += copy; + if (likely(inlen <= tlen) || copy < part) { + es->inline_hdr_sz = rte_cpu_to_be_16(tlen); + pdst += copy; pdst = RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE); return (struct mlx5_wqe_dseg *)pdst; } pdst = (uint8_t *)txq->wqes; - part = inlen; + part = inlen - tlen; } while (true); } @@ -2689,15 +2758,15 @@ mlx5_tx_eseg_mdat(struct mlx5_txq_data *restrict txq, * compile time and may be used for optimization. */ static __rte_always_inline void -mlx5_tx_dseg_ptr(struct mlx5_txq_data *restrict txq, - struct mlx5_txq_local *restrict loc, - struct mlx5_wqe_dseg *restrict dseg, +mlx5_tx_dseg_ptr(struct mlx5_txq_data *__rte_restrict txq, + struct mlx5_txq_local *__rte_restrict loc, + struct mlx5_wqe_dseg *__rte_restrict dseg, uint8_t *buf, unsigned int len, unsigned int olx __rte_unused) { - assert(len); + MLX5_ASSERT(len); dseg->bcount = rte_cpu_to_be_32(len); dseg->lkey = mlx5_tx_mb2mr(txq, loc->mbuf); dseg->pbuf = rte_cpu_to_be_64((uintptr_t)buf); @@ -2723,9 +2792,9 @@ mlx5_tx_dseg_ptr(struct mlx5_txq_data *restrict txq, * compile time and may be used for optimization. */ static __rte_always_inline void -mlx5_tx_dseg_iptr(struct mlx5_txq_data *restrict txq, - struct mlx5_txq_local *restrict loc, - struct mlx5_wqe_dseg *restrict dseg, +mlx5_tx_dseg_iptr(struct mlx5_txq_data *__rte_restrict txq, + struct mlx5_txq_local *__rte_restrict loc, + struct mlx5_wqe_dseg *__rte_restrict dseg, uint8_t *buf, unsigned int len, unsigned int olx __rte_unused) @@ -2733,7 +2802,7 @@ mlx5_tx_dseg_iptr(struct mlx5_txq_data *restrict txq, { uintptr_t dst, src; - assert(len); + MLX5_ASSERT(len); if (len > MLX5_DSEG_MIN_INLINE_SIZE) { dseg->bcount = rte_cpu_to_be_32(len); dseg->lkey = mlx5_tx_mb2mr(txq, loc->mbuf); @@ -2747,7 +2816,7 @@ mlx5_tx_dseg_iptr(struct mlx5_txq_data *restrict txq, src = (uintptr_t)buf; if (len & 0x08) { #ifdef RTE_ARCH_STRICT_ALIGN - assert(dst == RTE_PTR_ALIGN(dst, sizeof(uint32_t))); + MLX5_ASSERT(dst == RTE_PTR_ALIGN(dst, sizeof(uint32_t))); *(uint32_t *)dst = *(unaligned_uint32_t *)src; dst += sizeof(uint32_t); src += sizeof(uint32_t); @@ -2799,9 +2868,9 @@ mlx5_tx_dseg_iptr(struct mlx5_txq_data *restrict txq, * last packet in the eMPW session. */ static __rte_always_inline struct mlx5_wqe_dseg * -mlx5_tx_dseg_empw(struct mlx5_txq_data *restrict txq, - struct mlx5_txq_local *restrict loc __rte_unused, - struct mlx5_wqe_dseg *restrict dseg, +mlx5_tx_dseg_empw(struct mlx5_txq_data *__rte_restrict txq, + struct mlx5_txq_local *__rte_restrict loc __rte_unused, + struct mlx5_wqe_dseg *__rte_restrict dseg, uint8_t *buf, unsigned int len, unsigned int olx __rte_unused) @@ -2809,8 +2878,14 @@ mlx5_tx_dseg_empw(struct mlx5_txq_data *restrict txq, unsigned int part; uint8_t *pdst; - dseg->bcount = rte_cpu_to_be_32(len | MLX5_ETH_WQE_DATA_INLINE); - pdst = &dseg->inline_data[0]; + if (!MLX5_TXOFF_CONFIG(MPW)) { + /* Store the descriptor byte counter for eMPW sessions. */ + dseg->bcount = rte_cpu_to_be_32(len | MLX5_ETH_WQE_DATA_INLINE); + pdst = &dseg->inline_data[0]; + } else { + /* The entire legacy MPW session counter is stored on close. */ + pdst = (uint8_t *)dseg; + } /* * The WQEBB space availability is checked by caller. * Here we should be aware of WQE ring buffer wraparound only. @@ -2822,7 +2897,8 @@ mlx5_tx_dseg_empw(struct mlx5_txq_data *restrict txq, len -= part; if (likely(!len)) { pdst += part; - pdst = RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE); + if (!MLX5_TXOFF_CONFIG(MPW)) + pdst = RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE); /* Note: no final wraparound check here. */ return (struct mlx5_wqe_dseg *)pdst; } @@ -2855,9 +2931,9 @@ mlx5_tx_dseg_empw(struct mlx5_txq_data *restrict txq, * Ring buffer wraparound check is needed. */ static __rte_always_inline struct mlx5_wqe_dseg * -mlx5_tx_dseg_vlan(struct mlx5_txq_data *restrict txq, - struct mlx5_txq_local *restrict loc __rte_unused, - struct mlx5_wqe_dseg *restrict dseg, +mlx5_tx_dseg_vlan(struct mlx5_txq_data *__rte_restrict txq, + struct mlx5_txq_local *__rte_restrict loc __rte_unused, + struct mlx5_wqe_dseg *__rte_restrict dseg, uint8_t *buf, unsigned int len, unsigned int olx __rte_unused) @@ -2866,19 +2942,26 @@ mlx5_tx_dseg_vlan(struct mlx5_txq_data *restrict txq, unsigned int part; uint8_t *pdst; - assert(len > MLX5_ESEG_MIN_INLINE_SIZE); + MLX5_ASSERT(len > MLX5_ESEG_MIN_INLINE_SIZE); static_assert(MLX5_DSEG_MIN_INLINE_SIZE == (2 * RTE_ETHER_ADDR_LEN), "invalid Data Segment data size"); - dseg->bcount = rte_cpu_to_be_32((len + sizeof(struct rte_vlan_hdr)) | - MLX5_ETH_WQE_DATA_INLINE); - pdst = &dseg->inline_data[0]; + if (!MLX5_TXOFF_CONFIG(MPW)) { + /* Store the descriptor byte counter for eMPW sessions. */ + dseg->bcount = rte_cpu_to_be_32 + ((len + sizeof(struct rte_vlan_hdr)) | + MLX5_ETH_WQE_DATA_INLINE); + pdst = &dseg->inline_data[0]; + } else { + /* The entire legacy MPW session counter is stored on close. */ + pdst = (uint8_t *)dseg; + } memcpy(pdst, buf, MLX5_DSEG_MIN_INLINE_SIZE); buf += MLX5_DSEG_MIN_INLINE_SIZE; pdst += MLX5_DSEG_MIN_INLINE_SIZE; len -= MLX5_DSEG_MIN_INLINE_SIZE; /* Insert VLAN ethertype + VLAN tag. Pointer is aligned. */ - assert(pdst == RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE)); + MLX5_ASSERT(pdst == RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE)); if (unlikely(pdst >= (uint8_t *)txq->wqes_end)) pdst = (uint8_t *)txq->wqes; *(uint32_t *)pdst = rte_cpu_to_be_32((RTE_ETHER_TYPE_VLAN << 16) | @@ -2895,7 +2978,8 @@ mlx5_tx_dseg_vlan(struct mlx5_txq_data *restrict txq, len -= part; if (likely(!len)) { pdst += part; - pdst = RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE); + if (!MLX5_TXOFF_CONFIG(MPW)) + pdst = RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE); /* Note: no final wraparound check here. */ return (struct mlx5_wqe_dseg *)pdst; } @@ -2935,18 +3019,18 @@ mlx5_tx_dseg_vlan(struct mlx5_txq_data *restrict txq, * Actual size of built WQE in segments. */ static __rte_always_inline unsigned int -mlx5_tx_mseg_build(struct mlx5_txq_data *restrict txq, - struct mlx5_txq_local *restrict loc, - struct mlx5_wqe *restrict wqe, +mlx5_tx_mseg_build(struct mlx5_txq_data *__rte_restrict txq, + struct mlx5_txq_local *__rte_restrict loc, + struct mlx5_wqe *__rte_restrict wqe, unsigned int vlan, unsigned int inlen, unsigned int tso, unsigned int olx __rte_unused) { - struct mlx5_wqe_dseg *restrict dseg; + struct mlx5_wqe_dseg *__rte_restrict dseg; unsigned int ds; - assert((rte_pktmbuf_pkt_len(loc->mbuf) + vlan) >= inlen); + MLX5_ASSERT((rte_pktmbuf_pkt_len(loc->mbuf) + vlan) >= inlen); loc->mbuf_nseg = NB_SEGS(loc->mbuf); loc->mbuf_off = 0; @@ -2967,8 +3051,8 @@ mlx5_tx_mseg_build(struct mlx5_txq_data *restrict txq, * Non-zero offset means there are some data * remained in the packet. */ - assert(loc->mbuf_off < rte_pktmbuf_data_len(loc->mbuf)); - assert(rte_pktmbuf_data_len(loc->mbuf)); + MLX5_ASSERT(loc->mbuf_off < rte_pktmbuf_data_len(loc->mbuf)); + MLX5_ASSERT(rte_pktmbuf_data_len(loc->mbuf)); dptr = rte_pktmbuf_mtod_offset(loc->mbuf, uint8_t *, loc->mbuf_off); dlen = rte_pktmbuf_data_len(loc->mbuf) - loc->mbuf_off; @@ -2980,7 +3064,7 @@ mlx5_tx_mseg_build(struct mlx5_txq_data *restrict txq, dseg = (struct mlx5_wqe_dseg *)txq->wqes; mlx5_tx_dseg_iptr(txq, loc, dseg, dptr, dlen, olx); /* Store the mbuf to be freed on completion. */ - assert(loc->elts_free); + MLX5_ASSERT(loc->elts_free); txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf; --loc->elts_free; ++dseg; @@ -3006,7 +3090,7 @@ mlx5_tx_mseg_build(struct mlx5_txq_data *restrict txq, (txq, loc, dseg, rte_pktmbuf_mtod(loc->mbuf, uint8_t *), rte_pktmbuf_data_len(loc->mbuf), olx); - assert(loc->elts_free); + MLX5_ASSERT(loc->elts_free); txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf; --loc->elts_free; ++dseg; @@ -3026,6 +3110,59 @@ dseg_done: return ds; } +/** + * The routine checks timestamp flag in the current packet, + * and push WAIT WQE into the queue if scheduling is required. + * + * @param txq + * Pointer to TX queue structure. + * @param loc + * Pointer to burst routine local context. + * @param olx + * Configured Tx offloads mask. It is fully defined at + * compile time and may be used for optimization. + * + * @return + * MLX5_TXCMP_CODE_EXIT - sending is done or impossible. + * MLX5_TXCMP_CODE_SINGLE - continue processing with the packet. + * MLX5_TXCMP_CODE_MULTI - the WAIT inserted, continue processing. + * Local context variables partially updated. + */ +static __rte_always_inline enum mlx5_txcmp_code +mlx5_tx_schedule_send(struct mlx5_txq_data *restrict txq, + struct mlx5_txq_local *restrict loc, + unsigned int olx) +{ + if (MLX5_TXOFF_CONFIG(TXPP) && + loc->mbuf->ol_flags & txq->ts_mask) { + struct mlx5_wqe *wqe; + uint64_t ts; + int32_t wci; + + /* + * Estimate the required space quickly and roughly. + * We would like to ensure the packet can be pushed + * to the queue and we won't get the orphan WAIT WQE. + */ + if (loc->wqe_free <= MLX5_WQE_SIZE_MAX / MLX5_WQE_SIZE || + loc->elts_free < NB_SEGS(loc->mbuf)) + return MLX5_TXCMP_CODE_EXIT; + /* Convert the timestamp into completion to wait. */ + ts = *RTE_MBUF_DYNFIELD(loc->mbuf, txq->ts_offset, uint64_t *); + wci = mlx5_txpp_convert_tx_ts(txq->sh, ts); + if (unlikely(wci < 0)) + return MLX5_TXCMP_CODE_SINGLE; + /* Build the WAIT WQE with specified completion. */ + wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m); + mlx5_tx_cseg_init(txq, loc, wqe, 2, MLX5_OPCODE_WAIT, olx); + mlx5_tx_wseg_init(txq, loc, wqe, wci, olx); + ++txq->wqe_ci; + --loc->wqe_free; + return MLX5_TXCMP_CODE_MULTI; + } + return MLX5_TXCMP_CODE_SINGLE; +} + /** * Tx one packet function for multi-segment TSO. Supports all * types of Tx offloads, uses MLX5_OPCODE_TSO to build WQEs, @@ -3048,13 +3185,23 @@ dseg_done: * Local context variables partially updated. */ static __rte_always_inline enum mlx5_txcmp_code -mlx5_tx_packet_multi_tso(struct mlx5_txq_data *restrict txq, - struct mlx5_txq_local *restrict loc, +mlx5_tx_packet_multi_tso(struct mlx5_txq_data *__rte_restrict txq, + struct mlx5_txq_local *__rte_restrict loc, unsigned int olx) { - struct mlx5_wqe *restrict wqe; + struct mlx5_wqe *__rte_restrict wqe; unsigned int ds, dlen, inlen, ntcp, vlan = 0; + if (MLX5_TXOFF_CONFIG(TXPP)) { + enum mlx5_txcmp_code wret; + + /* Generate WAIT for scheduling if requested. */ + wret = mlx5_tx_schedule_send(txq, loc, olx); + if (wret == MLX5_TXCMP_CODE_EXIT) + return MLX5_TXCMP_CODE_EXIT; + if (wret == MLX5_TXCMP_CODE_ERROR) + return MLX5_TXCMP_CODE_ERROR; + } /* * Calculate data length to be inlined to estimate * the required space in WQE ring buffer. @@ -3073,7 +3220,7 @@ mlx5_tx_packet_multi_tso(struct mlx5_txq_data *restrict txq, inlen <= MLX5_ESEG_MIN_INLINE_SIZE || inlen > (dlen + vlan))) return MLX5_TXCMP_CODE_ERROR; - assert(inlen >= txq->inlen_mode); + MLX5_ASSERT(inlen >= txq->inlen_mode); /* * Check whether there are enough free WQEBBs: * - Control Segment @@ -3111,8 +3258,6 @@ mlx5_tx_packet_multi_tso(struct mlx5_txq_data *restrict txq, wqe->cseg.sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | ds); txq->wqe_ci += (ds + 3) / 4; loc->wqe_free -= (ds + 3) / 4; - /* Request CQE generation if limits are reached. */ - mlx5_tx_request_completion(txq, loc, true, olx); return MLX5_TXCMP_CODE_MULTI; } @@ -3139,15 +3284,25 @@ mlx5_tx_packet_multi_tso(struct mlx5_txq_data *restrict txq, * Local context variables partially updated. */ static __rte_always_inline enum mlx5_txcmp_code -mlx5_tx_packet_multi_send(struct mlx5_txq_data *restrict txq, - struct mlx5_txq_local *restrict loc, +mlx5_tx_packet_multi_send(struct mlx5_txq_data *__rte_restrict txq, + struct mlx5_txq_local *__rte_restrict loc, unsigned int olx) { - struct mlx5_wqe_dseg *restrict dseg; - struct mlx5_wqe *restrict wqe; + struct mlx5_wqe_dseg *__rte_restrict dseg; + struct mlx5_wqe *__rte_restrict wqe; unsigned int ds, nseg; - assert(NB_SEGS(loc->mbuf) > 1); + MLX5_ASSERT(NB_SEGS(loc->mbuf) > 1); + if (MLX5_TXOFF_CONFIG(TXPP)) { + enum mlx5_txcmp_code wret; + + /* Generate WAIT for scheduling if requested. */ + wret = mlx5_tx_schedule_send(txq, loc, olx); + if (wret == MLX5_TXCMP_CODE_EXIT) + return MLX5_TXCMP_CODE_EXIT; + if (wret == MLX5_TXCMP_CODE_ERROR) + return MLX5_TXCMP_CODE_ERROR; + } /* * No inline at all, it means the CPU cycles saving * is prioritized at configuration, we should not @@ -3221,8 +3376,6 @@ mlx5_tx_packet_multi_send(struct mlx5_txq_data *restrict txq, } while (true); txq->wqe_ci += (ds + 3) / 4; loc->wqe_free -= (ds + 3) / 4; - /* Request CQE generation if limits are reached. */ - mlx5_tx_request_completion(txq, loc, true, olx); return MLX5_TXCMP_CODE_MULTI; } @@ -3249,15 +3402,25 @@ mlx5_tx_packet_multi_send(struct mlx5_txq_data *restrict txq, * Local context variables partially updated. */ static __rte_always_inline enum mlx5_txcmp_code -mlx5_tx_packet_multi_inline(struct mlx5_txq_data *restrict txq, - struct mlx5_txq_local *restrict loc, +mlx5_tx_packet_multi_inline(struct mlx5_txq_data *__rte_restrict txq, + struct mlx5_txq_local *__rte_restrict loc, unsigned int olx) { - struct mlx5_wqe *restrict wqe; + struct mlx5_wqe *__rte_restrict wqe; unsigned int ds, inlen, dlen, vlan = 0; - assert(MLX5_TXOFF_CONFIG(INLINE)); - assert(NB_SEGS(loc->mbuf) > 1); + MLX5_ASSERT(MLX5_TXOFF_CONFIG(INLINE)); + MLX5_ASSERT(NB_SEGS(loc->mbuf) > 1); + if (MLX5_TXOFF_CONFIG(TXPP)) { + enum mlx5_txcmp_code wret; + + /* Generate WAIT for scheduling if requested. */ + wret = mlx5_tx_schedule_send(txq, loc, olx); + if (wret == MLX5_TXCMP_CODE_EXIT) + return MLX5_TXCMP_CODE_EXIT; + if (wret == MLX5_TXCMP_CODE_ERROR) + return MLX5_TXCMP_CODE_ERROR; + } /* * First calculate data length to be inlined * to estimate the required space for WQE. @@ -3269,8 +3432,9 @@ mlx5_tx_packet_multi_inline(struct mlx5_txq_data *restrict txq, /* Check against minimal length. */ if (inlen <= MLX5_ESEG_MIN_INLINE_SIZE) return MLX5_TXCMP_CODE_ERROR; - assert(txq->inlen_send >= MLX5_ESEG_MIN_INLINE_SIZE); - if (inlen > txq->inlen_send) { + MLX5_ASSERT(txq->inlen_send >= MLX5_ESEG_MIN_INLINE_SIZE); + if (inlen > txq->inlen_send || + loc->mbuf->ol_flags & PKT_TX_DYNF_NOINLINE) { struct rte_mbuf *mbuf; unsigned int nxlen; uintptr_t start; @@ -3281,11 +3445,13 @@ mlx5_tx_packet_multi_inline(struct mlx5_txq_data *restrict txq, * inlining is required. */ if (txq->inlen_mode) { - assert(txq->inlen_mode >= MLX5_ESEG_MIN_INLINE_SIZE); - assert(txq->inlen_mode <= txq->inlen_send); + MLX5_ASSERT(txq->inlen_mode >= + MLX5_ESEG_MIN_INLINE_SIZE); + MLX5_ASSERT(txq->inlen_mode <= txq->inlen_send); inlen = txq->inlen_mode; } else { - if (!vlan || txq->vlan_en) { + if (loc->mbuf->ol_flags & PKT_TX_DYNF_NOINLINE || + !vlan || txq->vlan_en) { /* * VLAN insertion will be done inside by HW. * It is not utmost effective - VLAN flag is @@ -3314,7 +3480,7 @@ mlx5_tx_packet_multi_inline(struct mlx5_txq_data *restrict txq, do { smlen = nxlen; mbuf = NEXT(mbuf); - assert(mbuf); + MLX5_ASSERT(mbuf); nxlen = rte_pktmbuf_data_len(mbuf); nxlen += smlen; } while (unlikely(nxlen < inlen)); @@ -3330,7 +3496,7 @@ mlx5_tx_packet_multi_inline(struct mlx5_txq_data *restrict txq, inlen = nxlen; mbuf = NEXT(mbuf); /* There should be not end of packet. */ - assert(mbuf); + MLX5_ASSERT(mbuf); nxlen = inlen + rte_pktmbuf_data_len(mbuf); } while (unlikely(nxlen < txq->inlen_send)); } @@ -3358,7 +3524,7 @@ do_align: * Estimate the number of Data Segments conservatively, * supposing no any mbufs is being freed during inlining. */ - assert(inlen <= txq->inlen_send); + MLX5_ASSERT(inlen <= txq->inlen_send); ds = NB_SEGS(loc->mbuf) + 2 + (inlen - MLX5_ESEG_MIN_INLINE_SIZE + MLX5_WSEG_SIZE + @@ -3379,8 +3545,6 @@ do_align: wqe->cseg.sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | ds); txq->wqe_ci += (ds + 3) / 4; loc->wqe_free -= (ds + 3) / 4; - /* Request CQE generation if limits are reached. */ - mlx5_tx_request_completion(txq, loc, true, olx); return MLX5_TXCMP_CODE_MULTI; } @@ -3413,20 +3577,20 @@ do_align: * Local context variables updated. */ static __rte_always_inline enum mlx5_txcmp_code -mlx5_tx_burst_mseg(struct mlx5_txq_data *restrict txq, - struct rte_mbuf **restrict pkts, +mlx5_tx_burst_mseg(struct mlx5_txq_data *__rte_restrict txq, + struct rte_mbuf **__rte_restrict pkts, unsigned int pkts_n, - struct mlx5_txq_local *restrict loc, + struct mlx5_txq_local *__rte_restrict loc, unsigned int olx) { - assert(loc->elts_free && loc->wqe_free); - assert(pkts_n > loc->pkts_sent); + MLX5_ASSERT(loc->elts_free && loc->wqe_free); + MLX5_ASSERT(pkts_n > loc->pkts_sent); pkts += loc->pkts_sent + 1; pkts_n -= loc->pkts_sent; for (;;) { enum mlx5_txcmp_code ret; - assert(NB_SEGS(loc->mbuf) > 1); + MLX5_ASSERT(NB_SEGS(loc->mbuf) > 1); /* * Estimate the number of free elts quickly but * conservatively. Some segment may be fully inlined @@ -3466,7 +3630,7 @@ mlx5_tx_burst_mseg(struct mlx5_txq_data *restrict txq, return MLX5_TXCMP_CODE_TSO; return MLX5_TXCMP_CODE_SINGLE; } - assert(false); + MLX5_ASSERT(false); } /** @@ -3502,23 +3666,33 @@ mlx5_tx_burst_mseg(struct mlx5_txq_data *restrict txq, * Local context variables updated. */ static __rte_always_inline enum mlx5_txcmp_code -mlx5_tx_burst_tso(struct mlx5_txq_data *restrict txq, - struct rte_mbuf **restrict pkts, +mlx5_tx_burst_tso(struct mlx5_txq_data *__rte_restrict txq, + struct rte_mbuf **__rte_restrict pkts, unsigned int pkts_n, - struct mlx5_txq_local *restrict loc, + struct mlx5_txq_local *__rte_restrict loc, unsigned int olx) { - assert(loc->elts_free && loc->wqe_free); - assert(pkts_n > loc->pkts_sent); + MLX5_ASSERT(loc->elts_free && loc->wqe_free); + MLX5_ASSERT(pkts_n > loc->pkts_sent); pkts += loc->pkts_sent + 1; pkts_n -= loc->pkts_sent; for (;;) { - struct mlx5_wqe_dseg *restrict dseg; - struct mlx5_wqe *restrict wqe; + struct mlx5_wqe_dseg *__rte_restrict dseg; + struct mlx5_wqe *__rte_restrict wqe; unsigned int ds, dlen, hlen, ntcp, vlan = 0; uint8_t *dptr; - assert(NB_SEGS(loc->mbuf) == 1); + MLX5_ASSERT(NB_SEGS(loc->mbuf) == 1); + if (MLX5_TXOFF_CONFIG(TXPP)) { + enum mlx5_txcmp_code wret; + + /* Generate WAIT for scheduling if requested. */ + wret = mlx5_tx_schedule_send(txq, loc, olx); + if (wret == MLX5_TXCMP_CODE_EXIT) + return MLX5_TXCMP_CODE_EXIT; + if (wret == MLX5_TXCMP_CODE_ERROR) + return MLX5_TXCMP_CODE_ERROR; + } dlen = rte_pktmbuf_data_len(loc->mbuf); if (MLX5_TXOFF_CONFIG(VLAN) && loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) { @@ -3590,8 +3764,6 @@ mlx5_tx_burst_tso(struct mlx5_txq_data *restrict txq, --loc->elts_free; ++loc->pkts_sent; --pkts_n; - /* Request CQE generation if limits are reached. */ - mlx5_tx_request_completion(txq, loc, false, olx); if (unlikely(!pkts_n || !loc->elts_free || !loc->wqe_free)) return MLX5_TXCMP_CODE_EXIT; loc->mbuf = *pkts++; @@ -3604,7 +3776,7 @@ mlx5_tx_burst_tso(struct mlx5_txq_data *restrict txq, return MLX5_TXCMP_CODE_SINGLE; /* Continue with the next TSO packet. */ } - assert(false); + MLX5_ASSERT(false); } /** @@ -3628,8 +3800,8 @@ mlx5_tx_burst_tso(struct mlx5_txq_data *restrict txq, * MLX5_TXCMP_CODE_EMPW - single-segment packet, use MPW. */ static __rte_always_inline enum mlx5_txcmp_code -mlx5_tx_able_to_empw(struct mlx5_txq_data *restrict txq, - struct mlx5_txq_local *restrict loc, +mlx5_tx_able_to_empw(struct mlx5_txq_data *__rte_restrict txq, + struct mlx5_txq_local *__rte_restrict loc, unsigned int olx, bool newp) { @@ -3664,6 +3836,7 @@ mlx5_tx_able_to_empw(struct mlx5_txq_data *restrict txq, /** * Check the next packet attributes to match with the eMPW batch ones. + * In addition, for legacy MPW the packet length is checked either. * * @param txq * Pointer to TX queue structure. @@ -3671,6 +3844,8 @@ mlx5_tx_able_to_empw(struct mlx5_txq_data *restrict txq, * Pointer to Ethernet Segment of eMPW batch. * @param loc * Pointer to burst routine local context. + * @param dlen + * Length of previous packet in MPW descriptor. * @param olx * Configured Tx offloads mask. It is fully defined at * compile time and may be used for optimization. @@ -3680,9 +3855,10 @@ mlx5_tx_able_to_empw(struct mlx5_txq_data *restrict txq, * false - no match, eMPW should be restarted. */ static __rte_always_inline bool -mlx5_tx_match_empw(struct mlx5_txq_data *restrict txq __rte_unused, - struct mlx5_wqe_eseg *restrict es, - struct mlx5_txq_local *restrict loc, +mlx5_tx_match_empw(struct mlx5_txq_data *__rte_restrict txq, + struct mlx5_wqe_eseg *__rte_restrict es, + struct mlx5_txq_local *__rte_restrict loc, + uint32_t dlen, unsigned int olx) { uint8_t swp_flags = 0; @@ -3701,9 +3877,17 @@ mlx5_tx_match_empw(struct mlx5_txq_data *restrict txq __rte_unused, es->metadata != (loc->mbuf->ol_flags & PKT_TX_DYNF_METADATA ? *RTE_FLOW_DYNF_METADATA(loc->mbuf) : 0)) return false; + /* Legacy MPW can send packets with the same lengt only. */ + if (MLX5_TXOFF_CONFIG(MPW) && + dlen != rte_pktmbuf_data_len(loc->mbuf)) + return false; /* There must be no VLAN packets in eMPW loop. */ if (MLX5_TXOFF_CONFIG(VLAN)) - assert(!(loc->mbuf->ol_flags & PKT_TX_VLAN_PKT)); + MLX5_ASSERT(!(loc->mbuf->ol_flags & PKT_TX_VLAN_PKT)); + /* Check if the scheduling is requested. */ + if (MLX5_TXOFF_CONFIG(TXPP) && + loc->mbuf->ol_flags & txq->ts_mask) + return false; return true; } @@ -3729,13 +3913,13 @@ mlx5_tx_match_empw(struct mlx5_txq_data *restrict txq __rte_unused, * false - no match, eMPW should be restarted. */ static __rte_always_inline void -mlx5_tx_sdone_empw(struct mlx5_txq_data *restrict txq, - struct mlx5_txq_local *restrict loc, +mlx5_tx_sdone_empw(struct mlx5_txq_data *__rte_restrict txq, + struct mlx5_txq_local *__rte_restrict loc, unsigned int ds, unsigned int slen, - unsigned int olx) + unsigned int olx __rte_unused) { - assert(!MLX5_TXOFF_CONFIG(INLINE)); + MLX5_ASSERT(!MLX5_TXOFF_CONFIG(INLINE)); #ifdef MLX5_PMD_SOFT_COUNTERS /* Update sent data bytes counter. */ txq->stats.obytes += slen; @@ -3748,8 +3932,6 @@ mlx5_tx_sdone_empw(struct mlx5_txq_data *restrict txq, loc->wqe_last->cseg.sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | ds); txq->wqe_ci += (ds + 3) / 4; loc->wqe_free -= (ds + 3) / 4; - /* Request CQE generation if limits are reached. */ - mlx5_tx_request_completion(txq, loc, false, olx); } /* @@ -3765,6 +3947,8 @@ mlx5_tx_sdone_empw(struct mlx5_txq_data *restrict txq, * Total size of descriptor/data in bytes. * @param slen * Accumulated statistics, data bytes sent. + * @param wqem + * The base WQE for the eMPW/MPW descriptor. * @param olx * Configured Tx offloads mask. It is fully defined at * compile time and may be used for optimization. @@ -3774,26 +3958,44 @@ mlx5_tx_sdone_empw(struct mlx5_txq_data *restrict txq, * false - no match, eMPW should be restarted. */ static __rte_always_inline void -mlx5_tx_idone_empw(struct mlx5_txq_data *restrict txq, - struct mlx5_txq_local *restrict loc, +mlx5_tx_idone_empw(struct mlx5_txq_data *__rte_restrict txq, + struct mlx5_txq_local *__rte_restrict loc, unsigned int len, unsigned int slen, + struct mlx5_wqe *__rte_restrict wqem, unsigned int olx __rte_unused) { - assert(MLX5_TXOFF_CONFIG(INLINE)); - assert((len % MLX5_WSEG_SIZE) == 0); + struct mlx5_wqe_dseg *dseg = &wqem->dseg[0]; + + MLX5_ASSERT(MLX5_TXOFF_CONFIG(INLINE)); #ifdef MLX5_PMD_SOFT_COUNTERS /* Update sent data bytes counter. */ txq->stats.obytes += slen; #else (void)slen; #endif - len = len / MLX5_WSEG_SIZE + 2; - loc->wqe_last->cseg.sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | len); + if (MLX5_TXOFF_CONFIG(MPW) && dseg->bcount == RTE_BE32(0)) { + /* + * If the legacy MPW session contains the inline packets + * we should set the only inline data segment length + * and align the total length to the segment size. + */ + MLX5_ASSERT(len > sizeof(dseg->bcount)); + dseg->bcount = rte_cpu_to_be_32((len - sizeof(dseg->bcount)) | + MLX5_ETH_WQE_DATA_INLINE); + len = (len + MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE + 2; + } else { + /* + * The session is not legacy MPW or contains the + * data buffer pointer segments. + */ + MLX5_ASSERT((len % MLX5_WSEG_SIZE) == 0); + len = len / MLX5_WSEG_SIZE + 2; + } + wqem->cseg.sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | len); txq->wqe_ci += (len + 3) / 4; loc->wqe_free -= (len + 3) / 4; - /* Request CQE generation if limits are reached. */ - mlx5_tx_request_completion(txq, loc, false, olx); + loc->wqe_last = wqem; } /** @@ -3844,10 +4046,10 @@ mlx5_tx_idone_empw(struct mlx5_txq_data *restrict txq, * No VLAN insertion is supported. */ static __rte_always_inline enum mlx5_txcmp_code -mlx5_tx_burst_empw_simple(struct mlx5_txq_data *restrict txq, - struct rte_mbuf **restrict pkts, +mlx5_tx_burst_empw_simple(struct mlx5_txq_data *__rte_restrict txq, + struct rte_mbuf **__rte_restrict pkts, unsigned int pkts_n, - struct mlx5_txq_local *restrict loc, + struct mlx5_txq_local *__rte_restrict loc, unsigned int olx) { /* @@ -3855,22 +4057,35 @@ mlx5_tx_burst_empw_simple(struct mlx5_txq_data *restrict txq, * and sends single-segment packet with eMPW opcode * without data inlining. */ - assert(!MLX5_TXOFF_CONFIG(INLINE)); - assert(MLX5_TXOFF_CONFIG(EMPW)); - assert(loc->elts_free && loc->wqe_free); - assert(pkts_n > loc->pkts_sent); + MLX5_ASSERT(!MLX5_TXOFF_CONFIG(INLINE)); + MLX5_ASSERT(MLX5_TXOFF_CONFIG(EMPW)); + MLX5_ASSERT(loc->elts_free && loc->wqe_free); + MLX5_ASSERT(pkts_n > loc->pkts_sent); static_assert(MLX5_EMPW_MIN_PACKETS >= 2, "invalid min size"); pkts += loc->pkts_sent + 1; pkts_n -= loc->pkts_sent; for (;;) { - struct mlx5_wqe_dseg *restrict dseg; - struct mlx5_wqe_eseg *restrict eseg; + struct mlx5_wqe_dseg *__rte_restrict dseg; + struct mlx5_wqe_eseg *__rte_restrict eseg; enum mlx5_txcmp_code ret; unsigned int part, loop; unsigned int slen = 0; next_empw: - part = RTE_MIN(pkts_n, MLX5_EMPW_MAX_PACKETS); + MLX5_ASSERT(NB_SEGS(loc->mbuf) == 1); + if (MLX5_TXOFF_CONFIG(TXPP)) { + enum mlx5_txcmp_code wret; + + /* Generate WAIT for scheduling if requested. */ + wret = mlx5_tx_schedule_send(txq, loc, olx); + if (wret == MLX5_TXCMP_CODE_EXIT) + return MLX5_TXCMP_CODE_EXIT; + if (wret == MLX5_TXCMP_CODE_ERROR) + return MLX5_TXCMP_CODE_ERROR; + } + part = RTE_MIN(pkts_n, MLX5_TXOFF_CONFIG(MPW) ? + MLX5_MPW_MAX_PACKETS : + MLX5_EMPW_MAX_PACKETS); if (unlikely(loc->elts_free < part)) { /* We have no enough elts to save all mbufs. */ if (unlikely(loc->elts_free < MLX5_EMPW_MIN_PACKETS)) @@ -3900,6 +4115,10 @@ next_empw: eseg = &loc->wqe_last->eseg; dseg = &loc->wqe_last->dseg[0]; loop = part; + /* Store the packet length for legacy MPW. */ + if (MLX5_TXOFF_CONFIG(MPW)) + eseg->mss = rte_cpu_to_be_16 + (rte_pktmbuf_data_len(loc->mbuf)); for (;;) { uint32_t dlen = rte_pktmbuf_data_len(loc->mbuf); #ifdef MLX5_PMD_SOFT_COUNTERS @@ -3929,6 +4148,7 @@ next_empw: return MLX5_TXCMP_CODE_EXIT; return MLX5_TXCMP_CODE_MULTI; } + MLX5_ASSERT(NB_SEGS(loc->mbuf) == 1); if (ret == MLX5_TXCMP_CODE_TSO) { part -= loop; mlx5_tx_sdone_empw(txq, loc, part, slen, olx); @@ -3946,7 +4166,7 @@ next_empw: return MLX5_TXCMP_CODE_SINGLE; } if (ret != MLX5_TXCMP_CODE_EMPW) { - assert(false); + MLX5_ASSERT(false); part -= loop; mlx5_tx_sdone_empw(txq, loc, part, slen, olx); return MLX5_TXCMP_CODE_ERROR; @@ -3957,9 +4177,11 @@ next_empw: * - check sum settings * - metadata value * - software parser settings + * - packets length (legacy MPW only) + * - scheduling is not required */ - if (!mlx5_tx_match_empw(txq, eseg, loc, olx)) { - assert(loop); + if (!mlx5_tx_match_empw(txq, eseg, loc, dlen, olx)) { + MLX5_ASSERT(loop); part -= loop; mlx5_tx_sdone_empw(txq, loc, part, slen, olx); if (unlikely(!loc->elts_free || @@ -3974,8 +4196,8 @@ next_empw: dseg = (struct mlx5_wqe_dseg *)txq->wqes; } /* eMPW is built successfully, update loop parameters. */ - assert(!loop); - assert(pkts_n >= part); + MLX5_ASSERT(!loop); + MLX5_ASSERT(pkts_n >= part); #ifdef MLX5_PMD_SOFT_COUNTERS /* Update sent data bytes counter. */ txq->stats.obytes += slen; @@ -3985,8 +4207,6 @@ next_empw: txq->wqe_ci += (2 + part + 3) / 4; loc->wqe_free -= (2 + part + 3) / 4; pkts_n -= part; - /* Request CQE generation if limits are reached. */ - mlx5_tx_request_completion(txq, loc, false, olx); if (unlikely(!pkts_n || !loc->elts_free || !loc->wqe_free)) return MLX5_TXCMP_CODE_EXIT; loc->mbuf = *pkts++; @@ -3995,7 +4215,7 @@ next_empw: return ret; /* Continue sending eMPW batches. */ } - assert(false); + MLX5_ASSERT(false); } /** @@ -4003,10 +4223,10 @@ next_empw: * with inlining, optionally supports VLAN insertion. */ static __rte_always_inline enum mlx5_txcmp_code -mlx5_tx_burst_empw_inline(struct mlx5_txq_data *restrict txq, - struct rte_mbuf **restrict pkts, +mlx5_tx_burst_empw_inline(struct mlx5_txq_data *__rte_restrict txq, + struct rte_mbuf **__rte_restrict pkts, unsigned int pkts_n, - struct mlx5_txq_local *restrict loc, + struct mlx5_txq_local *__rte_restrict loc, unsigned int olx) { /* @@ -4014,47 +4234,72 @@ mlx5_tx_burst_empw_inline(struct mlx5_txq_data *restrict txq, * and sends single-segment packet with eMPW opcode * with data inlining. */ - assert(MLX5_TXOFF_CONFIG(INLINE)); - assert(MLX5_TXOFF_CONFIG(EMPW)); - assert(loc->elts_free && loc->wqe_free); - assert(pkts_n > loc->pkts_sent); + MLX5_ASSERT(MLX5_TXOFF_CONFIG(INLINE)); + MLX5_ASSERT(MLX5_TXOFF_CONFIG(EMPW)); + MLX5_ASSERT(loc->elts_free && loc->wqe_free); + MLX5_ASSERT(pkts_n > loc->pkts_sent); static_assert(MLX5_EMPW_MIN_PACKETS >= 2, "invalid min size"); pkts += loc->pkts_sent + 1; pkts_n -= loc->pkts_sent; for (;;) { - struct mlx5_wqe_dseg *restrict dseg; - struct mlx5_wqe_eseg *restrict eseg; + struct mlx5_wqe_dseg *__rte_restrict dseg; + struct mlx5_wqe *__rte_restrict wqem; enum mlx5_txcmp_code ret; unsigned int room, part, nlim; unsigned int slen = 0; + MLX5_ASSERT(NB_SEGS(loc->mbuf) == 1); + if (MLX5_TXOFF_CONFIG(TXPP)) { + enum mlx5_txcmp_code wret; + + /* Generate WAIT for scheduling if requested. */ + wret = mlx5_tx_schedule_send(txq, loc, olx); + if (wret == MLX5_TXCMP_CODE_EXIT) + return MLX5_TXCMP_CODE_EXIT; + if (wret == MLX5_TXCMP_CODE_ERROR) + return MLX5_TXCMP_CODE_ERROR; + } /* * Limits the amount of packets in one WQE * to improve CQE latency generation. */ - nlim = RTE_MIN(pkts_n, MLX5_EMPW_MAX_PACKETS); + nlim = RTE_MIN(pkts_n, MLX5_TXOFF_CONFIG(MPW) ? + MLX5_MPW_INLINE_MAX_PACKETS : + MLX5_EMPW_MAX_PACKETS); /* Check whether we have minimal amount WQEs */ if (unlikely(loc->wqe_free < ((2 + MLX5_EMPW_MIN_PACKETS + 3) / 4))) return MLX5_TXCMP_CODE_EXIT; if (likely(pkts_n > 1)) rte_prefetch0(*pkts); - loc->wqe_last = txq->wqes + (txq->wqe_ci & txq->wqe_m); + wqem = txq->wqes + (txq->wqe_ci & txq->wqe_m); /* * Build eMPW title WQEBB: * - Control Segment, eMPW opcode, zero DS * - Ethernet Segment, no inline */ - mlx5_tx_cseg_init(txq, loc, loc->wqe_last, 0, + mlx5_tx_cseg_init(txq, loc, wqem, 0, MLX5_OPCODE_ENHANCED_MPSW, olx); - mlx5_tx_eseg_none(txq, loc, loc->wqe_last, + mlx5_tx_eseg_none(txq, loc, wqem, olx & ~MLX5_TXOFF_CONFIG_VLAN); - eseg = &loc->wqe_last->eseg; - dseg = &loc->wqe_last->dseg[0]; + dseg = &wqem->dseg[0]; + /* Store the packet length for legacy MPW. */ + if (MLX5_TXOFF_CONFIG(MPW)) + wqem->eseg.mss = rte_cpu_to_be_16 + (rte_pktmbuf_data_len(loc->mbuf)); room = RTE_MIN(MLX5_WQE_SIZE_MAX / MLX5_WQE_SIZE, loc->wqe_free) * MLX5_WQE_SIZE - MLX5_WQE_CSEG_SIZE - MLX5_WQE_ESEG_SIZE; + /* Limit the room for legacy MPW sessions for performance. */ + if (MLX5_TXOFF_CONFIG(MPW)) + room = RTE_MIN(room, + RTE_MAX(txq->inlen_empw + + sizeof(dseg->bcount) + + (MLX5_TXOFF_CONFIG(VLAN) ? + sizeof(struct rte_vlan_hdr) : 0), + MLX5_MPW_INLINE_MAX_PACKETS * + MLX5_WQE_DSEG_SIZE)); /* Build WQE till we have space, packets and resources. */ part = room; for (;;) { @@ -4062,9 +4307,9 @@ mlx5_tx_burst_empw_inline(struct mlx5_txq_data *restrict txq, uint8_t *dptr = rte_pktmbuf_mtod(loc->mbuf, uint8_t *); unsigned int tlen; - assert(room >= MLX5_WQE_DSEG_SIZE); - assert((room % MLX5_WQE_DSEG_SIZE) == 0); - assert((uintptr_t)dseg < (uintptr_t)txq->wqes_end); + MLX5_ASSERT(room >= MLX5_WQE_DSEG_SIZE); + MLX5_ASSERT((room % MLX5_WQE_DSEG_SIZE) == 0); + MLX5_ASSERT((uintptr_t)dseg < (uintptr_t)txq->wqes_end); /* * Some Tx offloads may cause an error if * packet is not long enough, check against @@ -4078,14 +4323,36 @@ mlx5_tx_burst_empw_inline(struct mlx5_txq_data *restrict txq, * We have some successfully built * packet Data Segments to send. */ - mlx5_tx_idone_empw(txq, loc, part, slen, olx); + mlx5_tx_idone_empw(txq, loc, part, + slen, wqem, olx); return MLX5_TXCMP_CODE_ERROR; } /* Inline or not inline - that's the Question. */ - if (dlen > txq->inlen_empw) + if (dlen > txq->inlen_empw || + loc->mbuf->ol_flags & PKT_TX_DYNF_NOINLINE) goto pointer_empw; + if (MLX5_TXOFF_CONFIG(MPW)) { + if (dlen > txq->inlen_send) + goto pointer_empw; + tlen = dlen; + if (part == room) { + /* Open new inline MPW session. */ + tlen += sizeof(dseg->bcount); + dseg->bcount = RTE_BE32(0); + dseg = RTE_PTR_ADD + (dseg, sizeof(dseg->bcount)); + } else { + /* + * No pointer and inline descriptor + * intermix for legacy MPW sessions. + */ + if (wqem->dseg[0].bcount) + break; + } + } else { + tlen = sizeof(dseg->bcount) + dlen; + } /* Inline entire packet, optional VLAN insertion. */ - tlen = sizeof(dseg->bcount) + dlen; if (MLX5_TXOFF_CONFIG(VLAN) && loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) { /* @@ -4093,8 +4360,9 @@ mlx5_tx_burst_empw_inline(struct mlx5_txq_data *restrict txq, * mlx5_tx_able_to_empw() and packet * fits into inline length guaranteed. */ - assert((dlen + sizeof(struct rte_vlan_hdr)) <= - txq->inlen_empw); + MLX5_ASSERT((dlen + + sizeof(struct rte_vlan_hdr)) <= + txq->inlen_empw); tlen += sizeof(struct rte_vlan_hdr); if (room < tlen) break; @@ -4110,8 +4378,9 @@ mlx5_tx_burst_empw_inline(struct mlx5_txq_data *restrict txq, dseg = mlx5_tx_dseg_empw(txq, loc, dseg, dptr, dlen, olx); } - tlen = RTE_ALIGN(tlen, MLX5_WSEG_SIZE); - assert(room >= tlen); + if (!MLX5_TXOFF_CONFIG(MPW)) + tlen = RTE_ALIGN(tlen, MLX5_WSEG_SIZE); + MLX5_ASSERT(room >= tlen); room -= tlen; /* * Packet data are completely inlined, @@ -4120,14 +4389,22 @@ mlx5_tx_burst_empw_inline(struct mlx5_txq_data *restrict txq, rte_pktmbuf_free_seg(loc->mbuf); goto next_mbuf; pointer_empw: + /* + * No pointer and inline descriptor + * intermix for legacy MPW sessions. + */ + if (MLX5_TXOFF_CONFIG(MPW) && + part != room && + wqem->dseg[0].bcount == RTE_BE32(0)) + break; /* * Not inlinable VLAN packets are * proceeded outside of this routine. */ - assert(room >= MLX5_WQE_DSEG_SIZE); + MLX5_ASSERT(room >= MLX5_WQE_DSEG_SIZE); if (MLX5_TXOFF_CONFIG(VLAN)) - assert(!(loc->mbuf->ol_flags & - PKT_TX_VLAN_PKT)); + MLX5_ASSERT(!(loc->mbuf->ol_flags & + PKT_TX_VLAN_PKT)); mlx5_tx_dseg_ptr(txq, loc, dseg, dptr, dlen, olx); /* We have to store mbuf in elts.*/ txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf; @@ -4148,7 +4425,8 @@ next_mbuf: * continue build descriptors. */ part -= room; - mlx5_tx_idone_empw(txq, loc, part, slen, olx); + mlx5_tx_idone_empw(txq, loc, part, + slen, wqem, olx); return MLX5_TXCMP_CODE_EXIT; } loc->mbuf = *pkts++; @@ -4162,15 +4440,18 @@ next_mbuf: */ if (ret == MLX5_TXCMP_CODE_MULTI) { part -= room; - mlx5_tx_idone_empw(txq, loc, part, slen, olx); + mlx5_tx_idone_empw(txq, loc, part, + slen, wqem, olx); if (unlikely(!loc->elts_free || !loc->wqe_free)) return MLX5_TXCMP_CODE_EXIT; return MLX5_TXCMP_CODE_MULTI; } + MLX5_ASSERT(NB_SEGS(loc->mbuf) == 1); if (ret == MLX5_TXCMP_CODE_TSO) { part -= room; - mlx5_tx_idone_empw(txq, loc, part, slen, olx); + mlx5_tx_idone_empw(txq, loc, part, + slen, wqem, olx); if (unlikely(!loc->elts_free || !loc->wqe_free)) return MLX5_TXCMP_CODE_EXIT; @@ -4178,16 +4459,18 @@ next_mbuf: } if (ret == MLX5_TXCMP_CODE_SINGLE) { part -= room; - mlx5_tx_idone_empw(txq, loc, part, slen, olx); + mlx5_tx_idone_empw(txq, loc, part, + slen, wqem, olx); if (unlikely(!loc->elts_free || !loc->wqe_free)) return MLX5_TXCMP_CODE_EXIT; return MLX5_TXCMP_CODE_SINGLE; } if (ret != MLX5_TXCMP_CODE_EMPW) { - assert(false); + MLX5_ASSERT(false); part -= room; - mlx5_tx_idone_empw(txq, loc, part, slen, olx); + mlx5_tx_idone_empw(txq, loc, part, + slen, wqem, olx); return MLX5_TXCMP_CODE_ERROR; } /* Check if we have minimal room left. */ @@ -4200,8 +4483,11 @@ next_mbuf: * - check sum settings * - metadata value * - software parser settings + * - packets length (legacy MPW only) + * - scheduling is not required */ - if (!mlx5_tx_match_empw(txq, eseg, loc, olx)) + if (!mlx5_tx_match_empw(txq, &wqem->eseg, + loc, dlen, olx)) break; /* Packet attributes match, continue the same eMPW. */ if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end) @@ -4211,17 +4497,17 @@ next_mbuf: * We get here to close an existing eMPW * session and start the new one. */ - assert(pkts_n); + MLX5_ASSERT(pkts_n); part -= room; if (unlikely(!part)) return MLX5_TXCMP_CODE_EXIT; - mlx5_tx_idone_empw(txq, loc, part, slen, olx); + mlx5_tx_idone_empw(txq, loc, part, slen, wqem, olx); if (unlikely(!loc->elts_free || !loc->wqe_free)) return MLX5_TXCMP_CODE_EXIT; /* Continue the loop with new eMPW session. */ } - assert(false); + MLX5_ASSERT(false); } /** @@ -4229,25 +4515,35 @@ next_mbuf: * Data inlining and VLAN insertion are supported. */ static __rte_always_inline enum mlx5_txcmp_code -mlx5_tx_burst_single_send(struct mlx5_txq_data *restrict txq, - struct rte_mbuf **restrict pkts, +mlx5_tx_burst_single_send(struct mlx5_txq_data *__rte_restrict txq, + struct rte_mbuf **__rte_restrict pkts, unsigned int pkts_n, - struct mlx5_txq_local *restrict loc, + struct mlx5_txq_local *__rte_restrict loc, unsigned int olx) { /* * Subroutine is the part of mlx5_tx_burst_single() * and sends single-segment packet with SEND opcode. */ - assert(loc->elts_free && loc->wqe_free); - assert(pkts_n > loc->pkts_sent); + MLX5_ASSERT(loc->elts_free && loc->wqe_free); + MLX5_ASSERT(pkts_n > loc->pkts_sent); pkts += loc->pkts_sent + 1; pkts_n -= loc->pkts_sent; for (;;) { - struct mlx5_wqe *restrict wqe; + struct mlx5_wqe *__rte_restrict wqe; enum mlx5_txcmp_code ret; - assert(NB_SEGS(loc->mbuf) == 1); + MLX5_ASSERT(NB_SEGS(loc->mbuf) == 1); + if (MLX5_TXOFF_CONFIG(TXPP)) { + enum mlx5_txcmp_code wret; + + /* Generate WAIT for scheduling if requested. */ + wret = mlx5_tx_schedule_send(txq, loc, olx); + if (wret == MLX5_TXCMP_CODE_EXIT) + return MLX5_TXCMP_CODE_EXIT; + if (wret == MLX5_TXCMP_CODE_ERROR) + return MLX5_TXCMP_CODE_ERROR; + } if (MLX5_TXOFF_CONFIG(INLINE)) { unsigned int inlen, vlan = 0; @@ -4267,7 +4563,8 @@ mlx5_tx_burst_single_send(struct mlx5_txq_data *restrict txq, * Otherwise we would do extra check for data * size to avoid crashes due to length overflow. */ - assert(txq->inlen_send >= MLX5_ESEG_MIN_INLINE_SIZE); + MLX5_ASSERT(txq->inlen_send >= + MLX5_ESEG_MIN_INLINE_SIZE); if (inlen <= txq->inlen_send) { unsigned int seg_n, wqe_n; @@ -4276,6 +4573,36 @@ mlx5_tx_burst_single_send(struct mlx5_txq_data *restrict txq, /* Check against minimal length. */ if (inlen <= MLX5_ESEG_MIN_INLINE_SIZE) return MLX5_TXCMP_CODE_ERROR; + if (loc->mbuf->ol_flags & + PKT_TX_DYNF_NOINLINE) { + /* + * The hint flag not to inline packet + * data is set. Check whether we can + * follow the hint. + */ + if ((!MLX5_TXOFF_CONFIG(EMPW) && + txq->inlen_mode) || + (MLX5_TXOFF_CONFIG(MPW) && + txq->inlen_mode)) { + if (inlen <= txq->inlen_send) + goto single_inline; + /* + * The hardware requires the + * minimal inline data header. + */ + goto single_min_inline; + } + if (MLX5_TXOFF_CONFIG(VLAN) && + vlan && !txq->vlan_en) { + /* + * We must insert VLAN tag + * by software means. + */ + goto single_part_inline; + } + goto single_no_inline; + } +single_inline: /* * Completely inlined packet data WQE: * - Control Segment, SEND opcode @@ -4304,8 +4631,9 @@ mlx5_tx_burst_single_send(struct mlx5_txq_data *restrict txq, * free the packet immediately. */ rte_pktmbuf_free_seg(loc->mbuf); - } else if (!MLX5_TXOFF_CONFIG(EMPW) && - txq->inlen_mode) { + } else if ((!MLX5_TXOFF_CONFIG(EMPW) || + MLX5_TXOFF_CONFIG(MPW)) && + txq->inlen_mode) { /* * If minimal inlining is requested the eMPW * feature should be disabled due to data is @@ -4313,7 +4641,7 @@ mlx5_tx_burst_single_send(struct mlx5_txq_data *restrict txq, * not contain inlined data for eMPW due to * segment shared for all packets. */ - struct mlx5_wqe_dseg *restrict dseg; + struct mlx5_wqe_dseg *__rte_restrict dseg; unsigned int ds; uint8_t *dptr; @@ -4324,10 +4652,11 @@ mlx5_tx_burst_single_send(struct mlx5_txq_data *restrict txq, * We should check the free space in * WQE ring buffer to inline partially. */ - assert(txq->inlen_send >= txq->inlen_mode); - assert(inlen > txq->inlen_mode); - assert(txq->inlen_mode >= - MLX5_ESEG_MIN_INLINE_SIZE); +single_min_inline: + MLX5_ASSERT(txq->inlen_send >= txq->inlen_mode); + MLX5_ASSERT(inlen > txq->inlen_mode); + MLX5_ASSERT(txq->inlen_mode >= + MLX5_ESEG_MIN_INLINE_SIZE); /* * Check whether there are enough free WQEBBs: * - Control Segment @@ -4370,7 +4699,7 @@ mlx5_tx_burst_single_send(struct mlx5_txq_data *restrict txq, txq->wqe_ci += (ds + 3) / 4; loc->wqe_free -= (ds + 3) / 4; /* We have to store mbuf in elts.*/ - assert(MLX5_TXOFF_CONFIG(INLINE)); + MLX5_ASSERT(MLX5_TXOFF_CONFIG(INLINE)); txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf; --loc->elts_free; @@ -4391,6 +4720,7 @@ mlx5_tx_burst_single_send(struct mlx5_txq_data *restrict txq, * We also get here if VLAN insertion is not * supported by HW, the inline is enabled. */ +single_part_inline: wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m); loc->wqe_last = wqe; mlx5_tx_cseg_init(txq, loc, wqe, 4, @@ -4403,14 +4733,14 @@ mlx5_tx_burst_single_send(struct mlx5_txq_data *restrict txq, * comparing with txq->inlen_send. We should * not get overflow here. */ - assert(inlen > MLX5_ESEG_MIN_INLINE_SIZE); + MLX5_ASSERT(inlen > MLX5_ESEG_MIN_INLINE_SIZE); dlen = inlen - MLX5_ESEG_MIN_INLINE_SIZE; mlx5_tx_dseg_ptr(txq, loc, &wqe->dseg[1], dptr, dlen, olx); ++txq->wqe_ci; --loc->wqe_free; /* We have to store mbuf in elts.*/ - assert(MLX5_TXOFF_CONFIG(INLINE)); + MLX5_ASSERT(MLX5_TXOFF_CONFIG(INLINE)); txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf; --loc->elts_free; @@ -4431,6 +4761,7 @@ mlx5_tx_burst_single_send(struct mlx5_txq_data *restrict txq, * - Ethernet Segment, optional VLAN, no inline * - Data Segment, pointer type */ +single_no_inline: wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m); loc->wqe_last = wqe; mlx5_tx_cseg_init(txq, loc, wqe, 3, @@ -4447,7 +4778,7 @@ mlx5_tx_burst_single_send(struct mlx5_txq_data *restrict txq, * if no inlining is configured, this is done * by calling routine in a batch copy. */ - assert(!MLX5_TXOFF_CONFIG(INLINE)); + MLX5_ASSERT(!MLX5_TXOFF_CONFIG(INLINE)); --loc->elts_free; #ifdef MLX5_PMD_SOFT_COUNTERS /* Update sent data bytes counter. */ @@ -4460,8 +4791,6 @@ mlx5_tx_burst_single_send(struct mlx5_txq_data *restrict txq, } ++loc->pkts_sent; --pkts_n; - /* Request CQE generation if limits are reached. */ - mlx5_tx_request_completion(txq, loc, false, olx); if (unlikely(!pkts_n || !loc->elts_free || !loc->wqe_free)) return MLX5_TXCMP_CODE_EXIT; loc->mbuf = *pkts++; @@ -4471,14 +4800,14 @@ mlx5_tx_burst_single_send(struct mlx5_txq_data *restrict txq, if (unlikely(ret != MLX5_TXCMP_CODE_SINGLE)) return ret; } - assert(false); + MLX5_ASSERT(false); } static __rte_always_inline enum mlx5_txcmp_code -mlx5_tx_burst_single(struct mlx5_txq_data *restrict txq, - struct rte_mbuf **restrict pkts, +mlx5_tx_burst_single(struct mlx5_txq_data *__rte_restrict txq, + struct rte_mbuf **__rte_restrict pkts, unsigned int pkts_n, - struct mlx5_txq_local *restrict loc, + struct mlx5_txq_local *__rte_restrict loc, unsigned int olx) { enum mlx5_txcmp_code ret; @@ -4486,7 +4815,7 @@ mlx5_tx_burst_single(struct mlx5_txq_data *restrict txq, ret = mlx5_tx_able_to_empw(txq, loc, olx, false); if (ret == MLX5_TXCMP_CODE_SINGLE) goto ordinary_send; - assert(ret == MLX5_TXCMP_CODE_EMPW); + MLX5_ASSERT(ret == MLX5_TXCMP_CODE_EMPW); for (;;) { /* Optimize for inline/no inline eMPW send. */ ret = (MLX5_TXOFF_CONFIG(INLINE)) ? @@ -4497,14 +4826,14 @@ mlx5_tx_burst_single(struct mlx5_txq_data *restrict txq, if (ret != MLX5_TXCMP_CODE_SINGLE) return ret; /* The resources to send one packet should remain. */ - assert(loc->elts_free && loc->wqe_free); + MLX5_ASSERT(loc->elts_free && loc->wqe_free); ordinary_send: ret = mlx5_tx_burst_single_send(txq, pkts, pkts_n, loc, olx); - assert(ret != MLX5_TXCMP_CODE_SINGLE); + MLX5_ASSERT(ret != MLX5_TXCMP_CODE_SINGLE); if (ret != MLX5_TXCMP_CODE_EMPW) return ret; /* The resources to send one packet should remain. */ - assert(loc->elts_free && loc->wqe_free); + MLX5_ASSERT(loc->elts_free && loc->wqe_free); } } @@ -4529,8 +4858,8 @@ ordinary_send: * Number of packets successfully transmitted (<= pkts_n). */ static __rte_always_inline uint16_t -mlx5_tx_burst_tmpl(struct mlx5_txq_data *restrict txq, - struct rte_mbuf **restrict pkts, +mlx5_tx_burst_tmpl(struct mlx5_txq_data *__rte_restrict txq, + struct rte_mbuf **__rte_restrict pkts, uint16_t pkts_n, unsigned int olx) { @@ -4538,8 +4867,8 @@ mlx5_tx_burst_tmpl(struct mlx5_txq_data *restrict txq, enum mlx5_txcmp_code ret; unsigned int part; - assert(txq->elts_s >= (uint16_t)(txq->elts_head - txq->elts_tail)); - assert(txq->wqe_s >= (uint16_t)(txq->wqe_ci - txq->wqe_pi)); + MLX5_ASSERT(txq->elts_s >= (uint16_t)(txq->elts_head - txq->elts_tail)); + MLX5_ASSERT(txq->wqe_s >= (uint16_t)(txq->wqe_ci - txq->wqe_pi)); if (unlikely(!pkts_n)) return 0; loc.pkts_sent = 0; @@ -4560,15 +4889,15 @@ send_loop: /* * Calculate the number of available resources - elts and WQEs. * There are two possible different scenarios: - * - no data inlining into WQEs, one WQEBB may contains upto + * - no data inlining into WQEs, one WQEBB may contains up to * four packets, in this case elts become scarce resource * - data inlining into WQEs, one packet may require multiple * WQEBBs, the WQEs become the limiting factor. */ - assert(txq->elts_s >= (uint16_t)(txq->elts_head - txq->elts_tail)); + MLX5_ASSERT(txq->elts_s >= (uint16_t)(txq->elts_head - txq->elts_tail)); loc.elts_free = txq->elts_s - (uint16_t)(txq->elts_head - txq->elts_tail); - assert(txq->wqe_s >= (uint16_t)(txq->wqe_ci - txq->wqe_pi)); + MLX5_ASSERT(txq->wqe_s >= (uint16_t)(txq->wqe_ci - txq->wqe_pi)); loc.wqe_free = txq->wqe_s - (uint16_t)(txq->wqe_ci - txq->wqe_pi); if (unlikely(!loc.elts_free || !loc.wqe_free)) @@ -4590,7 +4919,7 @@ send_loop: * per WQE, do it in dedicated routine. */ enter_send_multi: - assert(loc.pkts_sent >= loc.pkts_copy); + MLX5_ASSERT(loc.pkts_sent >= loc.pkts_copy); part = loc.pkts_sent - loc.pkts_copy; if (!MLX5_TXOFF_CONFIG(INLINE) && part) { /* @@ -4604,7 +4933,7 @@ enter_send_multi: part, olx); loc.pkts_copy = loc.pkts_sent; } - assert(pkts_n > loc.pkts_sent); + MLX5_ASSERT(pkts_n > loc.pkts_sent); ret = mlx5_tx_burst_mseg(txq, pkts, pkts_n, &loc, olx); if (!MLX5_TXOFF_CONFIG(INLINE)) loc.pkts_copy = loc.pkts_sent; @@ -4646,7 +4975,7 @@ enter_send_multi: goto enter_send_tso; } /* We must not get here. Something is going wrong. */ - assert(false); + MLX5_ASSERT(false); txq->stats.oerrors++; break; } @@ -4660,8 +4989,8 @@ enter_send_multi: * in dedicated branch. */ enter_send_tso: - assert(NB_SEGS(loc.mbuf) == 1); - assert(pkts_n > loc.pkts_sent); + MLX5_ASSERT(NB_SEGS(loc.mbuf) == 1); + MLX5_ASSERT(pkts_n > loc.pkts_sent); ret = mlx5_tx_burst_tso(txq, pkts, pkts_n, &loc, olx); /* * These returned code checks are supposed @@ -4684,7 +5013,7 @@ enter_send_tso: goto enter_send_multi; } /* We must not get here. Something is going wrong. */ - assert(false); + MLX5_ASSERT(false); txq->stats.oerrors++; break; } @@ -4697,7 +5026,7 @@ enter_send_tso: * offloads are requested at SQ configuration time). */ enter_send_single: - assert(pkts_n > loc.pkts_sent); + MLX5_ASSERT(pkts_n > loc.pkts_sent); ret = mlx5_tx_burst_single(txq, pkts, pkts_n, &loc, olx); /* * These returned code checks are supposed @@ -4726,7 +5055,7 @@ enter_send_single: goto enter_send_tso; } /* We must not get here. Something is going wrong. */ - assert(false); + MLX5_ASSERT(false); txq->stats.oerrors++; break; } @@ -4736,17 +5065,45 @@ enter_send_single: * - doorbell the hardware * - copy the rest of mbufs to elts (if any) */ - assert(MLX5_TXOFF_CONFIG(INLINE) || loc.pkts_sent >= loc.pkts_copy); + MLX5_ASSERT(MLX5_TXOFF_CONFIG(INLINE) || + loc.pkts_sent >= loc.pkts_copy); /* Take a shortcut if nothing is sent. */ if (unlikely(loc.pkts_sent == loc.pkts_loop)) goto burst_exit; + /* Request CQE generation if limits are reached. */ + mlx5_tx_request_completion(txq, &loc, olx); /* * Ring QP doorbell immediately after WQE building completion * to improve latencies. The pure software related data treatment * can be completed after doorbell. Tx CQEs for this SQ are * processed in this thread only by the polling. + * + * The rdma core library can map doorbell register in two ways, + * depending on the environment variable "MLX5_SHUT_UP_BF": + * + * - as regular cached memory, the variable is either missing or + * set to zero. This type of mapping may cause the significant + * doorbell register writing latency and requires explicit + * memory write barrier to mitigate this issue and prevent + * write combining. + * + * - as non-cached memory, the variable is present and set to + * not "0" value. This type of mapping may cause performance + * impact under heavy loading conditions but the explicit write + * memory barrier is not required and it may improve core + * performance. + * + * - the legacy behaviour (prior 19.08 release) was to use some + * heuristics to decide whether write memory barrier should + * be performed. This behavior is supported with specifying + * tx_db_nc=2, write barrier is skipped if application + * provides the full recommended burst of packets, it + * supposes the next packets are coming and the write barrier + * will be issued on the next burst (after descriptor writing, + * at least). */ - mlx5_tx_dbrec_cond_wmb(txq, loc.wqe_last, 0); + mlx5_tx_dbrec_cond_wmb(txq, loc.wqe_last, !txq->db_nc && + (!txq->db_heu || pkts_n % MLX5_TX_DEFAULT_BURST)); /* Not all of the mbufs may be stored into elts yet. */ part = MLX5_TXOFF_CONFIG(INLINE) ? 0 : loc.pkts_sent - loc.pkts_copy; if (!MLX5_TXOFF_CONFIG(INLINE) && part) { @@ -4762,8 +5119,8 @@ enter_send_single: mlx5_tx_copy_elts(txq, pkts + loc.pkts_copy, part, olx); loc.pkts_copy = loc.pkts_sent; } - assert(txq->elts_s >= (uint16_t)(txq->elts_head - txq->elts_tail)); - assert(txq->wqe_s >= (uint16_t)(txq->wqe_ci - txq->wqe_pi)); + MLX5_ASSERT(txq->elts_s >= (uint16_t)(txq->elts_head - txq->elts_tail)); + MLX5_ASSERT(txq->wqe_s >= (uint16_t)(txq->wqe_ci - txq->wqe_pi)); if (pkts_n > loc.pkts_sent) { /* * If burst size is large there might be no enough CQE @@ -4914,6 +5271,73 @@ MLX5_TXOFF_DECL(iv, MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | MLX5_TXOFF_CONFIG_METADATA) +/* Generate routines with timestamp scheduling. */ +MLX5_TXOFF_DECL(full_ts_nompw, + MLX5_TXOFF_CONFIG_FULL | MLX5_TXOFF_CONFIG_TXPP) + +MLX5_TXOFF_DECL(full_ts_nompwi, + MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | + MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | + MLX5_TXOFF_CONFIG_VLAN | MLX5_TXOFF_CONFIG_METADATA | + MLX5_TXOFF_CONFIG_TXPP) + +MLX5_TXOFF_DECL(full_ts, + MLX5_TXOFF_CONFIG_FULL | MLX5_TXOFF_CONFIG_TXPP | + MLX5_TXOFF_CONFIG_EMPW) + +MLX5_TXOFF_DECL(full_ts_noi, + MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | + MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | + MLX5_TXOFF_CONFIG_VLAN | MLX5_TXOFF_CONFIG_METADATA | + MLX5_TXOFF_CONFIG_TXPP | MLX5_TXOFF_CONFIG_EMPW) + +MLX5_TXOFF_DECL(none_ts, + MLX5_TXOFF_CONFIG_NONE | MLX5_TXOFF_CONFIG_TXPP | + MLX5_TXOFF_CONFIG_EMPW) + +MLX5_TXOFF_DECL(mdi_ts, + MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_METADATA | + MLX5_TXOFF_CONFIG_TXPP | MLX5_TXOFF_CONFIG_EMPW) + +MLX5_TXOFF_DECL(mti_ts, + MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | + MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_METADATA | + MLX5_TXOFF_CONFIG_TXPP | MLX5_TXOFF_CONFIG_EMPW) + +MLX5_TXOFF_DECL(mtiv_ts, + MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | + MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | + MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_TXPP | + MLX5_TXOFF_CONFIG_EMPW) + +/* + * Generate routines with Legacy Multi-Packet Write support. + * This mode is supported by ConnectX-4 Lx only and imposes + * offload limitations, not supported: + * - ACL/Flows (metadata are becoming meaningless) + * - WQE Inline headers + * - SRIOV (E-Switch offloads) + * - VLAN insertion + * - tunnel encapsulation/decapsulation + * - TSO + */ +MLX5_TXOFF_DECL(none_mpw, + MLX5_TXOFF_CONFIG_NONE | MLX5_TXOFF_CONFIG_EMPW | + MLX5_TXOFF_CONFIG_MPW) + +MLX5_TXOFF_DECL(mci_mpw, + MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_CSUM | + MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_EMPW | + MLX5_TXOFF_CONFIG_MPW) + +MLX5_TXOFF_DECL(mc_mpw, + MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_CSUM | + MLX5_TXOFF_CONFIG_EMPW | MLX5_TXOFF_CONFIG_MPW) + +MLX5_TXOFF_DECL(i_mpw, + MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_EMPW | + MLX5_TXOFF_CONFIG_MPW) + /* * Array of declared and compiled Tx burst function and corresponding * supported offloads set. The array is used to select the Tx burst @@ -4990,6 +5414,44 @@ MLX5_TXOFF_INFO(iv_empw, MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW) +MLX5_TXOFF_INFO(full_ts_nompw, + MLX5_TXOFF_CONFIG_FULL | MLX5_TXOFF_CONFIG_TXPP) + +MLX5_TXOFF_INFO(full_ts_nompwi, + MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | + MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | + MLX5_TXOFF_CONFIG_VLAN | MLX5_TXOFF_CONFIG_METADATA | + MLX5_TXOFF_CONFIG_TXPP) + +MLX5_TXOFF_INFO(full_ts, + MLX5_TXOFF_CONFIG_FULL | MLX5_TXOFF_CONFIG_TXPP | + MLX5_TXOFF_CONFIG_EMPW) + +MLX5_TXOFF_INFO(full_ts_noi, + MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | + MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | + MLX5_TXOFF_CONFIG_VLAN | MLX5_TXOFF_CONFIG_METADATA | + MLX5_TXOFF_CONFIG_TXPP | MLX5_TXOFF_CONFIG_EMPW) + +MLX5_TXOFF_INFO(none_ts, + MLX5_TXOFF_CONFIG_NONE | MLX5_TXOFF_CONFIG_TXPP | + MLX5_TXOFF_CONFIG_EMPW) + +MLX5_TXOFF_INFO(mdi_ts, + MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_METADATA | + MLX5_TXOFF_CONFIG_TXPP | MLX5_TXOFF_CONFIG_EMPW) + +MLX5_TXOFF_INFO(mti_ts, + MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | + MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_METADATA | + MLX5_TXOFF_CONFIG_TXPP | MLX5_TXOFF_CONFIG_EMPW) + +MLX5_TXOFF_INFO(mtiv_ts, + MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | + MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | + MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_TXPP | + MLX5_TXOFF_CONFIG_EMPW) + MLX5_TXOFF_INFO(full, MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | MLX5_TXOFF_CONFIG_SWP | MLX5_TXOFF_CONFIG_CSUM | @@ -5016,7 +5478,6 @@ MLX5_TXOFF_INFO(mti, MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_METADATA) - MLX5_TXOFF_INFO(mtv, MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | MLX5_TXOFF_CONFIG_VLAN | @@ -5057,6 +5518,23 @@ MLX5_TXOFF_INFO(v, MLX5_TXOFF_INFO(iv, MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | MLX5_TXOFF_CONFIG_METADATA) + +MLX5_TXOFF_INFO(none_mpw, + MLX5_TXOFF_CONFIG_NONE | MLX5_TXOFF_CONFIG_EMPW | + MLX5_TXOFF_CONFIG_MPW) + +MLX5_TXOFF_INFO(mci_mpw, + MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_CSUM | + MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_EMPW | + MLX5_TXOFF_CONFIG_MPW) + +MLX5_TXOFF_INFO(mc_mpw, + MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_CSUM | + MLX5_TXOFF_CONFIG_EMPW | MLX5_TXOFF_CONFIG_MPW) + +MLX5_TXOFF_INFO(i_mpw, + MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_EMPW | + MLX5_TXOFF_CONFIG_MPW) }; /** @@ -5090,7 +5568,7 @@ mlx5_select_tx_function(struct rte_eth_dev *dev) "invalid WQE Data Segment size"); static_assert(MLX5_WQE_SIZE == 4 * MLX5_WSEG_SIZE, "invalid WQE size"); - assert(priv); + MLX5_ASSERT(priv); if (tx_offloads & DEV_TX_OFFLOAD_MULTI_SEGS) { /* We should support Multi-Segment Packets. */ olx |= MLX5_TXOFF_CONFIG_MULTI; @@ -5120,6 +5598,14 @@ mlx5_select_tx_function(struct rte_eth_dev *dev) /* We should support VLAN insertion. */ olx |= MLX5_TXOFF_CONFIG_VLAN; } + if (tx_offloads & DEV_TX_OFFLOAD_SEND_ON_TIMESTAMP && + rte_mbuf_dynflag_lookup + (RTE_MBUF_DYNFLAG_TX_TIMESTAMP_NAME, NULL) >= 0 && + rte_mbuf_dynfield_lookup + (RTE_MBUF_DYNFIELD_TIMESTAMP_NAME, NULL) >= 0) { + /* Offload configured, dynamic entities registered. */ + olx |= MLX5_TXOFF_CONFIG_TXPP; + } if (priv->txqs_n && (*priv->txqs)[0]) { struct mlx5_txq_data *txd = (*priv->txqs)[0]; @@ -5139,11 +5625,8 @@ mlx5_select_tx_function(struct rte_eth_dev *dev) if (config->mps == MLX5_MPW_ENHANCED && config->txq_inline_min <= 0) { /* - * The NIC supports Enhanced Multi-Packet Write. - * We do not support legacy MPW due to its - * hardware related problems, so we just ignore - * legacy MLX5_MPW settings. There should be no - * minimal required inline data. + * The NIC supports Enhanced Multi-Packet Write + * and does not require minimal inline data. */ olx |= MLX5_TXOFF_CONFIG_EMPW; } @@ -5151,6 +5634,20 @@ mlx5_select_tx_function(struct rte_eth_dev *dev) /* We should support Flow metadata. */ olx |= MLX5_TXOFF_CONFIG_METADATA; } + if (config->mps == MLX5_MPW) { + /* + * The NIC supports Legacy Multi-Packet Write. + * The MLX5_TXOFF_CONFIG_MPW controls the + * descriptor building method in combination + * with MLX5_TXOFF_CONFIG_EMPW. + */ + if (!(olx & (MLX5_TXOFF_CONFIG_TSO | + MLX5_TXOFF_CONFIG_SWP | + MLX5_TXOFF_CONFIG_VLAN | + MLX5_TXOFF_CONFIG_METADATA))) + olx |= MLX5_TXOFF_CONFIG_EMPW | + MLX5_TXOFF_CONFIG_MPW; + } /* * Scan the routines table to find the minimal * satisfying routine with requested offloads. @@ -5169,12 +5666,18 @@ mlx5_select_tx_function(struct rte_eth_dev *dev) /* Does not meet requested offloads at all. */ continue; } + if ((olx ^ tmp) & MLX5_TXOFF_CONFIG_MPW) + /* Do not enable legacy MPW if not configured. */ + continue; if ((olx ^ tmp) & MLX5_TXOFF_CONFIG_EMPW) /* Do not enable eMPW if not configured. */ continue; if ((olx ^ tmp) & MLX5_TXOFF_CONFIG_INLINE) /* Do not enable inlining if not configured. */ continue; + if ((olx ^ tmp) & MLX5_TXOFF_CONFIG_TXPP) + /* Do not enable scheduling if not configured. */ + continue; /* * Some routine meets the requirements. * Check whether it has minimal amount @@ -5219,7 +5722,104 @@ mlx5_select_tx_function(struct rte_eth_dev *dev) DRV_LOG(DEBUG, "\tVLANI (VLAN insertion)"); if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_METADATA) DRV_LOG(DEBUG, "\tMETAD (tx Flow metadata)"); - if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_EMPW) - DRV_LOG(DEBUG, "\tEMPW (Enhanced MPW)"); + if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_TXPP) + DRV_LOG(DEBUG, "\tMETAD (tx Scheduling)"); + if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_EMPW) { + if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_MPW) + DRV_LOG(DEBUG, "\tMPW (Legacy MPW)"); + else + DRV_LOG(DEBUG, "\tEMPW (Enhanced MPW)"); + } return txoff_func[m].func; } + +/** + * DPDK callback to get the TX queue information + * + * @param dev + * Pointer to the device structure. + * + * @param tx_queue_id + * Tx queue identificator. + * + * @param qinfo + * Pointer to the TX queue information structure. + * + * @return + * None. + */ + +void +mlx5_txq_info_get(struct rte_eth_dev *dev, uint16_t tx_queue_id, + struct rte_eth_txq_info *qinfo) +{ + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_txq_data *txq = (*priv->txqs)[tx_queue_id]; + struct mlx5_txq_ctrl *txq_ctrl = + container_of(txq, struct mlx5_txq_ctrl, txq); + + if (!txq) + return; + qinfo->nb_desc = txq->elts_s; + qinfo->conf.tx_thresh.pthresh = 0; + qinfo->conf.tx_thresh.hthresh = 0; + qinfo->conf.tx_thresh.wthresh = 0; + qinfo->conf.tx_rs_thresh = 0; + qinfo->conf.tx_free_thresh = 0; + qinfo->conf.tx_deferred_start = txq_ctrl ? 0 : 1; + qinfo->conf.offloads = dev->data->dev_conf.txmode.offloads; +} + +/** + * DPDK callback to get the TX packet burst mode information + * + * @param dev + * Pointer to the device structure. + * + * @param tx_queue_id + * Tx queue identificatior. + * + * @param mode + * Pointer to the burts mode information. + * + * @return + * 0 as success, -EINVAL as failure. + */ + +int +mlx5_tx_burst_mode_get(struct rte_eth_dev *dev, + uint16_t tx_queue_id __rte_unused, + struct rte_eth_burst_mode *mode) +{ + eth_tx_burst_t pkt_burst = dev->tx_pkt_burst; + unsigned int i, olx; + + for (i = 0; i < RTE_DIM(txoff_func); i++) { + if (pkt_burst == txoff_func[i].func) { + olx = txoff_func[i].olx; + snprintf(mode->info, sizeof(mode->info), + "%s%s%s%s%s%s%s%s%s", + (olx & MLX5_TXOFF_CONFIG_EMPW) ? + ((olx & MLX5_TXOFF_CONFIG_MPW) ? + "Legacy MPW" : "Enhanced MPW") : "No MPW", + (olx & MLX5_TXOFF_CONFIG_MULTI) ? + " + MULTI" : "", + (olx & MLX5_TXOFF_CONFIG_TSO) ? + " + TSO" : "", + (olx & MLX5_TXOFF_CONFIG_SWP) ? + " + SWP" : "", + (olx & MLX5_TXOFF_CONFIG_CSUM) ? + " + CSUM" : "", + (olx & MLX5_TXOFF_CONFIG_INLINE) ? + " + INLINE" : "", + (olx & MLX5_TXOFF_CONFIG_VLAN) ? + " + VLAN" : "", + (olx & MLX5_TXOFF_CONFIG_METADATA) ? + " + METADATA" : "", + (olx & MLX5_TXOFF_CONFIG_TXPP) ? + " + TXPP" : ""); + return 0; + } + } + return -EINVAL; +}