X-Git-Url: http://git.droids-corp.org/?a=blobdiff_plain;f=drivers%2Fnet%2Fmlx5%2Fmlx5_rxtx.c;h=d8f6671e274f17bf8e9a6c2df8dc88063621995f;hb=7b4f1e6bd367;hp=007df8f062912c81549bd1717861efc72a3ad141;hpb=a496e093177781fcf449b5af36952e922cf0f2b5;p=dpdk.git diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c index 007df8f062..d8f6671e27 100644 --- a/drivers/net/mlx5/mlx5_rxtx.c +++ b/drivers/net/mlx5/mlx5_rxtx.c @@ -26,13 +26,16 @@ #include #include #include +#include +#include +#include + +#include "mlx5_defs.h" #include "mlx5.h" #include "mlx5_utils.h" #include "mlx5_rxtx.h" #include "mlx5_autoconf.h" -#include "mlx5_defs.h" -#include "mlx5_prm.h" /* TX burst subroutines return codes. */ enum mlx5_txcmp_code { @@ -61,6 +64,7 @@ enum mlx5_txcmp_code { #define MLX5_TXOFF_CONFIG_VLAN (1u << 5) /* VLAN insertion supported.*/ #define MLX5_TXOFF_CONFIG_METADATA (1u << 6) /* Flow metadata. */ #define MLX5_TXOFF_CONFIG_EMPW (1u << 8) /* Enhanced MPW supported.*/ +#define MLX5_TXOFF_CONFIG_MPW (1u << 9) /* Legacy MPW supported.*/ /* The most common offloads groups. */ #define MLX5_TXOFF_CONFIG_NONE 0 @@ -107,6 +111,16 @@ static int mlx5_queue_state_modify(struct rte_eth_dev *dev, struct mlx5_mp_arg_queue_state_modify *sm); +static inline void +mlx5_lro_update_tcp_hdr(struct rte_tcp_hdr *restrict tcp, + volatile struct mlx5_cqe *restrict cqe, + uint32_t phcsum); + +static inline void +mlx5_lro_update_hdr(uint8_t *restrict padd, + volatile struct mlx5_cqe *restrict cqe, + uint32_t len); + uint32_t mlx5_ptype_table[] __rte_cache_aligned = { [0xff] = RTE_PTYPE_ALL_MASK, /* Last entry for errored packet. */ }; @@ -573,18 +587,16 @@ mlx5_dump_debug_information(const char *fname, const char *hex_title, MKSTR(path, "%s/%s", MLX5_SYSTEM_LOG_DIR, fname); fd = fopen(path, "a+"); if (!fd) { - DRV_LOG(WARNING, "cannot open %s for debug dump\n", - path); + DRV_LOG(WARNING, "cannot open %s for debug dump", path); MKSTR(path2, "./%s", fname); fd = fopen(path2, "a+"); if (!fd) { - DRV_LOG(ERR, "cannot open %s for debug dump\n", - path2); + DRV_LOG(ERR, "cannot open %s for debug dump", path2); return; } - DRV_LOG(INFO, "New debug dump in file %s\n", path2); + DRV_LOG(INFO, "New debug dump in file %s", path2); } else { - DRV_LOG(INFO, "New debug dump in file %s\n", path); + DRV_LOG(INFO, "New debug dump in file %s", path); } if (hex_title) rte_hexdump(fd, hex_title, buf, hex_len); @@ -644,9 +656,10 @@ check_err_cqe_seen(volatile struct mlx5_err_cqe *err_cqe) * Pointer to the error CQE. * * @return - * The last Tx buffer element to free. + * Negative value if queue recovery failed, otherwise + * the error completion entry is handled successfully. */ -uint16_t +static int mlx5_tx_error_cqe_handle(struct mlx5_txq_data *restrict txq, volatile struct mlx5_err_cqe *err_cqe) { @@ -690,17 +703,14 @@ mlx5_tx_error_cqe_handle(struct mlx5_txq_data *restrict txq, */ txq->stats.oerrors += ((txq->wqe_ci & wqe_m) - new_wqe_pi) & wqe_m; - if (tx_recover_qp(txq_ctrl) == 0) { - txq->cq_ci++; - /* Release all the remaining buffers. */ - return txq->elts_head; + if (tx_recover_qp(txq_ctrl)) { + /* Recovering failed - retry later on the same WQE. */ + return -1; } - /* Recovering failed - try again later on the same WQE. */ - } else { - txq->cq_ci++; + /* Release all the remaining buffers. */ + txq_free_elts(txq_ctrl); } - /* Do not release buffers. */ - return txq->elts_tail; + return 0; } /** @@ -838,7 +848,7 @@ mlx5_queue_state_modify_primary(struct rte_eth_dev *dev, &rq_attr); } if (ret) { - DRV_LOG(ERR, "Cannot change Rx WQ state to %u - %s\n", + DRV_LOG(ERR, "Cannot change Rx WQ state to %u - %s", sm->state, strerror(errno)); rte_errno = errno; return ret; @@ -851,12 +861,12 @@ mlx5_queue_state_modify_primary(struct rte_eth_dev *dev, .qp_state = IBV_QPS_RESET, .port_num = (uint8_t)priv->ibv_port, }; - struct ibv_qp *qp = txq_ctrl->ibv->qp; + struct ibv_qp *qp = txq_ctrl->obj->qp; ret = mlx5_glue->modify_qp(qp, &mod, IBV_QP_STATE); if (ret) { DRV_LOG(ERR, "Cannot change the Tx QP state to RESET " - "%s\n", strerror(errno)); + "%s", strerror(errno)); rte_errno = errno; return ret; } @@ -864,7 +874,7 @@ mlx5_queue_state_modify_primary(struct rte_eth_dev *dev, ret = mlx5_glue->modify_qp(qp, &mod, (IBV_QP_STATE | IBV_QP_PORT)); if (ret) { - DRV_LOG(ERR, "Cannot change Tx QP state to INIT %s\n", + DRV_LOG(ERR, "Cannot change Tx QP state to INIT %s", strerror(errno)); rte_errno = errno; return ret; @@ -872,7 +882,7 @@ mlx5_queue_state_modify_primary(struct rte_eth_dev *dev, mod.qp_state = IBV_QPS_RTR; ret = mlx5_glue->modify_qp(qp, &mod, IBV_QP_STATE); if (ret) { - DRV_LOG(ERR, "Cannot change Tx QP state to RTR %s\n", + DRV_LOG(ERR, "Cannot change Tx QP state to RTR %s", strerror(errno)); rte_errno = errno; return ret; @@ -880,7 +890,7 @@ mlx5_queue_state_modify_primary(struct rte_eth_dev *dev, mod.qp_state = IBV_QPS_RTS; ret = mlx5_glue->modify_qp(qp, &mod, IBV_QP_STATE); if (ret) { - DRV_LOG(ERR, "Cannot change Tx QP state to RTS %s\n", + DRV_LOG(ERR, "Cannot change Tx QP state to RTS %s", strerror(errno)); rte_errno = errno; return ret; @@ -928,14 +938,15 @@ mlx5_queue_state_modify(struct rte_eth_dev *dev, * * @param[in] rxq * Pointer to RX queue structure. - * @param[in] mbuf_prepare - * Whether to prepare mbufs for the RQ. + * @param[in] vec + * 1 when called from vectorized Rx burst, need to prepare mbufs for the RQ. + * 0 when called from non-vectorized Rx burst. * * @return * -1 in case of recovery error, otherwise the CQE status. */ int -mlx5_rx_err_handle(struct mlx5_rxq_data *rxq, uint8_t mbuf_prepare) +mlx5_rx_err_handle(struct mlx5_rxq_data *rxq, uint8_t vec) { const uint16_t cqe_n = 1 << rxq->cqe_n; const uint16_t cqe_mask = cqe_n - 1; @@ -1002,7 +1013,7 @@ mlx5_rx_err_handle(struct mlx5_rxq_data *rxq, uint8_t mbuf_prepare) if (mlx5_queue_state_modify(ETH_DEV(rxq_ctrl->priv), &sm)) return -1; - if (mbuf_prepare) { + if (vec) { const uint16_t q_mask = wqe_n - 1; uint16_t elt_idx; struct rte_mbuf **elt; @@ -1026,6 +1037,16 @@ mlx5_rx_err_handle(struct mlx5_rxq_data *rxq, uint8_t mbuf_prepare) return -1; } } + for (i = 0; i < (int)wqe_n; ++i) { + elt = &(*rxq->elts)[i]; + DATA_LEN(*elt) = + (uint16_t)((*elt)->buf_len - + rte_pktmbuf_headroom(*elt)); + } + /* Padding with a fake mbuf for vec Rx. */ + for (i = 0; i < MLX5_VPMD_DESCS_PER_LOOP; ++i) + (*rxq->elts)[wqe_n + i] = + &rxq->fake_mbuf; } mlx5_rxq_initialize(rxq); rxq->err_state = MLX5_RXQ_ERR_STATE_NO_ERROR; @@ -1230,6 +1251,10 @@ rxq_cq_to_mbuf(struct mlx5_rxq_data *rxq, struct rte_mbuf *pkt, pkt->hash.fdir.hi = mlx5_flow_mark_get(mark); } } + if (rte_flow_dynf_metadata_avail() && cqe->flow_table_metadata) { + pkt->ol_flags |= PKT_RX_DYNF_METADATA; + *RTE_FLOW_DYNF_METADATA(pkt) = cqe->flow_table_metadata; + } if (rxq->csum) pkt->ol_flags |= rxq_cq_to_ol_flags(cqe); if (rxq->vlan_strip && @@ -1314,7 +1339,7 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) } pkt = seg; assert(len >= (rxq->crc_present << 2)); - pkt->ol_flags = 0; + pkt->ol_flags &= EXT_ATTACHED_MBUF; /* If compressed, take hash result from mini-CQE. */ rss_hash_res = rte_be_to_cpu_32(mcqe == NULL ? cqe->rx_hash_res : @@ -1323,6 +1348,13 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) if (rxq->crc_present) len -= RTE_ETHER_CRC_LEN; PKT_LEN(pkt) = len; + if (cqe->lro_num_seg > 1) { + mlx5_lro_update_hdr + (rte_pktmbuf_mtod(pkt, uint8_t *), cqe, + len); + pkt->ol_flags |= PKT_RX_LRO; + pkt->tso_segsz = len / cqe->lro_num_seg; + } } DATA_LEN(rep) = DATA_LEN(seg); PKT_LEN(rep) = PKT_LEN(seg); @@ -1623,8 +1655,6 @@ mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) len -= RTE_ETHER_CRC_LEN; offset = strd_idx * strd_sz + strd_shift; addr = RTE_PTR_ADD(mlx5_mprq_buf_addr(buf, strd_n), offset); - /* Initialize the offload flag. */ - pkt->ol_flags = 0; /* * Memcpy packets to the target mbuf if: * - The size of packet is smaller than mprq_max_memcpy_len. @@ -1641,6 +1671,7 @@ mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) continue; } rte_memcpy(rte_pktmbuf_mtod(pkt, void *), addr, len); + DATA_LEN(pkt) = len; } else { rte_iova_t buf_iova; struct rte_mbuf_ext_shared_info *shinfo; @@ -1681,6 +1712,26 @@ mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) ++rxq->stats.idropped; continue; } + DATA_LEN(pkt) = len; + /* + * LRO packet may consume all the stride memory, in this + * case packet head-room space is not guaranteed so must + * to add an empty mbuf for the head-room. + */ + if (!rxq->strd_headroom_en) { + struct rte_mbuf *headroom_mbuf = + rte_pktmbuf_alloc(rxq->mp); + + if (unlikely(headroom_mbuf == NULL)) { + rte_pktmbuf_free_seg(pkt); + ++rxq->stats.rx_nombuf; + break; + } + PORT(pkt) = rxq->port_id; + NEXT(headroom_mbuf) = pkt; + pkt = headroom_mbuf; + NB_SEGS(pkt) = 2; + } } rxq_cq_to_mbuf(rxq, pkt, cqe, rss_hash_res); if (lro_num_seg > 1) { @@ -1689,7 +1740,6 @@ mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) pkt->tso_segsz = strd_sz; } PKT_LEN(pkt) = len; - DATA_LEN(pkt) = len; PORT(pkt) = rxq->port_id; #ifdef MLX5_PMD_SOFT_COUNTERS /* Increment bytes counter. */ @@ -1974,6 +2024,35 @@ mlx5_tx_copy_elts(struct mlx5_txq_data *restrict txq, (pkts_n - part) * sizeof(struct rte_mbuf *)); } +/** + * Update completion queue consuming index via doorbell + * and flush the completed data buffers. + * + * @param txq + * Pointer to TX queue structure. + * @param valid CQE pointer + * if not NULL update txq->wqe_pi and flush the buffers + * @param olx + * Configured Tx offloads mask. It is fully defined at + * compile time and may be used for optimization. + */ +static __rte_always_inline void +mlx5_tx_comp_flush(struct mlx5_txq_data *restrict txq, + volatile struct mlx5_cqe *last_cqe, + unsigned int olx __rte_unused) +{ + if (likely(last_cqe != NULL)) { + uint16_t tail; + + txq->wqe_pi = rte_be_to_cpu_16(last_cqe->wqe_counter); + tail = txq->fcqs[(txq->cq_ci - 1) & txq->cqe_m]; + if (likely(tail != txq->elts_tail)) { + mlx5_tx_free_elts(txq, tail, olx); + assert(tail == txq->elts_tail); + } + } +} + /** * Manage TX completions. This routine checks the CQ for * arrived CQEs, deduces the last accomplished WQE in SQ, @@ -1992,52 +2071,78 @@ static void mlx5_tx_handle_completion(struct mlx5_txq_data *restrict txq, unsigned int olx __rte_unused) { - bool update = false; + unsigned int count = MLX5_TX_COMP_MAX_CQE; + volatile struct mlx5_cqe *last_cqe = NULL; + uint16_t ci = txq->cq_ci; int ret; + static_assert(MLX5_CQE_STATUS_HW_OWN < 0, "Must be negative value"); + static_assert(MLX5_CQE_STATUS_SW_OWN < 0, "Must be negative value"); do { - volatile struct mlx5_wqe_cseg *cseg; volatile struct mlx5_cqe *cqe; - uint16_t tail; - cqe = &txq->cqes[txq->cq_ci & txq->cqe_m]; - ret = check_cqe(cqe, txq->cqe_s, txq->cq_ci); + cqe = &txq->cqes[ci & txq->cqe_m]; + ret = check_cqe(cqe, txq->cqe_s, ci); if (unlikely(ret != MLX5_CQE_STATUS_SW_OWN)) { if (likely(ret != MLX5_CQE_STATUS_ERR)) { /* No new CQEs in completion queue. */ assert(ret == MLX5_CQE_STATUS_HW_OWN); - if (likely(update)) { - /* Update the consumer index. */ - rte_compiler_barrier(); - *txq->cq_db = - rte_cpu_to_be_32(txq->cq_ci); - } - return; + break; } - /* Some error occurred, try to restart. */ + /* + * Some error occurred, try to restart. + * We have no barrier after WQE related Doorbell + * written, make sure all writes are completed + * here, before we might perform SQ reset. + */ rte_wmb(); - tail = mlx5_tx_error_cqe_handle + txq->cq_ci = ci; + ret = mlx5_tx_error_cqe_handle (txq, (volatile struct mlx5_err_cqe *)cqe); - } else { - /* Normal transmit completion. */ - ++txq->cq_ci; - rte_cio_rmb(); - txq->wqe_pi = rte_be_to_cpu_16(cqe->wqe_counter); - cseg = (volatile struct mlx5_wqe_cseg *) - (txq->wqes + (txq->wqe_pi & txq->wqe_m)); - tail = cseg->misc; - } -#ifndef NDEBUG - if (txq->cq_pi) - --txq->cq_pi; -#endif - if (likely(tail != txq->elts_tail)) { - /* Free data buffers from elts. */ - mlx5_tx_free_elts(txq, tail, olx); - assert(tail == txq->elts_tail); + if (unlikely(ret < 0)) { + /* + * Some error occurred on queue error + * handling, we do not advance the index + * here, allowing to retry on next call. + */ + return; + } + /* + * We are going to fetch all entries with + * MLX5_CQE_SYNDROME_WR_FLUSH_ERR status. + * The send queue is supposed to be empty. + */ + ++ci; + txq->cq_pi = ci; + last_cqe = NULL; + continue; } - update = true; + /* Normal transmit completion. */ + assert(ci != txq->cq_pi); + assert((txq->fcqs[ci & txq->cqe_m] >> 16) == cqe->wqe_counter); + ++ci; + last_cqe = cqe; + /* + * We have to restrict the amount of processed CQEs + * in one tx_burst routine call. The CQ may be large + * and many CQEs may be updated by the NIC in one + * transaction. Buffers freeing is time consuming, + * multiple iterations may introduce significant + * latency. + */ + if (likely(--count == 0)) + break; } while (true); + if (likely(ci != txq->cq_ci)) { + /* + * Update completion queue consuming index + * and ring doorbell to notify hardware. + */ + rte_compiler_barrier(); + txq->cq_ci = ci; + *txq->cq_db = rte_cpu_to_be_32(ci); + mlx5_tx_comp_flush(txq, last_cqe, olx); + } } /** @@ -2047,8 +2152,6 @@ mlx5_tx_handle_completion(struct mlx5_txq_data *restrict txq, * * @param txq * Pointer to TX queue structure. - * @param n_mbuf - * Number of mbuf not stored yet in elts array. * @param loc * Pointer to burst routine local context. * @param olx @@ -2057,30 +2160,35 @@ mlx5_tx_handle_completion(struct mlx5_txq_data *restrict txq, */ static __rte_always_inline void mlx5_tx_request_completion(struct mlx5_txq_data *restrict txq, - unsigned int n_mbuf, struct mlx5_txq_local *restrict loc, - unsigned int olx __rte_unused) + unsigned int olx) { - uint16_t head = txq->elts_head + n_mbuf; + uint16_t head = txq->elts_head; + unsigned int part; + part = MLX5_TXOFF_CONFIG(INLINE) ? + 0 : loc->pkts_sent - loc->pkts_copy; + head += part; if ((uint16_t)(head - txq->elts_comp) >= MLX5_TX_COMP_THRESH || - (uint16_t)(txq->wqe_ci - txq->wqe_comp) >= txq->wqe_thres) { + (MLX5_TXOFF_CONFIG(INLINE) && + (uint16_t)(txq->wqe_ci - txq->wqe_comp) >= txq->wqe_thres)) { volatile struct mlx5_wqe *last = loc->wqe_last; txq->elts_comp = head; - txq->wqe_comp = txq->wqe_ci; + if (MLX5_TXOFF_CONFIG(INLINE)) + txq->wqe_comp = txq->wqe_ci; /* Request unconditional completion on last WQE. */ last->cseg.flags = RTE_BE32(MLX5_COMP_ALWAYS << MLX5_COMP_MODE_OFFSET); - /* Save elts_head in unused "immediate" field of WQE. */ - last->cseg.misc = head; - /* - * A CQE slot must always be available. Count the - * issued CEQ "always" request instead of production - * index due to here can be CQE with errors and - * difference with ci may become inconsistent. - */ - assert(txq->cqe_s > ++txq->cq_pi); + /* Save elts_head in dedicated free on completion queue. */ +#ifdef NDEBUG + txq->fcqs[txq->cq_pi++ & txq->cqe_m] = head; +#else + txq->fcqs[txq->cq_pi++ & txq->cqe_m] = head | + (last->cseg.opcode >> 8) << 16; +#endif + /* A CQE slot must always be available. */ + assert((txq->cq_pi - txq->cq_ci) <= txq->cqe_s); } } @@ -2138,6 +2246,9 @@ mlx5_tx_cseg_init(struct mlx5_txq_data *restrict txq, { struct mlx5_wqe_cseg *restrict cs = &wqe->cseg; + /* For legacy MPW replace the EMPW by TSO with modifier. */ + if (MLX5_TXOFF_CONFIG(MPW) && opcode == MLX5_OPCODE_ENHANCED_MPSW) + opcode = MLX5_OPCODE_TSO | MLX5_OPC_MOD_MPW << 24; cs->opcode = rte_cpu_to_be_32((txq->wqe_ci << 8) | opcode); cs->sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | ds); cs->flags = RTE_BE32(MLX5_COMP_ONLY_FIRST_ERR << @@ -2182,8 +2293,8 @@ mlx5_tx_eseg_none(struct mlx5_txq_data *restrict txq __rte_unused, es->swp_offs = txq_mbuf_to_swp(loc, &es->swp_flags, olx); /* Fill metadata field if needed. */ es->metadata = MLX5_TXOFF_CONFIG(METADATA) ? - loc->mbuf->ol_flags & PKT_TX_METADATA ? - loc->mbuf->tx_metadata : 0 : 0; + loc->mbuf->ol_flags & PKT_TX_DYNF_METADATA ? + *RTE_FLOW_DYNF_METADATA(loc->mbuf) : 0 : 0; /* Engage VLAN tag insertion feature if requested. */ if (MLX5_TXOFF_CONFIG(VLAN) && loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) { @@ -2242,8 +2353,8 @@ mlx5_tx_eseg_dmin(struct mlx5_txq_data *restrict txq __rte_unused, es->swp_offs = txq_mbuf_to_swp(loc, &es->swp_flags, olx); /* Fill metadata field if needed. */ es->metadata = MLX5_TXOFF_CONFIG(METADATA) ? - loc->mbuf->ol_flags & PKT_TX_METADATA ? - loc->mbuf->tx_metadata : 0 : 0; + loc->mbuf->ol_flags & PKT_TX_DYNF_METADATA ? + *RTE_FLOW_DYNF_METADATA(loc->mbuf) : 0 : 0; static_assert(MLX5_ESEG_MIN_INLINE_SIZE == (sizeof(uint16_t) + sizeof(rte_v128u32_t)), @@ -2335,8 +2446,8 @@ mlx5_tx_eseg_data(struct mlx5_txq_data *restrict txq, es->swp_offs = txq_mbuf_to_swp(loc, &es->swp_flags, olx); /* Fill metadata field if needed. */ es->metadata = MLX5_TXOFF_CONFIG(METADATA) ? - loc->mbuf->ol_flags & PKT_TX_METADATA ? - loc->mbuf->tx_metadata : 0 : 0; + loc->mbuf->ol_flags & PKT_TX_DYNF_METADATA ? + *RTE_FLOW_DYNF_METADATA(loc->mbuf) : 0 : 0; static_assert(MLX5_ESEG_MIN_INLINE_SIZE == (sizeof(uint16_t) + sizeof(rte_v128u32_t)), @@ -2529,8 +2640,8 @@ mlx5_tx_eseg_mdat(struct mlx5_txq_data *restrict txq, es->swp_offs = txq_mbuf_to_swp(loc, &es->swp_flags, olx); /* Fill metadata field if needed. */ es->metadata = MLX5_TXOFF_CONFIG(METADATA) ? - loc->mbuf->ol_flags & PKT_TX_METADATA ? - loc->mbuf->tx_metadata : 0 : 0; + loc->mbuf->ol_flags & PKT_TX_DYNF_METADATA ? + *RTE_FLOW_DYNF_METADATA(loc->mbuf) : 0 : 0; static_assert(MLX5_ESEG_MIN_INLINE_SIZE == (sizeof(uint16_t) + sizeof(rte_v128u32_t)), @@ -2540,7 +2651,7 @@ mlx5_tx_eseg_mdat(struct mlx5_txq_data *restrict txq, sizeof(struct rte_vlan_hdr) + 2 * RTE_ETHER_ADDR_LEN), "invalid Ethernet Segment data size"); - assert(inlen > MLX5_ESEG_MIN_INLINE_SIZE); + assert(inlen >= MLX5_ESEG_MIN_INLINE_SIZE); es->inline_hdr_sz = rte_cpu_to_be_16(inlen); pdst = (uint8_t *)&es->inline_data; if (MLX5_TXOFF_CONFIG(VLAN) && vlan) { @@ -2648,27 +2759,33 @@ mlx5_tx_dseg_iptr(struct mlx5_txq_data *restrict txq, /* Unrolled implementation of generic rte_memcpy. */ dst = (uintptr_t)&dseg->inline_data[0]; src = (uintptr_t)buf; + if (len & 0x08) { #ifdef RTE_ARCH_STRICT_ALIGN - memcpy(dst, src, len); + assert(dst == RTE_PTR_ALIGN(dst, sizeof(uint32_t))); + *(uint32_t *)dst = *(unaligned_uint32_t *)src; + dst += sizeof(uint32_t); + src += sizeof(uint32_t); + *(uint32_t *)dst = *(unaligned_uint32_t *)src; + dst += sizeof(uint32_t); + src += sizeof(uint32_t); #else - if (len & 0x08) { - *(uint64_t *)dst = *(uint64_t *)src; + *(uint64_t *)dst = *(unaligned_uint64_t *)src; dst += sizeof(uint64_t); src += sizeof(uint64_t); +#endif } if (len & 0x04) { - *(uint32_t *)dst = *(uint32_t *)src; + *(uint32_t *)dst = *(unaligned_uint32_t *)src; dst += sizeof(uint32_t); src += sizeof(uint32_t); } if (len & 0x02) { - *(uint16_t *)dst = *(uint16_t *)src; + *(uint16_t *)dst = *(unaligned_uint16_t *)src; dst += sizeof(uint16_t); src += sizeof(uint16_t); } if (len & 0x01) *(uint8_t *)dst = *(uint8_t *)src; -#endif } /** @@ -2773,13 +2890,14 @@ mlx5_tx_dseg_vlan(struct mlx5_txq_data *restrict txq, memcpy(pdst, buf, MLX5_DSEG_MIN_INLINE_SIZE); buf += MLX5_DSEG_MIN_INLINE_SIZE; pdst += MLX5_DSEG_MIN_INLINE_SIZE; + len -= MLX5_DSEG_MIN_INLINE_SIZE; /* Insert VLAN ethertype + VLAN tag. Pointer is aligned. */ assert(pdst == RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE)); + if (unlikely(pdst >= (uint8_t *)txq->wqes_end)) + pdst = (uint8_t *)txq->wqes; *(uint32_t *)pdst = rte_cpu_to_be_32((RTE_ETHER_TYPE_VLAN << 16) | loc->mbuf->vlan_tci); pdst += sizeof(struct rte_vlan_hdr); - if (unlikely(pdst >= (uint8_t *)txq->wqes_end)) - pdst = (uint8_t *)txq->wqes; /* * The WQEBB space availability is checked by caller. * Here we should be aware of WQE ring buffer wraparound only. @@ -3352,7 +3470,7 @@ mlx5_tx_burst_mseg(struct mlx5_txq_data *restrict txq, continue; /* Here ends the series of multi-segment packets. */ if (MLX5_TXOFF_CONFIG(TSO) && - unlikely(!(loc->mbuf->ol_flags & PKT_TX_TCP_SEG))) + unlikely(loc->mbuf->ol_flags & PKT_TX_TCP_SEG)) return MLX5_TXCMP_CODE_TSO; return MLX5_TXCMP_CODE_SINGLE; } @@ -3488,7 +3606,7 @@ mlx5_tx_burst_tso(struct mlx5_txq_data *restrict txq, if (MLX5_TXOFF_CONFIG(MULTI) && unlikely(NB_SEGS(loc->mbuf) > 1)) return MLX5_TXCMP_CODE_MULTI; - if (unlikely(!(loc->mbuf->ol_flags & PKT_TX_TCP_SEG))) + if (likely(!(loc->mbuf->ol_flags & PKT_TX_TCP_SEG))) return MLX5_TXCMP_CODE_SINGLE; /* Continue with the next TSO packet. */ } @@ -3552,6 +3670,7 @@ mlx5_tx_able_to_empw(struct mlx5_txq_data *restrict txq, /** * Check the next packet attributes to match with the eMPW batch ones. + * In addition, for legacy MPW the packet length is checked either. * * @param txq * Pointer to TX queue structure. @@ -3559,6 +3678,8 @@ mlx5_tx_able_to_empw(struct mlx5_txq_data *restrict txq, * Pointer to Ethernet Segment of eMPW batch. * @param loc * Pointer to burst routine local context. + * @param dlen + * Length of previous packet in MPW descriptor. * @param olx * Configured Tx offloads mask. It is fully defined at * compile time and may be used for optimization. @@ -3571,6 +3692,7 @@ static __rte_always_inline bool mlx5_tx_match_empw(struct mlx5_txq_data *restrict txq __rte_unused, struct mlx5_wqe_eseg *restrict es, struct mlx5_txq_local *restrict loc, + uint32_t dlen, unsigned int olx) { uint8_t swp_flags = 0; @@ -3586,8 +3708,12 @@ mlx5_tx_match_empw(struct mlx5_txq_data *restrict txq __rte_unused, return false; /* Fill metadata field if needed. */ if (MLX5_TXOFF_CONFIG(METADATA) && - es->metadata != (loc->mbuf->ol_flags & PKT_TX_METADATA ? - loc->mbuf->tx_metadata : 0)) + es->metadata != (loc->mbuf->ol_flags & PKT_TX_DYNF_METADATA ? + *RTE_FLOW_DYNF_METADATA(loc->mbuf) : 0)) + return false; + /* Legacy MPW can send packets with the same lengt only. */ + if (MLX5_TXOFF_CONFIG(MPW) && + dlen != rte_pktmbuf_data_len(loc->mbuf)) return false; /* There must be no VLAN packets in eMPW loop. */ if (MLX5_TXOFF_CONFIG(VLAN)) @@ -3754,7 +3880,10 @@ mlx5_tx_burst_empw_simple(struct mlx5_txq_data *restrict txq, unsigned int slen = 0; next_empw: - part = RTE_MIN(pkts_n, MLX5_EMPW_MAX_PACKETS); + assert(NB_SEGS(loc->mbuf) == 1); + part = RTE_MIN(pkts_n, MLX5_TXOFF_CONFIG(MPW) ? + MLX5_MPW_MAX_PACKETS : + MLX5_EMPW_MAX_PACKETS); if (unlikely(loc->elts_free < part)) { /* We have no enough elts to save all mbufs. */ if (unlikely(loc->elts_free < MLX5_EMPW_MIN_PACKETS)) @@ -3784,6 +3913,10 @@ next_empw: eseg = &loc->wqe_last->eseg; dseg = &loc->wqe_last->dseg[0]; loop = part; + /* Store the packet length for legacy MPW. */ + if (MLX5_TXOFF_CONFIG(MPW)) + eseg->mss = rte_cpu_to_be_16 + (rte_pktmbuf_data_len(loc->mbuf)); for (;;) { uint32_t dlen = rte_pktmbuf_data_len(loc->mbuf); #ifdef MLX5_PMD_SOFT_COUNTERS @@ -3813,6 +3946,7 @@ next_empw: return MLX5_TXCMP_CODE_EXIT; return MLX5_TXCMP_CODE_MULTI; } + assert(NB_SEGS(loc->mbuf) == 1); if (ret == MLX5_TXCMP_CODE_TSO) { part -= loop; mlx5_tx_sdone_empw(txq, loc, part, slen, olx); @@ -3841,14 +3975,16 @@ next_empw: * - check sum settings * - metadata value * - software parser settings + * - packets length (legacy MPW only) */ - if (!mlx5_tx_match_empw(txq, eseg, loc, olx)) { + if (!mlx5_tx_match_empw(txq, eseg, loc, dlen, olx)) { assert(loop); part -= loop; mlx5_tx_sdone_empw(txq, loc, part, slen, olx); if (unlikely(!loc->elts_free || !loc->wqe_free)) return MLX5_TXCMP_CODE_EXIT; + pkts_n -= part; goto next_empw; } /* Packet attributes match, continue the same eMPW. */ @@ -3906,10 +4042,17 @@ mlx5_tx_burst_empw_inline(struct mlx5_txq_data *restrict txq, struct mlx5_wqe_dseg *restrict dseg; struct mlx5_wqe_eseg *restrict eseg; enum mlx5_txcmp_code ret; - unsigned int room, part; + unsigned int room, part, nlim; unsigned int slen = 0; -next_empw: + assert(NB_SEGS(loc->mbuf) == 1); + /* + * Limits the amount of packets in one WQE + * to improve CQE latency generation. + */ + nlim = RTE_MIN(pkts_n, MLX5_TXOFF_CONFIG(MPW) ? + MLX5_MPW_INLINE_MAX_PACKETS : + MLX5_EMPW_MAX_PACKETS); /* Check whether we have minimal amount WQEs */ if (unlikely(loc->wqe_free < ((2 + MLX5_EMPW_MIN_PACKETS + 3) / 4))) @@ -3928,6 +4071,10 @@ next_empw: olx & ~MLX5_TXOFF_CONFIG_VLAN); eseg = &loc->wqe_last->eseg; dseg = &loc->wqe_last->dseg[0]; + /* Store the packet length for legacy MPW. */ + if (MLX5_TXOFF_CONFIG(MPW)) + eseg->mss = rte_cpu_to_be_16 + (rte_pktmbuf_data_len(loc->mbuf)); room = RTE_MIN(MLX5_WQE_SIZE_MAX / MLX5_WQE_SIZE, loc->wqe_free) * MLX5_WQE_SIZE - MLX5_WQE_CSEG_SIZE - @@ -4028,12 +4175,6 @@ next_mbuf: mlx5_tx_idone_empw(txq, loc, part, slen, olx); return MLX5_TXCMP_CODE_EXIT; } - /* Check if we have minimal room left. */ - if (room < MLX5_WQE_DSEG_SIZE) { - part -= room; - mlx5_tx_idone_empw(txq, loc, part, slen, olx); - goto next_empw; - } loc->mbuf = *pkts++; if (likely(pkts_n > 1)) rte_prefetch0(*pkts); @@ -4051,6 +4192,7 @@ next_mbuf: return MLX5_TXCMP_CODE_EXIT; return MLX5_TXCMP_CODE_MULTI; } + assert(NB_SEGS(loc->mbuf) == 1); if (ret == MLX5_TXCMP_CODE_TSO) { part -= room; mlx5_tx_idone_empw(txq, loc, part, slen, olx); @@ -4073,14 +4215,19 @@ next_mbuf: mlx5_tx_idone_empw(txq, loc, part, slen, olx); return MLX5_TXCMP_CODE_ERROR; } + /* Check if we have minimal room left. */ + nlim--; + if (unlikely(!nlim || room < MLX5_WQE_DSEG_SIZE)) + break; /* * Check whether packet parameters coincide * within assumed eMPW batch: * - check sum settings * - metadata value * - software parser settings + * - packets length (legacy MPW only) */ - if (!mlx5_tx_match_empw(txq, eseg, loc, olx)) + if (!mlx5_tx_match_empw(txq, eseg, loc, dlen, olx)) break; /* Packet attributes match, continue the same eMPW. */ if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end) @@ -4098,7 +4245,7 @@ next_mbuf: if (unlikely(!loc->elts_free || !loc->wqe_free)) return MLX5_TXCMP_CODE_EXIT; - goto next_empw; + /* Continue the loop with new eMPW session. */ } assert(false); } @@ -4183,8 +4330,9 @@ mlx5_tx_burst_single_send(struct mlx5_txq_data *restrict txq, * free the packet immediately. */ rte_pktmbuf_free_seg(loc->mbuf); - } else if (!MLX5_TXOFF_CONFIG(EMPW) && - txq->inlen_mode) { + } else if ((!MLX5_TXOFF_CONFIG(EMPW) || + MLX5_TXOFF_CONFIG(MPW)) && + txq->inlen_mode) { /* * If minimal inlining is requested the eMPW * feature should be disabled due to data is @@ -4417,6 +4565,14 @@ mlx5_tx_burst_tmpl(struct mlx5_txq_data *restrict txq, assert(txq->elts_s >= (uint16_t)(txq->elts_head - txq->elts_tail)); assert(txq->wqe_s >= (uint16_t)(txq->wqe_ci - txq->wqe_pi)); + if (unlikely(!pkts_n)) + return 0; + loc.pkts_sent = 0; + loc.pkts_copy = 0; + loc.wqe_last = NULL; + +send_loop: + loc.pkts_loop = loc.pkts_sent; /* * Check if there are some CQEs, if any: * - process an encountered errors @@ -4424,9 +4580,7 @@ mlx5_tx_burst_tmpl(struct mlx5_txq_data *restrict txq, * - free related mbufs * - doorbell the NIC about processed CQEs */ - if (unlikely(!pkts_n)) - return 0; - rte_prefetch0(*pkts); + rte_prefetch0(*(pkts + loc.pkts_sent)); mlx5_tx_handle_completion(txq, olx); /* * Calculate the number of available resources - elts and WQEs. @@ -4443,10 +4597,7 @@ mlx5_tx_burst_tmpl(struct mlx5_txq_data *restrict txq, loc.wqe_free = txq->wqe_s - (uint16_t)(txq->wqe_ci - txq->wqe_pi); if (unlikely(!loc.elts_free || !loc.wqe_free)) - return 0; - loc.pkts_sent = 0; - loc.pkts_copy = 0; - loc.wqe_last = NULL; + goto burst_exit; for (;;) { /* * Fetch the packet from array. Usually this is @@ -4612,22 +4763,48 @@ enter_send_single: */ assert(MLX5_TXOFF_CONFIG(INLINE) || loc.pkts_sent >= loc.pkts_copy); /* Take a shortcut if nothing is sent. */ - if (unlikely(loc.pkts_sent == 0)) - return 0; - /* Not all of the mbufs may be stored into elts yet. */ - part = MLX5_TXOFF_CONFIG(INLINE) ? 0 : loc.pkts_sent - loc.pkts_copy; - mlx5_tx_request_completion(txq, part, &loc, olx); + if (unlikely(loc.pkts_sent == loc.pkts_loop)) + goto burst_exit; + /* Request CQE generation if limits are reached. */ + mlx5_tx_request_completion(txq, &loc, olx); /* * Ring QP doorbell immediately after WQE building completion * to improve latencies. The pure software related data treatment * can be completed after doorbell. Tx CQEs for this SQ are * processed in this thread only by the polling. + * + * The rdma core library can map doorbell register in two ways, + * depending on the environment variable "MLX5_SHUT_UP_BF": + * + * - as regular cached memory, the variable is either missing or + * set to zero. This type of mapping may cause the significant + * doorbell register writing latency and requires explicit + * memory write barrier to mitigate this issue and prevent + * write combining. + * + * - as non-cached memory, the variable is present and set to + * not "0" value. This type of mapping may cause performance + * impact under heavy loading conditions but the explicit write + * memory barrier is not required and it may improve core + * performance. + * + * - the legacy behaviour (prior 19.08 release) was to use some + * heuristics to decide whether write memory barrier should + * be performed. This behavior is supported with specifying + * tx_db_nc=2, write barrier is skipped if application + * provides the full recommended burst of packets, it + * supposes the next packets are coming and the write barrier + * will be issued on the next burst (after descriptor writing, + * at least). */ - mlx5_tx_dbrec_cond_wmb(txq, loc.wqe_last, 0); + mlx5_tx_dbrec_cond_wmb(txq, loc.wqe_last, !txq->db_nc && + (!txq->db_heu || pkts_n % MLX5_TX_DEFAULT_BURST)); + /* Not all of the mbufs may be stored into elts yet. */ + part = MLX5_TXOFF_CONFIG(INLINE) ? 0 : loc.pkts_sent - loc.pkts_copy; if (!MLX5_TXOFF_CONFIG(INLINE) && part) { /* * There are some single-segment mbufs not stored in elts. - * It can be only if last packet was single-segment. + * It can be only if the last packet was single-segment. * The copying is gathered into one place due to it is * a good opportunity to optimize that with SIMD. * Unfortunately if inlining is enabled the gaps in @@ -4635,13 +4812,23 @@ enter_send_single: * inlined mbufs. */ mlx5_tx_copy_elts(txq, pkts + loc.pkts_copy, part, olx); + loc.pkts_copy = loc.pkts_sent; } + assert(txq->elts_s >= (uint16_t)(txq->elts_head - txq->elts_tail)); + assert(txq->wqe_s >= (uint16_t)(txq->wqe_ci - txq->wqe_pi)); + if (pkts_n > loc.pkts_sent) { + /* + * If burst size is large there might be no enough CQE + * fetched from completion queue and no enough resources + * freed to send all the packets. + */ + goto send_loop; + } +burst_exit: #ifdef MLX5_PMD_SOFT_COUNTERS /* Increment sent packets counter. */ txq->stats.opackets += loc.pkts_sent; #endif - assert(txq->elts_s >= (uint16_t)(txq->elts_head - txq->elts_tail)); - assert(txq->wqe_s >= (uint16_t)(txq->wqe_ci - txq->wqe_pi)); return loc.pkts_sent; } @@ -4779,6 +4966,34 @@ MLX5_TXOFF_DECL(iv, MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | MLX5_TXOFF_CONFIG_METADATA) +/* + * Generate routines with Legacy Multi-Packet Write support. + * This mode is supported by ConnectX-4LX only and imposes + * offload limitations, not supported: + * - ACL/Flows (metadata are becoming meaningless) + * - WQE Inline headers + * - SRIOV (E-Switch offloads) + * - VLAN insertion + * - tunnel encapsulation/decapsulation + * - TSO + */ +MLX5_TXOFF_DECL(none_mpw, + MLX5_TXOFF_CONFIG_NONE | MLX5_TXOFF_CONFIG_EMPW | + MLX5_TXOFF_CONFIG_MPW) + +MLX5_TXOFF_DECL(mci_mpw, + MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_CSUM | + MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_EMPW | + MLX5_TXOFF_CONFIG_MPW) + +MLX5_TXOFF_DECL(mc_mpw, + MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_CSUM | + MLX5_TXOFF_CONFIG_EMPW | MLX5_TXOFF_CONFIG_MPW) + +MLX5_TXOFF_DECL(i_mpw, + MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_EMPW | + MLX5_TXOFF_CONFIG_MPW) + /* * Array of declared and compiled Tx burst function and corresponding * supported offloads set. The array is used to select the Tx burst @@ -4881,7 +5096,6 @@ MLX5_TXOFF_INFO(mti, MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_METADATA) - MLX5_TXOFF_INFO(mtv, MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO | MLX5_TXOFF_CONFIG_VLAN | @@ -4922,6 +5136,23 @@ MLX5_TXOFF_INFO(v, MLX5_TXOFF_INFO(iv, MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN | MLX5_TXOFF_CONFIG_METADATA) + +MLX5_TXOFF_INFO(none_mpw, + MLX5_TXOFF_CONFIG_NONE | MLX5_TXOFF_CONFIG_EMPW | + MLX5_TXOFF_CONFIG_MPW) + +MLX5_TXOFF_INFO(mci_mpw, + MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_CSUM | + MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_EMPW | + MLX5_TXOFF_CONFIG_MPW) + +MLX5_TXOFF_INFO(mc_mpw, + MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_CSUM | + MLX5_TXOFF_CONFIG_EMPW | MLX5_TXOFF_CONFIG_MPW) + +MLX5_TXOFF_INFO(i_mpw, + MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_EMPW | + MLX5_TXOFF_CONFIG_MPW) }; /** @@ -5004,18 +5235,29 @@ mlx5_select_tx_function(struct rte_eth_dev *dev) if (config->mps == MLX5_MPW_ENHANCED && config->txq_inline_min <= 0) { /* - * The NIC supports Enhanced Multi-Packet Write. - * We do not support legacy MPW due to its - * hardware related problems, so we just ignore - * legacy MLX5_MPW settings. There should be no - * minimal required inline data. + * The NIC supports Enhanced Multi-Packet Write + * and does not require minimal inline data. */ olx |= MLX5_TXOFF_CONFIG_EMPW; } - if (tx_offloads & DEV_TX_OFFLOAD_MATCH_METADATA) { + if (rte_flow_dynf_metadata_avail()) { /* We should support Flow metadata. */ olx |= MLX5_TXOFF_CONFIG_METADATA; } + if (config->mps == MLX5_MPW) { + /* + * The NIC supports Legacy Multi-Packet Write. + * The MLX5_TXOFF_CONFIG_MPW controls the + * descriptor building method in combination + * with MLX5_TXOFF_CONFIG_EMPW. + */ + if (!(olx & (MLX5_TXOFF_CONFIG_TSO | + MLX5_TXOFF_CONFIG_SWP | + MLX5_TXOFF_CONFIG_VLAN | + MLX5_TXOFF_CONFIG_METADATA))) + olx |= MLX5_TXOFF_CONFIG_EMPW | + MLX5_TXOFF_CONFIG_MPW; + } /* * Scan the routines table to find the minimal * satisfying routine with requested offloads. @@ -5084,9 +5326,11 @@ mlx5_select_tx_function(struct rte_eth_dev *dev) DRV_LOG(DEBUG, "\tVLANI (VLAN insertion)"); if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_METADATA) DRV_LOG(DEBUG, "\tMETAD (tx Flow metadata)"); - if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_EMPW) - DRV_LOG(DEBUG, "\tEMPW (Enhanced MPW)"); + if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_EMPW) { + if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_MPW) + DRV_LOG(DEBUG, "\tMPW (Legacy MPW)"); + else + DRV_LOG(DEBUG, "\tEMPW (Enhanced MPW)"); + } return txoff_func[m].func; } - -