From: NĂ©lio Laranjeiro Date: Fri, 24 Jun 2016 13:17:53 +0000 (+0200) Subject: net/mlx5: refactor Tx data path X-Git-Tag: spdx-start~6394 X-Git-Url: http://git.droids-corp.org/?a=commitdiff_plain;h=1d88ba1719429c1e41bd61ab3add65ad9cc43551;p=dpdk.git net/mlx5: refactor Tx data path Bypass Verbs to improve Tx performance. Signed-off-by: Nelio Laranjeiro Signed-off-by: Yaacov Hazan Signed-off-by: Adrien Mazarguil --- diff --git a/drivers/net/mlx5/Makefile b/drivers/net/mlx5/Makefile index aaedd78a4a..f6d39388e8 100644 --- a/drivers/net/mlx5/Makefile +++ b/drivers/net/mlx5/Makefile @@ -106,11 +106,6 @@ mlx5_autoconf.h.new: FORCE mlx5_autoconf.h.new: $(RTE_SDK)/scripts/auto-config-h.sh $Q $(RM) -f -- '$@' - $Q sh -- '$<' '$@' \ - HAVE_VERBS_VLAN_INSERTION \ - infiniband/verbs.h \ - enum IBV_EXP_RECEIVE_WQ_CVLAN_INSERTION \ - $(AUTOCONF_OUTPUT) $Q sh -- '$<' '$@' \ HAVE_VERBS_IBV_EXP_CQ_COMPRESSED_CQE \ infiniband/verbs_exp.h \ diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c index 16b05d3bad..47e64b2a7e 100644 --- a/drivers/net/mlx5/mlx5_ethdev.c +++ b/drivers/net/mlx5/mlx5_ethdev.c @@ -1242,11 +1242,11 @@ mlx5_secondary_data_setup(struct priv *priv) txq_ctrl = rte_calloc_socket("TXQ", 1, sizeof(*txq_ctrl), 0, primary_txq_ctrl->socket); if (txq_ctrl != NULL) { - if (txq_setup(priv->dev, - primary_txq_ctrl, - primary_txq->elts_n, - primary_txq_ctrl->socket, - NULL) == 0) { + if (txq_ctrl_setup(priv->dev, + primary_txq_ctrl, + primary_txq->elts_n, + primary_txq_ctrl->socket, + NULL) == 0) { txq_ctrl->txq.stats.idx = primary_txq->stats.idx; tx_queues[i] = &txq_ctrl->txq; diff --git a/drivers/net/mlx5/mlx5_mr.c b/drivers/net/mlx5/mlx5_mr.c index 1d8bf725c9..67dfefa8da 100644 --- a/drivers/net/mlx5/mlx5_mr.c +++ b/drivers/net/mlx5/mlx5_mr.c @@ -190,7 +190,7 @@ txq_mp2mr_reg(struct txq *txq, struct rte_mempool *mp, unsigned int idx) /* Add a new entry, register MR first. */ DEBUG("%p: discovered new memory pool \"%s\" (%p)", (void *)txq_ctrl, mp->name, (void *)mp); - mr = mlx5_mp2mr(txq_ctrl->txq.priv->pd, mp); + mr = mlx5_mp2mr(txq_ctrl->priv->pd, mp); if (unlikely(mr == NULL)) { DEBUG("%p: unable to configure MR, ibv_reg_mr() failed.", (void *)txq_ctrl); @@ -209,7 +209,7 @@ txq_mp2mr_reg(struct txq *txq, struct rte_mempool *mp, unsigned int idx) /* Store the new entry. */ txq_ctrl->txq.mp2mr[idx].mp = mp; txq_ctrl->txq.mp2mr[idx].mr = mr; - txq_ctrl->txq.mp2mr[idx].lkey = mr->lkey; + txq_ctrl->txq.mp2mr[idx].lkey = htonl(mr->lkey); DEBUG("%p: new MR lkey for MP \"%s\" (%p): 0x%08" PRIu32, (void *)txq_ctrl, mp->name, (void *)mp, txq_ctrl->txq.mp2mr[idx].lkey); diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c index 87d09e9b8c..5060d1a56e 100644 --- a/drivers/net/mlx5/mlx5_rxtx.c +++ b/drivers/net/mlx5/mlx5_rxtx.c @@ -119,68 +119,52 @@ get_cqe64(volatile struct mlx5_cqe cqes[], * * @param txq * Pointer to TX queue structure. - * - * @return - * 0 on success, -1 on failure. */ -static int +static void txq_complete(struct txq *txq) { - unsigned int elts_comp = txq->elts_comp; - unsigned int elts_tail = txq->elts_tail; - unsigned int elts_free = txq->elts_tail; const unsigned int elts_n = txq->elts_n; - int wcs_n; - - if (unlikely(elts_comp == 0)) - return 0; -#ifdef DEBUG_SEND - DEBUG("%p: processing %u work requests completions", - (void *)txq, elts_comp); -#endif - wcs_n = txq->poll_cnt(txq->cq, elts_comp); - if (unlikely(wcs_n == 0)) - return 0; - if (unlikely(wcs_n < 0)) { - DEBUG("%p: ibv_poll_cq() failed (wcs_n=%d)", - (void *)txq, wcs_n); - return -1; + const unsigned int cqe_n = txq->cqe_n; + uint16_t elts_free = txq->elts_tail; + uint16_t elts_tail; + uint16_t cq_ci = txq->cq_ci; + unsigned int wqe_ci = (unsigned int)-1; + int ret = 0; + + while (ret == 0) { + volatile struct mlx5_cqe64 *cqe; + + cqe = get_cqe64(*txq->cqes, cqe_n, &cq_ci); + if (cqe == NULL) + break; + wqe_ci = ntohs(cqe->wqe_counter); } - elts_comp -= wcs_n; - assert(elts_comp <= txq->elts_comp); - /* - * Assume WC status is successful as nothing can be done about it - * anyway. - */ - elts_tail += wcs_n * txq->elts_comp_cd_init; - if (elts_tail >= elts_n) - elts_tail -= elts_n; - - while (elts_free != elts_tail) { - struct txq_elt *elt = &(*txq->elts)[elts_free]; + if (unlikely(wqe_ci == (unsigned int)-1)) + return; + /* Free buffers. */ + elts_tail = (wqe_ci + 1) & (elts_n - 1); + do { + struct rte_mbuf *elt = (*txq->elts)[elts_free]; unsigned int elts_free_next = - (((elts_free + 1) == elts_n) ? 0 : elts_free + 1); - struct rte_mbuf *tmp = elt->buf; - struct txq_elt *elt_next = &(*txq->elts)[elts_free_next]; + (elts_free + 1) & (elts_n - 1); + struct rte_mbuf *elt_next = (*txq->elts)[elts_free_next]; #ifndef NDEBUG /* Poisoning. */ - memset(elt, 0x66, sizeof(*elt)); + memset(&(*txq->elts)[elts_free], + 0x66, + sizeof((*txq->elts)[elts_free])); #endif - RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf); - /* Faster than rte_pktmbuf_free(). */ - do { - struct rte_mbuf *next = NEXT(tmp); - - rte_pktmbuf_free_seg(tmp); - tmp = next; - } while (tmp != NULL); + RTE_MBUF_PREFETCH_TO_FREE(elt_next); + /* Only one segment needs to be freed. */ + rte_pktmbuf_free_seg(elt); elts_free = elts_free_next; - } - + } while (elts_free != elts_tail); + txq->cq_ci = cq_ci; txq->elts_tail = elts_tail; - txq->elts_comp = elts_comp; - return 0; + /* Update the consumer index. */ + rte_wmb(); + *txq->cq_db = htonl(cq_ci); } /** @@ -231,7 +215,8 @@ txq_mp2mr(struct txq *txq, struct rte_mempool *mp) } if (txq->mp2mr[i].mp == mp) { assert(txq->mp2mr[i].lkey != (uint32_t)-1); - assert(txq->mp2mr[i].mr->lkey == txq->mp2mr[i].lkey); + assert(htonl(txq->mp2mr[i].mr->lkey) == + txq->mp2mr[i].lkey); lkey = txq->mp2mr[i].lkey; break; } @@ -242,33 +227,136 @@ txq_mp2mr(struct txq *txq, struct rte_mempool *mp) } /** - * Insert VLAN using mbuf headroom space. - * - * @param buf - * Buffer for VLAN insertion. + * Write a regular WQE. * - * @return - * 0 on success, errno value on failure. + * @param txq + * Pointer to TX queue structure. + * @param wqe + * Pointer to the WQE to fill. + * @param addr + * Buffer data address. + * @param length + * Packet length. + * @param lkey + * Memory region lkey. */ -static inline int -insert_vlan_sw(struct rte_mbuf *buf) +static inline void +mlx5_wqe_write(struct txq *txq, volatile union mlx5_wqe *wqe, + uintptr_t addr, uint32_t length, uint32_t lkey) { - uintptr_t addr; - uint32_t vlan; - uint16_t head_room_len = rte_pktmbuf_headroom(buf); + wqe->wqe.ctrl.data[0] = htonl((txq->wqe_ci << 8) | MLX5_OPCODE_SEND); + wqe->wqe.ctrl.data[1] = htonl((txq->qp_num_8s) | 4); + wqe->wqe.ctrl.data[3] = 0; + wqe->inl.eseg.rsvd0 = 0; + wqe->inl.eseg.rsvd1 = 0; + wqe->inl.eseg.mss = 0; + wqe->inl.eseg.rsvd2 = 0; + wqe->wqe.eseg.inline_hdr_sz = htons(MLX5_ETH_INLINE_HEADER_SIZE); + /* Copy the first 16 bytes into inline header. */ + rte_memcpy((uint8_t *)(uintptr_t)wqe->wqe.eseg.inline_hdr_start, + (uint8_t *)(uintptr_t)addr, + MLX5_ETH_INLINE_HEADER_SIZE); + addr += MLX5_ETH_INLINE_HEADER_SIZE; + length -= MLX5_ETH_INLINE_HEADER_SIZE; + /* Store remaining data in data segment. */ + wqe->wqe.dseg.byte_count = htonl(length); + wqe->wqe.dseg.lkey = lkey; + wqe->wqe.dseg.addr = htonll(addr); + /* Increment consumer index. */ + ++txq->wqe_ci; +} - if (head_room_len < 4) - return EINVAL; +/** + * Write a regular WQE with VLAN. + * + * @param txq + * Pointer to TX queue structure. + * @param wqe + * Pointer to the WQE to fill. + * @param addr + * Buffer data address. + * @param length + * Packet length. + * @param lkey + * Memory region lkey. + * @param vlan_tci + * VLAN field to insert in packet. + */ +static inline void +mlx5_wqe_write_vlan(struct txq *txq, volatile union mlx5_wqe *wqe, + uintptr_t addr, uint32_t length, uint32_t lkey, + uint16_t vlan_tci) +{ + uint32_t vlan = htonl(0x81000000 | vlan_tci); + + wqe->wqe.ctrl.data[0] = htonl((txq->wqe_ci << 8) | MLX5_OPCODE_SEND); + wqe->wqe.ctrl.data[1] = htonl((txq->qp_num_8s) | 4); + wqe->wqe.ctrl.data[3] = 0; + wqe->inl.eseg.rsvd0 = 0; + wqe->inl.eseg.rsvd1 = 0; + wqe->inl.eseg.mss = 0; + wqe->inl.eseg.rsvd2 = 0; + wqe->wqe.eseg.inline_hdr_sz = htons(MLX5_ETH_VLAN_INLINE_HEADER_SIZE); + /* + * Copy 12 bytes of source & destination MAC address. + * Copy 4 bytes of VLAN. + * Copy 2 bytes of Ether type. + */ + rte_memcpy((uint8_t *)(uintptr_t)wqe->wqe.eseg.inline_hdr_start, + (uint8_t *)(uintptr_t)addr, 12); + rte_memcpy((uint8_t *)((uintptr_t)wqe->wqe.eseg.inline_hdr_start + 12), + &vlan, sizeof(vlan)); + rte_memcpy((uint8_t *)((uintptr_t)wqe->wqe.eseg.inline_hdr_start + 16), + (uint8_t *)((uintptr_t)addr + 12), 2); + addr += MLX5_ETH_VLAN_INLINE_HEADER_SIZE - sizeof(vlan); + length -= MLX5_ETH_VLAN_INLINE_HEADER_SIZE - sizeof(vlan); + /* Store remaining data in data segment. */ + wqe->wqe.dseg.byte_count = htonl(length); + wqe->wqe.dseg.lkey = lkey; + wqe->wqe.dseg.addr = htonll(addr); + /* Increment consumer index. */ + ++txq->wqe_ci; +} - addr = rte_pktmbuf_mtod(buf, uintptr_t); - vlan = htonl(0x81000000 | buf->vlan_tci); - memmove((void *)(addr - 4), (void *)addr, 12); - memcpy((void *)(addr + 8), &vlan, sizeof(vlan)); +/** + * Ring TX queue doorbell. + * + * @param txq + * Pointer to TX queue structure. + */ +static inline void +mlx5_tx_dbrec(struct txq *txq) +{ + uint8_t *dst = (uint8_t *)((uintptr_t)txq->bf_reg + txq->bf_offset); + uint32_t data[4] = { + htonl((txq->wqe_ci << 8) | MLX5_OPCODE_SEND), + htonl(txq->qp_num_8s), + 0, + 0, + }; + rte_wmb(); + *txq->qp_db = htonl(txq->wqe_ci); + /* Ensure ordering between DB record and BF copy. */ + rte_wmb(); + rte_mov16(dst, (uint8_t *)data); + txq->bf_offset ^= txq->bf_buf_size; +} - SET_DATA_OFF(buf, head_room_len - 4); - DATA_LEN(buf) += 4; +/** + * Prefetch a CQE. + * + * @param txq + * Pointer to TX queue structure. + * @param cqe_ci + * CQE consumer index. + */ +static inline void +tx_prefetch_cqe(struct txq *txq, uint16_t ci) +{ + volatile struct mlx5_cqe64 *cqe; - return 0; + cqe = &(*txq->cqes)[ci & (txq->cqe_n - 1)].cqe64; + rte_prefetch0(cqe); } /** @@ -288,18 +376,21 @@ uint16_t mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) { struct txq *txq = (struct txq *)dpdk_txq; - unsigned int elts_head = txq->elts_head; + uint16_t elts_head = txq->elts_head; const unsigned int elts_n = txq->elts_n; - unsigned int elts_comp_cd = txq->elts_comp_cd; - unsigned int elts_comp = 0; unsigned int i; unsigned int max; - int err; - struct rte_mbuf *buf = pkts[0]; + volatile union mlx5_wqe *wqe; + struct rte_mbuf *buf; - assert(elts_comp_cd != 0); + if (unlikely(!pkts_n)) + return 0; + buf = pkts[0]; /* Prefetch first packet cacheline. */ + tx_prefetch_cqe(txq, txq->cq_ci); + tx_prefetch_cqe(txq, txq->cq_ci + 1); rte_prefetch0(buf); + /* Start processing. */ txq_complete(txq); max = (elts_n - (elts_head - txq->elts_tail)); if (max > elts_n) @@ -313,101 +404,53 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) if (max > pkts_n) max = pkts_n; for (i = 0; (i != max); ++i) { - struct rte_mbuf *buf_next = pkts[i + 1]; - unsigned int elts_head_next = - (((elts_head + 1) == elts_n) ? 0 : elts_head + 1); - struct txq_elt *elt = &(*txq->elts)[elts_head]; - uint32_t send_flags = 0; -#ifdef HAVE_VERBS_VLAN_INSERTION - int insert_vlan = 0; -#endif /* HAVE_VERBS_VLAN_INSERTION */ + unsigned int elts_head_next = (elts_head + 1) & (elts_n - 1); uintptr_t addr; uint32_t length; uint32_t lkey; - uintptr_t buf_next_addr; + wqe = &(*txq->wqes)[txq->wqe_ci & (txq->wqe_n - 1)]; + rte_prefetch0(wqe); if (i + 1 < max) - rte_prefetch0(buf_next); - /* Request TX completion. */ - if (unlikely(--elts_comp_cd == 0)) { - elts_comp_cd = txq->elts_comp_cd_init; - ++elts_comp; - send_flags |= IBV_EXP_QP_BURST_SIGNALED; - } - /* Should we enable HW CKSUM offload */ - if (buf->ol_flags & - (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) { - send_flags |= IBV_EXP_QP_BURST_IP_CSUM; - /* HW does not support checksum offloads at arbitrary - * offsets but automatically recognizes the packet - * type. For inner L3/L4 checksums, only VXLAN (UDP) - * tunnels are currently supported. */ - if (RTE_ETH_IS_TUNNEL_PKT(buf->packet_type)) - send_flags |= IBV_EXP_QP_BURST_TUNNEL; - } - if (buf->ol_flags & PKT_TX_VLAN_PKT) { -#ifdef HAVE_VERBS_VLAN_INSERTION - if (!txq->priv->mps) - insert_vlan = 1; - else -#endif /* HAVE_VERBS_VLAN_INSERTION */ - { - err = insert_vlan_sw(buf); - if (unlikely(err)) - goto stop; - } - } + rte_prefetch0(pkts[i + 1]); /* Retrieve buffer information. */ addr = rte_pktmbuf_mtod(buf, uintptr_t); length = DATA_LEN(buf); /* Update element. */ - elt->buf = buf; - if (txq->priv->sriov) - rte_prefetch0((volatile void *) - (uintptr_t)addr); + (*txq->elts)[elts_head] = buf; /* Prefetch next buffer data. */ - if (i + 1 < max) { - buf_next_addr = - rte_pktmbuf_mtod(buf_next, uintptr_t); - rte_prefetch0((volatile void *) - (uintptr_t)buf_next_addr); - } + if (i + 1 < max) + rte_prefetch0(rte_pktmbuf_mtod(pkts[i + 1], + volatile void *)); /* Retrieve Memory Region key for this memory pool. */ lkey = txq_mp2mr(txq, txq_mb2mp(buf)); - if (unlikely(lkey == (uint32_t)-1)) { - /* MR does not exist. */ - DEBUG("%p: unable to get MP <-> MR" - " association", (void *)txq); - /* Clean up TX element. */ - elt->buf = NULL; - goto stop; - } -#ifdef HAVE_VERBS_VLAN_INSERTION - if (insert_vlan) - err = txq->send_pending_vlan - (txq->qp, - addr, - length, - lkey, - send_flags, - &buf->vlan_tci); + if (buf->ol_flags & PKT_TX_VLAN_PKT) + mlx5_wqe_write_vlan(txq, wqe, addr, length, lkey, + buf->vlan_tci); else -#endif /* HAVE_VERBS_VLAN_INSERTION */ - err = txq->send_pending - (txq->qp, - addr, - length, - lkey, - send_flags); - if (unlikely(err)) - goto stop; + mlx5_wqe_write(txq, wqe, addr, length, lkey); + /* Request completion if needed. */ + if (unlikely(--txq->elts_comp == 0)) { + wqe->wqe.ctrl.data[2] = htonl(8); + txq->elts_comp = txq->elts_comp_cd_init; + } else { + wqe->wqe.ctrl.data[2] = 0; + } + /* Should we enable HW CKSUM offload */ + if (buf->ol_flags & + (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) { + wqe->wqe.eseg.cs_flags = + MLX5_ETH_WQE_L3_CSUM | + MLX5_ETH_WQE_L4_CSUM; + } else { + wqe->wqe.eseg.cs_flags = 0; + } #ifdef MLX5_PMD_SOFT_COUNTERS /* Increment sent bytes counter. */ txq->stats.obytes += length; #endif -stop: elts_head = elts_head_next; - buf = buf_next; + buf = pkts[i + 1]; } /* Take a shortcut if nothing must be sent. */ if (unlikely(i == 0)) @@ -417,16 +460,8 @@ stop: txq->stats.opackets += i; #endif /* Ring QP doorbell. */ - err = txq->send_flush(txq->qp); - if (unlikely(err)) { - /* A nonzero value is not supposed to be returned. - * Nothing can be done about it. */ - DEBUG("%p: send_flush() failed with error %d", - (void *)txq, err); - } + mlx5_tx_dbrec(txq); txq->elts_head = elts_head; - txq->elts_comp += elts_comp; - txq->elts_comp_cd = elts_comp_cd; return i; } diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h index 1827123760..6b3bb2df4f 100644 --- a/drivers/net/mlx5/mlx5_rxtx.h +++ b/drivers/net/mlx5/mlx5_rxtx.h @@ -223,44 +223,40 @@ struct hash_rxq { [MLX5_MAX_SPECIAL_FLOWS][MLX5_MAX_VLAN_IDS]; }; -/* TX element. */ -struct txq_elt { - struct rte_mbuf *buf; -}; - /* TX queue descriptor. */ struct txq { - struct priv *priv; /* Back pointer to private data. */ - int32_t (*poll_cnt)(struct ibv_cq *cq, uint32_t max); - int (*send_pending)(); -#ifdef HAVE_VERBS_VLAN_INSERTION - int (*send_pending_vlan)(); -#endif - int (*send_flush)(struct ibv_qp *qp); - struct ibv_cq *cq; /* Completion Queue. */ - struct ibv_qp *qp; /* Queue Pair. */ - struct txq_elt (*elts)[]; /* TX elements. */ - unsigned int elts_n; /* (*elts)[] length. */ - unsigned int elts_head; /* Current index in (*elts)[]. */ - unsigned int elts_tail; /* First element awaiting completion. */ - unsigned int elts_comp; /* Number of completion requests. */ - unsigned int elts_comp_cd; /* Countdown for next completion request. */ - unsigned int elts_comp_cd_init; /* Initial value for countdown. */ + uint16_t elts_head; /* Current index in (*elts)[]. */ + uint16_t elts_tail; /* First element awaiting completion. */ + uint16_t elts_comp_cd_init; /* Initial value for countdown. */ + uint16_t elts_comp; /* Elements before asking a completion. */ + uint16_t elts_n; /* (*elts)[] length. */ + uint16_t cq_ci; /* Consumer index for completion queue. */ + uint16_t cqe_n; /* Number of CQ elements. */ + uint16_t wqe_ci; /* Consumer index for work queue. */ + uint16_t wqe_n; /* Number of WQ elements. */ + uint16_t bf_offset; /* Blueflame offset. */ + uint16_t bf_buf_size; /* Blueflame size. */ + volatile struct mlx5_cqe (*cqes)[]; /* Completion queue. */ + volatile union mlx5_wqe (*wqes)[]; /* Work queue. */ + volatile uint32_t *qp_db; /* Work queue doorbell. */ + volatile uint32_t *cq_db; /* Completion queue doorbell. */ + volatile void *bf_reg; /* Blueflame register. */ struct { const struct rte_mempool *mp; /* Cached Memory Pool. */ struct ibv_mr *mr; /* Memory Region (for mp). */ - uint32_t lkey; /* mr->lkey */ + uint32_t lkey; /* htonl(mr->lkey) */ } mp2mr[MLX5_PMD_TX_MP_CACHE]; /* MP to MR translation table. */ + struct rte_mbuf *(*elts)[]; /* TX elements. */ struct mlx5_txq_stats stats; /* TX queue counters. */ + uint32_t qp_num_8s; /* QP number shifted by 8. */ } __rte_cache_aligned; /* TX queue control descriptor. */ struct txq_ctrl { -#ifdef HAVE_VERBS_VLAN_INSERTION - struct ibv_exp_qp_burst_family_v1 *if_qp; /* QP burst interface. */ -#else + struct priv *priv; /* Back pointer to private data. */ + struct ibv_cq *cq; /* Completion Queue. */ + struct ibv_qp *qp; /* Queue Pair. */ struct ibv_exp_qp_burst_family *if_qp; /* QP burst interface. */ -#endif struct ibv_exp_cq_family *if_cq; /* CQ interface. */ struct ibv_exp_res_domain *rd; /* Resource Domain. */ unsigned int socket; /* CPU socket ID for allocations. */ @@ -294,8 +290,8 @@ uint16_t mlx5_rx_burst_secondary_setup(void *, struct rte_mbuf **, uint16_t); /* mlx5_txq.c */ void txq_cleanup(struct txq_ctrl *); -int txq_setup(struct rte_eth_dev *, struct txq_ctrl *, uint16_t, unsigned int, - const struct rte_eth_txconf *); +int txq_ctrl_setup(struct rte_eth_dev *, struct txq_ctrl *, uint16_t, + unsigned int, const struct rte_eth_txconf *); int mlx5_tx_queue_setup(struct rte_eth_dev *, uint16_t, uint16_t, unsigned int, const struct rte_eth_txconf *); void mlx5_tx_queue_release(void *); diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c index ec4488aa10..26d6168543 100644 --- a/drivers/net/mlx5/mlx5_txq.c +++ b/drivers/net/mlx5/mlx5_txq.c @@ -60,6 +60,7 @@ #endif #include "mlx5_utils.h" +#include "mlx5_defs.h" #include "mlx5.h" #include "mlx5_rxtx.h" #include "mlx5_autoconf.h" @@ -72,48 +73,22 @@ * Pointer to TX queue structure. * @param elts_n * Number of elements to allocate. - * - * @return - * 0 on success, errno value on failure. */ -static int +static void txq_alloc_elts(struct txq_ctrl *txq_ctrl, unsigned int elts_n) { unsigned int i; - struct txq_elt (*elts)[elts_n] = - rte_calloc_socket("TXQ", 1, sizeof(*elts), 0, txq_ctrl->socket); - int ret = 0; - if (elts == NULL) { - ERROR("%p: can't allocate packets array", (void *)txq_ctrl); - ret = ENOMEM; - goto error; - } - for (i = 0; (i != elts_n); ++i) { - struct txq_elt *elt = &(*elts)[i]; + for (i = 0; (i != elts_n); ++i) + (*txq_ctrl->txq.elts)[i] = NULL; + for (i = 0; (i != txq_ctrl->txq.wqe_n); ++i) { + volatile union mlx5_wqe *wqe = &(*txq_ctrl->txq.wqes)[i]; - elt->buf = NULL; + memset((void *)(uintptr_t)wqe, 0x0, sizeof(*wqe)); } DEBUG("%p: allocated and configured %u WRs", (void *)txq_ctrl, elts_n); - txq_ctrl->txq.elts_n = elts_n; - txq_ctrl->txq.elts = elts; txq_ctrl->txq.elts_head = 0; txq_ctrl->txq.elts_tail = 0; - txq_ctrl->txq.elts_comp = 0; - /* Request send completion every MLX5_PMD_TX_PER_COMP_REQ packets or - * at least 4 times per ring. */ - txq_ctrl->txq.elts_comp_cd_init = - ((MLX5_PMD_TX_PER_COMP_REQ < (elts_n / 4)) ? - MLX5_PMD_TX_PER_COMP_REQ : (elts_n / 4)); - txq_ctrl->txq.elts_comp_cd = txq_ctrl->txq.elts_comp_cd_init; - assert(ret == 0); - return 0; -error: - rte_free(elts); - - DEBUG("%p: failed, freed everything", (void *)txq_ctrl); - assert(ret > 0); - return ret; } /** @@ -128,32 +103,26 @@ txq_free_elts(struct txq_ctrl *txq_ctrl) unsigned int elts_n = txq_ctrl->txq.elts_n; unsigned int elts_head = txq_ctrl->txq.elts_head; unsigned int elts_tail = txq_ctrl->txq.elts_tail; - struct txq_elt (*elts)[elts_n] = txq_ctrl->txq.elts; + struct rte_mbuf *(*elts)[elts_n] = txq_ctrl->txq.elts; DEBUG("%p: freeing WRs", (void *)txq_ctrl); - txq_ctrl->txq.elts_n = 0; txq_ctrl->txq.elts_head = 0; txq_ctrl->txq.elts_tail = 0; - txq_ctrl->txq.elts_comp = 0; - txq_ctrl->txq.elts_comp_cd = 0; - txq_ctrl->txq.elts_comp_cd_init = 0; - txq_ctrl->txq.elts = NULL; - if (elts == NULL) - return; while (elts_tail != elts_head) { - struct txq_elt *elt = &(*elts)[elts_tail]; + struct rte_mbuf *elt = (*elts)[elts_tail]; - assert(elt->buf != NULL); - rte_pktmbuf_free(elt->buf); + assert(elt != NULL); + rte_pktmbuf_free(elt); #ifndef NDEBUG /* Poisoning. */ - memset(elt, 0x77, sizeof(*elt)); + memset(&(*elts)[elts_tail], + 0x77, + sizeof((*elts)[elts_tail])); #endif if (++elts_tail == elts_n) elts_tail = 0; } - rte_free(elts); } /** @@ -172,42 +141,40 @@ txq_cleanup(struct txq_ctrl *txq_ctrl) DEBUG("cleaning up %p", (void *)txq_ctrl); txq_free_elts(txq_ctrl); - txq_ctrl->txq.poll_cnt = NULL; - txq_ctrl->txq.send_flush = NULL; if (txq_ctrl->if_qp != NULL) { - assert(txq_ctrl->txq.priv != NULL); - assert(txq_ctrl->txq.priv->ctx != NULL); - assert(txq_ctrl->txq.qp != NULL); + assert(txq_ctrl->priv != NULL); + assert(txq_ctrl->priv->ctx != NULL); + assert(txq_ctrl->qp != NULL); params = (struct ibv_exp_release_intf_params){ .comp_mask = 0, }; - claim_zero(ibv_exp_release_intf(txq_ctrl->txq.priv->ctx, + claim_zero(ibv_exp_release_intf(txq_ctrl->priv->ctx, txq_ctrl->if_qp, ¶ms)); } if (txq_ctrl->if_cq != NULL) { - assert(txq_ctrl->txq.priv != NULL); - assert(txq_ctrl->txq.priv->ctx != NULL); - assert(txq_ctrl->txq.cq != NULL); + assert(txq_ctrl->priv != NULL); + assert(txq_ctrl->priv->ctx != NULL); + assert(txq_ctrl->cq != NULL); params = (struct ibv_exp_release_intf_params){ .comp_mask = 0, }; - claim_zero(ibv_exp_release_intf(txq_ctrl->txq.priv->ctx, + claim_zero(ibv_exp_release_intf(txq_ctrl->priv->ctx, txq_ctrl->if_cq, ¶ms)); } - if (txq_ctrl->txq.qp != NULL) - claim_zero(ibv_destroy_qp(txq_ctrl->txq.qp)); - if (txq_ctrl->txq.cq != NULL) - claim_zero(ibv_destroy_cq(txq_ctrl->txq.cq)); + if (txq_ctrl->qp != NULL) + claim_zero(ibv_destroy_qp(txq_ctrl->qp)); + if (txq_ctrl->cq != NULL) + claim_zero(ibv_destroy_cq(txq_ctrl->cq)); if (txq_ctrl->rd != NULL) { struct ibv_exp_destroy_res_domain_attr attr = { .comp_mask = 0, }; - assert(txq_ctrl->txq.priv != NULL); - assert(txq_ctrl->txq.priv->ctx != NULL); - claim_zero(ibv_exp_destroy_res_domain(txq_ctrl->txq.priv->ctx, + assert(txq_ctrl->priv != NULL); + assert(txq_ctrl->priv->ctx != NULL); + claim_zero(ibv_exp_destroy_res_domain(txq_ctrl->priv->ctx, txq_ctrl->rd, &attr)); } @@ -220,6 +187,49 @@ txq_cleanup(struct txq_ctrl *txq_ctrl) memset(txq_ctrl, 0, sizeof(*txq_ctrl)); } +/** + * Initialize TX queue. + * + * @param tmpl + * Pointer to TX queue control template. + * @param txq_ctrl + * Pointer to TX queue control. + * + * @return + * 0 on success, errno value on failure. + */ +static inline int +txq_setup(struct txq_ctrl *tmpl, struct txq_ctrl *txq_ctrl) +{ + struct mlx5_qp *qp = to_mqp(tmpl->qp); + struct ibv_cq *ibcq = tmpl->cq; + struct mlx5_cq *cq = to_mxxx(cq, cq); + + if (cq->cqe_sz != RTE_CACHE_LINE_SIZE) { + ERROR("Wrong MLX5_CQE_SIZE environment variable value: " + "it should be set to %u", RTE_CACHE_LINE_SIZE); + return EINVAL; + } + tmpl->txq.cqe_n = ibcq->cqe + 1; + tmpl->txq.qp_num_8s = qp->ctrl_seg.qp_num << 8; + tmpl->txq.wqes = + (volatile union mlx5_wqe (*)[]) + (uintptr_t)qp->gen_data.sqstart; + tmpl->txq.wqe_n = qp->sq.wqe_cnt; + tmpl->txq.qp_db = &qp->gen_data.db[MLX5_SND_DBR]; + tmpl->txq.bf_reg = qp->gen_data.bf->reg; + tmpl->txq.bf_offset = qp->gen_data.bf->offset; + tmpl->txq.bf_buf_size = qp->gen_data.bf->buf_size; + tmpl->txq.cq_db = cq->dbrec; + tmpl->txq.cqes = + (volatile struct mlx5_cqe (*)[]) + (uintptr_t)cq->active_buf->buf; + tmpl->txq.elts = + (struct rte_mbuf *(*)[tmpl->txq.elts_n]) + ((uintptr_t)txq_ctrl + sizeof(*txq_ctrl)); + return 0; +} + /** * Configure a TX queue. * @@ -238,15 +248,14 @@ txq_cleanup(struct txq_ctrl *txq_ctrl) * 0 on success, errno value on failure. */ int -txq_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl, uint16_t desc, - unsigned int socket, const struct rte_eth_txconf *conf) +txq_ctrl_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl, + uint16_t desc, unsigned int socket, + const struct rte_eth_txconf *conf) { struct priv *priv = mlx5_get_priv(dev); struct txq_ctrl tmpl = { + .priv = priv, .socket = socket, - .txq = { - .priv = priv, - }, }; union { struct ibv_exp_query_intf_params params; @@ -254,15 +263,21 @@ txq_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl, uint16_t desc, struct ibv_exp_res_domain_init_attr rd; struct ibv_exp_cq_init_attr cq; struct ibv_exp_qp_attr mod; + struct ibv_exp_cq_attr cq_attr; } attr; enum ibv_exp_query_intf_status status; int ret = 0; (void)conf; /* Thresholds configuration (ignored). */ - if (desc == 0) { - ERROR("%p: invalid number of TX descriptors", (void *)dev); - return EINVAL; - } + tmpl.txq.elts_n = desc; + /* + * Request send completion every MLX5_PMD_TX_PER_COMP_REQ packets or + * at least 4 times per ring. + */ + tmpl.txq.elts_comp_cd_init = + ((MLX5_PMD_TX_PER_COMP_REQ < (desc / 4)) ? + MLX5_PMD_TX_PER_COMP_REQ : (desc / 4)); + tmpl.txq.elts_comp = tmpl.txq.elts_comp_cd_init; /* MRs will be registered in mp2mr[] later. */ attr.rd = (struct ibv_exp_res_domain_init_attr){ .comp_mask = (IBV_EXP_RES_DOMAIN_THREAD_MODEL | @@ -281,9 +296,10 @@ txq_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl, uint16_t desc, .comp_mask = IBV_EXP_CQ_INIT_ATTR_RES_DOMAIN, .res_domain = tmpl.rd, }; - tmpl.txq.cq = ibv_exp_create_cq(priv->ctx, desc, NULL, NULL, 0, - &attr.cq); - if (tmpl.txq.cq == NULL) { + tmpl.cq = ibv_exp_create_cq(priv->ctx, + (desc / tmpl.txq.elts_comp_cd_init) - 1, + NULL, NULL, 0, &attr.cq); + if (tmpl.cq == NULL) { ret = ENOMEM; ERROR("%p: CQ creation failure: %s", (void *)dev, strerror(ret)); @@ -295,9 +311,9 @@ txq_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl, uint16_t desc, priv->device_attr.max_sge); attr.init = (struct ibv_exp_qp_init_attr){ /* CQ to be associated with the send queue. */ - .send_cq = tmpl.txq.cq, + .send_cq = tmpl.cq, /* CQ to be associated with the receive queue. */ - .recv_cq = tmpl.txq.cq, + .recv_cq = tmpl.cq, .cap = { /* Max number of outstanding WRs. */ .max_send_wr = ((priv->device_attr.max_qp_wr < desc) ? @@ -315,8 +331,8 @@ txq_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl, uint16_t desc, .comp_mask = (IBV_EXP_QP_INIT_ATTR_PD | IBV_EXP_QP_INIT_ATTR_RES_DOMAIN), }; - tmpl.txq.qp = ibv_exp_create_qp(priv->ctx, &attr.init); - if (tmpl.txq.qp == NULL) { + tmpl.qp = ibv_exp_create_qp(priv->ctx, &attr.init); + if (tmpl.qp == NULL) { ret = (errno ? errno : EINVAL); ERROR("%p: QP creation failure: %s", (void *)dev, strerror(ret)); @@ -328,30 +344,31 @@ txq_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl, uint16_t desc, /* Primary port number. */ .port_num = priv->port }; - ret = ibv_exp_modify_qp(tmpl.txq.qp, &attr.mod, + ret = ibv_exp_modify_qp(tmpl.qp, &attr.mod, (IBV_EXP_QP_STATE | IBV_EXP_QP_PORT)); if (ret) { ERROR("%p: QP state to IBV_QPS_INIT failed: %s", (void *)dev, strerror(ret)); goto error; } - ret = txq_alloc_elts(&tmpl, desc); + ret = txq_setup(&tmpl, txq_ctrl); if (ret) { - ERROR("%p: TXQ allocation failed: %s", + ERROR("%p: cannot initialize TX queue structure: %s", (void *)dev, strerror(ret)); goto error; } + txq_alloc_elts(&tmpl, desc); attr.mod = (struct ibv_exp_qp_attr){ .qp_state = IBV_QPS_RTR }; - ret = ibv_exp_modify_qp(tmpl.txq.qp, &attr.mod, IBV_EXP_QP_STATE); + ret = ibv_exp_modify_qp(tmpl.qp, &attr.mod, IBV_EXP_QP_STATE); if (ret) { ERROR("%p: QP state to IBV_QPS_RTR failed: %s", (void *)dev, strerror(ret)); goto error; } attr.mod.qp_state = IBV_QPS_RTS; - ret = ibv_exp_modify_qp(tmpl.txq.qp, &attr.mod, IBV_EXP_QP_STATE); + ret = ibv_exp_modify_qp(tmpl.qp, &attr.mod, IBV_EXP_QP_STATE); if (ret) { ERROR("%p: QP state to IBV_QPS_RTS failed: %s", (void *)dev, strerror(ret)); @@ -360,7 +377,7 @@ txq_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl, uint16_t desc, attr.params = (struct ibv_exp_query_intf_params){ .intf_scope = IBV_EXP_INTF_GLOBAL, .intf = IBV_EXP_INTF_CQ, - .obj = tmpl.txq.cq, + .obj = tmpl.cq, }; tmpl.if_cq = ibv_exp_query_intf(priv->ctx, &attr.params, &status); if (tmpl.if_cq == NULL) { @@ -372,10 +389,8 @@ txq_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl, uint16_t desc, attr.params = (struct ibv_exp_query_intf_params){ .intf_scope = IBV_EXP_INTF_GLOBAL, .intf = IBV_EXP_INTF_QP_BURST, - .obj = tmpl.txq.qp, -#ifdef HAVE_VERBS_VLAN_INSERTION .intf_version = 1, -#endif + .obj = tmpl.qp, /* Enable multi-packet send if supported. */ .family_flags = (priv->mps ? @@ -393,12 +408,6 @@ txq_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl, uint16_t desc, DEBUG("%p: cleaning-up old txq just in case", (void *)txq_ctrl); txq_cleanup(txq_ctrl); *txq_ctrl = tmpl; - txq_ctrl->txq.poll_cnt = txq_ctrl->if_cq->poll_cnt; - txq_ctrl->txq.send_pending = txq_ctrl->if_qp->send_pending; -#ifdef HAVE_VERBS_VLAN_INSERTION - txq_ctrl->txq.send_pending_vlan = txq_ctrl->if_qp->send_pending_vlan; -#endif - txq_ctrl->txq.send_flush = txq_ctrl->if_qp->send_flush; DEBUG("%p: txq updated with %p", (void *)txq_ctrl, (void *)&tmpl); /* Pre-register known mempools. */ rte_mempool_walk(txq_mp2mr_iter, txq_ctrl); @@ -433,15 +442,19 @@ mlx5_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc, { struct priv *priv = dev->data->dev_private; struct txq *txq = (*priv->txqs)[idx]; - struct txq_ctrl *txq_ctrl; + struct txq_ctrl *txq_ctrl = container_of(txq, struct txq_ctrl, txq); int ret; if (mlx5_is_secondary()) return -E_RTE_SECONDARY; priv_lock(priv); - if (txq) - txq_ctrl = container_of(txq, struct txq_ctrl, txq); + if (!rte_is_power_of_2(desc)) { + desc = 1 << log2above(desc); + WARN("%p: increased number of descriptors in TX queue %u" + " to the next power of two (%d)", + (void *)dev, idx, desc); + } DEBUG("%p: configuring queue %u for %u descriptors", (void *)dev, idx, desc); if (idx >= priv->txqs_n) { @@ -460,8 +473,11 @@ mlx5_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc, (*priv->txqs)[idx] = NULL; txq_cleanup(txq_ctrl); } else { - txq_ctrl = rte_calloc_socket("TXQ", 1, sizeof(*txq_ctrl), - 0, socket); + txq_ctrl = + rte_calloc_socket("TXQ", 1, + sizeof(*txq_ctrl) + + desc * sizeof(struct rte_mbuf *), + 0, socket); if (txq_ctrl == NULL) { ERROR("%p: unable to allocate queue index %u", (void *)dev, idx); @@ -469,7 +485,7 @@ mlx5_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc, return -ENOMEM; } } - ret = txq_setup(dev, txq_ctrl, desc, socket, conf); + ret = txq_ctrl_setup(dev, txq_ctrl, desc, socket, conf); if (ret) rte_free(txq_ctrl); else { @@ -504,7 +520,7 @@ mlx5_tx_queue_release(void *dpdk_txq) if (txq == NULL) return; txq_ctrl = container_of(txq, struct txq_ctrl, txq); - priv = txq->priv; + priv = txq_ctrl->priv; priv_lock(priv); for (i = 0; (i != priv->txqs_n); ++i) if ((*priv->txqs)[i] == txq) { @@ -539,7 +555,8 @@ mlx5_tx_burst_secondary_setup(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) { struct txq *txq = dpdk_txq; - struct priv *priv = mlx5_secondary_data_setup(txq->priv); + struct txq_ctrl *txq_ctrl = container_of(txq, struct txq_ctrl, txq); + struct priv *priv = mlx5_secondary_data_setup(txq_ctrl->priv); struct priv *primary_priv; unsigned int index;