4 * Copyright 2015 6WIND S.A.
5 * Copyright 2015 Mellanox.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * * Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * * Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * * Neither the name of 6WIND S.A. nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
43 /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
45 #pragma GCC diagnostic ignored "-Wpedantic"
47 #include <infiniband/verbs.h>
49 #pragma GCC diagnostic error "-Wpedantic"
53 #include <rte_malloc.h>
54 #include <rte_ethdev.h>
55 #include <rte_common.h>
57 #include "mlx5_utils.h"
58 #include "mlx5_defs.h"
60 #include "mlx5_rxtx.h"
61 #include "mlx5_autoconf.h"
64 * Allocate TX queue elements.
67 * Pointer to TX queue structure.
69 * Number of elements to allocate.
72 txq_alloc_elts(struct mlx5_txq_ctrl *txq_ctrl, unsigned int elts_n)
76 for (i = 0; (i != elts_n); ++i)
77 (*txq_ctrl->txq.elts)[i] = NULL;
78 DEBUG("%p: allocated and configured %u WRs", (void *)txq_ctrl, elts_n);
79 txq_ctrl->txq.elts_head = 0;
80 txq_ctrl->txq.elts_tail = 0;
81 txq_ctrl->txq.elts_comp = 0;
85 * Free TX queue elements.
88 * Pointer to TX queue structure.
91 txq_free_elts(struct mlx5_txq_ctrl *txq_ctrl)
93 const uint16_t elts_n = 1 << txq_ctrl->txq.elts_n;
94 const uint16_t elts_m = elts_n - 1;
95 uint16_t elts_head = txq_ctrl->txq.elts_head;
96 uint16_t elts_tail = txq_ctrl->txq.elts_tail;
97 struct rte_mbuf *(*elts)[elts_n] = txq_ctrl->txq.elts;
99 DEBUG("%p: freeing WRs", (void *)txq_ctrl);
100 txq_ctrl->txq.elts_head = 0;
101 txq_ctrl->txq.elts_tail = 0;
102 txq_ctrl->txq.elts_comp = 0;
104 while (elts_tail != elts_head) {
105 struct rte_mbuf *elt = (*elts)[elts_tail & elts_m];
108 rte_pktmbuf_free_seg(elt);
111 memset(&(*elts)[elts_tail & elts_m],
113 sizeof((*elts)[elts_tail & elts_m]));
120 * Clean up a TX queue.
122 * Destroy objects, free allocated memory and reset the structure for reuse.
125 * Pointer to TX queue structure.
128 mlx5_txq_cleanup(struct mlx5_txq_ctrl *txq_ctrl)
132 DEBUG("cleaning up %p", (void *)txq_ctrl);
133 txq_free_elts(txq_ctrl);
134 for (i = 0; (i != RTE_DIM(txq_ctrl->txq.mp2mr)); ++i)
135 if (txq_ctrl->txq.mp2mr[i])
136 priv_mr_release(txq_ctrl->priv, txq_ctrl->txq.mp2mr[i]);
138 mlx5_priv_txq_ibv_release(txq_ctrl->priv, txq_ctrl->ibv);
139 memset(txq_ctrl, 0, sizeof(*txq_ctrl));
143 * Configure a TX queue.
146 * Pointer to Ethernet device structure.
148 * Pointer to TX queue structure.
150 * Number of descriptors to configure in queue.
152 * NUMA socket on which memory must be allocated.
154 * Thresholds parameters.
157 * 0 on success, errno value on failure.
160 mlx5_txq_ctrl_setup(struct rte_eth_dev *dev, struct mlx5_txq_ctrl *txq_ctrl,
161 uint16_t desc, unsigned int socket,
162 const struct rte_eth_txconf *conf)
164 struct priv *priv = mlx5_get_priv(dev);
165 struct mlx5_txq_ctrl tmpl = {
169 const unsigned int max_tso_inline = ((MLX5_MAX_TSO_HEADER +
170 (RTE_CACHE_LINE_SIZE - 1)) /
171 RTE_CACHE_LINE_SIZE);
173 if (mlx5_getenv_int("MLX5_ENABLE_CQE_COMPRESSION")) {
174 ERROR("MLX5_ENABLE_CQE_COMPRESSION must never be set");
177 tmpl.txq.flags = conf->txq_flags;
178 assert(desc > MLX5_TX_COMP_THRESH);
179 tmpl.txq.elts_n = log2above(desc);
180 if (priv->mps == MLX5_MPW_ENHANCED)
181 tmpl.txq.mpw_hdr_dseg = priv->mpw_hdr_dseg;
182 /* MRs will be registered in mp2mr[] later. */
183 DEBUG("priv->device_attr.max_qp_wr is %d",
184 priv->device_attr.orig_attr.max_qp_wr);
185 DEBUG("priv->device_attr.max_sge is %d",
186 priv->device_attr.orig_attr.max_sge);
187 if (priv->txq_inline && (priv->txqs_n >= priv->txqs_inline)) {
190 tmpl.txq.max_inline =
191 ((priv->txq_inline + (RTE_CACHE_LINE_SIZE - 1)) /
192 RTE_CACHE_LINE_SIZE);
193 tmpl.txq.inline_en = 1;
194 /* TSO and MPS can't be enabled concurrently. */
195 assert(!priv->tso || !priv->mps);
196 if (priv->mps == MLX5_MPW_ENHANCED) {
197 tmpl.txq.inline_max_packet_sz =
198 priv->inline_max_packet_sz;
199 /* To minimize the size of data set, avoid requesting
202 tmpl.max_inline_data =
203 ((RTE_MIN(priv->txq_inline,
204 priv->inline_max_packet_sz) +
205 (RTE_CACHE_LINE_SIZE - 1)) /
206 RTE_CACHE_LINE_SIZE) * RTE_CACHE_LINE_SIZE;
207 } else if (priv->tso) {
208 int inline_diff = tmpl.txq.max_inline - max_tso_inline;
211 * Adjust inline value as Verbs aggregates
212 * tso_inline and txq_inline fields.
214 tmpl.max_inline_data = inline_diff > 0 ?
216 RTE_CACHE_LINE_SIZE :
219 tmpl.max_inline_data =
220 tmpl.txq.max_inline * RTE_CACHE_LINE_SIZE;
223 * Check if the inline size is too large in a way which
224 * can make the WQE DS to overflow.
225 * Considering in calculation:
230 ds_cnt = 2 + (tmpl.max_inline_data / MLX5_WQE_DWORD_SIZE);
231 if (ds_cnt > MLX5_DSEG_MAX) {
232 unsigned int max_inline = (MLX5_DSEG_MAX - 2) *
235 max_inline = max_inline - (max_inline %
236 RTE_CACHE_LINE_SIZE);
237 WARN("txq inline is too large (%d) setting it to "
238 "the maximum possible: %d\n",
239 priv->txq_inline, max_inline);
240 tmpl.txq.max_inline = max_inline / RTE_CACHE_LINE_SIZE;
244 tmpl.max_tso_header = max_tso_inline * RTE_CACHE_LINE_SIZE;
245 tmpl.txq.max_inline = RTE_MAX(tmpl.txq.max_inline,
250 tmpl.txq.tunnel_en = 1;
252 (struct rte_mbuf *(*)[1 << tmpl.txq.elts_n])
253 ((uintptr_t)txq_ctrl + sizeof(*txq_ctrl));
254 txq_alloc_elts(&tmpl, desc);
255 /* Clean up txq in case we're reinitializing it. */
256 DEBUG("%p: cleaning-up old txq just in case", (void *)txq_ctrl);
257 mlx5_txq_cleanup(txq_ctrl);
259 DEBUG("%p: txq updated with %p", (void *)txq_ctrl, (void *)&tmpl);
260 /* Pre-register known mempools. */
261 rte_mempool_walk(mlx5_txq_mp2mr_iter, txq_ctrl);
266 * DPDK callback to configure a TX queue.
269 * Pointer to Ethernet device structure.
273 * Number of descriptors to configure in queue.
275 * NUMA socket on which memory must be allocated.
277 * Thresholds parameters.
280 * 0 on success, negative errno value on failure.
283 mlx5_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
284 unsigned int socket, const struct rte_eth_txconf *conf)
286 struct priv *priv = dev->data->dev_private;
287 struct mlx5_txq_data *txq = (*priv->txqs)[idx];
288 struct mlx5_txq_ctrl *txq_ctrl =
289 container_of(txq, struct mlx5_txq_ctrl, txq);
292 if (mlx5_is_secondary())
293 return -E_RTE_SECONDARY;
296 if (desc <= MLX5_TX_COMP_THRESH) {
297 WARN("%p: number of descriptors requested for TX queue %u"
298 " must be higher than MLX5_TX_COMP_THRESH, using"
300 (void *)dev, idx, MLX5_TX_COMP_THRESH + 1, desc);
301 desc = MLX5_TX_COMP_THRESH + 1;
303 if (!rte_is_power_of_2(desc)) {
304 desc = 1 << log2above(desc);
305 WARN("%p: increased number of descriptors in TX queue %u"
306 " to the next power of two (%d)",
307 (void *)dev, idx, desc);
309 DEBUG("%p: configuring queue %u for %u descriptors",
310 (void *)dev, idx, desc);
311 if (idx >= priv->txqs_n) {
312 ERROR("%p: queue index out of range (%u >= %u)",
313 (void *)dev, idx, priv->txqs_n);
318 DEBUG("%p: reusing already allocated queue index %u (%p)",
319 (void *)dev, idx, (void *)txq);
320 if (dev->data->dev_started) {
324 (*priv->txqs)[idx] = NULL;
325 mlx5_txq_cleanup(txq_ctrl);
326 /* Resize if txq size is changed. */
327 if (txq_ctrl->txq.elts_n != log2above(desc)) {
328 txq_ctrl = rte_realloc(txq_ctrl,
330 desc * sizeof(struct rte_mbuf *),
331 RTE_CACHE_LINE_SIZE);
333 ERROR("%p: unable to reallocate queue index %u",
341 rte_calloc_socket("TXQ", 1,
343 desc * sizeof(struct rte_mbuf *),
345 if (txq_ctrl == NULL) {
346 ERROR("%p: unable to allocate queue index %u",
352 ret = mlx5_txq_ctrl_setup(dev, txq_ctrl, desc, socket, conf);
357 txq_ctrl->txq.stats.idx = idx;
358 DEBUG("%p: adding TX queue %p to list",
359 (void *)dev, (void *)txq_ctrl);
360 (*priv->txqs)[idx] = &txq_ctrl->txq;
361 txq_ctrl->ibv = mlx5_priv_txq_ibv_new(priv, idx);
362 if (!txq_ctrl->ibv) {
366 /* Update send callback. */
367 priv_dev_select_tx_function(priv, priv->dev);
374 * DPDK callback to release a TX queue.
377 * Generic TX queue pointer.
380 mlx5_tx_queue_release(void *dpdk_txq)
382 struct mlx5_txq_data *txq = (struct mlx5_txq_data *)dpdk_txq;
383 struct mlx5_txq_ctrl *txq_ctrl;
387 if (mlx5_is_secondary())
392 txq_ctrl = container_of(txq, struct mlx5_txq_ctrl, txq);
393 priv = txq_ctrl->priv;
395 for (i = 0; (i != priv->txqs_n); ++i)
396 if ((*priv->txqs)[i] == txq) {
397 DEBUG("%p: removing TX queue %p from list",
398 (void *)priv->dev, (void *)txq_ctrl);
399 (*priv->txqs)[i] = NULL;
402 mlx5_txq_cleanup(txq_ctrl);
409 * Map locally UAR used in Tx queues for BlueFlame doorbell.
412 * Pointer to private structure.
414 * Verbs file descriptor to map UAR pages.
417 * 0 on success, errno value on failure.
420 priv_tx_uar_remap(struct priv *priv, int fd)
423 uintptr_t pages[priv->txqs_n];
424 unsigned int pages_n = 0;
427 struct mlx5_txq_data *txq;
428 struct mlx5_txq_ctrl *txq_ctrl;
430 size_t page_size = sysconf(_SC_PAGESIZE);
433 * As rdma-core, UARs are mapped in size of OS page size.
434 * Use aligned address to avoid duplicate mmap.
435 * Ref to libmlx5 function: mlx5_init_context()
437 for (i = 0; i != priv->txqs_n; ++i) {
438 txq = (*priv->txqs)[i];
439 txq_ctrl = container_of(txq, struct mlx5_txq_ctrl, txq);
440 uar_va = (uintptr_t)txq_ctrl->txq.bf_reg;
441 uar_va = RTE_ALIGN_FLOOR(uar_va, page_size);
443 for (j = 0; j != pages_n; ++j) {
444 if (pages[j] == uar_va) {
451 pages[pages_n++] = uar_va;
452 addr = mmap((void *)uar_va, page_size,
453 PROT_WRITE, MAP_FIXED | MAP_SHARED, fd,
454 txq_ctrl->uar_mmap_offset);
455 if (addr != (void *)uar_va) {
456 ERROR("call to mmap failed on UAR for txq %d\n", i);
464 * Create the Tx queue Verbs object.
467 * Pointer to private structure.
469 * Queue index in DPDK Rx queue array
472 * The Verbs object initialised if it can be created.
475 mlx5_priv_txq_ibv_new(struct priv *priv, uint16_t idx)
477 struct mlx5_txq_data *txq_data = (*priv->txqs)[idx];
478 struct mlx5_txq_ctrl *txq_ctrl =
479 container_of(txq_data, struct mlx5_txq_ctrl, txq);
480 struct mlx5_txq_ibv tmpl;
481 struct mlx5_txq_ibv *txq_ibv;
483 struct ibv_qp_init_attr_ex init;
484 struct ibv_cq_init_attr_ex cq;
485 struct ibv_qp_attr mod;
486 struct ibv_cq_ex cq_attr;
490 struct mlx5dv_cq cq_info;
491 struct mlx5dv_obj obj;
492 const int desc = 1 << txq_data->elts_n;
496 if (mlx5_getenv_int("MLX5_ENABLE_CQE_COMPRESSION")) {
497 ERROR("MLX5_ENABLE_CQE_COMPRESSION must never be set");
500 memset(&tmpl, 0, sizeof(struct mlx5_txq_ibv));
501 /* MRs will be registered in mp2mr[] later. */
502 attr.cq = (struct ibv_cq_init_attr_ex){
505 cqe_n = ((desc / MLX5_TX_COMP_THRESH) - 1) ?
506 ((desc / MLX5_TX_COMP_THRESH) - 1) : 1;
507 if (priv->mps == MLX5_MPW_ENHANCED)
508 cqe_n += MLX5_TX_COMP_THRESH_INLINE_DIV;
509 tmpl.cq = ibv_create_cq(priv->ctx, cqe_n, NULL, NULL, 0);
510 if (tmpl.cq == NULL) {
511 ERROR("%p: CQ creation failure", (void *)txq_ctrl);
514 attr.init = (struct ibv_qp_init_attr_ex){
515 /* CQ to be associated with the send queue. */
517 /* CQ to be associated with the receive queue. */
520 /* Max number of outstanding WRs. */
522 ((priv->device_attr.orig_attr.max_qp_wr <
524 priv->device_attr.orig_attr.max_qp_wr :
527 * Max number of scatter/gather elements in a WR,
528 * must be 1 to prevent libmlx5 from trying to affect
529 * too much memory. TX gather is not impacted by the
530 * priv->device_attr.max_sge limit and will still work
535 .qp_type = IBV_QPT_RAW_PACKET,
537 * Do *NOT* enable this, completions events are managed per
542 .comp_mask = IBV_QP_INIT_ATTR_PD,
544 if (txq_data->inline_en)
545 attr.init.cap.max_inline_data = txq_ctrl->max_inline_data;
546 if (txq_data->tso_en) {
547 attr.init.max_tso_header = txq_ctrl->max_tso_header;
548 attr.init.comp_mask |= IBV_QP_INIT_ATTR_MAX_TSO_HEADER;
550 tmpl.qp = ibv_create_qp_ex(priv->ctx, &attr.init);
551 if (tmpl.qp == NULL) {
552 ERROR("%p: QP creation failure", (void *)txq_ctrl);
555 attr.mod = (struct ibv_qp_attr){
556 /* Move the QP to this state. */
557 .qp_state = IBV_QPS_INIT,
558 /* Primary port number. */
559 .port_num = priv->port
561 ret = ibv_modify_qp(tmpl.qp, &attr.mod, (IBV_QP_STATE | IBV_QP_PORT));
563 ERROR("%p: QP state to IBV_QPS_INIT failed", (void *)txq_ctrl);
566 attr.mod = (struct ibv_qp_attr){
567 .qp_state = IBV_QPS_RTR
569 ret = ibv_modify_qp(tmpl.qp, &attr.mod, IBV_QP_STATE);
571 ERROR("%p: QP state to IBV_QPS_RTR failed", (void *)txq_ctrl);
574 attr.mod.qp_state = IBV_QPS_RTS;
575 ret = ibv_modify_qp(tmpl.qp, &attr.mod, IBV_QP_STATE);
577 ERROR("%p: QP state to IBV_QPS_RTS failed", (void *)txq_ctrl);
580 txq_ibv = rte_calloc_socket(__func__, 1, sizeof(struct mlx5_txq_ibv), 0,
583 ERROR("%p: cannot allocate memory", (void *)txq_ctrl);
587 obj.cq.out = &cq_info;
590 ret = mlx5dv_init_obj(&obj, MLX5DV_OBJ_CQ | MLX5DV_OBJ_QP);
593 if (cq_info.cqe_size != RTE_CACHE_LINE_SIZE) {
594 ERROR("Wrong MLX5_CQE_SIZE environment variable value: "
595 "it should be set to %u", RTE_CACHE_LINE_SIZE);
598 txq_data->cqe_n = log2above(cq_info.cqe_cnt);
599 txq_data->qp_num_8s = tmpl.qp->qp_num << 8;
600 txq_data->wqes = qp.sq.buf;
601 txq_data->wqe_n = log2above(qp.sq.wqe_cnt);
602 txq_data->qp_db = &qp.dbrec[MLX5_SND_DBR];
603 txq_data->bf_reg = qp.bf.reg;
604 txq_data->cq_db = cq_info.dbrec;
606 (volatile struct mlx5_cqe (*)[])
607 (uintptr_t)cq_info.buf;
610 txq_data->wqe_ci = 0;
611 txq_data->wqe_pi = 0;
612 txq_ibv->qp = tmpl.qp;
613 txq_ibv->cq = tmpl.cq;
614 rte_atomic32_inc(&txq_ibv->refcnt);
615 DEBUG("%p: Verbs Tx queue %p: refcnt %d", (void *)priv,
616 (void *)txq_ibv, rte_atomic32_read(&txq_ibv->refcnt));
617 LIST_INSERT_HEAD(&priv->txqsibv, txq_ibv, next);
621 claim_zero(ibv_destroy_cq(tmpl.cq));
623 claim_zero(ibv_destroy_qp(tmpl.qp));
628 * Get an Tx queue Verbs object.
631 * Pointer to private structure.
633 * Queue index in DPDK Rx queue array
636 * The Verbs object if it exists.
639 mlx5_priv_txq_ibv_get(struct priv *priv, uint16_t idx)
641 struct mlx5_txq_ctrl *txq_ctrl;
643 if (idx >= priv->txqs_n)
645 if (!(*priv->txqs)[idx])
647 txq_ctrl = container_of((*priv->txqs)[idx], struct mlx5_txq_ctrl, txq);
649 rte_atomic32_inc(&txq_ctrl->ibv->refcnt);
650 DEBUG("%p: Verbs Tx queue %p: refcnt %d", (void *)priv,
651 (void *)txq_ctrl->ibv,
652 rte_atomic32_read(&txq_ctrl->ibv->refcnt));
654 return txq_ctrl->ibv;
658 * Release an Tx verbs queue object.
661 * Pointer to private structure.
663 * Verbs Tx queue object.
666 * 0 on success, errno on failure.
669 mlx5_priv_txq_ibv_release(struct priv *priv, struct mlx5_txq_ibv *txq_ibv)
673 DEBUG("%p: Verbs Tx queue %p: refcnt %d", (void *)priv,
674 (void *)txq_ibv, rte_atomic32_read(&txq_ibv->refcnt));
675 if (rte_atomic32_dec_and_test(&txq_ibv->refcnt)) {
676 claim_zero(ibv_destroy_qp(txq_ibv->qp));
677 claim_zero(ibv_destroy_cq(txq_ibv->cq));
678 LIST_REMOVE(txq_ibv, next);
686 * Return true if a single reference exists on the object.
689 * Pointer to private structure.
691 * Verbs Tx queue object.
694 mlx5_priv_txq_ibv_releasable(struct priv *priv, struct mlx5_txq_ibv *txq_ibv)
698 return (rte_atomic32_read(&txq_ibv->refcnt) == 1);
702 * Verify the Verbs Tx queue list is empty
705 * Pointer to private structure.
707 * @return the number of object not released.
710 mlx5_priv_txq_ibv_verify(struct priv *priv)
713 struct mlx5_txq_ibv *txq_ibv;
715 LIST_FOREACH(txq_ibv, &priv->txqsibv, next) {
716 DEBUG("%p: Verbs Tx queue %p still referenced", (void *)priv,