4 * Copyright 2015 6WIND S.A.
5 * Copyright 2015 Mellanox.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * * Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * * Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * * Neither the name of 6WIND S.A. nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
41 /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
43 #pragma GCC diagnostic ignored "-pedantic"
45 #include <infiniband/verbs.h>
47 #pragma GCC diagnostic error "-pedantic"
50 /* DPDK headers don't like -pedantic. */
52 #pragma GCC diagnostic ignored "-pedantic"
55 #include <rte_malloc.h>
56 #include <rte_ethdev.h>
57 #include <rte_common.h>
59 #pragma GCC diagnostic error "-pedantic"
63 #include "mlx5_rxtx.h"
64 #include "mlx5_utils.h"
65 #include "mlx5_defs.h"
68 * Allocate RX queue elements.
71 * Pointer to RX queue structure.
73 * Number of elements to allocate.
75 * If not NULL, fetch buffers from this array instead of allocating them
76 * with rte_pktmbuf_alloc().
79 * 0 on success, errno value on failure.
82 rxq_alloc_elts(struct rxq *rxq, unsigned int elts_n, struct rte_mbuf **pool)
85 struct rxq_elt (*elts)[elts_n] =
86 rte_calloc_socket("RXQ elements", 1, sizeof(*elts), 0,
91 ERROR("%p: can't allocate packets array", (void *)rxq);
95 /* For each WR (packet). */
96 for (i = 0; (i != elts_n); ++i) {
97 struct rxq_elt *elt = &(*elts)[i];
98 struct ibv_recv_wr *wr = &elt->wr;
99 struct ibv_sge *sge = &(*elts)[i].sge;
100 struct rte_mbuf *buf;
105 rte_pktmbuf_reset(buf);
107 buf = rte_pktmbuf_alloc(rxq->mp);
109 assert(pool == NULL);
110 ERROR("%p: empty mbuf pool", (void *)rxq);
114 /* Configure WR. Work request ID contains its own index in
115 * the elts array and the offset between SGE buffer header and
117 WR_ID(wr->wr_id).id = i;
118 WR_ID(wr->wr_id).offset =
119 (((uintptr_t)buf->buf_addr + RTE_PKTMBUF_HEADROOM) -
121 wr->next = &(*elts)[(i + 1)].wr;
124 /* Headroom is reserved by rte_pktmbuf_alloc(). */
125 assert(DATA_OFF(buf) == RTE_PKTMBUF_HEADROOM);
126 /* Buffer is supposed to be empty. */
127 assert(rte_pktmbuf_data_len(buf) == 0);
128 assert(rte_pktmbuf_pkt_len(buf) == 0);
129 /* sge->addr must be able to store a pointer. */
130 assert(sizeof(sge->addr) >= sizeof(uintptr_t));
131 /* SGE keeps its headroom. */
132 sge->addr = (uintptr_t)
133 ((uint8_t *)buf->buf_addr + RTE_PKTMBUF_HEADROOM);
134 sge->length = (buf->buf_len - RTE_PKTMBUF_HEADROOM);
135 sge->lkey = rxq->mr->lkey;
136 /* Redundant check for tailroom. */
137 assert(sge->length == rte_pktmbuf_tailroom(buf));
138 /* Make sure elts index and SGE mbuf pointer can be deduced
140 if ((WR_ID(wr->wr_id).id != i) ||
141 ((void *)((uintptr_t)sge->addr -
142 WR_ID(wr->wr_id).offset) != buf)) {
143 ERROR("%p: cannot store index and offset in WR ID",
146 rte_pktmbuf_free(buf);
151 /* The last WR pointer must be NULL. */
152 (*elts)[(i - 1)].wr.next = NULL;
153 DEBUG("%p: allocated and configured %u single-segment WRs",
154 (void *)rxq, elts_n);
155 rxq->elts_n = elts_n;
157 rxq->elts.no_sp = elts;
162 assert(pool == NULL);
163 for (i = 0; (i != RTE_DIM(*elts)); ++i) {
164 struct rxq_elt *elt = &(*elts)[i];
165 struct rte_mbuf *buf;
167 if (elt->sge.addr == 0)
169 assert(WR_ID(elt->wr.wr_id).id == i);
170 buf = (void *)((uintptr_t)elt->sge.addr -
171 WR_ID(elt->wr.wr_id).offset);
172 rte_pktmbuf_free_seg(buf);
176 DEBUG("%p: failed, freed everything", (void *)rxq);
182 * Free RX queue elements.
185 * Pointer to RX queue structure.
188 rxq_free_elts(struct rxq *rxq)
191 unsigned int elts_n = rxq->elts_n;
192 struct rxq_elt (*elts)[elts_n] = rxq->elts.no_sp;
194 DEBUG("%p: freeing WRs", (void *)rxq);
196 rxq->elts.no_sp = NULL;
199 for (i = 0; (i != RTE_DIM(*elts)); ++i) {
200 struct rxq_elt *elt = &(*elts)[i];
201 struct rte_mbuf *buf;
203 if (elt->sge.addr == 0)
205 assert(WR_ID(elt->wr.wr_id).id == i);
206 buf = (void *)((uintptr_t)elt->sge.addr -
207 WR_ID(elt->wr.wr_id).offset);
208 rte_pktmbuf_free_seg(buf);
214 * Clean up a RX queue.
216 * Destroy objects, free allocated memory and reset the structure for reuse.
219 * Pointer to RX queue structure.
222 rxq_cleanup(struct rxq *rxq)
224 struct ibv_exp_release_intf_params params;
226 DEBUG("cleaning up %p", (void *)rxq);
228 if (rxq->if_qp != NULL) {
229 assert(rxq->priv != NULL);
230 assert(rxq->priv->ctx != NULL);
231 assert(rxq->qp != NULL);
232 params = (struct ibv_exp_release_intf_params){
235 claim_zero(ibv_exp_release_intf(rxq->priv->ctx,
239 if (rxq->if_cq != NULL) {
240 assert(rxq->priv != NULL);
241 assert(rxq->priv->ctx != NULL);
242 assert(rxq->cq != NULL);
243 params = (struct ibv_exp_release_intf_params){
246 claim_zero(ibv_exp_release_intf(rxq->priv->ctx,
250 if (rxq->qp != NULL) {
251 claim_zero(ibv_destroy_qp(rxq->qp));
254 claim_zero(ibv_destroy_cq(rxq->cq));
255 if (rxq->rd != NULL) {
256 struct ibv_exp_destroy_res_domain_attr attr = {
260 assert(rxq->priv != NULL);
261 assert(rxq->priv->ctx != NULL);
262 claim_zero(ibv_exp_destroy_res_domain(rxq->priv->ctx,
267 claim_zero(ibv_dereg_mr(rxq->mr));
268 memset(rxq, 0, sizeof(*rxq));
272 * Allocate a Queue Pair.
273 * Optionally setup inline receive if supported.
276 * Pointer to private structure.
278 * Completion queue to associate with QP.
280 * Number of descriptors in QP (hint only).
283 * QP pointer or NULL in case of error.
285 static struct ibv_qp *
286 rxq_setup_qp(struct priv *priv, struct ibv_cq *cq, uint16_t desc,
287 struct ibv_exp_res_domain *rd)
289 struct ibv_exp_qp_init_attr attr = {
290 /* CQ to be associated with the send queue. */
292 /* CQ to be associated with the receive queue. */
295 /* Max number of outstanding WRs. */
296 .max_recv_wr = ((priv->device_attr.max_qp_wr < desc) ?
297 priv->device_attr.max_qp_wr :
299 /* Max number of scatter/gather elements in a WR. */
300 .max_recv_sge = ((priv->device_attr.max_sge <
302 priv->device_attr.max_sge :
305 .qp_type = IBV_QPT_RAW_PACKET,
306 .comp_mask = (IBV_EXP_QP_INIT_ATTR_PD |
307 IBV_EXP_QP_INIT_ATTR_RES_DOMAIN),
312 return ibv_exp_create_qp(priv->ctx, &attr);
318 * Allocate a RSS Queue Pair.
319 * Optionally setup inline receive if supported.
322 * Pointer to private structure.
324 * Completion queue to associate with QP.
326 * Number of descriptors in QP (hint only).
328 * If nonzero, create a parent QP, otherwise a child.
331 * QP pointer or NULL in case of error.
333 static struct ibv_qp *
334 rxq_setup_qp_rss(struct priv *priv, struct ibv_cq *cq, uint16_t desc,
335 int parent, struct ibv_exp_res_domain *rd)
337 struct ibv_exp_qp_init_attr attr = {
338 /* CQ to be associated with the send queue. */
340 /* CQ to be associated with the receive queue. */
343 /* Max number of outstanding WRs. */
344 .max_recv_wr = ((priv->device_attr.max_qp_wr < desc) ?
345 priv->device_attr.max_qp_wr :
347 /* Max number of scatter/gather elements in a WR. */
348 .max_recv_sge = ((priv->device_attr.max_sge <
350 priv->device_attr.max_sge :
353 .qp_type = IBV_QPT_RAW_PACKET,
354 .comp_mask = (IBV_EXP_QP_INIT_ATTR_PD |
355 IBV_EXP_QP_INIT_ATTR_RES_DOMAIN |
356 IBV_EXP_QP_INIT_ATTR_QPG),
362 attr.qpg.qpg_type = IBV_EXP_QPG_PARENT;
363 /* TSS isn't necessary. */
364 attr.qpg.parent_attrib.tss_child_count = 0;
365 attr.qpg.parent_attrib.rss_child_count = priv->rxqs_n;
366 DEBUG("initializing parent RSS queue");
368 attr.qpg.qpg_type = IBV_EXP_QPG_CHILD_RX;
369 attr.qpg.qpg_parent = priv->rxq_parent.qp;
370 DEBUG("initializing child RSS queue");
372 return ibv_exp_create_qp(priv->ctx, &attr);
375 #endif /* RSS_SUPPORT */
378 * Configure a RX queue.
381 * Pointer to Ethernet device structure.
383 * Pointer to RX queue structure.
385 * Number of descriptors to configure in queue.
387 * NUMA socket on which memory must be allocated.
389 * Thresholds parameters.
391 * Memory pool for buffer allocations.
394 * 0 on success, errno value on failure.
397 rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
398 unsigned int socket, const struct rte_eth_rxconf *conf,
399 struct rte_mempool *mp)
401 struct priv *priv = dev->data->dev_private;
407 struct ibv_exp_qp_attr mod;
409 struct ibv_exp_query_intf_params params;
410 struct ibv_exp_cq_init_attr cq;
411 struct ibv_exp_res_domain_init_attr rd;
413 enum ibv_exp_query_intf_status status;
414 struct ibv_recv_wr *bad_wr;
415 struct rte_mbuf *buf;
417 int parent = (rxq == &priv->rxq_parent);
419 (void)conf; /* Thresholds configuration (ignored). */
421 * If this is a parent queue, hardware must support RSS and
422 * RSS must be enabled.
424 assert((!parent) || ((priv->hw_rss) && (priv->rss)));
426 /* Even if unused, ibv_create_cq() requires at least one
431 if ((desc == 0) || (desc % MLX5_PMD_SGE_WR_N)) {
432 ERROR("%p: invalid number of RX descriptors (must be a"
433 " multiple of %d)", (void *)dev, MLX5_PMD_SGE_WR_N);
436 /* Get mbuf length. */
437 buf = rte_pktmbuf_alloc(mp);
439 ERROR("%p: unable to allocate mbuf", (void *)dev);
442 tmpl.mb_len = buf->buf_len;
443 assert((rte_pktmbuf_headroom(buf) +
444 rte_pktmbuf_tailroom(buf)) == tmpl.mb_len);
445 assert(rte_pktmbuf_headroom(buf) == RTE_PKTMBUF_HEADROOM);
446 rte_pktmbuf_free(buf);
447 /* Use the entire RX mempool as the memory region. */
448 tmpl.mr = ibv_reg_mr(priv->pd,
449 (void *)mp->elt_va_start,
450 (mp->elt_va_end - mp->elt_va_start),
451 (IBV_ACCESS_LOCAL_WRITE |
452 IBV_ACCESS_REMOTE_WRITE));
453 if (tmpl.mr == NULL) {
455 ERROR("%p: MR creation failure: %s",
456 (void *)dev, strerror(ret));
460 attr.rd = (struct ibv_exp_res_domain_init_attr){
461 .comp_mask = (IBV_EXP_RES_DOMAIN_THREAD_MODEL |
462 IBV_EXP_RES_DOMAIN_MSG_MODEL),
463 .thread_model = IBV_EXP_THREAD_SINGLE,
464 .msg_model = IBV_EXP_MSG_HIGH_BW,
466 tmpl.rd = ibv_exp_create_res_domain(priv->ctx, &attr.rd);
467 if (tmpl.rd == NULL) {
469 ERROR("%p: RD creation failure: %s",
470 (void *)dev, strerror(ret));
473 attr.cq = (struct ibv_exp_cq_init_attr){
474 .comp_mask = IBV_EXP_CQ_INIT_ATTR_RES_DOMAIN,
475 .res_domain = tmpl.rd,
477 tmpl.cq = ibv_exp_create_cq(priv->ctx, desc, NULL, NULL, 0, &attr.cq);
478 if (tmpl.cq == NULL) {
480 ERROR("%p: CQ creation failure: %s",
481 (void *)dev, strerror(ret));
484 DEBUG("priv->device_attr.max_qp_wr is %d",
485 priv->device_attr.max_qp_wr);
486 DEBUG("priv->device_attr.max_sge is %d",
487 priv->device_attr.max_sge);
490 tmpl.qp = rxq_setup_qp_rss(priv, tmpl.cq, desc, parent,
493 #endif /* RSS_SUPPORT */
494 tmpl.qp = rxq_setup_qp(priv, tmpl.cq, desc, tmpl.rd);
495 if (tmpl.qp == NULL) {
496 ret = (errno ? errno : EINVAL);
497 ERROR("%p: QP creation failure: %s",
498 (void *)dev, strerror(ret));
501 mod = (struct ibv_exp_qp_attr){
502 /* Move the QP to this state. */
503 .qp_state = IBV_QPS_INIT,
504 /* Primary port number. */
505 .port_num = priv->port
507 ret = ibv_exp_modify_qp(tmpl.qp, &mod,
510 (parent ? IBV_EXP_QP_GROUP_RSS : 0) |
511 #endif /* RSS_SUPPORT */
514 ERROR("%p: QP state to IBV_QPS_INIT failed: %s",
515 (void *)dev, strerror(ret));
518 /* Allocate descriptors for RX queues, except for the RSS parent. */
521 ret = rxq_alloc_elts(&tmpl, desc, NULL);
523 ERROR("%p: RXQ allocation failed: %s",
524 (void *)dev, strerror(ret));
527 ret = ibv_post_recv(tmpl.qp,
528 &(*tmpl.elts.no_sp)[0].wr,
531 ERROR("%p: ibv_post_recv() failed for WR %p: %s",
538 mod = (struct ibv_exp_qp_attr){
539 .qp_state = IBV_QPS_RTR
541 ret = ibv_exp_modify_qp(tmpl.qp, &mod, IBV_EXP_QP_STATE);
543 ERROR("%p: QP state to IBV_QPS_RTR failed: %s",
544 (void *)dev, strerror(ret));
548 tmpl.port_id = dev->data->port_id;
549 DEBUG("%p: RTE port ID: %u", (void *)rxq, tmpl.port_id);
550 attr.params = (struct ibv_exp_query_intf_params){
551 .intf_scope = IBV_EXP_INTF_GLOBAL,
552 .intf = IBV_EXP_INTF_CQ,
555 tmpl.if_cq = ibv_exp_query_intf(priv->ctx, &attr.params, &status);
556 if (tmpl.if_cq == NULL) {
557 ERROR("%p: CQ interface family query failed with status %d",
558 (void *)dev, status);
561 attr.params = (struct ibv_exp_query_intf_params){
562 .intf_scope = IBV_EXP_INTF_GLOBAL,
563 .intf = IBV_EXP_INTF_QP_BURST,
566 tmpl.if_qp = ibv_exp_query_intf(priv->ctx, &attr.params, &status);
567 if (tmpl.if_qp == NULL) {
568 ERROR("%p: QP interface family query failed with status %d",
569 (void *)dev, status);
572 /* Clean up rxq in case we're reinitializing it. */
573 DEBUG("%p: cleaning-up old rxq just in case", (void *)rxq);
576 DEBUG("%p: rxq updated with %p", (void *)rxq, (void *)&tmpl);
586 * DPDK callback to configure a RX queue.
589 * Pointer to Ethernet device structure.
593 * Number of descriptors to configure in queue.
595 * NUMA socket on which memory must be allocated.
597 * Thresholds parameters.
599 * Memory pool for buffer allocations.
602 * 0 on success, negative errno value on failure.
605 mlx5_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
606 unsigned int socket, const struct rte_eth_rxconf *conf,
607 struct rte_mempool *mp)
609 struct priv *priv = dev->data->dev_private;
610 struct rxq *rxq = (*priv->rxqs)[idx];
614 DEBUG("%p: configuring queue %u for %u descriptors",
615 (void *)dev, idx, desc);
616 if (idx >= priv->rxqs_n) {
617 ERROR("%p: queue index out of range (%u >= %u)",
618 (void *)dev, idx, priv->rxqs_n);
623 DEBUG("%p: reusing already allocated queue index %u (%p)",
624 (void *)dev, idx, (void *)rxq);
629 (*priv->rxqs)[idx] = NULL;
632 rxq = rte_calloc_socket("RXQ", 1, sizeof(*rxq), 0, socket);
634 ERROR("%p: unable to allocate queue index %u",
640 ret = rxq_setup(dev, rxq, desc, socket, conf, mp);
644 DEBUG("%p: adding RX queue %p to list",
645 (void *)dev, (void *)rxq);
646 (*priv->rxqs)[idx] = rxq;
647 /* Update receive callback. */
648 dev->rx_pkt_burst = mlx5_rx_burst;
655 * DPDK callback to release a RX queue.
658 * Generic RX queue pointer.
661 mlx5_rx_queue_release(void *dpdk_rxq)
663 struct rxq *rxq = (struct rxq *)dpdk_rxq;
671 assert(rxq != &priv->rxq_parent);
672 for (i = 0; (i != priv->rxqs_n); ++i)
673 if ((*priv->rxqs)[i] == rxq) {
674 DEBUG("%p: removing RX queue %p from list",
675 (void *)priv->dev, (void *)rxq);
676 (*priv->rxqs)[i] = NULL;