4 * Copyright 2015 6WIND S.A.
5 * Copyright 2015 Mellanox.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * * Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * * Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * * Neither the name of 6WIND S.A. nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
41 /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
43 #pragma GCC diagnostic ignored "-pedantic"
45 #include <infiniband/verbs.h>
47 #pragma GCC diagnostic error "-pedantic"
50 /* DPDK headers don't like -pedantic. */
52 #pragma GCC diagnostic ignored "-pedantic"
55 #include <rte_malloc.h>
56 #include <rte_ethdev.h>
57 #include <rte_common.h>
59 #pragma GCC diagnostic error "-pedantic"
63 #include "mlx5_rxtx.h"
64 #include "mlx5_utils.h"
65 #include "mlx5_defs.h"
68 * Allocate RX queue elements.
71 * Pointer to RX queue structure.
73 * Number of elements to allocate.
75 * If not NULL, fetch buffers from this array instead of allocating them
76 * with rte_pktmbuf_alloc().
79 * 0 on success, errno value on failure.
82 rxq_alloc_elts(struct rxq *rxq, unsigned int elts_n, struct rte_mbuf **pool)
85 struct rxq_elt (*elts)[elts_n] =
86 rte_calloc_socket("RXQ elements", 1, sizeof(*elts), 0,
91 ERROR("%p: can't allocate packets array", (void *)rxq);
95 /* For each WR (packet). */
96 for (i = 0; (i != elts_n); ++i) {
97 struct rxq_elt *elt = &(*elts)[i];
98 struct ibv_recv_wr *wr = &elt->wr;
99 struct ibv_sge *sge = &(*elts)[i].sge;
100 struct rte_mbuf *buf;
105 rte_pktmbuf_reset(buf);
107 buf = rte_pktmbuf_alloc(rxq->mp);
109 assert(pool == NULL);
110 ERROR("%p: empty mbuf pool", (void *)rxq);
114 /* Configure WR. Work request ID contains its own index in
115 * the elts array and the offset between SGE buffer header and
117 WR_ID(wr->wr_id).id = i;
118 WR_ID(wr->wr_id).offset =
119 (((uintptr_t)buf->buf_addr + RTE_PKTMBUF_HEADROOM) -
121 wr->next = &(*elts)[(i + 1)].wr;
124 /* Headroom is reserved by rte_pktmbuf_alloc(). */
125 assert(DATA_OFF(buf) == RTE_PKTMBUF_HEADROOM);
126 /* Buffer is supposed to be empty. */
127 assert(rte_pktmbuf_data_len(buf) == 0);
128 assert(rte_pktmbuf_pkt_len(buf) == 0);
129 /* sge->addr must be able to store a pointer. */
130 assert(sizeof(sge->addr) >= sizeof(uintptr_t));
131 /* SGE keeps its headroom. */
132 sge->addr = (uintptr_t)
133 ((uint8_t *)buf->buf_addr + RTE_PKTMBUF_HEADROOM);
134 sge->length = (buf->buf_len - RTE_PKTMBUF_HEADROOM);
135 sge->lkey = rxq->mr->lkey;
136 /* Redundant check for tailroom. */
137 assert(sge->length == rte_pktmbuf_tailroom(buf));
138 /* Make sure elts index and SGE mbuf pointer can be deduced
140 if ((WR_ID(wr->wr_id).id != i) ||
141 ((void *)((uintptr_t)sge->addr -
142 WR_ID(wr->wr_id).offset) != buf)) {
143 ERROR("%p: cannot store index and offset in WR ID",
146 rte_pktmbuf_free(buf);
151 /* The last WR pointer must be NULL. */
152 (*elts)[(i - 1)].wr.next = NULL;
153 DEBUG("%p: allocated and configured %u single-segment WRs",
154 (void *)rxq, elts_n);
155 rxq->elts_n = elts_n;
157 rxq->elts.no_sp = elts;
162 assert(pool == NULL);
163 for (i = 0; (i != RTE_DIM(*elts)); ++i) {
164 struct rxq_elt *elt = &(*elts)[i];
165 struct rte_mbuf *buf;
167 if (elt->sge.addr == 0)
169 assert(WR_ID(elt->wr.wr_id).id == i);
170 buf = (void *)((uintptr_t)elt->sge.addr -
171 WR_ID(elt->wr.wr_id).offset);
172 rte_pktmbuf_free_seg(buf);
176 DEBUG("%p: failed, freed everything", (void *)rxq);
182 * Free RX queue elements.
185 * Pointer to RX queue structure.
188 rxq_free_elts(struct rxq *rxq)
191 unsigned int elts_n = rxq->elts_n;
192 struct rxq_elt (*elts)[elts_n] = rxq->elts.no_sp;
194 DEBUG("%p: freeing WRs", (void *)rxq);
196 rxq->elts.no_sp = NULL;
199 for (i = 0; (i != RTE_DIM(*elts)); ++i) {
200 struct rxq_elt *elt = &(*elts)[i];
201 struct rte_mbuf *buf;
203 if (elt->sge.addr == 0)
205 assert(WR_ID(elt->wr.wr_id).id == i);
206 buf = (void *)((uintptr_t)elt->sge.addr -
207 WR_ID(elt->wr.wr_id).offset);
208 rte_pktmbuf_free_seg(buf);
214 * Clean up a RX queue.
216 * Destroy objects, free allocated memory and reset the structure for reuse.
219 * Pointer to RX queue structure.
222 rxq_cleanup(struct rxq *rxq)
224 struct ibv_exp_release_intf_params params;
226 DEBUG("cleaning up %p", (void *)rxq);
228 if (rxq->if_qp != NULL) {
229 assert(rxq->priv != NULL);
230 assert(rxq->priv->ctx != NULL);
231 assert(rxq->qp != NULL);
232 params = (struct ibv_exp_release_intf_params){
235 claim_zero(ibv_exp_release_intf(rxq->priv->ctx,
239 if (rxq->if_cq != NULL) {
240 assert(rxq->priv != NULL);
241 assert(rxq->priv->ctx != NULL);
242 assert(rxq->cq != NULL);
243 params = (struct ibv_exp_release_intf_params){
246 claim_zero(ibv_exp_release_intf(rxq->priv->ctx,
250 if (rxq->qp != NULL) {
251 rxq_mac_addrs_del(rxq);
252 claim_zero(ibv_destroy_qp(rxq->qp));
255 claim_zero(ibv_destroy_cq(rxq->cq));
256 if (rxq->rd != NULL) {
257 struct ibv_exp_destroy_res_domain_attr attr = {
261 assert(rxq->priv != NULL);
262 assert(rxq->priv->ctx != NULL);
263 claim_zero(ibv_exp_destroy_res_domain(rxq->priv->ctx,
268 claim_zero(ibv_dereg_mr(rxq->mr));
269 memset(rxq, 0, sizeof(*rxq));
273 * Allocate a Queue Pair.
274 * Optionally setup inline receive if supported.
277 * Pointer to private structure.
279 * Completion queue to associate with QP.
281 * Number of descriptors in QP (hint only).
284 * QP pointer or NULL in case of error.
286 static struct ibv_qp *
287 rxq_setup_qp(struct priv *priv, struct ibv_cq *cq, uint16_t desc,
288 struct ibv_exp_res_domain *rd)
290 struct ibv_exp_qp_init_attr attr = {
291 /* CQ to be associated with the send queue. */
293 /* CQ to be associated with the receive queue. */
296 /* Max number of outstanding WRs. */
297 .max_recv_wr = ((priv->device_attr.max_qp_wr < desc) ?
298 priv->device_attr.max_qp_wr :
300 /* Max number of scatter/gather elements in a WR. */
301 .max_recv_sge = ((priv->device_attr.max_sge <
303 priv->device_attr.max_sge :
306 .qp_type = IBV_QPT_RAW_PACKET,
307 .comp_mask = (IBV_EXP_QP_INIT_ATTR_PD |
308 IBV_EXP_QP_INIT_ATTR_RES_DOMAIN),
313 return ibv_exp_create_qp(priv->ctx, &attr);
319 * Allocate a RSS Queue Pair.
320 * Optionally setup inline receive if supported.
323 * Pointer to private structure.
325 * Completion queue to associate with QP.
327 * Number of descriptors in QP (hint only).
329 * If nonzero, create a parent QP, otherwise a child.
332 * QP pointer or NULL in case of error.
334 static struct ibv_qp *
335 rxq_setup_qp_rss(struct priv *priv, struct ibv_cq *cq, uint16_t desc,
336 int parent, struct ibv_exp_res_domain *rd)
338 struct ibv_exp_qp_init_attr attr = {
339 /* CQ to be associated with the send queue. */
341 /* CQ to be associated with the receive queue. */
344 /* Max number of outstanding WRs. */
345 .max_recv_wr = ((priv->device_attr.max_qp_wr < desc) ?
346 priv->device_attr.max_qp_wr :
348 /* Max number of scatter/gather elements in a WR. */
349 .max_recv_sge = ((priv->device_attr.max_sge <
351 priv->device_attr.max_sge :
354 .qp_type = IBV_QPT_RAW_PACKET,
355 .comp_mask = (IBV_EXP_QP_INIT_ATTR_PD |
356 IBV_EXP_QP_INIT_ATTR_RES_DOMAIN |
357 IBV_EXP_QP_INIT_ATTR_QPG),
363 attr.qpg.qpg_type = IBV_EXP_QPG_PARENT;
364 /* TSS isn't necessary. */
365 attr.qpg.parent_attrib.tss_child_count = 0;
366 attr.qpg.parent_attrib.rss_child_count = priv->rxqs_n;
367 DEBUG("initializing parent RSS queue");
369 attr.qpg.qpg_type = IBV_EXP_QPG_CHILD_RX;
370 attr.qpg.qpg_parent = priv->rxq_parent.qp;
371 DEBUG("initializing child RSS queue");
373 return ibv_exp_create_qp(priv->ctx, &attr);
376 #endif /* RSS_SUPPORT */
379 * Configure a RX queue.
382 * Pointer to Ethernet device structure.
384 * Pointer to RX queue structure.
386 * Number of descriptors to configure in queue.
388 * NUMA socket on which memory must be allocated.
390 * Thresholds parameters.
392 * Memory pool for buffer allocations.
395 * 0 on success, errno value on failure.
398 rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
399 unsigned int socket, const struct rte_eth_rxconf *conf,
400 struct rte_mempool *mp)
402 struct priv *priv = dev->data->dev_private;
408 struct ibv_exp_qp_attr mod;
410 struct ibv_exp_query_intf_params params;
411 struct ibv_exp_cq_init_attr cq;
412 struct ibv_exp_res_domain_init_attr rd;
414 enum ibv_exp_query_intf_status status;
415 struct ibv_recv_wr *bad_wr;
416 struct rte_mbuf *buf;
418 int parent = (rxq == &priv->rxq_parent);
420 (void)conf; /* Thresholds configuration (ignored). */
422 * If this is a parent queue, hardware must support RSS and
423 * RSS must be enabled.
425 assert((!parent) || ((priv->hw_rss) && (priv->rss)));
427 /* Even if unused, ibv_create_cq() requires at least one
432 if ((desc == 0) || (desc % MLX5_PMD_SGE_WR_N)) {
433 ERROR("%p: invalid number of RX descriptors (must be a"
434 " multiple of %d)", (void *)dev, MLX5_PMD_SGE_WR_N);
437 /* Get mbuf length. */
438 buf = rte_pktmbuf_alloc(mp);
440 ERROR("%p: unable to allocate mbuf", (void *)dev);
443 tmpl.mb_len = buf->buf_len;
444 assert((rte_pktmbuf_headroom(buf) +
445 rte_pktmbuf_tailroom(buf)) == tmpl.mb_len);
446 assert(rte_pktmbuf_headroom(buf) == RTE_PKTMBUF_HEADROOM);
447 rte_pktmbuf_free(buf);
448 /* Use the entire RX mempool as the memory region. */
449 tmpl.mr = ibv_reg_mr(priv->pd,
450 (void *)mp->elt_va_start,
451 (mp->elt_va_end - mp->elt_va_start),
452 (IBV_ACCESS_LOCAL_WRITE |
453 IBV_ACCESS_REMOTE_WRITE));
454 if (tmpl.mr == NULL) {
456 ERROR("%p: MR creation failure: %s",
457 (void *)dev, strerror(ret));
461 attr.rd = (struct ibv_exp_res_domain_init_attr){
462 .comp_mask = (IBV_EXP_RES_DOMAIN_THREAD_MODEL |
463 IBV_EXP_RES_DOMAIN_MSG_MODEL),
464 .thread_model = IBV_EXP_THREAD_SINGLE,
465 .msg_model = IBV_EXP_MSG_HIGH_BW,
467 tmpl.rd = ibv_exp_create_res_domain(priv->ctx, &attr.rd);
468 if (tmpl.rd == NULL) {
470 ERROR("%p: RD creation failure: %s",
471 (void *)dev, strerror(ret));
474 attr.cq = (struct ibv_exp_cq_init_attr){
475 .comp_mask = IBV_EXP_CQ_INIT_ATTR_RES_DOMAIN,
476 .res_domain = tmpl.rd,
478 tmpl.cq = ibv_exp_create_cq(priv->ctx, desc, NULL, NULL, 0, &attr.cq);
479 if (tmpl.cq == NULL) {
481 ERROR("%p: CQ creation failure: %s",
482 (void *)dev, strerror(ret));
485 DEBUG("priv->device_attr.max_qp_wr is %d",
486 priv->device_attr.max_qp_wr);
487 DEBUG("priv->device_attr.max_sge is %d",
488 priv->device_attr.max_sge);
491 tmpl.qp = rxq_setup_qp_rss(priv, tmpl.cq, desc, parent,
494 #endif /* RSS_SUPPORT */
495 tmpl.qp = rxq_setup_qp(priv, tmpl.cq, desc, tmpl.rd);
496 if (tmpl.qp == NULL) {
497 ret = (errno ? errno : EINVAL);
498 ERROR("%p: QP creation failure: %s",
499 (void *)dev, strerror(ret));
502 mod = (struct ibv_exp_qp_attr){
503 /* Move the QP to this state. */
504 .qp_state = IBV_QPS_INIT,
505 /* Primary port number. */
506 .port_num = priv->port
508 ret = ibv_exp_modify_qp(tmpl.qp, &mod,
511 (parent ? IBV_EXP_QP_GROUP_RSS : 0) |
512 #endif /* RSS_SUPPORT */
515 ERROR("%p: QP state to IBV_QPS_INIT failed: %s",
516 (void *)dev, strerror(ret));
519 if ((parent) || (!priv->rss)) {
520 /* Configure MAC and broadcast addresses. */
521 ret = rxq_mac_addrs_add(&tmpl);
523 ERROR("%p: QP flow attachment failed: %s",
524 (void *)dev, strerror(ret));
528 /* Allocate descriptors for RX queues, except for the RSS parent. */
531 ret = rxq_alloc_elts(&tmpl, desc, NULL);
533 ERROR("%p: RXQ allocation failed: %s",
534 (void *)dev, strerror(ret));
537 ret = ibv_post_recv(tmpl.qp,
538 &(*tmpl.elts.no_sp)[0].wr,
541 ERROR("%p: ibv_post_recv() failed for WR %p: %s",
548 mod = (struct ibv_exp_qp_attr){
549 .qp_state = IBV_QPS_RTR
551 ret = ibv_exp_modify_qp(tmpl.qp, &mod, IBV_EXP_QP_STATE);
553 ERROR("%p: QP state to IBV_QPS_RTR failed: %s",
554 (void *)dev, strerror(ret));
558 tmpl.port_id = dev->data->port_id;
559 DEBUG("%p: RTE port ID: %u", (void *)rxq, tmpl.port_id);
560 attr.params = (struct ibv_exp_query_intf_params){
561 .intf_scope = IBV_EXP_INTF_GLOBAL,
562 .intf = IBV_EXP_INTF_CQ,
565 tmpl.if_cq = ibv_exp_query_intf(priv->ctx, &attr.params, &status);
566 if (tmpl.if_cq == NULL) {
567 ERROR("%p: CQ interface family query failed with status %d",
568 (void *)dev, status);
571 attr.params = (struct ibv_exp_query_intf_params){
572 .intf_scope = IBV_EXP_INTF_GLOBAL,
573 .intf = IBV_EXP_INTF_QP_BURST,
576 tmpl.if_qp = ibv_exp_query_intf(priv->ctx, &attr.params, &status);
577 if (tmpl.if_qp == NULL) {
578 ERROR("%p: QP interface family query failed with status %d",
579 (void *)dev, status);
582 /* Clean up rxq in case we're reinitializing it. */
583 DEBUG("%p: cleaning-up old rxq just in case", (void *)rxq);
586 DEBUG("%p: rxq updated with %p", (void *)rxq, (void *)&tmpl);
596 * DPDK callback to configure a RX queue.
599 * Pointer to Ethernet device structure.
603 * Number of descriptors to configure in queue.
605 * NUMA socket on which memory must be allocated.
607 * Thresholds parameters.
609 * Memory pool for buffer allocations.
612 * 0 on success, negative errno value on failure.
615 mlx5_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
616 unsigned int socket, const struct rte_eth_rxconf *conf,
617 struct rte_mempool *mp)
619 struct priv *priv = dev->data->dev_private;
620 struct rxq *rxq = (*priv->rxqs)[idx];
624 DEBUG("%p: configuring queue %u for %u descriptors",
625 (void *)dev, idx, desc);
626 if (idx >= priv->rxqs_n) {
627 ERROR("%p: queue index out of range (%u >= %u)",
628 (void *)dev, idx, priv->rxqs_n);
633 DEBUG("%p: reusing already allocated queue index %u (%p)",
634 (void *)dev, idx, (void *)rxq);
639 (*priv->rxqs)[idx] = NULL;
642 rxq = rte_calloc_socket("RXQ", 1, sizeof(*rxq), 0, socket);
644 ERROR("%p: unable to allocate queue index %u",
650 ret = rxq_setup(dev, rxq, desc, socket, conf, mp);
654 DEBUG("%p: adding RX queue %p to list",
655 (void *)dev, (void *)rxq);
656 (*priv->rxqs)[idx] = rxq;
657 /* Update receive callback. */
658 dev->rx_pkt_burst = mlx5_rx_burst;
665 * DPDK callback to release a RX queue.
668 * Generic RX queue pointer.
671 mlx5_rx_queue_release(void *dpdk_rxq)
673 struct rxq *rxq = (struct rxq *)dpdk_rxq;
681 assert(rxq != &priv->rxq_parent);
682 for (i = 0; (i != priv->rxqs_n); ++i)
683 if ((*priv->rxqs)[i] == rxq) {
684 DEBUG("%p: removing RX queue %p from list",
685 (void *)priv->dev, (void *)rxq);
686 (*priv->rxqs)[i] = NULL;