4 * Copyright 2015 6WIND S.A.
5 * Copyright 2015 Mellanox.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * * Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * * Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * * Neither the name of 6WIND S.A. nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
41 /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
43 #pragma GCC diagnostic ignored "-pedantic"
45 #include <infiniband/verbs.h>
47 #pragma GCC diagnostic error "-pedantic"
50 /* DPDK headers don't like -pedantic. */
52 #pragma GCC diagnostic ignored "-pedantic"
55 #include <rte_malloc.h>
56 #include <rte_ethdev.h>
57 #include <rte_common.h>
59 #pragma GCC diagnostic error "-pedantic"
63 #include "mlx5_rxtx.h"
64 #include "mlx5_utils.h"
65 #include "mlx5_defs.h"
67 /* Initialization data for hash RX queues. */
68 static const struct hash_rxq_init hash_rxq_init[] = {
70 .hash_fields = (IBV_EXP_RX_HASH_SRC_IPV4 |
71 IBV_EXP_RX_HASH_DST_IPV4 |
72 IBV_EXP_RX_HASH_SRC_PORT_TCP |
73 IBV_EXP_RX_HASH_DST_PORT_TCP),
75 .flow_spec.tcp_udp = {
76 .type = IBV_FLOW_SPEC_TCP,
77 .size = sizeof(hash_rxq_init[0].flow_spec.tcp_udp),
79 .underlayer = &hash_rxq_init[HASH_RXQ_IPV4],
82 .hash_fields = (IBV_EXP_RX_HASH_SRC_IPV4 |
83 IBV_EXP_RX_HASH_DST_IPV4 |
84 IBV_EXP_RX_HASH_SRC_PORT_UDP |
85 IBV_EXP_RX_HASH_DST_PORT_UDP),
87 .flow_spec.tcp_udp = {
88 .type = IBV_FLOW_SPEC_UDP,
89 .size = sizeof(hash_rxq_init[0].flow_spec.tcp_udp),
91 .underlayer = &hash_rxq_init[HASH_RXQ_IPV4],
94 .hash_fields = (IBV_EXP_RX_HASH_SRC_IPV4 |
95 IBV_EXP_RX_HASH_DST_IPV4),
98 .type = IBV_FLOW_SPEC_IPV4,
99 .size = sizeof(hash_rxq_init[0].flow_spec.ipv4),
101 .underlayer = &hash_rxq_init[HASH_RXQ_ETH],
107 .type = IBV_FLOW_SPEC_ETH,
108 .size = sizeof(hash_rxq_init[0].flow_spec.eth),
114 /* Number of entries in hash_rxq_init[]. */
115 static const unsigned int hash_rxq_init_n = RTE_DIM(hash_rxq_init);
117 /* Initialization data for hash RX queue indirection tables. */
118 static const struct ind_table_init ind_table_init[] = {
120 .max_size = -1u, /* Superseded by HW limitations. */
122 1 << HASH_RXQ_TCPV4 |
123 1 << HASH_RXQ_UDPV4 |
130 .hash_types = 1 << HASH_RXQ_ETH,
135 #define IND_TABLE_INIT_N RTE_DIM(ind_table_init)
137 /* Default RSS hash key also used for ConnectX-3. */
138 uint8_t rss_hash_default_key[] = {
139 0x2c, 0xc6, 0x81, 0xd1,
140 0x5b, 0xdb, 0xf4, 0xf7,
141 0xfc, 0xa2, 0x83, 0x19,
142 0xdb, 0x1a, 0x3e, 0x94,
143 0x6b, 0x9e, 0x38, 0xd9,
144 0x2c, 0x9c, 0x03, 0xd1,
145 0xad, 0x99, 0x44, 0xa7,
146 0xd9, 0x56, 0x3d, 0x59,
147 0x06, 0x3c, 0x25, 0xf3,
148 0xfc, 0x1f, 0xdc, 0x2a,
151 /* Length of the default RSS hash key. */
152 const size_t rss_hash_default_key_len = sizeof(rss_hash_default_key);
155 * Populate flow steering rule for a given hash RX queue type using
156 * information from hash_rxq_init[]. Nothing is written to flow_attr when
157 * flow_attr_size is not large enough, but the required size is still returned.
159 * @param[in] hash_rxq
160 * Pointer to hash RX queue.
161 * @param[out] flow_attr
162 * Pointer to flow attribute structure to fill. Note that the allocated
163 * area must be larger and large enough to hold all flow specifications.
164 * @param flow_attr_size
165 * Entire size of flow_attr and trailing room for flow specifications.
168 * Total size of the flow attribute buffer. No errors are defined.
171 hash_rxq_flow_attr(const struct hash_rxq *hash_rxq,
172 struct ibv_flow_attr *flow_attr,
173 size_t flow_attr_size)
175 size_t offset = sizeof(*flow_attr);
176 enum hash_rxq_type type = hash_rxq->type;
177 const struct hash_rxq_init *init = &hash_rxq_init[type];
179 assert(hash_rxq->priv != NULL);
180 assert((size_t)type < RTE_DIM(hash_rxq_init));
182 offset += init->flow_spec.hdr.size;
183 init = init->underlayer;
184 } while (init != NULL);
185 if (offset > flow_attr_size)
187 flow_attr_size = offset;
188 init = &hash_rxq_init[type];
189 *flow_attr = (struct ibv_flow_attr){
190 .type = IBV_FLOW_ATTR_NORMAL,
191 .priority = init->flow_priority,
193 .port = hash_rxq->priv->port,
197 offset -= init->flow_spec.hdr.size;
198 memcpy((void *)((uintptr_t)flow_attr + offset),
200 init->flow_spec.hdr.size);
201 ++flow_attr->num_of_specs;
202 init = init->underlayer;
203 } while (init != NULL);
204 return flow_attr_size;
208 * Return nearest power of two above input value.
214 * Nearest power of two above input value.
217 log2above(unsigned int v)
222 for (l = 0, r = 0; (v >> 1); ++l, v >>= 1)
228 * Return the type corresponding to the n'th bit set.
231 * The indirection table.
236 * The corresponding hash_rxq_type.
238 static enum hash_rxq_type
239 hash_rxq_type_from_n(const struct ind_table_init *table, unsigned int n)
241 assert(n < table->hash_types_n);
242 while (((table->hash_types >> n) & 0x1) == 0)
248 * Filter out disabled hash RX queue types from ind_table_init[].
251 * Pointer to private structure.
256 * Number of table entries.
259 priv_make_ind_table_init(struct priv *priv,
260 struct ind_table_init (*table)[IND_TABLE_INIT_N])
264 unsigned int table_n = 0;
265 /* Mandatory to receive frames not handled by normal hash RX queues. */
266 unsigned int hash_types_sup = 1 << HASH_RXQ_ETH;
268 /* Process other protocols only if more than one queue. */
269 if (priv->rxqs_n > 1)
270 for (i = 0; (i != hash_rxq_init_n); ++i)
271 if (hash_rxq_init[i].hash_fields)
272 hash_types_sup |= (1 << i);
274 /* Filter out entries whose protocols are not in the set. */
275 for (i = 0, j = 0; (i != IND_TABLE_INIT_N); ++i) {
279 /* j is increased only if the table has valid protocols. */
281 (*table)[j] = ind_table_init[i];
282 (*table)[j].hash_types &= hash_types_sup;
283 for (h = 0, nb = 0; (h != hash_rxq_init_n); ++h)
284 if (((*table)[j].hash_types >> h) & 0x1)
286 (*table)[i].hash_types_n = nb;
296 * Initialize hash RX queues and indirection table.
299 * Pointer to private structure.
302 * 0 on success, errno value on failure.
305 priv_create_hash_rxqs(struct priv *priv)
307 /* If the requested number of WQs is not a power of two, use the
308 * maximum indirection table size for better balancing.
309 * The result is always rounded to the next power of two. */
311 (1 << log2above((priv->rxqs_n & (priv->rxqs_n - 1)) ?
312 priv->ind_table_max_size :
314 struct ibv_exp_wq *wqs[wqs_n];
315 struct ind_table_init ind_table_init[IND_TABLE_INIT_N];
316 unsigned int ind_tables_n =
317 priv_make_ind_table_init(priv, &ind_table_init);
318 unsigned int hash_rxqs_n = 0;
319 struct hash_rxq (*hash_rxqs)[] = NULL;
320 struct ibv_exp_rwq_ind_table *(*ind_tables)[] = NULL;
326 assert(priv->ind_tables == NULL);
327 assert(priv->ind_tables_n == 0);
328 assert(priv->hash_rxqs == NULL);
329 assert(priv->hash_rxqs_n == 0);
330 assert(priv->pd != NULL);
331 assert(priv->ctx != NULL);
332 assert(priv->rss_conf != NULL);
333 if (priv->rxqs_n == 0)
335 assert(priv->rxqs != NULL);
336 if (ind_tables_n == 0) {
337 ERROR("all hash RX queue types have been filtered out,"
338 " indirection table cannot be created");
341 if ((wqs_n < priv->rxqs_n) || (wqs_n > priv->ind_table_max_size)) {
342 ERROR("cannot handle this many RX queues (%u)", priv->rxqs_n);
346 if (wqs_n != priv->rxqs_n) {
347 INFO("%u RX queues are configured, consider rounding this"
348 " number to the next power of two for better balancing",
350 DEBUG("indirection table extended to assume %u WQs", wqs_n);
352 /* When the number of RX queues is not a power of two, the remaining
353 * table entries are padded with reused WQs and hashes are not spread
355 for (i = 0, j = 0; (i != wqs_n); ++i) {
356 wqs[i] = (*priv->rxqs)[j]->wq;
357 if (++j == priv->rxqs_n)
360 /* Get number of hash RX queues to configure. */
361 for (i = 0, hash_rxqs_n = 0; (i != ind_tables_n); ++i)
362 hash_rxqs_n += ind_table_init[i].hash_types_n;
363 DEBUG("allocating %u hash RX queues for %u WQs, %u indirection tables",
364 hash_rxqs_n, priv->rxqs_n, ind_tables_n);
365 /* Create indirection tables. */
366 ind_tables = rte_calloc(__func__, ind_tables_n,
367 sizeof((*ind_tables)[0]), 0);
368 if (ind_tables == NULL) {
370 ERROR("cannot allocate indirection tables container: %s",
374 for (i = 0; (i != ind_tables_n); ++i) {
375 struct ibv_exp_rwq_ind_table_init_attr ind_init_attr = {
377 .log_ind_tbl_size = 0, /* Set below. */
381 unsigned int ind_tbl_size = ind_table_init[i].max_size;
382 struct ibv_exp_rwq_ind_table *ind_table;
384 if (wqs_n < ind_tbl_size)
385 ind_tbl_size = wqs_n;
386 ind_init_attr.log_ind_tbl_size = log2above(ind_tbl_size);
388 ind_table = ibv_exp_create_rwq_ind_table(priv->ctx,
390 if (ind_table != NULL) {
391 (*ind_tables)[i] = ind_table;
394 /* Not clear whether errno is set. */
395 err = (errno ? errno : EINVAL);
396 ERROR("RX indirection table creation failed with error %d: %s",
400 /* Allocate array that holds hash RX queues and related data. */
401 hash_rxqs = rte_calloc(__func__, hash_rxqs_n,
402 sizeof((*hash_rxqs)[0]), 0);
403 if (hash_rxqs == NULL) {
405 ERROR("cannot allocate hash RX queues container: %s",
409 for (i = 0, j = 0, k = 0;
410 ((i != hash_rxqs_n) && (j != ind_tables_n));
412 struct hash_rxq *hash_rxq = &(*hash_rxqs)[i];
413 enum hash_rxq_type type =
414 hash_rxq_type_from_n(&ind_table_init[j], k);
415 struct ibv_exp_rx_hash_conf hash_conf = {
416 .rx_hash_function = IBV_EXP_RX_HASH_FUNC_TOEPLITZ,
417 .rx_hash_key_len = priv->rss_conf->rss_key_len,
418 .rx_hash_key = priv->rss_conf->rss_key,
419 .rx_hash_fields_mask = hash_rxq_init[type].hash_fields,
420 .rwq_ind_tbl = (*ind_tables)[j],
422 struct ibv_exp_qp_init_attr qp_init_attr = {
423 .max_inl_recv = 0, /* Currently not supported. */
424 .qp_type = IBV_QPT_RAW_PACKET,
425 .comp_mask = (IBV_EXP_QP_INIT_ATTR_PD |
426 IBV_EXP_QP_INIT_ATTR_RX_HASH),
428 .rx_hash_conf = &hash_conf,
429 .port_num = priv->port,
432 DEBUG("using indirection table %u for hash RX queue %u",
434 *hash_rxq = (struct hash_rxq){
436 .qp = ibv_exp_create_qp(priv->ctx, &qp_init_attr),
439 if (hash_rxq->qp == NULL) {
440 err = (errno ? errno : EINVAL);
441 ERROR("Hash RX QP creation failure: %s",
445 if (++k < ind_table_init[j].hash_types_n)
447 /* Switch to the next indirection table and reset hash RX
448 * queue type array index. */
452 priv->ind_tables = ind_tables;
453 priv->ind_tables_n = ind_tables_n;
454 priv->hash_rxqs = hash_rxqs;
455 priv->hash_rxqs_n = hash_rxqs_n;
459 if (hash_rxqs != NULL) {
460 for (i = 0; (i != hash_rxqs_n); ++i) {
461 struct ibv_qp *qp = (*hash_rxqs)[i].qp;
465 claim_zero(ibv_destroy_qp(qp));
469 if (ind_tables != NULL) {
470 for (j = 0; (j != ind_tables_n); ++j) {
471 struct ibv_exp_rwq_ind_table *ind_table =
474 if (ind_table == NULL)
476 claim_zero(ibv_exp_destroy_rwq_ind_table(ind_table));
478 rte_free(ind_tables);
484 * Clean up hash RX queues and indirection table.
487 * Pointer to private structure.
490 priv_destroy_hash_rxqs(struct priv *priv)
494 DEBUG("destroying %u hash RX queues", priv->hash_rxqs_n);
495 if (priv->hash_rxqs_n == 0) {
496 assert(priv->hash_rxqs == NULL);
497 assert(priv->ind_tables == NULL);
500 for (i = 0; (i != priv->hash_rxqs_n); ++i) {
501 struct hash_rxq *hash_rxq = &(*priv->hash_rxqs)[i];
504 assert(hash_rxq->priv == priv);
505 assert(hash_rxq->qp != NULL);
506 /* Also check that there are no remaining flows. */
507 assert(hash_rxq->allmulti_flow == NULL);
508 assert(hash_rxq->promisc_flow == NULL);
509 for (j = 0; (j != RTE_DIM(hash_rxq->mac_flow)); ++j)
510 for (k = 0; (k != RTE_DIM(hash_rxq->mac_flow[j])); ++k)
511 assert(hash_rxq->mac_flow[j][k] == NULL);
512 claim_zero(ibv_destroy_qp(hash_rxq->qp));
514 priv->hash_rxqs_n = 0;
515 rte_free(priv->hash_rxqs);
516 priv->hash_rxqs = NULL;
517 for (i = 0; (i != priv->ind_tables_n); ++i) {
518 struct ibv_exp_rwq_ind_table *ind_table =
519 (*priv->ind_tables)[i];
521 assert(ind_table != NULL);
522 claim_zero(ibv_exp_destroy_rwq_ind_table(ind_table));
524 priv->ind_tables_n = 0;
525 rte_free(priv->ind_tables);
526 priv->ind_tables = NULL;
530 * Allocate RX queue elements with scattered packets support.
533 * Pointer to RX queue structure.
535 * Number of elements to allocate.
537 * If not NULL, fetch buffers from this array instead of allocating them
538 * with rte_pktmbuf_alloc().
541 * 0 on success, errno value on failure.
544 rxq_alloc_elts_sp(struct rxq *rxq, unsigned int elts_n,
545 struct rte_mbuf **pool)
548 struct rxq_elt_sp (*elts)[elts_n] =
549 rte_calloc_socket("RXQ elements", 1, sizeof(*elts), 0,
554 ERROR("%p: can't allocate packets array", (void *)rxq);
558 /* For each WR (packet). */
559 for (i = 0; (i != elts_n); ++i) {
561 struct rxq_elt_sp *elt = &(*elts)[i];
562 struct ibv_sge (*sges)[RTE_DIM(elt->sges)] = &elt->sges;
564 /* These two arrays must have the same size. */
565 assert(RTE_DIM(elt->sges) == RTE_DIM(elt->bufs));
566 /* For each SGE (segment). */
567 for (j = 0; (j != RTE_DIM(elt->bufs)); ++j) {
568 struct ibv_sge *sge = &(*sges)[j];
569 struct rte_mbuf *buf;
574 rte_pktmbuf_reset(buf);
576 buf = rte_pktmbuf_alloc(rxq->mp);
578 assert(pool == NULL);
579 ERROR("%p: empty mbuf pool", (void *)rxq);
584 /* Headroom is reserved by rte_pktmbuf_alloc(). */
585 assert(DATA_OFF(buf) == RTE_PKTMBUF_HEADROOM);
586 /* Buffer is supposed to be empty. */
587 assert(rte_pktmbuf_data_len(buf) == 0);
588 assert(rte_pktmbuf_pkt_len(buf) == 0);
589 /* sge->addr must be able to store a pointer. */
590 assert(sizeof(sge->addr) >= sizeof(uintptr_t));
592 /* The first SGE keeps its headroom. */
593 sge->addr = rte_pktmbuf_mtod(buf, uintptr_t);
594 sge->length = (buf->buf_len -
595 RTE_PKTMBUF_HEADROOM);
597 /* Subsequent SGEs lose theirs. */
598 assert(DATA_OFF(buf) == RTE_PKTMBUF_HEADROOM);
599 SET_DATA_OFF(buf, 0);
600 sge->addr = (uintptr_t)buf->buf_addr;
601 sge->length = buf->buf_len;
603 sge->lkey = rxq->mr->lkey;
604 /* Redundant check for tailroom. */
605 assert(sge->length == rte_pktmbuf_tailroom(buf));
608 DEBUG("%p: allocated and configured %u WRs (%zu segments)",
609 (void *)rxq, elts_n, (elts_n * RTE_DIM((*elts)[0].sges)));
610 rxq->elts_n = elts_n;
617 assert(pool == NULL);
618 for (i = 0; (i != RTE_DIM(*elts)); ++i) {
620 struct rxq_elt_sp *elt = &(*elts)[i];
622 for (j = 0; (j != RTE_DIM(elt->bufs)); ++j) {
623 struct rte_mbuf *buf = elt->bufs[j];
626 rte_pktmbuf_free_seg(buf);
631 DEBUG("%p: failed, freed everything", (void *)rxq);
637 * Free RX queue elements with scattered packets support.
640 * Pointer to RX queue structure.
643 rxq_free_elts_sp(struct rxq *rxq)
646 unsigned int elts_n = rxq->elts_n;
647 struct rxq_elt_sp (*elts)[elts_n] = rxq->elts.sp;
649 DEBUG("%p: freeing WRs", (void *)rxq);
654 for (i = 0; (i != RTE_DIM(*elts)); ++i) {
656 struct rxq_elt_sp *elt = &(*elts)[i];
658 for (j = 0; (j != RTE_DIM(elt->bufs)); ++j) {
659 struct rte_mbuf *buf = elt->bufs[j];
662 rte_pktmbuf_free_seg(buf);
669 * Allocate RX queue elements.
672 * Pointer to RX queue structure.
674 * Number of elements to allocate.
676 * If not NULL, fetch buffers from this array instead of allocating them
677 * with rte_pktmbuf_alloc().
680 * 0 on success, errno value on failure.
683 rxq_alloc_elts(struct rxq *rxq, unsigned int elts_n, struct rte_mbuf **pool)
686 struct rxq_elt (*elts)[elts_n] =
687 rte_calloc_socket("RXQ elements", 1, sizeof(*elts), 0,
692 ERROR("%p: can't allocate packets array", (void *)rxq);
696 /* For each WR (packet). */
697 for (i = 0; (i != elts_n); ++i) {
698 struct rxq_elt *elt = &(*elts)[i];
699 struct ibv_sge *sge = &(*elts)[i].sge;
700 struct rte_mbuf *buf;
705 rte_pktmbuf_reset(buf);
707 buf = rte_pktmbuf_alloc(rxq->mp);
709 assert(pool == NULL);
710 ERROR("%p: empty mbuf pool", (void *)rxq);
715 /* Headroom is reserved by rte_pktmbuf_alloc(). */
716 assert(DATA_OFF(buf) == RTE_PKTMBUF_HEADROOM);
717 /* Buffer is supposed to be empty. */
718 assert(rte_pktmbuf_data_len(buf) == 0);
719 assert(rte_pktmbuf_pkt_len(buf) == 0);
720 /* sge->addr must be able to store a pointer. */
721 assert(sizeof(sge->addr) >= sizeof(uintptr_t));
722 /* SGE keeps its headroom. */
723 sge->addr = (uintptr_t)
724 ((uint8_t *)buf->buf_addr + RTE_PKTMBUF_HEADROOM);
725 sge->length = (buf->buf_len - RTE_PKTMBUF_HEADROOM);
726 sge->lkey = rxq->mr->lkey;
727 /* Redundant check for tailroom. */
728 assert(sge->length == rte_pktmbuf_tailroom(buf));
730 DEBUG("%p: allocated and configured %u single-segment WRs",
731 (void *)rxq, elts_n);
732 rxq->elts_n = elts_n;
734 rxq->elts.no_sp = elts;
739 assert(pool == NULL);
740 for (i = 0; (i != RTE_DIM(*elts)); ++i) {
741 struct rxq_elt *elt = &(*elts)[i];
742 struct rte_mbuf *buf = elt->buf;
745 rte_pktmbuf_free_seg(buf);
749 DEBUG("%p: failed, freed everything", (void *)rxq);
755 * Free RX queue elements.
758 * Pointer to RX queue structure.
761 rxq_free_elts(struct rxq *rxq)
764 unsigned int elts_n = rxq->elts_n;
765 struct rxq_elt (*elts)[elts_n] = rxq->elts.no_sp;
767 DEBUG("%p: freeing WRs", (void *)rxq);
769 rxq->elts.no_sp = NULL;
772 for (i = 0; (i != RTE_DIM(*elts)); ++i) {
773 struct rxq_elt *elt = &(*elts)[i];
774 struct rte_mbuf *buf = elt->buf;
777 rte_pktmbuf_free_seg(buf);
783 * Clean up a RX queue.
785 * Destroy objects, free allocated memory and reset the structure for reuse.
788 * Pointer to RX queue structure.
791 rxq_cleanup(struct rxq *rxq)
793 struct ibv_exp_release_intf_params params;
795 DEBUG("cleaning up %p", (void *)rxq);
797 rxq_free_elts_sp(rxq);
800 if (rxq->if_wq != NULL) {
801 assert(rxq->priv != NULL);
802 assert(rxq->priv->ctx != NULL);
803 assert(rxq->wq != NULL);
804 params = (struct ibv_exp_release_intf_params){
807 claim_zero(ibv_exp_release_intf(rxq->priv->ctx,
811 if (rxq->if_cq != NULL) {
812 assert(rxq->priv != NULL);
813 assert(rxq->priv->ctx != NULL);
814 assert(rxq->cq != NULL);
815 params = (struct ibv_exp_release_intf_params){
818 claim_zero(ibv_exp_release_intf(rxq->priv->ctx,
823 claim_zero(ibv_exp_destroy_wq(rxq->wq));
825 claim_zero(ibv_destroy_cq(rxq->cq));
826 if (rxq->rd != NULL) {
827 struct ibv_exp_destroy_res_domain_attr attr = {
831 assert(rxq->priv != NULL);
832 assert(rxq->priv->ctx != NULL);
833 claim_zero(ibv_exp_destroy_res_domain(rxq->priv->ctx,
838 claim_zero(ibv_dereg_mr(rxq->mr));
839 memset(rxq, 0, sizeof(*rxq));
843 * Reconfigure a RX queue with new parameters.
845 * rxq_rehash() does not allocate mbufs, which, if not done from the right
846 * thread (such as a control thread), may corrupt the pool.
847 * In case of failure, the queue is left untouched.
850 * Pointer to Ethernet device structure.
855 * 0 on success, errno value on failure.
858 rxq_rehash(struct rte_eth_dev *dev, struct rxq *rxq)
860 struct priv *priv = rxq->priv;
861 struct rxq tmpl = *rxq;
864 struct rte_mbuf **pool;
866 struct ibv_exp_wq_attr mod;
869 DEBUG("%p: rehashing queue %p", (void *)dev, (void *)rxq);
870 /* Number of descriptors and mbufs currently allocated. */
871 desc_n = (tmpl.elts_n * (tmpl.sp ? MLX5_PMD_SGE_WR_N : 1));
873 /* Toggle RX checksum offload if hardware supports it. */
875 tmpl.csum = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
876 rxq->csum = tmpl.csum;
878 if (priv->hw_csum_l2tun) {
879 tmpl.csum_l2tun = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
880 rxq->csum_l2tun = tmpl.csum_l2tun;
882 /* Enable scattered packets support for this queue if necessary. */
883 if ((dev->data->dev_conf.rxmode.jumbo_frame) &&
884 (dev->data->dev_conf.rxmode.max_rx_pkt_len >
885 (tmpl.mb_len - RTE_PKTMBUF_HEADROOM))) {
887 desc_n /= MLX5_PMD_SGE_WR_N;
890 DEBUG("%p: %s scattered packets support (%u WRs)",
891 (void *)dev, (tmpl.sp ? "enabling" : "disabling"), desc_n);
892 /* If scatter mode is the same as before, nothing to do. */
893 if (tmpl.sp == rxq->sp) {
894 DEBUG("%p: nothing to do", (void *)dev);
897 /* From now on, any failure will render the queue unusable.
898 * Reinitialize WQ. */
899 mod = (struct ibv_exp_wq_attr){
900 .attr_mask = IBV_EXP_WQ_ATTR_STATE,
901 .wq_state = IBV_EXP_WQS_RESET,
903 err = ibv_exp_modify_wq(tmpl.wq, &mod);
905 ERROR("%p: cannot reset WQ: %s", (void *)dev, strerror(err));
910 pool = rte_malloc(__func__, (mbuf_n * sizeof(*pool)), 0);
912 ERROR("%p: cannot allocate memory", (void *)dev);
915 /* Snatch mbufs from original queue. */
918 struct rxq_elt_sp (*elts)[rxq->elts_n] = rxq->elts.sp;
920 for (i = 0; (i != RTE_DIM(*elts)); ++i) {
921 struct rxq_elt_sp *elt = &(*elts)[i];
924 for (j = 0; (j != RTE_DIM(elt->bufs)); ++j) {
925 assert(elt->bufs[j] != NULL);
926 pool[k++] = elt->bufs[j];
930 struct rxq_elt (*elts)[rxq->elts_n] = rxq->elts.no_sp;
932 for (i = 0; (i != RTE_DIM(*elts)); ++i) {
933 struct rxq_elt *elt = &(*elts)[i];
934 struct rte_mbuf *buf = elt->buf;
942 assert((void *)&tmpl.elts.sp == (void *)&tmpl.elts.no_sp);
944 rxq_alloc_elts_sp(&tmpl, desc_n, pool) :
945 rxq_alloc_elts(&tmpl, desc_n, pool));
947 ERROR("%p: cannot reallocate WRs, aborting", (void *)dev);
952 assert(tmpl.elts_n == desc_n);
953 assert(tmpl.elts.sp != NULL);
955 /* Clean up original data. */
957 rte_free(rxq->elts.sp);
959 /* Change queue state to ready. */
960 mod = (struct ibv_exp_wq_attr){
961 .attr_mask = IBV_EXP_WQ_ATTR_STATE,
962 .wq_state = IBV_EXP_WQS_RDY,
964 err = ibv_exp_modify_wq(tmpl.wq, &mod);
966 ERROR("%p: WQ state to IBV_EXP_WQS_RDY failed: %s",
967 (void *)dev, strerror(err));
971 assert(tmpl.if_wq != NULL);
973 struct rxq_elt_sp (*elts)[tmpl.elts_n] = tmpl.elts.sp;
975 for (i = 0; (i != RTE_DIM(*elts)); ++i) {
976 err = tmpl.if_wq->recv_sg_list
979 RTE_DIM((*elts)[i].sges));
984 struct rxq_elt (*elts)[tmpl.elts_n] = tmpl.elts.no_sp;
986 for (i = 0; (i != RTE_DIM(*elts)); ++i) {
987 err = tmpl.if_wq->recv_burst(
996 ERROR("%p: failed to post SGEs with error %d",
998 /* Set err because it does not contain a valid errno value. */
1009 * Configure a RX queue.
1012 * Pointer to Ethernet device structure.
1014 * Pointer to RX queue structure.
1016 * Number of descriptors to configure in queue.
1018 * NUMA socket on which memory must be allocated.
1020 * Thresholds parameters.
1022 * Memory pool for buffer allocations.
1025 * 0 on success, errno value on failure.
1028 rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
1029 unsigned int socket, const struct rte_eth_rxconf *conf,
1030 struct rte_mempool *mp)
1032 struct priv *priv = dev->data->dev_private;
1038 struct ibv_exp_wq_attr mod;
1040 struct ibv_exp_query_intf_params params;
1041 struct ibv_exp_cq_init_attr cq;
1042 struct ibv_exp_res_domain_init_attr rd;
1043 struct ibv_exp_wq_init_attr wq;
1045 enum ibv_exp_query_intf_status status;
1046 struct rte_mbuf *buf;
1049 unsigned int cq_size = desc;
1051 (void)conf; /* Thresholds configuration (ignored). */
1052 if ((desc == 0) || (desc % MLX5_PMD_SGE_WR_N)) {
1053 ERROR("%p: invalid number of RX descriptors (must be a"
1054 " multiple of %d)", (void *)dev, MLX5_PMD_SGE_WR_N);
1057 /* Get mbuf length. */
1058 buf = rte_pktmbuf_alloc(mp);
1060 ERROR("%p: unable to allocate mbuf", (void *)dev);
1063 tmpl.mb_len = buf->buf_len;
1064 assert((rte_pktmbuf_headroom(buf) +
1065 rte_pktmbuf_tailroom(buf)) == tmpl.mb_len);
1066 assert(rte_pktmbuf_headroom(buf) == RTE_PKTMBUF_HEADROOM);
1067 rte_pktmbuf_free(buf);
1068 /* Toggle RX checksum offload if hardware supports it. */
1070 tmpl.csum = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
1071 if (priv->hw_csum_l2tun)
1072 tmpl.csum_l2tun = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
1073 /* Enable scattered packets support for this queue if necessary. */
1074 if ((dev->data->dev_conf.rxmode.jumbo_frame) &&
1075 (dev->data->dev_conf.rxmode.max_rx_pkt_len >
1076 (tmpl.mb_len - RTE_PKTMBUF_HEADROOM))) {
1078 desc /= MLX5_PMD_SGE_WR_N;
1080 DEBUG("%p: %s scattered packets support (%u WRs)",
1081 (void *)dev, (tmpl.sp ? "enabling" : "disabling"), desc);
1082 /* Use the entire RX mempool as the memory region. */
1083 tmpl.mr = ibv_reg_mr(priv->pd,
1084 (void *)mp->elt_va_start,
1085 (mp->elt_va_end - mp->elt_va_start),
1086 (IBV_ACCESS_LOCAL_WRITE |
1087 IBV_ACCESS_REMOTE_WRITE));
1088 if (tmpl.mr == NULL) {
1090 ERROR("%p: MR creation failure: %s",
1091 (void *)dev, strerror(ret));
1094 attr.rd = (struct ibv_exp_res_domain_init_attr){
1095 .comp_mask = (IBV_EXP_RES_DOMAIN_THREAD_MODEL |
1096 IBV_EXP_RES_DOMAIN_MSG_MODEL),
1097 .thread_model = IBV_EXP_THREAD_SINGLE,
1098 .msg_model = IBV_EXP_MSG_HIGH_BW,
1100 tmpl.rd = ibv_exp_create_res_domain(priv->ctx, &attr.rd);
1101 if (tmpl.rd == NULL) {
1103 ERROR("%p: RD creation failure: %s",
1104 (void *)dev, strerror(ret));
1107 attr.cq = (struct ibv_exp_cq_init_attr){
1108 .comp_mask = IBV_EXP_CQ_INIT_ATTR_RES_DOMAIN,
1109 .res_domain = tmpl.rd,
1111 tmpl.cq = ibv_exp_create_cq(priv->ctx, cq_size, NULL, NULL, 0,
1113 if (tmpl.cq == NULL) {
1115 ERROR("%p: CQ creation failure: %s",
1116 (void *)dev, strerror(ret));
1119 DEBUG("priv->device_attr.max_qp_wr is %d",
1120 priv->device_attr.max_qp_wr);
1121 DEBUG("priv->device_attr.max_sge is %d",
1122 priv->device_attr.max_sge);
1123 attr.wq = (struct ibv_exp_wq_init_attr){
1124 .wq_context = NULL, /* Could be useful in the future. */
1125 .wq_type = IBV_EXP_WQT_RQ,
1126 /* Max number of outstanding WRs. */
1127 .max_recv_wr = ((priv->device_attr.max_qp_wr < (int)cq_size) ?
1128 priv->device_attr.max_qp_wr :
1130 /* Max number of scatter/gather elements in a WR. */
1131 .max_recv_sge = ((priv->device_attr.max_sge <
1132 MLX5_PMD_SGE_WR_N) ?
1133 priv->device_attr.max_sge :
1137 .comp_mask = IBV_EXP_CREATE_WQ_RES_DOMAIN,
1138 .res_domain = tmpl.rd,
1140 tmpl.wq = ibv_exp_create_wq(priv->ctx, &attr.wq);
1141 if (tmpl.wq == NULL) {
1142 ret = (errno ? errno : EINVAL);
1143 ERROR("%p: WQ creation failure: %s",
1144 (void *)dev, strerror(ret));
1148 ret = rxq_alloc_elts_sp(&tmpl, desc, NULL);
1150 ret = rxq_alloc_elts(&tmpl, desc, NULL);
1152 ERROR("%p: RXQ allocation failed: %s",
1153 (void *)dev, strerror(ret));
1157 tmpl.port_id = dev->data->port_id;
1158 DEBUG("%p: RTE port ID: %u", (void *)rxq, tmpl.port_id);
1159 attr.params = (struct ibv_exp_query_intf_params){
1160 .intf_scope = IBV_EXP_INTF_GLOBAL,
1161 .intf = IBV_EXP_INTF_CQ,
1164 tmpl.if_cq = ibv_exp_query_intf(priv->ctx, &attr.params, &status);
1165 if (tmpl.if_cq == NULL) {
1166 ERROR("%p: CQ interface family query failed with status %d",
1167 (void *)dev, status);
1170 attr.params = (struct ibv_exp_query_intf_params){
1171 .intf_scope = IBV_EXP_INTF_GLOBAL,
1172 .intf = IBV_EXP_INTF_WQ,
1175 tmpl.if_wq = ibv_exp_query_intf(priv->ctx, &attr.params, &status);
1176 if (tmpl.if_wq == NULL) {
1177 ERROR("%p: WQ interface family query failed with status %d",
1178 (void *)dev, status);
1181 /* Change queue state to ready. */
1182 mod = (struct ibv_exp_wq_attr){
1183 .attr_mask = IBV_EXP_WQ_ATTR_STATE,
1184 .wq_state = IBV_EXP_WQS_RDY,
1186 ret = ibv_exp_modify_wq(tmpl.wq, &mod);
1188 ERROR("%p: WQ state to IBV_EXP_WQS_RDY failed: %s",
1189 (void *)dev, strerror(ret));
1194 struct rxq_elt_sp (*elts)[tmpl.elts_n] = tmpl.elts.sp;
1196 for (i = 0; (i != RTE_DIM(*elts)); ++i) {
1197 ret = tmpl.if_wq->recv_sg_list
1200 RTE_DIM((*elts)[i].sges));
1205 struct rxq_elt (*elts)[tmpl.elts_n] = tmpl.elts.no_sp;
1207 for (i = 0; (i != RTE_DIM(*elts)); ++i) {
1208 ret = tmpl.if_wq->recv_burst(
1217 ERROR("%p: failed to post SGEs with error %d",
1219 /* Set ret because it does not contain a valid errno value. */
1223 /* Clean up rxq in case we're reinitializing it. */
1224 DEBUG("%p: cleaning-up old rxq just in case", (void *)rxq);
1227 DEBUG("%p: rxq updated with %p", (void *)rxq, (void *)&tmpl);
1237 * DPDK callback to configure a RX queue.
1240 * Pointer to Ethernet device structure.
1244 * Number of descriptors to configure in queue.
1246 * NUMA socket on which memory must be allocated.
1248 * Thresholds parameters.
1250 * Memory pool for buffer allocations.
1253 * 0 on success, negative errno value on failure.
1256 mlx5_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
1257 unsigned int socket, const struct rte_eth_rxconf *conf,
1258 struct rte_mempool *mp)
1260 struct priv *priv = dev->data->dev_private;
1261 struct rxq *rxq = (*priv->rxqs)[idx];
1265 DEBUG("%p: configuring queue %u for %u descriptors",
1266 (void *)dev, idx, desc);
1267 if (idx >= priv->rxqs_n) {
1268 ERROR("%p: queue index out of range (%u >= %u)",
1269 (void *)dev, idx, priv->rxqs_n);
1274 DEBUG("%p: reusing already allocated queue index %u (%p)",
1275 (void *)dev, idx, (void *)rxq);
1276 if (priv->started) {
1280 (*priv->rxqs)[idx] = NULL;
1283 rxq = rte_calloc_socket("RXQ", 1, sizeof(*rxq), 0, socket);
1285 ERROR("%p: unable to allocate queue index %u",
1291 ret = rxq_setup(dev, rxq, desc, socket, conf, mp);
1295 rxq->stats.idx = idx;
1296 DEBUG("%p: adding RX queue %p to list",
1297 (void *)dev, (void *)rxq);
1298 (*priv->rxqs)[idx] = rxq;
1299 /* Update receive callback. */
1301 dev->rx_pkt_burst = mlx5_rx_burst_sp;
1303 dev->rx_pkt_burst = mlx5_rx_burst;
1310 * DPDK callback to release a RX queue.
1313 * Generic RX queue pointer.
1316 mlx5_rx_queue_release(void *dpdk_rxq)
1318 struct rxq *rxq = (struct rxq *)dpdk_rxq;
1326 for (i = 0; (i != priv->rxqs_n); ++i)
1327 if ((*priv->rxqs)[i] == rxq) {
1328 DEBUG("%p: removing RX queue %p from list",
1329 (void *)priv->dev, (void *)rxq);
1330 (*priv->rxqs)[i] = NULL;