4 * Copyright 2015 6WIND S.A.
5 * Copyright 2015 Mellanox.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * * Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * * Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * * Neither the name of 6WIND S.A. nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
41 /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
43 #pragma GCC diagnostic ignored "-pedantic"
45 #include <infiniband/verbs.h>
47 #pragma GCC diagnostic error "-pedantic"
50 /* DPDK headers don't like -pedantic. */
52 #pragma GCC diagnostic ignored "-pedantic"
55 #include <rte_malloc.h>
56 #include <rte_ethdev.h>
57 #include <rte_common.h>
59 #pragma GCC diagnostic error "-pedantic"
63 #include "mlx5_rxtx.h"
64 #include "mlx5_utils.h"
65 #include "mlx5_defs.h"
67 /* Initialization data for hash RX queues. */
68 const struct hash_rxq_init hash_rxq_init[] = {
70 .hash_fields = (IBV_EXP_RX_HASH_SRC_IPV4 |
71 IBV_EXP_RX_HASH_DST_IPV4 |
72 IBV_EXP_RX_HASH_SRC_PORT_TCP |
73 IBV_EXP_RX_HASH_DST_PORT_TCP),
74 .dpdk_rss_hf = ETH_RSS_NONFRAG_IPV4_TCP,
76 .flow_spec.tcp_udp = {
77 .type = IBV_EXP_FLOW_SPEC_TCP,
78 .size = sizeof(hash_rxq_init[0].flow_spec.tcp_udp),
80 .underlayer = &hash_rxq_init[HASH_RXQ_IPV4],
83 .hash_fields = (IBV_EXP_RX_HASH_SRC_IPV4 |
84 IBV_EXP_RX_HASH_DST_IPV4 |
85 IBV_EXP_RX_HASH_SRC_PORT_UDP |
86 IBV_EXP_RX_HASH_DST_PORT_UDP),
87 .dpdk_rss_hf = ETH_RSS_NONFRAG_IPV4_UDP,
89 .flow_spec.tcp_udp = {
90 .type = IBV_EXP_FLOW_SPEC_UDP,
91 .size = sizeof(hash_rxq_init[0].flow_spec.tcp_udp),
93 .underlayer = &hash_rxq_init[HASH_RXQ_IPV4],
96 .hash_fields = (IBV_EXP_RX_HASH_SRC_IPV4 |
97 IBV_EXP_RX_HASH_DST_IPV4),
98 .dpdk_rss_hf = (ETH_RSS_IPV4 |
102 .type = IBV_EXP_FLOW_SPEC_IPV4,
103 .size = sizeof(hash_rxq_init[0].flow_spec.ipv4),
105 .underlayer = &hash_rxq_init[HASH_RXQ_ETH],
107 #ifdef HAVE_FLOW_SPEC_IPV6
109 .hash_fields = (IBV_EXP_RX_HASH_SRC_IPV6 |
110 IBV_EXP_RX_HASH_DST_IPV6 |
111 IBV_EXP_RX_HASH_SRC_PORT_TCP |
112 IBV_EXP_RX_HASH_DST_PORT_TCP),
113 .dpdk_rss_hf = ETH_RSS_NONFRAG_IPV6_TCP,
115 .flow_spec.tcp_udp = {
116 .type = IBV_EXP_FLOW_SPEC_TCP,
117 .size = sizeof(hash_rxq_init[0].flow_spec.tcp_udp),
119 .underlayer = &hash_rxq_init[HASH_RXQ_IPV6],
122 .hash_fields = (IBV_EXP_RX_HASH_SRC_IPV6 |
123 IBV_EXP_RX_HASH_DST_IPV6 |
124 IBV_EXP_RX_HASH_SRC_PORT_UDP |
125 IBV_EXP_RX_HASH_DST_PORT_UDP),
126 .dpdk_rss_hf = ETH_RSS_NONFRAG_IPV6_UDP,
128 .flow_spec.tcp_udp = {
129 .type = IBV_EXP_FLOW_SPEC_UDP,
130 .size = sizeof(hash_rxq_init[0].flow_spec.tcp_udp),
132 .underlayer = &hash_rxq_init[HASH_RXQ_IPV6],
135 .hash_fields = (IBV_EXP_RX_HASH_SRC_IPV6 |
136 IBV_EXP_RX_HASH_DST_IPV6),
137 .dpdk_rss_hf = (ETH_RSS_IPV6 |
141 .type = IBV_EXP_FLOW_SPEC_IPV6,
142 .size = sizeof(hash_rxq_init[0].flow_spec.ipv6),
144 .underlayer = &hash_rxq_init[HASH_RXQ_ETH],
146 #endif /* HAVE_FLOW_SPEC_IPV6 */
152 .type = IBV_EXP_FLOW_SPEC_ETH,
153 .size = sizeof(hash_rxq_init[0].flow_spec.eth),
159 /* Number of entries in hash_rxq_init[]. */
160 const unsigned int hash_rxq_init_n = RTE_DIM(hash_rxq_init);
162 /* Initialization data for hash RX queue indirection tables. */
163 static const struct ind_table_init ind_table_init[] = {
165 .max_size = -1u, /* Superseded by HW limitations. */
167 1 << HASH_RXQ_TCPV4 |
168 1 << HASH_RXQ_UDPV4 |
170 #ifdef HAVE_FLOW_SPEC_IPV6
171 1 << HASH_RXQ_TCPV6 |
172 1 << HASH_RXQ_UDPV6 |
174 #endif /* HAVE_FLOW_SPEC_IPV6 */
176 #ifdef HAVE_FLOW_SPEC_IPV6
178 #else /* HAVE_FLOW_SPEC_IPV6 */
180 #endif /* HAVE_FLOW_SPEC_IPV6 */
184 .hash_types = 1 << HASH_RXQ_ETH,
189 #define IND_TABLE_INIT_N RTE_DIM(ind_table_init)
191 /* Default RSS hash key also used for ConnectX-3. */
192 uint8_t rss_hash_default_key[] = {
193 0x2c, 0xc6, 0x81, 0xd1,
194 0x5b, 0xdb, 0xf4, 0xf7,
195 0xfc, 0xa2, 0x83, 0x19,
196 0xdb, 0x1a, 0x3e, 0x94,
197 0x6b, 0x9e, 0x38, 0xd9,
198 0x2c, 0x9c, 0x03, 0xd1,
199 0xad, 0x99, 0x44, 0xa7,
200 0xd9, 0x56, 0x3d, 0x59,
201 0x06, 0x3c, 0x25, 0xf3,
202 0xfc, 0x1f, 0xdc, 0x2a,
205 /* Length of the default RSS hash key. */
206 const size_t rss_hash_default_key_len = sizeof(rss_hash_default_key);
209 * Populate flow steering rule for a given hash RX queue type using
210 * information from hash_rxq_init[]. Nothing is written to flow_attr when
211 * flow_attr_size is not large enough, but the required size is still returned.
213 * @param[in] hash_rxq
214 * Pointer to hash RX queue.
215 * @param[out] flow_attr
216 * Pointer to flow attribute structure to fill. Note that the allocated
217 * area must be larger and large enough to hold all flow specifications.
218 * @param flow_attr_size
219 * Entire size of flow_attr and trailing room for flow specifications.
222 * Total size of the flow attribute buffer. No errors are defined.
225 hash_rxq_flow_attr(const struct hash_rxq *hash_rxq,
226 struct ibv_exp_flow_attr *flow_attr,
227 size_t flow_attr_size)
229 size_t offset = sizeof(*flow_attr);
230 enum hash_rxq_type type = hash_rxq->type;
231 const struct hash_rxq_init *init = &hash_rxq_init[type];
233 assert(hash_rxq->priv != NULL);
234 assert((size_t)type < RTE_DIM(hash_rxq_init));
236 offset += init->flow_spec.hdr.size;
237 init = init->underlayer;
238 } while (init != NULL);
239 if (offset > flow_attr_size)
241 flow_attr_size = offset;
242 init = &hash_rxq_init[type];
243 *flow_attr = (struct ibv_exp_flow_attr){
244 .type = IBV_EXP_FLOW_ATTR_NORMAL,
245 .priority = init->flow_priority,
247 .port = hash_rxq->priv->port,
251 offset -= init->flow_spec.hdr.size;
252 memcpy((void *)((uintptr_t)flow_attr + offset),
254 init->flow_spec.hdr.size);
255 ++flow_attr->num_of_specs;
256 init = init->underlayer;
257 } while (init != NULL);
258 return flow_attr_size;
262 * Return nearest power of two above input value.
268 * Nearest power of two above input value.
271 log2above(unsigned int v)
276 for (l = 0, r = 0; (v >> 1); ++l, v >>= 1)
282 * Return the type corresponding to the n'th bit set.
285 * The indirection table.
290 * The corresponding hash_rxq_type.
292 static enum hash_rxq_type
293 hash_rxq_type_from_n(const struct ind_table_init *table, unsigned int n)
295 assert(n < table->hash_types_n);
296 while (((table->hash_types >> n) & 0x1) == 0)
302 * Filter out disabled hash RX queue types from ind_table_init[].
305 * Pointer to private structure.
310 * Number of table entries.
313 priv_make_ind_table_init(struct priv *priv,
314 struct ind_table_init (*table)[IND_TABLE_INIT_N])
319 unsigned int table_n = 0;
320 /* Mandatory to receive frames not handled by normal hash RX queues. */
321 unsigned int hash_types_sup = 1 << HASH_RXQ_ETH;
323 rss_hf = priv->dev->data->dev_conf.rx_adv_conf.rss_conf.rss_hf;
324 /* Process other protocols only if more than one queue. */
325 if (priv->rxqs_n > 1)
326 for (i = 0; (i != hash_rxq_init_n); ++i)
327 if (rss_hf & hash_rxq_init[i].dpdk_rss_hf)
328 hash_types_sup |= (1 << i);
330 /* Filter out entries whose protocols are not in the set. */
331 for (i = 0, j = 0; (i != IND_TABLE_INIT_N); ++i) {
335 /* j is increased only if the table has valid protocols. */
337 (*table)[j] = ind_table_init[i];
338 (*table)[j].hash_types &= hash_types_sup;
339 for (h = 0, nb = 0; (h != hash_rxq_init_n); ++h)
340 if (((*table)[j].hash_types >> h) & 0x1)
342 (*table)[i].hash_types_n = nb;
352 * Initialize hash RX queues and indirection table.
355 * Pointer to private structure.
358 * 0 on success, errno value on failure.
361 priv_create_hash_rxqs(struct priv *priv)
363 /* If the requested number of WQs is not a power of two, use the
364 * maximum indirection table size for better balancing.
365 * The result is always rounded to the next power of two. */
367 (1 << log2above((priv->rxqs_n & (priv->rxqs_n - 1)) ?
368 priv->ind_table_max_size :
370 struct ibv_exp_wq *wqs[wqs_n];
371 struct ind_table_init ind_table_init[IND_TABLE_INIT_N];
372 unsigned int ind_tables_n =
373 priv_make_ind_table_init(priv, &ind_table_init);
374 unsigned int hash_rxqs_n = 0;
375 struct hash_rxq (*hash_rxqs)[] = NULL;
376 struct ibv_exp_rwq_ind_table *(*ind_tables)[] = NULL;
382 assert(priv->ind_tables == NULL);
383 assert(priv->ind_tables_n == 0);
384 assert(priv->hash_rxqs == NULL);
385 assert(priv->hash_rxqs_n == 0);
386 assert(priv->pd != NULL);
387 assert(priv->ctx != NULL);
388 if (priv->rxqs_n == 0)
390 assert(priv->rxqs != NULL);
391 if (ind_tables_n == 0) {
392 ERROR("all hash RX queue types have been filtered out,"
393 " indirection table cannot be created");
396 if ((wqs_n < priv->rxqs_n) || (wqs_n > priv->ind_table_max_size)) {
397 ERROR("cannot handle this many RX queues (%u)", priv->rxqs_n);
401 if (wqs_n != priv->rxqs_n) {
402 INFO("%u RX queues are configured, consider rounding this"
403 " number to the next power of two for better balancing",
405 DEBUG("indirection table extended to assume %u WQs", wqs_n);
407 /* When the number of RX queues is not a power of two, the remaining
408 * table entries are padded with reused WQs and hashes are not spread
410 for (i = 0, j = 0; (i != wqs_n); ++i) {
411 wqs[i] = (*priv->rxqs)[j]->wq;
412 if (++j == priv->rxqs_n)
415 /* Get number of hash RX queues to configure. */
416 for (i = 0, hash_rxqs_n = 0; (i != ind_tables_n); ++i)
417 hash_rxqs_n += ind_table_init[i].hash_types_n;
418 DEBUG("allocating %u hash RX queues for %u WQs, %u indirection tables",
419 hash_rxqs_n, priv->rxqs_n, ind_tables_n);
420 /* Create indirection tables. */
421 ind_tables = rte_calloc(__func__, ind_tables_n,
422 sizeof((*ind_tables)[0]), 0);
423 if (ind_tables == NULL) {
425 ERROR("cannot allocate indirection tables container: %s",
429 for (i = 0; (i != ind_tables_n); ++i) {
430 struct ibv_exp_rwq_ind_table_init_attr ind_init_attr = {
432 .log_ind_tbl_size = 0, /* Set below. */
436 unsigned int ind_tbl_size = ind_table_init[i].max_size;
437 struct ibv_exp_rwq_ind_table *ind_table;
439 if (wqs_n < ind_tbl_size)
440 ind_tbl_size = wqs_n;
441 ind_init_attr.log_ind_tbl_size = log2above(ind_tbl_size);
443 ind_table = ibv_exp_create_rwq_ind_table(priv->ctx,
445 if (ind_table != NULL) {
446 (*ind_tables)[i] = ind_table;
449 /* Not clear whether errno is set. */
450 err = (errno ? errno : EINVAL);
451 ERROR("RX indirection table creation failed with error %d: %s",
455 /* Allocate array that holds hash RX queues and related data. */
456 hash_rxqs = rte_calloc(__func__, hash_rxqs_n,
457 sizeof((*hash_rxqs)[0]), 0);
458 if (hash_rxqs == NULL) {
460 ERROR("cannot allocate hash RX queues container: %s",
464 for (i = 0, j = 0, k = 0;
465 ((i != hash_rxqs_n) && (j != ind_tables_n));
467 struct hash_rxq *hash_rxq = &(*hash_rxqs)[i];
468 enum hash_rxq_type type =
469 hash_rxq_type_from_n(&ind_table_init[j], k);
470 struct rte_eth_rss_conf *priv_rss_conf =
471 (*priv->rss_conf)[type];
472 struct ibv_exp_rx_hash_conf hash_conf = {
473 .rx_hash_function = IBV_EXP_RX_HASH_FUNC_TOEPLITZ,
474 .rx_hash_key_len = (priv_rss_conf ?
475 priv_rss_conf->rss_key_len :
476 rss_hash_default_key_len),
477 .rx_hash_key = (priv_rss_conf ?
478 priv_rss_conf->rss_key :
479 rss_hash_default_key),
480 .rx_hash_fields_mask = hash_rxq_init[type].hash_fields,
481 .rwq_ind_tbl = (*ind_tables)[j],
483 struct ibv_exp_qp_init_attr qp_init_attr = {
484 .max_inl_recv = 0, /* Currently not supported. */
485 .qp_type = IBV_QPT_RAW_PACKET,
486 .comp_mask = (IBV_EXP_QP_INIT_ATTR_PD |
487 IBV_EXP_QP_INIT_ATTR_RX_HASH),
489 .rx_hash_conf = &hash_conf,
490 .port_num = priv->port,
493 DEBUG("using indirection table %u for hash RX queue %u",
495 *hash_rxq = (struct hash_rxq){
497 .qp = ibv_exp_create_qp(priv->ctx, &qp_init_attr),
500 if (hash_rxq->qp == NULL) {
501 err = (errno ? errno : EINVAL);
502 ERROR("Hash RX QP creation failure: %s",
506 if (++k < ind_table_init[j].hash_types_n)
508 /* Switch to the next indirection table and reset hash RX
509 * queue type array index. */
513 priv->ind_tables = ind_tables;
514 priv->ind_tables_n = ind_tables_n;
515 priv->hash_rxqs = hash_rxqs;
516 priv->hash_rxqs_n = hash_rxqs_n;
520 if (hash_rxqs != NULL) {
521 for (i = 0; (i != hash_rxqs_n); ++i) {
522 struct ibv_qp *qp = (*hash_rxqs)[i].qp;
526 claim_zero(ibv_destroy_qp(qp));
530 if (ind_tables != NULL) {
531 for (j = 0; (j != ind_tables_n); ++j) {
532 struct ibv_exp_rwq_ind_table *ind_table =
535 if (ind_table == NULL)
537 claim_zero(ibv_exp_destroy_rwq_ind_table(ind_table));
539 rte_free(ind_tables);
545 * Clean up hash RX queues and indirection table.
548 * Pointer to private structure.
551 priv_destroy_hash_rxqs(struct priv *priv)
555 DEBUG("destroying %u hash RX queues", priv->hash_rxqs_n);
556 if (priv->hash_rxqs_n == 0) {
557 assert(priv->hash_rxqs == NULL);
558 assert(priv->ind_tables == NULL);
561 for (i = 0; (i != priv->hash_rxqs_n); ++i) {
562 struct hash_rxq *hash_rxq = &(*priv->hash_rxqs)[i];
565 assert(hash_rxq->priv == priv);
566 assert(hash_rxq->qp != NULL);
567 /* Also check that there are no remaining flows. */
568 assert(hash_rxq->allmulti_flow == NULL);
569 assert(hash_rxq->promisc_flow == NULL);
570 for (j = 0; (j != RTE_DIM(hash_rxq->mac_flow)); ++j)
571 for (k = 0; (k != RTE_DIM(hash_rxq->mac_flow[j])); ++k)
572 assert(hash_rxq->mac_flow[j][k] == NULL);
573 claim_zero(ibv_destroy_qp(hash_rxq->qp));
575 priv->hash_rxqs_n = 0;
576 rte_free(priv->hash_rxqs);
577 priv->hash_rxqs = NULL;
578 for (i = 0; (i != priv->ind_tables_n); ++i) {
579 struct ibv_exp_rwq_ind_table *ind_table =
580 (*priv->ind_tables)[i];
582 assert(ind_table != NULL);
583 claim_zero(ibv_exp_destroy_rwq_ind_table(ind_table));
585 priv->ind_tables_n = 0;
586 rte_free(priv->ind_tables);
587 priv->ind_tables = NULL;
591 * Check whether a given flow type is allowed.
594 * Pointer to private structure.
596 * Flow type to check.
599 * Nonzero if the given flow type is allowed.
602 priv_allow_flow_type(struct priv *priv, enum hash_rxq_flow_type type)
604 /* Only FLOW_TYPE_PROMISC is allowed when promiscuous mode
605 * has been requested. */
606 if (priv->promisc_req)
607 return (type == HASH_RXQ_FLOW_TYPE_PROMISC);
609 case HASH_RXQ_FLOW_TYPE_PROMISC:
610 return !!priv->promisc_req;
611 case HASH_RXQ_FLOW_TYPE_ALLMULTI:
612 return !!priv->allmulti_req;
613 case HASH_RXQ_FLOW_TYPE_MAC:
620 * Allocate RX queue elements with scattered packets support.
623 * Pointer to RX queue structure.
625 * Number of elements to allocate.
627 * If not NULL, fetch buffers from this array instead of allocating them
628 * with rte_pktmbuf_alloc().
631 * 0 on success, errno value on failure.
634 rxq_alloc_elts_sp(struct rxq *rxq, unsigned int elts_n,
635 struct rte_mbuf **pool)
638 struct rxq_elt_sp (*elts)[elts_n] =
639 rte_calloc_socket("RXQ elements", 1, sizeof(*elts), 0,
644 ERROR("%p: can't allocate packets array", (void *)rxq);
648 /* For each WR (packet). */
649 for (i = 0; (i != elts_n); ++i) {
651 struct rxq_elt_sp *elt = &(*elts)[i];
652 struct ibv_sge (*sges)[RTE_DIM(elt->sges)] = &elt->sges;
654 /* These two arrays must have the same size. */
655 assert(RTE_DIM(elt->sges) == RTE_DIM(elt->bufs));
656 /* For each SGE (segment). */
657 for (j = 0; (j != RTE_DIM(elt->bufs)); ++j) {
658 struct ibv_sge *sge = &(*sges)[j];
659 struct rte_mbuf *buf;
664 rte_pktmbuf_reset(buf);
666 buf = rte_pktmbuf_alloc(rxq->mp);
668 assert(pool == NULL);
669 ERROR("%p: empty mbuf pool", (void *)rxq);
674 /* Headroom is reserved by rte_pktmbuf_alloc(). */
675 assert(DATA_OFF(buf) == RTE_PKTMBUF_HEADROOM);
676 /* Buffer is supposed to be empty. */
677 assert(rte_pktmbuf_data_len(buf) == 0);
678 assert(rte_pktmbuf_pkt_len(buf) == 0);
679 /* sge->addr must be able to store a pointer. */
680 assert(sizeof(sge->addr) >= sizeof(uintptr_t));
682 /* The first SGE keeps its headroom. */
683 sge->addr = rte_pktmbuf_mtod(buf, uintptr_t);
684 sge->length = (buf->buf_len -
685 RTE_PKTMBUF_HEADROOM);
687 /* Subsequent SGEs lose theirs. */
688 assert(DATA_OFF(buf) == RTE_PKTMBUF_HEADROOM);
689 SET_DATA_OFF(buf, 0);
690 sge->addr = (uintptr_t)buf->buf_addr;
691 sge->length = buf->buf_len;
693 sge->lkey = rxq->mr->lkey;
694 /* Redundant check for tailroom. */
695 assert(sge->length == rte_pktmbuf_tailroom(buf));
698 DEBUG("%p: allocated and configured %u WRs (%zu segments)",
699 (void *)rxq, elts_n, (elts_n * RTE_DIM((*elts)[0].sges)));
700 rxq->elts_n = elts_n;
707 assert(pool == NULL);
708 for (i = 0; (i != RTE_DIM(*elts)); ++i) {
710 struct rxq_elt_sp *elt = &(*elts)[i];
712 for (j = 0; (j != RTE_DIM(elt->bufs)); ++j) {
713 struct rte_mbuf *buf = elt->bufs[j];
716 rte_pktmbuf_free_seg(buf);
721 DEBUG("%p: failed, freed everything", (void *)rxq);
727 * Free RX queue elements with scattered packets support.
730 * Pointer to RX queue structure.
733 rxq_free_elts_sp(struct rxq *rxq)
736 unsigned int elts_n = rxq->elts_n;
737 struct rxq_elt_sp (*elts)[elts_n] = rxq->elts.sp;
739 DEBUG("%p: freeing WRs", (void *)rxq);
744 for (i = 0; (i != RTE_DIM(*elts)); ++i) {
746 struct rxq_elt_sp *elt = &(*elts)[i];
748 for (j = 0; (j != RTE_DIM(elt->bufs)); ++j) {
749 struct rte_mbuf *buf = elt->bufs[j];
752 rte_pktmbuf_free_seg(buf);
759 * Allocate RX queue elements.
762 * Pointer to RX queue structure.
764 * Number of elements to allocate.
766 * If not NULL, fetch buffers from this array instead of allocating them
767 * with rte_pktmbuf_alloc().
770 * 0 on success, errno value on failure.
773 rxq_alloc_elts(struct rxq *rxq, unsigned int elts_n, struct rte_mbuf **pool)
776 struct rxq_elt (*elts)[elts_n] =
777 rte_calloc_socket("RXQ elements", 1, sizeof(*elts), 0,
782 ERROR("%p: can't allocate packets array", (void *)rxq);
786 /* For each WR (packet). */
787 for (i = 0; (i != elts_n); ++i) {
788 struct rxq_elt *elt = &(*elts)[i];
789 struct ibv_sge *sge = &(*elts)[i].sge;
790 struct rte_mbuf *buf;
795 rte_pktmbuf_reset(buf);
797 buf = rte_pktmbuf_alloc(rxq->mp);
799 assert(pool == NULL);
800 ERROR("%p: empty mbuf pool", (void *)rxq);
805 /* Headroom is reserved by rte_pktmbuf_alloc(). */
806 assert(DATA_OFF(buf) == RTE_PKTMBUF_HEADROOM);
807 /* Buffer is supposed to be empty. */
808 assert(rte_pktmbuf_data_len(buf) == 0);
809 assert(rte_pktmbuf_pkt_len(buf) == 0);
810 /* sge->addr must be able to store a pointer. */
811 assert(sizeof(sge->addr) >= sizeof(uintptr_t));
812 /* SGE keeps its headroom. */
813 sge->addr = (uintptr_t)
814 ((uint8_t *)buf->buf_addr + RTE_PKTMBUF_HEADROOM);
815 sge->length = (buf->buf_len - RTE_PKTMBUF_HEADROOM);
816 sge->lkey = rxq->mr->lkey;
817 /* Redundant check for tailroom. */
818 assert(sge->length == rte_pktmbuf_tailroom(buf));
820 DEBUG("%p: allocated and configured %u single-segment WRs",
821 (void *)rxq, elts_n);
822 rxq->elts_n = elts_n;
824 rxq->elts.no_sp = elts;
829 assert(pool == NULL);
830 for (i = 0; (i != RTE_DIM(*elts)); ++i) {
831 struct rxq_elt *elt = &(*elts)[i];
832 struct rte_mbuf *buf = elt->buf;
835 rte_pktmbuf_free_seg(buf);
839 DEBUG("%p: failed, freed everything", (void *)rxq);
845 * Free RX queue elements.
848 * Pointer to RX queue structure.
851 rxq_free_elts(struct rxq *rxq)
854 unsigned int elts_n = rxq->elts_n;
855 struct rxq_elt (*elts)[elts_n] = rxq->elts.no_sp;
857 DEBUG("%p: freeing WRs", (void *)rxq);
859 rxq->elts.no_sp = NULL;
862 for (i = 0; (i != RTE_DIM(*elts)); ++i) {
863 struct rxq_elt *elt = &(*elts)[i];
864 struct rte_mbuf *buf = elt->buf;
867 rte_pktmbuf_free_seg(buf);
873 * Clean up a RX queue.
875 * Destroy objects, free allocated memory and reset the structure for reuse.
878 * Pointer to RX queue structure.
881 rxq_cleanup(struct rxq *rxq)
883 struct ibv_exp_release_intf_params params;
885 DEBUG("cleaning up %p", (void *)rxq);
887 rxq_free_elts_sp(rxq);
890 if (rxq->if_wq != NULL) {
891 assert(rxq->priv != NULL);
892 assert(rxq->priv->ctx != NULL);
893 assert(rxq->wq != NULL);
894 params = (struct ibv_exp_release_intf_params){
897 claim_zero(ibv_exp_release_intf(rxq->priv->ctx,
901 if (rxq->if_cq != NULL) {
902 assert(rxq->priv != NULL);
903 assert(rxq->priv->ctx != NULL);
904 assert(rxq->cq != NULL);
905 params = (struct ibv_exp_release_intf_params){
908 claim_zero(ibv_exp_release_intf(rxq->priv->ctx,
913 claim_zero(ibv_exp_destroy_wq(rxq->wq));
915 claim_zero(ibv_destroy_cq(rxq->cq));
916 if (rxq->rd != NULL) {
917 struct ibv_exp_destroy_res_domain_attr attr = {
921 assert(rxq->priv != NULL);
922 assert(rxq->priv->ctx != NULL);
923 claim_zero(ibv_exp_destroy_res_domain(rxq->priv->ctx,
928 claim_zero(ibv_dereg_mr(rxq->mr));
929 memset(rxq, 0, sizeof(*rxq));
933 * Reconfigure a RX queue with new parameters.
935 * rxq_rehash() does not allocate mbufs, which, if not done from the right
936 * thread (such as a control thread), may corrupt the pool.
937 * In case of failure, the queue is left untouched.
940 * Pointer to Ethernet device structure.
945 * 0 on success, errno value on failure.
948 rxq_rehash(struct rte_eth_dev *dev, struct rxq *rxq)
950 struct priv *priv = rxq->priv;
951 struct rxq tmpl = *rxq;
954 struct rte_mbuf **pool;
956 struct ibv_exp_wq_attr mod;
959 DEBUG("%p: rehashing queue %p", (void *)dev, (void *)rxq);
960 /* Number of descriptors and mbufs currently allocated. */
961 desc_n = (tmpl.elts_n * (tmpl.sp ? MLX5_PMD_SGE_WR_N : 1));
963 /* Toggle RX checksum offload if hardware supports it. */
965 tmpl.csum = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
966 rxq->csum = tmpl.csum;
968 if (priv->hw_csum_l2tun) {
969 tmpl.csum_l2tun = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
970 rxq->csum_l2tun = tmpl.csum_l2tun;
972 /* Enable scattered packets support for this queue if necessary. */
973 if ((dev->data->dev_conf.rxmode.jumbo_frame) &&
974 (dev->data->dev_conf.rxmode.max_rx_pkt_len >
975 (tmpl.mb_len - RTE_PKTMBUF_HEADROOM))) {
977 desc_n /= MLX5_PMD_SGE_WR_N;
980 DEBUG("%p: %s scattered packets support (%u WRs)",
981 (void *)dev, (tmpl.sp ? "enabling" : "disabling"), desc_n);
982 /* If scatter mode is the same as before, nothing to do. */
983 if (tmpl.sp == rxq->sp) {
984 DEBUG("%p: nothing to do", (void *)dev);
987 /* From now on, any failure will render the queue unusable.
988 * Reinitialize WQ. */
989 mod = (struct ibv_exp_wq_attr){
990 .attr_mask = IBV_EXP_WQ_ATTR_STATE,
991 .wq_state = IBV_EXP_WQS_RESET,
993 err = ibv_exp_modify_wq(tmpl.wq, &mod);
995 ERROR("%p: cannot reset WQ: %s", (void *)dev, strerror(err));
1000 pool = rte_malloc(__func__, (mbuf_n * sizeof(*pool)), 0);
1002 ERROR("%p: cannot allocate memory", (void *)dev);
1005 /* Snatch mbufs from original queue. */
1008 struct rxq_elt_sp (*elts)[rxq->elts_n] = rxq->elts.sp;
1010 for (i = 0; (i != RTE_DIM(*elts)); ++i) {
1011 struct rxq_elt_sp *elt = &(*elts)[i];
1014 for (j = 0; (j != RTE_DIM(elt->bufs)); ++j) {
1015 assert(elt->bufs[j] != NULL);
1016 pool[k++] = elt->bufs[j];
1020 struct rxq_elt (*elts)[rxq->elts_n] = rxq->elts.no_sp;
1022 for (i = 0; (i != RTE_DIM(*elts)); ++i) {
1023 struct rxq_elt *elt = &(*elts)[i];
1024 struct rte_mbuf *buf = elt->buf;
1029 assert(k == mbuf_n);
1031 tmpl.elts.sp = NULL;
1032 assert((void *)&tmpl.elts.sp == (void *)&tmpl.elts.no_sp);
1034 rxq_alloc_elts_sp(&tmpl, desc_n, pool) :
1035 rxq_alloc_elts(&tmpl, desc_n, pool));
1037 ERROR("%p: cannot reallocate WRs, aborting", (void *)dev);
1042 assert(tmpl.elts_n == desc_n);
1043 assert(tmpl.elts.sp != NULL);
1045 /* Clean up original data. */
1047 rte_free(rxq->elts.sp);
1048 rxq->elts.sp = NULL;
1049 /* Change queue state to ready. */
1050 mod = (struct ibv_exp_wq_attr){
1051 .attr_mask = IBV_EXP_WQ_ATTR_STATE,
1052 .wq_state = IBV_EXP_WQS_RDY,
1054 err = ibv_exp_modify_wq(tmpl.wq, &mod);
1056 ERROR("%p: WQ state to IBV_EXP_WQS_RDY failed: %s",
1057 (void *)dev, strerror(err));
1061 assert(tmpl.if_wq != NULL);
1063 struct rxq_elt_sp (*elts)[tmpl.elts_n] = tmpl.elts.sp;
1065 for (i = 0; (i != RTE_DIM(*elts)); ++i) {
1066 err = tmpl.if_wq->recv_sg_list
1069 RTE_DIM((*elts)[i].sges));
1074 struct rxq_elt (*elts)[tmpl.elts_n] = tmpl.elts.no_sp;
1076 for (i = 0; (i != RTE_DIM(*elts)); ++i) {
1077 err = tmpl.if_wq->recv_burst(
1086 ERROR("%p: failed to post SGEs with error %d",
1088 /* Set err because it does not contain a valid errno value. */
1099 * Configure a RX queue.
1102 * Pointer to Ethernet device structure.
1104 * Pointer to RX queue structure.
1106 * Number of descriptors to configure in queue.
1108 * NUMA socket on which memory must be allocated.
1110 * Thresholds parameters.
1112 * Memory pool for buffer allocations.
1115 * 0 on success, errno value on failure.
1118 rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
1119 unsigned int socket, const struct rte_eth_rxconf *conf,
1120 struct rte_mempool *mp)
1122 struct priv *priv = dev->data->dev_private;
1128 struct ibv_exp_wq_attr mod;
1130 struct ibv_exp_query_intf_params params;
1131 struct ibv_exp_cq_init_attr cq;
1132 struct ibv_exp_res_domain_init_attr rd;
1133 struct ibv_exp_wq_init_attr wq;
1135 enum ibv_exp_query_intf_status status;
1136 struct rte_mbuf *buf;
1139 unsigned int cq_size = desc;
1141 (void)conf; /* Thresholds configuration (ignored). */
1142 if ((desc == 0) || (desc % MLX5_PMD_SGE_WR_N)) {
1143 ERROR("%p: invalid number of RX descriptors (must be a"
1144 " multiple of %d)", (void *)dev, MLX5_PMD_SGE_WR_N);
1147 /* Get mbuf length. */
1148 buf = rte_pktmbuf_alloc(mp);
1150 ERROR("%p: unable to allocate mbuf", (void *)dev);
1153 tmpl.mb_len = buf->buf_len;
1154 assert((rte_pktmbuf_headroom(buf) +
1155 rte_pktmbuf_tailroom(buf)) == tmpl.mb_len);
1156 assert(rte_pktmbuf_headroom(buf) == RTE_PKTMBUF_HEADROOM);
1157 rte_pktmbuf_free(buf);
1158 /* Toggle RX checksum offload if hardware supports it. */
1160 tmpl.csum = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
1161 if (priv->hw_csum_l2tun)
1162 tmpl.csum_l2tun = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
1163 /* Enable scattered packets support for this queue if necessary. */
1164 if ((dev->data->dev_conf.rxmode.jumbo_frame) &&
1165 (dev->data->dev_conf.rxmode.max_rx_pkt_len >
1166 (tmpl.mb_len - RTE_PKTMBUF_HEADROOM))) {
1168 desc /= MLX5_PMD_SGE_WR_N;
1170 DEBUG("%p: %s scattered packets support (%u WRs)",
1171 (void *)dev, (tmpl.sp ? "enabling" : "disabling"), desc);
1172 /* Use the entire RX mempool as the memory region. */
1173 tmpl.mr = ibv_reg_mr(priv->pd,
1174 (void *)mp->elt_va_start,
1175 (mp->elt_va_end - mp->elt_va_start),
1176 (IBV_ACCESS_LOCAL_WRITE |
1177 IBV_ACCESS_REMOTE_WRITE));
1178 if (tmpl.mr == NULL) {
1180 ERROR("%p: MR creation failure: %s",
1181 (void *)dev, strerror(ret));
1184 attr.rd = (struct ibv_exp_res_domain_init_attr){
1185 .comp_mask = (IBV_EXP_RES_DOMAIN_THREAD_MODEL |
1186 IBV_EXP_RES_DOMAIN_MSG_MODEL),
1187 .thread_model = IBV_EXP_THREAD_SINGLE,
1188 .msg_model = IBV_EXP_MSG_HIGH_BW,
1190 tmpl.rd = ibv_exp_create_res_domain(priv->ctx, &attr.rd);
1191 if (tmpl.rd == NULL) {
1193 ERROR("%p: RD creation failure: %s",
1194 (void *)dev, strerror(ret));
1197 attr.cq = (struct ibv_exp_cq_init_attr){
1198 .comp_mask = IBV_EXP_CQ_INIT_ATTR_RES_DOMAIN,
1199 .res_domain = tmpl.rd,
1201 tmpl.cq = ibv_exp_create_cq(priv->ctx, cq_size, NULL, NULL, 0,
1203 if (tmpl.cq == NULL) {
1205 ERROR("%p: CQ creation failure: %s",
1206 (void *)dev, strerror(ret));
1209 DEBUG("priv->device_attr.max_qp_wr is %d",
1210 priv->device_attr.max_qp_wr);
1211 DEBUG("priv->device_attr.max_sge is %d",
1212 priv->device_attr.max_sge);
1213 attr.wq = (struct ibv_exp_wq_init_attr){
1214 .wq_context = NULL, /* Could be useful in the future. */
1215 .wq_type = IBV_EXP_WQT_RQ,
1216 /* Max number of outstanding WRs. */
1217 .max_recv_wr = ((priv->device_attr.max_qp_wr < (int)cq_size) ?
1218 priv->device_attr.max_qp_wr :
1220 /* Max number of scatter/gather elements in a WR. */
1221 .max_recv_sge = ((priv->device_attr.max_sge <
1222 MLX5_PMD_SGE_WR_N) ?
1223 priv->device_attr.max_sge :
1227 .comp_mask = IBV_EXP_CREATE_WQ_RES_DOMAIN,
1228 .res_domain = tmpl.rd,
1230 tmpl.wq = ibv_exp_create_wq(priv->ctx, &attr.wq);
1231 if (tmpl.wq == NULL) {
1232 ret = (errno ? errno : EINVAL);
1233 ERROR("%p: WQ creation failure: %s",
1234 (void *)dev, strerror(ret));
1238 ret = rxq_alloc_elts_sp(&tmpl, desc, NULL);
1240 ret = rxq_alloc_elts(&tmpl, desc, NULL);
1242 ERROR("%p: RXQ allocation failed: %s",
1243 (void *)dev, strerror(ret));
1247 tmpl.port_id = dev->data->port_id;
1248 DEBUG("%p: RTE port ID: %u", (void *)rxq, tmpl.port_id);
1249 attr.params = (struct ibv_exp_query_intf_params){
1250 .intf_scope = IBV_EXP_INTF_GLOBAL,
1251 .intf = IBV_EXP_INTF_CQ,
1254 tmpl.if_cq = ibv_exp_query_intf(priv->ctx, &attr.params, &status);
1255 if (tmpl.if_cq == NULL) {
1256 ERROR("%p: CQ interface family query failed with status %d",
1257 (void *)dev, status);
1260 attr.params = (struct ibv_exp_query_intf_params){
1261 .intf_scope = IBV_EXP_INTF_GLOBAL,
1262 .intf = IBV_EXP_INTF_WQ,
1265 tmpl.if_wq = ibv_exp_query_intf(priv->ctx, &attr.params, &status);
1266 if (tmpl.if_wq == NULL) {
1267 ERROR("%p: WQ interface family query failed with status %d",
1268 (void *)dev, status);
1271 /* Change queue state to ready. */
1272 mod = (struct ibv_exp_wq_attr){
1273 .attr_mask = IBV_EXP_WQ_ATTR_STATE,
1274 .wq_state = IBV_EXP_WQS_RDY,
1276 ret = ibv_exp_modify_wq(tmpl.wq, &mod);
1278 ERROR("%p: WQ state to IBV_EXP_WQS_RDY failed: %s",
1279 (void *)dev, strerror(ret));
1284 struct rxq_elt_sp (*elts)[tmpl.elts_n] = tmpl.elts.sp;
1286 for (i = 0; (i != RTE_DIM(*elts)); ++i) {
1287 ret = tmpl.if_wq->recv_sg_list
1290 RTE_DIM((*elts)[i].sges));
1295 struct rxq_elt (*elts)[tmpl.elts_n] = tmpl.elts.no_sp;
1297 for (i = 0; (i != RTE_DIM(*elts)); ++i) {
1298 ret = tmpl.if_wq->recv_burst(
1307 ERROR("%p: failed to post SGEs with error %d",
1309 /* Set ret because it does not contain a valid errno value. */
1313 /* Clean up rxq in case we're reinitializing it. */
1314 DEBUG("%p: cleaning-up old rxq just in case", (void *)rxq);
1317 DEBUG("%p: rxq updated with %p", (void *)rxq, (void *)&tmpl);
1327 * DPDK callback to configure a RX queue.
1330 * Pointer to Ethernet device structure.
1334 * Number of descriptors to configure in queue.
1336 * NUMA socket on which memory must be allocated.
1338 * Thresholds parameters.
1340 * Memory pool for buffer allocations.
1343 * 0 on success, negative errno value on failure.
1346 mlx5_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
1347 unsigned int socket, const struct rte_eth_rxconf *conf,
1348 struct rte_mempool *mp)
1350 struct priv *priv = dev->data->dev_private;
1351 struct rxq *rxq = (*priv->rxqs)[idx];
1355 DEBUG("%p: configuring queue %u for %u descriptors",
1356 (void *)dev, idx, desc);
1357 if (idx >= priv->rxqs_n) {
1358 ERROR("%p: queue index out of range (%u >= %u)",
1359 (void *)dev, idx, priv->rxqs_n);
1364 DEBUG("%p: reusing already allocated queue index %u (%p)",
1365 (void *)dev, idx, (void *)rxq);
1366 if (priv->started) {
1370 (*priv->rxqs)[idx] = NULL;
1373 rxq = rte_calloc_socket("RXQ", 1, sizeof(*rxq), 0, socket);
1375 ERROR("%p: unable to allocate queue index %u",
1381 ret = rxq_setup(dev, rxq, desc, socket, conf, mp);
1385 rxq->stats.idx = idx;
1386 DEBUG("%p: adding RX queue %p to list",
1387 (void *)dev, (void *)rxq);
1388 (*priv->rxqs)[idx] = rxq;
1389 /* Update receive callback. */
1391 dev->rx_pkt_burst = mlx5_rx_burst_sp;
1393 dev->rx_pkt_burst = mlx5_rx_burst;
1400 * DPDK callback to release a RX queue.
1403 * Generic RX queue pointer.
1406 mlx5_rx_queue_release(void *dpdk_rxq)
1408 struct rxq *rxq = (struct rxq *)dpdk_rxq;
1416 for (i = 0; (i != priv->rxqs_n); ++i)
1417 if ((*priv->rxqs)[i] == rxq) {
1418 DEBUG("%p: removing RX queue %p from list",
1419 (void *)priv->dev, (void *)rxq);
1420 (*priv->rxqs)[i] = NULL;