4 * Copyright 2015 6WIND S.A.
5 * Copyright 2015 Mellanox.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * * Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * * Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * * Neither the name of 6WIND S.A. nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
41 /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
43 #pragma GCC diagnostic ignored "-pedantic"
45 #include <infiniband/verbs.h>
47 #pragma GCC diagnostic error "-pedantic"
50 /* DPDK headers don't like -pedantic. */
52 #pragma GCC diagnostic ignored "-pedantic"
55 #include <rte_malloc.h>
56 #include <rte_ethdev.h>
57 #include <rte_common.h>
59 #pragma GCC diagnostic error "-pedantic"
63 #include "mlx5_rxtx.h"
64 #include "mlx5_utils.h"
65 #include "mlx5_autoconf.h"
66 #include "mlx5_defs.h"
68 /* Initialization data for hash RX queues. */
69 const struct hash_rxq_init hash_rxq_init[] = {
71 .hash_fields = (IBV_EXP_RX_HASH_SRC_IPV4 |
72 IBV_EXP_RX_HASH_DST_IPV4 |
73 IBV_EXP_RX_HASH_SRC_PORT_TCP |
74 IBV_EXP_RX_HASH_DST_PORT_TCP),
75 .dpdk_rss_hf = ETH_RSS_NONFRAG_IPV4_TCP,
77 .flow_spec.tcp_udp = {
78 .type = IBV_EXP_FLOW_SPEC_TCP,
79 .size = sizeof(hash_rxq_init[0].flow_spec.tcp_udp),
81 .underlayer = &hash_rxq_init[HASH_RXQ_IPV4],
84 .hash_fields = (IBV_EXP_RX_HASH_SRC_IPV4 |
85 IBV_EXP_RX_HASH_DST_IPV4 |
86 IBV_EXP_RX_HASH_SRC_PORT_UDP |
87 IBV_EXP_RX_HASH_DST_PORT_UDP),
88 .dpdk_rss_hf = ETH_RSS_NONFRAG_IPV4_UDP,
90 .flow_spec.tcp_udp = {
91 .type = IBV_EXP_FLOW_SPEC_UDP,
92 .size = sizeof(hash_rxq_init[0].flow_spec.tcp_udp),
94 .underlayer = &hash_rxq_init[HASH_RXQ_IPV4],
97 .hash_fields = (IBV_EXP_RX_HASH_SRC_IPV4 |
98 IBV_EXP_RX_HASH_DST_IPV4),
99 .dpdk_rss_hf = (ETH_RSS_IPV4 |
103 .type = IBV_EXP_FLOW_SPEC_IPV4,
104 .size = sizeof(hash_rxq_init[0].flow_spec.ipv4),
106 .underlayer = &hash_rxq_init[HASH_RXQ_ETH],
108 #ifdef HAVE_FLOW_SPEC_IPV6
110 .hash_fields = (IBV_EXP_RX_HASH_SRC_IPV6 |
111 IBV_EXP_RX_HASH_DST_IPV6 |
112 IBV_EXP_RX_HASH_SRC_PORT_TCP |
113 IBV_EXP_RX_HASH_DST_PORT_TCP),
114 .dpdk_rss_hf = ETH_RSS_NONFRAG_IPV6_TCP,
116 .flow_spec.tcp_udp = {
117 .type = IBV_EXP_FLOW_SPEC_TCP,
118 .size = sizeof(hash_rxq_init[0].flow_spec.tcp_udp),
120 .underlayer = &hash_rxq_init[HASH_RXQ_IPV6],
123 .hash_fields = (IBV_EXP_RX_HASH_SRC_IPV6 |
124 IBV_EXP_RX_HASH_DST_IPV6 |
125 IBV_EXP_RX_HASH_SRC_PORT_UDP |
126 IBV_EXP_RX_HASH_DST_PORT_UDP),
127 .dpdk_rss_hf = ETH_RSS_NONFRAG_IPV6_UDP,
129 .flow_spec.tcp_udp = {
130 .type = IBV_EXP_FLOW_SPEC_UDP,
131 .size = sizeof(hash_rxq_init[0].flow_spec.tcp_udp),
133 .underlayer = &hash_rxq_init[HASH_RXQ_IPV6],
136 .hash_fields = (IBV_EXP_RX_HASH_SRC_IPV6 |
137 IBV_EXP_RX_HASH_DST_IPV6),
138 .dpdk_rss_hf = (ETH_RSS_IPV6 |
142 .type = IBV_EXP_FLOW_SPEC_IPV6,
143 .size = sizeof(hash_rxq_init[0].flow_spec.ipv6),
145 .underlayer = &hash_rxq_init[HASH_RXQ_ETH],
147 #endif /* HAVE_FLOW_SPEC_IPV6 */
153 .type = IBV_EXP_FLOW_SPEC_ETH,
154 .size = sizeof(hash_rxq_init[0].flow_spec.eth),
160 /* Number of entries in hash_rxq_init[]. */
161 const unsigned int hash_rxq_init_n = RTE_DIM(hash_rxq_init);
163 /* Initialization data for hash RX queue indirection tables. */
164 static const struct ind_table_init ind_table_init[] = {
166 .max_size = -1u, /* Superseded by HW limitations. */
168 1 << HASH_RXQ_TCPV4 |
169 1 << HASH_RXQ_UDPV4 |
171 #ifdef HAVE_FLOW_SPEC_IPV6
172 1 << HASH_RXQ_TCPV6 |
173 1 << HASH_RXQ_UDPV6 |
175 #endif /* HAVE_FLOW_SPEC_IPV6 */
177 #ifdef HAVE_FLOW_SPEC_IPV6
179 #else /* HAVE_FLOW_SPEC_IPV6 */
181 #endif /* HAVE_FLOW_SPEC_IPV6 */
185 .hash_types = 1 << HASH_RXQ_ETH,
190 #define IND_TABLE_INIT_N RTE_DIM(ind_table_init)
192 /* Default RSS hash key also used for ConnectX-3. */
193 uint8_t rss_hash_default_key[] = {
194 0x2c, 0xc6, 0x81, 0xd1,
195 0x5b, 0xdb, 0xf4, 0xf7,
196 0xfc, 0xa2, 0x83, 0x19,
197 0xdb, 0x1a, 0x3e, 0x94,
198 0x6b, 0x9e, 0x38, 0xd9,
199 0x2c, 0x9c, 0x03, 0xd1,
200 0xad, 0x99, 0x44, 0xa7,
201 0xd9, 0x56, 0x3d, 0x59,
202 0x06, 0x3c, 0x25, 0xf3,
203 0xfc, 0x1f, 0xdc, 0x2a,
206 /* Length of the default RSS hash key. */
207 const size_t rss_hash_default_key_len = sizeof(rss_hash_default_key);
210 * Populate flow steering rule for a given hash RX queue type using
211 * information from hash_rxq_init[]. Nothing is written to flow_attr when
212 * flow_attr_size is not large enough, but the required size is still returned.
215 * Pointer to private structure.
216 * @param[out] flow_attr
217 * Pointer to flow attribute structure to fill. Note that the allocated
218 * area must be larger and large enough to hold all flow specifications.
219 * @param flow_attr_size
220 * Entire size of flow_attr and trailing room for flow specifications.
222 * Hash RX queue type to use for flow steering rule.
225 * Total size of the flow attribute buffer. No errors are defined.
228 priv_flow_attr(struct priv *priv, struct ibv_exp_flow_attr *flow_attr,
229 size_t flow_attr_size, enum hash_rxq_type type)
231 size_t offset = sizeof(*flow_attr);
232 const struct hash_rxq_init *init = &hash_rxq_init[type];
234 assert(priv != NULL);
235 assert((size_t)type < RTE_DIM(hash_rxq_init));
237 offset += init->flow_spec.hdr.size;
238 init = init->underlayer;
239 } while (init != NULL);
240 if (offset > flow_attr_size)
242 flow_attr_size = offset;
243 init = &hash_rxq_init[type];
244 *flow_attr = (struct ibv_exp_flow_attr){
245 .type = IBV_EXP_FLOW_ATTR_NORMAL,
246 #ifdef MLX5_FDIR_SUPPORT
247 /* Priorities < 3 are reserved for flow director. */
248 .priority = init->flow_priority + 3,
249 #else /* MLX5_FDIR_SUPPORT */
250 .priority = init->flow_priority,
251 #endif /* MLX5_FDIR_SUPPORT */
257 offset -= init->flow_spec.hdr.size;
258 memcpy((void *)((uintptr_t)flow_attr + offset),
260 init->flow_spec.hdr.size);
261 ++flow_attr->num_of_specs;
262 init = init->underlayer;
263 } while (init != NULL);
264 return flow_attr_size;
268 * Convert hash type position in indirection table initializer to
269 * hash RX queue type.
272 * Indirection table initializer.
274 * Hash type position.
277 * Hash RX queue type.
279 static enum hash_rxq_type
280 hash_rxq_type_from_pos(const struct ind_table_init *table, unsigned int pos)
282 enum hash_rxq_type type = 0;
284 assert(pos < table->hash_types_n);
286 if ((table->hash_types & (1 << type)) && (pos-- == 0))
294 * Filter out disabled hash RX queue types from ind_table_init[].
297 * Pointer to private structure.
302 * Number of table entries.
305 priv_make_ind_table_init(struct priv *priv,
306 struct ind_table_init (*table)[IND_TABLE_INIT_N])
311 unsigned int table_n = 0;
312 /* Mandatory to receive frames not handled by normal hash RX queues. */
313 unsigned int hash_types_sup = 1 << HASH_RXQ_ETH;
315 rss_hf = priv->dev->data->dev_conf.rx_adv_conf.rss_conf.rss_hf;
316 /* Process other protocols only if more than one queue. */
317 if (priv->rxqs_n > 1)
318 for (i = 0; (i != hash_rxq_init_n); ++i)
319 if (rss_hf & hash_rxq_init[i].dpdk_rss_hf)
320 hash_types_sup |= (1 << i);
322 /* Filter out entries whose protocols are not in the set. */
323 for (i = 0, j = 0; (i != IND_TABLE_INIT_N); ++i) {
327 /* j is increased only if the table has valid protocols. */
329 (*table)[j] = ind_table_init[i];
330 (*table)[j].hash_types &= hash_types_sup;
331 for (h = 0, nb = 0; (h != hash_rxq_init_n); ++h)
332 if (((*table)[j].hash_types >> h) & 0x1)
334 (*table)[i].hash_types_n = nb;
344 * Initialize hash RX queues and indirection table.
347 * Pointer to private structure.
350 * 0 on success, errno value on failure.
353 priv_create_hash_rxqs(struct priv *priv)
355 struct ibv_exp_wq *wqs[priv->reta_idx_n];
356 struct ind_table_init ind_table_init[IND_TABLE_INIT_N];
357 unsigned int ind_tables_n =
358 priv_make_ind_table_init(priv, &ind_table_init);
359 unsigned int hash_rxqs_n = 0;
360 struct hash_rxq (*hash_rxqs)[] = NULL;
361 struct ibv_exp_rwq_ind_table *(*ind_tables)[] = NULL;
367 assert(priv->ind_tables == NULL);
368 assert(priv->ind_tables_n == 0);
369 assert(priv->hash_rxqs == NULL);
370 assert(priv->hash_rxqs_n == 0);
371 assert(priv->pd != NULL);
372 assert(priv->ctx != NULL);
373 if (priv->rxqs_n == 0)
375 assert(priv->rxqs != NULL);
376 if (ind_tables_n == 0) {
377 ERROR("all hash RX queue types have been filtered out,"
378 " indirection table cannot be created");
381 if (priv->rxqs_n & (priv->rxqs_n - 1)) {
382 INFO("%u RX queues are configured, consider rounding this"
383 " number to the next power of two for better balancing",
385 DEBUG("indirection table extended to assume %u WQs",
388 for (i = 0; (i != priv->reta_idx_n); ++i)
389 wqs[i] = (*priv->rxqs)[(*priv->reta_idx)[i]]->wq;
390 /* Get number of hash RX queues to configure. */
391 for (i = 0, hash_rxqs_n = 0; (i != ind_tables_n); ++i)
392 hash_rxqs_n += ind_table_init[i].hash_types_n;
393 DEBUG("allocating %u hash RX queues for %u WQs, %u indirection tables",
394 hash_rxqs_n, priv->rxqs_n, ind_tables_n);
395 /* Create indirection tables. */
396 ind_tables = rte_calloc(__func__, ind_tables_n,
397 sizeof((*ind_tables)[0]), 0);
398 if (ind_tables == NULL) {
400 ERROR("cannot allocate indirection tables container: %s",
404 for (i = 0; (i != ind_tables_n); ++i) {
405 struct ibv_exp_rwq_ind_table_init_attr ind_init_attr = {
407 .log_ind_tbl_size = 0, /* Set below. */
411 unsigned int ind_tbl_size = ind_table_init[i].max_size;
412 struct ibv_exp_rwq_ind_table *ind_table;
414 if (priv->reta_idx_n < ind_tbl_size)
415 ind_tbl_size = priv->reta_idx_n;
416 ind_init_attr.log_ind_tbl_size = log2above(ind_tbl_size);
418 ind_table = ibv_exp_create_rwq_ind_table(priv->ctx,
420 if (ind_table != NULL) {
421 (*ind_tables)[i] = ind_table;
424 /* Not clear whether errno is set. */
425 err = (errno ? errno : EINVAL);
426 ERROR("RX indirection table creation failed with error %d: %s",
430 /* Allocate array that holds hash RX queues and related data. */
431 hash_rxqs = rte_calloc(__func__, hash_rxqs_n,
432 sizeof((*hash_rxqs)[0]), 0);
433 if (hash_rxqs == NULL) {
435 ERROR("cannot allocate hash RX queues container: %s",
439 for (i = 0, j = 0, k = 0;
440 ((i != hash_rxqs_n) && (j != ind_tables_n));
442 struct hash_rxq *hash_rxq = &(*hash_rxqs)[i];
443 enum hash_rxq_type type =
444 hash_rxq_type_from_pos(&ind_table_init[j], k);
445 struct rte_eth_rss_conf *priv_rss_conf =
446 (*priv->rss_conf)[type];
447 struct ibv_exp_rx_hash_conf hash_conf = {
448 .rx_hash_function = IBV_EXP_RX_HASH_FUNC_TOEPLITZ,
449 .rx_hash_key_len = (priv_rss_conf ?
450 priv_rss_conf->rss_key_len :
451 rss_hash_default_key_len),
452 .rx_hash_key = (priv_rss_conf ?
453 priv_rss_conf->rss_key :
454 rss_hash_default_key),
455 .rx_hash_fields_mask = hash_rxq_init[type].hash_fields,
456 .rwq_ind_tbl = (*ind_tables)[j],
458 struct ibv_exp_qp_init_attr qp_init_attr = {
459 .max_inl_recv = 0, /* Currently not supported. */
460 .qp_type = IBV_QPT_RAW_PACKET,
461 .comp_mask = (IBV_EXP_QP_INIT_ATTR_PD |
462 IBV_EXP_QP_INIT_ATTR_RX_HASH),
464 .rx_hash_conf = &hash_conf,
465 .port_num = priv->port,
468 DEBUG("using indirection table %u for hash RX queue %u type %d",
470 *hash_rxq = (struct hash_rxq){
472 .qp = ibv_exp_create_qp(priv->ctx, &qp_init_attr),
475 if (hash_rxq->qp == NULL) {
476 err = (errno ? errno : EINVAL);
477 ERROR("Hash RX QP creation failure: %s",
481 if (++k < ind_table_init[j].hash_types_n)
483 /* Switch to the next indirection table and reset hash RX
484 * queue type array index. */
488 priv->ind_tables = ind_tables;
489 priv->ind_tables_n = ind_tables_n;
490 priv->hash_rxqs = hash_rxqs;
491 priv->hash_rxqs_n = hash_rxqs_n;
495 if (hash_rxqs != NULL) {
496 for (i = 0; (i != hash_rxqs_n); ++i) {
497 struct ibv_qp *qp = (*hash_rxqs)[i].qp;
501 claim_zero(ibv_destroy_qp(qp));
505 if (ind_tables != NULL) {
506 for (j = 0; (j != ind_tables_n); ++j) {
507 struct ibv_exp_rwq_ind_table *ind_table =
510 if (ind_table == NULL)
512 claim_zero(ibv_exp_destroy_rwq_ind_table(ind_table));
514 rte_free(ind_tables);
520 * Clean up hash RX queues and indirection table.
523 * Pointer to private structure.
526 priv_destroy_hash_rxqs(struct priv *priv)
530 DEBUG("destroying %u hash RX queues", priv->hash_rxqs_n);
531 if (priv->hash_rxqs_n == 0) {
532 assert(priv->hash_rxqs == NULL);
533 assert(priv->ind_tables == NULL);
536 for (i = 0; (i != priv->hash_rxqs_n); ++i) {
537 struct hash_rxq *hash_rxq = &(*priv->hash_rxqs)[i];
540 assert(hash_rxq->priv == priv);
541 assert(hash_rxq->qp != NULL);
542 /* Also check that there are no remaining flows. */
543 for (j = 0; (j != RTE_DIM(hash_rxq->special_flow)); ++j)
544 assert(hash_rxq->special_flow[j] == NULL);
545 for (j = 0; (j != RTE_DIM(hash_rxq->mac_flow)); ++j)
546 for (k = 0; (k != RTE_DIM(hash_rxq->mac_flow[j])); ++k)
547 assert(hash_rxq->mac_flow[j][k] == NULL);
548 claim_zero(ibv_destroy_qp(hash_rxq->qp));
550 priv->hash_rxqs_n = 0;
551 rte_free(priv->hash_rxqs);
552 priv->hash_rxqs = NULL;
553 for (i = 0; (i != priv->ind_tables_n); ++i) {
554 struct ibv_exp_rwq_ind_table *ind_table =
555 (*priv->ind_tables)[i];
557 assert(ind_table != NULL);
558 claim_zero(ibv_exp_destroy_rwq_ind_table(ind_table));
560 priv->ind_tables_n = 0;
561 rte_free(priv->ind_tables);
562 priv->ind_tables = NULL;
566 * Check whether a given flow type is allowed.
569 * Pointer to private structure.
571 * Flow type to check.
574 * Nonzero if the given flow type is allowed.
577 priv_allow_flow_type(struct priv *priv, enum hash_rxq_flow_type type)
579 /* Only FLOW_TYPE_PROMISC is allowed when promiscuous mode
580 * has been requested. */
581 if (priv->promisc_req)
582 return type == HASH_RXQ_FLOW_TYPE_PROMISC;
584 case HASH_RXQ_FLOW_TYPE_PROMISC:
585 return !!priv->promisc_req;
586 case HASH_RXQ_FLOW_TYPE_ALLMULTI:
587 return !!priv->allmulti_req;
588 case HASH_RXQ_FLOW_TYPE_BROADCAST:
589 #ifdef HAVE_FLOW_SPEC_IPV6
590 case HASH_RXQ_FLOW_TYPE_IPV6MULTI:
591 #endif /* HAVE_FLOW_SPEC_IPV6 */
592 /* If allmulti is enabled, broadcast and ipv6multi
593 * are unnecessary. */
594 return !priv->allmulti_req;
595 case HASH_RXQ_FLOW_TYPE_MAC:
598 /* Unsupported flow type is not allowed. */
605 * Automatically enable/disable flows according to configuration.
611 * 0 on success, errno value on failure.
614 priv_rehash_flows(struct priv *priv)
618 for (i = 0; (i != RTE_DIM((*priv->hash_rxqs)[0].special_flow)); ++i)
619 if (!priv_allow_flow_type(priv, i)) {
620 priv_special_flow_disable(priv, i);
622 int ret = priv_special_flow_enable(priv, i);
627 if (priv_allow_flow_type(priv, HASH_RXQ_FLOW_TYPE_MAC))
628 return priv_mac_addrs_enable(priv);
629 priv_mac_addrs_disable(priv);
634 * Allocate RX queue elements with scattered packets support.
637 * Pointer to RX queue structure.
639 * Number of elements to allocate.
641 * If not NULL, fetch buffers from this array instead of allocating them
642 * with rte_pktmbuf_alloc().
645 * 0 on success, errno value on failure.
648 rxq_alloc_elts_sp(struct rxq *rxq, unsigned int elts_n,
649 struct rte_mbuf **pool)
652 struct rxq_elt_sp (*elts)[elts_n] =
653 rte_calloc_socket("RXQ elements", 1, sizeof(*elts), 0,
658 ERROR("%p: can't allocate packets array", (void *)rxq);
662 /* For each WR (packet). */
663 for (i = 0; (i != elts_n); ++i) {
665 struct rxq_elt_sp *elt = &(*elts)[i];
666 struct ibv_sge (*sges)[RTE_DIM(elt->sges)] = &elt->sges;
668 /* These two arrays must have the same size. */
669 assert(RTE_DIM(elt->sges) == RTE_DIM(elt->bufs));
670 /* For each SGE (segment). */
671 for (j = 0; (j != RTE_DIM(elt->bufs)); ++j) {
672 struct ibv_sge *sge = &(*sges)[j];
673 struct rte_mbuf *buf;
678 rte_pktmbuf_reset(buf);
680 buf = rte_pktmbuf_alloc(rxq->mp);
682 assert(pool == NULL);
683 ERROR("%p: empty mbuf pool", (void *)rxq);
688 /* Headroom is reserved by rte_pktmbuf_alloc(). */
689 assert(DATA_OFF(buf) == RTE_PKTMBUF_HEADROOM);
690 /* Buffer is supposed to be empty. */
691 assert(rte_pktmbuf_data_len(buf) == 0);
692 assert(rte_pktmbuf_pkt_len(buf) == 0);
693 /* sge->addr must be able to store a pointer. */
694 assert(sizeof(sge->addr) >= sizeof(uintptr_t));
696 /* The first SGE keeps its headroom. */
697 sge->addr = rte_pktmbuf_mtod(buf, uintptr_t);
698 sge->length = (buf->buf_len -
699 RTE_PKTMBUF_HEADROOM);
701 /* Subsequent SGEs lose theirs. */
702 assert(DATA_OFF(buf) == RTE_PKTMBUF_HEADROOM);
703 SET_DATA_OFF(buf, 0);
704 sge->addr = (uintptr_t)buf->buf_addr;
705 sge->length = buf->buf_len;
707 sge->lkey = rxq->mr->lkey;
708 /* Redundant check for tailroom. */
709 assert(sge->length == rte_pktmbuf_tailroom(buf));
712 DEBUG("%p: allocated and configured %u WRs (%zu segments)",
713 (void *)rxq, elts_n, (elts_n * RTE_DIM((*elts)[0].sges)));
714 rxq->elts_n = elts_n;
721 assert(pool == NULL);
722 for (i = 0; (i != RTE_DIM(*elts)); ++i) {
724 struct rxq_elt_sp *elt = &(*elts)[i];
726 for (j = 0; (j != RTE_DIM(elt->bufs)); ++j) {
727 struct rte_mbuf *buf = elt->bufs[j];
730 rte_pktmbuf_free_seg(buf);
735 DEBUG("%p: failed, freed everything", (void *)rxq);
741 * Free RX queue elements with scattered packets support.
744 * Pointer to RX queue structure.
747 rxq_free_elts_sp(struct rxq *rxq)
750 unsigned int elts_n = rxq->elts_n;
751 struct rxq_elt_sp (*elts)[elts_n] = rxq->elts.sp;
753 DEBUG("%p: freeing WRs", (void *)rxq);
758 for (i = 0; (i != RTE_DIM(*elts)); ++i) {
760 struct rxq_elt_sp *elt = &(*elts)[i];
762 for (j = 0; (j != RTE_DIM(elt->bufs)); ++j) {
763 struct rte_mbuf *buf = elt->bufs[j];
766 rte_pktmbuf_free_seg(buf);
773 * Allocate RX queue elements.
776 * Pointer to RX queue structure.
778 * Number of elements to allocate.
780 * If not NULL, fetch buffers from this array instead of allocating them
781 * with rte_pktmbuf_alloc().
784 * 0 on success, errno value on failure.
787 rxq_alloc_elts(struct rxq *rxq, unsigned int elts_n, struct rte_mbuf **pool)
790 struct rxq_elt (*elts)[elts_n] =
791 rte_calloc_socket("RXQ elements", 1, sizeof(*elts), 0,
796 ERROR("%p: can't allocate packets array", (void *)rxq);
800 /* For each WR (packet). */
801 for (i = 0; (i != elts_n); ++i) {
802 struct rxq_elt *elt = &(*elts)[i];
803 struct ibv_sge *sge = &(*elts)[i].sge;
804 struct rte_mbuf *buf;
809 rte_pktmbuf_reset(buf);
811 buf = rte_pktmbuf_alloc(rxq->mp);
813 assert(pool == NULL);
814 ERROR("%p: empty mbuf pool", (void *)rxq);
819 /* Headroom is reserved by rte_pktmbuf_alloc(). */
820 assert(DATA_OFF(buf) == RTE_PKTMBUF_HEADROOM);
821 /* Buffer is supposed to be empty. */
822 assert(rte_pktmbuf_data_len(buf) == 0);
823 assert(rte_pktmbuf_pkt_len(buf) == 0);
824 /* sge->addr must be able to store a pointer. */
825 assert(sizeof(sge->addr) >= sizeof(uintptr_t));
826 /* SGE keeps its headroom. */
827 sge->addr = (uintptr_t)
828 ((uint8_t *)buf->buf_addr + RTE_PKTMBUF_HEADROOM);
829 sge->length = (buf->buf_len - RTE_PKTMBUF_HEADROOM);
830 sge->lkey = rxq->mr->lkey;
831 /* Redundant check for tailroom. */
832 assert(sge->length == rte_pktmbuf_tailroom(buf));
834 DEBUG("%p: allocated and configured %u single-segment WRs",
835 (void *)rxq, elts_n);
836 rxq->elts_n = elts_n;
838 rxq->elts.no_sp = elts;
843 assert(pool == NULL);
844 for (i = 0; (i != RTE_DIM(*elts)); ++i) {
845 struct rxq_elt *elt = &(*elts)[i];
846 struct rte_mbuf *buf = elt->buf;
849 rte_pktmbuf_free_seg(buf);
853 DEBUG("%p: failed, freed everything", (void *)rxq);
859 * Free RX queue elements.
862 * Pointer to RX queue structure.
865 rxq_free_elts(struct rxq *rxq)
868 unsigned int elts_n = rxq->elts_n;
869 struct rxq_elt (*elts)[elts_n] = rxq->elts.no_sp;
871 DEBUG("%p: freeing WRs", (void *)rxq);
873 rxq->elts.no_sp = NULL;
876 for (i = 0; (i != RTE_DIM(*elts)); ++i) {
877 struct rxq_elt *elt = &(*elts)[i];
878 struct rte_mbuf *buf = elt->buf;
881 rte_pktmbuf_free_seg(buf);
887 * Clean up a RX queue.
889 * Destroy objects, free allocated memory and reset the structure for reuse.
892 * Pointer to RX queue structure.
895 rxq_cleanup(struct rxq *rxq)
897 struct ibv_exp_release_intf_params params;
899 DEBUG("cleaning up %p", (void *)rxq);
901 rxq_free_elts_sp(rxq);
906 if (rxq->if_wq != NULL) {
907 assert(rxq->priv != NULL);
908 assert(rxq->priv->ctx != NULL);
909 assert(rxq->wq != NULL);
910 params = (struct ibv_exp_release_intf_params){
913 claim_zero(ibv_exp_release_intf(rxq->priv->ctx,
917 if (rxq->if_cq != NULL) {
918 assert(rxq->priv != NULL);
919 assert(rxq->priv->ctx != NULL);
920 assert(rxq->cq != NULL);
921 params = (struct ibv_exp_release_intf_params){
924 claim_zero(ibv_exp_release_intf(rxq->priv->ctx,
929 claim_zero(ibv_exp_destroy_wq(rxq->wq));
931 claim_zero(ibv_destroy_cq(rxq->cq));
932 if (rxq->rd != NULL) {
933 struct ibv_exp_destroy_res_domain_attr attr = {
937 assert(rxq->priv != NULL);
938 assert(rxq->priv->ctx != NULL);
939 claim_zero(ibv_exp_destroy_res_domain(rxq->priv->ctx,
944 claim_zero(ibv_dereg_mr(rxq->mr));
945 memset(rxq, 0, sizeof(*rxq));
949 * Reconfigure a RX queue with new parameters.
951 * rxq_rehash() does not allocate mbufs, which, if not done from the right
952 * thread (such as a control thread), may corrupt the pool.
953 * In case of failure, the queue is left untouched.
956 * Pointer to Ethernet device structure.
961 * 0 on success, errno value on failure.
964 rxq_rehash(struct rte_eth_dev *dev, struct rxq *rxq)
966 struct priv *priv = rxq->priv;
967 struct rxq tmpl = *rxq;
970 struct rte_mbuf **pool;
972 struct ibv_exp_wq_attr mod;
975 DEBUG("%p: rehashing queue %p", (void *)dev, (void *)rxq);
976 /* Number of descriptors and mbufs currently allocated. */
977 desc_n = (tmpl.elts_n * (tmpl.sp ? MLX5_PMD_SGE_WR_N : 1));
979 /* Toggle RX checksum offload if hardware supports it. */
981 tmpl.csum = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
982 rxq->csum = tmpl.csum;
984 if (priv->hw_csum_l2tun) {
985 tmpl.csum_l2tun = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
986 rxq->csum_l2tun = tmpl.csum_l2tun;
988 /* Enable scattered packets support for this queue if necessary. */
989 if ((dev->data->dev_conf.rxmode.jumbo_frame) &&
990 (dev->data->dev_conf.rxmode.max_rx_pkt_len >
991 (tmpl.mb_len - RTE_PKTMBUF_HEADROOM))) {
993 desc_n /= MLX5_PMD_SGE_WR_N;
996 DEBUG("%p: %s scattered packets support (%u WRs)",
997 (void *)dev, (tmpl.sp ? "enabling" : "disabling"), desc_n);
998 /* If scatter mode is the same as before, nothing to do. */
999 if (tmpl.sp == rxq->sp) {
1000 DEBUG("%p: nothing to do", (void *)dev);
1003 /* From now on, any failure will render the queue unusable.
1004 * Reinitialize WQ. */
1005 mod = (struct ibv_exp_wq_attr){
1006 .attr_mask = IBV_EXP_WQ_ATTR_STATE,
1007 .wq_state = IBV_EXP_WQS_RESET,
1009 err = ibv_exp_modify_wq(tmpl.wq, &mod);
1011 ERROR("%p: cannot reset WQ: %s", (void *)dev, strerror(err));
1015 /* Allocate pool. */
1016 pool = rte_malloc(__func__, (mbuf_n * sizeof(*pool)), 0);
1018 ERROR("%p: cannot allocate memory", (void *)dev);
1021 /* Snatch mbufs from original queue. */
1024 struct rxq_elt_sp (*elts)[rxq->elts_n] = rxq->elts.sp;
1026 for (i = 0; (i != RTE_DIM(*elts)); ++i) {
1027 struct rxq_elt_sp *elt = &(*elts)[i];
1030 for (j = 0; (j != RTE_DIM(elt->bufs)); ++j) {
1031 assert(elt->bufs[j] != NULL);
1032 pool[k++] = elt->bufs[j];
1036 struct rxq_elt (*elts)[rxq->elts_n] = rxq->elts.no_sp;
1038 for (i = 0; (i != RTE_DIM(*elts)); ++i) {
1039 struct rxq_elt *elt = &(*elts)[i];
1040 struct rte_mbuf *buf = elt->buf;
1045 assert(k == mbuf_n);
1047 tmpl.elts.sp = NULL;
1048 assert((void *)&tmpl.elts.sp == (void *)&tmpl.elts.no_sp);
1050 rxq_alloc_elts_sp(&tmpl, desc_n, pool) :
1051 rxq_alloc_elts(&tmpl, desc_n, pool));
1053 ERROR("%p: cannot reallocate WRs, aborting", (void *)dev);
1058 assert(tmpl.elts_n == desc_n);
1059 assert(tmpl.elts.sp != NULL);
1061 /* Clean up original data. */
1063 rte_free(rxq->elts.sp);
1064 rxq->elts.sp = NULL;
1065 /* Change queue state to ready. */
1066 mod = (struct ibv_exp_wq_attr){
1067 .attr_mask = IBV_EXP_WQ_ATTR_STATE,
1068 .wq_state = IBV_EXP_WQS_RDY,
1070 err = ibv_exp_modify_wq(tmpl.wq, &mod);
1072 ERROR("%p: WQ state to IBV_EXP_WQS_RDY failed: %s",
1073 (void *)dev, strerror(err));
1077 assert(tmpl.if_wq != NULL);
1079 struct rxq_elt_sp (*elts)[tmpl.elts_n] = tmpl.elts.sp;
1081 for (i = 0; (i != RTE_DIM(*elts)); ++i) {
1082 err = tmpl.if_wq->recv_sg_list
1085 RTE_DIM((*elts)[i].sges));
1090 struct rxq_elt (*elts)[tmpl.elts_n] = tmpl.elts.no_sp;
1092 for (i = 0; (i != RTE_DIM(*elts)); ++i) {
1093 err = tmpl.if_wq->recv_burst(
1102 ERROR("%p: failed to post SGEs with error %d",
1104 /* Set err because it does not contain a valid errno value. */
1109 tmpl.recv = tmpl.if_wq->recv_sg_list;
1111 tmpl.recv = tmpl.if_wq->recv_burst;
1119 * Configure a RX queue.
1122 * Pointer to Ethernet device structure.
1124 * Pointer to RX queue structure.
1126 * Number of descriptors to configure in queue.
1128 * NUMA socket on which memory must be allocated.
1130 * Thresholds parameters.
1132 * Memory pool for buffer allocations.
1135 * 0 on success, errno value on failure.
1138 rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
1139 unsigned int socket, const struct rte_eth_rxconf *conf,
1140 struct rte_mempool *mp)
1142 struct priv *priv = dev->data->dev_private;
1148 struct ibv_exp_wq_attr mod;
1150 struct ibv_exp_query_intf_params params;
1151 struct ibv_exp_cq_init_attr cq;
1152 struct ibv_exp_res_domain_init_attr rd;
1153 struct ibv_exp_wq_init_attr wq;
1155 enum ibv_exp_query_intf_status status;
1156 struct rte_mbuf *buf;
1159 unsigned int cq_size = desc;
1161 (void)conf; /* Thresholds configuration (ignored). */
1162 if ((desc == 0) || (desc % MLX5_PMD_SGE_WR_N)) {
1163 ERROR("%p: invalid number of RX descriptors (must be a"
1164 " multiple of %d)", (void *)dev, MLX5_PMD_SGE_WR_N);
1167 /* Get mbuf length. */
1168 buf = rte_pktmbuf_alloc(mp);
1170 ERROR("%p: unable to allocate mbuf", (void *)dev);
1173 tmpl.mb_len = buf->buf_len;
1174 assert((rte_pktmbuf_headroom(buf) +
1175 rte_pktmbuf_tailroom(buf)) == tmpl.mb_len);
1176 assert(rte_pktmbuf_headroom(buf) == RTE_PKTMBUF_HEADROOM);
1177 rte_pktmbuf_free(buf);
1178 /* Toggle RX checksum offload if hardware supports it. */
1180 tmpl.csum = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
1181 if (priv->hw_csum_l2tun)
1182 tmpl.csum_l2tun = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
1183 /* Enable scattered packets support for this queue if necessary. */
1184 if ((dev->data->dev_conf.rxmode.jumbo_frame) &&
1185 (dev->data->dev_conf.rxmode.max_rx_pkt_len >
1186 (tmpl.mb_len - RTE_PKTMBUF_HEADROOM))) {
1188 desc /= MLX5_PMD_SGE_WR_N;
1190 DEBUG("%p: %s scattered packets support (%u WRs)",
1191 (void *)dev, (tmpl.sp ? "enabling" : "disabling"), desc);
1192 /* Use the entire RX mempool as the memory region. */
1193 tmpl.mr = mlx5_mp2mr(priv->pd, mp);
1194 if (tmpl.mr == NULL) {
1196 ERROR("%p: MR creation failure: %s",
1197 (void *)dev, strerror(ret));
1200 attr.rd = (struct ibv_exp_res_domain_init_attr){
1201 .comp_mask = (IBV_EXP_RES_DOMAIN_THREAD_MODEL |
1202 IBV_EXP_RES_DOMAIN_MSG_MODEL),
1203 .thread_model = IBV_EXP_THREAD_SINGLE,
1204 .msg_model = IBV_EXP_MSG_HIGH_BW,
1206 tmpl.rd = ibv_exp_create_res_domain(priv->ctx, &attr.rd);
1207 if (tmpl.rd == NULL) {
1209 ERROR("%p: RD creation failure: %s",
1210 (void *)dev, strerror(ret));
1213 attr.cq = (struct ibv_exp_cq_init_attr){
1214 .comp_mask = IBV_EXP_CQ_INIT_ATTR_RES_DOMAIN,
1215 .res_domain = tmpl.rd,
1217 tmpl.cq = ibv_exp_create_cq(priv->ctx, cq_size, NULL, NULL, 0,
1219 if (tmpl.cq == NULL) {
1221 ERROR("%p: CQ creation failure: %s",
1222 (void *)dev, strerror(ret));
1225 DEBUG("priv->device_attr.max_qp_wr is %d",
1226 priv->device_attr.max_qp_wr);
1227 DEBUG("priv->device_attr.max_sge is %d",
1228 priv->device_attr.max_sge);
1229 /* Configure VLAN stripping. */
1230 tmpl.vlan_strip = dev->data->dev_conf.rxmode.hw_vlan_strip;
1231 attr.wq = (struct ibv_exp_wq_init_attr){
1232 .wq_context = NULL, /* Could be useful in the future. */
1233 .wq_type = IBV_EXP_WQT_RQ,
1234 /* Max number of outstanding WRs. */
1235 .max_recv_wr = ((priv->device_attr.max_qp_wr < (int)cq_size) ?
1236 priv->device_attr.max_qp_wr :
1238 /* Max number of scatter/gather elements in a WR. */
1239 .max_recv_sge = ((priv->device_attr.max_sge <
1240 MLX5_PMD_SGE_WR_N) ?
1241 priv->device_attr.max_sge :
1246 IBV_EXP_CREATE_WQ_RES_DOMAIN |
1247 #ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS
1248 IBV_EXP_CREATE_WQ_VLAN_OFFLOADS |
1249 #endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
1251 .res_domain = tmpl.rd,
1252 #ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS
1253 .vlan_offloads = (tmpl.vlan_strip ?
1254 IBV_EXP_RECEIVE_WQ_CVLAN_STRIP :
1256 #endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
1258 tmpl.wq = ibv_exp_create_wq(priv->ctx, &attr.wq);
1259 if (tmpl.wq == NULL) {
1260 ret = (errno ? errno : EINVAL);
1261 ERROR("%p: WQ creation failure: %s",
1262 (void *)dev, strerror(ret));
1266 ret = rxq_alloc_elts_sp(&tmpl, desc, NULL);
1268 ret = rxq_alloc_elts(&tmpl, desc, NULL);
1270 ERROR("%p: RXQ allocation failed: %s",
1271 (void *)dev, strerror(ret));
1275 tmpl.port_id = dev->data->port_id;
1276 DEBUG("%p: RTE port ID: %u", (void *)rxq, tmpl.port_id);
1277 attr.params = (struct ibv_exp_query_intf_params){
1278 .intf_scope = IBV_EXP_INTF_GLOBAL,
1279 #ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS
1281 #endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
1282 .intf = IBV_EXP_INTF_CQ,
1285 tmpl.if_cq = ibv_exp_query_intf(priv->ctx, &attr.params, &status);
1286 if (tmpl.if_cq == NULL) {
1287 ERROR("%p: CQ interface family query failed with status %d",
1288 (void *)dev, status);
1291 attr.params = (struct ibv_exp_query_intf_params){
1292 .intf_scope = IBV_EXP_INTF_GLOBAL,
1293 .intf = IBV_EXP_INTF_WQ,
1296 tmpl.if_wq = ibv_exp_query_intf(priv->ctx, &attr.params, &status);
1297 if (tmpl.if_wq == NULL) {
1298 ERROR("%p: WQ interface family query failed with status %d",
1299 (void *)dev, status);
1302 /* Change queue state to ready. */
1303 mod = (struct ibv_exp_wq_attr){
1304 .attr_mask = IBV_EXP_WQ_ATTR_STATE,
1305 .wq_state = IBV_EXP_WQS_RDY,
1307 ret = ibv_exp_modify_wq(tmpl.wq, &mod);
1309 ERROR("%p: WQ state to IBV_EXP_WQS_RDY failed: %s",
1310 (void *)dev, strerror(ret));
1315 struct rxq_elt_sp (*elts)[tmpl.elts_n] = tmpl.elts.sp;
1317 for (i = 0; (i != RTE_DIM(*elts)); ++i) {
1318 ret = tmpl.if_wq->recv_sg_list
1321 RTE_DIM((*elts)[i].sges));
1326 struct rxq_elt (*elts)[tmpl.elts_n] = tmpl.elts.no_sp;
1328 for (i = 0; (i != RTE_DIM(*elts)); ++i) {
1329 ret = tmpl.if_wq->recv_burst(
1338 ERROR("%p: failed to post SGEs with error %d",
1340 /* Set ret because it does not contain a valid errno value. */
1344 /* Clean up rxq in case we're reinitializing it. */
1345 DEBUG("%p: cleaning-up old rxq just in case", (void *)rxq);
1348 DEBUG("%p: rxq updated with %p", (void *)rxq, (void *)&tmpl);
1350 /* Assign function in queue. */
1351 #ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS
1352 rxq->poll = rxq->if_cq->poll_length_flags_cvlan;
1353 #else /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
1354 rxq->poll = rxq->if_cq->poll_length_flags;
1355 #endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
1357 rxq->recv = rxq->if_wq->recv_sg_list;
1359 rxq->recv = rxq->if_wq->recv_burst;
1368 * DPDK callback to configure a RX queue.
1371 * Pointer to Ethernet device structure.
1375 * Number of descriptors to configure in queue.
1377 * NUMA socket on which memory must be allocated.
1379 * Thresholds parameters.
1381 * Memory pool for buffer allocations.
1384 * 0 on success, negative errno value on failure.
1387 mlx5_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
1388 unsigned int socket, const struct rte_eth_rxconf *conf,
1389 struct rte_mempool *mp)
1391 struct priv *priv = dev->data->dev_private;
1392 struct rxq *rxq = (*priv->rxqs)[idx];
1396 DEBUG("%p: configuring queue %u for %u descriptors",
1397 (void *)dev, idx, desc);
1398 if (idx >= priv->rxqs_n) {
1399 ERROR("%p: queue index out of range (%u >= %u)",
1400 (void *)dev, idx, priv->rxqs_n);
1405 DEBUG("%p: reusing already allocated queue index %u (%p)",
1406 (void *)dev, idx, (void *)rxq);
1407 if (priv->started) {
1411 (*priv->rxqs)[idx] = NULL;
1414 rxq = rte_calloc_socket("RXQ", 1, sizeof(*rxq), 0, socket);
1416 ERROR("%p: unable to allocate queue index %u",
1422 ret = rxq_setup(dev, rxq, desc, socket, conf, mp);
1426 rxq->stats.idx = idx;
1427 DEBUG("%p: adding RX queue %p to list",
1428 (void *)dev, (void *)rxq);
1429 (*priv->rxqs)[idx] = rxq;
1430 /* Update receive callback. */
1432 dev->rx_pkt_burst = mlx5_rx_burst_sp;
1434 dev->rx_pkt_burst = mlx5_rx_burst;
1441 * DPDK callback to release a RX queue.
1444 * Generic RX queue pointer.
1447 mlx5_rx_queue_release(void *dpdk_rxq)
1449 struct rxq *rxq = (struct rxq *)dpdk_rxq;
1457 for (i = 0; (i != priv->rxqs_n); ++i)
1458 if ((*priv->rxqs)[i] == rxq) {
1459 DEBUG("%p: removing RX queue %p from list",
1460 (void *)priv->dev, (void *)rxq);
1461 (*priv->rxqs)[i] = NULL;