4 * Copyright 2015 6WIND S.A.
5 * Copyright 2015 Mellanox.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * * Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * * Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * * Neither the name of 6WIND S.A. nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
41 /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
43 #pragma GCC diagnostic ignored "-pedantic"
45 #include <infiniband/verbs.h>
47 #pragma GCC diagnostic error "-pedantic"
50 /* DPDK headers don't like -pedantic. */
52 #pragma GCC diagnostic ignored "-pedantic"
55 #include <rte_malloc.h>
56 #include <rte_ethdev.h>
57 #include <rte_common.h>
59 #pragma GCC diagnostic error "-pedantic"
63 #include "mlx5_rxtx.h"
64 #include "mlx5_utils.h"
65 #include "mlx5_defs.h"
67 /* Initialization data for hash RX queues. */
68 const struct hash_rxq_init hash_rxq_init[] = {
70 .hash_fields = (IBV_EXP_RX_HASH_SRC_IPV4 |
71 IBV_EXP_RX_HASH_DST_IPV4 |
72 IBV_EXP_RX_HASH_SRC_PORT_TCP |
73 IBV_EXP_RX_HASH_DST_PORT_TCP),
74 .dpdk_rss_hf = ETH_RSS_NONFRAG_IPV4_TCP,
76 .flow_spec.tcp_udp = {
77 .type = IBV_EXP_FLOW_SPEC_TCP,
78 .size = sizeof(hash_rxq_init[0].flow_spec.tcp_udp),
80 .underlayer = &hash_rxq_init[HASH_RXQ_IPV4],
83 .hash_fields = (IBV_EXP_RX_HASH_SRC_IPV4 |
84 IBV_EXP_RX_HASH_DST_IPV4 |
85 IBV_EXP_RX_HASH_SRC_PORT_UDP |
86 IBV_EXP_RX_HASH_DST_PORT_UDP),
87 .dpdk_rss_hf = ETH_RSS_NONFRAG_IPV4_UDP,
89 .flow_spec.tcp_udp = {
90 .type = IBV_EXP_FLOW_SPEC_UDP,
91 .size = sizeof(hash_rxq_init[0].flow_spec.tcp_udp),
93 .underlayer = &hash_rxq_init[HASH_RXQ_IPV4],
96 .hash_fields = (IBV_EXP_RX_HASH_SRC_IPV4 |
97 IBV_EXP_RX_HASH_DST_IPV4),
98 .dpdk_rss_hf = (ETH_RSS_IPV4 |
102 .type = IBV_EXP_FLOW_SPEC_IPV4,
103 .size = sizeof(hash_rxq_init[0].flow_spec.ipv4),
105 .underlayer = &hash_rxq_init[HASH_RXQ_ETH],
107 #ifdef HAVE_FLOW_SPEC_IPV6
109 .hash_fields = (IBV_EXP_RX_HASH_SRC_IPV6 |
110 IBV_EXP_RX_HASH_DST_IPV6 |
111 IBV_EXP_RX_HASH_SRC_PORT_TCP |
112 IBV_EXP_RX_HASH_DST_PORT_TCP),
113 .dpdk_rss_hf = ETH_RSS_NONFRAG_IPV6_TCP,
115 .flow_spec.tcp_udp = {
116 .type = IBV_EXP_FLOW_SPEC_TCP,
117 .size = sizeof(hash_rxq_init[0].flow_spec.tcp_udp),
119 .underlayer = &hash_rxq_init[HASH_RXQ_IPV6],
122 .hash_fields = (IBV_EXP_RX_HASH_SRC_IPV6 |
123 IBV_EXP_RX_HASH_DST_IPV6 |
124 IBV_EXP_RX_HASH_SRC_PORT_UDP |
125 IBV_EXP_RX_HASH_DST_PORT_UDP),
126 .dpdk_rss_hf = ETH_RSS_NONFRAG_IPV6_UDP,
128 .flow_spec.tcp_udp = {
129 .type = IBV_EXP_FLOW_SPEC_UDP,
130 .size = sizeof(hash_rxq_init[0].flow_spec.tcp_udp),
132 .underlayer = &hash_rxq_init[HASH_RXQ_IPV6],
135 .hash_fields = (IBV_EXP_RX_HASH_SRC_IPV6 |
136 IBV_EXP_RX_HASH_DST_IPV6),
137 .dpdk_rss_hf = (ETH_RSS_IPV6 |
141 .type = IBV_EXP_FLOW_SPEC_IPV6,
142 .size = sizeof(hash_rxq_init[0].flow_spec.ipv6),
144 .underlayer = &hash_rxq_init[HASH_RXQ_ETH],
146 #endif /* HAVE_FLOW_SPEC_IPV6 */
152 .type = IBV_EXP_FLOW_SPEC_ETH,
153 .size = sizeof(hash_rxq_init[0].flow_spec.eth),
159 /* Number of entries in hash_rxq_init[]. */
160 const unsigned int hash_rxq_init_n = RTE_DIM(hash_rxq_init);
162 /* Initialization data for hash RX queue indirection tables. */
163 static const struct ind_table_init ind_table_init[] = {
165 .max_size = -1u, /* Superseded by HW limitations. */
167 1 << HASH_RXQ_TCPV4 |
168 1 << HASH_RXQ_UDPV4 |
170 #ifdef HAVE_FLOW_SPEC_IPV6
171 1 << HASH_RXQ_TCPV6 |
172 1 << HASH_RXQ_UDPV6 |
174 #endif /* HAVE_FLOW_SPEC_IPV6 */
176 #ifdef HAVE_FLOW_SPEC_IPV6
178 #else /* HAVE_FLOW_SPEC_IPV6 */
180 #endif /* HAVE_FLOW_SPEC_IPV6 */
184 .hash_types = 1 << HASH_RXQ_ETH,
189 #define IND_TABLE_INIT_N RTE_DIM(ind_table_init)
191 /* Default RSS hash key also used for ConnectX-3. */
192 uint8_t rss_hash_default_key[] = {
193 0x2c, 0xc6, 0x81, 0xd1,
194 0x5b, 0xdb, 0xf4, 0xf7,
195 0xfc, 0xa2, 0x83, 0x19,
196 0xdb, 0x1a, 0x3e, 0x94,
197 0x6b, 0x9e, 0x38, 0xd9,
198 0x2c, 0x9c, 0x03, 0xd1,
199 0xad, 0x99, 0x44, 0xa7,
200 0xd9, 0x56, 0x3d, 0x59,
201 0x06, 0x3c, 0x25, 0xf3,
202 0xfc, 0x1f, 0xdc, 0x2a,
205 /* Length of the default RSS hash key. */
206 const size_t rss_hash_default_key_len = sizeof(rss_hash_default_key);
209 * Populate flow steering rule for a given hash RX queue type using
210 * information from hash_rxq_init[]. Nothing is written to flow_attr when
211 * flow_attr_size is not large enough, but the required size is still returned.
213 * @param[in] hash_rxq
214 * Pointer to hash RX queue.
215 * @param[out] flow_attr
216 * Pointer to flow attribute structure to fill. Note that the allocated
217 * area must be larger and large enough to hold all flow specifications.
218 * @param flow_attr_size
219 * Entire size of flow_attr and trailing room for flow specifications.
222 * Total size of the flow attribute buffer. No errors are defined.
225 hash_rxq_flow_attr(const struct hash_rxq *hash_rxq,
226 struct ibv_exp_flow_attr *flow_attr,
227 size_t flow_attr_size)
229 size_t offset = sizeof(*flow_attr);
230 enum hash_rxq_type type = hash_rxq->type;
231 const struct hash_rxq_init *init = &hash_rxq_init[type];
233 assert(hash_rxq->priv != NULL);
234 assert((size_t)type < RTE_DIM(hash_rxq_init));
236 offset += init->flow_spec.hdr.size;
237 init = init->underlayer;
238 } while (init != NULL);
239 if (offset > flow_attr_size)
241 flow_attr_size = offset;
242 init = &hash_rxq_init[type];
243 *flow_attr = (struct ibv_exp_flow_attr){
244 .type = IBV_EXP_FLOW_ATTR_NORMAL,
245 .priority = init->flow_priority,
247 .port = hash_rxq->priv->port,
251 offset -= init->flow_spec.hdr.size;
252 memcpy((void *)((uintptr_t)flow_attr + offset),
254 init->flow_spec.hdr.size);
255 ++flow_attr->num_of_specs;
256 init = init->underlayer;
257 } while (init != NULL);
258 return flow_attr_size;
262 * Convert hash type position in indirection table initializer to
263 * hash RX queue type.
266 * Indirection table initializer.
268 * Hash type position.
271 * Hash RX queue type.
273 static enum hash_rxq_type
274 hash_rxq_type_from_pos(const struct ind_table_init *table, unsigned int pos)
276 enum hash_rxq_type type = 0;
278 assert(pos < table->hash_types_n);
280 if ((table->hash_types & (1 << type)) && (pos-- == 0))
288 * Filter out disabled hash RX queue types from ind_table_init[].
291 * Pointer to private structure.
296 * Number of table entries.
299 priv_make_ind_table_init(struct priv *priv,
300 struct ind_table_init (*table)[IND_TABLE_INIT_N])
305 unsigned int table_n = 0;
306 /* Mandatory to receive frames not handled by normal hash RX queues. */
307 unsigned int hash_types_sup = 1 << HASH_RXQ_ETH;
309 rss_hf = priv->dev->data->dev_conf.rx_adv_conf.rss_conf.rss_hf;
310 /* Process other protocols only if more than one queue. */
311 if (priv->rxqs_n > 1)
312 for (i = 0; (i != hash_rxq_init_n); ++i)
313 if (rss_hf & hash_rxq_init[i].dpdk_rss_hf)
314 hash_types_sup |= (1 << i);
316 /* Filter out entries whose protocols are not in the set. */
317 for (i = 0, j = 0; (i != IND_TABLE_INIT_N); ++i) {
321 /* j is increased only if the table has valid protocols. */
323 (*table)[j] = ind_table_init[i];
324 (*table)[j].hash_types &= hash_types_sup;
325 for (h = 0, nb = 0; (h != hash_rxq_init_n); ++h)
326 if (((*table)[j].hash_types >> h) & 0x1)
328 (*table)[i].hash_types_n = nb;
338 * Initialize hash RX queues and indirection table.
341 * Pointer to private structure.
344 * 0 on success, errno value on failure.
347 priv_create_hash_rxqs(struct priv *priv)
349 struct ibv_exp_wq *wqs[priv->reta_idx_n];
350 struct ind_table_init ind_table_init[IND_TABLE_INIT_N];
351 unsigned int ind_tables_n =
352 priv_make_ind_table_init(priv, &ind_table_init);
353 unsigned int hash_rxqs_n = 0;
354 struct hash_rxq (*hash_rxqs)[] = NULL;
355 struct ibv_exp_rwq_ind_table *(*ind_tables)[] = NULL;
361 assert(priv->ind_tables == NULL);
362 assert(priv->ind_tables_n == 0);
363 assert(priv->hash_rxqs == NULL);
364 assert(priv->hash_rxqs_n == 0);
365 assert(priv->pd != NULL);
366 assert(priv->ctx != NULL);
367 if (priv->rxqs_n == 0)
369 assert(priv->rxqs != NULL);
370 if (ind_tables_n == 0) {
371 ERROR("all hash RX queue types have been filtered out,"
372 " indirection table cannot be created");
375 if (priv->rxqs_n & (priv->rxqs_n - 1)) {
376 INFO("%u RX queues are configured, consider rounding this"
377 " number to the next power of two for better balancing",
379 DEBUG("indirection table extended to assume %u WQs",
382 for (i = 0; (i != priv->reta_idx_n); ++i)
383 wqs[i] = (*priv->rxqs)[(*priv->reta_idx)[i]]->wq;
384 /* Get number of hash RX queues to configure. */
385 for (i = 0, hash_rxqs_n = 0; (i != ind_tables_n); ++i)
386 hash_rxqs_n += ind_table_init[i].hash_types_n;
387 DEBUG("allocating %u hash RX queues for %u WQs, %u indirection tables",
388 hash_rxqs_n, priv->rxqs_n, ind_tables_n);
389 /* Create indirection tables. */
390 ind_tables = rte_calloc(__func__, ind_tables_n,
391 sizeof((*ind_tables)[0]), 0);
392 if (ind_tables == NULL) {
394 ERROR("cannot allocate indirection tables container: %s",
398 for (i = 0; (i != ind_tables_n); ++i) {
399 struct ibv_exp_rwq_ind_table_init_attr ind_init_attr = {
401 .log_ind_tbl_size = 0, /* Set below. */
405 unsigned int ind_tbl_size = ind_table_init[i].max_size;
406 struct ibv_exp_rwq_ind_table *ind_table;
408 if (priv->reta_idx_n < ind_tbl_size)
409 ind_tbl_size = priv->reta_idx_n;
410 ind_init_attr.log_ind_tbl_size = log2above(ind_tbl_size);
412 ind_table = ibv_exp_create_rwq_ind_table(priv->ctx,
414 if (ind_table != NULL) {
415 (*ind_tables)[i] = ind_table;
418 /* Not clear whether errno is set. */
419 err = (errno ? errno : EINVAL);
420 ERROR("RX indirection table creation failed with error %d: %s",
424 /* Allocate array that holds hash RX queues and related data. */
425 hash_rxqs = rte_calloc(__func__, hash_rxqs_n,
426 sizeof((*hash_rxqs)[0]), 0);
427 if (hash_rxqs == NULL) {
429 ERROR("cannot allocate hash RX queues container: %s",
433 for (i = 0, j = 0, k = 0;
434 ((i != hash_rxqs_n) && (j != ind_tables_n));
436 struct hash_rxq *hash_rxq = &(*hash_rxqs)[i];
437 enum hash_rxq_type type =
438 hash_rxq_type_from_pos(&ind_table_init[j], k);
439 struct rte_eth_rss_conf *priv_rss_conf =
440 (*priv->rss_conf)[type];
441 struct ibv_exp_rx_hash_conf hash_conf = {
442 .rx_hash_function = IBV_EXP_RX_HASH_FUNC_TOEPLITZ,
443 .rx_hash_key_len = (priv_rss_conf ?
444 priv_rss_conf->rss_key_len :
445 rss_hash_default_key_len),
446 .rx_hash_key = (priv_rss_conf ?
447 priv_rss_conf->rss_key :
448 rss_hash_default_key),
449 .rx_hash_fields_mask = hash_rxq_init[type].hash_fields,
450 .rwq_ind_tbl = (*ind_tables)[j],
452 struct ibv_exp_qp_init_attr qp_init_attr = {
453 .max_inl_recv = 0, /* Currently not supported. */
454 .qp_type = IBV_QPT_RAW_PACKET,
455 .comp_mask = (IBV_EXP_QP_INIT_ATTR_PD |
456 IBV_EXP_QP_INIT_ATTR_RX_HASH),
458 .rx_hash_conf = &hash_conf,
459 .port_num = priv->port,
462 DEBUG("using indirection table %u for hash RX queue %u type %d",
464 *hash_rxq = (struct hash_rxq){
466 .qp = ibv_exp_create_qp(priv->ctx, &qp_init_attr),
469 if (hash_rxq->qp == NULL) {
470 err = (errno ? errno : EINVAL);
471 ERROR("Hash RX QP creation failure: %s",
475 if (++k < ind_table_init[j].hash_types_n)
477 /* Switch to the next indirection table and reset hash RX
478 * queue type array index. */
482 priv->ind_tables = ind_tables;
483 priv->ind_tables_n = ind_tables_n;
484 priv->hash_rxqs = hash_rxqs;
485 priv->hash_rxqs_n = hash_rxqs_n;
489 if (hash_rxqs != NULL) {
490 for (i = 0; (i != hash_rxqs_n); ++i) {
491 struct ibv_qp *qp = (*hash_rxqs)[i].qp;
495 claim_zero(ibv_destroy_qp(qp));
499 if (ind_tables != NULL) {
500 for (j = 0; (j != ind_tables_n); ++j) {
501 struct ibv_exp_rwq_ind_table *ind_table =
504 if (ind_table == NULL)
506 claim_zero(ibv_exp_destroy_rwq_ind_table(ind_table));
508 rte_free(ind_tables);
514 * Clean up hash RX queues and indirection table.
517 * Pointer to private structure.
520 priv_destroy_hash_rxqs(struct priv *priv)
524 DEBUG("destroying %u hash RX queues", priv->hash_rxqs_n);
525 if (priv->hash_rxqs_n == 0) {
526 assert(priv->hash_rxqs == NULL);
527 assert(priv->ind_tables == NULL);
530 for (i = 0; (i != priv->hash_rxqs_n); ++i) {
531 struct hash_rxq *hash_rxq = &(*priv->hash_rxqs)[i];
534 assert(hash_rxq->priv == priv);
535 assert(hash_rxq->qp != NULL);
536 /* Also check that there are no remaining flows. */
537 for (j = 0; (j != RTE_DIM(hash_rxq->special_flow)); ++j)
538 assert(hash_rxq->special_flow[j] == NULL);
539 for (j = 0; (j != RTE_DIM(hash_rxq->mac_flow)); ++j)
540 for (k = 0; (k != RTE_DIM(hash_rxq->mac_flow[j])); ++k)
541 assert(hash_rxq->mac_flow[j][k] == NULL);
542 claim_zero(ibv_destroy_qp(hash_rxq->qp));
544 priv->hash_rxqs_n = 0;
545 rte_free(priv->hash_rxqs);
546 priv->hash_rxqs = NULL;
547 for (i = 0; (i != priv->ind_tables_n); ++i) {
548 struct ibv_exp_rwq_ind_table *ind_table =
549 (*priv->ind_tables)[i];
551 assert(ind_table != NULL);
552 claim_zero(ibv_exp_destroy_rwq_ind_table(ind_table));
554 priv->ind_tables_n = 0;
555 rte_free(priv->ind_tables);
556 priv->ind_tables = NULL;
560 * Check whether a given flow type is allowed.
563 * Pointer to private structure.
565 * Flow type to check.
568 * Nonzero if the given flow type is allowed.
571 priv_allow_flow_type(struct priv *priv, enum hash_rxq_flow_type type)
573 /* Only FLOW_TYPE_PROMISC is allowed when promiscuous mode
574 * has been requested. */
575 if (priv->promisc_req)
576 return type == HASH_RXQ_FLOW_TYPE_PROMISC;
578 case HASH_RXQ_FLOW_TYPE_PROMISC:
579 return !!priv->promisc_req;
580 case HASH_RXQ_FLOW_TYPE_ALLMULTI:
581 return !!priv->allmulti_req;
582 case HASH_RXQ_FLOW_TYPE_BROADCAST:
583 #ifdef HAVE_FLOW_SPEC_IPV6
584 case HASH_RXQ_FLOW_TYPE_IPV6MULTI:
585 #endif /* HAVE_FLOW_SPEC_IPV6 */
586 /* If allmulti is enabled, broadcast and ipv6multi
587 * are unnecessary. */
588 return !priv->allmulti_req;
589 case HASH_RXQ_FLOW_TYPE_MAC:
592 /* Unsupported flow type is not allowed. */
599 * Automatically enable/disable flows according to configuration.
605 * 0 on success, errno value on failure.
608 priv_rehash_flows(struct priv *priv)
612 for (i = 0; (i != RTE_DIM((*priv->hash_rxqs)[0].special_flow)); ++i)
613 if (!priv_allow_flow_type(priv, i)) {
614 priv_special_flow_disable(priv, i);
616 int ret = priv_special_flow_enable(priv, i);
621 if (priv_allow_flow_type(priv, HASH_RXQ_FLOW_TYPE_MAC))
622 return priv_mac_addrs_enable(priv);
623 priv_mac_addrs_disable(priv);
628 * Allocate RX queue elements with scattered packets support.
631 * Pointer to RX queue structure.
633 * Number of elements to allocate.
635 * If not NULL, fetch buffers from this array instead of allocating them
636 * with rte_pktmbuf_alloc().
639 * 0 on success, errno value on failure.
642 rxq_alloc_elts_sp(struct rxq *rxq, unsigned int elts_n,
643 struct rte_mbuf **pool)
646 struct rxq_elt_sp (*elts)[elts_n] =
647 rte_calloc_socket("RXQ elements", 1, sizeof(*elts), 0,
652 ERROR("%p: can't allocate packets array", (void *)rxq);
656 /* For each WR (packet). */
657 for (i = 0; (i != elts_n); ++i) {
659 struct rxq_elt_sp *elt = &(*elts)[i];
660 struct ibv_sge (*sges)[RTE_DIM(elt->sges)] = &elt->sges;
662 /* These two arrays must have the same size. */
663 assert(RTE_DIM(elt->sges) == RTE_DIM(elt->bufs));
664 /* For each SGE (segment). */
665 for (j = 0; (j != RTE_DIM(elt->bufs)); ++j) {
666 struct ibv_sge *sge = &(*sges)[j];
667 struct rte_mbuf *buf;
672 rte_pktmbuf_reset(buf);
674 buf = rte_pktmbuf_alloc(rxq->mp);
676 assert(pool == NULL);
677 ERROR("%p: empty mbuf pool", (void *)rxq);
682 /* Headroom is reserved by rte_pktmbuf_alloc(). */
683 assert(DATA_OFF(buf) == RTE_PKTMBUF_HEADROOM);
684 /* Buffer is supposed to be empty. */
685 assert(rte_pktmbuf_data_len(buf) == 0);
686 assert(rte_pktmbuf_pkt_len(buf) == 0);
687 /* sge->addr must be able to store a pointer. */
688 assert(sizeof(sge->addr) >= sizeof(uintptr_t));
690 /* The first SGE keeps its headroom. */
691 sge->addr = rte_pktmbuf_mtod(buf, uintptr_t);
692 sge->length = (buf->buf_len -
693 RTE_PKTMBUF_HEADROOM);
695 /* Subsequent SGEs lose theirs. */
696 assert(DATA_OFF(buf) == RTE_PKTMBUF_HEADROOM);
697 SET_DATA_OFF(buf, 0);
698 sge->addr = (uintptr_t)buf->buf_addr;
699 sge->length = buf->buf_len;
701 sge->lkey = rxq->mr->lkey;
702 /* Redundant check for tailroom. */
703 assert(sge->length == rte_pktmbuf_tailroom(buf));
706 DEBUG("%p: allocated and configured %u WRs (%zu segments)",
707 (void *)rxq, elts_n, (elts_n * RTE_DIM((*elts)[0].sges)));
708 rxq->elts_n = elts_n;
715 assert(pool == NULL);
716 for (i = 0; (i != RTE_DIM(*elts)); ++i) {
718 struct rxq_elt_sp *elt = &(*elts)[i];
720 for (j = 0; (j != RTE_DIM(elt->bufs)); ++j) {
721 struct rte_mbuf *buf = elt->bufs[j];
724 rte_pktmbuf_free_seg(buf);
729 DEBUG("%p: failed, freed everything", (void *)rxq);
735 * Free RX queue elements with scattered packets support.
738 * Pointer to RX queue structure.
741 rxq_free_elts_sp(struct rxq *rxq)
744 unsigned int elts_n = rxq->elts_n;
745 struct rxq_elt_sp (*elts)[elts_n] = rxq->elts.sp;
747 DEBUG("%p: freeing WRs", (void *)rxq);
752 for (i = 0; (i != RTE_DIM(*elts)); ++i) {
754 struct rxq_elt_sp *elt = &(*elts)[i];
756 for (j = 0; (j != RTE_DIM(elt->bufs)); ++j) {
757 struct rte_mbuf *buf = elt->bufs[j];
760 rte_pktmbuf_free_seg(buf);
767 * Allocate RX queue elements.
770 * Pointer to RX queue structure.
772 * Number of elements to allocate.
774 * If not NULL, fetch buffers from this array instead of allocating them
775 * with rte_pktmbuf_alloc().
778 * 0 on success, errno value on failure.
781 rxq_alloc_elts(struct rxq *rxq, unsigned int elts_n, struct rte_mbuf **pool)
784 struct rxq_elt (*elts)[elts_n] =
785 rte_calloc_socket("RXQ elements", 1, sizeof(*elts), 0,
790 ERROR("%p: can't allocate packets array", (void *)rxq);
794 /* For each WR (packet). */
795 for (i = 0; (i != elts_n); ++i) {
796 struct rxq_elt *elt = &(*elts)[i];
797 struct ibv_sge *sge = &(*elts)[i].sge;
798 struct rte_mbuf *buf;
803 rte_pktmbuf_reset(buf);
805 buf = rte_pktmbuf_alloc(rxq->mp);
807 assert(pool == NULL);
808 ERROR("%p: empty mbuf pool", (void *)rxq);
813 /* Headroom is reserved by rte_pktmbuf_alloc(). */
814 assert(DATA_OFF(buf) == RTE_PKTMBUF_HEADROOM);
815 /* Buffer is supposed to be empty. */
816 assert(rte_pktmbuf_data_len(buf) == 0);
817 assert(rte_pktmbuf_pkt_len(buf) == 0);
818 /* sge->addr must be able to store a pointer. */
819 assert(sizeof(sge->addr) >= sizeof(uintptr_t));
820 /* SGE keeps its headroom. */
821 sge->addr = (uintptr_t)
822 ((uint8_t *)buf->buf_addr + RTE_PKTMBUF_HEADROOM);
823 sge->length = (buf->buf_len - RTE_PKTMBUF_HEADROOM);
824 sge->lkey = rxq->mr->lkey;
825 /* Redundant check for tailroom. */
826 assert(sge->length == rte_pktmbuf_tailroom(buf));
828 DEBUG("%p: allocated and configured %u single-segment WRs",
829 (void *)rxq, elts_n);
830 rxq->elts_n = elts_n;
832 rxq->elts.no_sp = elts;
837 assert(pool == NULL);
838 for (i = 0; (i != RTE_DIM(*elts)); ++i) {
839 struct rxq_elt *elt = &(*elts)[i];
840 struct rte_mbuf *buf = elt->buf;
843 rte_pktmbuf_free_seg(buf);
847 DEBUG("%p: failed, freed everything", (void *)rxq);
853 * Free RX queue elements.
856 * Pointer to RX queue structure.
859 rxq_free_elts(struct rxq *rxq)
862 unsigned int elts_n = rxq->elts_n;
863 struct rxq_elt (*elts)[elts_n] = rxq->elts.no_sp;
865 DEBUG("%p: freeing WRs", (void *)rxq);
867 rxq->elts.no_sp = NULL;
870 for (i = 0; (i != RTE_DIM(*elts)); ++i) {
871 struct rxq_elt *elt = &(*elts)[i];
872 struct rte_mbuf *buf = elt->buf;
875 rte_pktmbuf_free_seg(buf);
881 * Clean up a RX queue.
883 * Destroy objects, free allocated memory and reset the structure for reuse.
886 * Pointer to RX queue structure.
889 rxq_cleanup(struct rxq *rxq)
891 struct ibv_exp_release_intf_params params;
893 DEBUG("cleaning up %p", (void *)rxq);
895 rxq_free_elts_sp(rxq);
898 if (rxq->if_wq != NULL) {
899 assert(rxq->priv != NULL);
900 assert(rxq->priv->ctx != NULL);
901 assert(rxq->wq != NULL);
902 params = (struct ibv_exp_release_intf_params){
905 claim_zero(ibv_exp_release_intf(rxq->priv->ctx,
909 if (rxq->if_cq != NULL) {
910 assert(rxq->priv != NULL);
911 assert(rxq->priv->ctx != NULL);
912 assert(rxq->cq != NULL);
913 params = (struct ibv_exp_release_intf_params){
916 claim_zero(ibv_exp_release_intf(rxq->priv->ctx,
921 claim_zero(ibv_exp_destroy_wq(rxq->wq));
923 claim_zero(ibv_destroy_cq(rxq->cq));
924 if (rxq->rd != NULL) {
925 struct ibv_exp_destroy_res_domain_attr attr = {
929 assert(rxq->priv != NULL);
930 assert(rxq->priv->ctx != NULL);
931 claim_zero(ibv_exp_destroy_res_domain(rxq->priv->ctx,
936 claim_zero(ibv_dereg_mr(rxq->mr));
937 memset(rxq, 0, sizeof(*rxq));
941 * Reconfigure a RX queue with new parameters.
943 * rxq_rehash() does not allocate mbufs, which, if not done from the right
944 * thread (such as a control thread), may corrupt the pool.
945 * In case of failure, the queue is left untouched.
948 * Pointer to Ethernet device structure.
953 * 0 on success, errno value on failure.
956 rxq_rehash(struct rte_eth_dev *dev, struct rxq *rxq)
958 struct priv *priv = rxq->priv;
959 struct rxq tmpl = *rxq;
962 struct rte_mbuf **pool;
964 struct ibv_exp_wq_attr mod;
967 DEBUG("%p: rehashing queue %p", (void *)dev, (void *)rxq);
968 /* Number of descriptors and mbufs currently allocated. */
969 desc_n = (tmpl.elts_n * (tmpl.sp ? MLX5_PMD_SGE_WR_N : 1));
971 /* Toggle RX checksum offload if hardware supports it. */
973 tmpl.csum = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
974 rxq->csum = tmpl.csum;
976 if (priv->hw_csum_l2tun) {
977 tmpl.csum_l2tun = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
978 rxq->csum_l2tun = tmpl.csum_l2tun;
980 /* Enable scattered packets support for this queue if necessary. */
981 if ((dev->data->dev_conf.rxmode.jumbo_frame) &&
982 (dev->data->dev_conf.rxmode.max_rx_pkt_len >
983 (tmpl.mb_len - RTE_PKTMBUF_HEADROOM))) {
985 desc_n /= MLX5_PMD_SGE_WR_N;
988 DEBUG("%p: %s scattered packets support (%u WRs)",
989 (void *)dev, (tmpl.sp ? "enabling" : "disabling"), desc_n);
990 /* If scatter mode is the same as before, nothing to do. */
991 if (tmpl.sp == rxq->sp) {
992 DEBUG("%p: nothing to do", (void *)dev);
995 /* From now on, any failure will render the queue unusable.
996 * Reinitialize WQ. */
997 mod = (struct ibv_exp_wq_attr){
998 .attr_mask = IBV_EXP_WQ_ATTR_STATE,
999 .wq_state = IBV_EXP_WQS_RESET,
1001 err = ibv_exp_modify_wq(tmpl.wq, &mod);
1003 ERROR("%p: cannot reset WQ: %s", (void *)dev, strerror(err));
1007 /* Allocate pool. */
1008 pool = rte_malloc(__func__, (mbuf_n * sizeof(*pool)), 0);
1010 ERROR("%p: cannot allocate memory", (void *)dev);
1013 /* Snatch mbufs from original queue. */
1016 struct rxq_elt_sp (*elts)[rxq->elts_n] = rxq->elts.sp;
1018 for (i = 0; (i != RTE_DIM(*elts)); ++i) {
1019 struct rxq_elt_sp *elt = &(*elts)[i];
1022 for (j = 0; (j != RTE_DIM(elt->bufs)); ++j) {
1023 assert(elt->bufs[j] != NULL);
1024 pool[k++] = elt->bufs[j];
1028 struct rxq_elt (*elts)[rxq->elts_n] = rxq->elts.no_sp;
1030 for (i = 0; (i != RTE_DIM(*elts)); ++i) {
1031 struct rxq_elt *elt = &(*elts)[i];
1032 struct rte_mbuf *buf = elt->buf;
1037 assert(k == mbuf_n);
1039 tmpl.elts.sp = NULL;
1040 assert((void *)&tmpl.elts.sp == (void *)&tmpl.elts.no_sp);
1042 rxq_alloc_elts_sp(&tmpl, desc_n, pool) :
1043 rxq_alloc_elts(&tmpl, desc_n, pool));
1045 ERROR("%p: cannot reallocate WRs, aborting", (void *)dev);
1050 assert(tmpl.elts_n == desc_n);
1051 assert(tmpl.elts.sp != NULL);
1053 /* Clean up original data. */
1055 rte_free(rxq->elts.sp);
1056 rxq->elts.sp = NULL;
1057 /* Change queue state to ready. */
1058 mod = (struct ibv_exp_wq_attr){
1059 .attr_mask = IBV_EXP_WQ_ATTR_STATE,
1060 .wq_state = IBV_EXP_WQS_RDY,
1062 err = ibv_exp_modify_wq(tmpl.wq, &mod);
1064 ERROR("%p: WQ state to IBV_EXP_WQS_RDY failed: %s",
1065 (void *)dev, strerror(err));
1069 assert(tmpl.if_wq != NULL);
1071 struct rxq_elt_sp (*elts)[tmpl.elts_n] = tmpl.elts.sp;
1073 for (i = 0; (i != RTE_DIM(*elts)); ++i) {
1074 err = tmpl.if_wq->recv_sg_list
1077 RTE_DIM((*elts)[i].sges));
1082 struct rxq_elt (*elts)[tmpl.elts_n] = tmpl.elts.no_sp;
1084 for (i = 0; (i != RTE_DIM(*elts)); ++i) {
1085 err = tmpl.if_wq->recv_burst(
1094 ERROR("%p: failed to post SGEs with error %d",
1096 /* Set err because it does not contain a valid errno value. */
1107 * Configure a RX queue.
1110 * Pointer to Ethernet device structure.
1112 * Pointer to RX queue structure.
1114 * Number of descriptors to configure in queue.
1116 * NUMA socket on which memory must be allocated.
1118 * Thresholds parameters.
1120 * Memory pool for buffer allocations.
1123 * 0 on success, errno value on failure.
1126 rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
1127 unsigned int socket, const struct rte_eth_rxconf *conf,
1128 struct rte_mempool *mp)
1130 struct priv *priv = dev->data->dev_private;
1136 struct ibv_exp_wq_attr mod;
1138 struct ibv_exp_query_intf_params params;
1139 struct ibv_exp_cq_init_attr cq;
1140 struct ibv_exp_res_domain_init_attr rd;
1141 struct ibv_exp_wq_init_attr wq;
1143 enum ibv_exp_query_intf_status status;
1144 struct rte_mbuf *buf;
1147 unsigned int cq_size = desc;
1149 (void)conf; /* Thresholds configuration (ignored). */
1150 if ((desc == 0) || (desc % MLX5_PMD_SGE_WR_N)) {
1151 ERROR("%p: invalid number of RX descriptors (must be a"
1152 " multiple of %d)", (void *)dev, MLX5_PMD_SGE_WR_N);
1155 /* Get mbuf length. */
1156 buf = rte_pktmbuf_alloc(mp);
1158 ERROR("%p: unable to allocate mbuf", (void *)dev);
1161 tmpl.mb_len = buf->buf_len;
1162 assert((rte_pktmbuf_headroom(buf) +
1163 rte_pktmbuf_tailroom(buf)) == tmpl.mb_len);
1164 assert(rte_pktmbuf_headroom(buf) == RTE_PKTMBUF_HEADROOM);
1165 rte_pktmbuf_free(buf);
1166 /* Toggle RX checksum offload if hardware supports it. */
1168 tmpl.csum = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
1169 if (priv->hw_csum_l2tun)
1170 tmpl.csum_l2tun = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
1171 /* Enable scattered packets support for this queue if necessary. */
1172 if ((dev->data->dev_conf.rxmode.jumbo_frame) &&
1173 (dev->data->dev_conf.rxmode.max_rx_pkt_len >
1174 (tmpl.mb_len - RTE_PKTMBUF_HEADROOM))) {
1176 desc /= MLX5_PMD_SGE_WR_N;
1178 DEBUG("%p: %s scattered packets support (%u WRs)",
1179 (void *)dev, (tmpl.sp ? "enabling" : "disabling"), desc);
1180 /* Use the entire RX mempool as the memory region. */
1181 tmpl.mr = ibv_reg_mr(priv->pd,
1182 (void *)mp->elt_va_start,
1183 (mp->elt_va_end - mp->elt_va_start),
1184 (IBV_ACCESS_LOCAL_WRITE |
1185 IBV_ACCESS_REMOTE_WRITE));
1186 if (tmpl.mr == NULL) {
1188 ERROR("%p: MR creation failure: %s",
1189 (void *)dev, strerror(ret));
1192 attr.rd = (struct ibv_exp_res_domain_init_attr){
1193 .comp_mask = (IBV_EXP_RES_DOMAIN_THREAD_MODEL |
1194 IBV_EXP_RES_DOMAIN_MSG_MODEL),
1195 .thread_model = IBV_EXP_THREAD_SINGLE,
1196 .msg_model = IBV_EXP_MSG_HIGH_BW,
1198 tmpl.rd = ibv_exp_create_res_domain(priv->ctx, &attr.rd);
1199 if (tmpl.rd == NULL) {
1201 ERROR("%p: RD creation failure: %s",
1202 (void *)dev, strerror(ret));
1205 attr.cq = (struct ibv_exp_cq_init_attr){
1206 .comp_mask = IBV_EXP_CQ_INIT_ATTR_RES_DOMAIN,
1207 .res_domain = tmpl.rd,
1209 tmpl.cq = ibv_exp_create_cq(priv->ctx, cq_size, NULL, NULL, 0,
1211 if (tmpl.cq == NULL) {
1213 ERROR("%p: CQ creation failure: %s",
1214 (void *)dev, strerror(ret));
1217 DEBUG("priv->device_attr.max_qp_wr is %d",
1218 priv->device_attr.max_qp_wr);
1219 DEBUG("priv->device_attr.max_sge is %d",
1220 priv->device_attr.max_sge);
1221 attr.wq = (struct ibv_exp_wq_init_attr){
1222 .wq_context = NULL, /* Could be useful in the future. */
1223 .wq_type = IBV_EXP_WQT_RQ,
1224 /* Max number of outstanding WRs. */
1225 .max_recv_wr = ((priv->device_attr.max_qp_wr < (int)cq_size) ?
1226 priv->device_attr.max_qp_wr :
1228 /* Max number of scatter/gather elements in a WR. */
1229 .max_recv_sge = ((priv->device_attr.max_sge <
1230 MLX5_PMD_SGE_WR_N) ?
1231 priv->device_attr.max_sge :
1235 .comp_mask = IBV_EXP_CREATE_WQ_RES_DOMAIN,
1236 .res_domain = tmpl.rd,
1238 tmpl.wq = ibv_exp_create_wq(priv->ctx, &attr.wq);
1239 if (tmpl.wq == NULL) {
1240 ret = (errno ? errno : EINVAL);
1241 ERROR("%p: WQ creation failure: %s",
1242 (void *)dev, strerror(ret));
1246 ret = rxq_alloc_elts_sp(&tmpl, desc, NULL);
1248 ret = rxq_alloc_elts(&tmpl, desc, NULL);
1250 ERROR("%p: RXQ allocation failed: %s",
1251 (void *)dev, strerror(ret));
1255 tmpl.port_id = dev->data->port_id;
1256 DEBUG("%p: RTE port ID: %u", (void *)rxq, tmpl.port_id);
1257 attr.params = (struct ibv_exp_query_intf_params){
1258 .intf_scope = IBV_EXP_INTF_GLOBAL,
1259 .intf = IBV_EXP_INTF_CQ,
1262 tmpl.if_cq = ibv_exp_query_intf(priv->ctx, &attr.params, &status);
1263 if (tmpl.if_cq == NULL) {
1264 ERROR("%p: CQ interface family query failed with status %d",
1265 (void *)dev, status);
1268 attr.params = (struct ibv_exp_query_intf_params){
1269 .intf_scope = IBV_EXP_INTF_GLOBAL,
1270 .intf = IBV_EXP_INTF_WQ,
1273 tmpl.if_wq = ibv_exp_query_intf(priv->ctx, &attr.params, &status);
1274 if (tmpl.if_wq == NULL) {
1275 ERROR("%p: WQ interface family query failed with status %d",
1276 (void *)dev, status);
1279 /* Change queue state to ready. */
1280 mod = (struct ibv_exp_wq_attr){
1281 .attr_mask = IBV_EXP_WQ_ATTR_STATE,
1282 .wq_state = IBV_EXP_WQS_RDY,
1284 ret = ibv_exp_modify_wq(tmpl.wq, &mod);
1286 ERROR("%p: WQ state to IBV_EXP_WQS_RDY failed: %s",
1287 (void *)dev, strerror(ret));
1292 struct rxq_elt_sp (*elts)[tmpl.elts_n] = tmpl.elts.sp;
1294 for (i = 0; (i != RTE_DIM(*elts)); ++i) {
1295 ret = tmpl.if_wq->recv_sg_list
1298 RTE_DIM((*elts)[i].sges));
1303 struct rxq_elt (*elts)[tmpl.elts_n] = tmpl.elts.no_sp;
1305 for (i = 0; (i != RTE_DIM(*elts)); ++i) {
1306 ret = tmpl.if_wq->recv_burst(
1315 ERROR("%p: failed to post SGEs with error %d",
1317 /* Set ret because it does not contain a valid errno value. */
1321 /* Clean up rxq in case we're reinitializing it. */
1322 DEBUG("%p: cleaning-up old rxq just in case", (void *)rxq);
1325 DEBUG("%p: rxq updated with %p", (void *)rxq, (void *)&tmpl);
1335 * DPDK callback to configure a RX queue.
1338 * Pointer to Ethernet device structure.
1342 * Number of descriptors to configure in queue.
1344 * NUMA socket on which memory must be allocated.
1346 * Thresholds parameters.
1348 * Memory pool for buffer allocations.
1351 * 0 on success, negative errno value on failure.
1354 mlx5_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
1355 unsigned int socket, const struct rte_eth_rxconf *conf,
1356 struct rte_mempool *mp)
1358 struct priv *priv = dev->data->dev_private;
1359 struct rxq *rxq = (*priv->rxqs)[idx];
1363 DEBUG("%p: configuring queue %u for %u descriptors",
1364 (void *)dev, idx, desc);
1365 if (idx >= priv->rxqs_n) {
1366 ERROR("%p: queue index out of range (%u >= %u)",
1367 (void *)dev, idx, priv->rxqs_n);
1372 DEBUG("%p: reusing already allocated queue index %u (%p)",
1373 (void *)dev, idx, (void *)rxq);
1374 if (priv->started) {
1378 (*priv->rxqs)[idx] = NULL;
1381 rxq = rte_calloc_socket("RXQ", 1, sizeof(*rxq), 0, socket);
1383 ERROR("%p: unable to allocate queue index %u",
1389 ret = rxq_setup(dev, rxq, desc, socket, conf, mp);
1393 rxq->stats.idx = idx;
1394 DEBUG("%p: adding RX queue %p to list",
1395 (void *)dev, (void *)rxq);
1396 (*priv->rxqs)[idx] = rxq;
1397 /* Update receive callback. */
1399 dev->rx_pkt_burst = mlx5_rx_burst_sp;
1401 dev->rx_pkt_burst = mlx5_rx_burst;
1408 * DPDK callback to release a RX queue.
1411 * Generic RX queue pointer.
1414 mlx5_rx_queue_release(void *dpdk_rxq)
1416 struct rxq *rxq = (struct rxq *)dpdk_rxq;
1424 for (i = 0; (i != priv->rxqs_n); ++i)
1425 if ((*priv->rxqs)[i] == rxq) {
1426 DEBUG("%p: removing RX queue %p from list",
1427 (void *)priv->dev, (void *)rxq);
1428 (*priv->rxqs)[i] = NULL;