4 * Copyright 2015 6WIND S.A.
5 * Copyright 2015 Mellanox.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * * Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * * Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * * Neither the name of 6WIND S.A. nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
41 /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
43 #pragma GCC diagnostic ignored "-pedantic"
45 #include <infiniband/verbs.h>
47 #pragma GCC diagnostic error "-pedantic"
50 /* DPDK headers don't like -pedantic. */
52 #pragma GCC diagnostic ignored "-pedantic"
55 #include <rte_malloc.h>
56 #include <rte_ethdev.h>
57 #include <rte_common.h>
59 #pragma GCC diagnostic error "-pedantic"
63 #include "mlx5_rxtx.h"
64 #include "mlx5_utils.h"
65 #include "mlx5_defs.h"
67 /* Initialization data for hash RX queues. */
68 const struct hash_rxq_init hash_rxq_init[] = {
70 .hash_fields = (IBV_EXP_RX_HASH_SRC_IPV4 |
71 IBV_EXP_RX_HASH_DST_IPV4 |
72 IBV_EXP_RX_HASH_SRC_PORT_TCP |
73 IBV_EXP_RX_HASH_DST_PORT_TCP),
74 .dpdk_rss_hf = ETH_RSS_NONFRAG_IPV4_TCP,
76 .flow_spec.tcp_udp = {
77 .type = IBV_EXP_FLOW_SPEC_TCP,
78 .size = sizeof(hash_rxq_init[0].flow_spec.tcp_udp),
80 .underlayer = &hash_rxq_init[HASH_RXQ_IPV4],
83 .hash_fields = (IBV_EXP_RX_HASH_SRC_IPV4 |
84 IBV_EXP_RX_HASH_DST_IPV4 |
85 IBV_EXP_RX_HASH_SRC_PORT_UDP |
86 IBV_EXP_RX_HASH_DST_PORT_UDP),
87 .dpdk_rss_hf = ETH_RSS_NONFRAG_IPV4_UDP,
89 .flow_spec.tcp_udp = {
90 .type = IBV_EXP_FLOW_SPEC_UDP,
91 .size = sizeof(hash_rxq_init[0].flow_spec.tcp_udp),
93 .underlayer = &hash_rxq_init[HASH_RXQ_IPV4],
96 .hash_fields = (IBV_EXP_RX_HASH_SRC_IPV4 |
97 IBV_EXP_RX_HASH_DST_IPV4),
98 .dpdk_rss_hf = (ETH_RSS_IPV4 |
102 .type = IBV_EXP_FLOW_SPEC_IPV4,
103 .size = sizeof(hash_rxq_init[0].flow_spec.ipv4),
105 .underlayer = &hash_rxq_init[HASH_RXQ_ETH],
107 #ifdef HAVE_FLOW_SPEC_IPV6
109 .hash_fields = (IBV_EXP_RX_HASH_SRC_IPV6 |
110 IBV_EXP_RX_HASH_DST_IPV6 |
111 IBV_EXP_RX_HASH_SRC_PORT_TCP |
112 IBV_EXP_RX_HASH_DST_PORT_TCP),
113 .dpdk_rss_hf = ETH_RSS_NONFRAG_IPV6_TCP,
115 .flow_spec.tcp_udp = {
116 .type = IBV_EXP_FLOW_SPEC_TCP,
117 .size = sizeof(hash_rxq_init[0].flow_spec.tcp_udp),
119 .underlayer = &hash_rxq_init[HASH_RXQ_IPV6],
122 .hash_fields = (IBV_EXP_RX_HASH_SRC_IPV6 |
123 IBV_EXP_RX_HASH_DST_IPV6 |
124 IBV_EXP_RX_HASH_SRC_PORT_UDP |
125 IBV_EXP_RX_HASH_DST_PORT_UDP),
126 .dpdk_rss_hf = ETH_RSS_NONFRAG_IPV6_UDP,
128 .flow_spec.tcp_udp = {
129 .type = IBV_EXP_FLOW_SPEC_UDP,
130 .size = sizeof(hash_rxq_init[0].flow_spec.tcp_udp),
132 .underlayer = &hash_rxq_init[HASH_RXQ_IPV6],
135 .hash_fields = (IBV_EXP_RX_HASH_SRC_IPV6 |
136 IBV_EXP_RX_HASH_DST_IPV6),
137 .dpdk_rss_hf = (ETH_RSS_IPV6 |
141 .type = IBV_EXP_FLOW_SPEC_IPV6,
142 .size = sizeof(hash_rxq_init[0].flow_spec.ipv6),
144 .underlayer = &hash_rxq_init[HASH_RXQ_ETH],
146 #endif /* HAVE_FLOW_SPEC_IPV6 */
152 .type = IBV_EXP_FLOW_SPEC_ETH,
153 .size = sizeof(hash_rxq_init[0].flow_spec.eth),
159 /* Number of entries in hash_rxq_init[]. */
160 const unsigned int hash_rxq_init_n = RTE_DIM(hash_rxq_init);
162 /* Initialization data for hash RX queue indirection tables. */
163 static const struct ind_table_init ind_table_init[] = {
165 .max_size = -1u, /* Superseded by HW limitations. */
167 1 << HASH_RXQ_TCPV4 |
168 1 << HASH_RXQ_UDPV4 |
170 #ifdef HAVE_FLOW_SPEC_IPV6
171 1 << HASH_RXQ_TCPV6 |
172 1 << HASH_RXQ_UDPV6 |
174 #endif /* HAVE_FLOW_SPEC_IPV6 */
176 #ifdef HAVE_FLOW_SPEC_IPV6
178 #else /* HAVE_FLOW_SPEC_IPV6 */
180 #endif /* HAVE_FLOW_SPEC_IPV6 */
184 .hash_types = 1 << HASH_RXQ_ETH,
189 #define IND_TABLE_INIT_N RTE_DIM(ind_table_init)
191 /* Default RSS hash key also used for ConnectX-3. */
192 uint8_t rss_hash_default_key[] = {
193 0x2c, 0xc6, 0x81, 0xd1,
194 0x5b, 0xdb, 0xf4, 0xf7,
195 0xfc, 0xa2, 0x83, 0x19,
196 0xdb, 0x1a, 0x3e, 0x94,
197 0x6b, 0x9e, 0x38, 0xd9,
198 0x2c, 0x9c, 0x03, 0xd1,
199 0xad, 0x99, 0x44, 0xa7,
200 0xd9, 0x56, 0x3d, 0x59,
201 0x06, 0x3c, 0x25, 0xf3,
202 0xfc, 0x1f, 0xdc, 0x2a,
205 /* Length of the default RSS hash key. */
206 const size_t rss_hash_default_key_len = sizeof(rss_hash_default_key);
209 * Populate flow steering rule for a given hash RX queue type using
210 * information from hash_rxq_init[]. Nothing is written to flow_attr when
211 * flow_attr_size is not large enough, but the required size is still returned.
213 * @param[in] hash_rxq
214 * Pointer to hash RX queue.
215 * @param[out] flow_attr
216 * Pointer to flow attribute structure to fill. Note that the allocated
217 * area must be larger and large enough to hold all flow specifications.
218 * @param flow_attr_size
219 * Entire size of flow_attr and trailing room for flow specifications.
222 * Total size of the flow attribute buffer. No errors are defined.
225 hash_rxq_flow_attr(const struct hash_rxq *hash_rxq,
226 struct ibv_exp_flow_attr *flow_attr,
227 size_t flow_attr_size)
229 size_t offset = sizeof(*flow_attr);
230 enum hash_rxq_type type = hash_rxq->type;
231 const struct hash_rxq_init *init = &hash_rxq_init[type];
233 assert(hash_rxq->priv != NULL);
234 assert((size_t)type < RTE_DIM(hash_rxq_init));
236 offset += init->flow_spec.hdr.size;
237 init = init->underlayer;
238 } while (init != NULL);
239 if (offset > flow_attr_size)
241 flow_attr_size = offset;
242 init = &hash_rxq_init[type];
243 *flow_attr = (struct ibv_exp_flow_attr){
244 .type = IBV_EXP_FLOW_ATTR_NORMAL,
245 .priority = init->flow_priority,
247 .port = hash_rxq->priv->port,
251 offset -= init->flow_spec.hdr.size;
252 memcpy((void *)((uintptr_t)flow_attr + offset),
254 init->flow_spec.hdr.size);
255 ++flow_attr->num_of_specs;
256 init = init->underlayer;
257 } while (init != NULL);
258 return flow_attr_size;
262 * Convert hash type position in indirection table initializer to
263 * hash RX queue type.
266 * Indirection table initializer.
268 * Hash type position.
271 * Hash RX queue type.
273 static enum hash_rxq_type
274 hash_rxq_type_from_pos(const struct ind_table_init *table, unsigned int pos)
276 enum hash_rxq_type type = 0;
278 assert(pos < table->hash_types_n);
280 if ((table->hash_types & (1 << type)) && (pos-- == 0))
288 * Filter out disabled hash RX queue types from ind_table_init[].
291 * Pointer to private structure.
296 * Number of table entries.
299 priv_make_ind_table_init(struct priv *priv,
300 struct ind_table_init (*table)[IND_TABLE_INIT_N])
305 unsigned int table_n = 0;
306 /* Mandatory to receive frames not handled by normal hash RX queues. */
307 unsigned int hash_types_sup = 1 << HASH_RXQ_ETH;
309 rss_hf = priv->dev->data->dev_conf.rx_adv_conf.rss_conf.rss_hf;
310 /* Process other protocols only if more than one queue. */
311 if (priv->rxqs_n > 1)
312 for (i = 0; (i != hash_rxq_init_n); ++i)
313 if (rss_hf & hash_rxq_init[i].dpdk_rss_hf)
314 hash_types_sup |= (1 << i);
316 /* Filter out entries whose protocols are not in the set. */
317 for (i = 0, j = 0; (i != IND_TABLE_INIT_N); ++i) {
321 /* j is increased only if the table has valid protocols. */
323 (*table)[j] = ind_table_init[i];
324 (*table)[j].hash_types &= hash_types_sup;
325 for (h = 0, nb = 0; (h != hash_rxq_init_n); ++h)
326 if (((*table)[j].hash_types >> h) & 0x1)
328 (*table)[i].hash_types_n = nb;
338 * Initialize hash RX queues and indirection table.
341 * Pointer to private structure.
344 * 0 on success, errno value on failure.
347 priv_create_hash_rxqs(struct priv *priv)
349 struct ibv_exp_wq *wqs[priv->reta_idx_n];
350 struct ind_table_init ind_table_init[IND_TABLE_INIT_N];
351 unsigned int ind_tables_n =
352 priv_make_ind_table_init(priv, &ind_table_init);
353 unsigned int hash_rxqs_n = 0;
354 struct hash_rxq (*hash_rxqs)[] = NULL;
355 struct ibv_exp_rwq_ind_table *(*ind_tables)[] = NULL;
361 assert(priv->ind_tables == NULL);
362 assert(priv->ind_tables_n == 0);
363 assert(priv->hash_rxqs == NULL);
364 assert(priv->hash_rxqs_n == 0);
365 assert(priv->pd != NULL);
366 assert(priv->ctx != NULL);
367 if (priv->rxqs_n == 0)
369 assert(priv->rxqs != NULL);
370 if (ind_tables_n == 0) {
371 ERROR("all hash RX queue types have been filtered out,"
372 " indirection table cannot be created");
375 if (priv->rxqs_n & (priv->rxqs_n - 1)) {
376 INFO("%u RX queues are configured, consider rounding this"
377 " number to the next power of two for better balancing",
379 DEBUG("indirection table extended to assume %u WQs",
382 for (i = 0; (i != priv->reta_idx_n); ++i)
383 wqs[i] = (*priv->rxqs)[(*priv->reta_idx)[i]]->wq;
384 /* Get number of hash RX queues to configure. */
385 for (i = 0, hash_rxqs_n = 0; (i != ind_tables_n); ++i)
386 hash_rxqs_n += ind_table_init[i].hash_types_n;
387 DEBUG("allocating %u hash RX queues for %u WQs, %u indirection tables",
388 hash_rxqs_n, priv->rxqs_n, ind_tables_n);
389 /* Create indirection tables. */
390 ind_tables = rte_calloc(__func__, ind_tables_n,
391 sizeof((*ind_tables)[0]), 0);
392 if (ind_tables == NULL) {
394 ERROR("cannot allocate indirection tables container: %s",
398 for (i = 0; (i != ind_tables_n); ++i) {
399 struct ibv_exp_rwq_ind_table_init_attr ind_init_attr = {
401 .log_ind_tbl_size = 0, /* Set below. */
405 unsigned int ind_tbl_size = ind_table_init[i].max_size;
406 struct ibv_exp_rwq_ind_table *ind_table;
408 if (priv->reta_idx_n < ind_tbl_size)
409 ind_tbl_size = priv->reta_idx_n;
410 ind_init_attr.log_ind_tbl_size = log2above(ind_tbl_size);
412 ind_table = ibv_exp_create_rwq_ind_table(priv->ctx,
414 if (ind_table != NULL) {
415 (*ind_tables)[i] = ind_table;
418 /* Not clear whether errno is set. */
419 err = (errno ? errno : EINVAL);
420 ERROR("RX indirection table creation failed with error %d: %s",
424 /* Allocate array that holds hash RX queues and related data. */
425 hash_rxqs = rte_calloc(__func__, hash_rxqs_n,
426 sizeof((*hash_rxqs)[0]), 0);
427 if (hash_rxqs == NULL) {
429 ERROR("cannot allocate hash RX queues container: %s",
433 for (i = 0, j = 0, k = 0;
434 ((i != hash_rxqs_n) && (j != ind_tables_n));
436 struct hash_rxq *hash_rxq = &(*hash_rxqs)[i];
437 enum hash_rxq_type type =
438 hash_rxq_type_from_pos(&ind_table_init[j], k);
439 struct rte_eth_rss_conf *priv_rss_conf =
440 (*priv->rss_conf)[type];
441 struct ibv_exp_rx_hash_conf hash_conf = {
442 .rx_hash_function = IBV_EXP_RX_HASH_FUNC_TOEPLITZ,
443 .rx_hash_key_len = (priv_rss_conf ?
444 priv_rss_conf->rss_key_len :
445 rss_hash_default_key_len),
446 .rx_hash_key = (priv_rss_conf ?
447 priv_rss_conf->rss_key :
448 rss_hash_default_key),
449 .rx_hash_fields_mask = hash_rxq_init[type].hash_fields,
450 .rwq_ind_tbl = (*ind_tables)[j],
452 struct ibv_exp_qp_init_attr qp_init_attr = {
453 .max_inl_recv = 0, /* Currently not supported. */
454 .qp_type = IBV_QPT_RAW_PACKET,
455 .comp_mask = (IBV_EXP_QP_INIT_ATTR_PD |
456 IBV_EXP_QP_INIT_ATTR_RX_HASH),
458 .rx_hash_conf = &hash_conf,
459 .port_num = priv->port,
462 DEBUG("using indirection table %u for hash RX queue %u type %d",
464 *hash_rxq = (struct hash_rxq){
466 .qp = ibv_exp_create_qp(priv->ctx, &qp_init_attr),
469 if (hash_rxq->qp == NULL) {
470 err = (errno ? errno : EINVAL);
471 ERROR("Hash RX QP creation failure: %s",
475 if (++k < ind_table_init[j].hash_types_n)
477 /* Switch to the next indirection table and reset hash RX
478 * queue type array index. */
482 priv->ind_tables = ind_tables;
483 priv->ind_tables_n = ind_tables_n;
484 priv->hash_rxqs = hash_rxqs;
485 priv->hash_rxqs_n = hash_rxqs_n;
489 if (hash_rxqs != NULL) {
490 for (i = 0; (i != hash_rxqs_n); ++i) {
491 struct ibv_qp *qp = (*hash_rxqs)[i].qp;
495 claim_zero(ibv_destroy_qp(qp));
499 if (ind_tables != NULL) {
500 for (j = 0; (j != ind_tables_n); ++j) {
501 struct ibv_exp_rwq_ind_table *ind_table =
504 if (ind_table == NULL)
506 claim_zero(ibv_exp_destroy_rwq_ind_table(ind_table));
508 rte_free(ind_tables);
514 * Clean up hash RX queues and indirection table.
517 * Pointer to private structure.
520 priv_destroy_hash_rxqs(struct priv *priv)
524 DEBUG("destroying %u hash RX queues", priv->hash_rxqs_n);
525 if (priv->hash_rxqs_n == 0) {
526 assert(priv->hash_rxqs == NULL);
527 assert(priv->ind_tables == NULL);
530 for (i = 0; (i != priv->hash_rxqs_n); ++i) {
531 struct hash_rxq *hash_rxq = &(*priv->hash_rxqs)[i];
534 assert(hash_rxq->priv == priv);
535 assert(hash_rxq->qp != NULL);
536 /* Also check that there are no remaining flows. */
537 for (j = 0; (j != RTE_DIM(hash_rxq->special_flow)); ++j)
538 assert(hash_rxq->special_flow[j] == NULL);
539 for (j = 0; (j != RTE_DIM(hash_rxq->mac_flow)); ++j)
540 for (k = 0; (k != RTE_DIM(hash_rxq->mac_flow[j])); ++k)
541 assert(hash_rxq->mac_flow[j][k] == NULL);
542 claim_zero(ibv_destroy_qp(hash_rxq->qp));
544 priv->hash_rxqs_n = 0;
545 rte_free(priv->hash_rxqs);
546 priv->hash_rxqs = NULL;
547 for (i = 0; (i != priv->ind_tables_n); ++i) {
548 struct ibv_exp_rwq_ind_table *ind_table =
549 (*priv->ind_tables)[i];
551 assert(ind_table != NULL);
552 claim_zero(ibv_exp_destroy_rwq_ind_table(ind_table));
554 priv->ind_tables_n = 0;
555 rte_free(priv->ind_tables);
556 priv->ind_tables = NULL;
560 * Check whether a given flow type is allowed.
563 * Pointer to private structure.
565 * Flow type to check.
568 * Nonzero if the given flow type is allowed.
571 priv_allow_flow_type(struct priv *priv, enum hash_rxq_flow_type type)
573 /* Only FLOW_TYPE_PROMISC is allowed when promiscuous mode
574 * has been requested. */
575 if (priv->promisc_req)
576 return type == HASH_RXQ_FLOW_TYPE_PROMISC;
578 case HASH_RXQ_FLOW_TYPE_PROMISC:
579 return !!priv->promisc_req;
580 case HASH_RXQ_FLOW_TYPE_ALLMULTI:
581 return !!priv->allmulti_req;
582 case HASH_RXQ_FLOW_TYPE_MAC:
589 * Automatically enable/disable flows according to configuration.
595 * 0 on success, errno value on failure.
598 priv_rehash_flows(struct priv *priv)
602 for (i = 0; (i != RTE_DIM((*priv->hash_rxqs)[0].special_flow)); ++i)
603 if (!priv_allow_flow_type(priv, i)) {
604 priv_special_flow_disable(priv, i);
606 int ret = priv_special_flow_enable(priv, i);
611 if (priv_allow_flow_type(priv, HASH_RXQ_FLOW_TYPE_MAC))
612 return priv_mac_addrs_enable(priv);
613 priv_mac_addrs_disable(priv);
618 * Allocate RX queue elements with scattered packets support.
621 * Pointer to RX queue structure.
623 * Number of elements to allocate.
625 * If not NULL, fetch buffers from this array instead of allocating them
626 * with rte_pktmbuf_alloc().
629 * 0 on success, errno value on failure.
632 rxq_alloc_elts_sp(struct rxq *rxq, unsigned int elts_n,
633 struct rte_mbuf **pool)
636 struct rxq_elt_sp (*elts)[elts_n] =
637 rte_calloc_socket("RXQ elements", 1, sizeof(*elts), 0,
642 ERROR("%p: can't allocate packets array", (void *)rxq);
646 /* For each WR (packet). */
647 for (i = 0; (i != elts_n); ++i) {
649 struct rxq_elt_sp *elt = &(*elts)[i];
650 struct ibv_sge (*sges)[RTE_DIM(elt->sges)] = &elt->sges;
652 /* These two arrays must have the same size. */
653 assert(RTE_DIM(elt->sges) == RTE_DIM(elt->bufs));
654 /* For each SGE (segment). */
655 for (j = 0; (j != RTE_DIM(elt->bufs)); ++j) {
656 struct ibv_sge *sge = &(*sges)[j];
657 struct rte_mbuf *buf;
662 rte_pktmbuf_reset(buf);
664 buf = rte_pktmbuf_alloc(rxq->mp);
666 assert(pool == NULL);
667 ERROR("%p: empty mbuf pool", (void *)rxq);
672 /* Headroom is reserved by rte_pktmbuf_alloc(). */
673 assert(DATA_OFF(buf) == RTE_PKTMBUF_HEADROOM);
674 /* Buffer is supposed to be empty. */
675 assert(rte_pktmbuf_data_len(buf) == 0);
676 assert(rte_pktmbuf_pkt_len(buf) == 0);
677 /* sge->addr must be able to store a pointer. */
678 assert(sizeof(sge->addr) >= sizeof(uintptr_t));
680 /* The first SGE keeps its headroom. */
681 sge->addr = rte_pktmbuf_mtod(buf, uintptr_t);
682 sge->length = (buf->buf_len -
683 RTE_PKTMBUF_HEADROOM);
685 /* Subsequent SGEs lose theirs. */
686 assert(DATA_OFF(buf) == RTE_PKTMBUF_HEADROOM);
687 SET_DATA_OFF(buf, 0);
688 sge->addr = (uintptr_t)buf->buf_addr;
689 sge->length = buf->buf_len;
691 sge->lkey = rxq->mr->lkey;
692 /* Redundant check for tailroom. */
693 assert(sge->length == rte_pktmbuf_tailroom(buf));
696 DEBUG("%p: allocated and configured %u WRs (%zu segments)",
697 (void *)rxq, elts_n, (elts_n * RTE_DIM((*elts)[0].sges)));
698 rxq->elts_n = elts_n;
705 assert(pool == NULL);
706 for (i = 0; (i != RTE_DIM(*elts)); ++i) {
708 struct rxq_elt_sp *elt = &(*elts)[i];
710 for (j = 0; (j != RTE_DIM(elt->bufs)); ++j) {
711 struct rte_mbuf *buf = elt->bufs[j];
714 rte_pktmbuf_free_seg(buf);
719 DEBUG("%p: failed, freed everything", (void *)rxq);
725 * Free RX queue elements with scattered packets support.
728 * Pointer to RX queue structure.
731 rxq_free_elts_sp(struct rxq *rxq)
734 unsigned int elts_n = rxq->elts_n;
735 struct rxq_elt_sp (*elts)[elts_n] = rxq->elts.sp;
737 DEBUG("%p: freeing WRs", (void *)rxq);
742 for (i = 0; (i != RTE_DIM(*elts)); ++i) {
744 struct rxq_elt_sp *elt = &(*elts)[i];
746 for (j = 0; (j != RTE_DIM(elt->bufs)); ++j) {
747 struct rte_mbuf *buf = elt->bufs[j];
750 rte_pktmbuf_free_seg(buf);
757 * Allocate RX queue elements.
760 * Pointer to RX queue structure.
762 * Number of elements to allocate.
764 * If not NULL, fetch buffers from this array instead of allocating them
765 * with rte_pktmbuf_alloc().
768 * 0 on success, errno value on failure.
771 rxq_alloc_elts(struct rxq *rxq, unsigned int elts_n, struct rte_mbuf **pool)
774 struct rxq_elt (*elts)[elts_n] =
775 rte_calloc_socket("RXQ elements", 1, sizeof(*elts), 0,
780 ERROR("%p: can't allocate packets array", (void *)rxq);
784 /* For each WR (packet). */
785 for (i = 0; (i != elts_n); ++i) {
786 struct rxq_elt *elt = &(*elts)[i];
787 struct ibv_sge *sge = &(*elts)[i].sge;
788 struct rte_mbuf *buf;
793 rte_pktmbuf_reset(buf);
795 buf = rte_pktmbuf_alloc(rxq->mp);
797 assert(pool == NULL);
798 ERROR("%p: empty mbuf pool", (void *)rxq);
803 /* Headroom is reserved by rte_pktmbuf_alloc(). */
804 assert(DATA_OFF(buf) == RTE_PKTMBUF_HEADROOM);
805 /* Buffer is supposed to be empty. */
806 assert(rte_pktmbuf_data_len(buf) == 0);
807 assert(rte_pktmbuf_pkt_len(buf) == 0);
808 /* sge->addr must be able to store a pointer. */
809 assert(sizeof(sge->addr) >= sizeof(uintptr_t));
810 /* SGE keeps its headroom. */
811 sge->addr = (uintptr_t)
812 ((uint8_t *)buf->buf_addr + RTE_PKTMBUF_HEADROOM);
813 sge->length = (buf->buf_len - RTE_PKTMBUF_HEADROOM);
814 sge->lkey = rxq->mr->lkey;
815 /* Redundant check for tailroom. */
816 assert(sge->length == rte_pktmbuf_tailroom(buf));
818 DEBUG("%p: allocated and configured %u single-segment WRs",
819 (void *)rxq, elts_n);
820 rxq->elts_n = elts_n;
822 rxq->elts.no_sp = elts;
827 assert(pool == NULL);
828 for (i = 0; (i != RTE_DIM(*elts)); ++i) {
829 struct rxq_elt *elt = &(*elts)[i];
830 struct rte_mbuf *buf = elt->buf;
833 rte_pktmbuf_free_seg(buf);
837 DEBUG("%p: failed, freed everything", (void *)rxq);
843 * Free RX queue elements.
846 * Pointer to RX queue structure.
849 rxq_free_elts(struct rxq *rxq)
852 unsigned int elts_n = rxq->elts_n;
853 struct rxq_elt (*elts)[elts_n] = rxq->elts.no_sp;
855 DEBUG("%p: freeing WRs", (void *)rxq);
857 rxq->elts.no_sp = NULL;
860 for (i = 0; (i != RTE_DIM(*elts)); ++i) {
861 struct rxq_elt *elt = &(*elts)[i];
862 struct rte_mbuf *buf = elt->buf;
865 rte_pktmbuf_free_seg(buf);
871 * Clean up a RX queue.
873 * Destroy objects, free allocated memory and reset the structure for reuse.
876 * Pointer to RX queue structure.
879 rxq_cleanup(struct rxq *rxq)
881 struct ibv_exp_release_intf_params params;
883 DEBUG("cleaning up %p", (void *)rxq);
885 rxq_free_elts_sp(rxq);
888 if (rxq->if_wq != NULL) {
889 assert(rxq->priv != NULL);
890 assert(rxq->priv->ctx != NULL);
891 assert(rxq->wq != NULL);
892 params = (struct ibv_exp_release_intf_params){
895 claim_zero(ibv_exp_release_intf(rxq->priv->ctx,
899 if (rxq->if_cq != NULL) {
900 assert(rxq->priv != NULL);
901 assert(rxq->priv->ctx != NULL);
902 assert(rxq->cq != NULL);
903 params = (struct ibv_exp_release_intf_params){
906 claim_zero(ibv_exp_release_intf(rxq->priv->ctx,
911 claim_zero(ibv_exp_destroy_wq(rxq->wq));
913 claim_zero(ibv_destroy_cq(rxq->cq));
914 if (rxq->rd != NULL) {
915 struct ibv_exp_destroy_res_domain_attr attr = {
919 assert(rxq->priv != NULL);
920 assert(rxq->priv->ctx != NULL);
921 claim_zero(ibv_exp_destroy_res_domain(rxq->priv->ctx,
926 claim_zero(ibv_dereg_mr(rxq->mr));
927 memset(rxq, 0, sizeof(*rxq));
931 * Reconfigure a RX queue with new parameters.
933 * rxq_rehash() does not allocate mbufs, which, if not done from the right
934 * thread (such as a control thread), may corrupt the pool.
935 * In case of failure, the queue is left untouched.
938 * Pointer to Ethernet device structure.
943 * 0 on success, errno value on failure.
946 rxq_rehash(struct rte_eth_dev *dev, struct rxq *rxq)
948 struct priv *priv = rxq->priv;
949 struct rxq tmpl = *rxq;
952 struct rte_mbuf **pool;
954 struct ibv_exp_wq_attr mod;
957 DEBUG("%p: rehashing queue %p", (void *)dev, (void *)rxq);
958 /* Number of descriptors and mbufs currently allocated. */
959 desc_n = (tmpl.elts_n * (tmpl.sp ? MLX5_PMD_SGE_WR_N : 1));
961 /* Toggle RX checksum offload if hardware supports it. */
963 tmpl.csum = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
964 rxq->csum = tmpl.csum;
966 if (priv->hw_csum_l2tun) {
967 tmpl.csum_l2tun = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
968 rxq->csum_l2tun = tmpl.csum_l2tun;
970 /* Enable scattered packets support for this queue if necessary. */
971 if ((dev->data->dev_conf.rxmode.jumbo_frame) &&
972 (dev->data->dev_conf.rxmode.max_rx_pkt_len >
973 (tmpl.mb_len - RTE_PKTMBUF_HEADROOM))) {
975 desc_n /= MLX5_PMD_SGE_WR_N;
978 DEBUG("%p: %s scattered packets support (%u WRs)",
979 (void *)dev, (tmpl.sp ? "enabling" : "disabling"), desc_n);
980 /* If scatter mode is the same as before, nothing to do. */
981 if (tmpl.sp == rxq->sp) {
982 DEBUG("%p: nothing to do", (void *)dev);
985 /* From now on, any failure will render the queue unusable.
986 * Reinitialize WQ. */
987 mod = (struct ibv_exp_wq_attr){
988 .attr_mask = IBV_EXP_WQ_ATTR_STATE,
989 .wq_state = IBV_EXP_WQS_RESET,
991 err = ibv_exp_modify_wq(tmpl.wq, &mod);
993 ERROR("%p: cannot reset WQ: %s", (void *)dev, strerror(err));
998 pool = rte_malloc(__func__, (mbuf_n * sizeof(*pool)), 0);
1000 ERROR("%p: cannot allocate memory", (void *)dev);
1003 /* Snatch mbufs from original queue. */
1006 struct rxq_elt_sp (*elts)[rxq->elts_n] = rxq->elts.sp;
1008 for (i = 0; (i != RTE_DIM(*elts)); ++i) {
1009 struct rxq_elt_sp *elt = &(*elts)[i];
1012 for (j = 0; (j != RTE_DIM(elt->bufs)); ++j) {
1013 assert(elt->bufs[j] != NULL);
1014 pool[k++] = elt->bufs[j];
1018 struct rxq_elt (*elts)[rxq->elts_n] = rxq->elts.no_sp;
1020 for (i = 0; (i != RTE_DIM(*elts)); ++i) {
1021 struct rxq_elt *elt = &(*elts)[i];
1022 struct rte_mbuf *buf = elt->buf;
1027 assert(k == mbuf_n);
1029 tmpl.elts.sp = NULL;
1030 assert((void *)&tmpl.elts.sp == (void *)&tmpl.elts.no_sp);
1032 rxq_alloc_elts_sp(&tmpl, desc_n, pool) :
1033 rxq_alloc_elts(&tmpl, desc_n, pool));
1035 ERROR("%p: cannot reallocate WRs, aborting", (void *)dev);
1040 assert(tmpl.elts_n == desc_n);
1041 assert(tmpl.elts.sp != NULL);
1043 /* Clean up original data. */
1045 rte_free(rxq->elts.sp);
1046 rxq->elts.sp = NULL;
1047 /* Change queue state to ready. */
1048 mod = (struct ibv_exp_wq_attr){
1049 .attr_mask = IBV_EXP_WQ_ATTR_STATE,
1050 .wq_state = IBV_EXP_WQS_RDY,
1052 err = ibv_exp_modify_wq(tmpl.wq, &mod);
1054 ERROR("%p: WQ state to IBV_EXP_WQS_RDY failed: %s",
1055 (void *)dev, strerror(err));
1059 assert(tmpl.if_wq != NULL);
1061 struct rxq_elt_sp (*elts)[tmpl.elts_n] = tmpl.elts.sp;
1063 for (i = 0; (i != RTE_DIM(*elts)); ++i) {
1064 err = tmpl.if_wq->recv_sg_list
1067 RTE_DIM((*elts)[i].sges));
1072 struct rxq_elt (*elts)[tmpl.elts_n] = tmpl.elts.no_sp;
1074 for (i = 0; (i != RTE_DIM(*elts)); ++i) {
1075 err = tmpl.if_wq->recv_burst(
1084 ERROR("%p: failed to post SGEs with error %d",
1086 /* Set err because it does not contain a valid errno value. */
1097 * Configure a RX queue.
1100 * Pointer to Ethernet device structure.
1102 * Pointer to RX queue structure.
1104 * Number of descriptors to configure in queue.
1106 * NUMA socket on which memory must be allocated.
1108 * Thresholds parameters.
1110 * Memory pool for buffer allocations.
1113 * 0 on success, errno value on failure.
1116 rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
1117 unsigned int socket, const struct rte_eth_rxconf *conf,
1118 struct rte_mempool *mp)
1120 struct priv *priv = dev->data->dev_private;
1126 struct ibv_exp_wq_attr mod;
1128 struct ibv_exp_query_intf_params params;
1129 struct ibv_exp_cq_init_attr cq;
1130 struct ibv_exp_res_domain_init_attr rd;
1131 struct ibv_exp_wq_init_attr wq;
1133 enum ibv_exp_query_intf_status status;
1134 struct rte_mbuf *buf;
1137 unsigned int cq_size = desc;
1139 (void)conf; /* Thresholds configuration (ignored). */
1140 if ((desc == 0) || (desc % MLX5_PMD_SGE_WR_N)) {
1141 ERROR("%p: invalid number of RX descriptors (must be a"
1142 " multiple of %d)", (void *)dev, MLX5_PMD_SGE_WR_N);
1145 /* Get mbuf length. */
1146 buf = rte_pktmbuf_alloc(mp);
1148 ERROR("%p: unable to allocate mbuf", (void *)dev);
1151 tmpl.mb_len = buf->buf_len;
1152 assert((rte_pktmbuf_headroom(buf) +
1153 rte_pktmbuf_tailroom(buf)) == tmpl.mb_len);
1154 assert(rte_pktmbuf_headroom(buf) == RTE_PKTMBUF_HEADROOM);
1155 rte_pktmbuf_free(buf);
1156 /* Toggle RX checksum offload if hardware supports it. */
1158 tmpl.csum = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
1159 if (priv->hw_csum_l2tun)
1160 tmpl.csum_l2tun = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
1161 /* Enable scattered packets support for this queue if necessary. */
1162 if ((dev->data->dev_conf.rxmode.jumbo_frame) &&
1163 (dev->data->dev_conf.rxmode.max_rx_pkt_len >
1164 (tmpl.mb_len - RTE_PKTMBUF_HEADROOM))) {
1166 desc /= MLX5_PMD_SGE_WR_N;
1168 DEBUG("%p: %s scattered packets support (%u WRs)",
1169 (void *)dev, (tmpl.sp ? "enabling" : "disabling"), desc);
1170 /* Use the entire RX mempool as the memory region. */
1171 tmpl.mr = ibv_reg_mr(priv->pd,
1172 (void *)mp->elt_va_start,
1173 (mp->elt_va_end - mp->elt_va_start),
1174 (IBV_ACCESS_LOCAL_WRITE |
1175 IBV_ACCESS_REMOTE_WRITE));
1176 if (tmpl.mr == NULL) {
1178 ERROR("%p: MR creation failure: %s",
1179 (void *)dev, strerror(ret));
1182 attr.rd = (struct ibv_exp_res_domain_init_attr){
1183 .comp_mask = (IBV_EXP_RES_DOMAIN_THREAD_MODEL |
1184 IBV_EXP_RES_DOMAIN_MSG_MODEL),
1185 .thread_model = IBV_EXP_THREAD_SINGLE,
1186 .msg_model = IBV_EXP_MSG_HIGH_BW,
1188 tmpl.rd = ibv_exp_create_res_domain(priv->ctx, &attr.rd);
1189 if (tmpl.rd == NULL) {
1191 ERROR("%p: RD creation failure: %s",
1192 (void *)dev, strerror(ret));
1195 attr.cq = (struct ibv_exp_cq_init_attr){
1196 .comp_mask = IBV_EXP_CQ_INIT_ATTR_RES_DOMAIN,
1197 .res_domain = tmpl.rd,
1199 tmpl.cq = ibv_exp_create_cq(priv->ctx, cq_size, NULL, NULL, 0,
1201 if (tmpl.cq == NULL) {
1203 ERROR("%p: CQ creation failure: %s",
1204 (void *)dev, strerror(ret));
1207 DEBUG("priv->device_attr.max_qp_wr is %d",
1208 priv->device_attr.max_qp_wr);
1209 DEBUG("priv->device_attr.max_sge is %d",
1210 priv->device_attr.max_sge);
1211 attr.wq = (struct ibv_exp_wq_init_attr){
1212 .wq_context = NULL, /* Could be useful in the future. */
1213 .wq_type = IBV_EXP_WQT_RQ,
1214 /* Max number of outstanding WRs. */
1215 .max_recv_wr = ((priv->device_attr.max_qp_wr < (int)cq_size) ?
1216 priv->device_attr.max_qp_wr :
1218 /* Max number of scatter/gather elements in a WR. */
1219 .max_recv_sge = ((priv->device_attr.max_sge <
1220 MLX5_PMD_SGE_WR_N) ?
1221 priv->device_attr.max_sge :
1225 .comp_mask = IBV_EXP_CREATE_WQ_RES_DOMAIN,
1226 .res_domain = tmpl.rd,
1228 tmpl.wq = ibv_exp_create_wq(priv->ctx, &attr.wq);
1229 if (tmpl.wq == NULL) {
1230 ret = (errno ? errno : EINVAL);
1231 ERROR("%p: WQ creation failure: %s",
1232 (void *)dev, strerror(ret));
1236 ret = rxq_alloc_elts_sp(&tmpl, desc, NULL);
1238 ret = rxq_alloc_elts(&tmpl, desc, NULL);
1240 ERROR("%p: RXQ allocation failed: %s",
1241 (void *)dev, strerror(ret));
1245 tmpl.port_id = dev->data->port_id;
1246 DEBUG("%p: RTE port ID: %u", (void *)rxq, tmpl.port_id);
1247 attr.params = (struct ibv_exp_query_intf_params){
1248 .intf_scope = IBV_EXP_INTF_GLOBAL,
1249 .intf = IBV_EXP_INTF_CQ,
1252 tmpl.if_cq = ibv_exp_query_intf(priv->ctx, &attr.params, &status);
1253 if (tmpl.if_cq == NULL) {
1254 ERROR("%p: CQ interface family query failed with status %d",
1255 (void *)dev, status);
1258 attr.params = (struct ibv_exp_query_intf_params){
1259 .intf_scope = IBV_EXP_INTF_GLOBAL,
1260 .intf = IBV_EXP_INTF_WQ,
1263 tmpl.if_wq = ibv_exp_query_intf(priv->ctx, &attr.params, &status);
1264 if (tmpl.if_wq == NULL) {
1265 ERROR("%p: WQ interface family query failed with status %d",
1266 (void *)dev, status);
1269 /* Change queue state to ready. */
1270 mod = (struct ibv_exp_wq_attr){
1271 .attr_mask = IBV_EXP_WQ_ATTR_STATE,
1272 .wq_state = IBV_EXP_WQS_RDY,
1274 ret = ibv_exp_modify_wq(tmpl.wq, &mod);
1276 ERROR("%p: WQ state to IBV_EXP_WQS_RDY failed: %s",
1277 (void *)dev, strerror(ret));
1282 struct rxq_elt_sp (*elts)[tmpl.elts_n] = tmpl.elts.sp;
1284 for (i = 0; (i != RTE_DIM(*elts)); ++i) {
1285 ret = tmpl.if_wq->recv_sg_list
1288 RTE_DIM((*elts)[i].sges));
1293 struct rxq_elt (*elts)[tmpl.elts_n] = tmpl.elts.no_sp;
1295 for (i = 0; (i != RTE_DIM(*elts)); ++i) {
1296 ret = tmpl.if_wq->recv_burst(
1305 ERROR("%p: failed to post SGEs with error %d",
1307 /* Set ret because it does not contain a valid errno value. */
1311 /* Clean up rxq in case we're reinitializing it. */
1312 DEBUG("%p: cleaning-up old rxq just in case", (void *)rxq);
1315 DEBUG("%p: rxq updated with %p", (void *)rxq, (void *)&tmpl);
1325 * DPDK callback to configure a RX queue.
1328 * Pointer to Ethernet device structure.
1332 * Number of descriptors to configure in queue.
1334 * NUMA socket on which memory must be allocated.
1336 * Thresholds parameters.
1338 * Memory pool for buffer allocations.
1341 * 0 on success, negative errno value on failure.
1344 mlx5_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
1345 unsigned int socket, const struct rte_eth_rxconf *conf,
1346 struct rte_mempool *mp)
1348 struct priv *priv = dev->data->dev_private;
1349 struct rxq *rxq = (*priv->rxqs)[idx];
1353 DEBUG("%p: configuring queue %u for %u descriptors",
1354 (void *)dev, idx, desc);
1355 if (idx >= priv->rxqs_n) {
1356 ERROR("%p: queue index out of range (%u >= %u)",
1357 (void *)dev, idx, priv->rxqs_n);
1362 DEBUG("%p: reusing already allocated queue index %u (%p)",
1363 (void *)dev, idx, (void *)rxq);
1364 if (priv->started) {
1368 (*priv->rxqs)[idx] = NULL;
1371 rxq = rte_calloc_socket("RXQ", 1, sizeof(*rxq), 0, socket);
1373 ERROR("%p: unable to allocate queue index %u",
1379 ret = rxq_setup(dev, rxq, desc, socket, conf, mp);
1383 rxq->stats.idx = idx;
1384 DEBUG("%p: adding RX queue %p to list",
1385 (void *)dev, (void *)rxq);
1386 (*priv->rxqs)[idx] = rxq;
1387 /* Update receive callback. */
1389 dev->rx_pkt_burst = mlx5_rx_burst_sp;
1391 dev->rx_pkt_burst = mlx5_rx_burst;
1398 * DPDK callback to release a RX queue.
1401 * Generic RX queue pointer.
1404 mlx5_rx_queue_release(void *dpdk_rxq)
1406 struct rxq *rxq = (struct rxq *)dpdk_rxq;
1414 for (i = 0; (i != priv->rxqs_n); ++i)
1415 if ((*priv->rxqs)[i] == rxq) {
1416 DEBUG("%p: removing RX queue %p from list",
1417 (void *)priv->dev, (void *)rxq);
1418 (*priv->rxqs)[i] = NULL;