4 * Copyright 2015 6WIND S.A.
5 * Copyright 2015 Mellanox.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * * Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * * Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * * Neither the name of 6WIND S.A. nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
41 /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
43 #pragma GCC diagnostic ignored "-pedantic"
45 #include <infiniband/verbs.h>
47 #pragma GCC diagnostic error "-pedantic"
50 /* DPDK headers don't like -pedantic. */
52 #pragma GCC diagnostic ignored "-pedantic"
55 #include <rte_malloc.h>
56 #include <rte_ethdev.h>
57 #include <rte_common.h>
59 #pragma GCC diagnostic error "-pedantic"
63 #include "mlx5_rxtx.h"
64 #include "mlx5_utils.h"
65 #include "mlx5_defs.h"
67 /* Initialization data for hash RX queues. */
68 const struct hash_rxq_init hash_rxq_init[] = {
70 .hash_fields = (IBV_EXP_RX_HASH_SRC_IPV4 |
71 IBV_EXP_RX_HASH_DST_IPV4 |
72 IBV_EXP_RX_HASH_SRC_PORT_TCP |
73 IBV_EXP_RX_HASH_DST_PORT_TCP),
74 .dpdk_rss_hf = ETH_RSS_NONFRAG_IPV4_TCP,
76 .flow_spec.tcp_udp = {
77 .type = IBV_EXP_FLOW_SPEC_TCP,
78 .size = sizeof(hash_rxq_init[0].flow_spec.tcp_udp),
80 .underlayer = &hash_rxq_init[HASH_RXQ_IPV4],
83 .hash_fields = (IBV_EXP_RX_HASH_SRC_IPV4 |
84 IBV_EXP_RX_HASH_DST_IPV4 |
85 IBV_EXP_RX_HASH_SRC_PORT_UDP |
86 IBV_EXP_RX_HASH_DST_PORT_UDP),
87 .dpdk_rss_hf = ETH_RSS_NONFRAG_IPV4_UDP,
89 .flow_spec.tcp_udp = {
90 .type = IBV_EXP_FLOW_SPEC_UDP,
91 .size = sizeof(hash_rxq_init[0].flow_spec.tcp_udp),
93 .underlayer = &hash_rxq_init[HASH_RXQ_IPV4],
96 .hash_fields = (IBV_EXP_RX_HASH_SRC_IPV4 |
97 IBV_EXP_RX_HASH_DST_IPV4),
98 .dpdk_rss_hf = (ETH_RSS_IPV4 |
102 .type = IBV_EXP_FLOW_SPEC_IPV4,
103 .size = sizeof(hash_rxq_init[0].flow_spec.ipv4),
105 .underlayer = &hash_rxq_init[HASH_RXQ_ETH],
107 #ifdef HAVE_FLOW_SPEC_IPV6
109 .hash_fields = (IBV_EXP_RX_HASH_SRC_IPV6 |
110 IBV_EXP_RX_HASH_DST_IPV6 |
111 IBV_EXP_RX_HASH_SRC_PORT_TCP |
112 IBV_EXP_RX_HASH_DST_PORT_TCP),
113 .dpdk_rss_hf = ETH_RSS_NONFRAG_IPV6_TCP,
115 .flow_spec.tcp_udp = {
116 .type = IBV_EXP_FLOW_SPEC_TCP,
117 .size = sizeof(hash_rxq_init[0].flow_spec.tcp_udp),
119 .underlayer = &hash_rxq_init[HASH_RXQ_IPV6],
122 .hash_fields = (IBV_EXP_RX_HASH_SRC_IPV6 |
123 IBV_EXP_RX_HASH_DST_IPV6 |
124 IBV_EXP_RX_HASH_SRC_PORT_UDP |
125 IBV_EXP_RX_HASH_DST_PORT_UDP),
126 .dpdk_rss_hf = ETH_RSS_NONFRAG_IPV6_UDP,
128 .flow_spec.tcp_udp = {
129 .type = IBV_EXP_FLOW_SPEC_UDP,
130 .size = sizeof(hash_rxq_init[0].flow_spec.tcp_udp),
132 .underlayer = &hash_rxq_init[HASH_RXQ_IPV6],
135 .hash_fields = (IBV_EXP_RX_HASH_SRC_IPV6 |
136 IBV_EXP_RX_HASH_DST_IPV6),
137 .dpdk_rss_hf = (ETH_RSS_IPV6 |
141 .type = IBV_EXP_FLOW_SPEC_IPV6,
142 .size = sizeof(hash_rxq_init[0].flow_spec.ipv6),
144 .underlayer = &hash_rxq_init[HASH_RXQ_ETH],
146 #endif /* HAVE_FLOW_SPEC_IPV6 */
152 .type = IBV_EXP_FLOW_SPEC_ETH,
153 .size = sizeof(hash_rxq_init[0].flow_spec.eth),
159 /* Number of entries in hash_rxq_init[]. */
160 const unsigned int hash_rxq_init_n = RTE_DIM(hash_rxq_init);
162 /* Initialization data for hash RX queue indirection tables. */
163 static const struct ind_table_init ind_table_init[] = {
165 .max_size = -1u, /* Superseded by HW limitations. */
167 1 << HASH_RXQ_TCPV4 |
168 1 << HASH_RXQ_UDPV4 |
170 #ifdef HAVE_FLOW_SPEC_IPV6
171 1 << HASH_RXQ_TCPV6 |
172 1 << HASH_RXQ_UDPV6 |
174 #endif /* HAVE_FLOW_SPEC_IPV6 */
176 #ifdef HAVE_FLOW_SPEC_IPV6
178 #else /* HAVE_FLOW_SPEC_IPV6 */
180 #endif /* HAVE_FLOW_SPEC_IPV6 */
184 .hash_types = 1 << HASH_RXQ_ETH,
189 #define IND_TABLE_INIT_N RTE_DIM(ind_table_init)
191 /* Default RSS hash key also used for ConnectX-3. */
192 uint8_t rss_hash_default_key[] = {
193 0x2c, 0xc6, 0x81, 0xd1,
194 0x5b, 0xdb, 0xf4, 0xf7,
195 0xfc, 0xa2, 0x83, 0x19,
196 0xdb, 0x1a, 0x3e, 0x94,
197 0x6b, 0x9e, 0x38, 0xd9,
198 0x2c, 0x9c, 0x03, 0xd1,
199 0xad, 0x99, 0x44, 0xa7,
200 0xd9, 0x56, 0x3d, 0x59,
201 0x06, 0x3c, 0x25, 0xf3,
202 0xfc, 0x1f, 0xdc, 0x2a,
205 /* Length of the default RSS hash key. */
206 const size_t rss_hash_default_key_len = sizeof(rss_hash_default_key);
209 * Populate flow steering rule for a given hash RX queue type using
210 * information from hash_rxq_init[]. Nothing is written to flow_attr when
211 * flow_attr_size is not large enough, but the required size is still returned.
213 * @param[in] hash_rxq
214 * Pointer to hash RX queue.
215 * @param[out] flow_attr
216 * Pointer to flow attribute structure to fill. Note that the allocated
217 * area must be larger and large enough to hold all flow specifications.
218 * @param flow_attr_size
219 * Entire size of flow_attr and trailing room for flow specifications.
222 * Total size of the flow attribute buffer. No errors are defined.
225 hash_rxq_flow_attr(const struct hash_rxq *hash_rxq,
226 struct ibv_exp_flow_attr *flow_attr,
227 size_t flow_attr_size)
229 size_t offset = sizeof(*flow_attr);
230 enum hash_rxq_type type = hash_rxq->type;
231 const struct hash_rxq_init *init = &hash_rxq_init[type];
233 assert(hash_rxq->priv != NULL);
234 assert((size_t)type < RTE_DIM(hash_rxq_init));
236 offset += init->flow_spec.hdr.size;
237 init = init->underlayer;
238 } while (init != NULL);
239 if (offset > flow_attr_size)
241 flow_attr_size = offset;
242 init = &hash_rxq_init[type];
243 *flow_attr = (struct ibv_exp_flow_attr){
244 .type = IBV_EXP_FLOW_ATTR_NORMAL,
245 .priority = init->flow_priority,
247 .port = hash_rxq->priv->port,
251 offset -= init->flow_spec.hdr.size;
252 memcpy((void *)((uintptr_t)flow_attr + offset),
254 init->flow_spec.hdr.size);
255 ++flow_attr->num_of_specs;
256 init = init->underlayer;
257 } while (init != NULL);
258 return flow_attr_size;
262 * Convert hash type position in indirection table initializer to
263 * hash RX queue type.
266 * Indirection table initializer.
268 * Hash type position.
271 * Hash RX queue type.
273 static enum hash_rxq_type
274 hash_rxq_type_from_pos(const struct ind_table_init *table, unsigned int pos)
276 enum hash_rxq_type type = 0;
278 assert(pos < table->hash_types_n);
280 if ((table->hash_types & (1 << type)) && (pos-- == 0))
288 * Filter out disabled hash RX queue types from ind_table_init[].
291 * Pointer to private structure.
296 * Number of table entries.
299 priv_make_ind_table_init(struct priv *priv,
300 struct ind_table_init (*table)[IND_TABLE_INIT_N])
305 unsigned int table_n = 0;
306 /* Mandatory to receive frames not handled by normal hash RX queues. */
307 unsigned int hash_types_sup = 1 << HASH_RXQ_ETH;
309 rss_hf = priv->dev->data->dev_conf.rx_adv_conf.rss_conf.rss_hf;
310 /* Process other protocols only if more than one queue. */
311 if (priv->rxqs_n > 1)
312 for (i = 0; (i != hash_rxq_init_n); ++i)
313 if (rss_hf & hash_rxq_init[i].dpdk_rss_hf)
314 hash_types_sup |= (1 << i);
316 /* Filter out entries whose protocols are not in the set. */
317 for (i = 0, j = 0; (i != IND_TABLE_INIT_N); ++i) {
321 /* j is increased only if the table has valid protocols. */
323 (*table)[j] = ind_table_init[i];
324 (*table)[j].hash_types &= hash_types_sup;
325 for (h = 0, nb = 0; (h != hash_rxq_init_n); ++h)
326 if (((*table)[j].hash_types >> h) & 0x1)
328 (*table)[i].hash_types_n = nb;
338 * Initialize hash RX queues and indirection table.
341 * Pointer to private structure.
344 * 0 on success, errno value on failure.
347 priv_create_hash_rxqs(struct priv *priv)
349 struct ibv_exp_wq *wqs[priv->reta_idx_n];
350 struct ind_table_init ind_table_init[IND_TABLE_INIT_N];
351 unsigned int ind_tables_n =
352 priv_make_ind_table_init(priv, &ind_table_init);
353 unsigned int hash_rxqs_n = 0;
354 struct hash_rxq (*hash_rxqs)[] = NULL;
355 struct ibv_exp_rwq_ind_table *(*ind_tables)[] = NULL;
361 assert(priv->ind_tables == NULL);
362 assert(priv->ind_tables_n == 0);
363 assert(priv->hash_rxqs == NULL);
364 assert(priv->hash_rxqs_n == 0);
365 assert(priv->pd != NULL);
366 assert(priv->ctx != NULL);
367 if (priv->rxqs_n == 0)
369 assert(priv->rxqs != NULL);
370 if (ind_tables_n == 0) {
371 ERROR("all hash RX queue types have been filtered out,"
372 " indirection table cannot be created");
375 if (priv->rxqs_n & (priv->rxqs_n - 1)) {
376 INFO("%u RX queues are configured, consider rounding this"
377 " number to the next power of two for better balancing",
379 DEBUG("indirection table extended to assume %u WQs",
382 for (i = 0; (i != priv->reta_idx_n); ++i)
383 wqs[i] = (*priv->rxqs)[(*priv->reta_idx)[i]]->wq;
384 /* Get number of hash RX queues to configure. */
385 for (i = 0, hash_rxqs_n = 0; (i != ind_tables_n); ++i)
386 hash_rxqs_n += ind_table_init[i].hash_types_n;
387 DEBUG("allocating %u hash RX queues for %u WQs, %u indirection tables",
388 hash_rxqs_n, priv->rxqs_n, ind_tables_n);
389 /* Create indirection tables. */
390 ind_tables = rte_calloc(__func__, ind_tables_n,
391 sizeof((*ind_tables)[0]), 0);
392 if (ind_tables == NULL) {
394 ERROR("cannot allocate indirection tables container: %s",
398 for (i = 0; (i != ind_tables_n); ++i) {
399 struct ibv_exp_rwq_ind_table_init_attr ind_init_attr = {
401 .log_ind_tbl_size = 0, /* Set below. */
405 unsigned int ind_tbl_size = ind_table_init[i].max_size;
406 struct ibv_exp_rwq_ind_table *ind_table;
408 if (priv->reta_idx_n < ind_tbl_size)
409 ind_tbl_size = priv->reta_idx_n;
410 ind_init_attr.log_ind_tbl_size = log2above(ind_tbl_size);
412 ind_table = ibv_exp_create_rwq_ind_table(priv->ctx,
414 if (ind_table != NULL) {
415 (*ind_tables)[i] = ind_table;
418 /* Not clear whether errno is set. */
419 err = (errno ? errno : EINVAL);
420 ERROR("RX indirection table creation failed with error %d: %s",
424 /* Allocate array that holds hash RX queues and related data. */
425 hash_rxqs = rte_calloc(__func__, hash_rxqs_n,
426 sizeof((*hash_rxqs)[0]), 0);
427 if (hash_rxqs == NULL) {
429 ERROR("cannot allocate hash RX queues container: %s",
433 for (i = 0, j = 0, k = 0;
434 ((i != hash_rxqs_n) && (j != ind_tables_n));
436 struct hash_rxq *hash_rxq = &(*hash_rxqs)[i];
437 enum hash_rxq_type type =
438 hash_rxq_type_from_pos(&ind_table_init[j], k);
439 struct rte_eth_rss_conf *priv_rss_conf =
440 (*priv->rss_conf)[type];
441 struct ibv_exp_rx_hash_conf hash_conf = {
442 .rx_hash_function = IBV_EXP_RX_HASH_FUNC_TOEPLITZ,
443 .rx_hash_key_len = (priv_rss_conf ?
444 priv_rss_conf->rss_key_len :
445 rss_hash_default_key_len),
446 .rx_hash_key = (priv_rss_conf ?
447 priv_rss_conf->rss_key :
448 rss_hash_default_key),
449 .rx_hash_fields_mask = hash_rxq_init[type].hash_fields,
450 .rwq_ind_tbl = (*ind_tables)[j],
452 struct ibv_exp_qp_init_attr qp_init_attr = {
453 .max_inl_recv = 0, /* Currently not supported. */
454 .qp_type = IBV_QPT_RAW_PACKET,
455 .comp_mask = (IBV_EXP_QP_INIT_ATTR_PD |
456 IBV_EXP_QP_INIT_ATTR_RX_HASH),
458 .rx_hash_conf = &hash_conf,
459 .port_num = priv->port,
462 DEBUG("using indirection table %u for hash RX queue %u type %d",
464 *hash_rxq = (struct hash_rxq){
466 .qp = ibv_exp_create_qp(priv->ctx, &qp_init_attr),
469 if (hash_rxq->qp == NULL) {
470 err = (errno ? errno : EINVAL);
471 ERROR("Hash RX QP creation failure: %s",
475 if (++k < ind_table_init[j].hash_types_n)
477 /* Switch to the next indirection table and reset hash RX
478 * queue type array index. */
482 priv->ind_tables = ind_tables;
483 priv->ind_tables_n = ind_tables_n;
484 priv->hash_rxqs = hash_rxqs;
485 priv->hash_rxqs_n = hash_rxqs_n;
489 if (hash_rxqs != NULL) {
490 for (i = 0; (i != hash_rxqs_n); ++i) {
491 struct ibv_qp *qp = (*hash_rxqs)[i].qp;
495 claim_zero(ibv_destroy_qp(qp));
499 if (ind_tables != NULL) {
500 for (j = 0; (j != ind_tables_n); ++j) {
501 struct ibv_exp_rwq_ind_table *ind_table =
504 if (ind_table == NULL)
506 claim_zero(ibv_exp_destroy_rwq_ind_table(ind_table));
508 rte_free(ind_tables);
514 * Clean up hash RX queues and indirection table.
517 * Pointer to private structure.
520 priv_destroy_hash_rxqs(struct priv *priv)
524 DEBUG("destroying %u hash RX queues", priv->hash_rxqs_n);
525 if (priv->hash_rxqs_n == 0) {
526 assert(priv->hash_rxqs == NULL);
527 assert(priv->ind_tables == NULL);
530 for (i = 0; (i != priv->hash_rxqs_n); ++i) {
531 struct hash_rxq *hash_rxq = &(*priv->hash_rxqs)[i];
534 assert(hash_rxq->priv == priv);
535 assert(hash_rxq->qp != NULL);
536 /* Also check that there are no remaining flows. */
537 assert(hash_rxq->allmulti_flow == NULL);
538 assert(hash_rxq->promisc_flow == NULL);
539 for (j = 0; (j != RTE_DIM(hash_rxq->mac_flow)); ++j)
540 for (k = 0; (k != RTE_DIM(hash_rxq->mac_flow[j])); ++k)
541 assert(hash_rxq->mac_flow[j][k] == NULL);
542 claim_zero(ibv_destroy_qp(hash_rxq->qp));
544 priv->hash_rxqs_n = 0;
545 rte_free(priv->hash_rxqs);
546 priv->hash_rxqs = NULL;
547 for (i = 0; (i != priv->ind_tables_n); ++i) {
548 struct ibv_exp_rwq_ind_table *ind_table =
549 (*priv->ind_tables)[i];
551 assert(ind_table != NULL);
552 claim_zero(ibv_exp_destroy_rwq_ind_table(ind_table));
554 priv->ind_tables_n = 0;
555 rte_free(priv->ind_tables);
556 priv->ind_tables = NULL;
560 * Check whether a given flow type is allowed.
563 * Pointer to private structure.
565 * Flow type to check.
568 * Nonzero if the given flow type is allowed.
571 priv_allow_flow_type(struct priv *priv, enum hash_rxq_flow_type type)
573 /* Only FLOW_TYPE_PROMISC is allowed when promiscuous mode
574 * has been requested. */
575 if (priv->promisc_req)
576 return (type == HASH_RXQ_FLOW_TYPE_PROMISC);
578 case HASH_RXQ_FLOW_TYPE_PROMISC:
579 return !!priv->promisc_req;
580 case HASH_RXQ_FLOW_TYPE_ALLMULTI:
581 return !!priv->allmulti_req;
582 case HASH_RXQ_FLOW_TYPE_MAC:
589 * Allocate RX queue elements with scattered packets support.
592 * Pointer to RX queue structure.
594 * Number of elements to allocate.
596 * If not NULL, fetch buffers from this array instead of allocating them
597 * with rte_pktmbuf_alloc().
600 * 0 on success, errno value on failure.
603 rxq_alloc_elts_sp(struct rxq *rxq, unsigned int elts_n,
604 struct rte_mbuf **pool)
607 struct rxq_elt_sp (*elts)[elts_n] =
608 rte_calloc_socket("RXQ elements", 1, sizeof(*elts), 0,
613 ERROR("%p: can't allocate packets array", (void *)rxq);
617 /* For each WR (packet). */
618 for (i = 0; (i != elts_n); ++i) {
620 struct rxq_elt_sp *elt = &(*elts)[i];
621 struct ibv_sge (*sges)[RTE_DIM(elt->sges)] = &elt->sges;
623 /* These two arrays must have the same size. */
624 assert(RTE_DIM(elt->sges) == RTE_DIM(elt->bufs));
625 /* For each SGE (segment). */
626 for (j = 0; (j != RTE_DIM(elt->bufs)); ++j) {
627 struct ibv_sge *sge = &(*sges)[j];
628 struct rte_mbuf *buf;
633 rte_pktmbuf_reset(buf);
635 buf = rte_pktmbuf_alloc(rxq->mp);
637 assert(pool == NULL);
638 ERROR("%p: empty mbuf pool", (void *)rxq);
643 /* Headroom is reserved by rte_pktmbuf_alloc(). */
644 assert(DATA_OFF(buf) == RTE_PKTMBUF_HEADROOM);
645 /* Buffer is supposed to be empty. */
646 assert(rte_pktmbuf_data_len(buf) == 0);
647 assert(rte_pktmbuf_pkt_len(buf) == 0);
648 /* sge->addr must be able to store a pointer. */
649 assert(sizeof(sge->addr) >= sizeof(uintptr_t));
651 /* The first SGE keeps its headroom. */
652 sge->addr = rte_pktmbuf_mtod(buf, uintptr_t);
653 sge->length = (buf->buf_len -
654 RTE_PKTMBUF_HEADROOM);
656 /* Subsequent SGEs lose theirs. */
657 assert(DATA_OFF(buf) == RTE_PKTMBUF_HEADROOM);
658 SET_DATA_OFF(buf, 0);
659 sge->addr = (uintptr_t)buf->buf_addr;
660 sge->length = buf->buf_len;
662 sge->lkey = rxq->mr->lkey;
663 /* Redundant check for tailroom. */
664 assert(sge->length == rte_pktmbuf_tailroom(buf));
667 DEBUG("%p: allocated and configured %u WRs (%zu segments)",
668 (void *)rxq, elts_n, (elts_n * RTE_DIM((*elts)[0].sges)));
669 rxq->elts_n = elts_n;
676 assert(pool == NULL);
677 for (i = 0; (i != RTE_DIM(*elts)); ++i) {
679 struct rxq_elt_sp *elt = &(*elts)[i];
681 for (j = 0; (j != RTE_DIM(elt->bufs)); ++j) {
682 struct rte_mbuf *buf = elt->bufs[j];
685 rte_pktmbuf_free_seg(buf);
690 DEBUG("%p: failed, freed everything", (void *)rxq);
696 * Free RX queue elements with scattered packets support.
699 * Pointer to RX queue structure.
702 rxq_free_elts_sp(struct rxq *rxq)
705 unsigned int elts_n = rxq->elts_n;
706 struct rxq_elt_sp (*elts)[elts_n] = rxq->elts.sp;
708 DEBUG("%p: freeing WRs", (void *)rxq);
713 for (i = 0; (i != RTE_DIM(*elts)); ++i) {
715 struct rxq_elt_sp *elt = &(*elts)[i];
717 for (j = 0; (j != RTE_DIM(elt->bufs)); ++j) {
718 struct rte_mbuf *buf = elt->bufs[j];
721 rte_pktmbuf_free_seg(buf);
728 * Allocate RX queue elements.
731 * Pointer to RX queue structure.
733 * Number of elements to allocate.
735 * If not NULL, fetch buffers from this array instead of allocating them
736 * with rte_pktmbuf_alloc().
739 * 0 on success, errno value on failure.
742 rxq_alloc_elts(struct rxq *rxq, unsigned int elts_n, struct rte_mbuf **pool)
745 struct rxq_elt (*elts)[elts_n] =
746 rte_calloc_socket("RXQ elements", 1, sizeof(*elts), 0,
751 ERROR("%p: can't allocate packets array", (void *)rxq);
755 /* For each WR (packet). */
756 for (i = 0; (i != elts_n); ++i) {
757 struct rxq_elt *elt = &(*elts)[i];
758 struct ibv_sge *sge = &(*elts)[i].sge;
759 struct rte_mbuf *buf;
764 rte_pktmbuf_reset(buf);
766 buf = rte_pktmbuf_alloc(rxq->mp);
768 assert(pool == NULL);
769 ERROR("%p: empty mbuf pool", (void *)rxq);
774 /* Headroom is reserved by rte_pktmbuf_alloc(). */
775 assert(DATA_OFF(buf) == RTE_PKTMBUF_HEADROOM);
776 /* Buffer is supposed to be empty. */
777 assert(rte_pktmbuf_data_len(buf) == 0);
778 assert(rte_pktmbuf_pkt_len(buf) == 0);
779 /* sge->addr must be able to store a pointer. */
780 assert(sizeof(sge->addr) >= sizeof(uintptr_t));
781 /* SGE keeps its headroom. */
782 sge->addr = (uintptr_t)
783 ((uint8_t *)buf->buf_addr + RTE_PKTMBUF_HEADROOM);
784 sge->length = (buf->buf_len - RTE_PKTMBUF_HEADROOM);
785 sge->lkey = rxq->mr->lkey;
786 /* Redundant check for tailroom. */
787 assert(sge->length == rte_pktmbuf_tailroom(buf));
789 DEBUG("%p: allocated and configured %u single-segment WRs",
790 (void *)rxq, elts_n);
791 rxq->elts_n = elts_n;
793 rxq->elts.no_sp = elts;
798 assert(pool == NULL);
799 for (i = 0; (i != RTE_DIM(*elts)); ++i) {
800 struct rxq_elt *elt = &(*elts)[i];
801 struct rte_mbuf *buf = elt->buf;
804 rte_pktmbuf_free_seg(buf);
808 DEBUG("%p: failed, freed everything", (void *)rxq);
814 * Free RX queue elements.
817 * Pointer to RX queue structure.
820 rxq_free_elts(struct rxq *rxq)
823 unsigned int elts_n = rxq->elts_n;
824 struct rxq_elt (*elts)[elts_n] = rxq->elts.no_sp;
826 DEBUG("%p: freeing WRs", (void *)rxq);
828 rxq->elts.no_sp = NULL;
831 for (i = 0; (i != RTE_DIM(*elts)); ++i) {
832 struct rxq_elt *elt = &(*elts)[i];
833 struct rte_mbuf *buf = elt->buf;
836 rte_pktmbuf_free_seg(buf);
842 * Clean up a RX queue.
844 * Destroy objects, free allocated memory and reset the structure for reuse.
847 * Pointer to RX queue structure.
850 rxq_cleanup(struct rxq *rxq)
852 struct ibv_exp_release_intf_params params;
854 DEBUG("cleaning up %p", (void *)rxq);
856 rxq_free_elts_sp(rxq);
859 if (rxq->if_wq != NULL) {
860 assert(rxq->priv != NULL);
861 assert(rxq->priv->ctx != NULL);
862 assert(rxq->wq != NULL);
863 params = (struct ibv_exp_release_intf_params){
866 claim_zero(ibv_exp_release_intf(rxq->priv->ctx,
870 if (rxq->if_cq != NULL) {
871 assert(rxq->priv != NULL);
872 assert(rxq->priv->ctx != NULL);
873 assert(rxq->cq != NULL);
874 params = (struct ibv_exp_release_intf_params){
877 claim_zero(ibv_exp_release_intf(rxq->priv->ctx,
882 claim_zero(ibv_exp_destroy_wq(rxq->wq));
884 claim_zero(ibv_destroy_cq(rxq->cq));
885 if (rxq->rd != NULL) {
886 struct ibv_exp_destroy_res_domain_attr attr = {
890 assert(rxq->priv != NULL);
891 assert(rxq->priv->ctx != NULL);
892 claim_zero(ibv_exp_destroy_res_domain(rxq->priv->ctx,
897 claim_zero(ibv_dereg_mr(rxq->mr));
898 memset(rxq, 0, sizeof(*rxq));
902 * Reconfigure a RX queue with new parameters.
904 * rxq_rehash() does not allocate mbufs, which, if not done from the right
905 * thread (such as a control thread), may corrupt the pool.
906 * In case of failure, the queue is left untouched.
909 * Pointer to Ethernet device structure.
914 * 0 on success, errno value on failure.
917 rxq_rehash(struct rte_eth_dev *dev, struct rxq *rxq)
919 struct priv *priv = rxq->priv;
920 struct rxq tmpl = *rxq;
923 struct rte_mbuf **pool;
925 struct ibv_exp_wq_attr mod;
928 DEBUG("%p: rehashing queue %p", (void *)dev, (void *)rxq);
929 /* Number of descriptors and mbufs currently allocated. */
930 desc_n = (tmpl.elts_n * (tmpl.sp ? MLX5_PMD_SGE_WR_N : 1));
932 /* Toggle RX checksum offload if hardware supports it. */
934 tmpl.csum = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
935 rxq->csum = tmpl.csum;
937 if (priv->hw_csum_l2tun) {
938 tmpl.csum_l2tun = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
939 rxq->csum_l2tun = tmpl.csum_l2tun;
941 /* Enable scattered packets support for this queue if necessary. */
942 if ((dev->data->dev_conf.rxmode.jumbo_frame) &&
943 (dev->data->dev_conf.rxmode.max_rx_pkt_len >
944 (tmpl.mb_len - RTE_PKTMBUF_HEADROOM))) {
946 desc_n /= MLX5_PMD_SGE_WR_N;
949 DEBUG("%p: %s scattered packets support (%u WRs)",
950 (void *)dev, (tmpl.sp ? "enabling" : "disabling"), desc_n);
951 /* If scatter mode is the same as before, nothing to do. */
952 if (tmpl.sp == rxq->sp) {
953 DEBUG("%p: nothing to do", (void *)dev);
956 /* From now on, any failure will render the queue unusable.
957 * Reinitialize WQ. */
958 mod = (struct ibv_exp_wq_attr){
959 .attr_mask = IBV_EXP_WQ_ATTR_STATE,
960 .wq_state = IBV_EXP_WQS_RESET,
962 err = ibv_exp_modify_wq(tmpl.wq, &mod);
964 ERROR("%p: cannot reset WQ: %s", (void *)dev, strerror(err));
969 pool = rte_malloc(__func__, (mbuf_n * sizeof(*pool)), 0);
971 ERROR("%p: cannot allocate memory", (void *)dev);
974 /* Snatch mbufs from original queue. */
977 struct rxq_elt_sp (*elts)[rxq->elts_n] = rxq->elts.sp;
979 for (i = 0; (i != RTE_DIM(*elts)); ++i) {
980 struct rxq_elt_sp *elt = &(*elts)[i];
983 for (j = 0; (j != RTE_DIM(elt->bufs)); ++j) {
984 assert(elt->bufs[j] != NULL);
985 pool[k++] = elt->bufs[j];
989 struct rxq_elt (*elts)[rxq->elts_n] = rxq->elts.no_sp;
991 for (i = 0; (i != RTE_DIM(*elts)); ++i) {
992 struct rxq_elt *elt = &(*elts)[i];
993 struct rte_mbuf *buf = elt->buf;
1000 tmpl.elts.sp = NULL;
1001 assert((void *)&tmpl.elts.sp == (void *)&tmpl.elts.no_sp);
1003 rxq_alloc_elts_sp(&tmpl, desc_n, pool) :
1004 rxq_alloc_elts(&tmpl, desc_n, pool));
1006 ERROR("%p: cannot reallocate WRs, aborting", (void *)dev);
1011 assert(tmpl.elts_n == desc_n);
1012 assert(tmpl.elts.sp != NULL);
1014 /* Clean up original data. */
1016 rte_free(rxq->elts.sp);
1017 rxq->elts.sp = NULL;
1018 /* Change queue state to ready. */
1019 mod = (struct ibv_exp_wq_attr){
1020 .attr_mask = IBV_EXP_WQ_ATTR_STATE,
1021 .wq_state = IBV_EXP_WQS_RDY,
1023 err = ibv_exp_modify_wq(tmpl.wq, &mod);
1025 ERROR("%p: WQ state to IBV_EXP_WQS_RDY failed: %s",
1026 (void *)dev, strerror(err));
1030 assert(tmpl.if_wq != NULL);
1032 struct rxq_elt_sp (*elts)[tmpl.elts_n] = tmpl.elts.sp;
1034 for (i = 0; (i != RTE_DIM(*elts)); ++i) {
1035 err = tmpl.if_wq->recv_sg_list
1038 RTE_DIM((*elts)[i].sges));
1043 struct rxq_elt (*elts)[tmpl.elts_n] = tmpl.elts.no_sp;
1045 for (i = 0; (i != RTE_DIM(*elts)); ++i) {
1046 err = tmpl.if_wq->recv_burst(
1055 ERROR("%p: failed to post SGEs with error %d",
1057 /* Set err because it does not contain a valid errno value. */
1068 * Configure a RX queue.
1071 * Pointer to Ethernet device structure.
1073 * Pointer to RX queue structure.
1075 * Number of descriptors to configure in queue.
1077 * NUMA socket on which memory must be allocated.
1079 * Thresholds parameters.
1081 * Memory pool for buffer allocations.
1084 * 0 on success, errno value on failure.
1087 rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
1088 unsigned int socket, const struct rte_eth_rxconf *conf,
1089 struct rte_mempool *mp)
1091 struct priv *priv = dev->data->dev_private;
1097 struct ibv_exp_wq_attr mod;
1099 struct ibv_exp_query_intf_params params;
1100 struct ibv_exp_cq_init_attr cq;
1101 struct ibv_exp_res_domain_init_attr rd;
1102 struct ibv_exp_wq_init_attr wq;
1104 enum ibv_exp_query_intf_status status;
1105 struct rte_mbuf *buf;
1108 unsigned int cq_size = desc;
1110 (void)conf; /* Thresholds configuration (ignored). */
1111 if ((desc == 0) || (desc % MLX5_PMD_SGE_WR_N)) {
1112 ERROR("%p: invalid number of RX descriptors (must be a"
1113 " multiple of %d)", (void *)dev, MLX5_PMD_SGE_WR_N);
1116 /* Get mbuf length. */
1117 buf = rte_pktmbuf_alloc(mp);
1119 ERROR("%p: unable to allocate mbuf", (void *)dev);
1122 tmpl.mb_len = buf->buf_len;
1123 assert((rte_pktmbuf_headroom(buf) +
1124 rte_pktmbuf_tailroom(buf)) == tmpl.mb_len);
1125 assert(rte_pktmbuf_headroom(buf) == RTE_PKTMBUF_HEADROOM);
1126 rte_pktmbuf_free(buf);
1127 /* Toggle RX checksum offload if hardware supports it. */
1129 tmpl.csum = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
1130 if (priv->hw_csum_l2tun)
1131 tmpl.csum_l2tun = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
1132 /* Enable scattered packets support for this queue if necessary. */
1133 if ((dev->data->dev_conf.rxmode.jumbo_frame) &&
1134 (dev->data->dev_conf.rxmode.max_rx_pkt_len >
1135 (tmpl.mb_len - RTE_PKTMBUF_HEADROOM))) {
1137 desc /= MLX5_PMD_SGE_WR_N;
1139 DEBUG("%p: %s scattered packets support (%u WRs)",
1140 (void *)dev, (tmpl.sp ? "enabling" : "disabling"), desc);
1141 /* Use the entire RX mempool as the memory region. */
1142 tmpl.mr = ibv_reg_mr(priv->pd,
1143 (void *)mp->elt_va_start,
1144 (mp->elt_va_end - mp->elt_va_start),
1145 (IBV_ACCESS_LOCAL_WRITE |
1146 IBV_ACCESS_REMOTE_WRITE));
1147 if (tmpl.mr == NULL) {
1149 ERROR("%p: MR creation failure: %s",
1150 (void *)dev, strerror(ret));
1153 attr.rd = (struct ibv_exp_res_domain_init_attr){
1154 .comp_mask = (IBV_EXP_RES_DOMAIN_THREAD_MODEL |
1155 IBV_EXP_RES_DOMAIN_MSG_MODEL),
1156 .thread_model = IBV_EXP_THREAD_SINGLE,
1157 .msg_model = IBV_EXP_MSG_HIGH_BW,
1159 tmpl.rd = ibv_exp_create_res_domain(priv->ctx, &attr.rd);
1160 if (tmpl.rd == NULL) {
1162 ERROR("%p: RD creation failure: %s",
1163 (void *)dev, strerror(ret));
1166 attr.cq = (struct ibv_exp_cq_init_attr){
1167 .comp_mask = IBV_EXP_CQ_INIT_ATTR_RES_DOMAIN,
1168 .res_domain = tmpl.rd,
1170 tmpl.cq = ibv_exp_create_cq(priv->ctx, cq_size, NULL, NULL, 0,
1172 if (tmpl.cq == NULL) {
1174 ERROR("%p: CQ creation failure: %s",
1175 (void *)dev, strerror(ret));
1178 DEBUG("priv->device_attr.max_qp_wr is %d",
1179 priv->device_attr.max_qp_wr);
1180 DEBUG("priv->device_attr.max_sge is %d",
1181 priv->device_attr.max_sge);
1182 attr.wq = (struct ibv_exp_wq_init_attr){
1183 .wq_context = NULL, /* Could be useful in the future. */
1184 .wq_type = IBV_EXP_WQT_RQ,
1185 /* Max number of outstanding WRs. */
1186 .max_recv_wr = ((priv->device_attr.max_qp_wr < (int)cq_size) ?
1187 priv->device_attr.max_qp_wr :
1189 /* Max number of scatter/gather elements in a WR. */
1190 .max_recv_sge = ((priv->device_attr.max_sge <
1191 MLX5_PMD_SGE_WR_N) ?
1192 priv->device_attr.max_sge :
1196 .comp_mask = IBV_EXP_CREATE_WQ_RES_DOMAIN,
1197 .res_domain = tmpl.rd,
1199 tmpl.wq = ibv_exp_create_wq(priv->ctx, &attr.wq);
1200 if (tmpl.wq == NULL) {
1201 ret = (errno ? errno : EINVAL);
1202 ERROR("%p: WQ creation failure: %s",
1203 (void *)dev, strerror(ret));
1207 ret = rxq_alloc_elts_sp(&tmpl, desc, NULL);
1209 ret = rxq_alloc_elts(&tmpl, desc, NULL);
1211 ERROR("%p: RXQ allocation failed: %s",
1212 (void *)dev, strerror(ret));
1216 tmpl.port_id = dev->data->port_id;
1217 DEBUG("%p: RTE port ID: %u", (void *)rxq, tmpl.port_id);
1218 attr.params = (struct ibv_exp_query_intf_params){
1219 .intf_scope = IBV_EXP_INTF_GLOBAL,
1220 .intf = IBV_EXP_INTF_CQ,
1223 tmpl.if_cq = ibv_exp_query_intf(priv->ctx, &attr.params, &status);
1224 if (tmpl.if_cq == NULL) {
1225 ERROR("%p: CQ interface family query failed with status %d",
1226 (void *)dev, status);
1229 attr.params = (struct ibv_exp_query_intf_params){
1230 .intf_scope = IBV_EXP_INTF_GLOBAL,
1231 .intf = IBV_EXP_INTF_WQ,
1234 tmpl.if_wq = ibv_exp_query_intf(priv->ctx, &attr.params, &status);
1235 if (tmpl.if_wq == NULL) {
1236 ERROR("%p: WQ interface family query failed with status %d",
1237 (void *)dev, status);
1240 /* Change queue state to ready. */
1241 mod = (struct ibv_exp_wq_attr){
1242 .attr_mask = IBV_EXP_WQ_ATTR_STATE,
1243 .wq_state = IBV_EXP_WQS_RDY,
1245 ret = ibv_exp_modify_wq(tmpl.wq, &mod);
1247 ERROR("%p: WQ state to IBV_EXP_WQS_RDY failed: %s",
1248 (void *)dev, strerror(ret));
1253 struct rxq_elt_sp (*elts)[tmpl.elts_n] = tmpl.elts.sp;
1255 for (i = 0; (i != RTE_DIM(*elts)); ++i) {
1256 ret = tmpl.if_wq->recv_sg_list
1259 RTE_DIM((*elts)[i].sges));
1264 struct rxq_elt (*elts)[tmpl.elts_n] = tmpl.elts.no_sp;
1266 for (i = 0; (i != RTE_DIM(*elts)); ++i) {
1267 ret = tmpl.if_wq->recv_burst(
1276 ERROR("%p: failed to post SGEs with error %d",
1278 /* Set ret because it does not contain a valid errno value. */
1282 /* Clean up rxq in case we're reinitializing it. */
1283 DEBUG("%p: cleaning-up old rxq just in case", (void *)rxq);
1286 DEBUG("%p: rxq updated with %p", (void *)rxq, (void *)&tmpl);
1296 * DPDK callback to configure a RX queue.
1299 * Pointer to Ethernet device structure.
1303 * Number of descriptors to configure in queue.
1305 * NUMA socket on which memory must be allocated.
1307 * Thresholds parameters.
1309 * Memory pool for buffer allocations.
1312 * 0 on success, negative errno value on failure.
1315 mlx5_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
1316 unsigned int socket, const struct rte_eth_rxconf *conf,
1317 struct rte_mempool *mp)
1319 struct priv *priv = dev->data->dev_private;
1320 struct rxq *rxq = (*priv->rxqs)[idx];
1324 DEBUG("%p: configuring queue %u for %u descriptors",
1325 (void *)dev, idx, desc);
1326 if (idx >= priv->rxqs_n) {
1327 ERROR("%p: queue index out of range (%u >= %u)",
1328 (void *)dev, idx, priv->rxqs_n);
1333 DEBUG("%p: reusing already allocated queue index %u (%p)",
1334 (void *)dev, idx, (void *)rxq);
1335 if (priv->started) {
1339 (*priv->rxqs)[idx] = NULL;
1342 rxq = rte_calloc_socket("RXQ", 1, sizeof(*rxq), 0, socket);
1344 ERROR("%p: unable to allocate queue index %u",
1350 ret = rxq_setup(dev, rxq, desc, socket, conf, mp);
1354 rxq->stats.idx = idx;
1355 DEBUG("%p: adding RX queue %p to list",
1356 (void *)dev, (void *)rxq);
1357 (*priv->rxqs)[idx] = rxq;
1358 /* Update receive callback. */
1360 dev->rx_pkt_burst = mlx5_rx_burst_sp;
1362 dev->rx_pkt_burst = mlx5_rx_burst;
1369 * DPDK callback to release a RX queue.
1372 * Generic RX queue pointer.
1375 mlx5_rx_queue_release(void *dpdk_rxq)
1377 struct rxq *rxq = (struct rxq *)dpdk_rxq;
1385 for (i = 0; (i != priv->rxqs_n); ++i)
1386 if ((*priv->rxqs)[i] == rxq) {
1387 DEBUG("%p: removing RX queue %p from list",
1388 (void *)priv->dev, (void *)rxq);
1389 (*priv->rxqs)[i] = NULL;