4 * Copyright 2015 6WIND S.A.
5 * Copyright 2015 Mellanox.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * * Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * * Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * * Neither the name of 6WIND S.A. nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
41 /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
43 #pragma GCC diagnostic ignored "-pedantic"
45 #include <infiniband/verbs.h>
47 #pragma GCC diagnostic error "-pedantic"
50 /* DPDK headers don't like -pedantic. */
52 #pragma GCC diagnostic ignored "-pedantic"
55 #include <rte_malloc.h>
56 #include <rte_ethdev.h>
57 #include <rte_common.h>
59 #pragma GCC diagnostic error "-pedantic"
63 #include "mlx5_rxtx.h"
64 #include "mlx5_utils.h"
65 #include "mlx5_defs.h"
67 /* Initialization data for hash RX queues. */
68 const struct hash_rxq_init hash_rxq_init[] = {
70 .hash_fields = (IBV_EXP_RX_HASH_SRC_IPV4 |
71 IBV_EXP_RX_HASH_DST_IPV4 |
72 IBV_EXP_RX_HASH_SRC_PORT_TCP |
73 IBV_EXP_RX_HASH_DST_PORT_TCP),
74 .dpdk_rss_hf = ETH_RSS_NONFRAG_IPV4_TCP,
76 .flow_spec.tcp_udp = {
77 .type = IBV_EXP_FLOW_SPEC_TCP,
78 .size = sizeof(hash_rxq_init[0].flow_spec.tcp_udp),
80 .underlayer = &hash_rxq_init[HASH_RXQ_IPV4],
83 .hash_fields = (IBV_EXP_RX_HASH_SRC_IPV4 |
84 IBV_EXP_RX_HASH_DST_IPV4 |
85 IBV_EXP_RX_HASH_SRC_PORT_UDP |
86 IBV_EXP_RX_HASH_DST_PORT_UDP),
87 .dpdk_rss_hf = ETH_RSS_NONFRAG_IPV4_UDP,
89 .flow_spec.tcp_udp = {
90 .type = IBV_EXP_FLOW_SPEC_UDP,
91 .size = sizeof(hash_rxq_init[0].flow_spec.tcp_udp),
93 .underlayer = &hash_rxq_init[HASH_RXQ_IPV4],
96 .hash_fields = (IBV_EXP_RX_HASH_SRC_IPV4 |
97 IBV_EXP_RX_HASH_DST_IPV4),
98 .dpdk_rss_hf = (ETH_RSS_IPV4 |
102 .type = IBV_EXP_FLOW_SPEC_IPV4,
103 .size = sizeof(hash_rxq_init[0].flow_spec.ipv4),
105 .underlayer = &hash_rxq_init[HASH_RXQ_ETH],
107 #ifdef HAVE_FLOW_SPEC_IPV6
109 .hash_fields = (IBV_EXP_RX_HASH_SRC_IPV6 |
110 IBV_EXP_RX_HASH_DST_IPV6 |
111 IBV_EXP_RX_HASH_SRC_PORT_TCP |
112 IBV_EXP_RX_HASH_DST_PORT_TCP),
113 .dpdk_rss_hf = ETH_RSS_NONFRAG_IPV6_TCP,
115 .flow_spec.tcp_udp = {
116 .type = IBV_EXP_FLOW_SPEC_TCP,
117 .size = sizeof(hash_rxq_init[0].flow_spec.tcp_udp),
119 .underlayer = &hash_rxq_init[HASH_RXQ_IPV6],
122 .hash_fields = (IBV_EXP_RX_HASH_SRC_IPV6 |
123 IBV_EXP_RX_HASH_DST_IPV6 |
124 IBV_EXP_RX_HASH_SRC_PORT_UDP |
125 IBV_EXP_RX_HASH_DST_PORT_UDP),
126 .dpdk_rss_hf = ETH_RSS_NONFRAG_IPV6_UDP,
128 .flow_spec.tcp_udp = {
129 .type = IBV_EXP_FLOW_SPEC_UDP,
130 .size = sizeof(hash_rxq_init[0].flow_spec.tcp_udp),
132 .underlayer = &hash_rxq_init[HASH_RXQ_IPV6],
135 .hash_fields = (IBV_EXP_RX_HASH_SRC_IPV6 |
136 IBV_EXP_RX_HASH_DST_IPV6),
137 .dpdk_rss_hf = (ETH_RSS_IPV6 |
141 .type = IBV_EXP_FLOW_SPEC_IPV6,
142 .size = sizeof(hash_rxq_init[0].flow_spec.ipv6),
144 .underlayer = &hash_rxq_init[HASH_RXQ_ETH],
146 #endif /* HAVE_FLOW_SPEC_IPV6 */
152 .type = IBV_EXP_FLOW_SPEC_ETH,
153 .size = sizeof(hash_rxq_init[0].flow_spec.eth),
159 /* Number of entries in hash_rxq_init[]. */
160 const unsigned int hash_rxq_init_n = RTE_DIM(hash_rxq_init);
162 /* Initialization data for hash RX queue indirection tables. */
163 static const struct ind_table_init ind_table_init[] = {
165 .max_size = -1u, /* Superseded by HW limitations. */
167 1 << HASH_RXQ_TCPV4 |
168 1 << HASH_RXQ_UDPV4 |
170 #ifdef HAVE_FLOW_SPEC_IPV6
171 1 << HASH_RXQ_TCPV6 |
172 1 << HASH_RXQ_UDPV6 |
174 #endif /* HAVE_FLOW_SPEC_IPV6 */
176 #ifdef HAVE_FLOW_SPEC_IPV6
178 #else /* HAVE_FLOW_SPEC_IPV6 */
180 #endif /* HAVE_FLOW_SPEC_IPV6 */
184 .hash_types = 1 << HASH_RXQ_ETH,
189 #define IND_TABLE_INIT_N RTE_DIM(ind_table_init)
191 /* Default RSS hash key also used for ConnectX-3. */
192 uint8_t rss_hash_default_key[] = {
193 0x2c, 0xc6, 0x81, 0xd1,
194 0x5b, 0xdb, 0xf4, 0xf7,
195 0xfc, 0xa2, 0x83, 0x19,
196 0xdb, 0x1a, 0x3e, 0x94,
197 0x6b, 0x9e, 0x38, 0xd9,
198 0x2c, 0x9c, 0x03, 0xd1,
199 0xad, 0x99, 0x44, 0xa7,
200 0xd9, 0x56, 0x3d, 0x59,
201 0x06, 0x3c, 0x25, 0xf3,
202 0xfc, 0x1f, 0xdc, 0x2a,
205 /* Length of the default RSS hash key. */
206 const size_t rss_hash_default_key_len = sizeof(rss_hash_default_key);
209 * Populate flow steering rule for a given hash RX queue type using
210 * information from hash_rxq_init[]. Nothing is written to flow_attr when
211 * flow_attr_size is not large enough, but the required size is still returned.
213 * @param[in] hash_rxq
214 * Pointer to hash RX queue.
215 * @param[out] flow_attr
216 * Pointer to flow attribute structure to fill. Note that the allocated
217 * area must be larger and large enough to hold all flow specifications.
218 * @param flow_attr_size
219 * Entire size of flow_attr and trailing room for flow specifications.
222 * Total size of the flow attribute buffer. No errors are defined.
225 hash_rxq_flow_attr(const struct hash_rxq *hash_rxq,
226 struct ibv_exp_flow_attr *flow_attr,
227 size_t flow_attr_size)
229 size_t offset = sizeof(*flow_attr);
230 enum hash_rxq_type type = hash_rxq->type;
231 const struct hash_rxq_init *init = &hash_rxq_init[type];
233 assert(hash_rxq->priv != NULL);
234 assert((size_t)type < RTE_DIM(hash_rxq_init));
236 offset += init->flow_spec.hdr.size;
237 init = init->underlayer;
238 } while (init != NULL);
239 if (offset > flow_attr_size)
241 flow_attr_size = offset;
242 init = &hash_rxq_init[type];
243 *flow_attr = (struct ibv_exp_flow_attr){
244 .type = IBV_EXP_FLOW_ATTR_NORMAL,
245 .priority = init->flow_priority,
247 .port = hash_rxq->priv->port,
251 offset -= init->flow_spec.hdr.size;
252 memcpy((void *)((uintptr_t)flow_attr + offset),
254 init->flow_spec.hdr.size);
255 ++flow_attr->num_of_specs;
256 init = init->underlayer;
257 } while (init != NULL);
258 return flow_attr_size;
262 * Return the type corresponding to the n'th bit set.
265 * The indirection table.
270 * The corresponding hash_rxq_type.
272 static enum hash_rxq_type
273 hash_rxq_type_from_n(const struct ind_table_init *table, unsigned int n)
275 assert(n < table->hash_types_n);
276 while (((table->hash_types >> n) & 0x1) == 0)
282 * Filter out disabled hash RX queue types from ind_table_init[].
285 * Pointer to private structure.
290 * Number of table entries.
293 priv_make_ind_table_init(struct priv *priv,
294 struct ind_table_init (*table)[IND_TABLE_INIT_N])
299 unsigned int table_n = 0;
300 /* Mandatory to receive frames not handled by normal hash RX queues. */
301 unsigned int hash_types_sup = 1 << HASH_RXQ_ETH;
303 rss_hf = priv->dev->data->dev_conf.rx_adv_conf.rss_conf.rss_hf;
304 /* Process other protocols only if more than one queue. */
305 if (priv->rxqs_n > 1)
306 for (i = 0; (i != hash_rxq_init_n); ++i)
307 if (rss_hf & hash_rxq_init[i].dpdk_rss_hf)
308 hash_types_sup |= (1 << i);
310 /* Filter out entries whose protocols are not in the set. */
311 for (i = 0, j = 0; (i != IND_TABLE_INIT_N); ++i) {
315 /* j is increased only if the table has valid protocols. */
317 (*table)[j] = ind_table_init[i];
318 (*table)[j].hash_types &= hash_types_sup;
319 for (h = 0, nb = 0; (h != hash_rxq_init_n); ++h)
320 if (((*table)[j].hash_types >> h) & 0x1)
322 (*table)[i].hash_types_n = nb;
332 * Initialize hash RX queues and indirection table.
335 * Pointer to private structure.
338 * 0 on success, errno value on failure.
341 priv_create_hash_rxqs(struct priv *priv)
343 struct ibv_exp_wq *wqs[priv->reta_idx_n];
344 struct ind_table_init ind_table_init[IND_TABLE_INIT_N];
345 unsigned int ind_tables_n =
346 priv_make_ind_table_init(priv, &ind_table_init);
347 unsigned int hash_rxqs_n = 0;
348 struct hash_rxq (*hash_rxqs)[] = NULL;
349 struct ibv_exp_rwq_ind_table *(*ind_tables)[] = NULL;
355 assert(priv->ind_tables == NULL);
356 assert(priv->ind_tables_n == 0);
357 assert(priv->hash_rxqs == NULL);
358 assert(priv->hash_rxqs_n == 0);
359 assert(priv->pd != NULL);
360 assert(priv->ctx != NULL);
361 if (priv->rxqs_n == 0)
363 assert(priv->rxqs != NULL);
364 if (ind_tables_n == 0) {
365 ERROR("all hash RX queue types have been filtered out,"
366 " indirection table cannot be created");
369 if (priv->rxqs_n & (priv->rxqs_n - 1)) {
370 INFO("%u RX queues are configured, consider rounding this"
371 " number to the next power of two for better balancing",
373 DEBUG("indirection table extended to assume %u WQs",
376 for (i = 0; (i != priv->reta_idx_n); ++i)
377 wqs[i] = (*priv->rxqs)[(*priv->reta_idx)[i]]->wq;
378 /* Get number of hash RX queues to configure. */
379 for (i = 0, hash_rxqs_n = 0; (i != ind_tables_n); ++i)
380 hash_rxqs_n += ind_table_init[i].hash_types_n;
381 DEBUG("allocating %u hash RX queues for %u WQs, %u indirection tables",
382 hash_rxqs_n, priv->rxqs_n, ind_tables_n);
383 /* Create indirection tables. */
384 ind_tables = rte_calloc(__func__, ind_tables_n,
385 sizeof((*ind_tables)[0]), 0);
386 if (ind_tables == NULL) {
388 ERROR("cannot allocate indirection tables container: %s",
392 for (i = 0; (i != ind_tables_n); ++i) {
393 struct ibv_exp_rwq_ind_table_init_attr ind_init_attr = {
395 .log_ind_tbl_size = 0, /* Set below. */
399 unsigned int ind_tbl_size = ind_table_init[i].max_size;
400 struct ibv_exp_rwq_ind_table *ind_table;
402 if (priv->reta_idx_n < ind_tbl_size)
403 ind_tbl_size = priv->reta_idx_n;
404 ind_init_attr.log_ind_tbl_size = log2above(ind_tbl_size);
406 ind_table = ibv_exp_create_rwq_ind_table(priv->ctx,
408 if (ind_table != NULL) {
409 (*ind_tables)[i] = ind_table;
412 /* Not clear whether errno is set. */
413 err = (errno ? errno : EINVAL);
414 ERROR("RX indirection table creation failed with error %d: %s",
418 /* Allocate array that holds hash RX queues and related data. */
419 hash_rxqs = rte_calloc(__func__, hash_rxqs_n,
420 sizeof((*hash_rxqs)[0]), 0);
421 if (hash_rxqs == NULL) {
423 ERROR("cannot allocate hash RX queues container: %s",
427 for (i = 0, j = 0, k = 0;
428 ((i != hash_rxqs_n) && (j != ind_tables_n));
430 struct hash_rxq *hash_rxq = &(*hash_rxqs)[i];
431 enum hash_rxq_type type =
432 hash_rxq_type_from_n(&ind_table_init[j], k);
433 struct rte_eth_rss_conf *priv_rss_conf =
434 (*priv->rss_conf)[type];
435 struct ibv_exp_rx_hash_conf hash_conf = {
436 .rx_hash_function = IBV_EXP_RX_HASH_FUNC_TOEPLITZ,
437 .rx_hash_key_len = (priv_rss_conf ?
438 priv_rss_conf->rss_key_len :
439 rss_hash_default_key_len),
440 .rx_hash_key = (priv_rss_conf ?
441 priv_rss_conf->rss_key :
442 rss_hash_default_key),
443 .rx_hash_fields_mask = hash_rxq_init[type].hash_fields,
444 .rwq_ind_tbl = (*ind_tables)[j],
446 struct ibv_exp_qp_init_attr qp_init_attr = {
447 .max_inl_recv = 0, /* Currently not supported. */
448 .qp_type = IBV_QPT_RAW_PACKET,
449 .comp_mask = (IBV_EXP_QP_INIT_ATTR_PD |
450 IBV_EXP_QP_INIT_ATTR_RX_HASH),
452 .rx_hash_conf = &hash_conf,
453 .port_num = priv->port,
456 DEBUG("using indirection table %u for hash RX queue %u",
458 *hash_rxq = (struct hash_rxq){
460 .qp = ibv_exp_create_qp(priv->ctx, &qp_init_attr),
463 if (hash_rxq->qp == NULL) {
464 err = (errno ? errno : EINVAL);
465 ERROR("Hash RX QP creation failure: %s",
469 if (++k < ind_table_init[j].hash_types_n)
471 /* Switch to the next indirection table and reset hash RX
472 * queue type array index. */
476 priv->ind_tables = ind_tables;
477 priv->ind_tables_n = ind_tables_n;
478 priv->hash_rxqs = hash_rxqs;
479 priv->hash_rxqs_n = hash_rxqs_n;
483 if (hash_rxqs != NULL) {
484 for (i = 0; (i != hash_rxqs_n); ++i) {
485 struct ibv_qp *qp = (*hash_rxqs)[i].qp;
489 claim_zero(ibv_destroy_qp(qp));
493 if (ind_tables != NULL) {
494 for (j = 0; (j != ind_tables_n); ++j) {
495 struct ibv_exp_rwq_ind_table *ind_table =
498 if (ind_table == NULL)
500 claim_zero(ibv_exp_destroy_rwq_ind_table(ind_table));
502 rte_free(ind_tables);
508 * Clean up hash RX queues and indirection table.
511 * Pointer to private structure.
514 priv_destroy_hash_rxqs(struct priv *priv)
518 DEBUG("destroying %u hash RX queues", priv->hash_rxqs_n);
519 if (priv->hash_rxqs_n == 0) {
520 assert(priv->hash_rxqs == NULL);
521 assert(priv->ind_tables == NULL);
524 for (i = 0; (i != priv->hash_rxqs_n); ++i) {
525 struct hash_rxq *hash_rxq = &(*priv->hash_rxqs)[i];
528 assert(hash_rxq->priv == priv);
529 assert(hash_rxq->qp != NULL);
530 /* Also check that there are no remaining flows. */
531 assert(hash_rxq->allmulti_flow == NULL);
532 assert(hash_rxq->promisc_flow == NULL);
533 for (j = 0; (j != RTE_DIM(hash_rxq->mac_flow)); ++j)
534 for (k = 0; (k != RTE_DIM(hash_rxq->mac_flow[j])); ++k)
535 assert(hash_rxq->mac_flow[j][k] == NULL);
536 claim_zero(ibv_destroy_qp(hash_rxq->qp));
538 priv->hash_rxqs_n = 0;
539 rte_free(priv->hash_rxqs);
540 priv->hash_rxqs = NULL;
541 for (i = 0; (i != priv->ind_tables_n); ++i) {
542 struct ibv_exp_rwq_ind_table *ind_table =
543 (*priv->ind_tables)[i];
545 assert(ind_table != NULL);
546 claim_zero(ibv_exp_destroy_rwq_ind_table(ind_table));
548 priv->ind_tables_n = 0;
549 rte_free(priv->ind_tables);
550 priv->ind_tables = NULL;
554 * Check whether a given flow type is allowed.
557 * Pointer to private structure.
559 * Flow type to check.
562 * Nonzero if the given flow type is allowed.
565 priv_allow_flow_type(struct priv *priv, enum hash_rxq_flow_type type)
567 /* Only FLOW_TYPE_PROMISC is allowed when promiscuous mode
568 * has been requested. */
569 if (priv->promisc_req)
570 return (type == HASH_RXQ_FLOW_TYPE_PROMISC);
572 case HASH_RXQ_FLOW_TYPE_PROMISC:
573 return !!priv->promisc_req;
574 case HASH_RXQ_FLOW_TYPE_ALLMULTI:
575 return !!priv->allmulti_req;
576 case HASH_RXQ_FLOW_TYPE_MAC:
583 * Allocate RX queue elements with scattered packets support.
586 * Pointer to RX queue structure.
588 * Number of elements to allocate.
590 * If not NULL, fetch buffers from this array instead of allocating them
591 * with rte_pktmbuf_alloc().
594 * 0 on success, errno value on failure.
597 rxq_alloc_elts_sp(struct rxq *rxq, unsigned int elts_n,
598 struct rte_mbuf **pool)
601 struct rxq_elt_sp (*elts)[elts_n] =
602 rte_calloc_socket("RXQ elements", 1, sizeof(*elts), 0,
607 ERROR("%p: can't allocate packets array", (void *)rxq);
611 /* For each WR (packet). */
612 for (i = 0; (i != elts_n); ++i) {
614 struct rxq_elt_sp *elt = &(*elts)[i];
615 struct ibv_sge (*sges)[RTE_DIM(elt->sges)] = &elt->sges;
617 /* These two arrays must have the same size. */
618 assert(RTE_DIM(elt->sges) == RTE_DIM(elt->bufs));
619 /* For each SGE (segment). */
620 for (j = 0; (j != RTE_DIM(elt->bufs)); ++j) {
621 struct ibv_sge *sge = &(*sges)[j];
622 struct rte_mbuf *buf;
627 rte_pktmbuf_reset(buf);
629 buf = rte_pktmbuf_alloc(rxq->mp);
631 assert(pool == NULL);
632 ERROR("%p: empty mbuf pool", (void *)rxq);
637 /* Headroom is reserved by rte_pktmbuf_alloc(). */
638 assert(DATA_OFF(buf) == RTE_PKTMBUF_HEADROOM);
639 /* Buffer is supposed to be empty. */
640 assert(rte_pktmbuf_data_len(buf) == 0);
641 assert(rte_pktmbuf_pkt_len(buf) == 0);
642 /* sge->addr must be able to store a pointer. */
643 assert(sizeof(sge->addr) >= sizeof(uintptr_t));
645 /* The first SGE keeps its headroom. */
646 sge->addr = rte_pktmbuf_mtod(buf, uintptr_t);
647 sge->length = (buf->buf_len -
648 RTE_PKTMBUF_HEADROOM);
650 /* Subsequent SGEs lose theirs. */
651 assert(DATA_OFF(buf) == RTE_PKTMBUF_HEADROOM);
652 SET_DATA_OFF(buf, 0);
653 sge->addr = (uintptr_t)buf->buf_addr;
654 sge->length = buf->buf_len;
656 sge->lkey = rxq->mr->lkey;
657 /* Redundant check for tailroom. */
658 assert(sge->length == rte_pktmbuf_tailroom(buf));
661 DEBUG("%p: allocated and configured %u WRs (%zu segments)",
662 (void *)rxq, elts_n, (elts_n * RTE_DIM((*elts)[0].sges)));
663 rxq->elts_n = elts_n;
670 assert(pool == NULL);
671 for (i = 0; (i != RTE_DIM(*elts)); ++i) {
673 struct rxq_elt_sp *elt = &(*elts)[i];
675 for (j = 0; (j != RTE_DIM(elt->bufs)); ++j) {
676 struct rte_mbuf *buf = elt->bufs[j];
679 rte_pktmbuf_free_seg(buf);
684 DEBUG("%p: failed, freed everything", (void *)rxq);
690 * Free RX queue elements with scattered packets support.
693 * Pointer to RX queue structure.
696 rxq_free_elts_sp(struct rxq *rxq)
699 unsigned int elts_n = rxq->elts_n;
700 struct rxq_elt_sp (*elts)[elts_n] = rxq->elts.sp;
702 DEBUG("%p: freeing WRs", (void *)rxq);
707 for (i = 0; (i != RTE_DIM(*elts)); ++i) {
709 struct rxq_elt_sp *elt = &(*elts)[i];
711 for (j = 0; (j != RTE_DIM(elt->bufs)); ++j) {
712 struct rte_mbuf *buf = elt->bufs[j];
715 rte_pktmbuf_free_seg(buf);
722 * Allocate RX queue elements.
725 * Pointer to RX queue structure.
727 * Number of elements to allocate.
729 * If not NULL, fetch buffers from this array instead of allocating them
730 * with rte_pktmbuf_alloc().
733 * 0 on success, errno value on failure.
736 rxq_alloc_elts(struct rxq *rxq, unsigned int elts_n, struct rte_mbuf **pool)
739 struct rxq_elt (*elts)[elts_n] =
740 rte_calloc_socket("RXQ elements", 1, sizeof(*elts), 0,
745 ERROR("%p: can't allocate packets array", (void *)rxq);
749 /* For each WR (packet). */
750 for (i = 0; (i != elts_n); ++i) {
751 struct rxq_elt *elt = &(*elts)[i];
752 struct ibv_sge *sge = &(*elts)[i].sge;
753 struct rte_mbuf *buf;
758 rte_pktmbuf_reset(buf);
760 buf = rte_pktmbuf_alloc(rxq->mp);
762 assert(pool == NULL);
763 ERROR("%p: empty mbuf pool", (void *)rxq);
768 /* Headroom is reserved by rte_pktmbuf_alloc(). */
769 assert(DATA_OFF(buf) == RTE_PKTMBUF_HEADROOM);
770 /* Buffer is supposed to be empty. */
771 assert(rte_pktmbuf_data_len(buf) == 0);
772 assert(rte_pktmbuf_pkt_len(buf) == 0);
773 /* sge->addr must be able to store a pointer. */
774 assert(sizeof(sge->addr) >= sizeof(uintptr_t));
775 /* SGE keeps its headroom. */
776 sge->addr = (uintptr_t)
777 ((uint8_t *)buf->buf_addr + RTE_PKTMBUF_HEADROOM);
778 sge->length = (buf->buf_len - RTE_PKTMBUF_HEADROOM);
779 sge->lkey = rxq->mr->lkey;
780 /* Redundant check for tailroom. */
781 assert(sge->length == rte_pktmbuf_tailroom(buf));
783 DEBUG("%p: allocated and configured %u single-segment WRs",
784 (void *)rxq, elts_n);
785 rxq->elts_n = elts_n;
787 rxq->elts.no_sp = elts;
792 assert(pool == NULL);
793 for (i = 0; (i != RTE_DIM(*elts)); ++i) {
794 struct rxq_elt *elt = &(*elts)[i];
795 struct rte_mbuf *buf = elt->buf;
798 rte_pktmbuf_free_seg(buf);
802 DEBUG("%p: failed, freed everything", (void *)rxq);
808 * Free RX queue elements.
811 * Pointer to RX queue structure.
814 rxq_free_elts(struct rxq *rxq)
817 unsigned int elts_n = rxq->elts_n;
818 struct rxq_elt (*elts)[elts_n] = rxq->elts.no_sp;
820 DEBUG("%p: freeing WRs", (void *)rxq);
822 rxq->elts.no_sp = NULL;
825 for (i = 0; (i != RTE_DIM(*elts)); ++i) {
826 struct rxq_elt *elt = &(*elts)[i];
827 struct rte_mbuf *buf = elt->buf;
830 rte_pktmbuf_free_seg(buf);
836 * Clean up a RX queue.
838 * Destroy objects, free allocated memory and reset the structure for reuse.
841 * Pointer to RX queue structure.
844 rxq_cleanup(struct rxq *rxq)
846 struct ibv_exp_release_intf_params params;
848 DEBUG("cleaning up %p", (void *)rxq);
850 rxq_free_elts_sp(rxq);
853 if (rxq->if_wq != NULL) {
854 assert(rxq->priv != NULL);
855 assert(rxq->priv->ctx != NULL);
856 assert(rxq->wq != NULL);
857 params = (struct ibv_exp_release_intf_params){
860 claim_zero(ibv_exp_release_intf(rxq->priv->ctx,
864 if (rxq->if_cq != NULL) {
865 assert(rxq->priv != NULL);
866 assert(rxq->priv->ctx != NULL);
867 assert(rxq->cq != NULL);
868 params = (struct ibv_exp_release_intf_params){
871 claim_zero(ibv_exp_release_intf(rxq->priv->ctx,
876 claim_zero(ibv_exp_destroy_wq(rxq->wq));
878 claim_zero(ibv_destroy_cq(rxq->cq));
879 if (rxq->rd != NULL) {
880 struct ibv_exp_destroy_res_domain_attr attr = {
884 assert(rxq->priv != NULL);
885 assert(rxq->priv->ctx != NULL);
886 claim_zero(ibv_exp_destroy_res_domain(rxq->priv->ctx,
891 claim_zero(ibv_dereg_mr(rxq->mr));
892 memset(rxq, 0, sizeof(*rxq));
896 * Reconfigure a RX queue with new parameters.
898 * rxq_rehash() does not allocate mbufs, which, if not done from the right
899 * thread (such as a control thread), may corrupt the pool.
900 * In case of failure, the queue is left untouched.
903 * Pointer to Ethernet device structure.
908 * 0 on success, errno value on failure.
911 rxq_rehash(struct rte_eth_dev *dev, struct rxq *rxq)
913 struct priv *priv = rxq->priv;
914 struct rxq tmpl = *rxq;
917 struct rte_mbuf **pool;
919 struct ibv_exp_wq_attr mod;
922 DEBUG("%p: rehashing queue %p", (void *)dev, (void *)rxq);
923 /* Number of descriptors and mbufs currently allocated. */
924 desc_n = (tmpl.elts_n * (tmpl.sp ? MLX5_PMD_SGE_WR_N : 1));
926 /* Toggle RX checksum offload if hardware supports it. */
928 tmpl.csum = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
929 rxq->csum = tmpl.csum;
931 if (priv->hw_csum_l2tun) {
932 tmpl.csum_l2tun = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
933 rxq->csum_l2tun = tmpl.csum_l2tun;
935 /* Enable scattered packets support for this queue if necessary. */
936 if ((dev->data->dev_conf.rxmode.jumbo_frame) &&
937 (dev->data->dev_conf.rxmode.max_rx_pkt_len >
938 (tmpl.mb_len - RTE_PKTMBUF_HEADROOM))) {
940 desc_n /= MLX5_PMD_SGE_WR_N;
943 DEBUG("%p: %s scattered packets support (%u WRs)",
944 (void *)dev, (tmpl.sp ? "enabling" : "disabling"), desc_n);
945 /* If scatter mode is the same as before, nothing to do. */
946 if (tmpl.sp == rxq->sp) {
947 DEBUG("%p: nothing to do", (void *)dev);
950 /* From now on, any failure will render the queue unusable.
951 * Reinitialize WQ. */
952 mod = (struct ibv_exp_wq_attr){
953 .attr_mask = IBV_EXP_WQ_ATTR_STATE,
954 .wq_state = IBV_EXP_WQS_RESET,
956 err = ibv_exp_modify_wq(tmpl.wq, &mod);
958 ERROR("%p: cannot reset WQ: %s", (void *)dev, strerror(err));
963 pool = rte_malloc(__func__, (mbuf_n * sizeof(*pool)), 0);
965 ERROR("%p: cannot allocate memory", (void *)dev);
968 /* Snatch mbufs from original queue. */
971 struct rxq_elt_sp (*elts)[rxq->elts_n] = rxq->elts.sp;
973 for (i = 0; (i != RTE_DIM(*elts)); ++i) {
974 struct rxq_elt_sp *elt = &(*elts)[i];
977 for (j = 0; (j != RTE_DIM(elt->bufs)); ++j) {
978 assert(elt->bufs[j] != NULL);
979 pool[k++] = elt->bufs[j];
983 struct rxq_elt (*elts)[rxq->elts_n] = rxq->elts.no_sp;
985 for (i = 0; (i != RTE_DIM(*elts)); ++i) {
986 struct rxq_elt *elt = &(*elts)[i];
987 struct rte_mbuf *buf = elt->buf;
995 assert((void *)&tmpl.elts.sp == (void *)&tmpl.elts.no_sp);
997 rxq_alloc_elts_sp(&tmpl, desc_n, pool) :
998 rxq_alloc_elts(&tmpl, desc_n, pool));
1000 ERROR("%p: cannot reallocate WRs, aborting", (void *)dev);
1005 assert(tmpl.elts_n == desc_n);
1006 assert(tmpl.elts.sp != NULL);
1008 /* Clean up original data. */
1010 rte_free(rxq->elts.sp);
1011 rxq->elts.sp = NULL;
1012 /* Change queue state to ready. */
1013 mod = (struct ibv_exp_wq_attr){
1014 .attr_mask = IBV_EXP_WQ_ATTR_STATE,
1015 .wq_state = IBV_EXP_WQS_RDY,
1017 err = ibv_exp_modify_wq(tmpl.wq, &mod);
1019 ERROR("%p: WQ state to IBV_EXP_WQS_RDY failed: %s",
1020 (void *)dev, strerror(err));
1024 assert(tmpl.if_wq != NULL);
1026 struct rxq_elt_sp (*elts)[tmpl.elts_n] = tmpl.elts.sp;
1028 for (i = 0; (i != RTE_DIM(*elts)); ++i) {
1029 err = tmpl.if_wq->recv_sg_list
1032 RTE_DIM((*elts)[i].sges));
1037 struct rxq_elt (*elts)[tmpl.elts_n] = tmpl.elts.no_sp;
1039 for (i = 0; (i != RTE_DIM(*elts)); ++i) {
1040 err = tmpl.if_wq->recv_burst(
1049 ERROR("%p: failed to post SGEs with error %d",
1051 /* Set err because it does not contain a valid errno value. */
1062 * Configure a RX queue.
1065 * Pointer to Ethernet device structure.
1067 * Pointer to RX queue structure.
1069 * Number of descriptors to configure in queue.
1071 * NUMA socket on which memory must be allocated.
1073 * Thresholds parameters.
1075 * Memory pool for buffer allocations.
1078 * 0 on success, errno value on failure.
1081 rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
1082 unsigned int socket, const struct rte_eth_rxconf *conf,
1083 struct rte_mempool *mp)
1085 struct priv *priv = dev->data->dev_private;
1091 struct ibv_exp_wq_attr mod;
1093 struct ibv_exp_query_intf_params params;
1094 struct ibv_exp_cq_init_attr cq;
1095 struct ibv_exp_res_domain_init_attr rd;
1096 struct ibv_exp_wq_init_attr wq;
1098 enum ibv_exp_query_intf_status status;
1099 struct rte_mbuf *buf;
1102 unsigned int cq_size = desc;
1104 (void)conf; /* Thresholds configuration (ignored). */
1105 if ((desc == 0) || (desc % MLX5_PMD_SGE_WR_N)) {
1106 ERROR("%p: invalid number of RX descriptors (must be a"
1107 " multiple of %d)", (void *)dev, MLX5_PMD_SGE_WR_N);
1110 /* Get mbuf length. */
1111 buf = rte_pktmbuf_alloc(mp);
1113 ERROR("%p: unable to allocate mbuf", (void *)dev);
1116 tmpl.mb_len = buf->buf_len;
1117 assert((rte_pktmbuf_headroom(buf) +
1118 rte_pktmbuf_tailroom(buf)) == tmpl.mb_len);
1119 assert(rte_pktmbuf_headroom(buf) == RTE_PKTMBUF_HEADROOM);
1120 rte_pktmbuf_free(buf);
1121 /* Toggle RX checksum offload if hardware supports it. */
1123 tmpl.csum = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
1124 if (priv->hw_csum_l2tun)
1125 tmpl.csum_l2tun = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
1126 /* Enable scattered packets support for this queue if necessary. */
1127 if ((dev->data->dev_conf.rxmode.jumbo_frame) &&
1128 (dev->data->dev_conf.rxmode.max_rx_pkt_len >
1129 (tmpl.mb_len - RTE_PKTMBUF_HEADROOM))) {
1131 desc /= MLX5_PMD_SGE_WR_N;
1133 DEBUG("%p: %s scattered packets support (%u WRs)",
1134 (void *)dev, (tmpl.sp ? "enabling" : "disabling"), desc);
1135 /* Use the entire RX mempool as the memory region. */
1136 tmpl.mr = ibv_reg_mr(priv->pd,
1137 (void *)mp->elt_va_start,
1138 (mp->elt_va_end - mp->elt_va_start),
1139 (IBV_ACCESS_LOCAL_WRITE |
1140 IBV_ACCESS_REMOTE_WRITE));
1141 if (tmpl.mr == NULL) {
1143 ERROR("%p: MR creation failure: %s",
1144 (void *)dev, strerror(ret));
1147 attr.rd = (struct ibv_exp_res_domain_init_attr){
1148 .comp_mask = (IBV_EXP_RES_DOMAIN_THREAD_MODEL |
1149 IBV_EXP_RES_DOMAIN_MSG_MODEL),
1150 .thread_model = IBV_EXP_THREAD_SINGLE,
1151 .msg_model = IBV_EXP_MSG_HIGH_BW,
1153 tmpl.rd = ibv_exp_create_res_domain(priv->ctx, &attr.rd);
1154 if (tmpl.rd == NULL) {
1156 ERROR("%p: RD creation failure: %s",
1157 (void *)dev, strerror(ret));
1160 attr.cq = (struct ibv_exp_cq_init_attr){
1161 .comp_mask = IBV_EXP_CQ_INIT_ATTR_RES_DOMAIN,
1162 .res_domain = tmpl.rd,
1164 tmpl.cq = ibv_exp_create_cq(priv->ctx, cq_size, NULL, NULL, 0,
1166 if (tmpl.cq == NULL) {
1168 ERROR("%p: CQ creation failure: %s",
1169 (void *)dev, strerror(ret));
1172 DEBUG("priv->device_attr.max_qp_wr is %d",
1173 priv->device_attr.max_qp_wr);
1174 DEBUG("priv->device_attr.max_sge is %d",
1175 priv->device_attr.max_sge);
1176 attr.wq = (struct ibv_exp_wq_init_attr){
1177 .wq_context = NULL, /* Could be useful in the future. */
1178 .wq_type = IBV_EXP_WQT_RQ,
1179 /* Max number of outstanding WRs. */
1180 .max_recv_wr = ((priv->device_attr.max_qp_wr < (int)cq_size) ?
1181 priv->device_attr.max_qp_wr :
1183 /* Max number of scatter/gather elements in a WR. */
1184 .max_recv_sge = ((priv->device_attr.max_sge <
1185 MLX5_PMD_SGE_WR_N) ?
1186 priv->device_attr.max_sge :
1190 .comp_mask = IBV_EXP_CREATE_WQ_RES_DOMAIN,
1191 .res_domain = tmpl.rd,
1193 tmpl.wq = ibv_exp_create_wq(priv->ctx, &attr.wq);
1194 if (tmpl.wq == NULL) {
1195 ret = (errno ? errno : EINVAL);
1196 ERROR("%p: WQ creation failure: %s",
1197 (void *)dev, strerror(ret));
1201 ret = rxq_alloc_elts_sp(&tmpl, desc, NULL);
1203 ret = rxq_alloc_elts(&tmpl, desc, NULL);
1205 ERROR("%p: RXQ allocation failed: %s",
1206 (void *)dev, strerror(ret));
1210 tmpl.port_id = dev->data->port_id;
1211 DEBUG("%p: RTE port ID: %u", (void *)rxq, tmpl.port_id);
1212 attr.params = (struct ibv_exp_query_intf_params){
1213 .intf_scope = IBV_EXP_INTF_GLOBAL,
1214 .intf = IBV_EXP_INTF_CQ,
1217 tmpl.if_cq = ibv_exp_query_intf(priv->ctx, &attr.params, &status);
1218 if (tmpl.if_cq == NULL) {
1219 ERROR("%p: CQ interface family query failed with status %d",
1220 (void *)dev, status);
1223 attr.params = (struct ibv_exp_query_intf_params){
1224 .intf_scope = IBV_EXP_INTF_GLOBAL,
1225 .intf = IBV_EXP_INTF_WQ,
1228 tmpl.if_wq = ibv_exp_query_intf(priv->ctx, &attr.params, &status);
1229 if (tmpl.if_wq == NULL) {
1230 ERROR("%p: WQ interface family query failed with status %d",
1231 (void *)dev, status);
1234 /* Change queue state to ready. */
1235 mod = (struct ibv_exp_wq_attr){
1236 .attr_mask = IBV_EXP_WQ_ATTR_STATE,
1237 .wq_state = IBV_EXP_WQS_RDY,
1239 ret = ibv_exp_modify_wq(tmpl.wq, &mod);
1241 ERROR("%p: WQ state to IBV_EXP_WQS_RDY failed: %s",
1242 (void *)dev, strerror(ret));
1247 struct rxq_elt_sp (*elts)[tmpl.elts_n] = tmpl.elts.sp;
1249 for (i = 0; (i != RTE_DIM(*elts)); ++i) {
1250 ret = tmpl.if_wq->recv_sg_list
1253 RTE_DIM((*elts)[i].sges));
1258 struct rxq_elt (*elts)[tmpl.elts_n] = tmpl.elts.no_sp;
1260 for (i = 0; (i != RTE_DIM(*elts)); ++i) {
1261 ret = tmpl.if_wq->recv_burst(
1270 ERROR("%p: failed to post SGEs with error %d",
1272 /* Set ret because it does not contain a valid errno value. */
1276 /* Clean up rxq in case we're reinitializing it. */
1277 DEBUG("%p: cleaning-up old rxq just in case", (void *)rxq);
1280 DEBUG("%p: rxq updated with %p", (void *)rxq, (void *)&tmpl);
1290 * DPDK callback to configure a RX queue.
1293 * Pointer to Ethernet device structure.
1297 * Number of descriptors to configure in queue.
1299 * NUMA socket on which memory must be allocated.
1301 * Thresholds parameters.
1303 * Memory pool for buffer allocations.
1306 * 0 on success, negative errno value on failure.
1309 mlx5_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
1310 unsigned int socket, const struct rte_eth_rxconf *conf,
1311 struct rte_mempool *mp)
1313 struct priv *priv = dev->data->dev_private;
1314 struct rxq *rxq = (*priv->rxqs)[idx];
1318 DEBUG("%p: configuring queue %u for %u descriptors",
1319 (void *)dev, idx, desc);
1320 if (idx >= priv->rxqs_n) {
1321 ERROR("%p: queue index out of range (%u >= %u)",
1322 (void *)dev, idx, priv->rxqs_n);
1327 DEBUG("%p: reusing already allocated queue index %u (%p)",
1328 (void *)dev, idx, (void *)rxq);
1329 if (priv->started) {
1333 (*priv->rxqs)[idx] = NULL;
1336 rxq = rte_calloc_socket("RXQ", 1, sizeof(*rxq), 0, socket);
1338 ERROR("%p: unable to allocate queue index %u",
1344 ret = rxq_setup(dev, rxq, desc, socket, conf, mp);
1348 rxq->stats.idx = idx;
1349 DEBUG("%p: adding RX queue %p to list",
1350 (void *)dev, (void *)rxq);
1351 (*priv->rxqs)[idx] = rxq;
1352 /* Update receive callback. */
1354 dev->rx_pkt_burst = mlx5_rx_burst_sp;
1356 dev->rx_pkt_burst = mlx5_rx_burst;
1363 * DPDK callback to release a RX queue.
1366 * Generic RX queue pointer.
1369 mlx5_rx_queue_release(void *dpdk_rxq)
1371 struct rxq *rxq = (struct rxq *)dpdk_rxq;
1379 for (i = 0; (i != priv->rxqs_n); ++i)
1380 if ((*priv->rxqs)[i] == rxq) {
1381 DEBUG("%p: removing RX queue %p from list",
1382 (void *)priv->dev, (void *)rxq);
1383 (*priv->rxqs)[i] = NULL;