4 * Copyright 2015 6WIND S.A.
5 * Copyright 2015 Mellanox.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * * Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * * Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * * Neither the name of 6WIND S.A. nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
41 /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
43 #pragma GCC diagnostic ignored "-pedantic"
45 #include <infiniband/verbs.h>
47 #pragma GCC diagnostic error "-pedantic"
50 /* DPDK headers don't like -pedantic. */
52 #pragma GCC diagnostic ignored "-pedantic"
55 #include <rte_malloc.h>
56 #include <rte_ethdev.h>
57 #include <rte_common.h>
59 #pragma GCC diagnostic error "-pedantic"
63 #include "mlx5_rxtx.h"
64 #include "mlx5_utils.h"
65 #include "mlx5_defs.h"
67 /* Initialization data for hash RX queues. */
68 const struct hash_rxq_init hash_rxq_init[] = {
70 .hash_fields = (IBV_EXP_RX_HASH_SRC_IPV4 |
71 IBV_EXP_RX_HASH_DST_IPV4 |
72 IBV_EXP_RX_HASH_SRC_PORT_TCP |
73 IBV_EXP_RX_HASH_DST_PORT_TCP),
74 .dpdk_rss_hf = ETH_RSS_NONFRAG_IPV4_TCP,
76 .flow_spec.tcp_udp = {
77 .type = IBV_FLOW_SPEC_TCP,
78 .size = sizeof(hash_rxq_init[0].flow_spec.tcp_udp),
80 .underlayer = &hash_rxq_init[HASH_RXQ_IPV4],
83 .hash_fields = (IBV_EXP_RX_HASH_SRC_IPV4 |
84 IBV_EXP_RX_HASH_DST_IPV4 |
85 IBV_EXP_RX_HASH_SRC_PORT_UDP |
86 IBV_EXP_RX_HASH_DST_PORT_UDP),
87 .dpdk_rss_hf = ETH_RSS_NONFRAG_IPV4_UDP,
89 .flow_spec.tcp_udp = {
90 .type = IBV_FLOW_SPEC_UDP,
91 .size = sizeof(hash_rxq_init[0].flow_spec.tcp_udp),
93 .underlayer = &hash_rxq_init[HASH_RXQ_IPV4],
96 .hash_fields = (IBV_EXP_RX_HASH_SRC_IPV4 |
97 IBV_EXP_RX_HASH_DST_IPV4),
98 .dpdk_rss_hf = (ETH_RSS_IPV4 |
102 .type = IBV_FLOW_SPEC_IPV4,
103 .size = sizeof(hash_rxq_init[0].flow_spec.ipv4),
105 .underlayer = &hash_rxq_init[HASH_RXQ_ETH],
112 .type = IBV_FLOW_SPEC_ETH,
113 .size = sizeof(hash_rxq_init[0].flow_spec.eth),
119 /* Number of entries in hash_rxq_init[]. */
120 const unsigned int hash_rxq_init_n = RTE_DIM(hash_rxq_init);
122 /* Initialization data for hash RX queue indirection tables. */
123 static const struct ind_table_init ind_table_init[] = {
125 .max_size = -1u, /* Superseded by HW limitations. */
127 1 << HASH_RXQ_TCPV4 |
128 1 << HASH_RXQ_UDPV4 |
135 .hash_types = 1 << HASH_RXQ_ETH,
140 #define IND_TABLE_INIT_N RTE_DIM(ind_table_init)
142 /* Default RSS hash key also used for ConnectX-3. */
143 uint8_t rss_hash_default_key[] = {
144 0x2c, 0xc6, 0x81, 0xd1,
145 0x5b, 0xdb, 0xf4, 0xf7,
146 0xfc, 0xa2, 0x83, 0x19,
147 0xdb, 0x1a, 0x3e, 0x94,
148 0x6b, 0x9e, 0x38, 0xd9,
149 0x2c, 0x9c, 0x03, 0xd1,
150 0xad, 0x99, 0x44, 0xa7,
151 0xd9, 0x56, 0x3d, 0x59,
152 0x06, 0x3c, 0x25, 0xf3,
153 0xfc, 0x1f, 0xdc, 0x2a,
156 /* Length of the default RSS hash key. */
157 const size_t rss_hash_default_key_len = sizeof(rss_hash_default_key);
160 * Populate flow steering rule for a given hash RX queue type using
161 * information from hash_rxq_init[]. Nothing is written to flow_attr when
162 * flow_attr_size is not large enough, but the required size is still returned.
164 * @param[in] hash_rxq
165 * Pointer to hash RX queue.
166 * @param[out] flow_attr
167 * Pointer to flow attribute structure to fill. Note that the allocated
168 * area must be larger and large enough to hold all flow specifications.
169 * @param flow_attr_size
170 * Entire size of flow_attr and trailing room for flow specifications.
173 * Total size of the flow attribute buffer. No errors are defined.
176 hash_rxq_flow_attr(const struct hash_rxq *hash_rxq,
177 struct ibv_flow_attr *flow_attr,
178 size_t flow_attr_size)
180 size_t offset = sizeof(*flow_attr);
181 enum hash_rxq_type type = hash_rxq->type;
182 const struct hash_rxq_init *init = &hash_rxq_init[type];
184 assert(hash_rxq->priv != NULL);
185 assert((size_t)type < RTE_DIM(hash_rxq_init));
187 offset += init->flow_spec.hdr.size;
188 init = init->underlayer;
189 } while (init != NULL);
190 if (offset > flow_attr_size)
192 flow_attr_size = offset;
193 init = &hash_rxq_init[type];
194 *flow_attr = (struct ibv_flow_attr){
195 .type = IBV_FLOW_ATTR_NORMAL,
196 .priority = init->flow_priority,
198 .port = hash_rxq->priv->port,
202 offset -= init->flow_spec.hdr.size;
203 memcpy((void *)((uintptr_t)flow_attr + offset),
205 init->flow_spec.hdr.size);
206 ++flow_attr->num_of_specs;
207 init = init->underlayer;
208 } while (init != NULL);
209 return flow_attr_size;
213 * Return nearest power of two above input value.
219 * Nearest power of two above input value.
222 log2above(unsigned int v)
227 for (l = 0, r = 0; (v >> 1); ++l, v >>= 1)
233 * Return the type corresponding to the n'th bit set.
236 * The indirection table.
241 * The corresponding hash_rxq_type.
243 static enum hash_rxq_type
244 hash_rxq_type_from_n(const struct ind_table_init *table, unsigned int n)
246 assert(n < table->hash_types_n);
247 while (((table->hash_types >> n) & 0x1) == 0)
253 * Filter out disabled hash RX queue types from ind_table_init[].
256 * Pointer to private structure.
261 * Number of table entries.
264 priv_make_ind_table_init(struct priv *priv,
265 struct ind_table_init (*table)[IND_TABLE_INIT_N])
270 unsigned int table_n = 0;
271 /* Mandatory to receive frames not handled by normal hash RX queues. */
272 unsigned int hash_types_sup = 1 << HASH_RXQ_ETH;
274 rss_hf = priv->dev->data->dev_conf.rx_adv_conf.rss_conf.rss_hf;
275 /* Process other protocols only if more than one queue. */
276 if (priv->rxqs_n > 1)
277 for (i = 0; (i != hash_rxq_init_n); ++i)
278 if (rss_hf & hash_rxq_init[i].dpdk_rss_hf)
279 hash_types_sup |= (1 << i);
281 /* Filter out entries whose protocols are not in the set. */
282 for (i = 0, j = 0; (i != IND_TABLE_INIT_N); ++i) {
286 /* j is increased only if the table has valid protocols. */
288 (*table)[j] = ind_table_init[i];
289 (*table)[j].hash_types &= hash_types_sup;
290 for (h = 0, nb = 0; (h != hash_rxq_init_n); ++h)
291 if (((*table)[j].hash_types >> h) & 0x1)
293 (*table)[i].hash_types_n = nb;
303 * Initialize hash RX queues and indirection table.
306 * Pointer to private structure.
309 * 0 on success, errno value on failure.
312 priv_create_hash_rxqs(struct priv *priv)
314 /* If the requested number of WQs is not a power of two, use the
315 * maximum indirection table size for better balancing.
316 * The result is always rounded to the next power of two. */
318 (1 << log2above((priv->rxqs_n & (priv->rxqs_n - 1)) ?
319 priv->ind_table_max_size :
321 struct ibv_exp_wq *wqs[wqs_n];
322 struct ind_table_init ind_table_init[IND_TABLE_INIT_N];
323 unsigned int ind_tables_n =
324 priv_make_ind_table_init(priv, &ind_table_init);
325 unsigned int hash_rxqs_n = 0;
326 struct hash_rxq (*hash_rxqs)[] = NULL;
327 struct ibv_exp_rwq_ind_table *(*ind_tables)[] = NULL;
333 assert(priv->ind_tables == NULL);
334 assert(priv->ind_tables_n == 0);
335 assert(priv->hash_rxqs == NULL);
336 assert(priv->hash_rxqs_n == 0);
337 assert(priv->pd != NULL);
338 assert(priv->ctx != NULL);
339 if (priv->rxqs_n == 0)
341 assert(priv->rxqs != NULL);
342 if (ind_tables_n == 0) {
343 ERROR("all hash RX queue types have been filtered out,"
344 " indirection table cannot be created");
347 if ((wqs_n < priv->rxqs_n) || (wqs_n > priv->ind_table_max_size)) {
348 ERROR("cannot handle this many RX queues (%u)", priv->rxqs_n);
352 if (wqs_n != priv->rxqs_n) {
353 INFO("%u RX queues are configured, consider rounding this"
354 " number to the next power of two for better balancing",
356 DEBUG("indirection table extended to assume %u WQs", wqs_n);
358 /* When the number of RX queues is not a power of two, the remaining
359 * table entries are padded with reused WQs and hashes are not spread
361 for (i = 0, j = 0; (i != wqs_n); ++i) {
362 wqs[i] = (*priv->rxqs)[j]->wq;
363 if (++j == priv->rxqs_n)
366 /* Get number of hash RX queues to configure. */
367 for (i = 0, hash_rxqs_n = 0; (i != ind_tables_n); ++i)
368 hash_rxqs_n += ind_table_init[i].hash_types_n;
369 DEBUG("allocating %u hash RX queues for %u WQs, %u indirection tables",
370 hash_rxqs_n, priv->rxqs_n, ind_tables_n);
371 /* Create indirection tables. */
372 ind_tables = rte_calloc(__func__, ind_tables_n,
373 sizeof((*ind_tables)[0]), 0);
374 if (ind_tables == NULL) {
376 ERROR("cannot allocate indirection tables container: %s",
380 for (i = 0; (i != ind_tables_n); ++i) {
381 struct ibv_exp_rwq_ind_table_init_attr ind_init_attr = {
383 .log_ind_tbl_size = 0, /* Set below. */
387 unsigned int ind_tbl_size = ind_table_init[i].max_size;
388 struct ibv_exp_rwq_ind_table *ind_table;
390 if (wqs_n < ind_tbl_size)
391 ind_tbl_size = wqs_n;
392 ind_init_attr.log_ind_tbl_size = log2above(ind_tbl_size);
394 ind_table = ibv_exp_create_rwq_ind_table(priv->ctx,
396 if (ind_table != NULL) {
397 (*ind_tables)[i] = ind_table;
400 /* Not clear whether errno is set. */
401 err = (errno ? errno : EINVAL);
402 ERROR("RX indirection table creation failed with error %d: %s",
406 /* Allocate array that holds hash RX queues and related data. */
407 hash_rxqs = rte_calloc(__func__, hash_rxqs_n,
408 sizeof((*hash_rxqs)[0]), 0);
409 if (hash_rxqs == NULL) {
411 ERROR("cannot allocate hash RX queues container: %s",
415 for (i = 0, j = 0, k = 0;
416 ((i != hash_rxqs_n) && (j != ind_tables_n));
418 struct hash_rxq *hash_rxq = &(*hash_rxqs)[i];
419 enum hash_rxq_type type =
420 hash_rxq_type_from_n(&ind_table_init[j], k);
421 struct rte_eth_rss_conf *priv_rss_conf =
422 (*priv->rss_conf)[type];
423 struct ibv_exp_rx_hash_conf hash_conf = {
424 .rx_hash_function = IBV_EXP_RX_HASH_FUNC_TOEPLITZ,
425 .rx_hash_key_len = (priv_rss_conf ?
426 priv_rss_conf->rss_key_len :
427 rss_hash_default_key_len),
428 .rx_hash_key = (priv_rss_conf ?
429 priv_rss_conf->rss_key :
430 rss_hash_default_key),
431 .rx_hash_fields_mask = hash_rxq_init[type].hash_fields,
432 .rwq_ind_tbl = (*ind_tables)[j],
434 struct ibv_exp_qp_init_attr qp_init_attr = {
435 .max_inl_recv = 0, /* Currently not supported. */
436 .qp_type = IBV_QPT_RAW_PACKET,
437 .comp_mask = (IBV_EXP_QP_INIT_ATTR_PD |
438 IBV_EXP_QP_INIT_ATTR_RX_HASH),
440 .rx_hash_conf = &hash_conf,
441 .port_num = priv->port,
444 DEBUG("using indirection table %u for hash RX queue %u",
446 *hash_rxq = (struct hash_rxq){
448 .qp = ibv_exp_create_qp(priv->ctx, &qp_init_attr),
451 if (hash_rxq->qp == NULL) {
452 err = (errno ? errno : EINVAL);
453 ERROR("Hash RX QP creation failure: %s",
457 if (++k < ind_table_init[j].hash_types_n)
459 /* Switch to the next indirection table and reset hash RX
460 * queue type array index. */
464 priv->ind_tables = ind_tables;
465 priv->ind_tables_n = ind_tables_n;
466 priv->hash_rxqs = hash_rxqs;
467 priv->hash_rxqs_n = hash_rxqs_n;
471 if (hash_rxqs != NULL) {
472 for (i = 0; (i != hash_rxqs_n); ++i) {
473 struct ibv_qp *qp = (*hash_rxqs)[i].qp;
477 claim_zero(ibv_destroy_qp(qp));
481 if (ind_tables != NULL) {
482 for (j = 0; (j != ind_tables_n); ++j) {
483 struct ibv_exp_rwq_ind_table *ind_table =
486 if (ind_table == NULL)
488 claim_zero(ibv_exp_destroy_rwq_ind_table(ind_table));
490 rte_free(ind_tables);
496 * Clean up hash RX queues and indirection table.
499 * Pointer to private structure.
502 priv_destroy_hash_rxqs(struct priv *priv)
506 DEBUG("destroying %u hash RX queues", priv->hash_rxqs_n);
507 if (priv->hash_rxqs_n == 0) {
508 assert(priv->hash_rxqs == NULL);
509 assert(priv->ind_tables == NULL);
512 for (i = 0; (i != priv->hash_rxqs_n); ++i) {
513 struct hash_rxq *hash_rxq = &(*priv->hash_rxqs)[i];
516 assert(hash_rxq->priv == priv);
517 assert(hash_rxq->qp != NULL);
518 /* Also check that there are no remaining flows. */
519 assert(hash_rxq->allmulti_flow == NULL);
520 assert(hash_rxq->promisc_flow == NULL);
521 for (j = 0; (j != RTE_DIM(hash_rxq->mac_flow)); ++j)
522 for (k = 0; (k != RTE_DIM(hash_rxq->mac_flow[j])); ++k)
523 assert(hash_rxq->mac_flow[j][k] == NULL);
524 claim_zero(ibv_destroy_qp(hash_rxq->qp));
526 priv->hash_rxqs_n = 0;
527 rte_free(priv->hash_rxqs);
528 priv->hash_rxqs = NULL;
529 for (i = 0; (i != priv->ind_tables_n); ++i) {
530 struct ibv_exp_rwq_ind_table *ind_table =
531 (*priv->ind_tables)[i];
533 assert(ind_table != NULL);
534 claim_zero(ibv_exp_destroy_rwq_ind_table(ind_table));
536 priv->ind_tables_n = 0;
537 rte_free(priv->ind_tables);
538 priv->ind_tables = NULL;
542 * Allocate RX queue elements with scattered packets support.
545 * Pointer to RX queue structure.
547 * Number of elements to allocate.
549 * If not NULL, fetch buffers from this array instead of allocating them
550 * with rte_pktmbuf_alloc().
553 * 0 on success, errno value on failure.
556 rxq_alloc_elts_sp(struct rxq *rxq, unsigned int elts_n,
557 struct rte_mbuf **pool)
560 struct rxq_elt_sp (*elts)[elts_n] =
561 rte_calloc_socket("RXQ elements", 1, sizeof(*elts), 0,
566 ERROR("%p: can't allocate packets array", (void *)rxq);
570 /* For each WR (packet). */
571 for (i = 0; (i != elts_n); ++i) {
573 struct rxq_elt_sp *elt = &(*elts)[i];
574 struct ibv_sge (*sges)[RTE_DIM(elt->sges)] = &elt->sges;
576 /* These two arrays must have the same size. */
577 assert(RTE_DIM(elt->sges) == RTE_DIM(elt->bufs));
578 /* For each SGE (segment). */
579 for (j = 0; (j != RTE_DIM(elt->bufs)); ++j) {
580 struct ibv_sge *sge = &(*sges)[j];
581 struct rte_mbuf *buf;
586 rte_pktmbuf_reset(buf);
588 buf = rte_pktmbuf_alloc(rxq->mp);
590 assert(pool == NULL);
591 ERROR("%p: empty mbuf pool", (void *)rxq);
596 /* Headroom is reserved by rte_pktmbuf_alloc(). */
597 assert(DATA_OFF(buf) == RTE_PKTMBUF_HEADROOM);
598 /* Buffer is supposed to be empty. */
599 assert(rte_pktmbuf_data_len(buf) == 0);
600 assert(rte_pktmbuf_pkt_len(buf) == 0);
601 /* sge->addr must be able to store a pointer. */
602 assert(sizeof(sge->addr) >= sizeof(uintptr_t));
604 /* The first SGE keeps its headroom. */
605 sge->addr = rte_pktmbuf_mtod(buf, uintptr_t);
606 sge->length = (buf->buf_len -
607 RTE_PKTMBUF_HEADROOM);
609 /* Subsequent SGEs lose theirs. */
610 assert(DATA_OFF(buf) == RTE_PKTMBUF_HEADROOM);
611 SET_DATA_OFF(buf, 0);
612 sge->addr = (uintptr_t)buf->buf_addr;
613 sge->length = buf->buf_len;
615 sge->lkey = rxq->mr->lkey;
616 /* Redundant check for tailroom. */
617 assert(sge->length == rte_pktmbuf_tailroom(buf));
620 DEBUG("%p: allocated and configured %u WRs (%zu segments)",
621 (void *)rxq, elts_n, (elts_n * RTE_DIM((*elts)[0].sges)));
622 rxq->elts_n = elts_n;
629 assert(pool == NULL);
630 for (i = 0; (i != RTE_DIM(*elts)); ++i) {
632 struct rxq_elt_sp *elt = &(*elts)[i];
634 for (j = 0; (j != RTE_DIM(elt->bufs)); ++j) {
635 struct rte_mbuf *buf = elt->bufs[j];
638 rte_pktmbuf_free_seg(buf);
643 DEBUG("%p: failed, freed everything", (void *)rxq);
649 * Free RX queue elements with scattered packets support.
652 * Pointer to RX queue structure.
655 rxq_free_elts_sp(struct rxq *rxq)
658 unsigned int elts_n = rxq->elts_n;
659 struct rxq_elt_sp (*elts)[elts_n] = rxq->elts.sp;
661 DEBUG("%p: freeing WRs", (void *)rxq);
666 for (i = 0; (i != RTE_DIM(*elts)); ++i) {
668 struct rxq_elt_sp *elt = &(*elts)[i];
670 for (j = 0; (j != RTE_DIM(elt->bufs)); ++j) {
671 struct rte_mbuf *buf = elt->bufs[j];
674 rte_pktmbuf_free_seg(buf);
681 * Allocate RX queue elements.
684 * Pointer to RX queue structure.
686 * Number of elements to allocate.
688 * If not NULL, fetch buffers from this array instead of allocating them
689 * with rte_pktmbuf_alloc().
692 * 0 on success, errno value on failure.
695 rxq_alloc_elts(struct rxq *rxq, unsigned int elts_n, struct rte_mbuf **pool)
698 struct rxq_elt (*elts)[elts_n] =
699 rte_calloc_socket("RXQ elements", 1, sizeof(*elts), 0,
704 ERROR("%p: can't allocate packets array", (void *)rxq);
708 /* For each WR (packet). */
709 for (i = 0; (i != elts_n); ++i) {
710 struct rxq_elt *elt = &(*elts)[i];
711 struct ibv_sge *sge = &(*elts)[i].sge;
712 struct rte_mbuf *buf;
717 rte_pktmbuf_reset(buf);
719 buf = rte_pktmbuf_alloc(rxq->mp);
721 assert(pool == NULL);
722 ERROR("%p: empty mbuf pool", (void *)rxq);
727 /* Headroom is reserved by rte_pktmbuf_alloc(). */
728 assert(DATA_OFF(buf) == RTE_PKTMBUF_HEADROOM);
729 /* Buffer is supposed to be empty. */
730 assert(rte_pktmbuf_data_len(buf) == 0);
731 assert(rte_pktmbuf_pkt_len(buf) == 0);
732 /* sge->addr must be able to store a pointer. */
733 assert(sizeof(sge->addr) >= sizeof(uintptr_t));
734 /* SGE keeps its headroom. */
735 sge->addr = (uintptr_t)
736 ((uint8_t *)buf->buf_addr + RTE_PKTMBUF_HEADROOM);
737 sge->length = (buf->buf_len - RTE_PKTMBUF_HEADROOM);
738 sge->lkey = rxq->mr->lkey;
739 /* Redundant check for tailroom. */
740 assert(sge->length == rte_pktmbuf_tailroom(buf));
742 DEBUG("%p: allocated and configured %u single-segment WRs",
743 (void *)rxq, elts_n);
744 rxq->elts_n = elts_n;
746 rxq->elts.no_sp = elts;
751 assert(pool == NULL);
752 for (i = 0; (i != RTE_DIM(*elts)); ++i) {
753 struct rxq_elt *elt = &(*elts)[i];
754 struct rte_mbuf *buf = elt->buf;
757 rte_pktmbuf_free_seg(buf);
761 DEBUG("%p: failed, freed everything", (void *)rxq);
767 * Free RX queue elements.
770 * Pointer to RX queue structure.
773 rxq_free_elts(struct rxq *rxq)
776 unsigned int elts_n = rxq->elts_n;
777 struct rxq_elt (*elts)[elts_n] = rxq->elts.no_sp;
779 DEBUG("%p: freeing WRs", (void *)rxq);
781 rxq->elts.no_sp = NULL;
784 for (i = 0; (i != RTE_DIM(*elts)); ++i) {
785 struct rxq_elt *elt = &(*elts)[i];
786 struct rte_mbuf *buf = elt->buf;
789 rte_pktmbuf_free_seg(buf);
795 * Clean up a RX queue.
797 * Destroy objects, free allocated memory and reset the structure for reuse.
800 * Pointer to RX queue structure.
803 rxq_cleanup(struct rxq *rxq)
805 struct ibv_exp_release_intf_params params;
807 DEBUG("cleaning up %p", (void *)rxq);
809 rxq_free_elts_sp(rxq);
812 if (rxq->if_wq != NULL) {
813 assert(rxq->priv != NULL);
814 assert(rxq->priv->ctx != NULL);
815 assert(rxq->wq != NULL);
816 params = (struct ibv_exp_release_intf_params){
819 claim_zero(ibv_exp_release_intf(rxq->priv->ctx,
823 if (rxq->if_cq != NULL) {
824 assert(rxq->priv != NULL);
825 assert(rxq->priv->ctx != NULL);
826 assert(rxq->cq != NULL);
827 params = (struct ibv_exp_release_intf_params){
830 claim_zero(ibv_exp_release_intf(rxq->priv->ctx,
835 claim_zero(ibv_exp_destroy_wq(rxq->wq));
837 claim_zero(ibv_destroy_cq(rxq->cq));
838 if (rxq->rd != NULL) {
839 struct ibv_exp_destroy_res_domain_attr attr = {
843 assert(rxq->priv != NULL);
844 assert(rxq->priv->ctx != NULL);
845 claim_zero(ibv_exp_destroy_res_domain(rxq->priv->ctx,
850 claim_zero(ibv_dereg_mr(rxq->mr));
851 memset(rxq, 0, sizeof(*rxq));
855 * Reconfigure a RX queue with new parameters.
857 * rxq_rehash() does not allocate mbufs, which, if not done from the right
858 * thread (such as a control thread), may corrupt the pool.
859 * In case of failure, the queue is left untouched.
862 * Pointer to Ethernet device structure.
867 * 0 on success, errno value on failure.
870 rxq_rehash(struct rte_eth_dev *dev, struct rxq *rxq)
872 struct priv *priv = rxq->priv;
873 struct rxq tmpl = *rxq;
876 struct rte_mbuf **pool;
878 struct ibv_exp_wq_attr mod;
881 DEBUG("%p: rehashing queue %p", (void *)dev, (void *)rxq);
882 /* Number of descriptors and mbufs currently allocated. */
883 desc_n = (tmpl.elts_n * (tmpl.sp ? MLX5_PMD_SGE_WR_N : 1));
885 /* Toggle RX checksum offload if hardware supports it. */
887 tmpl.csum = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
888 rxq->csum = tmpl.csum;
890 if (priv->hw_csum_l2tun) {
891 tmpl.csum_l2tun = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
892 rxq->csum_l2tun = tmpl.csum_l2tun;
894 /* Enable scattered packets support for this queue if necessary. */
895 if ((dev->data->dev_conf.rxmode.jumbo_frame) &&
896 (dev->data->dev_conf.rxmode.max_rx_pkt_len >
897 (tmpl.mb_len - RTE_PKTMBUF_HEADROOM))) {
899 desc_n /= MLX5_PMD_SGE_WR_N;
902 DEBUG("%p: %s scattered packets support (%u WRs)",
903 (void *)dev, (tmpl.sp ? "enabling" : "disabling"), desc_n);
904 /* If scatter mode is the same as before, nothing to do. */
905 if (tmpl.sp == rxq->sp) {
906 DEBUG("%p: nothing to do", (void *)dev);
909 /* From now on, any failure will render the queue unusable.
910 * Reinitialize WQ. */
911 mod = (struct ibv_exp_wq_attr){
912 .attr_mask = IBV_EXP_WQ_ATTR_STATE,
913 .wq_state = IBV_EXP_WQS_RESET,
915 err = ibv_exp_modify_wq(tmpl.wq, &mod);
917 ERROR("%p: cannot reset WQ: %s", (void *)dev, strerror(err));
922 pool = rte_malloc(__func__, (mbuf_n * sizeof(*pool)), 0);
924 ERROR("%p: cannot allocate memory", (void *)dev);
927 /* Snatch mbufs from original queue. */
930 struct rxq_elt_sp (*elts)[rxq->elts_n] = rxq->elts.sp;
932 for (i = 0; (i != RTE_DIM(*elts)); ++i) {
933 struct rxq_elt_sp *elt = &(*elts)[i];
936 for (j = 0; (j != RTE_DIM(elt->bufs)); ++j) {
937 assert(elt->bufs[j] != NULL);
938 pool[k++] = elt->bufs[j];
942 struct rxq_elt (*elts)[rxq->elts_n] = rxq->elts.no_sp;
944 for (i = 0; (i != RTE_DIM(*elts)); ++i) {
945 struct rxq_elt *elt = &(*elts)[i];
946 struct rte_mbuf *buf = elt->buf;
954 assert((void *)&tmpl.elts.sp == (void *)&tmpl.elts.no_sp);
956 rxq_alloc_elts_sp(&tmpl, desc_n, pool) :
957 rxq_alloc_elts(&tmpl, desc_n, pool));
959 ERROR("%p: cannot reallocate WRs, aborting", (void *)dev);
964 assert(tmpl.elts_n == desc_n);
965 assert(tmpl.elts.sp != NULL);
967 /* Clean up original data. */
969 rte_free(rxq->elts.sp);
971 /* Change queue state to ready. */
972 mod = (struct ibv_exp_wq_attr){
973 .attr_mask = IBV_EXP_WQ_ATTR_STATE,
974 .wq_state = IBV_EXP_WQS_RDY,
976 err = ibv_exp_modify_wq(tmpl.wq, &mod);
978 ERROR("%p: WQ state to IBV_EXP_WQS_RDY failed: %s",
979 (void *)dev, strerror(err));
983 assert(tmpl.if_wq != NULL);
985 struct rxq_elt_sp (*elts)[tmpl.elts_n] = tmpl.elts.sp;
987 for (i = 0; (i != RTE_DIM(*elts)); ++i) {
988 err = tmpl.if_wq->recv_sg_list
991 RTE_DIM((*elts)[i].sges));
996 struct rxq_elt (*elts)[tmpl.elts_n] = tmpl.elts.no_sp;
998 for (i = 0; (i != RTE_DIM(*elts)); ++i) {
999 err = tmpl.if_wq->recv_burst(
1008 ERROR("%p: failed to post SGEs with error %d",
1010 /* Set err because it does not contain a valid errno value. */
1021 * Configure a RX queue.
1024 * Pointer to Ethernet device structure.
1026 * Pointer to RX queue structure.
1028 * Number of descriptors to configure in queue.
1030 * NUMA socket on which memory must be allocated.
1032 * Thresholds parameters.
1034 * Memory pool for buffer allocations.
1037 * 0 on success, errno value on failure.
1040 rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
1041 unsigned int socket, const struct rte_eth_rxconf *conf,
1042 struct rte_mempool *mp)
1044 struct priv *priv = dev->data->dev_private;
1050 struct ibv_exp_wq_attr mod;
1052 struct ibv_exp_query_intf_params params;
1053 struct ibv_exp_cq_init_attr cq;
1054 struct ibv_exp_res_domain_init_attr rd;
1055 struct ibv_exp_wq_init_attr wq;
1057 enum ibv_exp_query_intf_status status;
1058 struct rte_mbuf *buf;
1061 unsigned int cq_size = desc;
1063 (void)conf; /* Thresholds configuration (ignored). */
1064 if ((desc == 0) || (desc % MLX5_PMD_SGE_WR_N)) {
1065 ERROR("%p: invalid number of RX descriptors (must be a"
1066 " multiple of %d)", (void *)dev, MLX5_PMD_SGE_WR_N);
1069 /* Get mbuf length. */
1070 buf = rte_pktmbuf_alloc(mp);
1072 ERROR("%p: unable to allocate mbuf", (void *)dev);
1075 tmpl.mb_len = buf->buf_len;
1076 assert((rte_pktmbuf_headroom(buf) +
1077 rte_pktmbuf_tailroom(buf)) == tmpl.mb_len);
1078 assert(rte_pktmbuf_headroom(buf) == RTE_PKTMBUF_HEADROOM);
1079 rte_pktmbuf_free(buf);
1080 /* Toggle RX checksum offload if hardware supports it. */
1082 tmpl.csum = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
1083 if (priv->hw_csum_l2tun)
1084 tmpl.csum_l2tun = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
1085 /* Enable scattered packets support for this queue if necessary. */
1086 if ((dev->data->dev_conf.rxmode.jumbo_frame) &&
1087 (dev->data->dev_conf.rxmode.max_rx_pkt_len >
1088 (tmpl.mb_len - RTE_PKTMBUF_HEADROOM))) {
1090 desc /= MLX5_PMD_SGE_WR_N;
1092 DEBUG("%p: %s scattered packets support (%u WRs)",
1093 (void *)dev, (tmpl.sp ? "enabling" : "disabling"), desc);
1094 /* Use the entire RX mempool as the memory region. */
1095 tmpl.mr = ibv_reg_mr(priv->pd,
1096 (void *)mp->elt_va_start,
1097 (mp->elt_va_end - mp->elt_va_start),
1098 (IBV_ACCESS_LOCAL_WRITE |
1099 IBV_ACCESS_REMOTE_WRITE));
1100 if (tmpl.mr == NULL) {
1102 ERROR("%p: MR creation failure: %s",
1103 (void *)dev, strerror(ret));
1106 attr.rd = (struct ibv_exp_res_domain_init_attr){
1107 .comp_mask = (IBV_EXP_RES_DOMAIN_THREAD_MODEL |
1108 IBV_EXP_RES_DOMAIN_MSG_MODEL),
1109 .thread_model = IBV_EXP_THREAD_SINGLE,
1110 .msg_model = IBV_EXP_MSG_HIGH_BW,
1112 tmpl.rd = ibv_exp_create_res_domain(priv->ctx, &attr.rd);
1113 if (tmpl.rd == NULL) {
1115 ERROR("%p: RD creation failure: %s",
1116 (void *)dev, strerror(ret));
1119 attr.cq = (struct ibv_exp_cq_init_attr){
1120 .comp_mask = IBV_EXP_CQ_INIT_ATTR_RES_DOMAIN,
1121 .res_domain = tmpl.rd,
1123 tmpl.cq = ibv_exp_create_cq(priv->ctx, cq_size, NULL, NULL, 0,
1125 if (tmpl.cq == NULL) {
1127 ERROR("%p: CQ creation failure: %s",
1128 (void *)dev, strerror(ret));
1131 DEBUG("priv->device_attr.max_qp_wr is %d",
1132 priv->device_attr.max_qp_wr);
1133 DEBUG("priv->device_attr.max_sge is %d",
1134 priv->device_attr.max_sge);
1135 attr.wq = (struct ibv_exp_wq_init_attr){
1136 .wq_context = NULL, /* Could be useful in the future. */
1137 .wq_type = IBV_EXP_WQT_RQ,
1138 /* Max number of outstanding WRs. */
1139 .max_recv_wr = ((priv->device_attr.max_qp_wr < (int)cq_size) ?
1140 priv->device_attr.max_qp_wr :
1142 /* Max number of scatter/gather elements in a WR. */
1143 .max_recv_sge = ((priv->device_attr.max_sge <
1144 MLX5_PMD_SGE_WR_N) ?
1145 priv->device_attr.max_sge :
1149 .comp_mask = IBV_EXP_CREATE_WQ_RES_DOMAIN,
1150 .res_domain = tmpl.rd,
1152 tmpl.wq = ibv_exp_create_wq(priv->ctx, &attr.wq);
1153 if (tmpl.wq == NULL) {
1154 ret = (errno ? errno : EINVAL);
1155 ERROR("%p: WQ creation failure: %s",
1156 (void *)dev, strerror(ret));
1160 ret = rxq_alloc_elts_sp(&tmpl, desc, NULL);
1162 ret = rxq_alloc_elts(&tmpl, desc, NULL);
1164 ERROR("%p: RXQ allocation failed: %s",
1165 (void *)dev, strerror(ret));
1169 tmpl.port_id = dev->data->port_id;
1170 DEBUG("%p: RTE port ID: %u", (void *)rxq, tmpl.port_id);
1171 attr.params = (struct ibv_exp_query_intf_params){
1172 .intf_scope = IBV_EXP_INTF_GLOBAL,
1173 .intf = IBV_EXP_INTF_CQ,
1176 tmpl.if_cq = ibv_exp_query_intf(priv->ctx, &attr.params, &status);
1177 if (tmpl.if_cq == NULL) {
1178 ERROR("%p: CQ interface family query failed with status %d",
1179 (void *)dev, status);
1182 attr.params = (struct ibv_exp_query_intf_params){
1183 .intf_scope = IBV_EXP_INTF_GLOBAL,
1184 .intf = IBV_EXP_INTF_WQ,
1187 tmpl.if_wq = ibv_exp_query_intf(priv->ctx, &attr.params, &status);
1188 if (tmpl.if_wq == NULL) {
1189 ERROR("%p: WQ interface family query failed with status %d",
1190 (void *)dev, status);
1193 /* Change queue state to ready. */
1194 mod = (struct ibv_exp_wq_attr){
1195 .attr_mask = IBV_EXP_WQ_ATTR_STATE,
1196 .wq_state = IBV_EXP_WQS_RDY,
1198 ret = ibv_exp_modify_wq(tmpl.wq, &mod);
1200 ERROR("%p: WQ state to IBV_EXP_WQS_RDY failed: %s",
1201 (void *)dev, strerror(ret));
1206 struct rxq_elt_sp (*elts)[tmpl.elts_n] = tmpl.elts.sp;
1208 for (i = 0; (i != RTE_DIM(*elts)); ++i) {
1209 ret = tmpl.if_wq->recv_sg_list
1212 RTE_DIM((*elts)[i].sges));
1217 struct rxq_elt (*elts)[tmpl.elts_n] = tmpl.elts.no_sp;
1219 for (i = 0; (i != RTE_DIM(*elts)); ++i) {
1220 ret = tmpl.if_wq->recv_burst(
1229 ERROR("%p: failed to post SGEs with error %d",
1231 /* Set ret because it does not contain a valid errno value. */
1235 /* Clean up rxq in case we're reinitializing it. */
1236 DEBUG("%p: cleaning-up old rxq just in case", (void *)rxq);
1239 DEBUG("%p: rxq updated with %p", (void *)rxq, (void *)&tmpl);
1249 * DPDK callback to configure a RX queue.
1252 * Pointer to Ethernet device structure.
1256 * Number of descriptors to configure in queue.
1258 * NUMA socket on which memory must be allocated.
1260 * Thresholds parameters.
1262 * Memory pool for buffer allocations.
1265 * 0 on success, negative errno value on failure.
1268 mlx5_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
1269 unsigned int socket, const struct rte_eth_rxconf *conf,
1270 struct rte_mempool *mp)
1272 struct priv *priv = dev->data->dev_private;
1273 struct rxq *rxq = (*priv->rxqs)[idx];
1277 DEBUG("%p: configuring queue %u for %u descriptors",
1278 (void *)dev, idx, desc);
1279 if (idx >= priv->rxqs_n) {
1280 ERROR("%p: queue index out of range (%u >= %u)",
1281 (void *)dev, idx, priv->rxqs_n);
1286 DEBUG("%p: reusing already allocated queue index %u (%p)",
1287 (void *)dev, idx, (void *)rxq);
1288 if (priv->started) {
1292 (*priv->rxqs)[idx] = NULL;
1295 rxq = rte_calloc_socket("RXQ", 1, sizeof(*rxq), 0, socket);
1297 ERROR("%p: unable to allocate queue index %u",
1303 ret = rxq_setup(dev, rxq, desc, socket, conf, mp);
1307 rxq->stats.idx = idx;
1308 DEBUG("%p: adding RX queue %p to list",
1309 (void *)dev, (void *)rxq);
1310 (*priv->rxqs)[idx] = rxq;
1311 /* Update receive callback. */
1313 dev->rx_pkt_burst = mlx5_rx_burst_sp;
1315 dev->rx_pkt_burst = mlx5_rx_burst;
1322 * DPDK callback to release a RX queue.
1325 * Generic RX queue pointer.
1328 mlx5_rx_queue_release(void *dpdk_rxq)
1330 struct rxq *rxq = (struct rxq *)dpdk_rxq;
1338 for (i = 0; (i != priv->rxqs_n); ++i)
1339 if ((*priv->rxqs)[i] == rxq) {
1340 DEBUG("%p: removing RX queue %p from list",
1341 (void *)priv->dev, (void *)rxq);
1342 (*priv->rxqs)[i] = NULL;