4 * Copyright 2015 6WIND S.A.
5 * Copyright 2015 Mellanox.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * * Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * * Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * * Neither the name of 6WIND S.A. nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
41 /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
43 #pragma GCC diagnostic ignored "-pedantic"
45 #include <infiniband/verbs.h>
47 #pragma GCC diagnostic error "-pedantic"
50 /* DPDK headers don't like -pedantic. */
52 #pragma GCC diagnostic ignored "-pedantic"
55 #include <rte_malloc.h>
56 #include <rte_ethdev.h>
57 #include <rte_common.h>
59 #pragma GCC diagnostic error "-pedantic"
63 #include "mlx5_rxtx.h"
64 #include "mlx5_utils.h"
65 #include "mlx5_defs.h"
67 /* Initialization data for hash RX queues. */
68 static const struct hash_rxq_init hash_rxq_init[] = {
70 .hash_fields = (IBV_EXP_RX_HASH_SRC_IPV4 |
71 IBV_EXP_RX_HASH_DST_IPV4 |
72 IBV_EXP_RX_HASH_SRC_PORT_TCP |
73 IBV_EXP_RX_HASH_DST_PORT_TCP),
75 .flow_spec.tcp_udp = {
76 .type = IBV_FLOW_SPEC_TCP,
77 .size = sizeof(hash_rxq_init[0].flow_spec.tcp_udp),
79 .underlayer = &hash_rxq_init[HASH_RXQ_IPV4],
82 .hash_fields = (IBV_EXP_RX_HASH_SRC_IPV4 |
83 IBV_EXP_RX_HASH_DST_IPV4 |
84 IBV_EXP_RX_HASH_SRC_PORT_UDP |
85 IBV_EXP_RX_HASH_DST_PORT_UDP),
87 .flow_spec.tcp_udp = {
88 .type = IBV_FLOW_SPEC_UDP,
89 .size = sizeof(hash_rxq_init[0].flow_spec.tcp_udp),
91 .underlayer = &hash_rxq_init[HASH_RXQ_IPV4],
94 .hash_fields = (IBV_EXP_RX_HASH_SRC_IPV4 |
95 IBV_EXP_RX_HASH_DST_IPV4),
98 .type = IBV_FLOW_SPEC_IPV4,
99 .size = sizeof(hash_rxq_init[0].flow_spec.ipv4),
101 .underlayer = &hash_rxq_init[HASH_RXQ_ETH],
107 .type = IBV_FLOW_SPEC_ETH,
108 .size = sizeof(hash_rxq_init[0].flow_spec.eth),
114 /* Number of entries in hash_rxq_init[]. */
115 static const unsigned int hash_rxq_init_n = RTE_DIM(hash_rxq_init);
117 /* Initialization data for hash RX queue indirection tables. */
118 static const struct ind_table_init ind_table_init[] = {
120 .max_size = -1u, /* Superseded by HW limitations. */
122 1 << HASH_RXQ_TCPV4 |
123 1 << HASH_RXQ_UDPV4 |
130 .hash_types = 1 << HASH_RXQ_ETH,
135 #define IND_TABLE_INIT_N RTE_DIM(ind_table_init)
137 /* Default RSS hash key also used for ConnectX-3. */
138 static uint8_t hash_rxq_default_key[] = {
139 0x2c, 0xc6, 0x81, 0xd1,
140 0x5b, 0xdb, 0xf4, 0xf7,
141 0xfc, 0xa2, 0x83, 0x19,
142 0xdb, 0x1a, 0x3e, 0x94,
143 0x6b, 0x9e, 0x38, 0xd9,
144 0x2c, 0x9c, 0x03, 0xd1,
145 0xad, 0x99, 0x44, 0xa7,
146 0xd9, 0x56, 0x3d, 0x59,
147 0x06, 0x3c, 0x25, 0xf3,
148 0xfc, 0x1f, 0xdc, 0x2a,
152 * Populate flow steering rule for a given hash RX queue type using
153 * information from hash_rxq_init[]. Nothing is written to flow_attr when
154 * flow_attr_size is not large enough, but the required size is still returned.
156 * @param[in] hash_rxq
157 * Pointer to hash RX queue.
158 * @param[out] flow_attr
159 * Pointer to flow attribute structure to fill. Note that the allocated
160 * area must be larger and large enough to hold all flow specifications.
161 * @param flow_attr_size
162 * Entire size of flow_attr and trailing room for flow specifications.
165 * Total size of the flow attribute buffer. No errors are defined.
168 hash_rxq_flow_attr(const struct hash_rxq *hash_rxq,
169 struct ibv_flow_attr *flow_attr,
170 size_t flow_attr_size)
172 size_t offset = sizeof(*flow_attr);
173 enum hash_rxq_type type = hash_rxq->type;
174 const struct hash_rxq_init *init = &hash_rxq_init[type];
176 assert(hash_rxq->priv != NULL);
177 assert((size_t)type < RTE_DIM(hash_rxq_init));
179 offset += init->flow_spec.hdr.size;
180 init = init->underlayer;
181 } while (init != NULL);
182 if (offset > flow_attr_size)
184 flow_attr_size = offset;
185 init = &hash_rxq_init[type];
186 *flow_attr = (struct ibv_flow_attr){
187 .type = IBV_FLOW_ATTR_NORMAL,
188 .priority = init->flow_priority,
190 .port = hash_rxq->priv->port,
194 offset -= init->flow_spec.hdr.size;
195 memcpy((void *)((uintptr_t)flow_attr + offset),
197 init->flow_spec.hdr.size);
198 ++flow_attr->num_of_specs;
199 init = init->underlayer;
200 } while (init != NULL);
201 return flow_attr_size;
205 * Return nearest power of two above input value.
211 * Nearest power of two above input value.
214 log2above(unsigned int v)
219 for (l = 0, r = 0; (v >> 1); ++l, v >>= 1)
225 * Return the type corresponding to the n'th bit set.
228 * The indirection table.
233 * The corresponding hash_rxq_type.
235 static enum hash_rxq_type
236 hash_rxq_type_from_n(const struct ind_table_init *table, unsigned int n)
238 assert(n < table->hash_types_n);
239 while (((table->hash_types >> n) & 0x1) == 0)
245 * Filter out disabled hash RX queue types from ind_table_init[].
248 * Pointer to private structure.
253 * Number of table entries.
256 priv_make_ind_table_init(struct priv *priv,
257 struct ind_table_init (*table)[IND_TABLE_INIT_N])
261 unsigned int table_n = 0;
262 /* Mandatory to receive frames not handled by normal hash RX queues. */
263 unsigned int hash_types_sup = 1 << HASH_RXQ_ETH;
265 /* Process other protocols only if more than one queue. */
266 if (priv->rxqs_n > 1)
267 for (i = 0; (i != hash_rxq_init_n); ++i)
268 if (hash_rxq_init[i].hash_fields)
269 hash_types_sup |= (1 << i);
271 /* Filter out entries whose protocols are not in the set. */
272 for (i = 0, j = 0; (i != IND_TABLE_INIT_N); ++i) {
276 /* j is increased only if the table has valid protocols. */
278 (*table)[j] = ind_table_init[i];
279 (*table)[j].hash_types &= hash_types_sup;
280 for (h = 0, nb = 0; (h != hash_rxq_init_n); ++h)
281 if (((*table)[j].hash_types >> h) & 0x1)
283 (*table)[i].hash_types_n = nb;
293 * Initialize hash RX queues and indirection table.
296 * Pointer to private structure.
299 * 0 on success, errno value on failure.
302 priv_create_hash_rxqs(struct priv *priv)
304 /* If the requested number of WQs is not a power of two, use the
305 * maximum indirection table size for better balancing.
306 * The result is always rounded to the next power of two. */
308 (1 << log2above((priv->rxqs_n & (priv->rxqs_n - 1)) ?
309 priv->ind_table_max_size :
311 struct ibv_exp_wq *wqs[wqs_n];
312 struct ind_table_init ind_table_init[IND_TABLE_INIT_N];
313 unsigned int ind_tables_n =
314 priv_make_ind_table_init(priv, &ind_table_init);
315 unsigned int hash_rxqs_n = 0;
316 struct hash_rxq (*hash_rxqs)[] = NULL;
317 struct ibv_exp_rwq_ind_table *(*ind_tables)[] = NULL;
323 assert(priv->ind_tables == NULL);
324 assert(priv->ind_tables_n == 0);
325 assert(priv->hash_rxqs == NULL);
326 assert(priv->hash_rxqs_n == 0);
327 assert(priv->pd != NULL);
328 assert(priv->ctx != NULL);
329 if (priv->rxqs_n == 0)
331 assert(priv->rxqs != NULL);
332 if (ind_tables_n == 0) {
333 ERROR("all hash RX queue types have been filtered out,"
334 " indirection table cannot be created");
337 if ((wqs_n < priv->rxqs_n) || (wqs_n > priv->ind_table_max_size)) {
338 ERROR("cannot handle this many RX queues (%u)", priv->rxqs_n);
342 if (wqs_n != priv->rxqs_n) {
343 INFO("%u RX queues are configured, consider rounding this"
344 " number to the next power of two for better balancing",
346 DEBUG("indirection table extended to assume %u WQs", wqs_n);
348 /* When the number of RX queues is not a power of two, the remaining
349 * table entries are padded with reused WQs and hashes are not spread
351 for (i = 0, j = 0; (i != wqs_n); ++i) {
352 wqs[i] = (*priv->rxqs)[j]->wq;
353 if (++j == priv->rxqs_n)
356 /* Get number of hash RX queues to configure. */
357 for (i = 0, hash_rxqs_n = 0; (i != ind_tables_n); ++i)
358 hash_rxqs_n += ind_table_init[i].hash_types_n;
359 DEBUG("allocating %u hash RX queues for %u WQs, %u indirection tables",
360 hash_rxqs_n, priv->rxqs_n, ind_tables_n);
361 /* Create indirection tables. */
362 ind_tables = rte_calloc(__func__, ind_tables_n,
363 sizeof((*ind_tables)[0]), 0);
364 if (ind_tables == NULL) {
366 ERROR("cannot allocate indirection tables container: %s",
370 for (i = 0; (i != ind_tables_n); ++i) {
371 struct ibv_exp_rwq_ind_table_init_attr ind_init_attr = {
373 .log_ind_tbl_size = 0, /* Set below. */
377 unsigned int ind_tbl_size = ind_table_init[i].max_size;
378 struct ibv_exp_rwq_ind_table *ind_table;
380 if (wqs_n < ind_tbl_size)
381 ind_tbl_size = wqs_n;
382 ind_init_attr.log_ind_tbl_size = log2above(ind_tbl_size);
384 ind_table = ibv_exp_create_rwq_ind_table(priv->ctx,
386 if (ind_table != NULL) {
387 (*ind_tables)[i] = ind_table;
390 /* Not clear whether errno is set. */
391 err = (errno ? errno : EINVAL);
392 ERROR("RX indirection table creation failed with error %d: %s",
396 /* Allocate array that holds hash RX queues and related data. */
397 hash_rxqs = rte_calloc(__func__, hash_rxqs_n,
398 sizeof((*hash_rxqs)[0]), 0);
399 if (hash_rxqs == NULL) {
401 ERROR("cannot allocate hash RX queues container: %s",
405 for (i = 0, j = 0, k = 0;
406 ((i != hash_rxqs_n) && (j != ind_tables_n));
408 struct hash_rxq *hash_rxq = &(*hash_rxqs)[i];
409 enum hash_rxq_type type =
410 hash_rxq_type_from_n(&ind_table_init[j], k);
411 struct ibv_exp_rx_hash_conf hash_conf = {
412 .rx_hash_function = IBV_EXP_RX_HASH_FUNC_TOEPLITZ,
413 .rx_hash_key_len = sizeof(hash_rxq_default_key),
414 .rx_hash_key = hash_rxq_default_key,
415 .rx_hash_fields_mask = hash_rxq_init[type].hash_fields,
416 .rwq_ind_tbl = (*ind_tables)[j],
418 struct ibv_exp_qp_init_attr qp_init_attr = {
419 .max_inl_recv = 0, /* Currently not supported. */
420 .qp_type = IBV_QPT_RAW_PACKET,
421 .comp_mask = (IBV_EXP_QP_INIT_ATTR_PD |
422 IBV_EXP_QP_INIT_ATTR_RX_HASH),
424 .rx_hash_conf = &hash_conf,
425 .port_num = priv->port,
428 DEBUG("using indirection table %u for hash RX queue %u",
430 *hash_rxq = (struct hash_rxq){
432 .qp = ibv_exp_create_qp(priv->ctx, &qp_init_attr),
435 if (hash_rxq->qp == NULL) {
436 err = (errno ? errno : EINVAL);
437 ERROR("Hash RX QP creation failure: %s",
441 if (++k < ind_table_init[j].hash_types_n)
443 /* Switch to the next indirection table and reset hash RX
444 * queue type array index. */
448 priv->ind_tables = ind_tables;
449 priv->ind_tables_n = ind_tables_n;
450 priv->hash_rxqs = hash_rxqs;
451 priv->hash_rxqs_n = hash_rxqs_n;
455 if (hash_rxqs != NULL) {
456 for (i = 0; (i != hash_rxqs_n); ++i) {
457 struct ibv_qp *qp = (*hash_rxqs)[i].qp;
461 claim_zero(ibv_destroy_qp(qp));
465 if (ind_tables != NULL) {
466 for (j = 0; (j != ind_tables_n); ++j) {
467 struct ibv_exp_rwq_ind_table *ind_table =
470 if (ind_table == NULL)
472 claim_zero(ibv_exp_destroy_rwq_ind_table(ind_table));
474 rte_free(ind_tables);
480 * Clean up hash RX queues and indirection table.
483 * Pointer to private structure.
486 priv_destroy_hash_rxqs(struct priv *priv)
490 DEBUG("destroying %u hash RX queues", priv->hash_rxqs_n);
491 if (priv->hash_rxqs_n == 0) {
492 assert(priv->hash_rxqs == NULL);
493 assert(priv->ind_tables == NULL);
496 for (i = 0; (i != priv->hash_rxqs_n); ++i) {
497 struct hash_rxq *hash_rxq = &(*priv->hash_rxqs)[i];
500 assert(hash_rxq->priv == priv);
501 assert(hash_rxq->qp != NULL);
502 /* Also check that there are no remaining flows. */
503 assert(hash_rxq->allmulti_flow == NULL);
504 assert(hash_rxq->promisc_flow == NULL);
505 for (j = 0; (j != RTE_DIM(hash_rxq->mac_flow)); ++j)
506 for (k = 0; (k != RTE_DIM(hash_rxq->mac_flow[j])); ++k)
507 assert(hash_rxq->mac_flow[j][k] == NULL);
508 claim_zero(ibv_destroy_qp(hash_rxq->qp));
510 priv->hash_rxqs_n = 0;
511 rte_free(priv->hash_rxqs);
512 priv->hash_rxqs = NULL;
513 for (i = 0; (i != priv->ind_tables_n); ++i) {
514 struct ibv_exp_rwq_ind_table *ind_table =
515 (*priv->ind_tables)[i];
517 assert(ind_table != NULL);
518 claim_zero(ibv_exp_destroy_rwq_ind_table(ind_table));
520 priv->ind_tables_n = 0;
521 rte_free(priv->ind_tables);
522 priv->ind_tables = NULL;
526 * Allocate RX queue elements with scattered packets support.
529 * Pointer to RX queue structure.
531 * Number of elements to allocate.
533 * If not NULL, fetch buffers from this array instead of allocating them
534 * with rte_pktmbuf_alloc().
537 * 0 on success, errno value on failure.
540 rxq_alloc_elts_sp(struct rxq *rxq, unsigned int elts_n,
541 struct rte_mbuf **pool)
544 struct rxq_elt_sp (*elts)[elts_n] =
545 rte_calloc_socket("RXQ elements", 1, sizeof(*elts), 0,
550 ERROR("%p: can't allocate packets array", (void *)rxq);
554 /* For each WR (packet). */
555 for (i = 0; (i != elts_n); ++i) {
557 struct rxq_elt_sp *elt = &(*elts)[i];
558 struct ibv_sge (*sges)[RTE_DIM(elt->sges)] = &elt->sges;
560 /* These two arrays must have the same size. */
561 assert(RTE_DIM(elt->sges) == RTE_DIM(elt->bufs));
562 /* For each SGE (segment). */
563 for (j = 0; (j != RTE_DIM(elt->bufs)); ++j) {
564 struct ibv_sge *sge = &(*sges)[j];
565 struct rte_mbuf *buf;
570 rte_pktmbuf_reset(buf);
572 buf = rte_pktmbuf_alloc(rxq->mp);
574 assert(pool == NULL);
575 ERROR("%p: empty mbuf pool", (void *)rxq);
580 /* Headroom is reserved by rte_pktmbuf_alloc(). */
581 assert(DATA_OFF(buf) == RTE_PKTMBUF_HEADROOM);
582 /* Buffer is supposed to be empty. */
583 assert(rte_pktmbuf_data_len(buf) == 0);
584 assert(rte_pktmbuf_pkt_len(buf) == 0);
585 /* sge->addr must be able to store a pointer. */
586 assert(sizeof(sge->addr) >= sizeof(uintptr_t));
588 /* The first SGE keeps its headroom. */
589 sge->addr = rte_pktmbuf_mtod(buf, uintptr_t);
590 sge->length = (buf->buf_len -
591 RTE_PKTMBUF_HEADROOM);
593 /* Subsequent SGEs lose theirs. */
594 assert(DATA_OFF(buf) == RTE_PKTMBUF_HEADROOM);
595 SET_DATA_OFF(buf, 0);
596 sge->addr = (uintptr_t)buf->buf_addr;
597 sge->length = buf->buf_len;
599 sge->lkey = rxq->mr->lkey;
600 /* Redundant check for tailroom. */
601 assert(sge->length == rte_pktmbuf_tailroom(buf));
604 DEBUG("%p: allocated and configured %u WRs (%zu segments)",
605 (void *)rxq, elts_n, (elts_n * RTE_DIM((*elts)[0].sges)));
606 rxq->elts_n = elts_n;
613 assert(pool == NULL);
614 for (i = 0; (i != RTE_DIM(*elts)); ++i) {
616 struct rxq_elt_sp *elt = &(*elts)[i];
618 for (j = 0; (j != RTE_DIM(elt->bufs)); ++j) {
619 struct rte_mbuf *buf = elt->bufs[j];
622 rte_pktmbuf_free_seg(buf);
627 DEBUG("%p: failed, freed everything", (void *)rxq);
633 * Free RX queue elements with scattered packets support.
636 * Pointer to RX queue structure.
639 rxq_free_elts_sp(struct rxq *rxq)
642 unsigned int elts_n = rxq->elts_n;
643 struct rxq_elt_sp (*elts)[elts_n] = rxq->elts.sp;
645 DEBUG("%p: freeing WRs", (void *)rxq);
650 for (i = 0; (i != RTE_DIM(*elts)); ++i) {
652 struct rxq_elt_sp *elt = &(*elts)[i];
654 for (j = 0; (j != RTE_DIM(elt->bufs)); ++j) {
655 struct rte_mbuf *buf = elt->bufs[j];
658 rte_pktmbuf_free_seg(buf);
665 * Allocate RX queue elements.
668 * Pointer to RX queue structure.
670 * Number of elements to allocate.
672 * If not NULL, fetch buffers from this array instead of allocating them
673 * with rte_pktmbuf_alloc().
676 * 0 on success, errno value on failure.
679 rxq_alloc_elts(struct rxq *rxq, unsigned int elts_n, struct rte_mbuf **pool)
682 struct rxq_elt (*elts)[elts_n] =
683 rte_calloc_socket("RXQ elements", 1, sizeof(*elts), 0,
688 ERROR("%p: can't allocate packets array", (void *)rxq);
692 /* For each WR (packet). */
693 for (i = 0; (i != elts_n); ++i) {
694 struct rxq_elt *elt = &(*elts)[i];
695 struct ibv_sge *sge = &(*elts)[i].sge;
696 struct rte_mbuf *buf;
701 rte_pktmbuf_reset(buf);
703 buf = rte_pktmbuf_alloc(rxq->mp);
705 assert(pool == NULL);
706 ERROR("%p: empty mbuf pool", (void *)rxq);
711 /* Headroom is reserved by rte_pktmbuf_alloc(). */
712 assert(DATA_OFF(buf) == RTE_PKTMBUF_HEADROOM);
713 /* Buffer is supposed to be empty. */
714 assert(rte_pktmbuf_data_len(buf) == 0);
715 assert(rte_pktmbuf_pkt_len(buf) == 0);
716 /* sge->addr must be able to store a pointer. */
717 assert(sizeof(sge->addr) >= sizeof(uintptr_t));
718 /* SGE keeps its headroom. */
719 sge->addr = (uintptr_t)
720 ((uint8_t *)buf->buf_addr + RTE_PKTMBUF_HEADROOM);
721 sge->length = (buf->buf_len - RTE_PKTMBUF_HEADROOM);
722 sge->lkey = rxq->mr->lkey;
723 /* Redundant check for tailroom. */
724 assert(sge->length == rte_pktmbuf_tailroom(buf));
726 DEBUG("%p: allocated and configured %u single-segment WRs",
727 (void *)rxq, elts_n);
728 rxq->elts_n = elts_n;
730 rxq->elts.no_sp = elts;
735 assert(pool == NULL);
736 for (i = 0; (i != RTE_DIM(*elts)); ++i) {
737 struct rxq_elt *elt = &(*elts)[i];
738 struct rte_mbuf *buf = elt->buf;
741 rte_pktmbuf_free_seg(buf);
745 DEBUG("%p: failed, freed everything", (void *)rxq);
751 * Free RX queue elements.
754 * Pointer to RX queue structure.
757 rxq_free_elts(struct rxq *rxq)
760 unsigned int elts_n = rxq->elts_n;
761 struct rxq_elt (*elts)[elts_n] = rxq->elts.no_sp;
763 DEBUG("%p: freeing WRs", (void *)rxq);
765 rxq->elts.no_sp = NULL;
768 for (i = 0; (i != RTE_DIM(*elts)); ++i) {
769 struct rxq_elt *elt = &(*elts)[i];
770 struct rte_mbuf *buf = elt->buf;
773 rte_pktmbuf_free_seg(buf);
779 * Clean up a RX queue.
781 * Destroy objects, free allocated memory and reset the structure for reuse.
784 * Pointer to RX queue structure.
787 rxq_cleanup(struct rxq *rxq)
789 struct ibv_exp_release_intf_params params;
791 DEBUG("cleaning up %p", (void *)rxq);
793 rxq_free_elts_sp(rxq);
796 if (rxq->if_wq != NULL) {
797 assert(rxq->priv != NULL);
798 assert(rxq->priv->ctx != NULL);
799 assert(rxq->wq != NULL);
800 params = (struct ibv_exp_release_intf_params){
803 claim_zero(ibv_exp_release_intf(rxq->priv->ctx,
807 if (rxq->if_cq != NULL) {
808 assert(rxq->priv != NULL);
809 assert(rxq->priv->ctx != NULL);
810 assert(rxq->cq != NULL);
811 params = (struct ibv_exp_release_intf_params){
814 claim_zero(ibv_exp_release_intf(rxq->priv->ctx,
819 claim_zero(ibv_exp_destroy_wq(rxq->wq));
821 claim_zero(ibv_destroy_cq(rxq->cq));
822 if (rxq->rd != NULL) {
823 struct ibv_exp_destroy_res_domain_attr attr = {
827 assert(rxq->priv != NULL);
828 assert(rxq->priv->ctx != NULL);
829 claim_zero(ibv_exp_destroy_res_domain(rxq->priv->ctx,
834 claim_zero(ibv_dereg_mr(rxq->mr));
835 memset(rxq, 0, sizeof(*rxq));
839 * Reconfigure a RX queue with new parameters.
841 * rxq_rehash() does not allocate mbufs, which, if not done from the right
842 * thread (such as a control thread), may corrupt the pool.
843 * In case of failure, the queue is left untouched.
846 * Pointer to Ethernet device structure.
851 * 0 on success, errno value on failure.
854 rxq_rehash(struct rte_eth_dev *dev, struct rxq *rxq)
856 struct priv *priv = rxq->priv;
857 struct rxq tmpl = *rxq;
860 struct rte_mbuf **pool;
862 struct ibv_exp_wq_attr mod;
865 DEBUG("%p: rehashing queue %p", (void *)dev, (void *)rxq);
866 /* Number of descriptors and mbufs currently allocated. */
867 desc_n = (tmpl.elts_n * (tmpl.sp ? MLX5_PMD_SGE_WR_N : 1));
869 /* Toggle RX checksum offload if hardware supports it. */
871 tmpl.csum = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
872 rxq->csum = tmpl.csum;
874 if (priv->hw_csum_l2tun) {
875 tmpl.csum_l2tun = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
876 rxq->csum_l2tun = tmpl.csum_l2tun;
878 /* Enable scattered packets support for this queue if necessary. */
879 if ((dev->data->dev_conf.rxmode.jumbo_frame) &&
880 (dev->data->dev_conf.rxmode.max_rx_pkt_len >
881 (tmpl.mb_len - RTE_PKTMBUF_HEADROOM))) {
883 desc_n /= MLX5_PMD_SGE_WR_N;
886 DEBUG("%p: %s scattered packets support (%u WRs)",
887 (void *)dev, (tmpl.sp ? "enabling" : "disabling"), desc_n);
888 /* If scatter mode is the same as before, nothing to do. */
889 if (tmpl.sp == rxq->sp) {
890 DEBUG("%p: nothing to do", (void *)dev);
893 /* From now on, any failure will render the queue unusable.
894 * Reinitialize WQ. */
895 mod = (struct ibv_exp_wq_attr){
896 .attr_mask = IBV_EXP_WQ_ATTR_STATE,
897 .wq_state = IBV_EXP_WQS_RESET,
899 err = ibv_exp_modify_wq(tmpl.wq, &mod);
901 ERROR("%p: cannot reset WQ: %s", (void *)dev, strerror(err));
906 pool = rte_malloc(__func__, (mbuf_n * sizeof(*pool)), 0);
908 ERROR("%p: cannot allocate memory", (void *)dev);
911 /* Snatch mbufs from original queue. */
914 struct rxq_elt_sp (*elts)[rxq->elts_n] = rxq->elts.sp;
916 for (i = 0; (i != RTE_DIM(*elts)); ++i) {
917 struct rxq_elt_sp *elt = &(*elts)[i];
920 for (j = 0; (j != RTE_DIM(elt->bufs)); ++j) {
921 assert(elt->bufs[j] != NULL);
922 pool[k++] = elt->bufs[j];
926 struct rxq_elt (*elts)[rxq->elts_n] = rxq->elts.no_sp;
928 for (i = 0; (i != RTE_DIM(*elts)); ++i) {
929 struct rxq_elt *elt = &(*elts)[i];
930 struct rte_mbuf *buf = elt->buf;
938 assert((void *)&tmpl.elts.sp == (void *)&tmpl.elts.no_sp);
940 rxq_alloc_elts_sp(&tmpl, desc_n, pool) :
941 rxq_alloc_elts(&tmpl, desc_n, pool));
943 ERROR("%p: cannot reallocate WRs, aborting", (void *)dev);
948 assert(tmpl.elts_n == desc_n);
949 assert(tmpl.elts.sp != NULL);
951 /* Clean up original data. */
953 rte_free(rxq->elts.sp);
955 /* Change queue state to ready. */
956 mod = (struct ibv_exp_wq_attr){
957 .attr_mask = IBV_EXP_WQ_ATTR_STATE,
958 .wq_state = IBV_EXP_WQS_RDY,
960 err = ibv_exp_modify_wq(tmpl.wq, &mod);
962 ERROR("%p: WQ state to IBV_EXP_WQS_RDY failed: %s",
963 (void *)dev, strerror(err));
967 assert(tmpl.if_wq != NULL);
969 struct rxq_elt_sp (*elts)[tmpl.elts_n] = tmpl.elts.sp;
971 for (i = 0; (i != RTE_DIM(*elts)); ++i) {
972 err = tmpl.if_wq->recv_sg_list
975 RTE_DIM((*elts)[i].sges));
980 struct rxq_elt (*elts)[tmpl.elts_n] = tmpl.elts.no_sp;
982 for (i = 0; (i != RTE_DIM(*elts)); ++i) {
983 err = tmpl.if_wq->recv_burst(
992 ERROR("%p: failed to post SGEs with error %d",
994 /* Set err because it does not contain a valid errno value. */
1005 * Configure a RX queue.
1008 * Pointer to Ethernet device structure.
1010 * Pointer to RX queue structure.
1012 * Number of descriptors to configure in queue.
1014 * NUMA socket on which memory must be allocated.
1016 * Thresholds parameters.
1018 * Memory pool for buffer allocations.
1021 * 0 on success, errno value on failure.
1024 rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
1025 unsigned int socket, const struct rte_eth_rxconf *conf,
1026 struct rte_mempool *mp)
1028 struct priv *priv = dev->data->dev_private;
1034 struct ibv_exp_wq_attr mod;
1036 struct ibv_exp_query_intf_params params;
1037 struct ibv_exp_cq_init_attr cq;
1038 struct ibv_exp_res_domain_init_attr rd;
1039 struct ibv_exp_wq_init_attr wq;
1041 enum ibv_exp_query_intf_status status;
1042 struct rte_mbuf *buf;
1045 unsigned int cq_size = desc;
1047 (void)conf; /* Thresholds configuration (ignored). */
1048 if ((desc == 0) || (desc % MLX5_PMD_SGE_WR_N)) {
1049 ERROR("%p: invalid number of RX descriptors (must be a"
1050 " multiple of %d)", (void *)dev, MLX5_PMD_SGE_WR_N);
1053 /* Get mbuf length. */
1054 buf = rte_pktmbuf_alloc(mp);
1056 ERROR("%p: unable to allocate mbuf", (void *)dev);
1059 tmpl.mb_len = buf->buf_len;
1060 assert((rte_pktmbuf_headroom(buf) +
1061 rte_pktmbuf_tailroom(buf)) == tmpl.mb_len);
1062 assert(rte_pktmbuf_headroom(buf) == RTE_PKTMBUF_HEADROOM);
1063 rte_pktmbuf_free(buf);
1064 /* Toggle RX checksum offload if hardware supports it. */
1066 tmpl.csum = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
1067 if (priv->hw_csum_l2tun)
1068 tmpl.csum_l2tun = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
1069 /* Enable scattered packets support for this queue if necessary. */
1070 if ((dev->data->dev_conf.rxmode.jumbo_frame) &&
1071 (dev->data->dev_conf.rxmode.max_rx_pkt_len >
1072 (tmpl.mb_len - RTE_PKTMBUF_HEADROOM))) {
1074 desc /= MLX5_PMD_SGE_WR_N;
1076 DEBUG("%p: %s scattered packets support (%u WRs)",
1077 (void *)dev, (tmpl.sp ? "enabling" : "disabling"), desc);
1078 /* Use the entire RX mempool as the memory region. */
1079 tmpl.mr = ibv_reg_mr(priv->pd,
1080 (void *)mp->elt_va_start,
1081 (mp->elt_va_end - mp->elt_va_start),
1082 (IBV_ACCESS_LOCAL_WRITE |
1083 IBV_ACCESS_REMOTE_WRITE));
1084 if (tmpl.mr == NULL) {
1086 ERROR("%p: MR creation failure: %s",
1087 (void *)dev, strerror(ret));
1090 attr.rd = (struct ibv_exp_res_domain_init_attr){
1091 .comp_mask = (IBV_EXP_RES_DOMAIN_THREAD_MODEL |
1092 IBV_EXP_RES_DOMAIN_MSG_MODEL),
1093 .thread_model = IBV_EXP_THREAD_SINGLE,
1094 .msg_model = IBV_EXP_MSG_HIGH_BW,
1096 tmpl.rd = ibv_exp_create_res_domain(priv->ctx, &attr.rd);
1097 if (tmpl.rd == NULL) {
1099 ERROR("%p: RD creation failure: %s",
1100 (void *)dev, strerror(ret));
1103 attr.cq = (struct ibv_exp_cq_init_attr){
1104 .comp_mask = IBV_EXP_CQ_INIT_ATTR_RES_DOMAIN,
1105 .res_domain = tmpl.rd,
1107 tmpl.cq = ibv_exp_create_cq(priv->ctx, cq_size, NULL, NULL, 0,
1109 if (tmpl.cq == NULL) {
1111 ERROR("%p: CQ creation failure: %s",
1112 (void *)dev, strerror(ret));
1115 DEBUG("priv->device_attr.max_qp_wr is %d",
1116 priv->device_attr.max_qp_wr);
1117 DEBUG("priv->device_attr.max_sge is %d",
1118 priv->device_attr.max_sge);
1119 attr.wq = (struct ibv_exp_wq_init_attr){
1120 .wq_context = NULL, /* Could be useful in the future. */
1121 .wq_type = IBV_EXP_WQT_RQ,
1122 /* Max number of outstanding WRs. */
1123 .max_recv_wr = ((priv->device_attr.max_qp_wr < (int)cq_size) ?
1124 priv->device_attr.max_qp_wr :
1126 /* Max number of scatter/gather elements in a WR. */
1127 .max_recv_sge = ((priv->device_attr.max_sge <
1128 MLX5_PMD_SGE_WR_N) ?
1129 priv->device_attr.max_sge :
1133 .comp_mask = IBV_EXP_CREATE_WQ_RES_DOMAIN,
1134 .res_domain = tmpl.rd,
1136 tmpl.wq = ibv_exp_create_wq(priv->ctx, &attr.wq);
1137 if (tmpl.wq == NULL) {
1138 ret = (errno ? errno : EINVAL);
1139 ERROR("%p: WQ creation failure: %s",
1140 (void *)dev, strerror(ret));
1144 ret = rxq_alloc_elts_sp(&tmpl, desc, NULL);
1146 ret = rxq_alloc_elts(&tmpl, desc, NULL);
1148 ERROR("%p: RXQ allocation failed: %s",
1149 (void *)dev, strerror(ret));
1153 tmpl.port_id = dev->data->port_id;
1154 DEBUG("%p: RTE port ID: %u", (void *)rxq, tmpl.port_id);
1155 attr.params = (struct ibv_exp_query_intf_params){
1156 .intf_scope = IBV_EXP_INTF_GLOBAL,
1157 .intf = IBV_EXP_INTF_CQ,
1160 tmpl.if_cq = ibv_exp_query_intf(priv->ctx, &attr.params, &status);
1161 if (tmpl.if_cq == NULL) {
1162 ERROR("%p: CQ interface family query failed with status %d",
1163 (void *)dev, status);
1166 attr.params = (struct ibv_exp_query_intf_params){
1167 .intf_scope = IBV_EXP_INTF_GLOBAL,
1168 .intf = IBV_EXP_INTF_WQ,
1171 tmpl.if_wq = ibv_exp_query_intf(priv->ctx, &attr.params, &status);
1172 if (tmpl.if_wq == NULL) {
1173 ERROR("%p: WQ interface family query failed with status %d",
1174 (void *)dev, status);
1177 /* Change queue state to ready. */
1178 mod = (struct ibv_exp_wq_attr){
1179 .attr_mask = IBV_EXP_WQ_ATTR_STATE,
1180 .wq_state = IBV_EXP_WQS_RDY,
1182 ret = ibv_exp_modify_wq(tmpl.wq, &mod);
1184 ERROR("%p: WQ state to IBV_EXP_WQS_RDY failed: %s",
1185 (void *)dev, strerror(ret));
1190 struct rxq_elt_sp (*elts)[tmpl.elts_n] = tmpl.elts.sp;
1192 for (i = 0; (i != RTE_DIM(*elts)); ++i) {
1193 ret = tmpl.if_wq->recv_sg_list
1196 RTE_DIM((*elts)[i].sges));
1201 struct rxq_elt (*elts)[tmpl.elts_n] = tmpl.elts.no_sp;
1203 for (i = 0; (i != RTE_DIM(*elts)); ++i) {
1204 ret = tmpl.if_wq->recv_burst(
1213 ERROR("%p: failed to post SGEs with error %d",
1215 /* Set ret because it does not contain a valid errno value. */
1219 /* Clean up rxq in case we're reinitializing it. */
1220 DEBUG("%p: cleaning-up old rxq just in case", (void *)rxq);
1223 DEBUG("%p: rxq updated with %p", (void *)rxq, (void *)&tmpl);
1233 * DPDK callback to configure a RX queue.
1236 * Pointer to Ethernet device structure.
1240 * Number of descriptors to configure in queue.
1242 * NUMA socket on which memory must be allocated.
1244 * Thresholds parameters.
1246 * Memory pool for buffer allocations.
1249 * 0 on success, negative errno value on failure.
1252 mlx5_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
1253 unsigned int socket, const struct rte_eth_rxconf *conf,
1254 struct rte_mempool *mp)
1256 struct priv *priv = dev->data->dev_private;
1257 struct rxq *rxq = (*priv->rxqs)[idx];
1261 DEBUG("%p: configuring queue %u for %u descriptors",
1262 (void *)dev, idx, desc);
1263 if (idx >= priv->rxqs_n) {
1264 ERROR("%p: queue index out of range (%u >= %u)",
1265 (void *)dev, idx, priv->rxqs_n);
1270 DEBUG("%p: reusing already allocated queue index %u (%p)",
1271 (void *)dev, idx, (void *)rxq);
1272 if (priv->started) {
1276 (*priv->rxqs)[idx] = NULL;
1279 rxq = rte_calloc_socket("RXQ", 1, sizeof(*rxq), 0, socket);
1281 ERROR("%p: unable to allocate queue index %u",
1287 ret = rxq_setup(dev, rxq, desc, socket, conf, mp);
1291 rxq->stats.idx = idx;
1292 DEBUG("%p: adding RX queue %p to list",
1293 (void *)dev, (void *)rxq);
1294 (*priv->rxqs)[idx] = rxq;
1295 /* Update receive callback. */
1297 dev->rx_pkt_burst = mlx5_rx_burst_sp;
1299 dev->rx_pkt_burst = mlx5_rx_burst;
1306 * DPDK callback to release a RX queue.
1309 * Generic RX queue pointer.
1312 mlx5_rx_queue_release(void *dpdk_rxq)
1314 struct rxq *rxq = (struct rxq *)dpdk_rxq;
1322 for (i = 0; (i != priv->rxqs_n); ++i)
1323 if ((*priv->rxqs)[i] == rxq) {
1324 DEBUG("%p: removing RX queue %p from list",
1325 (void *)priv->dev, (void *)rxq);
1326 (*priv->rxqs)[i] = NULL;