4 * Copyright 2015 6WIND S.A.
5 * Copyright 2015 Mellanox.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * * Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * * Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * * Neither the name of 6WIND S.A. nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
40 #include <sys/queue.h>
43 /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
45 #pragma GCC diagnostic ignored "-Wpedantic"
47 #include <infiniband/verbs.h>
48 #include <infiniband/mlx5dv.h>
50 #pragma GCC diagnostic error "-Wpedantic"
54 #include <rte_malloc.h>
55 #include <rte_ethdev.h>
56 #include <rte_common.h>
57 #include <rte_interrupts.h>
58 #include <rte_debug.h>
62 #include "mlx5_rxtx.h"
63 #include "mlx5_utils.h"
64 #include "mlx5_autoconf.h"
65 #include "mlx5_defs.h"
67 /* Initialization data for hash RX queues. */
68 const struct hash_rxq_init hash_rxq_init[] = {
70 .hash_fields = (IBV_RX_HASH_SRC_IPV4 |
71 IBV_RX_HASH_DST_IPV4 |
72 IBV_RX_HASH_SRC_PORT_TCP |
73 IBV_RX_HASH_DST_PORT_TCP),
74 .dpdk_rss_hf = ETH_RSS_NONFRAG_IPV4_TCP,
76 .flow_spec.tcp_udp = {
77 .type = IBV_FLOW_SPEC_TCP,
78 .size = sizeof(hash_rxq_init[0].flow_spec.tcp_udp),
80 .underlayer = &hash_rxq_init[HASH_RXQ_IPV4],
83 .hash_fields = (IBV_RX_HASH_SRC_IPV4 |
84 IBV_RX_HASH_DST_IPV4 |
85 IBV_RX_HASH_SRC_PORT_UDP |
86 IBV_RX_HASH_DST_PORT_UDP),
87 .dpdk_rss_hf = ETH_RSS_NONFRAG_IPV4_UDP,
89 .flow_spec.tcp_udp = {
90 .type = IBV_FLOW_SPEC_UDP,
91 .size = sizeof(hash_rxq_init[0].flow_spec.tcp_udp),
93 .underlayer = &hash_rxq_init[HASH_RXQ_IPV4],
96 .hash_fields = (IBV_RX_HASH_SRC_IPV4 |
97 IBV_RX_HASH_DST_IPV4),
98 .dpdk_rss_hf = (ETH_RSS_IPV4 |
102 .type = IBV_FLOW_SPEC_IPV4,
103 .size = sizeof(hash_rxq_init[0].flow_spec.ipv4),
105 .underlayer = &hash_rxq_init[HASH_RXQ_ETH],
108 .hash_fields = (IBV_RX_HASH_SRC_IPV6 |
109 IBV_RX_HASH_DST_IPV6 |
110 IBV_RX_HASH_SRC_PORT_TCP |
111 IBV_RX_HASH_DST_PORT_TCP),
112 .dpdk_rss_hf = ETH_RSS_NONFRAG_IPV6_TCP,
114 .flow_spec.tcp_udp = {
115 .type = IBV_FLOW_SPEC_TCP,
116 .size = sizeof(hash_rxq_init[0].flow_spec.tcp_udp),
118 .underlayer = &hash_rxq_init[HASH_RXQ_IPV6],
121 .hash_fields = (IBV_RX_HASH_SRC_IPV6 |
122 IBV_RX_HASH_DST_IPV6 |
123 IBV_RX_HASH_SRC_PORT_UDP |
124 IBV_RX_HASH_DST_PORT_UDP),
125 .dpdk_rss_hf = ETH_RSS_NONFRAG_IPV6_UDP,
127 .flow_spec.tcp_udp = {
128 .type = IBV_FLOW_SPEC_UDP,
129 .size = sizeof(hash_rxq_init[0].flow_spec.tcp_udp),
131 .underlayer = &hash_rxq_init[HASH_RXQ_IPV6],
134 .hash_fields = (IBV_RX_HASH_SRC_IPV6 |
135 IBV_RX_HASH_DST_IPV6),
136 .dpdk_rss_hf = (ETH_RSS_IPV6 |
140 .type = IBV_FLOW_SPEC_IPV6,
141 .size = sizeof(hash_rxq_init[0].flow_spec.ipv6),
143 .underlayer = &hash_rxq_init[HASH_RXQ_ETH],
150 .type = IBV_FLOW_SPEC_ETH,
151 .size = sizeof(hash_rxq_init[0].flow_spec.eth),
157 /* Number of entries in hash_rxq_init[]. */
158 const unsigned int hash_rxq_init_n = RTE_DIM(hash_rxq_init);
160 /* Initialization data for hash RX queue indirection tables. */
161 static const struct ind_table_init ind_table_init[] = {
163 .max_size = -1u, /* Superseded by HW limitations. */
165 1 << HASH_RXQ_TCPV4 |
166 1 << HASH_RXQ_UDPV4 |
168 1 << HASH_RXQ_TCPV6 |
169 1 << HASH_RXQ_UDPV6 |
176 .hash_types = 1 << HASH_RXQ_ETH,
181 #define IND_TABLE_INIT_N RTE_DIM(ind_table_init)
183 /* Default RSS hash key also used for ConnectX-3. */
184 uint8_t rss_hash_default_key[] = {
185 0x2c, 0xc6, 0x81, 0xd1,
186 0x5b, 0xdb, 0xf4, 0xf7,
187 0xfc, 0xa2, 0x83, 0x19,
188 0xdb, 0x1a, 0x3e, 0x94,
189 0x6b, 0x9e, 0x38, 0xd9,
190 0x2c, 0x9c, 0x03, 0xd1,
191 0xad, 0x99, 0x44, 0xa7,
192 0xd9, 0x56, 0x3d, 0x59,
193 0x06, 0x3c, 0x25, 0xf3,
194 0xfc, 0x1f, 0xdc, 0x2a,
197 /* Length of the default RSS hash key. */
198 const size_t rss_hash_default_key_len = sizeof(rss_hash_default_key);
201 * Populate flow steering rule for a given hash RX queue type using
202 * information from hash_rxq_init[]. Nothing is written to flow_attr when
203 * flow_attr_size is not large enough, but the required size is still returned.
206 * Pointer to private structure.
207 * @param[out] flow_attr
208 * Pointer to flow attribute structure to fill. Note that the allocated
209 * area must be larger and large enough to hold all flow specifications.
210 * @param flow_attr_size
211 * Entire size of flow_attr and trailing room for flow specifications.
213 * Hash RX queue type to use for flow steering rule.
216 * Total size of the flow attribute buffer. No errors are defined.
219 priv_flow_attr(struct priv *priv, struct ibv_flow_attr *flow_attr,
220 size_t flow_attr_size, enum hash_rxq_type type)
222 size_t offset = sizeof(*flow_attr);
223 const struct hash_rxq_init *init = &hash_rxq_init[type];
225 assert(priv != NULL);
226 assert((size_t)type < RTE_DIM(hash_rxq_init));
228 offset += init->flow_spec.hdr.size;
229 init = init->underlayer;
230 } while (init != NULL);
231 if (offset > flow_attr_size)
233 flow_attr_size = offset;
234 init = &hash_rxq_init[type];
235 *flow_attr = (struct ibv_flow_attr){
236 .type = IBV_FLOW_ATTR_NORMAL,
237 /* Priorities < 3 are reserved for flow director. */
238 .priority = init->flow_priority + 3,
244 offset -= init->flow_spec.hdr.size;
245 memcpy((void *)((uintptr_t)flow_attr + offset),
247 init->flow_spec.hdr.size);
248 ++flow_attr->num_of_specs;
249 init = init->underlayer;
250 } while (init != NULL);
251 return flow_attr_size;
255 * Convert hash type position in indirection table initializer to
256 * hash RX queue type.
259 * Indirection table initializer.
261 * Hash type position.
264 * Hash RX queue type.
266 static enum hash_rxq_type
267 hash_rxq_type_from_pos(const struct ind_table_init *table, unsigned int pos)
269 enum hash_rxq_type type = HASH_RXQ_TCPV4;
271 assert(pos < table->hash_types_n);
273 if ((table->hash_types & (1 << type)) && (pos-- == 0))
281 * Filter out disabled hash RX queue types from ind_table_init[].
284 * Pointer to private structure.
289 * Number of table entries.
292 priv_make_ind_table_init(struct priv *priv,
293 struct ind_table_init (*table)[IND_TABLE_INIT_N])
298 unsigned int table_n = 0;
299 /* Mandatory to receive frames not handled by normal hash RX queues. */
300 unsigned int hash_types_sup = 1 << HASH_RXQ_ETH;
302 rss_hf = priv->rss_hf;
303 /* Process other protocols only if more than one queue. */
304 if (priv->rxqs_n > 1)
305 for (i = 0; (i != hash_rxq_init_n); ++i)
306 if (rss_hf & hash_rxq_init[i].dpdk_rss_hf)
307 hash_types_sup |= (1 << i);
309 /* Filter out entries whose protocols are not in the set. */
310 for (i = 0, j = 0; (i != IND_TABLE_INIT_N); ++i) {
314 /* j is increased only if the table has valid protocols. */
316 (*table)[j] = ind_table_init[i];
317 (*table)[j].hash_types &= hash_types_sup;
318 for (h = 0, nb = 0; (h != hash_rxq_init_n); ++h)
319 if (((*table)[j].hash_types >> h) & 0x1)
321 (*table)[i].hash_types_n = nb;
331 * Initialize hash RX queues and indirection table.
334 * Pointer to private structure.
337 * 0 on success, errno value on failure.
340 priv_create_hash_rxqs(struct priv *priv)
342 struct ibv_wq *wqs[priv->reta_idx_n];
343 struct ind_table_init ind_table_init[IND_TABLE_INIT_N];
344 unsigned int ind_tables_n =
345 priv_make_ind_table_init(priv, &ind_table_init);
346 unsigned int hash_rxqs_n = 0;
347 struct hash_rxq (*hash_rxqs)[] = NULL;
348 struct ibv_rwq_ind_table *(*ind_tables)[] = NULL;
354 assert(priv->ind_tables == NULL);
355 assert(priv->ind_tables_n == 0);
356 assert(priv->hash_rxqs == NULL);
357 assert(priv->hash_rxqs_n == 0);
358 assert(priv->pd != NULL);
359 assert(priv->ctx != NULL);
362 if (priv->rxqs_n == 0)
364 assert(priv->rxqs != NULL);
365 if (ind_tables_n == 0) {
366 ERROR("all hash RX queue types have been filtered out,"
367 " indirection table cannot be created");
370 if (priv->rxqs_n & (priv->rxqs_n - 1)) {
371 INFO("%u RX queues are configured, consider rounding this"
372 " number to the next power of two for better balancing",
374 DEBUG("indirection table extended to assume %u WQs",
377 for (i = 0; (i != priv->reta_idx_n); ++i) {
378 struct mlx5_rxq_ctrl *rxq_ctrl;
380 rxq_ctrl = container_of((*priv->rxqs)[(*priv->reta_idx)[i]],
381 struct mlx5_rxq_ctrl, rxq);
382 wqs[i] = rxq_ctrl->ibv->wq;
384 /* Get number of hash RX queues to configure. */
385 for (i = 0, hash_rxqs_n = 0; (i != ind_tables_n); ++i)
386 hash_rxqs_n += ind_table_init[i].hash_types_n;
387 DEBUG("allocating %u hash RX queues for %u WQs, %u indirection tables",
388 hash_rxqs_n, priv->rxqs_n, ind_tables_n);
389 /* Create indirection tables. */
390 ind_tables = rte_calloc(__func__, ind_tables_n,
391 sizeof((*ind_tables)[0]), 0);
392 if (ind_tables == NULL) {
394 ERROR("cannot allocate indirection tables container: %s",
398 for (i = 0; (i != ind_tables_n); ++i) {
399 struct ibv_rwq_ind_table_init_attr ind_init_attr = {
400 .log_ind_tbl_size = 0, /* Set below. */
404 unsigned int ind_tbl_size = ind_table_init[i].max_size;
405 struct ibv_rwq_ind_table *ind_table;
407 if (priv->reta_idx_n < ind_tbl_size)
408 ind_tbl_size = priv->reta_idx_n;
409 ind_init_attr.log_ind_tbl_size = log2above(ind_tbl_size);
411 ind_table = ibv_create_rwq_ind_table(priv->ctx,
413 if (ind_table != NULL) {
414 (*ind_tables)[i] = ind_table;
417 /* Not clear whether errno is set. */
418 err = (errno ? errno : EINVAL);
419 ERROR("RX indirection table creation failed with error %d: %s",
423 /* Allocate array that holds hash RX queues and related data. */
424 hash_rxqs = rte_calloc(__func__, hash_rxqs_n,
425 sizeof((*hash_rxqs)[0]), 0);
426 if (hash_rxqs == NULL) {
428 ERROR("cannot allocate hash RX queues container: %s",
432 for (i = 0, j = 0, k = 0;
433 ((i != hash_rxqs_n) && (j != ind_tables_n));
435 struct hash_rxq *hash_rxq = &(*hash_rxqs)[i];
436 enum hash_rxq_type type =
437 hash_rxq_type_from_pos(&ind_table_init[j], k);
438 struct rte_eth_rss_conf *priv_rss_conf =
439 (*priv->rss_conf)[type];
440 struct ibv_rx_hash_conf hash_conf = {
441 .rx_hash_function = IBV_RX_HASH_FUNC_TOEPLITZ,
442 .rx_hash_key_len = (priv_rss_conf ?
443 priv_rss_conf->rss_key_len :
444 rss_hash_default_key_len),
445 .rx_hash_key = (priv_rss_conf ?
446 priv_rss_conf->rss_key :
447 rss_hash_default_key),
448 .rx_hash_fields_mask = hash_rxq_init[type].hash_fields,
450 struct ibv_qp_init_attr_ex qp_init_attr = {
451 .qp_type = IBV_QPT_RAW_PACKET,
452 .comp_mask = (IBV_QP_INIT_ATTR_PD |
453 IBV_QP_INIT_ATTR_IND_TABLE |
454 IBV_QP_INIT_ATTR_RX_HASH),
455 .rx_hash_conf = hash_conf,
456 .rwq_ind_tbl = (*ind_tables)[j],
460 DEBUG("using indirection table %u for hash RX queue %u type %d",
462 *hash_rxq = (struct hash_rxq){
464 .qp = ibv_create_qp_ex(priv->ctx, &qp_init_attr),
467 if (hash_rxq->qp == NULL) {
468 err = (errno ? errno : EINVAL);
469 ERROR("Hash RX QP creation failure: %s",
473 if (++k < ind_table_init[j].hash_types_n)
475 /* Switch to the next indirection table and reset hash RX
476 * queue type array index. */
480 priv->ind_tables = ind_tables;
481 priv->ind_tables_n = ind_tables_n;
482 priv->hash_rxqs = hash_rxqs;
483 priv->hash_rxqs_n = hash_rxqs_n;
487 if (hash_rxqs != NULL) {
488 for (i = 0; (i != hash_rxqs_n); ++i) {
489 struct ibv_qp *qp = (*hash_rxqs)[i].qp;
493 claim_zero(ibv_destroy_qp(qp));
497 if (ind_tables != NULL) {
498 for (j = 0; (j != ind_tables_n); ++j) {
499 struct ibv_rwq_ind_table *ind_table =
502 if (ind_table == NULL)
504 claim_zero(ibv_destroy_rwq_ind_table(ind_table));
506 rte_free(ind_tables);
512 * Clean up hash RX queues and indirection table.
515 * Pointer to private structure.
518 priv_destroy_hash_rxqs(struct priv *priv)
522 DEBUG("destroying %u hash RX queues", priv->hash_rxqs_n);
523 if (priv->hash_rxqs_n == 0) {
524 assert(priv->hash_rxqs == NULL);
525 assert(priv->ind_tables == NULL);
528 for (i = 0; (i != priv->hash_rxqs_n); ++i) {
529 struct hash_rxq *hash_rxq = &(*priv->hash_rxqs)[i];
532 assert(hash_rxq->priv == priv);
533 assert(hash_rxq->qp != NULL);
534 /* Also check that there are no remaining flows. */
535 for (j = 0; (j != RTE_DIM(hash_rxq->special_flow)); ++j)
537 (k != RTE_DIM(hash_rxq->special_flow[j]));
539 assert(hash_rxq->special_flow[j][k] == NULL);
540 for (j = 0; (j != RTE_DIM(hash_rxq->mac_flow)); ++j)
541 for (k = 0; (k != RTE_DIM(hash_rxq->mac_flow[j])); ++k)
542 assert(hash_rxq->mac_flow[j][k] == NULL);
543 claim_zero(ibv_destroy_qp(hash_rxq->qp));
545 priv->hash_rxqs_n = 0;
546 rte_free(priv->hash_rxqs);
547 priv->hash_rxqs = NULL;
548 for (i = 0; (i != priv->ind_tables_n); ++i) {
549 struct ibv_rwq_ind_table *ind_table =
550 (*priv->ind_tables)[i];
552 assert(ind_table != NULL);
553 claim_zero(ibv_destroy_rwq_ind_table(ind_table));
555 priv->ind_tables_n = 0;
556 rte_free(priv->ind_tables);
557 priv->ind_tables = NULL;
561 * Check whether a given flow type is allowed.
564 * Pointer to private structure.
566 * Flow type to check.
569 * Nonzero if the given flow type is allowed.
572 priv_allow_flow_type(struct priv *priv, enum hash_rxq_flow_type type)
574 /* Only FLOW_TYPE_PROMISC is allowed when promiscuous mode
575 * has been requested. */
576 if (priv->promisc_req)
577 return type == HASH_RXQ_FLOW_TYPE_PROMISC;
579 case HASH_RXQ_FLOW_TYPE_PROMISC:
580 return !!priv->promisc_req;
581 case HASH_RXQ_FLOW_TYPE_ALLMULTI:
582 return !!priv->allmulti_req;
583 case HASH_RXQ_FLOW_TYPE_BROADCAST:
584 case HASH_RXQ_FLOW_TYPE_IPV6MULTI:
585 /* If allmulti is enabled, broadcast and ipv6multi
586 * are unnecessary. */
587 return !priv->allmulti_req;
588 case HASH_RXQ_FLOW_TYPE_MAC:
591 /* Unsupported flow type is not allowed. */
598 * Automatically enable/disable flows according to configuration.
604 * 0 on success, errno value on failure.
607 priv_rehash_flows(struct priv *priv)
611 for (i = 0; i != RTE_DIM((*priv->hash_rxqs)[0].special_flow); ++i)
612 if (!priv_allow_flow_type(priv, i)) {
613 priv_special_flow_disable(priv, i);
615 int ret = priv_special_flow_enable(priv, i);
620 if (priv_allow_flow_type(priv, HASH_RXQ_FLOW_TYPE_MAC))
621 return priv_mac_addrs_enable(priv);
622 priv_mac_addrs_disable(priv);
627 * Allocate RX queue elements.
630 * Pointer to RX queue structure.
633 * 0 on success, errno value on failure.
636 rxq_alloc_elts(struct mlx5_rxq_ctrl *rxq_ctrl)
638 const unsigned int sges_n = 1 << rxq_ctrl->rxq.sges_n;
639 unsigned int elts_n = 1 << rxq_ctrl->rxq.elts_n;
643 /* Iterate on segments. */
644 for (i = 0; (i != elts_n); ++i) {
645 struct rte_mbuf *buf;
647 buf = rte_pktmbuf_alloc(rxq_ctrl->rxq.mp);
649 ERROR("%p: empty mbuf pool", (void *)rxq_ctrl);
653 /* Headroom is reserved by rte_pktmbuf_alloc(). */
654 assert(DATA_OFF(buf) == RTE_PKTMBUF_HEADROOM);
655 /* Buffer is supposed to be empty. */
656 assert(rte_pktmbuf_data_len(buf) == 0);
657 assert(rte_pktmbuf_pkt_len(buf) == 0);
659 /* Only the first segment keeps headroom. */
661 SET_DATA_OFF(buf, 0);
662 PORT(buf) = rxq_ctrl->rxq.port_id;
663 DATA_LEN(buf) = rte_pktmbuf_tailroom(buf);
664 PKT_LEN(buf) = DATA_LEN(buf);
666 (*rxq_ctrl->rxq.elts)[i] = buf;
668 /* If Rx vector is activated. */
669 if (rxq_check_vec_support(&rxq_ctrl->rxq) > 0) {
670 struct mlx5_rxq_data *rxq = &rxq_ctrl->rxq;
671 struct rte_mbuf *mbuf_init = &rxq->fake_mbuf;
674 /* Initialize default rearm_data for vPMD. */
675 mbuf_init->data_off = RTE_PKTMBUF_HEADROOM;
676 rte_mbuf_refcnt_set(mbuf_init, 1);
677 mbuf_init->nb_segs = 1;
678 mbuf_init->port = rxq->port_id;
680 * prevent compiler reordering:
681 * rearm_data covers previous fields.
683 rte_compiler_barrier();
684 rxq->mbuf_initializer =
685 *(uint64_t *)&mbuf_init->rearm_data;
686 /* Padding with a fake mbuf for vectorized Rx. */
687 for (j = 0; j < MLX5_VPMD_DESCS_PER_LOOP; ++j)
688 (*rxq->elts)[elts_n + j] = &rxq->fake_mbuf;
690 DEBUG("%p: allocated and configured %u segments (max %u packets)",
691 (void *)rxq_ctrl, elts_n, elts_n / (1 << rxq_ctrl->rxq.sges_n));
696 for (i = 0; (i != elts_n); ++i) {
697 if ((*rxq_ctrl->rxq.elts)[i] != NULL)
698 rte_pktmbuf_free_seg((*rxq_ctrl->rxq.elts)[i]);
699 (*rxq_ctrl->rxq.elts)[i] = NULL;
701 DEBUG("%p: failed, freed everything", (void *)rxq_ctrl);
707 * Free RX queue elements.
710 * Pointer to RX queue structure.
713 rxq_free_elts(struct mlx5_rxq_ctrl *rxq_ctrl)
715 struct mlx5_rxq_data *rxq = &rxq_ctrl->rxq;
716 const uint16_t q_n = (1 << rxq->elts_n);
717 const uint16_t q_mask = q_n - 1;
718 uint16_t used = q_n - (rxq->rq_ci - rxq->rq_pi);
721 DEBUG("%p: freeing WRs", (void *)rxq_ctrl);
722 if (rxq->elts == NULL)
725 * Some mbuf in the Ring belongs to the application. They cannot be
728 if (rxq_check_vec_support(rxq) > 0) {
729 for (i = 0; i < used; ++i)
730 (*rxq->elts)[(rxq->rq_ci + i) & q_mask] = NULL;
731 rxq->rq_pi = rxq->rq_ci;
733 for (i = 0; (i != (1u << rxq->elts_n)); ++i) {
734 if ((*rxq->elts)[i] != NULL)
735 rte_pktmbuf_free_seg((*rxq->elts)[i]);
736 (*rxq->elts)[i] = NULL;
741 * Clean up a RX queue.
743 * Destroy objects, free allocated memory and reset the structure for reuse.
746 * Pointer to RX queue structure.
749 mlx5_rxq_cleanup(struct mlx5_rxq_ctrl *rxq_ctrl)
751 DEBUG("cleaning up %p", (void *)rxq_ctrl);
753 mlx5_priv_rxq_ibv_release(rxq_ctrl->priv, rxq_ctrl->ibv);
754 memset(rxq_ctrl, 0, sizeof(*rxq_ctrl));
760 * Pointer to Ethernet device structure.
764 * Number of descriptors to configure in queue.
766 * NUMA socket on which memory must be allocated.
768 * Thresholds parameters.
770 * Memory pool for buffer allocations.
773 * 0 on success, negative errno value on failure.
776 mlx5_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
777 unsigned int socket, const struct rte_eth_rxconf *conf,
778 struct rte_mempool *mp)
780 struct priv *priv = dev->data->dev_private;
781 struct mlx5_rxq_data *rxq = (*priv->rxqs)[idx];
782 struct mlx5_rxq_ctrl *rxq_ctrl =
783 container_of(rxq, struct mlx5_rxq_ctrl, rxq);
787 if (mlx5_is_secondary())
788 return -E_RTE_SECONDARY;
790 if (!rte_is_power_of_2(desc)) {
791 desc = 1 << log2above(desc);
792 WARN("%p: increased number of descriptors in RX queue %u"
793 " to the next power of two (%d)",
794 (void *)dev, idx, desc);
796 DEBUG("%p: configuring queue %u for %u descriptors",
797 (void *)dev, idx, desc);
798 if (idx >= priv->rxqs_n) {
799 ERROR("%p: queue index out of range (%u >= %u)",
800 (void *)dev, idx, priv->rxqs_n);
804 if (!mlx5_priv_rxq_releasable(priv, idx)) {
806 ERROR("%p: unable to release queue index %u",
810 mlx5_priv_rxq_release(priv, idx);
811 rxq_ctrl = mlx5_priv_rxq_new(priv, idx, desc, socket, mp);
813 ERROR("%p: unable to allocate queue index %u",
818 DEBUG("%p: adding RX queue %p to list",
819 (void *)dev, (void *)rxq_ctrl);
820 (*priv->rxqs)[idx] = &rxq_ctrl->rxq;
827 * DPDK callback to release a RX queue.
830 * Generic RX queue pointer.
833 mlx5_rx_queue_release(void *dpdk_rxq)
835 struct mlx5_rxq_data *rxq = (struct mlx5_rxq_data *)dpdk_rxq;
836 struct mlx5_rxq_ctrl *rxq_ctrl;
839 if (mlx5_is_secondary())
844 rxq_ctrl = container_of(rxq, struct mlx5_rxq_ctrl, rxq);
845 priv = rxq_ctrl->priv;
847 if (!mlx5_priv_rxq_releasable(priv, rxq_ctrl->rxq.stats.idx))
848 rte_panic("Rx queue %p is still used by a flow and cannot be"
849 " removed\n", (void *)rxq_ctrl);
850 mlx5_priv_rxq_release(priv, rxq_ctrl->rxq.stats.idx);
855 * Allocate queue vector and fill epoll fd list for Rx interrupts.
858 * Pointer to private structure.
861 * 0 on success, negative on failure.
864 priv_rx_intr_vec_enable(struct priv *priv)
867 unsigned int rxqs_n = priv->rxqs_n;
868 unsigned int n = RTE_MIN(rxqs_n, (uint32_t)RTE_MAX_RXTX_INTR_VEC_ID);
869 unsigned int count = 0;
870 struct rte_intr_handle *intr_handle = priv->dev->intr_handle;
872 assert(!mlx5_is_secondary());
873 if (!priv->dev->data->dev_conf.intr_conf.rxq)
875 priv_rx_intr_vec_disable(priv);
876 intr_handle->intr_vec = malloc(sizeof(intr_handle->intr_vec[rxqs_n]));
877 if (intr_handle->intr_vec == NULL) {
878 ERROR("failed to allocate memory for interrupt vector,"
879 " Rx interrupts will not be supported");
882 intr_handle->type = RTE_INTR_HANDLE_EXT;
883 for (i = 0; i != n; ++i) {
884 /* This rxq ibv must not be released in this function. */
885 struct mlx5_rxq_ibv *rxq_ibv = mlx5_priv_rxq_ibv_get(priv, i);
890 /* Skip queues that cannot request interrupts. */
891 if (!rxq_ibv || !rxq_ibv->channel) {
892 /* Use invalid intr_vec[] index to disable entry. */
893 intr_handle->intr_vec[i] =
894 RTE_INTR_VEC_RXTX_OFFSET +
895 RTE_MAX_RXTX_INTR_VEC_ID;
898 if (count >= RTE_MAX_RXTX_INTR_VEC_ID) {
899 ERROR("too many Rx queues for interrupt vector size"
900 " (%d), Rx interrupts cannot be enabled",
901 RTE_MAX_RXTX_INTR_VEC_ID);
902 priv_rx_intr_vec_disable(priv);
905 fd = rxq_ibv->channel->fd;
906 flags = fcntl(fd, F_GETFL);
907 rc = fcntl(fd, F_SETFL, flags | O_NONBLOCK);
909 ERROR("failed to make Rx interrupt file descriptor"
910 " %d non-blocking for queue index %d", fd, i);
911 priv_rx_intr_vec_disable(priv);
914 intr_handle->intr_vec[i] = RTE_INTR_VEC_RXTX_OFFSET + count;
915 intr_handle->efds[count] = fd;
919 priv_rx_intr_vec_disable(priv);
921 intr_handle->nb_efd = count;
926 * Clean up Rx interrupts handler.
929 * Pointer to private structure.
932 priv_rx_intr_vec_disable(struct priv *priv)
934 struct rte_intr_handle *intr_handle = priv->dev->intr_handle;
936 unsigned int rxqs_n = priv->rxqs_n;
937 unsigned int n = RTE_MIN(rxqs_n, (uint32_t)RTE_MAX_RXTX_INTR_VEC_ID);
939 if (!priv->dev->data->dev_conf.intr_conf.rxq)
941 for (i = 0; i != n; ++i) {
942 struct mlx5_rxq_ctrl *rxq_ctrl;
943 struct mlx5_rxq_data *rxq_data;
945 if (intr_handle->intr_vec[i] == RTE_INTR_VEC_RXTX_OFFSET +
946 RTE_MAX_RXTX_INTR_VEC_ID)
949 * Need to access directly the queue to release the reference
950 * kept in priv_rx_intr_vec_enable().
952 rxq_data = (*priv->rxqs)[i];
953 rxq_ctrl = container_of(rxq_data, struct mlx5_rxq_ctrl, rxq);
954 mlx5_priv_rxq_ibv_release(priv, rxq_ctrl->ibv);
956 rte_intr_free_epoll_fd(intr_handle);
957 free(intr_handle->intr_vec);
958 intr_handle->nb_efd = 0;
959 intr_handle->intr_vec = NULL;
963 * MLX5 CQ notification .
966 * Pointer to receive queue structure.
968 * Sequence number per receive queue .
971 mlx5_arm_cq(struct mlx5_rxq_data *rxq, int sq_n_rxq)
974 uint32_t doorbell_hi;
976 void *cq_db_reg = (char *)rxq->cq_uar + MLX5_CQ_DOORBELL;
978 sq_n = sq_n_rxq & MLX5_CQ_SQN_MASK;
979 doorbell_hi = sq_n << MLX5_CQ_SQN_OFFSET | (rxq->cq_ci & MLX5_CI_MASK);
980 doorbell = (uint64_t)doorbell_hi << 32;
981 doorbell |= rxq->cqn;
982 rxq->cq_db[MLX5_CQ_ARM_DB] = rte_cpu_to_be_32(doorbell_hi);
984 rte_write64(rte_cpu_to_be_64(doorbell), cq_db_reg);
988 * DPDK callback for Rx queue interrupt enable.
991 * Pointer to Ethernet device structure.
996 * 0 on success, negative on failure.
999 mlx5_rx_intr_enable(struct rte_eth_dev *dev, uint16_t rx_queue_id)
1001 struct priv *priv = mlx5_get_priv(dev);
1002 struct mlx5_rxq_data *rxq_data;
1003 struct mlx5_rxq_ctrl *rxq_ctrl;
1007 rxq_data = (*priv->rxqs)[rx_queue_id];
1012 rxq_ctrl = container_of(rxq_data, struct mlx5_rxq_ctrl, rxq);
1013 if (rxq_ctrl->irq) {
1014 struct mlx5_rxq_ibv *rxq_ibv;
1016 rxq_ibv = mlx5_priv_rxq_ibv_get(priv, rx_queue_id);
1021 mlx5_arm_cq(rxq_data, rxq_data->cq_arm_sn);
1022 mlx5_priv_rxq_ibv_release(priv, rxq_ibv);
1027 WARN("unable to arm interrupt on rx queue %d", rx_queue_id);
1032 * DPDK callback for Rx queue interrupt disable.
1035 * Pointer to Ethernet device structure.
1036 * @param rx_queue_id
1040 * 0 on success, negative on failure.
1043 mlx5_rx_intr_disable(struct rte_eth_dev *dev, uint16_t rx_queue_id)
1045 struct priv *priv = mlx5_get_priv(dev);
1046 struct mlx5_rxq_data *rxq_data;
1047 struct mlx5_rxq_ctrl *rxq_ctrl;
1048 struct mlx5_rxq_ibv *rxq_ibv = NULL;
1049 struct ibv_cq *ev_cq;
1054 rxq_data = (*priv->rxqs)[rx_queue_id];
1059 rxq_ctrl = container_of(rxq_data, struct mlx5_rxq_ctrl, rxq);
1062 rxq_ibv = mlx5_priv_rxq_ibv_get(priv, rx_queue_id);
1067 ret = ibv_get_cq_event(rxq_ibv->channel, &ev_cq, &ev_ctx);
1068 if (ret || ev_cq != rxq_ibv->cq) {
1072 rxq_data->cq_arm_sn++;
1073 ibv_ack_cq_events(rxq_ibv->cq, 1);
1076 mlx5_priv_rxq_ibv_release(priv, rxq_ibv);
1079 WARN("unable to disable interrupt on rx queue %d",
1085 * Create the Rx queue Verbs object.
1088 * Pointer to private structure.
1090 * Queue index in DPDK Rx queue array
1093 * The Verbs object initialised if it can be created.
1095 struct mlx5_rxq_ibv*
1096 mlx5_priv_rxq_ibv_new(struct priv *priv, uint16_t idx)
1098 struct mlx5_rxq_data *rxq_data = (*priv->rxqs)[idx];
1099 struct mlx5_rxq_ctrl *rxq_ctrl =
1100 container_of(rxq_data, struct mlx5_rxq_ctrl, rxq);
1101 struct ibv_wq_attr mod;
1103 struct ibv_cq_init_attr_ex cq;
1104 struct ibv_wq_init_attr wq;
1105 struct ibv_cq_ex cq_attr;
1107 unsigned int cqe_n = (1 << rxq_data->elts_n) - 1;
1108 struct mlx5_rxq_ibv *tmpl;
1109 struct mlx5dv_cq cq_info;
1110 struct mlx5dv_rwq rwq;
1113 struct mlx5dv_obj obj;
1116 assert(!rxq_ctrl->ibv);
1117 tmpl = rte_calloc_socket(__func__, 1, sizeof(*tmpl), 0,
1120 ERROR("%p: cannot allocate verbs resources",
1124 tmpl->rxq_ctrl = rxq_ctrl;
1125 /* Use the entire RX mempool as the memory region. */
1126 tmpl->mr = priv_mr_get(priv, rxq_data->mp);
1128 tmpl->mr = priv_mr_new(priv, rxq_data->mp);
1130 ERROR("%p: MR creation failure", (void *)rxq_ctrl);
1134 if (rxq_ctrl->irq) {
1135 tmpl->channel = ibv_create_comp_channel(priv->ctx);
1136 if (!tmpl->channel) {
1137 ERROR("%p: Comp Channel creation failure",
1142 attr.cq = (struct ibv_cq_init_attr_ex){
1145 if (priv->cqe_comp) {
1146 attr.cq.comp_mask |= IBV_CQ_INIT_ATTR_MASK_FLAGS;
1147 attr.cq.flags |= MLX5DV_CQ_INIT_ATTR_MASK_COMPRESSED_CQE;
1149 * For vectorized Rx, it must not be doubled in order to
1150 * make cq_ci and rq_ci aligned.
1152 if (rxq_check_vec_support(rxq_data) < 0)
1155 tmpl->cq = ibv_create_cq(priv->ctx, cqe_n, NULL, tmpl->channel, 0);
1156 if (tmpl->cq == NULL) {
1157 ERROR("%p: CQ creation failure", (void *)rxq_ctrl);
1160 DEBUG("priv->device_attr.max_qp_wr is %d",
1161 priv->device_attr.orig_attr.max_qp_wr);
1162 DEBUG("priv->device_attr.max_sge is %d",
1163 priv->device_attr.orig_attr.max_sge);
1164 attr.wq = (struct ibv_wq_init_attr){
1165 .wq_context = NULL, /* Could be useful in the future. */
1166 .wq_type = IBV_WQT_RQ,
1167 /* Max number of outstanding WRs. */
1168 .max_wr = (1 << rxq_data->elts_n) >> rxq_data->sges_n,
1169 /* Max number of scatter/gather elements in a WR. */
1170 .max_sge = 1 << rxq_data->sges_n,
1174 IBV_WQ_FLAGS_CVLAN_STRIPPING |
1176 .create_flags = (rxq_data->vlan_strip ?
1177 IBV_WQ_FLAGS_CVLAN_STRIPPING :
1180 /* By default, FCS (CRC) is stripped by hardware. */
1181 if (rxq_data->crc_present) {
1182 attr.wq.create_flags |= IBV_WQ_FLAGS_SCATTER_FCS;
1183 attr.wq.comp_mask |= IBV_WQ_INIT_ATTR_FLAGS;
1185 #ifdef HAVE_IBV_WQ_FLAG_RX_END_PADDING
1186 if (priv->hw_padding) {
1187 attr.wq.create_flags |= IBV_WQ_FLAG_RX_END_PADDING;
1188 attr.wq.comp_mask |= IBV_WQ_INIT_ATTR_FLAGS;
1191 tmpl->wq = ibv_create_wq(priv->ctx, &attr.wq);
1192 if (tmpl->wq == NULL) {
1193 ERROR("%p: WQ creation failure", (void *)rxq_ctrl);
1197 * Make sure number of WRs*SGEs match expectations since a queue
1198 * cannot allocate more than "desc" buffers.
1200 if (((int)attr.wq.max_wr !=
1201 ((1 << rxq_data->elts_n) >> rxq_data->sges_n)) ||
1202 ((int)attr.wq.max_sge != (1 << rxq_data->sges_n))) {
1203 ERROR("%p: requested %u*%u but got %u*%u WRs*SGEs",
1205 ((1 << rxq_data->elts_n) >> rxq_data->sges_n),
1206 (1 << rxq_data->sges_n),
1207 attr.wq.max_wr, attr.wq.max_sge);
1210 /* Change queue state to ready. */
1211 mod = (struct ibv_wq_attr){
1212 .attr_mask = IBV_WQ_ATTR_STATE,
1213 .wq_state = IBV_WQS_RDY,
1215 ret = ibv_modify_wq(tmpl->wq, &mod);
1217 ERROR("%p: WQ state to IBV_WQS_RDY failed",
1221 obj.cq.in = tmpl->cq;
1222 obj.cq.out = &cq_info;
1223 obj.rwq.in = tmpl->wq;
1225 ret = mlx5dv_init_obj(&obj, MLX5DV_OBJ_CQ | MLX5DV_OBJ_RWQ);
1228 if (cq_info.cqe_size != RTE_CACHE_LINE_SIZE) {
1229 ERROR("Wrong MLX5_CQE_SIZE environment variable value: "
1230 "it should be set to %u", RTE_CACHE_LINE_SIZE);
1233 /* Fill the rings. */
1234 rxq_data->wqes = (volatile struct mlx5_wqe_data_seg (*)[])
1236 for (i = 0; (i != (unsigned int)(1 << rxq_data->elts_n)); ++i) {
1237 struct rte_mbuf *buf = (*rxq_data->elts)[i];
1238 volatile struct mlx5_wqe_data_seg *scat = &(*rxq_data->wqes)[i];
1240 /* scat->addr must be able to store a pointer. */
1241 assert(sizeof(scat->addr) >= sizeof(uintptr_t));
1242 *scat = (struct mlx5_wqe_data_seg){
1243 .addr = rte_cpu_to_be_64(rte_pktmbuf_mtod(buf,
1245 .byte_count = rte_cpu_to_be_32(DATA_LEN(buf)),
1246 .lkey = tmpl->mr->lkey,
1249 rxq_data->rq_db = rwq.dbrec;
1250 rxq_data->cqe_n = log2above(cq_info.cqe_cnt);
1251 rxq_data->cq_ci = 0;
1252 rxq_data->rq_ci = 0;
1253 rxq_data->rq_pi = 0;
1254 rxq_data->zip = (struct rxq_zip){
1257 rxq_data->cq_db = cq_info.dbrec;
1258 rxq_data->cqes = (volatile struct mlx5_cqe (*)[])(uintptr_t)cq_info.buf;
1259 /* Update doorbell counter. */
1260 rxq_data->rq_ci = (1 << rxq_data->elts_n) >> rxq_data->sges_n;
1262 *rxq_data->rq_db = rte_cpu_to_be_32(rxq_data->rq_ci);
1263 DEBUG("%p: rxq updated with %p", (void *)rxq_ctrl, (void *)&tmpl);
1264 rte_atomic32_inc(&tmpl->refcnt);
1265 DEBUG("%p: Verbs Rx queue %p: refcnt %d", (void *)priv,
1266 (void *)tmpl, rte_atomic32_read(&tmpl->refcnt));
1267 LIST_INSERT_HEAD(&priv->rxqsibv, tmpl, next);
1271 claim_zero(ibv_destroy_wq(tmpl->wq));
1273 claim_zero(ibv_destroy_cq(tmpl->cq));
1275 claim_zero(ibv_destroy_comp_channel(tmpl->channel));
1277 priv_mr_release(priv, tmpl->mr);
1282 * Get an Rx queue Verbs object.
1285 * Pointer to private structure.
1287 * Queue index in DPDK Rx queue array
1290 * The Verbs object if it exists.
1292 struct mlx5_rxq_ibv*
1293 mlx5_priv_rxq_ibv_get(struct priv *priv, uint16_t idx)
1295 struct mlx5_rxq_data *rxq_data = (*priv->rxqs)[idx];
1296 struct mlx5_rxq_ctrl *rxq_ctrl;
1298 if (idx >= priv->rxqs_n)
1302 rxq_ctrl = container_of(rxq_data, struct mlx5_rxq_ctrl, rxq);
1303 if (rxq_ctrl->ibv) {
1304 priv_mr_get(priv, rxq_data->mp);
1305 rte_atomic32_inc(&rxq_ctrl->ibv->refcnt);
1306 DEBUG("%p: Verbs Rx queue %p: refcnt %d", (void *)priv,
1307 (void *)rxq_ctrl->ibv,
1308 rte_atomic32_read(&rxq_ctrl->ibv->refcnt));
1310 return rxq_ctrl->ibv;
1314 * Release an Rx verbs queue object.
1317 * Pointer to private structure.
1319 * Verbs Rx queue object.
1322 * 0 on success, errno value on failure.
1325 mlx5_priv_rxq_ibv_release(struct priv *priv, struct mlx5_rxq_ibv *rxq_ibv)
1330 assert(rxq_ibv->wq);
1331 assert(rxq_ibv->cq);
1332 assert(rxq_ibv->mr);
1333 ret = priv_mr_release(priv, rxq_ibv->mr);
1336 DEBUG("%p: Verbs Rx queue %p: refcnt %d", (void *)priv,
1337 (void *)rxq_ibv, rte_atomic32_read(&rxq_ibv->refcnt));
1338 if (rte_atomic32_dec_and_test(&rxq_ibv->refcnt)) {
1339 rxq_free_elts(rxq_ibv->rxq_ctrl);
1340 claim_zero(ibv_destroy_wq(rxq_ibv->wq));
1341 claim_zero(ibv_destroy_cq(rxq_ibv->cq));
1342 if (rxq_ibv->channel)
1343 claim_zero(ibv_destroy_comp_channel(rxq_ibv->channel));
1344 LIST_REMOVE(rxq_ibv, next);
1352 * Verify the Verbs Rx queue list is empty
1355 * Pointer to private structure.
1357 * @return the number of object not released.
1360 mlx5_priv_rxq_ibv_verify(struct priv *priv)
1363 struct mlx5_rxq_ibv *rxq_ibv;
1365 LIST_FOREACH(rxq_ibv, &priv->rxqsibv, next) {
1366 DEBUG("%p: Verbs Rx queue %p still referenced", (void *)priv,
1374 * Return true if a single reference exists on the object.
1377 * Pointer to private structure.
1379 * Verbs Rx queue object.
1382 mlx5_priv_rxq_ibv_releasable(struct priv *priv, struct mlx5_rxq_ibv *rxq_ibv)
1386 return (rte_atomic32_read(&rxq_ibv->refcnt) == 1);
1390 * Create a DPDK Rx queue.
1393 * Pointer to private structure.
1397 * Number of descriptors to configure in queue.
1399 * NUMA socket on which memory must be allocated.
1402 * A DPDK queue object on success.
1404 struct mlx5_rxq_ctrl*
1405 mlx5_priv_rxq_new(struct priv *priv, uint16_t idx, uint16_t desc,
1406 unsigned int socket, struct rte_mempool *mp)
1408 struct rte_eth_dev *dev = priv->dev;
1409 struct mlx5_rxq_ctrl *tmpl;
1410 const uint16_t desc_n =
1411 desc + priv->rx_vec_en * MLX5_VPMD_DESCS_PER_LOOP;
1412 unsigned int mb_len = rte_pktmbuf_data_room_size(mp);
1414 tmpl = rte_calloc_socket("RXQ", 1,
1416 desc_n * sizeof(struct rte_mbuf *),
1420 if (priv->dev->data->dev_conf.intr_conf.rxq)
1422 /* Enable scattered packets support for this queue if necessary. */
1423 assert(mb_len >= RTE_PKTMBUF_HEADROOM);
1424 if (dev->data->dev_conf.rxmode.max_rx_pkt_len <=
1425 (mb_len - RTE_PKTMBUF_HEADROOM)) {
1426 tmpl->rxq.sges_n = 0;
1427 } else if (dev->data->dev_conf.rxmode.enable_scatter) {
1429 RTE_PKTMBUF_HEADROOM +
1430 dev->data->dev_conf.rxmode.max_rx_pkt_len;
1431 unsigned int sges_n;
1434 * Determine the number of SGEs needed for a full packet
1435 * and round it to the next power of two.
1437 sges_n = log2above((size / mb_len) + !!(size % mb_len));
1438 tmpl->rxq.sges_n = sges_n;
1439 /* Make sure rxq.sges_n did not overflow. */
1440 size = mb_len * (1 << tmpl->rxq.sges_n);
1441 size -= RTE_PKTMBUF_HEADROOM;
1442 if (size < dev->data->dev_conf.rxmode.max_rx_pkt_len) {
1443 ERROR("%p: too many SGEs (%u) needed to handle"
1444 " requested maximum packet size %u",
1447 dev->data->dev_conf.rxmode.max_rx_pkt_len);
1451 WARN("%p: the requested maximum Rx packet size (%u) is"
1452 " larger than a single mbuf (%u) and scattered"
1453 " mode has not been requested",
1455 dev->data->dev_conf.rxmode.max_rx_pkt_len,
1456 mb_len - RTE_PKTMBUF_HEADROOM);
1458 DEBUG("%p: maximum number of segments per packet: %u",
1459 (void *)dev, 1 << tmpl->rxq.sges_n);
1460 if (desc % (1 << tmpl->rxq.sges_n)) {
1461 ERROR("%p: number of RX queue descriptors (%u) is not a"
1462 " multiple of SGEs per packet (%u)",
1465 1 << tmpl->rxq.sges_n);
1468 /* Toggle RX checksum offload if hardware supports it. */
1470 tmpl->rxq.csum = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
1471 if (priv->hw_csum_l2tun)
1472 tmpl->rxq.csum_l2tun =
1473 !!dev->data->dev_conf.rxmode.hw_ip_checksum;
1474 /* Configure VLAN stripping. */
1475 tmpl->rxq.vlan_strip = (priv->hw_vlan_strip &&
1476 !!dev->data->dev_conf.rxmode.hw_vlan_strip);
1477 /* By default, FCS (CRC) is stripped by hardware. */
1478 if (dev->data->dev_conf.rxmode.hw_strip_crc) {
1479 tmpl->rxq.crc_present = 0;
1480 } else if (priv->hw_fcs_strip) {
1481 tmpl->rxq.crc_present = 1;
1483 WARN("%p: CRC stripping has been disabled but will still"
1484 " be performed by hardware, make sure MLNX_OFED and"
1485 " firmware are up to date",
1487 tmpl->rxq.crc_present = 0;
1489 DEBUG("%p: CRC stripping is %s, %u bytes will be subtracted from"
1490 " incoming frames to hide it",
1492 tmpl->rxq.crc_present ? "disabled" : "enabled",
1493 tmpl->rxq.crc_present << 2);
1495 tmpl->rxq.rss_hash = priv->rxqs_n > 1;
1496 tmpl->rxq.port_id = dev->data->port_id;
1499 tmpl->rxq.stats.idx = idx;
1500 tmpl->rxq.elts_n = log2above(desc);
1502 (struct rte_mbuf *(*)[1 << tmpl->rxq.elts_n])(tmpl + 1);
1503 rte_atomic32_inc(&tmpl->refcnt);
1504 DEBUG("%p: Rx queue %p: refcnt %d", (void *)priv,
1505 (void *)tmpl, rte_atomic32_read(&tmpl->refcnt));
1506 LIST_INSERT_HEAD(&priv->rxqsctrl, tmpl, next);
1517 * Pointer to private structure.
1522 * A pointer to the queue if it exists.
1524 struct mlx5_rxq_ctrl*
1525 mlx5_priv_rxq_get(struct priv *priv, uint16_t idx)
1527 struct mlx5_rxq_ctrl *rxq_ctrl = NULL;
1529 if ((*priv->rxqs)[idx]) {
1530 rxq_ctrl = container_of((*priv->rxqs)[idx],
1531 struct mlx5_rxq_ctrl,
1534 mlx5_priv_rxq_ibv_get(priv, idx);
1535 rte_atomic32_inc(&rxq_ctrl->refcnt);
1536 DEBUG("%p: Rx queue %p: refcnt %d", (void *)priv,
1537 (void *)rxq_ctrl, rte_atomic32_read(&rxq_ctrl->refcnt));
1543 * Release a Rx queue.
1546 * Pointer to private structure.
1551 * 0 on success, errno value on failure.
1554 mlx5_priv_rxq_release(struct priv *priv, uint16_t idx)
1556 struct mlx5_rxq_ctrl *rxq_ctrl;
1558 if (!(*priv->rxqs)[idx])
1560 rxq_ctrl = container_of((*priv->rxqs)[idx], struct mlx5_rxq_ctrl, rxq);
1561 assert(rxq_ctrl->priv);
1562 if (rxq_ctrl->ibv) {
1565 ret = mlx5_priv_rxq_ibv_release(rxq_ctrl->priv, rxq_ctrl->ibv);
1567 rxq_ctrl->ibv = NULL;
1569 DEBUG("%p: Rx queue %p: refcnt %d", (void *)priv,
1570 (void *)rxq_ctrl, rte_atomic32_read(&rxq_ctrl->refcnt));
1571 if (rte_atomic32_dec_and_test(&rxq_ctrl->refcnt)) {
1572 LIST_REMOVE(rxq_ctrl, next);
1574 (*priv->rxqs)[idx] = NULL;
1581 * Verify if the queue can be released.
1584 * Pointer to private structure.
1589 * 1 if the queue can be released.
1592 mlx5_priv_rxq_releasable(struct priv *priv, uint16_t idx)
1594 struct mlx5_rxq_ctrl *rxq_ctrl;
1596 if (!(*priv->rxqs)[idx])
1598 rxq_ctrl = container_of((*priv->rxqs)[idx], struct mlx5_rxq_ctrl, rxq);
1599 return (rte_atomic32_read(&rxq_ctrl->refcnt) == 1);
1603 * Verify the Rx Queue list is empty
1606 * Pointer to private structure.
1608 * @return the number of object not released.
1611 mlx5_priv_rxq_verify(struct priv *priv)
1613 struct mlx5_rxq_ctrl *rxq_ctrl;
1616 LIST_FOREACH(rxq_ctrl, &priv->rxqsctrl, next) {
1617 DEBUG("%p: Rx Queue %p still referenced", (void *)priv,
1625 * Create an indirection table.
1628 * Pointer to private structure.
1630 * Queues entering in the indirection table.
1632 * Number of queues in the array.
1635 * A new indirection table.
1637 struct mlx5_ind_table_ibv*
1638 mlx5_priv_ind_table_ibv_new(struct priv *priv, uint16_t queues[],
1641 struct mlx5_ind_table_ibv *ind_tbl;
1642 const unsigned int wq_n = rte_is_power_of_2(queues_n) ?
1643 log2above(queues_n) :
1644 priv->ind_table_max_size;
1645 struct ibv_wq *wq[1 << wq_n];
1649 ind_tbl = rte_calloc(__func__, 1, sizeof(*ind_tbl) +
1650 queues_n * sizeof(uint16_t), 0);
1653 for (i = 0; i != queues_n; ++i) {
1654 struct mlx5_rxq_ctrl *rxq =
1655 mlx5_priv_rxq_get(priv, queues[i]);
1659 wq[i] = rxq->ibv->wq;
1660 ind_tbl->queues[i] = queues[i];
1662 ind_tbl->queues_n = queues_n;
1663 /* Finalise indirection table. */
1664 for (j = 0; i != (unsigned int)(1 << wq_n); ++i, ++j)
1666 ind_tbl->ind_table = ibv_create_rwq_ind_table(
1668 &(struct ibv_rwq_ind_table_init_attr){
1669 .log_ind_tbl_size = wq_n,
1673 if (!ind_tbl->ind_table)
1675 rte_atomic32_inc(&ind_tbl->refcnt);
1676 LIST_INSERT_HEAD(&priv->ind_tbls, ind_tbl, next);
1677 DEBUG("%p: Indirection table %p: refcnt %d", (void *)priv,
1678 (void *)ind_tbl, rte_atomic32_read(&ind_tbl->refcnt));
1682 DEBUG("%p cannot create indirection table", (void *)priv);
1687 * Get an indirection table.
1690 * Pointer to private structure.
1692 * Queues entering in the indirection table.
1694 * Number of queues in the array.
1697 * An indirection table if found.
1699 struct mlx5_ind_table_ibv*
1700 mlx5_priv_ind_table_ibv_get(struct priv *priv, uint16_t queues[],
1703 struct mlx5_ind_table_ibv *ind_tbl;
1705 LIST_FOREACH(ind_tbl, &priv->ind_tbls, next) {
1706 if ((ind_tbl->queues_n == queues_n) &&
1707 (memcmp(ind_tbl->queues, queues,
1708 ind_tbl->queues_n * sizeof(ind_tbl->queues[0]))
1715 rte_atomic32_inc(&ind_tbl->refcnt);
1716 DEBUG("%p: Indirection table %p: refcnt %d", (void *)priv,
1717 (void *)ind_tbl, rte_atomic32_read(&ind_tbl->refcnt));
1718 for (i = 0; i != ind_tbl->queues_n; ++i)
1719 mlx5_priv_rxq_get(priv, ind_tbl->queues[i]);
1725 * Release an indirection table.
1728 * Pointer to private structure.
1730 * Indirection table to release.
1733 * 0 on success, errno value on failure.
1736 mlx5_priv_ind_table_ibv_release(struct priv *priv,
1737 struct mlx5_ind_table_ibv *ind_tbl)
1741 DEBUG("%p: Indirection table %p: refcnt %d", (void *)priv,
1742 (void *)ind_tbl, rte_atomic32_read(&ind_tbl->refcnt));
1743 if (rte_atomic32_dec_and_test(&ind_tbl->refcnt))
1744 claim_zero(ibv_destroy_rwq_ind_table(ind_tbl->ind_table));
1745 for (i = 0; i != ind_tbl->queues_n; ++i)
1746 claim_nonzero(mlx5_priv_rxq_release(priv, ind_tbl->queues[i]));
1747 if (!rte_atomic32_read(&ind_tbl->refcnt)) {
1748 LIST_REMOVE(ind_tbl, next);
1756 * Verify the Rx Queue list is empty
1759 * Pointer to private structure.
1761 * @return the number of object not released.
1764 mlx5_priv_ind_table_ibv_verify(struct priv *priv)
1766 struct mlx5_ind_table_ibv *ind_tbl;
1769 LIST_FOREACH(ind_tbl, &priv->ind_tbls, next) {
1770 DEBUG("%p: Verbs indirection table %p still referenced",
1771 (void *)priv, (void *)ind_tbl);
1778 * Create an Rx Hash queue.
1781 * Pointer to private structure.
1783 * RSS key for the Rx hash queue.
1784 * @param rss_key_len
1786 * @param hash_fields
1787 * Verbs protocol hash field to make the RSS on.
1789 * Queues entering in hash queue.
1794 * An hash Rx queue on success.
1797 mlx5_priv_hrxq_new(struct priv *priv, uint8_t *rss_key, uint8_t rss_key_len,
1798 uint64_t hash_fields, uint16_t queues[], uint16_t queues_n)
1800 struct mlx5_hrxq *hrxq;
1801 struct mlx5_ind_table_ibv *ind_tbl;
1804 ind_tbl = mlx5_priv_ind_table_ibv_get(priv, queues, queues_n);
1806 ind_tbl = mlx5_priv_ind_table_ibv_new(priv, queues, queues_n);
1809 qp = ibv_create_qp_ex(
1811 &(struct ibv_qp_init_attr_ex){
1812 .qp_type = IBV_QPT_RAW_PACKET,
1814 IBV_QP_INIT_ATTR_PD |
1815 IBV_QP_INIT_ATTR_IND_TABLE |
1816 IBV_QP_INIT_ATTR_RX_HASH,
1817 .rx_hash_conf = (struct ibv_rx_hash_conf){
1818 .rx_hash_function = IBV_RX_HASH_FUNC_TOEPLITZ,
1819 .rx_hash_key_len = rss_key_len,
1820 .rx_hash_key = rss_key,
1821 .rx_hash_fields_mask = hash_fields,
1823 .rwq_ind_tbl = ind_tbl->ind_table,
1828 hrxq = rte_calloc(__func__, 1, sizeof(*hrxq) + rss_key_len, 0);
1831 hrxq->ind_table = ind_tbl;
1833 hrxq->rss_key_len = rss_key_len;
1834 hrxq->hash_fields = hash_fields;
1835 memcpy(hrxq->rss_key, rss_key, rss_key_len);
1836 rte_atomic32_inc(&hrxq->refcnt);
1837 LIST_INSERT_HEAD(&priv->hrxqs, hrxq, next);
1838 DEBUG("%p: Hash Rx queue %p: refcnt %d", (void *)priv,
1839 (void *)hrxq, rte_atomic32_read(&hrxq->refcnt));
1842 mlx5_priv_ind_table_ibv_release(priv, ind_tbl);
1844 claim_zero(ibv_destroy_qp(qp));
1849 * Get an Rx Hash queue.
1852 * Pointer to private structure.
1854 * RSS configuration for the Rx hash queue.
1856 * Queues entering in hash queue.
1861 * An hash Rx queue on success.
1864 mlx5_priv_hrxq_get(struct priv *priv, uint8_t *rss_key, uint8_t rss_key_len,
1865 uint64_t hash_fields, uint16_t queues[], uint16_t queues_n)
1867 struct mlx5_hrxq *hrxq;
1869 LIST_FOREACH(hrxq, &priv->hrxqs, next) {
1870 struct mlx5_ind_table_ibv *ind_tbl;
1872 if (hrxq->rss_key_len != rss_key_len)
1874 if (memcmp(hrxq->rss_key, rss_key, rss_key_len))
1876 if (hrxq->hash_fields != hash_fields)
1878 ind_tbl = mlx5_priv_ind_table_ibv_get(priv, queues, queues_n);
1881 if (ind_tbl != hrxq->ind_table) {
1882 mlx5_priv_ind_table_ibv_release(priv, ind_tbl);
1885 rte_atomic32_inc(&hrxq->refcnt);
1886 DEBUG("%p: Hash Rx queue %p: refcnt %d", (void *)priv,
1887 (void *)hrxq, rte_atomic32_read(&hrxq->refcnt));
1894 * Release the hash Rx queue.
1897 * Pointer to private structure.
1899 * Pointer to Hash Rx queue to release.
1902 * 0 on success, errno value on failure.
1905 mlx5_priv_hrxq_release(struct priv *priv, struct mlx5_hrxq *hrxq)
1907 DEBUG("%p: Hash Rx queue %p: refcnt %d", (void *)priv,
1908 (void *)hrxq, rte_atomic32_read(&hrxq->refcnt));
1909 if (rte_atomic32_dec_and_test(&hrxq->refcnt)) {
1910 claim_zero(ibv_destroy_qp(hrxq->qp));
1911 mlx5_priv_ind_table_ibv_release(priv, hrxq->ind_table);
1912 LIST_REMOVE(hrxq, next);
1916 claim_nonzero(mlx5_priv_ind_table_ibv_release(priv, hrxq->ind_table));
1921 * Verify the Rx Queue list is empty
1924 * Pointer to private structure.
1926 * @return the number of object not released.
1929 mlx5_priv_hrxq_ibv_verify(struct priv *priv)
1931 struct mlx5_hrxq *hrxq;
1934 LIST_FOREACH(hrxq, &priv->hrxqs, next) {
1935 DEBUG("%p: Verbs Hash Rx queue %p still referenced",
1936 (void *)priv, (void *)hrxq);