4 * Copyright 2015 6WIND S.A.
5 * Copyright 2015 Mellanox.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * * Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * * Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * * Neither the name of 6WIND S.A. nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
42 /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
44 #pragma GCC diagnostic ignored "-Wpedantic"
46 #include <infiniband/verbs.h>
47 #include <infiniband/arch.h>
48 #include <infiniband/mlx5_hw.h>
50 #pragma GCC diagnostic error "-Wpedantic"
54 #include <rte_malloc.h>
55 #include <rte_ethdev.h>
56 #include <rte_common.h>
57 #include <rte_interrupts.h>
58 #include <rte_debug.h>
61 #include "mlx5_rxtx.h"
62 #include "mlx5_utils.h"
63 #include "mlx5_autoconf.h"
64 #include "mlx5_defs.h"
66 /* Initialization data for hash RX queues. */
67 const struct hash_rxq_init hash_rxq_init[] = {
69 .hash_fields = (IBV_EXP_RX_HASH_SRC_IPV4 |
70 IBV_EXP_RX_HASH_DST_IPV4 |
71 IBV_EXP_RX_HASH_SRC_PORT_TCP |
72 IBV_EXP_RX_HASH_DST_PORT_TCP),
73 .dpdk_rss_hf = ETH_RSS_NONFRAG_IPV4_TCP,
75 .flow_spec.tcp_udp = {
76 .type = IBV_EXP_FLOW_SPEC_TCP,
77 .size = sizeof(hash_rxq_init[0].flow_spec.tcp_udp),
79 .underlayer = &hash_rxq_init[HASH_RXQ_IPV4],
82 .hash_fields = (IBV_EXP_RX_HASH_SRC_IPV4 |
83 IBV_EXP_RX_HASH_DST_IPV4 |
84 IBV_EXP_RX_HASH_SRC_PORT_UDP |
85 IBV_EXP_RX_HASH_DST_PORT_UDP),
86 .dpdk_rss_hf = ETH_RSS_NONFRAG_IPV4_UDP,
88 .flow_spec.tcp_udp = {
89 .type = IBV_EXP_FLOW_SPEC_UDP,
90 .size = sizeof(hash_rxq_init[0].flow_spec.tcp_udp),
92 .underlayer = &hash_rxq_init[HASH_RXQ_IPV4],
95 .hash_fields = (IBV_EXP_RX_HASH_SRC_IPV4 |
96 IBV_EXP_RX_HASH_DST_IPV4),
97 .dpdk_rss_hf = (ETH_RSS_IPV4 |
101 .type = IBV_EXP_FLOW_SPEC_IPV4,
102 .size = sizeof(hash_rxq_init[0].flow_spec.ipv4),
104 .underlayer = &hash_rxq_init[HASH_RXQ_ETH],
107 .hash_fields = (IBV_EXP_RX_HASH_SRC_IPV6 |
108 IBV_EXP_RX_HASH_DST_IPV6 |
109 IBV_EXP_RX_HASH_SRC_PORT_TCP |
110 IBV_EXP_RX_HASH_DST_PORT_TCP),
111 .dpdk_rss_hf = ETH_RSS_NONFRAG_IPV6_TCP,
113 .flow_spec.tcp_udp = {
114 .type = IBV_EXP_FLOW_SPEC_TCP,
115 .size = sizeof(hash_rxq_init[0].flow_spec.tcp_udp),
117 .underlayer = &hash_rxq_init[HASH_RXQ_IPV6],
120 .hash_fields = (IBV_EXP_RX_HASH_SRC_IPV6 |
121 IBV_EXP_RX_HASH_DST_IPV6 |
122 IBV_EXP_RX_HASH_SRC_PORT_UDP |
123 IBV_EXP_RX_HASH_DST_PORT_UDP),
124 .dpdk_rss_hf = ETH_RSS_NONFRAG_IPV6_UDP,
126 .flow_spec.tcp_udp = {
127 .type = IBV_EXP_FLOW_SPEC_UDP,
128 .size = sizeof(hash_rxq_init[0].flow_spec.tcp_udp),
130 .underlayer = &hash_rxq_init[HASH_RXQ_IPV6],
133 .hash_fields = (IBV_EXP_RX_HASH_SRC_IPV6 |
134 IBV_EXP_RX_HASH_DST_IPV6),
135 .dpdk_rss_hf = (ETH_RSS_IPV6 |
139 .type = IBV_EXP_FLOW_SPEC_IPV6,
140 .size = sizeof(hash_rxq_init[0].flow_spec.ipv6),
142 .underlayer = &hash_rxq_init[HASH_RXQ_ETH],
149 .type = IBV_EXP_FLOW_SPEC_ETH,
150 .size = sizeof(hash_rxq_init[0].flow_spec.eth),
156 /* Number of entries in hash_rxq_init[]. */
157 const unsigned int hash_rxq_init_n = RTE_DIM(hash_rxq_init);
159 /* Initialization data for hash RX queue indirection tables. */
160 static const struct ind_table_init ind_table_init[] = {
162 .max_size = -1u, /* Superseded by HW limitations. */
164 1 << HASH_RXQ_TCPV4 |
165 1 << HASH_RXQ_UDPV4 |
167 1 << HASH_RXQ_TCPV6 |
168 1 << HASH_RXQ_UDPV6 |
175 .hash_types = 1 << HASH_RXQ_ETH,
180 #define IND_TABLE_INIT_N RTE_DIM(ind_table_init)
182 /* Default RSS hash key also used for ConnectX-3. */
183 uint8_t rss_hash_default_key[] = {
184 0x2c, 0xc6, 0x81, 0xd1,
185 0x5b, 0xdb, 0xf4, 0xf7,
186 0xfc, 0xa2, 0x83, 0x19,
187 0xdb, 0x1a, 0x3e, 0x94,
188 0x6b, 0x9e, 0x38, 0xd9,
189 0x2c, 0x9c, 0x03, 0xd1,
190 0xad, 0x99, 0x44, 0xa7,
191 0xd9, 0x56, 0x3d, 0x59,
192 0x06, 0x3c, 0x25, 0xf3,
193 0xfc, 0x1f, 0xdc, 0x2a,
196 /* Length of the default RSS hash key. */
197 const size_t rss_hash_default_key_len = sizeof(rss_hash_default_key);
200 * Populate flow steering rule for a given hash RX queue type using
201 * information from hash_rxq_init[]. Nothing is written to flow_attr when
202 * flow_attr_size is not large enough, but the required size is still returned.
205 * Pointer to private structure.
206 * @param[out] flow_attr
207 * Pointer to flow attribute structure to fill. Note that the allocated
208 * area must be larger and large enough to hold all flow specifications.
209 * @param flow_attr_size
210 * Entire size of flow_attr and trailing room for flow specifications.
212 * Hash RX queue type to use for flow steering rule.
215 * Total size of the flow attribute buffer. No errors are defined.
218 priv_flow_attr(struct priv *priv, struct ibv_exp_flow_attr *flow_attr,
219 size_t flow_attr_size, enum hash_rxq_type type)
221 size_t offset = sizeof(*flow_attr);
222 const struct hash_rxq_init *init = &hash_rxq_init[type];
224 assert(priv != NULL);
225 assert((size_t)type < RTE_DIM(hash_rxq_init));
227 offset += init->flow_spec.hdr.size;
228 init = init->underlayer;
229 } while (init != NULL);
230 if (offset > flow_attr_size)
232 flow_attr_size = offset;
233 init = &hash_rxq_init[type];
234 *flow_attr = (struct ibv_exp_flow_attr){
235 .type = IBV_EXP_FLOW_ATTR_NORMAL,
236 /* Priorities < 3 are reserved for flow director. */
237 .priority = init->flow_priority + 3,
243 offset -= init->flow_spec.hdr.size;
244 memcpy((void *)((uintptr_t)flow_attr + offset),
246 init->flow_spec.hdr.size);
247 ++flow_attr->num_of_specs;
248 init = init->underlayer;
249 } while (init != NULL);
250 return flow_attr_size;
254 * Convert hash type position in indirection table initializer to
255 * hash RX queue type.
258 * Indirection table initializer.
260 * Hash type position.
263 * Hash RX queue type.
265 static enum hash_rxq_type
266 hash_rxq_type_from_pos(const struct ind_table_init *table, unsigned int pos)
268 enum hash_rxq_type type = HASH_RXQ_TCPV4;
270 assert(pos < table->hash_types_n);
272 if ((table->hash_types & (1 << type)) && (pos-- == 0))
280 * Filter out disabled hash RX queue types from ind_table_init[].
283 * Pointer to private structure.
288 * Number of table entries.
291 priv_make_ind_table_init(struct priv *priv,
292 struct ind_table_init (*table)[IND_TABLE_INIT_N])
297 unsigned int table_n = 0;
298 /* Mandatory to receive frames not handled by normal hash RX queues. */
299 unsigned int hash_types_sup = 1 << HASH_RXQ_ETH;
301 rss_hf = priv->rss_hf;
302 /* Process other protocols only if more than one queue. */
303 if (priv->rxqs_n > 1)
304 for (i = 0; (i != hash_rxq_init_n); ++i)
305 if (rss_hf & hash_rxq_init[i].dpdk_rss_hf)
306 hash_types_sup |= (1 << i);
308 /* Filter out entries whose protocols are not in the set. */
309 for (i = 0, j = 0; (i != IND_TABLE_INIT_N); ++i) {
313 /* j is increased only if the table has valid protocols. */
315 (*table)[j] = ind_table_init[i];
316 (*table)[j].hash_types &= hash_types_sup;
317 for (h = 0, nb = 0; (h != hash_rxq_init_n); ++h)
318 if (((*table)[j].hash_types >> h) & 0x1)
320 (*table)[i].hash_types_n = nb;
330 * Initialize hash RX queues and indirection table.
333 * Pointer to private structure.
336 * 0 on success, errno value on failure.
339 priv_create_hash_rxqs(struct priv *priv)
341 struct ibv_exp_wq *wqs[priv->reta_idx_n];
342 struct ind_table_init ind_table_init[IND_TABLE_INIT_N];
343 unsigned int ind_tables_n =
344 priv_make_ind_table_init(priv, &ind_table_init);
345 unsigned int hash_rxqs_n = 0;
346 struct hash_rxq (*hash_rxqs)[] = NULL;
347 struct ibv_exp_rwq_ind_table *(*ind_tables)[] = NULL;
353 assert(priv->ind_tables == NULL);
354 assert(priv->ind_tables_n == 0);
355 assert(priv->hash_rxqs == NULL);
356 assert(priv->hash_rxqs_n == 0);
357 assert(priv->pd != NULL);
358 assert(priv->ctx != NULL);
361 if (priv->rxqs_n == 0)
363 assert(priv->rxqs != NULL);
364 if (ind_tables_n == 0) {
365 ERROR("all hash RX queue types have been filtered out,"
366 " indirection table cannot be created");
369 if (priv->rxqs_n & (priv->rxqs_n - 1)) {
370 INFO("%u RX queues are configured, consider rounding this"
371 " number to the next power of two for better balancing",
373 DEBUG("indirection table extended to assume %u WQs",
376 for (i = 0; (i != priv->reta_idx_n); ++i) {
377 struct rxq_ctrl *rxq_ctrl;
379 rxq_ctrl = container_of((*priv->rxqs)[(*priv->reta_idx)[i]],
380 struct rxq_ctrl, rxq);
381 wqs[i] = rxq_ctrl->wq;
383 /* Get number of hash RX queues to configure. */
384 for (i = 0, hash_rxqs_n = 0; (i != ind_tables_n); ++i)
385 hash_rxqs_n += ind_table_init[i].hash_types_n;
386 DEBUG("allocating %u hash RX queues for %u WQs, %u indirection tables",
387 hash_rxqs_n, priv->rxqs_n, ind_tables_n);
388 /* Create indirection tables. */
389 ind_tables = rte_calloc(__func__, ind_tables_n,
390 sizeof((*ind_tables)[0]), 0);
391 if (ind_tables == NULL) {
393 ERROR("cannot allocate indirection tables container: %s",
397 for (i = 0; (i != ind_tables_n); ++i) {
398 struct ibv_exp_rwq_ind_table_init_attr ind_init_attr = {
400 .log_ind_tbl_size = 0, /* Set below. */
404 unsigned int ind_tbl_size = ind_table_init[i].max_size;
405 struct ibv_exp_rwq_ind_table *ind_table;
407 if (priv->reta_idx_n < ind_tbl_size)
408 ind_tbl_size = priv->reta_idx_n;
409 ind_init_attr.log_ind_tbl_size = log2above(ind_tbl_size);
411 ind_table = ibv_exp_create_rwq_ind_table(priv->ctx,
413 if (ind_table != NULL) {
414 (*ind_tables)[i] = ind_table;
417 /* Not clear whether errno is set. */
418 err = (errno ? errno : EINVAL);
419 ERROR("RX indirection table creation failed with error %d: %s",
423 /* Allocate array that holds hash RX queues and related data. */
424 hash_rxqs = rte_calloc(__func__, hash_rxqs_n,
425 sizeof((*hash_rxqs)[0]), 0);
426 if (hash_rxqs == NULL) {
428 ERROR("cannot allocate hash RX queues container: %s",
432 for (i = 0, j = 0, k = 0;
433 ((i != hash_rxqs_n) && (j != ind_tables_n));
435 struct hash_rxq *hash_rxq = &(*hash_rxqs)[i];
436 enum hash_rxq_type type =
437 hash_rxq_type_from_pos(&ind_table_init[j], k);
438 struct rte_eth_rss_conf *priv_rss_conf =
439 (*priv->rss_conf)[type];
440 struct ibv_exp_rx_hash_conf hash_conf = {
441 .rx_hash_function = IBV_EXP_RX_HASH_FUNC_TOEPLITZ,
442 .rx_hash_key_len = (priv_rss_conf ?
443 priv_rss_conf->rss_key_len :
444 rss_hash_default_key_len),
445 .rx_hash_key = (priv_rss_conf ?
446 priv_rss_conf->rss_key :
447 rss_hash_default_key),
448 .rx_hash_fields_mask = hash_rxq_init[type].hash_fields,
449 .rwq_ind_tbl = (*ind_tables)[j],
451 struct ibv_exp_qp_init_attr qp_init_attr = {
452 .max_inl_recv = 0, /* Currently not supported. */
453 .qp_type = IBV_QPT_RAW_PACKET,
454 .comp_mask = (IBV_EXP_QP_INIT_ATTR_PD |
455 IBV_EXP_QP_INIT_ATTR_RX_HASH),
457 .rx_hash_conf = &hash_conf,
458 .port_num = priv->port,
461 DEBUG("using indirection table %u for hash RX queue %u type %d",
463 *hash_rxq = (struct hash_rxq){
465 .qp = ibv_exp_create_qp(priv->ctx, &qp_init_attr),
468 if (hash_rxq->qp == NULL) {
469 err = (errno ? errno : EINVAL);
470 ERROR("Hash RX QP creation failure: %s",
474 if (++k < ind_table_init[j].hash_types_n)
476 /* Switch to the next indirection table and reset hash RX
477 * queue type array index. */
481 priv->ind_tables = ind_tables;
482 priv->ind_tables_n = ind_tables_n;
483 priv->hash_rxqs = hash_rxqs;
484 priv->hash_rxqs_n = hash_rxqs_n;
488 if (hash_rxqs != NULL) {
489 for (i = 0; (i != hash_rxqs_n); ++i) {
490 struct ibv_qp *qp = (*hash_rxqs)[i].qp;
494 claim_zero(ibv_destroy_qp(qp));
498 if (ind_tables != NULL) {
499 for (j = 0; (j != ind_tables_n); ++j) {
500 struct ibv_exp_rwq_ind_table *ind_table =
503 if (ind_table == NULL)
505 claim_zero(ibv_exp_destroy_rwq_ind_table(ind_table));
507 rte_free(ind_tables);
513 * Clean up hash RX queues and indirection table.
516 * Pointer to private structure.
519 priv_destroy_hash_rxqs(struct priv *priv)
523 DEBUG("destroying %u hash RX queues", priv->hash_rxqs_n);
524 if (priv->hash_rxqs_n == 0) {
525 assert(priv->hash_rxqs == NULL);
526 assert(priv->ind_tables == NULL);
529 for (i = 0; (i != priv->hash_rxqs_n); ++i) {
530 struct hash_rxq *hash_rxq = &(*priv->hash_rxqs)[i];
533 assert(hash_rxq->priv == priv);
534 assert(hash_rxq->qp != NULL);
535 /* Also check that there are no remaining flows. */
536 for (j = 0; (j != RTE_DIM(hash_rxq->special_flow)); ++j)
538 (k != RTE_DIM(hash_rxq->special_flow[j]));
540 assert(hash_rxq->special_flow[j][k] == NULL);
541 for (j = 0; (j != RTE_DIM(hash_rxq->mac_flow)); ++j)
542 for (k = 0; (k != RTE_DIM(hash_rxq->mac_flow[j])); ++k)
543 assert(hash_rxq->mac_flow[j][k] == NULL);
544 claim_zero(ibv_destroy_qp(hash_rxq->qp));
546 priv->hash_rxqs_n = 0;
547 rte_free(priv->hash_rxqs);
548 priv->hash_rxqs = NULL;
549 for (i = 0; (i != priv->ind_tables_n); ++i) {
550 struct ibv_exp_rwq_ind_table *ind_table =
551 (*priv->ind_tables)[i];
553 assert(ind_table != NULL);
554 claim_zero(ibv_exp_destroy_rwq_ind_table(ind_table));
556 priv->ind_tables_n = 0;
557 rte_free(priv->ind_tables);
558 priv->ind_tables = NULL;
562 * Check whether a given flow type is allowed.
565 * Pointer to private structure.
567 * Flow type to check.
570 * Nonzero if the given flow type is allowed.
573 priv_allow_flow_type(struct priv *priv, enum hash_rxq_flow_type type)
575 /* Only FLOW_TYPE_PROMISC is allowed when promiscuous mode
576 * has been requested. */
577 if (priv->promisc_req)
578 return type == HASH_RXQ_FLOW_TYPE_PROMISC;
580 case HASH_RXQ_FLOW_TYPE_PROMISC:
581 return !!priv->promisc_req;
582 case HASH_RXQ_FLOW_TYPE_ALLMULTI:
583 return !!priv->allmulti_req;
584 case HASH_RXQ_FLOW_TYPE_BROADCAST:
585 case HASH_RXQ_FLOW_TYPE_IPV6MULTI:
586 /* If allmulti is enabled, broadcast and ipv6multi
587 * are unnecessary. */
588 return !priv->allmulti_req;
589 case HASH_RXQ_FLOW_TYPE_MAC:
592 /* Unsupported flow type is not allowed. */
599 * Automatically enable/disable flows according to configuration.
605 * 0 on success, errno value on failure.
608 priv_rehash_flows(struct priv *priv)
610 enum hash_rxq_flow_type i;
612 for (i = HASH_RXQ_FLOW_TYPE_PROMISC;
613 i != RTE_DIM((*priv->hash_rxqs)[0].special_flow);
615 if (!priv_allow_flow_type(priv, i)) {
616 priv_special_flow_disable(priv, i);
618 int ret = priv_special_flow_enable(priv, i);
623 if (priv_allow_flow_type(priv, HASH_RXQ_FLOW_TYPE_MAC))
624 return priv_mac_addrs_enable(priv);
625 priv_mac_addrs_disable(priv);
630 * Allocate RX queue elements.
633 * Pointer to RX queue structure.
635 * Number of elements to allocate.
638 * 0 on success, errno value on failure.
641 rxq_alloc_elts(struct rxq_ctrl *rxq_ctrl, unsigned int elts_n)
643 const unsigned int sges_n = 1 << rxq_ctrl->rxq.sges_n;
647 /* Iterate on segments. */
648 for (i = 0; (i != elts_n); ++i) {
649 struct rte_mbuf *buf;
650 volatile struct mlx5_wqe_data_seg *scat =
651 &(*rxq_ctrl->rxq.wqes)[i];
653 buf = rte_pktmbuf_alloc(rxq_ctrl->rxq.mp);
655 ERROR("%p: empty mbuf pool", (void *)rxq_ctrl);
659 /* Headroom is reserved by rte_pktmbuf_alloc(). */
660 assert(DATA_OFF(buf) == RTE_PKTMBUF_HEADROOM);
661 /* Buffer is supposed to be empty. */
662 assert(rte_pktmbuf_data_len(buf) == 0);
663 assert(rte_pktmbuf_pkt_len(buf) == 0);
665 /* Only the first segment keeps headroom. */
667 SET_DATA_OFF(buf, 0);
668 PORT(buf) = rxq_ctrl->rxq.port_id;
669 DATA_LEN(buf) = rte_pktmbuf_tailroom(buf);
670 PKT_LEN(buf) = DATA_LEN(buf);
672 /* scat->addr must be able to store a pointer. */
673 assert(sizeof(scat->addr) >= sizeof(uintptr_t));
674 *scat = (struct mlx5_wqe_data_seg){
676 rte_cpu_to_be_64(rte_pktmbuf_mtod(buf, uintptr_t)),
677 .byte_count = rte_cpu_to_be_32(DATA_LEN(buf)),
678 .lkey = rte_cpu_to_be_32(rxq_ctrl->mr->lkey),
680 (*rxq_ctrl->rxq.elts)[i] = buf;
682 if (rxq_check_vec_support(&rxq_ctrl->rxq) > 0) {
683 struct rxq *rxq = &rxq_ctrl->rxq;
684 struct rte_mbuf *mbuf_init = &rxq->fake_mbuf;
686 assert(rxq->elts_n == rxq->cqe_n);
687 /* Initialize default rearm_data for vPMD. */
688 mbuf_init->data_off = RTE_PKTMBUF_HEADROOM;
689 rte_mbuf_refcnt_set(mbuf_init, 1);
690 mbuf_init->nb_segs = 1;
691 mbuf_init->port = rxq->port_id;
693 * prevent compiler reordering:
694 * rearm_data covers previous fields.
696 rte_compiler_barrier();
697 rxq->mbuf_initializer = *(uint64_t *)&mbuf_init->rearm_data;
698 /* Padding with a fake mbuf for vectorized Rx. */
699 for (i = 0; i < MLX5_VPMD_DESCS_PER_LOOP; ++i)
700 (*rxq->elts)[elts_n + i] = &rxq->fake_mbuf;
702 DEBUG("%p: allocated and configured %u segments (max %u packets)",
703 (void *)rxq_ctrl, elts_n, elts_n / (1 << rxq_ctrl->rxq.sges_n));
708 for (i = 0; (i != elts_n); ++i) {
709 if ((*rxq_ctrl->rxq.elts)[i] != NULL)
710 rte_pktmbuf_free_seg((*rxq_ctrl->rxq.elts)[i]);
711 (*rxq_ctrl->rxq.elts)[i] = NULL;
713 DEBUG("%p: failed, freed everything", (void *)rxq_ctrl);
719 * Free RX queue elements.
722 * Pointer to RX queue structure.
725 rxq_free_elts(struct rxq_ctrl *rxq_ctrl)
727 struct rxq *rxq = &rxq_ctrl->rxq;
728 const uint16_t q_n = (1 << rxq->elts_n);
729 const uint16_t q_mask = q_n - 1;
730 uint16_t used = q_n - (rxq->rq_ci - rxq->rq_pi);
733 DEBUG("%p: freeing WRs", (void *)rxq_ctrl);
734 if (rxq->elts == NULL)
737 * Some mbuf in the Ring belongs to the application. They cannot be
740 if (rxq_check_vec_support(rxq) > 0) {
741 for (i = 0; i < used; ++i)
742 (*rxq->elts)[(rxq->rq_ci + i) & q_mask] = NULL;
743 rxq->rq_pi = rxq->rq_ci;
745 for (i = 0; (i != (1u << rxq->elts_n)); ++i) {
746 if ((*rxq->elts)[i] != NULL)
747 rte_pktmbuf_free_seg((*rxq->elts)[i]);
748 (*rxq->elts)[i] = NULL;
753 * Clean up a RX queue.
755 * Destroy objects, free allocated memory and reset the structure for reuse.
758 * Pointer to RX queue structure.
761 rxq_cleanup(struct rxq_ctrl *rxq_ctrl)
763 DEBUG("cleaning up %p", (void *)rxq_ctrl);
764 rxq_free_elts(rxq_ctrl);
765 if (rxq_ctrl->fdir_queue != NULL)
766 priv_fdir_queue_destroy(rxq_ctrl->priv, rxq_ctrl->fdir_queue);
767 if (rxq_ctrl->wq != NULL)
768 claim_zero(ibv_exp_destroy_wq(rxq_ctrl->wq));
769 if (rxq_ctrl->cq != NULL)
770 claim_zero(ibv_destroy_cq(rxq_ctrl->cq));
771 if (rxq_ctrl->channel != NULL)
772 claim_zero(ibv_destroy_comp_channel(rxq_ctrl->channel));
773 if (rxq_ctrl->mr != NULL)
774 claim_zero(ibv_dereg_mr(rxq_ctrl->mr));
775 memset(rxq_ctrl, 0, sizeof(*rxq_ctrl));
779 * Initialize RX queue.
782 * Pointer to RX queue control template.
785 * 0 on success, errno value on failure.
788 rxq_setup(struct rxq_ctrl *tmpl)
790 struct ibv_cq *ibcq = tmpl->cq;
791 struct ibv_mlx5_cq_info cq_info;
792 struct mlx5_rwq *rwq = container_of(tmpl->wq, struct mlx5_rwq, wq);
793 const uint16_t desc_n =
794 (1 << tmpl->rxq.elts_n) + tmpl->priv->rx_vec_en *
795 MLX5_VPMD_DESCS_PER_LOOP;
796 struct rte_mbuf *(*elts)[desc_n] =
797 rte_calloc_socket("RXQ", 1, sizeof(*elts), 0, tmpl->socket);
798 if (ibv_mlx5_exp_get_cq_info(ibcq, &cq_info)) {
799 ERROR("Unable to query CQ info. check your OFED.");
802 if (cq_info.cqe_size != RTE_CACHE_LINE_SIZE) {
803 ERROR("Wrong MLX5_CQE_SIZE environment variable value: "
804 "it should be set to %u", RTE_CACHE_LINE_SIZE);
809 tmpl->rxq.rq_db = rwq->rq.db;
810 tmpl->rxq.cqe_n = log2above(cq_info.cqe_cnt);
814 tmpl->rxq.cq_db = cq_info.dbrec;
816 (volatile struct mlx5_wqe_data_seg (*)[])
817 (uintptr_t)rwq->rq.buff;
819 (volatile struct mlx5_cqe (*)[])
820 (uintptr_t)cq_info.buf;
821 tmpl->rxq.elts = elts;
826 * Configure a RX queue.
829 * Pointer to Ethernet device structure.
831 * Pointer to RX queue structure.
833 * Number of descriptors to configure in queue.
835 * NUMA socket on which memory must be allocated.
837 * Thresholds parameters.
839 * Memory pool for buffer allocations.
842 * 0 on success, errno value on failure.
845 rxq_ctrl_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl,
846 uint16_t desc, unsigned int socket,
847 const struct rte_eth_rxconf *conf, struct rte_mempool *mp)
849 struct priv *priv = dev->data->dev_private;
850 struct rxq_ctrl tmpl = {
854 .elts_n = log2above(desc),
856 .rss_hash = priv->rxqs_n > 1,
859 struct ibv_exp_wq_attr mod;
861 struct ibv_exp_cq_init_attr cq;
862 struct ibv_exp_wq_init_attr wq;
863 struct ibv_exp_cq_attr cq_attr;
865 unsigned int mb_len = rte_pktmbuf_data_room_size(mp);
866 unsigned int cqe_n = desc - 1;
867 const uint16_t desc_n =
868 desc + priv->rx_vec_en * MLX5_VPMD_DESCS_PER_LOOP;
869 struct rte_mbuf *(*elts)[desc_n] = NULL;
872 (void)conf; /* Thresholds configuration (ignored). */
873 /* Enable scattered packets support for this queue if necessary. */
874 assert(mb_len >= RTE_PKTMBUF_HEADROOM);
875 if (dev->data->dev_conf.rxmode.max_rx_pkt_len <=
876 (mb_len - RTE_PKTMBUF_HEADROOM)) {
878 } else if (dev->data->dev_conf.rxmode.enable_scatter) {
880 RTE_PKTMBUF_HEADROOM +
881 dev->data->dev_conf.rxmode.max_rx_pkt_len;
885 * Determine the number of SGEs needed for a full packet
886 * and round it to the next power of two.
888 sges_n = log2above((size / mb_len) + !!(size % mb_len));
889 tmpl.rxq.sges_n = sges_n;
890 /* Make sure rxq.sges_n did not overflow. */
891 size = mb_len * (1 << tmpl.rxq.sges_n);
892 size -= RTE_PKTMBUF_HEADROOM;
893 if (size < dev->data->dev_conf.rxmode.max_rx_pkt_len) {
894 ERROR("%p: too many SGEs (%u) needed to handle"
895 " requested maximum packet size %u",
898 dev->data->dev_conf.rxmode.max_rx_pkt_len);
902 WARN("%p: the requested maximum Rx packet size (%u) is"
903 " larger than a single mbuf (%u) and scattered"
904 " mode has not been requested",
906 dev->data->dev_conf.rxmode.max_rx_pkt_len,
907 mb_len - RTE_PKTMBUF_HEADROOM);
909 DEBUG("%p: maximum number of segments per packet: %u",
910 (void *)dev, 1 << tmpl.rxq.sges_n);
911 if (desc % (1 << tmpl.rxq.sges_n)) {
912 ERROR("%p: number of RX queue descriptors (%u) is not a"
913 " multiple of SGEs per packet (%u)",
916 1 << tmpl.rxq.sges_n);
919 /* Toggle RX checksum offload if hardware supports it. */
921 tmpl.rxq.csum = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
922 if (priv->hw_csum_l2tun)
923 tmpl.rxq.csum_l2tun =
924 !!dev->data->dev_conf.rxmode.hw_ip_checksum;
925 /* Use the entire RX mempool as the memory region. */
926 tmpl.mr = mlx5_mp2mr(priv->pd, mp);
927 if (tmpl.mr == NULL) {
929 ERROR("%p: MR creation failure: %s",
930 (void *)dev, strerror(ret));
933 if (dev->data->dev_conf.intr_conf.rxq) {
934 tmpl.channel = ibv_create_comp_channel(priv->ctx);
935 if (tmpl.channel == NULL) {
937 ERROR("%p: Rx interrupt completion channel creation"
939 (void *)dev, strerror(ret));
943 attr.cq = (struct ibv_exp_cq_init_attr){
946 if (priv->cqe_comp) {
947 attr.cq.comp_mask |= IBV_EXP_CQ_INIT_ATTR_FLAGS;
948 attr.cq.flags |= IBV_EXP_CQ_COMPRESSED_CQE;
950 * For vectorized Rx, it must not be doubled in order to
951 * make cq_ci and rq_ci aligned.
953 if (rxq_check_vec_support(&tmpl.rxq) < 0)
954 cqe_n = (desc * 2) - 1; /* Double the number of CQEs. */
956 tmpl.cq = ibv_exp_create_cq(priv->ctx, cqe_n, NULL, tmpl.channel, 0,
958 if (tmpl.cq == NULL) {
960 ERROR("%p: CQ creation failure: %s",
961 (void *)dev, strerror(ret));
964 DEBUG("priv->device_attr.max_qp_wr is %d",
965 priv->device_attr.max_qp_wr);
966 DEBUG("priv->device_attr.max_sge is %d",
967 priv->device_attr.max_sge);
968 /* Configure VLAN stripping. */
969 tmpl.rxq.vlan_strip = (priv->hw_vlan_strip &&
970 !!dev->data->dev_conf.rxmode.hw_vlan_strip);
971 attr.wq = (struct ibv_exp_wq_init_attr){
972 .wq_context = NULL, /* Could be useful in the future. */
973 .wq_type = IBV_EXP_WQT_RQ,
974 /* Max number of outstanding WRs. */
975 .max_recv_wr = desc >> tmpl.rxq.sges_n,
976 /* Max number of scatter/gather elements in a WR. */
977 .max_recv_sge = 1 << tmpl.rxq.sges_n,
981 IBV_EXP_CREATE_WQ_VLAN_OFFLOADS |
983 .vlan_offloads = (tmpl.rxq.vlan_strip ?
984 IBV_EXP_RECEIVE_WQ_CVLAN_STRIP :
987 /* By default, FCS (CRC) is stripped by hardware. */
988 if (dev->data->dev_conf.rxmode.hw_strip_crc) {
989 tmpl.rxq.crc_present = 0;
990 } else if (priv->hw_fcs_strip) {
991 /* Ask HW/Verbs to leave CRC in place when supported. */
992 attr.wq.flags |= IBV_EXP_CREATE_WQ_FLAG_SCATTER_FCS;
993 attr.wq.comp_mask |= IBV_EXP_CREATE_WQ_FLAGS;
994 tmpl.rxq.crc_present = 1;
996 WARN("%p: CRC stripping has been disabled but will still"
997 " be performed by hardware, make sure MLNX_OFED and"
998 " firmware are up to date",
1000 tmpl.rxq.crc_present = 0;
1002 DEBUG("%p: CRC stripping is %s, %u bytes will be subtracted from"
1003 " incoming frames to hide it",
1005 tmpl.rxq.crc_present ? "disabled" : "enabled",
1006 tmpl.rxq.crc_present << 2);
1007 if (!mlx5_getenv_int("MLX5_PMD_ENABLE_PADDING"))
1008 ; /* Nothing else to do. */
1009 else if (priv->hw_padding) {
1010 INFO("%p: enabling packet padding on queue %p",
1011 (void *)dev, (void *)rxq_ctrl);
1012 attr.wq.flags |= IBV_EXP_CREATE_WQ_FLAG_RX_END_PADDING;
1013 attr.wq.comp_mask |= IBV_EXP_CREATE_WQ_FLAGS;
1015 WARN("%p: packet padding has been requested but is not"
1016 " supported, make sure MLNX_OFED and firmware are"
1020 tmpl.wq = ibv_exp_create_wq(priv->ctx, &attr.wq);
1021 if (tmpl.wq == NULL) {
1022 ret = (errno ? errno : EINVAL);
1023 ERROR("%p: WQ creation failure: %s",
1024 (void *)dev, strerror(ret));
1028 * Make sure number of WRs*SGEs match expectations since a queue
1029 * cannot allocate more than "desc" buffers.
1031 if (((int)attr.wq.max_recv_wr != (desc >> tmpl.rxq.sges_n)) ||
1032 ((int)attr.wq.max_recv_sge != (1 << tmpl.rxq.sges_n))) {
1033 ERROR("%p: requested %u*%u but got %u*%u WRs*SGEs",
1035 (desc >> tmpl.rxq.sges_n), (1 << tmpl.rxq.sges_n),
1036 attr.wq.max_recv_wr, attr.wq.max_recv_sge);
1041 tmpl.rxq.port_id = dev->data->port_id;
1042 DEBUG("%p: RTE port ID: %u", (void *)rxq_ctrl, tmpl.rxq.port_id);
1043 /* Change queue state to ready. */
1044 mod = (struct ibv_exp_wq_attr){
1045 .attr_mask = IBV_EXP_WQ_ATTR_STATE,
1046 .wq_state = IBV_EXP_WQS_RDY,
1048 ret = ibv_exp_modify_wq(tmpl.wq, &mod);
1050 ERROR("%p: WQ state to IBV_EXP_WQS_RDY failed: %s",
1051 (void *)dev, strerror(ret));
1054 ret = rxq_setup(&tmpl);
1056 ERROR("%p: cannot initialize RX queue structure: %s",
1057 (void *)dev, strerror(ret));
1060 ret = rxq_alloc_elts(&tmpl, desc);
1062 ERROR("%p: RXQ allocation failed: %s",
1063 (void *)dev, strerror(ret));
1066 /* Clean up rxq in case we're reinitializing it. */
1067 DEBUG("%p: cleaning-up old rxq just in case", (void *)rxq_ctrl);
1068 rxq_cleanup(rxq_ctrl);
1069 /* Move mbuf pointers to dedicated storage area in RX queue. */
1070 elts = (void *)(rxq_ctrl + 1);
1071 rte_memcpy(elts, tmpl.rxq.elts, sizeof(*elts));
1073 memset(tmpl.rxq.elts, 0x55, sizeof(*elts));
1075 rte_free(tmpl.rxq.elts);
1076 tmpl.rxq.elts = elts;
1078 /* Update doorbell counter. */
1079 rxq_ctrl->rxq.rq_ci = desc >> rxq_ctrl->rxq.sges_n;
1081 *rxq_ctrl->rxq.rq_db = rte_cpu_to_be_32(rxq_ctrl->rxq.rq_ci);
1082 DEBUG("%p: rxq updated with %p", (void *)rxq_ctrl, (void *)&tmpl);
1086 elts = tmpl.rxq.elts;
1094 * DPDK callback to configure a RX queue.
1097 * Pointer to Ethernet device structure.
1101 * Number of descriptors to configure in queue.
1103 * NUMA socket on which memory must be allocated.
1105 * Thresholds parameters.
1107 * Memory pool for buffer allocations.
1110 * 0 on success, negative errno value on failure.
1113 mlx5_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
1114 unsigned int socket, const struct rte_eth_rxconf *conf,
1115 struct rte_mempool *mp)
1117 struct priv *priv = dev->data->dev_private;
1118 struct rxq *rxq = (*priv->rxqs)[idx];
1119 struct rxq_ctrl *rxq_ctrl = container_of(rxq, struct rxq_ctrl, rxq);
1120 const uint16_t desc_n =
1121 desc + priv->rx_vec_en * MLX5_VPMD_DESCS_PER_LOOP;
1124 if (mlx5_is_secondary())
1125 return -E_RTE_SECONDARY;
1128 if (!rte_is_power_of_2(desc)) {
1129 desc = 1 << log2above(desc);
1130 WARN("%p: increased number of descriptors in RX queue %u"
1131 " to the next power of two (%d)",
1132 (void *)dev, idx, desc);
1134 DEBUG("%p: configuring queue %u for %u descriptors",
1135 (void *)dev, idx, desc);
1136 if (idx >= priv->rxqs_n) {
1137 ERROR("%p: queue index out of range (%u >= %u)",
1138 (void *)dev, idx, priv->rxqs_n);
1143 DEBUG("%p: reusing already allocated queue index %u (%p)",
1144 (void *)dev, idx, (void *)rxq);
1145 if (priv->started) {
1149 (*priv->rxqs)[idx] = NULL;
1150 rxq_cleanup(rxq_ctrl);
1151 /* Resize if rxq size is changed. */
1152 if (rxq_ctrl->rxq.elts_n != log2above(desc)) {
1153 rxq_ctrl = rte_realloc(rxq_ctrl,
1154 sizeof(*rxq_ctrl) + desc_n *
1155 sizeof(struct rte_mbuf *),
1156 RTE_CACHE_LINE_SIZE);
1158 ERROR("%p: unable to reallocate queue index %u",
1165 rxq_ctrl = rte_calloc_socket("RXQ", 1, sizeof(*rxq_ctrl) +
1167 sizeof(struct rte_mbuf *),
1169 if (rxq_ctrl == NULL) {
1170 ERROR("%p: unable to allocate queue index %u",
1176 ret = rxq_ctrl_setup(dev, rxq_ctrl, desc, socket, conf, mp);
1180 rxq_ctrl->rxq.stats.idx = idx;
1181 DEBUG("%p: adding RX queue %p to list",
1182 (void *)dev, (void *)rxq_ctrl);
1183 (*priv->rxqs)[idx] = &rxq_ctrl->rxq;
1190 * DPDK callback to release a RX queue.
1193 * Generic RX queue pointer.
1196 mlx5_rx_queue_release(void *dpdk_rxq)
1198 struct rxq *rxq = (struct rxq *)dpdk_rxq;
1199 struct rxq_ctrl *rxq_ctrl;
1203 if (mlx5_is_secondary())
1208 rxq_ctrl = container_of(rxq, struct rxq_ctrl, rxq);
1209 priv = rxq_ctrl->priv;
1211 if (priv_flow_rxq_in_use(priv, rxq))
1212 rte_panic("Rx queue %p is still used by a flow and cannot be"
1213 " removed\n", (void *)rxq_ctrl);
1214 for (i = 0; (i != priv->rxqs_n); ++i)
1215 if ((*priv->rxqs)[i] == rxq) {
1216 DEBUG("%p: removing RX queue %p from list",
1217 (void *)priv->dev, (void *)rxq_ctrl);
1218 (*priv->rxqs)[i] = NULL;
1221 rxq_cleanup(rxq_ctrl);
1227 * Allocate queue vector and fill epoll fd list for Rx interrupts.
1230 * Pointer to private structure.
1233 * 0 on success, negative on failure.
1236 priv_rx_intr_vec_enable(struct priv *priv)
1239 unsigned int rxqs_n = priv->rxqs_n;
1240 unsigned int n = RTE_MIN(rxqs_n, (uint32_t)RTE_MAX_RXTX_INTR_VEC_ID);
1241 unsigned int count = 0;
1242 struct rte_intr_handle *intr_handle = priv->dev->intr_handle;
1244 if (!priv->dev->data->dev_conf.intr_conf.rxq)
1246 priv_rx_intr_vec_disable(priv);
1247 intr_handle->intr_vec = malloc(sizeof(intr_handle->intr_vec[rxqs_n]));
1248 if (intr_handle->intr_vec == NULL) {
1249 ERROR("failed to allocate memory for interrupt vector,"
1250 " Rx interrupts will not be supported");
1253 intr_handle->type = RTE_INTR_HANDLE_EXT;
1254 for (i = 0; i != n; ++i) {
1255 struct rxq *rxq = (*priv->rxqs)[i];
1256 struct rxq_ctrl *rxq_ctrl =
1257 container_of(rxq, struct rxq_ctrl, rxq);
1262 /* Skip queues that cannot request interrupts. */
1263 if (!rxq || !rxq_ctrl->channel) {
1264 /* Use invalid intr_vec[] index to disable entry. */
1265 intr_handle->intr_vec[i] =
1266 RTE_INTR_VEC_RXTX_OFFSET +
1267 RTE_MAX_RXTX_INTR_VEC_ID;
1270 if (count >= RTE_MAX_RXTX_INTR_VEC_ID) {
1271 ERROR("too many Rx queues for interrupt vector size"
1272 " (%d), Rx interrupts cannot be enabled",
1273 RTE_MAX_RXTX_INTR_VEC_ID);
1274 priv_rx_intr_vec_disable(priv);
1277 fd = rxq_ctrl->channel->fd;
1278 flags = fcntl(fd, F_GETFL);
1279 rc = fcntl(fd, F_SETFL, flags | O_NONBLOCK);
1281 ERROR("failed to make Rx interrupt file descriptor"
1282 " %d non-blocking for queue index %d", fd, i);
1283 priv_rx_intr_vec_disable(priv);
1286 intr_handle->intr_vec[i] = RTE_INTR_VEC_RXTX_OFFSET + count;
1287 intr_handle->efds[count] = fd;
1291 priv_rx_intr_vec_disable(priv);
1293 intr_handle->nb_efd = count;
1298 * Clean up Rx interrupts handler.
1301 * Pointer to private structure.
1304 priv_rx_intr_vec_disable(struct priv *priv)
1306 struct rte_intr_handle *intr_handle = priv->dev->intr_handle;
1308 rte_intr_free_epoll_fd(intr_handle);
1309 free(intr_handle->intr_vec);
1310 intr_handle->nb_efd = 0;
1311 intr_handle->intr_vec = NULL;
1314 #ifdef HAVE_UPDATE_CQ_CI
1317 * DPDK callback for Rx queue interrupt enable.
1320 * Pointer to Ethernet device structure.
1321 * @param rx_queue_id
1325 * 0 on success, negative on failure.
1328 mlx5_rx_intr_enable(struct rte_eth_dev *dev, uint16_t rx_queue_id)
1330 struct priv *priv = mlx5_get_priv(dev);
1331 struct rxq *rxq = (*priv->rxqs)[rx_queue_id];
1332 struct rxq_ctrl *rxq_ctrl = container_of(rxq, struct rxq_ctrl, rxq);
1335 if (!rxq || !rxq_ctrl->channel) {
1338 ibv_mlx5_exp_update_cq_ci(rxq_ctrl->cq, rxq->cq_ci);
1339 ret = ibv_req_notify_cq(rxq_ctrl->cq, 0);
1342 WARN("unable to arm interrupt on rx queue %d", rx_queue_id);
1347 * DPDK callback for Rx queue interrupt disable.
1350 * Pointer to Ethernet device structure.
1351 * @param rx_queue_id
1355 * 0 on success, negative on failure.
1358 mlx5_rx_intr_disable(struct rte_eth_dev *dev, uint16_t rx_queue_id)
1360 struct priv *priv = mlx5_get_priv(dev);
1361 struct rxq *rxq = (*priv->rxqs)[rx_queue_id];
1362 struct rxq_ctrl *rxq_ctrl = container_of(rxq, struct rxq_ctrl, rxq);
1363 struct ibv_cq *ev_cq;
1367 if (!rxq || !rxq_ctrl->channel) {
1370 ret = ibv_get_cq_event(rxq_ctrl->cq->channel, &ev_cq, &ev_ctx);
1371 if (ret || ev_cq != rxq_ctrl->cq)
1375 WARN("unable to disable interrupt on rx queue %d",
1378 ibv_ack_cq_events(rxq_ctrl->cq, 1);
1382 #endif /* HAVE_UPDATE_CQ_CI */