4 * Copyright 2015 6WIND S.A.
5 * Copyright 2015 Mellanox.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * * Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * * Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * * Neither the name of 6WIND S.A. nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
42 /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
44 #pragma GCC diagnostic ignored "-Wpedantic"
46 #include <infiniband/verbs.h>
47 #include <infiniband/arch.h>
48 #include <infiniband/mlx5_hw.h>
50 #pragma GCC diagnostic error "-Wpedantic"
53 /* DPDK headers don't like -pedantic. */
55 #pragma GCC diagnostic ignored "-Wpedantic"
58 #include <rte_malloc.h>
59 #include <rte_ethdev.h>
60 #include <rte_common.h>
61 #include <rte_interrupts.h>
63 #pragma GCC diagnostic error "-Wpedantic"
67 #include "mlx5_rxtx.h"
68 #include "mlx5_utils.h"
69 #include "mlx5_autoconf.h"
70 #include "mlx5_defs.h"
72 /* Initialization data for hash RX queues. */
73 const struct hash_rxq_init hash_rxq_init[] = {
75 .hash_fields = (IBV_EXP_RX_HASH_SRC_IPV4 |
76 IBV_EXP_RX_HASH_DST_IPV4 |
77 IBV_EXP_RX_HASH_SRC_PORT_TCP |
78 IBV_EXP_RX_HASH_DST_PORT_TCP),
79 .dpdk_rss_hf = ETH_RSS_NONFRAG_IPV4_TCP,
81 .flow_spec.tcp_udp = {
82 .type = IBV_EXP_FLOW_SPEC_TCP,
83 .size = sizeof(hash_rxq_init[0].flow_spec.tcp_udp),
85 .underlayer = &hash_rxq_init[HASH_RXQ_IPV4],
88 .hash_fields = (IBV_EXP_RX_HASH_SRC_IPV4 |
89 IBV_EXP_RX_HASH_DST_IPV4 |
90 IBV_EXP_RX_HASH_SRC_PORT_UDP |
91 IBV_EXP_RX_HASH_DST_PORT_UDP),
92 .dpdk_rss_hf = ETH_RSS_NONFRAG_IPV4_UDP,
94 .flow_spec.tcp_udp = {
95 .type = IBV_EXP_FLOW_SPEC_UDP,
96 .size = sizeof(hash_rxq_init[0].flow_spec.tcp_udp),
98 .underlayer = &hash_rxq_init[HASH_RXQ_IPV4],
101 .hash_fields = (IBV_EXP_RX_HASH_SRC_IPV4 |
102 IBV_EXP_RX_HASH_DST_IPV4),
103 .dpdk_rss_hf = (ETH_RSS_IPV4 |
107 .type = IBV_EXP_FLOW_SPEC_IPV4,
108 .size = sizeof(hash_rxq_init[0].flow_spec.ipv4),
110 .underlayer = &hash_rxq_init[HASH_RXQ_ETH],
113 .hash_fields = (IBV_EXP_RX_HASH_SRC_IPV6 |
114 IBV_EXP_RX_HASH_DST_IPV6 |
115 IBV_EXP_RX_HASH_SRC_PORT_TCP |
116 IBV_EXP_RX_HASH_DST_PORT_TCP),
117 .dpdk_rss_hf = ETH_RSS_NONFRAG_IPV6_TCP,
119 .flow_spec.tcp_udp = {
120 .type = IBV_EXP_FLOW_SPEC_TCP,
121 .size = sizeof(hash_rxq_init[0].flow_spec.tcp_udp),
123 .underlayer = &hash_rxq_init[HASH_RXQ_IPV6],
126 .hash_fields = (IBV_EXP_RX_HASH_SRC_IPV6 |
127 IBV_EXP_RX_HASH_DST_IPV6 |
128 IBV_EXP_RX_HASH_SRC_PORT_UDP |
129 IBV_EXP_RX_HASH_DST_PORT_UDP),
130 .dpdk_rss_hf = ETH_RSS_NONFRAG_IPV6_UDP,
132 .flow_spec.tcp_udp = {
133 .type = IBV_EXP_FLOW_SPEC_UDP,
134 .size = sizeof(hash_rxq_init[0].flow_spec.tcp_udp),
136 .underlayer = &hash_rxq_init[HASH_RXQ_IPV6],
139 .hash_fields = (IBV_EXP_RX_HASH_SRC_IPV6 |
140 IBV_EXP_RX_HASH_DST_IPV6),
141 .dpdk_rss_hf = (ETH_RSS_IPV6 |
145 .type = IBV_EXP_FLOW_SPEC_IPV6,
146 .size = sizeof(hash_rxq_init[0].flow_spec.ipv6),
148 .underlayer = &hash_rxq_init[HASH_RXQ_ETH],
155 .type = IBV_EXP_FLOW_SPEC_ETH,
156 .size = sizeof(hash_rxq_init[0].flow_spec.eth),
162 /* Number of entries in hash_rxq_init[]. */
163 const unsigned int hash_rxq_init_n = RTE_DIM(hash_rxq_init);
165 /* Initialization data for hash RX queue indirection tables. */
166 static const struct ind_table_init ind_table_init[] = {
168 .max_size = -1u, /* Superseded by HW limitations. */
170 1 << HASH_RXQ_TCPV4 |
171 1 << HASH_RXQ_UDPV4 |
173 1 << HASH_RXQ_TCPV6 |
174 1 << HASH_RXQ_UDPV6 |
181 .hash_types = 1 << HASH_RXQ_ETH,
186 #define IND_TABLE_INIT_N RTE_DIM(ind_table_init)
188 /* Default RSS hash key also used for ConnectX-3. */
189 uint8_t rss_hash_default_key[] = {
190 0x2c, 0xc6, 0x81, 0xd1,
191 0x5b, 0xdb, 0xf4, 0xf7,
192 0xfc, 0xa2, 0x83, 0x19,
193 0xdb, 0x1a, 0x3e, 0x94,
194 0x6b, 0x9e, 0x38, 0xd9,
195 0x2c, 0x9c, 0x03, 0xd1,
196 0xad, 0x99, 0x44, 0xa7,
197 0xd9, 0x56, 0x3d, 0x59,
198 0x06, 0x3c, 0x25, 0xf3,
199 0xfc, 0x1f, 0xdc, 0x2a,
202 /* Length of the default RSS hash key. */
203 const size_t rss_hash_default_key_len = sizeof(rss_hash_default_key);
206 * Populate flow steering rule for a given hash RX queue type using
207 * information from hash_rxq_init[]. Nothing is written to flow_attr when
208 * flow_attr_size is not large enough, but the required size is still returned.
211 * Pointer to private structure.
212 * @param[out] flow_attr
213 * Pointer to flow attribute structure to fill. Note that the allocated
214 * area must be larger and large enough to hold all flow specifications.
215 * @param flow_attr_size
216 * Entire size of flow_attr and trailing room for flow specifications.
218 * Hash RX queue type to use for flow steering rule.
221 * Total size of the flow attribute buffer. No errors are defined.
224 priv_flow_attr(struct priv *priv, struct ibv_exp_flow_attr *flow_attr,
225 size_t flow_attr_size, enum hash_rxq_type type)
227 size_t offset = sizeof(*flow_attr);
228 const struct hash_rxq_init *init = &hash_rxq_init[type];
230 assert(priv != NULL);
231 assert((size_t)type < RTE_DIM(hash_rxq_init));
233 offset += init->flow_spec.hdr.size;
234 init = init->underlayer;
235 } while (init != NULL);
236 if (offset > flow_attr_size)
238 flow_attr_size = offset;
239 init = &hash_rxq_init[type];
240 *flow_attr = (struct ibv_exp_flow_attr){
241 .type = IBV_EXP_FLOW_ATTR_NORMAL,
242 /* Priorities < 3 are reserved for flow director. */
243 .priority = init->flow_priority + 3,
249 offset -= init->flow_spec.hdr.size;
250 memcpy((void *)((uintptr_t)flow_attr + offset),
252 init->flow_spec.hdr.size);
253 ++flow_attr->num_of_specs;
254 init = init->underlayer;
255 } while (init != NULL);
256 return flow_attr_size;
260 * Convert hash type position in indirection table initializer to
261 * hash RX queue type.
264 * Indirection table initializer.
266 * Hash type position.
269 * Hash RX queue type.
271 static enum hash_rxq_type
272 hash_rxq_type_from_pos(const struct ind_table_init *table, unsigned int pos)
274 enum hash_rxq_type type = HASH_RXQ_TCPV4;
276 assert(pos < table->hash_types_n);
278 if ((table->hash_types & (1 << type)) && (pos-- == 0))
286 * Filter out disabled hash RX queue types from ind_table_init[].
289 * Pointer to private structure.
294 * Number of table entries.
297 priv_make_ind_table_init(struct priv *priv,
298 struct ind_table_init (*table)[IND_TABLE_INIT_N])
303 unsigned int table_n = 0;
304 /* Mandatory to receive frames not handled by normal hash RX queues. */
305 unsigned int hash_types_sup = 1 << HASH_RXQ_ETH;
307 rss_hf = priv->rss_hf;
308 /* Process other protocols only if more than one queue. */
309 if (priv->rxqs_n > 1)
310 for (i = 0; (i != hash_rxq_init_n); ++i)
311 if (rss_hf & hash_rxq_init[i].dpdk_rss_hf)
312 hash_types_sup |= (1 << i);
314 /* Filter out entries whose protocols are not in the set. */
315 for (i = 0, j = 0; (i != IND_TABLE_INIT_N); ++i) {
319 /* j is increased only if the table has valid protocols. */
321 (*table)[j] = ind_table_init[i];
322 (*table)[j].hash_types &= hash_types_sup;
323 for (h = 0, nb = 0; (h != hash_rxq_init_n); ++h)
324 if (((*table)[j].hash_types >> h) & 0x1)
326 (*table)[i].hash_types_n = nb;
336 * Initialize hash RX queues and indirection table.
339 * Pointer to private structure.
342 * 0 on success, errno value on failure.
345 priv_create_hash_rxqs(struct priv *priv)
347 struct ibv_exp_wq *wqs[priv->reta_idx_n];
348 struct ind_table_init ind_table_init[IND_TABLE_INIT_N];
349 unsigned int ind_tables_n =
350 priv_make_ind_table_init(priv, &ind_table_init);
351 unsigned int hash_rxqs_n = 0;
352 struct hash_rxq (*hash_rxqs)[] = NULL;
353 struct ibv_exp_rwq_ind_table *(*ind_tables)[] = NULL;
359 assert(priv->ind_tables == NULL);
360 assert(priv->ind_tables_n == 0);
361 assert(priv->hash_rxqs == NULL);
362 assert(priv->hash_rxqs_n == 0);
363 assert(priv->pd != NULL);
364 assert(priv->ctx != NULL);
365 if (priv->rxqs_n == 0)
367 assert(priv->rxqs != NULL);
368 if (ind_tables_n == 0) {
369 ERROR("all hash RX queue types have been filtered out,"
370 " indirection table cannot be created");
373 if (priv->rxqs_n & (priv->rxqs_n - 1)) {
374 INFO("%u RX queues are configured, consider rounding this"
375 " number to the next power of two for better balancing",
377 DEBUG("indirection table extended to assume %u WQs",
380 for (i = 0; (i != priv->reta_idx_n); ++i) {
381 struct rxq_ctrl *rxq_ctrl;
383 rxq_ctrl = container_of((*priv->rxqs)[(*priv->reta_idx)[i]],
384 struct rxq_ctrl, rxq);
385 wqs[i] = rxq_ctrl->wq;
387 /* Get number of hash RX queues to configure. */
388 for (i = 0, hash_rxqs_n = 0; (i != ind_tables_n); ++i)
389 hash_rxqs_n += ind_table_init[i].hash_types_n;
390 DEBUG("allocating %u hash RX queues for %u WQs, %u indirection tables",
391 hash_rxqs_n, priv->rxqs_n, ind_tables_n);
392 /* Create indirection tables. */
393 ind_tables = rte_calloc(__func__, ind_tables_n,
394 sizeof((*ind_tables)[0]), 0);
395 if (ind_tables == NULL) {
397 ERROR("cannot allocate indirection tables container: %s",
401 for (i = 0; (i != ind_tables_n); ++i) {
402 struct ibv_exp_rwq_ind_table_init_attr ind_init_attr = {
404 .log_ind_tbl_size = 0, /* Set below. */
408 unsigned int ind_tbl_size = ind_table_init[i].max_size;
409 struct ibv_exp_rwq_ind_table *ind_table;
411 if (priv->reta_idx_n < ind_tbl_size)
412 ind_tbl_size = priv->reta_idx_n;
413 ind_init_attr.log_ind_tbl_size = log2above(ind_tbl_size);
415 ind_table = ibv_exp_create_rwq_ind_table(priv->ctx,
417 if (ind_table != NULL) {
418 (*ind_tables)[i] = ind_table;
421 /* Not clear whether errno is set. */
422 err = (errno ? errno : EINVAL);
423 ERROR("RX indirection table creation failed with error %d: %s",
427 /* Allocate array that holds hash RX queues and related data. */
428 hash_rxqs = rte_calloc(__func__, hash_rxqs_n,
429 sizeof((*hash_rxqs)[0]), 0);
430 if (hash_rxqs == NULL) {
432 ERROR("cannot allocate hash RX queues container: %s",
436 for (i = 0, j = 0, k = 0;
437 ((i != hash_rxqs_n) && (j != ind_tables_n));
439 struct hash_rxq *hash_rxq = &(*hash_rxqs)[i];
440 enum hash_rxq_type type =
441 hash_rxq_type_from_pos(&ind_table_init[j], k);
442 struct rte_eth_rss_conf *priv_rss_conf =
443 (*priv->rss_conf)[type];
444 struct ibv_exp_rx_hash_conf hash_conf = {
445 .rx_hash_function = IBV_EXP_RX_HASH_FUNC_TOEPLITZ,
446 .rx_hash_key_len = (priv_rss_conf ?
447 priv_rss_conf->rss_key_len :
448 rss_hash_default_key_len),
449 .rx_hash_key = (priv_rss_conf ?
450 priv_rss_conf->rss_key :
451 rss_hash_default_key),
452 .rx_hash_fields_mask = hash_rxq_init[type].hash_fields,
453 .rwq_ind_tbl = (*ind_tables)[j],
455 struct ibv_exp_qp_init_attr qp_init_attr = {
456 .max_inl_recv = 0, /* Currently not supported. */
457 .qp_type = IBV_QPT_RAW_PACKET,
458 .comp_mask = (IBV_EXP_QP_INIT_ATTR_PD |
459 IBV_EXP_QP_INIT_ATTR_RX_HASH),
461 .rx_hash_conf = &hash_conf,
462 .port_num = priv->port,
465 DEBUG("using indirection table %u for hash RX queue %u type %d",
467 *hash_rxq = (struct hash_rxq){
469 .qp = ibv_exp_create_qp(priv->ctx, &qp_init_attr),
472 if (hash_rxq->qp == NULL) {
473 err = (errno ? errno : EINVAL);
474 ERROR("Hash RX QP creation failure: %s",
478 if (++k < ind_table_init[j].hash_types_n)
480 /* Switch to the next indirection table and reset hash RX
481 * queue type array index. */
485 priv->ind_tables = ind_tables;
486 priv->ind_tables_n = ind_tables_n;
487 priv->hash_rxqs = hash_rxqs;
488 priv->hash_rxqs_n = hash_rxqs_n;
492 if (hash_rxqs != NULL) {
493 for (i = 0; (i != hash_rxqs_n); ++i) {
494 struct ibv_qp *qp = (*hash_rxqs)[i].qp;
498 claim_zero(ibv_destroy_qp(qp));
502 if (ind_tables != NULL) {
503 for (j = 0; (j != ind_tables_n); ++j) {
504 struct ibv_exp_rwq_ind_table *ind_table =
507 if (ind_table == NULL)
509 claim_zero(ibv_exp_destroy_rwq_ind_table(ind_table));
511 rte_free(ind_tables);
517 * Clean up hash RX queues and indirection table.
520 * Pointer to private structure.
523 priv_destroy_hash_rxqs(struct priv *priv)
527 DEBUG("destroying %u hash RX queues", priv->hash_rxqs_n);
528 if (priv->hash_rxqs_n == 0) {
529 assert(priv->hash_rxqs == NULL);
530 assert(priv->ind_tables == NULL);
533 for (i = 0; (i != priv->hash_rxqs_n); ++i) {
534 struct hash_rxq *hash_rxq = &(*priv->hash_rxqs)[i];
537 assert(hash_rxq->priv == priv);
538 assert(hash_rxq->qp != NULL);
539 /* Also check that there are no remaining flows. */
540 for (j = 0; (j != RTE_DIM(hash_rxq->special_flow)); ++j)
542 (k != RTE_DIM(hash_rxq->special_flow[j]));
544 assert(hash_rxq->special_flow[j][k] == NULL);
545 for (j = 0; (j != RTE_DIM(hash_rxq->mac_flow)); ++j)
546 for (k = 0; (k != RTE_DIM(hash_rxq->mac_flow[j])); ++k)
547 assert(hash_rxq->mac_flow[j][k] == NULL);
548 claim_zero(ibv_destroy_qp(hash_rxq->qp));
550 priv->hash_rxqs_n = 0;
551 rte_free(priv->hash_rxqs);
552 priv->hash_rxqs = NULL;
553 for (i = 0; (i != priv->ind_tables_n); ++i) {
554 struct ibv_exp_rwq_ind_table *ind_table =
555 (*priv->ind_tables)[i];
557 assert(ind_table != NULL);
558 claim_zero(ibv_exp_destroy_rwq_ind_table(ind_table));
560 priv->ind_tables_n = 0;
561 rte_free(priv->ind_tables);
562 priv->ind_tables = NULL;
566 * Check whether a given flow type is allowed.
569 * Pointer to private structure.
571 * Flow type to check.
574 * Nonzero if the given flow type is allowed.
577 priv_allow_flow_type(struct priv *priv, enum hash_rxq_flow_type type)
579 /* Only FLOW_TYPE_PROMISC is allowed when promiscuous mode
580 * has been requested. */
581 if (priv->promisc_req)
582 return type == HASH_RXQ_FLOW_TYPE_PROMISC;
584 case HASH_RXQ_FLOW_TYPE_PROMISC:
585 return !!priv->promisc_req;
586 case HASH_RXQ_FLOW_TYPE_ALLMULTI:
587 return !!priv->allmulti_req;
588 case HASH_RXQ_FLOW_TYPE_BROADCAST:
589 case HASH_RXQ_FLOW_TYPE_IPV6MULTI:
590 /* If allmulti is enabled, broadcast and ipv6multi
591 * are unnecessary. */
592 return !priv->allmulti_req;
593 case HASH_RXQ_FLOW_TYPE_MAC:
596 /* Unsupported flow type is not allowed. */
603 * Automatically enable/disable flows according to configuration.
609 * 0 on success, errno value on failure.
612 priv_rehash_flows(struct priv *priv)
614 enum hash_rxq_flow_type i;
616 for (i = HASH_RXQ_FLOW_TYPE_PROMISC;
617 i != RTE_DIM((*priv->hash_rxqs)[0].special_flow);
619 if (!priv_allow_flow_type(priv, i)) {
620 priv_special_flow_disable(priv, i);
622 int ret = priv_special_flow_enable(priv, i);
627 if (priv_allow_flow_type(priv, HASH_RXQ_FLOW_TYPE_MAC))
628 return priv_mac_addrs_enable(priv);
629 priv_mac_addrs_disable(priv);
634 * Allocate RX queue elements.
637 * Pointer to RX queue structure.
639 * Number of elements to allocate.
641 * If not NULL, fetch buffers from this array instead of allocating them
642 * with rte_pktmbuf_alloc().
645 * 0 on success, errno value on failure.
648 rxq_alloc_elts(struct rxq_ctrl *rxq_ctrl, unsigned int elts_n,
649 struct rte_mbuf *(*pool)[])
651 const unsigned int sges_n = 1 << rxq_ctrl->rxq.sges_n;
655 /* Iterate on segments. */
656 for (i = 0; (i != elts_n); ++i) {
657 struct rte_mbuf *buf;
658 volatile struct mlx5_wqe_data_seg *scat =
659 &(*rxq_ctrl->rxq.wqes)[i];
664 rte_pktmbuf_reset(buf);
665 rte_pktmbuf_refcnt_update(buf, 1);
667 buf = rte_pktmbuf_alloc(rxq_ctrl->rxq.mp);
669 assert(pool == NULL);
670 ERROR("%p: empty mbuf pool", (void *)rxq_ctrl);
674 /* Headroom is reserved by rte_pktmbuf_alloc(). */
675 assert(DATA_OFF(buf) == RTE_PKTMBUF_HEADROOM);
676 /* Buffer is supposed to be empty. */
677 assert(rte_pktmbuf_data_len(buf) == 0);
678 assert(rte_pktmbuf_pkt_len(buf) == 0);
680 /* Only the first segment keeps headroom. */
682 SET_DATA_OFF(buf, 0);
683 PORT(buf) = rxq_ctrl->rxq.port_id;
684 DATA_LEN(buf) = rte_pktmbuf_tailroom(buf);
685 PKT_LEN(buf) = DATA_LEN(buf);
687 /* scat->addr must be able to store a pointer. */
688 assert(sizeof(scat->addr) >= sizeof(uintptr_t));
689 *scat = (struct mlx5_wqe_data_seg){
690 .addr = htonll(rte_pktmbuf_mtod(buf, uintptr_t)),
691 .byte_count = htonl(DATA_LEN(buf)),
692 .lkey = htonl(rxq_ctrl->mr->lkey),
694 (*rxq_ctrl->rxq.elts)[i] = buf;
696 DEBUG("%p: allocated and configured %u segments (max %u packets)",
697 (void *)rxq_ctrl, elts_n, elts_n / (1 << rxq_ctrl->rxq.sges_n));
701 assert(pool == NULL);
703 for (i = 0; (i != elts_n); ++i) {
704 if ((*rxq_ctrl->rxq.elts)[i] != NULL)
705 rte_pktmbuf_free_seg((*rxq_ctrl->rxq.elts)[i]);
706 (*rxq_ctrl->rxq.elts)[i] = NULL;
708 DEBUG("%p: failed, freed everything", (void *)rxq_ctrl);
714 * Free RX queue elements.
717 * Pointer to RX queue structure.
720 rxq_free_elts(struct rxq_ctrl *rxq_ctrl)
724 DEBUG("%p: freeing WRs", (void *)rxq_ctrl);
725 if (rxq_ctrl->rxq.elts == NULL)
728 for (i = 0; (i != (1u << rxq_ctrl->rxq.elts_n)); ++i) {
729 if ((*rxq_ctrl->rxq.elts)[i] != NULL)
730 rte_pktmbuf_free_seg((*rxq_ctrl->rxq.elts)[i]);
731 (*rxq_ctrl->rxq.elts)[i] = NULL;
736 * Clean up a RX queue.
738 * Destroy objects, free allocated memory and reset the structure for reuse.
741 * Pointer to RX queue structure.
744 rxq_cleanup(struct rxq_ctrl *rxq_ctrl)
746 DEBUG("cleaning up %p", (void *)rxq_ctrl);
747 rxq_free_elts(rxq_ctrl);
748 if (rxq_ctrl->fdir_queue != NULL)
749 priv_fdir_queue_destroy(rxq_ctrl->priv, rxq_ctrl->fdir_queue);
750 if (rxq_ctrl->wq != NULL)
751 claim_zero(ibv_exp_destroy_wq(rxq_ctrl->wq));
752 if (rxq_ctrl->cq != NULL)
753 claim_zero(ibv_destroy_cq(rxq_ctrl->cq));
754 if (rxq_ctrl->channel != NULL)
755 claim_zero(ibv_destroy_comp_channel(rxq_ctrl->channel));
756 if (rxq_ctrl->mr != NULL)
757 claim_zero(ibv_dereg_mr(rxq_ctrl->mr));
758 memset(rxq_ctrl, 0, sizeof(*rxq_ctrl));
762 * Reconfigure RX queue buffers.
764 * rxq_rehash() does not allocate mbufs, which, if not done from the right
765 * thread (such as a control thread), may corrupt the pool.
766 * In case of failure, the queue is left untouched.
769 * Pointer to Ethernet device structure.
774 * 0 on success, errno value on failure.
777 rxq_rehash(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl)
779 unsigned int elts_n = 1 << rxq_ctrl->rxq.elts_n;
781 struct ibv_exp_wq_attr mod;
784 DEBUG("%p: rehashing queue %p with %u SGE(s) per packet",
785 (void *)dev, (void *)rxq_ctrl, 1 << rxq_ctrl->rxq.sges_n);
786 assert(!(elts_n % (1 << rxq_ctrl->rxq.sges_n)));
787 /* From now on, any failure will render the queue unusable.
788 * Reinitialize WQ. */
789 mod = (struct ibv_exp_wq_attr){
790 .attr_mask = IBV_EXP_WQ_ATTR_STATE,
791 .wq_state = IBV_EXP_WQS_RESET,
793 err = ibv_exp_modify_wq(rxq_ctrl->wq, &mod);
795 ERROR("%p: cannot reset WQ: %s", (void *)dev, strerror(err));
799 /* Snatch mbufs from original queue. */
800 claim_zero(rxq_alloc_elts(rxq_ctrl, elts_n, rxq_ctrl->rxq.elts));
801 for (i = 0; i != elts_n; ++i) {
802 struct rte_mbuf *buf = (*rxq_ctrl->rxq.elts)[i];
804 assert(rte_mbuf_refcnt_read(buf) == 2);
805 rte_pktmbuf_free_seg(buf);
807 /* Change queue state to ready. */
808 mod = (struct ibv_exp_wq_attr){
809 .attr_mask = IBV_EXP_WQ_ATTR_STATE,
810 .wq_state = IBV_EXP_WQS_RDY,
812 err = ibv_exp_modify_wq(rxq_ctrl->wq, &mod);
814 ERROR("%p: WQ state to IBV_EXP_WQS_RDY failed: %s",
815 (void *)dev, strerror(err));
818 /* Update doorbell counter. */
819 rxq_ctrl->rxq.rq_ci = elts_n >> rxq_ctrl->rxq.sges_n;
821 *rxq_ctrl->rxq.rq_db = htonl(rxq_ctrl->rxq.rq_ci);
828 * Initialize RX queue.
831 * Pointer to RX queue control template.
834 * 0 on success, errno value on failure.
837 rxq_setup(struct rxq_ctrl *tmpl)
839 struct ibv_cq *ibcq = tmpl->cq;
840 struct mlx5_cq *cq = to_mxxx(cq, cq);
841 struct mlx5_rwq *rwq = container_of(tmpl->wq, struct mlx5_rwq, wq);
842 struct rte_mbuf *(*elts)[1 << tmpl->rxq.elts_n] =
843 rte_calloc_socket("RXQ", 1, sizeof(*elts), 0, tmpl->socket);
845 if (cq->cqe_sz != RTE_CACHE_LINE_SIZE) {
846 ERROR("Wrong MLX5_CQE_SIZE environment variable value: "
847 "it should be set to %u", RTE_CACHE_LINE_SIZE);
852 tmpl->rxq.rq_db = rwq->rq.db;
853 tmpl->rxq.cqe_n = log2above(ibcq->cqe);
856 tmpl->rxq.cq_db = cq->dbrec;
858 (volatile struct mlx5_wqe_data_seg (*)[])
859 (uintptr_t)rwq->rq.buff;
861 (volatile struct mlx5_cqe (*)[])
862 (uintptr_t)cq->active_buf->buf;
863 tmpl->rxq.elts = elts;
868 * Configure a RX queue.
871 * Pointer to Ethernet device structure.
873 * Pointer to RX queue structure.
875 * Number of descriptors to configure in queue.
877 * NUMA socket on which memory must be allocated.
879 * Thresholds parameters.
881 * Memory pool for buffer allocations.
884 * 0 on success, errno value on failure.
887 rxq_ctrl_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl,
888 uint16_t desc, unsigned int socket,
889 const struct rte_eth_rxconf *conf, struct rte_mempool *mp)
891 struct priv *priv = dev->data->dev_private;
892 struct rxq_ctrl tmpl = {
896 .elts_n = log2above(desc),
898 .rss_hash = priv->rxqs_n > 1,
901 struct ibv_exp_wq_attr mod;
903 struct ibv_exp_cq_init_attr cq;
904 struct ibv_exp_wq_init_attr wq;
905 struct ibv_exp_cq_attr cq_attr;
907 unsigned int mb_len = rte_pktmbuf_data_room_size(mp);
908 unsigned int cqe_n = desc - 1;
909 struct rte_mbuf *(*elts)[desc] = NULL;
912 (void)conf; /* Thresholds configuration (ignored). */
913 /* Enable scattered packets support for this queue if necessary. */
914 assert(mb_len >= RTE_PKTMBUF_HEADROOM);
915 if (dev->data->dev_conf.rxmode.max_rx_pkt_len <=
916 (mb_len - RTE_PKTMBUF_HEADROOM)) {
918 } else if (dev->data->dev_conf.rxmode.enable_scatter) {
920 RTE_PKTMBUF_HEADROOM +
921 dev->data->dev_conf.rxmode.max_rx_pkt_len;
925 * Determine the number of SGEs needed for a full packet
926 * and round it to the next power of two.
928 sges_n = log2above((size / mb_len) + !!(size % mb_len));
929 tmpl.rxq.sges_n = sges_n;
930 /* Make sure rxq.sges_n did not overflow. */
931 size = mb_len * (1 << tmpl.rxq.sges_n);
932 size -= RTE_PKTMBUF_HEADROOM;
933 if (size < dev->data->dev_conf.rxmode.max_rx_pkt_len) {
934 ERROR("%p: too many SGEs (%u) needed to handle"
935 " requested maximum packet size %u",
938 dev->data->dev_conf.rxmode.max_rx_pkt_len);
942 WARN("%p: the requested maximum Rx packet size (%u) is"
943 " larger than a single mbuf (%u) and scattered"
944 " mode has not been requested",
946 dev->data->dev_conf.rxmode.max_rx_pkt_len,
947 mb_len - RTE_PKTMBUF_HEADROOM);
949 DEBUG("%p: maximum number of segments per packet: %u",
950 (void *)dev, 1 << tmpl.rxq.sges_n);
951 if (desc % (1 << tmpl.rxq.sges_n)) {
952 ERROR("%p: number of RX queue descriptors (%u) is not a"
953 " multiple of SGEs per packet (%u)",
956 1 << tmpl.rxq.sges_n);
959 /* Toggle RX checksum offload if hardware supports it. */
961 tmpl.rxq.csum = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
962 if (priv->hw_csum_l2tun)
963 tmpl.rxq.csum_l2tun =
964 !!dev->data->dev_conf.rxmode.hw_ip_checksum;
965 /* Use the entire RX mempool as the memory region. */
966 tmpl.mr = mlx5_mp2mr(priv->pd, mp);
967 if (tmpl.mr == NULL) {
969 ERROR("%p: MR creation failure: %s",
970 (void *)dev, strerror(ret));
973 if (dev->data->dev_conf.intr_conf.rxq) {
974 tmpl.channel = ibv_create_comp_channel(priv->ctx);
975 if (tmpl.channel == NULL) {
976 dev->data->dev_conf.intr_conf.rxq = 0;
978 ERROR("%p: Comp Channel creation failure: %s",
979 (void *)dev, strerror(ret));
983 attr.cq = (struct ibv_exp_cq_init_attr){
986 if (priv->cqe_comp) {
987 attr.cq.comp_mask |= IBV_EXP_CQ_INIT_ATTR_FLAGS;
988 attr.cq.flags |= IBV_EXP_CQ_COMPRESSED_CQE;
989 cqe_n = (desc * 2) - 1; /* Double the number of CQEs. */
991 tmpl.cq = ibv_exp_create_cq(priv->ctx, cqe_n, NULL, tmpl.channel, 0,
993 if (tmpl.cq == NULL) {
995 ERROR("%p: CQ creation failure: %s",
996 (void *)dev, strerror(ret));
999 DEBUG("priv->device_attr.max_qp_wr is %d",
1000 priv->device_attr.max_qp_wr);
1001 DEBUG("priv->device_attr.max_sge is %d",
1002 priv->device_attr.max_sge);
1003 /* Configure VLAN stripping. */
1004 tmpl.rxq.vlan_strip = (priv->hw_vlan_strip &&
1005 !!dev->data->dev_conf.rxmode.hw_vlan_strip);
1006 attr.wq = (struct ibv_exp_wq_init_attr){
1007 .wq_context = NULL, /* Could be useful in the future. */
1008 .wq_type = IBV_EXP_WQT_RQ,
1009 /* Max number of outstanding WRs. */
1010 .max_recv_wr = desc >> tmpl.rxq.sges_n,
1011 /* Max number of scatter/gather elements in a WR. */
1012 .max_recv_sge = 1 << tmpl.rxq.sges_n,
1016 IBV_EXP_CREATE_WQ_VLAN_OFFLOADS |
1018 .vlan_offloads = (tmpl.rxq.vlan_strip ?
1019 IBV_EXP_RECEIVE_WQ_CVLAN_STRIP :
1022 /* By default, FCS (CRC) is stripped by hardware. */
1023 if (dev->data->dev_conf.rxmode.hw_strip_crc) {
1024 tmpl.rxq.crc_present = 0;
1025 } else if (priv->hw_fcs_strip) {
1026 /* Ask HW/Verbs to leave CRC in place when supported. */
1027 attr.wq.flags |= IBV_EXP_CREATE_WQ_FLAG_SCATTER_FCS;
1028 attr.wq.comp_mask |= IBV_EXP_CREATE_WQ_FLAGS;
1029 tmpl.rxq.crc_present = 1;
1031 WARN("%p: CRC stripping has been disabled but will still"
1032 " be performed by hardware, make sure MLNX_OFED and"
1033 " firmware are up to date",
1035 tmpl.rxq.crc_present = 0;
1037 DEBUG("%p: CRC stripping is %s, %u bytes will be subtracted from"
1038 " incoming frames to hide it",
1040 tmpl.rxq.crc_present ? "disabled" : "enabled",
1041 tmpl.rxq.crc_present << 2);
1042 if (!mlx5_getenv_int("MLX5_PMD_ENABLE_PADDING"))
1043 ; /* Nothing else to do. */
1044 else if (priv->hw_padding) {
1045 INFO("%p: enabling packet padding on queue %p",
1046 (void *)dev, (void *)rxq_ctrl);
1047 attr.wq.flags |= IBV_EXP_CREATE_WQ_FLAG_RX_END_PADDING;
1048 attr.wq.comp_mask |= IBV_EXP_CREATE_WQ_FLAGS;
1050 WARN("%p: packet padding has been requested but is not"
1051 " supported, make sure MLNX_OFED and firmware are"
1055 tmpl.wq = ibv_exp_create_wq(priv->ctx, &attr.wq);
1056 if (tmpl.wq == NULL) {
1057 ret = (errno ? errno : EINVAL);
1058 ERROR("%p: WQ creation failure: %s",
1059 (void *)dev, strerror(ret));
1063 * Make sure number of WRs*SGEs match expectations since a queue
1064 * cannot allocate more than "desc" buffers.
1066 if (((int)attr.wq.max_recv_wr != (desc >> tmpl.rxq.sges_n)) ||
1067 ((int)attr.wq.max_recv_sge != (1 << tmpl.rxq.sges_n))) {
1068 ERROR("%p: requested %u*%u but got %u*%u WRs*SGEs",
1070 (desc >> tmpl.rxq.sges_n), (1 << tmpl.rxq.sges_n),
1071 attr.wq.max_recv_wr, attr.wq.max_recv_sge);
1076 tmpl.rxq.port_id = dev->data->port_id;
1077 DEBUG("%p: RTE port ID: %u", (void *)rxq_ctrl, tmpl.rxq.port_id);
1078 /* Change queue state to ready. */
1079 mod = (struct ibv_exp_wq_attr){
1080 .attr_mask = IBV_EXP_WQ_ATTR_STATE,
1081 .wq_state = IBV_EXP_WQS_RDY,
1083 ret = ibv_exp_modify_wq(tmpl.wq, &mod);
1085 ERROR("%p: WQ state to IBV_EXP_WQS_RDY failed: %s",
1086 (void *)dev, strerror(ret));
1089 ret = rxq_setup(&tmpl);
1091 ERROR("%p: cannot initialize RX queue structure: %s",
1092 (void *)dev, strerror(ret));
1095 /* Reuse buffers from original queue if possible. */
1096 if (rxq_ctrl->rxq.elts_n) {
1097 assert(1 << rxq_ctrl->rxq.elts_n == desc);
1098 assert(rxq_ctrl->rxq.elts != tmpl.rxq.elts);
1099 ret = rxq_alloc_elts(&tmpl, desc, rxq_ctrl->rxq.elts);
1101 ret = rxq_alloc_elts(&tmpl, desc, NULL);
1103 ERROR("%p: RXQ allocation failed: %s",
1104 (void *)dev, strerror(ret));
1107 /* Clean up rxq in case we're reinitializing it. */
1108 DEBUG("%p: cleaning-up old rxq just in case", (void *)rxq_ctrl);
1109 rxq_cleanup(rxq_ctrl);
1110 /* Move mbuf pointers to dedicated storage area in RX queue. */
1111 elts = (void *)(rxq_ctrl + 1);
1112 rte_memcpy(elts, tmpl.rxq.elts, sizeof(*elts));
1114 memset(tmpl.rxq.elts, 0x55, sizeof(*elts));
1116 rte_free(tmpl.rxq.elts);
1117 tmpl.rxq.elts = elts;
1119 /* Update doorbell counter. */
1120 rxq_ctrl->rxq.rq_ci = desc >> rxq_ctrl->rxq.sges_n;
1122 *rxq_ctrl->rxq.rq_db = htonl(rxq_ctrl->rxq.rq_ci);
1123 DEBUG("%p: rxq updated with %p", (void *)rxq_ctrl, (void *)&tmpl);
1127 elts = tmpl.rxq.elts;
1135 * DPDK callback to configure a RX queue.
1138 * Pointer to Ethernet device structure.
1142 * Number of descriptors to configure in queue.
1144 * NUMA socket on which memory must be allocated.
1146 * Thresholds parameters.
1148 * Memory pool for buffer allocations.
1151 * 0 on success, negative errno value on failure.
1154 mlx5_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
1155 unsigned int socket, const struct rte_eth_rxconf *conf,
1156 struct rte_mempool *mp)
1158 struct priv *priv = dev->data->dev_private;
1159 struct rxq *rxq = (*priv->rxqs)[idx];
1160 struct rxq_ctrl *rxq_ctrl = container_of(rxq, struct rxq_ctrl, rxq);
1163 if (mlx5_is_secondary())
1164 return -E_RTE_SECONDARY;
1167 if (!rte_is_power_of_2(desc)) {
1168 desc = 1 << log2above(desc);
1169 WARN("%p: increased number of descriptors in RX queue %u"
1170 " to the next power of two (%d)",
1171 (void *)dev, idx, desc);
1173 DEBUG("%p: configuring queue %u for %u descriptors",
1174 (void *)dev, idx, desc);
1175 if (idx >= priv->rxqs_n) {
1176 ERROR("%p: queue index out of range (%u >= %u)",
1177 (void *)dev, idx, priv->rxqs_n);
1182 DEBUG("%p: reusing already allocated queue index %u (%p)",
1183 (void *)dev, idx, (void *)rxq);
1184 if (priv->started) {
1188 (*priv->rxqs)[idx] = NULL;
1189 rxq_cleanup(rxq_ctrl);
1190 /* Resize if rxq size is changed. */
1191 if (rxq_ctrl->rxq.elts_n != log2above(desc)) {
1192 rxq_ctrl = rte_realloc(rxq_ctrl,
1194 desc * sizeof(struct rte_mbuf *),
1195 RTE_CACHE_LINE_SIZE);
1197 ERROR("%p: unable to reallocate queue index %u",
1204 rxq_ctrl = rte_calloc_socket("RXQ", 1, sizeof(*rxq_ctrl) +
1205 desc * sizeof(struct rte_mbuf *),
1207 if (rxq_ctrl == NULL) {
1208 ERROR("%p: unable to allocate queue index %u",
1214 ret = rxq_ctrl_setup(dev, rxq_ctrl, desc, socket, conf, mp);
1218 rxq_ctrl->rxq.stats.idx = idx;
1219 DEBUG("%p: adding RX queue %p to list",
1220 (void *)dev, (void *)rxq_ctrl);
1221 (*priv->rxqs)[idx] = &rxq_ctrl->rxq;
1222 /* Update receive callback. */
1223 priv_select_rx_function(priv);
1230 * DPDK callback to release a RX queue.
1233 * Generic RX queue pointer.
1236 mlx5_rx_queue_release(void *dpdk_rxq)
1238 struct rxq *rxq = (struct rxq *)dpdk_rxq;
1239 struct rxq_ctrl *rxq_ctrl;
1243 if (mlx5_is_secondary())
1248 rxq_ctrl = container_of(rxq, struct rxq_ctrl, rxq);
1249 priv = rxq_ctrl->priv;
1251 for (i = 0; (i != priv->rxqs_n); ++i)
1252 if ((*priv->rxqs)[i] == rxq) {
1253 DEBUG("%p: removing RX queue %p from list",
1254 (void *)priv->dev, (void *)rxq_ctrl);
1255 (*priv->rxqs)[i] = NULL;
1258 rxq_cleanup(rxq_ctrl);
1264 * DPDK callback for RX in secondary processes.
1266 * This function configures all queues from primary process information
1267 * if necessary before reverting to the normal RX burst callback.
1270 * Generic pointer to RX queue structure.
1272 * Array to store received packets.
1274 * Maximum number of packets in array.
1277 * Number of packets successfully received (<= pkts_n).
1280 mlx5_rx_burst_secondary_setup(void *dpdk_rxq, struct rte_mbuf **pkts,
1283 struct rxq *rxq = dpdk_rxq;
1284 struct rxq_ctrl *rxq_ctrl = container_of(rxq, struct rxq_ctrl, rxq);
1285 struct priv *priv = mlx5_secondary_data_setup(rxq_ctrl->priv);
1286 struct priv *primary_priv;
1292 mlx5_secondary_data[priv->dev->data->port_id].primary_priv;
1293 /* Look for queue index in both private structures. */
1294 for (index = 0; index != priv->rxqs_n; ++index)
1295 if (((*primary_priv->rxqs)[index] == rxq) ||
1296 ((*priv->rxqs)[index] == rxq))
1298 if (index == priv->rxqs_n)
1300 rxq = (*priv->rxqs)[index];
1301 return priv->dev->rx_pkt_burst(rxq, pkts, pkts_n);
1305 * Fill epoll fd list for rxq interrupts.
1308 * Private structure.
1311 * 0 on success, negative on failure.
1314 priv_intr_efd_enable(struct priv *priv)
1317 unsigned int rxqs_n = priv->rxqs_n;
1318 unsigned int n = RTE_MIN(rxqs_n, (uint32_t)RTE_MAX_RXTX_INTR_VEC_ID);
1319 struct rte_intr_handle *intr_handle = priv->dev->intr_handle;
1324 WARN("rxqs num is larger than EAL max interrupt vector "
1325 "%u > %u unable to supprt rxq interrupts",
1326 rxqs_n, (uint32_t)RTE_MAX_RXTX_INTR_VEC_ID);
1329 intr_handle->type = RTE_INTR_HANDLE_EXT;
1330 for (i = 0; i != n; ++i) {
1331 struct rxq *rxq = (*priv->rxqs)[i];
1332 struct rxq_ctrl *rxq_ctrl =
1333 container_of(rxq, struct rxq_ctrl, rxq);
1334 int fd = rxq_ctrl->channel->fd;
1338 flags = fcntl(fd, F_GETFL);
1339 rc = fcntl(fd, F_SETFL, flags | O_NONBLOCK);
1341 WARN("failed to change rxq interrupt file "
1342 "descriptor %d for queue index %d", fd, i);
1345 intr_handle->efds[i] = fd;
1347 intr_handle->nb_efd = n;
1352 * Clean epoll fd list for rxq interrupts.
1355 * Private structure.
1358 priv_intr_efd_disable(struct priv *priv)
1360 struct rte_intr_handle *intr_handle = priv->dev->intr_handle;
1362 rte_intr_free_epoll_fd(intr_handle);
1366 * Create and init interrupt vector array.
1369 * Private structure.
1372 * 0 on success, negative on failure.
1375 priv_create_intr_vec(struct priv *priv)
1377 unsigned int rxqs_n = priv->rxqs_n;
1379 struct rte_intr_handle *intr_handle = priv->dev->intr_handle;
1383 intr_handle->intr_vec = (int *)
1384 rte_malloc("intr_vec", rxqs_n * sizeof(int), 0);
1385 if (intr_handle->intr_vec == NULL) {
1386 WARN("Failed to allocate memory for intr_vec "
1387 "rxq interrupt will not be supported");
1390 for (i = 0; i != rxqs_n; ++i) {
1391 /* 1:1 mapping between rxq and interrupt. */
1392 intr_handle->intr_vec[i] = RTE_INTR_VEC_RXTX_OFFSET + i;
1398 * Destroy init interrupt vector array.
1401 * Private structure.
1404 * 0 on success, negative on failure.
1407 priv_destroy_intr_vec(struct priv *priv)
1409 struct rte_intr_handle *intr_handle = priv->dev->intr_handle;
1411 rte_free(intr_handle->intr_vec);