4 * Copyright 2015 6WIND S.A.
5 * Copyright 2015 Mellanox.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * * Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * * Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * * Neither the name of 6WIND S.A. nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
42 /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
44 #pragma GCC diagnostic ignored "-Wpedantic"
46 #include <infiniband/verbs.h>
47 #include <infiniband/arch.h>
48 #include <infiniband/mlx5_hw.h>
50 #pragma GCC diagnostic error "-Wpedantic"
53 /* DPDK headers don't like -pedantic. */
55 #pragma GCC diagnostic ignored "-Wpedantic"
58 #include <rte_malloc.h>
59 #include <rte_ethdev.h>
60 #include <rte_common.h>
61 #include <rte_interrupts.h>
62 #include <rte_debug.h>
64 #pragma GCC diagnostic error "-Wpedantic"
68 #include "mlx5_rxtx.h"
69 #include "mlx5_utils.h"
70 #include "mlx5_autoconf.h"
71 #include "mlx5_defs.h"
73 /* Initialization data for hash RX queues. */
74 const struct hash_rxq_init hash_rxq_init[] = {
76 .hash_fields = (IBV_EXP_RX_HASH_SRC_IPV4 |
77 IBV_EXP_RX_HASH_DST_IPV4 |
78 IBV_EXP_RX_HASH_SRC_PORT_TCP |
79 IBV_EXP_RX_HASH_DST_PORT_TCP),
80 .dpdk_rss_hf = ETH_RSS_NONFRAG_IPV4_TCP,
82 .flow_spec.tcp_udp = {
83 .type = IBV_EXP_FLOW_SPEC_TCP,
84 .size = sizeof(hash_rxq_init[0].flow_spec.tcp_udp),
86 .underlayer = &hash_rxq_init[HASH_RXQ_IPV4],
89 .hash_fields = (IBV_EXP_RX_HASH_SRC_IPV4 |
90 IBV_EXP_RX_HASH_DST_IPV4 |
91 IBV_EXP_RX_HASH_SRC_PORT_UDP |
92 IBV_EXP_RX_HASH_DST_PORT_UDP),
93 .dpdk_rss_hf = ETH_RSS_NONFRAG_IPV4_UDP,
95 .flow_spec.tcp_udp = {
96 .type = IBV_EXP_FLOW_SPEC_UDP,
97 .size = sizeof(hash_rxq_init[0].flow_spec.tcp_udp),
99 .underlayer = &hash_rxq_init[HASH_RXQ_IPV4],
102 .hash_fields = (IBV_EXP_RX_HASH_SRC_IPV4 |
103 IBV_EXP_RX_HASH_DST_IPV4),
104 .dpdk_rss_hf = (ETH_RSS_IPV4 |
108 .type = IBV_EXP_FLOW_SPEC_IPV4,
109 .size = sizeof(hash_rxq_init[0].flow_spec.ipv4),
111 .underlayer = &hash_rxq_init[HASH_RXQ_ETH],
114 .hash_fields = (IBV_EXP_RX_HASH_SRC_IPV6 |
115 IBV_EXP_RX_HASH_DST_IPV6 |
116 IBV_EXP_RX_HASH_SRC_PORT_TCP |
117 IBV_EXP_RX_HASH_DST_PORT_TCP),
118 .dpdk_rss_hf = ETH_RSS_NONFRAG_IPV6_TCP,
120 .flow_spec.tcp_udp = {
121 .type = IBV_EXP_FLOW_SPEC_TCP,
122 .size = sizeof(hash_rxq_init[0].flow_spec.tcp_udp),
124 .underlayer = &hash_rxq_init[HASH_RXQ_IPV6],
127 .hash_fields = (IBV_EXP_RX_HASH_SRC_IPV6 |
128 IBV_EXP_RX_HASH_DST_IPV6 |
129 IBV_EXP_RX_HASH_SRC_PORT_UDP |
130 IBV_EXP_RX_HASH_DST_PORT_UDP),
131 .dpdk_rss_hf = ETH_RSS_NONFRAG_IPV6_UDP,
133 .flow_spec.tcp_udp = {
134 .type = IBV_EXP_FLOW_SPEC_UDP,
135 .size = sizeof(hash_rxq_init[0].flow_spec.tcp_udp),
137 .underlayer = &hash_rxq_init[HASH_RXQ_IPV6],
140 .hash_fields = (IBV_EXP_RX_HASH_SRC_IPV6 |
141 IBV_EXP_RX_HASH_DST_IPV6),
142 .dpdk_rss_hf = (ETH_RSS_IPV6 |
146 .type = IBV_EXP_FLOW_SPEC_IPV6,
147 .size = sizeof(hash_rxq_init[0].flow_spec.ipv6),
149 .underlayer = &hash_rxq_init[HASH_RXQ_ETH],
156 .type = IBV_EXP_FLOW_SPEC_ETH,
157 .size = sizeof(hash_rxq_init[0].flow_spec.eth),
163 /* Number of entries in hash_rxq_init[]. */
164 const unsigned int hash_rxq_init_n = RTE_DIM(hash_rxq_init);
166 /* Initialization data for hash RX queue indirection tables. */
167 static const struct ind_table_init ind_table_init[] = {
169 .max_size = -1u, /* Superseded by HW limitations. */
171 1 << HASH_RXQ_TCPV4 |
172 1 << HASH_RXQ_UDPV4 |
174 1 << HASH_RXQ_TCPV6 |
175 1 << HASH_RXQ_UDPV6 |
182 .hash_types = 1 << HASH_RXQ_ETH,
187 #define IND_TABLE_INIT_N RTE_DIM(ind_table_init)
189 /* Default RSS hash key also used for ConnectX-3. */
190 uint8_t rss_hash_default_key[] = {
191 0x2c, 0xc6, 0x81, 0xd1,
192 0x5b, 0xdb, 0xf4, 0xf7,
193 0xfc, 0xa2, 0x83, 0x19,
194 0xdb, 0x1a, 0x3e, 0x94,
195 0x6b, 0x9e, 0x38, 0xd9,
196 0x2c, 0x9c, 0x03, 0xd1,
197 0xad, 0x99, 0x44, 0xa7,
198 0xd9, 0x56, 0x3d, 0x59,
199 0x06, 0x3c, 0x25, 0xf3,
200 0xfc, 0x1f, 0xdc, 0x2a,
203 /* Length of the default RSS hash key. */
204 const size_t rss_hash_default_key_len = sizeof(rss_hash_default_key);
207 * Populate flow steering rule for a given hash RX queue type using
208 * information from hash_rxq_init[]. Nothing is written to flow_attr when
209 * flow_attr_size is not large enough, but the required size is still returned.
212 * Pointer to private structure.
213 * @param[out] flow_attr
214 * Pointer to flow attribute structure to fill. Note that the allocated
215 * area must be larger and large enough to hold all flow specifications.
216 * @param flow_attr_size
217 * Entire size of flow_attr and trailing room for flow specifications.
219 * Hash RX queue type to use for flow steering rule.
222 * Total size of the flow attribute buffer. No errors are defined.
225 priv_flow_attr(struct priv *priv, struct ibv_exp_flow_attr *flow_attr,
226 size_t flow_attr_size, enum hash_rxq_type type)
228 size_t offset = sizeof(*flow_attr);
229 const struct hash_rxq_init *init = &hash_rxq_init[type];
231 assert(priv != NULL);
232 assert((size_t)type < RTE_DIM(hash_rxq_init));
234 offset += init->flow_spec.hdr.size;
235 init = init->underlayer;
236 } while (init != NULL);
237 if (offset > flow_attr_size)
239 flow_attr_size = offset;
240 init = &hash_rxq_init[type];
241 *flow_attr = (struct ibv_exp_flow_attr){
242 .type = IBV_EXP_FLOW_ATTR_NORMAL,
243 /* Priorities < 3 are reserved for flow director. */
244 .priority = init->flow_priority + 3,
250 offset -= init->flow_spec.hdr.size;
251 memcpy((void *)((uintptr_t)flow_attr + offset),
253 init->flow_spec.hdr.size);
254 ++flow_attr->num_of_specs;
255 init = init->underlayer;
256 } while (init != NULL);
257 return flow_attr_size;
261 * Convert hash type position in indirection table initializer to
262 * hash RX queue type.
265 * Indirection table initializer.
267 * Hash type position.
270 * Hash RX queue type.
272 static enum hash_rxq_type
273 hash_rxq_type_from_pos(const struct ind_table_init *table, unsigned int pos)
275 enum hash_rxq_type type = HASH_RXQ_TCPV4;
277 assert(pos < table->hash_types_n);
279 if ((table->hash_types & (1 << type)) && (pos-- == 0))
287 * Filter out disabled hash RX queue types from ind_table_init[].
290 * Pointer to private structure.
295 * Number of table entries.
298 priv_make_ind_table_init(struct priv *priv,
299 struct ind_table_init (*table)[IND_TABLE_INIT_N])
304 unsigned int table_n = 0;
305 /* Mandatory to receive frames not handled by normal hash RX queues. */
306 unsigned int hash_types_sup = 1 << HASH_RXQ_ETH;
308 rss_hf = priv->rss_hf;
309 /* Process other protocols only if more than one queue. */
310 if (priv->rxqs_n > 1)
311 for (i = 0; (i != hash_rxq_init_n); ++i)
312 if (rss_hf & hash_rxq_init[i].dpdk_rss_hf)
313 hash_types_sup |= (1 << i);
315 /* Filter out entries whose protocols are not in the set. */
316 for (i = 0, j = 0; (i != IND_TABLE_INIT_N); ++i) {
320 /* j is increased only if the table has valid protocols. */
322 (*table)[j] = ind_table_init[i];
323 (*table)[j].hash_types &= hash_types_sup;
324 for (h = 0, nb = 0; (h != hash_rxq_init_n); ++h)
325 if (((*table)[j].hash_types >> h) & 0x1)
327 (*table)[i].hash_types_n = nb;
337 * Initialize hash RX queues and indirection table.
340 * Pointer to private structure.
343 * 0 on success, errno value on failure.
346 priv_create_hash_rxqs(struct priv *priv)
348 struct ibv_exp_wq *wqs[priv->reta_idx_n];
349 struct ind_table_init ind_table_init[IND_TABLE_INIT_N];
350 unsigned int ind_tables_n =
351 priv_make_ind_table_init(priv, &ind_table_init);
352 unsigned int hash_rxqs_n = 0;
353 struct hash_rxq (*hash_rxqs)[] = NULL;
354 struct ibv_exp_rwq_ind_table *(*ind_tables)[] = NULL;
360 assert(priv->ind_tables == NULL);
361 assert(priv->ind_tables_n == 0);
362 assert(priv->hash_rxqs == NULL);
363 assert(priv->hash_rxqs_n == 0);
364 assert(priv->pd != NULL);
365 assert(priv->ctx != NULL);
366 if (priv->rxqs_n == 0)
368 assert(priv->rxqs != NULL);
369 if (ind_tables_n == 0) {
370 ERROR("all hash RX queue types have been filtered out,"
371 " indirection table cannot be created");
374 if (priv->rxqs_n & (priv->rxqs_n - 1)) {
375 INFO("%u RX queues are configured, consider rounding this"
376 " number to the next power of two for better balancing",
378 DEBUG("indirection table extended to assume %u WQs",
381 for (i = 0; (i != priv->reta_idx_n); ++i) {
382 struct rxq_ctrl *rxq_ctrl;
384 rxq_ctrl = container_of((*priv->rxqs)[(*priv->reta_idx)[i]],
385 struct rxq_ctrl, rxq);
386 wqs[i] = rxq_ctrl->wq;
388 /* Get number of hash RX queues to configure. */
389 for (i = 0, hash_rxqs_n = 0; (i != ind_tables_n); ++i)
390 hash_rxqs_n += ind_table_init[i].hash_types_n;
391 DEBUG("allocating %u hash RX queues for %u WQs, %u indirection tables",
392 hash_rxqs_n, priv->rxqs_n, ind_tables_n);
393 /* Create indirection tables. */
394 ind_tables = rte_calloc(__func__, ind_tables_n,
395 sizeof((*ind_tables)[0]), 0);
396 if (ind_tables == NULL) {
398 ERROR("cannot allocate indirection tables container: %s",
402 for (i = 0; (i != ind_tables_n); ++i) {
403 struct ibv_exp_rwq_ind_table_init_attr ind_init_attr = {
405 .log_ind_tbl_size = 0, /* Set below. */
409 unsigned int ind_tbl_size = ind_table_init[i].max_size;
410 struct ibv_exp_rwq_ind_table *ind_table;
412 if (priv->reta_idx_n < ind_tbl_size)
413 ind_tbl_size = priv->reta_idx_n;
414 ind_init_attr.log_ind_tbl_size = log2above(ind_tbl_size);
416 ind_table = ibv_exp_create_rwq_ind_table(priv->ctx,
418 if (ind_table != NULL) {
419 (*ind_tables)[i] = ind_table;
422 /* Not clear whether errno is set. */
423 err = (errno ? errno : EINVAL);
424 ERROR("RX indirection table creation failed with error %d: %s",
428 /* Allocate array that holds hash RX queues and related data. */
429 hash_rxqs = rte_calloc(__func__, hash_rxqs_n,
430 sizeof((*hash_rxqs)[0]), 0);
431 if (hash_rxqs == NULL) {
433 ERROR("cannot allocate hash RX queues container: %s",
437 for (i = 0, j = 0, k = 0;
438 ((i != hash_rxqs_n) && (j != ind_tables_n));
440 struct hash_rxq *hash_rxq = &(*hash_rxqs)[i];
441 enum hash_rxq_type type =
442 hash_rxq_type_from_pos(&ind_table_init[j], k);
443 struct rte_eth_rss_conf *priv_rss_conf =
444 (*priv->rss_conf)[type];
445 struct ibv_exp_rx_hash_conf hash_conf = {
446 .rx_hash_function = IBV_EXP_RX_HASH_FUNC_TOEPLITZ,
447 .rx_hash_key_len = (priv_rss_conf ?
448 priv_rss_conf->rss_key_len :
449 rss_hash_default_key_len),
450 .rx_hash_key = (priv_rss_conf ?
451 priv_rss_conf->rss_key :
452 rss_hash_default_key),
453 .rx_hash_fields_mask = hash_rxq_init[type].hash_fields,
454 .rwq_ind_tbl = (*ind_tables)[j],
456 struct ibv_exp_qp_init_attr qp_init_attr = {
457 .max_inl_recv = 0, /* Currently not supported. */
458 .qp_type = IBV_QPT_RAW_PACKET,
459 .comp_mask = (IBV_EXP_QP_INIT_ATTR_PD |
460 IBV_EXP_QP_INIT_ATTR_RX_HASH),
462 .rx_hash_conf = &hash_conf,
463 .port_num = priv->port,
466 DEBUG("using indirection table %u for hash RX queue %u type %d",
468 *hash_rxq = (struct hash_rxq){
470 .qp = ibv_exp_create_qp(priv->ctx, &qp_init_attr),
473 if (hash_rxq->qp == NULL) {
474 err = (errno ? errno : EINVAL);
475 ERROR("Hash RX QP creation failure: %s",
479 if (++k < ind_table_init[j].hash_types_n)
481 /* Switch to the next indirection table and reset hash RX
482 * queue type array index. */
486 priv->ind_tables = ind_tables;
487 priv->ind_tables_n = ind_tables_n;
488 priv->hash_rxqs = hash_rxqs;
489 priv->hash_rxqs_n = hash_rxqs_n;
493 if (hash_rxqs != NULL) {
494 for (i = 0; (i != hash_rxqs_n); ++i) {
495 struct ibv_qp *qp = (*hash_rxqs)[i].qp;
499 claim_zero(ibv_destroy_qp(qp));
503 if (ind_tables != NULL) {
504 for (j = 0; (j != ind_tables_n); ++j) {
505 struct ibv_exp_rwq_ind_table *ind_table =
508 if (ind_table == NULL)
510 claim_zero(ibv_exp_destroy_rwq_ind_table(ind_table));
512 rte_free(ind_tables);
518 * Clean up hash RX queues and indirection table.
521 * Pointer to private structure.
524 priv_destroy_hash_rxqs(struct priv *priv)
528 DEBUG("destroying %u hash RX queues", priv->hash_rxqs_n);
529 if (priv->hash_rxqs_n == 0) {
530 assert(priv->hash_rxqs == NULL);
531 assert(priv->ind_tables == NULL);
534 for (i = 0; (i != priv->hash_rxqs_n); ++i) {
535 struct hash_rxq *hash_rxq = &(*priv->hash_rxqs)[i];
538 assert(hash_rxq->priv == priv);
539 assert(hash_rxq->qp != NULL);
540 /* Also check that there are no remaining flows. */
541 for (j = 0; (j != RTE_DIM(hash_rxq->special_flow)); ++j)
543 (k != RTE_DIM(hash_rxq->special_flow[j]));
545 assert(hash_rxq->special_flow[j][k] == NULL);
546 for (j = 0; (j != RTE_DIM(hash_rxq->mac_flow)); ++j)
547 for (k = 0; (k != RTE_DIM(hash_rxq->mac_flow[j])); ++k)
548 assert(hash_rxq->mac_flow[j][k] == NULL);
549 claim_zero(ibv_destroy_qp(hash_rxq->qp));
551 priv->hash_rxqs_n = 0;
552 rte_free(priv->hash_rxqs);
553 priv->hash_rxqs = NULL;
554 for (i = 0; (i != priv->ind_tables_n); ++i) {
555 struct ibv_exp_rwq_ind_table *ind_table =
556 (*priv->ind_tables)[i];
558 assert(ind_table != NULL);
559 claim_zero(ibv_exp_destroy_rwq_ind_table(ind_table));
561 priv->ind_tables_n = 0;
562 rte_free(priv->ind_tables);
563 priv->ind_tables = NULL;
567 * Check whether a given flow type is allowed.
570 * Pointer to private structure.
572 * Flow type to check.
575 * Nonzero if the given flow type is allowed.
578 priv_allow_flow_type(struct priv *priv, enum hash_rxq_flow_type type)
580 /* Only FLOW_TYPE_PROMISC is allowed when promiscuous mode
581 * has been requested. */
582 if (priv->promisc_req)
583 return type == HASH_RXQ_FLOW_TYPE_PROMISC;
585 case HASH_RXQ_FLOW_TYPE_PROMISC:
586 return !!priv->promisc_req;
587 case HASH_RXQ_FLOW_TYPE_ALLMULTI:
588 return !!priv->allmulti_req;
589 case HASH_RXQ_FLOW_TYPE_BROADCAST:
590 case HASH_RXQ_FLOW_TYPE_IPV6MULTI:
591 /* If allmulti is enabled, broadcast and ipv6multi
592 * are unnecessary. */
593 return !priv->allmulti_req;
594 case HASH_RXQ_FLOW_TYPE_MAC:
597 /* Unsupported flow type is not allowed. */
604 * Automatically enable/disable flows according to configuration.
610 * 0 on success, errno value on failure.
613 priv_rehash_flows(struct priv *priv)
615 enum hash_rxq_flow_type i;
617 for (i = HASH_RXQ_FLOW_TYPE_PROMISC;
618 i != RTE_DIM((*priv->hash_rxqs)[0].special_flow);
620 if (!priv_allow_flow_type(priv, i)) {
621 priv_special_flow_disable(priv, i);
623 int ret = priv_special_flow_enable(priv, i);
628 if (priv_allow_flow_type(priv, HASH_RXQ_FLOW_TYPE_MAC))
629 return priv_mac_addrs_enable(priv);
630 priv_mac_addrs_disable(priv);
635 * Allocate RX queue elements.
638 * Pointer to RX queue structure.
640 * Number of elements to allocate.
642 * If not NULL, fetch buffers from this array instead of allocating them
643 * with rte_pktmbuf_alloc().
646 * 0 on success, errno value on failure.
649 rxq_alloc_elts(struct rxq_ctrl *rxq_ctrl, unsigned int elts_n,
650 struct rte_mbuf *(*pool)[])
652 const unsigned int sges_n = 1 << rxq_ctrl->rxq.sges_n;
656 /* Iterate on segments. */
657 for (i = 0; (i != elts_n); ++i) {
658 struct rte_mbuf *buf;
659 volatile struct mlx5_wqe_data_seg *scat =
660 &(*rxq_ctrl->rxq.wqes)[i];
665 rte_pktmbuf_reset(buf);
666 rte_pktmbuf_refcnt_update(buf, 1);
668 buf = rte_pktmbuf_alloc(rxq_ctrl->rxq.mp);
670 assert(pool == NULL);
671 ERROR("%p: empty mbuf pool", (void *)rxq_ctrl);
675 /* Headroom is reserved by rte_pktmbuf_alloc(). */
676 assert(DATA_OFF(buf) == RTE_PKTMBUF_HEADROOM);
677 /* Buffer is supposed to be empty. */
678 assert(rte_pktmbuf_data_len(buf) == 0);
679 assert(rte_pktmbuf_pkt_len(buf) == 0);
681 /* Only the first segment keeps headroom. */
683 SET_DATA_OFF(buf, 0);
684 PORT(buf) = rxq_ctrl->rxq.port_id;
685 DATA_LEN(buf) = rte_pktmbuf_tailroom(buf);
686 PKT_LEN(buf) = DATA_LEN(buf);
688 /* scat->addr must be able to store a pointer. */
689 assert(sizeof(scat->addr) >= sizeof(uintptr_t));
690 *scat = (struct mlx5_wqe_data_seg){
691 .addr = htonll(rte_pktmbuf_mtod(buf, uintptr_t)),
692 .byte_count = htonl(DATA_LEN(buf)),
693 .lkey = htonl(rxq_ctrl->mr->lkey),
695 (*rxq_ctrl->rxq.elts)[i] = buf;
697 DEBUG("%p: allocated and configured %u segments (max %u packets)",
698 (void *)rxq_ctrl, elts_n, elts_n / (1 << rxq_ctrl->rxq.sges_n));
702 assert(pool == NULL);
704 for (i = 0; (i != elts_n); ++i) {
705 if ((*rxq_ctrl->rxq.elts)[i] != NULL)
706 rte_pktmbuf_free_seg((*rxq_ctrl->rxq.elts)[i]);
707 (*rxq_ctrl->rxq.elts)[i] = NULL;
709 DEBUG("%p: failed, freed everything", (void *)rxq_ctrl);
715 * Free RX queue elements.
718 * Pointer to RX queue structure.
721 rxq_free_elts(struct rxq_ctrl *rxq_ctrl)
725 DEBUG("%p: freeing WRs", (void *)rxq_ctrl);
726 if (rxq_ctrl->rxq.elts == NULL)
729 for (i = 0; (i != (1u << rxq_ctrl->rxq.elts_n)); ++i) {
730 if ((*rxq_ctrl->rxq.elts)[i] != NULL)
731 rte_pktmbuf_free_seg((*rxq_ctrl->rxq.elts)[i]);
732 (*rxq_ctrl->rxq.elts)[i] = NULL;
737 * Clean up a RX queue.
739 * Destroy objects, free allocated memory and reset the structure for reuse.
742 * Pointer to RX queue structure.
745 rxq_cleanup(struct rxq_ctrl *rxq_ctrl)
747 DEBUG("cleaning up %p", (void *)rxq_ctrl);
748 rxq_free_elts(rxq_ctrl);
749 if (rxq_ctrl->fdir_queue != NULL)
750 priv_fdir_queue_destroy(rxq_ctrl->priv, rxq_ctrl->fdir_queue);
751 if (rxq_ctrl->wq != NULL)
752 claim_zero(ibv_exp_destroy_wq(rxq_ctrl->wq));
753 if (rxq_ctrl->cq != NULL)
754 claim_zero(ibv_destroy_cq(rxq_ctrl->cq));
755 if (rxq_ctrl->channel != NULL)
756 claim_zero(ibv_destroy_comp_channel(rxq_ctrl->channel));
757 if (rxq_ctrl->mr != NULL)
758 claim_zero(ibv_dereg_mr(rxq_ctrl->mr));
759 memset(rxq_ctrl, 0, sizeof(*rxq_ctrl));
763 * Reconfigure RX queue buffers.
765 * rxq_rehash() does not allocate mbufs, which, if not done from the right
766 * thread (such as a control thread), may corrupt the pool.
767 * In case of failure, the queue is left untouched.
770 * Pointer to Ethernet device structure.
775 * 0 on success, errno value on failure.
778 rxq_rehash(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl)
780 unsigned int elts_n = 1 << rxq_ctrl->rxq.elts_n;
782 struct ibv_exp_wq_attr mod;
785 DEBUG("%p: rehashing queue %p with %u SGE(s) per packet",
786 (void *)dev, (void *)rxq_ctrl, 1 << rxq_ctrl->rxq.sges_n);
787 assert(!(elts_n % (1 << rxq_ctrl->rxq.sges_n)));
788 /* From now on, any failure will render the queue unusable.
789 * Reinitialize WQ. */
790 mod = (struct ibv_exp_wq_attr){
791 .attr_mask = IBV_EXP_WQ_ATTR_STATE,
792 .wq_state = IBV_EXP_WQS_RESET,
794 err = ibv_exp_modify_wq(rxq_ctrl->wq, &mod);
796 ERROR("%p: cannot reset WQ: %s", (void *)dev, strerror(err));
800 /* Snatch mbufs from original queue. */
801 claim_zero(rxq_alloc_elts(rxq_ctrl, elts_n, rxq_ctrl->rxq.elts));
802 for (i = 0; i != elts_n; ++i) {
803 struct rte_mbuf *buf = (*rxq_ctrl->rxq.elts)[i];
805 assert(rte_mbuf_refcnt_read(buf) == 2);
806 rte_pktmbuf_free_seg(buf);
808 /* Change queue state to ready. */
809 mod = (struct ibv_exp_wq_attr){
810 .attr_mask = IBV_EXP_WQ_ATTR_STATE,
811 .wq_state = IBV_EXP_WQS_RDY,
813 err = ibv_exp_modify_wq(rxq_ctrl->wq, &mod);
815 ERROR("%p: WQ state to IBV_EXP_WQS_RDY failed: %s",
816 (void *)dev, strerror(err));
819 /* Update doorbell counter. */
820 rxq_ctrl->rxq.rq_ci = elts_n >> rxq_ctrl->rxq.sges_n;
822 *rxq_ctrl->rxq.rq_db = htonl(rxq_ctrl->rxq.rq_ci);
829 * Initialize RX queue.
832 * Pointer to RX queue control template.
835 * 0 on success, errno value on failure.
838 rxq_setup(struct rxq_ctrl *tmpl)
840 struct ibv_cq *ibcq = tmpl->cq;
841 struct ibv_mlx5_cq_info cq_info;
842 struct mlx5_rwq *rwq = container_of(tmpl->wq, struct mlx5_rwq, wq);
843 struct rte_mbuf *(*elts)[1 << tmpl->rxq.elts_n] =
844 rte_calloc_socket("RXQ", 1, sizeof(*elts), 0, tmpl->socket);
846 if (ibv_mlx5_exp_get_cq_info(ibcq, &cq_info)) {
847 ERROR("Unable to query CQ info. check your OFED.");
850 if (cq_info.cqe_size != RTE_CACHE_LINE_SIZE) {
851 ERROR("Wrong MLX5_CQE_SIZE environment variable value: "
852 "it should be set to %u", RTE_CACHE_LINE_SIZE);
857 tmpl->rxq.rq_db = rwq->rq.db;
858 tmpl->rxq.cqe_n = log2above(cq_info.cqe_cnt);
861 tmpl->rxq.cq_db = cq_info.dbrec;
863 (volatile struct mlx5_wqe_data_seg (*)[])
864 (uintptr_t)rwq->rq.buff;
866 (volatile struct mlx5_cqe (*)[])
867 (uintptr_t)cq_info.buf;
868 tmpl->rxq.elts = elts;
873 * Configure a RX queue.
876 * Pointer to Ethernet device structure.
878 * Pointer to RX queue structure.
880 * Number of descriptors to configure in queue.
882 * NUMA socket on which memory must be allocated.
884 * Thresholds parameters.
886 * Memory pool for buffer allocations.
889 * 0 on success, errno value on failure.
892 rxq_ctrl_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl,
893 uint16_t desc, unsigned int socket,
894 const struct rte_eth_rxconf *conf, struct rte_mempool *mp)
896 struct priv *priv = dev->data->dev_private;
897 struct rxq_ctrl tmpl = {
901 .elts_n = log2above(desc),
903 .rss_hash = priv->rxqs_n > 1,
906 struct ibv_exp_wq_attr mod;
908 struct ibv_exp_cq_init_attr cq;
909 struct ibv_exp_wq_init_attr wq;
910 struct ibv_exp_cq_attr cq_attr;
912 unsigned int mb_len = rte_pktmbuf_data_room_size(mp);
913 unsigned int cqe_n = desc - 1;
914 struct rte_mbuf *(*elts)[desc] = NULL;
917 (void)conf; /* Thresholds configuration (ignored). */
918 /* Enable scattered packets support for this queue if necessary. */
919 assert(mb_len >= RTE_PKTMBUF_HEADROOM);
920 if (dev->data->dev_conf.rxmode.max_rx_pkt_len <=
921 (mb_len - RTE_PKTMBUF_HEADROOM)) {
923 } else if (dev->data->dev_conf.rxmode.enable_scatter) {
925 RTE_PKTMBUF_HEADROOM +
926 dev->data->dev_conf.rxmode.max_rx_pkt_len;
930 * Determine the number of SGEs needed for a full packet
931 * and round it to the next power of two.
933 sges_n = log2above((size / mb_len) + !!(size % mb_len));
934 tmpl.rxq.sges_n = sges_n;
935 /* Make sure rxq.sges_n did not overflow. */
936 size = mb_len * (1 << tmpl.rxq.sges_n);
937 size -= RTE_PKTMBUF_HEADROOM;
938 if (size < dev->data->dev_conf.rxmode.max_rx_pkt_len) {
939 ERROR("%p: too many SGEs (%u) needed to handle"
940 " requested maximum packet size %u",
943 dev->data->dev_conf.rxmode.max_rx_pkt_len);
947 WARN("%p: the requested maximum Rx packet size (%u) is"
948 " larger than a single mbuf (%u) and scattered"
949 " mode has not been requested",
951 dev->data->dev_conf.rxmode.max_rx_pkt_len,
952 mb_len - RTE_PKTMBUF_HEADROOM);
954 DEBUG("%p: maximum number of segments per packet: %u",
955 (void *)dev, 1 << tmpl.rxq.sges_n);
956 if (desc % (1 << tmpl.rxq.sges_n)) {
957 ERROR("%p: number of RX queue descriptors (%u) is not a"
958 " multiple of SGEs per packet (%u)",
961 1 << tmpl.rxq.sges_n);
964 /* Toggle RX checksum offload if hardware supports it. */
966 tmpl.rxq.csum = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
967 if (priv->hw_csum_l2tun)
968 tmpl.rxq.csum_l2tun =
969 !!dev->data->dev_conf.rxmode.hw_ip_checksum;
970 /* Use the entire RX mempool as the memory region. */
971 tmpl.mr = mlx5_mp2mr(priv->pd, mp);
972 if (tmpl.mr == NULL) {
974 ERROR("%p: MR creation failure: %s",
975 (void *)dev, strerror(ret));
978 if (dev->data->dev_conf.intr_conf.rxq) {
979 tmpl.channel = ibv_create_comp_channel(priv->ctx);
980 if (tmpl.channel == NULL) {
981 dev->data->dev_conf.intr_conf.rxq = 0;
983 ERROR("%p: Comp Channel creation failure: %s",
984 (void *)dev, strerror(ret));
988 attr.cq = (struct ibv_exp_cq_init_attr){
991 if (priv->cqe_comp) {
992 attr.cq.comp_mask |= IBV_EXP_CQ_INIT_ATTR_FLAGS;
993 attr.cq.flags |= IBV_EXP_CQ_COMPRESSED_CQE;
994 cqe_n = (desc * 2) - 1; /* Double the number of CQEs. */
996 tmpl.cq = ibv_exp_create_cq(priv->ctx, cqe_n, NULL, tmpl.channel, 0,
998 if (tmpl.cq == NULL) {
1000 ERROR("%p: CQ creation failure: %s",
1001 (void *)dev, strerror(ret));
1004 DEBUG("priv->device_attr.max_qp_wr is %d",
1005 priv->device_attr.max_qp_wr);
1006 DEBUG("priv->device_attr.max_sge is %d",
1007 priv->device_attr.max_sge);
1008 /* Configure VLAN stripping. */
1009 tmpl.rxq.vlan_strip = (priv->hw_vlan_strip &&
1010 !!dev->data->dev_conf.rxmode.hw_vlan_strip);
1011 attr.wq = (struct ibv_exp_wq_init_attr){
1012 .wq_context = NULL, /* Could be useful in the future. */
1013 .wq_type = IBV_EXP_WQT_RQ,
1014 /* Max number of outstanding WRs. */
1015 .max_recv_wr = desc >> tmpl.rxq.sges_n,
1016 /* Max number of scatter/gather elements in a WR. */
1017 .max_recv_sge = 1 << tmpl.rxq.sges_n,
1021 IBV_EXP_CREATE_WQ_VLAN_OFFLOADS |
1023 .vlan_offloads = (tmpl.rxq.vlan_strip ?
1024 IBV_EXP_RECEIVE_WQ_CVLAN_STRIP :
1027 /* By default, FCS (CRC) is stripped by hardware. */
1028 if (dev->data->dev_conf.rxmode.hw_strip_crc) {
1029 tmpl.rxq.crc_present = 0;
1030 } else if (priv->hw_fcs_strip) {
1031 /* Ask HW/Verbs to leave CRC in place when supported. */
1032 attr.wq.flags |= IBV_EXP_CREATE_WQ_FLAG_SCATTER_FCS;
1033 attr.wq.comp_mask |= IBV_EXP_CREATE_WQ_FLAGS;
1034 tmpl.rxq.crc_present = 1;
1036 WARN("%p: CRC stripping has been disabled but will still"
1037 " be performed by hardware, make sure MLNX_OFED and"
1038 " firmware are up to date",
1040 tmpl.rxq.crc_present = 0;
1042 DEBUG("%p: CRC stripping is %s, %u bytes will be subtracted from"
1043 " incoming frames to hide it",
1045 tmpl.rxq.crc_present ? "disabled" : "enabled",
1046 tmpl.rxq.crc_present << 2);
1047 if (!mlx5_getenv_int("MLX5_PMD_ENABLE_PADDING"))
1048 ; /* Nothing else to do. */
1049 else if (priv->hw_padding) {
1050 INFO("%p: enabling packet padding on queue %p",
1051 (void *)dev, (void *)rxq_ctrl);
1052 attr.wq.flags |= IBV_EXP_CREATE_WQ_FLAG_RX_END_PADDING;
1053 attr.wq.comp_mask |= IBV_EXP_CREATE_WQ_FLAGS;
1055 WARN("%p: packet padding has been requested but is not"
1056 " supported, make sure MLNX_OFED and firmware are"
1060 tmpl.wq = ibv_exp_create_wq(priv->ctx, &attr.wq);
1061 if (tmpl.wq == NULL) {
1062 ret = (errno ? errno : EINVAL);
1063 ERROR("%p: WQ creation failure: %s",
1064 (void *)dev, strerror(ret));
1068 * Make sure number of WRs*SGEs match expectations since a queue
1069 * cannot allocate more than "desc" buffers.
1071 if (((int)attr.wq.max_recv_wr != (desc >> tmpl.rxq.sges_n)) ||
1072 ((int)attr.wq.max_recv_sge != (1 << tmpl.rxq.sges_n))) {
1073 ERROR("%p: requested %u*%u but got %u*%u WRs*SGEs",
1075 (desc >> tmpl.rxq.sges_n), (1 << tmpl.rxq.sges_n),
1076 attr.wq.max_recv_wr, attr.wq.max_recv_sge);
1081 tmpl.rxq.port_id = dev->data->port_id;
1082 DEBUG("%p: RTE port ID: %u", (void *)rxq_ctrl, tmpl.rxq.port_id);
1083 /* Change queue state to ready. */
1084 mod = (struct ibv_exp_wq_attr){
1085 .attr_mask = IBV_EXP_WQ_ATTR_STATE,
1086 .wq_state = IBV_EXP_WQS_RDY,
1088 ret = ibv_exp_modify_wq(tmpl.wq, &mod);
1090 ERROR("%p: WQ state to IBV_EXP_WQS_RDY failed: %s",
1091 (void *)dev, strerror(ret));
1094 ret = rxq_setup(&tmpl);
1096 ERROR("%p: cannot initialize RX queue structure: %s",
1097 (void *)dev, strerror(ret));
1100 /* Reuse buffers from original queue if possible. */
1101 if (rxq_ctrl->rxq.elts_n) {
1102 assert(1 << rxq_ctrl->rxq.elts_n == desc);
1103 assert(rxq_ctrl->rxq.elts != tmpl.rxq.elts);
1104 ret = rxq_alloc_elts(&tmpl, desc, rxq_ctrl->rxq.elts);
1106 ret = rxq_alloc_elts(&tmpl, desc, NULL);
1108 ERROR("%p: RXQ allocation failed: %s",
1109 (void *)dev, strerror(ret));
1112 /* Clean up rxq in case we're reinitializing it. */
1113 DEBUG("%p: cleaning-up old rxq just in case", (void *)rxq_ctrl);
1114 rxq_cleanup(rxq_ctrl);
1115 /* Move mbuf pointers to dedicated storage area in RX queue. */
1116 elts = (void *)(rxq_ctrl + 1);
1117 rte_memcpy(elts, tmpl.rxq.elts, sizeof(*elts));
1119 memset(tmpl.rxq.elts, 0x55, sizeof(*elts));
1121 rte_free(tmpl.rxq.elts);
1122 tmpl.rxq.elts = elts;
1124 /* Update doorbell counter. */
1125 rxq_ctrl->rxq.rq_ci = desc >> rxq_ctrl->rxq.sges_n;
1127 *rxq_ctrl->rxq.rq_db = htonl(rxq_ctrl->rxq.rq_ci);
1128 DEBUG("%p: rxq updated with %p", (void *)rxq_ctrl, (void *)&tmpl);
1132 elts = tmpl.rxq.elts;
1140 * DPDK callback to configure a RX queue.
1143 * Pointer to Ethernet device structure.
1147 * Number of descriptors to configure in queue.
1149 * NUMA socket on which memory must be allocated.
1151 * Thresholds parameters.
1153 * Memory pool for buffer allocations.
1156 * 0 on success, negative errno value on failure.
1159 mlx5_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
1160 unsigned int socket, const struct rte_eth_rxconf *conf,
1161 struct rte_mempool *mp)
1163 struct priv *priv = dev->data->dev_private;
1164 struct rxq *rxq = (*priv->rxqs)[idx];
1165 struct rxq_ctrl *rxq_ctrl = container_of(rxq, struct rxq_ctrl, rxq);
1168 if (mlx5_is_secondary())
1169 return -E_RTE_SECONDARY;
1172 if (!rte_is_power_of_2(desc)) {
1173 desc = 1 << log2above(desc);
1174 WARN("%p: increased number of descriptors in RX queue %u"
1175 " to the next power of two (%d)",
1176 (void *)dev, idx, desc);
1178 DEBUG("%p: configuring queue %u for %u descriptors",
1179 (void *)dev, idx, desc);
1180 if (idx >= priv->rxqs_n) {
1181 ERROR("%p: queue index out of range (%u >= %u)",
1182 (void *)dev, idx, priv->rxqs_n);
1187 DEBUG("%p: reusing already allocated queue index %u (%p)",
1188 (void *)dev, idx, (void *)rxq);
1189 if (priv->started) {
1193 (*priv->rxqs)[idx] = NULL;
1194 rxq_cleanup(rxq_ctrl);
1195 /* Resize if rxq size is changed. */
1196 if (rxq_ctrl->rxq.elts_n != log2above(desc)) {
1197 rxq_ctrl = rte_realloc(rxq_ctrl,
1199 desc * sizeof(struct rte_mbuf *),
1200 RTE_CACHE_LINE_SIZE);
1202 ERROR("%p: unable to reallocate queue index %u",
1209 rxq_ctrl = rte_calloc_socket("RXQ", 1, sizeof(*rxq_ctrl) +
1210 desc * sizeof(struct rte_mbuf *),
1212 if (rxq_ctrl == NULL) {
1213 ERROR("%p: unable to allocate queue index %u",
1219 ret = rxq_ctrl_setup(dev, rxq_ctrl, desc, socket, conf, mp);
1223 rxq_ctrl->rxq.stats.idx = idx;
1224 DEBUG("%p: adding RX queue %p to list",
1225 (void *)dev, (void *)rxq_ctrl);
1226 (*priv->rxqs)[idx] = &rxq_ctrl->rxq;
1227 /* Update receive callback. */
1228 priv_select_rx_function(priv);
1235 * DPDK callback to release a RX queue.
1238 * Generic RX queue pointer.
1241 mlx5_rx_queue_release(void *dpdk_rxq)
1243 struct rxq *rxq = (struct rxq *)dpdk_rxq;
1244 struct rxq_ctrl *rxq_ctrl;
1248 if (mlx5_is_secondary())
1253 rxq_ctrl = container_of(rxq, struct rxq_ctrl, rxq);
1254 priv = rxq_ctrl->priv;
1256 if (priv_flow_rxq_in_use(priv, rxq))
1257 rte_panic("Rx queue %p is still used by a flow and cannot be"
1258 " removed\n", (void *)rxq_ctrl);
1259 for (i = 0; (i != priv->rxqs_n); ++i)
1260 if ((*priv->rxqs)[i] == rxq) {
1261 DEBUG("%p: removing RX queue %p from list",
1262 (void *)priv->dev, (void *)rxq_ctrl);
1263 (*priv->rxqs)[i] = NULL;
1266 rxq_cleanup(rxq_ctrl);
1272 * DPDK callback for RX in secondary processes.
1274 * This function configures all queues from primary process information
1275 * if necessary before reverting to the normal RX burst callback.
1278 * Generic pointer to RX queue structure.
1280 * Array to store received packets.
1282 * Maximum number of packets in array.
1285 * Number of packets successfully received (<= pkts_n).
1288 mlx5_rx_burst_secondary_setup(void *dpdk_rxq, struct rte_mbuf **pkts,
1291 struct rxq *rxq = dpdk_rxq;
1292 struct rxq_ctrl *rxq_ctrl = container_of(rxq, struct rxq_ctrl, rxq);
1293 struct priv *priv = mlx5_secondary_data_setup(rxq_ctrl->priv);
1294 struct priv *primary_priv;
1300 mlx5_secondary_data[priv->dev->data->port_id].primary_priv;
1301 /* Look for queue index in both private structures. */
1302 for (index = 0; index != priv->rxqs_n; ++index)
1303 if (((*primary_priv->rxqs)[index] == rxq) ||
1304 ((*priv->rxqs)[index] == rxq))
1306 if (index == priv->rxqs_n)
1308 rxq = (*priv->rxqs)[index];
1309 return priv->dev->rx_pkt_burst(rxq, pkts, pkts_n);
1313 * Fill epoll fd list for rxq interrupts.
1316 * Private structure.
1319 * 0 on success, negative on failure.
1322 priv_intr_efd_enable(struct priv *priv)
1325 unsigned int rxqs_n = priv->rxqs_n;
1326 unsigned int n = RTE_MIN(rxqs_n, (uint32_t)RTE_MAX_RXTX_INTR_VEC_ID);
1327 struct rte_intr_handle *intr_handle = priv->dev->intr_handle;
1332 WARN("rxqs num is larger than EAL max interrupt vector "
1333 "%u > %u unable to supprt rxq interrupts",
1334 rxqs_n, (uint32_t)RTE_MAX_RXTX_INTR_VEC_ID);
1337 intr_handle->type = RTE_INTR_HANDLE_EXT;
1338 for (i = 0; i != n; ++i) {
1339 struct rxq *rxq = (*priv->rxqs)[i];
1340 struct rxq_ctrl *rxq_ctrl =
1341 container_of(rxq, struct rxq_ctrl, rxq);
1342 int fd = rxq_ctrl->channel->fd;
1346 flags = fcntl(fd, F_GETFL);
1347 rc = fcntl(fd, F_SETFL, flags | O_NONBLOCK);
1349 WARN("failed to change rxq interrupt file "
1350 "descriptor %d for queue index %d", fd, i);
1353 intr_handle->efds[i] = fd;
1355 intr_handle->nb_efd = n;
1360 * Clean epoll fd list for rxq interrupts.
1363 * Private structure.
1366 priv_intr_efd_disable(struct priv *priv)
1368 struct rte_intr_handle *intr_handle = priv->dev->intr_handle;
1370 rte_intr_free_epoll_fd(intr_handle);
1374 * Create and init interrupt vector array.
1377 * Private structure.
1380 * 0 on success, negative on failure.
1383 priv_create_intr_vec(struct priv *priv)
1385 unsigned int rxqs_n = priv->rxqs_n;
1387 struct rte_intr_handle *intr_handle = priv->dev->intr_handle;
1391 intr_handle->intr_vec = (int *)
1392 rte_malloc("intr_vec", rxqs_n * sizeof(int), 0);
1393 if (intr_handle->intr_vec == NULL) {
1394 WARN("Failed to allocate memory for intr_vec "
1395 "rxq interrupt will not be supported");
1398 for (i = 0; i != rxqs_n; ++i) {
1399 /* 1:1 mapping between rxq and interrupt. */
1400 intr_handle->intr_vec[i] = RTE_INTR_VEC_RXTX_OFFSET + i;
1406 * Destroy init interrupt vector array.
1409 * Private structure.
1412 * 0 on success, negative on failure.
1415 priv_destroy_intr_vec(struct priv *priv)
1417 struct rte_intr_handle *intr_handle = priv->dev->intr_handle;
1419 rte_free(intr_handle->intr_vec);