4 * Copyright 2015 6WIND S.A.
5 * Copyright 2015 Mellanox.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * * Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * * Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * * Neither the name of 6WIND S.A. nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
41 /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
43 #pragma GCC diagnostic ignored "-pedantic"
45 #include <infiniband/verbs.h>
47 #pragma GCC diagnostic error "-pedantic"
50 /* DPDK headers don't like -pedantic. */
52 #pragma GCC diagnostic ignored "-pedantic"
55 #include <rte_malloc.h>
56 #include <rte_ethdev.h>
57 #include <rte_common.h>
59 #pragma GCC diagnostic error "-pedantic"
63 #include "mlx5_rxtx.h"
64 #include "mlx5_utils.h"
65 #include "mlx5_autoconf.h"
66 #include "mlx5_defs.h"
68 /* Initialization data for hash RX queues. */
69 const struct hash_rxq_init hash_rxq_init[] = {
71 .hash_fields = (IBV_EXP_RX_HASH_SRC_IPV4 |
72 IBV_EXP_RX_HASH_DST_IPV4 |
73 IBV_EXP_RX_HASH_SRC_PORT_TCP |
74 IBV_EXP_RX_HASH_DST_PORT_TCP),
75 .dpdk_rss_hf = ETH_RSS_NONFRAG_IPV4_TCP,
77 .flow_spec.tcp_udp = {
78 .type = IBV_EXP_FLOW_SPEC_TCP,
79 .size = sizeof(hash_rxq_init[0].flow_spec.tcp_udp),
81 .underlayer = &hash_rxq_init[HASH_RXQ_IPV4],
84 .hash_fields = (IBV_EXP_RX_HASH_SRC_IPV4 |
85 IBV_EXP_RX_HASH_DST_IPV4 |
86 IBV_EXP_RX_HASH_SRC_PORT_UDP |
87 IBV_EXP_RX_HASH_DST_PORT_UDP),
88 .dpdk_rss_hf = ETH_RSS_NONFRAG_IPV4_UDP,
90 .flow_spec.tcp_udp = {
91 .type = IBV_EXP_FLOW_SPEC_UDP,
92 .size = sizeof(hash_rxq_init[0].flow_spec.tcp_udp),
94 .underlayer = &hash_rxq_init[HASH_RXQ_IPV4],
97 .hash_fields = (IBV_EXP_RX_HASH_SRC_IPV4 |
98 IBV_EXP_RX_HASH_DST_IPV4),
99 .dpdk_rss_hf = (ETH_RSS_IPV4 |
103 .type = IBV_EXP_FLOW_SPEC_IPV4,
104 .size = sizeof(hash_rxq_init[0].flow_spec.ipv4),
106 .underlayer = &hash_rxq_init[HASH_RXQ_ETH],
109 .hash_fields = (IBV_EXP_RX_HASH_SRC_IPV6 |
110 IBV_EXP_RX_HASH_DST_IPV6 |
111 IBV_EXP_RX_HASH_SRC_PORT_TCP |
112 IBV_EXP_RX_HASH_DST_PORT_TCP),
113 .dpdk_rss_hf = ETH_RSS_NONFRAG_IPV6_TCP,
115 .flow_spec.tcp_udp = {
116 .type = IBV_EXP_FLOW_SPEC_TCP,
117 .size = sizeof(hash_rxq_init[0].flow_spec.tcp_udp),
119 .underlayer = &hash_rxq_init[HASH_RXQ_IPV6],
122 .hash_fields = (IBV_EXP_RX_HASH_SRC_IPV6 |
123 IBV_EXP_RX_HASH_DST_IPV6 |
124 IBV_EXP_RX_HASH_SRC_PORT_UDP |
125 IBV_EXP_RX_HASH_DST_PORT_UDP),
126 .dpdk_rss_hf = ETH_RSS_NONFRAG_IPV6_UDP,
128 .flow_spec.tcp_udp = {
129 .type = IBV_EXP_FLOW_SPEC_UDP,
130 .size = sizeof(hash_rxq_init[0].flow_spec.tcp_udp),
132 .underlayer = &hash_rxq_init[HASH_RXQ_IPV6],
135 .hash_fields = (IBV_EXP_RX_HASH_SRC_IPV6 |
136 IBV_EXP_RX_HASH_DST_IPV6),
137 .dpdk_rss_hf = (ETH_RSS_IPV6 |
141 .type = IBV_EXP_FLOW_SPEC_IPV6,
142 .size = sizeof(hash_rxq_init[0].flow_spec.ipv6),
144 .underlayer = &hash_rxq_init[HASH_RXQ_ETH],
151 .type = IBV_EXP_FLOW_SPEC_ETH,
152 .size = sizeof(hash_rxq_init[0].flow_spec.eth),
158 /* Number of entries in hash_rxq_init[]. */
159 const unsigned int hash_rxq_init_n = RTE_DIM(hash_rxq_init);
161 /* Initialization data for hash RX queue indirection tables. */
162 static const struct ind_table_init ind_table_init[] = {
164 .max_size = -1u, /* Superseded by HW limitations. */
166 1 << HASH_RXQ_TCPV4 |
167 1 << HASH_RXQ_UDPV4 |
169 1 << HASH_RXQ_TCPV6 |
170 1 << HASH_RXQ_UDPV6 |
177 .hash_types = 1 << HASH_RXQ_ETH,
182 #define IND_TABLE_INIT_N RTE_DIM(ind_table_init)
184 /* Default RSS hash key also used for ConnectX-3. */
185 uint8_t rss_hash_default_key[] = {
186 0x2c, 0xc6, 0x81, 0xd1,
187 0x5b, 0xdb, 0xf4, 0xf7,
188 0xfc, 0xa2, 0x83, 0x19,
189 0xdb, 0x1a, 0x3e, 0x94,
190 0x6b, 0x9e, 0x38, 0xd9,
191 0x2c, 0x9c, 0x03, 0xd1,
192 0xad, 0x99, 0x44, 0xa7,
193 0xd9, 0x56, 0x3d, 0x59,
194 0x06, 0x3c, 0x25, 0xf3,
195 0xfc, 0x1f, 0xdc, 0x2a,
198 /* Length of the default RSS hash key. */
199 const size_t rss_hash_default_key_len = sizeof(rss_hash_default_key);
202 * Populate flow steering rule for a given hash RX queue type using
203 * information from hash_rxq_init[]. Nothing is written to flow_attr when
204 * flow_attr_size is not large enough, but the required size is still returned.
207 * Pointer to private structure.
208 * @param[out] flow_attr
209 * Pointer to flow attribute structure to fill. Note that the allocated
210 * area must be larger and large enough to hold all flow specifications.
211 * @param flow_attr_size
212 * Entire size of flow_attr and trailing room for flow specifications.
214 * Hash RX queue type to use for flow steering rule.
217 * Total size of the flow attribute buffer. No errors are defined.
220 priv_flow_attr(struct priv *priv, struct ibv_exp_flow_attr *flow_attr,
221 size_t flow_attr_size, enum hash_rxq_type type)
223 size_t offset = sizeof(*flow_attr);
224 const struct hash_rxq_init *init = &hash_rxq_init[type];
226 assert(priv != NULL);
227 assert((size_t)type < RTE_DIM(hash_rxq_init));
229 offset += init->flow_spec.hdr.size;
230 init = init->underlayer;
231 } while (init != NULL);
232 if (offset > flow_attr_size)
234 flow_attr_size = offset;
235 init = &hash_rxq_init[type];
236 *flow_attr = (struct ibv_exp_flow_attr){
237 .type = IBV_EXP_FLOW_ATTR_NORMAL,
238 /* Priorities < 3 are reserved for flow director. */
239 .priority = init->flow_priority + 3,
245 offset -= init->flow_spec.hdr.size;
246 memcpy((void *)((uintptr_t)flow_attr + offset),
248 init->flow_spec.hdr.size);
249 ++flow_attr->num_of_specs;
250 init = init->underlayer;
251 } while (init != NULL);
252 return flow_attr_size;
256 * Convert hash type position in indirection table initializer to
257 * hash RX queue type.
260 * Indirection table initializer.
262 * Hash type position.
265 * Hash RX queue type.
267 static enum hash_rxq_type
268 hash_rxq_type_from_pos(const struct ind_table_init *table, unsigned int pos)
270 enum hash_rxq_type type = 0;
272 assert(pos < table->hash_types_n);
274 if ((table->hash_types & (1 << type)) && (pos-- == 0))
282 * Filter out disabled hash RX queue types from ind_table_init[].
285 * Pointer to private structure.
290 * Number of table entries.
293 priv_make_ind_table_init(struct priv *priv,
294 struct ind_table_init (*table)[IND_TABLE_INIT_N])
299 unsigned int table_n = 0;
300 /* Mandatory to receive frames not handled by normal hash RX queues. */
301 unsigned int hash_types_sup = 1 << HASH_RXQ_ETH;
303 rss_hf = priv->rss_hf;
304 /* Process other protocols only if more than one queue. */
305 if (priv->rxqs_n > 1)
306 for (i = 0; (i != hash_rxq_init_n); ++i)
307 if (rss_hf & hash_rxq_init[i].dpdk_rss_hf)
308 hash_types_sup |= (1 << i);
310 /* Filter out entries whose protocols are not in the set. */
311 for (i = 0, j = 0; (i != IND_TABLE_INIT_N); ++i) {
315 /* j is increased only if the table has valid protocols. */
317 (*table)[j] = ind_table_init[i];
318 (*table)[j].hash_types &= hash_types_sup;
319 for (h = 0, nb = 0; (h != hash_rxq_init_n); ++h)
320 if (((*table)[j].hash_types >> h) & 0x1)
322 (*table)[i].hash_types_n = nb;
332 * Initialize hash RX queues and indirection table.
335 * Pointer to private structure.
338 * 0 on success, errno value on failure.
341 priv_create_hash_rxqs(struct priv *priv)
343 struct ibv_exp_wq *wqs[priv->reta_idx_n];
344 struct ind_table_init ind_table_init[IND_TABLE_INIT_N];
345 unsigned int ind_tables_n =
346 priv_make_ind_table_init(priv, &ind_table_init);
347 unsigned int hash_rxqs_n = 0;
348 struct hash_rxq (*hash_rxqs)[] = NULL;
349 struct ibv_exp_rwq_ind_table *(*ind_tables)[] = NULL;
355 assert(priv->ind_tables == NULL);
356 assert(priv->ind_tables_n == 0);
357 assert(priv->hash_rxqs == NULL);
358 assert(priv->hash_rxqs_n == 0);
359 assert(priv->pd != NULL);
360 assert(priv->ctx != NULL);
361 if (priv->rxqs_n == 0)
363 assert(priv->rxqs != NULL);
364 if (ind_tables_n == 0) {
365 ERROR("all hash RX queue types have been filtered out,"
366 " indirection table cannot be created");
369 if (priv->rxqs_n & (priv->rxqs_n - 1)) {
370 INFO("%u RX queues are configured, consider rounding this"
371 " number to the next power of two for better balancing",
373 DEBUG("indirection table extended to assume %u WQs",
376 for (i = 0; (i != priv->reta_idx_n); ++i)
377 wqs[i] = (*priv->rxqs)[(*priv->reta_idx)[i]]->wq;
378 /* Get number of hash RX queues to configure. */
379 for (i = 0, hash_rxqs_n = 0; (i != ind_tables_n); ++i)
380 hash_rxqs_n += ind_table_init[i].hash_types_n;
381 DEBUG("allocating %u hash RX queues for %u WQs, %u indirection tables",
382 hash_rxqs_n, priv->rxqs_n, ind_tables_n);
383 /* Create indirection tables. */
384 ind_tables = rte_calloc(__func__, ind_tables_n,
385 sizeof((*ind_tables)[0]), 0);
386 if (ind_tables == NULL) {
388 ERROR("cannot allocate indirection tables container: %s",
392 for (i = 0; (i != ind_tables_n); ++i) {
393 struct ibv_exp_rwq_ind_table_init_attr ind_init_attr = {
395 .log_ind_tbl_size = 0, /* Set below. */
399 unsigned int ind_tbl_size = ind_table_init[i].max_size;
400 struct ibv_exp_rwq_ind_table *ind_table;
402 if (priv->reta_idx_n < ind_tbl_size)
403 ind_tbl_size = priv->reta_idx_n;
404 ind_init_attr.log_ind_tbl_size = log2above(ind_tbl_size);
406 ind_table = ibv_exp_create_rwq_ind_table(priv->ctx,
408 if (ind_table != NULL) {
409 (*ind_tables)[i] = ind_table;
412 /* Not clear whether errno is set. */
413 err = (errno ? errno : EINVAL);
414 ERROR("RX indirection table creation failed with error %d: %s",
418 /* Allocate array that holds hash RX queues and related data. */
419 hash_rxqs = rte_calloc(__func__, hash_rxqs_n,
420 sizeof((*hash_rxqs)[0]), 0);
421 if (hash_rxqs == NULL) {
423 ERROR("cannot allocate hash RX queues container: %s",
427 for (i = 0, j = 0, k = 0;
428 ((i != hash_rxqs_n) && (j != ind_tables_n));
430 struct hash_rxq *hash_rxq = &(*hash_rxqs)[i];
431 enum hash_rxq_type type =
432 hash_rxq_type_from_pos(&ind_table_init[j], k);
433 struct rte_eth_rss_conf *priv_rss_conf =
434 (*priv->rss_conf)[type];
435 struct ibv_exp_rx_hash_conf hash_conf = {
436 .rx_hash_function = IBV_EXP_RX_HASH_FUNC_TOEPLITZ,
437 .rx_hash_key_len = (priv_rss_conf ?
438 priv_rss_conf->rss_key_len :
439 rss_hash_default_key_len),
440 .rx_hash_key = (priv_rss_conf ?
441 priv_rss_conf->rss_key :
442 rss_hash_default_key),
443 .rx_hash_fields_mask = hash_rxq_init[type].hash_fields,
444 .rwq_ind_tbl = (*ind_tables)[j],
446 struct ibv_exp_qp_init_attr qp_init_attr = {
447 .max_inl_recv = 0, /* Currently not supported. */
448 .qp_type = IBV_QPT_RAW_PACKET,
449 .comp_mask = (IBV_EXP_QP_INIT_ATTR_PD |
450 IBV_EXP_QP_INIT_ATTR_RX_HASH),
452 .rx_hash_conf = &hash_conf,
453 .port_num = priv->port,
456 DEBUG("using indirection table %u for hash RX queue %u type %d",
458 *hash_rxq = (struct hash_rxq){
460 .qp = ibv_exp_create_qp(priv->ctx, &qp_init_attr),
463 if (hash_rxq->qp == NULL) {
464 err = (errno ? errno : EINVAL);
465 ERROR("Hash RX QP creation failure: %s",
469 if (++k < ind_table_init[j].hash_types_n)
471 /* Switch to the next indirection table and reset hash RX
472 * queue type array index. */
476 priv->ind_tables = ind_tables;
477 priv->ind_tables_n = ind_tables_n;
478 priv->hash_rxqs = hash_rxqs;
479 priv->hash_rxqs_n = hash_rxqs_n;
483 if (hash_rxqs != NULL) {
484 for (i = 0; (i != hash_rxqs_n); ++i) {
485 struct ibv_qp *qp = (*hash_rxqs)[i].qp;
489 claim_zero(ibv_destroy_qp(qp));
493 if (ind_tables != NULL) {
494 for (j = 0; (j != ind_tables_n); ++j) {
495 struct ibv_exp_rwq_ind_table *ind_table =
498 if (ind_table == NULL)
500 claim_zero(ibv_exp_destroy_rwq_ind_table(ind_table));
502 rte_free(ind_tables);
508 * Clean up hash RX queues and indirection table.
511 * Pointer to private structure.
514 priv_destroy_hash_rxqs(struct priv *priv)
518 DEBUG("destroying %u hash RX queues", priv->hash_rxqs_n);
519 if (priv->hash_rxqs_n == 0) {
520 assert(priv->hash_rxqs == NULL);
521 assert(priv->ind_tables == NULL);
524 for (i = 0; (i != priv->hash_rxqs_n); ++i) {
525 struct hash_rxq *hash_rxq = &(*priv->hash_rxqs)[i];
528 assert(hash_rxq->priv == priv);
529 assert(hash_rxq->qp != NULL);
530 /* Also check that there are no remaining flows. */
531 for (j = 0; (j != RTE_DIM(hash_rxq->special_flow)); ++j)
533 (k != RTE_DIM(hash_rxq->special_flow[j]));
535 assert(hash_rxq->special_flow[j][k] == NULL);
536 for (j = 0; (j != RTE_DIM(hash_rxq->mac_flow)); ++j)
537 for (k = 0; (k != RTE_DIM(hash_rxq->mac_flow[j])); ++k)
538 assert(hash_rxq->mac_flow[j][k] == NULL);
539 claim_zero(ibv_destroy_qp(hash_rxq->qp));
541 priv->hash_rxqs_n = 0;
542 rte_free(priv->hash_rxqs);
543 priv->hash_rxqs = NULL;
544 for (i = 0; (i != priv->ind_tables_n); ++i) {
545 struct ibv_exp_rwq_ind_table *ind_table =
546 (*priv->ind_tables)[i];
548 assert(ind_table != NULL);
549 claim_zero(ibv_exp_destroy_rwq_ind_table(ind_table));
551 priv->ind_tables_n = 0;
552 rte_free(priv->ind_tables);
553 priv->ind_tables = NULL;
557 * Check whether a given flow type is allowed.
560 * Pointer to private structure.
562 * Flow type to check.
565 * Nonzero if the given flow type is allowed.
568 priv_allow_flow_type(struct priv *priv, enum hash_rxq_flow_type type)
570 /* Only FLOW_TYPE_PROMISC is allowed when promiscuous mode
571 * has been requested. */
572 if (priv->promisc_req)
573 return type == HASH_RXQ_FLOW_TYPE_PROMISC;
575 case HASH_RXQ_FLOW_TYPE_PROMISC:
576 return !!priv->promisc_req;
577 case HASH_RXQ_FLOW_TYPE_ALLMULTI:
578 return !!priv->allmulti_req;
579 case HASH_RXQ_FLOW_TYPE_BROADCAST:
580 case HASH_RXQ_FLOW_TYPE_IPV6MULTI:
581 /* If allmulti is enabled, broadcast and ipv6multi
582 * are unnecessary. */
583 return !priv->allmulti_req;
584 case HASH_RXQ_FLOW_TYPE_MAC:
587 /* Unsupported flow type is not allowed. */
594 * Automatically enable/disable flows according to configuration.
600 * 0 on success, errno value on failure.
603 priv_rehash_flows(struct priv *priv)
607 for (i = 0; (i != RTE_DIM((*priv->hash_rxqs)[0].special_flow)); ++i)
608 if (!priv_allow_flow_type(priv, i)) {
609 priv_special_flow_disable(priv, i);
611 int ret = priv_special_flow_enable(priv, i);
616 if (priv_allow_flow_type(priv, HASH_RXQ_FLOW_TYPE_MAC))
617 return priv_mac_addrs_enable(priv);
618 priv_mac_addrs_disable(priv);
623 * Allocate RX queue elements.
626 * Pointer to RX queue structure.
628 * Number of elements to allocate.
630 * If not NULL, fetch buffers from this array instead of allocating them
631 * with rte_pktmbuf_alloc().
634 * 0 on success, errno value on failure.
637 rxq_alloc_elts(struct rxq_ctrl *rxq_ctrl, unsigned int elts_n,
638 struct rte_mbuf **pool)
641 struct rxq_elt (*elts)[elts_n] =
642 rte_calloc_socket("RXQ elements", 1, sizeof(*elts), 0,
647 ERROR("%p: can't allocate packets array", (void *)rxq_ctrl);
651 /* For each WR (packet). */
652 for (i = 0; (i != elts_n); ++i) {
653 struct rxq_elt *elt = &(*elts)[i];
654 struct ibv_sge *sge = &(*elts)[i].sge;
655 struct rte_mbuf *buf;
660 rte_pktmbuf_reset(buf);
662 buf = rte_pktmbuf_alloc(rxq_ctrl->rxq.mp);
664 assert(pool == NULL);
665 ERROR("%p: empty mbuf pool", (void *)rxq_ctrl);
670 /* Headroom is reserved by rte_pktmbuf_alloc(). */
671 assert(DATA_OFF(buf) == RTE_PKTMBUF_HEADROOM);
672 /* Buffer is supposed to be empty. */
673 assert(rte_pktmbuf_data_len(buf) == 0);
674 assert(rte_pktmbuf_pkt_len(buf) == 0);
675 /* sge->addr must be able to store a pointer. */
676 assert(sizeof(sge->addr) >= sizeof(uintptr_t));
677 /* SGE keeps its headroom. */
678 sge->addr = (uintptr_t)
679 ((uint8_t *)buf->buf_addr + RTE_PKTMBUF_HEADROOM);
680 sge->length = (buf->buf_len - RTE_PKTMBUF_HEADROOM);
681 sge->lkey = rxq_ctrl->mr->lkey;
682 /* Redundant check for tailroom. */
683 assert(sge->length == rte_pktmbuf_tailroom(buf));
685 DEBUG("%p: allocated and configured %u single-segment WRs",
686 (void *)rxq_ctrl, elts_n);
687 rxq_ctrl->rxq.elts_n = elts_n;
688 rxq_ctrl->rxq.elts_head = 0;
689 rxq_ctrl->rxq.elts = elts;
694 assert(pool == NULL);
695 for (i = 0; (i != RTE_DIM(*elts)); ++i) {
696 struct rxq_elt *elt = &(*elts)[i];
697 struct rte_mbuf *buf = elt->buf;
700 rte_pktmbuf_free_seg(buf);
704 DEBUG("%p: failed, freed everything", (void *)rxq_ctrl);
710 * Free RX queue elements.
713 * Pointer to RX queue structure.
716 rxq_free_elts(struct rxq_ctrl *rxq_ctrl)
719 unsigned int elts_n = rxq_ctrl->rxq.elts_n;
720 struct rxq_elt (*elts)[elts_n] = rxq_ctrl->rxq.elts;
722 DEBUG("%p: freeing WRs", (void *)rxq_ctrl);
723 rxq_ctrl->rxq.elts_n = 0;
724 rxq_ctrl->rxq.elts = NULL;
727 for (i = 0; (i != RTE_DIM(*elts)); ++i) {
728 struct rxq_elt *elt = &(*elts)[i];
729 struct rte_mbuf *buf = elt->buf;
732 rte_pktmbuf_free_seg(buf);
738 * Clean up a RX queue.
740 * Destroy objects, free allocated memory and reset the structure for reuse.
743 * Pointer to RX queue structure.
746 rxq_cleanup(struct rxq_ctrl *rxq_ctrl)
748 struct ibv_exp_release_intf_params params;
750 DEBUG("cleaning up %p", (void *)rxq_ctrl);
751 rxq_free_elts(rxq_ctrl);
752 rxq_ctrl->rxq.poll = NULL;
753 rxq_ctrl->rxq.recv = NULL;
754 if (rxq_ctrl->if_wq != NULL) {
755 assert(rxq_ctrl->rxq.priv != NULL);
756 assert(rxq_ctrl->rxq.priv->ctx != NULL);
757 assert(rxq_ctrl->rxq.wq != NULL);
758 params = (struct ibv_exp_release_intf_params){
761 claim_zero(ibv_exp_release_intf(rxq_ctrl->rxq.priv->ctx,
765 if (rxq_ctrl->if_cq != NULL) {
766 assert(rxq_ctrl->rxq.priv != NULL);
767 assert(rxq_ctrl->rxq.priv->ctx != NULL);
768 assert(rxq_ctrl->rxq.cq != NULL);
769 params = (struct ibv_exp_release_intf_params){
772 claim_zero(ibv_exp_release_intf(rxq_ctrl->rxq.priv->ctx,
776 if (rxq_ctrl->rxq.wq != NULL)
777 claim_zero(ibv_exp_destroy_wq(rxq_ctrl->rxq.wq));
778 if (rxq_ctrl->rxq.cq != NULL)
779 claim_zero(ibv_destroy_cq(rxq_ctrl->rxq.cq));
780 if (rxq_ctrl->rd != NULL) {
781 struct ibv_exp_destroy_res_domain_attr attr = {
785 assert(rxq_ctrl->rxq.priv != NULL);
786 assert(rxq_ctrl->rxq.priv->ctx != NULL);
787 claim_zero(ibv_exp_destroy_res_domain(rxq_ctrl->rxq.priv->ctx,
791 if (rxq_ctrl->mr != NULL)
792 claim_zero(ibv_dereg_mr(rxq_ctrl->mr));
793 memset(rxq_ctrl, 0, sizeof(*rxq_ctrl));
797 * Reconfigure a RX queue with new parameters.
799 * rxq_rehash() does not allocate mbufs, which, if not done from the right
800 * thread (such as a control thread), may corrupt the pool.
801 * In case of failure, the queue is left untouched.
804 * Pointer to Ethernet device structure.
809 * 0 on success, errno value on failure.
812 rxq_rehash(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl)
814 struct priv *priv = rxq_ctrl->rxq.priv;
815 struct rxq_ctrl tmpl = *rxq_ctrl;
818 struct rte_mbuf **pool;
820 struct ibv_exp_wq_attr mod;
821 struct rxq_elt (*elts)[tmpl.rxq.elts_n];
824 DEBUG("%p: rehashing queue %p", (void *)dev, (void *)rxq_ctrl);
825 /* Number of descriptors and mbufs currently allocated. */
826 desc_n = tmpl.rxq.elts_n;
828 /* Toggle RX checksum offload if hardware supports it. */
830 tmpl.rxq.csum = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
831 rxq_ctrl->rxq.csum = tmpl.rxq.csum;
833 if (priv->hw_csum_l2tun) {
834 tmpl.rxq.csum_l2tun =
835 !!dev->data->dev_conf.rxmode.hw_ip_checksum;
836 rxq_ctrl->rxq.csum_l2tun = tmpl.rxq.csum_l2tun;
838 /* From now on, any failure will render the queue unusable.
839 * Reinitialize WQ. */
840 mod = (struct ibv_exp_wq_attr){
841 .attr_mask = IBV_EXP_WQ_ATTR_STATE,
842 .wq_state = IBV_EXP_WQS_RESET,
844 err = ibv_exp_modify_wq(tmpl.rxq.wq, &mod);
846 ERROR("%p: cannot reset WQ: %s", (void *)dev, strerror(err));
851 pool = rte_malloc(__func__, (mbuf_n * sizeof(*pool)), 0);
853 ERROR("%p: cannot allocate memory", (void *)dev);
856 /* Snatch mbufs from original queue. */
858 elts = rxq_ctrl->rxq.elts;
859 for (i = 0; (i != RTE_DIM(*elts)); ++i) {
860 struct rxq_elt *elt = &(*elts)[i];
861 struct rte_mbuf *buf = elt->buf;
867 tmpl.rxq.elts = NULL;
868 assert((void *)&tmpl.rxq.elts == NULL);
869 err = rxq_alloc_elts(&tmpl, desc_n, pool);
871 ERROR("%p: cannot reallocate WRs, aborting", (void *)dev);
876 assert(tmpl.rxq.elts_n == desc_n);
878 /* Clean up original data. */
879 rxq_ctrl->rxq.elts_n = 0;
880 rte_free(rxq_ctrl->rxq.elts);
881 rxq_ctrl->rxq.elts = NULL;
882 /* Change queue state to ready. */
883 mod = (struct ibv_exp_wq_attr){
884 .attr_mask = IBV_EXP_WQ_ATTR_STATE,
885 .wq_state = IBV_EXP_WQS_RDY,
887 err = ibv_exp_modify_wq(tmpl.rxq.wq, &mod);
889 ERROR("%p: WQ state to IBV_EXP_WQS_RDY failed: %s",
890 (void *)dev, strerror(err));
894 assert(tmpl.if_wq != NULL);
895 elts = tmpl.rxq.elts;
896 for (i = 0; (i != RTE_DIM(*elts)); ++i) {
897 err = tmpl.if_wq->recv_burst(
905 ERROR("%p: failed to post SGEs with error %d",
907 /* Set err because it does not contain a valid errno value. */
911 tmpl.rxq.recv = tmpl.if_wq->recv_burst;
919 * Configure a RX queue.
922 * Pointer to Ethernet device structure.
924 * Pointer to RX queue structure.
926 * Number of descriptors to configure in queue.
928 * NUMA socket on which memory must be allocated.
930 * Thresholds parameters.
932 * Memory pool for buffer allocations.
935 * 0 on success, errno value on failure.
938 rxq_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl, uint16_t desc,
939 unsigned int socket, const struct rte_eth_rxconf *conf,
940 struct rte_mempool *mp)
942 struct priv *priv = dev->data->dev_private;
943 struct rxq_ctrl tmpl = {
950 struct ibv_exp_wq_attr mod;
952 struct ibv_exp_query_intf_params params;
953 struct ibv_exp_cq_init_attr cq;
954 struct ibv_exp_res_domain_init_attr rd;
955 struct ibv_exp_wq_init_attr wq;
957 enum ibv_exp_query_intf_status status;
958 unsigned int mb_len = rte_pktmbuf_data_room_size(mp);
959 struct rxq_elt (*elts)[desc];
962 unsigned int cq_size = desc;
964 (void)conf; /* Thresholds configuration (ignored). */
966 ERROR("%p: invalid number of RX descriptors", (void *)dev);
969 /* Toggle RX checksum offload if hardware supports it. */
971 tmpl.rxq.csum = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
972 if (priv->hw_csum_l2tun)
973 tmpl.rxq.csum_l2tun =
974 !!dev->data->dev_conf.rxmode.hw_ip_checksum;
975 (void)mb_len; /* I'll be back! */
976 /* Use the entire RX mempool as the memory region. */
977 tmpl.mr = mlx5_mp2mr(priv->pd, mp);
978 if (tmpl.mr == NULL) {
980 ERROR("%p: MR creation failure: %s",
981 (void *)dev, strerror(ret));
984 attr.rd = (struct ibv_exp_res_domain_init_attr){
985 .comp_mask = (IBV_EXP_RES_DOMAIN_THREAD_MODEL |
986 IBV_EXP_RES_DOMAIN_MSG_MODEL),
987 .thread_model = IBV_EXP_THREAD_SINGLE,
988 .msg_model = IBV_EXP_MSG_HIGH_BW,
990 tmpl.rd = ibv_exp_create_res_domain(priv->ctx, &attr.rd);
991 if (tmpl.rd == NULL) {
993 ERROR("%p: RD creation failure: %s",
994 (void *)dev, strerror(ret));
997 attr.cq = (struct ibv_exp_cq_init_attr){
998 .comp_mask = IBV_EXP_CQ_INIT_ATTR_RES_DOMAIN,
999 .res_domain = tmpl.rd,
1001 tmpl.rxq.cq = ibv_exp_create_cq(priv->ctx, cq_size, NULL, NULL, 0,
1003 if (tmpl.rxq.cq == NULL) {
1005 ERROR("%p: CQ creation failure: %s",
1006 (void *)dev, strerror(ret));
1009 DEBUG("priv->device_attr.max_qp_wr is %d",
1010 priv->device_attr.max_qp_wr);
1011 DEBUG("priv->device_attr.max_sge is %d",
1012 priv->device_attr.max_sge);
1013 /* Configure VLAN stripping. */
1014 tmpl.rxq.vlan_strip = (priv->hw_vlan_strip &&
1015 !!dev->data->dev_conf.rxmode.hw_vlan_strip);
1016 attr.wq = (struct ibv_exp_wq_init_attr){
1017 .wq_context = NULL, /* Could be useful in the future. */
1018 .wq_type = IBV_EXP_WQT_RQ,
1019 /* Max number of outstanding WRs. */
1020 .max_recv_wr = ((priv->device_attr.max_qp_wr < (int)cq_size) ?
1021 priv->device_attr.max_qp_wr :
1023 /* Max number of scatter/gather elements in a WR. */
1028 IBV_EXP_CREATE_WQ_RES_DOMAIN |
1029 IBV_EXP_CREATE_WQ_VLAN_OFFLOADS |
1031 .res_domain = tmpl.rd,
1032 .vlan_offloads = (tmpl.rxq.vlan_strip ?
1033 IBV_EXP_RECEIVE_WQ_CVLAN_STRIP :
1036 /* By default, FCS (CRC) is stripped by hardware. */
1037 if (dev->data->dev_conf.rxmode.hw_strip_crc) {
1038 tmpl.rxq.crc_present = 0;
1039 } else if (priv->hw_fcs_strip) {
1040 /* Ask HW/Verbs to leave CRC in place when supported. */
1041 attr.wq.flags |= IBV_EXP_CREATE_WQ_FLAG_SCATTER_FCS;
1042 attr.wq.comp_mask |= IBV_EXP_CREATE_WQ_FLAGS;
1043 tmpl.rxq.crc_present = 1;
1045 WARN("%p: CRC stripping has been disabled but will still"
1046 " be performed by hardware, make sure MLNX_OFED and"
1047 " firmware are up to date",
1049 tmpl.rxq.crc_present = 0;
1051 DEBUG("%p: CRC stripping is %s, %u bytes will be subtracted from"
1052 " incoming frames to hide it",
1054 tmpl.rxq.crc_present ? "disabled" : "enabled",
1055 tmpl.rxq.crc_present << 2);
1056 if (!mlx5_getenv_int("MLX5_PMD_ENABLE_PADDING"))
1057 ; /* Nothing else to do. */
1058 else if (priv->hw_padding) {
1059 INFO("%p: enabling packet padding on queue %p",
1060 (void *)dev, (void *)rxq_ctrl);
1061 attr.wq.flags |= IBV_EXP_CREATE_WQ_FLAG_RX_END_PADDING;
1062 attr.wq.comp_mask |= IBV_EXP_CREATE_WQ_FLAGS;
1064 WARN("%p: packet padding has been requested but is not"
1065 " supported, make sure MLNX_OFED and firmware are"
1069 tmpl.rxq.wq = ibv_exp_create_wq(priv->ctx, &attr.wq);
1070 if (tmpl.rxq.wq == NULL) {
1071 ret = (errno ? errno : EINVAL);
1072 ERROR("%p: WQ creation failure: %s",
1073 (void *)dev, strerror(ret));
1076 ret = rxq_alloc_elts(&tmpl, desc, NULL);
1078 ERROR("%p: RXQ allocation failed: %s",
1079 (void *)dev, strerror(ret));
1083 tmpl.rxq.port_id = dev->data->port_id;
1084 DEBUG("%p: RTE port ID: %u", (void *)rxq_ctrl, tmpl.rxq.port_id);
1085 attr.params = (struct ibv_exp_query_intf_params){
1086 .intf_scope = IBV_EXP_INTF_GLOBAL,
1088 .intf = IBV_EXP_INTF_CQ,
1091 tmpl.if_cq = ibv_exp_query_intf(priv->ctx, &attr.params, &status);
1092 if (tmpl.if_cq == NULL) {
1093 ERROR("%p: CQ interface family query failed with status %d",
1094 (void *)dev, status);
1097 attr.params = (struct ibv_exp_query_intf_params){
1098 .intf_scope = IBV_EXP_INTF_GLOBAL,
1099 .intf = IBV_EXP_INTF_WQ,
1102 tmpl.if_wq = ibv_exp_query_intf(priv->ctx, &attr.params, &status);
1103 if (tmpl.if_wq == NULL) {
1104 ERROR("%p: WQ interface family query failed with status %d",
1105 (void *)dev, status);
1108 /* Change queue state to ready. */
1109 mod = (struct ibv_exp_wq_attr){
1110 .attr_mask = IBV_EXP_WQ_ATTR_STATE,
1111 .wq_state = IBV_EXP_WQS_RDY,
1113 ret = ibv_exp_modify_wq(tmpl.rxq.wq, &mod);
1115 ERROR("%p: WQ state to IBV_EXP_WQS_RDY failed: %s",
1116 (void *)dev, strerror(ret));
1120 elts = tmpl.rxq.elts;
1121 for (i = 0; (i != RTE_DIM(*elts)); ++i) {
1122 ret = tmpl.if_wq->recv_burst(
1130 ERROR("%p: failed to post SGEs with error %d",
1132 /* Set ret because it does not contain a valid errno value. */
1136 /* Clean up rxq in case we're reinitializing it. */
1137 DEBUG("%p: cleaning-up old rxq just in case", (void *)rxq_ctrl);
1138 rxq_cleanup(rxq_ctrl);
1140 DEBUG("%p: rxq updated with %p", (void *)rxq_ctrl, (void *)&tmpl);
1142 /* Assign function in queue. */
1143 rxq_ctrl->rxq.poll = rxq_ctrl->if_cq->poll_length_flags_cvlan;
1144 rxq_ctrl->rxq.recv = rxq_ctrl->if_wq->recv_burst;
1153 * DPDK callback to configure a RX queue.
1156 * Pointer to Ethernet device structure.
1160 * Number of descriptors to configure in queue.
1162 * NUMA socket on which memory must be allocated.
1164 * Thresholds parameters.
1166 * Memory pool for buffer allocations.
1169 * 0 on success, negative errno value on failure.
1172 mlx5_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
1173 unsigned int socket, const struct rte_eth_rxconf *conf,
1174 struct rte_mempool *mp)
1176 struct priv *priv = dev->data->dev_private;
1177 struct rxq *rxq = (*priv->rxqs)[idx];
1178 struct rxq_ctrl *rxq_ctrl;
1181 if (mlx5_is_secondary())
1182 return -E_RTE_SECONDARY;
1185 rxq_ctrl = container_of(rxq, struct rxq_ctrl, rxq);
1186 DEBUG("%p: configuring queue %u for %u descriptors",
1187 (void *)dev, idx, desc);
1188 if (idx >= priv->rxqs_n) {
1189 ERROR("%p: queue index out of range (%u >= %u)",
1190 (void *)dev, idx, priv->rxqs_n);
1195 DEBUG("%p: reusing already allocated queue index %u (%p)",
1196 (void *)dev, idx, (void *)rxq);
1197 if (priv->started) {
1201 (*priv->rxqs)[idx] = NULL;
1202 rxq_cleanup(rxq_ctrl);
1204 rxq_ctrl = rte_calloc_socket("RXQ", 1, sizeof(*rxq_ctrl), 0,
1206 if (rxq_ctrl == NULL) {
1207 ERROR("%p: unable to allocate queue index %u",
1213 ret = rxq_setup(dev, rxq_ctrl, desc, socket, conf, mp);
1217 rxq_ctrl->rxq.stats.idx = idx;
1218 DEBUG("%p: adding RX queue %p to list",
1219 (void *)dev, (void *)rxq_ctrl);
1220 (*priv->rxqs)[idx] = &rxq_ctrl->rxq;
1221 /* Update receive callback. */
1222 dev->rx_pkt_burst = mlx5_rx_burst;
1229 * DPDK callback to release a RX queue.
1232 * Generic RX queue pointer.
1235 mlx5_rx_queue_release(void *dpdk_rxq)
1237 struct rxq *rxq = (struct rxq *)dpdk_rxq;
1238 struct rxq_ctrl *rxq_ctrl;
1242 if (mlx5_is_secondary())
1247 rxq_ctrl = container_of(rxq, struct rxq_ctrl, rxq);
1250 for (i = 0; (i != priv->rxqs_n); ++i)
1251 if ((*priv->rxqs)[i] == rxq) {
1252 DEBUG("%p: removing RX queue %p from list",
1253 (void *)priv->dev, (void *)rxq);
1254 (*priv->rxqs)[i] = NULL;
1257 rxq_cleanup(rxq_ctrl);
1263 * DPDK callback for RX in secondary processes.
1265 * This function configures all queues from primary process information
1266 * if necessary before reverting to the normal RX burst callback.
1269 * Generic pointer to RX queue structure.
1271 * Array to store received packets.
1273 * Maximum number of packets in array.
1276 * Number of packets successfully received (<= pkts_n).
1279 mlx5_rx_burst_secondary_setup(void *dpdk_rxq, struct rte_mbuf **pkts,
1282 struct rxq *rxq = dpdk_rxq;
1283 struct priv *priv = mlx5_secondary_data_setup(rxq->priv);
1284 struct priv *primary_priv;
1290 mlx5_secondary_data[priv->dev->data->port_id].primary_priv;
1291 /* Look for queue index in both private structures. */
1292 for (index = 0; index != priv->rxqs_n; ++index)
1293 if (((*primary_priv->rxqs)[index] == rxq) ||
1294 ((*priv->rxqs)[index] == rxq))
1296 if (index == priv->rxqs_n)
1298 rxq = (*priv->rxqs)[index];
1299 return priv->dev->rx_pkt_burst(rxq, pkts, pkts_n);