1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright 2015 6WIND S.A.
3 * Copyright 2015 Mellanox Technologies, Ltd
16 #include <linux/rtnetlink.h>
19 /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
21 #pragma GCC diagnostic ignored "-Wpedantic"
23 #include <infiniband/verbs.h>
25 #pragma GCC diagnostic error "-Wpedantic"
28 #include <rte_malloc.h>
29 #include <rte_ethdev_driver.h>
30 #include <rte_ethdev_pci.h>
32 #include <rte_bus_pci.h>
33 #include <rte_common.h>
34 #include <rte_config.h>
35 #include <rte_kvargs.h>
36 #include <rte_rwlock.h>
37 #include <rte_spinlock.h>
38 #include <rte_string_fns.h>
39 #include <rte_alarm.h>
42 #include "mlx5_utils.h"
43 #include "mlx5_rxtx.h"
44 #include "mlx5_autoconf.h"
45 #include "mlx5_defs.h"
46 #include "mlx5_glue.h"
48 #include "mlx5_flow.h"
50 /* Device parameter to enable RX completion queue compression. */
51 #define MLX5_RXQ_CQE_COMP_EN "rxq_cqe_comp_en"
53 /* Device parameter to enable RX completion entry padding to 128B. */
54 #define MLX5_RXQ_CQE_PAD_EN "rxq_cqe_pad_en"
56 /* Device parameter to enable padding Rx packet to cacheline size. */
57 #define MLX5_RXQ_PKT_PAD_EN "rxq_pkt_pad_en"
59 /* Device parameter to enable Multi-Packet Rx queue. */
60 #define MLX5_RX_MPRQ_EN "mprq_en"
62 /* Device parameter to configure log 2 of the number of strides for MPRQ. */
63 #define MLX5_RX_MPRQ_LOG_STRIDE_NUM "mprq_log_stride_num"
65 /* Device parameter to limit the size of memcpy'd packet for MPRQ. */
66 #define MLX5_RX_MPRQ_MAX_MEMCPY_LEN "mprq_max_memcpy_len"
68 /* Device parameter to set the minimum number of Rx queues to enable MPRQ. */
69 #define MLX5_RXQS_MIN_MPRQ "rxqs_min_mprq"
71 /* Device parameter to configure inline send. Deprecated, ignored.*/
72 #define MLX5_TXQ_INLINE "txq_inline"
74 /* Device parameter to limit packet size to inline with ordinary SEND. */
75 #define MLX5_TXQ_INLINE_MAX "txq_inline_max"
77 /* Device parameter to configure minimal data size to inline. */
78 #define MLX5_TXQ_INLINE_MIN "txq_inline_min"
80 /* Device parameter to limit packet size to inline with Enhanced MPW. */
81 #define MLX5_TXQ_INLINE_MPW "txq_inline_mpw"
84 * Device parameter to configure the number of TX queues threshold for
85 * enabling inline send.
87 #define MLX5_TXQS_MIN_INLINE "txqs_min_inline"
90 * Device parameter to configure the number of TX queues threshold for
91 * enabling vectorized Tx, deprecated, ignored (no vectorized Tx routines).
93 #define MLX5_TXQS_MAX_VEC "txqs_max_vec"
95 /* Device parameter to enable multi-packet send WQEs. */
96 #define MLX5_TXQ_MPW_EN "txq_mpw_en"
99 * Device parameter to force doorbell register mapping
100 * to non-cahed region eliminating the extra write memory barrier.
102 #define MLX5_TX_DB_NC "tx_db_nc"
105 * Device parameter to include 2 dsegs in the title WQEBB.
106 * Deprecated, ignored.
108 #define MLX5_TXQ_MPW_HDR_DSEG_EN "txq_mpw_hdr_dseg_en"
111 * Device parameter to limit the size of inlining packet.
112 * Deprecated, ignored.
114 #define MLX5_TXQ_MAX_INLINE_LEN "txq_max_inline_len"
117 * Device parameter to enable hardware Tx vector.
118 * Deprecated, ignored (no vectorized Tx routines anymore).
120 #define MLX5_TX_VEC_EN "tx_vec_en"
122 /* Device parameter to enable hardware Rx vector. */
123 #define MLX5_RX_VEC_EN "rx_vec_en"
125 /* Allow L3 VXLAN flow creation. */
126 #define MLX5_L3_VXLAN_EN "l3_vxlan_en"
128 /* Activate DV E-Switch flow steering. */
129 #define MLX5_DV_ESW_EN "dv_esw_en"
131 /* Activate DV flow steering. */
132 #define MLX5_DV_FLOW_EN "dv_flow_en"
134 /* Enable extensive flow metadata support. */
135 #define MLX5_DV_XMETA_EN "dv_xmeta_en"
137 /* Activate Netlink support in VF mode. */
138 #define MLX5_VF_NL_EN "vf_nl_en"
140 /* Enable extending memsegs when creating a MR. */
141 #define MLX5_MR_EXT_MEMSEG_EN "mr_ext_memseg_en"
143 /* Select port representors to instantiate. */
144 #define MLX5_REPRESENTOR "representor"
146 /* Device parameter to configure the maximum number of dump files per queue. */
147 #define MLX5_MAX_DUMP_FILES_NUM "max_dump_files_num"
149 /* Configure timeout of LRO session (in microseconds). */
150 #define MLX5_LRO_TIMEOUT_USEC "lro_timeout_usec"
152 #ifndef HAVE_IBV_MLX5_MOD_MPW
153 #define MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED (1 << 2)
154 #define MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW (1 << 3)
157 #ifndef HAVE_IBV_MLX5_MOD_CQE_128B_COMP
158 #define MLX5DV_CONTEXT_FLAGS_CQE_128B_COMP (1 << 4)
161 static const char *MZ_MLX5_PMD_SHARED_DATA = "mlx5_pmd_shared_data";
163 /* Shared memory between primary and secondary processes. */
164 struct mlx5_shared_data *mlx5_shared_data;
166 /* Spinlock for mlx5_shared_data allocation. */
167 static rte_spinlock_t mlx5_shared_data_lock = RTE_SPINLOCK_INITIALIZER;
169 /* Process local data for secondary processes. */
170 static struct mlx5_local_data mlx5_local_data;
172 /** Driver-specific log messages type. */
175 /** Data associated with devices to spawn. */
176 struct mlx5_dev_spawn_data {
177 uint32_t ifindex; /**< Network interface index. */
178 uint32_t max_port; /**< IB device maximal port index. */
179 uint32_t ibv_port; /**< IB device physical port index. */
180 int pf_bond; /**< bonding device PF index. < 0 - no bonding */
181 struct mlx5_switch_info info; /**< Switch information. */
182 struct ibv_device *ibv_dev; /**< Associated IB device. */
183 struct rte_eth_dev *eth_dev; /**< Associated Ethernet device. */
184 struct rte_pci_device *pci_dev; /**< Backend PCI device. */
187 static LIST_HEAD(, mlx5_ibv_shared) mlx5_ibv_list = LIST_HEAD_INITIALIZER();
188 static pthread_mutex_t mlx5_ibv_list_mutex = PTHREAD_MUTEX_INITIALIZER;
190 #define MLX5_FLOW_MIN_ID_POOL_SIZE 512
191 #define MLX5_ID_GENERATION_ARRAY_FACTOR 16
193 #define MLX5_FLOW_TABLE_HLIST_ARRAY_SIZE 4096
196 * Allocate ID pool structure.
199 * Pointer to pool object, NULL value otherwise.
201 struct mlx5_flow_id_pool *
202 mlx5_flow_id_pool_alloc(void)
204 struct mlx5_flow_id_pool *pool;
207 pool = rte_zmalloc("id pool allocation", sizeof(*pool),
208 RTE_CACHE_LINE_SIZE);
210 DRV_LOG(ERR, "can't allocate id pool");
214 mem = rte_zmalloc("", MLX5_FLOW_MIN_ID_POOL_SIZE * sizeof(uint32_t),
215 RTE_CACHE_LINE_SIZE);
217 DRV_LOG(ERR, "can't allocate mem for id pool");
221 pool->free_arr = mem;
222 pool->curr = pool->free_arr;
223 pool->last = pool->free_arr + MLX5_FLOW_MIN_ID_POOL_SIZE;
224 pool->base_index = 0;
232 * Release ID pool structure.
235 * Pointer to flow id pool object to free.
238 mlx5_flow_id_pool_release(struct mlx5_flow_id_pool *pool)
240 rte_free(pool->free_arr);
248 * Pointer to flow id pool.
253 * 0 on success, error value otherwise.
256 mlx5_flow_id_get(struct mlx5_flow_id_pool *pool, uint32_t *id)
258 if (pool->curr == pool->free_arr) {
259 if (pool->base_index == UINT32_MAX) {
261 DRV_LOG(ERR, "no free id");
264 *id = ++pool->base_index;
267 *id = *(--pool->curr);
275 * Pointer to flow id pool.
280 * 0 on success, error value otherwise.
283 mlx5_flow_id_release(struct mlx5_flow_id_pool *pool, uint32_t id)
289 if (pool->curr == pool->last) {
290 size = pool->curr - pool->free_arr;
291 size2 = size * MLX5_ID_GENERATION_ARRAY_FACTOR;
292 assert(size2 > size);
293 mem = rte_malloc("", size2 * sizeof(uint32_t), 0);
295 DRV_LOG(ERR, "can't allocate mem for id pool");
299 memcpy(mem, pool->free_arr, size * sizeof(uint32_t));
300 rte_free(pool->free_arr);
301 pool->free_arr = mem;
302 pool->curr = pool->free_arr + size;
303 pool->last = pool->free_arr + size2;
311 * Initialize the counters management structure.
314 * Pointer to mlx5_ibv_shared object to free
317 mlx5_flow_counters_mng_init(struct mlx5_ibv_shared *sh)
321 TAILQ_INIT(&sh->cmng.flow_counters);
322 for (i = 0; i < RTE_DIM(sh->cmng.ccont); ++i)
323 TAILQ_INIT(&sh->cmng.ccont[i].pool_list);
327 * Destroy all the resources allocated for a counter memory management.
330 * Pointer to the memory management structure.
333 mlx5_flow_destroy_counter_stat_mem_mng(struct mlx5_counter_stats_mem_mng *mng)
335 uint8_t *mem = (uint8_t *)(uintptr_t)mng->raws[0].data;
337 LIST_REMOVE(mng, next);
338 claim_zero(mlx5_devx_cmd_destroy(mng->dm));
339 claim_zero(mlx5_glue->devx_umem_dereg(mng->umem));
344 * Close and release all the resources of the counters management.
347 * Pointer to mlx5_ibv_shared object to free.
350 mlx5_flow_counters_mng_close(struct mlx5_ibv_shared *sh)
352 struct mlx5_counter_stats_mem_mng *mng;
359 rte_eal_alarm_cancel(mlx5_flow_query_alarm, sh);
360 if (rte_errno != EINPROGRESS)
364 for (i = 0; i < RTE_DIM(sh->cmng.ccont); ++i) {
365 struct mlx5_flow_counter_pool *pool;
366 uint32_t batch = !!(i % 2);
368 if (!sh->cmng.ccont[i].pools)
370 pool = TAILQ_FIRST(&sh->cmng.ccont[i].pool_list);
375 (mlx5_devx_cmd_destroy(pool->min_dcs));
377 for (j = 0; j < MLX5_COUNTERS_PER_POOL; ++j) {
378 if (pool->counters_raw[j].action)
380 (mlx5_glue->destroy_flow_action
381 (pool->counters_raw[j].action));
382 if (!batch && pool->counters_raw[j].dcs)
383 claim_zero(mlx5_devx_cmd_destroy
384 (pool->counters_raw[j].dcs));
386 TAILQ_REMOVE(&sh->cmng.ccont[i].pool_list, pool,
389 pool = TAILQ_FIRST(&sh->cmng.ccont[i].pool_list);
391 rte_free(sh->cmng.ccont[i].pools);
393 mng = LIST_FIRST(&sh->cmng.mem_mngs);
395 mlx5_flow_destroy_counter_stat_mem_mng(mng);
396 mng = LIST_FIRST(&sh->cmng.mem_mngs);
398 memset(&sh->cmng, 0, sizeof(sh->cmng));
402 * Extract pdn of PD object using DV API.
405 * Pointer to the verbs PD object.
407 * Pointer to the PD object number variable.
410 * 0 on success, error value otherwise.
412 #ifdef HAVE_IBV_FLOW_DV_SUPPORT
414 mlx5_get_pdn(struct ibv_pd *pd __rte_unused, uint32_t *pdn __rte_unused)
416 struct mlx5dv_obj obj;
417 struct mlx5dv_pd pd_info;
421 obj.pd.out = &pd_info;
422 ret = mlx5_glue->dv_init_obj(&obj, MLX5DV_OBJ_PD);
424 DRV_LOG(DEBUG, "Fail to get PD object info");
430 #endif /* HAVE_IBV_FLOW_DV_SUPPORT */
433 mlx5_config_doorbell_mapping_env(const struct mlx5_dev_config *config)
438 assert(rte_eal_process_type() == RTE_PROC_PRIMARY);
439 /* Get environment variable to store. */
440 env = getenv(MLX5_SHUT_UP_BF);
441 value = env ? !!strcmp(env, "0") : MLX5_ARG_UNSET;
442 if (config->dbnc == MLX5_ARG_UNSET)
443 setenv(MLX5_SHUT_UP_BF, MLX5_SHUT_UP_BF_DEFAULT, 1);
445 setenv(MLX5_SHUT_UP_BF, config->dbnc ? "1" : "0", 1);
450 mlx5_restore_doorbell_mapping_env(const struct mlx5_dev_config *config,
453 assert(rte_eal_process_type() == RTE_PROC_PRIMARY);
454 if (config->dbnc == MLX5_ARG_UNSET)
456 /* Restore the original environment variable state. */
457 if (value == MLX5_ARG_UNSET)
458 unsetenv(MLX5_SHUT_UP_BF);
460 setenv(MLX5_SHUT_UP_BF, value ? "1" : "0", 1);
464 * Allocate shared IB device context. If there is multiport device the
465 * master and representors will share this context, if there is single
466 * port dedicated IB device, the context will be used by only given
467 * port due to unification.
469 * Routine first searches the context for the specified IB device name,
470 * if found the shared context assumed and reference counter is incremented.
471 * If no context found the new one is created and initialized with specified
472 * IB device context and parameters.
475 * Pointer to the IB device attributes (name, port, etc).
477 * Pointer to device configuration structure.
480 * Pointer to mlx5_ibv_shared object on success,
481 * otherwise NULL and rte_errno is set.
483 static struct mlx5_ibv_shared *
484 mlx5_alloc_shared_ibctx(const struct mlx5_dev_spawn_data *spawn,
485 const struct mlx5_dev_config *config)
487 struct mlx5_ibv_shared *sh;
491 #ifdef HAVE_IBV_FLOW_DV_SUPPORT
492 struct mlx5_devx_tis_attr tis_attr = { 0 };
496 /* Secondary process should not create the shared context. */
497 assert(rte_eal_process_type() == RTE_PROC_PRIMARY);
498 pthread_mutex_lock(&mlx5_ibv_list_mutex);
499 /* Search for IB context by device name. */
500 LIST_FOREACH(sh, &mlx5_ibv_list, next) {
501 if (!strcmp(sh->ibdev_name, spawn->ibv_dev->name)) {
506 /* No device found, we have to create new shared context. */
507 assert(spawn->max_port);
508 sh = rte_zmalloc("ethdev shared ib context",
509 sizeof(struct mlx5_ibv_shared) +
511 sizeof(struct mlx5_ibv_shared_port),
512 RTE_CACHE_LINE_SIZE);
514 DRV_LOG(ERR, "shared context allocation failure");
519 * Configure environment variable "MLX5_BF_SHUT_UP"
520 * before the device creation. The rdma_core library
521 * checks the variable at device creation and
522 * stores the result internally.
524 dbmap_env = mlx5_config_doorbell_mapping_env(config);
525 /* Try to open IB device with DV first, then usual Verbs. */
527 sh->ctx = mlx5_glue->dv_open_device(spawn->ibv_dev);
530 DRV_LOG(DEBUG, "DevX is supported");
531 /* The device is created, no need for environment. */
532 mlx5_restore_doorbell_mapping_env(config, dbmap_env);
534 /* The environment variable is still configured. */
535 sh->ctx = mlx5_glue->open_device(spawn->ibv_dev);
536 err = errno ? errno : ENODEV;
538 * The environment variable is not needed anymore,
539 * all device creation attempts are completed.
541 mlx5_restore_doorbell_mapping_env(config, dbmap_env);
545 DRV_LOG(DEBUG, "DevX is NOT supported");
547 err = mlx5_glue->query_device_ex(sh->ctx, NULL, &sh->device_attr);
549 DRV_LOG(DEBUG, "ibv_query_device_ex() failed");
553 sh->max_port = spawn->max_port;
554 strncpy(sh->ibdev_name, sh->ctx->device->name,
555 sizeof(sh->ibdev_name));
556 strncpy(sh->ibdev_path, sh->ctx->device->ibdev_path,
557 sizeof(sh->ibdev_path));
558 pthread_mutex_init(&sh->intr_mutex, NULL);
560 * Setting port_id to max unallowed value means
561 * there is no interrupt subhandler installed for
562 * the given port index i.
564 for (i = 0; i < sh->max_port; i++) {
565 sh->port[i].ih_port_id = RTE_MAX_ETHPORTS;
566 sh->port[i].devx_ih_port_id = RTE_MAX_ETHPORTS;
568 sh->pd = mlx5_glue->alloc_pd(sh->ctx);
569 if (sh->pd == NULL) {
570 DRV_LOG(ERR, "PD allocation failure");
574 #ifdef HAVE_IBV_FLOW_DV_SUPPORT
576 err = mlx5_get_pdn(sh->pd, &sh->pdn);
578 DRV_LOG(ERR, "Fail to extract pdn from PD");
581 sh->td = mlx5_devx_cmd_create_td(sh->ctx);
583 DRV_LOG(ERR, "TD allocation failure");
587 tis_attr.transport_domain = sh->td->id;
588 sh->tis = mlx5_devx_cmd_create_tis(sh->ctx, &tis_attr);
590 DRV_LOG(ERR, "TIS allocation failure");
595 sh->flow_id_pool = mlx5_flow_id_pool_alloc();
596 if (!sh->flow_id_pool) {
597 DRV_LOG(ERR, "can't create flow id pool");
601 #endif /* HAVE_IBV_FLOW_DV_SUPPORT */
603 * Once the device is added to the list of memory event
604 * callback, its global MR cache table cannot be expanded
605 * on the fly because of deadlock. If it overflows, lookup
606 * should be done by searching MR list linearly, which is slow.
608 * At this point the device is not added to the memory
609 * event list yet, context is just being created.
611 err = mlx5_mr_btree_init(&sh->mr.cache,
612 MLX5_MR_BTREE_CACHE_N * 2,
613 spawn->pci_dev->device.numa_node);
618 mlx5_flow_counters_mng_init(sh);
619 /* Add device to memory callback list. */
620 rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock);
621 LIST_INSERT_HEAD(&mlx5_shared_data->mem_event_cb_list,
623 rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
624 /* Add context to the global device list. */
625 LIST_INSERT_HEAD(&mlx5_ibv_list, sh, next);
627 pthread_mutex_unlock(&mlx5_ibv_list_mutex);
630 pthread_mutex_unlock(&mlx5_ibv_list_mutex);
633 claim_zero(mlx5_devx_cmd_destroy(sh->tis));
635 claim_zero(mlx5_devx_cmd_destroy(sh->td));
637 claim_zero(mlx5_glue->dealloc_pd(sh->pd));
639 claim_zero(mlx5_glue->close_device(sh->ctx));
640 if (sh->flow_id_pool)
641 mlx5_flow_id_pool_release(sh->flow_id_pool);
649 * Free shared IB device context. Decrement counter and if zero free
650 * all allocated resources and close handles.
653 * Pointer to mlx5_ibv_shared object to free
656 mlx5_free_shared_ibctx(struct mlx5_ibv_shared *sh)
658 pthread_mutex_lock(&mlx5_ibv_list_mutex);
660 /* Check the object presence in the list. */
661 struct mlx5_ibv_shared *lctx;
663 LIST_FOREACH(lctx, &mlx5_ibv_list, next)
668 DRV_LOG(ERR, "Freeing non-existing shared IB context");
674 /* Secondary process should not free the shared context. */
675 assert(rte_eal_process_type() == RTE_PROC_PRIMARY);
678 /* Release created Memory Regions. */
680 /* Remove from memory callback device list. */
681 rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock);
682 LIST_REMOVE(sh, mem_event_cb);
683 rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
684 /* Remove context from the global device list. */
685 LIST_REMOVE(sh, next);
687 * Ensure there is no async event handler installed.
688 * Only primary process handles async device events.
690 mlx5_flow_counters_mng_close(sh);
691 assert(!sh->intr_cnt);
693 mlx5_intr_callback_unregister
694 (&sh->intr_handle, mlx5_dev_interrupt_handler, sh);
695 #ifdef HAVE_MLX5_DEVX_ASYNC_SUPPORT
696 if (sh->devx_intr_cnt) {
697 if (sh->intr_handle_devx.fd)
698 rte_intr_callback_unregister(&sh->intr_handle_devx,
699 mlx5_dev_interrupt_handler_devx, sh);
701 mlx5dv_devx_destroy_cmd_comp(sh->devx_comp);
704 pthread_mutex_destroy(&sh->intr_mutex);
706 claim_zero(mlx5_glue->dealloc_pd(sh->pd));
708 claim_zero(mlx5_devx_cmd_destroy(sh->tis));
710 claim_zero(mlx5_devx_cmd_destroy(sh->td));
712 claim_zero(mlx5_glue->close_device(sh->ctx));
713 if (sh->flow_id_pool)
714 mlx5_flow_id_pool_release(sh->flow_id_pool);
717 pthread_mutex_unlock(&mlx5_ibv_list_mutex);
721 * Initialize DR related data within private structure.
722 * Routine checks the reference counter and does actual
723 * resources creation/initialization only if counter is zero.
726 * Pointer to the private device data structure.
729 * Zero on success, positive error code otherwise.
732 mlx5_alloc_shared_dr(struct mlx5_priv *priv)
734 #ifdef HAVE_MLX5DV_DR
735 struct mlx5_ibv_shared *sh = priv->sh;
738 char s[MLX5_HLIST_NAMESIZE];
742 /* Shared DV/DR structures is already initialized. */
747 /* Reference counter is zero, we should initialize structures. */
748 domain = mlx5_glue->dr_create_domain(sh->ctx,
749 MLX5DV_DR_DOMAIN_TYPE_NIC_RX);
751 DRV_LOG(ERR, "ingress mlx5dv_dr_create_domain failed");
755 sh->rx_domain = domain;
756 domain = mlx5_glue->dr_create_domain(sh->ctx,
757 MLX5DV_DR_DOMAIN_TYPE_NIC_TX);
759 DRV_LOG(ERR, "egress mlx5dv_dr_create_domain failed");
763 pthread_mutex_init(&sh->dv_mutex, NULL);
764 sh->tx_domain = domain;
765 #ifdef HAVE_MLX5DV_DR_ESWITCH
766 if (priv->config.dv_esw_en) {
767 domain = mlx5_glue->dr_create_domain
768 (sh->ctx, MLX5DV_DR_DOMAIN_TYPE_FDB);
770 DRV_LOG(ERR, "FDB mlx5dv_dr_create_domain failed");
774 sh->fdb_domain = domain;
775 sh->esw_drop_action = mlx5_glue->dr_create_flow_action_drop();
778 snprintf(s, sizeof(s) - 1, "%s_flow_table", priv->sh->ibdev_name);
779 sh->flow_tbls = mlx5_hlist_create(s,
780 MLX5_FLOW_TABLE_HLIST_ARRAY_SIZE);
781 if (!sh->flow_tbls) {
782 DRV_LOG(ERR, "flow tables with hash creation failed.\n");
786 sh->pop_vlan_action = mlx5_glue->dr_create_flow_action_pop_vlan();
792 /* Rollback the created objects. */
794 mlx5_glue->dr_destroy_domain(sh->rx_domain);
795 sh->rx_domain = NULL;
798 mlx5_glue->dr_destroy_domain(sh->tx_domain);
799 sh->tx_domain = NULL;
801 if (sh->fdb_domain) {
802 mlx5_glue->dr_destroy_domain(sh->fdb_domain);
803 sh->fdb_domain = NULL;
805 if (sh->esw_drop_action) {
806 mlx5_glue->destroy_flow_action(sh->esw_drop_action);
807 sh->esw_drop_action = NULL;
809 if (sh->pop_vlan_action) {
810 mlx5_glue->destroy_flow_action(sh->pop_vlan_action);
811 sh->pop_vlan_action = NULL;
821 * Destroy DR related data within private structure.
824 * Pointer to the private device data structure.
827 mlx5_free_shared_dr(struct mlx5_priv *priv)
829 #ifdef HAVE_MLX5DV_DR
830 struct mlx5_ibv_shared *sh;
832 if (!priv->dr_shared)
837 assert(sh->dv_refcnt);
838 if (sh->dv_refcnt && --sh->dv_refcnt)
841 /* flow table entries should be handled properly before. */
842 mlx5_hlist_destroy(sh->flow_tbls, NULL, NULL);
843 sh->flow_tbls = NULL;
846 mlx5_glue->dr_destroy_domain(sh->rx_domain);
847 sh->rx_domain = NULL;
850 mlx5_glue->dr_destroy_domain(sh->tx_domain);
851 sh->tx_domain = NULL;
853 #ifdef HAVE_MLX5DV_DR_ESWITCH
854 if (sh->fdb_domain) {
855 mlx5_glue->dr_destroy_domain(sh->fdb_domain);
856 sh->fdb_domain = NULL;
858 if (sh->esw_drop_action) {
859 mlx5_glue->destroy_flow_action(sh->esw_drop_action);
860 sh->esw_drop_action = NULL;
863 if (sh->pop_vlan_action) {
864 mlx5_glue->destroy_flow_action(sh->pop_vlan_action);
865 sh->pop_vlan_action = NULL;
867 pthread_mutex_destroy(&sh->dv_mutex);
874 * Initialize shared data between primary and secondary process.
876 * A memzone is reserved by primary process and secondary processes attach to
880 * 0 on success, a negative errno value otherwise and rte_errno is set.
883 mlx5_init_shared_data(void)
885 const struct rte_memzone *mz;
888 rte_spinlock_lock(&mlx5_shared_data_lock);
889 if (mlx5_shared_data == NULL) {
890 if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
891 /* Allocate shared memory. */
892 mz = rte_memzone_reserve(MZ_MLX5_PMD_SHARED_DATA,
893 sizeof(*mlx5_shared_data),
897 "Cannot allocate mlx5 shared data");
901 mlx5_shared_data = mz->addr;
902 memset(mlx5_shared_data, 0, sizeof(*mlx5_shared_data));
903 rte_spinlock_init(&mlx5_shared_data->lock);
905 /* Lookup allocated shared memory. */
906 mz = rte_memzone_lookup(MZ_MLX5_PMD_SHARED_DATA);
909 "Cannot attach mlx5 shared data");
913 mlx5_shared_data = mz->addr;
914 memset(&mlx5_local_data, 0, sizeof(mlx5_local_data));
918 rte_spinlock_unlock(&mlx5_shared_data_lock);
923 * Retrieve integer value from environment variable.
926 * Environment variable name.
929 * Integer value, 0 if the variable is not set.
932 mlx5_getenv_int(const char *name)
934 const char *val = getenv(name);
942 * Verbs callback to allocate a memory. This function should allocate the space
943 * according to the size provided residing inside a huge page.
944 * Please note that all allocation must respect the alignment from libmlx5
945 * (i.e. currently sysconf(_SC_PAGESIZE)).
948 * The size in bytes of the memory to allocate.
950 * A pointer to the callback data.
953 * Allocated buffer, NULL otherwise and rte_errno is set.
956 mlx5_alloc_verbs_buf(size_t size, void *data)
958 struct mlx5_priv *priv = data;
960 size_t alignment = sysconf(_SC_PAGESIZE);
961 unsigned int socket = SOCKET_ID_ANY;
963 if (priv->verbs_alloc_ctx.type == MLX5_VERBS_ALLOC_TYPE_TX_QUEUE) {
964 const struct mlx5_txq_ctrl *ctrl = priv->verbs_alloc_ctx.obj;
966 socket = ctrl->socket;
967 } else if (priv->verbs_alloc_ctx.type ==
968 MLX5_VERBS_ALLOC_TYPE_RX_QUEUE) {
969 const struct mlx5_rxq_ctrl *ctrl = priv->verbs_alloc_ctx.obj;
971 socket = ctrl->socket;
973 assert(data != NULL);
974 ret = rte_malloc_socket(__func__, size, alignment, socket);
981 * Verbs callback to free a memory.
984 * A pointer to the memory to free.
986 * A pointer to the callback data.
989 mlx5_free_verbs_buf(void *ptr, void *data __rte_unused)
991 assert(data != NULL);
996 * DPDK callback to add udp tunnel port
999 * A pointer to eth_dev
1000 * @param[in] udp_tunnel
1001 * A pointer to udp tunnel
1004 * 0 on valid udp ports and tunnels, -ENOTSUP otherwise.
1007 mlx5_udp_tunnel_port_add(struct rte_eth_dev *dev __rte_unused,
1008 struct rte_eth_udp_tunnel *udp_tunnel)
1010 assert(udp_tunnel != NULL);
1011 if (udp_tunnel->prot_type == RTE_TUNNEL_TYPE_VXLAN &&
1012 udp_tunnel->udp_port == 4789)
1014 if (udp_tunnel->prot_type == RTE_TUNNEL_TYPE_VXLAN_GPE &&
1015 udp_tunnel->udp_port == 4790)
1021 * Initialize process private data structure.
1024 * Pointer to Ethernet device structure.
1027 * 0 on success, a negative errno value otherwise and rte_errno is set.
1030 mlx5_proc_priv_init(struct rte_eth_dev *dev)
1032 struct mlx5_priv *priv = dev->data->dev_private;
1033 struct mlx5_proc_priv *ppriv;
1037 * UAR register table follows the process private structure. BlueFlame
1038 * registers for Tx queues are stored in the table.
1041 sizeof(struct mlx5_proc_priv) + priv->txqs_n * sizeof(void *);
1042 ppriv = rte_malloc_socket("mlx5_proc_priv", ppriv_size,
1043 RTE_CACHE_LINE_SIZE, dev->device->numa_node);
1048 ppriv->uar_table_sz = ppriv_size;
1049 dev->process_private = ppriv;
1054 * Un-initialize process private data structure.
1057 * Pointer to Ethernet device structure.
1060 mlx5_proc_priv_uninit(struct rte_eth_dev *dev)
1062 if (!dev->process_private)
1064 rte_free(dev->process_private);
1065 dev->process_private = NULL;
1069 * DPDK callback to close the device.
1071 * Destroy all queues and objects, free memory.
1074 * Pointer to Ethernet device structure.
1077 mlx5_dev_close(struct rte_eth_dev *dev)
1079 struct mlx5_priv *priv = dev->data->dev_private;
1083 DRV_LOG(DEBUG, "port %u closing device \"%s\"",
1085 ((priv->sh->ctx != NULL) ? priv->sh->ctx->device->name : ""));
1086 /* In case mlx5_dev_stop() has not been called. */
1087 mlx5_dev_interrupt_handler_uninstall(dev);
1088 mlx5_dev_interrupt_handler_devx_uninstall(dev);
1089 mlx5_traffic_disable(dev);
1090 mlx5_flow_flush(dev, NULL);
1091 mlx5_flow_meter_flush(dev, NULL);
1092 /* Prevent crashes when queues are still in use. */
1093 dev->rx_pkt_burst = removed_rx_burst;
1094 dev->tx_pkt_burst = removed_tx_burst;
1096 /* Disable datapath on secondary process. */
1097 mlx5_mp_req_stop_rxtx(dev);
1098 if (priv->rxqs != NULL) {
1099 /* XXX race condition if mlx5_rx_burst() is still running. */
1101 for (i = 0; (i != priv->rxqs_n); ++i)
1102 mlx5_rxq_release(dev, i);
1106 if (priv->txqs != NULL) {
1107 /* XXX race condition if mlx5_tx_burst() is still running. */
1109 for (i = 0; (i != priv->txqs_n); ++i)
1110 mlx5_txq_release(dev, i);
1114 mlx5_proc_priv_uninit(dev);
1115 if (priv->mreg_cp_tbl)
1116 mlx5_hlist_destroy(priv->mreg_cp_tbl, NULL, NULL);
1117 mlx5_mprq_free_mp(dev);
1118 mlx5_free_shared_dr(priv);
1119 if (priv->rss_conf.rss_key != NULL)
1120 rte_free(priv->rss_conf.rss_key);
1121 if (priv->reta_idx != NULL)
1122 rte_free(priv->reta_idx);
1123 if (priv->config.vf)
1124 mlx5_nl_mac_addr_flush(dev);
1125 if (priv->nl_socket_route >= 0)
1126 close(priv->nl_socket_route);
1127 if (priv->nl_socket_rdma >= 0)
1128 close(priv->nl_socket_rdma);
1129 if (priv->vmwa_context)
1130 mlx5_vlan_vmwa_exit(priv->vmwa_context);
1133 * Free the shared context in last turn, because the cleanup
1134 * routines above may use some shared fields, like
1135 * mlx5_nl_mac_addr_flush() uses ibdev_path for retrieveing
1136 * ifindex if Netlink fails.
1138 mlx5_free_shared_ibctx(priv->sh);
1141 ret = mlx5_hrxq_verify(dev);
1143 DRV_LOG(WARNING, "port %u some hash Rx queue still remain",
1144 dev->data->port_id);
1145 ret = mlx5_ind_table_obj_verify(dev);
1147 DRV_LOG(WARNING, "port %u some indirection table still remain",
1148 dev->data->port_id);
1149 ret = mlx5_rxq_obj_verify(dev);
1151 DRV_LOG(WARNING, "port %u some Rx queue objects still remain",
1152 dev->data->port_id);
1153 ret = mlx5_rxq_verify(dev);
1155 DRV_LOG(WARNING, "port %u some Rx queues still remain",
1156 dev->data->port_id);
1157 ret = mlx5_txq_obj_verify(dev);
1159 DRV_LOG(WARNING, "port %u some Verbs Tx queue still remain",
1160 dev->data->port_id);
1161 ret = mlx5_txq_verify(dev);
1163 DRV_LOG(WARNING, "port %u some Tx queues still remain",
1164 dev->data->port_id);
1165 ret = mlx5_flow_verify(dev);
1167 DRV_LOG(WARNING, "port %u some flows still remain",
1168 dev->data->port_id);
1169 if (priv->domain_id != RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID) {
1173 MLX5_ETH_FOREACH_DEV(port_id, priv->pci_dev) {
1174 struct mlx5_priv *opriv =
1175 rte_eth_devices[port_id].data->dev_private;
1178 opriv->domain_id != priv->domain_id ||
1179 &rte_eth_devices[port_id] == dev)
1185 claim_zero(rte_eth_switch_domain_free(priv->domain_id));
1187 memset(priv, 0, sizeof(*priv));
1188 priv->domain_id = RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID;
1190 * Reset mac_addrs to NULL such that it is not freed as part of
1191 * rte_eth_dev_release_port(). mac_addrs is part of dev_private so
1192 * it is freed when dev_private is freed.
1194 dev->data->mac_addrs = NULL;
1197 const struct eth_dev_ops mlx5_dev_ops = {
1198 .dev_configure = mlx5_dev_configure,
1199 .dev_start = mlx5_dev_start,
1200 .dev_stop = mlx5_dev_stop,
1201 .dev_set_link_down = mlx5_set_link_down,
1202 .dev_set_link_up = mlx5_set_link_up,
1203 .dev_close = mlx5_dev_close,
1204 .promiscuous_enable = mlx5_promiscuous_enable,
1205 .promiscuous_disable = mlx5_promiscuous_disable,
1206 .allmulticast_enable = mlx5_allmulticast_enable,
1207 .allmulticast_disable = mlx5_allmulticast_disable,
1208 .link_update = mlx5_link_update,
1209 .stats_get = mlx5_stats_get,
1210 .stats_reset = mlx5_stats_reset,
1211 .xstats_get = mlx5_xstats_get,
1212 .xstats_reset = mlx5_xstats_reset,
1213 .xstats_get_names = mlx5_xstats_get_names,
1214 .fw_version_get = mlx5_fw_version_get,
1215 .dev_infos_get = mlx5_dev_infos_get,
1216 .read_clock = mlx5_read_clock,
1217 .dev_supported_ptypes_get = mlx5_dev_supported_ptypes_get,
1218 .vlan_filter_set = mlx5_vlan_filter_set,
1219 .rx_queue_setup = mlx5_rx_queue_setup,
1220 .rx_hairpin_queue_setup = mlx5_rx_hairpin_queue_setup,
1221 .tx_queue_setup = mlx5_tx_queue_setup,
1222 .tx_hairpin_queue_setup = mlx5_tx_hairpin_queue_setup,
1223 .rx_queue_release = mlx5_rx_queue_release,
1224 .tx_queue_release = mlx5_tx_queue_release,
1225 .flow_ctrl_get = mlx5_dev_get_flow_ctrl,
1226 .flow_ctrl_set = mlx5_dev_set_flow_ctrl,
1227 .mac_addr_remove = mlx5_mac_addr_remove,
1228 .mac_addr_add = mlx5_mac_addr_add,
1229 .mac_addr_set = mlx5_mac_addr_set,
1230 .set_mc_addr_list = mlx5_set_mc_addr_list,
1231 .mtu_set = mlx5_dev_set_mtu,
1232 .vlan_strip_queue_set = mlx5_vlan_strip_queue_set,
1233 .vlan_offload_set = mlx5_vlan_offload_set,
1234 .reta_update = mlx5_dev_rss_reta_update,
1235 .reta_query = mlx5_dev_rss_reta_query,
1236 .rss_hash_update = mlx5_rss_hash_update,
1237 .rss_hash_conf_get = mlx5_rss_hash_conf_get,
1238 .filter_ctrl = mlx5_dev_filter_ctrl,
1239 .rx_descriptor_status = mlx5_rx_descriptor_status,
1240 .tx_descriptor_status = mlx5_tx_descriptor_status,
1241 .rx_queue_count = mlx5_rx_queue_count,
1242 .rx_queue_intr_enable = mlx5_rx_intr_enable,
1243 .rx_queue_intr_disable = mlx5_rx_intr_disable,
1244 .is_removed = mlx5_is_removed,
1245 .udp_tunnel_port_add = mlx5_udp_tunnel_port_add,
1246 .get_module_info = mlx5_get_module_info,
1247 .get_module_eeprom = mlx5_get_module_eeprom,
1248 .hairpin_cap_get = mlx5_hairpin_cap_get,
1249 .mtr_ops_get = mlx5_flow_meter_ops_get,
1252 /* Available operations from secondary process. */
1253 static const struct eth_dev_ops mlx5_dev_sec_ops = {
1254 .stats_get = mlx5_stats_get,
1255 .stats_reset = mlx5_stats_reset,
1256 .xstats_get = mlx5_xstats_get,
1257 .xstats_reset = mlx5_xstats_reset,
1258 .xstats_get_names = mlx5_xstats_get_names,
1259 .fw_version_get = mlx5_fw_version_get,
1260 .dev_infos_get = mlx5_dev_infos_get,
1261 .rx_descriptor_status = mlx5_rx_descriptor_status,
1262 .tx_descriptor_status = mlx5_tx_descriptor_status,
1263 .get_module_info = mlx5_get_module_info,
1264 .get_module_eeprom = mlx5_get_module_eeprom,
1267 /* Available operations in flow isolated mode. */
1268 const struct eth_dev_ops mlx5_dev_ops_isolate = {
1269 .dev_configure = mlx5_dev_configure,
1270 .dev_start = mlx5_dev_start,
1271 .dev_stop = mlx5_dev_stop,
1272 .dev_set_link_down = mlx5_set_link_down,
1273 .dev_set_link_up = mlx5_set_link_up,
1274 .dev_close = mlx5_dev_close,
1275 .promiscuous_enable = mlx5_promiscuous_enable,
1276 .promiscuous_disable = mlx5_promiscuous_disable,
1277 .allmulticast_enable = mlx5_allmulticast_enable,
1278 .allmulticast_disable = mlx5_allmulticast_disable,
1279 .link_update = mlx5_link_update,
1280 .stats_get = mlx5_stats_get,
1281 .stats_reset = mlx5_stats_reset,
1282 .xstats_get = mlx5_xstats_get,
1283 .xstats_reset = mlx5_xstats_reset,
1284 .xstats_get_names = mlx5_xstats_get_names,
1285 .fw_version_get = mlx5_fw_version_get,
1286 .dev_infos_get = mlx5_dev_infos_get,
1287 .dev_supported_ptypes_get = mlx5_dev_supported_ptypes_get,
1288 .vlan_filter_set = mlx5_vlan_filter_set,
1289 .rx_queue_setup = mlx5_rx_queue_setup,
1290 .rx_hairpin_queue_setup = mlx5_rx_hairpin_queue_setup,
1291 .tx_queue_setup = mlx5_tx_queue_setup,
1292 .tx_hairpin_queue_setup = mlx5_tx_hairpin_queue_setup,
1293 .rx_queue_release = mlx5_rx_queue_release,
1294 .tx_queue_release = mlx5_tx_queue_release,
1295 .flow_ctrl_get = mlx5_dev_get_flow_ctrl,
1296 .flow_ctrl_set = mlx5_dev_set_flow_ctrl,
1297 .mac_addr_remove = mlx5_mac_addr_remove,
1298 .mac_addr_add = mlx5_mac_addr_add,
1299 .mac_addr_set = mlx5_mac_addr_set,
1300 .set_mc_addr_list = mlx5_set_mc_addr_list,
1301 .mtu_set = mlx5_dev_set_mtu,
1302 .vlan_strip_queue_set = mlx5_vlan_strip_queue_set,
1303 .vlan_offload_set = mlx5_vlan_offload_set,
1304 .filter_ctrl = mlx5_dev_filter_ctrl,
1305 .rx_descriptor_status = mlx5_rx_descriptor_status,
1306 .tx_descriptor_status = mlx5_tx_descriptor_status,
1307 .rx_queue_intr_enable = mlx5_rx_intr_enable,
1308 .rx_queue_intr_disable = mlx5_rx_intr_disable,
1309 .is_removed = mlx5_is_removed,
1310 .get_module_info = mlx5_get_module_info,
1311 .get_module_eeprom = mlx5_get_module_eeprom,
1312 .hairpin_cap_get = mlx5_hairpin_cap_get,
1313 .mtr_ops_get = mlx5_flow_meter_ops_get,
1317 * Verify and store value for device argument.
1320 * Key argument to verify.
1322 * Value associated with key.
1327 * 0 on success, a negative errno value otherwise and rte_errno is set.
1330 mlx5_args_check(const char *key, const char *val, void *opaque)
1332 struct mlx5_dev_config *config = opaque;
1335 /* No-op, port representors are processed in mlx5_dev_spawn(). */
1336 if (!strcmp(MLX5_REPRESENTOR, key))
1339 tmp = strtoul(val, NULL, 0);
1342 DRV_LOG(WARNING, "%s: \"%s\" is not a valid integer", key, val);
1345 if (strcmp(MLX5_RXQ_CQE_COMP_EN, key) == 0) {
1346 config->cqe_comp = !!tmp;
1347 } else if (strcmp(MLX5_RXQ_CQE_PAD_EN, key) == 0) {
1348 config->cqe_pad = !!tmp;
1349 } else if (strcmp(MLX5_RXQ_PKT_PAD_EN, key) == 0) {
1350 config->hw_padding = !!tmp;
1351 } else if (strcmp(MLX5_RX_MPRQ_EN, key) == 0) {
1352 config->mprq.enabled = !!tmp;
1353 } else if (strcmp(MLX5_RX_MPRQ_LOG_STRIDE_NUM, key) == 0) {
1354 config->mprq.stride_num_n = tmp;
1355 } else if (strcmp(MLX5_RX_MPRQ_MAX_MEMCPY_LEN, key) == 0) {
1356 config->mprq.max_memcpy_len = tmp;
1357 } else if (strcmp(MLX5_RXQS_MIN_MPRQ, key) == 0) {
1358 config->mprq.min_rxqs_num = tmp;
1359 } else if (strcmp(MLX5_TXQ_INLINE, key) == 0) {
1360 DRV_LOG(WARNING, "%s: deprecated parameter,"
1361 " converted to txq_inline_max", key);
1362 config->txq_inline_max = tmp;
1363 } else if (strcmp(MLX5_TXQ_INLINE_MAX, key) == 0) {
1364 config->txq_inline_max = tmp;
1365 } else if (strcmp(MLX5_TXQ_INLINE_MIN, key) == 0) {
1366 config->txq_inline_min = tmp;
1367 } else if (strcmp(MLX5_TXQ_INLINE_MPW, key) == 0) {
1368 config->txq_inline_mpw = tmp;
1369 } else if (strcmp(MLX5_TXQS_MIN_INLINE, key) == 0) {
1370 config->txqs_inline = tmp;
1371 } else if (strcmp(MLX5_TXQS_MAX_VEC, key) == 0) {
1372 DRV_LOG(WARNING, "%s: deprecated parameter, ignored", key);
1373 } else if (strcmp(MLX5_TXQ_MPW_EN, key) == 0) {
1374 config->mps = !!tmp;
1375 } else if (strcmp(MLX5_TX_DB_NC, key) == 0) {
1376 config->dbnc = !!tmp;
1377 } else if (strcmp(MLX5_TXQ_MPW_HDR_DSEG_EN, key) == 0) {
1378 DRV_LOG(WARNING, "%s: deprecated parameter, ignored", key);
1379 } else if (strcmp(MLX5_TXQ_MAX_INLINE_LEN, key) == 0) {
1380 DRV_LOG(WARNING, "%s: deprecated parameter,"
1381 " converted to txq_inline_mpw", key);
1382 config->txq_inline_mpw = tmp;
1383 } else if (strcmp(MLX5_TX_VEC_EN, key) == 0) {
1384 DRV_LOG(WARNING, "%s: deprecated parameter, ignored", key);
1385 } else if (strcmp(MLX5_RX_VEC_EN, key) == 0) {
1386 config->rx_vec_en = !!tmp;
1387 } else if (strcmp(MLX5_L3_VXLAN_EN, key) == 0) {
1388 config->l3_vxlan_en = !!tmp;
1389 } else if (strcmp(MLX5_VF_NL_EN, key) == 0) {
1390 config->vf_nl_en = !!tmp;
1391 } else if (strcmp(MLX5_DV_ESW_EN, key) == 0) {
1392 config->dv_esw_en = !!tmp;
1393 } else if (strcmp(MLX5_DV_FLOW_EN, key) == 0) {
1394 config->dv_flow_en = !!tmp;
1395 } else if (strcmp(MLX5_DV_XMETA_EN, key) == 0) {
1396 if (tmp != MLX5_XMETA_MODE_LEGACY &&
1397 tmp != MLX5_XMETA_MODE_META16 &&
1398 tmp != MLX5_XMETA_MODE_META32) {
1399 DRV_LOG(WARNING, "invalid extensive "
1400 "metadata parameter");
1404 config->dv_xmeta_en = tmp;
1405 } else if (strcmp(MLX5_MR_EXT_MEMSEG_EN, key) == 0) {
1406 config->mr_ext_memseg_en = !!tmp;
1407 } else if (strcmp(MLX5_MAX_DUMP_FILES_NUM, key) == 0) {
1408 config->max_dump_files_num = tmp;
1409 } else if (strcmp(MLX5_LRO_TIMEOUT_USEC, key) == 0) {
1410 config->lro.timeout = tmp;
1412 DRV_LOG(WARNING, "%s: unknown parameter", key);
1420 * Parse device parameters.
1423 * Pointer to device configuration structure.
1425 * Device arguments structure.
1428 * 0 on success, a negative errno value otherwise and rte_errno is set.
1431 mlx5_args(struct mlx5_dev_config *config, struct rte_devargs *devargs)
1433 const char **params = (const char *[]){
1434 MLX5_RXQ_CQE_COMP_EN,
1435 MLX5_RXQ_CQE_PAD_EN,
1436 MLX5_RXQ_PKT_PAD_EN,
1438 MLX5_RX_MPRQ_LOG_STRIDE_NUM,
1439 MLX5_RX_MPRQ_MAX_MEMCPY_LEN,
1442 MLX5_TXQ_INLINE_MIN,
1443 MLX5_TXQ_INLINE_MAX,
1444 MLX5_TXQ_INLINE_MPW,
1445 MLX5_TXQS_MIN_INLINE,
1448 MLX5_TXQ_MPW_HDR_DSEG_EN,
1449 MLX5_TXQ_MAX_INLINE_LEN,
1458 MLX5_MR_EXT_MEMSEG_EN,
1460 MLX5_MAX_DUMP_FILES_NUM,
1461 MLX5_LRO_TIMEOUT_USEC,
1464 struct rte_kvargs *kvlist;
1468 if (devargs == NULL)
1470 /* Following UGLY cast is done to pass checkpatch. */
1471 kvlist = rte_kvargs_parse(devargs->args, params);
1472 if (kvlist == NULL) {
1476 /* Process parameters. */
1477 for (i = 0; (params[i] != NULL); ++i) {
1478 if (rte_kvargs_count(kvlist, params[i])) {
1479 ret = rte_kvargs_process(kvlist, params[i],
1480 mlx5_args_check, config);
1483 rte_kvargs_free(kvlist);
1488 rte_kvargs_free(kvlist);
1492 static struct rte_pci_driver mlx5_driver;
1495 * PMD global initialization.
1497 * Independent from individual device, this function initializes global
1498 * per-PMD data structures distinguishing primary and secondary processes.
1499 * Hence, each initialization is called once per a process.
1502 * 0 on success, a negative errno value otherwise and rte_errno is set.
1505 mlx5_init_once(void)
1507 struct mlx5_shared_data *sd;
1508 struct mlx5_local_data *ld = &mlx5_local_data;
1511 if (mlx5_init_shared_data())
1513 sd = mlx5_shared_data;
1515 rte_spinlock_lock(&sd->lock);
1516 switch (rte_eal_process_type()) {
1517 case RTE_PROC_PRIMARY:
1520 LIST_INIT(&sd->mem_event_cb_list);
1521 rte_rwlock_init(&sd->mem_event_rwlock);
1522 rte_mem_event_callback_register("MLX5_MEM_EVENT_CB",
1523 mlx5_mr_mem_event_cb, NULL);
1524 ret = mlx5_mp_init_primary();
1527 sd->init_done = true;
1529 case RTE_PROC_SECONDARY:
1532 ret = mlx5_mp_init_secondary();
1535 ++sd->secondary_cnt;
1536 ld->init_done = true;
1542 rte_spinlock_unlock(&sd->lock);
1547 * Configures the minimal amount of data to inline into WQE
1548 * while sending packets.
1550 * - the txq_inline_min has the maximal priority, if this
1551 * key is specified in devargs
1552 * - if DevX is enabled the inline mode is queried from the
1553 * device (HCA attributes and NIC vport context if needed).
1554 * - otherwise L2 mode (18 bytes) is assumed for ConnectX-4/4LX
1555 * and none (0 bytes) for other NICs
1558 * Verbs device parameters (name, port, switch_info) to spawn.
1560 * Device configuration parameters.
1563 mlx5_set_min_inline(struct mlx5_dev_spawn_data *spawn,
1564 struct mlx5_dev_config *config)
1566 if (config->txq_inline_min != MLX5_ARG_UNSET) {
1567 /* Application defines size of inlined data explicitly. */
1568 switch (spawn->pci_dev->id.device_id) {
1569 case PCI_DEVICE_ID_MELLANOX_CONNECTX4:
1570 case PCI_DEVICE_ID_MELLANOX_CONNECTX4VF:
1571 if (config->txq_inline_min <
1572 (int)MLX5_INLINE_HSIZE_L2) {
1574 "txq_inline_mix aligned to minimal"
1575 " ConnectX-4 required value %d",
1576 (int)MLX5_INLINE_HSIZE_L2);
1577 config->txq_inline_min = MLX5_INLINE_HSIZE_L2;
1583 if (config->hca_attr.eth_net_offloads) {
1584 /* We have DevX enabled, inline mode queried successfully. */
1585 switch (config->hca_attr.wqe_inline_mode) {
1586 case MLX5_CAP_INLINE_MODE_L2:
1587 /* outer L2 header must be inlined. */
1588 config->txq_inline_min = MLX5_INLINE_HSIZE_L2;
1590 case MLX5_CAP_INLINE_MODE_NOT_REQUIRED:
1591 /* No inline data are required by NIC. */
1592 config->txq_inline_min = MLX5_INLINE_HSIZE_NONE;
1593 config->hw_vlan_insert =
1594 config->hca_attr.wqe_vlan_insert;
1595 DRV_LOG(DEBUG, "Tx VLAN insertion is supported");
1597 case MLX5_CAP_INLINE_MODE_VPORT_CONTEXT:
1598 /* inline mode is defined by NIC vport context. */
1599 if (!config->hca_attr.eth_virt)
1601 switch (config->hca_attr.vport_inline_mode) {
1602 case MLX5_INLINE_MODE_NONE:
1603 config->txq_inline_min =
1604 MLX5_INLINE_HSIZE_NONE;
1606 case MLX5_INLINE_MODE_L2:
1607 config->txq_inline_min =
1608 MLX5_INLINE_HSIZE_L2;
1610 case MLX5_INLINE_MODE_IP:
1611 config->txq_inline_min =
1612 MLX5_INLINE_HSIZE_L3;
1614 case MLX5_INLINE_MODE_TCP_UDP:
1615 config->txq_inline_min =
1616 MLX5_INLINE_HSIZE_L4;
1618 case MLX5_INLINE_MODE_INNER_L2:
1619 config->txq_inline_min =
1620 MLX5_INLINE_HSIZE_INNER_L2;
1622 case MLX5_INLINE_MODE_INNER_IP:
1623 config->txq_inline_min =
1624 MLX5_INLINE_HSIZE_INNER_L3;
1626 case MLX5_INLINE_MODE_INNER_TCP_UDP:
1627 config->txq_inline_min =
1628 MLX5_INLINE_HSIZE_INNER_L4;
1634 * We get here if we are unable to deduce
1635 * inline data size with DevX. Try PCI ID
1636 * to determine old NICs.
1638 switch (spawn->pci_dev->id.device_id) {
1639 case PCI_DEVICE_ID_MELLANOX_CONNECTX4:
1640 case PCI_DEVICE_ID_MELLANOX_CONNECTX4VF:
1641 case PCI_DEVICE_ID_MELLANOX_CONNECTX4LX:
1642 case PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF:
1643 config->txq_inline_min = MLX5_INLINE_HSIZE_L2;
1644 config->hw_vlan_insert = 0;
1646 case PCI_DEVICE_ID_MELLANOX_CONNECTX5:
1647 case PCI_DEVICE_ID_MELLANOX_CONNECTX5VF:
1648 case PCI_DEVICE_ID_MELLANOX_CONNECTX5EX:
1649 case PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF:
1651 * These NICs support VLAN insertion from WQE and
1652 * report the wqe_vlan_insert flag. But there is the bug
1653 * and PFC control may be broken, so disable feature.
1655 config->hw_vlan_insert = 0;
1656 config->txq_inline_min = MLX5_INLINE_HSIZE_NONE;
1659 config->txq_inline_min = MLX5_INLINE_HSIZE_NONE;
1663 DRV_LOG(DEBUG, "min tx inline configured: %d", config->txq_inline_min);
1667 * Configures the metadata mask fields in the shared context.
1670 * Pointer to Ethernet device.
1673 mlx5_set_metadata_mask(struct rte_eth_dev *dev)
1675 struct mlx5_priv *priv = dev->data->dev_private;
1676 struct mlx5_ibv_shared *sh = priv->sh;
1677 uint32_t meta, mark, reg_c0;
1679 reg_c0 = ~priv->vport_meta_mask;
1680 switch (priv->config.dv_xmeta_en) {
1681 case MLX5_XMETA_MODE_LEGACY:
1683 mark = MLX5_FLOW_MARK_MASK;
1685 case MLX5_XMETA_MODE_META16:
1686 meta = reg_c0 >> rte_bsf32(reg_c0);
1687 mark = MLX5_FLOW_MARK_MASK;
1689 case MLX5_XMETA_MODE_META32:
1691 mark = (reg_c0 >> rte_bsf32(reg_c0)) & MLX5_FLOW_MARK_MASK;
1699 if (sh->dv_mark_mask && sh->dv_mark_mask != mark)
1700 DRV_LOG(WARNING, "metadata MARK mask mismatche %08X:%08X",
1701 sh->dv_mark_mask, mark);
1703 sh->dv_mark_mask = mark;
1704 if (sh->dv_meta_mask && sh->dv_meta_mask != meta)
1705 DRV_LOG(WARNING, "metadata META mask mismatche %08X:%08X",
1706 sh->dv_meta_mask, meta);
1708 sh->dv_meta_mask = meta;
1709 if (sh->dv_regc0_mask && sh->dv_regc0_mask != reg_c0)
1710 DRV_LOG(WARNING, "metadata reg_c0 mask mismatche %08X:%08X",
1711 sh->dv_meta_mask, reg_c0);
1713 sh->dv_regc0_mask = reg_c0;
1714 DRV_LOG(DEBUG, "metadata mode %u", priv->config.dv_xmeta_en);
1715 DRV_LOG(DEBUG, "metadata MARK mask %08X", sh->dv_mark_mask);
1716 DRV_LOG(DEBUG, "metadata META mask %08X", sh->dv_meta_mask);
1717 DRV_LOG(DEBUG, "metadata reg_c0 mask %08X", sh->dv_regc0_mask);
1721 * Allocate page of door-bells and register it using DevX API.
1724 * Pointer to Ethernet device.
1727 * Pointer to new page on success, NULL otherwise.
1729 static struct mlx5_devx_dbr_page *
1730 mlx5_alloc_dbr_page(struct rte_eth_dev *dev)
1732 struct mlx5_priv *priv = dev->data->dev_private;
1733 struct mlx5_devx_dbr_page *page;
1735 /* Allocate space for door-bell page and management data. */
1736 page = rte_calloc_socket(__func__, 1, sizeof(struct mlx5_devx_dbr_page),
1737 RTE_CACHE_LINE_SIZE, dev->device->numa_node);
1739 DRV_LOG(ERR, "port %u cannot allocate dbr page",
1740 dev->data->port_id);
1743 /* Register allocated memory. */
1744 page->umem = mlx5_glue->devx_umem_reg(priv->sh->ctx, page->dbrs,
1745 MLX5_DBR_PAGE_SIZE, 0);
1747 DRV_LOG(ERR, "port %u cannot umem reg dbr page",
1748 dev->data->port_id);
1756 * Find the next available door-bell, allocate new page if needed.
1759 * Pointer to Ethernet device.
1760 * @param [out] dbr_page
1761 * Door-bell page containing the page data.
1764 * Door-bell address offset on success, a negative error value otherwise.
1767 mlx5_get_dbr(struct rte_eth_dev *dev, struct mlx5_devx_dbr_page **dbr_page)
1769 struct mlx5_priv *priv = dev->data->dev_private;
1770 struct mlx5_devx_dbr_page *page = NULL;
1773 LIST_FOREACH(page, &priv->dbrpgs, next)
1774 if (page->dbr_count < MLX5_DBR_PER_PAGE)
1776 if (!page) { /* No page with free door-bell exists. */
1777 page = mlx5_alloc_dbr_page(dev);
1778 if (!page) /* Failed to allocate new page. */
1780 LIST_INSERT_HEAD(&priv->dbrpgs, page, next);
1782 /* Loop to find bitmap part with clear bit. */
1784 i < MLX5_DBR_BITMAP_SIZE && page->dbr_bitmap[i] == UINT64_MAX;
1787 /* Find the first clear bit. */
1788 j = rte_bsf64(~page->dbr_bitmap[i]);
1789 assert(i < (MLX5_DBR_PER_PAGE / 64));
1790 page->dbr_bitmap[i] |= (1 << j);
1793 return (((i * 64) + j) * sizeof(uint64_t));
1797 * Release a door-bell record.
1800 * Pointer to Ethernet device.
1801 * @param [in] umem_id
1802 * UMEM ID of page containing the door-bell record to release.
1803 * @param [in] offset
1804 * Offset of door-bell record in page.
1807 * 0 on success, a negative error value otherwise.
1810 mlx5_release_dbr(struct rte_eth_dev *dev, uint32_t umem_id, uint64_t offset)
1812 struct mlx5_priv *priv = dev->data->dev_private;
1813 struct mlx5_devx_dbr_page *page = NULL;
1816 LIST_FOREACH(page, &priv->dbrpgs, next)
1817 /* Find the page this address belongs to. */
1818 if (page->umem->umem_id == umem_id)
1823 if (!page->dbr_count) {
1824 /* Page not used, free it and remove from list. */
1825 LIST_REMOVE(page, next);
1827 ret = -mlx5_glue->devx_umem_dereg(page->umem);
1830 /* Mark in bitmap that this door-bell is not in use. */
1831 offset /= MLX5_DBR_SIZE;
1832 int i = offset / 64;
1833 int j = offset % 64;
1835 page->dbr_bitmap[i] &= ~(1 << j);
1841 * Check sibling device configurations.
1843 * Sibling devices sharing the Infiniband device context
1844 * should have compatible configurations. This regards
1845 * representors and bonding slaves.
1848 * Private device descriptor.
1850 * Configuration of the device is going to be created.
1853 * 0 on success, EINVAL otherwise
1856 mlx5_dev_check_sibling_config(struct mlx5_priv *priv,
1857 struct mlx5_dev_config *config)
1859 struct mlx5_ibv_shared *sh = priv->sh;
1860 struct mlx5_dev_config *sh_conf = NULL;
1864 /* Nothing to compare for the single/first device. */
1865 if (sh->refcnt == 1)
1867 /* Find the device with shared context. */
1868 MLX5_ETH_FOREACH_DEV(port_id, priv->pci_dev) {
1869 struct mlx5_priv *opriv =
1870 rte_eth_devices[port_id].data->dev_private;
1872 if (opriv && opriv != priv && opriv->sh == sh) {
1873 sh_conf = &opriv->config;
1879 if (sh_conf->dv_flow_en ^ config->dv_flow_en) {
1880 DRV_LOG(ERR, "\"dv_flow_en\" configuration mismatch"
1881 " for shared %s context", sh->ibdev_name);
1885 if (sh_conf->dv_xmeta_en ^ config->dv_xmeta_en) {
1886 DRV_LOG(ERR, "\"dv_xmeta_en\" configuration mismatch"
1887 " for shared %s context", sh->ibdev_name);
1894 * Spawn an Ethernet device from Verbs information.
1897 * Backing DPDK device.
1899 * Verbs device parameters (name, port, switch_info) to spawn.
1901 * Device configuration parameters.
1904 * A valid Ethernet device object on success, NULL otherwise and rte_errno
1905 * is set. The following errors are defined:
1907 * EBUSY: device is not supposed to be spawned.
1908 * EEXIST: device is already spawned
1910 static struct rte_eth_dev *
1911 mlx5_dev_spawn(struct rte_device *dpdk_dev,
1912 struct mlx5_dev_spawn_data *spawn,
1913 struct mlx5_dev_config config)
1915 const struct mlx5_switch_info *switch_info = &spawn->info;
1916 struct mlx5_ibv_shared *sh = NULL;
1917 struct ibv_port_attr port_attr;
1918 struct mlx5dv_context dv_attr = { .comp_mask = 0 };
1919 struct rte_eth_dev *eth_dev = NULL;
1920 struct mlx5_priv *priv = NULL;
1922 unsigned int hw_padding = 0;
1924 unsigned int cqe_comp;
1925 unsigned int cqe_pad = 0;
1926 unsigned int tunnel_en = 0;
1927 unsigned int mpls_en = 0;
1928 unsigned int swp = 0;
1929 unsigned int mprq = 0;
1930 unsigned int mprq_min_stride_size_n = 0;
1931 unsigned int mprq_max_stride_size_n = 0;
1932 unsigned int mprq_min_stride_num_n = 0;
1933 unsigned int mprq_max_stride_num_n = 0;
1934 struct rte_ether_addr mac;
1935 char name[RTE_ETH_NAME_MAX_LEN];
1936 int own_domain_id = 0;
1939 #ifdef HAVE_MLX5DV_DR_DEVX_PORT
1940 struct mlx5dv_devx_port devx_port = { .comp_mask = 0 };
1943 /* Determine if this port representor is supposed to be spawned. */
1944 if (switch_info->representor && dpdk_dev->devargs) {
1945 struct rte_eth_devargs eth_da;
1947 err = rte_eth_devargs_parse(dpdk_dev->devargs->args, ð_da);
1950 DRV_LOG(ERR, "failed to process device arguments: %s",
1951 strerror(rte_errno));
1954 for (i = 0; i < eth_da.nb_representor_ports; ++i)
1955 if (eth_da.representor_ports[i] ==
1956 (uint16_t)switch_info->port_name)
1958 if (i == eth_da.nb_representor_ports) {
1963 /* Build device name. */
1964 if (spawn->pf_bond < 0) {
1965 /* Single device. */
1966 if (!switch_info->representor)
1967 strlcpy(name, dpdk_dev->name, sizeof(name));
1969 snprintf(name, sizeof(name), "%s_representor_%u",
1970 dpdk_dev->name, switch_info->port_name);
1972 /* Bonding device. */
1973 if (!switch_info->representor)
1974 snprintf(name, sizeof(name), "%s_%s",
1975 dpdk_dev->name, spawn->ibv_dev->name);
1977 snprintf(name, sizeof(name), "%s_%s_representor_%u",
1978 dpdk_dev->name, spawn->ibv_dev->name,
1979 switch_info->port_name);
1981 /* check if the device is already spawned */
1982 if (rte_eth_dev_get_port_by_name(name, &port_id) == 0) {
1986 DRV_LOG(DEBUG, "naming Ethernet device \"%s\"", name);
1987 if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
1988 eth_dev = rte_eth_dev_attach_secondary(name);
1989 if (eth_dev == NULL) {
1990 DRV_LOG(ERR, "can not attach rte ethdev");
1994 eth_dev->device = dpdk_dev;
1995 eth_dev->dev_ops = &mlx5_dev_sec_ops;
1996 err = mlx5_proc_priv_init(eth_dev);
1999 /* Receive command fd from primary process */
2000 err = mlx5_mp_req_verbs_cmd_fd(eth_dev);
2003 /* Remap UAR for Tx queues. */
2004 err = mlx5_tx_uar_init_secondary(eth_dev, err);
2008 * Ethdev pointer is still required as input since
2009 * the primary device is not accessible from the
2010 * secondary process.
2012 eth_dev->rx_pkt_burst = mlx5_select_rx_function(eth_dev);
2013 eth_dev->tx_pkt_burst = mlx5_select_tx_function(eth_dev);
2017 * Some parameters ("tx_db_nc" in particularly) are needed in
2018 * advance to create dv/verbs device context. We proceed the
2019 * devargs here to get ones, and later proceed devargs again
2020 * to override some hardware settings.
2022 err = mlx5_args(&config, dpdk_dev->devargs);
2025 DRV_LOG(ERR, "failed to process device arguments: %s",
2026 strerror(rte_errno));
2029 sh = mlx5_alloc_shared_ibctx(spawn, &config);
2032 config.devx = sh->devx;
2033 #ifdef HAVE_MLX5DV_DR_ACTION_DEST_DEVX_TIR
2034 config.dest_tir = 1;
2036 #ifdef HAVE_IBV_MLX5_MOD_SWP
2037 dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_SWP;
2040 * Multi-packet send is supported by ConnectX-4 Lx PF as well
2041 * as all ConnectX-5 devices.
2043 #ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT
2044 dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS;
2046 #ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT
2047 dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_STRIDING_RQ;
2049 mlx5_glue->dv_query_device(sh->ctx, &dv_attr);
2050 if (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED) {
2051 if (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW) {
2052 DRV_LOG(DEBUG, "enhanced MPW is supported");
2053 mps = MLX5_MPW_ENHANCED;
2055 DRV_LOG(DEBUG, "MPW is supported");
2059 DRV_LOG(DEBUG, "MPW isn't supported");
2060 mps = MLX5_MPW_DISABLED;
2062 #ifdef HAVE_IBV_MLX5_MOD_SWP
2063 if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_SWP)
2064 swp = dv_attr.sw_parsing_caps.sw_parsing_offloads;
2065 DRV_LOG(DEBUG, "SWP support: %u", swp);
2068 #ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT
2069 if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_STRIDING_RQ) {
2070 struct mlx5dv_striding_rq_caps mprq_caps =
2071 dv_attr.striding_rq_caps;
2073 DRV_LOG(DEBUG, "\tmin_single_stride_log_num_of_bytes: %d",
2074 mprq_caps.min_single_stride_log_num_of_bytes);
2075 DRV_LOG(DEBUG, "\tmax_single_stride_log_num_of_bytes: %d",
2076 mprq_caps.max_single_stride_log_num_of_bytes);
2077 DRV_LOG(DEBUG, "\tmin_single_wqe_log_num_of_strides: %d",
2078 mprq_caps.min_single_wqe_log_num_of_strides);
2079 DRV_LOG(DEBUG, "\tmax_single_wqe_log_num_of_strides: %d",
2080 mprq_caps.max_single_wqe_log_num_of_strides);
2081 DRV_LOG(DEBUG, "\tsupported_qpts: %d",
2082 mprq_caps.supported_qpts);
2083 DRV_LOG(DEBUG, "device supports Multi-Packet RQ");
2085 mprq_min_stride_size_n =
2086 mprq_caps.min_single_stride_log_num_of_bytes;
2087 mprq_max_stride_size_n =
2088 mprq_caps.max_single_stride_log_num_of_bytes;
2089 mprq_min_stride_num_n =
2090 mprq_caps.min_single_wqe_log_num_of_strides;
2091 mprq_max_stride_num_n =
2092 mprq_caps.max_single_wqe_log_num_of_strides;
2093 config.mprq.stride_num_n = RTE_MAX(MLX5_MPRQ_STRIDE_NUM_N,
2094 mprq_min_stride_num_n);
2097 if (RTE_CACHE_LINE_SIZE == 128 &&
2098 !(dv_attr.flags & MLX5DV_CONTEXT_FLAGS_CQE_128B_COMP))
2102 config.cqe_comp = cqe_comp;
2103 #ifdef HAVE_IBV_MLX5_MOD_CQE_128B_PAD
2104 /* Whether device supports 128B Rx CQE padding. */
2105 cqe_pad = RTE_CACHE_LINE_SIZE == 128 &&
2106 (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_CQE_128B_PAD);
2108 #ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT
2109 if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS) {
2110 tunnel_en = ((dv_attr.tunnel_offloads_caps &
2111 MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_VXLAN) &&
2112 (dv_attr.tunnel_offloads_caps &
2113 MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_GRE));
2115 DRV_LOG(DEBUG, "tunnel offloading is %ssupported",
2116 tunnel_en ? "" : "not ");
2119 "tunnel offloading disabled due to old OFED/rdma-core version");
2121 config.tunnel_en = tunnel_en;
2122 #ifdef HAVE_IBV_DEVICE_MPLS_SUPPORT
2123 mpls_en = ((dv_attr.tunnel_offloads_caps &
2124 MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_CW_MPLS_OVER_GRE) &&
2125 (dv_attr.tunnel_offloads_caps &
2126 MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_CW_MPLS_OVER_UDP));
2127 DRV_LOG(DEBUG, "MPLS over GRE/UDP tunnel offloading is %ssupported",
2128 mpls_en ? "" : "not ");
2130 DRV_LOG(WARNING, "MPLS over GRE/UDP tunnel offloading disabled due to"
2131 " old OFED/rdma-core version or firmware configuration");
2133 config.mpls_en = mpls_en;
2134 /* Check port status. */
2135 err = mlx5_glue->query_port(sh->ctx, spawn->ibv_port, &port_attr);
2137 DRV_LOG(ERR, "port query failed: %s", strerror(err));
2140 if (port_attr.link_layer != IBV_LINK_LAYER_ETHERNET) {
2141 DRV_LOG(ERR, "port is not configured in Ethernet mode");
2145 if (port_attr.state != IBV_PORT_ACTIVE)
2146 DRV_LOG(DEBUG, "port is not active: \"%s\" (%d)",
2147 mlx5_glue->port_state_str(port_attr.state),
2149 /* Allocate private eth device data. */
2150 priv = rte_zmalloc("ethdev private structure",
2152 RTE_CACHE_LINE_SIZE);
2154 DRV_LOG(ERR, "priv allocation failure");
2159 priv->ibv_port = spawn->ibv_port;
2160 priv->pci_dev = spawn->pci_dev;
2161 priv->mtu = RTE_ETHER_MTU;
2163 /* Initialize UAR access locks for 32bit implementations. */
2164 rte_spinlock_init(&priv->uar_lock_cq);
2165 for (i = 0; i < MLX5_UAR_PAGE_NUM_MAX; i++)
2166 rte_spinlock_init(&priv->uar_lock[i]);
2168 /* Some internal functions rely on Netlink sockets, open them now. */
2169 priv->nl_socket_rdma = mlx5_nl_init(NETLINK_RDMA);
2170 priv->nl_socket_route = mlx5_nl_init(NETLINK_ROUTE);
2172 priv->representor = !!switch_info->representor;
2173 priv->master = !!switch_info->master;
2174 priv->domain_id = RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID;
2175 priv->vport_meta_tag = 0;
2176 priv->vport_meta_mask = 0;
2177 priv->pf_bond = spawn->pf_bond;
2178 #ifdef HAVE_MLX5DV_DR_DEVX_PORT
2180 * The DevX port query API is implemented. E-Switch may use
2181 * either vport or reg_c[0] metadata register to match on
2182 * vport index. The engaged part of metadata register is
2185 if (switch_info->representor || switch_info->master) {
2186 devx_port.comp_mask = MLX5DV_DEVX_PORT_VPORT |
2187 MLX5DV_DEVX_PORT_MATCH_REG_C_0;
2188 err = mlx5_glue->devx_port_query(sh->ctx, spawn->ibv_port,
2192 "can't query devx port %d on device %s",
2193 spawn->ibv_port, spawn->ibv_dev->name);
2194 devx_port.comp_mask = 0;
2197 if (devx_port.comp_mask & MLX5DV_DEVX_PORT_MATCH_REG_C_0) {
2198 priv->vport_meta_tag = devx_port.reg_c_0.value;
2199 priv->vport_meta_mask = devx_port.reg_c_0.mask;
2200 if (!priv->vport_meta_mask) {
2201 DRV_LOG(ERR, "vport zero mask for port %d"
2202 " on bonding device %s",
2203 spawn->ibv_port, spawn->ibv_dev->name);
2207 if (priv->vport_meta_tag & ~priv->vport_meta_mask) {
2208 DRV_LOG(ERR, "invalid vport tag for port %d"
2209 " on bonding device %s",
2210 spawn->ibv_port, spawn->ibv_dev->name);
2214 } else if (devx_port.comp_mask & MLX5DV_DEVX_PORT_VPORT) {
2215 priv->vport_id = devx_port.vport_num;
2216 } else if (spawn->pf_bond >= 0) {
2217 DRV_LOG(ERR, "can't deduce vport index for port %d"
2218 " on bonding device %s",
2219 spawn->ibv_port, spawn->ibv_dev->name);
2223 /* Suppose vport index in compatible way. */
2224 priv->vport_id = switch_info->representor ?
2225 switch_info->port_name + 1 : -1;
2229 * Kernel/rdma_core support single E-Switch per PF configurations
2230 * only and vport_id field contains the vport index for
2231 * associated VF, which is deduced from representor port name.
2232 * For example, let's have the IB device port 10, it has
2233 * attached network device eth0, which has port name attribute
2234 * pf0vf2, we can deduce the VF number as 2, and set vport index
2235 * as 3 (2+1). This assigning schema should be changed if the
2236 * multiple E-Switch instances per PF configurations or/and PCI
2237 * subfunctions are added.
2239 priv->vport_id = switch_info->representor ?
2240 switch_info->port_name + 1 : -1;
2242 /* representor_id field keeps the unmodified VF index. */
2243 priv->representor_id = switch_info->representor ?
2244 switch_info->port_name : -1;
2246 * Look for sibling devices in order to reuse their switch domain
2247 * if any, otherwise allocate one.
2249 MLX5_ETH_FOREACH_DEV(port_id, priv->pci_dev) {
2250 const struct mlx5_priv *opriv =
2251 rte_eth_devices[port_id].data->dev_private;
2254 opriv->sh != priv->sh ||
2256 RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID)
2258 priv->domain_id = opriv->domain_id;
2261 if (priv->domain_id == RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID) {
2262 err = rte_eth_switch_domain_alloc(&priv->domain_id);
2265 DRV_LOG(ERR, "unable to allocate switch domain: %s",
2266 strerror(rte_errno));
2271 /* Override some values set by hardware configuration. */
2272 mlx5_args(&config, dpdk_dev->devargs);
2273 err = mlx5_dev_check_sibling_config(priv, &config);
2276 config.hw_csum = !!(sh->device_attr.device_cap_flags_ex &
2277 IBV_DEVICE_RAW_IP_CSUM);
2278 DRV_LOG(DEBUG, "checksum offloading is %ssupported",
2279 (config.hw_csum ? "" : "not "));
2280 #if !defined(HAVE_IBV_DEVICE_COUNTERS_SET_V42) && \
2281 !defined(HAVE_IBV_DEVICE_COUNTERS_SET_V45)
2282 DRV_LOG(DEBUG, "counters are not supported");
2284 #ifndef HAVE_IBV_FLOW_DV_SUPPORT
2285 if (config.dv_flow_en) {
2286 DRV_LOG(WARNING, "DV flow is not supported");
2287 config.dv_flow_en = 0;
2290 config.ind_table_max_size =
2291 sh->device_attr.rss_caps.max_rwq_indirection_table_size;
2293 * Remove this check once DPDK supports larger/variable
2294 * indirection tables.
2296 if (config.ind_table_max_size > (unsigned int)ETH_RSS_RETA_SIZE_512)
2297 config.ind_table_max_size = ETH_RSS_RETA_SIZE_512;
2298 DRV_LOG(DEBUG, "maximum Rx indirection table size is %u",
2299 config.ind_table_max_size);
2300 config.hw_vlan_strip = !!(sh->device_attr.raw_packet_caps &
2301 IBV_RAW_PACKET_CAP_CVLAN_STRIPPING);
2302 DRV_LOG(DEBUG, "VLAN stripping is %ssupported",
2303 (config.hw_vlan_strip ? "" : "not "));
2304 config.hw_fcs_strip = !!(sh->device_attr.raw_packet_caps &
2305 IBV_RAW_PACKET_CAP_SCATTER_FCS);
2306 DRV_LOG(DEBUG, "FCS stripping configuration is %ssupported",
2307 (config.hw_fcs_strip ? "" : "not "));
2308 #if defined(HAVE_IBV_WQ_FLAG_RX_END_PADDING)
2309 hw_padding = !!sh->device_attr.rx_pad_end_addr_align;
2310 #elif defined(HAVE_IBV_WQ_FLAGS_PCI_WRITE_END_PADDING)
2311 hw_padding = !!(sh->device_attr.device_cap_flags_ex &
2312 IBV_DEVICE_PCI_WRITE_END_PADDING);
2314 if (config.hw_padding && !hw_padding) {
2315 DRV_LOG(DEBUG, "Rx end alignment padding isn't supported");
2316 config.hw_padding = 0;
2317 } else if (config.hw_padding) {
2318 DRV_LOG(DEBUG, "Rx end alignment padding is enabled");
2320 config.tso = (sh->device_attr.tso_caps.max_tso > 0 &&
2321 (sh->device_attr.tso_caps.supported_qpts &
2322 (1 << IBV_QPT_RAW_PACKET)));
2324 config.tso_max_payload_sz = sh->device_attr.tso_caps.max_tso;
2326 * MPW is disabled by default, while the Enhanced MPW is enabled
2329 if (config.mps == MLX5_ARG_UNSET)
2330 config.mps = (mps == MLX5_MPW_ENHANCED) ? MLX5_MPW_ENHANCED :
2333 config.mps = config.mps ? mps : MLX5_MPW_DISABLED;
2334 DRV_LOG(INFO, "%sMPS is %s",
2335 config.mps == MLX5_MPW_ENHANCED ? "enhanced " : "",
2336 config.mps != MLX5_MPW_DISABLED ? "enabled" : "disabled");
2337 if (config.cqe_comp && !cqe_comp) {
2338 DRV_LOG(WARNING, "Rx CQE compression isn't supported");
2339 config.cqe_comp = 0;
2341 if (config.cqe_pad && !cqe_pad) {
2342 DRV_LOG(WARNING, "Rx CQE padding isn't supported");
2344 } else if (config.cqe_pad) {
2345 DRV_LOG(INFO, "Rx CQE padding is enabled");
2348 priv->counter_fallback = 0;
2349 err = mlx5_devx_cmd_query_hca_attr(sh->ctx, &config.hca_attr);
2354 if (!config.hca_attr.flow_counters_dump)
2355 priv->counter_fallback = 1;
2356 #ifndef HAVE_IBV_DEVX_ASYNC
2357 priv->counter_fallback = 1;
2359 if (priv->counter_fallback)
2360 DRV_LOG(INFO, "Use fall-back DV counter management");
2361 /* Check for LRO support. */
2362 if (config.dest_tir && config.hca_attr.lro_cap &&
2363 config.dv_flow_en) {
2364 /* TBD check tunnel lro caps. */
2365 config.lro.supported = config.hca_attr.lro_cap;
2366 DRV_LOG(DEBUG, "Device supports LRO");
2368 * If LRO timeout is not configured by application,
2369 * use the minimal supported value.
2371 if (!config.lro.timeout)
2372 config.lro.timeout =
2373 config.hca_attr.lro_timer_supported_periods[0];
2374 DRV_LOG(DEBUG, "LRO session timeout set to %d usec",
2375 config.lro.timeout);
2377 #if defined(HAVE_MLX5DV_DR) && defined(HAVE_MLX5_DR_CREATE_ACTION_FLOW_METER)
2378 if (config.hca_attr.qos.sup && config.hca_attr.qos.srtcm_sup &&
2379 config.dv_flow_en) {
2380 uint8_t reg_c_mask =
2381 config.hca_attr.qos.flow_meter_reg_c_ids;
2383 * Meter needs two REG_C's for color match and pre-sfx
2384 * flow match. Here get the REG_C for color match.
2385 * REG_C_0 and REG_C_1 is reserved for metadata feature.
2388 if (__builtin_popcount(reg_c_mask) < 1) {
2390 DRV_LOG(WARNING, "No available register for"
2393 priv->mtr_color_reg = ffs(reg_c_mask) - 1 +
2396 DRV_LOG(DEBUG, "The REG_C meter uses is %d",
2397 priv->mtr_color_reg);
2402 if (config.mprq.enabled && mprq) {
2403 if (config.mprq.stride_num_n > mprq_max_stride_num_n ||
2404 config.mprq.stride_num_n < mprq_min_stride_num_n) {
2405 config.mprq.stride_num_n =
2406 RTE_MAX(MLX5_MPRQ_STRIDE_NUM_N,
2407 mprq_min_stride_num_n);
2409 "the number of strides"
2410 " for Multi-Packet RQ is out of range,"
2411 " setting default value (%u)",
2412 1 << config.mprq.stride_num_n);
2414 config.mprq.min_stride_size_n = mprq_min_stride_size_n;
2415 config.mprq.max_stride_size_n = mprq_max_stride_size_n;
2416 } else if (config.mprq.enabled && !mprq) {
2417 DRV_LOG(WARNING, "Multi-Packet RQ isn't supported");
2418 config.mprq.enabled = 0;
2420 if (config.max_dump_files_num == 0)
2421 config.max_dump_files_num = 128;
2422 eth_dev = rte_eth_dev_allocate(name);
2423 if (eth_dev == NULL) {
2424 DRV_LOG(ERR, "can not allocate rte ethdev");
2428 /* Flag to call rte_eth_dev_release_port() in rte_eth_dev_close(). */
2429 eth_dev->data->dev_flags |= RTE_ETH_DEV_CLOSE_REMOVE;
2430 if (priv->representor) {
2431 eth_dev->data->dev_flags |= RTE_ETH_DEV_REPRESENTOR;
2432 eth_dev->data->representor_id = priv->representor_id;
2435 * Store associated network device interface index. This index
2436 * is permanent throughout the lifetime of device. So, we may store
2437 * the ifindex here and use the cached value further.
2439 assert(spawn->ifindex);
2440 priv->if_index = spawn->ifindex;
2441 eth_dev->data->dev_private = priv;
2442 priv->dev_data = eth_dev->data;
2443 eth_dev->data->mac_addrs = priv->mac;
2444 eth_dev->device = dpdk_dev;
2445 /* Configure the first MAC address by default. */
2446 if (mlx5_get_mac(eth_dev, &mac.addr_bytes)) {
2448 "port %u cannot get MAC address, is mlx5_en"
2449 " loaded? (errno: %s)",
2450 eth_dev->data->port_id, strerror(rte_errno));
2455 "port %u MAC address is %02x:%02x:%02x:%02x:%02x:%02x",
2456 eth_dev->data->port_id,
2457 mac.addr_bytes[0], mac.addr_bytes[1],
2458 mac.addr_bytes[2], mac.addr_bytes[3],
2459 mac.addr_bytes[4], mac.addr_bytes[5]);
2462 char ifname[IF_NAMESIZE];
2464 if (mlx5_get_ifname(eth_dev, &ifname) == 0)
2465 DRV_LOG(DEBUG, "port %u ifname is \"%s\"",
2466 eth_dev->data->port_id, ifname);
2468 DRV_LOG(DEBUG, "port %u ifname is unknown",
2469 eth_dev->data->port_id);
2472 /* Get actual MTU if possible. */
2473 err = mlx5_get_mtu(eth_dev, &priv->mtu);
2478 DRV_LOG(DEBUG, "port %u MTU is %u", eth_dev->data->port_id,
2480 /* Initialize burst functions to prevent crashes before link-up. */
2481 eth_dev->rx_pkt_burst = removed_rx_burst;
2482 eth_dev->tx_pkt_burst = removed_tx_burst;
2483 eth_dev->dev_ops = &mlx5_dev_ops;
2484 /* Register MAC address. */
2485 claim_zero(mlx5_mac_addr_add(eth_dev, &mac, 0, 0));
2486 if (config.vf && config.vf_nl_en)
2487 mlx5_nl_mac_addr_sync(eth_dev);
2488 TAILQ_INIT(&priv->flows);
2489 TAILQ_INIT(&priv->ctrl_flows);
2490 TAILQ_INIT(&priv->flow_meters);
2491 TAILQ_INIT(&priv->flow_meter_profiles);
2492 /* Hint libmlx5 to use PMD allocator for data plane resources */
2493 struct mlx5dv_ctx_allocators alctr = {
2494 .alloc = &mlx5_alloc_verbs_buf,
2495 .free = &mlx5_free_verbs_buf,
2498 mlx5_glue->dv_set_context_attr(sh->ctx,
2499 MLX5DV_CTX_ATTR_BUF_ALLOCATORS,
2500 (void *)((uintptr_t)&alctr));
2501 /* Bring Ethernet device up. */
2502 DRV_LOG(DEBUG, "port %u forcing Ethernet interface up",
2503 eth_dev->data->port_id);
2504 mlx5_set_link_up(eth_dev);
2506 * Even though the interrupt handler is not installed yet,
2507 * interrupts will still trigger on the async_fd from
2508 * Verbs context returned by ibv_open_device().
2510 mlx5_link_update(eth_dev, 0);
2511 #ifdef HAVE_MLX5DV_DR_ESWITCH
2512 if (!(config.hca_attr.eswitch_manager && config.dv_flow_en &&
2513 (switch_info->representor || switch_info->master)))
2514 config.dv_esw_en = 0;
2516 config.dv_esw_en = 0;
2518 /* Detect minimal data bytes to inline. */
2519 mlx5_set_min_inline(spawn, &config);
2520 /* Store device configuration on private structure. */
2521 priv->config = config;
2522 /* Create context for virtual machine VLAN workaround. */
2523 priv->vmwa_context = mlx5_vlan_vmwa_init(eth_dev, spawn->ifindex);
2524 if (config.dv_flow_en) {
2525 err = mlx5_alloc_shared_dr(priv);
2528 priv->qrss_id_pool = mlx5_flow_id_pool_alloc();
2529 if (!priv->qrss_id_pool) {
2530 DRV_LOG(ERR, "can't create flow id pool");
2535 /* Supported Verbs flow priority number detection. */
2536 err = mlx5_flow_discover_priorities(eth_dev);
2541 priv->config.flow_prio = err;
2542 if (!priv->config.dv_esw_en &&
2543 priv->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY) {
2544 DRV_LOG(WARNING, "metadata mode %u is not supported "
2545 "(no E-Switch)", priv->config.dv_xmeta_en);
2546 priv->config.dv_xmeta_en = MLX5_XMETA_MODE_LEGACY;
2548 mlx5_set_metadata_mask(eth_dev);
2549 if (priv->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY &&
2550 !priv->sh->dv_regc0_mask) {
2551 DRV_LOG(ERR, "metadata mode %u is not supported "
2552 "(no metadata reg_c[0] is available)",
2553 priv->config.dv_xmeta_en);
2557 /* Query availibility of metadata reg_c's. */
2558 err = mlx5_flow_discover_mreg_c(eth_dev);
2563 if (!mlx5_flow_ext_mreg_supported(eth_dev)) {
2565 "port %u extensive metadata register is not supported",
2566 eth_dev->data->port_id);
2567 if (priv->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY) {
2568 DRV_LOG(ERR, "metadata mode %u is not supported "
2569 "(no metadata registers available)",
2570 priv->config.dv_xmeta_en);
2575 if (priv->config.dv_flow_en &&
2576 priv->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY &&
2577 mlx5_flow_ext_mreg_supported(eth_dev) &&
2578 priv->sh->dv_regc0_mask) {
2579 priv->mreg_cp_tbl = mlx5_hlist_create(MLX5_FLOW_MREG_HNAME,
2580 MLX5_FLOW_MREG_HTABLE_SZ);
2581 if (!priv->mreg_cp_tbl) {
2589 if (priv->mreg_cp_tbl)
2590 mlx5_hlist_destroy(priv->mreg_cp_tbl, NULL, NULL);
2592 mlx5_free_shared_dr(priv);
2593 if (priv->nl_socket_route >= 0)
2594 close(priv->nl_socket_route);
2595 if (priv->nl_socket_rdma >= 0)
2596 close(priv->nl_socket_rdma);
2597 if (priv->vmwa_context)
2598 mlx5_vlan_vmwa_exit(priv->vmwa_context);
2599 if (priv->qrss_id_pool)
2600 mlx5_flow_id_pool_release(priv->qrss_id_pool);
2602 claim_zero(rte_eth_switch_domain_free(priv->domain_id));
2604 if (eth_dev != NULL)
2605 eth_dev->data->dev_private = NULL;
2607 if (eth_dev != NULL) {
2608 /* mac_addrs must not be freed alone because part of dev_private */
2609 eth_dev->data->mac_addrs = NULL;
2610 rte_eth_dev_release_port(eth_dev);
2613 mlx5_free_shared_ibctx(sh);
2620 * Comparison callback to sort device data.
2622 * This is meant to be used with qsort().
2625 * Pointer to pointer to first data object.
2627 * Pointer to pointer to second data object.
2630 * 0 if both objects are equal, less than 0 if the first argument is less
2631 * than the second, greater than 0 otherwise.
2634 mlx5_dev_spawn_data_cmp(const void *a, const void *b)
2636 const struct mlx5_switch_info *si_a =
2637 &((const struct mlx5_dev_spawn_data *)a)->info;
2638 const struct mlx5_switch_info *si_b =
2639 &((const struct mlx5_dev_spawn_data *)b)->info;
2642 /* Master device first. */
2643 ret = si_b->master - si_a->master;
2646 /* Then representor devices. */
2647 ret = si_b->representor - si_a->representor;
2650 /* Unidentified devices come last in no specific order. */
2651 if (!si_a->representor)
2653 /* Order representors by name. */
2654 return si_a->port_name - si_b->port_name;
2658 * Match PCI information for possible slaves of bonding device.
2660 * @param[in] ibv_dev
2661 * Pointer to Infiniband device structure.
2662 * @param[in] pci_dev
2663 * Pointer to PCI device structure to match PCI address.
2664 * @param[in] nl_rdma
2665 * Netlink RDMA group socket handle.
2668 * negative value if no bonding device found, otherwise
2669 * positive index of slave PF in bonding.
2672 mlx5_device_bond_pci_match(const struct ibv_device *ibv_dev,
2673 const struct rte_pci_device *pci_dev,
2676 char ifname[IF_NAMESIZE + 1];
2677 unsigned int ifindex;
2683 * Try to get master device name. If something goes
2684 * wrong suppose the lack of kernel support and no
2689 if (!strstr(ibv_dev->name, "bond"))
2691 np = mlx5_nl_portnum(nl_rdma, ibv_dev->name);
2695 * The Master device might not be on the predefined
2696 * port (not on port index 1, it is not garanted),
2697 * we have to scan all Infiniband device port and
2700 for (i = 1; i <= np; ++i) {
2701 /* Check whether Infiniband port is populated. */
2702 ifindex = mlx5_nl_ifindex(nl_rdma, ibv_dev->name, i);
2705 if (!if_indextoname(ifindex, ifname))
2707 /* Try to read bonding slave names from sysfs. */
2709 "/sys/class/net/%s/master/bonding/slaves", ifname);
2710 file = fopen(slaves, "r");
2716 /* Use safe format to check maximal buffer length. */
2717 assert(atol(RTE_STR(IF_NAMESIZE)) == IF_NAMESIZE);
2718 while (fscanf(file, "%" RTE_STR(IF_NAMESIZE) "s", ifname) == 1) {
2719 char tmp_str[IF_NAMESIZE + 32];
2720 struct rte_pci_addr pci_addr;
2721 struct mlx5_switch_info info;
2723 /* Process slave interface names in the loop. */
2724 snprintf(tmp_str, sizeof(tmp_str),
2725 "/sys/class/net/%s", ifname);
2726 if (mlx5_dev_to_pci_addr(tmp_str, &pci_addr)) {
2727 DRV_LOG(WARNING, "can not get PCI address"
2728 " for netdev \"%s\"", ifname);
2731 if (pci_dev->addr.domain != pci_addr.domain ||
2732 pci_dev->addr.bus != pci_addr.bus ||
2733 pci_dev->addr.devid != pci_addr.devid ||
2734 pci_dev->addr.function != pci_addr.function)
2736 /* Slave interface PCI address match found. */
2738 snprintf(tmp_str, sizeof(tmp_str),
2739 "/sys/class/net/%s/phys_port_name", ifname);
2740 file = fopen(tmp_str, "rb");
2743 info.name_type = MLX5_PHYS_PORT_NAME_TYPE_NOTSET;
2744 if (fscanf(file, "%32s", tmp_str) == 1)
2745 mlx5_translate_port_name(tmp_str, &info);
2746 if (info.name_type == MLX5_PHYS_PORT_NAME_TYPE_LEGACY ||
2747 info.name_type == MLX5_PHYS_PORT_NAME_TYPE_UPLINK)
2748 pf = info.port_name;
2757 * DPDK callback to register a PCI device.
2759 * This function spawns Ethernet devices out of a given PCI device.
2761 * @param[in] pci_drv
2762 * PCI driver structure (mlx5_driver).
2763 * @param[in] pci_dev
2764 * PCI device information.
2767 * 0 on success, a negative errno value otherwise and rte_errno is set.
2770 mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
2771 struct rte_pci_device *pci_dev)
2773 struct ibv_device **ibv_list;
2775 * Number of found IB Devices matching with requested PCI BDF.
2776 * nd != 1 means there are multiple IB devices over the same
2777 * PCI device and we have representors and master.
2779 unsigned int nd = 0;
2781 * Number of found IB device Ports. nd = 1 and np = 1..n means
2782 * we have the single multiport IB device, and there may be
2783 * representors attached to some of found ports.
2785 unsigned int np = 0;
2787 * Number of DPDK ethernet devices to Spawn - either over
2788 * multiple IB devices or multiple ports of single IB device.
2789 * Actually this is the number of iterations to spawn.
2791 unsigned int ns = 0;
2794 * < 0 - no bonding device (single one)
2795 * >= 0 - bonding device (value is slave PF index)
2798 struct mlx5_dev_spawn_data *list = NULL;
2799 struct mlx5_dev_config dev_config;
2802 ret = mlx5_init_once();
2804 DRV_LOG(ERR, "unable to init PMD global data: %s",
2805 strerror(rte_errno));
2808 assert(pci_drv == &mlx5_driver);
2810 ibv_list = mlx5_glue->get_device_list(&ret);
2812 rte_errno = errno ? errno : ENOSYS;
2813 DRV_LOG(ERR, "cannot list devices, is ib_uverbs loaded?");
2817 * First scan the list of all Infiniband devices to find
2818 * matching ones, gathering into the list.
2820 struct ibv_device *ibv_match[ret + 1];
2821 int nl_route = mlx5_nl_init(NETLINK_ROUTE);
2822 int nl_rdma = mlx5_nl_init(NETLINK_RDMA);
2826 struct rte_pci_addr pci_addr;
2828 DRV_LOG(DEBUG, "checking device \"%s\"", ibv_list[ret]->name);
2829 bd = mlx5_device_bond_pci_match
2830 (ibv_list[ret], pci_dev, nl_rdma);
2833 * Bonding device detected. Only one match is allowed,
2834 * the bonding is supported over multi-port IB device,
2835 * there should be no matches on representor PCI
2836 * functions or non VF LAG bonding devices with
2837 * specified address.
2841 "multiple PCI match on bonding device"
2842 "\"%s\" found", ibv_list[ret]->name);
2847 DRV_LOG(INFO, "PCI information matches for"
2848 " slave %d bonding device \"%s\"",
2849 bd, ibv_list[ret]->name);
2850 ibv_match[nd++] = ibv_list[ret];
2853 if (mlx5_dev_to_pci_addr
2854 (ibv_list[ret]->ibdev_path, &pci_addr))
2856 if (pci_dev->addr.domain != pci_addr.domain ||
2857 pci_dev->addr.bus != pci_addr.bus ||
2858 pci_dev->addr.devid != pci_addr.devid ||
2859 pci_dev->addr.function != pci_addr.function)
2861 DRV_LOG(INFO, "PCI information matches for device \"%s\"",
2862 ibv_list[ret]->name);
2863 ibv_match[nd++] = ibv_list[ret];
2865 ibv_match[nd] = NULL;
2867 /* No device matches, just complain and bail out. */
2869 "no Verbs device matches PCI device " PCI_PRI_FMT ","
2870 " are kernel drivers loaded?",
2871 pci_dev->addr.domain, pci_dev->addr.bus,
2872 pci_dev->addr.devid, pci_dev->addr.function);
2879 * Found single matching device may have multiple ports.
2880 * Each port may be representor, we have to check the port
2881 * number and check the representors existence.
2884 np = mlx5_nl_portnum(nl_rdma, ibv_match[0]->name);
2886 DRV_LOG(WARNING, "can not get IB device \"%s\""
2887 " ports number", ibv_match[0]->name);
2888 if (bd >= 0 && !np) {
2889 DRV_LOG(ERR, "can not get ports"
2890 " for bonding device");
2896 #ifndef HAVE_MLX5DV_DR_DEVX_PORT
2899 * This may happen if there is VF LAG kernel support and
2900 * application is compiled with older rdma_core library.
2903 "No kernel/verbs support for VF LAG bonding found.");
2904 rte_errno = ENOTSUP;
2910 * Now we can determine the maximal
2911 * amount of devices to be spawned.
2913 list = rte_zmalloc("device spawn data",
2914 sizeof(struct mlx5_dev_spawn_data) *
2916 RTE_CACHE_LINE_SIZE);
2918 DRV_LOG(ERR, "spawn data array allocation failure");
2923 if (bd >= 0 || np > 1) {
2925 * Single IB device with multiple ports found,
2926 * it may be E-Switch master device and representors.
2927 * We have to perform identification trough the ports.
2929 assert(nl_rdma >= 0);
2933 for (i = 1; i <= np; ++i) {
2934 list[ns].max_port = np;
2935 list[ns].ibv_port = i;
2936 list[ns].ibv_dev = ibv_match[0];
2937 list[ns].eth_dev = NULL;
2938 list[ns].pci_dev = pci_dev;
2939 list[ns].pf_bond = bd;
2940 list[ns].ifindex = mlx5_nl_ifindex
2941 (nl_rdma, list[ns].ibv_dev->name, i);
2942 if (!list[ns].ifindex) {
2944 * No network interface index found for the
2945 * specified port, it means there is no
2946 * representor on this port. It's OK,
2947 * there can be disabled ports, for example
2948 * if sriov_numvfs < sriov_totalvfs.
2954 ret = mlx5_nl_switch_info
2958 if (ret || (!list[ns].info.representor &&
2959 !list[ns].info.master)) {
2961 * We failed to recognize representors with
2962 * Netlink, let's try to perform the task
2965 ret = mlx5_sysfs_switch_info
2969 if (!ret && bd >= 0) {
2970 switch (list[ns].info.name_type) {
2971 case MLX5_PHYS_PORT_NAME_TYPE_UPLINK:
2972 if (list[ns].info.port_name == bd)
2975 case MLX5_PHYS_PORT_NAME_TYPE_PFVF:
2976 if (list[ns].info.pf_num == bd)
2984 if (!ret && (list[ns].info.representor ^
2985 list[ns].info.master))
2990 "unable to recognize master/representors"
2991 " on the IB device with multiple ports");
2998 * The existence of several matching entries (nd > 1) means
2999 * port representors have been instantiated. No existing Verbs
3000 * call nor sysfs entries can tell them apart, this can only
3001 * be done through Netlink calls assuming kernel drivers are
3002 * recent enough to support them.
3004 * In the event of identification failure through Netlink,
3005 * try again through sysfs, then:
3007 * 1. A single IB device matches (nd == 1) with single
3008 * port (np=0/1) and is not a representor, assume
3009 * no switch support.
3011 * 2. Otherwise no safe assumptions can be made;
3012 * complain louder and bail out.
3015 for (i = 0; i != nd; ++i) {
3016 memset(&list[ns].info, 0, sizeof(list[ns].info));
3017 list[ns].max_port = 1;
3018 list[ns].ibv_port = 1;
3019 list[ns].ibv_dev = ibv_match[i];
3020 list[ns].eth_dev = NULL;
3021 list[ns].pci_dev = pci_dev;
3022 list[ns].pf_bond = -1;
3023 list[ns].ifindex = 0;
3025 list[ns].ifindex = mlx5_nl_ifindex
3026 (nl_rdma, list[ns].ibv_dev->name, 1);
3027 if (!list[ns].ifindex) {
3028 char ifname[IF_NAMESIZE];
3031 * Netlink failed, it may happen with old
3032 * ib_core kernel driver (before 4.16).
3033 * We can assume there is old driver because
3034 * here we are processing single ports IB
3035 * devices. Let's try sysfs to retrieve
3036 * the ifindex. The method works for
3037 * master device only.
3041 * Multiple devices found, assume
3042 * representors, can not distinguish
3043 * master/representor and retrieve
3044 * ifindex via sysfs.
3048 ret = mlx5_get_master_ifname
3049 (ibv_match[i]->ibdev_path, &ifname);
3052 if_nametoindex(ifname);
3053 if (!list[ns].ifindex) {
3055 * No network interface index found
3056 * for the specified device, it means
3057 * there it is neither representor
3065 ret = mlx5_nl_switch_info
3069 if (ret || (!list[ns].info.representor &&
3070 !list[ns].info.master)) {
3072 * We failed to recognize representors with
3073 * Netlink, let's try to perform the task
3076 ret = mlx5_sysfs_switch_info
3080 if (!ret && (list[ns].info.representor ^
3081 list[ns].info.master)) {
3083 } else if ((nd == 1) &&
3084 !list[ns].info.representor &&
3085 !list[ns].info.master) {
3087 * Single IB device with
3088 * one physical port and
3089 * attached network device.
3090 * May be SRIOV is not enabled
3091 * or there is no representors.
3093 DRV_LOG(INFO, "no E-Switch support detected");
3100 "unable to recognize master/representors"
3101 " on the multiple IB devices");
3109 * Sort list to probe devices in natural order for users convenience
3110 * (i.e. master first, then representors from lowest to highest ID).
3112 qsort(list, ns, sizeof(*list), mlx5_dev_spawn_data_cmp);
3113 /* Default configuration. */
3114 dev_config = (struct mlx5_dev_config){
3116 .mps = MLX5_ARG_UNSET,
3117 .dbnc = MLX5_ARG_UNSET,
3119 .txq_inline_max = MLX5_ARG_UNSET,
3120 .txq_inline_min = MLX5_ARG_UNSET,
3121 .txq_inline_mpw = MLX5_ARG_UNSET,
3122 .txqs_inline = MLX5_ARG_UNSET,
3124 .mr_ext_memseg_en = 1,
3126 .enabled = 0, /* Disabled by default. */
3127 .stride_num_n = MLX5_MPRQ_STRIDE_NUM_N,
3128 .max_memcpy_len = MLX5_MPRQ_MEMCPY_DEFAULT_LEN,
3129 .min_rxqs_num = MLX5_MPRQ_MIN_RXQS,
3133 /* Device specific configuration. */
3134 switch (pci_dev->id.device_id) {
3135 case PCI_DEVICE_ID_MELLANOX_CONNECTX4VF:
3136 case PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF:
3137 case PCI_DEVICE_ID_MELLANOX_CONNECTX5VF:
3138 case PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF:
3139 case PCI_DEVICE_ID_MELLANOX_CONNECTX5BFVF:
3140 case PCI_DEVICE_ID_MELLANOX_CONNECTX6VF:
3141 case PCI_DEVICE_ID_MELLANOX_CONNECTX6DXVF:
3147 for (i = 0; i != ns; ++i) {
3150 list[i].eth_dev = mlx5_dev_spawn(&pci_dev->device,
3153 if (!list[i].eth_dev) {
3154 if (rte_errno != EBUSY && rte_errno != EEXIST)
3156 /* Device is disabled or already spawned. Ignore it. */
3159 restore = list[i].eth_dev->data->dev_flags;
3160 rte_eth_copy_pci_info(list[i].eth_dev, pci_dev);
3161 /* Restore non-PCI flags cleared by the above call. */
3162 list[i].eth_dev->data->dev_flags |= restore;
3163 mlx5_dev_interrupt_handler_devx_install(list[i].eth_dev);
3164 rte_eth_dev_probing_finish(list[i].eth_dev);
3168 "probe of PCI device " PCI_PRI_FMT " aborted after"
3169 " encountering an error: %s",
3170 pci_dev->addr.domain, pci_dev->addr.bus,
3171 pci_dev->addr.devid, pci_dev->addr.function,
3172 strerror(rte_errno));
3176 if (!list[i].eth_dev)
3178 mlx5_dev_close(list[i].eth_dev);
3179 /* mac_addrs must not be freed because in dev_private */
3180 list[i].eth_dev->data->mac_addrs = NULL;
3181 claim_zero(rte_eth_dev_release_port(list[i].eth_dev));
3183 /* Restore original error. */
3190 * Do the routine cleanup:
3191 * - close opened Netlink sockets
3192 * - free allocated spawn data array
3193 * - free the Infiniband device list
3202 mlx5_glue->free_device_list(ibv_list);
3207 * Look for the ethernet device belonging to mlx5 driver.
3209 * @param[in] port_id
3210 * port_id to start looking for device.
3211 * @param[in] pci_dev
3212 * Pointer to the hint PCI device. When device is being probed
3213 * the its siblings (master and preceding representors might
3214 * not have assigned driver yet (because the mlx5_pci_probe()
3215 * is not completed yet, for this case match on hint PCI
3216 * device may be used to detect sibling device.
3219 * port_id of found device, RTE_MAX_ETHPORT if not found.
3222 mlx5_eth_find_next(uint16_t port_id, struct rte_pci_device *pci_dev)
3224 while (port_id < RTE_MAX_ETHPORTS) {
3225 struct rte_eth_dev *dev = &rte_eth_devices[port_id];
3227 if (dev->state != RTE_ETH_DEV_UNUSED &&
3229 (dev->device == &pci_dev->device ||
3230 (dev->device->driver &&
3231 dev->device->driver->name &&
3232 !strcmp(dev->device->driver->name, MLX5_DRIVER_NAME))))
3236 if (port_id >= RTE_MAX_ETHPORTS)
3237 return RTE_MAX_ETHPORTS;
3242 * DPDK callback to remove a PCI device.
3244 * This function removes all Ethernet devices belong to a given PCI device.
3246 * @param[in] pci_dev
3247 * Pointer to the PCI device.
3250 * 0 on success, the function cannot fail.
3253 mlx5_pci_remove(struct rte_pci_device *pci_dev)
3257 RTE_ETH_FOREACH_DEV_OF(port_id, &pci_dev->device)
3258 rte_eth_dev_close(port_id);
3262 static const struct rte_pci_id mlx5_pci_id_map[] = {
3264 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
3265 PCI_DEVICE_ID_MELLANOX_CONNECTX4)
3268 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
3269 PCI_DEVICE_ID_MELLANOX_CONNECTX4VF)
3272 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
3273 PCI_DEVICE_ID_MELLANOX_CONNECTX4LX)
3276 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
3277 PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF)
3280 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
3281 PCI_DEVICE_ID_MELLANOX_CONNECTX5)
3284 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
3285 PCI_DEVICE_ID_MELLANOX_CONNECTX5VF)
3288 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
3289 PCI_DEVICE_ID_MELLANOX_CONNECTX5EX)
3292 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
3293 PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF)
3296 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
3297 PCI_DEVICE_ID_MELLANOX_CONNECTX5BF)
3300 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
3301 PCI_DEVICE_ID_MELLANOX_CONNECTX5BFVF)
3304 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
3305 PCI_DEVICE_ID_MELLANOX_CONNECTX6)
3308 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
3309 PCI_DEVICE_ID_MELLANOX_CONNECTX6VF)
3312 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
3313 PCI_DEVICE_ID_MELLANOX_CONNECTX6DX)
3316 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
3317 PCI_DEVICE_ID_MELLANOX_CONNECTX6DXVF)
3324 static struct rte_pci_driver mlx5_driver = {
3326 .name = MLX5_DRIVER_NAME
3328 .id_table = mlx5_pci_id_map,
3329 .probe = mlx5_pci_probe,
3330 .remove = mlx5_pci_remove,
3331 .dma_map = mlx5_dma_map,
3332 .dma_unmap = mlx5_dma_unmap,
3333 .drv_flags = RTE_PCI_DRV_INTR_LSC | RTE_PCI_DRV_INTR_RMV |
3334 RTE_PCI_DRV_PROBE_AGAIN,
3337 #ifdef RTE_IBVERBS_LINK_DLOPEN
3340 * Suffix RTE_EAL_PMD_PATH with "-glue".
3342 * This function performs a sanity check on RTE_EAL_PMD_PATH before
3343 * suffixing its last component.
3346 * Output buffer, should be large enough otherwise NULL is returned.
3351 * Pointer to @p buf or @p NULL in case suffix cannot be appended.
3354 mlx5_glue_path(char *buf, size_t size)
3356 static const char *const bad[] = { "/", ".", "..", NULL };
3357 const char *path = RTE_EAL_PMD_PATH;
3358 size_t len = strlen(path);
3362 while (len && path[len - 1] == '/')
3364 for (off = len; off && path[off - 1] != '/'; --off)
3366 for (i = 0; bad[i]; ++i)
3367 if (!strncmp(path + off, bad[i], (int)(len - off)))
3369 i = snprintf(buf, size, "%.*s-glue", (int)len, path);
3370 if (i == -1 || (size_t)i >= size)
3375 "unable to append \"-glue\" to last component of"
3376 " RTE_EAL_PMD_PATH (\"" RTE_EAL_PMD_PATH "\"),"
3377 " please re-configure DPDK");
3382 * Initialization routine for run-time dependency on rdma-core.
3385 mlx5_glue_init(void)
3387 char glue_path[sizeof(RTE_EAL_PMD_PATH) - 1 + sizeof("-glue")];
3388 const char *path[] = {
3390 * A basic security check is necessary before trusting
3391 * MLX5_GLUE_PATH, which may override RTE_EAL_PMD_PATH.
3393 (geteuid() == getuid() && getegid() == getgid() ?
3394 getenv("MLX5_GLUE_PATH") : NULL),
3396 * When RTE_EAL_PMD_PATH is set, use its glue-suffixed
3397 * variant, otherwise let dlopen() look up libraries on its
3400 (*RTE_EAL_PMD_PATH ?
3401 mlx5_glue_path(glue_path, sizeof(glue_path)) : ""),
3404 void *handle = NULL;
3408 while (!handle && i != RTE_DIM(path)) {
3417 end = strpbrk(path[i], ":;");
3419 end = path[i] + strlen(path[i]);
3420 len = end - path[i];
3425 ret = snprintf(name, sizeof(name), "%.*s%s" MLX5_GLUE,
3427 (!len || *(end - 1) == '/') ? "" : "/");
3430 if (sizeof(name) != (size_t)ret + 1)
3432 DRV_LOG(DEBUG, "looking for rdma-core glue as \"%s\"",
3434 handle = dlopen(name, RTLD_LAZY);
3445 DRV_LOG(WARNING, "cannot load glue library: %s", dlmsg);
3448 sym = dlsym(handle, "mlx5_glue");
3449 if (!sym || !*sym) {
3453 DRV_LOG(ERR, "cannot resolve glue symbol: %s", dlmsg);
3462 "cannot initialize PMD due to missing run-time dependency on"
3463 " rdma-core libraries (libibverbs, libmlx5)");
3470 * Driver initialization routine.
3472 RTE_INIT(rte_mlx5_pmd_init)
3474 /* Initialize driver log type. */
3475 mlx5_logtype = rte_log_register("pmd.net.mlx5");
3476 if (mlx5_logtype >= 0)
3477 rte_log_set_level(mlx5_logtype, RTE_LOG_NOTICE);
3479 /* Build the static tables for Verbs conversion. */
3480 mlx5_set_ptype_table();
3481 mlx5_set_cksum_table();
3482 mlx5_set_swp_types_table();
3484 * RDMAV_HUGEPAGES_SAFE tells ibv_fork_init() we intend to use
3485 * huge pages. Calling ibv_fork_init() during init allows
3486 * applications to use fork() safely for purposes other than
3487 * using this PMD, which is not supported in forked processes.
3489 setenv("RDMAV_HUGEPAGES_SAFE", "1", 1);
3490 /* Match the size of Rx completion entry to the size of a cacheline. */
3491 if (RTE_CACHE_LINE_SIZE == 128)
3492 setenv("MLX5_CQE_SIZE", "128", 0);
3494 * MLX5_DEVICE_FATAL_CLEANUP tells ibv_destroy functions to
3495 * cleanup all the Verbs resources even when the device was removed.
3497 setenv("MLX5_DEVICE_FATAL_CLEANUP", "1", 1);
3498 #ifdef RTE_IBVERBS_LINK_DLOPEN
3499 if (mlx5_glue_init())
3504 /* Glue structure must not contain any NULL pointers. */
3508 for (i = 0; i != sizeof(*mlx5_glue) / sizeof(void *); ++i)
3509 assert(((const void *const *)mlx5_glue)[i]);
3512 if (strcmp(mlx5_glue->version, MLX5_GLUE_VERSION)) {
3514 "rdma-core glue \"%s\" mismatch: \"%s\" is required",
3515 mlx5_glue->version, MLX5_GLUE_VERSION);
3518 mlx5_glue->fork_init();
3519 rte_pci_register(&mlx5_driver);
3522 RTE_PMD_EXPORT_NAME(net_mlx5, __COUNTER__);
3523 RTE_PMD_REGISTER_PCI_TABLE(net_mlx5, mlx5_pci_id_map);
3524 RTE_PMD_REGISTER_KMOD_DEP(net_mlx5, "* ib_uverbs & mlx5_core & mlx5_ib");