1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright 2015 6WIND S.A.
3 * Copyright 2020 Mellanox Technologies, Ltd
14 #include <linux/rtnetlink.h>
18 /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
20 #pragma GCC diagnostic ignored "-Wpedantic"
22 #include <infiniband/verbs.h>
24 #pragma GCC diagnostic error "-Wpedantic"
27 #include <rte_malloc.h>
28 #include <rte_ethdev_driver.h>
29 #include <rte_ethdev_pci.h>
31 #include <rte_bus_pci.h>
32 #include <rte_common.h>
33 #include <rte_kvargs.h>
34 #include <rte_rwlock.h>
35 #include <rte_spinlock.h>
36 #include <rte_string_fns.h>
37 #include <rte_alarm.h>
39 #include <mlx5_glue.h>
40 #include <mlx5_devx_cmds.h>
41 #include <mlx5_common.h>
42 #include <mlx5_common_mp.h>
44 #include "mlx5_defs.h"
46 #include "mlx5_utils.h"
47 #include "mlx5_rxtx.h"
48 #include "mlx5_autoconf.h"
50 #include "mlx5_flow.h"
51 #include "rte_pmd_mlx5.h"
53 #define MLX5_TAGS_HLIST_ARRAY_SIZE 8192
55 #ifndef HAVE_IBV_MLX5_MOD_MPW
56 #define MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED (1 << 2)
57 #define MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW (1 << 3)
60 #ifndef HAVE_IBV_MLX5_MOD_CQE_128B_COMP
61 #define MLX5DV_CONTEXT_FLAGS_CQE_128B_COMP (1 << 4)
65 * Get ibv device name. Given an ibv_context pointer - return a
66 * pointer to the corresponding device name.
69 * Pointer to ibv context.
72 * Pointer to device name if ctx is valid, NULL otherwise.
75 mlx5_os_get_ctx_device_name(void *ctx)
79 return ((struct ibv_context *)ctx)->device->name;
83 * Get ibv device path name. Given an ibv_context pointer - return a
84 * pointer to the corresponding device path name.
87 * Pointer to ibv context.
90 * Pointer to device path name if ctx is valid, NULL otherwise.
93 mlx5_os_get_ctx_device_path(void *ctx)
98 return ((struct ibv_context *)ctx)->device->ibdev_path;
102 * Get umem id. Given a pointer to umem object of type
103 * 'struct mlx5dv_devx_umem *' - return its id.
106 * Pointer to umem object.
109 * The umem id if umem is valid, 0 otherwise.
112 mlx5_os_get_umem_id(void *umem)
116 return ((struct mlx5dv_devx_umem *)umem)->umem_id;
120 * Get mlx5 device attributes. The glue function query_device_ex() is called
121 * with out parameter of type 'struct ibv_device_attr_ex *'. Then fill in mlx5
122 * device attributes from the glue out parameter.
125 * Pointer to ibv context.
128 * Pointer to mlx5 device attributes.
131 * 0 on success, non zero error number otherwise
134 mlx5_os_get_dev_attr(void *ctx, struct mlx5_dev_attr *device_attr)
137 struct ibv_device_attr_ex attr_ex;
138 memset(device_attr, 0, sizeof(*device_attr));
139 err = mlx5_glue->query_device_ex(ctx, NULL, &attr_ex);
143 device_attr->device_cap_flags_ex = attr_ex.device_cap_flags_ex;
144 device_attr->max_qp_wr = attr_ex.orig_attr.max_qp_wr;
145 device_attr->max_sge = attr_ex.orig_attr.max_sge;
146 device_attr->max_cq = attr_ex.orig_attr.max_cq;
147 device_attr->max_qp = attr_ex.orig_attr.max_qp;
148 device_attr->raw_packet_caps = attr_ex.raw_packet_caps;
149 device_attr->max_rwq_indirection_table_size =
150 attr_ex.rss_caps.max_rwq_indirection_table_size;
151 device_attr->max_tso = attr_ex.tso_caps.max_tso;
152 device_attr->tso_supported_qpts = attr_ex.tso_caps.supported_qpts;
154 struct mlx5dv_context dv_attr = { .comp_mask = 0 };
155 err = mlx5_glue->dv_query_device(ctx, &dv_attr);
159 device_attr->flags = dv_attr.flags;
160 device_attr->comp_mask = dv_attr.comp_mask;
161 #ifdef HAVE_IBV_MLX5_MOD_SWP
162 device_attr->sw_parsing_offloads =
163 dv_attr.sw_parsing_caps.sw_parsing_offloads;
165 device_attr->min_single_stride_log_num_of_bytes =
166 dv_attr.striding_rq_caps.min_single_stride_log_num_of_bytes;
167 device_attr->max_single_stride_log_num_of_bytes =
168 dv_attr.striding_rq_caps.max_single_stride_log_num_of_bytes;
169 device_attr->min_single_wqe_log_num_of_strides =
170 dv_attr.striding_rq_caps.min_single_wqe_log_num_of_strides;
171 device_attr->max_single_wqe_log_num_of_strides =
172 dv_attr.striding_rq_caps.max_single_wqe_log_num_of_strides;
173 device_attr->stride_supported_qpts =
174 dv_attr.striding_rq_caps.supported_qpts;
175 #ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT
176 device_attr->tunnel_offloads_caps = dv_attr.tunnel_offloads_caps;
183 * Verbs callback to allocate a memory. This function should allocate the space
184 * according to the size provided residing inside a huge page.
185 * Please note that all allocation must respect the alignment from libmlx5
186 * (i.e. currently sysconf(_SC_PAGESIZE)).
189 * The size in bytes of the memory to allocate.
191 * A pointer to the callback data.
194 * Allocated buffer, NULL otherwise and rte_errno is set.
197 mlx5_alloc_verbs_buf(size_t size, void *data)
199 struct mlx5_priv *priv = data;
201 size_t alignment = sysconf(_SC_PAGESIZE);
202 unsigned int socket = SOCKET_ID_ANY;
204 if (priv->verbs_alloc_ctx.type == MLX5_VERBS_ALLOC_TYPE_TX_QUEUE) {
205 const struct mlx5_txq_ctrl *ctrl = priv->verbs_alloc_ctx.obj;
207 socket = ctrl->socket;
208 } else if (priv->verbs_alloc_ctx.type ==
209 MLX5_VERBS_ALLOC_TYPE_RX_QUEUE) {
210 const struct mlx5_rxq_ctrl *ctrl = priv->verbs_alloc_ctx.obj;
212 socket = ctrl->socket;
214 MLX5_ASSERT(data != NULL);
215 ret = rte_malloc_socket(__func__, size, alignment, socket);
222 * Verbs callback to free a memory.
225 * A pointer to the memory to free.
227 * A pointer to the callback data.
230 mlx5_free_verbs_buf(void *ptr, void *data __rte_unused)
232 MLX5_ASSERT(data != NULL);
237 * Initialize DR related data within private structure.
238 * Routine checks the reference counter and does actual
239 * resources creation/initialization only if counter is zero.
242 * Pointer to the private device data structure.
245 * Zero on success, positive error code otherwise.
248 mlx5_alloc_shared_dr(struct mlx5_priv *priv)
250 struct mlx5_dev_ctx_shared *sh = priv->sh;
251 char s[MLX5_HLIST_NAMESIZE];
255 err = mlx5_alloc_table_hash_list(priv);
257 DRV_LOG(DEBUG, "sh->flow_tbls[%p] already created, reuse\n",
258 (void *)sh->flow_tbls);
261 /* Create tags hash list table. */
262 snprintf(s, sizeof(s), "%s_tags", sh->ibdev_name);
263 sh->tag_table = mlx5_hlist_create(s, MLX5_TAGS_HLIST_ARRAY_SIZE);
264 if (!sh->tag_table) {
265 DRV_LOG(ERR, "tags with hash creation failed.\n");
269 #ifdef HAVE_MLX5DV_DR
273 /* Shared DV/DR structures is already initialized. */
278 /* Reference counter is zero, we should initialize structures. */
279 domain = mlx5_glue->dr_create_domain(sh->ctx,
280 MLX5DV_DR_DOMAIN_TYPE_NIC_RX);
282 DRV_LOG(ERR, "ingress mlx5dv_dr_create_domain failed");
286 sh->rx_domain = domain;
287 domain = mlx5_glue->dr_create_domain(sh->ctx,
288 MLX5DV_DR_DOMAIN_TYPE_NIC_TX);
290 DRV_LOG(ERR, "egress mlx5dv_dr_create_domain failed");
294 pthread_mutex_init(&sh->dv_mutex, NULL);
295 sh->tx_domain = domain;
296 #ifdef HAVE_MLX5DV_DR_ESWITCH
297 if (priv->config.dv_esw_en) {
298 domain = mlx5_glue->dr_create_domain
299 (sh->ctx, MLX5DV_DR_DOMAIN_TYPE_FDB);
301 DRV_LOG(ERR, "FDB mlx5dv_dr_create_domain failed");
305 sh->fdb_domain = domain;
306 sh->esw_drop_action = mlx5_glue->dr_create_flow_action_drop();
309 if (priv->config.reclaim_mode == MLX5_RCM_AGGR) {
310 mlx5_glue->dr_reclaim_domain_memory(sh->rx_domain, 1);
311 mlx5_glue->dr_reclaim_domain_memory(sh->tx_domain, 1);
313 mlx5_glue->dr_reclaim_domain_memory(sh->fdb_domain, 1);
315 sh->pop_vlan_action = mlx5_glue->dr_create_flow_action_pop_vlan();
316 #endif /* HAVE_MLX5DV_DR */
321 /* Rollback the created objects. */
323 mlx5_glue->dr_destroy_domain(sh->rx_domain);
324 sh->rx_domain = NULL;
327 mlx5_glue->dr_destroy_domain(sh->tx_domain);
328 sh->tx_domain = NULL;
330 if (sh->fdb_domain) {
331 mlx5_glue->dr_destroy_domain(sh->fdb_domain);
332 sh->fdb_domain = NULL;
334 if (sh->esw_drop_action) {
335 mlx5_glue->destroy_flow_action(sh->esw_drop_action);
336 sh->esw_drop_action = NULL;
338 if (sh->pop_vlan_action) {
339 mlx5_glue->destroy_flow_action(sh->pop_vlan_action);
340 sh->pop_vlan_action = NULL;
343 /* tags should be destroyed with flow before. */
344 mlx5_hlist_destroy(sh->tag_table, NULL, NULL);
345 sh->tag_table = NULL;
347 mlx5_free_table_hash_list(priv);
352 * Destroy DR related data within private structure.
355 * Pointer to the private device data structure.
358 mlx5_os_free_shared_dr(struct mlx5_priv *priv)
360 struct mlx5_dev_ctx_shared *sh;
362 if (!priv->dr_shared)
367 #ifdef HAVE_MLX5DV_DR
368 MLX5_ASSERT(sh->dv_refcnt);
369 if (sh->dv_refcnt && --sh->dv_refcnt)
372 mlx5_glue->dr_destroy_domain(sh->rx_domain);
373 sh->rx_domain = NULL;
376 mlx5_glue->dr_destroy_domain(sh->tx_domain);
377 sh->tx_domain = NULL;
379 #ifdef HAVE_MLX5DV_DR_ESWITCH
380 if (sh->fdb_domain) {
381 mlx5_glue->dr_destroy_domain(sh->fdb_domain);
382 sh->fdb_domain = NULL;
384 if (sh->esw_drop_action) {
385 mlx5_glue->destroy_flow_action(sh->esw_drop_action);
386 sh->esw_drop_action = NULL;
389 if (sh->pop_vlan_action) {
390 mlx5_glue->destroy_flow_action(sh->pop_vlan_action);
391 sh->pop_vlan_action = NULL;
393 pthread_mutex_destroy(&sh->dv_mutex);
394 #endif /* HAVE_MLX5DV_DR */
396 /* tags should be destroyed with flow before. */
397 mlx5_hlist_destroy(sh->tag_table, NULL, NULL);
398 sh->tag_table = NULL;
400 mlx5_free_table_hash_list(priv);
404 * Spawn an Ethernet device from Verbs information.
407 * Backing DPDK device.
409 * Verbs device parameters (name, port, switch_info) to spawn.
411 * Device configuration parameters.
414 * A valid Ethernet device object on success, NULL otherwise and rte_errno
415 * is set. The following errors are defined:
417 * EBUSY: device is not supposed to be spawned.
418 * EEXIST: device is already spawned
420 static struct rte_eth_dev *
421 mlx5_dev_spawn(struct rte_device *dpdk_dev,
422 struct mlx5_dev_spawn_data *spawn,
423 struct mlx5_dev_config config)
425 const struct mlx5_switch_info *switch_info = &spawn->info;
426 struct mlx5_dev_ctx_shared *sh = NULL;
427 struct ibv_port_attr port_attr;
428 struct mlx5dv_context dv_attr = { .comp_mask = 0 };
429 struct rte_eth_dev *eth_dev = NULL;
430 struct mlx5_priv *priv = NULL;
432 unsigned int hw_padding = 0;
434 unsigned int cqe_comp;
435 unsigned int cqe_pad = 0;
436 unsigned int tunnel_en = 0;
437 unsigned int mpls_en = 0;
438 unsigned int swp = 0;
439 unsigned int mprq = 0;
440 unsigned int mprq_min_stride_size_n = 0;
441 unsigned int mprq_max_stride_size_n = 0;
442 unsigned int mprq_min_stride_num_n = 0;
443 unsigned int mprq_max_stride_num_n = 0;
444 struct rte_ether_addr mac;
445 char name[RTE_ETH_NAME_MAX_LEN];
446 int own_domain_id = 0;
449 #ifdef HAVE_MLX5DV_DR_DEVX_PORT
450 struct mlx5dv_devx_port devx_port = { .comp_mask = 0 };
453 /* Determine if this port representor is supposed to be spawned. */
454 if (switch_info->representor && dpdk_dev->devargs) {
455 struct rte_eth_devargs eth_da;
457 err = rte_eth_devargs_parse(dpdk_dev->devargs->args, ð_da);
460 DRV_LOG(ERR, "failed to process device arguments: %s",
461 strerror(rte_errno));
464 for (i = 0; i < eth_da.nb_representor_ports; ++i)
465 if (eth_da.representor_ports[i] ==
466 (uint16_t)switch_info->port_name)
468 if (i == eth_da.nb_representor_ports) {
473 /* Build device name. */
474 if (spawn->pf_bond < 0) {
476 if (!switch_info->representor)
477 strlcpy(name, dpdk_dev->name, sizeof(name));
479 snprintf(name, sizeof(name), "%s_representor_%u",
480 dpdk_dev->name, switch_info->port_name);
482 /* Bonding device. */
483 if (!switch_info->representor)
484 snprintf(name, sizeof(name), "%s_%s",
485 dpdk_dev->name, spawn->ibv_dev->name);
487 snprintf(name, sizeof(name), "%s_%s_representor_%u",
488 dpdk_dev->name, spawn->ibv_dev->name,
489 switch_info->port_name);
491 /* check if the device is already spawned */
492 if (rte_eth_dev_get_port_by_name(name, &port_id) == 0) {
496 DRV_LOG(DEBUG, "naming Ethernet device \"%s\"", name);
497 if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
498 struct mlx5_mp_id mp_id;
500 eth_dev = rte_eth_dev_attach_secondary(name);
501 if (eth_dev == NULL) {
502 DRV_LOG(ERR, "can not attach rte ethdev");
506 eth_dev->device = dpdk_dev;
507 eth_dev->dev_ops = &mlx5_dev_sec_ops;
508 err = mlx5_proc_priv_init(eth_dev);
511 mp_id.port_id = eth_dev->data->port_id;
512 strlcpy(mp_id.name, MLX5_MP_NAME, RTE_MP_MAX_NAME_LEN);
513 /* Receive command fd from primary process */
514 err = mlx5_mp_req_verbs_cmd_fd(&mp_id);
517 /* Remap UAR for Tx queues. */
518 err = mlx5_tx_uar_init_secondary(eth_dev, err);
522 * Ethdev pointer is still required as input since
523 * the primary device is not accessible from the
526 eth_dev->rx_pkt_burst = mlx5_select_rx_function(eth_dev);
527 eth_dev->tx_pkt_burst = mlx5_select_tx_function(eth_dev);
530 mlx5_dev_close(eth_dev);
534 * Some parameters ("tx_db_nc" in particularly) are needed in
535 * advance to create dv/verbs device context. We proceed the
536 * devargs here to get ones, and later proceed devargs again
537 * to override some hardware settings.
539 err = mlx5_args(&config, dpdk_dev->devargs);
542 DRV_LOG(ERR, "failed to process device arguments: %s",
543 strerror(rte_errno));
546 sh = mlx5_alloc_shared_ibctx(spawn, &config);
549 config.devx = sh->devx;
550 #ifdef HAVE_MLX5DV_DR_ACTION_DEST_DEVX_TIR
553 #ifdef HAVE_IBV_MLX5_MOD_SWP
554 dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_SWP;
557 * Multi-packet send is supported by ConnectX-4 Lx PF as well
558 * as all ConnectX-5 devices.
560 #ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT
561 dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS;
563 #ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT
564 dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_STRIDING_RQ;
566 mlx5_glue->dv_query_device(sh->ctx, &dv_attr);
567 if (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED) {
568 if (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW) {
569 DRV_LOG(DEBUG, "enhanced MPW is supported");
570 mps = MLX5_MPW_ENHANCED;
572 DRV_LOG(DEBUG, "MPW is supported");
576 DRV_LOG(DEBUG, "MPW isn't supported");
577 mps = MLX5_MPW_DISABLED;
579 #ifdef HAVE_IBV_MLX5_MOD_SWP
580 if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_SWP)
581 swp = dv_attr.sw_parsing_caps.sw_parsing_offloads;
582 DRV_LOG(DEBUG, "SWP support: %u", swp);
585 #ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT
586 if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_STRIDING_RQ) {
587 struct mlx5dv_striding_rq_caps mprq_caps =
588 dv_attr.striding_rq_caps;
590 DRV_LOG(DEBUG, "\tmin_single_stride_log_num_of_bytes: %d",
591 mprq_caps.min_single_stride_log_num_of_bytes);
592 DRV_LOG(DEBUG, "\tmax_single_stride_log_num_of_bytes: %d",
593 mprq_caps.max_single_stride_log_num_of_bytes);
594 DRV_LOG(DEBUG, "\tmin_single_wqe_log_num_of_strides: %d",
595 mprq_caps.min_single_wqe_log_num_of_strides);
596 DRV_LOG(DEBUG, "\tmax_single_wqe_log_num_of_strides: %d",
597 mprq_caps.max_single_wqe_log_num_of_strides);
598 DRV_LOG(DEBUG, "\tsupported_qpts: %d",
599 mprq_caps.supported_qpts);
600 DRV_LOG(DEBUG, "device supports Multi-Packet RQ");
602 mprq_min_stride_size_n =
603 mprq_caps.min_single_stride_log_num_of_bytes;
604 mprq_max_stride_size_n =
605 mprq_caps.max_single_stride_log_num_of_bytes;
606 mprq_min_stride_num_n =
607 mprq_caps.min_single_wqe_log_num_of_strides;
608 mprq_max_stride_num_n =
609 mprq_caps.max_single_wqe_log_num_of_strides;
612 if (RTE_CACHE_LINE_SIZE == 128 &&
613 !(dv_attr.flags & MLX5DV_CONTEXT_FLAGS_CQE_128B_COMP))
617 config.cqe_comp = cqe_comp;
618 #ifdef HAVE_IBV_MLX5_MOD_CQE_128B_PAD
619 /* Whether device supports 128B Rx CQE padding. */
620 cqe_pad = RTE_CACHE_LINE_SIZE == 128 &&
621 (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_CQE_128B_PAD);
623 #ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT
624 if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS) {
625 tunnel_en = ((dv_attr.tunnel_offloads_caps &
626 MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_VXLAN) &&
627 (dv_attr.tunnel_offloads_caps &
628 MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_GRE) &&
629 (dv_attr.tunnel_offloads_caps &
630 MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_GENEVE));
632 DRV_LOG(DEBUG, "tunnel offloading is %ssupported",
633 tunnel_en ? "" : "not ");
636 "tunnel offloading disabled due to old OFED/rdma-core version");
638 config.tunnel_en = tunnel_en;
639 #ifdef HAVE_IBV_DEVICE_MPLS_SUPPORT
640 mpls_en = ((dv_attr.tunnel_offloads_caps &
641 MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_CW_MPLS_OVER_GRE) &&
642 (dv_attr.tunnel_offloads_caps &
643 MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_CW_MPLS_OVER_UDP));
644 DRV_LOG(DEBUG, "MPLS over GRE/UDP tunnel offloading is %ssupported",
645 mpls_en ? "" : "not ");
647 DRV_LOG(WARNING, "MPLS over GRE/UDP tunnel offloading disabled due to"
648 " old OFED/rdma-core version or firmware configuration");
650 config.mpls_en = mpls_en;
651 /* Check port status. */
652 err = mlx5_glue->query_port(sh->ctx, spawn->ibv_port, &port_attr);
654 DRV_LOG(ERR, "port query failed: %s", strerror(err));
657 if (port_attr.link_layer != IBV_LINK_LAYER_ETHERNET) {
658 DRV_LOG(ERR, "port is not configured in Ethernet mode");
662 if (port_attr.state != IBV_PORT_ACTIVE)
663 DRV_LOG(DEBUG, "port is not active: \"%s\" (%d)",
664 mlx5_glue->port_state_str(port_attr.state),
666 /* Allocate private eth device data. */
667 priv = rte_zmalloc("ethdev private structure",
669 RTE_CACHE_LINE_SIZE);
671 DRV_LOG(ERR, "priv allocation failure");
676 priv->ibv_port = spawn->ibv_port;
677 priv->pci_dev = spawn->pci_dev;
678 priv->mtu = RTE_ETHER_MTU;
679 priv->mp_id.port_id = port_id;
680 strlcpy(priv->mp_id.name, MLX5_MP_NAME, RTE_MP_MAX_NAME_LEN);
682 /* Initialize UAR access locks for 32bit implementations. */
683 rte_spinlock_init(&priv->uar_lock_cq);
684 for (i = 0; i < MLX5_UAR_PAGE_NUM_MAX; i++)
685 rte_spinlock_init(&priv->uar_lock[i]);
687 /* Some internal functions rely on Netlink sockets, open them now. */
688 priv->nl_socket_rdma = mlx5_nl_init(NETLINK_RDMA);
689 priv->nl_socket_route = mlx5_nl_init(NETLINK_ROUTE);
690 priv->representor = !!switch_info->representor;
691 priv->master = !!switch_info->master;
692 priv->domain_id = RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID;
693 priv->vport_meta_tag = 0;
694 priv->vport_meta_mask = 0;
695 priv->pf_bond = spawn->pf_bond;
696 #ifdef HAVE_MLX5DV_DR_DEVX_PORT
698 * The DevX port query API is implemented. E-Switch may use
699 * either vport or reg_c[0] metadata register to match on
700 * vport index. The engaged part of metadata register is
703 if (switch_info->representor || switch_info->master) {
704 devx_port.comp_mask = MLX5DV_DEVX_PORT_VPORT |
705 MLX5DV_DEVX_PORT_MATCH_REG_C_0;
706 err = mlx5_glue->devx_port_query(sh->ctx, spawn->ibv_port,
710 "can't query devx port %d on device %s",
711 spawn->ibv_port, spawn->ibv_dev->name);
712 devx_port.comp_mask = 0;
715 if (devx_port.comp_mask & MLX5DV_DEVX_PORT_MATCH_REG_C_0) {
716 priv->vport_meta_tag = devx_port.reg_c_0.value;
717 priv->vport_meta_mask = devx_port.reg_c_0.mask;
718 if (!priv->vport_meta_mask) {
719 DRV_LOG(ERR, "vport zero mask for port %d"
720 " on bonding device %s",
721 spawn->ibv_port, spawn->ibv_dev->name);
725 if (priv->vport_meta_tag & ~priv->vport_meta_mask) {
726 DRV_LOG(ERR, "invalid vport tag for port %d"
727 " on bonding device %s",
728 spawn->ibv_port, spawn->ibv_dev->name);
733 if (devx_port.comp_mask & MLX5DV_DEVX_PORT_VPORT) {
734 priv->vport_id = devx_port.vport_num;
735 } else if (spawn->pf_bond >= 0) {
736 DRV_LOG(ERR, "can't deduce vport index for port %d"
737 " on bonding device %s",
738 spawn->ibv_port, spawn->ibv_dev->name);
742 /* Suppose vport index in compatible way. */
743 priv->vport_id = switch_info->representor ?
744 switch_info->port_name + 1 : -1;
748 * Kernel/rdma_core support single E-Switch per PF configurations
749 * only and vport_id field contains the vport index for
750 * associated VF, which is deduced from representor port name.
751 * For example, let's have the IB device port 10, it has
752 * attached network device eth0, which has port name attribute
753 * pf0vf2, we can deduce the VF number as 2, and set vport index
754 * as 3 (2+1). This assigning schema should be changed if the
755 * multiple E-Switch instances per PF configurations or/and PCI
756 * subfunctions are added.
758 priv->vport_id = switch_info->representor ?
759 switch_info->port_name + 1 : -1;
761 /* representor_id field keeps the unmodified VF index. */
762 priv->representor_id = switch_info->representor ?
763 switch_info->port_name : -1;
765 * Look for sibling devices in order to reuse their switch domain
766 * if any, otherwise allocate one.
768 MLX5_ETH_FOREACH_DEV(port_id, priv->pci_dev) {
769 const struct mlx5_priv *opriv =
770 rte_eth_devices[port_id].data->dev_private;
773 opriv->sh != priv->sh ||
775 RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID)
777 priv->domain_id = opriv->domain_id;
780 if (priv->domain_id == RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID) {
781 err = rte_eth_switch_domain_alloc(&priv->domain_id);
784 DRV_LOG(ERR, "unable to allocate switch domain: %s",
785 strerror(rte_errno));
790 /* Override some values set by hardware configuration. */
791 mlx5_args(&config, dpdk_dev->devargs);
792 err = mlx5_dev_check_sibling_config(priv, &config);
795 config.hw_csum = !!(sh->device_attr.device_cap_flags_ex &
796 IBV_DEVICE_RAW_IP_CSUM);
797 DRV_LOG(DEBUG, "checksum offloading is %ssupported",
798 (config.hw_csum ? "" : "not "));
799 #if !defined(HAVE_IBV_DEVICE_COUNTERS_SET_V42) && \
800 !defined(HAVE_IBV_DEVICE_COUNTERS_SET_V45)
801 DRV_LOG(DEBUG, "counters are not supported");
803 #if !defined(HAVE_IBV_FLOW_DV_SUPPORT) || !defined(HAVE_MLX5DV_DR)
804 if (config.dv_flow_en) {
805 DRV_LOG(WARNING, "DV flow is not supported");
806 config.dv_flow_en = 0;
809 config.ind_table_max_size =
810 sh->device_attr.max_rwq_indirection_table_size;
812 * Remove this check once DPDK supports larger/variable
813 * indirection tables.
815 if (config.ind_table_max_size > (unsigned int)ETH_RSS_RETA_SIZE_512)
816 config.ind_table_max_size = ETH_RSS_RETA_SIZE_512;
817 DRV_LOG(DEBUG, "maximum Rx indirection table size is %u",
818 config.ind_table_max_size);
819 config.hw_vlan_strip = !!(sh->device_attr.raw_packet_caps &
820 IBV_RAW_PACKET_CAP_CVLAN_STRIPPING);
821 DRV_LOG(DEBUG, "VLAN stripping is %ssupported",
822 (config.hw_vlan_strip ? "" : "not "));
823 config.hw_fcs_strip = !!(sh->device_attr.raw_packet_caps &
824 IBV_RAW_PACKET_CAP_SCATTER_FCS);
825 DRV_LOG(DEBUG, "FCS stripping configuration is %ssupported",
826 (config.hw_fcs_strip ? "" : "not "));
827 #if defined(HAVE_IBV_WQ_FLAG_RX_END_PADDING)
828 hw_padding = !!sh->device_attr.rx_pad_end_addr_align;
829 #elif defined(HAVE_IBV_WQ_FLAGS_PCI_WRITE_END_PADDING)
830 hw_padding = !!(sh->device_attr.device_cap_flags_ex &
831 IBV_DEVICE_PCI_WRITE_END_PADDING);
833 if (config.hw_padding && !hw_padding) {
834 DRV_LOG(DEBUG, "Rx end alignment padding isn't supported");
835 config.hw_padding = 0;
836 } else if (config.hw_padding) {
837 DRV_LOG(DEBUG, "Rx end alignment padding is enabled");
839 config.tso = (sh->device_attr.max_tso > 0 &&
840 (sh->device_attr.tso_supported_qpts &
841 (1 << IBV_QPT_RAW_PACKET)));
843 config.tso_max_payload_sz = sh->device_attr.max_tso;
845 * MPW is disabled by default, while the Enhanced MPW is enabled
848 if (config.mps == MLX5_ARG_UNSET)
849 config.mps = (mps == MLX5_MPW_ENHANCED) ? MLX5_MPW_ENHANCED :
852 config.mps = config.mps ? mps : MLX5_MPW_DISABLED;
853 DRV_LOG(INFO, "%sMPS is %s",
854 config.mps == MLX5_MPW_ENHANCED ? "enhanced " :
855 config.mps == MLX5_MPW ? "legacy " : "",
856 config.mps != MLX5_MPW_DISABLED ? "enabled" : "disabled");
857 if (config.cqe_comp && !cqe_comp) {
858 DRV_LOG(WARNING, "Rx CQE compression isn't supported");
861 if (config.cqe_pad && !cqe_pad) {
862 DRV_LOG(WARNING, "Rx CQE padding isn't supported");
864 } else if (config.cqe_pad) {
865 DRV_LOG(INFO, "Rx CQE padding is enabled");
868 priv->counter_fallback = 0;
869 err = mlx5_devx_cmd_query_hca_attr(sh->ctx, &config.hca_attr);
874 if (!config.hca_attr.flow_counters_dump)
875 priv->counter_fallback = 1;
876 #ifndef HAVE_IBV_DEVX_ASYNC
877 priv->counter_fallback = 1;
879 if (priv->counter_fallback)
880 DRV_LOG(INFO, "Use fall-back DV counter management");
881 /* Check for LRO support. */
882 if (config.dest_tir && config.hca_attr.lro_cap &&
884 /* TBD check tunnel lro caps. */
885 config.lro.supported = config.hca_attr.lro_cap;
886 DRV_LOG(DEBUG, "Device supports LRO");
888 * If LRO timeout is not configured by application,
889 * use the minimal supported value.
891 if (!config.lro.timeout)
893 config.hca_attr.lro_timer_supported_periods[0];
894 DRV_LOG(DEBUG, "LRO session timeout set to %d usec",
897 #if defined(HAVE_MLX5DV_DR) && defined(HAVE_MLX5_DR_CREATE_ACTION_FLOW_METER)
898 if (config.hca_attr.qos.sup && config.hca_attr.qos.srtcm_sup &&
901 config.hca_attr.qos.flow_meter_reg_c_ids;
903 * Meter needs two REG_C's for color match and pre-sfx
904 * flow match. Here get the REG_C for color match.
905 * REG_C_0 and REG_C_1 is reserved for metadata feature.
908 if (__builtin_popcount(reg_c_mask) < 1) {
910 DRV_LOG(WARNING, "No available register for"
913 priv->mtr_color_reg = ffs(reg_c_mask) - 1 +
916 priv->mtr_reg_share =
917 config.hca_attr.qos.flow_meter_reg_share;
918 DRV_LOG(DEBUG, "The REG_C meter uses is %d",
919 priv->mtr_color_reg);
924 if (config.mprq.enabled && mprq) {
925 if (config.mprq.stride_num_n &&
926 (config.mprq.stride_num_n > mprq_max_stride_num_n ||
927 config.mprq.stride_num_n < mprq_min_stride_num_n)) {
928 config.mprq.stride_num_n =
929 RTE_MIN(RTE_MAX(MLX5_MPRQ_STRIDE_NUM_N,
930 mprq_min_stride_num_n),
931 mprq_max_stride_num_n);
933 "the number of strides"
934 " for Multi-Packet RQ is out of range,"
935 " setting default value (%u)",
936 1 << config.mprq.stride_num_n);
938 if (config.mprq.stride_size_n &&
939 (config.mprq.stride_size_n > mprq_max_stride_size_n ||
940 config.mprq.stride_size_n < mprq_min_stride_size_n)) {
941 config.mprq.stride_size_n =
942 RTE_MIN(RTE_MAX(MLX5_MPRQ_STRIDE_SIZE_N,
943 mprq_min_stride_size_n),
944 mprq_max_stride_size_n);
946 "the size of a stride"
947 " for Multi-Packet RQ is out of range,"
948 " setting default value (%u)",
949 1 << config.mprq.stride_size_n);
951 config.mprq.min_stride_size_n = mprq_min_stride_size_n;
952 config.mprq.max_stride_size_n = mprq_max_stride_size_n;
953 } else if (config.mprq.enabled && !mprq) {
954 DRV_LOG(WARNING, "Multi-Packet RQ isn't supported");
955 config.mprq.enabled = 0;
957 if (config.max_dump_files_num == 0)
958 config.max_dump_files_num = 128;
959 eth_dev = rte_eth_dev_allocate(name);
960 if (eth_dev == NULL) {
961 DRV_LOG(ERR, "can not allocate rte ethdev");
965 /* Flag to call rte_eth_dev_release_port() in rte_eth_dev_close(). */
966 eth_dev->data->dev_flags |= RTE_ETH_DEV_CLOSE_REMOVE;
967 if (priv->representor) {
968 eth_dev->data->dev_flags |= RTE_ETH_DEV_REPRESENTOR;
969 eth_dev->data->representor_id = priv->representor_id;
972 * Store associated network device interface index. This index
973 * is permanent throughout the lifetime of device. So, we may store
974 * the ifindex here and use the cached value further.
976 MLX5_ASSERT(spawn->ifindex);
977 priv->if_index = spawn->ifindex;
978 eth_dev->data->dev_private = priv;
979 priv->dev_data = eth_dev->data;
980 eth_dev->data->mac_addrs = priv->mac;
981 eth_dev->device = dpdk_dev;
982 /* Configure the first MAC address by default. */
983 if (mlx5_get_mac(eth_dev, &mac.addr_bytes)) {
985 "port %u cannot get MAC address, is mlx5_en"
986 " loaded? (errno: %s)",
987 eth_dev->data->port_id, strerror(rte_errno));
992 "port %u MAC address is %02x:%02x:%02x:%02x:%02x:%02x",
993 eth_dev->data->port_id,
994 mac.addr_bytes[0], mac.addr_bytes[1],
995 mac.addr_bytes[2], mac.addr_bytes[3],
996 mac.addr_bytes[4], mac.addr_bytes[5]);
997 #ifdef RTE_LIBRTE_MLX5_DEBUG
999 char ifname[IF_NAMESIZE];
1001 if (mlx5_get_ifname(eth_dev, &ifname) == 0)
1002 DRV_LOG(DEBUG, "port %u ifname is \"%s\"",
1003 eth_dev->data->port_id, ifname);
1005 DRV_LOG(DEBUG, "port %u ifname is unknown",
1006 eth_dev->data->port_id);
1009 /* Get actual MTU if possible. */
1010 err = mlx5_get_mtu(eth_dev, &priv->mtu);
1015 DRV_LOG(DEBUG, "port %u MTU is %u", eth_dev->data->port_id,
1017 /* Initialize burst functions to prevent crashes before link-up. */
1018 eth_dev->rx_pkt_burst = removed_rx_burst;
1019 eth_dev->tx_pkt_burst = removed_tx_burst;
1020 eth_dev->dev_ops = &mlx5_dev_ops;
1021 /* Register MAC address. */
1022 claim_zero(mlx5_mac_addr_add(eth_dev, &mac, 0, 0));
1023 if (config.vf && config.vf_nl_en)
1024 mlx5_nl_mac_addr_sync(priv->nl_socket_route,
1025 mlx5_ifindex(eth_dev),
1026 eth_dev->data->mac_addrs,
1027 MLX5_MAX_MAC_ADDRESSES);
1029 priv->ctrl_flows = 0;
1030 TAILQ_INIT(&priv->flow_meters);
1031 TAILQ_INIT(&priv->flow_meter_profiles);
1032 /* Hint libmlx5 to use PMD allocator for data plane resources */
1033 struct mlx5dv_ctx_allocators alctr = {
1034 .alloc = &mlx5_alloc_verbs_buf,
1035 .free = &mlx5_free_verbs_buf,
1038 mlx5_glue->dv_set_context_attr(sh->ctx,
1039 MLX5DV_CTX_ATTR_BUF_ALLOCATORS,
1040 (void *)((uintptr_t)&alctr));
1041 /* Bring Ethernet device up. */
1042 DRV_LOG(DEBUG, "port %u forcing Ethernet interface up",
1043 eth_dev->data->port_id);
1044 mlx5_set_link_up(eth_dev);
1046 * Even though the interrupt handler is not installed yet,
1047 * interrupts will still trigger on the async_fd from
1048 * Verbs context returned by ibv_open_device().
1050 mlx5_link_update(eth_dev, 0);
1051 #ifdef HAVE_MLX5DV_DR_ESWITCH
1052 if (!(config.hca_attr.eswitch_manager && config.dv_flow_en &&
1053 (switch_info->representor || switch_info->master)))
1054 config.dv_esw_en = 0;
1056 config.dv_esw_en = 0;
1058 /* Detect minimal data bytes to inline. */
1059 mlx5_set_min_inline(spawn, &config);
1060 /* Store device configuration on private structure. */
1061 priv->config = config;
1062 /* Create context for virtual machine VLAN workaround. */
1063 priv->vmwa_context = mlx5_vlan_vmwa_init(eth_dev, spawn->ifindex);
1064 if (config.dv_flow_en) {
1065 err = mlx5_alloc_shared_dr(priv);
1069 * RSS id is shared with meter flow id. Meter flow id can only
1070 * use the 24 MSB of the register.
1072 priv->qrss_id_pool = mlx5_flow_id_pool_alloc(UINT32_MAX >>
1073 MLX5_MTR_COLOR_BITS);
1074 if (!priv->qrss_id_pool) {
1075 DRV_LOG(ERR, "can't create flow id pool");
1080 /* Supported Verbs flow priority number detection. */
1081 err = mlx5_flow_discover_priorities(eth_dev);
1086 priv->config.flow_prio = err;
1087 if (!priv->config.dv_esw_en &&
1088 priv->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY) {
1089 DRV_LOG(WARNING, "metadata mode %u is not supported "
1090 "(no E-Switch)", priv->config.dv_xmeta_en);
1091 priv->config.dv_xmeta_en = MLX5_XMETA_MODE_LEGACY;
1093 mlx5_set_metadata_mask(eth_dev);
1094 if (priv->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY &&
1095 !priv->sh->dv_regc0_mask) {
1096 DRV_LOG(ERR, "metadata mode %u is not supported "
1097 "(no metadata reg_c[0] is available)",
1098 priv->config.dv_xmeta_en);
1103 * Allocate the buffer for flow creating, just once.
1104 * The allocation must be done before any flow creating.
1106 mlx5_flow_alloc_intermediate(eth_dev);
1107 /* Query availability of metadata reg_c's. */
1108 err = mlx5_flow_discover_mreg_c(eth_dev);
1113 if (!mlx5_flow_ext_mreg_supported(eth_dev)) {
1115 "port %u extensive metadata register is not supported",
1116 eth_dev->data->port_id);
1117 if (priv->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY) {
1118 DRV_LOG(ERR, "metadata mode %u is not supported "
1119 "(no metadata registers available)",
1120 priv->config.dv_xmeta_en);
1125 if (priv->config.dv_flow_en &&
1126 priv->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY &&
1127 mlx5_flow_ext_mreg_supported(eth_dev) &&
1128 priv->sh->dv_regc0_mask) {
1129 priv->mreg_cp_tbl = mlx5_hlist_create(MLX5_FLOW_MREG_HNAME,
1130 MLX5_FLOW_MREG_HTABLE_SZ);
1131 if (!priv->mreg_cp_tbl) {
1139 if (priv->mreg_cp_tbl)
1140 mlx5_hlist_destroy(priv->mreg_cp_tbl, NULL, NULL);
1142 mlx5_os_free_shared_dr(priv);
1143 if (priv->nl_socket_route >= 0)
1144 close(priv->nl_socket_route);
1145 if (priv->nl_socket_rdma >= 0)
1146 close(priv->nl_socket_rdma);
1147 if (priv->vmwa_context)
1148 mlx5_vlan_vmwa_exit(priv->vmwa_context);
1149 if (priv->qrss_id_pool)
1150 mlx5_flow_id_pool_release(priv->qrss_id_pool);
1152 claim_zero(rte_eth_switch_domain_free(priv->domain_id));
1154 if (eth_dev != NULL)
1155 eth_dev->data->dev_private = NULL;
1157 if (eth_dev != NULL) {
1158 /* mac_addrs must not be freed alone because part of
1161 eth_dev->data->mac_addrs = NULL;
1162 rte_eth_dev_release_port(eth_dev);
1165 mlx5_free_shared_ibctx(sh);
1166 MLX5_ASSERT(err > 0);
1172 * Comparison callback to sort device data.
1174 * This is meant to be used with qsort().
1177 * Pointer to pointer to first data object.
1179 * Pointer to pointer to second data object.
1182 * 0 if both objects are equal, less than 0 if the first argument is less
1183 * than the second, greater than 0 otherwise.
1186 mlx5_dev_spawn_data_cmp(const void *a, const void *b)
1188 const struct mlx5_switch_info *si_a =
1189 &((const struct mlx5_dev_spawn_data *)a)->info;
1190 const struct mlx5_switch_info *si_b =
1191 &((const struct mlx5_dev_spawn_data *)b)->info;
1194 /* Master device first. */
1195 ret = si_b->master - si_a->master;
1198 /* Then representor devices. */
1199 ret = si_b->representor - si_a->representor;
1202 /* Unidentified devices come last in no specific order. */
1203 if (!si_a->representor)
1205 /* Order representors by name. */
1206 return si_a->port_name - si_b->port_name;
1210 * Match PCI information for possible slaves of bonding device.
1212 * @param[in] ibv_dev
1213 * Pointer to Infiniband device structure.
1214 * @param[in] pci_dev
1215 * Pointer to PCI device structure to match PCI address.
1216 * @param[in] nl_rdma
1217 * Netlink RDMA group socket handle.
1220 * negative value if no bonding device found, otherwise
1221 * positive index of slave PF in bonding.
1224 mlx5_device_bond_pci_match(const struct ibv_device *ibv_dev,
1225 const struct rte_pci_device *pci_dev,
1228 char ifname[IF_NAMESIZE + 1];
1229 unsigned int ifindex;
1235 * Try to get master device name. If something goes
1236 * wrong suppose the lack of kernel support and no
1241 if (!strstr(ibv_dev->name, "bond"))
1243 np = mlx5_nl_portnum(nl_rdma, ibv_dev->name);
1247 * The Master device might not be on the predefined
1248 * port (not on port index 1, it is not garanted),
1249 * we have to scan all Infiniband device port and
1252 for (i = 1; i <= np; ++i) {
1253 /* Check whether Infiniband port is populated. */
1254 ifindex = mlx5_nl_ifindex(nl_rdma, ibv_dev->name, i);
1257 if (!if_indextoname(ifindex, ifname))
1259 /* Try to read bonding slave names from sysfs. */
1261 "/sys/class/net/%s/master/bonding/slaves", ifname);
1262 file = fopen(slaves, "r");
1268 /* Use safe format to check maximal buffer length. */
1269 MLX5_ASSERT(atol(RTE_STR(IF_NAMESIZE)) == IF_NAMESIZE);
1270 while (fscanf(file, "%" RTE_STR(IF_NAMESIZE) "s", ifname) == 1) {
1271 char tmp_str[IF_NAMESIZE + 32];
1272 struct rte_pci_addr pci_addr;
1273 struct mlx5_switch_info info;
1275 /* Process slave interface names in the loop. */
1276 snprintf(tmp_str, sizeof(tmp_str),
1277 "/sys/class/net/%s", ifname);
1278 if (mlx5_dev_to_pci_addr(tmp_str, &pci_addr)) {
1279 DRV_LOG(WARNING, "can not get PCI address"
1280 " for netdev \"%s\"", ifname);
1283 if (pci_dev->addr.domain != pci_addr.domain ||
1284 pci_dev->addr.bus != pci_addr.bus ||
1285 pci_dev->addr.devid != pci_addr.devid ||
1286 pci_dev->addr.function != pci_addr.function)
1288 /* Slave interface PCI address match found. */
1290 snprintf(tmp_str, sizeof(tmp_str),
1291 "/sys/class/net/%s/phys_port_name", ifname);
1292 file = fopen(tmp_str, "rb");
1295 info.name_type = MLX5_PHYS_PORT_NAME_TYPE_NOTSET;
1296 if (fscanf(file, "%32s", tmp_str) == 1)
1297 mlx5_translate_port_name(tmp_str, &info);
1298 if (info.name_type == MLX5_PHYS_PORT_NAME_TYPE_LEGACY ||
1299 info.name_type == MLX5_PHYS_PORT_NAME_TYPE_UPLINK)
1300 pf = info.port_name;
1309 * DPDK callback to register a PCI device.
1311 * This function spawns Ethernet devices out of a given PCI device.
1313 * @param[in] pci_drv
1314 * PCI driver structure (mlx5_driver).
1315 * @param[in] pci_dev
1316 * PCI device information.
1319 * 0 on success, a negative errno value otherwise and rte_errno is set.
1322 mlx5_os_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
1323 struct rte_pci_device *pci_dev)
1325 struct ibv_device **ibv_list;
1327 * Number of found IB Devices matching with requested PCI BDF.
1328 * nd != 1 means there are multiple IB devices over the same
1329 * PCI device and we have representors and master.
1331 unsigned int nd = 0;
1333 * Number of found IB device Ports. nd = 1 and np = 1..n means
1334 * we have the single multiport IB device, and there may be
1335 * representors attached to some of found ports.
1337 unsigned int np = 0;
1339 * Number of DPDK ethernet devices to Spawn - either over
1340 * multiple IB devices or multiple ports of single IB device.
1341 * Actually this is the number of iterations to spawn.
1343 unsigned int ns = 0;
1346 * < 0 - no bonding device (single one)
1347 * >= 0 - bonding device (value is slave PF index)
1350 struct mlx5_dev_spawn_data *list = NULL;
1351 struct mlx5_dev_config dev_config;
1354 if (mlx5_class_get(pci_dev->device.devargs) != MLX5_CLASS_NET) {
1355 DRV_LOG(DEBUG, "Skip probing - should be probed by other mlx5"
1359 if (rte_eal_process_type() == RTE_PROC_PRIMARY)
1360 mlx5_pmd_socket_init();
1361 ret = mlx5_init_once();
1363 DRV_LOG(ERR, "unable to init PMD global data: %s",
1364 strerror(rte_errno));
1367 MLX5_ASSERT(pci_drv == &mlx5_driver);
1369 ibv_list = mlx5_glue->get_device_list(&ret);
1371 rte_errno = errno ? errno : ENOSYS;
1372 DRV_LOG(ERR, "cannot list devices, is ib_uverbs loaded?");
1376 * First scan the list of all Infiniband devices to find
1377 * matching ones, gathering into the list.
1379 struct ibv_device *ibv_match[ret + 1];
1380 int nl_route = mlx5_nl_init(NETLINK_ROUTE);
1381 int nl_rdma = mlx5_nl_init(NETLINK_RDMA);
1385 struct rte_pci_addr pci_addr;
1387 DRV_LOG(DEBUG, "checking device \"%s\"", ibv_list[ret]->name);
1388 bd = mlx5_device_bond_pci_match
1389 (ibv_list[ret], pci_dev, nl_rdma);
1392 * Bonding device detected. Only one match is allowed,
1393 * the bonding is supported over multi-port IB device,
1394 * there should be no matches on representor PCI
1395 * functions or non VF LAG bonding devices with
1396 * specified address.
1400 "multiple PCI match on bonding device"
1401 "\"%s\" found", ibv_list[ret]->name);
1406 DRV_LOG(INFO, "PCI information matches for"
1407 " slave %d bonding device \"%s\"",
1408 bd, ibv_list[ret]->name);
1409 ibv_match[nd++] = ibv_list[ret];
1412 if (mlx5_dev_to_pci_addr
1413 (ibv_list[ret]->ibdev_path, &pci_addr))
1415 if (pci_dev->addr.domain != pci_addr.domain ||
1416 pci_dev->addr.bus != pci_addr.bus ||
1417 pci_dev->addr.devid != pci_addr.devid ||
1418 pci_dev->addr.function != pci_addr.function)
1420 DRV_LOG(INFO, "PCI information matches for device \"%s\"",
1421 ibv_list[ret]->name);
1422 ibv_match[nd++] = ibv_list[ret];
1424 ibv_match[nd] = NULL;
1426 /* No device matches, just complain and bail out. */
1428 "no Verbs device matches PCI device " PCI_PRI_FMT ","
1429 " are kernel drivers loaded?",
1430 pci_dev->addr.domain, pci_dev->addr.bus,
1431 pci_dev->addr.devid, pci_dev->addr.function);
1438 * Found single matching device may have multiple ports.
1439 * Each port may be representor, we have to check the port
1440 * number and check the representors existence.
1443 np = mlx5_nl_portnum(nl_rdma, ibv_match[0]->name);
1445 DRV_LOG(WARNING, "can not get IB device \"%s\""
1446 " ports number", ibv_match[0]->name);
1447 if (bd >= 0 && !np) {
1448 DRV_LOG(ERR, "can not get ports"
1449 " for bonding device");
1455 #ifndef HAVE_MLX5DV_DR_DEVX_PORT
1458 * This may happen if there is VF LAG kernel support and
1459 * application is compiled with older rdma_core library.
1462 "No kernel/verbs support for VF LAG bonding found.");
1463 rte_errno = ENOTSUP;
1469 * Now we can determine the maximal
1470 * amount of devices to be spawned.
1472 list = rte_zmalloc("device spawn data",
1473 sizeof(struct mlx5_dev_spawn_data) *
1475 RTE_CACHE_LINE_SIZE);
1477 DRV_LOG(ERR, "spawn data array allocation failure");
1482 if (bd >= 0 || np > 1) {
1484 * Single IB device with multiple ports found,
1485 * it may be E-Switch master device and representors.
1486 * We have to perform identification through the ports.
1488 MLX5_ASSERT(nl_rdma >= 0);
1489 MLX5_ASSERT(ns == 0);
1490 MLX5_ASSERT(nd == 1);
1492 for (i = 1; i <= np; ++i) {
1493 list[ns].max_port = np;
1494 list[ns].ibv_port = i;
1495 list[ns].ibv_dev = ibv_match[0];
1496 list[ns].eth_dev = NULL;
1497 list[ns].pci_dev = pci_dev;
1498 list[ns].pf_bond = bd;
1499 list[ns].ifindex = mlx5_nl_ifindex
1500 (nl_rdma, list[ns].ibv_dev->name, i);
1501 if (!list[ns].ifindex) {
1503 * No network interface index found for the
1504 * specified port, it means there is no
1505 * representor on this port. It's OK,
1506 * there can be disabled ports, for example
1507 * if sriov_numvfs < sriov_totalvfs.
1513 ret = mlx5_nl_switch_info
1517 if (ret || (!list[ns].info.representor &&
1518 !list[ns].info.master)) {
1520 * We failed to recognize representors with
1521 * Netlink, let's try to perform the task
1524 ret = mlx5_sysfs_switch_info
1528 if (!ret && bd >= 0) {
1529 switch (list[ns].info.name_type) {
1530 case MLX5_PHYS_PORT_NAME_TYPE_UPLINK:
1531 if (list[ns].info.port_name == bd)
1534 case MLX5_PHYS_PORT_NAME_TYPE_PFVF:
1535 if (list[ns].info.pf_num == bd)
1543 if (!ret && (list[ns].info.representor ^
1544 list[ns].info.master))
1549 "unable to recognize master/representors"
1550 " on the IB device with multiple ports");
1557 * The existence of several matching entries (nd > 1) means
1558 * port representors have been instantiated. No existing Verbs
1559 * call nor sysfs entries can tell them apart, this can only
1560 * be done through Netlink calls assuming kernel drivers are
1561 * recent enough to support them.
1563 * In the event of identification failure through Netlink,
1564 * try again through sysfs, then:
1566 * 1. A single IB device matches (nd == 1) with single
1567 * port (np=0/1) and is not a representor, assume
1568 * no switch support.
1570 * 2. Otherwise no safe assumptions can be made;
1571 * complain louder and bail out.
1573 for (i = 0; i != nd; ++i) {
1574 memset(&list[ns].info, 0, sizeof(list[ns].info));
1575 list[ns].max_port = 1;
1576 list[ns].ibv_port = 1;
1577 list[ns].ibv_dev = ibv_match[i];
1578 list[ns].eth_dev = NULL;
1579 list[ns].pci_dev = pci_dev;
1580 list[ns].pf_bond = -1;
1581 list[ns].ifindex = 0;
1583 list[ns].ifindex = mlx5_nl_ifindex
1584 (nl_rdma, list[ns].ibv_dev->name, 1);
1585 if (!list[ns].ifindex) {
1586 char ifname[IF_NAMESIZE];
1589 * Netlink failed, it may happen with old
1590 * ib_core kernel driver (before 4.16).
1591 * We can assume there is old driver because
1592 * here we are processing single ports IB
1593 * devices. Let's try sysfs to retrieve
1594 * the ifindex. The method works for
1595 * master device only.
1599 * Multiple devices found, assume
1600 * representors, can not distinguish
1601 * master/representor and retrieve
1602 * ifindex via sysfs.
1606 ret = mlx5_get_master_ifname
1607 (ibv_match[i]->ibdev_path, &ifname);
1610 if_nametoindex(ifname);
1611 if (!list[ns].ifindex) {
1613 * No network interface index found
1614 * for the specified device, it means
1615 * there it is neither representor
1623 ret = mlx5_nl_switch_info
1627 if (ret || (!list[ns].info.representor &&
1628 !list[ns].info.master)) {
1630 * We failed to recognize representors with
1631 * Netlink, let's try to perform the task
1634 ret = mlx5_sysfs_switch_info
1638 if (!ret && (list[ns].info.representor ^
1639 list[ns].info.master)) {
1641 } else if ((nd == 1) &&
1642 !list[ns].info.representor &&
1643 !list[ns].info.master) {
1645 * Single IB device with
1646 * one physical port and
1647 * attached network device.
1648 * May be SRIOV is not enabled
1649 * or there is no representors.
1651 DRV_LOG(INFO, "no E-Switch support detected");
1658 "unable to recognize master/representors"
1659 " on the multiple IB devices");
1667 * Sort list to probe devices in natural order for users convenience
1668 * (i.e. master first, then representors from lowest to highest ID).
1670 qsort(list, ns, sizeof(*list), mlx5_dev_spawn_data_cmp);
1671 /* Default configuration. */
1672 dev_config = (struct mlx5_dev_config){
1674 .mps = MLX5_ARG_UNSET,
1675 .dbnc = MLX5_ARG_UNSET,
1677 .txq_inline_max = MLX5_ARG_UNSET,
1678 .txq_inline_min = MLX5_ARG_UNSET,
1679 .txq_inline_mpw = MLX5_ARG_UNSET,
1680 .txqs_inline = MLX5_ARG_UNSET,
1682 .mr_ext_memseg_en = 1,
1684 .enabled = 0, /* Disabled by default. */
1687 .max_memcpy_len = MLX5_MPRQ_MEMCPY_DEFAULT_LEN,
1688 .min_rxqs_num = MLX5_MPRQ_MIN_RXQS,
1692 .log_hp_size = MLX5_ARG_UNSET,
1694 /* Device specific configuration. */
1695 switch (pci_dev->id.device_id) {
1696 case PCI_DEVICE_ID_MELLANOX_CONNECTX4VF:
1697 case PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF:
1698 case PCI_DEVICE_ID_MELLANOX_CONNECTX5VF:
1699 case PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF:
1700 case PCI_DEVICE_ID_MELLANOX_CONNECTX5BFVF:
1701 case PCI_DEVICE_ID_MELLANOX_CONNECTX6VF:
1702 case PCI_DEVICE_ID_MELLANOX_CONNECTX6DXVF:
1708 for (i = 0; i != ns; ++i) {
1711 list[i].eth_dev = mlx5_dev_spawn(&pci_dev->device,
1714 if (!list[i].eth_dev) {
1715 if (rte_errno != EBUSY && rte_errno != EEXIST)
1717 /* Device is disabled or already spawned. Ignore it. */
1720 restore = list[i].eth_dev->data->dev_flags;
1721 rte_eth_copy_pci_info(list[i].eth_dev, pci_dev);
1722 /* Restore non-PCI flags cleared by the above call. */
1723 list[i].eth_dev->data->dev_flags |= restore;
1724 rte_eth_dev_probing_finish(list[i].eth_dev);
1728 "probe of PCI device " PCI_PRI_FMT " aborted after"
1729 " encountering an error: %s",
1730 pci_dev->addr.domain, pci_dev->addr.bus,
1731 pci_dev->addr.devid, pci_dev->addr.function,
1732 strerror(rte_errno));
1736 if (!list[i].eth_dev)
1738 mlx5_dev_close(list[i].eth_dev);
1739 /* mac_addrs must not be freed because in dev_private */
1740 list[i].eth_dev->data->mac_addrs = NULL;
1741 claim_zero(rte_eth_dev_release_port(list[i].eth_dev));
1743 /* Restore original error. */
1750 * Do the routine cleanup:
1751 * - close opened Netlink sockets
1752 * - free allocated spawn data array
1753 * - free the Infiniband device list
1761 MLX5_ASSERT(ibv_list);
1762 mlx5_glue->free_device_list(ibv_list);
1767 mlx5_config_doorbell_mapping_env(const struct mlx5_dev_config *config)
1772 MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY);
1773 /* Get environment variable to store. */
1774 env = getenv(MLX5_SHUT_UP_BF);
1775 value = env ? !!strcmp(env, "0") : MLX5_ARG_UNSET;
1776 if (config->dbnc == MLX5_ARG_UNSET)
1777 setenv(MLX5_SHUT_UP_BF, MLX5_SHUT_UP_BF_DEFAULT, 1);
1779 setenv(MLX5_SHUT_UP_BF,
1780 config->dbnc == MLX5_TXDB_NCACHED ? "1" : "0", 1);
1785 mlx5_restore_doorbell_mapping_env(int value)
1787 MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY);
1788 /* Restore the original environment variable state. */
1789 if (value == MLX5_ARG_UNSET)
1790 unsetenv(MLX5_SHUT_UP_BF);
1792 setenv(MLX5_SHUT_UP_BF, value ? "1" : "0", 1);
1796 * Extract pdn of PD object using DV API.
1799 * Pointer to the verbs PD object.
1801 * Pointer to the PD object number variable.
1804 * 0 on success, error value otherwise.
1807 mlx5_os_get_pdn(void *pd, uint32_t *pdn)
1809 #ifdef HAVE_IBV_FLOW_DV_SUPPORT
1810 struct mlx5dv_obj obj;
1811 struct mlx5dv_pd pd_info;
1815 obj.pd.out = &pd_info;
1816 ret = mlx5_glue->dv_init_obj(&obj, MLX5DV_OBJ_PD);
1818 DRV_LOG(DEBUG, "Fail to get PD object info");
1827 #endif /* HAVE_IBV_FLOW_DV_SUPPORT */
1831 * Function API to open IB device.
1833 * This function calls the Linux glue APIs to open a device.
1836 * Pointer to the IB device attributes (name, port, etc).
1837 * @param[out] config
1838 * Pointer to device configuration structure.
1840 * Pointer to shared context structure.
1843 * 0 on success, a positive error value otherwise.
1846 mlx5_os_open_device(const struct mlx5_dev_spawn_data *spawn,
1847 const struct mlx5_dev_config *config,
1848 struct mlx5_dev_ctx_shared *sh)
1853 * Configure environment variable "MLX5_BF_SHUT_UP"
1854 * before the device creation. The rdma_core library
1855 * checks the variable at device creation and
1856 * stores the result internally.
1858 dbmap_env = mlx5_config_doorbell_mapping_env(config);
1859 /* Try to open IB device with DV first, then usual Verbs. */
1861 sh->ctx = mlx5_glue->dv_open_device(spawn->ibv_dev);
1864 DRV_LOG(DEBUG, "DevX is supported");
1865 /* The device is created, no need for environment. */
1866 mlx5_restore_doorbell_mapping_env(dbmap_env);
1868 /* The environment variable is still configured. */
1869 sh->ctx = mlx5_glue->open_device(spawn->ibv_dev);
1870 err = errno ? errno : ENODEV;
1872 * The environment variable is not needed anymore,
1873 * all device creation attempts are completed.
1875 mlx5_restore_doorbell_mapping_env(dbmap_env);
1878 DRV_LOG(DEBUG, "DevX is NOT supported");
1885 * Install shared asynchronous device events handler.
1886 * This function is implemented to support event sharing
1887 * between multiple ports of single IB device.
1890 * Pointer to mlx5_dev_ctx_shared object.
1893 mlx5_os_dev_shared_handler_install(struct mlx5_dev_ctx_shared *sh)
1898 sh->intr_handle.fd = -1;
1899 flags = fcntl(((struct ibv_context *)sh->ctx)->async_fd, F_GETFL);
1900 ret = fcntl(((struct ibv_context *)sh->ctx)->async_fd,
1901 F_SETFL, flags | O_NONBLOCK);
1903 DRV_LOG(INFO, "failed to change file descriptor async event"
1906 sh->intr_handle.fd = ((struct ibv_context *)sh->ctx)->async_fd;
1907 sh->intr_handle.type = RTE_INTR_HANDLE_EXT;
1908 if (rte_intr_callback_register(&sh->intr_handle,
1909 mlx5_dev_interrupt_handler, sh)) {
1910 DRV_LOG(INFO, "Fail to install the shared interrupt.");
1911 sh->intr_handle.fd = -1;
1915 #ifdef HAVE_IBV_DEVX_ASYNC
1916 sh->intr_handle_devx.fd = -1;
1917 sh->devx_comp = mlx5_glue->devx_create_cmd_comp(sh->ctx);
1918 if (!sh->devx_comp) {
1919 DRV_LOG(INFO, "failed to allocate devx_comp.");
1922 flags = fcntl(sh->devx_comp->fd, F_GETFL);
1923 ret = fcntl(sh->devx_comp->fd, F_SETFL, flags | O_NONBLOCK);
1925 DRV_LOG(INFO, "failed to change file descriptor"
1929 sh->intr_handle_devx.fd = sh->devx_comp->fd;
1930 sh->intr_handle_devx.type = RTE_INTR_HANDLE_EXT;
1931 if (rte_intr_callback_register(&sh->intr_handle_devx,
1932 mlx5_dev_interrupt_handler_devx, sh)) {
1933 DRV_LOG(INFO, "Fail to install the devx shared"
1935 sh->intr_handle_devx.fd = -1;
1937 #endif /* HAVE_IBV_DEVX_ASYNC */
1942 * Uninstall shared asynchronous device events handler.
1943 * This function is implemented to support event sharing
1944 * between multiple ports of single IB device.
1947 * Pointer to mlx5_dev_ctx_shared object.
1950 mlx5_os_dev_shared_handler_uninstall(struct mlx5_dev_ctx_shared *sh)
1952 if (sh->intr_handle.fd >= 0)
1953 mlx5_intr_callback_unregister(&sh->intr_handle,
1954 mlx5_dev_interrupt_handler, sh);
1955 #ifdef HAVE_IBV_DEVX_ASYNC
1956 if (sh->intr_handle_devx.fd >= 0)
1957 rte_intr_callback_unregister(&sh->intr_handle_devx,
1958 mlx5_dev_interrupt_handler_devx, sh);
1960 mlx5_glue->devx_destroy_cmd_comp(sh->devx_comp);