net/mlx5: reduce PCI dependency
[dpdk.git] / drivers / net / mlx5 / linux / mlx5_os.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2015 6WIND S.A.
3  * Copyright 2020 Mellanox Technologies, Ltd
4  */
5
6 #include <stddef.h>
7 #include <unistd.h>
8 #include <string.h>
9 #include <stdint.h>
10 #include <stdlib.h>
11 #include <errno.h>
12 #include <net/if.h>
13 #include <linux/rtnetlink.h>
14 #include <linux/sockios.h>
15 #include <linux/ethtool.h>
16 #include <fcntl.h>
17
18 #include <rte_malloc.h>
19 #include <ethdev_driver.h>
20 #include <ethdev_pci.h>
21 #include <rte_pci.h>
22 #include <rte_bus_pci.h>
23 #include <rte_common.h>
24 #include <rte_kvargs.h>
25 #include <rte_rwlock.h>
26 #include <rte_spinlock.h>
27 #include <rte_string_fns.h>
28 #include <rte_alarm.h>
29 #include <rte_eal_paging.h>
30
31 #include <mlx5_glue.h>
32 #include <mlx5_devx_cmds.h>
33 #include <mlx5_common.h>
34 #include <mlx5_common_mp.h>
35 #include <mlx5_common_mr.h>
36 #include <mlx5_malloc.h>
37
38 #include "mlx5_defs.h"
39 #include "mlx5.h"
40 #include "mlx5_common_os.h"
41 #include "mlx5_utils.h"
42 #include "mlx5_rxtx.h"
43 #include "mlx5_rx.h"
44 #include "mlx5_tx.h"
45 #include "mlx5_autoconf.h"
46 #include "mlx5_mr.h"
47 #include "mlx5_flow.h"
48 #include "rte_pmd_mlx5.h"
49 #include "mlx5_verbs.h"
50 #include "mlx5_nl.h"
51 #include "mlx5_devx.h"
52
53 #ifndef HAVE_IBV_MLX5_MOD_MPW
54 #define MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED (1 << 2)
55 #define MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW (1 << 3)
56 #endif
57
58 #ifndef HAVE_IBV_MLX5_MOD_CQE_128B_COMP
59 #define MLX5DV_CONTEXT_FLAGS_CQE_128B_COMP (1 << 4)
60 #endif
61
62 static const char *MZ_MLX5_PMD_SHARED_DATA = "mlx5_pmd_shared_data";
63
64 /* Spinlock for mlx5_shared_data allocation. */
65 static rte_spinlock_t mlx5_shared_data_lock = RTE_SPINLOCK_INITIALIZER;
66
67 /* Process local data for secondary processes. */
68 static struct mlx5_local_data mlx5_local_data;
69
70 /* rte flow indexed pool configuration. */
71 static struct mlx5_indexed_pool_config icfg[] = {
72         {
73                 .size = sizeof(struct rte_flow),
74                 .trunk_size = 64,
75                 .need_lock = 1,
76                 .release_mem_en = 0,
77                 .malloc = mlx5_malloc,
78                 .free = mlx5_free,
79                 .per_core_cache = 0,
80                 .type = "ctl_flow_ipool",
81         },
82         {
83                 .size = sizeof(struct rte_flow),
84                 .trunk_size = 64,
85                 .grow_trunk = 3,
86                 .grow_shift = 2,
87                 .need_lock = 1,
88                 .release_mem_en = 0,
89                 .malloc = mlx5_malloc,
90                 .free = mlx5_free,
91                 .per_core_cache = 1 << 14,
92                 .type = "rte_flow_ipool",
93         },
94         {
95                 .size = sizeof(struct rte_flow),
96                 .trunk_size = 64,
97                 .grow_trunk = 3,
98                 .grow_shift = 2,
99                 .need_lock = 1,
100                 .release_mem_en = 0,
101                 .malloc = mlx5_malloc,
102                 .free = mlx5_free,
103                 .per_core_cache = 0,
104                 .type = "mcp_flow_ipool",
105         },
106 };
107
108 /**
109  * Set the completion channel file descriptor interrupt as non-blocking.
110  *
111  * @param[in] rxq_obj
112  *   Pointer to RQ channel object, which includes the channel fd
113  *
114  * @param[out] fd
115  *   The file descriptor (representing the intetrrupt) used in this channel.
116  *
117  * @return
118  *   0 on successfully setting the fd to non-blocking, non-zero otherwise.
119  */
120 int
121 mlx5_os_set_nonblock_channel_fd(int fd)
122 {
123         int flags;
124
125         flags = fcntl(fd, F_GETFL);
126         return fcntl(fd, F_SETFL, flags | O_NONBLOCK);
127 }
128
129 /**
130  * Get mlx5 device attributes. The glue function query_device_ex() is called
131  * with out parameter of type 'struct ibv_device_attr_ex *'. Then fill in mlx5
132  * device attributes from the glue out parameter.
133  *
134  * @param dev
135  *   Pointer to ibv context.
136  *
137  * @param device_attr
138  *   Pointer to mlx5 device attributes.
139  *
140  * @return
141  *   0 on success, non zero error number otherwise
142  */
143 int
144 mlx5_os_get_dev_attr(void *ctx, struct mlx5_dev_attr *device_attr)
145 {
146         int err;
147         struct ibv_device_attr_ex attr_ex;
148         memset(device_attr, 0, sizeof(*device_attr));
149         err = mlx5_glue->query_device_ex(ctx, NULL, &attr_ex);
150         if (err)
151                 return err;
152
153         device_attr->device_cap_flags_ex = attr_ex.device_cap_flags_ex;
154         device_attr->max_qp_wr = attr_ex.orig_attr.max_qp_wr;
155         device_attr->max_sge = attr_ex.orig_attr.max_sge;
156         device_attr->max_cq = attr_ex.orig_attr.max_cq;
157         device_attr->max_cqe = attr_ex.orig_attr.max_cqe;
158         device_attr->max_mr = attr_ex.orig_attr.max_mr;
159         device_attr->max_pd = attr_ex.orig_attr.max_pd;
160         device_attr->max_qp = attr_ex.orig_attr.max_qp;
161         device_attr->max_srq = attr_ex.orig_attr.max_srq;
162         device_attr->max_srq_wr = attr_ex.orig_attr.max_srq_wr;
163         device_attr->raw_packet_caps = attr_ex.raw_packet_caps;
164         device_attr->max_rwq_indirection_table_size =
165                 attr_ex.rss_caps.max_rwq_indirection_table_size;
166         device_attr->max_tso = attr_ex.tso_caps.max_tso;
167         device_attr->tso_supported_qpts = attr_ex.tso_caps.supported_qpts;
168
169         struct mlx5dv_context dv_attr = { .comp_mask = 0 };
170         err = mlx5_glue->dv_query_device(ctx, &dv_attr);
171         if (err)
172                 return err;
173
174         device_attr->flags = dv_attr.flags;
175         device_attr->comp_mask = dv_attr.comp_mask;
176 #ifdef HAVE_IBV_MLX5_MOD_SWP
177         device_attr->sw_parsing_offloads =
178                 dv_attr.sw_parsing_caps.sw_parsing_offloads;
179 #endif
180         device_attr->min_single_stride_log_num_of_bytes =
181                 dv_attr.striding_rq_caps.min_single_stride_log_num_of_bytes;
182         device_attr->max_single_stride_log_num_of_bytes =
183                 dv_attr.striding_rq_caps.max_single_stride_log_num_of_bytes;
184         device_attr->min_single_wqe_log_num_of_strides =
185                 dv_attr.striding_rq_caps.min_single_wqe_log_num_of_strides;
186         device_attr->max_single_wqe_log_num_of_strides =
187                 dv_attr.striding_rq_caps.max_single_wqe_log_num_of_strides;
188         device_attr->stride_supported_qpts =
189                 dv_attr.striding_rq_caps.supported_qpts;
190 #ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT
191         device_attr->tunnel_offloads_caps = dv_attr.tunnel_offloads_caps;
192 #endif
193         strlcpy(device_attr->fw_ver, attr_ex.orig_attr.fw_ver,
194                 sizeof(device_attr->fw_ver));
195
196         return err;
197 }
198
199 /**
200  * Verbs callback to allocate a memory. This function should allocate the space
201  * according to the size provided residing inside a huge page.
202  * Please note that all allocation must respect the alignment from libmlx5
203  * (i.e. currently rte_mem_page_size()).
204  *
205  * @param[in] size
206  *   The size in bytes of the memory to allocate.
207  * @param[in] data
208  *   A pointer to the callback data.
209  *
210  * @return
211  *   Allocated buffer, NULL otherwise and rte_errno is set.
212  */
213 static void *
214 mlx5_alloc_verbs_buf(size_t size, void *data)
215 {
216         struct mlx5_dev_ctx_shared *sh = data;
217         void *ret;
218         size_t alignment = rte_mem_page_size();
219         if (alignment == (size_t)-1) {
220                 DRV_LOG(ERR, "Failed to get mem page size");
221                 rte_errno = ENOMEM;
222                 return NULL;
223         }
224
225         MLX5_ASSERT(data != NULL);
226         ret = mlx5_malloc(0, size, alignment, sh->numa_node);
227         if (!ret && size)
228                 rte_errno = ENOMEM;
229         return ret;
230 }
231
232 /**
233  * Detect misc5 support or not
234  *
235  * @param[in] priv
236  *   Device private data pointer
237  */
238 #ifdef HAVE_MLX5DV_DR
239 static void
240 __mlx5_discovery_misc5_cap(struct mlx5_priv *priv)
241 {
242 #ifdef HAVE_IBV_FLOW_DV_SUPPORT
243         /* Dummy VxLAN matcher to detect rdma-core misc5 cap
244          * Case: IPv4--->UDP--->VxLAN--->vni
245          */
246         void *tbl;
247         struct mlx5_flow_dv_match_params matcher_mask;
248         void *match_m;
249         void *matcher;
250         void *headers_m;
251         void *misc5_m;
252         uint32_t *tunnel_header_m;
253         struct mlx5dv_flow_matcher_attr dv_attr;
254
255         memset(&matcher_mask, 0, sizeof(matcher_mask));
256         matcher_mask.size = sizeof(matcher_mask.buf);
257         match_m = matcher_mask.buf;
258         headers_m = MLX5_ADDR_OF(fte_match_param, match_m, outer_headers);
259         misc5_m = MLX5_ADDR_OF(fte_match_param,
260                                match_m, misc_parameters_5);
261         tunnel_header_m = (uint32_t *)
262                                 MLX5_ADDR_OF(fte_match_set_misc5,
263                                 misc5_m, tunnel_header_1);
264         MLX5_SET(fte_match_set_lyr_2_4, headers_m, ip_protocol, 0xff);
265         MLX5_SET(fte_match_set_lyr_2_4, headers_m, ip_version, 4);
266         MLX5_SET(fte_match_set_lyr_2_4, headers_m, udp_dport, 0xffff);
267         *tunnel_header_m = 0xffffff;
268
269         tbl = mlx5_glue->dr_create_flow_tbl(priv->sh->rx_domain, 1);
270         if (!tbl) {
271                 DRV_LOG(INFO, "No SW steering support");
272                 return;
273         }
274         dv_attr.type = IBV_FLOW_ATTR_NORMAL,
275         dv_attr.match_mask = (void *)&matcher_mask,
276         dv_attr.match_criteria_enable =
277                         (1 << MLX5_MATCH_CRITERIA_ENABLE_OUTER_BIT) |
278                         (1 << MLX5_MATCH_CRITERIA_ENABLE_MISC5_BIT);
279         dv_attr.priority = 3;
280 #ifdef HAVE_MLX5DV_DR_ESWITCH
281         void *misc2_m;
282         if (priv->config.dv_esw_en) {
283                 /* FDB enabled reg_c_0 */
284                 dv_attr.match_criteria_enable |=
285                                 (1 << MLX5_MATCH_CRITERIA_ENABLE_MISC2_BIT);
286                 misc2_m = MLX5_ADDR_OF(fte_match_param,
287                                        match_m, misc_parameters_2);
288                 MLX5_SET(fte_match_set_misc2, misc2_m,
289                          metadata_reg_c_0, 0xffff);
290         }
291 #endif
292         matcher = mlx5_glue->dv_create_flow_matcher(priv->sh->ctx,
293                                                     &dv_attr, tbl);
294         if (matcher) {
295                 priv->sh->misc5_cap = 1;
296                 mlx5_glue->dv_destroy_flow_matcher(matcher);
297         }
298         mlx5_glue->dr_destroy_flow_tbl(tbl);
299 #else
300         RTE_SET_USED(priv);
301 #endif
302 }
303 #endif
304
305 /**
306  * Verbs callback to free a memory.
307  *
308  * @param[in] ptr
309  *   A pointer to the memory to free.
310  * @param[in] data
311  *   A pointer to the callback data.
312  */
313 static void
314 mlx5_free_verbs_buf(void *ptr, void *data __rte_unused)
315 {
316         MLX5_ASSERT(data != NULL);
317         mlx5_free(ptr);
318 }
319
320 /**
321  * Initialize DR related data within private structure.
322  * Routine checks the reference counter and does actual
323  * resources creation/initialization only if counter is zero.
324  *
325  * @param[in] priv
326  *   Pointer to the private device data structure.
327  *
328  * @return
329  *   Zero on success, positive error code otherwise.
330  */
331 static int
332 mlx5_alloc_shared_dr(struct mlx5_priv *priv)
333 {
334         struct mlx5_dev_ctx_shared *sh = priv->sh;
335         char s[MLX5_NAME_SIZE] __rte_unused;
336         int err;
337
338         MLX5_ASSERT(sh && sh->refcnt);
339         if (sh->refcnt > 1)
340                 return 0;
341         err = mlx5_alloc_table_hash_list(priv);
342         if (err)
343                 goto error;
344         /* The resources below are only valid with DV support. */
345 #ifdef HAVE_IBV_FLOW_DV_SUPPORT
346         /* Init port id action list. */
347         snprintf(s, sizeof(s), "%s_port_id_action_list", sh->ibdev_name);
348         sh->port_id_action_list = mlx5_list_create(s, sh, true,
349                                                    flow_dv_port_id_create_cb,
350                                                    flow_dv_port_id_match_cb,
351                                                    flow_dv_port_id_remove_cb,
352                                                    flow_dv_port_id_clone_cb,
353                                                  flow_dv_port_id_clone_free_cb);
354         if (!sh->port_id_action_list)
355                 goto error;
356         /* Init push vlan action list. */
357         snprintf(s, sizeof(s), "%s_push_vlan_action_list", sh->ibdev_name);
358         sh->push_vlan_action_list = mlx5_list_create(s, sh, true,
359                                                     flow_dv_push_vlan_create_cb,
360                                                     flow_dv_push_vlan_match_cb,
361                                                     flow_dv_push_vlan_remove_cb,
362                                                     flow_dv_push_vlan_clone_cb,
363                                                flow_dv_push_vlan_clone_free_cb);
364         if (!sh->push_vlan_action_list)
365                 goto error;
366         /* Init sample action list. */
367         snprintf(s, sizeof(s), "%s_sample_action_list", sh->ibdev_name);
368         sh->sample_action_list = mlx5_list_create(s, sh, true,
369                                                   flow_dv_sample_create_cb,
370                                                   flow_dv_sample_match_cb,
371                                                   flow_dv_sample_remove_cb,
372                                                   flow_dv_sample_clone_cb,
373                                                   flow_dv_sample_clone_free_cb);
374         if (!sh->sample_action_list)
375                 goto error;
376         /* Init dest array action list. */
377         snprintf(s, sizeof(s), "%s_dest_array_list", sh->ibdev_name);
378         sh->dest_array_list = mlx5_list_create(s, sh, true,
379                                                flow_dv_dest_array_create_cb,
380                                                flow_dv_dest_array_match_cb,
381                                                flow_dv_dest_array_remove_cb,
382                                                flow_dv_dest_array_clone_cb,
383                                               flow_dv_dest_array_clone_free_cb);
384         if (!sh->dest_array_list)
385                 goto error;
386 #endif
387 #ifdef HAVE_MLX5DV_DR
388         void *domain;
389
390         /* Reference counter is zero, we should initialize structures. */
391         domain = mlx5_glue->dr_create_domain(sh->ctx,
392                                              MLX5DV_DR_DOMAIN_TYPE_NIC_RX);
393         if (!domain) {
394                 DRV_LOG(ERR, "ingress mlx5dv_dr_create_domain failed");
395                 err = errno;
396                 goto error;
397         }
398         sh->rx_domain = domain;
399         domain = mlx5_glue->dr_create_domain(sh->ctx,
400                                              MLX5DV_DR_DOMAIN_TYPE_NIC_TX);
401         if (!domain) {
402                 DRV_LOG(ERR, "egress mlx5dv_dr_create_domain failed");
403                 err = errno;
404                 goto error;
405         }
406         sh->tx_domain = domain;
407 #ifdef HAVE_MLX5DV_DR_ESWITCH
408         if (priv->config.dv_esw_en) {
409                 domain  = mlx5_glue->dr_create_domain
410                         (sh->ctx, MLX5DV_DR_DOMAIN_TYPE_FDB);
411                 if (!domain) {
412                         DRV_LOG(ERR, "FDB mlx5dv_dr_create_domain failed");
413                         err = errno;
414                         goto error;
415                 }
416                 sh->fdb_domain = domain;
417         }
418         /*
419          * The drop action is just some dummy placeholder in rdma-core. It
420          * does not belong to domains and has no any attributes, and, can be
421          * shared by the entire device.
422          */
423         sh->dr_drop_action = mlx5_glue->dr_create_flow_action_drop();
424         if (!sh->dr_drop_action) {
425                 DRV_LOG(ERR, "FDB mlx5dv_dr_create_flow_action_drop");
426                 err = errno;
427                 goto error;
428         }
429 #endif
430         if (!sh->tunnel_hub && priv->config.dv_miss_info)
431                 err = mlx5_alloc_tunnel_hub(sh);
432         if (err) {
433                 DRV_LOG(ERR, "mlx5_alloc_tunnel_hub failed err=%d", err);
434                 goto error;
435         }
436         if (priv->config.reclaim_mode == MLX5_RCM_AGGR) {
437                 mlx5_glue->dr_reclaim_domain_memory(sh->rx_domain, 1);
438                 mlx5_glue->dr_reclaim_domain_memory(sh->tx_domain, 1);
439                 if (sh->fdb_domain)
440                         mlx5_glue->dr_reclaim_domain_memory(sh->fdb_domain, 1);
441         }
442         sh->pop_vlan_action = mlx5_glue->dr_create_flow_action_pop_vlan();
443         if (!priv->config.allow_duplicate_pattern) {
444 #ifndef HAVE_MLX5_DR_ALLOW_DUPLICATE
445                 DRV_LOG(WARNING, "Disallow duplicate pattern is not supported - maybe old rdma-core version?");
446 #endif
447                 mlx5_glue->dr_allow_duplicate_rules(sh->rx_domain, 0);
448                 mlx5_glue->dr_allow_duplicate_rules(sh->tx_domain, 0);
449                 if (sh->fdb_domain)
450                         mlx5_glue->dr_allow_duplicate_rules(sh->fdb_domain, 0);
451         }
452
453         __mlx5_discovery_misc5_cap(priv);
454 #endif /* HAVE_MLX5DV_DR */
455         sh->default_miss_action =
456                         mlx5_glue->dr_create_flow_action_default_miss();
457         if (!sh->default_miss_action)
458                 DRV_LOG(WARNING, "Default miss action is not supported.");
459         return 0;
460 error:
461         /* Rollback the created objects. */
462         if (sh->rx_domain) {
463                 mlx5_glue->dr_destroy_domain(sh->rx_domain);
464                 sh->rx_domain = NULL;
465         }
466         if (sh->tx_domain) {
467                 mlx5_glue->dr_destroy_domain(sh->tx_domain);
468                 sh->tx_domain = NULL;
469         }
470         if (sh->fdb_domain) {
471                 mlx5_glue->dr_destroy_domain(sh->fdb_domain);
472                 sh->fdb_domain = NULL;
473         }
474         if (sh->dr_drop_action) {
475                 mlx5_glue->destroy_flow_action(sh->dr_drop_action);
476                 sh->dr_drop_action = NULL;
477         }
478         if (sh->pop_vlan_action) {
479                 mlx5_glue->destroy_flow_action(sh->pop_vlan_action);
480                 sh->pop_vlan_action = NULL;
481         }
482         if (sh->encaps_decaps) {
483                 mlx5_hlist_destroy(sh->encaps_decaps);
484                 sh->encaps_decaps = NULL;
485         }
486         if (sh->modify_cmds) {
487                 mlx5_hlist_destroy(sh->modify_cmds);
488                 sh->modify_cmds = NULL;
489         }
490         if (sh->tag_table) {
491                 /* tags should be destroyed with flow before. */
492                 mlx5_hlist_destroy(sh->tag_table);
493                 sh->tag_table = NULL;
494         }
495         if (sh->tunnel_hub) {
496                 mlx5_release_tunnel_hub(sh, priv->dev_port);
497                 sh->tunnel_hub = NULL;
498         }
499         mlx5_free_table_hash_list(priv);
500         if (sh->port_id_action_list) {
501                 mlx5_list_destroy(sh->port_id_action_list);
502                 sh->port_id_action_list = NULL;
503         }
504         if (sh->push_vlan_action_list) {
505                 mlx5_list_destroy(sh->push_vlan_action_list);
506                 sh->push_vlan_action_list = NULL;
507         }
508         if (sh->sample_action_list) {
509                 mlx5_list_destroy(sh->sample_action_list);
510                 sh->sample_action_list = NULL;
511         }
512         if (sh->dest_array_list) {
513                 mlx5_list_destroy(sh->dest_array_list);
514                 sh->dest_array_list = NULL;
515         }
516         return err;
517 }
518
519 /**
520  * Destroy DR related data within private structure.
521  *
522  * @param[in] priv
523  *   Pointer to the private device data structure.
524  */
525 void
526 mlx5_os_free_shared_dr(struct mlx5_priv *priv)
527 {
528         struct mlx5_dev_ctx_shared *sh = priv->sh;
529
530         MLX5_ASSERT(sh && sh->refcnt);
531         if (sh->refcnt > 1)
532                 return;
533 #ifdef HAVE_MLX5DV_DR
534         if (sh->rx_domain) {
535                 mlx5_glue->dr_destroy_domain(sh->rx_domain);
536                 sh->rx_domain = NULL;
537         }
538         if (sh->tx_domain) {
539                 mlx5_glue->dr_destroy_domain(sh->tx_domain);
540                 sh->tx_domain = NULL;
541         }
542 #ifdef HAVE_MLX5DV_DR_ESWITCH
543         if (sh->fdb_domain) {
544                 mlx5_glue->dr_destroy_domain(sh->fdb_domain);
545                 sh->fdb_domain = NULL;
546         }
547         if (sh->dr_drop_action) {
548                 mlx5_glue->destroy_flow_action(sh->dr_drop_action);
549                 sh->dr_drop_action = NULL;
550         }
551 #endif
552         if (sh->pop_vlan_action) {
553                 mlx5_glue->destroy_flow_action(sh->pop_vlan_action);
554                 sh->pop_vlan_action = NULL;
555         }
556 #endif /* HAVE_MLX5DV_DR */
557         if (sh->default_miss_action)
558                 mlx5_glue->destroy_flow_action
559                                 (sh->default_miss_action);
560         if (sh->encaps_decaps) {
561                 mlx5_hlist_destroy(sh->encaps_decaps);
562                 sh->encaps_decaps = NULL;
563         }
564         if (sh->modify_cmds) {
565                 mlx5_hlist_destroy(sh->modify_cmds);
566                 sh->modify_cmds = NULL;
567         }
568         if (sh->tag_table) {
569                 /* tags should be destroyed with flow before. */
570                 mlx5_hlist_destroy(sh->tag_table);
571                 sh->tag_table = NULL;
572         }
573         if (sh->tunnel_hub) {
574                 mlx5_release_tunnel_hub(sh, priv->dev_port);
575                 sh->tunnel_hub = NULL;
576         }
577         mlx5_free_table_hash_list(priv);
578         if (sh->port_id_action_list) {
579                 mlx5_list_destroy(sh->port_id_action_list);
580                 sh->port_id_action_list = NULL;
581         }
582         if (sh->push_vlan_action_list) {
583                 mlx5_list_destroy(sh->push_vlan_action_list);
584                 sh->push_vlan_action_list = NULL;
585         }
586         if (sh->sample_action_list) {
587                 mlx5_list_destroy(sh->sample_action_list);
588                 sh->sample_action_list = NULL;
589         }
590         if (sh->dest_array_list) {
591                 mlx5_list_destroy(sh->dest_array_list);
592                 sh->dest_array_list = NULL;
593         }
594 }
595
596 /**
597  * Initialize shared data between primary and secondary process.
598  *
599  * A memzone is reserved by primary process and secondary processes attach to
600  * the memzone.
601  *
602  * @return
603  *   0 on success, a negative errno value otherwise and rte_errno is set.
604  */
605 static int
606 mlx5_init_shared_data(void)
607 {
608         const struct rte_memzone *mz;
609         int ret = 0;
610
611         rte_spinlock_lock(&mlx5_shared_data_lock);
612         if (mlx5_shared_data == NULL) {
613                 if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
614                         /* Allocate shared memory. */
615                         mz = rte_memzone_reserve(MZ_MLX5_PMD_SHARED_DATA,
616                                                  sizeof(*mlx5_shared_data),
617                                                  SOCKET_ID_ANY, 0);
618                         if (mz == NULL) {
619                                 DRV_LOG(ERR,
620                                         "Cannot allocate mlx5 shared data");
621                                 ret = -rte_errno;
622                                 goto error;
623                         }
624                         mlx5_shared_data = mz->addr;
625                         memset(mlx5_shared_data, 0, sizeof(*mlx5_shared_data));
626                         rte_spinlock_init(&mlx5_shared_data->lock);
627                 } else {
628                         /* Lookup allocated shared memory. */
629                         mz = rte_memzone_lookup(MZ_MLX5_PMD_SHARED_DATA);
630                         if (mz == NULL) {
631                                 DRV_LOG(ERR,
632                                         "Cannot attach mlx5 shared data");
633                                 ret = -rte_errno;
634                                 goto error;
635                         }
636                         mlx5_shared_data = mz->addr;
637                         memset(&mlx5_local_data, 0, sizeof(mlx5_local_data));
638                 }
639         }
640 error:
641         rte_spinlock_unlock(&mlx5_shared_data_lock);
642         return ret;
643 }
644
645 /**
646  * PMD global initialization.
647  *
648  * Independent from individual device, this function initializes global
649  * per-PMD data structures distinguishing primary and secondary processes.
650  * Hence, each initialization is called once per a process.
651  *
652  * @return
653  *   0 on success, a negative errno value otherwise and rte_errno is set.
654  */
655 static int
656 mlx5_init_once(void)
657 {
658         struct mlx5_shared_data *sd;
659         struct mlx5_local_data *ld = &mlx5_local_data;
660         int ret = 0;
661
662         if (mlx5_init_shared_data())
663                 return -rte_errno;
664         sd = mlx5_shared_data;
665         MLX5_ASSERT(sd);
666         rte_spinlock_lock(&sd->lock);
667         switch (rte_eal_process_type()) {
668         case RTE_PROC_PRIMARY:
669                 if (sd->init_done)
670                         break;
671                 LIST_INIT(&sd->mem_event_cb_list);
672                 rte_rwlock_init(&sd->mem_event_rwlock);
673                 rte_mem_event_callback_register("MLX5_MEM_EVENT_CB",
674                                                 mlx5_mr_mem_event_cb, NULL);
675                 ret = mlx5_mp_init_primary(MLX5_MP_NAME,
676                                            mlx5_mp_os_primary_handle);
677                 if (ret)
678                         goto out;
679                 sd->init_done = true;
680                 break;
681         case RTE_PROC_SECONDARY:
682                 if (ld->init_done)
683                         break;
684                 ret = mlx5_mp_init_secondary(MLX5_MP_NAME,
685                                              mlx5_mp_os_secondary_handle);
686                 if (ret)
687                         goto out;
688                 ++sd->secondary_cnt;
689                 ld->init_done = true;
690                 break;
691         default:
692                 break;
693         }
694 out:
695         rte_spinlock_unlock(&sd->lock);
696         return ret;
697 }
698
699 /**
700  * Create the Tx queue DevX/Verbs object.
701  *
702  * @param dev
703  *   Pointer to Ethernet device.
704  * @param idx
705  *   Queue index in DPDK Tx queue array.
706  *
707  * @return
708  *   0 on success, a negative errno value otherwise and rte_errno is set.
709  */
710 static int
711 mlx5_os_txq_obj_new(struct rte_eth_dev *dev, uint16_t idx)
712 {
713         struct mlx5_priv *priv = dev->data->dev_private;
714         struct mlx5_txq_data *txq_data = (*priv->txqs)[idx];
715         struct mlx5_txq_ctrl *txq_ctrl =
716                         container_of(txq_data, struct mlx5_txq_ctrl, txq);
717
718         if (txq_ctrl->type == MLX5_TXQ_TYPE_HAIRPIN)
719                 return mlx5_txq_devx_obj_new(dev, idx);
720 #ifdef HAVE_MLX5DV_DEVX_UAR_OFFSET
721         if (!priv->config.dv_esw_en)
722                 return mlx5_txq_devx_obj_new(dev, idx);
723 #endif
724         return mlx5_txq_ibv_obj_new(dev, idx);
725 }
726
727 /**
728  * Release an Tx DevX/verbs queue object.
729  *
730  * @param txq_obj
731  *   DevX/Verbs Tx queue object.
732  */
733 static void
734 mlx5_os_txq_obj_release(struct mlx5_txq_obj *txq_obj)
735 {
736         if (txq_obj->txq_ctrl->type == MLX5_TXQ_TYPE_HAIRPIN) {
737                 mlx5_txq_devx_obj_release(txq_obj);
738                 return;
739         }
740 #ifdef HAVE_MLX5DV_DEVX_UAR_OFFSET
741         if (!txq_obj->txq_ctrl->priv->config.dv_esw_en) {
742                 mlx5_txq_devx_obj_release(txq_obj);
743                 return;
744         }
745 #endif
746         mlx5_txq_ibv_obj_release(txq_obj);
747 }
748
749 /**
750  * DV flow counter mode detect and config.
751  *
752  * @param dev
753  *   Pointer to rte_eth_dev structure.
754  *
755  */
756 static void
757 mlx5_flow_counter_mode_config(struct rte_eth_dev *dev __rte_unused)
758 {
759 #ifdef HAVE_IBV_FLOW_DV_SUPPORT
760         struct mlx5_priv *priv = dev->data->dev_private;
761         struct mlx5_dev_ctx_shared *sh = priv->sh;
762         bool fallback;
763
764 #ifndef HAVE_IBV_DEVX_ASYNC
765         fallback = true;
766 #else
767         fallback = false;
768         if (!priv->config.devx || !priv->config.dv_flow_en ||
769             !priv->config.hca_attr.flow_counters_dump ||
770             !(priv->config.hca_attr.flow_counter_bulk_alloc_bitmap & 0x4) ||
771             (mlx5_flow_dv_discover_counter_offset_support(dev) == -ENOTSUP))
772                 fallback = true;
773 #endif
774         if (fallback)
775                 DRV_LOG(INFO, "Use fall-back DV counter management. Flow "
776                         "counter dump:%d, bulk_alloc_bitmap:0x%hhx.",
777                         priv->config.hca_attr.flow_counters_dump,
778                         priv->config.hca_attr.flow_counter_bulk_alloc_bitmap);
779         /* Initialize fallback mode only on the port initializes sh. */
780         if (sh->refcnt == 1)
781                 sh->cmng.counter_fallback = fallback;
782         else if (fallback != sh->cmng.counter_fallback)
783                 DRV_LOG(WARNING, "Port %d in sh has different fallback mode "
784                         "with others:%d.", PORT_ID(priv), fallback);
785 #endif
786 }
787
788 static void
789 mlx5_queue_counter_id_prepare(struct rte_eth_dev *dev)
790 {
791         struct mlx5_priv *priv = dev->data->dev_private;
792         void *ctx = priv->sh->ctx;
793
794         priv->q_counters = mlx5_devx_cmd_queue_counter_alloc(ctx);
795         if (!priv->q_counters) {
796                 struct ibv_cq *cq = mlx5_glue->create_cq(ctx, 1, NULL, NULL, 0);
797                 struct ibv_wq *wq;
798
799                 DRV_LOG(DEBUG, "Port %d queue counter object cannot be created "
800                         "by DevX - fall-back to use the kernel driver global "
801                         "queue counter.", dev->data->port_id);
802                 /* Create WQ by kernel and query its queue counter ID. */
803                 if (cq) {
804                         wq = mlx5_glue->create_wq(ctx,
805                                                   &(struct ibv_wq_init_attr){
806                                                     .wq_type = IBV_WQT_RQ,
807                                                     .max_wr = 1,
808                                                     .max_sge = 1,
809                                                     .pd = priv->sh->pd,
810                                                     .cq = cq,
811                                                 });
812                         if (wq) {
813                                 /* Counter is assigned only on RDY state. */
814                                 int ret = mlx5_glue->modify_wq(wq,
815                                                  &(struct ibv_wq_attr){
816                                                  .attr_mask = IBV_WQ_ATTR_STATE,
817                                                  .wq_state = IBV_WQS_RDY,
818                                                 });
819
820                                 if (ret == 0)
821                                         mlx5_devx_cmd_wq_query(wq,
822                                                          &priv->counter_set_id);
823                                 claim_zero(mlx5_glue->destroy_wq(wq));
824                         }
825                         claim_zero(mlx5_glue->destroy_cq(cq));
826                 }
827         } else {
828                 priv->counter_set_id = priv->q_counters->id;
829         }
830         if (priv->counter_set_id == 0)
831                 DRV_LOG(INFO, "Part of the port %d statistics will not be "
832                         "available.", dev->data->port_id);
833 }
834
835 /**
836  * Check if representor spawn info match devargs.
837  *
838  * @param spawn
839  *   Verbs device parameters (name, port, switch_info) to spawn.
840  * @param eth_da
841  *   Device devargs to probe.
842  *
843  * @return
844  *   Match result.
845  */
846 static bool
847 mlx5_representor_match(struct mlx5_dev_spawn_data *spawn,
848                        struct rte_eth_devargs *eth_da)
849 {
850         struct mlx5_switch_info *switch_info = &spawn->info;
851         unsigned int p, f;
852         uint16_t id;
853         uint16_t repr_id = mlx5_representor_id_encode(switch_info,
854                                                       eth_da->type);
855
856         switch (eth_da->type) {
857         case RTE_ETH_REPRESENTOR_SF:
858                 if (!(spawn->info.port_name == -1 &&
859                       switch_info->name_type ==
860                                 MLX5_PHYS_PORT_NAME_TYPE_PFHPF) &&
861                     switch_info->name_type != MLX5_PHYS_PORT_NAME_TYPE_PFSF) {
862                         rte_errno = EBUSY;
863                         return false;
864                 }
865                 break;
866         case RTE_ETH_REPRESENTOR_VF:
867                 /* Allows HPF representor index -1 as exception. */
868                 if (!(spawn->info.port_name == -1 &&
869                       switch_info->name_type ==
870                                 MLX5_PHYS_PORT_NAME_TYPE_PFHPF) &&
871                     switch_info->name_type != MLX5_PHYS_PORT_NAME_TYPE_PFVF) {
872                         rte_errno = EBUSY;
873                         return false;
874                 }
875                 break;
876         case RTE_ETH_REPRESENTOR_NONE:
877                 rte_errno = EBUSY;
878                 return false;
879         default:
880                 rte_errno = ENOTSUP;
881                 DRV_LOG(ERR, "unsupported representor type");
882                 return false;
883         }
884         /* Check representor ID: */
885         for (p = 0; p < eth_da->nb_ports; ++p) {
886                 if (spawn->pf_bond < 0) {
887                         /* For non-LAG mode, allow and ignore pf. */
888                         switch_info->pf_num = eth_da->ports[p];
889                         repr_id = mlx5_representor_id_encode(switch_info,
890                                                              eth_da->type);
891                 }
892                 for (f = 0; f < eth_da->nb_representor_ports; ++f) {
893                         id = MLX5_REPRESENTOR_ID
894                                 (eth_da->ports[p], eth_da->type,
895                                  eth_da->representor_ports[f]);
896                         if (repr_id == id)
897                                 return true;
898                 }
899         }
900         rte_errno = EBUSY;
901         return false;
902 }
903
904
905 /**
906  * Spawn an Ethernet device from Verbs information.
907  *
908  * @param dpdk_dev
909  *   Backing DPDK device.
910  * @param spawn
911  *   Verbs device parameters (name, port, switch_info) to spawn.
912  * @param config
913  *   Device configuration parameters.
914  * @param config
915  *   Device arguments.
916  *
917  * @return
918  *   A valid Ethernet device object on success, NULL otherwise and rte_errno
919  *   is set. The following errors are defined:
920  *
921  *   EBUSY: device is not supposed to be spawned.
922  *   EEXIST: device is already spawned
923  */
924 static struct rte_eth_dev *
925 mlx5_dev_spawn(struct rte_device *dpdk_dev,
926                struct mlx5_dev_spawn_data *spawn,
927                struct mlx5_dev_config *config,
928                struct rte_eth_devargs *eth_da)
929 {
930         const struct mlx5_switch_info *switch_info = &spawn->info;
931         struct mlx5_dev_ctx_shared *sh = NULL;
932         struct ibv_port_attr port_attr;
933         struct mlx5dv_context dv_attr = { .comp_mask = 0 };
934         struct rte_eth_dev *eth_dev = NULL;
935         struct mlx5_priv *priv = NULL;
936         int err = 0;
937         unsigned int hw_padding = 0;
938         unsigned int mps;
939         unsigned int tunnel_en = 0;
940         unsigned int mpls_en = 0;
941         unsigned int swp = 0;
942         unsigned int mprq = 0;
943         unsigned int mprq_min_stride_size_n = 0;
944         unsigned int mprq_max_stride_size_n = 0;
945         unsigned int mprq_min_stride_num_n = 0;
946         unsigned int mprq_max_stride_num_n = 0;
947         struct rte_ether_addr mac;
948         char name[RTE_ETH_NAME_MAX_LEN];
949         int own_domain_id = 0;
950         uint16_t port_id;
951         struct mlx5_port_info vport_info = { .query_flags = 0 };
952         int i;
953
954         /* Determine if this port representor is supposed to be spawned. */
955         if (switch_info->representor && dpdk_dev->devargs &&
956             !mlx5_representor_match(spawn, eth_da))
957                 return NULL;
958         /* Build device name. */
959         if (spawn->pf_bond < 0) {
960                 /* Single device. */
961                 if (!switch_info->representor)
962                         strlcpy(name, dpdk_dev->name, sizeof(name));
963                 else
964                         err = snprintf(name, sizeof(name), "%s_representor_%s%u",
965                                  dpdk_dev->name,
966                                  switch_info->name_type ==
967                                  MLX5_PHYS_PORT_NAME_TYPE_PFSF ? "sf" : "vf",
968                                  switch_info->port_name);
969         } else {
970                 /* Bonding device. */
971                 if (!switch_info->representor) {
972                         err = snprintf(name, sizeof(name), "%s_%s",
973                                  dpdk_dev->name,
974                                  mlx5_os_get_dev_device_name(spawn->phys_dev));
975                 } else {
976                         err = snprintf(name, sizeof(name), "%s_%s_representor_c%dpf%d%s%u",
977                                 dpdk_dev->name,
978                                 mlx5_os_get_dev_device_name(spawn->phys_dev),
979                                 switch_info->ctrl_num,
980                                 switch_info->pf_num,
981                                 switch_info->name_type ==
982                                 MLX5_PHYS_PORT_NAME_TYPE_PFSF ? "sf" : "vf",
983                                 switch_info->port_name);
984                 }
985         }
986         if (err >= (int)sizeof(name))
987                 DRV_LOG(WARNING, "device name overflow %s", name);
988         /* check if the device is already spawned */
989         if (rte_eth_dev_get_port_by_name(name, &port_id) == 0) {
990                 rte_errno = EEXIST;
991                 return NULL;
992         }
993         DRV_LOG(DEBUG, "naming Ethernet device \"%s\"", name);
994         if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
995                 struct mlx5_mp_id mp_id;
996
997                 eth_dev = rte_eth_dev_attach_secondary(name);
998                 if (eth_dev == NULL) {
999                         DRV_LOG(ERR, "can not attach rte ethdev");
1000                         rte_errno = ENOMEM;
1001                         return NULL;
1002                 }
1003                 eth_dev->device = dpdk_dev;
1004                 eth_dev->dev_ops = &mlx5_dev_sec_ops;
1005                 eth_dev->rx_descriptor_status = mlx5_rx_descriptor_status;
1006                 eth_dev->tx_descriptor_status = mlx5_tx_descriptor_status;
1007                 err = mlx5_proc_priv_init(eth_dev);
1008                 if (err)
1009                         return NULL;
1010                 mp_id.port_id = eth_dev->data->port_id;
1011                 strlcpy(mp_id.name, MLX5_MP_NAME, RTE_MP_MAX_NAME_LEN);
1012                 /* Receive command fd from primary process */
1013                 err = mlx5_mp_req_verbs_cmd_fd(&mp_id);
1014                 if (err < 0)
1015                         goto err_secondary;
1016                 /* Remap UAR for Tx queues. */
1017                 err = mlx5_tx_uar_init_secondary(eth_dev, err);
1018                 if (err)
1019                         goto err_secondary;
1020                 /*
1021                  * Ethdev pointer is still required as input since
1022                  * the primary device is not accessible from the
1023                  * secondary process.
1024                  */
1025                 eth_dev->rx_pkt_burst = mlx5_select_rx_function(eth_dev);
1026                 eth_dev->tx_pkt_burst = mlx5_select_tx_function(eth_dev);
1027                 return eth_dev;
1028 err_secondary:
1029                 mlx5_dev_close(eth_dev);
1030                 return NULL;
1031         }
1032         /*
1033          * Some parameters ("tx_db_nc" in particularly) are needed in
1034          * advance to create dv/verbs device context. We proceed the
1035          * devargs here to get ones, and later proceed devargs again
1036          * to override some hardware settings.
1037          */
1038         err = mlx5_args(config, dpdk_dev->devargs);
1039         if (err) {
1040                 err = rte_errno;
1041                 DRV_LOG(ERR, "failed to process device arguments: %s",
1042                         strerror(rte_errno));
1043                 goto error;
1044         }
1045         if (config->dv_miss_info) {
1046                 if (switch_info->master || switch_info->representor)
1047                         config->dv_xmeta_en = MLX5_XMETA_MODE_META16;
1048         }
1049         mlx5_malloc_mem_select(config->sys_mem_en);
1050         sh = mlx5_alloc_shared_dev_ctx(spawn, config);
1051         if (!sh)
1052                 return NULL;
1053         config->devx = sh->devx;
1054 #ifdef HAVE_MLX5DV_DR_ACTION_DEST_DEVX_TIR
1055         config->dest_tir = 1;
1056 #endif
1057 #ifdef HAVE_IBV_MLX5_MOD_SWP
1058         dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_SWP;
1059 #endif
1060         /*
1061          * Multi-packet send is supported by ConnectX-4 Lx PF as well
1062          * as all ConnectX-5 devices.
1063          */
1064 #ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT
1065         dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS;
1066 #endif
1067 #ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT
1068         dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_STRIDING_RQ;
1069 #endif
1070         mlx5_glue->dv_query_device(sh->ctx, &dv_attr);
1071         if (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED) {
1072                 if (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW) {
1073                         DRV_LOG(DEBUG, "enhanced MPW is supported");
1074                         mps = MLX5_MPW_ENHANCED;
1075                 } else {
1076                         DRV_LOG(DEBUG, "MPW is supported");
1077                         mps = MLX5_MPW;
1078                 }
1079         } else {
1080                 DRV_LOG(DEBUG, "MPW isn't supported");
1081                 mps = MLX5_MPW_DISABLED;
1082         }
1083 #ifdef HAVE_IBV_MLX5_MOD_SWP
1084         if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_SWP)
1085                 swp = dv_attr.sw_parsing_caps.sw_parsing_offloads;
1086         DRV_LOG(DEBUG, "SWP support: %u", swp);
1087 #endif
1088         config->swp = !!swp;
1089 #ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT
1090         if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_STRIDING_RQ) {
1091                 struct mlx5dv_striding_rq_caps mprq_caps =
1092                         dv_attr.striding_rq_caps;
1093
1094                 DRV_LOG(DEBUG, "\tmin_single_stride_log_num_of_bytes: %d",
1095                         mprq_caps.min_single_stride_log_num_of_bytes);
1096                 DRV_LOG(DEBUG, "\tmax_single_stride_log_num_of_bytes: %d",
1097                         mprq_caps.max_single_stride_log_num_of_bytes);
1098                 DRV_LOG(DEBUG, "\tmin_single_wqe_log_num_of_strides: %d",
1099                         mprq_caps.min_single_wqe_log_num_of_strides);
1100                 DRV_LOG(DEBUG, "\tmax_single_wqe_log_num_of_strides: %d",
1101                         mprq_caps.max_single_wqe_log_num_of_strides);
1102                 DRV_LOG(DEBUG, "\tsupported_qpts: %d",
1103                         mprq_caps.supported_qpts);
1104                 DRV_LOG(DEBUG, "device supports Multi-Packet RQ");
1105                 mprq = 1;
1106                 mprq_min_stride_size_n =
1107                         mprq_caps.min_single_stride_log_num_of_bytes;
1108                 mprq_max_stride_size_n =
1109                         mprq_caps.max_single_stride_log_num_of_bytes;
1110                 mprq_min_stride_num_n =
1111                         mprq_caps.min_single_wqe_log_num_of_strides;
1112                 mprq_max_stride_num_n =
1113                         mprq_caps.max_single_wqe_log_num_of_strides;
1114         }
1115 #endif
1116         /* Rx CQE compression is enabled by default. */
1117         config->cqe_comp = 1;
1118 #ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT
1119         if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS) {
1120                 tunnel_en = ((dv_attr.tunnel_offloads_caps &
1121                               MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_VXLAN) &&
1122                              (dv_attr.tunnel_offloads_caps &
1123                               MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_GRE) &&
1124                              (dv_attr.tunnel_offloads_caps &
1125                               MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_GENEVE));
1126         }
1127         DRV_LOG(DEBUG, "tunnel offloading is %ssupported",
1128                 tunnel_en ? "" : "not ");
1129 #else
1130         DRV_LOG(WARNING,
1131                 "tunnel offloading disabled due to old OFED/rdma-core version");
1132 #endif
1133         config->tunnel_en = tunnel_en;
1134 #ifdef HAVE_IBV_DEVICE_MPLS_SUPPORT
1135         mpls_en = ((dv_attr.tunnel_offloads_caps &
1136                     MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_CW_MPLS_OVER_GRE) &&
1137                    (dv_attr.tunnel_offloads_caps &
1138                     MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_CW_MPLS_OVER_UDP));
1139         DRV_LOG(DEBUG, "MPLS over GRE/UDP tunnel offloading is %ssupported",
1140                 mpls_en ? "" : "not ");
1141 #else
1142         DRV_LOG(WARNING, "MPLS over GRE/UDP tunnel offloading disabled due to"
1143                 " old OFED/rdma-core version or firmware configuration");
1144 #endif
1145         config->mpls_en = mpls_en;
1146         /* Check port status. */
1147         err = mlx5_glue->query_port(sh->ctx, spawn->phys_port, &port_attr);
1148         if (err) {
1149                 DRV_LOG(ERR, "port query failed: %s", strerror(err));
1150                 goto error;
1151         }
1152         if (port_attr.link_layer != IBV_LINK_LAYER_ETHERNET) {
1153                 DRV_LOG(ERR, "port is not configured in Ethernet mode");
1154                 err = EINVAL;
1155                 goto error;
1156         }
1157         if (port_attr.state != IBV_PORT_ACTIVE)
1158                 DRV_LOG(DEBUG, "port is not active: \"%s\" (%d)",
1159                         mlx5_glue->port_state_str(port_attr.state),
1160                         port_attr.state);
1161         /* Allocate private eth device data. */
1162         priv = mlx5_malloc(MLX5_MEM_ZERO | MLX5_MEM_RTE,
1163                            sizeof(*priv),
1164                            RTE_CACHE_LINE_SIZE, SOCKET_ID_ANY);
1165         if (priv == NULL) {
1166                 DRV_LOG(ERR, "priv allocation failure");
1167                 err = ENOMEM;
1168                 goto error;
1169         }
1170         priv->sh = sh;
1171         priv->dev_port = spawn->phys_port;
1172         priv->pci_dev = spawn->pci_dev;
1173         priv->mtu = RTE_ETHER_MTU;
1174         /* Some internal functions rely on Netlink sockets, open them now. */
1175         priv->nl_socket_rdma = mlx5_nl_init(NETLINK_RDMA);
1176         priv->nl_socket_route = mlx5_nl_init(NETLINK_ROUTE);
1177         priv->representor = !!switch_info->representor;
1178         priv->master = !!switch_info->master;
1179         priv->domain_id = RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID;
1180         priv->vport_meta_tag = 0;
1181         priv->vport_meta_mask = 0;
1182         priv->pf_bond = spawn->pf_bond;
1183         /*
1184          * If we have E-Switch we should determine the vport attributes.
1185          * E-Switch may use either source vport field or reg_c[0] metadata
1186          * register to match on vport index. The engaged part of metadata
1187          * register is defined by mask.
1188          */
1189         if (switch_info->representor || switch_info->master) {
1190                 err = mlx5_glue->devx_port_query(sh->ctx,
1191                                                  spawn->phys_port,
1192                                                  &vport_info);
1193                 if (err) {
1194                         DRV_LOG(WARNING,
1195                                 "can't query devx port %d on device %s",
1196                                 spawn->phys_port,
1197                                 mlx5_os_get_dev_device_name(spawn->phys_dev));
1198                         vport_info.query_flags = 0;
1199                 }
1200         }
1201         if (vport_info.query_flags & MLX5_PORT_QUERY_REG_C0) {
1202                 priv->vport_meta_tag = vport_info.vport_meta_tag;
1203                 priv->vport_meta_mask = vport_info.vport_meta_mask;
1204                 if (!priv->vport_meta_mask) {
1205                         DRV_LOG(ERR, "vport zero mask for port %d"
1206                                      " on bonding device %s",
1207                                      spawn->phys_port,
1208                                      mlx5_os_get_dev_device_name
1209                                                         (spawn->phys_dev));
1210                         err = ENOTSUP;
1211                         goto error;
1212                 }
1213                 if (priv->vport_meta_tag & ~priv->vport_meta_mask) {
1214                         DRV_LOG(ERR, "invalid vport tag for port %d"
1215                                      " on bonding device %s",
1216                                      spawn->phys_port,
1217                                      mlx5_os_get_dev_device_name
1218                                                         (spawn->phys_dev));
1219                         err = ENOTSUP;
1220                         goto error;
1221                 }
1222         }
1223         if (vport_info.query_flags & MLX5_PORT_QUERY_VPORT) {
1224                 priv->vport_id = vport_info.vport_id;
1225         } else if (spawn->pf_bond >= 0 &&
1226                    (switch_info->representor || switch_info->master)) {
1227                 DRV_LOG(ERR, "can't deduce vport index for port %d"
1228                              " on bonding device %s",
1229                              spawn->phys_port,
1230                              mlx5_os_get_dev_device_name(spawn->phys_dev));
1231                 err = ENOTSUP;
1232                 goto error;
1233         } else {
1234                 /*
1235                  * Suppose vport index in compatible way. Kernel/rdma_core
1236                  * support single E-Switch per PF configurations only and
1237                  * vport_id field contains the vport index for associated VF,
1238                  * which is deduced from representor port name.
1239                  * For example, let's have the IB device port 10, it has
1240                  * attached network device eth0, which has port name attribute
1241                  * pf0vf2, we can deduce the VF number as 2, and set vport index
1242                  * as 3 (2+1). This assigning schema should be changed if the
1243                  * multiple E-Switch instances per PF configurations or/and PCI
1244                  * subfunctions are added.
1245                  */
1246                 priv->vport_id = switch_info->representor ?
1247                                  switch_info->port_name + 1 : -1;
1248         }
1249         priv->representor_id = mlx5_representor_id_encode(switch_info,
1250                                                           eth_da->type);
1251         /*
1252          * Look for sibling devices in order to reuse their switch domain
1253          * if any, otherwise allocate one.
1254          */
1255         MLX5_ETH_FOREACH_DEV(port_id, NULL) {
1256                 const struct mlx5_priv *opriv =
1257                         rte_eth_devices[port_id].data->dev_private;
1258
1259                 if (!opriv ||
1260                     opriv->sh != priv->sh ||
1261                         opriv->domain_id ==
1262                         RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID)
1263                         continue;
1264                 priv->domain_id = opriv->domain_id;
1265                 break;
1266         }
1267         if (priv->domain_id == RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID) {
1268                 err = rte_eth_switch_domain_alloc(&priv->domain_id);
1269                 if (err) {
1270                         err = rte_errno;
1271                         DRV_LOG(ERR, "unable to allocate switch domain: %s",
1272                                 strerror(rte_errno));
1273                         goto error;
1274                 }
1275                 own_domain_id = 1;
1276         }
1277         /* Override some values set by hardware configuration. */
1278         mlx5_args(config, dpdk_dev->devargs);
1279         err = mlx5_dev_check_sibling_config(priv, config);
1280         if (err)
1281                 goto error;
1282         config->hw_csum = !!(sh->device_attr.device_cap_flags_ex &
1283                             IBV_DEVICE_RAW_IP_CSUM);
1284         DRV_LOG(DEBUG, "checksum offloading is %ssupported",
1285                 (config->hw_csum ? "" : "not "));
1286 #if !defined(HAVE_IBV_DEVICE_COUNTERS_SET_V42) && \
1287         !defined(HAVE_IBV_DEVICE_COUNTERS_SET_V45)
1288         DRV_LOG(DEBUG, "counters are not supported");
1289 #endif
1290 #if !defined(HAVE_IBV_FLOW_DV_SUPPORT) || !defined(HAVE_MLX5DV_DR)
1291         if (config->dv_flow_en) {
1292                 DRV_LOG(WARNING, "DV flow is not supported");
1293                 config->dv_flow_en = 0;
1294         }
1295 #endif
1296         config->ind_table_max_size =
1297                 sh->device_attr.max_rwq_indirection_table_size;
1298         /*
1299          * Remove this check once DPDK supports larger/variable
1300          * indirection tables.
1301          */
1302         if (config->ind_table_max_size > (unsigned int)ETH_RSS_RETA_SIZE_512)
1303                 config->ind_table_max_size = ETH_RSS_RETA_SIZE_512;
1304         DRV_LOG(DEBUG, "maximum Rx indirection table size is %u",
1305                 config->ind_table_max_size);
1306         config->hw_vlan_strip = !!(sh->device_attr.raw_packet_caps &
1307                                   IBV_RAW_PACKET_CAP_CVLAN_STRIPPING);
1308         DRV_LOG(DEBUG, "VLAN stripping is %ssupported",
1309                 (config->hw_vlan_strip ? "" : "not "));
1310         config->hw_fcs_strip = !!(sh->device_attr.raw_packet_caps &
1311                                  IBV_RAW_PACKET_CAP_SCATTER_FCS);
1312 #if defined(HAVE_IBV_WQ_FLAG_RX_END_PADDING)
1313         hw_padding = !!sh->device_attr.rx_pad_end_addr_align;
1314 #elif defined(HAVE_IBV_WQ_FLAGS_PCI_WRITE_END_PADDING)
1315         hw_padding = !!(sh->device_attr.device_cap_flags_ex &
1316                         IBV_DEVICE_PCI_WRITE_END_PADDING);
1317 #endif
1318         if (config->hw_padding && !hw_padding) {
1319                 DRV_LOG(DEBUG, "Rx end alignment padding isn't supported");
1320                 config->hw_padding = 0;
1321         } else if (config->hw_padding) {
1322                 DRV_LOG(DEBUG, "Rx end alignment padding is enabled");
1323         }
1324         config->tso = (sh->device_attr.max_tso > 0 &&
1325                       (sh->device_attr.tso_supported_qpts &
1326                        (1 << IBV_QPT_RAW_PACKET)));
1327         if (config->tso)
1328                 config->tso_max_payload_sz = sh->device_attr.max_tso;
1329         /*
1330          * MPW is disabled by default, while the Enhanced MPW is enabled
1331          * by default.
1332          */
1333         if (config->mps == MLX5_ARG_UNSET)
1334                 config->mps = (mps == MLX5_MPW_ENHANCED) ? MLX5_MPW_ENHANCED :
1335                                                           MLX5_MPW_DISABLED;
1336         else
1337                 config->mps = config->mps ? mps : MLX5_MPW_DISABLED;
1338         DRV_LOG(INFO, "%sMPS is %s",
1339                 config->mps == MLX5_MPW_ENHANCED ? "enhanced " :
1340                 config->mps == MLX5_MPW ? "legacy " : "",
1341                 config->mps != MLX5_MPW_DISABLED ? "enabled" : "disabled");
1342         if (config->devx) {
1343                 err = mlx5_devx_cmd_query_hca_attr(sh->ctx, &config->hca_attr);
1344                 if (err) {
1345                         err = -err;
1346                         goto error;
1347                 }
1348                 /* Check relax ordering support. */
1349                 if (!haswell_broadwell_cpu) {
1350                         sh->cmng.relaxed_ordering_write =
1351                                 config->hca_attr.relaxed_ordering_write;
1352                         sh->cmng.relaxed_ordering_read =
1353                                 config->hca_attr.relaxed_ordering_read;
1354                 } else {
1355                         sh->cmng.relaxed_ordering_read = 0;
1356                         sh->cmng.relaxed_ordering_write = 0;
1357                 }
1358                 sh->rq_ts_format = config->hca_attr.rq_ts_format;
1359                 sh->sq_ts_format = config->hca_attr.sq_ts_format;
1360                 sh->qp_ts_format = config->hca_attr.qp_ts_format;
1361                 /* Check for LRO support. */
1362                 if (config->dest_tir && config->hca_attr.lro_cap &&
1363                     config->dv_flow_en) {
1364                         /* TBD check tunnel lro caps. */
1365                         config->lro.supported = config->hca_attr.lro_cap;
1366                         DRV_LOG(DEBUG, "Device supports LRO");
1367                         /*
1368                          * If LRO timeout is not configured by application,
1369                          * use the minimal supported value.
1370                          */
1371                         if (!config->lro.timeout)
1372                                 config->lro.timeout =
1373                                 config->hca_attr.lro_timer_supported_periods[0];
1374                         DRV_LOG(DEBUG, "LRO session timeout set to %d usec",
1375                                 config->lro.timeout);
1376                         DRV_LOG(DEBUG, "LRO minimal size of TCP segment "
1377                                 "required for coalescing is %d bytes",
1378                                 config->hca_attr.lro_min_mss_size);
1379                 }
1380 #if defined(HAVE_MLX5DV_DR) && \
1381         (defined(HAVE_MLX5_DR_CREATE_ACTION_FLOW_METER) || \
1382          defined(HAVE_MLX5_DR_CREATE_ACTION_ASO))
1383                 if (config->hca_attr.qos.sup &&
1384                     config->hca_attr.qos.flow_meter_old &&
1385                     config->dv_flow_en) {
1386                         uint8_t reg_c_mask =
1387                                 config->hca_attr.qos.flow_meter_reg_c_ids;
1388                         /*
1389                          * Meter needs two REG_C's for color match and pre-sfx
1390                          * flow match. Here get the REG_C for color match.
1391                          * REG_C_0 and REG_C_1 is reserved for metadata feature.
1392                          */
1393                         reg_c_mask &= 0xfc;
1394                         if (__builtin_popcount(reg_c_mask) < 1) {
1395                                 priv->mtr_en = 0;
1396                                 DRV_LOG(WARNING, "No available register for"
1397                                         " meter.");
1398                         } else {
1399                                 /*
1400                                  * The meter color register is used by the
1401                                  * flow-hit feature as well.
1402                                  * The flow-hit feature must use REG_C_3
1403                                  * Prefer REG_C_3 if it is available.
1404                                  */
1405                                 if (reg_c_mask & (1 << (REG_C_3 - REG_C_0)))
1406                                         priv->mtr_color_reg = REG_C_3;
1407                                 else
1408                                         priv->mtr_color_reg = ffs(reg_c_mask)
1409                                                               - 1 + REG_C_0;
1410                                 priv->mtr_en = 1;
1411                                 priv->mtr_reg_share =
1412                                       config->hca_attr.qos.flow_meter;
1413                                 DRV_LOG(DEBUG, "The REG_C meter uses is %d",
1414                                         priv->mtr_color_reg);
1415                         }
1416                 }
1417                 if (config->hca_attr.qos.sup &&
1418                         config->hca_attr.qos.flow_meter_aso_sup) {
1419                         uint32_t log_obj_size =
1420                                 rte_log2_u32(MLX5_ASO_MTRS_PER_POOL >> 1);
1421                         if (log_obj_size >=
1422                         config->hca_attr.qos.log_meter_aso_granularity &&
1423                         log_obj_size <=
1424                         config->hca_attr.qos.log_meter_aso_max_alloc)
1425                                 sh->meter_aso_en = 1;
1426                 }
1427                 if (priv->mtr_en) {
1428                         err = mlx5_aso_flow_mtrs_mng_init(priv->sh);
1429                         if (err) {
1430                                 err = -err;
1431                                 goto error;
1432                         }
1433                 }
1434                 if (config->hca_attr.flow.tunnel_header_0_1)
1435                         sh->tunnel_header_0_1 = 1;
1436 #endif
1437 #ifdef HAVE_MLX5_DR_CREATE_ACTION_ASO
1438                 if (config->hca_attr.flow_hit_aso &&
1439                     priv->mtr_color_reg == REG_C_3) {
1440                         sh->flow_hit_aso_en = 1;
1441                         err = mlx5_flow_aso_age_mng_init(sh);
1442                         if (err) {
1443                                 err = -err;
1444                                 goto error;
1445                         }
1446                         DRV_LOG(DEBUG, "Flow Hit ASO is supported.");
1447                 }
1448 #endif /* HAVE_MLX5_DR_CREATE_ACTION_ASO */
1449 #if defined(HAVE_MLX5_DR_CREATE_ACTION_ASO) && \
1450         defined(HAVE_MLX5_DR_ACTION_ASO_CT)
1451                 if (config->hca_attr.ct_offload &&
1452                     priv->mtr_color_reg == REG_C_3) {
1453                         err = mlx5_flow_aso_ct_mng_init(sh);
1454                         if (err) {
1455                                 err = -err;
1456                                 goto error;
1457                         }
1458                         DRV_LOG(DEBUG, "CT ASO is supported.");
1459                         sh->ct_aso_en = 1;
1460                 }
1461 #endif /* HAVE_MLX5_DR_CREATE_ACTION_ASO && HAVE_MLX5_DR_ACTION_ASO_CT */
1462 #if defined(HAVE_MLX5DV_DR) && defined(HAVE_MLX5_DR_CREATE_ACTION_FLOW_SAMPLE)
1463                 if (config->hca_attr.log_max_ft_sampler_num > 0  &&
1464                     config->dv_flow_en) {
1465                         priv->sampler_en = 1;
1466                         DRV_LOG(DEBUG, "Sampler enabled!");
1467                 } else {
1468                         priv->sampler_en = 0;
1469                         if (!config->hca_attr.log_max_ft_sampler_num)
1470                                 DRV_LOG(WARNING,
1471                                         "No available register for sampler.");
1472                         else
1473                                 DRV_LOG(DEBUG, "DV flow is not supported!");
1474                 }
1475 #endif
1476         }
1477         if (config->cqe_comp && RTE_CACHE_LINE_SIZE == 128 &&
1478             !(dv_attr.flags & MLX5DV_CONTEXT_FLAGS_CQE_128B_COMP)) {
1479                 DRV_LOG(WARNING, "Rx CQE 128B compression is not supported");
1480                 config->cqe_comp = 0;
1481         }
1482         if (config->cqe_comp_fmt == MLX5_CQE_RESP_FORMAT_FTAG_STRIDX &&
1483             (!config->devx || !config->hca_attr.mini_cqe_resp_flow_tag)) {
1484                 DRV_LOG(WARNING, "Flow Tag CQE compression"
1485                                  " format isn't supported.");
1486                 config->cqe_comp = 0;
1487         }
1488         if (config->cqe_comp_fmt == MLX5_CQE_RESP_FORMAT_L34H_STRIDX &&
1489             (!config->devx || !config->hca_attr.mini_cqe_resp_l3_l4_tag)) {
1490                 DRV_LOG(WARNING, "L3/L4 Header CQE compression"
1491                                  " format isn't supported.");
1492                 config->cqe_comp = 0;
1493         }
1494         DRV_LOG(DEBUG, "Rx CQE compression is %ssupported",
1495                         config->cqe_comp ? "" : "not ");
1496         if (config->tx_pp) {
1497                 DRV_LOG(DEBUG, "Timestamp counter frequency %u kHz",
1498                         config->hca_attr.dev_freq_khz);
1499                 DRV_LOG(DEBUG, "Packet pacing is %ssupported",
1500                         config->hca_attr.qos.packet_pacing ? "" : "not ");
1501                 DRV_LOG(DEBUG, "Cross channel ops are %ssupported",
1502                         config->hca_attr.cross_channel ? "" : "not ");
1503                 DRV_LOG(DEBUG, "WQE index ignore is %ssupported",
1504                         config->hca_attr.wqe_index_ignore ? "" : "not ");
1505                 DRV_LOG(DEBUG, "Non-wire SQ feature is %ssupported",
1506                         config->hca_attr.non_wire_sq ? "" : "not ");
1507                 DRV_LOG(DEBUG, "Static WQE SQ feature is %ssupported (%d)",
1508                         config->hca_attr.log_max_static_sq_wq ? "" : "not ",
1509                         config->hca_attr.log_max_static_sq_wq);
1510                 DRV_LOG(DEBUG, "WQE rate PP mode is %ssupported",
1511                         config->hca_attr.qos.wqe_rate_pp ? "" : "not ");
1512                 if (!config->devx) {
1513                         DRV_LOG(ERR, "DevX is required for packet pacing");
1514                         err = ENODEV;
1515                         goto error;
1516                 }
1517                 if (!config->hca_attr.qos.packet_pacing) {
1518                         DRV_LOG(ERR, "Packet pacing is not supported");
1519                         err = ENODEV;
1520                         goto error;
1521                 }
1522                 if (!config->hca_attr.cross_channel) {
1523                         DRV_LOG(ERR, "Cross channel operations are"
1524                                      " required for packet pacing");
1525                         err = ENODEV;
1526                         goto error;
1527                 }
1528                 if (!config->hca_attr.wqe_index_ignore) {
1529                         DRV_LOG(ERR, "WQE index ignore feature is"
1530                                      " required for packet pacing");
1531                         err = ENODEV;
1532                         goto error;
1533                 }
1534                 if (!config->hca_attr.non_wire_sq) {
1535                         DRV_LOG(ERR, "Non-wire SQ feature is"
1536                                      " required for packet pacing");
1537                         err = ENODEV;
1538                         goto error;
1539                 }
1540                 if (!config->hca_attr.log_max_static_sq_wq) {
1541                         DRV_LOG(ERR, "Static WQE SQ feature is"
1542                                      " required for packet pacing");
1543                         err = ENODEV;
1544                         goto error;
1545                 }
1546                 if (!config->hca_attr.qos.wqe_rate_pp) {
1547                         DRV_LOG(ERR, "WQE rate mode is required"
1548                                      " for packet pacing");
1549                         err = ENODEV;
1550                         goto error;
1551                 }
1552 #ifndef HAVE_MLX5DV_DEVX_UAR_OFFSET
1553                 DRV_LOG(ERR, "DevX does not provide UAR offset,"
1554                              " can't create queues for packet pacing");
1555                 err = ENODEV;
1556                 goto error;
1557 #endif
1558         }
1559         if (config->devx) {
1560                 uint32_t reg[MLX5_ST_SZ_DW(register_mtutc)];
1561
1562                 err = config->hca_attr.access_register_user ?
1563                         mlx5_devx_cmd_register_read
1564                                 (sh->ctx, MLX5_REGISTER_ID_MTUTC, 0,
1565                                 reg, MLX5_ST_SZ_DW(register_mtutc)) : ENOTSUP;
1566                 if (!err) {
1567                         uint32_t ts_mode;
1568
1569                         /* MTUTC register is read successfully. */
1570                         ts_mode = MLX5_GET(register_mtutc, reg,
1571                                            time_stamp_mode);
1572                         if (ts_mode == MLX5_MTUTC_TIMESTAMP_MODE_REAL_TIME)
1573                                 config->rt_timestamp = 1;
1574                 } else {
1575                         /* Kernel does not support register reading. */
1576                         if (config->hca_attr.dev_freq_khz ==
1577                                                  (NS_PER_S / MS_PER_S))
1578                                 config->rt_timestamp = 1;
1579                 }
1580         }
1581         /*
1582          * If HW has bug working with tunnel packet decapsulation and
1583          * scatter FCS, and decapsulation is needed, clear the hw_fcs_strip
1584          * bit. Then DEV_RX_OFFLOAD_KEEP_CRC bit will not be set anymore.
1585          */
1586         if (config->hca_attr.scatter_fcs_w_decap_disable && config->decap_en)
1587                 config->hw_fcs_strip = 0;
1588         DRV_LOG(DEBUG, "FCS stripping configuration is %ssupported",
1589                 (config->hw_fcs_strip ? "" : "not "));
1590         if (config->mprq.enabled && mprq) {
1591                 if (config->mprq.stride_num_n &&
1592                     (config->mprq.stride_num_n > mprq_max_stride_num_n ||
1593                      config->mprq.stride_num_n < mprq_min_stride_num_n)) {
1594                         config->mprq.stride_num_n =
1595                                 RTE_MIN(RTE_MAX(MLX5_MPRQ_STRIDE_NUM_N,
1596                                                 mprq_min_stride_num_n),
1597                                         mprq_max_stride_num_n);
1598                         DRV_LOG(WARNING,
1599                                 "the number of strides"
1600                                 " for Multi-Packet RQ is out of range,"
1601                                 " setting default value (%u)",
1602                                 1 << config->mprq.stride_num_n);
1603                 }
1604                 if (config->mprq.stride_size_n &&
1605                     (config->mprq.stride_size_n > mprq_max_stride_size_n ||
1606                      config->mprq.stride_size_n < mprq_min_stride_size_n)) {
1607                         config->mprq.stride_size_n =
1608                                 RTE_MIN(RTE_MAX(MLX5_MPRQ_STRIDE_SIZE_N,
1609                                                 mprq_min_stride_size_n),
1610                                         mprq_max_stride_size_n);
1611                         DRV_LOG(WARNING,
1612                                 "the size of a stride"
1613                                 " for Multi-Packet RQ is out of range,"
1614                                 " setting default value (%u)",
1615                                 1 << config->mprq.stride_size_n);
1616                 }
1617                 config->mprq.min_stride_size_n = mprq_min_stride_size_n;
1618                 config->mprq.max_stride_size_n = mprq_max_stride_size_n;
1619         } else if (config->mprq.enabled && !mprq) {
1620                 DRV_LOG(WARNING, "Multi-Packet RQ isn't supported");
1621                 config->mprq.enabled = 0;
1622         }
1623         if (config->max_dump_files_num == 0)
1624                 config->max_dump_files_num = 128;
1625         eth_dev = rte_eth_dev_allocate(name);
1626         if (eth_dev == NULL) {
1627                 DRV_LOG(ERR, "can not allocate rte ethdev");
1628                 err = ENOMEM;
1629                 goto error;
1630         }
1631         if (priv->representor) {
1632                 eth_dev->data->dev_flags |= RTE_ETH_DEV_REPRESENTOR;
1633                 eth_dev->data->representor_id = priv->representor_id;
1634         }
1635         priv->mp_id.port_id = eth_dev->data->port_id;
1636         strlcpy(priv->mp_id.name, MLX5_MP_NAME, RTE_MP_MAX_NAME_LEN);
1637         /*
1638          * Store associated network device interface index. This index
1639          * is permanent throughout the lifetime of device. So, we may store
1640          * the ifindex here and use the cached value further.
1641          */
1642         MLX5_ASSERT(spawn->ifindex);
1643         priv->if_index = spawn->ifindex;
1644         eth_dev->data->dev_private = priv;
1645         priv->dev_data = eth_dev->data;
1646         eth_dev->data->mac_addrs = priv->mac;
1647         eth_dev->device = dpdk_dev;
1648         eth_dev->data->dev_flags |= RTE_ETH_DEV_AUTOFILL_QUEUE_XSTATS;
1649         /* Configure the first MAC address by default. */
1650         if (mlx5_get_mac(eth_dev, &mac.addr_bytes)) {
1651                 DRV_LOG(ERR,
1652                         "port %u cannot get MAC address, is mlx5_en"
1653                         " loaded? (errno: %s)",
1654                         eth_dev->data->port_id, strerror(rte_errno));
1655                 err = ENODEV;
1656                 goto error;
1657         }
1658         DRV_LOG(INFO,
1659                 "port %u MAC address is %02x:%02x:%02x:%02x:%02x:%02x",
1660                 eth_dev->data->port_id,
1661                 mac.addr_bytes[0], mac.addr_bytes[1],
1662                 mac.addr_bytes[2], mac.addr_bytes[3],
1663                 mac.addr_bytes[4], mac.addr_bytes[5]);
1664 #ifdef RTE_LIBRTE_MLX5_DEBUG
1665         {
1666                 char ifname[MLX5_NAMESIZE];
1667
1668                 if (mlx5_get_ifname(eth_dev, &ifname) == 0)
1669                         DRV_LOG(DEBUG, "port %u ifname is \"%s\"",
1670                                 eth_dev->data->port_id, ifname);
1671                 else
1672                         DRV_LOG(DEBUG, "port %u ifname is unknown",
1673                                 eth_dev->data->port_id);
1674         }
1675 #endif
1676         /* Get actual MTU if possible. */
1677         err = mlx5_get_mtu(eth_dev, &priv->mtu);
1678         if (err) {
1679                 err = rte_errno;
1680                 goto error;
1681         }
1682         DRV_LOG(DEBUG, "port %u MTU is %u", eth_dev->data->port_id,
1683                 priv->mtu);
1684         /* Initialize burst functions to prevent crashes before link-up. */
1685         eth_dev->rx_pkt_burst = removed_rx_burst;
1686         eth_dev->tx_pkt_burst = removed_tx_burst;
1687         eth_dev->dev_ops = &mlx5_dev_ops;
1688         eth_dev->rx_descriptor_status = mlx5_rx_descriptor_status;
1689         eth_dev->tx_descriptor_status = mlx5_tx_descriptor_status;
1690         eth_dev->rx_queue_count = mlx5_rx_queue_count;
1691         /* Register MAC address. */
1692         claim_zero(mlx5_mac_addr_add(eth_dev, &mac, 0, 0));
1693         if (config->vf && config->vf_nl_en)
1694                 mlx5_nl_mac_addr_sync(priv->nl_socket_route,
1695                                       mlx5_ifindex(eth_dev),
1696                                       eth_dev->data->mac_addrs,
1697                                       MLX5_MAX_MAC_ADDRESSES);
1698         priv->ctrl_flows = 0;
1699         rte_spinlock_init(&priv->flow_list_lock);
1700         TAILQ_INIT(&priv->flow_meters);
1701         priv->mtr_profile_tbl = mlx5_l3t_create(MLX5_L3T_TYPE_PTR);
1702         if (!priv->mtr_profile_tbl)
1703                 goto error;
1704         /* Hint libmlx5 to use PMD allocator for data plane resources */
1705         mlx5_glue->dv_set_context_attr(sh->ctx,
1706                         MLX5DV_CTX_ATTR_BUF_ALLOCATORS,
1707                         (void *)((uintptr_t)&(struct mlx5dv_ctx_allocators){
1708                                 .alloc = &mlx5_alloc_verbs_buf,
1709                                 .free = &mlx5_free_verbs_buf,
1710                                 .data = sh,
1711                         }));
1712         /* Bring Ethernet device up. */
1713         DRV_LOG(DEBUG, "port %u forcing Ethernet interface up",
1714                 eth_dev->data->port_id);
1715         mlx5_set_link_up(eth_dev);
1716         /*
1717          * Even though the interrupt handler is not installed yet,
1718          * interrupts will still trigger on the async_fd from
1719          * Verbs context returned by ibv_open_device().
1720          */
1721         mlx5_link_update(eth_dev, 0);
1722 #ifdef HAVE_MLX5DV_DR_ESWITCH
1723         if (!(config->hca_attr.eswitch_manager && config->dv_flow_en &&
1724               (switch_info->representor || switch_info->master)))
1725                 config->dv_esw_en = 0;
1726 #else
1727         config->dv_esw_en = 0;
1728 #endif
1729         /* Detect minimal data bytes to inline. */
1730         mlx5_set_min_inline(spawn, config);
1731         /* Store device configuration on private structure. */
1732         priv->config = *config;
1733         for (i = 0; i < MLX5_FLOW_TYPE_MAXI; i++) {
1734                 icfg[i].release_mem_en = !!config->reclaim_mode;
1735                 if (config->reclaim_mode)
1736                         icfg[i].per_core_cache = 0;
1737                 priv->flows[i] = mlx5_ipool_create(&icfg[i]);
1738                 if (!priv->flows[i])
1739                         goto error;
1740         }
1741         /* Create context for virtual machine VLAN workaround. */
1742         priv->vmwa_context = mlx5_vlan_vmwa_init(eth_dev, spawn->ifindex);
1743         if (config->dv_flow_en) {
1744                 err = mlx5_alloc_shared_dr(priv);
1745                 if (err)
1746                         goto error;
1747         }
1748         if (config->devx && config->dv_flow_en && config->dest_tir) {
1749                 priv->obj_ops = devx_obj_ops;
1750                 priv->obj_ops.drop_action_create =
1751                                                 ibv_obj_ops.drop_action_create;
1752                 priv->obj_ops.drop_action_destroy =
1753                                                 ibv_obj_ops.drop_action_destroy;
1754 #ifndef HAVE_MLX5DV_DEVX_UAR_OFFSET
1755                 priv->obj_ops.txq_obj_modify = ibv_obj_ops.txq_obj_modify;
1756 #else
1757                 if (config->dv_esw_en)
1758                         priv->obj_ops.txq_obj_modify =
1759                                                 ibv_obj_ops.txq_obj_modify;
1760 #endif
1761                 /* Use specific wrappers for Tx object. */
1762                 priv->obj_ops.txq_obj_new = mlx5_os_txq_obj_new;
1763                 priv->obj_ops.txq_obj_release = mlx5_os_txq_obj_release;
1764                 mlx5_queue_counter_id_prepare(eth_dev);
1765                 priv->obj_ops.lb_dummy_queue_create =
1766                                         mlx5_rxq_ibv_obj_dummy_lb_create;
1767                 priv->obj_ops.lb_dummy_queue_release =
1768                                         mlx5_rxq_ibv_obj_dummy_lb_release;
1769         } else {
1770                 priv->obj_ops = ibv_obj_ops;
1771         }
1772         priv->drop_queue.hrxq = mlx5_drop_action_create(eth_dev);
1773         if (!priv->drop_queue.hrxq)
1774                 goto error;
1775         /* Supported Verbs flow priority number detection. */
1776         err = mlx5_flow_discover_priorities(eth_dev);
1777         if (err < 0) {
1778                 err = -err;
1779                 goto error;
1780         }
1781         priv->config.flow_prio = err;
1782         if (!priv->config.dv_esw_en &&
1783             priv->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY) {
1784                 DRV_LOG(WARNING, "metadata mode %u is not supported "
1785                                  "(no E-Switch)", priv->config.dv_xmeta_en);
1786                 priv->config.dv_xmeta_en = MLX5_XMETA_MODE_LEGACY;
1787         }
1788         mlx5_set_metadata_mask(eth_dev);
1789         if (priv->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY &&
1790             !priv->sh->dv_regc0_mask) {
1791                 DRV_LOG(ERR, "metadata mode %u is not supported "
1792                              "(no metadata reg_c[0] is available)",
1793                              priv->config.dv_xmeta_en);
1794                         err = ENOTSUP;
1795                         goto error;
1796         }
1797         priv->hrxqs = mlx5_list_create("hrxq", eth_dev, true,
1798                                        mlx5_hrxq_create_cb,
1799                                        mlx5_hrxq_match_cb,
1800                                        mlx5_hrxq_remove_cb,
1801                                        mlx5_hrxq_clone_cb,
1802                                        mlx5_hrxq_clone_free_cb);
1803         if (!priv->hrxqs)
1804                 goto error;
1805         rte_rwlock_init(&priv->ind_tbls_lock);
1806         /* Query availability of metadata reg_c's. */
1807         err = mlx5_flow_discover_mreg_c(eth_dev);
1808         if (err < 0) {
1809                 err = -err;
1810                 goto error;
1811         }
1812         if (!mlx5_flow_ext_mreg_supported(eth_dev)) {
1813                 DRV_LOG(DEBUG,
1814                         "port %u extensive metadata register is not supported",
1815                         eth_dev->data->port_id);
1816                 if (priv->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY) {
1817                         DRV_LOG(ERR, "metadata mode %u is not supported "
1818                                      "(no metadata registers available)",
1819                                      priv->config.dv_xmeta_en);
1820                         err = ENOTSUP;
1821                         goto error;
1822                 }
1823         }
1824         if (priv->config.dv_flow_en &&
1825             priv->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY &&
1826             mlx5_flow_ext_mreg_supported(eth_dev) &&
1827             priv->sh->dv_regc0_mask) {
1828                 priv->mreg_cp_tbl = mlx5_hlist_create(MLX5_FLOW_MREG_HNAME,
1829                                                       MLX5_FLOW_MREG_HTABLE_SZ,
1830                                                       false, true, eth_dev,
1831                                                       flow_dv_mreg_create_cb,
1832                                                       flow_dv_mreg_match_cb,
1833                                                       flow_dv_mreg_remove_cb,
1834                                                       flow_dv_mreg_clone_cb,
1835                                                     flow_dv_mreg_clone_free_cb);
1836                 if (!priv->mreg_cp_tbl) {
1837                         err = ENOMEM;
1838                         goto error;
1839                 }
1840         }
1841         rte_spinlock_init(&priv->shared_act_sl);
1842         mlx5_flow_counter_mode_config(eth_dev);
1843         if (priv->config.dv_flow_en)
1844                 eth_dev->data->dev_flags |= RTE_ETH_DEV_FLOW_OPS_THREAD_SAFE;
1845         return eth_dev;
1846 error:
1847         if (priv) {
1848                 if (priv->mreg_cp_tbl)
1849                         mlx5_hlist_destroy(priv->mreg_cp_tbl);
1850                 if (priv->sh)
1851                         mlx5_os_free_shared_dr(priv);
1852                 if (priv->nl_socket_route >= 0)
1853                         close(priv->nl_socket_route);
1854                 if (priv->nl_socket_rdma >= 0)
1855                         close(priv->nl_socket_rdma);
1856                 if (priv->vmwa_context)
1857                         mlx5_vlan_vmwa_exit(priv->vmwa_context);
1858                 if (eth_dev && priv->drop_queue.hrxq)
1859                         mlx5_drop_action_destroy(eth_dev);
1860                 if (priv->mtr_profile_tbl)
1861                         mlx5_l3t_destroy(priv->mtr_profile_tbl);
1862                 if (own_domain_id)
1863                         claim_zero(rte_eth_switch_domain_free(priv->domain_id));
1864                 if (priv->hrxqs)
1865                         mlx5_list_destroy(priv->hrxqs);
1866                 mlx5_free(priv);
1867                 if (eth_dev != NULL)
1868                         eth_dev->data->dev_private = NULL;
1869         }
1870         if (eth_dev != NULL) {
1871                 /* mac_addrs must not be freed alone because part of
1872                  * dev_private
1873                  **/
1874                 eth_dev->data->mac_addrs = NULL;
1875                 rte_eth_dev_release_port(eth_dev);
1876         }
1877         if (sh)
1878                 mlx5_free_shared_dev_ctx(sh);
1879         MLX5_ASSERT(err > 0);
1880         rte_errno = err;
1881         return NULL;
1882 }
1883
1884 /**
1885  * Comparison callback to sort device data.
1886  *
1887  * This is meant to be used with qsort().
1888  *
1889  * @param a[in]
1890  *   Pointer to pointer to first data object.
1891  * @param b[in]
1892  *   Pointer to pointer to second data object.
1893  *
1894  * @return
1895  *   0 if both objects are equal, less than 0 if the first argument is less
1896  *   than the second, greater than 0 otherwise.
1897  */
1898 static int
1899 mlx5_dev_spawn_data_cmp(const void *a, const void *b)
1900 {
1901         const struct mlx5_switch_info *si_a =
1902                 &((const struct mlx5_dev_spawn_data *)a)->info;
1903         const struct mlx5_switch_info *si_b =
1904                 &((const struct mlx5_dev_spawn_data *)b)->info;
1905         int ret;
1906
1907         /* Master device first. */
1908         ret = si_b->master - si_a->master;
1909         if (ret)
1910                 return ret;
1911         /* Then representor devices. */
1912         ret = si_b->representor - si_a->representor;
1913         if (ret)
1914                 return ret;
1915         /* Unidentified devices come last in no specific order. */
1916         if (!si_a->representor)
1917                 return 0;
1918         /* Order representors by name. */
1919         return si_a->port_name - si_b->port_name;
1920 }
1921
1922 /**
1923  * Match PCI information for possible slaves of bonding device.
1924  *
1925  * @param[in] ibv_dev
1926  *   Pointer to Infiniband device structure.
1927  * @param[in] pci_dev
1928  *   Pointer to primary PCI address structure to match.
1929  * @param[in] nl_rdma
1930  *   Netlink RDMA group socket handle.
1931  * @param[in] owner
1932  *   Rerepsentor owner PF index.
1933  * @param[out] bond_info
1934  *   Pointer to bonding information.
1935  *
1936  * @return
1937  *   negative value if no bonding device found, otherwise
1938  *   positive index of slave PF in bonding.
1939  */
1940 static int
1941 mlx5_device_bond_pci_match(const struct ibv_device *ibv_dev,
1942                            const struct rte_pci_addr *pci_dev,
1943                            int nl_rdma, uint16_t owner,
1944                            struct mlx5_bond_info *bond_info)
1945 {
1946         char ifname[IF_NAMESIZE + 1];
1947         unsigned int ifindex;
1948         unsigned int np, i;
1949         FILE *bond_file = NULL, *file;
1950         int pf = -1;
1951         int ret;
1952
1953         /*
1954          * Try to get master device name. If something goes
1955          * wrong suppose the lack of kernel support and no
1956          * bonding devices.
1957          */
1958         memset(bond_info, 0, sizeof(*bond_info));
1959         if (nl_rdma < 0)
1960                 return -1;
1961         if (!strstr(ibv_dev->name, "bond"))
1962                 return -1;
1963         np = mlx5_nl_portnum(nl_rdma, ibv_dev->name);
1964         if (!np)
1965                 return -1;
1966         /*
1967          * The Master device might not be on the predefined
1968          * port (not on port index 1, it is not garanted),
1969          * we have to scan all Infiniband device port and
1970          * find master.
1971          */
1972         for (i = 1; i <= np; ++i) {
1973                 /* Check whether Infiniband port is populated. */
1974                 ifindex = mlx5_nl_ifindex(nl_rdma, ibv_dev->name, i);
1975                 if (!ifindex)
1976                         continue;
1977                 if (!if_indextoname(ifindex, ifname))
1978                         continue;
1979                 /* Try to read bonding slave names from sysfs. */
1980                 MKSTR(slaves,
1981                       "/sys/class/net/%s/master/bonding/slaves", ifname);
1982                 bond_file = fopen(slaves, "r");
1983                 if (bond_file)
1984                         break;
1985         }
1986         if (!bond_file)
1987                 return -1;
1988         /* Use safe format to check maximal buffer length. */
1989         MLX5_ASSERT(atol(RTE_STR(IF_NAMESIZE)) == IF_NAMESIZE);
1990         while (fscanf(bond_file, "%" RTE_STR(IF_NAMESIZE) "s", ifname) == 1) {
1991                 char tmp_str[IF_NAMESIZE + 32];
1992                 struct rte_pci_addr pci_addr;
1993                 struct mlx5_switch_info info;
1994
1995                 /* Process slave interface names in the loop. */
1996                 snprintf(tmp_str, sizeof(tmp_str),
1997                          "/sys/class/net/%s", ifname);
1998                 if (mlx5_get_pci_addr(tmp_str, &pci_addr)) {
1999                         DRV_LOG(WARNING, "can not get PCI address"
2000                                          " for netdev \"%s\"", ifname);
2001                         continue;
2002                 }
2003                 /* Slave interface PCI address match found. */
2004                 snprintf(tmp_str, sizeof(tmp_str),
2005                          "/sys/class/net/%s/phys_port_name", ifname);
2006                 file = fopen(tmp_str, "rb");
2007                 if (!file)
2008                         break;
2009                 info.name_type = MLX5_PHYS_PORT_NAME_TYPE_NOTSET;
2010                 if (fscanf(file, "%32s", tmp_str) == 1)
2011                         mlx5_translate_port_name(tmp_str, &info);
2012                 fclose(file);
2013                 /* Only process PF ports. */
2014                 if (info.name_type != MLX5_PHYS_PORT_NAME_TYPE_LEGACY &&
2015                     info.name_type != MLX5_PHYS_PORT_NAME_TYPE_UPLINK)
2016                         continue;
2017                 /* Check max bonding member. */
2018                 if (info.port_name >= MLX5_BOND_MAX_PORTS) {
2019                         DRV_LOG(WARNING, "bonding index out of range, "
2020                                 "please increase MLX5_BOND_MAX_PORTS: %s",
2021                                 tmp_str);
2022                         break;
2023                 }
2024                 /* Match PCI address, allows BDF0+pfx or BDFx+pfx. */
2025                 if (pci_dev->domain == pci_addr.domain &&
2026                     pci_dev->bus == pci_addr.bus &&
2027                     pci_dev->devid == pci_addr.devid &&
2028                     ((pci_dev->function == 0 &&
2029                       pci_dev->function + owner == pci_addr.function) ||
2030                      (pci_dev->function == owner &&
2031                       pci_addr.function == owner)))
2032                         pf = info.port_name;
2033                 /* Get ifindex. */
2034                 snprintf(tmp_str, sizeof(tmp_str),
2035                          "/sys/class/net/%s/ifindex", ifname);
2036                 file = fopen(tmp_str, "rb");
2037                 if (!file)
2038                         break;
2039                 ret = fscanf(file, "%u", &ifindex);
2040                 fclose(file);
2041                 if (ret != 1)
2042                         break;
2043                 /* Save bonding info. */
2044                 strncpy(bond_info->ports[info.port_name].ifname, ifname,
2045                         sizeof(bond_info->ports[0].ifname));
2046                 bond_info->ports[info.port_name].pci_addr = pci_addr;
2047                 bond_info->ports[info.port_name].ifindex = ifindex;
2048                 bond_info->n_port++;
2049         }
2050         if (pf >= 0) {
2051                 /* Get bond interface info */
2052                 ret = mlx5_sysfs_bond_info(ifindex, &bond_info->ifindex,
2053                                            bond_info->ifname);
2054                 if (ret)
2055                         DRV_LOG(ERR, "unable to get bond info: %s",
2056                                 strerror(rte_errno));
2057                 else
2058                         DRV_LOG(INFO, "PF device %u, bond device %u(%s)",
2059                                 ifindex, bond_info->ifindex, bond_info->ifname);
2060         }
2061         return pf;
2062 }
2063
2064 /**
2065  * Register a PCI device within bonding.
2066  *
2067  * This function spawns Ethernet devices out of a given PCI device and
2068  * bonding owner PF index.
2069  *
2070  * @param[in] pci_dev
2071  *   PCI device information.
2072  * @param[in] req_eth_da
2073  *   Requested ethdev device argument.
2074  * @param[in] owner_id
2075  *   Requested owner PF port ID within bonding device, default to 0.
2076  *
2077  * @return
2078  *   0 on success, a negative errno value otherwise and rte_errno is set.
2079  */
2080 static int
2081 mlx5_os_pci_probe_pf(struct rte_pci_device *pci_dev,
2082                      struct rte_eth_devargs *req_eth_da,
2083                      uint16_t owner_id)
2084 {
2085         struct ibv_device **ibv_list;
2086         /*
2087          * Number of found IB Devices matching with requested PCI BDF.
2088          * nd != 1 means there are multiple IB devices over the same
2089          * PCI device and we have representors and master.
2090          */
2091         unsigned int nd = 0;
2092         /*
2093          * Number of found IB device Ports. nd = 1 and np = 1..n means
2094          * we have the single multiport IB device, and there may be
2095          * representors attached to some of found ports.
2096          */
2097         unsigned int np = 0;
2098         /*
2099          * Number of DPDK ethernet devices to Spawn - either over
2100          * multiple IB devices or multiple ports of single IB device.
2101          * Actually this is the number of iterations to spawn.
2102          */
2103         unsigned int ns = 0;
2104         /*
2105          * Bonding device
2106          *   < 0 - no bonding device (single one)
2107          *  >= 0 - bonding device (value is slave PF index)
2108          */
2109         int bd = -1;
2110         struct mlx5_dev_spawn_data *list = NULL;
2111         struct mlx5_dev_config dev_config;
2112         unsigned int dev_config_vf;
2113         struct rte_eth_devargs eth_da = *req_eth_da;
2114         struct rte_pci_addr owner_pci = pci_dev->addr; /* Owner PF. */
2115         struct mlx5_bond_info bond_info;
2116         int ret = -1;
2117
2118         if (rte_eal_process_type() == RTE_PROC_PRIMARY)
2119                 mlx5_pmd_socket_init();
2120         ret = mlx5_init_once();
2121         if (ret) {
2122                 DRV_LOG(ERR, "unable to init PMD global data: %s",
2123                         strerror(rte_errno));
2124                 return -rte_errno;
2125         }
2126         errno = 0;
2127         ibv_list = mlx5_glue->get_device_list(&ret);
2128         if (!ibv_list) {
2129                 rte_errno = errno ? errno : ENOSYS;
2130                 DRV_LOG(ERR, "cannot list devices, is ib_uverbs loaded?");
2131                 return -rte_errno;
2132         }
2133         /*
2134          * First scan the list of all Infiniband devices to find
2135          * matching ones, gathering into the list.
2136          */
2137         struct ibv_device *ibv_match[ret + 1];
2138         int nl_route = mlx5_nl_init(NETLINK_ROUTE);
2139         int nl_rdma = mlx5_nl_init(NETLINK_RDMA);
2140         unsigned int i;
2141
2142         while (ret-- > 0) {
2143                 struct rte_pci_addr pci_addr;
2144
2145                 DRV_LOG(DEBUG, "checking device \"%s\"", ibv_list[ret]->name);
2146                 bd = mlx5_device_bond_pci_match
2147                                 (ibv_list[ret], &owner_pci, nl_rdma, owner_id,
2148                                  &bond_info);
2149                 if (bd >= 0) {
2150                         /*
2151                          * Bonding device detected. Only one match is allowed,
2152                          * the bonding is supported over multi-port IB device,
2153                          * there should be no matches on representor PCI
2154                          * functions or non VF LAG bonding devices with
2155                          * specified address.
2156                          */
2157                         if (nd) {
2158                                 DRV_LOG(ERR,
2159                                         "multiple PCI match on bonding device"
2160                                         "\"%s\" found", ibv_list[ret]->name);
2161                                 rte_errno = ENOENT;
2162                                 ret = -rte_errno;
2163                                 goto exit;
2164                         }
2165                         /* Amend owner pci address if owner PF ID specified. */
2166                         if (eth_da.nb_representor_ports)
2167                                 owner_pci.function += owner_id;
2168                         DRV_LOG(INFO, "PCI information matches for"
2169                                       " slave %d bonding device \"%s\"",
2170                                       bd, ibv_list[ret]->name);
2171                         ibv_match[nd++] = ibv_list[ret];
2172                         break;
2173                 } else {
2174                         /* Bonding device not found. */
2175                         if (mlx5_get_pci_addr(ibv_list[ret]->ibdev_path,
2176                                               &pci_addr))
2177                                 continue;
2178                         if (owner_pci.domain != pci_addr.domain ||
2179                             owner_pci.bus != pci_addr.bus ||
2180                             owner_pci.devid != pci_addr.devid ||
2181                             owner_pci.function != pci_addr.function)
2182                                 continue;
2183                         DRV_LOG(INFO, "PCI information matches for device \"%s\"",
2184                                 ibv_list[ret]->name);
2185                         ibv_match[nd++] = ibv_list[ret];
2186                 }
2187         }
2188         ibv_match[nd] = NULL;
2189         if (!nd) {
2190                 /* No device matches, just complain and bail out. */
2191                 DRV_LOG(WARNING,
2192                         "no Verbs device matches PCI device " PCI_PRI_FMT ","
2193                         " are kernel drivers loaded?",
2194                         owner_pci.domain, owner_pci.bus,
2195                         owner_pci.devid, owner_pci.function);
2196                 rte_errno = ENOENT;
2197                 ret = -rte_errno;
2198                 goto exit;
2199         }
2200         if (nd == 1) {
2201                 /*
2202                  * Found single matching device may have multiple ports.
2203                  * Each port may be representor, we have to check the port
2204                  * number and check the representors existence.
2205                  */
2206                 if (nl_rdma >= 0)
2207                         np = mlx5_nl_portnum(nl_rdma, ibv_match[0]->name);
2208                 if (!np)
2209                         DRV_LOG(WARNING, "can not get IB device \"%s\""
2210                                          " ports number", ibv_match[0]->name);
2211                 if (bd >= 0 && !np) {
2212                         DRV_LOG(ERR, "can not get ports"
2213                                      " for bonding device");
2214                         rte_errno = ENOENT;
2215                         ret = -rte_errno;
2216                         goto exit;
2217                 }
2218         }
2219 #ifndef HAVE_MLX5DV_DR_DEVX_PORT
2220         if (bd >= 0) {
2221                 /*
2222                  * This may happen if there is VF LAG kernel support and
2223                  * application is compiled with older rdma_core library.
2224                  */
2225                 DRV_LOG(ERR,
2226                         "No kernel/verbs support for VF LAG bonding found.");
2227                 rte_errno = ENOTSUP;
2228                 ret = -rte_errno;
2229                 goto exit;
2230         }
2231 #endif
2232         /*
2233          * Now we can determine the maximal
2234          * amount of devices to be spawned.
2235          */
2236         list = mlx5_malloc(MLX5_MEM_ZERO,
2237                            sizeof(struct mlx5_dev_spawn_data) *
2238                            (np ? np : nd),
2239                            RTE_CACHE_LINE_SIZE, SOCKET_ID_ANY);
2240         if (!list) {
2241                 DRV_LOG(ERR, "spawn data array allocation failure");
2242                 rte_errno = ENOMEM;
2243                 ret = -rte_errno;
2244                 goto exit;
2245         }
2246         if (bd >= 0 || np > 1) {
2247                 /*
2248                  * Single IB device with multiple ports found,
2249                  * it may be E-Switch master device and representors.
2250                  * We have to perform identification through the ports.
2251                  */
2252                 MLX5_ASSERT(nl_rdma >= 0);
2253                 MLX5_ASSERT(ns == 0);
2254                 MLX5_ASSERT(nd == 1);
2255                 MLX5_ASSERT(np);
2256                 for (i = 1; i <= np; ++i) {
2257                         list[ns].bond_info = &bond_info;
2258                         list[ns].max_port = np;
2259                         list[ns].phys_port = i;
2260                         list[ns].phys_dev = ibv_match[0];
2261                         list[ns].eth_dev = NULL;
2262                         list[ns].pci_dev = pci_dev;
2263                         list[ns].pf_bond = bd;
2264                         list[ns].ifindex = mlx5_nl_ifindex
2265                                 (nl_rdma,
2266                                 mlx5_os_get_dev_device_name
2267                                                 (list[ns].phys_dev), i);
2268                         if (!list[ns].ifindex) {
2269                                 /*
2270                                  * No network interface index found for the
2271                                  * specified port, it means there is no
2272                                  * representor on this port. It's OK,
2273                                  * there can be disabled ports, for example
2274                                  * if sriov_numvfs < sriov_totalvfs.
2275                                  */
2276                                 continue;
2277                         }
2278                         ret = -1;
2279                         if (nl_route >= 0)
2280                                 ret = mlx5_nl_switch_info
2281                                                (nl_route,
2282                                                 list[ns].ifindex,
2283                                                 &list[ns].info);
2284                         if (ret || (!list[ns].info.representor &&
2285                                     !list[ns].info.master)) {
2286                                 /*
2287                                  * We failed to recognize representors with
2288                                  * Netlink, let's try to perform the task
2289                                  * with sysfs.
2290                                  */
2291                                 ret =  mlx5_sysfs_switch_info
2292                                                 (list[ns].ifindex,
2293                                                  &list[ns].info);
2294                         }
2295 #ifdef HAVE_MLX5DV_DR_DEVX_PORT
2296                         if (!ret && bd >= 0) {
2297                                 switch (list[ns].info.name_type) {
2298                                 case MLX5_PHYS_PORT_NAME_TYPE_UPLINK:
2299                                         if (list[ns].info.port_name == bd)
2300                                                 ns++;
2301                                         break;
2302                                 case MLX5_PHYS_PORT_NAME_TYPE_PFHPF:
2303                                         /* Fallthrough */
2304                                 case MLX5_PHYS_PORT_NAME_TYPE_PFVF:
2305                                         /* Fallthrough */
2306                                 case MLX5_PHYS_PORT_NAME_TYPE_PFSF:
2307                                         if (list[ns].info.pf_num == bd)
2308                                                 ns++;
2309                                         break;
2310                                 default:
2311                                         break;
2312                                 }
2313                                 continue;
2314                         }
2315 #endif
2316                         if (!ret && (list[ns].info.representor ^
2317                                      list[ns].info.master))
2318                                 ns++;
2319                 }
2320                 if (!ns) {
2321                         DRV_LOG(ERR,
2322                                 "unable to recognize master/representors"
2323                                 " on the IB device with multiple ports");
2324                         rte_errno = ENOENT;
2325                         ret = -rte_errno;
2326                         goto exit;
2327                 }
2328         } else {
2329                 /*
2330                  * The existence of several matching entries (nd > 1) means
2331                  * port representors have been instantiated. No existing Verbs
2332                  * call nor sysfs entries can tell them apart, this can only
2333                  * be done through Netlink calls assuming kernel drivers are
2334                  * recent enough to support them.
2335                  *
2336                  * In the event of identification failure through Netlink,
2337                  * try again through sysfs, then:
2338                  *
2339                  * 1. A single IB device matches (nd == 1) with single
2340                  *    port (np=0/1) and is not a representor, assume
2341                  *    no switch support.
2342                  *
2343                  * 2. Otherwise no safe assumptions can be made;
2344                  *    complain louder and bail out.
2345                  */
2346                 for (i = 0; i != nd; ++i) {
2347                         memset(&list[ns].info, 0, sizeof(list[ns].info));
2348                         list[ns].bond_info = NULL;
2349                         list[ns].max_port = 1;
2350                         list[ns].phys_port = 1;
2351                         list[ns].phys_dev = ibv_match[i];
2352                         list[ns].eth_dev = NULL;
2353                         list[ns].pci_dev = pci_dev;
2354                         list[ns].pf_bond = -1;
2355                         list[ns].ifindex = 0;
2356                         if (nl_rdma >= 0)
2357                                 list[ns].ifindex = mlx5_nl_ifindex
2358                                 (nl_rdma,
2359                                 mlx5_os_get_dev_device_name
2360                                                 (list[ns].phys_dev), 1);
2361                         if (!list[ns].ifindex) {
2362                                 char ifname[IF_NAMESIZE];
2363
2364                                 /*
2365                                  * Netlink failed, it may happen with old
2366                                  * ib_core kernel driver (before 4.16).
2367                                  * We can assume there is old driver because
2368                                  * here we are processing single ports IB
2369                                  * devices. Let's try sysfs to retrieve
2370                                  * the ifindex. The method works for
2371                                  * master device only.
2372                                  */
2373                                 if (nd > 1) {
2374                                         /*
2375                                          * Multiple devices found, assume
2376                                          * representors, can not distinguish
2377                                          * master/representor and retrieve
2378                                          * ifindex via sysfs.
2379                                          */
2380                                         continue;
2381                                 }
2382                                 ret = mlx5_get_ifname_sysfs
2383                                         (ibv_match[i]->ibdev_path, ifname);
2384                                 if (!ret)
2385                                         list[ns].ifindex =
2386                                                 if_nametoindex(ifname);
2387                                 if (!list[ns].ifindex) {
2388                                         /*
2389                                          * No network interface index found
2390                                          * for the specified device, it means
2391                                          * there it is neither representor
2392                                          * nor master.
2393                                          */
2394                                         continue;
2395                                 }
2396                         }
2397                         ret = -1;
2398                         if (nl_route >= 0)
2399                                 ret = mlx5_nl_switch_info
2400                                                (nl_route,
2401                                                 list[ns].ifindex,
2402                                                 &list[ns].info);
2403                         if (ret || (!list[ns].info.representor &&
2404                                     !list[ns].info.master)) {
2405                                 /*
2406                                  * We failed to recognize representors with
2407                                  * Netlink, let's try to perform the task
2408                                  * with sysfs.
2409                                  */
2410                                 ret =  mlx5_sysfs_switch_info
2411                                                 (list[ns].ifindex,
2412                                                  &list[ns].info);
2413                         }
2414                         if (!ret && (list[ns].info.representor ^
2415                                      list[ns].info.master)) {
2416                                 ns++;
2417                         } else if ((nd == 1) &&
2418                                    !list[ns].info.representor &&
2419                                    !list[ns].info.master) {
2420                                 /*
2421                                  * Single IB device with
2422                                  * one physical port and
2423                                  * attached network device.
2424                                  * May be SRIOV is not enabled
2425                                  * or there is no representors.
2426                                  */
2427                                 DRV_LOG(INFO, "no E-Switch support detected");
2428                                 ns++;
2429                                 break;
2430                         }
2431                 }
2432                 if (!ns) {
2433                         DRV_LOG(ERR,
2434                                 "unable to recognize master/representors"
2435                                 " on the multiple IB devices");
2436                         rte_errno = ENOENT;
2437                         ret = -rte_errno;
2438                         goto exit;
2439                 }
2440                 /*
2441                  * New kernels may add the switch_id attribute for the case
2442                  * there is no E-Switch and we wrongly recognized the
2443                  * only device as master. Override this if there is the
2444                  * single device with single port and new device name
2445                  * format present.
2446                  */
2447                 if (nd == 1 &&
2448                     list[0].info.name_type == MLX5_PHYS_PORT_NAME_TYPE_UPLINK) {
2449                         list[0].info.master = 0;
2450                         list[0].info.representor = 0;
2451                 }
2452         }
2453         MLX5_ASSERT(ns);
2454         /*
2455          * Sort list to probe devices in natural order for users convenience
2456          * (i.e. master first, then representors from lowest to highest ID).
2457          */
2458         qsort(list, ns, sizeof(*list), mlx5_dev_spawn_data_cmp);
2459         /* Device specific configuration. */
2460         switch (pci_dev->id.device_id) {
2461         case PCI_DEVICE_ID_MELLANOX_CONNECTX4VF:
2462         case PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF:
2463         case PCI_DEVICE_ID_MELLANOX_CONNECTX5VF:
2464         case PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF:
2465         case PCI_DEVICE_ID_MELLANOX_CONNECTX5BFVF:
2466         case PCI_DEVICE_ID_MELLANOX_CONNECTX6VF:
2467         case PCI_DEVICE_ID_MELLANOX_CONNECTXVF:
2468                 dev_config_vf = 1;
2469                 break;
2470         default:
2471                 dev_config_vf = 0;
2472                 break;
2473         }
2474         if (eth_da.type != RTE_ETH_REPRESENTOR_NONE) {
2475                 /* Set devargs default values. */
2476                 if (eth_da.nb_mh_controllers == 0) {
2477                         eth_da.nb_mh_controllers = 1;
2478                         eth_da.mh_controllers[0] = 0;
2479                 }
2480                 if (eth_da.nb_ports == 0 && ns > 0) {
2481                         if (list[0].pf_bond >= 0 && list[0].info.representor)
2482                                 DRV_LOG(WARNING, "Representor on Bonding device should use pf#vf# syntax: %s",
2483                                         pci_dev->device.devargs->args);
2484                         eth_da.nb_ports = 1;
2485                         eth_da.ports[0] = list[0].info.pf_num;
2486                 }
2487                 if (eth_da.nb_representor_ports == 0) {
2488                         eth_da.nb_representor_ports = 1;
2489                         eth_da.representor_ports[0] = 0;
2490                 }
2491         }
2492         for (i = 0; i != ns; ++i) {
2493                 uint32_t restore;
2494
2495                 /* Default configuration. */
2496                 memset(&dev_config, 0, sizeof(struct mlx5_dev_config));
2497                 dev_config.vf = dev_config_vf;
2498                 dev_config.mps = MLX5_ARG_UNSET;
2499                 dev_config.dbnc = MLX5_ARG_UNSET;
2500                 dev_config.rx_vec_en = 1;
2501                 dev_config.txq_inline_max = MLX5_ARG_UNSET;
2502                 dev_config.txq_inline_min = MLX5_ARG_UNSET;
2503                 dev_config.txq_inline_mpw = MLX5_ARG_UNSET;
2504                 dev_config.txqs_inline = MLX5_ARG_UNSET;
2505                 dev_config.vf_nl_en = 1;
2506                 dev_config.mr_ext_memseg_en = 1;
2507                 dev_config.mprq.max_memcpy_len = MLX5_MPRQ_MEMCPY_DEFAULT_LEN;
2508                 dev_config.mprq.min_rxqs_num = MLX5_MPRQ_MIN_RXQS;
2509                 dev_config.dv_esw_en = 1;
2510                 dev_config.dv_flow_en = 1;
2511                 dev_config.decap_en = 1;
2512                 dev_config.log_hp_size = MLX5_ARG_UNSET;
2513                 dev_config.allow_duplicate_pattern = 1;
2514                 list[i].numa_node = pci_dev->device.numa_node;
2515                 list[i].eth_dev = mlx5_dev_spawn(&pci_dev->device,
2516                                                  &list[i],
2517                                                  &dev_config,
2518                                                  &eth_da);
2519                 if (!list[i].eth_dev) {
2520                         if (rte_errno != EBUSY && rte_errno != EEXIST)
2521                                 break;
2522                         /* Device is disabled or already spawned. Ignore it. */
2523                         continue;
2524                 }
2525                 restore = list[i].eth_dev->data->dev_flags;
2526                 rte_eth_copy_pci_info(list[i].eth_dev, pci_dev);
2527                 /* Restore non-PCI flags cleared by the above call. */
2528                 list[i].eth_dev->data->dev_flags |= restore;
2529                 rte_eth_dev_probing_finish(list[i].eth_dev);
2530         }
2531         if (i != ns) {
2532                 DRV_LOG(ERR,
2533                         "probe of PCI device " PCI_PRI_FMT " aborted after"
2534                         " encountering an error: %s",
2535                         owner_pci.domain, owner_pci.bus,
2536                         owner_pci.devid, owner_pci.function,
2537                         strerror(rte_errno));
2538                 ret = -rte_errno;
2539                 /* Roll back. */
2540                 while (i--) {
2541                         if (!list[i].eth_dev)
2542                                 continue;
2543                         mlx5_dev_close(list[i].eth_dev);
2544                         /* mac_addrs must not be freed because in dev_private */
2545                         list[i].eth_dev->data->mac_addrs = NULL;
2546                         claim_zero(rte_eth_dev_release_port(list[i].eth_dev));
2547                 }
2548                 /* Restore original error. */
2549                 rte_errno = -ret;
2550         } else {
2551                 ret = 0;
2552         }
2553 exit:
2554         /*
2555          * Do the routine cleanup:
2556          * - close opened Netlink sockets
2557          * - free allocated spawn data array
2558          * - free the Infiniband device list
2559          */
2560         if (nl_rdma >= 0)
2561                 close(nl_rdma);
2562         if (nl_route >= 0)
2563                 close(nl_route);
2564         if (list)
2565                 mlx5_free(list);
2566         MLX5_ASSERT(ibv_list);
2567         mlx5_glue->free_device_list(ibv_list);
2568         return ret;
2569 }
2570
2571 /**
2572  * DPDK callback to register a PCI device.
2573  *
2574  * This function spawns Ethernet devices out of a given PCI device.
2575  *
2576  * @param[in] pci_drv
2577  *   PCI driver structure (mlx5_driver).
2578  * @param[in] pci_dev
2579  *   PCI device information.
2580  *
2581  * @return
2582  *   0 on success, a negative errno value otherwise and rte_errno is set.
2583  */
2584 int
2585 mlx5_os_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
2586                   struct rte_pci_device *pci_dev)
2587 {
2588         struct rte_eth_devargs eth_da = { .type = RTE_ETH_REPRESENTOR_NONE };
2589         int ret = 0;
2590         uint16_t p;
2591
2592         if (pci_dev->device.devargs) {
2593                 /* Parse representor information from device argument. */
2594                 if (pci_dev->device.devargs->cls_str)
2595                         ret = rte_eth_devargs_parse
2596                                 (pci_dev->device.devargs->cls_str, &eth_da);
2597                 if (ret) {
2598                         DRV_LOG(ERR, "failed to parse device arguments: %s",
2599                                 pci_dev->device.devargs->cls_str);
2600                         return -rte_errno;
2601                 }
2602                 if (eth_da.type == RTE_ETH_REPRESENTOR_NONE) {
2603                         /* Support legacy device argument */
2604                         ret = rte_eth_devargs_parse
2605                                 (pci_dev->device.devargs->args, &eth_da);
2606                         if (ret) {
2607                                 DRV_LOG(ERR, "failed to parse device arguments: %s",
2608                                         pci_dev->device.devargs->args);
2609                                 return -rte_errno;
2610                         }
2611                 }
2612         }
2613
2614         if (eth_da.nb_ports > 0) {
2615                 /* Iterate all port if devargs pf is range: "pf[0-1]vf[...]". */
2616                 for (p = 0; p < eth_da.nb_ports; p++)
2617                         ret = mlx5_os_pci_probe_pf(pci_dev, &eth_da,
2618                                                    eth_da.ports[p]);
2619         } else {
2620                 ret = mlx5_os_pci_probe_pf(pci_dev, &eth_da, 0);
2621         }
2622         return ret;
2623 }
2624
2625 static int
2626 mlx5_config_doorbell_mapping_env(const struct mlx5_dev_config *config)
2627 {
2628         char *env;
2629         int value;
2630
2631         MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY);
2632         /* Get environment variable to store. */
2633         env = getenv(MLX5_SHUT_UP_BF);
2634         value = env ? !!strcmp(env, "0") : MLX5_ARG_UNSET;
2635         if (config->dbnc == MLX5_ARG_UNSET)
2636                 setenv(MLX5_SHUT_UP_BF, MLX5_SHUT_UP_BF_DEFAULT, 1);
2637         else
2638                 setenv(MLX5_SHUT_UP_BF,
2639                        config->dbnc == MLX5_TXDB_NCACHED ? "1" : "0", 1);
2640         return value;
2641 }
2642
2643 static void
2644 mlx5_restore_doorbell_mapping_env(int value)
2645 {
2646         MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY);
2647         /* Restore the original environment variable state. */
2648         if (value == MLX5_ARG_UNSET)
2649                 unsetenv(MLX5_SHUT_UP_BF);
2650         else
2651                 setenv(MLX5_SHUT_UP_BF, value ? "1" : "0", 1);
2652 }
2653
2654 /**
2655  * Extract pdn of PD object using DV API.
2656  *
2657  * @param[in] pd
2658  *   Pointer to the verbs PD object.
2659  * @param[out] pdn
2660  *   Pointer to the PD object number variable.
2661  *
2662  * @return
2663  *   0 on success, error value otherwise.
2664  */
2665 int
2666 mlx5_os_get_pdn(void *pd, uint32_t *pdn)
2667 {
2668 #ifdef HAVE_IBV_FLOW_DV_SUPPORT
2669         struct mlx5dv_obj obj;
2670         struct mlx5dv_pd pd_info;
2671         int ret = 0;
2672
2673         obj.pd.in = pd;
2674         obj.pd.out = &pd_info;
2675         ret = mlx5_glue->dv_init_obj(&obj, MLX5DV_OBJ_PD);
2676         if (ret) {
2677                 DRV_LOG(DEBUG, "Fail to get PD object info");
2678                 return ret;
2679         }
2680         *pdn = pd_info.pdn;
2681         return 0;
2682 #else
2683         (void)pd;
2684         (void)pdn;
2685         return -ENOTSUP;
2686 #endif /* HAVE_IBV_FLOW_DV_SUPPORT */
2687 }
2688
2689 /**
2690  * Function API to open IB device.
2691  *
2692  * This function calls the Linux glue APIs to open a device.
2693  *
2694  * @param[in] spawn
2695  *   Pointer to the IB device attributes (name, port, etc).
2696  * @param[out] config
2697  *   Pointer to device configuration structure.
2698  * @param[out] sh
2699  *   Pointer to shared context structure.
2700  *
2701  * @return
2702  *   0 on success, a positive error value otherwise.
2703  */
2704 int
2705 mlx5_os_open_device(const struct mlx5_dev_spawn_data *spawn,
2706                      const struct mlx5_dev_config *config,
2707                      struct mlx5_dev_ctx_shared *sh)
2708 {
2709         int dbmap_env;
2710         int err = 0;
2711
2712         pthread_mutex_init(&sh->txpp.mutex, NULL);
2713         /*
2714          * Configure environment variable "MLX5_BF_SHUT_UP"
2715          * before the device creation. The rdma_core library
2716          * checks the variable at device creation and
2717          * stores the result internally.
2718          */
2719         dbmap_env = mlx5_config_doorbell_mapping_env(config);
2720         /* Try to open IB device with DV first, then usual Verbs. */
2721         errno = 0;
2722         sh->ctx = mlx5_glue->dv_open_device(spawn->phys_dev);
2723         if (sh->ctx) {
2724                 sh->devx = 1;
2725                 DRV_LOG(DEBUG, "DevX is supported");
2726                 /* The device is created, no need for environment. */
2727                 mlx5_restore_doorbell_mapping_env(dbmap_env);
2728         } else {
2729                 /* The environment variable is still configured. */
2730                 sh->ctx = mlx5_glue->open_device(spawn->phys_dev);
2731                 err = errno ? errno : ENODEV;
2732                 /*
2733                  * The environment variable is not needed anymore,
2734                  * all device creation attempts are completed.
2735                  */
2736                 mlx5_restore_doorbell_mapping_env(dbmap_env);
2737                 if (!sh->ctx)
2738                         return err;
2739                 DRV_LOG(DEBUG, "DevX is NOT supported");
2740                 err = 0;
2741         }
2742         if (!err && sh->ctx) {
2743                 /* Hint libmlx5 to use PMD allocator for data plane resources */
2744                 mlx5_glue->dv_set_context_attr(sh->ctx,
2745                         MLX5DV_CTX_ATTR_BUF_ALLOCATORS,
2746                         (void *)((uintptr_t)&(struct mlx5dv_ctx_allocators){
2747                                 .alloc = &mlx5_alloc_verbs_buf,
2748                                 .free = &mlx5_free_verbs_buf,
2749                                 .data = sh,
2750                         }));
2751         }
2752         return err;
2753 }
2754
2755 /**
2756  * Install shared asynchronous device events handler.
2757  * This function is implemented to support event sharing
2758  * between multiple ports of single IB device.
2759  *
2760  * @param sh
2761  *   Pointer to mlx5_dev_ctx_shared object.
2762  */
2763 void
2764 mlx5_os_dev_shared_handler_install(struct mlx5_dev_ctx_shared *sh)
2765 {
2766         int ret;
2767         int flags;
2768
2769         sh->intr_handle.fd = -1;
2770         flags = fcntl(((struct ibv_context *)sh->ctx)->async_fd, F_GETFL);
2771         ret = fcntl(((struct ibv_context *)sh->ctx)->async_fd,
2772                     F_SETFL, flags | O_NONBLOCK);
2773         if (ret) {
2774                 DRV_LOG(INFO, "failed to change file descriptor async event"
2775                         " queue");
2776         } else {
2777                 sh->intr_handle.fd = ((struct ibv_context *)sh->ctx)->async_fd;
2778                 sh->intr_handle.type = RTE_INTR_HANDLE_EXT;
2779                 if (rte_intr_callback_register(&sh->intr_handle,
2780                                         mlx5_dev_interrupt_handler, sh)) {
2781                         DRV_LOG(INFO, "Fail to install the shared interrupt.");
2782                         sh->intr_handle.fd = -1;
2783                 }
2784         }
2785         if (sh->devx) {
2786 #ifdef HAVE_IBV_DEVX_ASYNC
2787                 sh->intr_handle_devx.fd = -1;
2788                 sh->devx_comp =
2789                         (void *)mlx5_glue->devx_create_cmd_comp(sh->ctx);
2790                 struct mlx5dv_devx_cmd_comp *devx_comp = sh->devx_comp;
2791                 if (!devx_comp) {
2792                         DRV_LOG(INFO, "failed to allocate devx_comp.");
2793                         return;
2794                 }
2795                 flags = fcntl(devx_comp->fd, F_GETFL);
2796                 ret = fcntl(devx_comp->fd, F_SETFL, flags | O_NONBLOCK);
2797                 if (ret) {
2798                         DRV_LOG(INFO, "failed to change file descriptor"
2799                                 " devx comp");
2800                         return;
2801                 }
2802                 sh->intr_handle_devx.fd = devx_comp->fd;
2803                 sh->intr_handle_devx.type = RTE_INTR_HANDLE_EXT;
2804                 if (rte_intr_callback_register(&sh->intr_handle_devx,
2805                                         mlx5_dev_interrupt_handler_devx, sh)) {
2806                         DRV_LOG(INFO, "Fail to install the devx shared"
2807                                 " interrupt.");
2808                         sh->intr_handle_devx.fd = -1;
2809                 }
2810 #endif /* HAVE_IBV_DEVX_ASYNC */
2811         }
2812 }
2813
2814 /**
2815  * Uninstall shared asynchronous device events handler.
2816  * This function is implemented to support event sharing
2817  * between multiple ports of single IB device.
2818  *
2819  * @param dev
2820  *   Pointer to mlx5_dev_ctx_shared object.
2821  */
2822 void
2823 mlx5_os_dev_shared_handler_uninstall(struct mlx5_dev_ctx_shared *sh)
2824 {
2825         if (sh->intr_handle.fd >= 0)
2826                 mlx5_intr_callback_unregister(&sh->intr_handle,
2827                                               mlx5_dev_interrupt_handler, sh);
2828 #ifdef HAVE_IBV_DEVX_ASYNC
2829         if (sh->intr_handle_devx.fd >= 0)
2830                 rte_intr_callback_unregister(&sh->intr_handle_devx,
2831                                   mlx5_dev_interrupt_handler_devx, sh);
2832         if (sh->devx_comp)
2833                 mlx5_glue->devx_destroy_cmd_comp(sh->devx_comp);
2834 #endif
2835 }
2836
2837 /**
2838  * Read statistics by a named counter.
2839  *
2840  * @param[in] priv
2841  *   Pointer to the private device data structure.
2842  * @param[in] ctr_name
2843  *   Pointer to the name of the statistic counter to read
2844  * @param[out] stat
2845  *   Pointer to read statistic value.
2846  * @return
2847  *   0 on success and stat is valud, 1 if failed to read the value
2848  *   rte_errno is set.
2849  *
2850  */
2851 int
2852 mlx5_os_read_dev_stat(struct mlx5_priv *priv, const char *ctr_name,
2853                       uint64_t *stat)
2854 {
2855         int fd;
2856
2857         if (priv->sh) {
2858                 if (priv->q_counters != NULL &&
2859                     strcmp(ctr_name, "out_of_buffer") == 0)
2860                         return mlx5_devx_cmd_queue_counter_query
2861                                         (priv->q_counters, 0, (uint32_t *)stat);
2862                 MKSTR(path, "%s/ports/%d/hw_counters/%s",
2863                       priv->sh->ibdev_path,
2864                       priv->dev_port,
2865                       ctr_name);
2866                 fd = open(path, O_RDONLY);
2867                 /*
2868                  * in switchdev the file location is not per port
2869                  * but rather in <ibdev_path>/hw_counters/<file_name>.
2870                  */
2871                 if (fd == -1) {
2872                         MKSTR(path1, "%s/hw_counters/%s",
2873                               priv->sh->ibdev_path,
2874                               ctr_name);
2875                         fd = open(path1, O_RDONLY);
2876                 }
2877                 if (fd != -1) {
2878                         char buf[21] = {'\0'};
2879                         ssize_t n = read(fd, buf, sizeof(buf));
2880
2881                         close(fd);
2882                         if (n != -1) {
2883                                 *stat = strtoull(buf, NULL, 10);
2884                                 return 0;
2885                         }
2886                 }
2887         }
2888         *stat = 0;
2889         return 1;
2890 }
2891
2892 /**
2893  * Set the reg_mr and dereg_mr call backs
2894  *
2895  * @param reg_mr_cb[out]
2896  *   Pointer to reg_mr func
2897  * @param dereg_mr_cb[out]
2898  *   Pointer to dereg_mr func
2899  *
2900  */
2901 void
2902 mlx5_os_set_reg_mr_cb(mlx5_reg_mr_t *reg_mr_cb,
2903                       mlx5_dereg_mr_t *dereg_mr_cb)
2904 {
2905         *reg_mr_cb = mlx5_mr_verbs_ops.reg_mr;
2906         *dereg_mr_cb = mlx5_mr_verbs_ops.dereg_mr;
2907 }
2908
2909 /**
2910  * Remove a MAC address from device
2911  *
2912  * @param dev
2913  *   Pointer to Ethernet device structure.
2914  * @param index
2915  *   MAC address index.
2916  */
2917 void
2918 mlx5_os_mac_addr_remove(struct rte_eth_dev *dev, uint32_t index)
2919 {
2920         struct mlx5_priv *priv = dev->data->dev_private;
2921         const int vf = priv->config.vf;
2922
2923         if (vf)
2924                 mlx5_nl_mac_addr_remove(priv->nl_socket_route,
2925                                         mlx5_ifindex(dev), priv->mac_own,
2926                                         &dev->data->mac_addrs[index], index);
2927 }
2928
2929 /**
2930  * Adds a MAC address to the device
2931  *
2932  * @param dev
2933  *   Pointer to Ethernet device structure.
2934  * @param mac_addr
2935  *   MAC address to register.
2936  * @param index
2937  *   MAC address index.
2938  *
2939  * @return
2940  *   0 on success, a negative errno value otherwise
2941  */
2942 int
2943 mlx5_os_mac_addr_add(struct rte_eth_dev *dev, struct rte_ether_addr *mac,
2944                      uint32_t index)
2945 {
2946         struct mlx5_priv *priv = dev->data->dev_private;
2947         const int vf = priv->config.vf;
2948         int ret = 0;
2949
2950         if (vf)
2951                 ret = mlx5_nl_mac_addr_add(priv->nl_socket_route,
2952                                            mlx5_ifindex(dev), priv->mac_own,
2953                                            mac, index);
2954         return ret;
2955 }
2956
2957 /**
2958  * Modify a VF MAC address
2959  *
2960  * @param priv
2961  *   Pointer to device private data.
2962  * @param mac_addr
2963  *   MAC address to modify into.
2964  * @param iface_idx
2965  *   Net device interface index
2966  * @param vf_index
2967  *   VF index
2968  *
2969  * @return
2970  *   0 on success, a negative errno value otherwise
2971  */
2972 int
2973 mlx5_os_vf_mac_addr_modify(struct mlx5_priv *priv,
2974                            unsigned int iface_idx,
2975                            struct rte_ether_addr *mac_addr,
2976                            int vf_index)
2977 {
2978         return mlx5_nl_vf_mac_addr_modify
2979                 (priv->nl_socket_route, iface_idx, mac_addr, vf_index);
2980 }
2981
2982 /**
2983  * Set device promiscuous mode
2984  *
2985  * @param dev
2986  *   Pointer to Ethernet device structure.
2987  * @param enable
2988  *   0 - promiscuous is disabled, otherwise - enabled
2989  *
2990  * @return
2991  *   0 on success, a negative error value otherwise
2992  */
2993 int
2994 mlx5_os_set_promisc(struct rte_eth_dev *dev, int enable)
2995 {
2996         struct mlx5_priv *priv = dev->data->dev_private;
2997
2998         return mlx5_nl_promisc(priv->nl_socket_route,
2999                                mlx5_ifindex(dev), !!enable);
3000 }
3001
3002 /**
3003  * Set device promiscuous mode
3004  *
3005  * @param dev
3006  *   Pointer to Ethernet device structure.
3007  * @param enable
3008  *   0 - all multicase is disabled, otherwise - enabled
3009  *
3010  * @return
3011  *   0 on success, a negative error value otherwise
3012  */
3013 int
3014 mlx5_os_set_allmulti(struct rte_eth_dev *dev, int enable)
3015 {
3016         struct mlx5_priv *priv = dev->data->dev_private;
3017
3018         return mlx5_nl_allmulti(priv->nl_socket_route,
3019                                 mlx5_ifindex(dev), !!enable);
3020 }
3021
3022 /**
3023  * Flush device MAC addresses
3024  *
3025  * @param dev
3026  *   Pointer to Ethernet device structure.
3027  *
3028  */
3029 void
3030 mlx5_os_mac_addr_flush(struct rte_eth_dev *dev)
3031 {
3032         struct mlx5_priv *priv = dev->data->dev_private;
3033
3034         mlx5_nl_mac_addr_flush(priv->nl_socket_route, mlx5_ifindex(dev),
3035                                dev->data->mac_addrs,
3036                                MLX5_MAX_MAC_ADDRESSES, priv->mac_own);
3037 }