net/mlx5: add Linux-specific file with getter functions
[dpdk.git] / drivers / net / mlx5 / mlx5.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2015 6WIND S.A.
3  * Copyright 2015 Mellanox Technologies, Ltd
4  */
5
6 #include <stddef.h>
7 #include <unistd.h>
8 #include <string.h>
9 #include <stdint.h>
10 #include <stdlib.h>
11 #include <errno.h>
12 #include <net/if.h>
13 #include <fcntl.h>
14 #include <sys/mman.h>
15 #include <linux/rtnetlink.h>
16
17 /* Verbs header. */
18 /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
19 #ifdef PEDANTIC
20 #pragma GCC diagnostic ignored "-Wpedantic"
21 #endif
22 #include <infiniband/verbs.h>
23 #ifdef PEDANTIC
24 #pragma GCC diagnostic error "-Wpedantic"
25 #endif
26
27 #include <rte_malloc.h>
28 #include <rte_ethdev_driver.h>
29 #include <rte_ethdev_pci.h>
30 #include <rte_pci.h>
31 #include <rte_bus_pci.h>
32 #include <rte_common.h>
33 #include <rte_kvargs.h>
34 #include <rte_rwlock.h>
35 #include <rte_spinlock.h>
36 #include <rte_string_fns.h>
37 #include <rte_alarm.h>
38
39 #include <mlx5_glue.h>
40 #include <mlx5_devx_cmds.h>
41 #include <mlx5_common.h>
42 #include <mlx5_common_mp.h>
43
44 #include "mlx5_defs.h"
45 #include "mlx5.h"
46 #include "mlx5_utils.h"
47 #include "mlx5_rxtx.h"
48 #include "mlx5_autoconf.h"
49 #include "mlx5_mr.h"
50 #include "mlx5_flow.h"
51 #include "rte_pmd_mlx5.h"
52
53 /* Device parameter to enable RX completion queue compression. */
54 #define MLX5_RXQ_CQE_COMP_EN "rxq_cqe_comp_en"
55
56 /* Device parameter to enable RX completion entry padding to 128B. */
57 #define MLX5_RXQ_CQE_PAD_EN "rxq_cqe_pad_en"
58
59 /* Device parameter to enable padding Rx packet to cacheline size. */
60 #define MLX5_RXQ_PKT_PAD_EN "rxq_pkt_pad_en"
61
62 /* Device parameter to enable Multi-Packet Rx queue. */
63 #define MLX5_RX_MPRQ_EN "mprq_en"
64
65 /* Device parameter to configure log 2 of the number of strides for MPRQ. */
66 #define MLX5_RX_MPRQ_LOG_STRIDE_NUM "mprq_log_stride_num"
67
68 /* Device parameter to configure log 2 of the stride size for MPRQ. */
69 #define MLX5_RX_MPRQ_LOG_STRIDE_SIZE "mprq_log_stride_size"
70
71 /* Device parameter to limit the size of memcpy'd packet for MPRQ. */
72 #define MLX5_RX_MPRQ_MAX_MEMCPY_LEN "mprq_max_memcpy_len"
73
74 /* Device parameter to set the minimum number of Rx queues to enable MPRQ. */
75 #define MLX5_RXQS_MIN_MPRQ "rxqs_min_mprq"
76
77 /* Device parameter to configure inline send. Deprecated, ignored.*/
78 #define MLX5_TXQ_INLINE "txq_inline"
79
80 /* Device parameter to limit packet size to inline with ordinary SEND. */
81 #define MLX5_TXQ_INLINE_MAX "txq_inline_max"
82
83 /* Device parameter to configure minimal data size to inline. */
84 #define MLX5_TXQ_INLINE_MIN "txq_inline_min"
85
86 /* Device parameter to limit packet size to inline with Enhanced MPW. */
87 #define MLX5_TXQ_INLINE_MPW "txq_inline_mpw"
88
89 /*
90  * Device parameter to configure the number of TX queues threshold for
91  * enabling inline send.
92  */
93 #define MLX5_TXQS_MIN_INLINE "txqs_min_inline"
94
95 /*
96  * Device parameter to configure the number of TX queues threshold for
97  * enabling vectorized Tx, deprecated, ignored (no vectorized Tx routines).
98  */
99 #define MLX5_TXQS_MAX_VEC "txqs_max_vec"
100
101 /* Device parameter to enable multi-packet send WQEs. */
102 #define MLX5_TXQ_MPW_EN "txq_mpw_en"
103
104 /*
105  * Device parameter to force doorbell register mapping
106  * to non-cahed region eliminating the extra write memory barrier.
107  */
108 #define MLX5_TX_DB_NC "tx_db_nc"
109
110 /*
111  * Device parameter to include 2 dsegs in the title WQEBB.
112  * Deprecated, ignored.
113  */
114 #define MLX5_TXQ_MPW_HDR_DSEG_EN "txq_mpw_hdr_dseg_en"
115
116 /*
117  * Device parameter to limit the size of inlining packet.
118  * Deprecated, ignored.
119  */
120 #define MLX5_TXQ_MAX_INLINE_LEN "txq_max_inline_len"
121
122 /*
123  * Device parameter to enable hardware Tx vector.
124  * Deprecated, ignored (no vectorized Tx routines anymore).
125  */
126 #define MLX5_TX_VEC_EN "tx_vec_en"
127
128 /* Device parameter to enable hardware Rx vector. */
129 #define MLX5_RX_VEC_EN "rx_vec_en"
130
131 /* Allow L3 VXLAN flow creation. */
132 #define MLX5_L3_VXLAN_EN "l3_vxlan_en"
133
134 /* Activate DV E-Switch flow steering. */
135 #define MLX5_DV_ESW_EN "dv_esw_en"
136
137 /* Activate DV flow steering. */
138 #define MLX5_DV_FLOW_EN "dv_flow_en"
139
140 /* Enable extensive flow metadata support. */
141 #define MLX5_DV_XMETA_EN "dv_xmeta_en"
142
143 /* Activate Netlink support in VF mode. */
144 #define MLX5_VF_NL_EN "vf_nl_en"
145
146 /* Enable extending memsegs when creating a MR. */
147 #define MLX5_MR_EXT_MEMSEG_EN "mr_ext_memseg_en"
148
149 /* Select port representors to instantiate. */
150 #define MLX5_REPRESENTOR "representor"
151
152 /* Device parameter to configure the maximum number of dump files per queue. */
153 #define MLX5_MAX_DUMP_FILES_NUM "max_dump_files_num"
154
155 /* Configure timeout of LRO session (in microseconds). */
156 #define MLX5_LRO_TIMEOUT_USEC "lro_timeout_usec"
157
158 /*
159  * Device parameter to configure the total data buffer size for a single
160  * hairpin queue (logarithm value).
161  */
162 #define MLX5_HP_BUF_SIZE "hp_buf_log_sz"
163
164 /* Flow memory reclaim mode. */
165 #define MLX5_RECLAIM_MEM "reclaim_mem_mode"
166
167 #ifndef HAVE_IBV_MLX5_MOD_MPW
168 #define MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED (1 << 2)
169 #define MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW (1 << 3)
170 #endif
171
172 #ifndef HAVE_IBV_MLX5_MOD_CQE_128B_COMP
173 #define MLX5DV_CONTEXT_FLAGS_CQE_128B_COMP (1 << 4)
174 #endif
175
176 static const char *MZ_MLX5_PMD_SHARED_DATA = "mlx5_pmd_shared_data";
177
178 /* Shared memory between primary and secondary processes. */
179 struct mlx5_shared_data *mlx5_shared_data;
180
181 /* Spinlock for mlx5_shared_data allocation. */
182 static rte_spinlock_t mlx5_shared_data_lock = RTE_SPINLOCK_INITIALIZER;
183
184 /* Process local data for secondary processes. */
185 static struct mlx5_local_data mlx5_local_data;
186
187 /** Driver-specific log messages type. */
188 int mlx5_logtype;
189
190 /** Data associated with devices to spawn. */
191 struct mlx5_dev_spawn_data {
192         uint32_t ifindex; /**< Network interface index. */
193         uint32_t max_port; /**< IB device maximal port index. */
194         uint32_t ibv_port; /**< IB device physical port index. */
195         int pf_bond; /**< bonding device PF index. < 0 - no bonding */
196         struct mlx5_switch_info info; /**< Switch information. */
197         struct ibv_device *ibv_dev; /**< Associated IB device. */
198         struct rte_eth_dev *eth_dev; /**< Associated Ethernet device. */
199         struct rte_pci_device *pci_dev; /**< Backend PCI device. */
200 };
201
202 static LIST_HEAD(, mlx5_dev_ctx_shared) mlx5_ibv_list = LIST_HEAD_INITIALIZER();
203 static pthread_mutex_t mlx5_ibv_list_mutex = PTHREAD_MUTEX_INITIALIZER;
204
205 static struct mlx5_indexed_pool_config mlx5_ipool_cfg[] = {
206 #ifdef HAVE_IBV_FLOW_DV_SUPPORT
207         {
208                 .size = sizeof(struct mlx5_flow_dv_encap_decap_resource),
209                 .trunk_size = 64,
210                 .grow_trunk = 3,
211                 .grow_shift = 2,
212                 .need_lock = 0,
213                 .release_mem_en = 1,
214                 .malloc = rte_malloc_socket,
215                 .free = rte_free,
216                 .type = "mlx5_encap_decap_ipool",
217         },
218         {
219                 .size = sizeof(struct mlx5_flow_dv_push_vlan_action_resource),
220                 .trunk_size = 64,
221                 .grow_trunk = 3,
222                 .grow_shift = 2,
223                 .need_lock = 0,
224                 .release_mem_en = 1,
225                 .malloc = rte_malloc_socket,
226                 .free = rte_free,
227                 .type = "mlx5_push_vlan_ipool",
228         },
229         {
230                 .size = sizeof(struct mlx5_flow_dv_tag_resource),
231                 .trunk_size = 64,
232                 .grow_trunk = 3,
233                 .grow_shift = 2,
234                 .need_lock = 0,
235                 .release_mem_en = 1,
236                 .malloc = rte_malloc_socket,
237                 .free = rte_free,
238                 .type = "mlx5_tag_ipool",
239         },
240         {
241                 .size = sizeof(struct mlx5_flow_dv_port_id_action_resource),
242                 .trunk_size = 64,
243                 .grow_trunk = 3,
244                 .grow_shift = 2,
245                 .need_lock = 0,
246                 .release_mem_en = 1,
247                 .malloc = rte_malloc_socket,
248                 .free = rte_free,
249                 .type = "mlx5_port_id_ipool",
250         },
251         {
252                 .size = sizeof(struct mlx5_flow_tbl_data_entry),
253                 .trunk_size = 64,
254                 .grow_trunk = 3,
255                 .grow_shift = 2,
256                 .need_lock = 0,
257                 .release_mem_en = 1,
258                 .malloc = rte_malloc_socket,
259                 .free = rte_free,
260                 .type = "mlx5_jump_ipool",
261         },
262 #endif
263         {
264                 .size = sizeof(struct mlx5_flow_meter),
265                 .trunk_size = 64,
266                 .grow_trunk = 3,
267                 .grow_shift = 2,
268                 .need_lock = 0,
269                 .release_mem_en = 1,
270                 .malloc = rte_malloc_socket,
271                 .free = rte_free,
272                 .type = "mlx5_meter_ipool",
273         },
274         {
275                 .size = sizeof(struct mlx5_flow_mreg_copy_resource),
276                 .trunk_size = 64,
277                 .grow_trunk = 3,
278                 .grow_shift = 2,
279                 .need_lock = 0,
280                 .release_mem_en = 1,
281                 .malloc = rte_malloc_socket,
282                 .free = rte_free,
283                 .type = "mlx5_mcp_ipool",
284         },
285         {
286                 .size = (sizeof(struct mlx5_hrxq) + MLX5_RSS_HASH_KEY_LEN),
287                 .trunk_size = 64,
288                 .grow_trunk = 3,
289                 .grow_shift = 2,
290                 .need_lock = 0,
291                 .release_mem_en = 1,
292                 .malloc = rte_malloc_socket,
293                 .free = rte_free,
294                 .type = "mlx5_hrxq_ipool",
295         },
296         {
297                 .size = sizeof(struct mlx5_flow_handle),
298                 .trunk_size = 64,
299                 .grow_trunk = 3,
300                 .grow_shift = 2,
301                 .need_lock = 0,
302                 .release_mem_en = 1,
303                 .malloc = rte_malloc_socket,
304                 .free = rte_free,
305                 .type = "mlx5_flow_handle_ipool",
306         },
307         {
308                 .size = sizeof(struct rte_flow),
309                 .trunk_size = 4096,
310                 .need_lock = 1,
311                 .release_mem_en = 1,
312                 .malloc = rte_malloc_socket,
313                 .free = rte_free,
314                 .type = "rte_flow_ipool",
315         },
316 };
317
318
319 #define MLX5_FLOW_MIN_ID_POOL_SIZE 512
320 #define MLX5_ID_GENERATION_ARRAY_FACTOR 16
321
322 #define MLX5_FLOW_TABLE_HLIST_ARRAY_SIZE 4096
323 #define MLX5_TAGS_HLIST_ARRAY_SIZE 8192
324
325 /**
326  * Allocate ID pool structure.
327  *
328  * @param[in] max_id
329  *   The maximum id can be allocated from the pool.
330  *
331  * @return
332  *   Pointer to pool object, NULL value otherwise.
333  */
334 struct mlx5_flow_id_pool *
335 mlx5_flow_id_pool_alloc(uint32_t max_id)
336 {
337         struct mlx5_flow_id_pool *pool;
338         void *mem;
339
340         pool = rte_zmalloc("id pool allocation", sizeof(*pool),
341                            RTE_CACHE_LINE_SIZE);
342         if (!pool) {
343                 DRV_LOG(ERR, "can't allocate id pool");
344                 rte_errno  = ENOMEM;
345                 return NULL;
346         }
347         mem = rte_zmalloc("", MLX5_FLOW_MIN_ID_POOL_SIZE * sizeof(uint32_t),
348                           RTE_CACHE_LINE_SIZE);
349         if (!mem) {
350                 DRV_LOG(ERR, "can't allocate mem for id pool");
351                 rte_errno  = ENOMEM;
352                 goto error;
353         }
354         pool->free_arr = mem;
355         pool->curr = pool->free_arr;
356         pool->last = pool->free_arr + MLX5_FLOW_MIN_ID_POOL_SIZE;
357         pool->base_index = 0;
358         pool->max_id = max_id;
359         return pool;
360 error:
361         rte_free(pool);
362         return NULL;
363 }
364
365 /**
366  * Release ID pool structure.
367  *
368  * @param[in] pool
369  *   Pointer to flow id pool object to free.
370  */
371 void
372 mlx5_flow_id_pool_release(struct mlx5_flow_id_pool *pool)
373 {
374         rte_free(pool->free_arr);
375         rte_free(pool);
376 }
377
378 /**
379  * Generate ID.
380  *
381  * @param[in] pool
382  *   Pointer to flow id pool.
383  * @param[out] id
384  *   The generated ID.
385  *
386  * @return
387  *   0 on success, error value otherwise.
388  */
389 uint32_t
390 mlx5_flow_id_get(struct mlx5_flow_id_pool *pool, uint32_t *id)
391 {
392         if (pool->curr == pool->free_arr) {
393                 if (pool->base_index == pool->max_id) {
394                         rte_errno  = ENOMEM;
395                         DRV_LOG(ERR, "no free id");
396                         return -rte_errno;
397                 }
398                 *id = ++pool->base_index;
399                 return 0;
400         }
401         *id = *(--pool->curr);
402         return 0;
403 }
404
405 /**
406  * Release ID.
407  *
408  * @param[in] pool
409  *   Pointer to flow id pool.
410  * @param[out] id
411  *   The generated ID.
412  *
413  * @return
414  *   0 on success, error value otherwise.
415  */
416 uint32_t
417 mlx5_flow_id_release(struct mlx5_flow_id_pool *pool, uint32_t id)
418 {
419         uint32_t size;
420         uint32_t size2;
421         void *mem;
422
423         if (pool->curr == pool->last) {
424                 size = pool->curr - pool->free_arr;
425                 size2 = size * MLX5_ID_GENERATION_ARRAY_FACTOR;
426                 MLX5_ASSERT(size2 > size);
427                 mem = rte_malloc("", size2 * sizeof(uint32_t), 0);
428                 if (!mem) {
429                         DRV_LOG(ERR, "can't allocate mem for id pool");
430                         rte_errno  = ENOMEM;
431                         return -rte_errno;
432                 }
433                 memcpy(mem, pool->free_arr, size * sizeof(uint32_t));
434                 rte_free(pool->free_arr);
435                 pool->free_arr = mem;
436                 pool->curr = pool->free_arr + size;
437                 pool->last = pool->free_arr + size2;
438         }
439         *pool->curr = id;
440         pool->curr++;
441         return 0;
442 }
443
444 /**
445  * Initialize the shared aging list information per port.
446  *
447  * @param[in] sh
448  *   Pointer to mlx5_dev_ctx_shared object.
449  */
450 static void
451 mlx5_flow_aging_init(struct mlx5_dev_ctx_shared *sh)
452 {
453         uint32_t i;
454         struct mlx5_age_info *age_info;
455
456         for (i = 0; i < sh->max_port; i++) {
457                 age_info = &sh->port[i].age_info;
458                 age_info->flags = 0;
459                 TAILQ_INIT(&age_info->aged_counters);
460                 rte_spinlock_init(&age_info->aged_sl);
461                 MLX5_AGE_SET(age_info, MLX5_AGE_TRIGGER);
462         }
463 }
464
465 /**
466  * Initialize the counters management structure.
467  *
468  * @param[in] sh
469  *   Pointer to mlx5_dev_ctx_shared object to free
470  */
471 static void
472 mlx5_flow_counters_mng_init(struct mlx5_dev_ctx_shared *sh)
473 {
474         int i;
475
476         memset(&sh->cmng, 0, sizeof(sh->cmng));
477         TAILQ_INIT(&sh->cmng.flow_counters);
478         for (i = 0; i < MLX5_CCONT_TYPE_MAX; ++i) {
479                 TAILQ_INIT(&sh->cmng.ccont[i].pool_list);
480                 rte_spinlock_init(&sh->cmng.ccont[i].resize_sl);
481         }
482 }
483
484 /**
485  * Destroy all the resources allocated for a counter memory management.
486  *
487  * @param[in] mng
488  *   Pointer to the memory management structure.
489  */
490 static void
491 mlx5_flow_destroy_counter_stat_mem_mng(struct mlx5_counter_stats_mem_mng *mng)
492 {
493         uint8_t *mem = (uint8_t *)(uintptr_t)mng->raws[0].data;
494
495         LIST_REMOVE(mng, next);
496         claim_zero(mlx5_devx_cmd_destroy(mng->dm));
497         claim_zero(mlx5_glue->devx_umem_dereg(mng->umem));
498         rte_free(mem);
499 }
500
501 /**
502  * Close and release all the resources of the counters management.
503  *
504  * @param[in] sh
505  *   Pointer to mlx5_dev_ctx_shared object to free.
506  */
507 static void
508 mlx5_flow_counters_mng_close(struct mlx5_dev_ctx_shared *sh)
509 {
510         struct mlx5_counter_stats_mem_mng *mng;
511         int i;
512         int j;
513         int retries = 1024;
514
515         rte_errno = 0;
516         while (--retries) {
517                 rte_eal_alarm_cancel(mlx5_flow_query_alarm, sh);
518                 if (rte_errno != EINPROGRESS)
519                         break;
520                 rte_pause();
521         }
522         for (i = 0; i < MLX5_CCONT_TYPE_MAX; ++i) {
523                 struct mlx5_flow_counter_pool *pool;
524                 uint32_t batch = !!(i > 1);
525
526                 if (!sh->cmng.ccont[i].pools)
527                         continue;
528                 pool = TAILQ_FIRST(&sh->cmng.ccont[i].pool_list);
529                 while (pool) {
530                         if (batch && pool->min_dcs)
531                                 claim_zero(mlx5_devx_cmd_destroy
532                                                                (pool->min_dcs));
533                         for (j = 0; j < MLX5_COUNTERS_PER_POOL; ++j) {
534                                 if (MLX5_POOL_GET_CNT(pool, j)->action)
535                                         claim_zero
536                                          (mlx5_glue->destroy_flow_action
537                                           (MLX5_POOL_GET_CNT
538                                           (pool, j)->action));
539                                 if (!batch && MLX5_GET_POOL_CNT_EXT
540                                     (pool, j)->dcs)
541                                         claim_zero(mlx5_devx_cmd_destroy
542                                                    (MLX5_GET_POOL_CNT_EXT
543                                                     (pool, j)->dcs));
544                         }
545                         TAILQ_REMOVE(&sh->cmng.ccont[i].pool_list, pool, next);
546                         rte_free(pool);
547                         pool = TAILQ_FIRST(&sh->cmng.ccont[i].pool_list);
548                 }
549                 rte_free(sh->cmng.ccont[i].pools);
550         }
551         mng = LIST_FIRST(&sh->cmng.mem_mngs);
552         while (mng) {
553                 mlx5_flow_destroy_counter_stat_mem_mng(mng);
554                 mng = LIST_FIRST(&sh->cmng.mem_mngs);
555         }
556         memset(&sh->cmng, 0, sizeof(sh->cmng));
557 }
558
559 /**
560  * Initialize the flow resources' indexed mempool.
561  *
562  * @param[in] sh
563  *   Pointer to mlx5_dev_ctx_shared object.
564  * @param[in] sh
565  *   Pointer to user dev config.
566  */
567 static void
568 mlx5_flow_ipool_create(struct mlx5_dev_ctx_shared *sh,
569                        const struct mlx5_dev_config *config __rte_unused)
570 {
571         uint8_t i;
572
573 #ifdef HAVE_IBV_FLOW_DV_SUPPORT
574         /*
575          * While DV is supported, user chooses the verbs mode,
576          * the mlx5 flow handle size is different with the
577          * MLX5_FLOW_HANDLE_VERBS_SIZE.
578          */
579         if (!config->dv_flow_en)
580                 mlx5_ipool_cfg[MLX5_IPOOL_MLX5_FLOW].size =
581                                         MLX5_FLOW_HANDLE_VERBS_SIZE;
582 #endif
583         for (i = 0; i < MLX5_IPOOL_MAX; ++i) {
584                 if (config->reclaim_mode)
585                         mlx5_ipool_cfg[i].release_mem_en = 1;
586                 sh->ipool[i] = mlx5_ipool_create(&mlx5_ipool_cfg[i]);
587         }
588 }
589
590 /**
591  * Release the flow resources' indexed mempool.
592  *
593  * @param[in] sh
594  *   Pointer to mlx5_dev_ctx_shared object.
595  */
596 static void
597 mlx5_flow_ipool_destroy(struct mlx5_dev_ctx_shared *sh)
598 {
599         uint8_t i;
600
601         for (i = 0; i < MLX5_IPOOL_MAX; ++i)
602                 mlx5_ipool_destroy(sh->ipool[i]);
603 }
604
605 /**
606  * Extract pdn of PD object using DV API.
607  *
608  * @param[in] pd
609  *   Pointer to the verbs PD object.
610  * @param[out] pdn
611  *   Pointer to the PD object number variable.
612  *
613  * @return
614  *   0 on success, error value otherwise.
615  */
616 #ifdef HAVE_IBV_FLOW_DV_SUPPORT
617 static int
618 mlx5_get_pdn(struct ibv_pd *pd __rte_unused, uint32_t *pdn __rte_unused)
619 {
620         struct mlx5dv_obj obj;
621         struct mlx5dv_pd pd_info;
622         int ret = 0;
623
624         obj.pd.in = pd;
625         obj.pd.out = &pd_info;
626         ret = mlx5_glue->dv_init_obj(&obj, MLX5DV_OBJ_PD);
627         if (ret) {
628                 DRV_LOG(DEBUG, "Fail to get PD object info");
629                 return ret;
630         }
631         *pdn = pd_info.pdn;
632         return 0;
633 }
634 #endif /* HAVE_IBV_FLOW_DV_SUPPORT */
635
636 static int
637 mlx5_config_doorbell_mapping_env(const struct mlx5_dev_config *config)
638 {
639         char *env;
640         int value;
641
642         MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY);
643         /* Get environment variable to store. */
644         env = getenv(MLX5_SHUT_UP_BF);
645         value = env ? !!strcmp(env, "0") : MLX5_ARG_UNSET;
646         if (config->dbnc == MLX5_ARG_UNSET)
647                 setenv(MLX5_SHUT_UP_BF, MLX5_SHUT_UP_BF_DEFAULT, 1);
648         else
649                 setenv(MLX5_SHUT_UP_BF,
650                        config->dbnc == MLX5_TXDB_NCACHED ? "1" : "0", 1);
651         return value;
652 }
653
654 static void
655 mlx5_restore_doorbell_mapping_env(int value)
656 {
657         MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY);
658         /* Restore the original environment variable state. */
659         if (value == MLX5_ARG_UNSET)
660                 unsetenv(MLX5_SHUT_UP_BF);
661         else
662                 setenv(MLX5_SHUT_UP_BF, value ? "1" : "0", 1);
663 }
664
665 /**
666  * Install shared asynchronous device events handler.
667  * This function is implemented to support event sharing
668  * between multiple ports of single IB device.
669  *
670  * @param sh
671  *   Pointer to mlx5_dev_ctx_shared object.
672  */
673 static void
674 mlx5_dev_shared_handler_install(struct mlx5_dev_ctx_shared *sh)
675 {
676         int ret;
677         int flags;
678
679         sh->intr_handle.fd = -1;
680         flags = fcntl(((struct ibv_context *)sh->ctx)->async_fd, F_GETFL);
681         ret = fcntl(((struct ibv_context *)sh->ctx)->async_fd,
682                     F_SETFL, flags | O_NONBLOCK);
683         if (ret) {
684                 DRV_LOG(INFO, "failed to change file descriptor async event"
685                         " queue");
686         } else {
687                 sh->intr_handle.fd = ((struct ibv_context *)sh->ctx)->async_fd;
688                 sh->intr_handle.type = RTE_INTR_HANDLE_EXT;
689                 if (rte_intr_callback_register(&sh->intr_handle,
690                                         mlx5_dev_interrupt_handler, sh)) {
691                         DRV_LOG(INFO, "Fail to install the shared interrupt.");
692                         sh->intr_handle.fd = -1;
693                 }
694         }
695         if (sh->devx) {
696 #ifdef HAVE_IBV_DEVX_ASYNC
697                 sh->intr_handle_devx.fd = -1;
698                 sh->devx_comp = mlx5_glue->devx_create_cmd_comp(sh->ctx);
699                 if (!sh->devx_comp) {
700                         DRV_LOG(INFO, "failed to allocate devx_comp.");
701                         return;
702                 }
703                 flags = fcntl(sh->devx_comp->fd, F_GETFL);
704                 ret = fcntl(sh->devx_comp->fd, F_SETFL, flags | O_NONBLOCK);
705                 if (ret) {
706                         DRV_LOG(INFO, "failed to change file descriptor"
707                                 " devx comp");
708                         return;
709                 }
710                 sh->intr_handle_devx.fd = sh->devx_comp->fd;
711                 sh->intr_handle_devx.type = RTE_INTR_HANDLE_EXT;
712                 if (rte_intr_callback_register(&sh->intr_handle_devx,
713                                         mlx5_dev_interrupt_handler_devx, sh)) {
714                         DRV_LOG(INFO, "Fail to install the devx shared"
715                                 " interrupt.");
716                         sh->intr_handle_devx.fd = -1;
717                 }
718 #endif /* HAVE_IBV_DEVX_ASYNC */
719         }
720 }
721
722 /**
723  * Uninstall shared asynchronous device events handler.
724  * This function is implemented to support event sharing
725  * between multiple ports of single IB device.
726  *
727  * @param dev
728  *   Pointer to mlx5_dev_ctx_shared object.
729  */
730 static void
731 mlx5_dev_shared_handler_uninstall(struct mlx5_dev_ctx_shared *sh)
732 {
733         if (sh->intr_handle.fd >= 0)
734                 mlx5_intr_callback_unregister(&sh->intr_handle,
735                                               mlx5_dev_interrupt_handler, sh);
736 #ifdef HAVE_IBV_DEVX_ASYNC
737         if (sh->intr_handle_devx.fd >= 0)
738                 rte_intr_callback_unregister(&sh->intr_handle_devx,
739                                   mlx5_dev_interrupt_handler_devx, sh);
740         if (sh->devx_comp)
741                 mlx5_glue->devx_destroy_cmd_comp(sh->devx_comp);
742 #endif
743 }
744
745 /**
746  * Allocate shared IB device context. If there is multiport device the
747  * master and representors will share this context, if there is single
748  * port dedicated IB device, the context will be used by only given
749  * port due to unification.
750  *
751  * Routine first searches the context for the specified IB device name,
752  * if found the shared context assumed and reference counter is incremented.
753  * If no context found the new one is created and initialized with specified
754  * IB device context and parameters.
755  *
756  * @param[in] spawn
757  *   Pointer to the IB device attributes (name, port, etc).
758  * @param[in] config
759  *   Pointer to device configuration structure.
760  *
761  * @return
762  *   Pointer to mlx5_dev_ctx_shared object on success,
763  *   otherwise NULL and rte_errno is set.
764  */
765 static struct mlx5_dev_ctx_shared *
766 mlx5_alloc_shared_ibctx(const struct mlx5_dev_spawn_data *spawn,
767                         const struct mlx5_dev_config *config)
768 {
769         struct mlx5_dev_ctx_shared *sh;
770         int dbmap_env;
771         int err = 0;
772         uint32_t i;
773 #ifdef HAVE_IBV_FLOW_DV_SUPPORT
774         struct mlx5_devx_tis_attr tis_attr = { 0 };
775 #endif
776
777         MLX5_ASSERT(spawn);
778         /* Secondary process should not create the shared context. */
779         MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY);
780         pthread_mutex_lock(&mlx5_ibv_list_mutex);
781         /* Search for IB context by device name. */
782         LIST_FOREACH(sh, &mlx5_ibv_list, next) {
783                 if (!strcmp(sh->ibdev_name, spawn->ibv_dev->name)) {
784                         sh->refcnt++;
785                         goto exit;
786                 }
787         }
788         /* No device found, we have to create new shared context. */
789         MLX5_ASSERT(spawn->max_port);
790         sh = rte_zmalloc("ethdev shared ib context",
791                          sizeof(struct mlx5_dev_ctx_shared) +
792                          spawn->max_port *
793                          sizeof(struct mlx5_ibv_shared_port),
794                          RTE_CACHE_LINE_SIZE);
795         if (!sh) {
796                 DRV_LOG(ERR, "shared context allocation failure");
797                 rte_errno  = ENOMEM;
798                 goto exit;
799         }
800         /*
801          * Configure environment variable "MLX5_BF_SHUT_UP"
802          * before the device creation. The rdma_core library
803          * checks the variable at device creation and
804          * stores the result internally.
805          */
806         dbmap_env = mlx5_config_doorbell_mapping_env(config);
807         /* Try to open IB device with DV first, then usual Verbs. */
808         errno = 0;
809         sh->ctx = mlx5_glue->dv_open_device(spawn->ibv_dev);
810         if (sh->ctx) {
811                 sh->devx = 1;
812                 DRV_LOG(DEBUG, "DevX is supported");
813                 /* The device is created, no need for environment. */
814                 mlx5_restore_doorbell_mapping_env(dbmap_env);
815         } else {
816                 /* The environment variable is still configured. */
817                 sh->ctx = mlx5_glue->open_device(spawn->ibv_dev);
818                 err = errno ? errno : ENODEV;
819                 /*
820                  * The environment variable is not needed anymore,
821                  * all device creation attempts are completed.
822                  */
823                 mlx5_restore_doorbell_mapping_env(dbmap_env);
824                 if (!sh->ctx)
825                         goto error;
826                 DRV_LOG(DEBUG, "DevX is NOT supported");
827         }
828         err = mlx5_glue->query_device_ex(sh->ctx, NULL, &sh->device_attr);
829         if (err) {
830                 DRV_LOG(DEBUG, "ibv_query_device_ex() failed");
831                 goto error;
832         }
833         sh->refcnt = 1;
834         sh->max_port = spawn->max_port;
835         strncpy(sh->ibdev_name, mlx5_os_get_ctx_device_name(sh->ctx),
836                 sizeof(sh->ibdev_name) - 1);
837         strncpy(sh->ibdev_path, mlx5_os_get_ctx_device_path(sh->ctx),
838                 sizeof(sh->ibdev_path) - 1);
839         /*
840          * Setting port_id to max unallowed value means
841          * there is no interrupt subhandler installed for
842          * the given port index i.
843          */
844         for (i = 0; i < sh->max_port; i++) {
845                 sh->port[i].ih_port_id = RTE_MAX_ETHPORTS;
846                 sh->port[i].devx_ih_port_id = RTE_MAX_ETHPORTS;
847         }
848         sh->pd = mlx5_glue->alloc_pd(sh->ctx);
849         if (sh->pd == NULL) {
850                 DRV_LOG(ERR, "PD allocation failure");
851                 err = ENOMEM;
852                 goto error;
853         }
854 #ifdef HAVE_IBV_FLOW_DV_SUPPORT
855         if (sh->devx) {
856                 err = mlx5_get_pdn(sh->pd, &sh->pdn);
857                 if (err) {
858                         DRV_LOG(ERR, "Fail to extract pdn from PD");
859                         goto error;
860                 }
861                 sh->td = mlx5_devx_cmd_create_td(sh->ctx);
862                 if (!sh->td) {
863                         DRV_LOG(ERR, "TD allocation failure");
864                         err = ENOMEM;
865                         goto error;
866                 }
867                 tis_attr.transport_domain = sh->td->id;
868                 sh->tis = mlx5_devx_cmd_create_tis(sh->ctx, &tis_attr);
869                 if (!sh->tis) {
870                         DRV_LOG(ERR, "TIS allocation failure");
871                         err = ENOMEM;
872                         goto error;
873                 }
874         }
875         sh->flow_id_pool = mlx5_flow_id_pool_alloc
876                                         ((1 << HAIRPIN_FLOW_ID_BITS) - 1);
877         if (!sh->flow_id_pool) {
878                 DRV_LOG(ERR, "can't create flow id pool");
879                 err = ENOMEM;
880                 goto error;
881         }
882 #endif /* HAVE_IBV_FLOW_DV_SUPPORT */
883         /*
884          * Once the device is added to the list of memory event
885          * callback, its global MR cache table cannot be expanded
886          * on the fly because of deadlock. If it overflows, lookup
887          * should be done by searching MR list linearly, which is slow.
888          *
889          * At this point the device is not added to the memory
890          * event list yet, context is just being created.
891          */
892         err = mlx5_mr_btree_init(&sh->share_cache.cache,
893                                  MLX5_MR_BTREE_CACHE_N * 2,
894                                  spawn->pci_dev->device.numa_node);
895         if (err) {
896                 err = rte_errno;
897                 goto error;
898         }
899         mlx5_dev_shared_handler_install(sh);
900         mlx5_flow_aging_init(sh);
901         mlx5_flow_counters_mng_init(sh);
902         mlx5_flow_ipool_create(sh, config);
903         /* Add device to memory callback list. */
904         rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock);
905         LIST_INSERT_HEAD(&mlx5_shared_data->mem_event_cb_list,
906                          sh, mem_event_cb);
907         rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
908         /* Add context to the global device list. */
909         LIST_INSERT_HEAD(&mlx5_ibv_list, sh, next);
910 exit:
911         pthread_mutex_unlock(&mlx5_ibv_list_mutex);
912         return sh;
913 error:
914         pthread_mutex_unlock(&mlx5_ibv_list_mutex);
915         MLX5_ASSERT(sh);
916         if (sh->tis)
917                 claim_zero(mlx5_devx_cmd_destroy(sh->tis));
918         if (sh->td)
919                 claim_zero(mlx5_devx_cmd_destroy(sh->td));
920         if (sh->pd)
921                 claim_zero(mlx5_glue->dealloc_pd(sh->pd));
922         if (sh->ctx)
923                 claim_zero(mlx5_glue->close_device(sh->ctx));
924         if (sh->flow_id_pool)
925                 mlx5_flow_id_pool_release(sh->flow_id_pool);
926         rte_free(sh);
927         MLX5_ASSERT(err > 0);
928         rte_errno = err;
929         return NULL;
930 }
931
932 /**
933  * Free shared IB device context. Decrement counter and if zero free
934  * all allocated resources and close handles.
935  *
936  * @param[in] sh
937  *   Pointer to mlx5_dev_ctx_shared object to free
938  */
939 static void
940 mlx5_free_shared_ibctx(struct mlx5_dev_ctx_shared *sh)
941 {
942         pthread_mutex_lock(&mlx5_ibv_list_mutex);
943 #ifdef RTE_LIBRTE_MLX5_DEBUG
944         /* Check the object presence in the list. */
945         struct mlx5_dev_ctx_shared *lctx;
946
947         LIST_FOREACH(lctx, &mlx5_ibv_list, next)
948                 if (lctx == sh)
949                         break;
950         MLX5_ASSERT(lctx);
951         if (lctx != sh) {
952                 DRV_LOG(ERR, "Freeing non-existing shared IB context");
953                 goto exit;
954         }
955 #endif
956         MLX5_ASSERT(sh);
957         MLX5_ASSERT(sh->refcnt);
958         /* Secondary process should not free the shared context. */
959         MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY);
960         if (--sh->refcnt)
961                 goto exit;
962         /* Remove from memory callback device list. */
963         rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock);
964         LIST_REMOVE(sh, mem_event_cb);
965         rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
966         /* Release created Memory Regions. */
967         mlx5_mr_release_cache(&sh->share_cache);
968         /* Remove context from the global device list. */
969         LIST_REMOVE(sh, next);
970         /*
971          *  Ensure there is no async event handler installed.
972          *  Only primary process handles async device events.
973          **/
974         mlx5_flow_counters_mng_close(sh);
975         mlx5_flow_ipool_destroy(sh);
976         mlx5_dev_shared_handler_uninstall(sh);
977         if (sh->pd)
978                 claim_zero(mlx5_glue->dealloc_pd(sh->pd));
979         if (sh->tis)
980                 claim_zero(mlx5_devx_cmd_destroy(sh->tis));
981         if (sh->td)
982                 claim_zero(mlx5_devx_cmd_destroy(sh->td));
983         if (sh->ctx)
984                 claim_zero(mlx5_glue->close_device(sh->ctx));
985         if (sh->flow_id_pool)
986                 mlx5_flow_id_pool_release(sh->flow_id_pool);
987         rte_free(sh);
988 exit:
989         pthread_mutex_unlock(&mlx5_ibv_list_mutex);
990 }
991
992 /**
993  * Destroy table hash list and all the root entries per domain.
994  *
995  * @param[in] priv
996  *   Pointer to the private device data structure.
997  */
998 static void
999 mlx5_free_table_hash_list(struct mlx5_priv *priv)
1000 {
1001         struct mlx5_dev_ctx_shared *sh = priv->sh;
1002         struct mlx5_flow_tbl_data_entry *tbl_data;
1003         union mlx5_flow_tbl_key table_key = {
1004                 {
1005                         .table_id = 0,
1006                         .reserved = 0,
1007                         .domain = 0,
1008                         .direction = 0,
1009                 }
1010         };
1011         struct mlx5_hlist_entry *pos;
1012
1013         if (!sh->flow_tbls)
1014                 return;
1015         pos = mlx5_hlist_lookup(sh->flow_tbls, table_key.v64);
1016         if (pos) {
1017                 tbl_data = container_of(pos, struct mlx5_flow_tbl_data_entry,
1018                                         entry);
1019                 MLX5_ASSERT(tbl_data);
1020                 mlx5_hlist_remove(sh->flow_tbls, pos);
1021                 rte_free(tbl_data);
1022         }
1023         table_key.direction = 1;
1024         pos = mlx5_hlist_lookup(sh->flow_tbls, table_key.v64);
1025         if (pos) {
1026                 tbl_data = container_of(pos, struct mlx5_flow_tbl_data_entry,
1027                                         entry);
1028                 MLX5_ASSERT(tbl_data);
1029                 mlx5_hlist_remove(sh->flow_tbls, pos);
1030                 rte_free(tbl_data);
1031         }
1032         table_key.direction = 0;
1033         table_key.domain = 1;
1034         pos = mlx5_hlist_lookup(sh->flow_tbls, table_key.v64);
1035         if (pos) {
1036                 tbl_data = container_of(pos, struct mlx5_flow_tbl_data_entry,
1037                                         entry);
1038                 MLX5_ASSERT(tbl_data);
1039                 mlx5_hlist_remove(sh->flow_tbls, pos);
1040                 rte_free(tbl_data);
1041         }
1042         mlx5_hlist_destroy(sh->flow_tbls, NULL, NULL);
1043 }
1044
1045 /**
1046  * Initialize flow table hash list and create the root tables entry
1047  * for each domain.
1048  *
1049  * @param[in] priv
1050  *   Pointer to the private device data structure.
1051  *
1052  * @return
1053  *   Zero on success, positive error code otherwise.
1054  */
1055 static int
1056 mlx5_alloc_table_hash_list(struct mlx5_priv *priv)
1057 {
1058         struct mlx5_dev_ctx_shared *sh = priv->sh;
1059         char s[MLX5_HLIST_NAMESIZE];
1060         int err = 0;
1061
1062         MLX5_ASSERT(sh);
1063         snprintf(s, sizeof(s), "%s_flow_table", priv->sh->ibdev_name);
1064         sh->flow_tbls = mlx5_hlist_create(s, MLX5_FLOW_TABLE_HLIST_ARRAY_SIZE);
1065         if (!sh->flow_tbls) {
1066                 DRV_LOG(ERR, "flow tables with hash creation failed.\n");
1067                 err = ENOMEM;
1068                 return err;
1069         }
1070 #ifndef HAVE_MLX5DV_DR
1071         /*
1072          * In case we have not DR support, the zero tables should be created
1073          * because DV expect to see them even if they cannot be created by
1074          * RDMA-CORE.
1075          */
1076         union mlx5_flow_tbl_key table_key = {
1077                 {
1078                         .table_id = 0,
1079                         .reserved = 0,
1080                         .domain = 0,
1081                         .direction = 0,
1082                 }
1083         };
1084         struct mlx5_flow_tbl_data_entry *tbl_data = rte_zmalloc(NULL,
1085                                                           sizeof(*tbl_data), 0);
1086
1087         if (!tbl_data) {
1088                 err = ENOMEM;
1089                 goto error;
1090         }
1091         tbl_data->entry.key = table_key.v64;
1092         err = mlx5_hlist_insert(sh->flow_tbls, &tbl_data->entry);
1093         if (err)
1094                 goto error;
1095         rte_atomic32_init(&tbl_data->tbl.refcnt);
1096         rte_atomic32_inc(&tbl_data->tbl.refcnt);
1097         table_key.direction = 1;
1098         tbl_data = rte_zmalloc(NULL, sizeof(*tbl_data), 0);
1099         if (!tbl_data) {
1100                 err = ENOMEM;
1101                 goto error;
1102         }
1103         tbl_data->entry.key = table_key.v64;
1104         err = mlx5_hlist_insert(sh->flow_tbls, &tbl_data->entry);
1105         if (err)
1106                 goto error;
1107         rte_atomic32_init(&tbl_data->tbl.refcnt);
1108         rte_atomic32_inc(&tbl_data->tbl.refcnt);
1109         table_key.direction = 0;
1110         table_key.domain = 1;
1111         tbl_data = rte_zmalloc(NULL, sizeof(*tbl_data), 0);
1112         if (!tbl_data) {
1113                 err = ENOMEM;
1114                 goto error;
1115         }
1116         tbl_data->entry.key = table_key.v64;
1117         err = mlx5_hlist_insert(sh->flow_tbls, &tbl_data->entry);
1118         if (err)
1119                 goto error;
1120         rte_atomic32_init(&tbl_data->tbl.refcnt);
1121         rte_atomic32_inc(&tbl_data->tbl.refcnt);
1122         return err;
1123 error:
1124         mlx5_free_table_hash_list(priv);
1125 #endif /* HAVE_MLX5DV_DR */
1126         return err;
1127 }
1128
1129 /**
1130  * Initialize DR related data within private structure.
1131  * Routine checks the reference counter and does actual
1132  * resources creation/initialization only if counter is zero.
1133  *
1134  * @param[in] priv
1135  *   Pointer to the private device data structure.
1136  *
1137  * @return
1138  *   Zero on success, positive error code otherwise.
1139  */
1140 static int
1141 mlx5_alloc_shared_dr(struct mlx5_priv *priv)
1142 {
1143         struct mlx5_dev_ctx_shared *sh = priv->sh;
1144         char s[MLX5_HLIST_NAMESIZE];
1145         int err = 0;
1146
1147         if (!sh->flow_tbls)
1148                 err = mlx5_alloc_table_hash_list(priv);
1149         else
1150                 DRV_LOG(DEBUG, "sh->flow_tbls[%p] already created, reuse\n",
1151                         (void *)sh->flow_tbls);
1152         if (err)
1153                 return err;
1154         /* Create tags hash list table. */
1155         snprintf(s, sizeof(s), "%s_tags", sh->ibdev_name);
1156         sh->tag_table = mlx5_hlist_create(s, MLX5_TAGS_HLIST_ARRAY_SIZE);
1157         if (!sh->tag_table) {
1158                 DRV_LOG(ERR, "tags with hash creation failed.\n");
1159                 err = ENOMEM;
1160                 goto error;
1161         }
1162 #ifdef HAVE_MLX5DV_DR
1163         void *domain;
1164
1165         if (sh->dv_refcnt) {
1166                 /* Shared DV/DR structures is already initialized. */
1167                 sh->dv_refcnt++;
1168                 priv->dr_shared = 1;
1169                 return 0;
1170         }
1171         /* Reference counter is zero, we should initialize structures. */
1172         domain = mlx5_glue->dr_create_domain(sh->ctx,
1173                                              MLX5DV_DR_DOMAIN_TYPE_NIC_RX);
1174         if (!domain) {
1175                 DRV_LOG(ERR, "ingress mlx5dv_dr_create_domain failed");
1176                 err = errno;
1177                 goto error;
1178         }
1179         sh->rx_domain = domain;
1180         domain = mlx5_glue->dr_create_domain(sh->ctx,
1181                                              MLX5DV_DR_DOMAIN_TYPE_NIC_TX);
1182         if (!domain) {
1183                 DRV_LOG(ERR, "egress mlx5dv_dr_create_domain failed");
1184                 err = errno;
1185                 goto error;
1186         }
1187         pthread_mutex_init(&sh->dv_mutex, NULL);
1188         sh->tx_domain = domain;
1189 #ifdef HAVE_MLX5DV_DR_ESWITCH
1190         if (priv->config.dv_esw_en) {
1191                 domain  = mlx5_glue->dr_create_domain
1192                         (sh->ctx, MLX5DV_DR_DOMAIN_TYPE_FDB);
1193                 if (!domain) {
1194                         DRV_LOG(ERR, "FDB mlx5dv_dr_create_domain failed");
1195                         err = errno;
1196                         goto error;
1197                 }
1198                 sh->fdb_domain = domain;
1199                 sh->esw_drop_action = mlx5_glue->dr_create_flow_action_drop();
1200         }
1201 #endif
1202         if (priv->config.reclaim_mode == MLX5_RCM_AGGR) {
1203                 mlx5_glue->dr_reclaim_domain_memory(sh->rx_domain, 1);
1204                 mlx5_glue->dr_reclaim_domain_memory(sh->tx_domain, 1);
1205                 if (sh->fdb_domain)
1206                         mlx5_glue->dr_reclaim_domain_memory(sh->fdb_domain, 1);
1207         }
1208         sh->pop_vlan_action = mlx5_glue->dr_create_flow_action_pop_vlan();
1209 #endif /* HAVE_MLX5DV_DR */
1210         sh->dv_refcnt++;
1211         priv->dr_shared = 1;
1212         return 0;
1213 error:
1214         /* Rollback the created objects. */
1215         if (sh->rx_domain) {
1216                 mlx5_glue->dr_destroy_domain(sh->rx_domain);
1217                 sh->rx_domain = NULL;
1218         }
1219         if (sh->tx_domain) {
1220                 mlx5_glue->dr_destroy_domain(sh->tx_domain);
1221                 sh->tx_domain = NULL;
1222         }
1223         if (sh->fdb_domain) {
1224                 mlx5_glue->dr_destroy_domain(sh->fdb_domain);
1225                 sh->fdb_domain = NULL;
1226         }
1227         if (sh->esw_drop_action) {
1228                 mlx5_glue->destroy_flow_action(sh->esw_drop_action);
1229                 sh->esw_drop_action = NULL;
1230         }
1231         if (sh->pop_vlan_action) {
1232                 mlx5_glue->destroy_flow_action(sh->pop_vlan_action);
1233                 sh->pop_vlan_action = NULL;
1234         }
1235         if (sh->tag_table) {
1236                 /* tags should be destroyed with flow before. */
1237                 mlx5_hlist_destroy(sh->tag_table, NULL, NULL);
1238                 sh->tag_table = NULL;
1239         }
1240         mlx5_free_table_hash_list(priv);
1241         return err;
1242 }
1243
1244 /**
1245  * Destroy DR related data within private structure.
1246  *
1247  * @param[in] priv
1248  *   Pointer to the private device data structure.
1249  */
1250 static void
1251 mlx5_free_shared_dr(struct mlx5_priv *priv)
1252 {
1253         struct mlx5_dev_ctx_shared *sh;
1254
1255         if (!priv->dr_shared)
1256                 return;
1257         priv->dr_shared = 0;
1258         sh = priv->sh;
1259         MLX5_ASSERT(sh);
1260 #ifdef HAVE_MLX5DV_DR
1261         MLX5_ASSERT(sh->dv_refcnt);
1262         if (sh->dv_refcnt && --sh->dv_refcnt)
1263                 return;
1264         if (sh->rx_domain) {
1265                 mlx5_glue->dr_destroy_domain(sh->rx_domain);
1266                 sh->rx_domain = NULL;
1267         }
1268         if (sh->tx_domain) {
1269                 mlx5_glue->dr_destroy_domain(sh->tx_domain);
1270                 sh->tx_domain = NULL;
1271         }
1272 #ifdef HAVE_MLX5DV_DR_ESWITCH
1273         if (sh->fdb_domain) {
1274                 mlx5_glue->dr_destroy_domain(sh->fdb_domain);
1275                 sh->fdb_domain = NULL;
1276         }
1277         if (sh->esw_drop_action) {
1278                 mlx5_glue->destroy_flow_action(sh->esw_drop_action);
1279                 sh->esw_drop_action = NULL;
1280         }
1281 #endif
1282         if (sh->pop_vlan_action) {
1283                 mlx5_glue->destroy_flow_action(sh->pop_vlan_action);
1284                 sh->pop_vlan_action = NULL;
1285         }
1286         pthread_mutex_destroy(&sh->dv_mutex);
1287 #endif /* HAVE_MLX5DV_DR */
1288         if (sh->tag_table) {
1289                 /* tags should be destroyed with flow before. */
1290                 mlx5_hlist_destroy(sh->tag_table, NULL, NULL);
1291                 sh->tag_table = NULL;
1292         }
1293         mlx5_free_table_hash_list(priv);
1294 }
1295
1296 /**
1297  * Initialize shared data between primary and secondary process.
1298  *
1299  * A memzone is reserved by primary process and secondary processes attach to
1300  * the memzone.
1301  *
1302  * @return
1303  *   0 on success, a negative errno value otherwise and rte_errno is set.
1304  */
1305 static int
1306 mlx5_init_shared_data(void)
1307 {
1308         const struct rte_memzone *mz;
1309         int ret = 0;
1310
1311         rte_spinlock_lock(&mlx5_shared_data_lock);
1312         if (mlx5_shared_data == NULL) {
1313                 if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
1314                         /* Allocate shared memory. */
1315                         mz = rte_memzone_reserve(MZ_MLX5_PMD_SHARED_DATA,
1316                                                  sizeof(*mlx5_shared_data),
1317                                                  SOCKET_ID_ANY, 0);
1318                         if (mz == NULL) {
1319                                 DRV_LOG(ERR,
1320                                         "Cannot allocate mlx5 shared data");
1321                                 ret = -rte_errno;
1322                                 goto error;
1323                         }
1324                         mlx5_shared_data = mz->addr;
1325                         memset(mlx5_shared_data, 0, sizeof(*mlx5_shared_data));
1326                         rte_spinlock_init(&mlx5_shared_data->lock);
1327                 } else {
1328                         /* Lookup allocated shared memory. */
1329                         mz = rte_memzone_lookup(MZ_MLX5_PMD_SHARED_DATA);
1330                         if (mz == NULL) {
1331                                 DRV_LOG(ERR,
1332                                         "Cannot attach mlx5 shared data");
1333                                 ret = -rte_errno;
1334                                 goto error;
1335                         }
1336                         mlx5_shared_data = mz->addr;
1337                         memset(&mlx5_local_data, 0, sizeof(mlx5_local_data));
1338                 }
1339         }
1340 error:
1341         rte_spinlock_unlock(&mlx5_shared_data_lock);
1342         return ret;
1343 }
1344
1345 /**
1346  * Retrieve integer value from environment variable.
1347  *
1348  * @param[in] name
1349  *   Environment variable name.
1350  *
1351  * @return
1352  *   Integer value, 0 if the variable is not set.
1353  */
1354 int
1355 mlx5_getenv_int(const char *name)
1356 {
1357         const char *val = getenv(name);
1358
1359         if (val == NULL)
1360                 return 0;
1361         return atoi(val);
1362 }
1363
1364 /**
1365  * Verbs callback to allocate a memory. This function should allocate the space
1366  * according to the size provided residing inside a huge page.
1367  * Please note that all allocation must respect the alignment from libmlx5
1368  * (i.e. currently sysconf(_SC_PAGESIZE)).
1369  *
1370  * @param[in] size
1371  *   The size in bytes of the memory to allocate.
1372  * @param[in] data
1373  *   A pointer to the callback data.
1374  *
1375  * @return
1376  *   Allocated buffer, NULL otherwise and rte_errno is set.
1377  */
1378 static void *
1379 mlx5_alloc_verbs_buf(size_t size, void *data)
1380 {
1381         struct mlx5_priv *priv = data;
1382         void *ret;
1383         size_t alignment = sysconf(_SC_PAGESIZE);
1384         unsigned int socket = SOCKET_ID_ANY;
1385
1386         if (priv->verbs_alloc_ctx.type == MLX5_VERBS_ALLOC_TYPE_TX_QUEUE) {
1387                 const struct mlx5_txq_ctrl *ctrl = priv->verbs_alloc_ctx.obj;
1388
1389                 socket = ctrl->socket;
1390         } else if (priv->verbs_alloc_ctx.type ==
1391                    MLX5_VERBS_ALLOC_TYPE_RX_QUEUE) {
1392                 const struct mlx5_rxq_ctrl *ctrl = priv->verbs_alloc_ctx.obj;
1393
1394                 socket = ctrl->socket;
1395         }
1396         MLX5_ASSERT(data != NULL);
1397         ret = rte_malloc_socket(__func__, size, alignment, socket);
1398         if (!ret && size)
1399                 rte_errno = ENOMEM;
1400         return ret;
1401 }
1402
1403 /**
1404  * Verbs callback to free a memory.
1405  *
1406  * @param[in] ptr
1407  *   A pointer to the memory to free.
1408  * @param[in] data
1409  *   A pointer to the callback data.
1410  */
1411 static void
1412 mlx5_free_verbs_buf(void *ptr, void *data __rte_unused)
1413 {
1414         MLX5_ASSERT(data != NULL);
1415         rte_free(ptr);
1416 }
1417
1418 /**
1419  * DPDK callback to add udp tunnel port
1420  *
1421  * @param[in] dev
1422  *   A pointer to eth_dev
1423  * @param[in] udp_tunnel
1424  *   A pointer to udp tunnel
1425  *
1426  * @return
1427  *   0 on valid udp ports and tunnels, -ENOTSUP otherwise.
1428  */
1429 int
1430 mlx5_udp_tunnel_port_add(struct rte_eth_dev *dev __rte_unused,
1431                          struct rte_eth_udp_tunnel *udp_tunnel)
1432 {
1433         MLX5_ASSERT(udp_tunnel != NULL);
1434         if (udp_tunnel->prot_type == RTE_TUNNEL_TYPE_VXLAN &&
1435             udp_tunnel->udp_port == 4789)
1436                 return 0;
1437         if (udp_tunnel->prot_type == RTE_TUNNEL_TYPE_VXLAN_GPE &&
1438             udp_tunnel->udp_port == 4790)
1439                 return 0;
1440         return -ENOTSUP;
1441 }
1442
1443 /**
1444  * Initialize process private data structure.
1445  *
1446  * @param dev
1447  *   Pointer to Ethernet device structure.
1448  *
1449  * @return
1450  *   0 on success, a negative errno value otherwise and rte_errno is set.
1451  */
1452 int
1453 mlx5_proc_priv_init(struct rte_eth_dev *dev)
1454 {
1455         struct mlx5_priv *priv = dev->data->dev_private;
1456         struct mlx5_proc_priv *ppriv;
1457         size_t ppriv_size;
1458
1459         /*
1460          * UAR register table follows the process private structure. BlueFlame
1461          * registers for Tx queues are stored in the table.
1462          */
1463         ppriv_size =
1464                 sizeof(struct mlx5_proc_priv) + priv->txqs_n * sizeof(void *);
1465         ppriv = rte_malloc_socket("mlx5_proc_priv", ppriv_size,
1466                                   RTE_CACHE_LINE_SIZE, dev->device->numa_node);
1467         if (!ppriv) {
1468                 rte_errno = ENOMEM;
1469                 return -rte_errno;
1470         }
1471         ppriv->uar_table_sz = ppriv_size;
1472         dev->process_private = ppriv;
1473         return 0;
1474 }
1475
1476 /**
1477  * Un-initialize process private data structure.
1478  *
1479  * @param dev
1480  *   Pointer to Ethernet device structure.
1481  */
1482 static void
1483 mlx5_proc_priv_uninit(struct rte_eth_dev *dev)
1484 {
1485         if (!dev->process_private)
1486                 return;
1487         rte_free(dev->process_private);
1488         dev->process_private = NULL;
1489 }
1490
1491 /**
1492  * DPDK callback to close the device.
1493  *
1494  * Destroy all queues and objects, free memory.
1495  *
1496  * @param dev
1497  *   Pointer to Ethernet device structure.
1498  */
1499 static void
1500 mlx5_dev_close(struct rte_eth_dev *dev)
1501 {
1502         struct mlx5_priv *priv = dev->data->dev_private;
1503         unsigned int i;
1504         int ret;
1505
1506         if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
1507                 /* Check if process_private released. */
1508                 if (!dev->process_private)
1509                         return;
1510                 mlx5_tx_uar_uninit_secondary(dev);
1511                 mlx5_proc_priv_uninit(dev);
1512                 rte_eth_dev_release_port(dev);
1513                 return;
1514         }
1515         if (!priv->sh)
1516                 return;
1517         DRV_LOG(DEBUG, "port %u closing device \"%s\"",
1518                 dev->data->port_id,
1519                 ((priv->sh->ctx != NULL) ?
1520                 mlx5_os_get_ctx_device_name(priv->sh->ctx) : ""));
1521         /*
1522          * If default mreg copy action is removed at the stop stage,
1523          * the search will return none and nothing will be done anymore.
1524          */
1525         mlx5_flow_stop_default(dev);
1526         mlx5_traffic_disable(dev);
1527         /*
1528          * If all the flows are already flushed in the device stop stage,
1529          * then this will return directly without any action.
1530          */
1531         mlx5_flow_list_flush(dev, &priv->flows, true);
1532         mlx5_flow_meter_flush(dev, NULL);
1533         /* Free the intermediate buffers for flow creation. */
1534         mlx5_flow_free_intermediate(dev);
1535         /* Prevent crashes when queues are still in use. */
1536         dev->rx_pkt_burst = removed_rx_burst;
1537         dev->tx_pkt_burst = removed_tx_burst;
1538         rte_wmb();
1539         /* Disable datapath on secondary process. */
1540         mlx5_mp_req_stop_rxtx(dev);
1541         if (priv->rxqs != NULL) {
1542                 /* XXX race condition if mlx5_rx_burst() is still running. */
1543                 usleep(1000);
1544                 for (i = 0; (i != priv->rxqs_n); ++i)
1545                         mlx5_rxq_release(dev, i);
1546                 priv->rxqs_n = 0;
1547                 priv->rxqs = NULL;
1548         }
1549         if (priv->txqs != NULL) {
1550                 /* XXX race condition if mlx5_tx_burst() is still running. */
1551                 usleep(1000);
1552                 for (i = 0; (i != priv->txqs_n); ++i)
1553                         mlx5_txq_release(dev, i);
1554                 priv->txqs_n = 0;
1555                 priv->txqs = NULL;
1556         }
1557         mlx5_proc_priv_uninit(dev);
1558         if (priv->mreg_cp_tbl)
1559                 mlx5_hlist_destroy(priv->mreg_cp_tbl, NULL, NULL);
1560         mlx5_mprq_free_mp(dev);
1561         mlx5_free_shared_dr(priv);
1562         if (priv->rss_conf.rss_key != NULL)
1563                 rte_free(priv->rss_conf.rss_key);
1564         if (priv->reta_idx != NULL)
1565                 rte_free(priv->reta_idx);
1566         if (priv->config.vf)
1567                 mlx5_nl_mac_addr_flush(priv->nl_socket_route, mlx5_ifindex(dev),
1568                                        dev->data->mac_addrs,
1569                                        MLX5_MAX_MAC_ADDRESSES, priv->mac_own);
1570         if (priv->nl_socket_route >= 0)
1571                 close(priv->nl_socket_route);
1572         if (priv->nl_socket_rdma >= 0)
1573                 close(priv->nl_socket_rdma);
1574         if (priv->vmwa_context)
1575                 mlx5_vlan_vmwa_exit(priv->vmwa_context);
1576         ret = mlx5_hrxq_verify(dev);
1577         if (ret)
1578                 DRV_LOG(WARNING, "port %u some hash Rx queue still remain",
1579                         dev->data->port_id);
1580         ret = mlx5_ind_table_obj_verify(dev);
1581         if (ret)
1582                 DRV_LOG(WARNING, "port %u some indirection table still remain",
1583                         dev->data->port_id);
1584         ret = mlx5_rxq_obj_verify(dev);
1585         if (ret)
1586                 DRV_LOG(WARNING, "port %u some Rx queue objects still remain",
1587                         dev->data->port_id);
1588         ret = mlx5_rxq_verify(dev);
1589         if (ret)
1590                 DRV_LOG(WARNING, "port %u some Rx queues still remain",
1591                         dev->data->port_id);
1592         ret = mlx5_txq_obj_verify(dev);
1593         if (ret)
1594                 DRV_LOG(WARNING, "port %u some Verbs Tx queue still remain",
1595                         dev->data->port_id);
1596         ret = mlx5_txq_verify(dev);
1597         if (ret)
1598                 DRV_LOG(WARNING, "port %u some Tx queues still remain",
1599                         dev->data->port_id);
1600         ret = mlx5_flow_verify(dev);
1601         if (ret)
1602                 DRV_LOG(WARNING, "port %u some flows still remain",
1603                         dev->data->port_id);
1604         /*
1605          * Free the shared context in last turn, because the cleanup
1606          * routines above may use some shared fields, like
1607          * mlx5_nl_mac_addr_flush() uses ibdev_path for retrieveing
1608          * ifindex if Netlink fails.
1609          */
1610         mlx5_free_shared_ibctx(priv->sh);
1611         if (priv->domain_id != RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID) {
1612                 unsigned int c = 0;
1613                 uint16_t port_id;
1614
1615                 MLX5_ETH_FOREACH_DEV(port_id, priv->pci_dev) {
1616                         struct mlx5_priv *opriv =
1617                                 rte_eth_devices[port_id].data->dev_private;
1618
1619                         if (!opriv ||
1620                             opriv->domain_id != priv->domain_id ||
1621                             &rte_eth_devices[port_id] == dev)
1622                                 continue;
1623                         ++c;
1624                         break;
1625                 }
1626                 if (!c)
1627                         claim_zero(rte_eth_switch_domain_free(priv->domain_id));
1628         }
1629         memset(priv, 0, sizeof(*priv));
1630         priv->domain_id = RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID;
1631         /*
1632          * Reset mac_addrs to NULL such that it is not freed as part of
1633          * rte_eth_dev_release_port(). mac_addrs is part of dev_private so
1634          * it is freed when dev_private is freed.
1635          */
1636         dev->data->mac_addrs = NULL;
1637 }
1638
1639 const struct eth_dev_ops mlx5_dev_ops = {
1640         .dev_configure = mlx5_dev_configure,
1641         .dev_start = mlx5_dev_start,
1642         .dev_stop = mlx5_dev_stop,
1643         .dev_set_link_down = mlx5_set_link_down,
1644         .dev_set_link_up = mlx5_set_link_up,
1645         .dev_close = mlx5_dev_close,
1646         .promiscuous_enable = mlx5_promiscuous_enable,
1647         .promiscuous_disable = mlx5_promiscuous_disable,
1648         .allmulticast_enable = mlx5_allmulticast_enable,
1649         .allmulticast_disable = mlx5_allmulticast_disable,
1650         .link_update = mlx5_link_update,
1651         .stats_get = mlx5_stats_get,
1652         .stats_reset = mlx5_stats_reset,
1653         .xstats_get = mlx5_xstats_get,
1654         .xstats_reset = mlx5_xstats_reset,
1655         .xstats_get_names = mlx5_xstats_get_names,
1656         .fw_version_get = mlx5_fw_version_get,
1657         .dev_infos_get = mlx5_dev_infos_get,
1658         .read_clock = mlx5_read_clock,
1659         .dev_supported_ptypes_get = mlx5_dev_supported_ptypes_get,
1660         .vlan_filter_set = mlx5_vlan_filter_set,
1661         .rx_queue_setup = mlx5_rx_queue_setup,
1662         .rx_hairpin_queue_setup = mlx5_rx_hairpin_queue_setup,
1663         .tx_queue_setup = mlx5_tx_queue_setup,
1664         .tx_hairpin_queue_setup = mlx5_tx_hairpin_queue_setup,
1665         .rx_queue_release = mlx5_rx_queue_release,
1666         .tx_queue_release = mlx5_tx_queue_release,
1667         .flow_ctrl_get = mlx5_dev_get_flow_ctrl,
1668         .flow_ctrl_set = mlx5_dev_set_flow_ctrl,
1669         .mac_addr_remove = mlx5_mac_addr_remove,
1670         .mac_addr_add = mlx5_mac_addr_add,
1671         .mac_addr_set = mlx5_mac_addr_set,
1672         .set_mc_addr_list = mlx5_set_mc_addr_list,
1673         .mtu_set = mlx5_dev_set_mtu,
1674         .vlan_strip_queue_set = mlx5_vlan_strip_queue_set,
1675         .vlan_offload_set = mlx5_vlan_offload_set,
1676         .reta_update = mlx5_dev_rss_reta_update,
1677         .reta_query = mlx5_dev_rss_reta_query,
1678         .rss_hash_update = mlx5_rss_hash_update,
1679         .rss_hash_conf_get = mlx5_rss_hash_conf_get,
1680         .filter_ctrl = mlx5_dev_filter_ctrl,
1681         .rx_descriptor_status = mlx5_rx_descriptor_status,
1682         .tx_descriptor_status = mlx5_tx_descriptor_status,
1683         .rxq_info_get = mlx5_rxq_info_get,
1684         .txq_info_get = mlx5_txq_info_get,
1685         .rx_burst_mode_get = mlx5_rx_burst_mode_get,
1686         .tx_burst_mode_get = mlx5_tx_burst_mode_get,
1687         .rx_queue_count = mlx5_rx_queue_count,
1688         .rx_queue_intr_enable = mlx5_rx_intr_enable,
1689         .rx_queue_intr_disable = mlx5_rx_intr_disable,
1690         .is_removed = mlx5_is_removed,
1691         .udp_tunnel_port_add  = mlx5_udp_tunnel_port_add,
1692         .get_module_info = mlx5_get_module_info,
1693         .get_module_eeprom = mlx5_get_module_eeprom,
1694         .hairpin_cap_get = mlx5_hairpin_cap_get,
1695         .mtr_ops_get = mlx5_flow_meter_ops_get,
1696 };
1697
1698 /* Available operations from secondary process. */
1699 static const struct eth_dev_ops mlx5_dev_sec_ops = {
1700         .stats_get = mlx5_stats_get,
1701         .stats_reset = mlx5_stats_reset,
1702         .xstats_get = mlx5_xstats_get,
1703         .xstats_reset = mlx5_xstats_reset,
1704         .xstats_get_names = mlx5_xstats_get_names,
1705         .fw_version_get = mlx5_fw_version_get,
1706         .dev_infos_get = mlx5_dev_infos_get,
1707         .rx_descriptor_status = mlx5_rx_descriptor_status,
1708         .tx_descriptor_status = mlx5_tx_descriptor_status,
1709         .rxq_info_get = mlx5_rxq_info_get,
1710         .txq_info_get = mlx5_txq_info_get,
1711         .rx_burst_mode_get = mlx5_rx_burst_mode_get,
1712         .tx_burst_mode_get = mlx5_tx_burst_mode_get,
1713         .get_module_info = mlx5_get_module_info,
1714         .get_module_eeprom = mlx5_get_module_eeprom,
1715 };
1716
1717 /* Available operations in flow isolated mode. */
1718 const struct eth_dev_ops mlx5_dev_ops_isolate = {
1719         .dev_configure = mlx5_dev_configure,
1720         .dev_start = mlx5_dev_start,
1721         .dev_stop = mlx5_dev_stop,
1722         .dev_set_link_down = mlx5_set_link_down,
1723         .dev_set_link_up = mlx5_set_link_up,
1724         .dev_close = mlx5_dev_close,
1725         .promiscuous_enable = mlx5_promiscuous_enable,
1726         .promiscuous_disable = mlx5_promiscuous_disable,
1727         .allmulticast_enable = mlx5_allmulticast_enable,
1728         .allmulticast_disable = mlx5_allmulticast_disable,
1729         .link_update = mlx5_link_update,
1730         .stats_get = mlx5_stats_get,
1731         .stats_reset = mlx5_stats_reset,
1732         .xstats_get = mlx5_xstats_get,
1733         .xstats_reset = mlx5_xstats_reset,
1734         .xstats_get_names = mlx5_xstats_get_names,
1735         .fw_version_get = mlx5_fw_version_get,
1736         .dev_infos_get = mlx5_dev_infos_get,
1737         .dev_supported_ptypes_get = mlx5_dev_supported_ptypes_get,
1738         .vlan_filter_set = mlx5_vlan_filter_set,
1739         .rx_queue_setup = mlx5_rx_queue_setup,
1740         .rx_hairpin_queue_setup = mlx5_rx_hairpin_queue_setup,
1741         .tx_queue_setup = mlx5_tx_queue_setup,
1742         .tx_hairpin_queue_setup = mlx5_tx_hairpin_queue_setup,
1743         .rx_queue_release = mlx5_rx_queue_release,
1744         .tx_queue_release = mlx5_tx_queue_release,
1745         .flow_ctrl_get = mlx5_dev_get_flow_ctrl,
1746         .flow_ctrl_set = mlx5_dev_set_flow_ctrl,
1747         .mac_addr_remove = mlx5_mac_addr_remove,
1748         .mac_addr_add = mlx5_mac_addr_add,
1749         .mac_addr_set = mlx5_mac_addr_set,
1750         .set_mc_addr_list = mlx5_set_mc_addr_list,
1751         .mtu_set = mlx5_dev_set_mtu,
1752         .vlan_strip_queue_set = mlx5_vlan_strip_queue_set,
1753         .vlan_offload_set = mlx5_vlan_offload_set,
1754         .filter_ctrl = mlx5_dev_filter_ctrl,
1755         .rx_descriptor_status = mlx5_rx_descriptor_status,
1756         .tx_descriptor_status = mlx5_tx_descriptor_status,
1757         .rxq_info_get = mlx5_rxq_info_get,
1758         .txq_info_get = mlx5_txq_info_get,
1759         .rx_burst_mode_get = mlx5_rx_burst_mode_get,
1760         .tx_burst_mode_get = mlx5_tx_burst_mode_get,
1761         .rx_queue_intr_enable = mlx5_rx_intr_enable,
1762         .rx_queue_intr_disable = mlx5_rx_intr_disable,
1763         .is_removed = mlx5_is_removed,
1764         .get_module_info = mlx5_get_module_info,
1765         .get_module_eeprom = mlx5_get_module_eeprom,
1766         .hairpin_cap_get = mlx5_hairpin_cap_get,
1767         .mtr_ops_get = mlx5_flow_meter_ops_get,
1768 };
1769
1770 /**
1771  * Verify and store value for device argument.
1772  *
1773  * @param[in] key
1774  *   Key argument to verify.
1775  * @param[in] val
1776  *   Value associated with key.
1777  * @param opaque
1778  *   User data.
1779  *
1780  * @return
1781  *   0 on success, a negative errno value otherwise and rte_errno is set.
1782  */
1783 static int
1784 mlx5_args_check(const char *key, const char *val, void *opaque)
1785 {
1786         struct mlx5_dev_config *config = opaque;
1787         unsigned long tmp;
1788
1789         /* No-op, port representors are processed in mlx5_dev_spawn(). */
1790         if (!strcmp(MLX5_REPRESENTOR, key))
1791                 return 0;
1792         errno = 0;
1793         tmp = strtoul(val, NULL, 0);
1794         if (errno) {
1795                 rte_errno = errno;
1796                 DRV_LOG(WARNING, "%s: \"%s\" is not a valid integer", key, val);
1797                 return -rte_errno;
1798         }
1799         if (strcmp(MLX5_RXQ_CQE_COMP_EN, key) == 0) {
1800                 config->cqe_comp = !!tmp;
1801         } else if (strcmp(MLX5_RXQ_CQE_PAD_EN, key) == 0) {
1802                 config->cqe_pad = !!tmp;
1803         } else if (strcmp(MLX5_RXQ_PKT_PAD_EN, key) == 0) {
1804                 config->hw_padding = !!tmp;
1805         } else if (strcmp(MLX5_RX_MPRQ_EN, key) == 0) {
1806                 config->mprq.enabled = !!tmp;
1807         } else if (strcmp(MLX5_RX_MPRQ_LOG_STRIDE_NUM, key) == 0) {
1808                 config->mprq.stride_num_n = tmp;
1809         } else if (strcmp(MLX5_RX_MPRQ_LOG_STRIDE_SIZE, key) == 0) {
1810                 config->mprq.stride_size_n = tmp;
1811         } else if (strcmp(MLX5_RX_MPRQ_MAX_MEMCPY_LEN, key) == 0) {
1812                 config->mprq.max_memcpy_len = tmp;
1813         } else if (strcmp(MLX5_RXQS_MIN_MPRQ, key) == 0) {
1814                 config->mprq.min_rxqs_num = tmp;
1815         } else if (strcmp(MLX5_TXQ_INLINE, key) == 0) {
1816                 DRV_LOG(WARNING, "%s: deprecated parameter,"
1817                                  " converted to txq_inline_max", key);
1818                 config->txq_inline_max = tmp;
1819         } else if (strcmp(MLX5_TXQ_INLINE_MAX, key) == 0) {
1820                 config->txq_inline_max = tmp;
1821         } else if (strcmp(MLX5_TXQ_INLINE_MIN, key) == 0) {
1822                 config->txq_inline_min = tmp;
1823         } else if (strcmp(MLX5_TXQ_INLINE_MPW, key) == 0) {
1824                 config->txq_inline_mpw = tmp;
1825         } else if (strcmp(MLX5_TXQS_MIN_INLINE, key) == 0) {
1826                 config->txqs_inline = tmp;
1827         } else if (strcmp(MLX5_TXQS_MAX_VEC, key) == 0) {
1828                 DRV_LOG(WARNING, "%s: deprecated parameter, ignored", key);
1829         } else if (strcmp(MLX5_TXQ_MPW_EN, key) == 0) {
1830                 config->mps = !!tmp;
1831         } else if (strcmp(MLX5_TX_DB_NC, key) == 0) {
1832                 if (tmp != MLX5_TXDB_CACHED &&
1833                     tmp != MLX5_TXDB_NCACHED &&
1834                     tmp != MLX5_TXDB_HEURISTIC) {
1835                         DRV_LOG(ERR, "invalid Tx doorbell "
1836                                      "mapping parameter");
1837                         rte_errno = EINVAL;
1838                         return -rte_errno;
1839                 }
1840                 config->dbnc = tmp;
1841         } else if (strcmp(MLX5_TXQ_MPW_HDR_DSEG_EN, key) == 0) {
1842                 DRV_LOG(WARNING, "%s: deprecated parameter, ignored", key);
1843         } else if (strcmp(MLX5_TXQ_MAX_INLINE_LEN, key) == 0) {
1844                 DRV_LOG(WARNING, "%s: deprecated parameter,"
1845                                  " converted to txq_inline_mpw", key);
1846                 config->txq_inline_mpw = tmp;
1847         } else if (strcmp(MLX5_TX_VEC_EN, key) == 0) {
1848                 DRV_LOG(WARNING, "%s: deprecated parameter, ignored", key);
1849         } else if (strcmp(MLX5_RX_VEC_EN, key) == 0) {
1850                 config->rx_vec_en = !!tmp;
1851         } else if (strcmp(MLX5_L3_VXLAN_EN, key) == 0) {
1852                 config->l3_vxlan_en = !!tmp;
1853         } else if (strcmp(MLX5_VF_NL_EN, key) == 0) {
1854                 config->vf_nl_en = !!tmp;
1855         } else if (strcmp(MLX5_DV_ESW_EN, key) == 0) {
1856                 config->dv_esw_en = !!tmp;
1857         } else if (strcmp(MLX5_DV_FLOW_EN, key) == 0) {
1858                 config->dv_flow_en = !!tmp;
1859         } else if (strcmp(MLX5_DV_XMETA_EN, key) == 0) {
1860                 if (tmp != MLX5_XMETA_MODE_LEGACY &&
1861                     tmp != MLX5_XMETA_MODE_META16 &&
1862                     tmp != MLX5_XMETA_MODE_META32) {
1863                         DRV_LOG(ERR, "invalid extensive "
1864                                      "metadata parameter");
1865                         rte_errno = EINVAL;
1866                         return -rte_errno;
1867                 }
1868                 config->dv_xmeta_en = tmp;
1869         } else if (strcmp(MLX5_MR_EXT_MEMSEG_EN, key) == 0) {
1870                 config->mr_ext_memseg_en = !!tmp;
1871         } else if (strcmp(MLX5_MAX_DUMP_FILES_NUM, key) == 0) {
1872                 config->max_dump_files_num = tmp;
1873         } else if (strcmp(MLX5_LRO_TIMEOUT_USEC, key) == 0) {
1874                 config->lro.timeout = tmp;
1875         } else if (strcmp(MLX5_CLASS_ARG_NAME, key) == 0) {
1876                 DRV_LOG(DEBUG, "class argument is %s.", val);
1877         } else if (strcmp(MLX5_HP_BUF_SIZE, key) == 0) {
1878                 config->log_hp_size = tmp;
1879         } else if (strcmp(MLX5_RECLAIM_MEM, key) == 0) {
1880                 if (tmp != MLX5_RCM_NONE &&
1881                     tmp != MLX5_RCM_LIGHT &&
1882                     tmp != MLX5_RCM_AGGR) {
1883                         DRV_LOG(ERR, "Unrecognize %s: \"%s\"", key, val);
1884                         rte_errno = EINVAL;
1885                         return -rte_errno;
1886                 }
1887                 config->reclaim_mode = tmp;
1888         } else {
1889                 DRV_LOG(WARNING, "%s: unknown parameter", key);
1890                 rte_errno = EINVAL;
1891                 return -rte_errno;
1892         }
1893         return 0;
1894 }
1895
1896 /**
1897  * Parse device parameters.
1898  *
1899  * @param config
1900  *   Pointer to device configuration structure.
1901  * @param devargs
1902  *   Device arguments structure.
1903  *
1904  * @return
1905  *   0 on success, a negative errno value otherwise and rte_errno is set.
1906  */
1907 static int
1908 mlx5_args(struct mlx5_dev_config *config, struct rte_devargs *devargs)
1909 {
1910         const char **params = (const char *[]){
1911                 MLX5_RXQ_CQE_COMP_EN,
1912                 MLX5_RXQ_CQE_PAD_EN,
1913                 MLX5_RXQ_PKT_PAD_EN,
1914                 MLX5_RX_MPRQ_EN,
1915                 MLX5_RX_MPRQ_LOG_STRIDE_NUM,
1916                 MLX5_RX_MPRQ_LOG_STRIDE_SIZE,
1917                 MLX5_RX_MPRQ_MAX_MEMCPY_LEN,
1918                 MLX5_RXQS_MIN_MPRQ,
1919                 MLX5_TXQ_INLINE,
1920                 MLX5_TXQ_INLINE_MIN,
1921                 MLX5_TXQ_INLINE_MAX,
1922                 MLX5_TXQ_INLINE_MPW,
1923                 MLX5_TXQS_MIN_INLINE,
1924                 MLX5_TXQS_MAX_VEC,
1925                 MLX5_TXQ_MPW_EN,
1926                 MLX5_TXQ_MPW_HDR_DSEG_EN,
1927                 MLX5_TXQ_MAX_INLINE_LEN,
1928                 MLX5_TX_DB_NC,
1929                 MLX5_TX_VEC_EN,
1930                 MLX5_RX_VEC_EN,
1931                 MLX5_L3_VXLAN_EN,
1932                 MLX5_VF_NL_EN,
1933                 MLX5_DV_ESW_EN,
1934                 MLX5_DV_FLOW_EN,
1935                 MLX5_DV_XMETA_EN,
1936                 MLX5_MR_EXT_MEMSEG_EN,
1937                 MLX5_REPRESENTOR,
1938                 MLX5_MAX_DUMP_FILES_NUM,
1939                 MLX5_LRO_TIMEOUT_USEC,
1940                 MLX5_CLASS_ARG_NAME,
1941                 MLX5_HP_BUF_SIZE,
1942                 MLX5_RECLAIM_MEM,
1943                 NULL,
1944         };
1945         struct rte_kvargs *kvlist;
1946         int ret = 0;
1947         int i;
1948
1949         if (devargs == NULL)
1950                 return 0;
1951         /* Following UGLY cast is done to pass checkpatch. */
1952         kvlist = rte_kvargs_parse(devargs->args, params);
1953         if (kvlist == NULL) {
1954                 rte_errno = EINVAL;
1955                 return -rte_errno;
1956         }
1957         /* Process parameters. */
1958         for (i = 0; (params[i] != NULL); ++i) {
1959                 if (rte_kvargs_count(kvlist, params[i])) {
1960                         ret = rte_kvargs_process(kvlist, params[i],
1961                                                  mlx5_args_check, config);
1962                         if (ret) {
1963                                 rte_errno = EINVAL;
1964                                 rte_kvargs_free(kvlist);
1965                                 return -rte_errno;
1966                         }
1967                 }
1968         }
1969         rte_kvargs_free(kvlist);
1970         return 0;
1971 }
1972
1973 static struct rte_pci_driver mlx5_driver;
1974
1975 /**
1976  * PMD global initialization.
1977  *
1978  * Independent from individual device, this function initializes global
1979  * per-PMD data structures distinguishing primary and secondary processes.
1980  * Hence, each initialization is called once per a process.
1981  *
1982  * @return
1983  *   0 on success, a negative errno value otherwise and rte_errno is set.
1984  */
1985 static int
1986 mlx5_init_once(void)
1987 {
1988         struct mlx5_shared_data *sd;
1989         struct mlx5_local_data *ld = &mlx5_local_data;
1990         int ret = 0;
1991
1992         if (mlx5_init_shared_data())
1993                 return -rte_errno;
1994         sd = mlx5_shared_data;
1995         MLX5_ASSERT(sd);
1996         rte_spinlock_lock(&sd->lock);
1997         switch (rte_eal_process_type()) {
1998         case RTE_PROC_PRIMARY:
1999                 if (sd->init_done)
2000                         break;
2001                 LIST_INIT(&sd->mem_event_cb_list);
2002                 rte_rwlock_init(&sd->mem_event_rwlock);
2003                 rte_mem_event_callback_register("MLX5_MEM_EVENT_CB",
2004                                                 mlx5_mr_mem_event_cb, NULL);
2005                 ret = mlx5_mp_init_primary(MLX5_MP_NAME,
2006                                            mlx5_mp_primary_handle);
2007                 if (ret)
2008                         goto out;
2009                 sd->init_done = true;
2010                 break;
2011         case RTE_PROC_SECONDARY:
2012                 if (ld->init_done)
2013                         break;
2014                 ret = mlx5_mp_init_secondary(MLX5_MP_NAME,
2015                                              mlx5_mp_secondary_handle);
2016                 if (ret)
2017                         goto out;
2018                 ++sd->secondary_cnt;
2019                 ld->init_done = true;
2020                 break;
2021         default:
2022                 break;
2023         }
2024 out:
2025         rte_spinlock_unlock(&sd->lock);
2026         return ret;
2027 }
2028
2029 /**
2030  * Configures the minimal amount of data to inline into WQE
2031  * while sending packets.
2032  *
2033  * - the txq_inline_min has the maximal priority, if this
2034  *   key is specified in devargs
2035  * - if DevX is enabled the inline mode is queried from the
2036  *   device (HCA attributes and NIC vport context if needed).
2037  * - otherwise L2 mode (18 bytes) is assumed for ConnectX-4/4 Lx
2038  *   and none (0 bytes) for other NICs
2039  *
2040  * @param spawn
2041  *   Verbs device parameters (name, port, switch_info) to spawn.
2042  * @param config
2043  *   Device configuration parameters.
2044  */
2045 static void
2046 mlx5_set_min_inline(struct mlx5_dev_spawn_data *spawn,
2047                     struct mlx5_dev_config *config)
2048 {
2049         if (config->txq_inline_min != MLX5_ARG_UNSET) {
2050                 /* Application defines size of inlined data explicitly. */
2051                 switch (spawn->pci_dev->id.device_id) {
2052                 case PCI_DEVICE_ID_MELLANOX_CONNECTX4:
2053                 case PCI_DEVICE_ID_MELLANOX_CONNECTX4VF:
2054                         if (config->txq_inline_min <
2055                                        (int)MLX5_INLINE_HSIZE_L2) {
2056                                 DRV_LOG(DEBUG,
2057                                         "txq_inline_mix aligned to minimal"
2058                                         " ConnectX-4 required value %d",
2059                                         (int)MLX5_INLINE_HSIZE_L2);
2060                                 config->txq_inline_min = MLX5_INLINE_HSIZE_L2;
2061                         }
2062                         break;
2063                 }
2064                 goto exit;
2065         }
2066         if (config->hca_attr.eth_net_offloads) {
2067                 /* We have DevX enabled, inline mode queried successfully. */
2068                 switch (config->hca_attr.wqe_inline_mode) {
2069                 case MLX5_CAP_INLINE_MODE_L2:
2070                         /* outer L2 header must be inlined. */
2071                         config->txq_inline_min = MLX5_INLINE_HSIZE_L2;
2072                         goto exit;
2073                 case MLX5_CAP_INLINE_MODE_NOT_REQUIRED:
2074                         /* No inline data are required by NIC. */
2075                         config->txq_inline_min = MLX5_INLINE_HSIZE_NONE;
2076                         config->hw_vlan_insert =
2077                                 config->hca_attr.wqe_vlan_insert;
2078                         DRV_LOG(DEBUG, "Tx VLAN insertion is supported");
2079                         goto exit;
2080                 case MLX5_CAP_INLINE_MODE_VPORT_CONTEXT:
2081                         /* inline mode is defined by NIC vport context. */
2082                         if (!config->hca_attr.eth_virt)
2083                                 break;
2084                         switch (config->hca_attr.vport_inline_mode) {
2085                         case MLX5_INLINE_MODE_NONE:
2086                                 config->txq_inline_min =
2087                                         MLX5_INLINE_HSIZE_NONE;
2088                                 goto exit;
2089                         case MLX5_INLINE_MODE_L2:
2090                                 config->txq_inline_min =
2091                                         MLX5_INLINE_HSIZE_L2;
2092                                 goto exit;
2093                         case MLX5_INLINE_MODE_IP:
2094                                 config->txq_inline_min =
2095                                         MLX5_INLINE_HSIZE_L3;
2096                                 goto exit;
2097                         case MLX5_INLINE_MODE_TCP_UDP:
2098                                 config->txq_inline_min =
2099                                         MLX5_INLINE_HSIZE_L4;
2100                                 goto exit;
2101                         case MLX5_INLINE_MODE_INNER_L2:
2102                                 config->txq_inline_min =
2103                                         MLX5_INLINE_HSIZE_INNER_L2;
2104                                 goto exit;
2105                         case MLX5_INLINE_MODE_INNER_IP:
2106                                 config->txq_inline_min =
2107                                         MLX5_INLINE_HSIZE_INNER_L3;
2108                                 goto exit;
2109                         case MLX5_INLINE_MODE_INNER_TCP_UDP:
2110                                 config->txq_inline_min =
2111                                         MLX5_INLINE_HSIZE_INNER_L4;
2112                                 goto exit;
2113                         }
2114                 }
2115         }
2116         /*
2117          * We get here if we are unable to deduce
2118          * inline data size with DevX. Try PCI ID
2119          * to determine old NICs.
2120          */
2121         switch (spawn->pci_dev->id.device_id) {
2122         case PCI_DEVICE_ID_MELLANOX_CONNECTX4:
2123         case PCI_DEVICE_ID_MELLANOX_CONNECTX4VF:
2124         case PCI_DEVICE_ID_MELLANOX_CONNECTX4LX:
2125         case PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF:
2126                 config->txq_inline_min = MLX5_INLINE_HSIZE_L2;
2127                 config->hw_vlan_insert = 0;
2128                 break;
2129         case PCI_DEVICE_ID_MELLANOX_CONNECTX5:
2130         case PCI_DEVICE_ID_MELLANOX_CONNECTX5VF:
2131         case PCI_DEVICE_ID_MELLANOX_CONNECTX5EX:
2132         case PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF:
2133                 /*
2134                  * These NICs support VLAN insertion from WQE and
2135                  * report the wqe_vlan_insert flag. But there is the bug
2136                  * and PFC control may be broken, so disable feature.
2137                  */
2138                 config->hw_vlan_insert = 0;
2139                 config->txq_inline_min = MLX5_INLINE_HSIZE_NONE;
2140                 break;
2141         default:
2142                 config->txq_inline_min = MLX5_INLINE_HSIZE_NONE;
2143                 break;
2144         }
2145 exit:
2146         DRV_LOG(DEBUG, "min tx inline configured: %d", config->txq_inline_min);
2147 }
2148
2149 /**
2150  * Configures the metadata mask fields in the shared context.
2151  *
2152  * @param [in] dev
2153  *   Pointer to Ethernet device.
2154  */
2155 static void
2156 mlx5_set_metadata_mask(struct rte_eth_dev *dev)
2157 {
2158         struct mlx5_priv *priv = dev->data->dev_private;
2159         struct mlx5_dev_ctx_shared *sh = priv->sh;
2160         uint32_t meta, mark, reg_c0;
2161
2162         reg_c0 = ~priv->vport_meta_mask;
2163         switch (priv->config.dv_xmeta_en) {
2164         case MLX5_XMETA_MODE_LEGACY:
2165                 meta = UINT32_MAX;
2166                 mark = MLX5_FLOW_MARK_MASK;
2167                 break;
2168         case MLX5_XMETA_MODE_META16:
2169                 meta = reg_c0 >> rte_bsf32(reg_c0);
2170                 mark = MLX5_FLOW_MARK_MASK;
2171                 break;
2172         case MLX5_XMETA_MODE_META32:
2173                 meta = UINT32_MAX;
2174                 mark = (reg_c0 >> rte_bsf32(reg_c0)) & MLX5_FLOW_MARK_MASK;
2175                 break;
2176         default:
2177                 meta = 0;
2178                 mark = 0;
2179                 MLX5_ASSERT(false);
2180                 break;
2181         }
2182         if (sh->dv_mark_mask && sh->dv_mark_mask != mark)
2183                 DRV_LOG(WARNING, "metadata MARK mask mismatche %08X:%08X",
2184                                  sh->dv_mark_mask, mark);
2185         else
2186                 sh->dv_mark_mask = mark;
2187         if (sh->dv_meta_mask && sh->dv_meta_mask != meta)
2188                 DRV_LOG(WARNING, "metadata META mask mismatche %08X:%08X",
2189                                  sh->dv_meta_mask, meta);
2190         else
2191                 sh->dv_meta_mask = meta;
2192         if (sh->dv_regc0_mask && sh->dv_regc0_mask != reg_c0)
2193                 DRV_LOG(WARNING, "metadata reg_c0 mask mismatche %08X:%08X",
2194                                  sh->dv_meta_mask, reg_c0);
2195         else
2196                 sh->dv_regc0_mask = reg_c0;
2197         DRV_LOG(DEBUG, "metadata mode %u", priv->config.dv_xmeta_en);
2198         DRV_LOG(DEBUG, "metadata MARK mask %08X", sh->dv_mark_mask);
2199         DRV_LOG(DEBUG, "metadata META mask %08X", sh->dv_meta_mask);
2200         DRV_LOG(DEBUG, "metadata reg_c0 mask %08X", sh->dv_regc0_mask);
2201 }
2202
2203 /**
2204  * Allocate page of door-bells and register it using DevX API.
2205  *
2206  * @param [in] dev
2207  *   Pointer to Ethernet device.
2208  *
2209  * @return
2210  *   Pointer to new page on success, NULL otherwise.
2211  */
2212 static struct mlx5_devx_dbr_page *
2213 mlx5_alloc_dbr_page(struct rte_eth_dev *dev)
2214 {
2215         struct mlx5_priv *priv = dev->data->dev_private;
2216         struct mlx5_devx_dbr_page *page;
2217
2218         /* Allocate space for door-bell page and management data. */
2219         page = rte_calloc_socket(__func__, 1, sizeof(struct mlx5_devx_dbr_page),
2220                                  RTE_CACHE_LINE_SIZE, dev->device->numa_node);
2221         if (!page) {
2222                 DRV_LOG(ERR, "port %u cannot allocate dbr page",
2223                         dev->data->port_id);
2224                 return NULL;
2225         }
2226         /* Register allocated memory. */
2227         page->umem = mlx5_glue->devx_umem_reg(priv->sh->ctx, page->dbrs,
2228                                               MLX5_DBR_PAGE_SIZE, 0);
2229         if (!page->umem) {
2230                 DRV_LOG(ERR, "port %u cannot umem reg dbr page",
2231                         dev->data->port_id);
2232                 rte_free(page);
2233                 return NULL;
2234         }
2235         return page;
2236 }
2237
2238 /**
2239  * Find the next available door-bell, allocate new page if needed.
2240  *
2241  * @param [in] dev
2242  *   Pointer to Ethernet device.
2243  * @param [out] dbr_page
2244  *   Door-bell page containing the page data.
2245  *
2246  * @return
2247  *   Door-bell address offset on success, a negative error value otherwise.
2248  */
2249 int64_t
2250 mlx5_get_dbr(struct rte_eth_dev *dev, struct mlx5_devx_dbr_page **dbr_page)
2251 {
2252         struct mlx5_priv *priv = dev->data->dev_private;
2253         struct mlx5_devx_dbr_page *page = NULL;
2254         uint32_t i, j;
2255
2256         LIST_FOREACH(page, &priv->dbrpgs, next)
2257                 if (page->dbr_count < MLX5_DBR_PER_PAGE)
2258                         break;
2259         if (!page) { /* No page with free door-bell exists. */
2260                 page = mlx5_alloc_dbr_page(dev);
2261                 if (!page) /* Failed to allocate new page. */
2262                         return (-1);
2263                 LIST_INSERT_HEAD(&priv->dbrpgs, page, next);
2264         }
2265         /* Loop to find bitmap part with clear bit. */
2266         for (i = 0;
2267              i < MLX5_DBR_BITMAP_SIZE && page->dbr_bitmap[i] == UINT64_MAX;
2268              i++)
2269                 ; /* Empty. */
2270         /* Find the first clear bit. */
2271         MLX5_ASSERT(i < MLX5_DBR_BITMAP_SIZE);
2272         j = rte_bsf64(~page->dbr_bitmap[i]);
2273         page->dbr_bitmap[i] |= (UINT64_C(1) << j);
2274         page->dbr_count++;
2275         *dbr_page = page;
2276         return (((i * 64) + j) * sizeof(uint64_t));
2277 }
2278
2279 /**
2280  * Release a door-bell record.
2281  *
2282  * @param [in] dev
2283  *   Pointer to Ethernet device.
2284  * @param [in] umem_id
2285  *   UMEM ID of page containing the door-bell record to release.
2286  * @param [in] offset
2287  *   Offset of door-bell record in page.
2288  *
2289  * @return
2290  *   0 on success, a negative error value otherwise.
2291  */
2292 int32_t
2293 mlx5_release_dbr(struct rte_eth_dev *dev, uint32_t umem_id, uint64_t offset)
2294 {
2295         struct mlx5_priv *priv = dev->data->dev_private;
2296         struct mlx5_devx_dbr_page *page = NULL;
2297         int ret = 0;
2298
2299         LIST_FOREACH(page, &priv->dbrpgs, next)
2300                 /* Find the page this address belongs to. */
2301                 if (page->umem->umem_id == umem_id)
2302                         break;
2303         if (!page)
2304                 return -EINVAL;
2305         page->dbr_count--;
2306         if (!page->dbr_count) {
2307                 /* Page not used, free it and remove from list. */
2308                 LIST_REMOVE(page, next);
2309                 if (page->umem)
2310                         ret = -mlx5_glue->devx_umem_dereg(page->umem);
2311                 rte_free(page);
2312         } else {
2313                 /* Mark in bitmap that this door-bell is not in use. */
2314                 offset /= MLX5_DBR_SIZE;
2315                 int i = offset / 64;
2316                 int j = offset % 64;
2317
2318                 page->dbr_bitmap[i] &= ~(UINT64_C(1) << j);
2319         }
2320         return ret;
2321 }
2322
2323 int
2324 rte_pmd_mlx5_get_dyn_flag_names(char *names[], unsigned int n)
2325 {
2326         static const char *const dynf_names[] = {
2327                 RTE_PMD_MLX5_FINE_GRANULARITY_INLINE,
2328                 RTE_MBUF_DYNFLAG_METADATA_NAME
2329         };
2330         unsigned int i;
2331
2332         if (n < RTE_DIM(dynf_names))
2333                 return -ENOMEM;
2334         for (i = 0; i < RTE_DIM(dynf_names); i++) {
2335                 if (names[i] == NULL)
2336                         return -EINVAL;
2337                 strcpy(names[i], dynf_names[i]);
2338         }
2339         return RTE_DIM(dynf_names);
2340 }
2341
2342 /**
2343  * Check sibling device configurations.
2344  *
2345  * Sibling devices sharing the Infiniband device context
2346  * should have compatible configurations. This regards
2347  * representors and bonding slaves.
2348  *
2349  * @param priv
2350  *   Private device descriptor.
2351  * @param config
2352  *   Configuration of the device is going to be created.
2353  *
2354  * @return
2355  *   0 on success, EINVAL otherwise
2356  */
2357 static int
2358 mlx5_dev_check_sibling_config(struct mlx5_priv *priv,
2359                               struct mlx5_dev_config *config)
2360 {
2361         struct mlx5_dev_ctx_shared *sh = priv->sh;
2362         struct mlx5_dev_config *sh_conf = NULL;
2363         uint16_t port_id;
2364
2365         MLX5_ASSERT(sh);
2366         /* Nothing to compare for the single/first device. */
2367         if (sh->refcnt == 1)
2368                 return 0;
2369         /* Find the device with shared context. */
2370         MLX5_ETH_FOREACH_DEV(port_id, priv->pci_dev) {
2371                 struct mlx5_priv *opriv =
2372                         rte_eth_devices[port_id].data->dev_private;
2373
2374                 if (opriv && opriv != priv && opriv->sh == sh) {
2375                         sh_conf = &opriv->config;
2376                         break;
2377                 }
2378         }
2379         if (!sh_conf)
2380                 return 0;
2381         if (sh_conf->dv_flow_en ^ config->dv_flow_en) {
2382                 DRV_LOG(ERR, "\"dv_flow_en\" configuration mismatch"
2383                              " for shared %s context", sh->ibdev_name);
2384                 rte_errno = EINVAL;
2385                 return rte_errno;
2386         }
2387         if (sh_conf->dv_xmeta_en ^ config->dv_xmeta_en) {
2388                 DRV_LOG(ERR, "\"dv_xmeta_en\" configuration mismatch"
2389                              " for shared %s context", sh->ibdev_name);
2390                 rte_errno = EINVAL;
2391                 return rte_errno;
2392         }
2393         return 0;
2394 }
2395 /**
2396  * Spawn an Ethernet device from Verbs information.
2397  *
2398  * @param dpdk_dev
2399  *   Backing DPDK device.
2400  * @param spawn
2401  *   Verbs device parameters (name, port, switch_info) to spawn.
2402  * @param config
2403  *   Device configuration parameters.
2404  *
2405  * @return
2406  *   A valid Ethernet device object on success, NULL otherwise and rte_errno
2407  *   is set. The following errors are defined:
2408  *
2409  *   EBUSY: device is not supposed to be spawned.
2410  *   EEXIST: device is already spawned
2411  */
2412 static struct rte_eth_dev *
2413 mlx5_dev_spawn(struct rte_device *dpdk_dev,
2414                struct mlx5_dev_spawn_data *spawn,
2415                struct mlx5_dev_config config)
2416 {
2417         const struct mlx5_switch_info *switch_info = &spawn->info;
2418         struct mlx5_dev_ctx_shared *sh = NULL;
2419         struct ibv_port_attr port_attr;
2420         struct mlx5dv_context dv_attr = { .comp_mask = 0 };
2421         struct rte_eth_dev *eth_dev = NULL;
2422         struct mlx5_priv *priv = NULL;
2423         int err = 0;
2424         unsigned int hw_padding = 0;
2425         unsigned int mps;
2426         unsigned int cqe_comp;
2427         unsigned int cqe_pad = 0;
2428         unsigned int tunnel_en = 0;
2429         unsigned int mpls_en = 0;
2430         unsigned int swp = 0;
2431         unsigned int mprq = 0;
2432         unsigned int mprq_min_stride_size_n = 0;
2433         unsigned int mprq_max_stride_size_n = 0;
2434         unsigned int mprq_min_stride_num_n = 0;
2435         unsigned int mprq_max_stride_num_n = 0;
2436         struct rte_ether_addr mac;
2437         char name[RTE_ETH_NAME_MAX_LEN];
2438         int own_domain_id = 0;
2439         uint16_t port_id;
2440         unsigned int i;
2441 #ifdef HAVE_MLX5DV_DR_DEVX_PORT
2442         struct mlx5dv_devx_port devx_port = { .comp_mask = 0 };
2443 #endif
2444
2445         /* Determine if this port representor is supposed to be spawned. */
2446         if (switch_info->representor && dpdk_dev->devargs) {
2447                 struct rte_eth_devargs eth_da;
2448
2449                 err = rte_eth_devargs_parse(dpdk_dev->devargs->args, &eth_da);
2450                 if (err) {
2451                         rte_errno = -err;
2452                         DRV_LOG(ERR, "failed to process device arguments: %s",
2453                                 strerror(rte_errno));
2454                         return NULL;
2455                 }
2456                 for (i = 0; i < eth_da.nb_representor_ports; ++i)
2457                         if (eth_da.representor_ports[i] ==
2458                             (uint16_t)switch_info->port_name)
2459                                 break;
2460                 if (i == eth_da.nb_representor_ports) {
2461                         rte_errno = EBUSY;
2462                         return NULL;
2463                 }
2464         }
2465         /* Build device name. */
2466         if (spawn->pf_bond <  0) {
2467                 /* Single device. */
2468                 if (!switch_info->representor)
2469                         strlcpy(name, dpdk_dev->name, sizeof(name));
2470                 else
2471                         snprintf(name, sizeof(name), "%s_representor_%u",
2472                                  dpdk_dev->name, switch_info->port_name);
2473         } else {
2474                 /* Bonding device. */
2475                 if (!switch_info->representor)
2476                         snprintf(name, sizeof(name), "%s_%s",
2477                                  dpdk_dev->name, spawn->ibv_dev->name);
2478                 else
2479                         snprintf(name, sizeof(name), "%s_%s_representor_%u",
2480                                  dpdk_dev->name, spawn->ibv_dev->name,
2481                                  switch_info->port_name);
2482         }
2483         /* check if the device is already spawned */
2484         if (rte_eth_dev_get_port_by_name(name, &port_id) == 0) {
2485                 rte_errno = EEXIST;
2486                 return NULL;
2487         }
2488         DRV_LOG(DEBUG, "naming Ethernet device \"%s\"", name);
2489         if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
2490                 struct mlx5_mp_id mp_id;
2491
2492                 eth_dev = rte_eth_dev_attach_secondary(name);
2493                 if (eth_dev == NULL) {
2494                         DRV_LOG(ERR, "can not attach rte ethdev");
2495                         rte_errno = ENOMEM;
2496                         return NULL;
2497                 }
2498                 eth_dev->device = dpdk_dev;
2499                 eth_dev->dev_ops = &mlx5_dev_sec_ops;
2500                 err = mlx5_proc_priv_init(eth_dev);
2501                 if (err)
2502                         return NULL;
2503                 mp_id.port_id = eth_dev->data->port_id;
2504                 strlcpy(mp_id.name, MLX5_MP_NAME, RTE_MP_MAX_NAME_LEN);
2505                 /* Receive command fd from primary process */
2506                 err = mlx5_mp_req_verbs_cmd_fd(&mp_id);
2507                 if (err < 0)
2508                         goto err_secondary;
2509                 /* Remap UAR for Tx queues. */
2510                 err = mlx5_tx_uar_init_secondary(eth_dev, err);
2511                 if (err)
2512                         goto err_secondary;
2513                 /*
2514                  * Ethdev pointer is still required as input since
2515                  * the primary device is not accessible from the
2516                  * secondary process.
2517                  */
2518                 eth_dev->rx_pkt_burst = mlx5_select_rx_function(eth_dev);
2519                 eth_dev->tx_pkt_burst = mlx5_select_tx_function(eth_dev);
2520                 return eth_dev;
2521 err_secondary:
2522                 mlx5_dev_close(eth_dev);
2523                 return NULL;
2524         }
2525         /*
2526          * Some parameters ("tx_db_nc" in particularly) are needed in
2527          * advance to create dv/verbs device context. We proceed the
2528          * devargs here to get ones, and later proceed devargs again
2529          * to override some hardware settings.
2530          */
2531         err = mlx5_args(&config, dpdk_dev->devargs);
2532         if (err) {
2533                 err = rte_errno;
2534                 DRV_LOG(ERR, "failed to process device arguments: %s",
2535                         strerror(rte_errno));
2536                 goto error;
2537         }
2538         sh = mlx5_alloc_shared_ibctx(spawn, &config);
2539         if (!sh)
2540                 return NULL;
2541         config.devx = sh->devx;
2542 #ifdef HAVE_MLX5DV_DR_ACTION_DEST_DEVX_TIR
2543         config.dest_tir = 1;
2544 #endif
2545 #ifdef HAVE_IBV_MLX5_MOD_SWP
2546         dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_SWP;
2547 #endif
2548         /*
2549          * Multi-packet send is supported by ConnectX-4 Lx PF as well
2550          * as all ConnectX-5 devices.
2551          */
2552 #ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT
2553         dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS;
2554 #endif
2555 #ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT
2556         dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_STRIDING_RQ;
2557 #endif
2558         mlx5_glue->dv_query_device(sh->ctx, &dv_attr);
2559         if (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED) {
2560                 if (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW) {
2561                         DRV_LOG(DEBUG, "enhanced MPW is supported");
2562                         mps = MLX5_MPW_ENHANCED;
2563                 } else {
2564                         DRV_LOG(DEBUG, "MPW is supported");
2565                         mps = MLX5_MPW;
2566                 }
2567         } else {
2568                 DRV_LOG(DEBUG, "MPW isn't supported");
2569                 mps = MLX5_MPW_DISABLED;
2570         }
2571 #ifdef HAVE_IBV_MLX5_MOD_SWP
2572         if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_SWP)
2573                 swp = dv_attr.sw_parsing_caps.sw_parsing_offloads;
2574         DRV_LOG(DEBUG, "SWP support: %u", swp);
2575 #endif
2576         config.swp = !!swp;
2577 #ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT
2578         if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_STRIDING_RQ) {
2579                 struct mlx5dv_striding_rq_caps mprq_caps =
2580                         dv_attr.striding_rq_caps;
2581
2582                 DRV_LOG(DEBUG, "\tmin_single_stride_log_num_of_bytes: %d",
2583                         mprq_caps.min_single_stride_log_num_of_bytes);
2584                 DRV_LOG(DEBUG, "\tmax_single_stride_log_num_of_bytes: %d",
2585                         mprq_caps.max_single_stride_log_num_of_bytes);
2586                 DRV_LOG(DEBUG, "\tmin_single_wqe_log_num_of_strides: %d",
2587                         mprq_caps.min_single_wqe_log_num_of_strides);
2588                 DRV_LOG(DEBUG, "\tmax_single_wqe_log_num_of_strides: %d",
2589                         mprq_caps.max_single_wqe_log_num_of_strides);
2590                 DRV_LOG(DEBUG, "\tsupported_qpts: %d",
2591                         mprq_caps.supported_qpts);
2592                 DRV_LOG(DEBUG, "device supports Multi-Packet RQ");
2593                 mprq = 1;
2594                 mprq_min_stride_size_n =
2595                         mprq_caps.min_single_stride_log_num_of_bytes;
2596                 mprq_max_stride_size_n =
2597                         mprq_caps.max_single_stride_log_num_of_bytes;
2598                 mprq_min_stride_num_n =
2599                         mprq_caps.min_single_wqe_log_num_of_strides;
2600                 mprq_max_stride_num_n =
2601                         mprq_caps.max_single_wqe_log_num_of_strides;
2602         }
2603 #endif
2604         if (RTE_CACHE_LINE_SIZE == 128 &&
2605             !(dv_attr.flags & MLX5DV_CONTEXT_FLAGS_CQE_128B_COMP))
2606                 cqe_comp = 0;
2607         else
2608                 cqe_comp = 1;
2609         config.cqe_comp = cqe_comp;
2610 #ifdef HAVE_IBV_MLX5_MOD_CQE_128B_PAD
2611         /* Whether device supports 128B Rx CQE padding. */
2612         cqe_pad = RTE_CACHE_LINE_SIZE == 128 &&
2613                   (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_CQE_128B_PAD);
2614 #endif
2615 #ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT
2616         if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS) {
2617                 tunnel_en = ((dv_attr.tunnel_offloads_caps &
2618                               MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_VXLAN) &&
2619                              (dv_attr.tunnel_offloads_caps &
2620                               MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_GRE) &&
2621                              (dv_attr.tunnel_offloads_caps &
2622                               MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_GENEVE));
2623         }
2624         DRV_LOG(DEBUG, "tunnel offloading is %ssupported",
2625                 tunnel_en ? "" : "not ");
2626 #else
2627         DRV_LOG(WARNING,
2628                 "tunnel offloading disabled due to old OFED/rdma-core version");
2629 #endif
2630         config.tunnel_en = tunnel_en;
2631 #ifdef HAVE_IBV_DEVICE_MPLS_SUPPORT
2632         mpls_en = ((dv_attr.tunnel_offloads_caps &
2633                     MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_CW_MPLS_OVER_GRE) &&
2634                    (dv_attr.tunnel_offloads_caps &
2635                     MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_CW_MPLS_OVER_UDP));
2636         DRV_LOG(DEBUG, "MPLS over GRE/UDP tunnel offloading is %ssupported",
2637                 mpls_en ? "" : "not ");
2638 #else
2639         DRV_LOG(WARNING, "MPLS over GRE/UDP tunnel offloading disabled due to"
2640                 " old OFED/rdma-core version or firmware configuration");
2641 #endif
2642         config.mpls_en = mpls_en;
2643         /* Check port status. */
2644         err = mlx5_glue->query_port(sh->ctx, spawn->ibv_port, &port_attr);
2645         if (err) {
2646                 DRV_LOG(ERR, "port query failed: %s", strerror(err));
2647                 goto error;
2648         }
2649         if (port_attr.link_layer != IBV_LINK_LAYER_ETHERNET) {
2650                 DRV_LOG(ERR, "port is not configured in Ethernet mode");
2651                 err = EINVAL;
2652                 goto error;
2653         }
2654         if (port_attr.state != IBV_PORT_ACTIVE)
2655                 DRV_LOG(DEBUG, "port is not active: \"%s\" (%d)",
2656                         mlx5_glue->port_state_str(port_attr.state),
2657                         port_attr.state);
2658         /* Allocate private eth device data. */
2659         priv = rte_zmalloc("ethdev private structure",
2660                            sizeof(*priv),
2661                            RTE_CACHE_LINE_SIZE);
2662         if (priv == NULL) {
2663                 DRV_LOG(ERR, "priv allocation failure");
2664                 err = ENOMEM;
2665                 goto error;
2666         }
2667         priv->sh = sh;
2668         priv->ibv_port = spawn->ibv_port;
2669         priv->pci_dev = spawn->pci_dev;
2670         priv->mtu = RTE_ETHER_MTU;
2671         priv->mp_id.port_id = port_id;
2672         strlcpy(priv->mp_id.name, MLX5_MP_NAME, RTE_MP_MAX_NAME_LEN);
2673 #ifndef RTE_ARCH_64
2674         /* Initialize UAR access locks for 32bit implementations. */
2675         rte_spinlock_init(&priv->uar_lock_cq);
2676         for (i = 0; i < MLX5_UAR_PAGE_NUM_MAX; i++)
2677                 rte_spinlock_init(&priv->uar_lock[i]);
2678 #endif
2679         /* Some internal functions rely on Netlink sockets, open them now. */
2680         priv->nl_socket_rdma = mlx5_nl_init(NETLINK_RDMA);
2681         priv->nl_socket_route = mlx5_nl_init(NETLINK_ROUTE);
2682         priv->representor = !!switch_info->representor;
2683         priv->master = !!switch_info->master;
2684         priv->domain_id = RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID;
2685         priv->vport_meta_tag = 0;
2686         priv->vport_meta_mask = 0;
2687         priv->pf_bond = spawn->pf_bond;
2688 #ifdef HAVE_MLX5DV_DR_DEVX_PORT
2689         /*
2690          * The DevX port query API is implemented. E-Switch may use
2691          * either vport or reg_c[0] metadata register to match on
2692          * vport index. The engaged part of metadata register is
2693          * defined by mask.
2694          */
2695         if (switch_info->representor || switch_info->master) {
2696                 devx_port.comp_mask = MLX5DV_DEVX_PORT_VPORT |
2697                                       MLX5DV_DEVX_PORT_MATCH_REG_C_0;
2698                 err = mlx5_glue->devx_port_query(sh->ctx, spawn->ibv_port,
2699                                                  &devx_port);
2700                 if (err) {
2701                         DRV_LOG(WARNING,
2702                                 "can't query devx port %d on device %s",
2703                                 spawn->ibv_port, spawn->ibv_dev->name);
2704                         devx_port.comp_mask = 0;
2705                 }
2706         }
2707         if (devx_port.comp_mask & MLX5DV_DEVX_PORT_MATCH_REG_C_0) {
2708                 priv->vport_meta_tag = devx_port.reg_c_0.value;
2709                 priv->vport_meta_mask = devx_port.reg_c_0.mask;
2710                 if (!priv->vport_meta_mask) {
2711                         DRV_LOG(ERR, "vport zero mask for port %d"
2712                                      " on bonding device %s",
2713                                      spawn->ibv_port, spawn->ibv_dev->name);
2714                         err = ENOTSUP;
2715                         goto error;
2716                 }
2717                 if (priv->vport_meta_tag & ~priv->vport_meta_mask) {
2718                         DRV_LOG(ERR, "invalid vport tag for port %d"
2719                                      " on bonding device %s",
2720                                      spawn->ibv_port, spawn->ibv_dev->name);
2721                         err = ENOTSUP;
2722                         goto error;
2723                 }
2724         }
2725         if (devx_port.comp_mask & MLX5DV_DEVX_PORT_VPORT) {
2726                 priv->vport_id = devx_port.vport_num;
2727         } else if (spawn->pf_bond >= 0) {
2728                 DRV_LOG(ERR, "can't deduce vport index for port %d"
2729                              " on bonding device %s",
2730                              spawn->ibv_port, spawn->ibv_dev->name);
2731                 err = ENOTSUP;
2732                 goto error;
2733         } else {
2734                 /* Suppose vport index in compatible way. */
2735                 priv->vport_id = switch_info->representor ?
2736                                  switch_info->port_name + 1 : -1;
2737         }
2738 #else
2739         /*
2740          * Kernel/rdma_core support single E-Switch per PF configurations
2741          * only and vport_id field contains the vport index for
2742          * associated VF, which is deduced from representor port name.
2743          * For example, let's have the IB device port 10, it has
2744          * attached network device eth0, which has port name attribute
2745          * pf0vf2, we can deduce the VF number as 2, and set vport index
2746          * as 3 (2+1). This assigning schema should be changed if the
2747          * multiple E-Switch instances per PF configurations or/and PCI
2748          * subfunctions are added.
2749          */
2750         priv->vport_id = switch_info->representor ?
2751                          switch_info->port_name + 1 : -1;
2752 #endif
2753         /* representor_id field keeps the unmodified VF index. */
2754         priv->representor_id = switch_info->representor ?
2755                                switch_info->port_name : -1;
2756         /*
2757          * Look for sibling devices in order to reuse their switch domain
2758          * if any, otherwise allocate one.
2759          */
2760         MLX5_ETH_FOREACH_DEV(port_id, priv->pci_dev) {
2761                 const struct mlx5_priv *opriv =
2762                         rte_eth_devices[port_id].data->dev_private;
2763
2764                 if (!opriv ||
2765                     opriv->sh != priv->sh ||
2766                         opriv->domain_id ==
2767                         RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID)
2768                         continue;
2769                 priv->domain_id = opriv->domain_id;
2770                 break;
2771         }
2772         if (priv->domain_id == RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID) {
2773                 err = rte_eth_switch_domain_alloc(&priv->domain_id);
2774                 if (err) {
2775                         err = rte_errno;
2776                         DRV_LOG(ERR, "unable to allocate switch domain: %s",
2777                                 strerror(rte_errno));
2778                         goto error;
2779                 }
2780                 own_domain_id = 1;
2781         }
2782         /* Override some values set by hardware configuration. */
2783         mlx5_args(&config, dpdk_dev->devargs);
2784         err = mlx5_dev_check_sibling_config(priv, &config);
2785         if (err)
2786                 goto error;
2787         config.hw_csum = !!(sh->device_attr.device_cap_flags_ex &
2788                             IBV_DEVICE_RAW_IP_CSUM);
2789         DRV_LOG(DEBUG, "checksum offloading is %ssupported",
2790                 (config.hw_csum ? "" : "not "));
2791 #if !defined(HAVE_IBV_DEVICE_COUNTERS_SET_V42) && \
2792         !defined(HAVE_IBV_DEVICE_COUNTERS_SET_V45)
2793         DRV_LOG(DEBUG, "counters are not supported");
2794 #endif
2795 #if !defined(HAVE_IBV_FLOW_DV_SUPPORT) || !defined(HAVE_MLX5DV_DR)
2796         if (config.dv_flow_en) {
2797                 DRV_LOG(WARNING, "DV flow is not supported");
2798                 config.dv_flow_en = 0;
2799         }
2800 #endif
2801         config.ind_table_max_size =
2802                 sh->device_attr.rss_caps.max_rwq_indirection_table_size;
2803         /*
2804          * Remove this check once DPDK supports larger/variable
2805          * indirection tables.
2806          */
2807         if (config.ind_table_max_size > (unsigned int)ETH_RSS_RETA_SIZE_512)
2808                 config.ind_table_max_size = ETH_RSS_RETA_SIZE_512;
2809         DRV_LOG(DEBUG, "maximum Rx indirection table size is %u",
2810                 config.ind_table_max_size);
2811         config.hw_vlan_strip = !!(sh->device_attr.raw_packet_caps &
2812                                   IBV_RAW_PACKET_CAP_CVLAN_STRIPPING);
2813         DRV_LOG(DEBUG, "VLAN stripping is %ssupported",
2814                 (config.hw_vlan_strip ? "" : "not "));
2815         config.hw_fcs_strip = !!(sh->device_attr.raw_packet_caps &
2816                                  IBV_RAW_PACKET_CAP_SCATTER_FCS);
2817         DRV_LOG(DEBUG, "FCS stripping configuration is %ssupported",
2818                 (config.hw_fcs_strip ? "" : "not "));
2819 #if defined(HAVE_IBV_WQ_FLAG_RX_END_PADDING)
2820         hw_padding = !!sh->device_attr.rx_pad_end_addr_align;
2821 #elif defined(HAVE_IBV_WQ_FLAGS_PCI_WRITE_END_PADDING)
2822         hw_padding = !!(sh->device_attr.device_cap_flags_ex &
2823                         IBV_DEVICE_PCI_WRITE_END_PADDING);
2824 #endif
2825         if (config.hw_padding && !hw_padding) {
2826                 DRV_LOG(DEBUG, "Rx end alignment padding isn't supported");
2827                 config.hw_padding = 0;
2828         } else if (config.hw_padding) {
2829                 DRV_LOG(DEBUG, "Rx end alignment padding is enabled");
2830         }
2831         config.tso = (sh->device_attr.tso_caps.max_tso > 0 &&
2832                       (sh->device_attr.tso_caps.supported_qpts &
2833                        (1 << IBV_QPT_RAW_PACKET)));
2834         if (config.tso)
2835                 config.tso_max_payload_sz = sh->device_attr.tso_caps.max_tso;
2836         /*
2837          * MPW is disabled by default, while the Enhanced MPW is enabled
2838          * by default.
2839          */
2840         if (config.mps == MLX5_ARG_UNSET)
2841                 config.mps = (mps == MLX5_MPW_ENHANCED) ? MLX5_MPW_ENHANCED :
2842                                                           MLX5_MPW_DISABLED;
2843         else
2844                 config.mps = config.mps ? mps : MLX5_MPW_DISABLED;
2845         DRV_LOG(INFO, "%sMPS is %s",
2846                 config.mps == MLX5_MPW_ENHANCED ? "enhanced " :
2847                 config.mps == MLX5_MPW ? "legacy " : "",
2848                 config.mps != MLX5_MPW_DISABLED ? "enabled" : "disabled");
2849         if (config.cqe_comp && !cqe_comp) {
2850                 DRV_LOG(WARNING, "Rx CQE compression isn't supported");
2851                 config.cqe_comp = 0;
2852         }
2853         if (config.cqe_pad && !cqe_pad) {
2854                 DRV_LOG(WARNING, "Rx CQE padding isn't supported");
2855                 config.cqe_pad = 0;
2856         } else if (config.cqe_pad) {
2857                 DRV_LOG(INFO, "Rx CQE padding is enabled");
2858         }
2859         if (config.devx) {
2860                 priv->counter_fallback = 0;
2861                 err = mlx5_devx_cmd_query_hca_attr(sh->ctx, &config.hca_attr);
2862                 if (err) {
2863                         err = -err;
2864                         goto error;
2865                 }
2866                 if (!config.hca_attr.flow_counters_dump)
2867                         priv->counter_fallback = 1;
2868 #ifndef HAVE_IBV_DEVX_ASYNC
2869                 priv->counter_fallback = 1;
2870 #endif
2871                 if (priv->counter_fallback)
2872                         DRV_LOG(INFO, "Use fall-back DV counter management");
2873                 /* Check for LRO support. */
2874                 if (config.dest_tir && config.hca_attr.lro_cap &&
2875                     config.dv_flow_en) {
2876                         /* TBD check tunnel lro caps. */
2877                         config.lro.supported = config.hca_attr.lro_cap;
2878                         DRV_LOG(DEBUG, "Device supports LRO");
2879                         /*
2880                          * If LRO timeout is not configured by application,
2881                          * use the minimal supported value.
2882                          */
2883                         if (!config.lro.timeout)
2884                                 config.lro.timeout =
2885                                 config.hca_attr.lro_timer_supported_periods[0];
2886                         DRV_LOG(DEBUG, "LRO session timeout set to %d usec",
2887                                 config.lro.timeout);
2888                 }
2889 #if defined(HAVE_MLX5DV_DR) && defined(HAVE_MLX5_DR_CREATE_ACTION_FLOW_METER)
2890                 if (config.hca_attr.qos.sup && config.hca_attr.qos.srtcm_sup &&
2891                     config.dv_flow_en) {
2892                         uint8_t reg_c_mask =
2893                                 config.hca_attr.qos.flow_meter_reg_c_ids;
2894                         /*
2895                          * Meter needs two REG_C's for color match and pre-sfx
2896                          * flow match. Here get the REG_C for color match.
2897                          * REG_C_0 and REG_C_1 is reserved for metadata feature.
2898                          */
2899                         reg_c_mask &= 0xfc;
2900                         if (__builtin_popcount(reg_c_mask) < 1) {
2901                                 priv->mtr_en = 0;
2902                                 DRV_LOG(WARNING, "No available register for"
2903                                         " meter.");
2904                         } else {
2905                                 priv->mtr_color_reg = ffs(reg_c_mask) - 1 +
2906                                                       REG_C_0;
2907                                 priv->mtr_en = 1;
2908                                 priv->mtr_reg_share =
2909                                       config.hca_attr.qos.flow_meter_reg_share;
2910                                 DRV_LOG(DEBUG, "The REG_C meter uses is %d",
2911                                         priv->mtr_color_reg);
2912                         }
2913                 }
2914 #endif
2915         }
2916         if (config.mprq.enabled && mprq) {
2917                 if (config.mprq.stride_num_n &&
2918                     (config.mprq.stride_num_n > mprq_max_stride_num_n ||
2919                      config.mprq.stride_num_n < mprq_min_stride_num_n)) {
2920                         config.mprq.stride_num_n =
2921                                 RTE_MIN(RTE_MAX(MLX5_MPRQ_STRIDE_NUM_N,
2922                                                 mprq_min_stride_num_n),
2923                                         mprq_max_stride_num_n);
2924                         DRV_LOG(WARNING,
2925                                 "the number of strides"
2926                                 " for Multi-Packet RQ is out of range,"
2927                                 " setting default value (%u)",
2928                                 1 << config.mprq.stride_num_n);
2929                 }
2930                 if (config.mprq.stride_size_n &&
2931                     (config.mprq.stride_size_n > mprq_max_stride_size_n ||
2932                      config.mprq.stride_size_n < mprq_min_stride_size_n)) {
2933                         config.mprq.stride_size_n =
2934                                 RTE_MIN(RTE_MAX(MLX5_MPRQ_STRIDE_SIZE_N,
2935                                                 mprq_min_stride_size_n),
2936                                         mprq_max_stride_size_n);
2937                         DRV_LOG(WARNING,
2938                                 "the size of a stride"
2939                                 " for Multi-Packet RQ is out of range,"
2940                                 " setting default value (%u)",
2941                                 1 << config.mprq.stride_size_n);
2942                 }
2943                 config.mprq.min_stride_size_n = mprq_min_stride_size_n;
2944                 config.mprq.max_stride_size_n = mprq_max_stride_size_n;
2945         } else if (config.mprq.enabled && !mprq) {
2946                 DRV_LOG(WARNING, "Multi-Packet RQ isn't supported");
2947                 config.mprq.enabled = 0;
2948         }
2949         if (config.max_dump_files_num == 0)
2950                 config.max_dump_files_num = 128;
2951         eth_dev = rte_eth_dev_allocate(name);
2952         if (eth_dev == NULL) {
2953                 DRV_LOG(ERR, "can not allocate rte ethdev");
2954                 err = ENOMEM;
2955                 goto error;
2956         }
2957         /* Flag to call rte_eth_dev_release_port() in rte_eth_dev_close(). */
2958         eth_dev->data->dev_flags |= RTE_ETH_DEV_CLOSE_REMOVE;
2959         if (priv->representor) {
2960                 eth_dev->data->dev_flags |= RTE_ETH_DEV_REPRESENTOR;
2961                 eth_dev->data->representor_id = priv->representor_id;
2962         }
2963         /*
2964          * Store associated network device interface index. This index
2965          * is permanent throughout the lifetime of device. So, we may store
2966          * the ifindex here and use the cached value further.
2967          */
2968         MLX5_ASSERT(spawn->ifindex);
2969         priv->if_index = spawn->ifindex;
2970         eth_dev->data->dev_private = priv;
2971         priv->dev_data = eth_dev->data;
2972         eth_dev->data->mac_addrs = priv->mac;
2973         eth_dev->device = dpdk_dev;
2974         /* Configure the first MAC address by default. */
2975         if (mlx5_get_mac(eth_dev, &mac.addr_bytes)) {
2976                 DRV_LOG(ERR,
2977                         "port %u cannot get MAC address, is mlx5_en"
2978                         " loaded? (errno: %s)",
2979                         eth_dev->data->port_id, strerror(rte_errno));
2980                 err = ENODEV;
2981                 goto error;
2982         }
2983         DRV_LOG(INFO,
2984                 "port %u MAC address is %02x:%02x:%02x:%02x:%02x:%02x",
2985                 eth_dev->data->port_id,
2986                 mac.addr_bytes[0], mac.addr_bytes[1],
2987                 mac.addr_bytes[2], mac.addr_bytes[3],
2988                 mac.addr_bytes[4], mac.addr_bytes[5]);
2989 #ifdef RTE_LIBRTE_MLX5_DEBUG
2990         {
2991                 char ifname[IF_NAMESIZE];
2992
2993                 if (mlx5_get_ifname(eth_dev, &ifname) == 0)
2994                         DRV_LOG(DEBUG, "port %u ifname is \"%s\"",
2995                                 eth_dev->data->port_id, ifname);
2996                 else
2997                         DRV_LOG(DEBUG, "port %u ifname is unknown",
2998                                 eth_dev->data->port_id);
2999         }
3000 #endif
3001         /* Get actual MTU if possible. */
3002         err = mlx5_get_mtu(eth_dev, &priv->mtu);
3003         if (err) {
3004                 err = rte_errno;
3005                 goto error;
3006         }
3007         DRV_LOG(DEBUG, "port %u MTU is %u", eth_dev->data->port_id,
3008                 priv->mtu);
3009         /* Initialize burst functions to prevent crashes before link-up. */
3010         eth_dev->rx_pkt_burst = removed_rx_burst;
3011         eth_dev->tx_pkt_burst = removed_tx_burst;
3012         eth_dev->dev_ops = &mlx5_dev_ops;
3013         /* Register MAC address. */
3014         claim_zero(mlx5_mac_addr_add(eth_dev, &mac, 0, 0));
3015         if (config.vf && config.vf_nl_en)
3016                 mlx5_nl_mac_addr_sync(priv->nl_socket_route,
3017                                       mlx5_ifindex(eth_dev),
3018                                       eth_dev->data->mac_addrs,
3019                                       MLX5_MAX_MAC_ADDRESSES);
3020         priv->flows = 0;
3021         priv->ctrl_flows = 0;
3022         TAILQ_INIT(&priv->flow_meters);
3023         TAILQ_INIT(&priv->flow_meter_profiles);
3024         /* Hint libmlx5 to use PMD allocator for data plane resources */
3025         struct mlx5dv_ctx_allocators alctr = {
3026                 .alloc = &mlx5_alloc_verbs_buf,
3027                 .free = &mlx5_free_verbs_buf,
3028                 .data = priv,
3029         };
3030         mlx5_glue->dv_set_context_attr(sh->ctx,
3031                                        MLX5DV_CTX_ATTR_BUF_ALLOCATORS,
3032                                        (void *)((uintptr_t)&alctr));
3033         /* Bring Ethernet device up. */
3034         DRV_LOG(DEBUG, "port %u forcing Ethernet interface up",
3035                 eth_dev->data->port_id);
3036         mlx5_set_link_up(eth_dev);
3037         /*
3038          * Even though the interrupt handler is not installed yet,
3039          * interrupts will still trigger on the async_fd from
3040          * Verbs context returned by ibv_open_device().
3041          */
3042         mlx5_link_update(eth_dev, 0);
3043 #ifdef HAVE_MLX5DV_DR_ESWITCH
3044         if (!(config.hca_attr.eswitch_manager && config.dv_flow_en &&
3045               (switch_info->representor || switch_info->master)))
3046                 config.dv_esw_en = 0;
3047 #else
3048         config.dv_esw_en = 0;
3049 #endif
3050         /* Detect minimal data bytes to inline. */
3051         mlx5_set_min_inline(spawn, &config);
3052         /* Store device configuration on private structure. */
3053         priv->config = config;
3054         /* Create context for virtual machine VLAN workaround. */
3055         priv->vmwa_context = mlx5_vlan_vmwa_init(eth_dev, spawn->ifindex);
3056         if (config.dv_flow_en) {
3057                 err = mlx5_alloc_shared_dr(priv);
3058                 if (err)
3059                         goto error;
3060                 /*
3061                  * RSS id is shared with meter flow id. Meter flow id can only
3062                  * use the 24 MSB of the register.
3063                  */
3064                 priv->qrss_id_pool = mlx5_flow_id_pool_alloc(UINT32_MAX >>
3065                                      MLX5_MTR_COLOR_BITS);
3066                 if (!priv->qrss_id_pool) {
3067                         DRV_LOG(ERR, "can't create flow id pool");
3068                         err = ENOMEM;
3069                         goto error;
3070                 }
3071         }
3072         /* Supported Verbs flow priority number detection. */
3073         err = mlx5_flow_discover_priorities(eth_dev);
3074         if (err < 0) {
3075                 err = -err;
3076                 goto error;
3077         }
3078         priv->config.flow_prio = err;
3079         if (!priv->config.dv_esw_en &&
3080             priv->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY) {
3081                 DRV_LOG(WARNING, "metadata mode %u is not supported "
3082                                  "(no E-Switch)", priv->config.dv_xmeta_en);
3083                 priv->config.dv_xmeta_en = MLX5_XMETA_MODE_LEGACY;
3084         }
3085         mlx5_set_metadata_mask(eth_dev);
3086         if (priv->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY &&
3087             !priv->sh->dv_regc0_mask) {
3088                 DRV_LOG(ERR, "metadata mode %u is not supported "
3089                              "(no metadata reg_c[0] is available)",
3090                              priv->config.dv_xmeta_en);
3091                         err = ENOTSUP;
3092                         goto error;
3093         }
3094         /*
3095          * Allocate the buffer for flow creating, just once.
3096          * The allocation must be done before any flow creating.
3097          */
3098         mlx5_flow_alloc_intermediate(eth_dev);
3099         /* Query availibility of metadata reg_c's. */
3100         err = mlx5_flow_discover_mreg_c(eth_dev);
3101         if (err < 0) {
3102                 err = -err;
3103                 goto error;
3104         }
3105         if (!mlx5_flow_ext_mreg_supported(eth_dev)) {
3106                 DRV_LOG(DEBUG,
3107                         "port %u extensive metadata register is not supported",
3108                         eth_dev->data->port_id);
3109                 if (priv->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY) {
3110                         DRV_LOG(ERR, "metadata mode %u is not supported "
3111                                      "(no metadata registers available)",
3112                                      priv->config.dv_xmeta_en);
3113                         err = ENOTSUP;
3114                         goto error;
3115                 }
3116         }
3117         if (priv->config.dv_flow_en &&
3118             priv->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY &&
3119             mlx5_flow_ext_mreg_supported(eth_dev) &&
3120             priv->sh->dv_regc0_mask) {
3121                 priv->mreg_cp_tbl = mlx5_hlist_create(MLX5_FLOW_MREG_HNAME,
3122                                                       MLX5_FLOW_MREG_HTABLE_SZ);
3123                 if (!priv->mreg_cp_tbl) {
3124                         err = ENOMEM;
3125                         goto error;
3126                 }
3127         }
3128         return eth_dev;
3129 error:
3130         if (priv) {
3131                 if (priv->mreg_cp_tbl)
3132                         mlx5_hlist_destroy(priv->mreg_cp_tbl, NULL, NULL);
3133                 if (priv->sh)
3134                         mlx5_free_shared_dr(priv);
3135                 if (priv->nl_socket_route >= 0)
3136                         close(priv->nl_socket_route);
3137                 if (priv->nl_socket_rdma >= 0)
3138                         close(priv->nl_socket_rdma);
3139                 if (priv->vmwa_context)
3140                         mlx5_vlan_vmwa_exit(priv->vmwa_context);
3141                 if (priv->qrss_id_pool)
3142                         mlx5_flow_id_pool_release(priv->qrss_id_pool);
3143                 if (own_domain_id)
3144                         claim_zero(rte_eth_switch_domain_free(priv->domain_id));
3145                 rte_free(priv);
3146                 if (eth_dev != NULL)
3147                         eth_dev->data->dev_private = NULL;
3148         }
3149         if (eth_dev != NULL) {
3150                 /* mac_addrs must not be freed alone because part of dev_private */
3151                 eth_dev->data->mac_addrs = NULL;
3152                 rte_eth_dev_release_port(eth_dev);
3153         }
3154         if (sh)
3155                 mlx5_free_shared_ibctx(sh);
3156         MLX5_ASSERT(err > 0);
3157         rte_errno = err;
3158         return NULL;
3159 }
3160
3161 /**
3162  * Comparison callback to sort device data.
3163  *
3164  * This is meant to be used with qsort().
3165  *
3166  * @param a[in]
3167  *   Pointer to pointer to first data object.
3168  * @param b[in]
3169  *   Pointer to pointer to second data object.
3170  *
3171  * @return
3172  *   0 if both objects are equal, less than 0 if the first argument is less
3173  *   than the second, greater than 0 otherwise.
3174  */
3175 static int
3176 mlx5_dev_spawn_data_cmp(const void *a, const void *b)
3177 {
3178         const struct mlx5_switch_info *si_a =
3179                 &((const struct mlx5_dev_spawn_data *)a)->info;
3180         const struct mlx5_switch_info *si_b =
3181                 &((const struct mlx5_dev_spawn_data *)b)->info;
3182         int ret;
3183
3184         /* Master device first. */
3185         ret = si_b->master - si_a->master;
3186         if (ret)
3187                 return ret;
3188         /* Then representor devices. */
3189         ret = si_b->representor - si_a->representor;
3190         if (ret)
3191                 return ret;
3192         /* Unidentified devices come last in no specific order. */
3193         if (!si_a->representor)
3194                 return 0;
3195         /* Order representors by name. */
3196         return si_a->port_name - si_b->port_name;
3197 }
3198
3199 /**
3200  * Match PCI information for possible slaves of bonding device.
3201  *
3202  * @param[in] ibv_dev
3203  *   Pointer to Infiniband device structure.
3204  * @param[in] pci_dev
3205  *   Pointer to PCI device structure to match PCI address.
3206  * @param[in] nl_rdma
3207  *   Netlink RDMA group socket handle.
3208  *
3209  * @return
3210  *   negative value if no bonding device found, otherwise
3211  *   positive index of slave PF in bonding.
3212  */
3213 static int
3214 mlx5_device_bond_pci_match(const struct ibv_device *ibv_dev,
3215                            const struct rte_pci_device *pci_dev,
3216                            int nl_rdma)
3217 {
3218         char ifname[IF_NAMESIZE + 1];
3219         unsigned int ifindex;
3220         unsigned int np, i;
3221         FILE *file = NULL;
3222         int pf = -1;
3223
3224         /*
3225          * Try to get master device name. If something goes
3226          * wrong suppose the lack of kernel support and no
3227          * bonding devices.
3228          */
3229         if (nl_rdma < 0)
3230                 return -1;
3231         if (!strstr(ibv_dev->name, "bond"))
3232                 return -1;
3233         np = mlx5_nl_portnum(nl_rdma, ibv_dev->name);
3234         if (!np)
3235                 return -1;
3236         /*
3237          * The Master device might not be on the predefined
3238          * port (not on port index 1, it is not garanted),
3239          * we have to scan all Infiniband device port and
3240          * find master.
3241          */
3242         for (i = 1; i <= np; ++i) {
3243                 /* Check whether Infiniband port is populated. */
3244                 ifindex = mlx5_nl_ifindex(nl_rdma, ibv_dev->name, i);
3245                 if (!ifindex)
3246                         continue;
3247                 if (!if_indextoname(ifindex, ifname))
3248                         continue;
3249                 /* Try to read bonding slave names from sysfs. */
3250                 MKSTR(slaves,
3251                       "/sys/class/net/%s/master/bonding/slaves", ifname);
3252                 file = fopen(slaves, "r");
3253                 if (file)
3254                         break;
3255         }
3256         if (!file)
3257                 return -1;
3258         /* Use safe format to check maximal buffer length. */
3259         MLX5_ASSERT(atol(RTE_STR(IF_NAMESIZE)) == IF_NAMESIZE);
3260         while (fscanf(file, "%" RTE_STR(IF_NAMESIZE) "s", ifname) == 1) {
3261                 char tmp_str[IF_NAMESIZE + 32];
3262                 struct rte_pci_addr pci_addr;
3263                 struct mlx5_switch_info info;
3264
3265                 /* Process slave interface names in the loop. */
3266                 snprintf(tmp_str, sizeof(tmp_str),
3267                          "/sys/class/net/%s", ifname);
3268                 if (mlx5_dev_to_pci_addr(tmp_str, &pci_addr)) {
3269                         DRV_LOG(WARNING, "can not get PCI address"
3270                                          " for netdev \"%s\"", ifname);
3271                         continue;
3272                 }
3273                 if (pci_dev->addr.domain != pci_addr.domain ||
3274                     pci_dev->addr.bus != pci_addr.bus ||
3275                     pci_dev->addr.devid != pci_addr.devid ||
3276                     pci_dev->addr.function != pci_addr.function)
3277                         continue;
3278                 /* Slave interface PCI address match found. */
3279                 fclose(file);
3280                 snprintf(tmp_str, sizeof(tmp_str),
3281                          "/sys/class/net/%s/phys_port_name", ifname);
3282                 file = fopen(tmp_str, "rb");
3283                 if (!file)
3284                         break;
3285                 info.name_type = MLX5_PHYS_PORT_NAME_TYPE_NOTSET;
3286                 if (fscanf(file, "%32s", tmp_str) == 1)
3287                         mlx5_translate_port_name(tmp_str, &info);
3288                 if (info.name_type == MLX5_PHYS_PORT_NAME_TYPE_LEGACY ||
3289                     info.name_type == MLX5_PHYS_PORT_NAME_TYPE_UPLINK)
3290                         pf = info.port_name;
3291                 break;
3292         }
3293         if (file)
3294                 fclose(file);
3295         return pf;
3296 }
3297
3298 /**
3299  * DPDK callback to register a PCI device.
3300  *
3301  * This function spawns Ethernet devices out of a given PCI device.
3302  *
3303  * @param[in] pci_drv
3304  *   PCI driver structure (mlx5_driver).
3305  * @param[in] pci_dev
3306  *   PCI device information.
3307  *
3308  * @return
3309  *   0 on success, a negative errno value otherwise and rte_errno is set.
3310  */
3311 static int
3312 mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
3313                struct rte_pci_device *pci_dev)
3314 {
3315         struct ibv_device **ibv_list;
3316         /*
3317          * Number of found IB Devices matching with requested PCI BDF.
3318          * nd != 1 means there are multiple IB devices over the same
3319          * PCI device and we have representors and master.
3320          */
3321         unsigned int nd = 0;
3322         /*
3323          * Number of found IB device Ports. nd = 1 and np = 1..n means
3324          * we have the single multiport IB device, and there may be
3325          * representors attached to some of found ports.
3326          */
3327         unsigned int np = 0;
3328         /*
3329          * Number of DPDK ethernet devices to Spawn - either over
3330          * multiple IB devices or multiple ports of single IB device.
3331          * Actually this is the number of iterations to spawn.
3332          */
3333         unsigned int ns = 0;
3334         /*
3335          * Bonding device
3336          *   < 0 - no bonding device (single one)
3337          *  >= 0 - bonding device (value is slave PF index)
3338          */
3339         int bd = -1;
3340         struct mlx5_dev_spawn_data *list = NULL;
3341         struct mlx5_dev_config dev_config;
3342         int ret;
3343
3344         if (mlx5_class_get(pci_dev->device.devargs) != MLX5_CLASS_NET) {
3345                 DRV_LOG(DEBUG, "Skip probing - should be probed by other mlx5"
3346                         " driver.");
3347                 return 1;
3348         }
3349         if (rte_eal_process_type() == RTE_PROC_PRIMARY)
3350                 mlx5_pmd_socket_init();
3351         ret = mlx5_init_once();
3352         if (ret) {
3353                 DRV_LOG(ERR, "unable to init PMD global data: %s",
3354                         strerror(rte_errno));
3355                 return -rte_errno;
3356         }
3357         MLX5_ASSERT(pci_drv == &mlx5_driver);
3358         errno = 0;
3359         ibv_list = mlx5_glue->get_device_list(&ret);
3360         if (!ibv_list) {
3361                 rte_errno = errno ? errno : ENOSYS;
3362                 DRV_LOG(ERR, "cannot list devices, is ib_uverbs loaded?");
3363                 return -rte_errno;
3364         }
3365         /*
3366          * First scan the list of all Infiniband devices to find
3367          * matching ones, gathering into the list.
3368          */
3369         struct ibv_device *ibv_match[ret + 1];
3370         int nl_route = mlx5_nl_init(NETLINK_ROUTE);
3371         int nl_rdma = mlx5_nl_init(NETLINK_RDMA);
3372         unsigned int i;
3373
3374         while (ret-- > 0) {
3375                 struct rte_pci_addr pci_addr;
3376
3377                 DRV_LOG(DEBUG, "checking device \"%s\"", ibv_list[ret]->name);
3378                 bd = mlx5_device_bond_pci_match
3379                                 (ibv_list[ret], pci_dev, nl_rdma);
3380                 if (bd >= 0) {
3381                         /*
3382                          * Bonding device detected. Only one match is allowed,
3383                          * the bonding is supported over multi-port IB device,
3384                          * there should be no matches on representor PCI
3385                          * functions or non VF LAG bonding devices with
3386                          * specified address.
3387                          */
3388                         if (nd) {
3389                                 DRV_LOG(ERR,
3390                                         "multiple PCI match on bonding device"
3391                                         "\"%s\" found", ibv_list[ret]->name);
3392                                 rte_errno = ENOENT;
3393                                 ret = -rte_errno;
3394                                 goto exit;
3395                         }
3396                         DRV_LOG(INFO, "PCI information matches for"
3397                                       " slave %d bonding device \"%s\"",
3398                                       bd, ibv_list[ret]->name);
3399                         ibv_match[nd++] = ibv_list[ret];
3400                         break;
3401                 }
3402                 if (mlx5_dev_to_pci_addr
3403                         (ibv_list[ret]->ibdev_path, &pci_addr))
3404                         continue;
3405                 if (pci_dev->addr.domain != pci_addr.domain ||
3406                     pci_dev->addr.bus != pci_addr.bus ||
3407                     pci_dev->addr.devid != pci_addr.devid ||
3408                     pci_dev->addr.function != pci_addr.function)
3409                         continue;
3410                 DRV_LOG(INFO, "PCI information matches for device \"%s\"",
3411                         ibv_list[ret]->name);
3412                 ibv_match[nd++] = ibv_list[ret];
3413         }
3414         ibv_match[nd] = NULL;
3415         if (!nd) {
3416                 /* No device matches, just complain and bail out. */
3417                 DRV_LOG(WARNING,
3418                         "no Verbs device matches PCI device " PCI_PRI_FMT ","
3419                         " are kernel drivers loaded?",
3420                         pci_dev->addr.domain, pci_dev->addr.bus,
3421                         pci_dev->addr.devid, pci_dev->addr.function);
3422                 rte_errno = ENOENT;
3423                 ret = -rte_errno;
3424                 goto exit;
3425         }
3426         if (nd == 1) {
3427                 /*
3428                  * Found single matching device may have multiple ports.
3429                  * Each port may be representor, we have to check the port
3430                  * number and check the representors existence.
3431                  */
3432                 if (nl_rdma >= 0)
3433                         np = mlx5_nl_portnum(nl_rdma, ibv_match[0]->name);
3434                 if (!np)
3435                         DRV_LOG(WARNING, "can not get IB device \"%s\""
3436                                          " ports number", ibv_match[0]->name);
3437                 if (bd >= 0 && !np) {
3438                         DRV_LOG(ERR, "can not get ports"
3439                                      " for bonding device");
3440                         rte_errno = ENOENT;
3441                         ret = -rte_errno;
3442                         goto exit;
3443                 }
3444         }
3445 #ifndef HAVE_MLX5DV_DR_DEVX_PORT
3446         if (bd >= 0) {
3447                 /*
3448                  * This may happen if there is VF LAG kernel support and
3449                  * application is compiled with older rdma_core library.
3450                  */
3451                 DRV_LOG(ERR,
3452                         "No kernel/verbs support for VF LAG bonding found.");
3453                 rte_errno = ENOTSUP;
3454                 ret = -rte_errno;
3455                 goto exit;
3456         }
3457 #endif
3458         /*
3459          * Now we can determine the maximal
3460          * amount of devices to be spawned.
3461          */
3462         list = rte_zmalloc("device spawn data",
3463                          sizeof(struct mlx5_dev_spawn_data) *
3464                          (np ? np : nd),
3465                          RTE_CACHE_LINE_SIZE);
3466         if (!list) {
3467                 DRV_LOG(ERR, "spawn data array allocation failure");
3468                 rte_errno = ENOMEM;
3469                 ret = -rte_errno;
3470                 goto exit;
3471         }
3472         if (bd >= 0 || np > 1) {
3473                 /*
3474                  * Single IB device with multiple ports found,
3475                  * it may be E-Switch master device and representors.
3476                  * We have to perform identification through the ports.
3477                  */
3478                 MLX5_ASSERT(nl_rdma >= 0);
3479                 MLX5_ASSERT(ns == 0);
3480                 MLX5_ASSERT(nd == 1);
3481                 MLX5_ASSERT(np);
3482                 for (i = 1; i <= np; ++i) {
3483                         list[ns].max_port = np;
3484                         list[ns].ibv_port = i;
3485                         list[ns].ibv_dev = ibv_match[0];
3486                         list[ns].eth_dev = NULL;
3487                         list[ns].pci_dev = pci_dev;
3488                         list[ns].pf_bond = bd;
3489                         list[ns].ifindex = mlx5_nl_ifindex
3490                                         (nl_rdma, list[ns].ibv_dev->name, i);
3491                         if (!list[ns].ifindex) {
3492                                 /*
3493                                  * No network interface index found for the
3494                                  * specified port, it means there is no
3495                                  * representor on this port. It's OK,
3496                                  * there can be disabled ports, for example
3497                                  * if sriov_numvfs < sriov_totalvfs.
3498                                  */
3499                                 continue;
3500                         }
3501                         ret = -1;
3502                         if (nl_route >= 0)
3503                                 ret = mlx5_nl_switch_info
3504                                                (nl_route,
3505                                                 list[ns].ifindex,
3506                                                 &list[ns].info);
3507                         if (ret || (!list[ns].info.representor &&
3508                                     !list[ns].info.master)) {
3509                                 /*
3510                                  * We failed to recognize representors with
3511                                  * Netlink, let's try to perform the task
3512                                  * with sysfs.
3513                                  */
3514                                 ret =  mlx5_sysfs_switch_info
3515                                                 (list[ns].ifindex,
3516                                                  &list[ns].info);
3517                         }
3518                         if (!ret && bd >= 0) {
3519                                 switch (list[ns].info.name_type) {
3520                                 case MLX5_PHYS_PORT_NAME_TYPE_UPLINK:
3521                                         if (list[ns].info.port_name == bd)
3522                                                 ns++;
3523                                         break;
3524                                 case MLX5_PHYS_PORT_NAME_TYPE_PFVF:
3525                                         if (list[ns].info.pf_num == bd)
3526                                                 ns++;
3527                                         break;
3528                                 default:
3529                                         break;
3530                                 }
3531                                 continue;
3532                         }
3533                         if (!ret && (list[ns].info.representor ^
3534                                      list[ns].info.master))
3535                                 ns++;
3536                 }
3537                 if (!ns) {
3538                         DRV_LOG(ERR,
3539                                 "unable to recognize master/representors"
3540                                 " on the IB device with multiple ports");
3541                         rte_errno = ENOENT;
3542                         ret = -rte_errno;
3543                         goto exit;
3544                 }
3545         } else {
3546                 /*
3547                  * The existence of several matching entries (nd > 1) means
3548                  * port representors have been instantiated. No existing Verbs
3549                  * call nor sysfs entries can tell them apart, this can only
3550                  * be done through Netlink calls assuming kernel drivers are
3551                  * recent enough to support them.
3552                  *
3553                  * In the event of identification failure through Netlink,
3554                  * try again through sysfs, then:
3555                  *
3556                  * 1. A single IB device matches (nd == 1) with single
3557                  *    port (np=0/1) and is not a representor, assume
3558                  *    no switch support.
3559                  *
3560                  * 2. Otherwise no safe assumptions can be made;
3561                  *    complain louder and bail out.
3562                  */
3563                 np = 1;
3564                 for (i = 0; i != nd; ++i) {
3565                         memset(&list[ns].info, 0, sizeof(list[ns].info));
3566                         list[ns].max_port = 1;
3567                         list[ns].ibv_port = 1;
3568                         list[ns].ibv_dev = ibv_match[i];
3569                         list[ns].eth_dev = NULL;
3570                         list[ns].pci_dev = pci_dev;
3571                         list[ns].pf_bond = -1;
3572                         list[ns].ifindex = 0;
3573                         if (nl_rdma >= 0)
3574                                 list[ns].ifindex = mlx5_nl_ifindex
3575                                         (nl_rdma, list[ns].ibv_dev->name, 1);
3576                         if (!list[ns].ifindex) {
3577                                 char ifname[IF_NAMESIZE];
3578
3579                                 /*
3580                                  * Netlink failed, it may happen with old
3581                                  * ib_core kernel driver (before 4.16).
3582                                  * We can assume there is old driver because
3583                                  * here we are processing single ports IB
3584                                  * devices. Let's try sysfs to retrieve
3585                                  * the ifindex. The method works for
3586                                  * master device only.
3587                                  */
3588                                 if (nd > 1) {
3589                                         /*
3590                                          * Multiple devices found, assume
3591                                          * representors, can not distinguish
3592                                          * master/representor and retrieve
3593                                          * ifindex via sysfs.
3594                                          */
3595                                         continue;
3596                                 }
3597                                 ret = mlx5_get_master_ifname
3598                                         (ibv_match[i]->ibdev_path, &ifname);
3599                                 if (!ret)
3600                                         list[ns].ifindex =
3601                                                 if_nametoindex(ifname);
3602                                 if (!list[ns].ifindex) {
3603                                         /*
3604                                          * No network interface index found
3605                                          * for the specified device, it means
3606                                          * there it is neither representor
3607                                          * nor master.
3608                                          */
3609                                         continue;
3610                                 }
3611                         }
3612                         ret = -1;
3613                         if (nl_route >= 0)
3614                                 ret = mlx5_nl_switch_info
3615                                                (nl_route,
3616                                                 list[ns].ifindex,
3617                                                 &list[ns].info);
3618                         if (ret || (!list[ns].info.representor &&
3619                                     !list[ns].info.master)) {
3620                                 /*
3621                                  * We failed to recognize representors with
3622                                  * Netlink, let's try to perform the task
3623                                  * with sysfs.
3624                                  */
3625                                 ret =  mlx5_sysfs_switch_info
3626                                                 (list[ns].ifindex,
3627                                                  &list[ns].info);
3628                         }
3629                         if (!ret && (list[ns].info.representor ^
3630                                      list[ns].info.master)) {
3631                                 ns++;
3632                         } else if ((nd == 1) &&
3633                                    !list[ns].info.representor &&
3634                                    !list[ns].info.master) {
3635                                 /*
3636                                  * Single IB device with
3637                                  * one physical port and
3638                                  * attached network device.
3639                                  * May be SRIOV is not enabled
3640                                  * or there is no representors.
3641                                  */
3642                                 DRV_LOG(INFO, "no E-Switch support detected");
3643                                 ns++;
3644                                 break;
3645                         }
3646                 }
3647                 if (!ns) {
3648                         DRV_LOG(ERR,
3649                                 "unable to recognize master/representors"
3650                                 " on the multiple IB devices");
3651                         rte_errno = ENOENT;
3652                         ret = -rte_errno;
3653                         goto exit;
3654                 }
3655         }
3656         MLX5_ASSERT(ns);
3657         /*
3658          * Sort list to probe devices in natural order for users convenience
3659          * (i.e. master first, then representors from lowest to highest ID).
3660          */
3661         qsort(list, ns, sizeof(*list), mlx5_dev_spawn_data_cmp);
3662         /* Default configuration. */
3663         dev_config = (struct mlx5_dev_config){
3664                 .hw_padding = 0,
3665                 .mps = MLX5_ARG_UNSET,
3666                 .dbnc = MLX5_ARG_UNSET,
3667                 .rx_vec_en = 1,
3668                 .txq_inline_max = MLX5_ARG_UNSET,
3669                 .txq_inline_min = MLX5_ARG_UNSET,
3670                 .txq_inline_mpw = MLX5_ARG_UNSET,
3671                 .txqs_inline = MLX5_ARG_UNSET,
3672                 .vf_nl_en = 1,
3673                 .mr_ext_memseg_en = 1,
3674                 .mprq = {
3675                         .enabled = 0, /* Disabled by default. */
3676                         .stride_num_n = 0,
3677                         .stride_size_n = 0,
3678                         .max_memcpy_len = MLX5_MPRQ_MEMCPY_DEFAULT_LEN,
3679                         .min_rxqs_num = MLX5_MPRQ_MIN_RXQS,
3680                 },
3681                 .dv_esw_en = 1,
3682                 .dv_flow_en = 1,
3683                 .log_hp_size = MLX5_ARG_UNSET,
3684         };
3685         /* Device specific configuration. */
3686         switch (pci_dev->id.device_id) {
3687         case PCI_DEVICE_ID_MELLANOX_CONNECTX4VF:
3688         case PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF:
3689         case PCI_DEVICE_ID_MELLANOX_CONNECTX5VF:
3690         case PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF:
3691         case PCI_DEVICE_ID_MELLANOX_CONNECTX5BFVF:
3692         case PCI_DEVICE_ID_MELLANOX_CONNECTX6VF:
3693         case PCI_DEVICE_ID_MELLANOX_CONNECTX6DXVF:
3694                 dev_config.vf = 1;
3695                 break;
3696         default:
3697                 break;
3698         }
3699         for (i = 0; i != ns; ++i) {
3700                 uint32_t restore;
3701
3702                 list[i].eth_dev = mlx5_dev_spawn(&pci_dev->device,
3703                                                  &list[i],
3704                                                  dev_config);
3705                 if (!list[i].eth_dev) {
3706                         if (rte_errno != EBUSY && rte_errno != EEXIST)
3707                                 break;
3708                         /* Device is disabled or already spawned. Ignore it. */
3709                         continue;
3710                 }
3711                 restore = list[i].eth_dev->data->dev_flags;
3712                 rte_eth_copy_pci_info(list[i].eth_dev, pci_dev);
3713                 /* Restore non-PCI flags cleared by the above call. */
3714                 list[i].eth_dev->data->dev_flags |= restore;
3715                 rte_eth_dev_probing_finish(list[i].eth_dev);
3716         }
3717         if (i != ns) {
3718                 DRV_LOG(ERR,
3719                         "probe of PCI device " PCI_PRI_FMT " aborted after"
3720                         " encountering an error: %s",
3721                         pci_dev->addr.domain, pci_dev->addr.bus,
3722                         pci_dev->addr.devid, pci_dev->addr.function,
3723                         strerror(rte_errno));
3724                 ret = -rte_errno;
3725                 /* Roll back. */
3726                 while (i--) {
3727                         if (!list[i].eth_dev)
3728                                 continue;
3729                         mlx5_dev_close(list[i].eth_dev);
3730                         /* mac_addrs must not be freed because in dev_private */
3731                         list[i].eth_dev->data->mac_addrs = NULL;
3732                         claim_zero(rte_eth_dev_release_port(list[i].eth_dev));
3733                 }
3734                 /* Restore original error. */
3735                 rte_errno = -ret;
3736         } else {
3737                 ret = 0;
3738         }
3739 exit:
3740         /*
3741          * Do the routine cleanup:
3742          * - close opened Netlink sockets
3743          * - free allocated spawn data array
3744          * - free the Infiniband device list
3745          */
3746         if (nl_rdma >= 0)
3747                 close(nl_rdma);
3748         if (nl_route >= 0)
3749                 close(nl_route);
3750         if (list)
3751                 rte_free(list);
3752         MLX5_ASSERT(ibv_list);
3753         mlx5_glue->free_device_list(ibv_list);
3754         return ret;
3755 }
3756
3757 /**
3758  * Look for the ethernet device belonging to mlx5 driver.
3759  *
3760  * @param[in] port_id
3761  *   port_id to start looking for device.
3762  * @param[in] pci_dev
3763  *   Pointer to the hint PCI device. When device is being probed
3764  *   the its siblings (master and preceding representors might
3765  *   not have assigned driver yet (because the mlx5_pci_probe()
3766  *   is not completed yet, for this case match on hint PCI
3767  *   device may be used to detect sibling device.
3768  *
3769  * @return
3770  *   port_id of found device, RTE_MAX_ETHPORT if not found.
3771  */
3772 uint16_t
3773 mlx5_eth_find_next(uint16_t port_id, struct rte_pci_device *pci_dev)
3774 {
3775         while (port_id < RTE_MAX_ETHPORTS) {
3776                 struct rte_eth_dev *dev = &rte_eth_devices[port_id];
3777
3778                 if (dev->state != RTE_ETH_DEV_UNUSED &&
3779                     dev->device &&
3780                     (dev->device == &pci_dev->device ||
3781                      (dev->device->driver &&
3782                      dev->device->driver->name &&
3783                      !strcmp(dev->device->driver->name, MLX5_DRIVER_NAME))))
3784                         break;
3785                 port_id++;
3786         }
3787         if (port_id >= RTE_MAX_ETHPORTS)
3788                 return RTE_MAX_ETHPORTS;
3789         return port_id;
3790 }
3791
3792 /**
3793  * DPDK callback to remove a PCI device.
3794  *
3795  * This function removes all Ethernet devices belong to a given PCI device.
3796  *
3797  * @param[in] pci_dev
3798  *   Pointer to the PCI device.
3799  *
3800  * @return
3801  *   0 on success, the function cannot fail.
3802  */
3803 static int
3804 mlx5_pci_remove(struct rte_pci_device *pci_dev)
3805 {
3806         uint16_t port_id;
3807
3808         RTE_ETH_FOREACH_DEV_OF(port_id, &pci_dev->device) {
3809                 /*
3810                  * mlx5_dev_close() is not registered to secondary process,
3811                  * call the close function explicitly for secondary process.
3812                  */
3813                 if (rte_eal_process_type() == RTE_PROC_SECONDARY)
3814                         mlx5_dev_close(&rte_eth_devices[port_id]);
3815                 else
3816                         rte_eth_dev_close(port_id);
3817         }
3818         return 0;
3819 }
3820
3821 static const struct rte_pci_id mlx5_pci_id_map[] = {
3822         {
3823                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
3824                                PCI_DEVICE_ID_MELLANOX_CONNECTX4)
3825         },
3826         {
3827                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
3828                                PCI_DEVICE_ID_MELLANOX_CONNECTX4VF)
3829         },
3830         {
3831                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
3832                                PCI_DEVICE_ID_MELLANOX_CONNECTX4LX)
3833         },
3834         {
3835                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
3836                                PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF)
3837         },
3838         {
3839                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
3840                                PCI_DEVICE_ID_MELLANOX_CONNECTX5)
3841         },
3842         {
3843                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
3844                                PCI_DEVICE_ID_MELLANOX_CONNECTX5VF)
3845         },
3846         {
3847                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
3848                                PCI_DEVICE_ID_MELLANOX_CONNECTX5EX)
3849         },
3850         {
3851                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
3852                                PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF)
3853         },
3854         {
3855                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
3856                                PCI_DEVICE_ID_MELLANOX_CONNECTX5BF)
3857         },
3858         {
3859                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
3860                                PCI_DEVICE_ID_MELLANOX_CONNECTX5BFVF)
3861         },
3862         {
3863                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
3864                                 PCI_DEVICE_ID_MELLANOX_CONNECTX6)
3865         },
3866         {
3867                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
3868                                 PCI_DEVICE_ID_MELLANOX_CONNECTX6VF)
3869         },
3870         {
3871                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
3872                                 PCI_DEVICE_ID_MELLANOX_CONNECTX6DX)
3873         },
3874         {
3875                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
3876                                 PCI_DEVICE_ID_MELLANOX_CONNECTX6DXVF)
3877         },
3878         {
3879                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
3880                                 PCI_DEVICE_ID_MELLANOX_CONNECTX6DXBF)
3881         },
3882         {
3883                 .vendor_id = 0
3884         }
3885 };
3886
3887 static struct rte_pci_driver mlx5_driver = {
3888         .driver = {
3889                 .name = MLX5_DRIVER_NAME
3890         },
3891         .id_table = mlx5_pci_id_map,
3892         .probe = mlx5_pci_probe,
3893         .remove = mlx5_pci_remove,
3894         .dma_map = mlx5_dma_map,
3895         .dma_unmap = mlx5_dma_unmap,
3896         .drv_flags = RTE_PCI_DRV_INTR_LSC | RTE_PCI_DRV_INTR_RMV |
3897                      RTE_PCI_DRV_PROBE_AGAIN,
3898 };
3899
3900 /**
3901  * Driver initialization routine.
3902  */
3903 RTE_INIT(rte_mlx5_pmd_init)
3904 {
3905         /* Initialize driver log type. */
3906         mlx5_logtype = rte_log_register("pmd.net.mlx5");
3907         if (mlx5_logtype >= 0)
3908                 rte_log_set_level(mlx5_logtype, RTE_LOG_NOTICE);
3909
3910         /* Build the static tables for Verbs conversion. */
3911         mlx5_set_ptype_table();
3912         mlx5_set_cksum_table();
3913         mlx5_set_swp_types_table();
3914         if (mlx5_glue)
3915                 rte_pci_register(&mlx5_driver);
3916 }
3917
3918 RTE_PMD_EXPORT_NAME(net_mlx5, __COUNTER__);
3919 RTE_PMD_REGISTER_PCI_TABLE(net_mlx5, mlx5_pci_id_map);
3920 RTE_PMD_REGISTER_KMOD_DEP(net_mlx5, "* ib_uverbs & mlx5_core & mlx5_ib");