net/mlx5: fix assert in doorbell lookup
[dpdk.git] / drivers / net / mlx5 / mlx5.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2015 6WIND S.A.
3  * Copyright 2015 Mellanox Technologies, Ltd
4  */
5
6 #include <stddef.h>
7 #include <unistd.h>
8 #include <string.h>
9 #include <stdint.h>
10 #include <stdlib.h>
11 #include <errno.h>
12 #include <net/if.h>
13 #include <sys/mman.h>
14 #include <linux/rtnetlink.h>
15
16 /* Verbs header. */
17 /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
18 #ifdef PEDANTIC
19 #pragma GCC diagnostic ignored "-Wpedantic"
20 #endif
21 #include <infiniband/verbs.h>
22 #ifdef PEDANTIC
23 #pragma GCC diagnostic error "-Wpedantic"
24 #endif
25
26 #include <rte_malloc.h>
27 #include <rte_ethdev_driver.h>
28 #include <rte_ethdev_pci.h>
29 #include <rte_pci.h>
30 #include <rte_bus_pci.h>
31 #include <rte_common.h>
32 #include <rte_kvargs.h>
33 #include <rte_rwlock.h>
34 #include <rte_spinlock.h>
35 #include <rte_string_fns.h>
36 #include <rte_alarm.h>
37
38 #include <mlx5_glue.h>
39 #include <mlx5_devx_cmds.h>
40 #include <mlx5_common.h>
41 #include <mlx5_common_mp.h>
42
43 #include "mlx5_defs.h"
44 #include "mlx5.h"
45 #include "mlx5_utils.h"
46 #include "mlx5_rxtx.h"
47 #include "mlx5_autoconf.h"
48 #include "mlx5_mr.h"
49 #include "mlx5_flow.h"
50 #include "rte_pmd_mlx5.h"
51
52 /* Device parameter to enable RX completion queue compression. */
53 #define MLX5_RXQ_CQE_COMP_EN "rxq_cqe_comp_en"
54
55 /* Device parameter to enable RX completion entry padding to 128B. */
56 #define MLX5_RXQ_CQE_PAD_EN "rxq_cqe_pad_en"
57
58 /* Device parameter to enable padding Rx packet to cacheline size. */
59 #define MLX5_RXQ_PKT_PAD_EN "rxq_pkt_pad_en"
60
61 /* Device parameter to enable Multi-Packet Rx queue. */
62 #define MLX5_RX_MPRQ_EN "mprq_en"
63
64 /* Device parameter to configure log 2 of the number of strides for MPRQ. */
65 #define MLX5_RX_MPRQ_LOG_STRIDE_NUM "mprq_log_stride_num"
66
67 /* Device parameter to configure log 2 of the stride size for MPRQ. */
68 #define MLX5_RX_MPRQ_LOG_STRIDE_SIZE "mprq_log_stride_size"
69
70 /* Device parameter to limit the size of memcpy'd packet for MPRQ. */
71 #define MLX5_RX_MPRQ_MAX_MEMCPY_LEN "mprq_max_memcpy_len"
72
73 /* Device parameter to set the minimum number of Rx queues to enable MPRQ. */
74 #define MLX5_RXQS_MIN_MPRQ "rxqs_min_mprq"
75
76 /* Device parameter to configure inline send. Deprecated, ignored.*/
77 #define MLX5_TXQ_INLINE "txq_inline"
78
79 /* Device parameter to limit packet size to inline with ordinary SEND. */
80 #define MLX5_TXQ_INLINE_MAX "txq_inline_max"
81
82 /* Device parameter to configure minimal data size to inline. */
83 #define MLX5_TXQ_INLINE_MIN "txq_inline_min"
84
85 /* Device parameter to limit packet size to inline with Enhanced MPW. */
86 #define MLX5_TXQ_INLINE_MPW "txq_inline_mpw"
87
88 /*
89  * Device parameter to configure the number of TX queues threshold for
90  * enabling inline send.
91  */
92 #define MLX5_TXQS_MIN_INLINE "txqs_min_inline"
93
94 /*
95  * Device parameter to configure the number of TX queues threshold for
96  * enabling vectorized Tx, deprecated, ignored (no vectorized Tx routines).
97  */
98 #define MLX5_TXQS_MAX_VEC "txqs_max_vec"
99
100 /* Device parameter to enable multi-packet send WQEs. */
101 #define MLX5_TXQ_MPW_EN "txq_mpw_en"
102
103 /*
104  * Device parameter to force doorbell register mapping
105  * to non-cahed region eliminating the extra write memory barrier.
106  */
107 #define MLX5_TX_DB_NC "tx_db_nc"
108
109 /*
110  * Device parameter to include 2 dsegs in the title WQEBB.
111  * Deprecated, ignored.
112  */
113 #define MLX5_TXQ_MPW_HDR_DSEG_EN "txq_mpw_hdr_dseg_en"
114
115 /*
116  * Device parameter to limit the size of inlining packet.
117  * Deprecated, ignored.
118  */
119 #define MLX5_TXQ_MAX_INLINE_LEN "txq_max_inline_len"
120
121 /*
122  * Device parameter to enable hardware Tx vector.
123  * Deprecated, ignored (no vectorized Tx routines anymore).
124  */
125 #define MLX5_TX_VEC_EN "tx_vec_en"
126
127 /* Device parameter to enable hardware Rx vector. */
128 #define MLX5_RX_VEC_EN "rx_vec_en"
129
130 /* Allow L3 VXLAN flow creation. */
131 #define MLX5_L3_VXLAN_EN "l3_vxlan_en"
132
133 /* Activate DV E-Switch flow steering. */
134 #define MLX5_DV_ESW_EN "dv_esw_en"
135
136 /* Activate DV flow steering. */
137 #define MLX5_DV_FLOW_EN "dv_flow_en"
138
139 /* Enable extensive flow metadata support. */
140 #define MLX5_DV_XMETA_EN "dv_xmeta_en"
141
142 /* Activate Netlink support in VF mode. */
143 #define MLX5_VF_NL_EN "vf_nl_en"
144
145 /* Enable extending memsegs when creating a MR. */
146 #define MLX5_MR_EXT_MEMSEG_EN "mr_ext_memseg_en"
147
148 /* Select port representors to instantiate. */
149 #define MLX5_REPRESENTOR "representor"
150
151 /* Device parameter to configure the maximum number of dump files per queue. */
152 #define MLX5_MAX_DUMP_FILES_NUM "max_dump_files_num"
153
154 /* Configure timeout of LRO session (in microseconds). */
155 #define MLX5_LRO_TIMEOUT_USEC "lro_timeout_usec"
156
157 /*
158  * Device parameter to configure the total data buffer size for a single
159  * hairpin queue (logarithm value).
160  */
161 #define MLX5_HP_BUF_SIZE "hp_buf_log_sz"
162
163 #ifndef HAVE_IBV_MLX5_MOD_MPW
164 #define MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED (1 << 2)
165 #define MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW (1 << 3)
166 #endif
167
168 #ifndef HAVE_IBV_MLX5_MOD_CQE_128B_COMP
169 #define MLX5DV_CONTEXT_FLAGS_CQE_128B_COMP (1 << 4)
170 #endif
171
172 static const char *MZ_MLX5_PMD_SHARED_DATA = "mlx5_pmd_shared_data";
173
174 /* Shared memory between primary and secondary processes. */
175 struct mlx5_shared_data *mlx5_shared_data;
176
177 /* Spinlock for mlx5_shared_data allocation. */
178 static rte_spinlock_t mlx5_shared_data_lock = RTE_SPINLOCK_INITIALIZER;
179
180 /* Process local data for secondary processes. */
181 static struct mlx5_local_data mlx5_local_data;
182
183 /** Driver-specific log messages type. */
184 int mlx5_logtype;
185
186 /** Data associated with devices to spawn. */
187 struct mlx5_dev_spawn_data {
188         uint32_t ifindex; /**< Network interface index. */
189         uint32_t max_port; /**< IB device maximal port index. */
190         uint32_t ibv_port; /**< IB device physical port index. */
191         int pf_bond; /**< bonding device PF index. < 0 - no bonding */
192         struct mlx5_switch_info info; /**< Switch information. */
193         struct ibv_device *ibv_dev; /**< Associated IB device. */
194         struct rte_eth_dev *eth_dev; /**< Associated Ethernet device. */
195         struct rte_pci_device *pci_dev; /**< Backend PCI device. */
196 };
197
198 static LIST_HEAD(, mlx5_ibv_shared) mlx5_ibv_list = LIST_HEAD_INITIALIZER();
199 static pthread_mutex_t mlx5_ibv_list_mutex = PTHREAD_MUTEX_INITIALIZER;
200
201 static struct mlx5_indexed_pool_config mlx5_ipool_cfg[] = {
202 #ifdef HAVE_IBV_FLOW_DV_SUPPORT
203         {
204                 .size = sizeof(struct mlx5_flow_dv_encap_decap_resource),
205                 .trunk_size = 64,
206                 .grow_trunk = 3,
207                 .grow_shift = 2,
208                 .need_lock = 0,
209                 .release_mem_en = 1,
210                 .malloc = rte_malloc_socket,
211                 .free = rte_free,
212                 .type = "mlx5_encap_decap_ipool",
213         },
214         {
215                 .size = sizeof(struct mlx5_flow_dv_push_vlan_action_resource),
216                 .trunk_size = 64,
217                 .grow_trunk = 3,
218                 .grow_shift = 2,
219                 .need_lock = 0,
220                 .release_mem_en = 1,
221                 .malloc = rte_malloc_socket,
222                 .free = rte_free,
223                 .type = "mlx5_push_vlan_ipool",
224         },
225         {
226                 .size = sizeof(struct mlx5_flow_dv_tag_resource),
227                 .trunk_size = 64,
228                 .grow_trunk = 3,
229                 .grow_shift = 2,
230                 .need_lock = 0,
231                 .release_mem_en = 1,
232                 .malloc = rte_malloc_socket,
233                 .free = rte_free,
234                 .type = "mlx5_tag_ipool",
235         },
236         {
237                 .size = sizeof(struct mlx5_flow_dv_port_id_action_resource),
238                 .trunk_size = 64,
239                 .grow_trunk = 3,
240                 .grow_shift = 2,
241                 .need_lock = 0,
242                 .release_mem_en = 1,
243                 .malloc = rte_malloc_socket,
244                 .free = rte_free,
245                 .type = "mlx5_port_id_ipool",
246         },
247         {
248                 .size = sizeof(struct mlx5_flow_tbl_data_entry),
249                 .trunk_size = 64,
250                 .grow_trunk = 3,
251                 .grow_shift = 2,
252                 .need_lock = 0,
253                 .release_mem_en = 1,
254                 .malloc = rte_malloc_socket,
255                 .free = rte_free,
256                 .type = "mlx5_jump_ipool",
257         },
258 #endif
259         {
260                 .size = sizeof(struct mlx5_flow_meter),
261                 .trunk_size = 64,
262                 .grow_trunk = 3,
263                 .grow_shift = 2,
264                 .need_lock = 0,
265                 .release_mem_en = 1,
266                 .malloc = rte_malloc_socket,
267                 .free = rte_free,
268                 .type = "mlx5_meter_ipool",
269         },
270         {
271                 .size = sizeof(struct mlx5_flow_mreg_copy_resource),
272                 .trunk_size = 64,
273                 .grow_trunk = 3,
274                 .grow_shift = 2,
275                 .need_lock = 0,
276                 .release_mem_en = 1,
277                 .malloc = rte_malloc_socket,
278                 .free = rte_free,
279                 .type = "mlx5_mcp_ipool",
280         },
281         {
282                 .size = (sizeof(struct mlx5_hrxq) + MLX5_RSS_HASH_KEY_LEN),
283                 .trunk_size = 64,
284                 .grow_trunk = 3,
285                 .grow_shift = 2,
286                 .need_lock = 0,
287                 .release_mem_en = 1,
288                 .malloc = rte_malloc_socket,
289                 .free = rte_free,
290                 .type = "mlx5_hrxq_ipool",
291         },
292         {
293                 .size = sizeof(struct mlx5_flow_handle),
294                 .trunk_size = 64,
295                 .grow_trunk = 3,
296                 .grow_shift = 2,
297                 .need_lock = 0,
298                 .release_mem_en = 1,
299                 .malloc = rte_malloc_socket,
300                 .free = rte_free,
301                 .type = "mlx5_flow_handle_ipool",
302         },
303         {
304                 .size = sizeof(struct rte_flow),
305                 .trunk_size = 4096,
306                 .need_lock = 1,
307                 .release_mem_en = 1,
308                 .malloc = rte_malloc_socket,
309                 .free = rte_free,
310                 .type = "rte_flow_ipool",
311         },
312 };
313
314
315 #define MLX5_FLOW_MIN_ID_POOL_SIZE 512
316 #define MLX5_ID_GENERATION_ARRAY_FACTOR 16
317
318 #define MLX5_FLOW_TABLE_HLIST_ARRAY_SIZE 4096
319 #define MLX5_TAGS_HLIST_ARRAY_SIZE 8192
320
321 /**
322  * Allocate ID pool structure.
323  *
324  * @param[in] max_id
325  *   The maximum id can be allocated from the pool.
326  *
327  * @return
328  *   Pointer to pool object, NULL value otherwise.
329  */
330 struct mlx5_flow_id_pool *
331 mlx5_flow_id_pool_alloc(uint32_t max_id)
332 {
333         struct mlx5_flow_id_pool *pool;
334         void *mem;
335
336         pool = rte_zmalloc("id pool allocation", sizeof(*pool),
337                            RTE_CACHE_LINE_SIZE);
338         if (!pool) {
339                 DRV_LOG(ERR, "can't allocate id pool");
340                 rte_errno  = ENOMEM;
341                 return NULL;
342         }
343         mem = rte_zmalloc("", MLX5_FLOW_MIN_ID_POOL_SIZE * sizeof(uint32_t),
344                           RTE_CACHE_LINE_SIZE);
345         if (!mem) {
346                 DRV_LOG(ERR, "can't allocate mem for id pool");
347                 rte_errno  = ENOMEM;
348                 goto error;
349         }
350         pool->free_arr = mem;
351         pool->curr = pool->free_arr;
352         pool->last = pool->free_arr + MLX5_FLOW_MIN_ID_POOL_SIZE;
353         pool->base_index = 0;
354         pool->max_id = max_id;
355         return pool;
356 error:
357         rte_free(pool);
358         return NULL;
359 }
360
361 /**
362  * Release ID pool structure.
363  *
364  * @param[in] pool
365  *   Pointer to flow id pool object to free.
366  */
367 void
368 mlx5_flow_id_pool_release(struct mlx5_flow_id_pool *pool)
369 {
370         rte_free(pool->free_arr);
371         rte_free(pool);
372 }
373
374 /**
375  * Generate ID.
376  *
377  * @param[in] pool
378  *   Pointer to flow id pool.
379  * @param[out] id
380  *   The generated ID.
381  *
382  * @return
383  *   0 on success, error value otherwise.
384  */
385 uint32_t
386 mlx5_flow_id_get(struct mlx5_flow_id_pool *pool, uint32_t *id)
387 {
388         if (pool->curr == pool->free_arr) {
389                 if (pool->base_index == pool->max_id) {
390                         rte_errno  = ENOMEM;
391                         DRV_LOG(ERR, "no free id");
392                         return -rte_errno;
393                 }
394                 *id = ++pool->base_index;
395                 return 0;
396         }
397         *id = *(--pool->curr);
398         return 0;
399 }
400
401 /**
402  * Release ID.
403  *
404  * @param[in] pool
405  *   Pointer to flow id pool.
406  * @param[out] id
407  *   The generated ID.
408  *
409  * @return
410  *   0 on success, error value otherwise.
411  */
412 uint32_t
413 mlx5_flow_id_release(struct mlx5_flow_id_pool *pool, uint32_t id)
414 {
415         uint32_t size;
416         uint32_t size2;
417         void *mem;
418
419         if (pool->curr == pool->last) {
420                 size = pool->curr - pool->free_arr;
421                 size2 = size * MLX5_ID_GENERATION_ARRAY_FACTOR;
422                 MLX5_ASSERT(size2 > size);
423                 mem = rte_malloc("", size2 * sizeof(uint32_t), 0);
424                 if (!mem) {
425                         DRV_LOG(ERR, "can't allocate mem for id pool");
426                         rte_errno  = ENOMEM;
427                         return -rte_errno;
428                 }
429                 memcpy(mem, pool->free_arr, size * sizeof(uint32_t));
430                 rte_free(pool->free_arr);
431                 pool->free_arr = mem;
432                 pool->curr = pool->free_arr + size;
433                 pool->last = pool->free_arr + size2;
434         }
435         *pool->curr = id;
436         pool->curr++;
437         return 0;
438 }
439
440 /**
441  * Initialize the counters management structure.
442  *
443  * @param[in] sh
444  *   Pointer to mlx5_ibv_shared object to free
445  */
446 static void
447 mlx5_flow_counters_mng_init(struct mlx5_ibv_shared *sh)
448 {
449         uint8_t i;
450
451         TAILQ_INIT(&sh->cmng.flow_counters);
452         for (i = 0; i < RTE_DIM(sh->cmng.ccont); ++i)
453                 TAILQ_INIT(&sh->cmng.ccont[i].pool_list);
454 }
455
456 /**
457  * Destroy all the resources allocated for a counter memory management.
458  *
459  * @param[in] mng
460  *   Pointer to the memory management structure.
461  */
462 static void
463 mlx5_flow_destroy_counter_stat_mem_mng(struct mlx5_counter_stats_mem_mng *mng)
464 {
465         uint8_t *mem = (uint8_t *)(uintptr_t)mng->raws[0].data;
466
467         LIST_REMOVE(mng, next);
468         claim_zero(mlx5_devx_cmd_destroy(mng->dm));
469         claim_zero(mlx5_glue->devx_umem_dereg(mng->umem));
470         rte_free(mem);
471 }
472
473 /**
474  * Close and release all the resources of the counters management.
475  *
476  * @param[in] sh
477  *   Pointer to mlx5_ibv_shared object to free.
478  */
479 static void
480 mlx5_flow_counters_mng_close(struct mlx5_ibv_shared *sh)
481 {
482         struct mlx5_counter_stats_mem_mng *mng;
483         uint8_t i;
484         int j;
485         int retries = 1024;
486
487         rte_errno = 0;
488         while (--retries) {
489                 rte_eal_alarm_cancel(mlx5_flow_query_alarm, sh);
490                 if (rte_errno != EINPROGRESS)
491                         break;
492                 rte_pause();
493         }
494         for (i = 0; i < RTE_DIM(sh->cmng.ccont); ++i) {
495                 struct mlx5_flow_counter_pool *pool;
496                 uint32_t batch = !!(i % 2);
497
498                 if (!sh->cmng.ccont[i].pools)
499                         continue;
500                 pool = TAILQ_FIRST(&sh->cmng.ccont[i].pool_list);
501                 while (pool) {
502                         if (batch) {
503                                 if (pool->min_dcs)
504                                         claim_zero
505                                         (mlx5_devx_cmd_destroy(pool->min_dcs));
506                         }
507                         for (j = 0; j < MLX5_COUNTERS_PER_POOL; ++j) {
508                                 if (pool->counters_raw[j].action)
509                                         claim_zero
510                                         (mlx5_glue->destroy_flow_action
511                                                (pool->counters_raw[j].action));
512                                 if (!batch && MLX5_GET_POOL_CNT_EXT
513                                     (pool, j)->dcs)
514                                         claim_zero(mlx5_devx_cmd_destroy
515                                                   (MLX5_GET_POOL_CNT_EXT
516                                                   (pool, j)->dcs));
517                         }
518                         TAILQ_REMOVE(&sh->cmng.ccont[i].pool_list, pool,
519                                      next);
520                         rte_free(pool);
521                         pool = TAILQ_FIRST(&sh->cmng.ccont[i].pool_list);
522                 }
523                 rte_free(sh->cmng.ccont[i].pools);
524         }
525         mng = LIST_FIRST(&sh->cmng.mem_mngs);
526         while (mng) {
527                 mlx5_flow_destroy_counter_stat_mem_mng(mng);
528                 mng = LIST_FIRST(&sh->cmng.mem_mngs);
529         }
530         memset(&sh->cmng, 0, sizeof(sh->cmng));
531 }
532
533 /**
534  * Initialize the flow resources' indexed mempool.
535  *
536  * @param[in] sh
537  *   Pointer to mlx5_ibv_shared object.
538  * @param[in] sh
539  *   Pointer to user dev config.
540  */
541 static void
542 mlx5_flow_ipool_create(struct mlx5_ibv_shared *sh,
543                        const struct mlx5_dev_config *config __rte_unused)
544 {
545         uint8_t i;
546
547 #ifdef HAVE_IBV_FLOW_DV_SUPPORT
548         /*
549          * While DV is supported, user chooses the verbs mode,
550          * the mlx5 flow handle size is different with the
551          * MLX5_FLOW_HANDLE_VERBS_SIZE.
552          */
553         if (!config->dv_flow_en)
554                 mlx5_ipool_cfg[MLX5_IPOOL_MLX5_FLOW].size =
555                                         MLX5_FLOW_HANDLE_VERBS_SIZE;
556 #endif
557         for (i = 0; i < MLX5_IPOOL_MAX; ++i)
558                 sh->ipool[i] = mlx5_ipool_create(&mlx5_ipool_cfg[i]);
559 }
560
561 /**
562  * Release the flow resources' indexed mempool.
563  *
564  * @param[in] sh
565  *   Pointer to mlx5_ibv_shared object.
566  */
567 static void
568 mlx5_flow_ipool_destroy(struct mlx5_ibv_shared *sh)
569 {
570         uint8_t i;
571
572         for (i = 0; i < MLX5_IPOOL_MAX; ++i)
573                 mlx5_ipool_destroy(sh->ipool[i]);
574 }
575
576 /**
577  * Extract pdn of PD object using DV API.
578  *
579  * @param[in] pd
580  *   Pointer to the verbs PD object.
581  * @param[out] pdn
582  *   Pointer to the PD object number variable.
583  *
584  * @return
585  *   0 on success, error value otherwise.
586  */
587 #ifdef HAVE_IBV_FLOW_DV_SUPPORT
588 static int
589 mlx5_get_pdn(struct ibv_pd *pd __rte_unused, uint32_t *pdn __rte_unused)
590 {
591         struct mlx5dv_obj obj;
592         struct mlx5dv_pd pd_info;
593         int ret = 0;
594
595         obj.pd.in = pd;
596         obj.pd.out = &pd_info;
597         ret = mlx5_glue->dv_init_obj(&obj, MLX5DV_OBJ_PD);
598         if (ret) {
599                 DRV_LOG(DEBUG, "Fail to get PD object info");
600                 return ret;
601         }
602         *pdn = pd_info.pdn;
603         return 0;
604 }
605 #endif /* HAVE_IBV_FLOW_DV_SUPPORT */
606
607 static int
608 mlx5_config_doorbell_mapping_env(const struct mlx5_dev_config *config)
609 {
610         char *env;
611         int value;
612
613         MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY);
614         /* Get environment variable to store. */
615         env = getenv(MLX5_SHUT_UP_BF);
616         value = env ? !!strcmp(env, "0") : MLX5_ARG_UNSET;
617         if (config->dbnc == MLX5_ARG_UNSET)
618                 setenv(MLX5_SHUT_UP_BF, MLX5_SHUT_UP_BF_DEFAULT, 1);
619         else
620                 setenv(MLX5_SHUT_UP_BF,
621                        config->dbnc == MLX5_TXDB_NCACHED ? "1" : "0", 1);
622         return value;
623 }
624
625 static void
626 mlx5_restore_doorbell_mapping_env(int value)
627 {
628         MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY);
629         /* Restore the original environment variable state. */
630         if (value == MLX5_ARG_UNSET)
631                 unsetenv(MLX5_SHUT_UP_BF);
632         else
633                 setenv(MLX5_SHUT_UP_BF, value ? "1" : "0", 1);
634 }
635
636 /**
637  * Allocate shared IB device context. If there is multiport device the
638  * master and representors will share this context, if there is single
639  * port dedicated IB device, the context will be used by only given
640  * port due to unification.
641  *
642  * Routine first searches the context for the specified IB device name,
643  * if found the shared context assumed and reference counter is incremented.
644  * If no context found the new one is created and initialized with specified
645  * IB device context and parameters.
646  *
647  * @param[in] spawn
648  *   Pointer to the IB device attributes (name, port, etc).
649  * @param[in] config
650  *   Pointer to device configuration structure.
651  *
652  * @return
653  *   Pointer to mlx5_ibv_shared object on success,
654  *   otherwise NULL and rte_errno is set.
655  */
656 static struct mlx5_ibv_shared *
657 mlx5_alloc_shared_ibctx(const struct mlx5_dev_spawn_data *spawn,
658                         const struct mlx5_dev_config *config)
659 {
660         struct mlx5_ibv_shared *sh;
661         int dbmap_env;
662         int err = 0;
663         uint32_t i;
664 #ifdef HAVE_IBV_FLOW_DV_SUPPORT
665         struct mlx5_devx_tis_attr tis_attr = { 0 };
666 #endif
667
668         MLX5_ASSERT(spawn);
669         /* Secondary process should not create the shared context. */
670         MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY);
671         pthread_mutex_lock(&mlx5_ibv_list_mutex);
672         /* Search for IB context by device name. */
673         LIST_FOREACH(sh, &mlx5_ibv_list, next) {
674                 if (!strcmp(sh->ibdev_name, spawn->ibv_dev->name)) {
675                         sh->refcnt++;
676                         goto exit;
677                 }
678         }
679         /* No device found, we have to create new shared context. */
680         MLX5_ASSERT(spawn->max_port);
681         sh = rte_zmalloc("ethdev shared ib context",
682                          sizeof(struct mlx5_ibv_shared) +
683                          spawn->max_port *
684                          sizeof(struct mlx5_ibv_shared_port),
685                          RTE_CACHE_LINE_SIZE);
686         if (!sh) {
687                 DRV_LOG(ERR, "shared context allocation failure");
688                 rte_errno  = ENOMEM;
689                 goto exit;
690         }
691         /*
692          * Configure environment variable "MLX5_BF_SHUT_UP"
693          * before the device creation. The rdma_core library
694          * checks the variable at device creation and
695          * stores the result internally.
696          */
697         dbmap_env = mlx5_config_doorbell_mapping_env(config);
698         /* Try to open IB device with DV first, then usual Verbs. */
699         errno = 0;
700         sh->ctx = mlx5_glue->dv_open_device(spawn->ibv_dev);
701         if (sh->ctx) {
702                 sh->devx = 1;
703                 DRV_LOG(DEBUG, "DevX is supported");
704                 /* The device is created, no need for environment. */
705                 mlx5_restore_doorbell_mapping_env(dbmap_env);
706         } else {
707                 /* The environment variable is still configured. */
708                 sh->ctx = mlx5_glue->open_device(spawn->ibv_dev);
709                 err = errno ? errno : ENODEV;
710                 /*
711                  * The environment variable is not needed anymore,
712                  * all device creation attempts are completed.
713                  */
714                 mlx5_restore_doorbell_mapping_env(dbmap_env);
715                 if (!sh->ctx)
716                         goto error;
717                 DRV_LOG(DEBUG, "DevX is NOT supported");
718         }
719         err = mlx5_glue->query_device_ex(sh->ctx, NULL, &sh->device_attr);
720         if (err) {
721                 DRV_LOG(DEBUG, "ibv_query_device_ex() failed");
722                 goto error;
723         }
724         sh->refcnt = 1;
725         sh->max_port = spawn->max_port;
726         strncpy(sh->ibdev_name, sh->ctx->device->name,
727                 sizeof(sh->ibdev_name));
728         strncpy(sh->ibdev_path, sh->ctx->device->ibdev_path,
729                 sizeof(sh->ibdev_path));
730         pthread_mutex_init(&sh->intr_mutex, NULL);
731         /*
732          * Setting port_id to max unallowed value means
733          * there is no interrupt subhandler installed for
734          * the given port index i.
735          */
736         for (i = 0; i < sh->max_port; i++) {
737                 sh->port[i].ih_port_id = RTE_MAX_ETHPORTS;
738                 sh->port[i].devx_ih_port_id = RTE_MAX_ETHPORTS;
739         }
740         sh->pd = mlx5_glue->alloc_pd(sh->ctx);
741         if (sh->pd == NULL) {
742                 DRV_LOG(ERR, "PD allocation failure");
743                 err = ENOMEM;
744                 goto error;
745         }
746 #ifdef HAVE_IBV_FLOW_DV_SUPPORT
747         if (sh->devx) {
748                 err = mlx5_get_pdn(sh->pd, &sh->pdn);
749                 if (err) {
750                         DRV_LOG(ERR, "Fail to extract pdn from PD");
751                         goto error;
752                 }
753                 sh->td = mlx5_devx_cmd_create_td(sh->ctx);
754                 if (!sh->td) {
755                         DRV_LOG(ERR, "TD allocation failure");
756                         err = ENOMEM;
757                         goto error;
758                 }
759                 tis_attr.transport_domain = sh->td->id;
760                 sh->tis = mlx5_devx_cmd_create_tis(sh->ctx, &tis_attr);
761                 if (!sh->tis) {
762                         DRV_LOG(ERR, "TIS allocation failure");
763                         err = ENOMEM;
764                         goto error;
765                 }
766         }
767         sh->flow_id_pool = mlx5_flow_id_pool_alloc
768                                         ((1 << HAIRPIN_FLOW_ID_BITS) - 1);
769         if (!sh->flow_id_pool) {
770                 DRV_LOG(ERR, "can't create flow id pool");
771                 err = ENOMEM;
772                 goto error;
773         }
774 #endif /* HAVE_IBV_FLOW_DV_SUPPORT */
775         /*
776          * Once the device is added to the list of memory event
777          * callback, its global MR cache table cannot be expanded
778          * on the fly because of deadlock. If it overflows, lookup
779          * should be done by searching MR list linearly, which is slow.
780          *
781          * At this point the device is not added to the memory
782          * event list yet, context is just being created.
783          */
784         err = mlx5_mr_btree_init(&sh->share_cache.cache,
785                                  MLX5_MR_BTREE_CACHE_N * 2,
786                                  spawn->pci_dev->device.numa_node);
787         if (err) {
788                 err = rte_errno;
789                 goto error;
790         }
791         mlx5_flow_counters_mng_init(sh);
792         mlx5_flow_ipool_create(sh, config);
793         /* Add device to memory callback list. */
794         rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock);
795         LIST_INSERT_HEAD(&mlx5_shared_data->mem_event_cb_list,
796                          sh, mem_event_cb);
797         rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
798         /* Add context to the global device list. */
799         LIST_INSERT_HEAD(&mlx5_ibv_list, sh, next);
800 exit:
801         pthread_mutex_unlock(&mlx5_ibv_list_mutex);
802         return sh;
803 error:
804         pthread_mutex_unlock(&mlx5_ibv_list_mutex);
805         MLX5_ASSERT(sh);
806         if (sh->tis)
807                 claim_zero(mlx5_devx_cmd_destroy(sh->tis));
808         if (sh->td)
809                 claim_zero(mlx5_devx_cmd_destroy(sh->td));
810         if (sh->pd)
811                 claim_zero(mlx5_glue->dealloc_pd(sh->pd));
812         if (sh->ctx)
813                 claim_zero(mlx5_glue->close_device(sh->ctx));
814         if (sh->flow_id_pool)
815                 mlx5_flow_id_pool_release(sh->flow_id_pool);
816         rte_free(sh);
817         MLX5_ASSERT(err > 0);
818         rte_errno = err;
819         return NULL;
820 }
821
822 /**
823  * Free shared IB device context. Decrement counter and if zero free
824  * all allocated resources and close handles.
825  *
826  * @param[in] sh
827  *   Pointer to mlx5_ibv_shared object to free
828  */
829 static void
830 mlx5_free_shared_ibctx(struct mlx5_ibv_shared *sh)
831 {
832         pthread_mutex_lock(&mlx5_ibv_list_mutex);
833 #ifdef RTE_LIBRTE_MLX5_DEBUG
834         /* Check the object presence in the list. */
835         struct mlx5_ibv_shared *lctx;
836
837         LIST_FOREACH(lctx, &mlx5_ibv_list, next)
838                 if (lctx == sh)
839                         break;
840         MLX5_ASSERT(lctx);
841         if (lctx != sh) {
842                 DRV_LOG(ERR, "Freeing non-existing shared IB context");
843                 goto exit;
844         }
845 #endif
846         MLX5_ASSERT(sh);
847         MLX5_ASSERT(sh->refcnt);
848         /* Secondary process should not free the shared context. */
849         MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY);
850         if (--sh->refcnt)
851                 goto exit;
852         /* Remove from memory callback device list. */
853         rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock);
854         LIST_REMOVE(sh, mem_event_cb);
855         rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
856         /* Release created Memory Regions. */
857         mlx5_mr_release_cache(&sh->share_cache);
858         /* Remove context from the global device list. */
859         LIST_REMOVE(sh, next);
860         /*
861          *  Ensure there is no async event handler installed.
862          *  Only primary process handles async device events.
863          **/
864         mlx5_flow_counters_mng_close(sh);
865         mlx5_flow_ipool_destroy(sh);
866         MLX5_ASSERT(!sh->intr_cnt);
867         if (sh->intr_cnt)
868                 mlx5_intr_callback_unregister
869                         (&sh->intr_handle, mlx5_dev_interrupt_handler, sh);
870 #ifdef HAVE_MLX5_DEVX_ASYNC_SUPPORT
871         if (sh->devx_intr_cnt) {
872                 if (sh->intr_handle_devx.fd)
873                         rte_intr_callback_unregister(&sh->intr_handle_devx,
874                                           mlx5_dev_interrupt_handler_devx, sh);
875                 if (sh->devx_comp)
876                         mlx5dv_devx_destroy_cmd_comp(sh->devx_comp);
877         }
878 #endif
879         pthread_mutex_destroy(&sh->intr_mutex);
880         if (sh->pd)
881                 claim_zero(mlx5_glue->dealloc_pd(sh->pd));
882         if (sh->tis)
883                 claim_zero(mlx5_devx_cmd_destroy(sh->tis));
884         if (sh->td)
885                 claim_zero(mlx5_devx_cmd_destroy(sh->td));
886         if (sh->ctx)
887                 claim_zero(mlx5_glue->close_device(sh->ctx));
888         if (sh->flow_id_pool)
889                 mlx5_flow_id_pool_release(sh->flow_id_pool);
890         rte_free(sh);
891 exit:
892         pthread_mutex_unlock(&mlx5_ibv_list_mutex);
893 }
894
895 /**
896  * Destroy table hash list and all the root entries per domain.
897  *
898  * @param[in] priv
899  *   Pointer to the private device data structure.
900  */
901 static void
902 mlx5_free_table_hash_list(struct mlx5_priv *priv)
903 {
904         struct mlx5_ibv_shared *sh = priv->sh;
905         struct mlx5_flow_tbl_data_entry *tbl_data;
906         union mlx5_flow_tbl_key table_key = {
907                 {
908                         .table_id = 0,
909                         .reserved = 0,
910                         .domain = 0,
911                         .direction = 0,
912                 }
913         };
914         struct mlx5_hlist_entry *pos;
915
916         if (!sh->flow_tbls)
917                 return;
918         pos = mlx5_hlist_lookup(sh->flow_tbls, table_key.v64);
919         if (pos) {
920                 tbl_data = container_of(pos, struct mlx5_flow_tbl_data_entry,
921                                         entry);
922                 MLX5_ASSERT(tbl_data);
923                 mlx5_hlist_remove(sh->flow_tbls, pos);
924                 rte_free(tbl_data);
925         }
926         table_key.direction = 1;
927         pos = mlx5_hlist_lookup(sh->flow_tbls, table_key.v64);
928         if (pos) {
929                 tbl_data = container_of(pos, struct mlx5_flow_tbl_data_entry,
930                                         entry);
931                 MLX5_ASSERT(tbl_data);
932                 mlx5_hlist_remove(sh->flow_tbls, pos);
933                 rte_free(tbl_data);
934         }
935         table_key.direction = 0;
936         table_key.domain = 1;
937         pos = mlx5_hlist_lookup(sh->flow_tbls, table_key.v64);
938         if (pos) {
939                 tbl_data = container_of(pos, struct mlx5_flow_tbl_data_entry,
940                                         entry);
941                 MLX5_ASSERT(tbl_data);
942                 mlx5_hlist_remove(sh->flow_tbls, pos);
943                 rte_free(tbl_data);
944         }
945         mlx5_hlist_destroy(sh->flow_tbls, NULL, NULL);
946 }
947
948 /**
949  * Initialize flow table hash list and create the root tables entry
950  * for each domain.
951  *
952  * @param[in] priv
953  *   Pointer to the private device data structure.
954  *
955  * @return
956  *   Zero on success, positive error code otherwise.
957  */
958 static int
959 mlx5_alloc_table_hash_list(struct mlx5_priv *priv)
960 {
961         struct mlx5_ibv_shared *sh = priv->sh;
962         char s[MLX5_HLIST_NAMESIZE];
963         int err = 0;
964
965         MLX5_ASSERT(sh);
966         snprintf(s, sizeof(s), "%s_flow_table", priv->sh->ibdev_name);
967         sh->flow_tbls = mlx5_hlist_create(s, MLX5_FLOW_TABLE_HLIST_ARRAY_SIZE);
968         if (!sh->flow_tbls) {
969                 DRV_LOG(ERR, "flow tables with hash creation failed.\n");
970                 err = ENOMEM;
971                 return err;
972         }
973 #ifndef HAVE_MLX5DV_DR
974         /*
975          * In case we have not DR support, the zero tables should be created
976          * because DV expect to see them even if they cannot be created by
977          * RDMA-CORE.
978          */
979         union mlx5_flow_tbl_key table_key = {
980                 {
981                         .table_id = 0,
982                         .reserved = 0,
983                         .domain = 0,
984                         .direction = 0,
985                 }
986         };
987         struct mlx5_flow_tbl_data_entry *tbl_data = rte_zmalloc(NULL,
988                                                           sizeof(*tbl_data), 0);
989
990         if (!tbl_data) {
991                 err = ENOMEM;
992                 goto error;
993         }
994         tbl_data->entry.key = table_key.v64;
995         err = mlx5_hlist_insert(sh->flow_tbls, &tbl_data->entry);
996         if (err)
997                 goto error;
998         rte_atomic32_init(&tbl_data->tbl.refcnt);
999         rte_atomic32_inc(&tbl_data->tbl.refcnt);
1000         table_key.direction = 1;
1001         tbl_data = rte_zmalloc(NULL, sizeof(*tbl_data), 0);
1002         if (!tbl_data) {
1003                 err = ENOMEM;
1004                 goto error;
1005         }
1006         tbl_data->entry.key = table_key.v64;
1007         err = mlx5_hlist_insert(sh->flow_tbls, &tbl_data->entry);
1008         if (err)
1009                 goto error;
1010         rte_atomic32_init(&tbl_data->tbl.refcnt);
1011         rte_atomic32_inc(&tbl_data->tbl.refcnt);
1012         table_key.direction = 0;
1013         table_key.domain = 1;
1014         tbl_data = rte_zmalloc(NULL, sizeof(*tbl_data), 0);
1015         if (!tbl_data) {
1016                 err = ENOMEM;
1017                 goto error;
1018         }
1019         tbl_data->entry.key = table_key.v64;
1020         err = mlx5_hlist_insert(sh->flow_tbls, &tbl_data->entry);
1021         if (err)
1022                 goto error;
1023         rte_atomic32_init(&tbl_data->tbl.refcnt);
1024         rte_atomic32_inc(&tbl_data->tbl.refcnt);
1025         return err;
1026 error:
1027         mlx5_free_table_hash_list(priv);
1028 #endif /* HAVE_MLX5DV_DR */
1029         return err;
1030 }
1031
1032 /**
1033  * Initialize DR related data within private structure.
1034  * Routine checks the reference counter and does actual
1035  * resources creation/initialization only if counter is zero.
1036  *
1037  * @param[in] priv
1038  *   Pointer to the private device data structure.
1039  *
1040  * @return
1041  *   Zero on success, positive error code otherwise.
1042  */
1043 static int
1044 mlx5_alloc_shared_dr(struct mlx5_priv *priv)
1045 {
1046         struct mlx5_ibv_shared *sh = priv->sh;
1047         char s[MLX5_HLIST_NAMESIZE];
1048         int err = 0;
1049
1050         if (!sh->flow_tbls)
1051                 err = mlx5_alloc_table_hash_list(priv);
1052         else
1053                 DRV_LOG(DEBUG, "sh->flow_tbls[%p] already created, reuse\n",
1054                         (void *)sh->flow_tbls);
1055         if (err)
1056                 return err;
1057         /* Create tags hash list table. */
1058         snprintf(s, sizeof(s), "%s_tags", sh->ibdev_name);
1059         sh->tag_table = mlx5_hlist_create(s, MLX5_TAGS_HLIST_ARRAY_SIZE);
1060         if (!sh->tag_table) {
1061                 DRV_LOG(ERR, "tags with hash creation failed.\n");
1062                 err = ENOMEM;
1063                 goto error;
1064         }
1065 #ifdef HAVE_MLX5DV_DR
1066         void *domain;
1067
1068         if (sh->dv_refcnt) {
1069                 /* Shared DV/DR structures is already initialized. */
1070                 sh->dv_refcnt++;
1071                 priv->dr_shared = 1;
1072                 return 0;
1073         }
1074         /* Reference counter is zero, we should initialize structures. */
1075         domain = mlx5_glue->dr_create_domain(sh->ctx,
1076                                              MLX5DV_DR_DOMAIN_TYPE_NIC_RX);
1077         if (!domain) {
1078                 DRV_LOG(ERR, "ingress mlx5dv_dr_create_domain failed");
1079                 err = errno;
1080                 goto error;
1081         }
1082         sh->rx_domain = domain;
1083         domain = mlx5_glue->dr_create_domain(sh->ctx,
1084                                              MLX5DV_DR_DOMAIN_TYPE_NIC_TX);
1085         if (!domain) {
1086                 DRV_LOG(ERR, "egress mlx5dv_dr_create_domain failed");
1087                 err = errno;
1088                 goto error;
1089         }
1090         pthread_mutex_init(&sh->dv_mutex, NULL);
1091         sh->tx_domain = domain;
1092 #ifdef HAVE_MLX5DV_DR_ESWITCH
1093         if (priv->config.dv_esw_en) {
1094                 domain  = mlx5_glue->dr_create_domain
1095                         (sh->ctx, MLX5DV_DR_DOMAIN_TYPE_FDB);
1096                 if (!domain) {
1097                         DRV_LOG(ERR, "FDB mlx5dv_dr_create_domain failed");
1098                         err = errno;
1099                         goto error;
1100                 }
1101                 sh->fdb_domain = domain;
1102                 sh->esw_drop_action = mlx5_glue->dr_create_flow_action_drop();
1103         }
1104 #endif
1105         sh->pop_vlan_action = mlx5_glue->dr_create_flow_action_pop_vlan();
1106 #endif /* HAVE_MLX5DV_DR */
1107         sh->dv_refcnt++;
1108         priv->dr_shared = 1;
1109         return 0;
1110 error:
1111         /* Rollback the created objects. */
1112         if (sh->rx_domain) {
1113                 mlx5_glue->dr_destroy_domain(sh->rx_domain);
1114                 sh->rx_domain = NULL;
1115         }
1116         if (sh->tx_domain) {
1117                 mlx5_glue->dr_destroy_domain(sh->tx_domain);
1118                 sh->tx_domain = NULL;
1119         }
1120         if (sh->fdb_domain) {
1121                 mlx5_glue->dr_destroy_domain(sh->fdb_domain);
1122                 sh->fdb_domain = NULL;
1123         }
1124         if (sh->esw_drop_action) {
1125                 mlx5_glue->destroy_flow_action(sh->esw_drop_action);
1126                 sh->esw_drop_action = NULL;
1127         }
1128         if (sh->pop_vlan_action) {
1129                 mlx5_glue->destroy_flow_action(sh->pop_vlan_action);
1130                 sh->pop_vlan_action = NULL;
1131         }
1132         if (sh->tag_table) {
1133                 /* tags should be destroyed with flow before. */
1134                 mlx5_hlist_destroy(sh->tag_table, NULL, NULL);
1135                 sh->tag_table = NULL;
1136         }
1137         mlx5_free_table_hash_list(priv);
1138         return err;
1139 }
1140
1141 /**
1142  * Destroy DR related data within private structure.
1143  *
1144  * @param[in] priv
1145  *   Pointer to the private device data structure.
1146  */
1147 static void
1148 mlx5_free_shared_dr(struct mlx5_priv *priv)
1149 {
1150         struct mlx5_ibv_shared *sh;
1151
1152         if (!priv->dr_shared)
1153                 return;
1154         priv->dr_shared = 0;
1155         sh = priv->sh;
1156         MLX5_ASSERT(sh);
1157 #ifdef HAVE_MLX5DV_DR
1158         MLX5_ASSERT(sh->dv_refcnt);
1159         if (sh->dv_refcnt && --sh->dv_refcnt)
1160                 return;
1161         if (sh->rx_domain) {
1162                 mlx5_glue->dr_destroy_domain(sh->rx_domain);
1163                 sh->rx_domain = NULL;
1164         }
1165         if (sh->tx_domain) {
1166                 mlx5_glue->dr_destroy_domain(sh->tx_domain);
1167                 sh->tx_domain = NULL;
1168         }
1169 #ifdef HAVE_MLX5DV_DR_ESWITCH
1170         if (sh->fdb_domain) {
1171                 mlx5_glue->dr_destroy_domain(sh->fdb_domain);
1172                 sh->fdb_domain = NULL;
1173         }
1174         if (sh->esw_drop_action) {
1175                 mlx5_glue->destroy_flow_action(sh->esw_drop_action);
1176                 sh->esw_drop_action = NULL;
1177         }
1178 #endif
1179         if (sh->pop_vlan_action) {
1180                 mlx5_glue->destroy_flow_action(sh->pop_vlan_action);
1181                 sh->pop_vlan_action = NULL;
1182         }
1183         pthread_mutex_destroy(&sh->dv_mutex);
1184 #endif /* HAVE_MLX5DV_DR */
1185         if (sh->tag_table) {
1186                 /* tags should be destroyed with flow before. */
1187                 mlx5_hlist_destroy(sh->tag_table, NULL, NULL);
1188                 sh->tag_table = NULL;
1189         }
1190         mlx5_free_table_hash_list(priv);
1191 }
1192
1193 /**
1194  * Initialize shared data between primary and secondary process.
1195  *
1196  * A memzone is reserved by primary process and secondary processes attach to
1197  * the memzone.
1198  *
1199  * @return
1200  *   0 on success, a negative errno value otherwise and rte_errno is set.
1201  */
1202 static int
1203 mlx5_init_shared_data(void)
1204 {
1205         const struct rte_memzone *mz;
1206         int ret = 0;
1207
1208         rte_spinlock_lock(&mlx5_shared_data_lock);
1209         if (mlx5_shared_data == NULL) {
1210                 if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
1211                         /* Allocate shared memory. */
1212                         mz = rte_memzone_reserve(MZ_MLX5_PMD_SHARED_DATA,
1213                                                  sizeof(*mlx5_shared_data),
1214                                                  SOCKET_ID_ANY, 0);
1215                         if (mz == NULL) {
1216                                 DRV_LOG(ERR,
1217                                         "Cannot allocate mlx5 shared data");
1218                                 ret = -rte_errno;
1219                                 goto error;
1220                         }
1221                         mlx5_shared_data = mz->addr;
1222                         memset(mlx5_shared_data, 0, sizeof(*mlx5_shared_data));
1223                         rte_spinlock_init(&mlx5_shared_data->lock);
1224                 } else {
1225                         /* Lookup allocated shared memory. */
1226                         mz = rte_memzone_lookup(MZ_MLX5_PMD_SHARED_DATA);
1227                         if (mz == NULL) {
1228                                 DRV_LOG(ERR,
1229                                         "Cannot attach mlx5 shared data");
1230                                 ret = -rte_errno;
1231                                 goto error;
1232                         }
1233                         mlx5_shared_data = mz->addr;
1234                         memset(&mlx5_local_data, 0, sizeof(mlx5_local_data));
1235                 }
1236         }
1237 error:
1238         rte_spinlock_unlock(&mlx5_shared_data_lock);
1239         return ret;
1240 }
1241
1242 /**
1243  * Retrieve integer value from environment variable.
1244  *
1245  * @param[in] name
1246  *   Environment variable name.
1247  *
1248  * @return
1249  *   Integer value, 0 if the variable is not set.
1250  */
1251 int
1252 mlx5_getenv_int(const char *name)
1253 {
1254         const char *val = getenv(name);
1255
1256         if (val == NULL)
1257                 return 0;
1258         return atoi(val);
1259 }
1260
1261 /**
1262  * Verbs callback to allocate a memory. This function should allocate the space
1263  * according to the size provided residing inside a huge page.
1264  * Please note that all allocation must respect the alignment from libmlx5
1265  * (i.e. currently sysconf(_SC_PAGESIZE)).
1266  *
1267  * @param[in] size
1268  *   The size in bytes of the memory to allocate.
1269  * @param[in] data
1270  *   A pointer to the callback data.
1271  *
1272  * @return
1273  *   Allocated buffer, NULL otherwise and rte_errno is set.
1274  */
1275 static void *
1276 mlx5_alloc_verbs_buf(size_t size, void *data)
1277 {
1278         struct mlx5_priv *priv = data;
1279         void *ret;
1280         size_t alignment = sysconf(_SC_PAGESIZE);
1281         unsigned int socket = SOCKET_ID_ANY;
1282
1283         if (priv->verbs_alloc_ctx.type == MLX5_VERBS_ALLOC_TYPE_TX_QUEUE) {
1284                 const struct mlx5_txq_ctrl *ctrl = priv->verbs_alloc_ctx.obj;
1285
1286                 socket = ctrl->socket;
1287         } else if (priv->verbs_alloc_ctx.type ==
1288                    MLX5_VERBS_ALLOC_TYPE_RX_QUEUE) {
1289                 const struct mlx5_rxq_ctrl *ctrl = priv->verbs_alloc_ctx.obj;
1290
1291                 socket = ctrl->socket;
1292         }
1293         MLX5_ASSERT(data != NULL);
1294         ret = rte_malloc_socket(__func__, size, alignment, socket);
1295         if (!ret && size)
1296                 rte_errno = ENOMEM;
1297         return ret;
1298 }
1299
1300 /**
1301  * Verbs callback to free a memory.
1302  *
1303  * @param[in] ptr
1304  *   A pointer to the memory to free.
1305  * @param[in] data
1306  *   A pointer to the callback data.
1307  */
1308 static void
1309 mlx5_free_verbs_buf(void *ptr, void *data __rte_unused)
1310 {
1311         MLX5_ASSERT(data != NULL);
1312         rte_free(ptr);
1313 }
1314
1315 /**
1316  * DPDK callback to add udp tunnel port
1317  *
1318  * @param[in] dev
1319  *   A pointer to eth_dev
1320  * @param[in] udp_tunnel
1321  *   A pointer to udp tunnel
1322  *
1323  * @return
1324  *   0 on valid udp ports and tunnels, -ENOTSUP otherwise.
1325  */
1326 int
1327 mlx5_udp_tunnel_port_add(struct rte_eth_dev *dev __rte_unused,
1328                          struct rte_eth_udp_tunnel *udp_tunnel)
1329 {
1330         MLX5_ASSERT(udp_tunnel != NULL);
1331         if (udp_tunnel->prot_type == RTE_TUNNEL_TYPE_VXLAN &&
1332             udp_tunnel->udp_port == 4789)
1333                 return 0;
1334         if (udp_tunnel->prot_type == RTE_TUNNEL_TYPE_VXLAN_GPE &&
1335             udp_tunnel->udp_port == 4790)
1336                 return 0;
1337         return -ENOTSUP;
1338 }
1339
1340 /**
1341  * Initialize process private data structure.
1342  *
1343  * @param dev
1344  *   Pointer to Ethernet device structure.
1345  *
1346  * @return
1347  *   0 on success, a negative errno value otherwise and rte_errno is set.
1348  */
1349 int
1350 mlx5_proc_priv_init(struct rte_eth_dev *dev)
1351 {
1352         struct mlx5_priv *priv = dev->data->dev_private;
1353         struct mlx5_proc_priv *ppriv;
1354         size_t ppriv_size;
1355
1356         /*
1357          * UAR register table follows the process private structure. BlueFlame
1358          * registers for Tx queues are stored in the table.
1359          */
1360         ppriv_size =
1361                 sizeof(struct mlx5_proc_priv) + priv->txqs_n * sizeof(void *);
1362         ppriv = rte_malloc_socket("mlx5_proc_priv", ppriv_size,
1363                                   RTE_CACHE_LINE_SIZE, dev->device->numa_node);
1364         if (!ppriv) {
1365                 rte_errno = ENOMEM;
1366                 return -rte_errno;
1367         }
1368         ppriv->uar_table_sz = ppriv_size;
1369         dev->process_private = ppriv;
1370         return 0;
1371 }
1372
1373 /**
1374  * Un-initialize process private data structure.
1375  *
1376  * @param dev
1377  *   Pointer to Ethernet device structure.
1378  */
1379 static void
1380 mlx5_proc_priv_uninit(struct rte_eth_dev *dev)
1381 {
1382         if (!dev->process_private)
1383                 return;
1384         rte_free(dev->process_private);
1385         dev->process_private = NULL;
1386 }
1387
1388 /**
1389  * DPDK callback to close the device.
1390  *
1391  * Destroy all queues and objects, free memory.
1392  *
1393  * @param dev
1394  *   Pointer to Ethernet device structure.
1395  */
1396 static void
1397 mlx5_dev_close(struct rte_eth_dev *dev)
1398 {
1399         struct mlx5_priv *priv = dev->data->dev_private;
1400         unsigned int i;
1401         int ret;
1402
1403         DRV_LOG(DEBUG, "port %u closing device \"%s\"",
1404                 dev->data->port_id,
1405                 ((priv->sh->ctx != NULL) ? priv->sh->ctx->device->name : ""));
1406         /* In case mlx5_dev_stop() has not been called. */
1407         mlx5_dev_interrupt_handler_uninstall(dev);
1408         mlx5_dev_interrupt_handler_devx_uninstall(dev);
1409         /*
1410          * If default mreg copy action is removed at the stop stage,
1411          * the search will return none and nothing will be done anymore.
1412          */
1413         mlx5_flow_stop_default(dev);
1414         mlx5_traffic_disable(dev);
1415         /*
1416          * If all the flows are already flushed in the device stop stage,
1417          * then this will return directly without any action.
1418          */
1419         mlx5_flow_list_flush(dev, &priv->flows, true);
1420         mlx5_flow_meter_flush(dev, NULL);
1421         /* Free the intermediate buffers for flow creation. */
1422         mlx5_flow_free_intermediate(dev);
1423         /* Prevent crashes when queues are still in use. */
1424         dev->rx_pkt_burst = removed_rx_burst;
1425         dev->tx_pkt_burst = removed_tx_burst;
1426         rte_wmb();
1427         /* Disable datapath on secondary process. */
1428         mlx5_mp_req_stop_rxtx(dev);
1429         if (priv->rxqs != NULL) {
1430                 /* XXX race condition if mlx5_rx_burst() is still running. */
1431                 usleep(1000);
1432                 for (i = 0; (i != priv->rxqs_n); ++i)
1433                         mlx5_rxq_release(dev, i);
1434                 priv->rxqs_n = 0;
1435                 priv->rxqs = NULL;
1436         }
1437         if (priv->txqs != NULL) {
1438                 /* XXX race condition if mlx5_tx_burst() is still running. */
1439                 usleep(1000);
1440                 for (i = 0; (i != priv->txqs_n); ++i)
1441                         mlx5_txq_release(dev, i);
1442                 priv->txqs_n = 0;
1443                 priv->txqs = NULL;
1444         }
1445         mlx5_proc_priv_uninit(dev);
1446         if (priv->mreg_cp_tbl)
1447                 mlx5_hlist_destroy(priv->mreg_cp_tbl, NULL, NULL);
1448         mlx5_mprq_free_mp(dev);
1449         mlx5_free_shared_dr(priv);
1450         if (priv->rss_conf.rss_key != NULL)
1451                 rte_free(priv->rss_conf.rss_key);
1452         if (priv->reta_idx != NULL)
1453                 rte_free(priv->reta_idx);
1454         if (priv->config.vf)
1455                 mlx5_nl_mac_addr_flush(priv->nl_socket_route, mlx5_ifindex(dev),
1456                                        dev->data->mac_addrs,
1457                                        MLX5_MAX_MAC_ADDRESSES, priv->mac_own);
1458         if (priv->nl_socket_route >= 0)
1459                 close(priv->nl_socket_route);
1460         if (priv->nl_socket_rdma >= 0)
1461                 close(priv->nl_socket_rdma);
1462         if (priv->vmwa_context)
1463                 mlx5_vlan_vmwa_exit(priv->vmwa_context);
1464         ret = mlx5_hrxq_verify(dev);
1465         if (ret)
1466                 DRV_LOG(WARNING, "port %u some hash Rx queue still remain",
1467                         dev->data->port_id);
1468         ret = mlx5_ind_table_obj_verify(dev);
1469         if (ret)
1470                 DRV_LOG(WARNING, "port %u some indirection table still remain",
1471                         dev->data->port_id);
1472         ret = mlx5_rxq_obj_verify(dev);
1473         if (ret)
1474                 DRV_LOG(WARNING, "port %u some Rx queue objects still remain",
1475                         dev->data->port_id);
1476         ret = mlx5_rxq_verify(dev);
1477         if (ret)
1478                 DRV_LOG(WARNING, "port %u some Rx queues still remain",
1479                         dev->data->port_id);
1480         ret = mlx5_txq_obj_verify(dev);
1481         if (ret)
1482                 DRV_LOG(WARNING, "port %u some Verbs Tx queue still remain",
1483                         dev->data->port_id);
1484         ret = mlx5_txq_verify(dev);
1485         if (ret)
1486                 DRV_LOG(WARNING, "port %u some Tx queues still remain",
1487                         dev->data->port_id);
1488         ret = mlx5_flow_verify(dev);
1489         if (ret)
1490                 DRV_LOG(WARNING, "port %u some flows still remain",
1491                         dev->data->port_id);
1492         if (priv->sh) {
1493                 /*
1494                  * Free the shared context in last turn, because the cleanup
1495                  * routines above may use some shared fields, like
1496                  * mlx5_nl_mac_addr_flush() uses ibdev_path for retrieveing
1497                  * ifindex if Netlink fails.
1498                  */
1499                 mlx5_free_shared_ibctx(priv->sh);
1500                 priv->sh = NULL;
1501         }
1502         if (priv->domain_id != RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID) {
1503                 unsigned int c = 0;
1504                 uint16_t port_id;
1505
1506                 MLX5_ETH_FOREACH_DEV(port_id, priv->pci_dev) {
1507                         struct mlx5_priv *opriv =
1508                                 rte_eth_devices[port_id].data->dev_private;
1509
1510                         if (!opriv ||
1511                             opriv->domain_id != priv->domain_id ||
1512                             &rte_eth_devices[port_id] == dev)
1513                                 continue;
1514                         ++c;
1515                         break;
1516                 }
1517                 if (!c)
1518                         claim_zero(rte_eth_switch_domain_free(priv->domain_id));
1519         }
1520         memset(priv, 0, sizeof(*priv));
1521         priv->domain_id = RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID;
1522         /*
1523          * Reset mac_addrs to NULL such that it is not freed as part of
1524          * rte_eth_dev_release_port(). mac_addrs is part of dev_private so
1525          * it is freed when dev_private is freed.
1526          */
1527         dev->data->mac_addrs = NULL;
1528 }
1529
1530 const struct eth_dev_ops mlx5_dev_ops = {
1531         .dev_configure = mlx5_dev_configure,
1532         .dev_start = mlx5_dev_start,
1533         .dev_stop = mlx5_dev_stop,
1534         .dev_set_link_down = mlx5_set_link_down,
1535         .dev_set_link_up = mlx5_set_link_up,
1536         .dev_close = mlx5_dev_close,
1537         .promiscuous_enable = mlx5_promiscuous_enable,
1538         .promiscuous_disable = mlx5_promiscuous_disable,
1539         .allmulticast_enable = mlx5_allmulticast_enable,
1540         .allmulticast_disable = mlx5_allmulticast_disable,
1541         .link_update = mlx5_link_update,
1542         .stats_get = mlx5_stats_get,
1543         .stats_reset = mlx5_stats_reset,
1544         .xstats_get = mlx5_xstats_get,
1545         .xstats_reset = mlx5_xstats_reset,
1546         .xstats_get_names = mlx5_xstats_get_names,
1547         .fw_version_get = mlx5_fw_version_get,
1548         .dev_infos_get = mlx5_dev_infos_get,
1549         .read_clock = mlx5_read_clock,
1550         .dev_supported_ptypes_get = mlx5_dev_supported_ptypes_get,
1551         .vlan_filter_set = mlx5_vlan_filter_set,
1552         .rx_queue_setup = mlx5_rx_queue_setup,
1553         .rx_hairpin_queue_setup = mlx5_rx_hairpin_queue_setup,
1554         .tx_queue_setup = mlx5_tx_queue_setup,
1555         .tx_hairpin_queue_setup = mlx5_tx_hairpin_queue_setup,
1556         .rx_queue_release = mlx5_rx_queue_release,
1557         .tx_queue_release = mlx5_tx_queue_release,
1558         .flow_ctrl_get = mlx5_dev_get_flow_ctrl,
1559         .flow_ctrl_set = mlx5_dev_set_flow_ctrl,
1560         .mac_addr_remove = mlx5_mac_addr_remove,
1561         .mac_addr_add = mlx5_mac_addr_add,
1562         .mac_addr_set = mlx5_mac_addr_set,
1563         .set_mc_addr_list = mlx5_set_mc_addr_list,
1564         .mtu_set = mlx5_dev_set_mtu,
1565         .vlan_strip_queue_set = mlx5_vlan_strip_queue_set,
1566         .vlan_offload_set = mlx5_vlan_offload_set,
1567         .reta_update = mlx5_dev_rss_reta_update,
1568         .reta_query = mlx5_dev_rss_reta_query,
1569         .rss_hash_update = mlx5_rss_hash_update,
1570         .rss_hash_conf_get = mlx5_rss_hash_conf_get,
1571         .filter_ctrl = mlx5_dev_filter_ctrl,
1572         .rx_descriptor_status = mlx5_rx_descriptor_status,
1573         .tx_descriptor_status = mlx5_tx_descriptor_status,
1574         .rxq_info_get = mlx5_rxq_info_get,
1575         .txq_info_get = mlx5_txq_info_get,
1576         .rx_burst_mode_get = mlx5_rx_burst_mode_get,
1577         .tx_burst_mode_get = mlx5_tx_burst_mode_get,
1578         .rx_queue_count = mlx5_rx_queue_count,
1579         .rx_queue_intr_enable = mlx5_rx_intr_enable,
1580         .rx_queue_intr_disable = mlx5_rx_intr_disable,
1581         .is_removed = mlx5_is_removed,
1582         .udp_tunnel_port_add  = mlx5_udp_tunnel_port_add,
1583         .get_module_info = mlx5_get_module_info,
1584         .get_module_eeprom = mlx5_get_module_eeprom,
1585         .hairpin_cap_get = mlx5_hairpin_cap_get,
1586         .mtr_ops_get = mlx5_flow_meter_ops_get,
1587 };
1588
1589 /* Available operations from secondary process. */
1590 static const struct eth_dev_ops mlx5_dev_sec_ops = {
1591         .stats_get = mlx5_stats_get,
1592         .stats_reset = mlx5_stats_reset,
1593         .xstats_get = mlx5_xstats_get,
1594         .xstats_reset = mlx5_xstats_reset,
1595         .xstats_get_names = mlx5_xstats_get_names,
1596         .fw_version_get = mlx5_fw_version_get,
1597         .dev_infos_get = mlx5_dev_infos_get,
1598         .rx_descriptor_status = mlx5_rx_descriptor_status,
1599         .tx_descriptor_status = mlx5_tx_descriptor_status,
1600         .rxq_info_get = mlx5_rxq_info_get,
1601         .txq_info_get = mlx5_txq_info_get,
1602         .rx_burst_mode_get = mlx5_rx_burst_mode_get,
1603         .tx_burst_mode_get = mlx5_tx_burst_mode_get,
1604         .get_module_info = mlx5_get_module_info,
1605         .get_module_eeprom = mlx5_get_module_eeprom,
1606 };
1607
1608 /* Available operations in flow isolated mode. */
1609 const struct eth_dev_ops mlx5_dev_ops_isolate = {
1610         .dev_configure = mlx5_dev_configure,
1611         .dev_start = mlx5_dev_start,
1612         .dev_stop = mlx5_dev_stop,
1613         .dev_set_link_down = mlx5_set_link_down,
1614         .dev_set_link_up = mlx5_set_link_up,
1615         .dev_close = mlx5_dev_close,
1616         .promiscuous_enable = mlx5_promiscuous_enable,
1617         .promiscuous_disable = mlx5_promiscuous_disable,
1618         .allmulticast_enable = mlx5_allmulticast_enable,
1619         .allmulticast_disable = mlx5_allmulticast_disable,
1620         .link_update = mlx5_link_update,
1621         .stats_get = mlx5_stats_get,
1622         .stats_reset = mlx5_stats_reset,
1623         .xstats_get = mlx5_xstats_get,
1624         .xstats_reset = mlx5_xstats_reset,
1625         .xstats_get_names = mlx5_xstats_get_names,
1626         .fw_version_get = mlx5_fw_version_get,
1627         .dev_infos_get = mlx5_dev_infos_get,
1628         .dev_supported_ptypes_get = mlx5_dev_supported_ptypes_get,
1629         .vlan_filter_set = mlx5_vlan_filter_set,
1630         .rx_queue_setup = mlx5_rx_queue_setup,
1631         .rx_hairpin_queue_setup = mlx5_rx_hairpin_queue_setup,
1632         .tx_queue_setup = mlx5_tx_queue_setup,
1633         .tx_hairpin_queue_setup = mlx5_tx_hairpin_queue_setup,
1634         .rx_queue_release = mlx5_rx_queue_release,
1635         .tx_queue_release = mlx5_tx_queue_release,
1636         .flow_ctrl_get = mlx5_dev_get_flow_ctrl,
1637         .flow_ctrl_set = mlx5_dev_set_flow_ctrl,
1638         .mac_addr_remove = mlx5_mac_addr_remove,
1639         .mac_addr_add = mlx5_mac_addr_add,
1640         .mac_addr_set = mlx5_mac_addr_set,
1641         .set_mc_addr_list = mlx5_set_mc_addr_list,
1642         .mtu_set = mlx5_dev_set_mtu,
1643         .vlan_strip_queue_set = mlx5_vlan_strip_queue_set,
1644         .vlan_offload_set = mlx5_vlan_offload_set,
1645         .filter_ctrl = mlx5_dev_filter_ctrl,
1646         .rx_descriptor_status = mlx5_rx_descriptor_status,
1647         .tx_descriptor_status = mlx5_tx_descriptor_status,
1648         .rxq_info_get = mlx5_rxq_info_get,
1649         .txq_info_get = mlx5_txq_info_get,
1650         .rx_burst_mode_get = mlx5_rx_burst_mode_get,
1651         .tx_burst_mode_get = mlx5_tx_burst_mode_get,
1652         .rx_queue_intr_enable = mlx5_rx_intr_enable,
1653         .rx_queue_intr_disable = mlx5_rx_intr_disable,
1654         .is_removed = mlx5_is_removed,
1655         .get_module_info = mlx5_get_module_info,
1656         .get_module_eeprom = mlx5_get_module_eeprom,
1657         .hairpin_cap_get = mlx5_hairpin_cap_get,
1658         .mtr_ops_get = mlx5_flow_meter_ops_get,
1659 };
1660
1661 /**
1662  * Verify and store value for device argument.
1663  *
1664  * @param[in] key
1665  *   Key argument to verify.
1666  * @param[in] val
1667  *   Value associated with key.
1668  * @param opaque
1669  *   User data.
1670  *
1671  * @return
1672  *   0 on success, a negative errno value otherwise and rte_errno is set.
1673  */
1674 static int
1675 mlx5_args_check(const char *key, const char *val, void *opaque)
1676 {
1677         struct mlx5_dev_config *config = opaque;
1678         unsigned long tmp;
1679
1680         /* No-op, port representors are processed in mlx5_dev_spawn(). */
1681         if (!strcmp(MLX5_REPRESENTOR, key))
1682                 return 0;
1683         errno = 0;
1684         tmp = strtoul(val, NULL, 0);
1685         if (errno) {
1686                 rte_errno = errno;
1687                 DRV_LOG(WARNING, "%s: \"%s\" is not a valid integer", key, val);
1688                 return -rte_errno;
1689         }
1690         if (strcmp(MLX5_RXQ_CQE_COMP_EN, key) == 0) {
1691                 config->cqe_comp = !!tmp;
1692         } else if (strcmp(MLX5_RXQ_CQE_PAD_EN, key) == 0) {
1693                 config->cqe_pad = !!tmp;
1694         } else if (strcmp(MLX5_RXQ_PKT_PAD_EN, key) == 0) {
1695                 config->hw_padding = !!tmp;
1696         } else if (strcmp(MLX5_RX_MPRQ_EN, key) == 0) {
1697                 config->mprq.enabled = !!tmp;
1698         } else if (strcmp(MLX5_RX_MPRQ_LOG_STRIDE_NUM, key) == 0) {
1699                 config->mprq.stride_num_n = tmp;
1700         } else if (strcmp(MLX5_RX_MPRQ_LOG_STRIDE_SIZE, key) == 0) {
1701                 config->mprq.stride_size_n = tmp;
1702         } else if (strcmp(MLX5_RX_MPRQ_MAX_MEMCPY_LEN, key) == 0) {
1703                 config->mprq.max_memcpy_len = tmp;
1704         } else if (strcmp(MLX5_RXQS_MIN_MPRQ, key) == 0) {
1705                 config->mprq.min_rxqs_num = tmp;
1706         } else if (strcmp(MLX5_TXQ_INLINE, key) == 0) {
1707                 DRV_LOG(WARNING, "%s: deprecated parameter,"
1708                                  " converted to txq_inline_max", key);
1709                 config->txq_inline_max = tmp;
1710         } else if (strcmp(MLX5_TXQ_INLINE_MAX, key) == 0) {
1711                 config->txq_inline_max = tmp;
1712         } else if (strcmp(MLX5_TXQ_INLINE_MIN, key) == 0) {
1713                 config->txq_inline_min = tmp;
1714         } else if (strcmp(MLX5_TXQ_INLINE_MPW, key) == 0) {
1715                 config->txq_inline_mpw = tmp;
1716         } else if (strcmp(MLX5_TXQS_MIN_INLINE, key) == 0) {
1717                 config->txqs_inline = tmp;
1718         } else if (strcmp(MLX5_TXQS_MAX_VEC, key) == 0) {
1719                 DRV_LOG(WARNING, "%s: deprecated parameter, ignored", key);
1720         } else if (strcmp(MLX5_TXQ_MPW_EN, key) == 0) {
1721                 config->mps = !!tmp;
1722         } else if (strcmp(MLX5_TX_DB_NC, key) == 0) {
1723                 if (tmp != MLX5_TXDB_CACHED &&
1724                     tmp != MLX5_TXDB_NCACHED &&
1725                     tmp != MLX5_TXDB_HEURISTIC) {
1726                         DRV_LOG(ERR, "invalid Tx doorbell "
1727                                      "mapping parameter");
1728                         rte_errno = EINVAL;
1729                         return -rte_errno;
1730                 }
1731                 config->dbnc = tmp;
1732         } else if (strcmp(MLX5_TXQ_MPW_HDR_DSEG_EN, key) == 0) {
1733                 DRV_LOG(WARNING, "%s: deprecated parameter, ignored", key);
1734         } else if (strcmp(MLX5_TXQ_MAX_INLINE_LEN, key) == 0) {
1735                 DRV_LOG(WARNING, "%s: deprecated parameter,"
1736                                  " converted to txq_inline_mpw", key);
1737                 config->txq_inline_mpw = tmp;
1738         } else if (strcmp(MLX5_TX_VEC_EN, key) == 0) {
1739                 DRV_LOG(WARNING, "%s: deprecated parameter, ignored", key);
1740         } else if (strcmp(MLX5_RX_VEC_EN, key) == 0) {
1741                 config->rx_vec_en = !!tmp;
1742         } else if (strcmp(MLX5_L3_VXLAN_EN, key) == 0) {
1743                 config->l3_vxlan_en = !!tmp;
1744         } else if (strcmp(MLX5_VF_NL_EN, key) == 0) {
1745                 config->vf_nl_en = !!tmp;
1746         } else if (strcmp(MLX5_DV_ESW_EN, key) == 0) {
1747                 config->dv_esw_en = !!tmp;
1748         } else if (strcmp(MLX5_DV_FLOW_EN, key) == 0) {
1749                 config->dv_flow_en = !!tmp;
1750         } else if (strcmp(MLX5_DV_XMETA_EN, key) == 0) {
1751                 if (tmp != MLX5_XMETA_MODE_LEGACY &&
1752                     tmp != MLX5_XMETA_MODE_META16 &&
1753                     tmp != MLX5_XMETA_MODE_META32) {
1754                         DRV_LOG(ERR, "invalid extensive "
1755                                      "metadata parameter");
1756                         rte_errno = EINVAL;
1757                         return -rte_errno;
1758                 }
1759                 config->dv_xmeta_en = tmp;
1760         } else if (strcmp(MLX5_MR_EXT_MEMSEG_EN, key) == 0) {
1761                 config->mr_ext_memseg_en = !!tmp;
1762         } else if (strcmp(MLX5_MAX_DUMP_FILES_NUM, key) == 0) {
1763                 config->max_dump_files_num = tmp;
1764         } else if (strcmp(MLX5_LRO_TIMEOUT_USEC, key) == 0) {
1765                 config->lro.timeout = tmp;
1766         } else if (strcmp(MLX5_CLASS_ARG_NAME, key) == 0) {
1767                 DRV_LOG(DEBUG, "class argument is %s.", val);
1768         } else if (strcmp(MLX5_HP_BUF_SIZE, key) == 0) {
1769                 config->log_hp_size = tmp;
1770         } else {
1771                 DRV_LOG(WARNING, "%s: unknown parameter", key);
1772                 rte_errno = EINVAL;
1773                 return -rte_errno;
1774         }
1775         return 0;
1776 }
1777
1778 /**
1779  * Parse device parameters.
1780  *
1781  * @param config
1782  *   Pointer to device configuration structure.
1783  * @param devargs
1784  *   Device arguments structure.
1785  *
1786  * @return
1787  *   0 on success, a negative errno value otherwise and rte_errno is set.
1788  */
1789 static int
1790 mlx5_args(struct mlx5_dev_config *config, struct rte_devargs *devargs)
1791 {
1792         const char **params = (const char *[]){
1793                 MLX5_RXQ_CQE_COMP_EN,
1794                 MLX5_RXQ_CQE_PAD_EN,
1795                 MLX5_RXQ_PKT_PAD_EN,
1796                 MLX5_RX_MPRQ_EN,
1797                 MLX5_RX_MPRQ_LOG_STRIDE_NUM,
1798                 MLX5_RX_MPRQ_LOG_STRIDE_SIZE,
1799                 MLX5_RX_MPRQ_MAX_MEMCPY_LEN,
1800                 MLX5_RXQS_MIN_MPRQ,
1801                 MLX5_TXQ_INLINE,
1802                 MLX5_TXQ_INLINE_MIN,
1803                 MLX5_TXQ_INLINE_MAX,
1804                 MLX5_TXQ_INLINE_MPW,
1805                 MLX5_TXQS_MIN_INLINE,
1806                 MLX5_TXQS_MAX_VEC,
1807                 MLX5_TXQ_MPW_EN,
1808                 MLX5_TXQ_MPW_HDR_DSEG_EN,
1809                 MLX5_TXQ_MAX_INLINE_LEN,
1810                 MLX5_TX_DB_NC,
1811                 MLX5_TX_VEC_EN,
1812                 MLX5_RX_VEC_EN,
1813                 MLX5_L3_VXLAN_EN,
1814                 MLX5_VF_NL_EN,
1815                 MLX5_DV_ESW_EN,
1816                 MLX5_DV_FLOW_EN,
1817                 MLX5_DV_XMETA_EN,
1818                 MLX5_MR_EXT_MEMSEG_EN,
1819                 MLX5_REPRESENTOR,
1820                 MLX5_MAX_DUMP_FILES_NUM,
1821                 MLX5_LRO_TIMEOUT_USEC,
1822                 MLX5_CLASS_ARG_NAME,
1823                 MLX5_HP_BUF_SIZE,
1824                 NULL,
1825         };
1826         struct rte_kvargs *kvlist;
1827         int ret = 0;
1828         int i;
1829
1830         if (devargs == NULL)
1831                 return 0;
1832         /* Following UGLY cast is done to pass checkpatch. */
1833         kvlist = rte_kvargs_parse(devargs->args, params);
1834         if (kvlist == NULL) {
1835                 rte_errno = EINVAL;
1836                 return -rte_errno;
1837         }
1838         /* Process parameters. */
1839         for (i = 0; (params[i] != NULL); ++i) {
1840                 if (rte_kvargs_count(kvlist, params[i])) {
1841                         ret = rte_kvargs_process(kvlist, params[i],
1842                                                  mlx5_args_check, config);
1843                         if (ret) {
1844                                 rte_errno = EINVAL;
1845                                 rte_kvargs_free(kvlist);
1846                                 return -rte_errno;
1847                         }
1848                 }
1849         }
1850         rte_kvargs_free(kvlist);
1851         return 0;
1852 }
1853
1854 static struct rte_pci_driver mlx5_driver;
1855
1856 /**
1857  * PMD global initialization.
1858  *
1859  * Independent from individual device, this function initializes global
1860  * per-PMD data structures distinguishing primary and secondary processes.
1861  * Hence, each initialization is called once per a process.
1862  *
1863  * @return
1864  *   0 on success, a negative errno value otherwise and rte_errno is set.
1865  */
1866 static int
1867 mlx5_init_once(void)
1868 {
1869         struct mlx5_shared_data *sd;
1870         struct mlx5_local_data *ld = &mlx5_local_data;
1871         int ret = 0;
1872
1873         if (mlx5_init_shared_data())
1874                 return -rte_errno;
1875         sd = mlx5_shared_data;
1876         MLX5_ASSERT(sd);
1877         rte_spinlock_lock(&sd->lock);
1878         switch (rte_eal_process_type()) {
1879         case RTE_PROC_PRIMARY:
1880                 if (sd->init_done)
1881                         break;
1882                 LIST_INIT(&sd->mem_event_cb_list);
1883                 rte_rwlock_init(&sd->mem_event_rwlock);
1884                 rte_mem_event_callback_register("MLX5_MEM_EVENT_CB",
1885                                                 mlx5_mr_mem_event_cb, NULL);
1886                 ret = mlx5_mp_init_primary(MLX5_MP_NAME,
1887                                            mlx5_mp_primary_handle);
1888                 if (ret)
1889                         goto out;
1890                 sd->init_done = true;
1891                 break;
1892         case RTE_PROC_SECONDARY:
1893                 if (ld->init_done)
1894                         break;
1895                 ret = mlx5_mp_init_secondary(MLX5_MP_NAME,
1896                                              mlx5_mp_secondary_handle);
1897                 if (ret)
1898                         goto out;
1899                 ++sd->secondary_cnt;
1900                 ld->init_done = true;
1901                 break;
1902         default:
1903                 break;
1904         }
1905 out:
1906         rte_spinlock_unlock(&sd->lock);
1907         return ret;
1908 }
1909
1910 /**
1911  * Configures the minimal amount of data to inline into WQE
1912  * while sending packets.
1913  *
1914  * - the txq_inline_min has the maximal priority, if this
1915  *   key is specified in devargs
1916  * - if DevX is enabled the inline mode is queried from the
1917  *   device (HCA attributes and NIC vport context if needed).
1918  * - otherwise L2 mode (18 bytes) is assumed for ConnectX-4/4 Lx
1919  *   and none (0 bytes) for other NICs
1920  *
1921  * @param spawn
1922  *   Verbs device parameters (name, port, switch_info) to spawn.
1923  * @param config
1924  *   Device configuration parameters.
1925  */
1926 static void
1927 mlx5_set_min_inline(struct mlx5_dev_spawn_data *spawn,
1928                     struct mlx5_dev_config *config)
1929 {
1930         if (config->txq_inline_min != MLX5_ARG_UNSET) {
1931                 /* Application defines size of inlined data explicitly. */
1932                 switch (spawn->pci_dev->id.device_id) {
1933                 case PCI_DEVICE_ID_MELLANOX_CONNECTX4:
1934                 case PCI_DEVICE_ID_MELLANOX_CONNECTX4VF:
1935                         if (config->txq_inline_min <
1936                                        (int)MLX5_INLINE_HSIZE_L2) {
1937                                 DRV_LOG(DEBUG,
1938                                         "txq_inline_mix aligned to minimal"
1939                                         " ConnectX-4 required value %d",
1940                                         (int)MLX5_INLINE_HSIZE_L2);
1941                                 config->txq_inline_min = MLX5_INLINE_HSIZE_L2;
1942                         }
1943                         break;
1944                 }
1945                 goto exit;
1946         }
1947         if (config->hca_attr.eth_net_offloads) {
1948                 /* We have DevX enabled, inline mode queried successfully. */
1949                 switch (config->hca_attr.wqe_inline_mode) {
1950                 case MLX5_CAP_INLINE_MODE_L2:
1951                         /* outer L2 header must be inlined. */
1952                         config->txq_inline_min = MLX5_INLINE_HSIZE_L2;
1953                         goto exit;
1954                 case MLX5_CAP_INLINE_MODE_NOT_REQUIRED:
1955                         /* No inline data are required by NIC. */
1956                         config->txq_inline_min = MLX5_INLINE_HSIZE_NONE;
1957                         config->hw_vlan_insert =
1958                                 config->hca_attr.wqe_vlan_insert;
1959                         DRV_LOG(DEBUG, "Tx VLAN insertion is supported");
1960                         goto exit;
1961                 case MLX5_CAP_INLINE_MODE_VPORT_CONTEXT:
1962                         /* inline mode is defined by NIC vport context. */
1963                         if (!config->hca_attr.eth_virt)
1964                                 break;
1965                         switch (config->hca_attr.vport_inline_mode) {
1966                         case MLX5_INLINE_MODE_NONE:
1967                                 config->txq_inline_min =
1968                                         MLX5_INLINE_HSIZE_NONE;
1969                                 goto exit;
1970                         case MLX5_INLINE_MODE_L2:
1971                                 config->txq_inline_min =
1972                                         MLX5_INLINE_HSIZE_L2;
1973                                 goto exit;
1974                         case MLX5_INLINE_MODE_IP:
1975                                 config->txq_inline_min =
1976                                         MLX5_INLINE_HSIZE_L3;
1977                                 goto exit;
1978                         case MLX5_INLINE_MODE_TCP_UDP:
1979                                 config->txq_inline_min =
1980                                         MLX5_INLINE_HSIZE_L4;
1981                                 goto exit;
1982                         case MLX5_INLINE_MODE_INNER_L2:
1983                                 config->txq_inline_min =
1984                                         MLX5_INLINE_HSIZE_INNER_L2;
1985                                 goto exit;
1986                         case MLX5_INLINE_MODE_INNER_IP:
1987                                 config->txq_inline_min =
1988                                         MLX5_INLINE_HSIZE_INNER_L3;
1989                                 goto exit;
1990                         case MLX5_INLINE_MODE_INNER_TCP_UDP:
1991                                 config->txq_inline_min =
1992                                         MLX5_INLINE_HSIZE_INNER_L4;
1993                                 goto exit;
1994                         }
1995                 }
1996         }
1997         /*
1998          * We get here if we are unable to deduce
1999          * inline data size with DevX. Try PCI ID
2000          * to determine old NICs.
2001          */
2002         switch (spawn->pci_dev->id.device_id) {
2003         case PCI_DEVICE_ID_MELLANOX_CONNECTX4:
2004         case PCI_DEVICE_ID_MELLANOX_CONNECTX4VF:
2005         case PCI_DEVICE_ID_MELLANOX_CONNECTX4LX:
2006         case PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF:
2007                 config->txq_inline_min = MLX5_INLINE_HSIZE_L2;
2008                 config->hw_vlan_insert = 0;
2009                 break;
2010         case PCI_DEVICE_ID_MELLANOX_CONNECTX5:
2011         case PCI_DEVICE_ID_MELLANOX_CONNECTX5VF:
2012         case PCI_DEVICE_ID_MELLANOX_CONNECTX5EX:
2013         case PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF:
2014                 /*
2015                  * These NICs support VLAN insertion from WQE and
2016                  * report the wqe_vlan_insert flag. But there is the bug
2017                  * and PFC control may be broken, so disable feature.
2018                  */
2019                 config->hw_vlan_insert = 0;
2020                 config->txq_inline_min = MLX5_INLINE_HSIZE_NONE;
2021                 break;
2022         default:
2023                 config->txq_inline_min = MLX5_INLINE_HSIZE_NONE;
2024                 break;
2025         }
2026 exit:
2027         DRV_LOG(DEBUG, "min tx inline configured: %d", config->txq_inline_min);
2028 }
2029
2030 /**
2031  * Configures the metadata mask fields in the shared context.
2032  *
2033  * @param [in] dev
2034  *   Pointer to Ethernet device.
2035  */
2036 static void
2037 mlx5_set_metadata_mask(struct rte_eth_dev *dev)
2038 {
2039         struct mlx5_priv *priv = dev->data->dev_private;
2040         struct mlx5_ibv_shared *sh = priv->sh;
2041         uint32_t meta, mark, reg_c0;
2042
2043         reg_c0 = ~priv->vport_meta_mask;
2044         switch (priv->config.dv_xmeta_en) {
2045         case MLX5_XMETA_MODE_LEGACY:
2046                 meta = UINT32_MAX;
2047                 mark = MLX5_FLOW_MARK_MASK;
2048                 break;
2049         case MLX5_XMETA_MODE_META16:
2050                 meta = reg_c0 >> rte_bsf32(reg_c0);
2051                 mark = MLX5_FLOW_MARK_MASK;
2052                 break;
2053         case MLX5_XMETA_MODE_META32:
2054                 meta = UINT32_MAX;
2055                 mark = (reg_c0 >> rte_bsf32(reg_c0)) & MLX5_FLOW_MARK_MASK;
2056                 break;
2057         default:
2058                 meta = 0;
2059                 mark = 0;
2060                 MLX5_ASSERT(false);
2061                 break;
2062         }
2063         if (sh->dv_mark_mask && sh->dv_mark_mask != mark)
2064                 DRV_LOG(WARNING, "metadata MARK mask mismatche %08X:%08X",
2065                                  sh->dv_mark_mask, mark);
2066         else
2067                 sh->dv_mark_mask = mark;
2068         if (sh->dv_meta_mask && sh->dv_meta_mask != meta)
2069                 DRV_LOG(WARNING, "metadata META mask mismatche %08X:%08X",
2070                                  sh->dv_meta_mask, meta);
2071         else
2072                 sh->dv_meta_mask = meta;
2073         if (sh->dv_regc0_mask && sh->dv_regc0_mask != reg_c0)
2074                 DRV_LOG(WARNING, "metadata reg_c0 mask mismatche %08X:%08X",
2075                                  sh->dv_meta_mask, reg_c0);
2076         else
2077                 sh->dv_regc0_mask = reg_c0;
2078         DRV_LOG(DEBUG, "metadata mode %u", priv->config.dv_xmeta_en);
2079         DRV_LOG(DEBUG, "metadata MARK mask %08X", sh->dv_mark_mask);
2080         DRV_LOG(DEBUG, "metadata META mask %08X", sh->dv_meta_mask);
2081         DRV_LOG(DEBUG, "metadata reg_c0 mask %08X", sh->dv_regc0_mask);
2082 }
2083
2084 /**
2085  * Allocate page of door-bells and register it using DevX API.
2086  *
2087  * @param [in] dev
2088  *   Pointer to Ethernet device.
2089  *
2090  * @return
2091  *   Pointer to new page on success, NULL otherwise.
2092  */
2093 static struct mlx5_devx_dbr_page *
2094 mlx5_alloc_dbr_page(struct rte_eth_dev *dev)
2095 {
2096         struct mlx5_priv *priv = dev->data->dev_private;
2097         struct mlx5_devx_dbr_page *page;
2098
2099         /* Allocate space for door-bell page and management data. */
2100         page = rte_calloc_socket(__func__, 1, sizeof(struct mlx5_devx_dbr_page),
2101                                  RTE_CACHE_LINE_SIZE, dev->device->numa_node);
2102         if (!page) {
2103                 DRV_LOG(ERR, "port %u cannot allocate dbr page",
2104                         dev->data->port_id);
2105                 return NULL;
2106         }
2107         /* Register allocated memory. */
2108         page->umem = mlx5_glue->devx_umem_reg(priv->sh->ctx, page->dbrs,
2109                                               MLX5_DBR_PAGE_SIZE, 0);
2110         if (!page->umem) {
2111                 DRV_LOG(ERR, "port %u cannot umem reg dbr page",
2112                         dev->data->port_id);
2113                 rte_free(page);
2114                 return NULL;
2115         }
2116         return page;
2117 }
2118
2119 /**
2120  * Find the next available door-bell, allocate new page if needed.
2121  *
2122  * @param [in] dev
2123  *   Pointer to Ethernet device.
2124  * @param [out] dbr_page
2125  *   Door-bell page containing the page data.
2126  *
2127  * @return
2128  *   Door-bell address offset on success, a negative error value otherwise.
2129  */
2130 int64_t
2131 mlx5_get_dbr(struct rte_eth_dev *dev, struct mlx5_devx_dbr_page **dbr_page)
2132 {
2133         struct mlx5_priv *priv = dev->data->dev_private;
2134         struct mlx5_devx_dbr_page *page = NULL;
2135         uint32_t i, j;
2136
2137         LIST_FOREACH(page, &priv->dbrpgs, next)
2138                 if (page->dbr_count < MLX5_DBR_PER_PAGE)
2139                         break;
2140         if (!page) { /* No page with free door-bell exists. */
2141                 page = mlx5_alloc_dbr_page(dev);
2142                 if (!page) /* Failed to allocate new page. */
2143                         return (-1);
2144                 LIST_INSERT_HEAD(&priv->dbrpgs, page, next);
2145         }
2146         /* Loop to find bitmap part with clear bit. */
2147         for (i = 0;
2148              i < MLX5_DBR_BITMAP_SIZE && page->dbr_bitmap[i] == UINT64_MAX;
2149              i++)
2150                 ; /* Empty. */
2151         /* Find the first clear bit. */
2152         MLX5_ASSERT(i < MLX5_DBR_BITMAP_SIZE);
2153         j = rte_bsf64(~page->dbr_bitmap[i]);
2154         page->dbr_bitmap[i] |= (1 << j);
2155         page->dbr_count++;
2156         *dbr_page = page;
2157         return (((i * 64) + j) * sizeof(uint64_t));
2158 }
2159
2160 /**
2161  * Release a door-bell record.
2162  *
2163  * @param [in] dev
2164  *   Pointer to Ethernet device.
2165  * @param [in] umem_id
2166  *   UMEM ID of page containing the door-bell record to release.
2167  * @param [in] offset
2168  *   Offset of door-bell record in page.
2169  *
2170  * @return
2171  *   0 on success, a negative error value otherwise.
2172  */
2173 int32_t
2174 mlx5_release_dbr(struct rte_eth_dev *dev, uint32_t umem_id, uint64_t offset)
2175 {
2176         struct mlx5_priv *priv = dev->data->dev_private;
2177         struct mlx5_devx_dbr_page *page = NULL;
2178         int ret = 0;
2179
2180         LIST_FOREACH(page, &priv->dbrpgs, next)
2181                 /* Find the page this address belongs to. */
2182                 if (page->umem->umem_id == umem_id)
2183                         break;
2184         if (!page)
2185                 return -EINVAL;
2186         page->dbr_count--;
2187         if (!page->dbr_count) {
2188                 /* Page not used, free it and remove from list. */
2189                 LIST_REMOVE(page, next);
2190                 if (page->umem)
2191                         ret = -mlx5_glue->devx_umem_dereg(page->umem);
2192                 rte_free(page);
2193         } else {
2194                 /* Mark in bitmap that this door-bell is not in use. */
2195                 offset /= MLX5_DBR_SIZE;
2196                 int i = offset / 64;
2197                 int j = offset % 64;
2198
2199                 page->dbr_bitmap[i] &= ~(1 << j);
2200         }
2201         return ret;
2202 }
2203
2204 int
2205 rte_pmd_mlx5_get_dyn_flag_names(char *names[], unsigned int n)
2206 {
2207         static const char *const dynf_names[] = {
2208                 RTE_PMD_MLX5_FINE_GRANULARITY_INLINE,
2209                 RTE_MBUF_DYNFLAG_METADATA_NAME
2210         };
2211         unsigned int i;
2212
2213         if (n < RTE_DIM(dynf_names))
2214                 return -ENOMEM;
2215         for (i = 0; i < RTE_DIM(dynf_names); i++) {
2216                 if (names[i] == NULL)
2217                         return -EINVAL;
2218                 strcpy(names[i], dynf_names[i]);
2219         }
2220         return RTE_DIM(dynf_names);
2221 }
2222
2223 /**
2224  * Check sibling device configurations.
2225  *
2226  * Sibling devices sharing the Infiniband device context
2227  * should have compatible configurations. This regards
2228  * representors and bonding slaves.
2229  *
2230  * @param priv
2231  *   Private device descriptor.
2232  * @param config
2233  *   Configuration of the device is going to be created.
2234  *
2235  * @return
2236  *   0 on success, EINVAL otherwise
2237  */
2238 static int
2239 mlx5_dev_check_sibling_config(struct mlx5_priv *priv,
2240                               struct mlx5_dev_config *config)
2241 {
2242         struct mlx5_ibv_shared *sh = priv->sh;
2243         struct mlx5_dev_config *sh_conf = NULL;
2244         uint16_t port_id;
2245
2246         MLX5_ASSERT(sh);
2247         /* Nothing to compare for the single/first device. */
2248         if (sh->refcnt == 1)
2249                 return 0;
2250         /* Find the device with shared context. */
2251         MLX5_ETH_FOREACH_DEV(port_id, priv->pci_dev) {
2252                 struct mlx5_priv *opriv =
2253                         rte_eth_devices[port_id].data->dev_private;
2254
2255                 if (opriv && opriv != priv && opriv->sh == sh) {
2256                         sh_conf = &opriv->config;
2257                         break;
2258                 }
2259         }
2260         if (!sh_conf)
2261                 return 0;
2262         if (sh_conf->dv_flow_en ^ config->dv_flow_en) {
2263                 DRV_LOG(ERR, "\"dv_flow_en\" configuration mismatch"
2264                              " for shared %s context", sh->ibdev_name);
2265                 rte_errno = EINVAL;
2266                 return rte_errno;
2267         }
2268         if (sh_conf->dv_xmeta_en ^ config->dv_xmeta_en) {
2269                 DRV_LOG(ERR, "\"dv_xmeta_en\" configuration mismatch"
2270                              " for shared %s context", sh->ibdev_name);
2271                 rte_errno = EINVAL;
2272                 return rte_errno;
2273         }
2274         return 0;
2275 }
2276 /**
2277  * Spawn an Ethernet device from Verbs information.
2278  *
2279  * @param dpdk_dev
2280  *   Backing DPDK device.
2281  * @param spawn
2282  *   Verbs device parameters (name, port, switch_info) to spawn.
2283  * @param config
2284  *   Device configuration parameters.
2285  *
2286  * @return
2287  *   A valid Ethernet device object on success, NULL otherwise and rte_errno
2288  *   is set. The following errors are defined:
2289  *
2290  *   EBUSY: device is not supposed to be spawned.
2291  *   EEXIST: device is already spawned
2292  */
2293 static struct rte_eth_dev *
2294 mlx5_dev_spawn(struct rte_device *dpdk_dev,
2295                struct mlx5_dev_spawn_data *spawn,
2296                struct mlx5_dev_config config)
2297 {
2298         const struct mlx5_switch_info *switch_info = &spawn->info;
2299         struct mlx5_ibv_shared *sh = NULL;
2300         struct ibv_port_attr port_attr;
2301         struct mlx5dv_context dv_attr = { .comp_mask = 0 };
2302         struct rte_eth_dev *eth_dev = NULL;
2303         struct mlx5_priv *priv = NULL;
2304         int err = 0;
2305         unsigned int hw_padding = 0;
2306         unsigned int mps;
2307         unsigned int cqe_comp;
2308         unsigned int cqe_pad = 0;
2309         unsigned int tunnel_en = 0;
2310         unsigned int mpls_en = 0;
2311         unsigned int swp = 0;
2312         unsigned int mprq = 0;
2313         unsigned int mprq_min_stride_size_n = 0;
2314         unsigned int mprq_max_stride_size_n = 0;
2315         unsigned int mprq_min_stride_num_n = 0;
2316         unsigned int mprq_max_stride_num_n = 0;
2317         struct rte_ether_addr mac;
2318         char name[RTE_ETH_NAME_MAX_LEN];
2319         int own_domain_id = 0;
2320         uint16_t port_id;
2321         unsigned int i;
2322 #ifdef HAVE_MLX5DV_DR_DEVX_PORT
2323         struct mlx5dv_devx_port devx_port = { .comp_mask = 0 };
2324 #endif
2325
2326         /* Determine if this port representor is supposed to be spawned. */
2327         if (switch_info->representor && dpdk_dev->devargs) {
2328                 struct rte_eth_devargs eth_da;
2329
2330                 err = rte_eth_devargs_parse(dpdk_dev->devargs->args, &eth_da);
2331                 if (err) {
2332                         rte_errno = -err;
2333                         DRV_LOG(ERR, "failed to process device arguments: %s",
2334                                 strerror(rte_errno));
2335                         return NULL;
2336                 }
2337                 for (i = 0; i < eth_da.nb_representor_ports; ++i)
2338                         if (eth_da.representor_ports[i] ==
2339                             (uint16_t)switch_info->port_name)
2340                                 break;
2341                 if (i == eth_da.nb_representor_ports) {
2342                         rte_errno = EBUSY;
2343                         return NULL;
2344                 }
2345         }
2346         /* Build device name. */
2347         if (spawn->pf_bond <  0) {
2348                 /* Single device. */
2349                 if (!switch_info->representor)
2350                         strlcpy(name, dpdk_dev->name, sizeof(name));
2351                 else
2352                         snprintf(name, sizeof(name), "%s_representor_%u",
2353                                  dpdk_dev->name, switch_info->port_name);
2354         } else {
2355                 /* Bonding device. */
2356                 if (!switch_info->representor)
2357                         snprintf(name, sizeof(name), "%s_%s",
2358                                  dpdk_dev->name, spawn->ibv_dev->name);
2359                 else
2360                         snprintf(name, sizeof(name), "%s_%s_representor_%u",
2361                                  dpdk_dev->name, spawn->ibv_dev->name,
2362                                  switch_info->port_name);
2363         }
2364         /* check if the device is already spawned */
2365         if (rte_eth_dev_get_port_by_name(name, &port_id) == 0) {
2366                 rte_errno = EEXIST;
2367                 return NULL;
2368         }
2369         DRV_LOG(DEBUG, "naming Ethernet device \"%s\"", name);
2370         if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
2371                 struct mlx5_mp_id mp_id;
2372
2373                 eth_dev = rte_eth_dev_attach_secondary(name);
2374                 if (eth_dev == NULL) {
2375                         DRV_LOG(ERR, "can not attach rte ethdev");
2376                         rte_errno = ENOMEM;
2377                         return NULL;
2378                 }
2379                 eth_dev->device = dpdk_dev;
2380                 eth_dev->dev_ops = &mlx5_dev_sec_ops;
2381                 err = mlx5_proc_priv_init(eth_dev);
2382                 if (err)
2383                         return NULL;
2384                 mp_id.port_id = eth_dev->data->port_id;
2385                 strlcpy(mp_id.name, MLX5_MP_NAME, RTE_MP_MAX_NAME_LEN);
2386                 /* Receive command fd from primary process */
2387                 err = mlx5_mp_req_verbs_cmd_fd(&mp_id);
2388                 if (err < 0)
2389                         return NULL;
2390                 /* Remap UAR for Tx queues. */
2391                 err = mlx5_tx_uar_init_secondary(eth_dev, err);
2392                 if (err)
2393                         return NULL;
2394                 /*
2395                  * Ethdev pointer is still required as input since
2396                  * the primary device is not accessible from the
2397                  * secondary process.
2398                  */
2399                 eth_dev->rx_pkt_burst = mlx5_select_rx_function(eth_dev);
2400                 eth_dev->tx_pkt_burst = mlx5_select_tx_function(eth_dev);
2401                 return eth_dev;
2402         }
2403         /*
2404          * Some parameters ("tx_db_nc" in particularly) are needed in
2405          * advance to create dv/verbs device context. We proceed the
2406          * devargs here to get ones, and later proceed devargs again
2407          * to override some hardware settings.
2408          */
2409         err = mlx5_args(&config, dpdk_dev->devargs);
2410         if (err) {
2411                 err = rte_errno;
2412                 DRV_LOG(ERR, "failed to process device arguments: %s",
2413                         strerror(rte_errno));
2414                 goto error;
2415         }
2416         sh = mlx5_alloc_shared_ibctx(spawn, &config);
2417         if (!sh)
2418                 return NULL;
2419         config.devx = sh->devx;
2420 #ifdef HAVE_MLX5DV_DR_ACTION_DEST_DEVX_TIR
2421         config.dest_tir = 1;
2422 #endif
2423 #ifdef HAVE_IBV_MLX5_MOD_SWP
2424         dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_SWP;
2425 #endif
2426         /*
2427          * Multi-packet send is supported by ConnectX-4 Lx PF as well
2428          * as all ConnectX-5 devices.
2429          */
2430 #ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT
2431         dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS;
2432 #endif
2433 #ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT
2434         dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_STRIDING_RQ;
2435 #endif
2436         mlx5_glue->dv_query_device(sh->ctx, &dv_attr);
2437         if (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED) {
2438                 if (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW) {
2439                         DRV_LOG(DEBUG, "enhanced MPW is supported");
2440                         mps = MLX5_MPW_ENHANCED;
2441                 } else {
2442                         DRV_LOG(DEBUG, "MPW is supported");
2443                         mps = MLX5_MPW;
2444                 }
2445         } else {
2446                 DRV_LOG(DEBUG, "MPW isn't supported");
2447                 mps = MLX5_MPW_DISABLED;
2448         }
2449 #ifdef HAVE_IBV_MLX5_MOD_SWP
2450         if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_SWP)
2451                 swp = dv_attr.sw_parsing_caps.sw_parsing_offloads;
2452         DRV_LOG(DEBUG, "SWP support: %u", swp);
2453 #endif
2454         config.swp = !!swp;
2455 #ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT
2456         if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_STRIDING_RQ) {
2457                 struct mlx5dv_striding_rq_caps mprq_caps =
2458                         dv_attr.striding_rq_caps;
2459
2460                 DRV_LOG(DEBUG, "\tmin_single_stride_log_num_of_bytes: %d",
2461                         mprq_caps.min_single_stride_log_num_of_bytes);
2462                 DRV_LOG(DEBUG, "\tmax_single_stride_log_num_of_bytes: %d",
2463                         mprq_caps.max_single_stride_log_num_of_bytes);
2464                 DRV_LOG(DEBUG, "\tmin_single_wqe_log_num_of_strides: %d",
2465                         mprq_caps.min_single_wqe_log_num_of_strides);
2466                 DRV_LOG(DEBUG, "\tmax_single_wqe_log_num_of_strides: %d",
2467                         mprq_caps.max_single_wqe_log_num_of_strides);
2468                 DRV_LOG(DEBUG, "\tsupported_qpts: %d",
2469                         mprq_caps.supported_qpts);
2470                 DRV_LOG(DEBUG, "device supports Multi-Packet RQ");
2471                 mprq = 1;
2472                 mprq_min_stride_size_n =
2473                         mprq_caps.min_single_stride_log_num_of_bytes;
2474                 mprq_max_stride_size_n =
2475                         mprq_caps.max_single_stride_log_num_of_bytes;
2476                 mprq_min_stride_num_n =
2477                         mprq_caps.min_single_wqe_log_num_of_strides;
2478                 mprq_max_stride_num_n =
2479                         mprq_caps.max_single_wqe_log_num_of_strides;
2480         }
2481 #endif
2482         if (RTE_CACHE_LINE_SIZE == 128 &&
2483             !(dv_attr.flags & MLX5DV_CONTEXT_FLAGS_CQE_128B_COMP))
2484                 cqe_comp = 0;
2485         else
2486                 cqe_comp = 1;
2487         config.cqe_comp = cqe_comp;
2488 #ifdef HAVE_IBV_MLX5_MOD_CQE_128B_PAD
2489         /* Whether device supports 128B Rx CQE padding. */
2490         cqe_pad = RTE_CACHE_LINE_SIZE == 128 &&
2491                   (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_CQE_128B_PAD);
2492 #endif
2493 #ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT
2494         if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS) {
2495                 tunnel_en = ((dv_attr.tunnel_offloads_caps &
2496                               MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_VXLAN) &&
2497                              (dv_attr.tunnel_offloads_caps &
2498                               MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_GRE) &&
2499                              (dv_attr.tunnel_offloads_caps &
2500                               MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_GENEVE));
2501         }
2502         DRV_LOG(DEBUG, "tunnel offloading is %ssupported",
2503                 tunnel_en ? "" : "not ");
2504 #else
2505         DRV_LOG(WARNING,
2506                 "tunnel offloading disabled due to old OFED/rdma-core version");
2507 #endif
2508         config.tunnel_en = tunnel_en;
2509 #ifdef HAVE_IBV_DEVICE_MPLS_SUPPORT
2510         mpls_en = ((dv_attr.tunnel_offloads_caps &
2511                     MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_CW_MPLS_OVER_GRE) &&
2512                    (dv_attr.tunnel_offloads_caps &
2513                     MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_CW_MPLS_OVER_UDP));
2514         DRV_LOG(DEBUG, "MPLS over GRE/UDP tunnel offloading is %ssupported",
2515                 mpls_en ? "" : "not ");
2516 #else
2517         DRV_LOG(WARNING, "MPLS over GRE/UDP tunnel offloading disabled due to"
2518                 " old OFED/rdma-core version or firmware configuration");
2519 #endif
2520         config.mpls_en = mpls_en;
2521         /* Check port status. */
2522         err = mlx5_glue->query_port(sh->ctx, spawn->ibv_port, &port_attr);
2523         if (err) {
2524                 DRV_LOG(ERR, "port query failed: %s", strerror(err));
2525                 goto error;
2526         }
2527         if (port_attr.link_layer != IBV_LINK_LAYER_ETHERNET) {
2528                 DRV_LOG(ERR, "port is not configured in Ethernet mode");
2529                 err = EINVAL;
2530                 goto error;
2531         }
2532         if (port_attr.state != IBV_PORT_ACTIVE)
2533                 DRV_LOG(DEBUG, "port is not active: \"%s\" (%d)",
2534                         mlx5_glue->port_state_str(port_attr.state),
2535                         port_attr.state);
2536         /* Allocate private eth device data. */
2537         priv = rte_zmalloc("ethdev private structure",
2538                            sizeof(*priv),
2539                            RTE_CACHE_LINE_SIZE);
2540         if (priv == NULL) {
2541                 DRV_LOG(ERR, "priv allocation failure");
2542                 err = ENOMEM;
2543                 goto error;
2544         }
2545         priv->sh = sh;
2546         priv->ibv_port = spawn->ibv_port;
2547         priv->pci_dev = spawn->pci_dev;
2548         priv->mtu = RTE_ETHER_MTU;
2549         priv->mp_id.port_id = port_id;
2550         strlcpy(priv->mp_id.name, MLX5_MP_NAME, RTE_MP_MAX_NAME_LEN);
2551 #ifndef RTE_ARCH_64
2552         /* Initialize UAR access locks for 32bit implementations. */
2553         rte_spinlock_init(&priv->uar_lock_cq);
2554         for (i = 0; i < MLX5_UAR_PAGE_NUM_MAX; i++)
2555                 rte_spinlock_init(&priv->uar_lock[i]);
2556 #endif
2557         /* Some internal functions rely on Netlink sockets, open them now. */
2558         priv->nl_socket_rdma = mlx5_nl_init(NETLINK_RDMA);
2559         priv->nl_socket_route = mlx5_nl_init(NETLINK_ROUTE);
2560         priv->representor = !!switch_info->representor;
2561         priv->master = !!switch_info->master;
2562         priv->domain_id = RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID;
2563         priv->vport_meta_tag = 0;
2564         priv->vport_meta_mask = 0;
2565         priv->pf_bond = spawn->pf_bond;
2566 #ifdef HAVE_MLX5DV_DR_DEVX_PORT
2567         /*
2568          * The DevX port query API is implemented. E-Switch may use
2569          * either vport or reg_c[0] metadata register to match on
2570          * vport index. The engaged part of metadata register is
2571          * defined by mask.
2572          */
2573         if (switch_info->representor || switch_info->master) {
2574                 devx_port.comp_mask = MLX5DV_DEVX_PORT_VPORT |
2575                                       MLX5DV_DEVX_PORT_MATCH_REG_C_0;
2576                 err = mlx5_glue->devx_port_query(sh->ctx, spawn->ibv_port,
2577                                                  &devx_port);
2578                 if (err) {
2579                         DRV_LOG(WARNING,
2580                                 "can't query devx port %d on device %s",
2581                                 spawn->ibv_port, spawn->ibv_dev->name);
2582                         devx_port.comp_mask = 0;
2583                 }
2584         }
2585         if (devx_port.comp_mask & MLX5DV_DEVX_PORT_MATCH_REG_C_0) {
2586                 priv->vport_meta_tag = devx_port.reg_c_0.value;
2587                 priv->vport_meta_mask = devx_port.reg_c_0.mask;
2588                 if (!priv->vport_meta_mask) {
2589                         DRV_LOG(ERR, "vport zero mask for port %d"
2590                                      " on bonding device %s",
2591                                      spawn->ibv_port, spawn->ibv_dev->name);
2592                         err = ENOTSUP;
2593                         goto error;
2594                 }
2595                 if (priv->vport_meta_tag & ~priv->vport_meta_mask) {
2596                         DRV_LOG(ERR, "invalid vport tag for port %d"
2597                                      " on bonding device %s",
2598                                      spawn->ibv_port, spawn->ibv_dev->name);
2599                         err = ENOTSUP;
2600                         goto error;
2601                 }
2602         }
2603         if (devx_port.comp_mask & MLX5DV_DEVX_PORT_VPORT) {
2604                 priv->vport_id = devx_port.vport_num;
2605         } else if (spawn->pf_bond >= 0) {
2606                 DRV_LOG(ERR, "can't deduce vport index for port %d"
2607                              " on bonding device %s",
2608                              spawn->ibv_port, spawn->ibv_dev->name);
2609                 err = ENOTSUP;
2610                 goto error;
2611         } else {
2612                 /* Suppose vport index in compatible way. */
2613                 priv->vport_id = switch_info->representor ?
2614                                  switch_info->port_name + 1 : -1;
2615         }
2616 #else
2617         /*
2618          * Kernel/rdma_core support single E-Switch per PF configurations
2619          * only and vport_id field contains the vport index for
2620          * associated VF, which is deduced from representor port name.
2621          * For example, let's have the IB device port 10, it has
2622          * attached network device eth0, which has port name attribute
2623          * pf0vf2, we can deduce the VF number as 2, and set vport index
2624          * as 3 (2+1). This assigning schema should be changed if the
2625          * multiple E-Switch instances per PF configurations or/and PCI
2626          * subfunctions are added.
2627          */
2628         priv->vport_id = switch_info->representor ?
2629                          switch_info->port_name + 1 : -1;
2630 #endif
2631         /* representor_id field keeps the unmodified VF index. */
2632         priv->representor_id = switch_info->representor ?
2633                                switch_info->port_name : -1;
2634         /*
2635          * Look for sibling devices in order to reuse their switch domain
2636          * if any, otherwise allocate one.
2637          */
2638         MLX5_ETH_FOREACH_DEV(port_id, priv->pci_dev) {
2639                 const struct mlx5_priv *opriv =
2640                         rte_eth_devices[port_id].data->dev_private;
2641
2642                 if (!opriv ||
2643                     opriv->sh != priv->sh ||
2644                         opriv->domain_id ==
2645                         RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID)
2646                         continue;
2647                 priv->domain_id = opriv->domain_id;
2648                 break;
2649         }
2650         if (priv->domain_id == RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID) {
2651                 err = rte_eth_switch_domain_alloc(&priv->domain_id);
2652                 if (err) {
2653                         err = rte_errno;
2654                         DRV_LOG(ERR, "unable to allocate switch domain: %s",
2655                                 strerror(rte_errno));
2656                         goto error;
2657                 }
2658                 own_domain_id = 1;
2659         }
2660         /* Override some values set by hardware configuration. */
2661         mlx5_args(&config, dpdk_dev->devargs);
2662         err = mlx5_dev_check_sibling_config(priv, &config);
2663         if (err)
2664                 goto error;
2665         config.hw_csum = !!(sh->device_attr.device_cap_flags_ex &
2666                             IBV_DEVICE_RAW_IP_CSUM);
2667         DRV_LOG(DEBUG, "checksum offloading is %ssupported",
2668                 (config.hw_csum ? "" : "not "));
2669 #if !defined(HAVE_IBV_DEVICE_COUNTERS_SET_V42) && \
2670         !defined(HAVE_IBV_DEVICE_COUNTERS_SET_V45)
2671         DRV_LOG(DEBUG, "counters are not supported");
2672 #endif
2673 #if !defined(HAVE_IBV_FLOW_DV_SUPPORT) || !defined(HAVE_MLX5DV_DR)
2674         if (config.dv_flow_en) {
2675                 DRV_LOG(WARNING, "DV flow is not supported");
2676                 config.dv_flow_en = 0;
2677         }
2678 #endif
2679         config.ind_table_max_size =
2680                 sh->device_attr.rss_caps.max_rwq_indirection_table_size;
2681         /*
2682          * Remove this check once DPDK supports larger/variable
2683          * indirection tables.
2684          */
2685         if (config.ind_table_max_size > (unsigned int)ETH_RSS_RETA_SIZE_512)
2686                 config.ind_table_max_size = ETH_RSS_RETA_SIZE_512;
2687         DRV_LOG(DEBUG, "maximum Rx indirection table size is %u",
2688                 config.ind_table_max_size);
2689         config.hw_vlan_strip = !!(sh->device_attr.raw_packet_caps &
2690                                   IBV_RAW_PACKET_CAP_CVLAN_STRIPPING);
2691         DRV_LOG(DEBUG, "VLAN stripping is %ssupported",
2692                 (config.hw_vlan_strip ? "" : "not "));
2693         config.hw_fcs_strip = !!(sh->device_attr.raw_packet_caps &
2694                                  IBV_RAW_PACKET_CAP_SCATTER_FCS);
2695         DRV_LOG(DEBUG, "FCS stripping configuration is %ssupported",
2696                 (config.hw_fcs_strip ? "" : "not "));
2697 #if defined(HAVE_IBV_WQ_FLAG_RX_END_PADDING)
2698         hw_padding = !!sh->device_attr.rx_pad_end_addr_align;
2699 #elif defined(HAVE_IBV_WQ_FLAGS_PCI_WRITE_END_PADDING)
2700         hw_padding = !!(sh->device_attr.device_cap_flags_ex &
2701                         IBV_DEVICE_PCI_WRITE_END_PADDING);
2702 #endif
2703         if (config.hw_padding && !hw_padding) {
2704                 DRV_LOG(DEBUG, "Rx end alignment padding isn't supported");
2705                 config.hw_padding = 0;
2706         } else if (config.hw_padding) {
2707                 DRV_LOG(DEBUG, "Rx end alignment padding is enabled");
2708         }
2709         config.tso = (sh->device_attr.tso_caps.max_tso > 0 &&
2710                       (sh->device_attr.tso_caps.supported_qpts &
2711                        (1 << IBV_QPT_RAW_PACKET)));
2712         if (config.tso)
2713                 config.tso_max_payload_sz = sh->device_attr.tso_caps.max_tso;
2714         /*
2715          * MPW is disabled by default, while the Enhanced MPW is enabled
2716          * by default.
2717          */
2718         if (config.mps == MLX5_ARG_UNSET)
2719                 config.mps = (mps == MLX5_MPW_ENHANCED) ? MLX5_MPW_ENHANCED :
2720                                                           MLX5_MPW_DISABLED;
2721         else
2722                 config.mps = config.mps ? mps : MLX5_MPW_DISABLED;
2723         DRV_LOG(INFO, "%sMPS is %s",
2724                 config.mps == MLX5_MPW_ENHANCED ? "enhanced " :
2725                 config.mps == MLX5_MPW ? "legacy " : "",
2726                 config.mps != MLX5_MPW_DISABLED ? "enabled" : "disabled");
2727         if (config.cqe_comp && !cqe_comp) {
2728                 DRV_LOG(WARNING, "Rx CQE compression isn't supported");
2729                 config.cqe_comp = 0;
2730         }
2731         if (config.cqe_pad && !cqe_pad) {
2732                 DRV_LOG(WARNING, "Rx CQE padding isn't supported");
2733                 config.cqe_pad = 0;
2734         } else if (config.cqe_pad) {
2735                 DRV_LOG(INFO, "Rx CQE padding is enabled");
2736         }
2737         if (config.devx) {
2738                 priv->counter_fallback = 0;
2739                 err = mlx5_devx_cmd_query_hca_attr(sh->ctx, &config.hca_attr);
2740                 if (err) {
2741                         err = -err;
2742                         goto error;
2743                 }
2744                 if (!config.hca_attr.flow_counters_dump)
2745                         priv->counter_fallback = 1;
2746 #ifndef HAVE_IBV_DEVX_ASYNC
2747                 priv->counter_fallback = 1;
2748 #endif
2749                 if (priv->counter_fallback)
2750                         DRV_LOG(INFO, "Use fall-back DV counter management");
2751                 /* Check for LRO support. */
2752                 if (config.dest_tir && config.hca_attr.lro_cap &&
2753                     config.dv_flow_en) {
2754                         /* TBD check tunnel lro caps. */
2755                         config.lro.supported = config.hca_attr.lro_cap;
2756                         DRV_LOG(DEBUG, "Device supports LRO");
2757                         /*
2758                          * If LRO timeout is not configured by application,
2759                          * use the minimal supported value.
2760                          */
2761                         if (!config.lro.timeout)
2762                                 config.lro.timeout =
2763                                 config.hca_attr.lro_timer_supported_periods[0];
2764                         DRV_LOG(DEBUG, "LRO session timeout set to %d usec",
2765                                 config.lro.timeout);
2766                 }
2767 #if defined(HAVE_MLX5DV_DR) && defined(HAVE_MLX5_DR_CREATE_ACTION_FLOW_METER)
2768                 if (config.hca_attr.qos.sup && config.hca_attr.qos.srtcm_sup &&
2769                     config.dv_flow_en) {
2770                         uint8_t reg_c_mask =
2771                                 config.hca_attr.qos.flow_meter_reg_c_ids;
2772                         /*
2773                          * Meter needs two REG_C's for color match and pre-sfx
2774                          * flow match. Here get the REG_C for color match.
2775                          * REG_C_0 and REG_C_1 is reserved for metadata feature.
2776                          */
2777                         reg_c_mask &= 0xfc;
2778                         if (__builtin_popcount(reg_c_mask) < 1) {
2779                                 priv->mtr_en = 0;
2780                                 DRV_LOG(WARNING, "No available register for"
2781                                         " meter.");
2782                         } else {
2783                                 priv->mtr_color_reg = ffs(reg_c_mask) - 1 +
2784                                                       REG_C_0;
2785                                 priv->mtr_en = 1;
2786                                 priv->mtr_reg_share =
2787                                       config.hca_attr.qos.flow_meter_reg_share;
2788                                 DRV_LOG(DEBUG, "The REG_C meter uses is %d",
2789                                         priv->mtr_color_reg);
2790                         }
2791                 }
2792 #endif
2793         }
2794         if (config.mprq.enabled && mprq) {
2795                 if (config.mprq.stride_num_n &&
2796                     (config.mprq.stride_num_n > mprq_max_stride_num_n ||
2797                      config.mprq.stride_num_n < mprq_min_stride_num_n)) {
2798                         config.mprq.stride_num_n =
2799                                 RTE_MIN(RTE_MAX(MLX5_MPRQ_STRIDE_NUM_N,
2800                                                 mprq_min_stride_num_n),
2801                                         mprq_max_stride_num_n);
2802                         DRV_LOG(WARNING,
2803                                 "the number of strides"
2804                                 " for Multi-Packet RQ is out of range,"
2805                                 " setting default value (%u)",
2806                                 1 << config.mprq.stride_num_n);
2807                 }
2808                 if (config.mprq.stride_size_n &&
2809                     (config.mprq.stride_size_n > mprq_max_stride_size_n ||
2810                      config.mprq.stride_size_n < mprq_min_stride_size_n)) {
2811                         config.mprq.stride_size_n =
2812                                 RTE_MIN(RTE_MAX(MLX5_MPRQ_STRIDE_SIZE_N,
2813                                                 mprq_min_stride_size_n),
2814                                         mprq_max_stride_size_n);
2815                         DRV_LOG(WARNING,
2816                                 "the size of a stride"
2817                                 " for Multi-Packet RQ is out of range,"
2818                                 " setting default value (%u)",
2819                                 1 << config.mprq.stride_size_n);
2820                 }
2821                 config.mprq.min_stride_size_n = mprq_min_stride_size_n;
2822                 config.mprq.max_stride_size_n = mprq_max_stride_size_n;
2823         } else if (config.mprq.enabled && !mprq) {
2824                 DRV_LOG(WARNING, "Multi-Packet RQ isn't supported");
2825                 config.mprq.enabled = 0;
2826         }
2827         if (config.max_dump_files_num == 0)
2828                 config.max_dump_files_num = 128;
2829         eth_dev = rte_eth_dev_allocate(name);
2830         if (eth_dev == NULL) {
2831                 DRV_LOG(ERR, "can not allocate rte ethdev");
2832                 err = ENOMEM;
2833                 goto error;
2834         }
2835         /* Flag to call rte_eth_dev_release_port() in rte_eth_dev_close(). */
2836         eth_dev->data->dev_flags |= RTE_ETH_DEV_CLOSE_REMOVE;
2837         if (priv->representor) {
2838                 eth_dev->data->dev_flags |= RTE_ETH_DEV_REPRESENTOR;
2839                 eth_dev->data->representor_id = priv->representor_id;
2840         }
2841         /*
2842          * Store associated network device interface index. This index
2843          * is permanent throughout the lifetime of device. So, we may store
2844          * the ifindex here and use the cached value further.
2845          */
2846         MLX5_ASSERT(spawn->ifindex);
2847         priv->if_index = spawn->ifindex;
2848         eth_dev->data->dev_private = priv;
2849         priv->dev_data = eth_dev->data;
2850         eth_dev->data->mac_addrs = priv->mac;
2851         eth_dev->device = dpdk_dev;
2852         /* Configure the first MAC address by default. */
2853         if (mlx5_get_mac(eth_dev, &mac.addr_bytes)) {
2854                 DRV_LOG(ERR,
2855                         "port %u cannot get MAC address, is mlx5_en"
2856                         " loaded? (errno: %s)",
2857                         eth_dev->data->port_id, strerror(rte_errno));
2858                 err = ENODEV;
2859                 goto error;
2860         }
2861         DRV_LOG(INFO,
2862                 "port %u MAC address is %02x:%02x:%02x:%02x:%02x:%02x",
2863                 eth_dev->data->port_id,
2864                 mac.addr_bytes[0], mac.addr_bytes[1],
2865                 mac.addr_bytes[2], mac.addr_bytes[3],
2866                 mac.addr_bytes[4], mac.addr_bytes[5]);
2867 #ifdef RTE_LIBRTE_MLX5_DEBUG
2868         {
2869                 char ifname[IF_NAMESIZE];
2870
2871                 if (mlx5_get_ifname(eth_dev, &ifname) == 0)
2872                         DRV_LOG(DEBUG, "port %u ifname is \"%s\"",
2873                                 eth_dev->data->port_id, ifname);
2874                 else
2875                         DRV_LOG(DEBUG, "port %u ifname is unknown",
2876                                 eth_dev->data->port_id);
2877         }
2878 #endif
2879         /* Get actual MTU if possible. */
2880         err = mlx5_get_mtu(eth_dev, &priv->mtu);
2881         if (err) {
2882                 err = rte_errno;
2883                 goto error;
2884         }
2885         DRV_LOG(DEBUG, "port %u MTU is %u", eth_dev->data->port_id,
2886                 priv->mtu);
2887         /* Initialize burst functions to prevent crashes before link-up. */
2888         eth_dev->rx_pkt_burst = removed_rx_burst;
2889         eth_dev->tx_pkt_burst = removed_tx_burst;
2890         eth_dev->dev_ops = &mlx5_dev_ops;
2891         /* Register MAC address. */
2892         claim_zero(mlx5_mac_addr_add(eth_dev, &mac, 0, 0));
2893         if (config.vf && config.vf_nl_en)
2894                 mlx5_nl_mac_addr_sync(priv->nl_socket_route,
2895                                       mlx5_ifindex(eth_dev),
2896                                       eth_dev->data->mac_addrs,
2897                                       MLX5_MAX_MAC_ADDRESSES);
2898         priv->flows = 0;
2899         priv->ctrl_flows = 0;
2900         TAILQ_INIT(&priv->flow_meters);
2901         TAILQ_INIT(&priv->flow_meter_profiles);
2902         /* Hint libmlx5 to use PMD allocator for data plane resources */
2903         struct mlx5dv_ctx_allocators alctr = {
2904                 .alloc = &mlx5_alloc_verbs_buf,
2905                 .free = &mlx5_free_verbs_buf,
2906                 .data = priv,
2907         };
2908         mlx5_glue->dv_set_context_attr(sh->ctx,
2909                                        MLX5DV_CTX_ATTR_BUF_ALLOCATORS,
2910                                        (void *)((uintptr_t)&alctr));
2911         /* Bring Ethernet device up. */
2912         DRV_LOG(DEBUG, "port %u forcing Ethernet interface up",
2913                 eth_dev->data->port_id);
2914         mlx5_set_link_up(eth_dev);
2915         /*
2916          * Even though the interrupt handler is not installed yet,
2917          * interrupts will still trigger on the async_fd from
2918          * Verbs context returned by ibv_open_device().
2919          */
2920         mlx5_link_update(eth_dev, 0);
2921 #ifdef HAVE_MLX5DV_DR_ESWITCH
2922         if (!(config.hca_attr.eswitch_manager && config.dv_flow_en &&
2923               (switch_info->representor || switch_info->master)))
2924                 config.dv_esw_en = 0;
2925 #else
2926         config.dv_esw_en = 0;
2927 #endif
2928         /* Detect minimal data bytes to inline. */
2929         mlx5_set_min_inline(spawn, &config);
2930         /* Store device configuration on private structure. */
2931         priv->config = config;
2932         /* Create context for virtual machine VLAN workaround. */
2933         priv->vmwa_context = mlx5_vlan_vmwa_init(eth_dev, spawn->ifindex);
2934         if (config.dv_flow_en) {
2935                 err = mlx5_alloc_shared_dr(priv);
2936                 if (err)
2937                         goto error;
2938                 /*
2939                  * RSS id is shared with meter flow id. Meter flow id can only
2940                  * use the 24 MSB of the register.
2941                  */
2942                 priv->qrss_id_pool = mlx5_flow_id_pool_alloc(UINT32_MAX >>
2943                                      MLX5_MTR_COLOR_BITS);
2944                 if (!priv->qrss_id_pool) {
2945                         DRV_LOG(ERR, "can't create flow id pool");
2946                         err = ENOMEM;
2947                         goto error;
2948                 }
2949         }
2950         /* Supported Verbs flow priority number detection. */
2951         err = mlx5_flow_discover_priorities(eth_dev);
2952         if (err < 0) {
2953                 err = -err;
2954                 goto error;
2955         }
2956         priv->config.flow_prio = err;
2957         if (!priv->config.dv_esw_en &&
2958             priv->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY) {
2959                 DRV_LOG(WARNING, "metadata mode %u is not supported "
2960                                  "(no E-Switch)", priv->config.dv_xmeta_en);
2961                 priv->config.dv_xmeta_en = MLX5_XMETA_MODE_LEGACY;
2962         }
2963         mlx5_set_metadata_mask(eth_dev);
2964         if (priv->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY &&
2965             !priv->sh->dv_regc0_mask) {
2966                 DRV_LOG(ERR, "metadata mode %u is not supported "
2967                              "(no metadata reg_c[0] is available)",
2968                              priv->config.dv_xmeta_en);
2969                         err = ENOTSUP;
2970                         goto error;
2971         }
2972         /*
2973          * Allocate the buffer for flow creating, just once.
2974          * The allocation must be done before any flow creating.
2975          */
2976         mlx5_flow_alloc_intermediate(eth_dev);
2977         /* Query availibility of metadata reg_c's. */
2978         err = mlx5_flow_discover_mreg_c(eth_dev);
2979         if (err < 0) {
2980                 err = -err;
2981                 goto error;
2982         }
2983         if (!mlx5_flow_ext_mreg_supported(eth_dev)) {
2984                 DRV_LOG(DEBUG,
2985                         "port %u extensive metadata register is not supported",
2986                         eth_dev->data->port_id);
2987                 if (priv->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY) {
2988                         DRV_LOG(ERR, "metadata mode %u is not supported "
2989                                      "(no metadata registers available)",
2990                                      priv->config.dv_xmeta_en);
2991                         err = ENOTSUP;
2992                         goto error;
2993                 }
2994         }
2995         if (priv->config.dv_flow_en &&
2996             priv->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY &&
2997             mlx5_flow_ext_mreg_supported(eth_dev) &&
2998             priv->sh->dv_regc0_mask) {
2999                 priv->mreg_cp_tbl = mlx5_hlist_create(MLX5_FLOW_MREG_HNAME,
3000                                                       MLX5_FLOW_MREG_HTABLE_SZ);
3001                 if (!priv->mreg_cp_tbl) {
3002                         err = ENOMEM;
3003                         goto error;
3004                 }
3005         }
3006         return eth_dev;
3007 error:
3008         if (priv) {
3009                 if (priv->mreg_cp_tbl)
3010                         mlx5_hlist_destroy(priv->mreg_cp_tbl, NULL, NULL);
3011                 if (priv->sh)
3012                         mlx5_free_shared_dr(priv);
3013                 if (priv->nl_socket_route >= 0)
3014                         close(priv->nl_socket_route);
3015                 if (priv->nl_socket_rdma >= 0)
3016                         close(priv->nl_socket_rdma);
3017                 if (priv->vmwa_context)
3018                         mlx5_vlan_vmwa_exit(priv->vmwa_context);
3019                 if (priv->qrss_id_pool)
3020                         mlx5_flow_id_pool_release(priv->qrss_id_pool);
3021                 if (own_domain_id)
3022                         claim_zero(rte_eth_switch_domain_free(priv->domain_id));
3023                 rte_free(priv);
3024                 if (eth_dev != NULL)
3025                         eth_dev->data->dev_private = NULL;
3026         }
3027         if (eth_dev != NULL) {
3028                 /* mac_addrs must not be freed alone because part of dev_private */
3029                 eth_dev->data->mac_addrs = NULL;
3030                 rte_eth_dev_release_port(eth_dev);
3031         }
3032         if (sh)
3033                 mlx5_free_shared_ibctx(sh);
3034         MLX5_ASSERT(err > 0);
3035         rte_errno = err;
3036         return NULL;
3037 }
3038
3039 /**
3040  * Comparison callback to sort device data.
3041  *
3042  * This is meant to be used with qsort().
3043  *
3044  * @param a[in]
3045  *   Pointer to pointer to first data object.
3046  * @param b[in]
3047  *   Pointer to pointer to second data object.
3048  *
3049  * @return
3050  *   0 if both objects are equal, less than 0 if the first argument is less
3051  *   than the second, greater than 0 otherwise.
3052  */
3053 static int
3054 mlx5_dev_spawn_data_cmp(const void *a, const void *b)
3055 {
3056         const struct mlx5_switch_info *si_a =
3057                 &((const struct mlx5_dev_spawn_data *)a)->info;
3058         const struct mlx5_switch_info *si_b =
3059                 &((const struct mlx5_dev_spawn_data *)b)->info;
3060         int ret;
3061
3062         /* Master device first. */
3063         ret = si_b->master - si_a->master;
3064         if (ret)
3065                 return ret;
3066         /* Then representor devices. */
3067         ret = si_b->representor - si_a->representor;
3068         if (ret)
3069                 return ret;
3070         /* Unidentified devices come last in no specific order. */
3071         if (!si_a->representor)
3072                 return 0;
3073         /* Order representors by name. */
3074         return si_a->port_name - si_b->port_name;
3075 }
3076
3077 /**
3078  * Match PCI information for possible slaves of bonding device.
3079  *
3080  * @param[in] ibv_dev
3081  *   Pointer to Infiniband device structure.
3082  * @param[in] pci_dev
3083  *   Pointer to PCI device structure to match PCI address.
3084  * @param[in] nl_rdma
3085  *   Netlink RDMA group socket handle.
3086  *
3087  * @return
3088  *   negative value if no bonding device found, otherwise
3089  *   positive index of slave PF in bonding.
3090  */
3091 static int
3092 mlx5_device_bond_pci_match(const struct ibv_device *ibv_dev,
3093                            const struct rte_pci_device *pci_dev,
3094                            int nl_rdma)
3095 {
3096         char ifname[IF_NAMESIZE + 1];
3097         unsigned int ifindex;
3098         unsigned int np, i;
3099         FILE *file = NULL;
3100         int pf = -1;
3101
3102         /*
3103          * Try to get master device name. If something goes
3104          * wrong suppose the lack of kernel support and no
3105          * bonding devices.
3106          */
3107         if (nl_rdma < 0)
3108                 return -1;
3109         if (!strstr(ibv_dev->name, "bond"))
3110                 return -1;
3111         np = mlx5_nl_portnum(nl_rdma, ibv_dev->name);
3112         if (!np)
3113                 return -1;
3114         /*
3115          * The Master device might not be on the predefined
3116          * port (not on port index 1, it is not garanted),
3117          * we have to scan all Infiniband device port and
3118          * find master.
3119          */
3120         for (i = 1; i <= np; ++i) {
3121                 /* Check whether Infiniband port is populated. */
3122                 ifindex = mlx5_nl_ifindex(nl_rdma, ibv_dev->name, i);
3123                 if (!ifindex)
3124                         continue;
3125                 if (!if_indextoname(ifindex, ifname))
3126                         continue;
3127                 /* Try to read bonding slave names from sysfs. */
3128                 MKSTR(slaves,
3129                       "/sys/class/net/%s/master/bonding/slaves", ifname);
3130                 file = fopen(slaves, "r");
3131                 if (file)
3132                         break;
3133         }
3134         if (!file)
3135                 return -1;
3136         /* Use safe format to check maximal buffer length. */
3137         MLX5_ASSERT(atol(RTE_STR(IF_NAMESIZE)) == IF_NAMESIZE);
3138         while (fscanf(file, "%" RTE_STR(IF_NAMESIZE) "s", ifname) == 1) {
3139                 char tmp_str[IF_NAMESIZE + 32];
3140                 struct rte_pci_addr pci_addr;
3141                 struct mlx5_switch_info info;
3142
3143                 /* Process slave interface names in the loop. */
3144                 snprintf(tmp_str, sizeof(tmp_str),
3145                          "/sys/class/net/%s", ifname);
3146                 if (mlx5_dev_to_pci_addr(tmp_str, &pci_addr)) {
3147                         DRV_LOG(WARNING, "can not get PCI address"
3148                                          " for netdev \"%s\"", ifname);
3149                         continue;
3150                 }
3151                 if (pci_dev->addr.domain != pci_addr.domain ||
3152                     pci_dev->addr.bus != pci_addr.bus ||
3153                     pci_dev->addr.devid != pci_addr.devid ||
3154                     pci_dev->addr.function != pci_addr.function)
3155                         continue;
3156                 /* Slave interface PCI address match found. */
3157                 fclose(file);
3158                 snprintf(tmp_str, sizeof(tmp_str),
3159                          "/sys/class/net/%s/phys_port_name", ifname);
3160                 file = fopen(tmp_str, "rb");
3161                 if (!file)
3162                         break;
3163                 info.name_type = MLX5_PHYS_PORT_NAME_TYPE_NOTSET;
3164                 if (fscanf(file, "%32s", tmp_str) == 1)
3165                         mlx5_translate_port_name(tmp_str, &info);
3166                 if (info.name_type == MLX5_PHYS_PORT_NAME_TYPE_LEGACY ||
3167                     info.name_type == MLX5_PHYS_PORT_NAME_TYPE_UPLINK)
3168                         pf = info.port_name;
3169                 break;
3170         }
3171         if (file)
3172                 fclose(file);
3173         return pf;
3174 }
3175
3176 /**
3177  * DPDK callback to register a PCI device.
3178  *
3179  * This function spawns Ethernet devices out of a given PCI device.
3180  *
3181  * @param[in] pci_drv
3182  *   PCI driver structure (mlx5_driver).
3183  * @param[in] pci_dev
3184  *   PCI device information.
3185  *
3186  * @return
3187  *   0 on success, a negative errno value otherwise and rte_errno is set.
3188  */
3189 static int
3190 mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
3191                struct rte_pci_device *pci_dev)
3192 {
3193         struct ibv_device **ibv_list;
3194         /*
3195          * Number of found IB Devices matching with requested PCI BDF.
3196          * nd != 1 means there are multiple IB devices over the same
3197          * PCI device and we have representors and master.
3198          */
3199         unsigned int nd = 0;
3200         /*
3201          * Number of found IB device Ports. nd = 1 and np = 1..n means
3202          * we have the single multiport IB device, and there may be
3203          * representors attached to some of found ports.
3204          */
3205         unsigned int np = 0;
3206         /*
3207          * Number of DPDK ethernet devices to Spawn - either over
3208          * multiple IB devices or multiple ports of single IB device.
3209          * Actually this is the number of iterations to spawn.
3210          */
3211         unsigned int ns = 0;
3212         /*
3213          * Bonding device
3214          *   < 0 - no bonding device (single one)
3215          *  >= 0 - bonding device (value is slave PF index)
3216          */
3217         int bd = -1;
3218         struct mlx5_dev_spawn_data *list = NULL;
3219         struct mlx5_dev_config dev_config;
3220         int ret;
3221
3222         if (mlx5_class_get(pci_dev->device.devargs) != MLX5_CLASS_NET) {
3223                 DRV_LOG(DEBUG, "Skip probing - should be probed by other mlx5"
3224                         " driver.");
3225                 return 1;
3226         }
3227         if (rte_eal_process_type() == RTE_PROC_PRIMARY)
3228                 mlx5_pmd_socket_init();
3229         ret = mlx5_init_once();
3230         if (ret) {
3231                 DRV_LOG(ERR, "unable to init PMD global data: %s",
3232                         strerror(rte_errno));
3233                 return -rte_errno;
3234         }
3235         MLX5_ASSERT(pci_drv == &mlx5_driver);
3236         errno = 0;
3237         ibv_list = mlx5_glue->get_device_list(&ret);
3238         if (!ibv_list) {
3239                 rte_errno = errno ? errno : ENOSYS;
3240                 DRV_LOG(ERR, "cannot list devices, is ib_uverbs loaded?");
3241                 return -rte_errno;
3242         }
3243         /*
3244          * First scan the list of all Infiniband devices to find
3245          * matching ones, gathering into the list.
3246          */
3247         struct ibv_device *ibv_match[ret + 1];
3248         int nl_route = mlx5_nl_init(NETLINK_ROUTE);
3249         int nl_rdma = mlx5_nl_init(NETLINK_RDMA);
3250         unsigned int i;
3251
3252         while (ret-- > 0) {
3253                 struct rte_pci_addr pci_addr;
3254
3255                 DRV_LOG(DEBUG, "checking device \"%s\"", ibv_list[ret]->name);
3256                 bd = mlx5_device_bond_pci_match
3257                                 (ibv_list[ret], pci_dev, nl_rdma);
3258                 if (bd >= 0) {
3259                         /*
3260                          * Bonding device detected. Only one match is allowed,
3261                          * the bonding is supported over multi-port IB device,
3262                          * there should be no matches on representor PCI
3263                          * functions or non VF LAG bonding devices with
3264                          * specified address.
3265                          */
3266                         if (nd) {
3267                                 DRV_LOG(ERR,
3268                                         "multiple PCI match on bonding device"
3269                                         "\"%s\" found", ibv_list[ret]->name);
3270                                 rte_errno = ENOENT;
3271                                 ret = -rte_errno;
3272                                 goto exit;
3273                         }
3274                         DRV_LOG(INFO, "PCI information matches for"
3275                                       " slave %d bonding device \"%s\"",
3276                                       bd, ibv_list[ret]->name);
3277                         ibv_match[nd++] = ibv_list[ret];
3278                         break;
3279                 }
3280                 if (mlx5_dev_to_pci_addr
3281                         (ibv_list[ret]->ibdev_path, &pci_addr))
3282                         continue;
3283                 if (pci_dev->addr.domain != pci_addr.domain ||
3284                     pci_dev->addr.bus != pci_addr.bus ||
3285                     pci_dev->addr.devid != pci_addr.devid ||
3286                     pci_dev->addr.function != pci_addr.function)
3287                         continue;
3288                 DRV_LOG(INFO, "PCI information matches for device \"%s\"",
3289                         ibv_list[ret]->name);
3290                 ibv_match[nd++] = ibv_list[ret];
3291         }
3292         ibv_match[nd] = NULL;
3293         if (!nd) {
3294                 /* No device matches, just complain and bail out. */
3295                 DRV_LOG(WARNING,
3296                         "no Verbs device matches PCI device " PCI_PRI_FMT ","
3297                         " are kernel drivers loaded?",
3298                         pci_dev->addr.domain, pci_dev->addr.bus,
3299                         pci_dev->addr.devid, pci_dev->addr.function);
3300                 rte_errno = ENOENT;
3301                 ret = -rte_errno;
3302                 goto exit;
3303         }
3304         if (nd == 1) {
3305                 /*
3306                  * Found single matching device may have multiple ports.
3307                  * Each port may be representor, we have to check the port
3308                  * number and check the representors existence.
3309                  */
3310                 if (nl_rdma >= 0)
3311                         np = mlx5_nl_portnum(nl_rdma, ibv_match[0]->name);
3312                 if (!np)
3313                         DRV_LOG(WARNING, "can not get IB device \"%s\""
3314                                          " ports number", ibv_match[0]->name);
3315                 if (bd >= 0 && !np) {
3316                         DRV_LOG(ERR, "can not get ports"
3317                                      " for bonding device");
3318                         rte_errno = ENOENT;
3319                         ret = -rte_errno;
3320                         goto exit;
3321                 }
3322         }
3323 #ifndef HAVE_MLX5DV_DR_DEVX_PORT
3324         if (bd >= 0) {
3325                 /*
3326                  * This may happen if there is VF LAG kernel support and
3327                  * application is compiled with older rdma_core library.
3328                  */
3329                 DRV_LOG(ERR,
3330                         "No kernel/verbs support for VF LAG bonding found.");
3331                 rte_errno = ENOTSUP;
3332                 ret = -rte_errno;
3333                 goto exit;
3334         }
3335 #endif
3336         /*
3337          * Now we can determine the maximal
3338          * amount of devices to be spawned.
3339          */
3340         list = rte_zmalloc("device spawn data",
3341                          sizeof(struct mlx5_dev_spawn_data) *
3342                          (np ? np : nd),
3343                          RTE_CACHE_LINE_SIZE);
3344         if (!list) {
3345                 DRV_LOG(ERR, "spawn data array allocation failure");
3346                 rte_errno = ENOMEM;
3347                 ret = -rte_errno;
3348                 goto exit;
3349         }
3350         if (bd >= 0 || np > 1) {
3351                 /*
3352                  * Single IB device with multiple ports found,
3353                  * it may be E-Switch master device and representors.
3354                  * We have to perform identification trough the ports.
3355                  */
3356                 MLX5_ASSERT(nl_rdma >= 0);
3357                 MLX5_ASSERT(ns == 0);
3358                 MLX5_ASSERT(nd == 1);
3359                 MLX5_ASSERT(np);
3360                 for (i = 1; i <= np; ++i) {
3361                         list[ns].max_port = np;
3362                         list[ns].ibv_port = i;
3363                         list[ns].ibv_dev = ibv_match[0];
3364                         list[ns].eth_dev = NULL;
3365                         list[ns].pci_dev = pci_dev;
3366                         list[ns].pf_bond = bd;
3367                         list[ns].ifindex = mlx5_nl_ifindex
3368                                         (nl_rdma, list[ns].ibv_dev->name, i);
3369                         if (!list[ns].ifindex) {
3370                                 /*
3371                                  * No network interface index found for the
3372                                  * specified port, it means there is no
3373                                  * representor on this port. It's OK,
3374                                  * there can be disabled ports, for example
3375                                  * if sriov_numvfs < sriov_totalvfs.
3376                                  */
3377                                 continue;
3378                         }
3379                         ret = -1;
3380                         if (nl_route >= 0)
3381                                 ret = mlx5_nl_switch_info
3382                                                (nl_route,
3383                                                 list[ns].ifindex,
3384                                                 &list[ns].info);
3385                         if (ret || (!list[ns].info.representor &&
3386                                     !list[ns].info.master)) {
3387                                 /*
3388                                  * We failed to recognize representors with
3389                                  * Netlink, let's try to perform the task
3390                                  * with sysfs.
3391                                  */
3392                                 ret =  mlx5_sysfs_switch_info
3393                                                 (list[ns].ifindex,
3394                                                  &list[ns].info);
3395                         }
3396                         if (!ret && bd >= 0) {
3397                                 switch (list[ns].info.name_type) {
3398                                 case MLX5_PHYS_PORT_NAME_TYPE_UPLINK:
3399                                         if (list[ns].info.port_name == bd)
3400                                                 ns++;
3401                                         break;
3402                                 case MLX5_PHYS_PORT_NAME_TYPE_PFVF:
3403                                         if (list[ns].info.pf_num == bd)
3404                                                 ns++;
3405                                         break;
3406                                 default:
3407                                         break;
3408                                 }
3409                                 continue;
3410                         }
3411                         if (!ret && (list[ns].info.representor ^
3412                                      list[ns].info.master))
3413                                 ns++;
3414                 }
3415                 if (!ns) {
3416                         DRV_LOG(ERR,
3417                                 "unable to recognize master/representors"
3418                                 " on the IB device with multiple ports");
3419                         rte_errno = ENOENT;
3420                         ret = -rte_errno;
3421                         goto exit;
3422                 }
3423         } else {
3424                 /*
3425                  * The existence of several matching entries (nd > 1) means
3426                  * port representors have been instantiated. No existing Verbs
3427                  * call nor sysfs entries can tell them apart, this can only
3428                  * be done through Netlink calls assuming kernel drivers are
3429                  * recent enough to support them.
3430                  *
3431                  * In the event of identification failure through Netlink,
3432                  * try again through sysfs, then:
3433                  *
3434                  * 1. A single IB device matches (nd == 1) with single
3435                  *    port (np=0/1) and is not a representor, assume
3436                  *    no switch support.
3437                  *
3438                  * 2. Otherwise no safe assumptions can be made;
3439                  *    complain louder and bail out.
3440                  */
3441                 np = 1;
3442                 for (i = 0; i != nd; ++i) {
3443                         memset(&list[ns].info, 0, sizeof(list[ns].info));
3444                         list[ns].max_port = 1;
3445                         list[ns].ibv_port = 1;
3446                         list[ns].ibv_dev = ibv_match[i];
3447                         list[ns].eth_dev = NULL;
3448                         list[ns].pci_dev = pci_dev;
3449                         list[ns].pf_bond = -1;
3450                         list[ns].ifindex = 0;
3451                         if (nl_rdma >= 0)
3452                                 list[ns].ifindex = mlx5_nl_ifindex
3453                                         (nl_rdma, list[ns].ibv_dev->name, 1);
3454                         if (!list[ns].ifindex) {
3455                                 char ifname[IF_NAMESIZE];
3456
3457                                 /*
3458                                  * Netlink failed, it may happen with old
3459                                  * ib_core kernel driver (before 4.16).
3460                                  * We can assume there is old driver because
3461                                  * here we are processing single ports IB
3462                                  * devices. Let's try sysfs to retrieve
3463                                  * the ifindex. The method works for
3464                                  * master device only.
3465                                  */
3466                                 if (nd > 1) {
3467                                         /*
3468                                          * Multiple devices found, assume
3469                                          * representors, can not distinguish
3470                                          * master/representor and retrieve
3471                                          * ifindex via sysfs.
3472                                          */
3473                                         continue;
3474                                 }
3475                                 ret = mlx5_get_master_ifname
3476                                         (ibv_match[i]->ibdev_path, &ifname);
3477                                 if (!ret)
3478                                         list[ns].ifindex =
3479                                                 if_nametoindex(ifname);
3480                                 if (!list[ns].ifindex) {
3481                                         /*
3482                                          * No network interface index found
3483                                          * for the specified device, it means
3484                                          * there it is neither representor
3485                                          * nor master.
3486                                          */
3487                                         continue;
3488                                 }
3489                         }
3490                         ret = -1;
3491                         if (nl_route >= 0)
3492                                 ret = mlx5_nl_switch_info
3493                                                (nl_route,
3494                                                 list[ns].ifindex,
3495                                                 &list[ns].info);
3496                         if (ret || (!list[ns].info.representor &&
3497                                     !list[ns].info.master)) {
3498                                 /*
3499                                  * We failed to recognize representors with
3500                                  * Netlink, let's try to perform the task
3501                                  * with sysfs.
3502                                  */
3503                                 ret =  mlx5_sysfs_switch_info
3504                                                 (list[ns].ifindex,
3505                                                  &list[ns].info);
3506                         }
3507                         if (!ret && (list[ns].info.representor ^
3508                                      list[ns].info.master)) {
3509                                 ns++;
3510                         } else if ((nd == 1) &&
3511                                    !list[ns].info.representor &&
3512                                    !list[ns].info.master) {
3513                                 /*
3514                                  * Single IB device with
3515                                  * one physical port and
3516                                  * attached network device.
3517                                  * May be SRIOV is not enabled
3518                                  * or there is no representors.
3519                                  */
3520                                 DRV_LOG(INFO, "no E-Switch support detected");
3521                                 ns++;
3522                                 break;
3523                         }
3524                 }
3525                 if (!ns) {
3526                         DRV_LOG(ERR,
3527                                 "unable to recognize master/representors"
3528                                 " on the multiple IB devices");
3529                         rte_errno = ENOENT;
3530                         ret = -rte_errno;
3531                         goto exit;
3532                 }
3533         }
3534         MLX5_ASSERT(ns);
3535         /*
3536          * Sort list to probe devices in natural order for users convenience
3537          * (i.e. master first, then representors from lowest to highest ID).
3538          */
3539         qsort(list, ns, sizeof(*list), mlx5_dev_spawn_data_cmp);
3540         /* Default configuration. */
3541         dev_config = (struct mlx5_dev_config){
3542                 .hw_padding = 0,
3543                 .mps = MLX5_ARG_UNSET,
3544                 .dbnc = MLX5_ARG_UNSET,
3545                 .rx_vec_en = 1,
3546                 .txq_inline_max = MLX5_ARG_UNSET,
3547                 .txq_inline_min = MLX5_ARG_UNSET,
3548                 .txq_inline_mpw = MLX5_ARG_UNSET,
3549                 .txqs_inline = MLX5_ARG_UNSET,
3550                 .vf_nl_en = 1,
3551                 .mr_ext_memseg_en = 1,
3552                 .mprq = {
3553                         .enabled = 0, /* Disabled by default. */
3554                         .stride_num_n = 0,
3555                         .stride_size_n = 0,
3556                         .max_memcpy_len = MLX5_MPRQ_MEMCPY_DEFAULT_LEN,
3557                         .min_rxqs_num = MLX5_MPRQ_MIN_RXQS,
3558                 },
3559                 .dv_esw_en = 1,
3560                 .dv_flow_en = 1,
3561                 .log_hp_size = MLX5_ARG_UNSET,
3562         };
3563         /* Device specific configuration. */
3564         switch (pci_dev->id.device_id) {
3565         case PCI_DEVICE_ID_MELLANOX_CONNECTX4VF:
3566         case PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF:
3567         case PCI_DEVICE_ID_MELLANOX_CONNECTX5VF:
3568         case PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF:
3569         case PCI_DEVICE_ID_MELLANOX_CONNECTX5BFVF:
3570         case PCI_DEVICE_ID_MELLANOX_CONNECTX6VF:
3571         case PCI_DEVICE_ID_MELLANOX_CONNECTX6DXVF:
3572                 dev_config.vf = 1;
3573                 break;
3574         default:
3575                 break;
3576         }
3577         for (i = 0; i != ns; ++i) {
3578                 uint32_t restore;
3579
3580                 list[i].eth_dev = mlx5_dev_spawn(&pci_dev->device,
3581                                                  &list[i],
3582                                                  dev_config);
3583                 if (!list[i].eth_dev) {
3584                         if (rte_errno != EBUSY && rte_errno != EEXIST)
3585                                 break;
3586                         /* Device is disabled or already spawned. Ignore it. */
3587                         continue;
3588                 }
3589                 restore = list[i].eth_dev->data->dev_flags;
3590                 rte_eth_copy_pci_info(list[i].eth_dev, pci_dev);
3591                 /* Restore non-PCI flags cleared by the above call. */
3592                 list[i].eth_dev->data->dev_flags |= restore;
3593                 mlx5_dev_interrupt_handler_devx_install(list[i].eth_dev);
3594                 rte_eth_dev_probing_finish(list[i].eth_dev);
3595         }
3596         if (i != ns) {
3597                 DRV_LOG(ERR,
3598                         "probe of PCI device " PCI_PRI_FMT " aborted after"
3599                         " encountering an error: %s",
3600                         pci_dev->addr.domain, pci_dev->addr.bus,
3601                         pci_dev->addr.devid, pci_dev->addr.function,
3602                         strerror(rte_errno));
3603                 ret = -rte_errno;
3604                 /* Roll back. */
3605                 while (i--) {
3606                         if (!list[i].eth_dev)
3607                                 continue;
3608                         mlx5_dev_close(list[i].eth_dev);
3609                         /* mac_addrs must not be freed because in dev_private */
3610                         list[i].eth_dev->data->mac_addrs = NULL;
3611                         claim_zero(rte_eth_dev_release_port(list[i].eth_dev));
3612                 }
3613                 /* Restore original error. */
3614                 rte_errno = -ret;
3615         } else {
3616                 ret = 0;
3617         }
3618 exit:
3619         /*
3620          * Do the routine cleanup:
3621          * - close opened Netlink sockets
3622          * - free allocated spawn data array
3623          * - free the Infiniband device list
3624          */
3625         if (nl_rdma >= 0)
3626                 close(nl_rdma);
3627         if (nl_route >= 0)
3628                 close(nl_route);
3629         if (list)
3630                 rte_free(list);
3631         MLX5_ASSERT(ibv_list);
3632         mlx5_glue->free_device_list(ibv_list);
3633         return ret;
3634 }
3635
3636 /**
3637  * Look for the ethernet device belonging to mlx5 driver.
3638  *
3639  * @param[in] port_id
3640  *   port_id to start looking for device.
3641  * @param[in] pci_dev
3642  *   Pointer to the hint PCI device. When device is being probed
3643  *   the its siblings (master and preceding representors might
3644  *   not have assigned driver yet (because the mlx5_pci_probe()
3645  *   is not completed yet, for this case match on hint PCI
3646  *   device may be used to detect sibling device.
3647  *
3648  * @return
3649  *   port_id of found device, RTE_MAX_ETHPORT if not found.
3650  */
3651 uint16_t
3652 mlx5_eth_find_next(uint16_t port_id, struct rte_pci_device *pci_dev)
3653 {
3654         while (port_id < RTE_MAX_ETHPORTS) {
3655                 struct rte_eth_dev *dev = &rte_eth_devices[port_id];
3656
3657                 if (dev->state != RTE_ETH_DEV_UNUSED &&
3658                     dev->device &&
3659                     (dev->device == &pci_dev->device ||
3660                      (dev->device->driver &&
3661                      dev->device->driver->name &&
3662                      !strcmp(dev->device->driver->name, MLX5_DRIVER_NAME))))
3663                         break;
3664                 port_id++;
3665         }
3666         if (port_id >= RTE_MAX_ETHPORTS)
3667                 return RTE_MAX_ETHPORTS;
3668         return port_id;
3669 }
3670
3671 /**
3672  * DPDK callback to remove a PCI device.
3673  *
3674  * This function removes all Ethernet devices belong to a given PCI device.
3675  *
3676  * @param[in] pci_dev
3677  *   Pointer to the PCI device.
3678  *
3679  * @return
3680  *   0 on success, the function cannot fail.
3681  */
3682 static int
3683 mlx5_pci_remove(struct rte_pci_device *pci_dev)
3684 {
3685         uint16_t port_id;
3686
3687         RTE_ETH_FOREACH_DEV_OF(port_id, &pci_dev->device)
3688                 rte_eth_dev_close(port_id);
3689         return 0;
3690 }
3691
3692 static const struct rte_pci_id mlx5_pci_id_map[] = {
3693         {
3694                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
3695                                PCI_DEVICE_ID_MELLANOX_CONNECTX4)
3696         },
3697         {
3698                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
3699                                PCI_DEVICE_ID_MELLANOX_CONNECTX4VF)
3700         },
3701         {
3702                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
3703                                PCI_DEVICE_ID_MELLANOX_CONNECTX4LX)
3704         },
3705         {
3706                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
3707                                PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF)
3708         },
3709         {
3710                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
3711                                PCI_DEVICE_ID_MELLANOX_CONNECTX5)
3712         },
3713         {
3714                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
3715                                PCI_DEVICE_ID_MELLANOX_CONNECTX5VF)
3716         },
3717         {
3718                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
3719                                PCI_DEVICE_ID_MELLANOX_CONNECTX5EX)
3720         },
3721         {
3722                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
3723                                PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF)
3724         },
3725         {
3726                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
3727                                PCI_DEVICE_ID_MELLANOX_CONNECTX5BF)
3728         },
3729         {
3730                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
3731                                PCI_DEVICE_ID_MELLANOX_CONNECTX5BFVF)
3732         },
3733         {
3734                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
3735                                 PCI_DEVICE_ID_MELLANOX_CONNECTX6)
3736         },
3737         {
3738                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
3739                                 PCI_DEVICE_ID_MELLANOX_CONNECTX6VF)
3740         },
3741         {
3742                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
3743                                 PCI_DEVICE_ID_MELLANOX_CONNECTX6DX)
3744         },
3745         {
3746                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
3747                                 PCI_DEVICE_ID_MELLANOX_CONNECTX6DXVF)
3748         },
3749         {
3750                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
3751                                 PCI_DEVICE_ID_MELLANOX_CONNECTX6DXBF)
3752         },
3753         {
3754                 .vendor_id = 0
3755         }
3756 };
3757
3758 static struct rte_pci_driver mlx5_driver = {
3759         .driver = {
3760                 .name = MLX5_DRIVER_NAME
3761         },
3762         .id_table = mlx5_pci_id_map,
3763         .probe = mlx5_pci_probe,
3764         .remove = mlx5_pci_remove,
3765         .dma_map = mlx5_dma_map,
3766         .dma_unmap = mlx5_dma_unmap,
3767         .drv_flags = RTE_PCI_DRV_INTR_LSC | RTE_PCI_DRV_INTR_RMV |
3768                      RTE_PCI_DRV_PROBE_AGAIN,
3769 };
3770
3771 /**
3772  * Driver initialization routine.
3773  */
3774 RTE_INIT(rte_mlx5_pmd_init)
3775 {
3776         /* Initialize driver log type. */
3777         mlx5_logtype = rte_log_register("pmd.net.mlx5");
3778         if (mlx5_logtype >= 0)
3779                 rte_log_set_level(mlx5_logtype, RTE_LOG_NOTICE);
3780
3781         /* Build the static tables for Verbs conversion. */
3782         mlx5_set_ptype_table();
3783         mlx5_set_cksum_table();
3784         mlx5_set_swp_types_table();
3785         if (mlx5_glue)
3786                 rte_pci_register(&mlx5_driver);
3787 }
3788
3789 RTE_PMD_EXPORT_NAME(net_mlx5, __COUNTER__);
3790 RTE_PMD_REGISTER_PCI_TABLE(net_mlx5, mlx5_pci_id_map);
3791 RTE_PMD_REGISTER_KMOD_DEP(net_mlx5, "* ib_uverbs & mlx5_core & mlx5_ib");