0e79c799e0c548ee7d468b06ec295e7c34f211d1
[dpdk.git] / drivers / net / mlx5 / mlx5.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2015 6WIND S.A.
3  * Copyright 2015 Mellanox Technologies, Ltd
4  */
5
6 #include <stddef.h>
7 #include <unistd.h>
8 #include <string.h>
9 #include <stdint.h>
10 #include <stdlib.h>
11 #include <errno.h>
12 #include <net/if.h>
13 #include <sys/mman.h>
14 #include <linux/rtnetlink.h>
15
16 /* Verbs header. */
17 /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
18 #ifdef PEDANTIC
19 #pragma GCC diagnostic ignored "-Wpedantic"
20 #endif
21 #include <infiniband/verbs.h>
22 #ifdef PEDANTIC
23 #pragma GCC diagnostic error "-Wpedantic"
24 #endif
25
26 #include <rte_malloc.h>
27 #include <rte_ethdev_driver.h>
28 #include <rte_ethdev_pci.h>
29 #include <rte_pci.h>
30 #include <rte_bus_pci.h>
31 #include <rte_common.h>
32 #include <rte_kvargs.h>
33 #include <rte_rwlock.h>
34 #include <rte_spinlock.h>
35 #include <rte_string_fns.h>
36 #include <rte_alarm.h>
37
38 #include <mlx5_glue.h>
39 #include <mlx5_devx_cmds.h>
40 #include <mlx5_common.h>
41 #include <mlx5_common_mp.h>
42
43 #include "mlx5_defs.h"
44 #include "mlx5.h"
45 #include "mlx5_utils.h"
46 #include "mlx5_rxtx.h"
47 #include "mlx5_autoconf.h"
48 #include "mlx5_mr.h"
49 #include "mlx5_flow.h"
50 #include "rte_pmd_mlx5.h"
51
52 /* Device parameter to enable RX completion queue compression. */
53 #define MLX5_RXQ_CQE_COMP_EN "rxq_cqe_comp_en"
54
55 /* Device parameter to enable RX completion entry padding to 128B. */
56 #define MLX5_RXQ_CQE_PAD_EN "rxq_cqe_pad_en"
57
58 /* Device parameter to enable padding Rx packet to cacheline size. */
59 #define MLX5_RXQ_PKT_PAD_EN "rxq_pkt_pad_en"
60
61 /* Device parameter to enable Multi-Packet Rx queue. */
62 #define MLX5_RX_MPRQ_EN "mprq_en"
63
64 /* Device parameter to configure log 2 of the number of strides for MPRQ. */
65 #define MLX5_RX_MPRQ_LOG_STRIDE_NUM "mprq_log_stride_num"
66
67 /* Device parameter to configure log 2 of the stride size for MPRQ. */
68 #define MLX5_RX_MPRQ_LOG_STRIDE_SIZE "mprq_log_stride_size"
69
70 /* Device parameter to limit the size of memcpy'd packet for MPRQ. */
71 #define MLX5_RX_MPRQ_MAX_MEMCPY_LEN "mprq_max_memcpy_len"
72
73 /* Device parameter to set the minimum number of Rx queues to enable MPRQ. */
74 #define MLX5_RXQS_MIN_MPRQ "rxqs_min_mprq"
75
76 /* Device parameter to configure inline send. Deprecated, ignored.*/
77 #define MLX5_TXQ_INLINE "txq_inline"
78
79 /* Device parameter to limit packet size to inline with ordinary SEND. */
80 #define MLX5_TXQ_INLINE_MAX "txq_inline_max"
81
82 /* Device parameter to configure minimal data size to inline. */
83 #define MLX5_TXQ_INLINE_MIN "txq_inline_min"
84
85 /* Device parameter to limit packet size to inline with Enhanced MPW. */
86 #define MLX5_TXQ_INLINE_MPW "txq_inline_mpw"
87
88 /*
89  * Device parameter to configure the number of TX queues threshold for
90  * enabling inline send.
91  */
92 #define MLX5_TXQS_MIN_INLINE "txqs_min_inline"
93
94 /*
95  * Device parameter to configure the number of TX queues threshold for
96  * enabling vectorized Tx, deprecated, ignored (no vectorized Tx routines).
97  */
98 #define MLX5_TXQS_MAX_VEC "txqs_max_vec"
99
100 /* Device parameter to enable multi-packet send WQEs. */
101 #define MLX5_TXQ_MPW_EN "txq_mpw_en"
102
103 /*
104  * Device parameter to force doorbell register mapping
105  * to non-cahed region eliminating the extra write memory barrier.
106  */
107 #define MLX5_TX_DB_NC "tx_db_nc"
108
109 /*
110  * Device parameter to include 2 dsegs in the title WQEBB.
111  * Deprecated, ignored.
112  */
113 #define MLX5_TXQ_MPW_HDR_DSEG_EN "txq_mpw_hdr_dseg_en"
114
115 /*
116  * Device parameter to limit the size of inlining packet.
117  * Deprecated, ignored.
118  */
119 #define MLX5_TXQ_MAX_INLINE_LEN "txq_max_inline_len"
120
121 /*
122  * Device parameter to enable hardware Tx vector.
123  * Deprecated, ignored (no vectorized Tx routines anymore).
124  */
125 #define MLX5_TX_VEC_EN "tx_vec_en"
126
127 /* Device parameter to enable hardware Rx vector. */
128 #define MLX5_RX_VEC_EN "rx_vec_en"
129
130 /* Allow L3 VXLAN flow creation. */
131 #define MLX5_L3_VXLAN_EN "l3_vxlan_en"
132
133 /* Activate DV E-Switch flow steering. */
134 #define MLX5_DV_ESW_EN "dv_esw_en"
135
136 /* Activate DV flow steering. */
137 #define MLX5_DV_FLOW_EN "dv_flow_en"
138
139 /* Enable extensive flow metadata support. */
140 #define MLX5_DV_XMETA_EN "dv_xmeta_en"
141
142 /* Activate Netlink support in VF mode. */
143 #define MLX5_VF_NL_EN "vf_nl_en"
144
145 /* Enable extending memsegs when creating a MR. */
146 #define MLX5_MR_EXT_MEMSEG_EN "mr_ext_memseg_en"
147
148 /* Select port representors to instantiate. */
149 #define MLX5_REPRESENTOR "representor"
150
151 /* Device parameter to configure the maximum number of dump files per queue. */
152 #define MLX5_MAX_DUMP_FILES_NUM "max_dump_files_num"
153
154 /* Configure timeout of LRO session (in microseconds). */
155 #define MLX5_LRO_TIMEOUT_USEC "lro_timeout_usec"
156
157 /*
158  * Device parameter to configure the total data buffer size for a single
159  * hairpin queue (logarithm value).
160  */
161 #define MLX5_HP_BUF_SIZE "hp_buf_log_sz"
162
163 #ifndef HAVE_IBV_MLX5_MOD_MPW
164 #define MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED (1 << 2)
165 #define MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW (1 << 3)
166 #endif
167
168 #ifndef HAVE_IBV_MLX5_MOD_CQE_128B_COMP
169 #define MLX5DV_CONTEXT_FLAGS_CQE_128B_COMP (1 << 4)
170 #endif
171
172 static const char *MZ_MLX5_PMD_SHARED_DATA = "mlx5_pmd_shared_data";
173
174 /* Shared memory between primary and secondary processes. */
175 struct mlx5_shared_data *mlx5_shared_data;
176
177 /* Spinlock for mlx5_shared_data allocation. */
178 static rte_spinlock_t mlx5_shared_data_lock = RTE_SPINLOCK_INITIALIZER;
179
180 /* Process local data for secondary processes. */
181 static struct mlx5_local_data mlx5_local_data;
182
183 /** Driver-specific log messages type. */
184 int mlx5_logtype;
185
186 /** Data associated with devices to spawn. */
187 struct mlx5_dev_spawn_data {
188         uint32_t ifindex; /**< Network interface index. */
189         uint32_t max_port; /**< IB device maximal port index. */
190         uint32_t ibv_port; /**< IB device physical port index. */
191         int pf_bond; /**< bonding device PF index. < 0 - no bonding */
192         struct mlx5_switch_info info; /**< Switch information. */
193         struct ibv_device *ibv_dev; /**< Associated IB device. */
194         struct rte_eth_dev *eth_dev; /**< Associated Ethernet device. */
195         struct rte_pci_device *pci_dev; /**< Backend PCI device. */
196 };
197
198 static LIST_HEAD(, mlx5_ibv_shared) mlx5_ibv_list = LIST_HEAD_INITIALIZER();
199 static pthread_mutex_t mlx5_ibv_list_mutex = PTHREAD_MUTEX_INITIALIZER;
200
201 static struct mlx5_indexed_pool_config mlx5_ipool_cfg[] = {
202 #ifdef HAVE_IBV_FLOW_DV_SUPPORT
203         {
204                 .size = sizeof(struct mlx5_flow_dv_encap_decap_resource),
205                 .trunk_size = 64,
206                 .grow_trunk = 3,
207                 .grow_shift = 2,
208                 .need_lock = 0,
209                 .release_mem_en = 1,
210                 .malloc = rte_malloc_socket,
211                 .free = rte_free,
212                 .type = "mlx5_encap_decap_ipool",
213         },
214         {
215                 .size = sizeof(struct mlx5_flow_dv_push_vlan_action_resource),
216                 .trunk_size = 64,
217                 .grow_trunk = 3,
218                 .grow_shift = 2,
219                 .need_lock = 0,
220                 .release_mem_en = 1,
221                 .malloc = rte_malloc_socket,
222                 .free = rte_free,
223                 .type = "mlx5_push_vlan_ipool",
224         },
225         {
226                 .size = sizeof(struct mlx5_flow_dv_tag_resource),
227                 .trunk_size = 64,
228                 .grow_trunk = 3,
229                 .grow_shift = 2,
230                 .need_lock = 0,
231                 .release_mem_en = 1,
232                 .malloc = rte_malloc_socket,
233                 .free = rte_free,
234                 .type = "mlx5_tag_ipool",
235         },
236         {
237                 .size = sizeof(struct mlx5_flow_dv_port_id_action_resource),
238                 .trunk_size = 64,
239                 .grow_trunk = 3,
240                 .grow_shift = 2,
241                 .need_lock = 0,
242                 .release_mem_en = 1,
243                 .malloc = rte_malloc_socket,
244                 .free = rte_free,
245                 .type = "mlx5_port_id_ipool",
246         },
247         {
248                 .size = sizeof(struct mlx5_flow_tbl_data_entry),
249                 .trunk_size = 64,
250                 .grow_trunk = 3,
251                 .grow_shift = 2,
252                 .need_lock = 0,
253                 .release_mem_en = 1,
254                 .malloc = rte_malloc_socket,
255                 .free = rte_free,
256                 .type = "mlx5_jump_ipool",
257         },
258 #endif
259         {
260                 .size = sizeof(struct mlx5_flow_meter),
261                 .trunk_size = 64,
262                 .grow_trunk = 3,
263                 .grow_shift = 2,
264                 .need_lock = 0,
265                 .release_mem_en = 1,
266                 .malloc = rte_malloc_socket,
267                 .free = rte_free,
268                 .type = "mlx5_meter_ipool",
269         },
270         {
271                 .size = (sizeof(struct mlx5_hrxq) + MLX5_RSS_HASH_KEY_LEN),
272                 .trunk_size = 64,
273                 .grow_trunk = 3,
274                 .grow_shift = 2,
275                 .need_lock = 0,
276                 .release_mem_en = 1,
277                 .malloc = rte_malloc_socket,
278                 .free = rte_free,
279                 .type = "mlx5_hrxq_ipool",
280         },
281         {
282                 .size = sizeof(struct mlx5_flow_handle),
283                 .trunk_size = 64,
284                 .grow_trunk = 3,
285                 .grow_shift = 2,
286                 .need_lock = 0,
287                 .release_mem_en = 1,
288                 .malloc = rte_malloc_socket,
289                 .free = rte_free,
290                 .type = "mlx5_flow_handle_ipool",
291         },
292 };
293
294
295 #define MLX5_FLOW_MIN_ID_POOL_SIZE 512
296 #define MLX5_ID_GENERATION_ARRAY_FACTOR 16
297
298 #define MLX5_FLOW_TABLE_HLIST_ARRAY_SIZE 4096
299 #define MLX5_TAGS_HLIST_ARRAY_SIZE 8192
300
301 /**
302  * Allocate ID pool structure.
303  *
304  * @param[in] max_id
305  *   The maximum id can be allocated from the pool.
306  *
307  * @return
308  *   Pointer to pool object, NULL value otherwise.
309  */
310 struct mlx5_flow_id_pool *
311 mlx5_flow_id_pool_alloc(uint32_t max_id)
312 {
313         struct mlx5_flow_id_pool *pool;
314         void *mem;
315
316         pool = rte_zmalloc("id pool allocation", sizeof(*pool),
317                            RTE_CACHE_LINE_SIZE);
318         if (!pool) {
319                 DRV_LOG(ERR, "can't allocate id pool");
320                 rte_errno  = ENOMEM;
321                 return NULL;
322         }
323         mem = rte_zmalloc("", MLX5_FLOW_MIN_ID_POOL_SIZE * sizeof(uint32_t),
324                           RTE_CACHE_LINE_SIZE);
325         if (!mem) {
326                 DRV_LOG(ERR, "can't allocate mem for id pool");
327                 rte_errno  = ENOMEM;
328                 goto error;
329         }
330         pool->free_arr = mem;
331         pool->curr = pool->free_arr;
332         pool->last = pool->free_arr + MLX5_FLOW_MIN_ID_POOL_SIZE;
333         pool->base_index = 0;
334         pool->max_id = max_id;
335         return pool;
336 error:
337         rte_free(pool);
338         return NULL;
339 }
340
341 /**
342  * Release ID pool structure.
343  *
344  * @param[in] pool
345  *   Pointer to flow id pool object to free.
346  */
347 void
348 mlx5_flow_id_pool_release(struct mlx5_flow_id_pool *pool)
349 {
350         rte_free(pool->free_arr);
351         rte_free(pool);
352 }
353
354 /**
355  * Generate ID.
356  *
357  * @param[in] pool
358  *   Pointer to flow id pool.
359  * @param[out] id
360  *   The generated ID.
361  *
362  * @return
363  *   0 on success, error value otherwise.
364  */
365 uint32_t
366 mlx5_flow_id_get(struct mlx5_flow_id_pool *pool, uint32_t *id)
367 {
368         if (pool->curr == pool->free_arr) {
369                 if (pool->base_index == pool->max_id) {
370                         rte_errno  = ENOMEM;
371                         DRV_LOG(ERR, "no free id");
372                         return -rte_errno;
373                 }
374                 *id = ++pool->base_index;
375                 return 0;
376         }
377         *id = *(--pool->curr);
378         return 0;
379 }
380
381 /**
382  * Release ID.
383  *
384  * @param[in] pool
385  *   Pointer to flow id pool.
386  * @param[out] id
387  *   The generated ID.
388  *
389  * @return
390  *   0 on success, error value otherwise.
391  */
392 uint32_t
393 mlx5_flow_id_release(struct mlx5_flow_id_pool *pool, uint32_t id)
394 {
395         uint32_t size;
396         uint32_t size2;
397         void *mem;
398
399         if (pool->curr == pool->last) {
400                 size = pool->curr - pool->free_arr;
401                 size2 = size * MLX5_ID_GENERATION_ARRAY_FACTOR;
402                 MLX5_ASSERT(size2 > size);
403                 mem = rte_malloc("", size2 * sizeof(uint32_t), 0);
404                 if (!mem) {
405                         DRV_LOG(ERR, "can't allocate mem for id pool");
406                         rte_errno  = ENOMEM;
407                         return -rte_errno;
408                 }
409                 memcpy(mem, pool->free_arr, size * sizeof(uint32_t));
410                 rte_free(pool->free_arr);
411                 pool->free_arr = mem;
412                 pool->curr = pool->free_arr + size;
413                 pool->last = pool->free_arr + size2;
414         }
415         *pool->curr = id;
416         pool->curr++;
417         return 0;
418 }
419
420 /**
421  * Initialize the counters management structure.
422  *
423  * @param[in] sh
424  *   Pointer to mlx5_ibv_shared object to free
425  */
426 static void
427 mlx5_flow_counters_mng_init(struct mlx5_ibv_shared *sh)
428 {
429         uint8_t i;
430
431         TAILQ_INIT(&sh->cmng.flow_counters);
432         for (i = 0; i < RTE_DIM(sh->cmng.ccont); ++i)
433                 TAILQ_INIT(&sh->cmng.ccont[i].pool_list);
434 }
435
436 /**
437  * Destroy all the resources allocated for a counter memory management.
438  *
439  * @param[in] mng
440  *   Pointer to the memory management structure.
441  */
442 static void
443 mlx5_flow_destroy_counter_stat_mem_mng(struct mlx5_counter_stats_mem_mng *mng)
444 {
445         uint8_t *mem = (uint8_t *)(uintptr_t)mng->raws[0].data;
446
447         LIST_REMOVE(mng, next);
448         claim_zero(mlx5_devx_cmd_destroy(mng->dm));
449         claim_zero(mlx5_glue->devx_umem_dereg(mng->umem));
450         rte_free(mem);
451 }
452
453 /**
454  * Close and release all the resources of the counters management.
455  *
456  * @param[in] sh
457  *   Pointer to mlx5_ibv_shared object to free.
458  */
459 static void
460 mlx5_flow_counters_mng_close(struct mlx5_ibv_shared *sh)
461 {
462         struct mlx5_counter_stats_mem_mng *mng;
463         uint8_t i;
464         int j;
465         int retries = 1024;
466
467         rte_errno = 0;
468         while (--retries) {
469                 rte_eal_alarm_cancel(mlx5_flow_query_alarm, sh);
470                 if (rte_errno != EINPROGRESS)
471                         break;
472                 rte_pause();
473         }
474         for (i = 0; i < RTE_DIM(sh->cmng.ccont); ++i) {
475                 struct mlx5_flow_counter_pool *pool;
476                 uint32_t batch = !!(i % 2);
477
478                 if (!sh->cmng.ccont[i].pools)
479                         continue;
480                 pool = TAILQ_FIRST(&sh->cmng.ccont[i].pool_list);
481                 while (pool) {
482                         if (batch) {
483                                 if (pool->min_dcs)
484                                         claim_zero
485                                         (mlx5_devx_cmd_destroy(pool->min_dcs));
486                         }
487                         for (j = 0; j < MLX5_COUNTERS_PER_POOL; ++j) {
488                                 if (pool->counters_raw[j].action)
489                                         claim_zero
490                                         (mlx5_glue->destroy_flow_action
491                                                (pool->counters_raw[j].action));
492                                 if (!batch && MLX5_GET_POOL_CNT_EXT
493                                     (pool, j)->dcs)
494                                         claim_zero(mlx5_devx_cmd_destroy
495                                                   (MLX5_GET_POOL_CNT_EXT
496                                                   (pool, j)->dcs));
497                         }
498                         TAILQ_REMOVE(&sh->cmng.ccont[i].pool_list, pool,
499                                      next);
500                         rte_free(pool);
501                         pool = TAILQ_FIRST(&sh->cmng.ccont[i].pool_list);
502                 }
503                 rte_free(sh->cmng.ccont[i].pools);
504         }
505         mng = LIST_FIRST(&sh->cmng.mem_mngs);
506         while (mng) {
507                 mlx5_flow_destroy_counter_stat_mem_mng(mng);
508                 mng = LIST_FIRST(&sh->cmng.mem_mngs);
509         }
510         memset(&sh->cmng, 0, sizeof(sh->cmng));
511 }
512
513 /**
514  * Initialize the flow resources' indexed mempool.
515  *
516  * @param[in] sh
517  *   Pointer to mlx5_ibv_shared object.
518  * @param[in] sh
519  *   Pointer to user dev config.
520  */
521 static void
522 mlx5_flow_ipool_create(struct mlx5_ibv_shared *sh,
523                        const struct mlx5_dev_config *config __rte_unused)
524 {
525         uint8_t i;
526
527 #ifdef HAVE_IBV_FLOW_DV_SUPPORT
528         /*
529          * While DV is supported, user chooses the verbs mode,
530          * the mlx5 flow handle size is different with the
531          * MLX5_FLOW_HANDLE_VERBS_SIZE.
532          */
533         if (!config->dv_flow_en)
534                 mlx5_ipool_cfg[MLX5_IPOOL_MLX5_FLOW].size =
535                                         MLX5_FLOW_HANDLE_VERBS_SIZE;
536 #endif
537         for (i = 0; i < MLX5_IPOOL_MAX; ++i)
538                 sh->ipool[i] = mlx5_ipool_create(&mlx5_ipool_cfg[i]);
539 }
540
541 /**
542  * Release the flow resources' indexed mempool.
543  *
544  * @param[in] sh
545  *   Pointer to mlx5_ibv_shared object.
546  */
547 static void
548 mlx5_flow_ipool_destroy(struct mlx5_ibv_shared *sh)
549 {
550         uint8_t i;
551
552         for (i = 0; i < MLX5_IPOOL_MAX; ++i)
553                 mlx5_ipool_destroy(sh->ipool[i]);
554 }
555
556 /**
557  * Extract pdn of PD object using DV API.
558  *
559  * @param[in] pd
560  *   Pointer to the verbs PD object.
561  * @param[out] pdn
562  *   Pointer to the PD object number variable.
563  *
564  * @return
565  *   0 on success, error value otherwise.
566  */
567 #ifdef HAVE_IBV_FLOW_DV_SUPPORT
568 static int
569 mlx5_get_pdn(struct ibv_pd *pd __rte_unused, uint32_t *pdn __rte_unused)
570 {
571         struct mlx5dv_obj obj;
572         struct mlx5dv_pd pd_info;
573         int ret = 0;
574
575         obj.pd.in = pd;
576         obj.pd.out = &pd_info;
577         ret = mlx5_glue->dv_init_obj(&obj, MLX5DV_OBJ_PD);
578         if (ret) {
579                 DRV_LOG(DEBUG, "Fail to get PD object info");
580                 return ret;
581         }
582         *pdn = pd_info.pdn;
583         return 0;
584 }
585 #endif /* HAVE_IBV_FLOW_DV_SUPPORT */
586
587 static int
588 mlx5_config_doorbell_mapping_env(const struct mlx5_dev_config *config)
589 {
590         char *env;
591         int value;
592
593         MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY);
594         /* Get environment variable to store. */
595         env = getenv(MLX5_SHUT_UP_BF);
596         value = env ? !!strcmp(env, "0") : MLX5_ARG_UNSET;
597         if (config->dbnc == MLX5_ARG_UNSET)
598                 setenv(MLX5_SHUT_UP_BF, MLX5_SHUT_UP_BF_DEFAULT, 1);
599         else
600                 setenv(MLX5_SHUT_UP_BF,
601                        config->dbnc == MLX5_TXDB_NCACHED ? "1" : "0", 1);
602         return value;
603 }
604
605 static void
606 mlx5_restore_doorbell_mapping_env(int value)
607 {
608         MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY);
609         /* Restore the original environment variable state. */
610         if (value == MLX5_ARG_UNSET)
611                 unsetenv(MLX5_SHUT_UP_BF);
612         else
613                 setenv(MLX5_SHUT_UP_BF, value ? "1" : "0", 1);
614 }
615
616 /**
617  * Allocate shared IB device context. If there is multiport device the
618  * master and representors will share this context, if there is single
619  * port dedicated IB device, the context will be used by only given
620  * port due to unification.
621  *
622  * Routine first searches the context for the specified IB device name,
623  * if found the shared context assumed and reference counter is incremented.
624  * If no context found the new one is created and initialized with specified
625  * IB device context and parameters.
626  *
627  * @param[in] spawn
628  *   Pointer to the IB device attributes (name, port, etc).
629  * @param[in] config
630  *   Pointer to device configuration structure.
631  *
632  * @return
633  *   Pointer to mlx5_ibv_shared object on success,
634  *   otherwise NULL and rte_errno is set.
635  */
636 static struct mlx5_ibv_shared *
637 mlx5_alloc_shared_ibctx(const struct mlx5_dev_spawn_data *spawn,
638                         const struct mlx5_dev_config *config)
639 {
640         struct mlx5_ibv_shared *sh;
641         int dbmap_env;
642         int err = 0;
643         uint32_t i;
644 #ifdef HAVE_IBV_FLOW_DV_SUPPORT
645         struct mlx5_devx_tis_attr tis_attr = { 0 };
646 #endif
647
648         MLX5_ASSERT(spawn);
649         /* Secondary process should not create the shared context. */
650         MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY);
651         pthread_mutex_lock(&mlx5_ibv_list_mutex);
652         /* Search for IB context by device name. */
653         LIST_FOREACH(sh, &mlx5_ibv_list, next) {
654                 if (!strcmp(sh->ibdev_name, spawn->ibv_dev->name)) {
655                         sh->refcnt++;
656                         goto exit;
657                 }
658         }
659         /* No device found, we have to create new shared context. */
660         MLX5_ASSERT(spawn->max_port);
661         sh = rte_zmalloc("ethdev shared ib context",
662                          sizeof(struct mlx5_ibv_shared) +
663                          spawn->max_port *
664                          sizeof(struct mlx5_ibv_shared_port),
665                          RTE_CACHE_LINE_SIZE);
666         if (!sh) {
667                 DRV_LOG(ERR, "shared context allocation failure");
668                 rte_errno  = ENOMEM;
669                 goto exit;
670         }
671         /*
672          * Configure environment variable "MLX5_BF_SHUT_UP"
673          * before the device creation. The rdma_core library
674          * checks the variable at device creation and
675          * stores the result internally.
676          */
677         dbmap_env = mlx5_config_doorbell_mapping_env(config);
678         /* Try to open IB device with DV first, then usual Verbs. */
679         errno = 0;
680         sh->ctx = mlx5_glue->dv_open_device(spawn->ibv_dev);
681         if (sh->ctx) {
682                 sh->devx = 1;
683                 DRV_LOG(DEBUG, "DevX is supported");
684                 /* The device is created, no need for environment. */
685                 mlx5_restore_doorbell_mapping_env(dbmap_env);
686         } else {
687                 /* The environment variable is still configured. */
688                 sh->ctx = mlx5_glue->open_device(spawn->ibv_dev);
689                 err = errno ? errno : ENODEV;
690                 /*
691                  * The environment variable is not needed anymore,
692                  * all device creation attempts are completed.
693                  */
694                 mlx5_restore_doorbell_mapping_env(dbmap_env);
695                 if (!sh->ctx)
696                         goto error;
697                 DRV_LOG(DEBUG, "DevX is NOT supported");
698         }
699         err = mlx5_glue->query_device_ex(sh->ctx, NULL, &sh->device_attr);
700         if (err) {
701                 DRV_LOG(DEBUG, "ibv_query_device_ex() failed");
702                 goto error;
703         }
704         sh->refcnt = 1;
705         sh->max_port = spawn->max_port;
706         strncpy(sh->ibdev_name, sh->ctx->device->name,
707                 sizeof(sh->ibdev_name));
708         strncpy(sh->ibdev_path, sh->ctx->device->ibdev_path,
709                 sizeof(sh->ibdev_path));
710         pthread_mutex_init(&sh->intr_mutex, NULL);
711         /*
712          * Setting port_id to max unallowed value means
713          * there is no interrupt subhandler installed for
714          * the given port index i.
715          */
716         for (i = 0; i < sh->max_port; i++) {
717                 sh->port[i].ih_port_id = RTE_MAX_ETHPORTS;
718                 sh->port[i].devx_ih_port_id = RTE_MAX_ETHPORTS;
719         }
720         sh->pd = mlx5_glue->alloc_pd(sh->ctx);
721         if (sh->pd == NULL) {
722                 DRV_LOG(ERR, "PD allocation failure");
723                 err = ENOMEM;
724                 goto error;
725         }
726 #ifdef HAVE_IBV_FLOW_DV_SUPPORT
727         if (sh->devx) {
728                 err = mlx5_get_pdn(sh->pd, &sh->pdn);
729                 if (err) {
730                         DRV_LOG(ERR, "Fail to extract pdn from PD");
731                         goto error;
732                 }
733                 sh->td = mlx5_devx_cmd_create_td(sh->ctx);
734                 if (!sh->td) {
735                         DRV_LOG(ERR, "TD allocation failure");
736                         err = ENOMEM;
737                         goto error;
738                 }
739                 tis_attr.transport_domain = sh->td->id;
740                 sh->tis = mlx5_devx_cmd_create_tis(sh->ctx, &tis_attr);
741                 if (!sh->tis) {
742                         DRV_LOG(ERR, "TIS allocation failure");
743                         err = ENOMEM;
744                         goto error;
745                 }
746         }
747         sh->flow_id_pool = mlx5_flow_id_pool_alloc(UINT32_MAX);
748         if (!sh->flow_id_pool) {
749                 DRV_LOG(ERR, "can't create flow id pool");
750                 err = ENOMEM;
751                 goto error;
752         }
753 #endif /* HAVE_IBV_FLOW_DV_SUPPORT */
754         /*
755          * Once the device is added to the list of memory event
756          * callback, its global MR cache table cannot be expanded
757          * on the fly because of deadlock. If it overflows, lookup
758          * should be done by searching MR list linearly, which is slow.
759          *
760          * At this point the device is not added to the memory
761          * event list yet, context is just being created.
762          */
763         err = mlx5_mr_btree_init(&sh->share_cache.cache,
764                                  MLX5_MR_BTREE_CACHE_N * 2,
765                                  spawn->pci_dev->device.numa_node);
766         if (err) {
767                 err = rte_errno;
768                 goto error;
769         }
770         mlx5_flow_counters_mng_init(sh);
771         mlx5_flow_ipool_create(sh, config);
772         /* Add device to memory callback list. */
773         rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock);
774         LIST_INSERT_HEAD(&mlx5_shared_data->mem_event_cb_list,
775                          sh, mem_event_cb);
776         rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
777         /* Add context to the global device list. */
778         LIST_INSERT_HEAD(&mlx5_ibv_list, sh, next);
779 exit:
780         pthread_mutex_unlock(&mlx5_ibv_list_mutex);
781         return sh;
782 error:
783         pthread_mutex_unlock(&mlx5_ibv_list_mutex);
784         MLX5_ASSERT(sh);
785         if (sh->tis)
786                 claim_zero(mlx5_devx_cmd_destroy(sh->tis));
787         if (sh->td)
788                 claim_zero(mlx5_devx_cmd_destroy(sh->td));
789         if (sh->pd)
790                 claim_zero(mlx5_glue->dealloc_pd(sh->pd));
791         if (sh->ctx)
792                 claim_zero(mlx5_glue->close_device(sh->ctx));
793         if (sh->flow_id_pool)
794                 mlx5_flow_id_pool_release(sh->flow_id_pool);
795         rte_free(sh);
796         MLX5_ASSERT(err > 0);
797         rte_errno = err;
798         return NULL;
799 }
800
801 /**
802  * Free shared IB device context. Decrement counter and if zero free
803  * all allocated resources and close handles.
804  *
805  * @param[in] sh
806  *   Pointer to mlx5_ibv_shared object to free
807  */
808 static void
809 mlx5_free_shared_ibctx(struct mlx5_ibv_shared *sh)
810 {
811         pthread_mutex_lock(&mlx5_ibv_list_mutex);
812 #ifdef RTE_LIBRTE_MLX5_DEBUG
813         /* Check the object presence in the list. */
814         struct mlx5_ibv_shared *lctx;
815
816         LIST_FOREACH(lctx, &mlx5_ibv_list, next)
817                 if (lctx == sh)
818                         break;
819         MLX5_ASSERT(lctx);
820         if (lctx != sh) {
821                 DRV_LOG(ERR, "Freeing non-existing shared IB context");
822                 goto exit;
823         }
824 #endif
825         MLX5_ASSERT(sh);
826         MLX5_ASSERT(sh->refcnt);
827         /* Secondary process should not free the shared context. */
828         MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY);
829         if (--sh->refcnt)
830                 goto exit;
831         /* Remove from memory callback device list. */
832         rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock);
833         LIST_REMOVE(sh, mem_event_cb);
834         rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
835         /* Release created Memory Regions. */
836         mlx5_mr_release_cache(&sh->share_cache);
837         /* Remove context from the global device list. */
838         LIST_REMOVE(sh, next);
839         /*
840          *  Ensure there is no async event handler installed.
841          *  Only primary process handles async device events.
842          **/
843         mlx5_flow_counters_mng_close(sh);
844         mlx5_flow_ipool_destroy(sh);
845         MLX5_ASSERT(!sh->intr_cnt);
846         if (sh->intr_cnt)
847                 mlx5_intr_callback_unregister
848                         (&sh->intr_handle, mlx5_dev_interrupt_handler, sh);
849 #ifdef HAVE_MLX5_DEVX_ASYNC_SUPPORT
850         if (sh->devx_intr_cnt) {
851                 if (sh->intr_handle_devx.fd)
852                         rte_intr_callback_unregister(&sh->intr_handle_devx,
853                                           mlx5_dev_interrupt_handler_devx, sh);
854                 if (sh->devx_comp)
855                         mlx5dv_devx_destroy_cmd_comp(sh->devx_comp);
856         }
857 #endif
858         pthread_mutex_destroy(&sh->intr_mutex);
859         if (sh->pd)
860                 claim_zero(mlx5_glue->dealloc_pd(sh->pd));
861         if (sh->tis)
862                 claim_zero(mlx5_devx_cmd_destroy(sh->tis));
863         if (sh->td)
864                 claim_zero(mlx5_devx_cmd_destroy(sh->td));
865         if (sh->ctx)
866                 claim_zero(mlx5_glue->close_device(sh->ctx));
867         if (sh->flow_id_pool)
868                 mlx5_flow_id_pool_release(sh->flow_id_pool);
869         rte_free(sh);
870 exit:
871         pthread_mutex_unlock(&mlx5_ibv_list_mutex);
872 }
873
874 /**
875  * Destroy table hash list and all the root entries per domain.
876  *
877  * @param[in] priv
878  *   Pointer to the private device data structure.
879  */
880 static void
881 mlx5_free_table_hash_list(struct mlx5_priv *priv)
882 {
883         struct mlx5_ibv_shared *sh = priv->sh;
884         struct mlx5_flow_tbl_data_entry *tbl_data;
885         union mlx5_flow_tbl_key table_key = {
886                 {
887                         .table_id = 0,
888                         .reserved = 0,
889                         .domain = 0,
890                         .direction = 0,
891                 }
892         };
893         struct mlx5_hlist_entry *pos;
894
895         if (!sh->flow_tbls)
896                 return;
897         pos = mlx5_hlist_lookup(sh->flow_tbls, table_key.v64);
898         if (pos) {
899                 tbl_data = container_of(pos, struct mlx5_flow_tbl_data_entry,
900                                         entry);
901                 MLX5_ASSERT(tbl_data);
902                 mlx5_hlist_remove(sh->flow_tbls, pos);
903                 rte_free(tbl_data);
904         }
905         table_key.direction = 1;
906         pos = mlx5_hlist_lookup(sh->flow_tbls, table_key.v64);
907         if (pos) {
908                 tbl_data = container_of(pos, struct mlx5_flow_tbl_data_entry,
909                                         entry);
910                 MLX5_ASSERT(tbl_data);
911                 mlx5_hlist_remove(sh->flow_tbls, pos);
912                 rte_free(tbl_data);
913         }
914         table_key.direction = 0;
915         table_key.domain = 1;
916         pos = mlx5_hlist_lookup(sh->flow_tbls, table_key.v64);
917         if (pos) {
918                 tbl_data = container_of(pos, struct mlx5_flow_tbl_data_entry,
919                                         entry);
920                 MLX5_ASSERT(tbl_data);
921                 mlx5_hlist_remove(sh->flow_tbls, pos);
922                 rte_free(tbl_data);
923         }
924         mlx5_hlist_destroy(sh->flow_tbls, NULL, NULL);
925 }
926
927 /**
928  * Initialize flow table hash list and create the root tables entry
929  * for each domain.
930  *
931  * @param[in] priv
932  *   Pointer to the private device data structure.
933  *
934  * @return
935  *   Zero on success, positive error code otherwise.
936  */
937 static int
938 mlx5_alloc_table_hash_list(struct mlx5_priv *priv)
939 {
940         struct mlx5_ibv_shared *sh = priv->sh;
941         char s[MLX5_HLIST_NAMESIZE];
942         int err = 0;
943
944         MLX5_ASSERT(sh);
945         snprintf(s, sizeof(s), "%s_flow_table", priv->sh->ibdev_name);
946         sh->flow_tbls = mlx5_hlist_create(s, MLX5_FLOW_TABLE_HLIST_ARRAY_SIZE);
947         if (!sh->flow_tbls) {
948                 DRV_LOG(ERR, "flow tables with hash creation failed.\n");
949                 err = ENOMEM;
950                 return err;
951         }
952 #ifndef HAVE_MLX5DV_DR
953         /*
954          * In case we have not DR support, the zero tables should be created
955          * because DV expect to see them even if they cannot be created by
956          * RDMA-CORE.
957          */
958         union mlx5_flow_tbl_key table_key = {
959                 {
960                         .table_id = 0,
961                         .reserved = 0,
962                         .domain = 0,
963                         .direction = 0,
964                 }
965         };
966         struct mlx5_flow_tbl_data_entry *tbl_data = rte_zmalloc(NULL,
967                                                           sizeof(*tbl_data), 0);
968
969         if (!tbl_data) {
970                 err = ENOMEM;
971                 goto error;
972         }
973         tbl_data->entry.key = table_key.v64;
974         err = mlx5_hlist_insert(sh->flow_tbls, &tbl_data->entry);
975         if (err)
976                 goto error;
977         rte_atomic32_init(&tbl_data->tbl.refcnt);
978         rte_atomic32_inc(&tbl_data->tbl.refcnt);
979         table_key.direction = 1;
980         tbl_data = rte_zmalloc(NULL, sizeof(*tbl_data), 0);
981         if (!tbl_data) {
982                 err = ENOMEM;
983                 goto error;
984         }
985         tbl_data->entry.key = table_key.v64;
986         err = mlx5_hlist_insert(sh->flow_tbls, &tbl_data->entry);
987         if (err)
988                 goto error;
989         rte_atomic32_init(&tbl_data->tbl.refcnt);
990         rte_atomic32_inc(&tbl_data->tbl.refcnt);
991         table_key.direction = 0;
992         table_key.domain = 1;
993         tbl_data = rte_zmalloc(NULL, sizeof(*tbl_data), 0);
994         if (!tbl_data) {
995                 err = ENOMEM;
996                 goto error;
997         }
998         tbl_data->entry.key = table_key.v64;
999         err = mlx5_hlist_insert(sh->flow_tbls, &tbl_data->entry);
1000         if (err)
1001                 goto error;
1002         rte_atomic32_init(&tbl_data->tbl.refcnt);
1003         rte_atomic32_inc(&tbl_data->tbl.refcnt);
1004         return err;
1005 error:
1006         mlx5_free_table_hash_list(priv);
1007 #endif /* HAVE_MLX5DV_DR */
1008         return err;
1009 }
1010
1011 /**
1012  * Initialize DR related data within private structure.
1013  * Routine checks the reference counter and does actual
1014  * resources creation/initialization only if counter is zero.
1015  *
1016  * @param[in] priv
1017  *   Pointer to the private device data structure.
1018  *
1019  * @return
1020  *   Zero on success, positive error code otherwise.
1021  */
1022 static int
1023 mlx5_alloc_shared_dr(struct mlx5_priv *priv)
1024 {
1025         struct mlx5_ibv_shared *sh = priv->sh;
1026         char s[MLX5_HLIST_NAMESIZE];
1027         int err = 0;
1028
1029         if (!sh->flow_tbls)
1030                 err = mlx5_alloc_table_hash_list(priv);
1031         else
1032                 DRV_LOG(DEBUG, "sh->flow_tbls[%p] already created, reuse\n",
1033                         (void *)sh->flow_tbls);
1034         if (err)
1035                 return err;
1036         /* Create tags hash list table. */
1037         snprintf(s, sizeof(s), "%s_tags", sh->ibdev_name);
1038         sh->tag_table = mlx5_hlist_create(s, MLX5_TAGS_HLIST_ARRAY_SIZE);
1039         if (!sh->tag_table) {
1040                 DRV_LOG(ERR, "tags with hash creation failed.\n");
1041                 err = ENOMEM;
1042                 goto error;
1043         }
1044 #ifdef HAVE_MLX5DV_DR
1045         void *domain;
1046
1047         if (sh->dv_refcnt) {
1048                 /* Shared DV/DR structures is already initialized. */
1049                 sh->dv_refcnt++;
1050                 priv->dr_shared = 1;
1051                 return 0;
1052         }
1053         /* Reference counter is zero, we should initialize structures. */
1054         domain = mlx5_glue->dr_create_domain(sh->ctx,
1055                                              MLX5DV_DR_DOMAIN_TYPE_NIC_RX);
1056         if (!domain) {
1057                 DRV_LOG(ERR, "ingress mlx5dv_dr_create_domain failed");
1058                 err = errno;
1059                 goto error;
1060         }
1061         sh->rx_domain = domain;
1062         domain = mlx5_glue->dr_create_domain(sh->ctx,
1063                                              MLX5DV_DR_DOMAIN_TYPE_NIC_TX);
1064         if (!domain) {
1065                 DRV_LOG(ERR, "egress mlx5dv_dr_create_domain failed");
1066                 err = errno;
1067                 goto error;
1068         }
1069         pthread_mutex_init(&sh->dv_mutex, NULL);
1070         sh->tx_domain = domain;
1071 #ifdef HAVE_MLX5DV_DR_ESWITCH
1072         if (priv->config.dv_esw_en) {
1073                 domain  = mlx5_glue->dr_create_domain
1074                         (sh->ctx, MLX5DV_DR_DOMAIN_TYPE_FDB);
1075                 if (!domain) {
1076                         DRV_LOG(ERR, "FDB mlx5dv_dr_create_domain failed");
1077                         err = errno;
1078                         goto error;
1079                 }
1080                 sh->fdb_domain = domain;
1081                 sh->esw_drop_action = mlx5_glue->dr_create_flow_action_drop();
1082         }
1083 #endif
1084         sh->pop_vlan_action = mlx5_glue->dr_create_flow_action_pop_vlan();
1085 #endif /* HAVE_MLX5DV_DR */
1086         sh->dv_refcnt++;
1087         priv->dr_shared = 1;
1088         return 0;
1089 error:
1090         /* Rollback the created objects. */
1091         if (sh->rx_domain) {
1092                 mlx5_glue->dr_destroy_domain(sh->rx_domain);
1093                 sh->rx_domain = NULL;
1094         }
1095         if (sh->tx_domain) {
1096                 mlx5_glue->dr_destroy_domain(sh->tx_domain);
1097                 sh->tx_domain = NULL;
1098         }
1099         if (sh->fdb_domain) {
1100                 mlx5_glue->dr_destroy_domain(sh->fdb_domain);
1101                 sh->fdb_domain = NULL;
1102         }
1103         if (sh->esw_drop_action) {
1104                 mlx5_glue->destroy_flow_action(sh->esw_drop_action);
1105                 sh->esw_drop_action = NULL;
1106         }
1107         if (sh->pop_vlan_action) {
1108                 mlx5_glue->destroy_flow_action(sh->pop_vlan_action);
1109                 sh->pop_vlan_action = NULL;
1110         }
1111         if (sh->tag_table) {
1112                 /* tags should be destroyed with flow before. */
1113                 mlx5_hlist_destroy(sh->tag_table, NULL, NULL);
1114                 sh->tag_table = NULL;
1115         }
1116         mlx5_free_table_hash_list(priv);
1117         return err;
1118 }
1119
1120 /**
1121  * Destroy DR related data within private structure.
1122  *
1123  * @param[in] priv
1124  *   Pointer to the private device data structure.
1125  */
1126 static void
1127 mlx5_free_shared_dr(struct mlx5_priv *priv)
1128 {
1129         struct mlx5_ibv_shared *sh;
1130
1131         if (!priv->dr_shared)
1132                 return;
1133         priv->dr_shared = 0;
1134         sh = priv->sh;
1135         MLX5_ASSERT(sh);
1136 #ifdef HAVE_MLX5DV_DR
1137         MLX5_ASSERT(sh->dv_refcnt);
1138         if (sh->dv_refcnt && --sh->dv_refcnt)
1139                 return;
1140         if (sh->rx_domain) {
1141                 mlx5_glue->dr_destroy_domain(sh->rx_domain);
1142                 sh->rx_domain = NULL;
1143         }
1144         if (sh->tx_domain) {
1145                 mlx5_glue->dr_destroy_domain(sh->tx_domain);
1146                 sh->tx_domain = NULL;
1147         }
1148 #ifdef HAVE_MLX5DV_DR_ESWITCH
1149         if (sh->fdb_domain) {
1150                 mlx5_glue->dr_destroy_domain(sh->fdb_domain);
1151                 sh->fdb_domain = NULL;
1152         }
1153         if (sh->esw_drop_action) {
1154                 mlx5_glue->destroy_flow_action(sh->esw_drop_action);
1155                 sh->esw_drop_action = NULL;
1156         }
1157 #endif
1158         if (sh->pop_vlan_action) {
1159                 mlx5_glue->destroy_flow_action(sh->pop_vlan_action);
1160                 sh->pop_vlan_action = NULL;
1161         }
1162         pthread_mutex_destroy(&sh->dv_mutex);
1163 #endif /* HAVE_MLX5DV_DR */
1164         if (sh->tag_table) {
1165                 /* tags should be destroyed with flow before. */
1166                 mlx5_hlist_destroy(sh->tag_table, NULL, NULL);
1167                 sh->tag_table = NULL;
1168         }
1169         mlx5_free_table_hash_list(priv);
1170 }
1171
1172 /**
1173  * Initialize shared data between primary and secondary process.
1174  *
1175  * A memzone is reserved by primary process and secondary processes attach to
1176  * the memzone.
1177  *
1178  * @return
1179  *   0 on success, a negative errno value otherwise and rte_errno is set.
1180  */
1181 static int
1182 mlx5_init_shared_data(void)
1183 {
1184         const struct rte_memzone *mz;
1185         int ret = 0;
1186
1187         rte_spinlock_lock(&mlx5_shared_data_lock);
1188         if (mlx5_shared_data == NULL) {
1189                 if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
1190                         /* Allocate shared memory. */
1191                         mz = rte_memzone_reserve(MZ_MLX5_PMD_SHARED_DATA,
1192                                                  sizeof(*mlx5_shared_data),
1193                                                  SOCKET_ID_ANY, 0);
1194                         if (mz == NULL) {
1195                                 DRV_LOG(ERR,
1196                                         "Cannot allocate mlx5 shared data");
1197                                 ret = -rte_errno;
1198                                 goto error;
1199                         }
1200                         mlx5_shared_data = mz->addr;
1201                         memset(mlx5_shared_data, 0, sizeof(*mlx5_shared_data));
1202                         rte_spinlock_init(&mlx5_shared_data->lock);
1203                 } else {
1204                         /* Lookup allocated shared memory. */
1205                         mz = rte_memzone_lookup(MZ_MLX5_PMD_SHARED_DATA);
1206                         if (mz == NULL) {
1207                                 DRV_LOG(ERR,
1208                                         "Cannot attach mlx5 shared data");
1209                                 ret = -rte_errno;
1210                                 goto error;
1211                         }
1212                         mlx5_shared_data = mz->addr;
1213                         memset(&mlx5_local_data, 0, sizeof(mlx5_local_data));
1214                 }
1215         }
1216 error:
1217         rte_spinlock_unlock(&mlx5_shared_data_lock);
1218         return ret;
1219 }
1220
1221 /**
1222  * Retrieve integer value from environment variable.
1223  *
1224  * @param[in] name
1225  *   Environment variable name.
1226  *
1227  * @return
1228  *   Integer value, 0 if the variable is not set.
1229  */
1230 int
1231 mlx5_getenv_int(const char *name)
1232 {
1233         const char *val = getenv(name);
1234
1235         if (val == NULL)
1236                 return 0;
1237         return atoi(val);
1238 }
1239
1240 /**
1241  * Verbs callback to allocate a memory. This function should allocate the space
1242  * according to the size provided residing inside a huge page.
1243  * Please note that all allocation must respect the alignment from libmlx5
1244  * (i.e. currently sysconf(_SC_PAGESIZE)).
1245  *
1246  * @param[in] size
1247  *   The size in bytes of the memory to allocate.
1248  * @param[in] data
1249  *   A pointer to the callback data.
1250  *
1251  * @return
1252  *   Allocated buffer, NULL otherwise and rte_errno is set.
1253  */
1254 static void *
1255 mlx5_alloc_verbs_buf(size_t size, void *data)
1256 {
1257         struct mlx5_priv *priv = data;
1258         void *ret;
1259         size_t alignment = sysconf(_SC_PAGESIZE);
1260         unsigned int socket = SOCKET_ID_ANY;
1261
1262         if (priv->verbs_alloc_ctx.type == MLX5_VERBS_ALLOC_TYPE_TX_QUEUE) {
1263                 const struct mlx5_txq_ctrl *ctrl = priv->verbs_alloc_ctx.obj;
1264
1265                 socket = ctrl->socket;
1266         } else if (priv->verbs_alloc_ctx.type ==
1267                    MLX5_VERBS_ALLOC_TYPE_RX_QUEUE) {
1268                 const struct mlx5_rxq_ctrl *ctrl = priv->verbs_alloc_ctx.obj;
1269
1270                 socket = ctrl->socket;
1271         }
1272         MLX5_ASSERT(data != NULL);
1273         ret = rte_malloc_socket(__func__, size, alignment, socket);
1274         if (!ret && size)
1275                 rte_errno = ENOMEM;
1276         return ret;
1277 }
1278
1279 /**
1280  * Verbs callback to free a memory.
1281  *
1282  * @param[in] ptr
1283  *   A pointer to the memory to free.
1284  * @param[in] data
1285  *   A pointer to the callback data.
1286  */
1287 static void
1288 mlx5_free_verbs_buf(void *ptr, void *data __rte_unused)
1289 {
1290         MLX5_ASSERT(data != NULL);
1291         rte_free(ptr);
1292 }
1293
1294 /**
1295  * DPDK callback to add udp tunnel port
1296  *
1297  * @param[in] dev
1298  *   A pointer to eth_dev
1299  * @param[in] udp_tunnel
1300  *   A pointer to udp tunnel
1301  *
1302  * @return
1303  *   0 on valid udp ports and tunnels, -ENOTSUP otherwise.
1304  */
1305 int
1306 mlx5_udp_tunnel_port_add(struct rte_eth_dev *dev __rte_unused,
1307                          struct rte_eth_udp_tunnel *udp_tunnel)
1308 {
1309         MLX5_ASSERT(udp_tunnel != NULL);
1310         if (udp_tunnel->prot_type == RTE_TUNNEL_TYPE_VXLAN &&
1311             udp_tunnel->udp_port == 4789)
1312                 return 0;
1313         if (udp_tunnel->prot_type == RTE_TUNNEL_TYPE_VXLAN_GPE &&
1314             udp_tunnel->udp_port == 4790)
1315                 return 0;
1316         return -ENOTSUP;
1317 }
1318
1319 /**
1320  * Initialize process private data structure.
1321  *
1322  * @param dev
1323  *   Pointer to Ethernet device structure.
1324  *
1325  * @return
1326  *   0 on success, a negative errno value otherwise and rte_errno is set.
1327  */
1328 int
1329 mlx5_proc_priv_init(struct rte_eth_dev *dev)
1330 {
1331         struct mlx5_priv *priv = dev->data->dev_private;
1332         struct mlx5_proc_priv *ppriv;
1333         size_t ppriv_size;
1334
1335         /*
1336          * UAR register table follows the process private structure. BlueFlame
1337          * registers for Tx queues are stored in the table.
1338          */
1339         ppriv_size =
1340                 sizeof(struct mlx5_proc_priv) + priv->txqs_n * sizeof(void *);
1341         ppriv = rte_malloc_socket("mlx5_proc_priv", ppriv_size,
1342                                   RTE_CACHE_LINE_SIZE, dev->device->numa_node);
1343         if (!ppriv) {
1344                 rte_errno = ENOMEM;
1345                 return -rte_errno;
1346         }
1347         ppriv->uar_table_sz = ppriv_size;
1348         dev->process_private = ppriv;
1349         return 0;
1350 }
1351
1352 /**
1353  * Un-initialize process private data structure.
1354  *
1355  * @param dev
1356  *   Pointer to Ethernet device structure.
1357  */
1358 static void
1359 mlx5_proc_priv_uninit(struct rte_eth_dev *dev)
1360 {
1361         if (!dev->process_private)
1362                 return;
1363         rte_free(dev->process_private);
1364         dev->process_private = NULL;
1365 }
1366
1367 /**
1368  * DPDK callback to close the device.
1369  *
1370  * Destroy all queues and objects, free memory.
1371  *
1372  * @param dev
1373  *   Pointer to Ethernet device structure.
1374  */
1375 static void
1376 mlx5_dev_close(struct rte_eth_dev *dev)
1377 {
1378         struct mlx5_priv *priv = dev->data->dev_private;
1379         unsigned int i;
1380         int ret;
1381
1382         DRV_LOG(DEBUG, "port %u closing device \"%s\"",
1383                 dev->data->port_id,
1384                 ((priv->sh->ctx != NULL) ? priv->sh->ctx->device->name : ""));
1385         /* In case mlx5_dev_stop() has not been called. */
1386         mlx5_dev_interrupt_handler_uninstall(dev);
1387         mlx5_dev_interrupt_handler_devx_uninstall(dev);
1388         /*
1389          * If default mreg copy action is removed at the stop stage,
1390          * the search will return none and nothing will be done anymore.
1391          */
1392         mlx5_flow_stop_default(dev);
1393         mlx5_traffic_disable(dev);
1394         /*
1395          * If all the flows are already flushed in the device stop stage,
1396          * then this will return directly without any action.
1397          */
1398         mlx5_flow_list_flush(dev, &priv->flows, true);
1399         mlx5_flow_meter_flush(dev, NULL);
1400         /* Free the intermediate buffers for flow creation. */
1401         mlx5_flow_free_intermediate(dev);
1402         /* Prevent crashes when queues are still in use. */
1403         dev->rx_pkt_burst = removed_rx_burst;
1404         dev->tx_pkt_burst = removed_tx_burst;
1405         rte_wmb();
1406         /* Disable datapath on secondary process. */
1407         mlx5_mp_req_stop_rxtx(dev);
1408         if (priv->rxqs != NULL) {
1409                 /* XXX race condition if mlx5_rx_burst() is still running. */
1410                 usleep(1000);
1411                 for (i = 0; (i != priv->rxqs_n); ++i)
1412                         mlx5_rxq_release(dev, i);
1413                 priv->rxqs_n = 0;
1414                 priv->rxqs = NULL;
1415         }
1416         if (priv->txqs != NULL) {
1417                 /* XXX race condition if mlx5_tx_burst() is still running. */
1418                 usleep(1000);
1419                 for (i = 0; (i != priv->txqs_n); ++i)
1420                         mlx5_txq_release(dev, i);
1421                 priv->txqs_n = 0;
1422                 priv->txqs = NULL;
1423         }
1424         mlx5_proc_priv_uninit(dev);
1425         if (priv->mreg_cp_tbl)
1426                 mlx5_hlist_destroy(priv->mreg_cp_tbl, NULL, NULL);
1427         mlx5_mprq_free_mp(dev);
1428         mlx5_free_shared_dr(priv);
1429         if (priv->rss_conf.rss_key != NULL)
1430                 rte_free(priv->rss_conf.rss_key);
1431         if (priv->reta_idx != NULL)
1432                 rte_free(priv->reta_idx);
1433         if (priv->config.vf)
1434                 mlx5_nl_mac_addr_flush(priv->nl_socket_route, mlx5_ifindex(dev),
1435                                        dev->data->mac_addrs,
1436                                        MLX5_MAX_MAC_ADDRESSES, priv->mac_own);
1437         if (priv->nl_socket_route >= 0)
1438                 close(priv->nl_socket_route);
1439         if (priv->nl_socket_rdma >= 0)
1440                 close(priv->nl_socket_rdma);
1441         if (priv->vmwa_context)
1442                 mlx5_vlan_vmwa_exit(priv->vmwa_context);
1443         ret = mlx5_hrxq_verify(dev);
1444         if (ret)
1445                 DRV_LOG(WARNING, "port %u some hash Rx queue still remain",
1446                         dev->data->port_id);
1447         ret = mlx5_ind_table_obj_verify(dev);
1448         if (ret)
1449                 DRV_LOG(WARNING, "port %u some indirection table still remain",
1450                         dev->data->port_id);
1451         ret = mlx5_rxq_obj_verify(dev);
1452         if (ret)
1453                 DRV_LOG(WARNING, "port %u some Rx queue objects still remain",
1454                         dev->data->port_id);
1455         ret = mlx5_rxq_verify(dev);
1456         if (ret)
1457                 DRV_LOG(WARNING, "port %u some Rx queues still remain",
1458                         dev->data->port_id);
1459         ret = mlx5_txq_obj_verify(dev);
1460         if (ret)
1461                 DRV_LOG(WARNING, "port %u some Verbs Tx queue still remain",
1462                         dev->data->port_id);
1463         ret = mlx5_txq_verify(dev);
1464         if (ret)
1465                 DRV_LOG(WARNING, "port %u some Tx queues still remain",
1466                         dev->data->port_id);
1467         ret = mlx5_flow_verify(dev);
1468         if (ret)
1469                 DRV_LOG(WARNING, "port %u some flows still remain",
1470                         dev->data->port_id);
1471         if (priv->sh) {
1472                 /*
1473                  * Free the shared context in last turn, because the cleanup
1474                  * routines above may use some shared fields, like
1475                  * mlx5_nl_mac_addr_flush() uses ibdev_path for retrieveing
1476                  * ifindex if Netlink fails.
1477                  */
1478                 mlx5_free_shared_ibctx(priv->sh);
1479                 priv->sh = NULL;
1480         }
1481         if (priv->domain_id != RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID) {
1482                 unsigned int c = 0;
1483                 uint16_t port_id;
1484
1485                 MLX5_ETH_FOREACH_DEV(port_id, priv->pci_dev) {
1486                         struct mlx5_priv *opriv =
1487                                 rte_eth_devices[port_id].data->dev_private;
1488
1489                         if (!opriv ||
1490                             opriv->domain_id != priv->domain_id ||
1491                             &rte_eth_devices[port_id] == dev)
1492                                 continue;
1493                         ++c;
1494                         break;
1495                 }
1496                 if (!c)
1497                         claim_zero(rte_eth_switch_domain_free(priv->domain_id));
1498         }
1499         memset(priv, 0, sizeof(*priv));
1500         priv->domain_id = RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID;
1501         /*
1502          * Reset mac_addrs to NULL such that it is not freed as part of
1503          * rte_eth_dev_release_port(). mac_addrs is part of dev_private so
1504          * it is freed when dev_private is freed.
1505          */
1506         dev->data->mac_addrs = NULL;
1507 }
1508
1509 const struct eth_dev_ops mlx5_dev_ops = {
1510         .dev_configure = mlx5_dev_configure,
1511         .dev_start = mlx5_dev_start,
1512         .dev_stop = mlx5_dev_stop,
1513         .dev_set_link_down = mlx5_set_link_down,
1514         .dev_set_link_up = mlx5_set_link_up,
1515         .dev_close = mlx5_dev_close,
1516         .promiscuous_enable = mlx5_promiscuous_enable,
1517         .promiscuous_disable = mlx5_promiscuous_disable,
1518         .allmulticast_enable = mlx5_allmulticast_enable,
1519         .allmulticast_disable = mlx5_allmulticast_disable,
1520         .link_update = mlx5_link_update,
1521         .stats_get = mlx5_stats_get,
1522         .stats_reset = mlx5_stats_reset,
1523         .xstats_get = mlx5_xstats_get,
1524         .xstats_reset = mlx5_xstats_reset,
1525         .xstats_get_names = mlx5_xstats_get_names,
1526         .fw_version_get = mlx5_fw_version_get,
1527         .dev_infos_get = mlx5_dev_infos_get,
1528         .read_clock = mlx5_read_clock,
1529         .dev_supported_ptypes_get = mlx5_dev_supported_ptypes_get,
1530         .vlan_filter_set = mlx5_vlan_filter_set,
1531         .rx_queue_setup = mlx5_rx_queue_setup,
1532         .rx_hairpin_queue_setup = mlx5_rx_hairpin_queue_setup,
1533         .tx_queue_setup = mlx5_tx_queue_setup,
1534         .tx_hairpin_queue_setup = mlx5_tx_hairpin_queue_setup,
1535         .rx_queue_release = mlx5_rx_queue_release,
1536         .tx_queue_release = mlx5_tx_queue_release,
1537         .flow_ctrl_get = mlx5_dev_get_flow_ctrl,
1538         .flow_ctrl_set = mlx5_dev_set_flow_ctrl,
1539         .mac_addr_remove = mlx5_mac_addr_remove,
1540         .mac_addr_add = mlx5_mac_addr_add,
1541         .mac_addr_set = mlx5_mac_addr_set,
1542         .set_mc_addr_list = mlx5_set_mc_addr_list,
1543         .mtu_set = mlx5_dev_set_mtu,
1544         .vlan_strip_queue_set = mlx5_vlan_strip_queue_set,
1545         .vlan_offload_set = mlx5_vlan_offload_set,
1546         .reta_update = mlx5_dev_rss_reta_update,
1547         .reta_query = mlx5_dev_rss_reta_query,
1548         .rss_hash_update = mlx5_rss_hash_update,
1549         .rss_hash_conf_get = mlx5_rss_hash_conf_get,
1550         .filter_ctrl = mlx5_dev_filter_ctrl,
1551         .rx_descriptor_status = mlx5_rx_descriptor_status,
1552         .tx_descriptor_status = mlx5_tx_descriptor_status,
1553         .rxq_info_get = mlx5_rxq_info_get,
1554         .txq_info_get = mlx5_txq_info_get,
1555         .rx_burst_mode_get = mlx5_rx_burst_mode_get,
1556         .tx_burst_mode_get = mlx5_tx_burst_mode_get,
1557         .rx_queue_count = mlx5_rx_queue_count,
1558         .rx_queue_intr_enable = mlx5_rx_intr_enable,
1559         .rx_queue_intr_disable = mlx5_rx_intr_disable,
1560         .is_removed = mlx5_is_removed,
1561         .udp_tunnel_port_add  = mlx5_udp_tunnel_port_add,
1562         .get_module_info = mlx5_get_module_info,
1563         .get_module_eeprom = mlx5_get_module_eeprom,
1564         .hairpin_cap_get = mlx5_hairpin_cap_get,
1565         .mtr_ops_get = mlx5_flow_meter_ops_get,
1566 };
1567
1568 /* Available operations from secondary process. */
1569 static const struct eth_dev_ops mlx5_dev_sec_ops = {
1570         .stats_get = mlx5_stats_get,
1571         .stats_reset = mlx5_stats_reset,
1572         .xstats_get = mlx5_xstats_get,
1573         .xstats_reset = mlx5_xstats_reset,
1574         .xstats_get_names = mlx5_xstats_get_names,
1575         .fw_version_get = mlx5_fw_version_get,
1576         .dev_infos_get = mlx5_dev_infos_get,
1577         .rx_descriptor_status = mlx5_rx_descriptor_status,
1578         .tx_descriptor_status = mlx5_tx_descriptor_status,
1579         .rxq_info_get = mlx5_rxq_info_get,
1580         .txq_info_get = mlx5_txq_info_get,
1581         .rx_burst_mode_get = mlx5_rx_burst_mode_get,
1582         .tx_burst_mode_get = mlx5_tx_burst_mode_get,
1583         .get_module_info = mlx5_get_module_info,
1584         .get_module_eeprom = mlx5_get_module_eeprom,
1585 };
1586
1587 /* Available operations in flow isolated mode. */
1588 const struct eth_dev_ops mlx5_dev_ops_isolate = {
1589         .dev_configure = mlx5_dev_configure,
1590         .dev_start = mlx5_dev_start,
1591         .dev_stop = mlx5_dev_stop,
1592         .dev_set_link_down = mlx5_set_link_down,
1593         .dev_set_link_up = mlx5_set_link_up,
1594         .dev_close = mlx5_dev_close,
1595         .promiscuous_enable = mlx5_promiscuous_enable,
1596         .promiscuous_disable = mlx5_promiscuous_disable,
1597         .allmulticast_enable = mlx5_allmulticast_enable,
1598         .allmulticast_disable = mlx5_allmulticast_disable,
1599         .link_update = mlx5_link_update,
1600         .stats_get = mlx5_stats_get,
1601         .stats_reset = mlx5_stats_reset,
1602         .xstats_get = mlx5_xstats_get,
1603         .xstats_reset = mlx5_xstats_reset,
1604         .xstats_get_names = mlx5_xstats_get_names,
1605         .fw_version_get = mlx5_fw_version_get,
1606         .dev_infos_get = mlx5_dev_infos_get,
1607         .dev_supported_ptypes_get = mlx5_dev_supported_ptypes_get,
1608         .vlan_filter_set = mlx5_vlan_filter_set,
1609         .rx_queue_setup = mlx5_rx_queue_setup,
1610         .rx_hairpin_queue_setup = mlx5_rx_hairpin_queue_setup,
1611         .tx_queue_setup = mlx5_tx_queue_setup,
1612         .tx_hairpin_queue_setup = mlx5_tx_hairpin_queue_setup,
1613         .rx_queue_release = mlx5_rx_queue_release,
1614         .tx_queue_release = mlx5_tx_queue_release,
1615         .flow_ctrl_get = mlx5_dev_get_flow_ctrl,
1616         .flow_ctrl_set = mlx5_dev_set_flow_ctrl,
1617         .mac_addr_remove = mlx5_mac_addr_remove,
1618         .mac_addr_add = mlx5_mac_addr_add,
1619         .mac_addr_set = mlx5_mac_addr_set,
1620         .set_mc_addr_list = mlx5_set_mc_addr_list,
1621         .mtu_set = mlx5_dev_set_mtu,
1622         .vlan_strip_queue_set = mlx5_vlan_strip_queue_set,
1623         .vlan_offload_set = mlx5_vlan_offload_set,
1624         .filter_ctrl = mlx5_dev_filter_ctrl,
1625         .rx_descriptor_status = mlx5_rx_descriptor_status,
1626         .tx_descriptor_status = mlx5_tx_descriptor_status,
1627         .rxq_info_get = mlx5_rxq_info_get,
1628         .txq_info_get = mlx5_txq_info_get,
1629         .rx_burst_mode_get = mlx5_rx_burst_mode_get,
1630         .tx_burst_mode_get = mlx5_tx_burst_mode_get,
1631         .rx_queue_intr_enable = mlx5_rx_intr_enable,
1632         .rx_queue_intr_disable = mlx5_rx_intr_disable,
1633         .is_removed = mlx5_is_removed,
1634         .get_module_info = mlx5_get_module_info,
1635         .get_module_eeprom = mlx5_get_module_eeprom,
1636         .hairpin_cap_get = mlx5_hairpin_cap_get,
1637         .mtr_ops_get = mlx5_flow_meter_ops_get,
1638 };
1639
1640 /**
1641  * Verify and store value for device argument.
1642  *
1643  * @param[in] key
1644  *   Key argument to verify.
1645  * @param[in] val
1646  *   Value associated with key.
1647  * @param opaque
1648  *   User data.
1649  *
1650  * @return
1651  *   0 on success, a negative errno value otherwise and rte_errno is set.
1652  */
1653 static int
1654 mlx5_args_check(const char *key, const char *val, void *opaque)
1655 {
1656         struct mlx5_dev_config *config = opaque;
1657         unsigned long tmp;
1658
1659         /* No-op, port representors are processed in mlx5_dev_spawn(). */
1660         if (!strcmp(MLX5_REPRESENTOR, key))
1661                 return 0;
1662         errno = 0;
1663         tmp = strtoul(val, NULL, 0);
1664         if (errno) {
1665                 rte_errno = errno;
1666                 DRV_LOG(WARNING, "%s: \"%s\" is not a valid integer", key, val);
1667                 return -rte_errno;
1668         }
1669         if (strcmp(MLX5_RXQ_CQE_COMP_EN, key) == 0) {
1670                 config->cqe_comp = !!tmp;
1671         } else if (strcmp(MLX5_RXQ_CQE_PAD_EN, key) == 0) {
1672                 config->cqe_pad = !!tmp;
1673         } else if (strcmp(MLX5_RXQ_PKT_PAD_EN, key) == 0) {
1674                 config->hw_padding = !!tmp;
1675         } else if (strcmp(MLX5_RX_MPRQ_EN, key) == 0) {
1676                 config->mprq.enabled = !!tmp;
1677         } else if (strcmp(MLX5_RX_MPRQ_LOG_STRIDE_NUM, key) == 0) {
1678                 config->mprq.stride_num_n = tmp;
1679         } else if (strcmp(MLX5_RX_MPRQ_LOG_STRIDE_SIZE, key) == 0) {
1680                 config->mprq.stride_size_n = tmp;
1681         } else if (strcmp(MLX5_RX_MPRQ_MAX_MEMCPY_LEN, key) == 0) {
1682                 config->mprq.max_memcpy_len = tmp;
1683         } else if (strcmp(MLX5_RXQS_MIN_MPRQ, key) == 0) {
1684                 config->mprq.min_rxqs_num = tmp;
1685         } else if (strcmp(MLX5_TXQ_INLINE, key) == 0) {
1686                 DRV_LOG(WARNING, "%s: deprecated parameter,"
1687                                  " converted to txq_inline_max", key);
1688                 config->txq_inline_max = tmp;
1689         } else if (strcmp(MLX5_TXQ_INLINE_MAX, key) == 0) {
1690                 config->txq_inline_max = tmp;
1691         } else if (strcmp(MLX5_TXQ_INLINE_MIN, key) == 0) {
1692                 config->txq_inline_min = tmp;
1693         } else if (strcmp(MLX5_TXQ_INLINE_MPW, key) == 0) {
1694                 config->txq_inline_mpw = tmp;
1695         } else if (strcmp(MLX5_TXQS_MIN_INLINE, key) == 0) {
1696                 config->txqs_inline = tmp;
1697         } else if (strcmp(MLX5_TXQS_MAX_VEC, key) == 0) {
1698                 DRV_LOG(WARNING, "%s: deprecated parameter, ignored", key);
1699         } else if (strcmp(MLX5_TXQ_MPW_EN, key) == 0) {
1700                 config->mps = !!tmp;
1701         } else if (strcmp(MLX5_TX_DB_NC, key) == 0) {
1702                 if (tmp != MLX5_TXDB_CACHED &&
1703                     tmp != MLX5_TXDB_NCACHED &&
1704                     tmp != MLX5_TXDB_HEURISTIC) {
1705                         DRV_LOG(ERR, "invalid Tx doorbell "
1706                                      "mapping parameter");
1707                         rte_errno = EINVAL;
1708                         return -rte_errno;
1709                 }
1710                 config->dbnc = tmp;
1711         } else if (strcmp(MLX5_TXQ_MPW_HDR_DSEG_EN, key) == 0) {
1712                 DRV_LOG(WARNING, "%s: deprecated parameter, ignored", key);
1713         } else if (strcmp(MLX5_TXQ_MAX_INLINE_LEN, key) == 0) {
1714                 DRV_LOG(WARNING, "%s: deprecated parameter,"
1715                                  " converted to txq_inline_mpw", key);
1716                 config->txq_inline_mpw = tmp;
1717         } else if (strcmp(MLX5_TX_VEC_EN, key) == 0) {
1718                 DRV_LOG(WARNING, "%s: deprecated parameter, ignored", key);
1719         } else if (strcmp(MLX5_RX_VEC_EN, key) == 0) {
1720                 config->rx_vec_en = !!tmp;
1721         } else if (strcmp(MLX5_L3_VXLAN_EN, key) == 0) {
1722                 config->l3_vxlan_en = !!tmp;
1723         } else if (strcmp(MLX5_VF_NL_EN, key) == 0) {
1724                 config->vf_nl_en = !!tmp;
1725         } else if (strcmp(MLX5_DV_ESW_EN, key) == 0) {
1726                 config->dv_esw_en = !!tmp;
1727         } else if (strcmp(MLX5_DV_FLOW_EN, key) == 0) {
1728                 config->dv_flow_en = !!tmp;
1729         } else if (strcmp(MLX5_DV_XMETA_EN, key) == 0) {
1730                 if (tmp != MLX5_XMETA_MODE_LEGACY &&
1731                     tmp != MLX5_XMETA_MODE_META16 &&
1732                     tmp != MLX5_XMETA_MODE_META32) {
1733                         DRV_LOG(ERR, "invalid extensive "
1734                                      "metadata parameter");
1735                         rte_errno = EINVAL;
1736                         return -rte_errno;
1737                 }
1738                 config->dv_xmeta_en = tmp;
1739         } else if (strcmp(MLX5_MR_EXT_MEMSEG_EN, key) == 0) {
1740                 config->mr_ext_memseg_en = !!tmp;
1741         } else if (strcmp(MLX5_MAX_DUMP_FILES_NUM, key) == 0) {
1742                 config->max_dump_files_num = tmp;
1743         } else if (strcmp(MLX5_LRO_TIMEOUT_USEC, key) == 0) {
1744                 config->lro.timeout = tmp;
1745         } else if (strcmp(MLX5_CLASS_ARG_NAME, key) == 0) {
1746                 DRV_LOG(DEBUG, "class argument is %s.", val);
1747         } else if (strcmp(MLX5_HP_BUF_SIZE, key) == 0) {
1748                 config->log_hp_size = tmp;
1749         } else {
1750                 DRV_LOG(WARNING, "%s: unknown parameter", key);
1751                 rte_errno = EINVAL;
1752                 return -rte_errno;
1753         }
1754         return 0;
1755 }
1756
1757 /**
1758  * Parse device parameters.
1759  *
1760  * @param config
1761  *   Pointer to device configuration structure.
1762  * @param devargs
1763  *   Device arguments structure.
1764  *
1765  * @return
1766  *   0 on success, a negative errno value otherwise and rte_errno is set.
1767  */
1768 static int
1769 mlx5_args(struct mlx5_dev_config *config, struct rte_devargs *devargs)
1770 {
1771         const char **params = (const char *[]){
1772                 MLX5_RXQ_CQE_COMP_EN,
1773                 MLX5_RXQ_CQE_PAD_EN,
1774                 MLX5_RXQ_PKT_PAD_EN,
1775                 MLX5_RX_MPRQ_EN,
1776                 MLX5_RX_MPRQ_LOG_STRIDE_NUM,
1777                 MLX5_RX_MPRQ_LOG_STRIDE_SIZE,
1778                 MLX5_RX_MPRQ_MAX_MEMCPY_LEN,
1779                 MLX5_RXQS_MIN_MPRQ,
1780                 MLX5_TXQ_INLINE,
1781                 MLX5_TXQ_INLINE_MIN,
1782                 MLX5_TXQ_INLINE_MAX,
1783                 MLX5_TXQ_INLINE_MPW,
1784                 MLX5_TXQS_MIN_INLINE,
1785                 MLX5_TXQS_MAX_VEC,
1786                 MLX5_TXQ_MPW_EN,
1787                 MLX5_TXQ_MPW_HDR_DSEG_EN,
1788                 MLX5_TXQ_MAX_INLINE_LEN,
1789                 MLX5_TX_DB_NC,
1790                 MLX5_TX_VEC_EN,
1791                 MLX5_RX_VEC_EN,
1792                 MLX5_L3_VXLAN_EN,
1793                 MLX5_VF_NL_EN,
1794                 MLX5_DV_ESW_EN,
1795                 MLX5_DV_FLOW_EN,
1796                 MLX5_DV_XMETA_EN,
1797                 MLX5_MR_EXT_MEMSEG_EN,
1798                 MLX5_REPRESENTOR,
1799                 MLX5_MAX_DUMP_FILES_NUM,
1800                 MLX5_LRO_TIMEOUT_USEC,
1801                 MLX5_CLASS_ARG_NAME,
1802                 MLX5_HP_BUF_SIZE,
1803                 NULL,
1804         };
1805         struct rte_kvargs *kvlist;
1806         int ret = 0;
1807         int i;
1808
1809         if (devargs == NULL)
1810                 return 0;
1811         /* Following UGLY cast is done to pass checkpatch. */
1812         kvlist = rte_kvargs_parse(devargs->args, params);
1813         if (kvlist == NULL) {
1814                 rte_errno = EINVAL;
1815                 return -rte_errno;
1816         }
1817         /* Process parameters. */
1818         for (i = 0; (params[i] != NULL); ++i) {
1819                 if (rte_kvargs_count(kvlist, params[i])) {
1820                         ret = rte_kvargs_process(kvlist, params[i],
1821                                                  mlx5_args_check, config);
1822                         if (ret) {
1823                                 rte_errno = EINVAL;
1824                                 rte_kvargs_free(kvlist);
1825                                 return -rte_errno;
1826                         }
1827                 }
1828         }
1829         rte_kvargs_free(kvlist);
1830         return 0;
1831 }
1832
1833 static struct rte_pci_driver mlx5_driver;
1834
1835 /**
1836  * PMD global initialization.
1837  *
1838  * Independent from individual device, this function initializes global
1839  * per-PMD data structures distinguishing primary and secondary processes.
1840  * Hence, each initialization is called once per a process.
1841  *
1842  * @return
1843  *   0 on success, a negative errno value otherwise and rte_errno is set.
1844  */
1845 static int
1846 mlx5_init_once(void)
1847 {
1848         struct mlx5_shared_data *sd;
1849         struct mlx5_local_data *ld = &mlx5_local_data;
1850         int ret = 0;
1851
1852         if (mlx5_init_shared_data())
1853                 return -rte_errno;
1854         sd = mlx5_shared_data;
1855         MLX5_ASSERT(sd);
1856         rte_spinlock_lock(&sd->lock);
1857         switch (rte_eal_process_type()) {
1858         case RTE_PROC_PRIMARY:
1859                 if (sd->init_done)
1860                         break;
1861                 LIST_INIT(&sd->mem_event_cb_list);
1862                 rte_rwlock_init(&sd->mem_event_rwlock);
1863                 rte_mem_event_callback_register("MLX5_MEM_EVENT_CB",
1864                                                 mlx5_mr_mem_event_cb, NULL);
1865                 ret = mlx5_mp_init_primary(MLX5_MP_NAME,
1866                                            mlx5_mp_primary_handle);
1867                 if (ret)
1868                         goto out;
1869                 sd->init_done = true;
1870                 break;
1871         case RTE_PROC_SECONDARY:
1872                 if (ld->init_done)
1873                         break;
1874                 ret = mlx5_mp_init_secondary(MLX5_MP_NAME,
1875                                              mlx5_mp_secondary_handle);
1876                 if (ret)
1877                         goto out;
1878                 ++sd->secondary_cnt;
1879                 ld->init_done = true;
1880                 break;
1881         default:
1882                 break;
1883         }
1884 out:
1885         rte_spinlock_unlock(&sd->lock);
1886         return ret;
1887 }
1888
1889 /**
1890  * Configures the minimal amount of data to inline into WQE
1891  * while sending packets.
1892  *
1893  * - the txq_inline_min has the maximal priority, if this
1894  *   key is specified in devargs
1895  * - if DevX is enabled the inline mode is queried from the
1896  *   device (HCA attributes and NIC vport context if needed).
1897  * - otherwise L2 mode (18 bytes) is assumed for ConnectX-4/4 Lx
1898  *   and none (0 bytes) for other NICs
1899  *
1900  * @param spawn
1901  *   Verbs device parameters (name, port, switch_info) to spawn.
1902  * @param config
1903  *   Device configuration parameters.
1904  */
1905 static void
1906 mlx5_set_min_inline(struct mlx5_dev_spawn_data *spawn,
1907                     struct mlx5_dev_config *config)
1908 {
1909         if (config->txq_inline_min != MLX5_ARG_UNSET) {
1910                 /* Application defines size of inlined data explicitly. */
1911                 switch (spawn->pci_dev->id.device_id) {
1912                 case PCI_DEVICE_ID_MELLANOX_CONNECTX4:
1913                 case PCI_DEVICE_ID_MELLANOX_CONNECTX4VF:
1914                         if (config->txq_inline_min <
1915                                        (int)MLX5_INLINE_HSIZE_L2) {
1916                                 DRV_LOG(DEBUG,
1917                                         "txq_inline_mix aligned to minimal"
1918                                         " ConnectX-4 required value %d",
1919                                         (int)MLX5_INLINE_HSIZE_L2);
1920                                 config->txq_inline_min = MLX5_INLINE_HSIZE_L2;
1921                         }
1922                         break;
1923                 }
1924                 goto exit;
1925         }
1926         if (config->hca_attr.eth_net_offloads) {
1927                 /* We have DevX enabled, inline mode queried successfully. */
1928                 switch (config->hca_attr.wqe_inline_mode) {
1929                 case MLX5_CAP_INLINE_MODE_L2:
1930                         /* outer L2 header must be inlined. */
1931                         config->txq_inline_min = MLX5_INLINE_HSIZE_L2;
1932                         goto exit;
1933                 case MLX5_CAP_INLINE_MODE_NOT_REQUIRED:
1934                         /* No inline data are required by NIC. */
1935                         config->txq_inline_min = MLX5_INLINE_HSIZE_NONE;
1936                         config->hw_vlan_insert =
1937                                 config->hca_attr.wqe_vlan_insert;
1938                         DRV_LOG(DEBUG, "Tx VLAN insertion is supported");
1939                         goto exit;
1940                 case MLX5_CAP_INLINE_MODE_VPORT_CONTEXT:
1941                         /* inline mode is defined by NIC vport context. */
1942                         if (!config->hca_attr.eth_virt)
1943                                 break;
1944                         switch (config->hca_attr.vport_inline_mode) {
1945                         case MLX5_INLINE_MODE_NONE:
1946                                 config->txq_inline_min =
1947                                         MLX5_INLINE_HSIZE_NONE;
1948                                 goto exit;
1949                         case MLX5_INLINE_MODE_L2:
1950                                 config->txq_inline_min =
1951                                         MLX5_INLINE_HSIZE_L2;
1952                                 goto exit;
1953                         case MLX5_INLINE_MODE_IP:
1954                                 config->txq_inline_min =
1955                                         MLX5_INLINE_HSIZE_L3;
1956                                 goto exit;
1957                         case MLX5_INLINE_MODE_TCP_UDP:
1958                                 config->txq_inline_min =
1959                                         MLX5_INLINE_HSIZE_L4;
1960                                 goto exit;
1961                         case MLX5_INLINE_MODE_INNER_L2:
1962                                 config->txq_inline_min =
1963                                         MLX5_INLINE_HSIZE_INNER_L2;
1964                                 goto exit;
1965                         case MLX5_INLINE_MODE_INNER_IP:
1966                                 config->txq_inline_min =
1967                                         MLX5_INLINE_HSIZE_INNER_L3;
1968                                 goto exit;
1969                         case MLX5_INLINE_MODE_INNER_TCP_UDP:
1970                                 config->txq_inline_min =
1971                                         MLX5_INLINE_HSIZE_INNER_L4;
1972                                 goto exit;
1973                         }
1974                 }
1975         }
1976         /*
1977          * We get here if we are unable to deduce
1978          * inline data size with DevX. Try PCI ID
1979          * to determine old NICs.
1980          */
1981         switch (spawn->pci_dev->id.device_id) {
1982         case PCI_DEVICE_ID_MELLANOX_CONNECTX4:
1983         case PCI_DEVICE_ID_MELLANOX_CONNECTX4VF:
1984         case PCI_DEVICE_ID_MELLANOX_CONNECTX4LX:
1985         case PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF:
1986                 config->txq_inline_min = MLX5_INLINE_HSIZE_L2;
1987                 config->hw_vlan_insert = 0;
1988                 break;
1989         case PCI_DEVICE_ID_MELLANOX_CONNECTX5:
1990         case PCI_DEVICE_ID_MELLANOX_CONNECTX5VF:
1991         case PCI_DEVICE_ID_MELLANOX_CONNECTX5EX:
1992         case PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF:
1993                 /*
1994                  * These NICs support VLAN insertion from WQE and
1995                  * report the wqe_vlan_insert flag. But there is the bug
1996                  * and PFC control may be broken, so disable feature.
1997                  */
1998                 config->hw_vlan_insert = 0;
1999                 config->txq_inline_min = MLX5_INLINE_HSIZE_NONE;
2000                 break;
2001         default:
2002                 config->txq_inline_min = MLX5_INLINE_HSIZE_NONE;
2003                 break;
2004         }
2005 exit:
2006         DRV_LOG(DEBUG, "min tx inline configured: %d", config->txq_inline_min);
2007 }
2008
2009 /**
2010  * Configures the metadata mask fields in the shared context.
2011  *
2012  * @param [in] dev
2013  *   Pointer to Ethernet device.
2014  */
2015 static void
2016 mlx5_set_metadata_mask(struct rte_eth_dev *dev)
2017 {
2018         struct mlx5_priv *priv = dev->data->dev_private;
2019         struct mlx5_ibv_shared *sh = priv->sh;
2020         uint32_t meta, mark, reg_c0;
2021
2022         reg_c0 = ~priv->vport_meta_mask;
2023         switch (priv->config.dv_xmeta_en) {
2024         case MLX5_XMETA_MODE_LEGACY:
2025                 meta = UINT32_MAX;
2026                 mark = MLX5_FLOW_MARK_MASK;
2027                 break;
2028         case MLX5_XMETA_MODE_META16:
2029                 meta = reg_c0 >> rte_bsf32(reg_c0);
2030                 mark = MLX5_FLOW_MARK_MASK;
2031                 break;
2032         case MLX5_XMETA_MODE_META32:
2033                 meta = UINT32_MAX;
2034                 mark = (reg_c0 >> rte_bsf32(reg_c0)) & MLX5_FLOW_MARK_MASK;
2035                 break;
2036         default:
2037                 meta = 0;
2038                 mark = 0;
2039                 MLX5_ASSERT(false);
2040                 break;
2041         }
2042         if (sh->dv_mark_mask && sh->dv_mark_mask != mark)
2043                 DRV_LOG(WARNING, "metadata MARK mask mismatche %08X:%08X",
2044                                  sh->dv_mark_mask, mark);
2045         else
2046                 sh->dv_mark_mask = mark;
2047         if (sh->dv_meta_mask && sh->dv_meta_mask != meta)
2048                 DRV_LOG(WARNING, "metadata META mask mismatche %08X:%08X",
2049                                  sh->dv_meta_mask, meta);
2050         else
2051                 sh->dv_meta_mask = meta;
2052         if (sh->dv_regc0_mask && sh->dv_regc0_mask != reg_c0)
2053                 DRV_LOG(WARNING, "metadata reg_c0 mask mismatche %08X:%08X",
2054                                  sh->dv_meta_mask, reg_c0);
2055         else
2056                 sh->dv_regc0_mask = reg_c0;
2057         DRV_LOG(DEBUG, "metadata mode %u", priv->config.dv_xmeta_en);
2058         DRV_LOG(DEBUG, "metadata MARK mask %08X", sh->dv_mark_mask);
2059         DRV_LOG(DEBUG, "metadata META mask %08X", sh->dv_meta_mask);
2060         DRV_LOG(DEBUG, "metadata reg_c0 mask %08X", sh->dv_regc0_mask);
2061 }
2062
2063 /**
2064  * Allocate page of door-bells and register it using DevX API.
2065  *
2066  * @param [in] dev
2067  *   Pointer to Ethernet device.
2068  *
2069  * @return
2070  *   Pointer to new page on success, NULL otherwise.
2071  */
2072 static struct mlx5_devx_dbr_page *
2073 mlx5_alloc_dbr_page(struct rte_eth_dev *dev)
2074 {
2075         struct mlx5_priv *priv = dev->data->dev_private;
2076         struct mlx5_devx_dbr_page *page;
2077
2078         /* Allocate space for door-bell page and management data. */
2079         page = rte_calloc_socket(__func__, 1, sizeof(struct mlx5_devx_dbr_page),
2080                                  RTE_CACHE_LINE_SIZE, dev->device->numa_node);
2081         if (!page) {
2082                 DRV_LOG(ERR, "port %u cannot allocate dbr page",
2083                         dev->data->port_id);
2084                 return NULL;
2085         }
2086         /* Register allocated memory. */
2087         page->umem = mlx5_glue->devx_umem_reg(priv->sh->ctx, page->dbrs,
2088                                               MLX5_DBR_PAGE_SIZE, 0);
2089         if (!page->umem) {
2090                 DRV_LOG(ERR, "port %u cannot umem reg dbr page",
2091                         dev->data->port_id);
2092                 rte_free(page);
2093                 return NULL;
2094         }
2095         return page;
2096 }
2097
2098 /**
2099  * Find the next available door-bell, allocate new page if needed.
2100  *
2101  * @param [in] dev
2102  *   Pointer to Ethernet device.
2103  * @param [out] dbr_page
2104  *   Door-bell page containing the page data.
2105  *
2106  * @return
2107  *   Door-bell address offset on success, a negative error value otherwise.
2108  */
2109 int64_t
2110 mlx5_get_dbr(struct rte_eth_dev *dev, struct mlx5_devx_dbr_page **dbr_page)
2111 {
2112         struct mlx5_priv *priv = dev->data->dev_private;
2113         struct mlx5_devx_dbr_page *page = NULL;
2114         uint32_t i, j;
2115
2116         LIST_FOREACH(page, &priv->dbrpgs, next)
2117                 if (page->dbr_count < MLX5_DBR_PER_PAGE)
2118                         break;
2119         if (!page) { /* No page with free door-bell exists. */
2120                 page = mlx5_alloc_dbr_page(dev);
2121                 if (!page) /* Failed to allocate new page. */
2122                         return (-1);
2123                 LIST_INSERT_HEAD(&priv->dbrpgs, page, next);
2124         }
2125         /* Loop to find bitmap part with clear bit. */
2126         for (i = 0;
2127              i < MLX5_DBR_BITMAP_SIZE && page->dbr_bitmap[i] == UINT64_MAX;
2128              i++)
2129                 ; /* Empty. */
2130         /* Find the first clear bit. */
2131         j = rte_bsf64(~page->dbr_bitmap[i]);
2132         MLX5_ASSERT(i < (MLX5_DBR_PER_PAGE / 64));
2133         page->dbr_bitmap[i] |= (1 << j);
2134         page->dbr_count++;
2135         *dbr_page = page;
2136         return (((i * 64) + j) * sizeof(uint64_t));
2137 }
2138
2139 /**
2140  * Release a door-bell record.
2141  *
2142  * @param [in] dev
2143  *   Pointer to Ethernet device.
2144  * @param [in] umem_id
2145  *   UMEM ID of page containing the door-bell record to release.
2146  * @param [in] offset
2147  *   Offset of door-bell record in page.
2148  *
2149  * @return
2150  *   0 on success, a negative error value otherwise.
2151  */
2152 int32_t
2153 mlx5_release_dbr(struct rte_eth_dev *dev, uint32_t umem_id, uint64_t offset)
2154 {
2155         struct mlx5_priv *priv = dev->data->dev_private;
2156         struct mlx5_devx_dbr_page *page = NULL;
2157         int ret = 0;
2158
2159         LIST_FOREACH(page, &priv->dbrpgs, next)
2160                 /* Find the page this address belongs to. */
2161                 if (page->umem->umem_id == umem_id)
2162                         break;
2163         if (!page)
2164                 return -EINVAL;
2165         page->dbr_count--;
2166         if (!page->dbr_count) {
2167                 /* Page not used, free it and remove from list. */
2168                 LIST_REMOVE(page, next);
2169                 if (page->umem)
2170                         ret = -mlx5_glue->devx_umem_dereg(page->umem);
2171                 rte_free(page);
2172         } else {
2173                 /* Mark in bitmap that this door-bell is not in use. */
2174                 offset /= MLX5_DBR_SIZE;
2175                 int i = offset / 64;
2176                 int j = offset % 64;
2177
2178                 page->dbr_bitmap[i] &= ~(1 << j);
2179         }
2180         return ret;
2181 }
2182
2183 int
2184 rte_pmd_mlx5_get_dyn_flag_names(char *names[], unsigned int n)
2185 {
2186         static const char *const dynf_names[] = {
2187                 RTE_PMD_MLX5_FINE_GRANULARITY_INLINE,
2188                 RTE_MBUF_DYNFLAG_METADATA_NAME
2189         };
2190         unsigned int i;
2191
2192         if (n < RTE_DIM(dynf_names))
2193                 return -ENOMEM;
2194         for (i = 0; i < RTE_DIM(dynf_names); i++) {
2195                 if (names[i] == NULL)
2196                         return -EINVAL;
2197                 strcpy(names[i], dynf_names[i]);
2198         }
2199         return RTE_DIM(dynf_names);
2200 }
2201
2202 /**
2203  * Check sibling device configurations.
2204  *
2205  * Sibling devices sharing the Infiniband device context
2206  * should have compatible configurations. This regards
2207  * representors and bonding slaves.
2208  *
2209  * @param priv
2210  *   Private device descriptor.
2211  * @param config
2212  *   Configuration of the device is going to be created.
2213  *
2214  * @return
2215  *   0 on success, EINVAL otherwise
2216  */
2217 static int
2218 mlx5_dev_check_sibling_config(struct mlx5_priv *priv,
2219                               struct mlx5_dev_config *config)
2220 {
2221         struct mlx5_ibv_shared *sh = priv->sh;
2222         struct mlx5_dev_config *sh_conf = NULL;
2223         uint16_t port_id;
2224
2225         MLX5_ASSERT(sh);
2226         /* Nothing to compare for the single/first device. */
2227         if (sh->refcnt == 1)
2228                 return 0;
2229         /* Find the device with shared context. */
2230         MLX5_ETH_FOREACH_DEV(port_id, priv->pci_dev) {
2231                 struct mlx5_priv *opriv =
2232                         rte_eth_devices[port_id].data->dev_private;
2233
2234                 if (opriv && opriv != priv && opriv->sh == sh) {
2235                         sh_conf = &opriv->config;
2236                         break;
2237                 }
2238         }
2239         if (!sh_conf)
2240                 return 0;
2241         if (sh_conf->dv_flow_en ^ config->dv_flow_en) {
2242                 DRV_LOG(ERR, "\"dv_flow_en\" configuration mismatch"
2243                              " for shared %s context", sh->ibdev_name);
2244                 rte_errno = EINVAL;
2245                 return rte_errno;
2246         }
2247         if (sh_conf->dv_xmeta_en ^ config->dv_xmeta_en) {
2248                 DRV_LOG(ERR, "\"dv_xmeta_en\" configuration mismatch"
2249                              " for shared %s context", sh->ibdev_name);
2250                 rte_errno = EINVAL;
2251                 return rte_errno;
2252         }
2253         return 0;
2254 }
2255 /**
2256  * Spawn an Ethernet device from Verbs information.
2257  *
2258  * @param dpdk_dev
2259  *   Backing DPDK device.
2260  * @param spawn
2261  *   Verbs device parameters (name, port, switch_info) to spawn.
2262  * @param config
2263  *   Device configuration parameters.
2264  *
2265  * @return
2266  *   A valid Ethernet device object on success, NULL otherwise and rte_errno
2267  *   is set. The following errors are defined:
2268  *
2269  *   EBUSY: device is not supposed to be spawned.
2270  *   EEXIST: device is already spawned
2271  */
2272 static struct rte_eth_dev *
2273 mlx5_dev_spawn(struct rte_device *dpdk_dev,
2274                struct mlx5_dev_spawn_data *spawn,
2275                struct mlx5_dev_config config)
2276 {
2277         const struct mlx5_switch_info *switch_info = &spawn->info;
2278         struct mlx5_ibv_shared *sh = NULL;
2279         struct ibv_port_attr port_attr;
2280         struct mlx5dv_context dv_attr = { .comp_mask = 0 };
2281         struct rte_eth_dev *eth_dev = NULL;
2282         struct mlx5_priv *priv = NULL;
2283         int err = 0;
2284         unsigned int hw_padding = 0;
2285         unsigned int mps;
2286         unsigned int cqe_comp;
2287         unsigned int cqe_pad = 0;
2288         unsigned int tunnel_en = 0;
2289         unsigned int mpls_en = 0;
2290         unsigned int swp = 0;
2291         unsigned int mprq = 0;
2292         unsigned int mprq_min_stride_size_n = 0;
2293         unsigned int mprq_max_stride_size_n = 0;
2294         unsigned int mprq_min_stride_num_n = 0;
2295         unsigned int mprq_max_stride_num_n = 0;
2296         struct rte_ether_addr mac;
2297         char name[RTE_ETH_NAME_MAX_LEN];
2298         int own_domain_id = 0;
2299         uint16_t port_id;
2300         unsigned int i;
2301 #ifdef HAVE_MLX5DV_DR_DEVX_PORT
2302         struct mlx5dv_devx_port devx_port = { .comp_mask = 0 };
2303 #endif
2304
2305         /* Determine if this port representor is supposed to be spawned. */
2306         if (switch_info->representor && dpdk_dev->devargs) {
2307                 struct rte_eth_devargs eth_da;
2308
2309                 err = rte_eth_devargs_parse(dpdk_dev->devargs->args, &eth_da);
2310                 if (err) {
2311                         rte_errno = -err;
2312                         DRV_LOG(ERR, "failed to process device arguments: %s",
2313                                 strerror(rte_errno));
2314                         return NULL;
2315                 }
2316                 for (i = 0; i < eth_da.nb_representor_ports; ++i)
2317                         if (eth_da.representor_ports[i] ==
2318                             (uint16_t)switch_info->port_name)
2319                                 break;
2320                 if (i == eth_da.nb_representor_ports) {
2321                         rte_errno = EBUSY;
2322                         return NULL;
2323                 }
2324         }
2325         /* Build device name. */
2326         if (spawn->pf_bond <  0) {
2327                 /* Single device. */
2328                 if (!switch_info->representor)
2329                         strlcpy(name, dpdk_dev->name, sizeof(name));
2330                 else
2331                         snprintf(name, sizeof(name), "%s_representor_%u",
2332                                  dpdk_dev->name, switch_info->port_name);
2333         } else {
2334                 /* Bonding device. */
2335                 if (!switch_info->representor)
2336                         snprintf(name, sizeof(name), "%s_%s",
2337                                  dpdk_dev->name, spawn->ibv_dev->name);
2338                 else
2339                         snprintf(name, sizeof(name), "%s_%s_representor_%u",
2340                                  dpdk_dev->name, spawn->ibv_dev->name,
2341                                  switch_info->port_name);
2342         }
2343         /* check if the device is already spawned */
2344         if (rte_eth_dev_get_port_by_name(name, &port_id) == 0) {
2345                 rte_errno = EEXIST;
2346                 return NULL;
2347         }
2348         DRV_LOG(DEBUG, "naming Ethernet device \"%s\"", name);
2349         if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
2350                 struct mlx5_mp_id mp_id;
2351
2352                 eth_dev = rte_eth_dev_attach_secondary(name);
2353                 if (eth_dev == NULL) {
2354                         DRV_LOG(ERR, "can not attach rte ethdev");
2355                         rte_errno = ENOMEM;
2356                         return NULL;
2357                 }
2358                 eth_dev->device = dpdk_dev;
2359                 eth_dev->dev_ops = &mlx5_dev_sec_ops;
2360                 err = mlx5_proc_priv_init(eth_dev);
2361                 if (err)
2362                         return NULL;
2363                 mp_id.port_id = eth_dev->data->port_id;
2364                 strlcpy(mp_id.name, MLX5_MP_NAME, RTE_MP_MAX_NAME_LEN);
2365                 /* Receive command fd from primary process */
2366                 err = mlx5_mp_req_verbs_cmd_fd(&mp_id);
2367                 if (err < 0)
2368                         return NULL;
2369                 /* Remap UAR for Tx queues. */
2370                 err = mlx5_tx_uar_init_secondary(eth_dev, err);
2371                 if (err)
2372                         return NULL;
2373                 /*
2374                  * Ethdev pointer is still required as input since
2375                  * the primary device is not accessible from the
2376                  * secondary process.
2377                  */
2378                 eth_dev->rx_pkt_burst = mlx5_select_rx_function(eth_dev);
2379                 eth_dev->tx_pkt_burst = mlx5_select_tx_function(eth_dev);
2380                 return eth_dev;
2381         }
2382         /*
2383          * Some parameters ("tx_db_nc" in particularly) are needed in
2384          * advance to create dv/verbs device context. We proceed the
2385          * devargs here to get ones, and later proceed devargs again
2386          * to override some hardware settings.
2387          */
2388         err = mlx5_args(&config, dpdk_dev->devargs);
2389         if (err) {
2390                 err = rte_errno;
2391                 DRV_LOG(ERR, "failed to process device arguments: %s",
2392                         strerror(rte_errno));
2393                 goto error;
2394         }
2395         sh = mlx5_alloc_shared_ibctx(spawn, &config);
2396         if (!sh)
2397                 return NULL;
2398         config.devx = sh->devx;
2399 #ifdef HAVE_MLX5DV_DR_ACTION_DEST_DEVX_TIR
2400         config.dest_tir = 1;
2401 #endif
2402 #ifdef HAVE_IBV_MLX5_MOD_SWP
2403         dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_SWP;
2404 #endif
2405         /*
2406          * Multi-packet send is supported by ConnectX-4 Lx PF as well
2407          * as all ConnectX-5 devices.
2408          */
2409 #ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT
2410         dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS;
2411 #endif
2412 #ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT
2413         dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_STRIDING_RQ;
2414 #endif
2415         mlx5_glue->dv_query_device(sh->ctx, &dv_attr);
2416         if (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED) {
2417                 if (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW) {
2418                         DRV_LOG(DEBUG, "enhanced MPW is supported");
2419                         mps = MLX5_MPW_ENHANCED;
2420                 } else {
2421                         DRV_LOG(DEBUG, "MPW is supported");
2422                         mps = MLX5_MPW;
2423                 }
2424         } else {
2425                 DRV_LOG(DEBUG, "MPW isn't supported");
2426                 mps = MLX5_MPW_DISABLED;
2427         }
2428 #ifdef HAVE_IBV_MLX5_MOD_SWP
2429         if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_SWP)
2430                 swp = dv_attr.sw_parsing_caps.sw_parsing_offloads;
2431         DRV_LOG(DEBUG, "SWP support: %u", swp);
2432 #endif
2433         config.swp = !!swp;
2434 #ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT
2435         if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_STRIDING_RQ) {
2436                 struct mlx5dv_striding_rq_caps mprq_caps =
2437                         dv_attr.striding_rq_caps;
2438
2439                 DRV_LOG(DEBUG, "\tmin_single_stride_log_num_of_bytes: %d",
2440                         mprq_caps.min_single_stride_log_num_of_bytes);
2441                 DRV_LOG(DEBUG, "\tmax_single_stride_log_num_of_bytes: %d",
2442                         mprq_caps.max_single_stride_log_num_of_bytes);
2443                 DRV_LOG(DEBUG, "\tmin_single_wqe_log_num_of_strides: %d",
2444                         mprq_caps.min_single_wqe_log_num_of_strides);
2445                 DRV_LOG(DEBUG, "\tmax_single_wqe_log_num_of_strides: %d",
2446                         mprq_caps.max_single_wqe_log_num_of_strides);
2447                 DRV_LOG(DEBUG, "\tsupported_qpts: %d",
2448                         mprq_caps.supported_qpts);
2449                 DRV_LOG(DEBUG, "device supports Multi-Packet RQ");
2450                 mprq = 1;
2451                 mprq_min_stride_size_n =
2452                         mprq_caps.min_single_stride_log_num_of_bytes;
2453                 mprq_max_stride_size_n =
2454                         mprq_caps.max_single_stride_log_num_of_bytes;
2455                 mprq_min_stride_num_n =
2456                         mprq_caps.min_single_wqe_log_num_of_strides;
2457                 mprq_max_stride_num_n =
2458                         mprq_caps.max_single_wqe_log_num_of_strides;
2459         }
2460 #endif
2461         if (RTE_CACHE_LINE_SIZE == 128 &&
2462             !(dv_attr.flags & MLX5DV_CONTEXT_FLAGS_CQE_128B_COMP))
2463                 cqe_comp = 0;
2464         else
2465                 cqe_comp = 1;
2466         config.cqe_comp = cqe_comp;
2467 #ifdef HAVE_IBV_MLX5_MOD_CQE_128B_PAD
2468         /* Whether device supports 128B Rx CQE padding. */
2469         cqe_pad = RTE_CACHE_LINE_SIZE == 128 &&
2470                   (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_CQE_128B_PAD);
2471 #endif
2472 #ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT
2473         if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS) {
2474                 tunnel_en = ((dv_attr.tunnel_offloads_caps &
2475                               MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_VXLAN) &&
2476                              (dv_attr.tunnel_offloads_caps &
2477                               MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_GRE) &&
2478                              (dv_attr.tunnel_offloads_caps &
2479                               MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_GENEVE));
2480         }
2481         DRV_LOG(DEBUG, "tunnel offloading is %ssupported",
2482                 tunnel_en ? "" : "not ");
2483 #else
2484         DRV_LOG(WARNING,
2485                 "tunnel offloading disabled due to old OFED/rdma-core version");
2486 #endif
2487         config.tunnel_en = tunnel_en;
2488 #ifdef HAVE_IBV_DEVICE_MPLS_SUPPORT
2489         mpls_en = ((dv_attr.tunnel_offloads_caps &
2490                     MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_CW_MPLS_OVER_GRE) &&
2491                    (dv_attr.tunnel_offloads_caps &
2492                     MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_CW_MPLS_OVER_UDP));
2493         DRV_LOG(DEBUG, "MPLS over GRE/UDP tunnel offloading is %ssupported",
2494                 mpls_en ? "" : "not ");
2495 #else
2496         DRV_LOG(WARNING, "MPLS over GRE/UDP tunnel offloading disabled due to"
2497                 " old OFED/rdma-core version or firmware configuration");
2498 #endif
2499         config.mpls_en = mpls_en;
2500         /* Check port status. */
2501         err = mlx5_glue->query_port(sh->ctx, spawn->ibv_port, &port_attr);
2502         if (err) {
2503                 DRV_LOG(ERR, "port query failed: %s", strerror(err));
2504                 goto error;
2505         }
2506         if (port_attr.link_layer != IBV_LINK_LAYER_ETHERNET) {
2507                 DRV_LOG(ERR, "port is not configured in Ethernet mode");
2508                 err = EINVAL;
2509                 goto error;
2510         }
2511         if (port_attr.state != IBV_PORT_ACTIVE)
2512                 DRV_LOG(DEBUG, "port is not active: \"%s\" (%d)",
2513                         mlx5_glue->port_state_str(port_attr.state),
2514                         port_attr.state);
2515         /* Allocate private eth device data. */
2516         priv = rte_zmalloc("ethdev private structure",
2517                            sizeof(*priv),
2518                            RTE_CACHE_LINE_SIZE);
2519         if (priv == NULL) {
2520                 DRV_LOG(ERR, "priv allocation failure");
2521                 err = ENOMEM;
2522                 goto error;
2523         }
2524         priv->sh = sh;
2525         priv->ibv_port = spawn->ibv_port;
2526         priv->pci_dev = spawn->pci_dev;
2527         priv->mtu = RTE_ETHER_MTU;
2528         priv->mp_id.port_id = port_id;
2529         strlcpy(priv->mp_id.name, MLX5_MP_NAME, RTE_MP_MAX_NAME_LEN);
2530 #ifndef RTE_ARCH_64
2531         /* Initialize UAR access locks for 32bit implementations. */
2532         rte_spinlock_init(&priv->uar_lock_cq);
2533         for (i = 0; i < MLX5_UAR_PAGE_NUM_MAX; i++)
2534                 rte_spinlock_init(&priv->uar_lock[i]);
2535 #endif
2536         /* Some internal functions rely on Netlink sockets, open them now. */
2537         priv->nl_socket_rdma = mlx5_nl_init(NETLINK_RDMA);
2538         priv->nl_socket_route = mlx5_nl_init(NETLINK_ROUTE);
2539         priv->representor = !!switch_info->representor;
2540         priv->master = !!switch_info->master;
2541         priv->domain_id = RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID;
2542         priv->vport_meta_tag = 0;
2543         priv->vport_meta_mask = 0;
2544         priv->pf_bond = spawn->pf_bond;
2545 #ifdef HAVE_MLX5DV_DR_DEVX_PORT
2546         /*
2547          * The DevX port query API is implemented. E-Switch may use
2548          * either vport or reg_c[0] metadata register to match on
2549          * vport index. The engaged part of metadata register is
2550          * defined by mask.
2551          */
2552         if (switch_info->representor || switch_info->master) {
2553                 devx_port.comp_mask = MLX5DV_DEVX_PORT_VPORT |
2554                                       MLX5DV_DEVX_PORT_MATCH_REG_C_0;
2555                 err = mlx5_glue->devx_port_query(sh->ctx, spawn->ibv_port,
2556                                                  &devx_port);
2557                 if (err) {
2558                         DRV_LOG(WARNING,
2559                                 "can't query devx port %d on device %s",
2560                                 spawn->ibv_port, spawn->ibv_dev->name);
2561                         devx_port.comp_mask = 0;
2562                 }
2563         }
2564         if (devx_port.comp_mask & MLX5DV_DEVX_PORT_MATCH_REG_C_0) {
2565                 priv->vport_meta_tag = devx_port.reg_c_0.value;
2566                 priv->vport_meta_mask = devx_port.reg_c_0.mask;
2567                 if (!priv->vport_meta_mask) {
2568                         DRV_LOG(ERR, "vport zero mask for port %d"
2569                                      " on bonding device %s",
2570                                      spawn->ibv_port, spawn->ibv_dev->name);
2571                         err = ENOTSUP;
2572                         goto error;
2573                 }
2574                 if (priv->vport_meta_tag & ~priv->vport_meta_mask) {
2575                         DRV_LOG(ERR, "invalid vport tag for port %d"
2576                                      " on bonding device %s",
2577                                      spawn->ibv_port, spawn->ibv_dev->name);
2578                         err = ENOTSUP;
2579                         goto error;
2580                 }
2581         }
2582         if (devx_port.comp_mask & MLX5DV_DEVX_PORT_VPORT) {
2583                 priv->vport_id = devx_port.vport_num;
2584         } else if (spawn->pf_bond >= 0) {
2585                 DRV_LOG(ERR, "can't deduce vport index for port %d"
2586                              " on bonding device %s",
2587                              spawn->ibv_port, spawn->ibv_dev->name);
2588                 err = ENOTSUP;
2589                 goto error;
2590         } else {
2591                 /* Suppose vport index in compatible way. */
2592                 priv->vport_id = switch_info->representor ?
2593                                  switch_info->port_name + 1 : -1;
2594         }
2595 #else
2596         /*
2597          * Kernel/rdma_core support single E-Switch per PF configurations
2598          * only and vport_id field contains the vport index for
2599          * associated VF, which is deduced from representor port name.
2600          * For example, let's have the IB device port 10, it has
2601          * attached network device eth0, which has port name attribute
2602          * pf0vf2, we can deduce the VF number as 2, and set vport index
2603          * as 3 (2+1). This assigning schema should be changed if the
2604          * multiple E-Switch instances per PF configurations or/and PCI
2605          * subfunctions are added.
2606          */
2607         priv->vport_id = switch_info->representor ?
2608                          switch_info->port_name + 1 : -1;
2609 #endif
2610         /* representor_id field keeps the unmodified VF index. */
2611         priv->representor_id = switch_info->representor ?
2612                                switch_info->port_name : -1;
2613         /*
2614          * Look for sibling devices in order to reuse their switch domain
2615          * if any, otherwise allocate one.
2616          */
2617         MLX5_ETH_FOREACH_DEV(port_id, priv->pci_dev) {
2618                 const struct mlx5_priv *opriv =
2619                         rte_eth_devices[port_id].data->dev_private;
2620
2621                 if (!opriv ||
2622                     opriv->sh != priv->sh ||
2623                         opriv->domain_id ==
2624                         RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID)
2625                         continue;
2626                 priv->domain_id = opriv->domain_id;
2627                 break;
2628         }
2629         if (priv->domain_id == RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID) {
2630                 err = rte_eth_switch_domain_alloc(&priv->domain_id);
2631                 if (err) {
2632                         err = rte_errno;
2633                         DRV_LOG(ERR, "unable to allocate switch domain: %s",
2634                                 strerror(rte_errno));
2635                         goto error;
2636                 }
2637                 own_domain_id = 1;
2638         }
2639         /* Override some values set by hardware configuration. */
2640         mlx5_args(&config, dpdk_dev->devargs);
2641         err = mlx5_dev_check_sibling_config(priv, &config);
2642         if (err)
2643                 goto error;
2644         config.hw_csum = !!(sh->device_attr.device_cap_flags_ex &
2645                             IBV_DEVICE_RAW_IP_CSUM);
2646         DRV_LOG(DEBUG, "checksum offloading is %ssupported",
2647                 (config.hw_csum ? "" : "not "));
2648 #if !defined(HAVE_IBV_DEVICE_COUNTERS_SET_V42) && \
2649         !defined(HAVE_IBV_DEVICE_COUNTERS_SET_V45)
2650         DRV_LOG(DEBUG, "counters are not supported");
2651 #endif
2652 #if !defined(HAVE_IBV_FLOW_DV_SUPPORT) || !defined(HAVE_MLX5DV_DR)
2653         if (config.dv_flow_en) {
2654                 DRV_LOG(WARNING, "DV flow is not supported");
2655                 config.dv_flow_en = 0;
2656         }
2657 #endif
2658         config.ind_table_max_size =
2659                 sh->device_attr.rss_caps.max_rwq_indirection_table_size;
2660         /*
2661          * Remove this check once DPDK supports larger/variable
2662          * indirection tables.
2663          */
2664         if (config.ind_table_max_size > (unsigned int)ETH_RSS_RETA_SIZE_512)
2665                 config.ind_table_max_size = ETH_RSS_RETA_SIZE_512;
2666         DRV_LOG(DEBUG, "maximum Rx indirection table size is %u",
2667                 config.ind_table_max_size);
2668         config.hw_vlan_strip = !!(sh->device_attr.raw_packet_caps &
2669                                   IBV_RAW_PACKET_CAP_CVLAN_STRIPPING);
2670         DRV_LOG(DEBUG, "VLAN stripping is %ssupported",
2671                 (config.hw_vlan_strip ? "" : "not "));
2672         config.hw_fcs_strip = !!(sh->device_attr.raw_packet_caps &
2673                                  IBV_RAW_PACKET_CAP_SCATTER_FCS);
2674         DRV_LOG(DEBUG, "FCS stripping configuration is %ssupported",
2675                 (config.hw_fcs_strip ? "" : "not "));
2676 #if defined(HAVE_IBV_WQ_FLAG_RX_END_PADDING)
2677         hw_padding = !!sh->device_attr.rx_pad_end_addr_align;
2678 #elif defined(HAVE_IBV_WQ_FLAGS_PCI_WRITE_END_PADDING)
2679         hw_padding = !!(sh->device_attr.device_cap_flags_ex &
2680                         IBV_DEVICE_PCI_WRITE_END_PADDING);
2681 #endif
2682         if (config.hw_padding && !hw_padding) {
2683                 DRV_LOG(DEBUG, "Rx end alignment padding isn't supported");
2684                 config.hw_padding = 0;
2685         } else if (config.hw_padding) {
2686                 DRV_LOG(DEBUG, "Rx end alignment padding is enabled");
2687         }
2688         config.tso = (sh->device_attr.tso_caps.max_tso > 0 &&
2689                       (sh->device_attr.tso_caps.supported_qpts &
2690                        (1 << IBV_QPT_RAW_PACKET)));
2691         if (config.tso)
2692                 config.tso_max_payload_sz = sh->device_attr.tso_caps.max_tso;
2693         /*
2694          * MPW is disabled by default, while the Enhanced MPW is enabled
2695          * by default.
2696          */
2697         if (config.mps == MLX5_ARG_UNSET)
2698                 config.mps = (mps == MLX5_MPW_ENHANCED) ? MLX5_MPW_ENHANCED :
2699                                                           MLX5_MPW_DISABLED;
2700         else
2701                 config.mps = config.mps ? mps : MLX5_MPW_DISABLED;
2702         DRV_LOG(INFO, "%sMPS is %s",
2703                 config.mps == MLX5_MPW_ENHANCED ? "enhanced " :
2704                 config.mps == MLX5_MPW ? "legacy " : "",
2705                 config.mps != MLX5_MPW_DISABLED ? "enabled" : "disabled");
2706         if (config.cqe_comp && !cqe_comp) {
2707                 DRV_LOG(WARNING, "Rx CQE compression isn't supported");
2708                 config.cqe_comp = 0;
2709         }
2710         if (config.cqe_pad && !cqe_pad) {
2711                 DRV_LOG(WARNING, "Rx CQE padding isn't supported");
2712                 config.cqe_pad = 0;
2713         } else if (config.cqe_pad) {
2714                 DRV_LOG(INFO, "Rx CQE padding is enabled");
2715         }
2716         if (config.devx) {
2717                 priv->counter_fallback = 0;
2718                 err = mlx5_devx_cmd_query_hca_attr(sh->ctx, &config.hca_attr);
2719                 if (err) {
2720                         err = -err;
2721                         goto error;
2722                 }
2723                 if (!config.hca_attr.flow_counters_dump)
2724                         priv->counter_fallback = 1;
2725 #ifndef HAVE_IBV_DEVX_ASYNC
2726                 priv->counter_fallback = 1;
2727 #endif
2728                 if (priv->counter_fallback)
2729                         DRV_LOG(INFO, "Use fall-back DV counter management");
2730                 /* Check for LRO support. */
2731                 if (config.dest_tir && config.hca_attr.lro_cap &&
2732                     config.dv_flow_en) {
2733                         /* TBD check tunnel lro caps. */
2734                         config.lro.supported = config.hca_attr.lro_cap;
2735                         DRV_LOG(DEBUG, "Device supports LRO");
2736                         /*
2737                          * If LRO timeout is not configured by application,
2738                          * use the minimal supported value.
2739                          */
2740                         if (!config.lro.timeout)
2741                                 config.lro.timeout =
2742                                 config.hca_attr.lro_timer_supported_periods[0];
2743                         DRV_LOG(DEBUG, "LRO session timeout set to %d usec",
2744                                 config.lro.timeout);
2745                 }
2746 #if defined(HAVE_MLX5DV_DR) && defined(HAVE_MLX5_DR_CREATE_ACTION_FLOW_METER)
2747                 if (config.hca_attr.qos.sup && config.hca_attr.qos.srtcm_sup &&
2748                     config.dv_flow_en) {
2749                         uint8_t reg_c_mask =
2750                                 config.hca_attr.qos.flow_meter_reg_c_ids;
2751                         /*
2752                          * Meter needs two REG_C's for color match and pre-sfx
2753                          * flow match. Here get the REG_C for color match.
2754                          * REG_C_0 and REG_C_1 is reserved for metadata feature.
2755                          */
2756                         reg_c_mask &= 0xfc;
2757                         if (__builtin_popcount(reg_c_mask) < 1) {
2758                                 priv->mtr_en = 0;
2759                                 DRV_LOG(WARNING, "No available register for"
2760                                         " meter.");
2761                         } else {
2762                                 priv->mtr_color_reg = ffs(reg_c_mask) - 1 +
2763                                                       REG_C_0;
2764                                 priv->mtr_en = 1;
2765                                 priv->mtr_reg_share =
2766                                       config.hca_attr.qos.flow_meter_reg_share;
2767                                 DRV_LOG(DEBUG, "The REG_C meter uses is %d",
2768                                         priv->mtr_color_reg);
2769                         }
2770                 }
2771 #endif
2772         }
2773         if (config.mprq.enabled && mprq) {
2774                 if (config.mprq.stride_num_n &&
2775                     (config.mprq.stride_num_n > mprq_max_stride_num_n ||
2776                      config.mprq.stride_num_n < mprq_min_stride_num_n)) {
2777                         config.mprq.stride_num_n =
2778                                 RTE_MIN(RTE_MAX(MLX5_MPRQ_STRIDE_NUM_N,
2779                                                 mprq_min_stride_num_n),
2780                                         mprq_max_stride_num_n);
2781                         DRV_LOG(WARNING,
2782                                 "the number of strides"
2783                                 " for Multi-Packet RQ is out of range,"
2784                                 " setting default value (%u)",
2785                                 1 << config.mprq.stride_num_n);
2786                 }
2787                 if (config.mprq.stride_size_n &&
2788                     (config.mprq.stride_size_n > mprq_max_stride_size_n ||
2789                      config.mprq.stride_size_n < mprq_min_stride_size_n)) {
2790                         config.mprq.stride_size_n =
2791                                 RTE_MIN(RTE_MAX(MLX5_MPRQ_STRIDE_SIZE_N,
2792                                                 mprq_min_stride_size_n),
2793                                         mprq_max_stride_size_n);
2794                         DRV_LOG(WARNING,
2795                                 "the size of a stride"
2796                                 " for Multi-Packet RQ is out of range,"
2797                                 " setting default value (%u)",
2798                                 1 << config.mprq.stride_size_n);
2799                 }
2800                 config.mprq.min_stride_size_n = mprq_min_stride_size_n;
2801                 config.mprq.max_stride_size_n = mprq_max_stride_size_n;
2802         } else if (config.mprq.enabled && !mprq) {
2803                 DRV_LOG(WARNING, "Multi-Packet RQ isn't supported");
2804                 config.mprq.enabled = 0;
2805         }
2806         if (config.max_dump_files_num == 0)
2807                 config.max_dump_files_num = 128;
2808         eth_dev = rte_eth_dev_allocate(name);
2809         if (eth_dev == NULL) {
2810                 DRV_LOG(ERR, "can not allocate rte ethdev");
2811                 err = ENOMEM;
2812                 goto error;
2813         }
2814         /* Flag to call rte_eth_dev_release_port() in rte_eth_dev_close(). */
2815         eth_dev->data->dev_flags |= RTE_ETH_DEV_CLOSE_REMOVE;
2816         if (priv->representor) {
2817                 eth_dev->data->dev_flags |= RTE_ETH_DEV_REPRESENTOR;
2818                 eth_dev->data->representor_id = priv->representor_id;
2819         }
2820         /*
2821          * Store associated network device interface index. This index
2822          * is permanent throughout the lifetime of device. So, we may store
2823          * the ifindex here and use the cached value further.
2824          */
2825         MLX5_ASSERT(spawn->ifindex);
2826         priv->if_index = spawn->ifindex;
2827         eth_dev->data->dev_private = priv;
2828         priv->dev_data = eth_dev->data;
2829         eth_dev->data->mac_addrs = priv->mac;
2830         eth_dev->device = dpdk_dev;
2831         /* Configure the first MAC address by default. */
2832         if (mlx5_get_mac(eth_dev, &mac.addr_bytes)) {
2833                 DRV_LOG(ERR,
2834                         "port %u cannot get MAC address, is mlx5_en"
2835                         " loaded? (errno: %s)",
2836                         eth_dev->data->port_id, strerror(rte_errno));
2837                 err = ENODEV;
2838                 goto error;
2839         }
2840         DRV_LOG(INFO,
2841                 "port %u MAC address is %02x:%02x:%02x:%02x:%02x:%02x",
2842                 eth_dev->data->port_id,
2843                 mac.addr_bytes[0], mac.addr_bytes[1],
2844                 mac.addr_bytes[2], mac.addr_bytes[3],
2845                 mac.addr_bytes[4], mac.addr_bytes[5]);
2846 #ifdef RTE_LIBRTE_MLX5_DEBUG
2847         {
2848                 char ifname[IF_NAMESIZE];
2849
2850                 if (mlx5_get_ifname(eth_dev, &ifname) == 0)
2851                         DRV_LOG(DEBUG, "port %u ifname is \"%s\"",
2852                                 eth_dev->data->port_id, ifname);
2853                 else
2854                         DRV_LOG(DEBUG, "port %u ifname is unknown",
2855                                 eth_dev->data->port_id);
2856         }
2857 #endif
2858         /* Get actual MTU if possible. */
2859         err = mlx5_get_mtu(eth_dev, &priv->mtu);
2860         if (err) {
2861                 err = rte_errno;
2862                 goto error;
2863         }
2864         DRV_LOG(DEBUG, "port %u MTU is %u", eth_dev->data->port_id,
2865                 priv->mtu);
2866         /* Initialize burst functions to prevent crashes before link-up. */
2867         eth_dev->rx_pkt_burst = removed_rx_burst;
2868         eth_dev->tx_pkt_burst = removed_tx_burst;
2869         eth_dev->dev_ops = &mlx5_dev_ops;
2870         /* Register MAC address. */
2871         claim_zero(mlx5_mac_addr_add(eth_dev, &mac, 0, 0));
2872         if (config.vf && config.vf_nl_en)
2873                 mlx5_nl_mac_addr_sync(priv->nl_socket_route,
2874                                       mlx5_ifindex(eth_dev),
2875                                       eth_dev->data->mac_addrs,
2876                                       MLX5_MAX_MAC_ADDRESSES);
2877         TAILQ_INIT(&priv->flows);
2878         TAILQ_INIT(&priv->ctrl_flows);
2879         TAILQ_INIT(&priv->flow_meters);
2880         TAILQ_INIT(&priv->flow_meter_profiles);
2881         /* Hint libmlx5 to use PMD allocator for data plane resources */
2882         struct mlx5dv_ctx_allocators alctr = {
2883                 .alloc = &mlx5_alloc_verbs_buf,
2884                 .free = &mlx5_free_verbs_buf,
2885                 .data = priv,
2886         };
2887         mlx5_glue->dv_set_context_attr(sh->ctx,
2888                                        MLX5DV_CTX_ATTR_BUF_ALLOCATORS,
2889                                        (void *)((uintptr_t)&alctr));
2890         /* Bring Ethernet device up. */
2891         DRV_LOG(DEBUG, "port %u forcing Ethernet interface up",
2892                 eth_dev->data->port_id);
2893         mlx5_set_link_up(eth_dev);
2894         /*
2895          * Even though the interrupt handler is not installed yet,
2896          * interrupts will still trigger on the async_fd from
2897          * Verbs context returned by ibv_open_device().
2898          */
2899         mlx5_link_update(eth_dev, 0);
2900 #ifdef HAVE_MLX5DV_DR_ESWITCH
2901         if (!(config.hca_attr.eswitch_manager && config.dv_flow_en &&
2902               (switch_info->representor || switch_info->master)))
2903                 config.dv_esw_en = 0;
2904 #else
2905         config.dv_esw_en = 0;
2906 #endif
2907         /* Detect minimal data bytes to inline. */
2908         mlx5_set_min_inline(spawn, &config);
2909         /* Store device configuration on private structure. */
2910         priv->config = config;
2911         /* Create context for virtual machine VLAN workaround. */
2912         priv->vmwa_context = mlx5_vlan_vmwa_init(eth_dev, spawn->ifindex);
2913         if (config.dv_flow_en) {
2914                 err = mlx5_alloc_shared_dr(priv);
2915                 if (err)
2916                         goto error;
2917                 /*
2918                  * RSS id is shared with meter flow id. Meter flow id can only
2919                  * use the 24 MSB of the register.
2920                  */
2921                 priv->qrss_id_pool = mlx5_flow_id_pool_alloc(UINT32_MAX >>
2922                                      MLX5_MTR_COLOR_BITS);
2923                 if (!priv->qrss_id_pool) {
2924                         DRV_LOG(ERR, "can't create flow id pool");
2925                         err = ENOMEM;
2926                         goto error;
2927                 }
2928         }
2929         /* Supported Verbs flow priority number detection. */
2930         err = mlx5_flow_discover_priorities(eth_dev);
2931         if (err < 0) {
2932                 err = -err;
2933                 goto error;
2934         }
2935         priv->config.flow_prio = err;
2936         if (!priv->config.dv_esw_en &&
2937             priv->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY) {
2938                 DRV_LOG(WARNING, "metadata mode %u is not supported "
2939                                  "(no E-Switch)", priv->config.dv_xmeta_en);
2940                 priv->config.dv_xmeta_en = MLX5_XMETA_MODE_LEGACY;
2941         }
2942         mlx5_set_metadata_mask(eth_dev);
2943         if (priv->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY &&
2944             !priv->sh->dv_regc0_mask) {
2945                 DRV_LOG(ERR, "metadata mode %u is not supported "
2946                              "(no metadata reg_c[0] is available)",
2947                              priv->config.dv_xmeta_en);
2948                         err = ENOTSUP;
2949                         goto error;
2950         }
2951         /*
2952          * Allocate the buffer for flow creating, just once.
2953          * The allocation must be done before any flow creating.
2954          */
2955         mlx5_flow_alloc_intermediate(eth_dev);
2956         /* Query availibility of metadata reg_c's. */
2957         err = mlx5_flow_discover_mreg_c(eth_dev);
2958         if (err < 0) {
2959                 err = -err;
2960                 goto error;
2961         }
2962         if (!mlx5_flow_ext_mreg_supported(eth_dev)) {
2963                 DRV_LOG(DEBUG,
2964                         "port %u extensive metadata register is not supported",
2965                         eth_dev->data->port_id);
2966                 if (priv->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY) {
2967                         DRV_LOG(ERR, "metadata mode %u is not supported "
2968                                      "(no metadata registers available)",
2969                                      priv->config.dv_xmeta_en);
2970                         err = ENOTSUP;
2971                         goto error;
2972                 }
2973         }
2974         if (priv->config.dv_flow_en &&
2975             priv->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY &&
2976             mlx5_flow_ext_mreg_supported(eth_dev) &&
2977             priv->sh->dv_regc0_mask) {
2978                 priv->mreg_cp_tbl = mlx5_hlist_create(MLX5_FLOW_MREG_HNAME,
2979                                                       MLX5_FLOW_MREG_HTABLE_SZ);
2980                 if (!priv->mreg_cp_tbl) {
2981                         err = ENOMEM;
2982                         goto error;
2983                 }
2984         }
2985         return eth_dev;
2986 error:
2987         if (priv) {
2988                 if (priv->mreg_cp_tbl)
2989                         mlx5_hlist_destroy(priv->mreg_cp_tbl, NULL, NULL);
2990                 if (priv->sh)
2991                         mlx5_free_shared_dr(priv);
2992                 if (priv->nl_socket_route >= 0)
2993                         close(priv->nl_socket_route);
2994                 if (priv->nl_socket_rdma >= 0)
2995                         close(priv->nl_socket_rdma);
2996                 if (priv->vmwa_context)
2997                         mlx5_vlan_vmwa_exit(priv->vmwa_context);
2998                 if (priv->qrss_id_pool)
2999                         mlx5_flow_id_pool_release(priv->qrss_id_pool);
3000                 if (own_domain_id)
3001                         claim_zero(rte_eth_switch_domain_free(priv->domain_id));
3002                 rte_free(priv);
3003                 if (eth_dev != NULL)
3004                         eth_dev->data->dev_private = NULL;
3005         }
3006         if (eth_dev != NULL) {
3007                 /* mac_addrs must not be freed alone because part of dev_private */
3008                 eth_dev->data->mac_addrs = NULL;
3009                 rte_eth_dev_release_port(eth_dev);
3010         }
3011         if (sh)
3012                 mlx5_free_shared_ibctx(sh);
3013         MLX5_ASSERT(err > 0);
3014         rte_errno = err;
3015         return NULL;
3016 }
3017
3018 /**
3019  * Comparison callback to sort device data.
3020  *
3021  * This is meant to be used with qsort().
3022  *
3023  * @param a[in]
3024  *   Pointer to pointer to first data object.
3025  * @param b[in]
3026  *   Pointer to pointer to second data object.
3027  *
3028  * @return
3029  *   0 if both objects are equal, less than 0 if the first argument is less
3030  *   than the second, greater than 0 otherwise.
3031  */
3032 static int
3033 mlx5_dev_spawn_data_cmp(const void *a, const void *b)
3034 {
3035         const struct mlx5_switch_info *si_a =
3036                 &((const struct mlx5_dev_spawn_data *)a)->info;
3037         const struct mlx5_switch_info *si_b =
3038                 &((const struct mlx5_dev_spawn_data *)b)->info;
3039         int ret;
3040
3041         /* Master device first. */
3042         ret = si_b->master - si_a->master;
3043         if (ret)
3044                 return ret;
3045         /* Then representor devices. */
3046         ret = si_b->representor - si_a->representor;
3047         if (ret)
3048                 return ret;
3049         /* Unidentified devices come last in no specific order. */
3050         if (!si_a->representor)
3051                 return 0;
3052         /* Order representors by name. */
3053         return si_a->port_name - si_b->port_name;
3054 }
3055
3056 /**
3057  * Match PCI information for possible slaves of bonding device.
3058  *
3059  * @param[in] ibv_dev
3060  *   Pointer to Infiniband device structure.
3061  * @param[in] pci_dev
3062  *   Pointer to PCI device structure to match PCI address.
3063  * @param[in] nl_rdma
3064  *   Netlink RDMA group socket handle.
3065  *
3066  * @return
3067  *   negative value if no bonding device found, otherwise
3068  *   positive index of slave PF in bonding.
3069  */
3070 static int
3071 mlx5_device_bond_pci_match(const struct ibv_device *ibv_dev,
3072                            const struct rte_pci_device *pci_dev,
3073                            int nl_rdma)
3074 {
3075         char ifname[IF_NAMESIZE + 1];
3076         unsigned int ifindex;
3077         unsigned int np, i;
3078         FILE *file = NULL;
3079         int pf = -1;
3080
3081         /*
3082          * Try to get master device name. If something goes
3083          * wrong suppose the lack of kernel support and no
3084          * bonding devices.
3085          */
3086         if (nl_rdma < 0)
3087                 return -1;
3088         if (!strstr(ibv_dev->name, "bond"))
3089                 return -1;
3090         np = mlx5_nl_portnum(nl_rdma, ibv_dev->name);
3091         if (!np)
3092                 return -1;
3093         /*
3094          * The Master device might not be on the predefined
3095          * port (not on port index 1, it is not garanted),
3096          * we have to scan all Infiniband device port and
3097          * find master.
3098          */
3099         for (i = 1; i <= np; ++i) {
3100                 /* Check whether Infiniband port is populated. */
3101                 ifindex = mlx5_nl_ifindex(nl_rdma, ibv_dev->name, i);
3102                 if (!ifindex)
3103                         continue;
3104                 if (!if_indextoname(ifindex, ifname))
3105                         continue;
3106                 /* Try to read bonding slave names from sysfs. */
3107                 MKSTR(slaves,
3108                       "/sys/class/net/%s/master/bonding/slaves", ifname);
3109                 file = fopen(slaves, "r");
3110                 if (file)
3111                         break;
3112         }
3113         if (!file)
3114                 return -1;
3115         /* Use safe format to check maximal buffer length. */
3116         MLX5_ASSERT(atol(RTE_STR(IF_NAMESIZE)) == IF_NAMESIZE);
3117         while (fscanf(file, "%" RTE_STR(IF_NAMESIZE) "s", ifname) == 1) {
3118                 char tmp_str[IF_NAMESIZE + 32];
3119                 struct rte_pci_addr pci_addr;
3120                 struct mlx5_switch_info info;
3121
3122                 /* Process slave interface names in the loop. */
3123                 snprintf(tmp_str, sizeof(tmp_str),
3124                          "/sys/class/net/%s", ifname);
3125                 if (mlx5_dev_to_pci_addr(tmp_str, &pci_addr)) {
3126                         DRV_LOG(WARNING, "can not get PCI address"
3127                                          " for netdev \"%s\"", ifname);
3128                         continue;
3129                 }
3130                 if (pci_dev->addr.domain != pci_addr.domain ||
3131                     pci_dev->addr.bus != pci_addr.bus ||
3132                     pci_dev->addr.devid != pci_addr.devid ||
3133                     pci_dev->addr.function != pci_addr.function)
3134                         continue;
3135                 /* Slave interface PCI address match found. */
3136                 fclose(file);
3137                 snprintf(tmp_str, sizeof(tmp_str),
3138                          "/sys/class/net/%s/phys_port_name", ifname);
3139                 file = fopen(tmp_str, "rb");
3140                 if (!file)
3141                         break;
3142                 info.name_type = MLX5_PHYS_PORT_NAME_TYPE_NOTSET;
3143                 if (fscanf(file, "%32s", tmp_str) == 1)
3144                         mlx5_translate_port_name(tmp_str, &info);
3145                 if (info.name_type == MLX5_PHYS_PORT_NAME_TYPE_LEGACY ||
3146                     info.name_type == MLX5_PHYS_PORT_NAME_TYPE_UPLINK)
3147                         pf = info.port_name;
3148                 break;
3149         }
3150         if (file)
3151                 fclose(file);
3152         return pf;
3153 }
3154
3155 /**
3156  * DPDK callback to register a PCI device.
3157  *
3158  * This function spawns Ethernet devices out of a given PCI device.
3159  *
3160  * @param[in] pci_drv
3161  *   PCI driver structure (mlx5_driver).
3162  * @param[in] pci_dev
3163  *   PCI device information.
3164  *
3165  * @return
3166  *   0 on success, a negative errno value otherwise and rte_errno is set.
3167  */
3168 static int
3169 mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
3170                struct rte_pci_device *pci_dev)
3171 {
3172         struct ibv_device **ibv_list;
3173         /*
3174          * Number of found IB Devices matching with requested PCI BDF.
3175          * nd != 1 means there are multiple IB devices over the same
3176          * PCI device and we have representors and master.
3177          */
3178         unsigned int nd = 0;
3179         /*
3180          * Number of found IB device Ports. nd = 1 and np = 1..n means
3181          * we have the single multiport IB device, and there may be
3182          * representors attached to some of found ports.
3183          */
3184         unsigned int np = 0;
3185         /*
3186          * Number of DPDK ethernet devices to Spawn - either over
3187          * multiple IB devices or multiple ports of single IB device.
3188          * Actually this is the number of iterations to spawn.
3189          */
3190         unsigned int ns = 0;
3191         /*
3192          * Bonding device
3193          *   < 0 - no bonding device (single one)
3194          *  >= 0 - bonding device (value is slave PF index)
3195          */
3196         int bd = -1;
3197         struct mlx5_dev_spawn_data *list = NULL;
3198         struct mlx5_dev_config dev_config;
3199         int ret;
3200
3201         if (mlx5_class_get(pci_dev->device.devargs) != MLX5_CLASS_NET) {
3202                 DRV_LOG(DEBUG, "Skip probing - should be probed by other mlx5"
3203                         " driver.");
3204                 return 1;
3205         }
3206         if (rte_eal_process_type() == RTE_PROC_PRIMARY)
3207                 mlx5_pmd_socket_init();
3208         ret = mlx5_init_once();
3209         if (ret) {
3210                 DRV_LOG(ERR, "unable to init PMD global data: %s",
3211                         strerror(rte_errno));
3212                 return -rte_errno;
3213         }
3214         MLX5_ASSERT(pci_drv == &mlx5_driver);
3215         errno = 0;
3216         ibv_list = mlx5_glue->get_device_list(&ret);
3217         if (!ibv_list) {
3218                 rte_errno = errno ? errno : ENOSYS;
3219                 DRV_LOG(ERR, "cannot list devices, is ib_uverbs loaded?");
3220                 return -rte_errno;
3221         }
3222         /*
3223          * First scan the list of all Infiniband devices to find
3224          * matching ones, gathering into the list.
3225          */
3226         struct ibv_device *ibv_match[ret + 1];
3227         int nl_route = mlx5_nl_init(NETLINK_ROUTE);
3228         int nl_rdma = mlx5_nl_init(NETLINK_RDMA);
3229         unsigned int i;
3230
3231         while (ret-- > 0) {
3232                 struct rte_pci_addr pci_addr;
3233
3234                 DRV_LOG(DEBUG, "checking device \"%s\"", ibv_list[ret]->name);
3235                 bd = mlx5_device_bond_pci_match
3236                                 (ibv_list[ret], pci_dev, nl_rdma);
3237                 if (bd >= 0) {
3238                         /*
3239                          * Bonding device detected. Only one match is allowed,
3240                          * the bonding is supported over multi-port IB device,
3241                          * there should be no matches on representor PCI
3242                          * functions or non VF LAG bonding devices with
3243                          * specified address.
3244                          */
3245                         if (nd) {
3246                                 DRV_LOG(ERR,
3247                                         "multiple PCI match on bonding device"
3248                                         "\"%s\" found", ibv_list[ret]->name);
3249                                 rte_errno = ENOENT;
3250                                 ret = -rte_errno;
3251                                 goto exit;
3252                         }
3253                         DRV_LOG(INFO, "PCI information matches for"
3254                                       " slave %d bonding device \"%s\"",
3255                                       bd, ibv_list[ret]->name);
3256                         ibv_match[nd++] = ibv_list[ret];
3257                         break;
3258                 }
3259                 if (mlx5_dev_to_pci_addr
3260                         (ibv_list[ret]->ibdev_path, &pci_addr))
3261                         continue;
3262                 if (pci_dev->addr.domain != pci_addr.domain ||
3263                     pci_dev->addr.bus != pci_addr.bus ||
3264                     pci_dev->addr.devid != pci_addr.devid ||
3265                     pci_dev->addr.function != pci_addr.function)
3266                         continue;
3267                 DRV_LOG(INFO, "PCI information matches for device \"%s\"",
3268                         ibv_list[ret]->name);
3269                 ibv_match[nd++] = ibv_list[ret];
3270         }
3271         ibv_match[nd] = NULL;
3272         if (!nd) {
3273                 /* No device matches, just complain and bail out. */
3274                 DRV_LOG(WARNING,
3275                         "no Verbs device matches PCI device " PCI_PRI_FMT ","
3276                         " are kernel drivers loaded?",
3277                         pci_dev->addr.domain, pci_dev->addr.bus,
3278                         pci_dev->addr.devid, pci_dev->addr.function);
3279                 rte_errno = ENOENT;
3280                 ret = -rte_errno;
3281                 goto exit;
3282         }
3283         if (nd == 1) {
3284                 /*
3285                  * Found single matching device may have multiple ports.
3286                  * Each port may be representor, we have to check the port
3287                  * number and check the representors existence.
3288                  */
3289                 if (nl_rdma >= 0)
3290                         np = mlx5_nl_portnum(nl_rdma, ibv_match[0]->name);
3291                 if (!np)
3292                         DRV_LOG(WARNING, "can not get IB device \"%s\""
3293                                          " ports number", ibv_match[0]->name);
3294                 if (bd >= 0 && !np) {
3295                         DRV_LOG(ERR, "can not get ports"
3296                                      " for bonding device");
3297                         rte_errno = ENOENT;
3298                         ret = -rte_errno;
3299                         goto exit;
3300                 }
3301         }
3302 #ifndef HAVE_MLX5DV_DR_DEVX_PORT
3303         if (bd >= 0) {
3304                 /*
3305                  * This may happen if there is VF LAG kernel support and
3306                  * application is compiled with older rdma_core library.
3307                  */
3308                 DRV_LOG(ERR,
3309                         "No kernel/verbs support for VF LAG bonding found.");
3310                 rte_errno = ENOTSUP;
3311                 ret = -rte_errno;
3312                 goto exit;
3313         }
3314 #endif
3315         /*
3316          * Now we can determine the maximal
3317          * amount of devices to be spawned.
3318          */
3319         list = rte_zmalloc("device spawn data",
3320                          sizeof(struct mlx5_dev_spawn_data) *
3321                          (np ? np : nd),
3322                          RTE_CACHE_LINE_SIZE);
3323         if (!list) {
3324                 DRV_LOG(ERR, "spawn data array allocation failure");
3325                 rte_errno = ENOMEM;
3326                 ret = -rte_errno;
3327                 goto exit;
3328         }
3329         if (bd >= 0 || np > 1) {
3330                 /*
3331                  * Single IB device with multiple ports found,
3332                  * it may be E-Switch master device and representors.
3333                  * We have to perform identification trough the ports.
3334                  */
3335                 MLX5_ASSERT(nl_rdma >= 0);
3336                 MLX5_ASSERT(ns == 0);
3337                 MLX5_ASSERT(nd == 1);
3338                 MLX5_ASSERT(np);
3339                 for (i = 1; i <= np; ++i) {
3340                         list[ns].max_port = np;
3341                         list[ns].ibv_port = i;
3342                         list[ns].ibv_dev = ibv_match[0];
3343                         list[ns].eth_dev = NULL;
3344                         list[ns].pci_dev = pci_dev;
3345                         list[ns].pf_bond = bd;
3346                         list[ns].ifindex = mlx5_nl_ifindex
3347                                         (nl_rdma, list[ns].ibv_dev->name, i);
3348                         if (!list[ns].ifindex) {
3349                                 /*
3350                                  * No network interface index found for the
3351                                  * specified port, it means there is no
3352                                  * representor on this port. It's OK,
3353                                  * there can be disabled ports, for example
3354                                  * if sriov_numvfs < sriov_totalvfs.
3355                                  */
3356                                 continue;
3357                         }
3358                         ret = -1;
3359                         if (nl_route >= 0)
3360                                 ret = mlx5_nl_switch_info
3361                                                (nl_route,
3362                                                 list[ns].ifindex,
3363                                                 &list[ns].info);
3364                         if (ret || (!list[ns].info.representor &&
3365                                     !list[ns].info.master)) {
3366                                 /*
3367                                  * We failed to recognize representors with
3368                                  * Netlink, let's try to perform the task
3369                                  * with sysfs.
3370                                  */
3371                                 ret =  mlx5_sysfs_switch_info
3372                                                 (list[ns].ifindex,
3373                                                  &list[ns].info);
3374                         }
3375                         if (!ret && bd >= 0) {
3376                                 switch (list[ns].info.name_type) {
3377                                 case MLX5_PHYS_PORT_NAME_TYPE_UPLINK:
3378                                         if (list[ns].info.port_name == bd)
3379                                                 ns++;
3380                                         break;
3381                                 case MLX5_PHYS_PORT_NAME_TYPE_PFVF:
3382                                         if (list[ns].info.pf_num == bd)
3383                                                 ns++;
3384                                         break;
3385                                 default:
3386                                         break;
3387                                 }
3388                                 continue;
3389                         }
3390                         if (!ret && (list[ns].info.representor ^
3391                                      list[ns].info.master))
3392                                 ns++;
3393                 }
3394                 if (!ns) {
3395                         DRV_LOG(ERR,
3396                                 "unable to recognize master/representors"
3397                                 " on the IB device with multiple ports");
3398                         rte_errno = ENOENT;
3399                         ret = -rte_errno;
3400                         goto exit;
3401                 }
3402         } else {
3403                 /*
3404                  * The existence of several matching entries (nd > 1) means
3405                  * port representors have been instantiated. No existing Verbs
3406                  * call nor sysfs entries can tell them apart, this can only
3407                  * be done through Netlink calls assuming kernel drivers are
3408                  * recent enough to support them.
3409                  *
3410                  * In the event of identification failure through Netlink,
3411                  * try again through sysfs, then:
3412                  *
3413                  * 1. A single IB device matches (nd == 1) with single
3414                  *    port (np=0/1) and is not a representor, assume
3415                  *    no switch support.
3416                  *
3417                  * 2. Otherwise no safe assumptions can be made;
3418                  *    complain louder and bail out.
3419                  */
3420                 np = 1;
3421                 for (i = 0; i != nd; ++i) {
3422                         memset(&list[ns].info, 0, sizeof(list[ns].info));
3423                         list[ns].max_port = 1;
3424                         list[ns].ibv_port = 1;
3425                         list[ns].ibv_dev = ibv_match[i];
3426                         list[ns].eth_dev = NULL;
3427                         list[ns].pci_dev = pci_dev;
3428                         list[ns].pf_bond = -1;
3429                         list[ns].ifindex = 0;
3430                         if (nl_rdma >= 0)
3431                                 list[ns].ifindex = mlx5_nl_ifindex
3432                                         (nl_rdma, list[ns].ibv_dev->name, 1);
3433                         if (!list[ns].ifindex) {
3434                                 char ifname[IF_NAMESIZE];
3435
3436                                 /*
3437                                  * Netlink failed, it may happen with old
3438                                  * ib_core kernel driver (before 4.16).
3439                                  * We can assume there is old driver because
3440                                  * here we are processing single ports IB
3441                                  * devices. Let's try sysfs to retrieve
3442                                  * the ifindex. The method works for
3443                                  * master device only.
3444                                  */
3445                                 if (nd > 1) {
3446                                         /*
3447                                          * Multiple devices found, assume
3448                                          * representors, can not distinguish
3449                                          * master/representor and retrieve
3450                                          * ifindex via sysfs.
3451                                          */
3452                                         continue;
3453                                 }
3454                                 ret = mlx5_get_master_ifname
3455                                         (ibv_match[i]->ibdev_path, &ifname);
3456                                 if (!ret)
3457                                         list[ns].ifindex =
3458                                                 if_nametoindex(ifname);
3459                                 if (!list[ns].ifindex) {
3460                                         /*
3461                                          * No network interface index found
3462                                          * for the specified device, it means
3463                                          * there it is neither representor
3464                                          * nor master.
3465                                          */
3466                                         continue;
3467                                 }
3468                         }
3469                         ret = -1;
3470                         if (nl_route >= 0)
3471                                 ret = mlx5_nl_switch_info
3472                                                (nl_route,
3473                                                 list[ns].ifindex,
3474                                                 &list[ns].info);
3475                         if (ret || (!list[ns].info.representor &&
3476                                     !list[ns].info.master)) {
3477                                 /*
3478                                  * We failed to recognize representors with
3479                                  * Netlink, let's try to perform the task
3480                                  * with sysfs.
3481                                  */
3482                                 ret =  mlx5_sysfs_switch_info
3483                                                 (list[ns].ifindex,
3484                                                  &list[ns].info);
3485                         }
3486                         if (!ret && (list[ns].info.representor ^
3487                                      list[ns].info.master)) {
3488                                 ns++;
3489                         } else if ((nd == 1) &&
3490                                    !list[ns].info.representor &&
3491                                    !list[ns].info.master) {
3492                                 /*
3493                                  * Single IB device with
3494                                  * one physical port and
3495                                  * attached network device.
3496                                  * May be SRIOV is not enabled
3497                                  * or there is no representors.
3498                                  */
3499                                 DRV_LOG(INFO, "no E-Switch support detected");
3500                                 ns++;
3501                                 break;
3502                         }
3503                 }
3504                 if (!ns) {
3505                         DRV_LOG(ERR,
3506                                 "unable to recognize master/representors"
3507                                 " on the multiple IB devices");
3508                         rte_errno = ENOENT;
3509                         ret = -rte_errno;
3510                         goto exit;
3511                 }
3512         }
3513         MLX5_ASSERT(ns);
3514         /*
3515          * Sort list to probe devices in natural order for users convenience
3516          * (i.e. master first, then representors from lowest to highest ID).
3517          */
3518         qsort(list, ns, sizeof(*list), mlx5_dev_spawn_data_cmp);
3519         /* Default configuration. */
3520         dev_config = (struct mlx5_dev_config){
3521                 .hw_padding = 0,
3522                 .mps = MLX5_ARG_UNSET,
3523                 .dbnc = MLX5_ARG_UNSET,
3524                 .rx_vec_en = 1,
3525                 .txq_inline_max = MLX5_ARG_UNSET,
3526                 .txq_inline_min = MLX5_ARG_UNSET,
3527                 .txq_inline_mpw = MLX5_ARG_UNSET,
3528                 .txqs_inline = MLX5_ARG_UNSET,
3529                 .vf_nl_en = 1,
3530                 .mr_ext_memseg_en = 1,
3531                 .mprq = {
3532                         .enabled = 0, /* Disabled by default. */
3533                         .stride_num_n = 0,
3534                         .stride_size_n = 0,
3535                         .max_memcpy_len = MLX5_MPRQ_MEMCPY_DEFAULT_LEN,
3536                         .min_rxqs_num = MLX5_MPRQ_MIN_RXQS,
3537                 },
3538                 .dv_esw_en = 1,
3539                 .dv_flow_en = 1,
3540                 .log_hp_size = MLX5_ARG_UNSET,
3541         };
3542         /* Device specific configuration. */
3543         switch (pci_dev->id.device_id) {
3544         case PCI_DEVICE_ID_MELLANOX_CONNECTX4VF:
3545         case PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF:
3546         case PCI_DEVICE_ID_MELLANOX_CONNECTX5VF:
3547         case PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF:
3548         case PCI_DEVICE_ID_MELLANOX_CONNECTX5BFVF:
3549         case PCI_DEVICE_ID_MELLANOX_CONNECTX6VF:
3550         case PCI_DEVICE_ID_MELLANOX_CONNECTX6DXVF:
3551                 dev_config.vf = 1;
3552                 break;
3553         default:
3554                 break;
3555         }
3556         for (i = 0; i != ns; ++i) {
3557                 uint32_t restore;
3558
3559                 list[i].eth_dev = mlx5_dev_spawn(&pci_dev->device,
3560                                                  &list[i],
3561                                                  dev_config);
3562                 if (!list[i].eth_dev) {
3563                         if (rte_errno != EBUSY && rte_errno != EEXIST)
3564                                 break;
3565                         /* Device is disabled or already spawned. Ignore it. */
3566                         continue;
3567                 }
3568                 restore = list[i].eth_dev->data->dev_flags;
3569                 rte_eth_copy_pci_info(list[i].eth_dev, pci_dev);
3570                 /* Restore non-PCI flags cleared by the above call. */
3571                 list[i].eth_dev->data->dev_flags |= restore;
3572                 mlx5_dev_interrupt_handler_devx_install(list[i].eth_dev);
3573                 rte_eth_dev_probing_finish(list[i].eth_dev);
3574         }
3575         if (i != ns) {
3576                 DRV_LOG(ERR,
3577                         "probe of PCI device " PCI_PRI_FMT " aborted after"
3578                         " encountering an error: %s",
3579                         pci_dev->addr.domain, pci_dev->addr.bus,
3580                         pci_dev->addr.devid, pci_dev->addr.function,
3581                         strerror(rte_errno));
3582                 ret = -rte_errno;
3583                 /* Roll back. */
3584                 while (i--) {
3585                         if (!list[i].eth_dev)
3586                                 continue;
3587                         mlx5_dev_close(list[i].eth_dev);
3588                         /* mac_addrs must not be freed because in dev_private */
3589                         list[i].eth_dev->data->mac_addrs = NULL;
3590                         claim_zero(rte_eth_dev_release_port(list[i].eth_dev));
3591                 }
3592                 /* Restore original error. */
3593                 rte_errno = -ret;
3594         } else {
3595                 ret = 0;
3596         }
3597 exit:
3598         /*
3599          * Do the routine cleanup:
3600          * - close opened Netlink sockets
3601          * - free allocated spawn data array
3602          * - free the Infiniband device list
3603          */
3604         if (nl_rdma >= 0)
3605                 close(nl_rdma);
3606         if (nl_route >= 0)
3607                 close(nl_route);
3608         if (list)
3609                 rte_free(list);
3610         MLX5_ASSERT(ibv_list);
3611         mlx5_glue->free_device_list(ibv_list);
3612         return ret;
3613 }
3614
3615 /**
3616  * Look for the ethernet device belonging to mlx5 driver.
3617  *
3618  * @param[in] port_id
3619  *   port_id to start looking for device.
3620  * @param[in] pci_dev
3621  *   Pointer to the hint PCI device. When device is being probed
3622  *   the its siblings (master and preceding representors might
3623  *   not have assigned driver yet (because the mlx5_pci_probe()
3624  *   is not completed yet, for this case match on hint PCI
3625  *   device may be used to detect sibling device.
3626  *
3627  * @return
3628  *   port_id of found device, RTE_MAX_ETHPORT if not found.
3629  */
3630 uint16_t
3631 mlx5_eth_find_next(uint16_t port_id, struct rte_pci_device *pci_dev)
3632 {
3633         while (port_id < RTE_MAX_ETHPORTS) {
3634                 struct rte_eth_dev *dev = &rte_eth_devices[port_id];
3635
3636                 if (dev->state != RTE_ETH_DEV_UNUSED &&
3637                     dev->device &&
3638                     (dev->device == &pci_dev->device ||
3639                      (dev->device->driver &&
3640                      dev->device->driver->name &&
3641                      !strcmp(dev->device->driver->name, MLX5_DRIVER_NAME))))
3642                         break;
3643                 port_id++;
3644         }
3645         if (port_id >= RTE_MAX_ETHPORTS)
3646                 return RTE_MAX_ETHPORTS;
3647         return port_id;
3648 }
3649
3650 /**
3651  * DPDK callback to remove a PCI device.
3652  *
3653  * This function removes all Ethernet devices belong to a given PCI device.
3654  *
3655  * @param[in] pci_dev
3656  *   Pointer to the PCI device.
3657  *
3658  * @return
3659  *   0 on success, the function cannot fail.
3660  */
3661 static int
3662 mlx5_pci_remove(struct rte_pci_device *pci_dev)
3663 {
3664         uint16_t port_id;
3665
3666         RTE_ETH_FOREACH_DEV_OF(port_id, &pci_dev->device)
3667                 rte_eth_dev_close(port_id);
3668         return 0;
3669 }
3670
3671 static const struct rte_pci_id mlx5_pci_id_map[] = {
3672         {
3673                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
3674                                PCI_DEVICE_ID_MELLANOX_CONNECTX4)
3675         },
3676         {
3677                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
3678                                PCI_DEVICE_ID_MELLANOX_CONNECTX4VF)
3679         },
3680         {
3681                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
3682                                PCI_DEVICE_ID_MELLANOX_CONNECTX4LX)
3683         },
3684         {
3685                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
3686                                PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF)
3687         },
3688         {
3689                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
3690                                PCI_DEVICE_ID_MELLANOX_CONNECTX5)
3691         },
3692         {
3693                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
3694                                PCI_DEVICE_ID_MELLANOX_CONNECTX5VF)
3695         },
3696         {
3697                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
3698                                PCI_DEVICE_ID_MELLANOX_CONNECTX5EX)
3699         },
3700         {
3701                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
3702                                PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF)
3703         },
3704         {
3705                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
3706                                PCI_DEVICE_ID_MELLANOX_CONNECTX5BF)
3707         },
3708         {
3709                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
3710                                PCI_DEVICE_ID_MELLANOX_CONNECTX5BFVF)
3711         },
3712         {
3713                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
3714                                 PCI_DEVICE_ID_MELLANOX_CONNECTX6)
3715         },
3716         {
3717                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
3718                                 PCI_DEVICE_ID_MELLANOX_CONNECTX6VF)
3719         },
3720         {
3721                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
3722                                 PCI_DEVICE_ID_MELLANOX_CONNECTX6DX)
3723         },
3724         {
3725                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
3726                                 PCI_DEVICE_ID_MELLANOX_CONNECTX6DXVF)
3727         },
3728         {
3729                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
3730                                 PCI_DEVICE_ID_MELLANOX_CONNECTX6DXBF)
3731         },
3732         {
3733                 .vendor_id = 0
3734         }
3735 };
3736
3737 static struct rte_pci_driver mlx5_driver = {
3738         .driver = {
3739                 .name = MLX5_DRIVER_NAME
3740         },
3741         .id_table = mlx5_pci_id_map,
3742         .probe = mlx5_pci_probe,
3743         .remove = mlx5_pci_remove,
3744         .dma_map = mlx5_dma_map,
3745         .dma_unmap = mlx5_dma_unmap,
3746         .drv_flags = RTE_PCI_DRV_INTR_LSC | RTE_PCI_DRV_INTR_RMV |
3747                      RTE_PCI_DRV_PROBE_AGAIN,
3748 };
3749
3750 /**
3751  * Driver initialization routine.
3752  */
3753 RTE_INIT(rte_mlx5_pmd_init)
3754 {
3755         /* Initialize driver log type. */
3756         mlx5_logtype = rte_log_register("pmd.net.mlx5");
3757         if (mlx5_logtype >= 0)
3758                 rte_log_set_level(mlx5_logtype, RTE_LOG_NOTICE);
3759
3760         /* Build the static tables for Verbs conversion. */
3761         mlx5_set_ptype_table();
3762         mlx5_set_cksum_table();
3763         mlx5_set_swp_types_table();
3764         if (mlx5_glue)
3765                 rte_pci_register(&mlx5_driver);
3766 }
3767
3768 RTE_PMD_EXPORT_NAME(net_mlx5, __COUNTER__);
3769 RTE_PMD_REGISTER_PCI_TABLE(net_mlx5, mlx5_pci_id_map);
3770 RTE_PMD_REGISTER_KMOD_DEP(net_mlx5, "* ib_uverbs & mlx5_core & mlx5_ib");