common/mlx5: refactor IPC handling from net driver
[dpdk.git] / drivers / net / mlx5 / mlx5.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2015 6WIND S.A.
3  * Copyright 2015 Mellanox Technologies, Ltd
4  */
5
6 #include <stddef.h>
7 #include <unistd.h>
8 #include <string.h>
9 #include <stdint.h>
10 #include <stdlib.h>
11 #include <errno.h>
12 #include <net/if.h>
13 #include <sys/mman.h>
14 #include <linux/rtnetlink.h>
15
16 /* Verbs header. */
17 /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
18 #ifdef PEDANTIC
19 #pragma GCC diagnostic ignored "-Wpedantic"
20 #endif
21 #include <infiniband/verbs.h>
22 #ifdef PEDANTIC
23 #pragma GCC diagnostic error "-Wpedantic"
24 #endif
25
26 #include <rte_malloc.h>
27 #include <rte_ethdev_driver.h>
28 #include <rte_ethdev_pci.h>
29 #include <rte_pci.h>
30 #include <rte_bus_pci.h>
31 #include <rte_common.h>
32 #include <rte_kvargs.h>
33 #include <rte_rwlock.h>
34 #include <rte_spinlock.h>
35 #include <rte_string_fns.h>
36 #include <rte_alarm.h>
37
38 #include <mlx5_glue.h>
39 #include <mlx5_devx_cmds.h>
40 #include <mlx5_common.h>
41 #include <mlx5_common_mp.h>
42
43 #include "mlx5_defs.h"
44 #include "mlx5.h"
45 #include "mlx5_utils.h"
46 #include "mlx5_rxtx.h"
47 #include "mlx5_autoconf.h"
48 #include "mlx5_mr.h"
49 #include "mlx5_flow.h"
50 #include "rte_pmd_mlx5.h"
51
52 /* Device parameter to enable RX completion queue compression. */
53 #define MLX5_RXQ_CQE_COMP_EN "rxq_cqe_comp_en"
54
55 /* Device parameter to enable RX completion entry padding to 128B. */
56 #define MLX5_RXQ_CQE_PAD_EN "rxq_cqe_pad_en"
57
58 /* Device parameter to enable padding Rx packet to cacheline size. */
59 #define MLX5_RXQ_PKT_PAD_EN "rxq_pkt_pad_en"
60
61 /* Device parameter to enable Multi-Packet Rx queue. */
62 #define MLX5_RX_MPRQ_EN "mprq_en"
63
64 /* Device parameter to configure log 2 of the number of strides for MPRQ. */
65 #define MLX5_RX_MPRQ_LOG_STRIDE_NUM "mprq_log_stride_num"
66
67 /* Device parameter to configure log 2 of the stride size for MPRQ. */
68 #define MLX5_RX_MPRQ_LOG_STRIDE_SIZE "mprq_log_stride_size"
69
70 /* Device parameter to limit the size of memcpy'd packet for MPRQ. */
71 #define MLX5_RX_MPRQ_MAX_MEMCPY_LEN "mprq_max_memcpy_len"
72
73 /* Device parameter to set the minimum number of Rx queues to enable MPRQ. */
74 #define MLX5_RXQS_MIN_MPRQ "rxqs_min_mprq"
75
76 /* Device parameter to configure inline send. Deprecated, ignored.*/
77 #define MLX5_TXQ_INLINE "txq_inline"
78
79 /* Device parameter to limit packet size to inline with ordinary SEND. */
80 #define MLX5_TXQ_INLINE_MAX "txq_inline_max"
81
82 /* Device parameter to configure minimal data size to inline. */
83 #define MLX5_TXQ_INLINE_MIN "txq_inline_min"
84
85 /* Device parameter to limit packet size to inline with Enhanced MPW. */
86 #define MLX5_TXQ_INLINE_MPW "txq_inline_mpw"
87
88 /*
89  * Device parameter to configure the number of TX queues threshold for
90  * enabling inline send.
91  */
92 #define MLX5_TXQS_MIN_INLINE "txqs_min_inline"
93
94 /*
95  * Device parameter to configure the number of TX queues threshold for
96  * enabling vectorized Tx, deprecated, ignored (no vectorized Tx routines).
97  */
98 #define MLX5_TXQS_MAX_VEC "txqs_max_vec"
99
100 /* Device parameter to enable multi-packet send WQEs. */
101 #define MLX5_TXQ_MPW_EN "txq_mpw_en"
102
103 /*
104  * Device parameter to force doorbell register mapping
105  * to non-cahed region eliminating the extra write memory barrier.
106  */
107 #define MLX5_TX_DB_NC "tx_db_nc"
108
109 /*
110  * Device parameter to include 2 dsegs in the title WQEBB.
111  * Deprecated, ignored.
112  */
113 #define MLX5_TXQ_MPW_HDR_DSEG_EN "txq_mpw_hdr_dseg_en"
114
115 /*
116  * Device parameter to limit the size of inlining packet.
117  * Deprecated, ignored.
118  */
119 #define MLX5_TXQ_MAX_INLINE_LEN "txq_max_inline_len"
120
121 /*
122  * Device parameter to enable hardware Tx vector.
123  * Deprecated, ignored (no vectorized Tx routines anymore).
124  */
125 #define MLX5_TX_VEC_EN "tx_vec_en"
126
127 /* Device parameter to enable hardware Rx vector. */
128 #define MLX5_RX_VEC_EN "rx_vec_en"
129
130 /* Allow L3 VXLAN flow creation. */
131 #define MLX5_L3_VXLAN_EN "l3_vxlan_en"
132
133 /* Activate DV E-Switch flow steering. */
134 #define MLX5_DV_ESW_EN "dv_esw_en"
135
136 /* Activate DV flow steering. */
137 #define MLX5_DV_FLOW_EN "dv_flow_en"
138
139 /* Enable extensive flow metadata support. */
140 #define MLX5_DV_XMETA_EN "dv_xmeta_en"
141
142 /* Activate Netlink support in VF mode. */
143 #define MLX5_VF_NL_EN "vf_nl_en"
144
145 /* Enable extending memsegs when creating a MR. */
146 #define MLX5_MR_EXT_MEMSEG_EN "mr_ext_memseg_en"
147
148 /* Select port representors to instantiate. */
149 #define MLX5_REPRESENTOR "representor"
150
151 /* Device parameter to configure the maximum number of dump files per queue. */
152 #define MLX5_MAX_DUMP_FILES_NUM "max_dump_files_num"
153
154 /* Configure timeout of LRO session (in microseconds). */
155 #define MLX5_LRO_TIMEOUT_USEC "lro_timeout_usec"
156
157 /*
158  * Device parameter to configure the total data buffer size for a single
159  * hairpin queue (logarithm value).
160  */
161 #define MLX5_HP_BUF_SIZE "hp_buf_log_sz"
162
163 #ifndef HAVE_IBV_MLX5_MOD_MPW
164 #define MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED (1 << 2)
165 #define MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW (1 << 3)
166 #endif
167
168 #ifndef HAVE_IBV_MLX5_MOD_CQE_128B_COMP
169 #define MLX5DV_CONTEXT_FLAGS_CQE_128B_COMP (1 << 4)
170 #endif
171
172 static const char *MZ_MLX5_PMD_SHARED_DATA = "mlx5_pmd_shared_data";
173
174 /* Shared memory between primary and secondary processes. */
175 struct mlx5_shared_data *mlx5_shared_data;
176
177 /* Spinlock for mlx5_shared_data allocation. */
178 static rte_spinlock_t mlx5_shared_data_lock = RTE_SPINLOCK_INITIALIZER;
179
180 /* Process local data for secondary processes. */
181 static struct mlx5_local_data mlx5_local_data;
182
183 /** Driver-specific log messages type. */
184 int mlx5_logtype;
185
186 /** Data associated with devices to spawn. */
187 struct mlx5_dev_spawn_data {
188         uint32_t ifindex; /**< Network interface index. */
189         uint32_t max_port; /**< IB device maximal port index. */
190         uint32_t ibv_port; /**< IB device physical port index. */
191         int pf_bond; /**< bonding device PF index. < 0 - no bonding */
192         struct mlx5_switch_info info; /**< Switch information. */
193         struct ibv_device *ibv_dev; /**< Associated IB device. */
194         struct rte_eth_dev *eth_dev; /**< Associated Ethernet device. */
195         struct rte_pci_device *pci_dev; /**< Backend PCI device. */
196 };
197
198 static LIST_HEAD(, mlx5_ibv_shared) mlx5_ibv_list = LIST_HEAD_INITIALIZER();
199 static pthread_mutex_t mlx5_ibv_list_mutex = PTHREAD_MUTEX_INITIALIZER;
200
201 #define MLX5_FLOW_MIN_ID_POOL_SIZE 512
202 #define MLX5_ID_GENERATION_ARRAY_FACTOR 16
203
204 #define MLX5_FLOW_TABLE_HLIST_ARRAY_SIZE 4096
205 #define MLX5_TAGS_HLIST_ARRAY_SIZE 8192
206
207 /**
208  * Allocate ID pool structure.
209  *
210  * @param[in] max_id
211  *   The maximum id can be allocated from the pool.
212  *
213  * @return
214  *   Pointer to pool object, NULL value otherwise.
215  */
216 struct mlx5_flow_id_pool *
217 mlx5_flow_id_pool_alloc(uint32_t max_id)
218 {
219         struct mlx5_flow_id_pool *pool;
220         void *mem;
221
222         pool = rte_zmalloc("id pool allocation", sizeof(*pool),
223                            RTE_CACHE_LINE_SIZE);
224         if (!pool) {
225                 DRV_LOG(ERR, "can't allocate id pool");
226                 rte_errno  = ENOMEM;
227                 return NULL;
228         }
229         mem = rte_zmalloc("", MLX5_FLOW_MIN_ID_POOL_SIZE * sizeof(uint32_t),
230                           RTE_CACHE_LINE_SIZE);
231         if (!mem) {
232                 DRV_LOG(ERR, "can't allocate mem for id pool");
233                 rte_errno  = ENOMEM;
234                 goto error;
235         }
236         pool->free_arr = mem;
237         pool->curr = pool->free_arr;
238         pool->last = pool->free_arr + MLX5_FLOW_MIN_ID_POOL_SIZE;
239         pool->base_index = 0;
240         pool->max_id = max_id;
241         return pool;
242 error:
243         rte_free(pool);
244         return NULL;
245 }
246
247 /**
248  * Release ID pool structure.
249  *
250  * @param[in] pool
251  *   Pointer to flow id pool object to free.
252  */
253 void
254 mlx5_flow_id_pool_release(struct mlx5_flow_id_pool *pool)
255 {
256         rte_free(pool->free_arr);
257         rte_free(pool);
258 }
259
260 /**
261  * Generate ID.
262  *
263  * @param[in] pool
264  *   Pointer to flow id pool.
265  * @param[out] id
266  *   The generated ID.
267  *
268  * @return
269  *   0 on success, error value otherwise.
270  */
271 uint32_t
272 mlx5_flow_id_get(struct mlx5_flow_id_pool *pool, uint32_t *id)
273 {
274         if (pool->curr == pool->free_arr) {
275                 if (pool->base_index == pool->max_id) {
276                         rte_errno  = ENOMEM;
277                         DRV_LOG(ERR, "no free id");
278                         return -rte_errno;
279                 }
280                 *id = ++pool->base_index;
281                 return 0;
282         }
283         *id = *(--pool->curr);
284         return 0;
285 }
286
287 /**
288  * Release ID.
289  *
290  * @param[in] pool
291  *   Pointer to flow id pool.
292  * @param[out] id
293  *   The generated ID.
294  *
295  * @return
296  *   0 on success, error value otherwise.
297  */
298 uint32_t
299 mlx5_flow_id_release(struct mlx5_flow_id_pool *pool, uint32_t id)
300 {
301         uint32_t size;
302         uint32_t size2;
303         void *mem;
304
305         if (pool->curr == pool->last) {
306                 size = pool->curr - pool->free_arr;
307                 size2 = size * MLX5_ID_GENERATION_ARRAY_FACTOR;
308                 MLX5_ASSERT(size2 > size);
309                 mem = rte_malloc("", size2 * sizeof(uint32_t), 0);
310                 if (!mem) {
311                         DRV_LOG(ERR, "can't allocate mem for id pool");
312                         rte_errno  = ENOMEM;
313                         return -rte_errno;
314                 }
315                 memcpy(mem, pool->free_arr, size * sizeof(uint32_t));
316                 rte_free(pool->free_arr);
317                 pool->free_arr = mem;
318                 pool->curr = pool->free_arr + size;
319                 pool->last = pool->free_arr + size2;
320         }
321         *pool->curr = id;
322         pool->curr++;
323         return 0;
324 }
325
326 /**
327  * Initialize the counters management structure.
328  *
329  * @param[in] sh
330  *   Pointer to mlx5_ibv_shared object to free
331  */
332 static void
333 mlx5_flow_counters_mng_init(struct mlx5_ibv_shared *sh)
334 {
335         uint8_t i;
336
337         TAILQ_INIT(&sh->cmng.flow_counters);
338         for (i = 0; i < RTE_DIM(sh->cmng.ccont); ++i)
339                 TAILQ_INIT(&sh->cmng.ccont[i].pool_list);
340 }
341
342 /**
343  * Destroy all the resources allocated for a counter memory management.
344  *
345  * @param[in] mng
346  *   Pointer to the memory management structure.
347  */
348 static void
349 mlx5_flow_destroy_counter_stat_mem_mng(struct mlx5_counter_stats_mem_mng *mng)
350 {
351         uint8_t *mem = (uint8_t *)(uintptr_t)mng->raws[0].data;
352
353         LIST_REMOVE(mng, next);
354         claim_zero(mlx5_devx_cmd_destroy(mng->dm));
355         claim_zero(mlx5_glue->devx_umem_dereg(mng->umem));
356         rte_free(mem);
357 }
358
359 /**
360  * Close and release all the resources of the counters management.
361  *
362  * @param[in] sh
363  *   Pointer to mlx5_ibv_shared object to free.
364  */
365 static void
366 mlx5_flow_counters_mng_close(struct mlx5_ibv_shared *sh)
367 {
368         struct mlx5_counter_stats_mem_mng *mng;
369         uint8_t i;
370         int j;
371         int retries = 1024;
372
373         rte_errno = 0;
374         while (--retries) {
375                 rte_eal_alarm_cancel(mlx5_flow_query_alarm, sh);
376                 if (rte_errno != EINPROGRESS)
377                         break;
378                 rte_pause();
379         }
380         for (i = 0; i < RTE_DIM(sh->cmng.ccont); ++i) {
381                 struct mlx5_flow_counter_pool *pool;
382                 uint32_t batch = !!(i % 2);
383
384                 if (!sh->cmng.ccont[i].pools)
385                         continue;
386                 pool = TAILQ_FIRST(&sh->cmng.ccont[i].pool_list);
387                 while (pool) {
388                         if (batch) {
389                                 if (pool->min_dcs)
390                                         claim_zero
391                                         (mlx5_devx_cmd_destroy(pool->min_dcs));
392                         }
393                         for (j = 0; j < MLX5_COUNTERS_PER_POOL; ++j) {
394                                 if (pool->counters_raw[j].action)
395                                         claim_zero
396                                         (mlx5_glue->destroy_flow_action
397                                                (pool->counters_raw[j].action));
398                                 if (!batch && MLX5_GET_POOL_CNT_EXT
399                                     (pool, j)->dcs)
400                                         claim_zero(mlx5_devx_cmd_destroy
401                                                   (MLX5_GET_POOL_CNT_EXT
402                                                   (pool, j)->dcs));
403                         }
404                         TAILQ_REMOVE(&sh->cmng.ccont[i].pool_list, pool,
405                                      next);
406                         rte_free(pool);
407                         pool = TAILQ_FIRST(&sh->cmng.ccont[i].pool_list);
408                 }
409                 rte_free(sh->cmng.ccont[i].pools);
410         }
411         mng = LIST_FIRST(&sh->cmng.mem_mngs);
412         while (mng) {
413                 mlx5_flow_destroy_counter_stat_mem_mng(mng);
414                 mng = LIST_FIRST(&sh->cmng.mem_mngs);
415         }
416         memset(&sh->cmng, 0, sizeof(sh->cmng));
417 }
418
419 /**
420  * Extract pdn of PD object using DV API.
421  *
422  * @param[in] pd
423  *   Pointer to the verbs PD object.
424  * @param[out] pdn
425  *   Pointer to the PD object number variable.
426  *
427  * @return
428  *   0 on success, error value otherwise.
429  */
430 #ifdef HAVE_IBV_FLOW_DV_SUPPORT
431 static int
432 mlx5_get_pdn(struct ibv_pd *pd __rte_unused, uint32_t *pdn __rte_unused)
433 {
434         struct mlx5dv_obj obj;
435         struct mlx5dv_pd pd_info;
436         int ret = 0;
437
438         obj.pd.in = pd;
439         obj.pd.out = &pd_info;
440         ret = mlx5_glue->dv_init_obj(&obj, MLX5DV_OBJ_PD);
441         if (ret) {
442                 DRV_LOG(DEBUG, "Fail to get PD object info");
443                 return ret;
444         }
445         *pdn = pd_info.pdn;
446         return 0;
447 }
448 #endif /* HAVE_IBV_FLOW_DV_SUPPORT */
449
450 static int
451 mlx5_config_doorbell_mapping_env(const struct mlx5_dev_config *config)
452 {
453         char *env;
454         int value;
455
456         MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY);
457         /* Get environment variable to store. */
458         env = getenv(MLX5_SHUT_UP_BF);
459         value = env ? !!strcmp(env, "0") : MLX5_ARG_UNSET;
460         if (config->dbnc == MLX5_ARG_UNSET)
461                 setenv(MLX5_SHUT_UP_BF, MLX5_SHUT_UP_BF_DEFAULT, 1);
462         else
463                 setenv(MLX5_SHUT_UP_BF,
464                        config->dbnc == MLX5_TXDB_NCACHED ? "1" : "0", 1);
465         return value;
466 }
467
468 static void
469 mlx5_restore_doorbell_mapping_env(int value)
470 {
471         MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY);
472         /* Restore the original environment variable state. */
473         if (value == MLX5_ARG_UNSET)
474                 unsetenv(MLX5_SHUT_UP_BF);
475         else
476                 setenv(MLX5_SHUT_UP_BF, value ? "1" : "0", 1);
477 }
478
479 /**
480  * Allocate shared IB device context. If there is multiport device the
481  * master and representors will share this context, if there is single
482  * port dedicated IB device, the context will be used by only given
483  * port due to unification.
484  *
485  * Routine first searches the context for the specified IB device name,
486  * if found the shared context assumed and reference counter is incremented.
487  * If no context found the new one is created and initialized with specified
488  * IB device context and parameters.
489  *
490  * @param[in] spawn
491  *   Pointer to the IB device attributes (name, port, etc).
492  * @param[in] config
493  *   Pointer to device configuration structure.
494  *
495  * @return
496  *   Pointer to mlx5_ibv_shared object on success,
497  *   otherwise NULL and rte_errno is set.
498  */
499 static struct mlx5_ibv_shared *
500 mlx5_alloc_shared_ibctx(const struct mlx5_dev_spawn_data *spawn,
501                         const struct mlx5_dev_config *config)
502 {
503         struct mlx5_ibv_shared *sh;
504         int dbmap_env;
505         int err = 0;
506         uint32_t i;
507 #ifdef HAVE_IBV_FLOW_DV_SUPPORT
508         struct mlx5_devx_tis_attr tis_attr = { 0 };
509 #endif
510
511         MLX5_ASSERT(spawn);
512         /* Secondary process should not create the shared context. */
513         MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY);
514         pthread_mutex_lock(&mlx5_ibv_list_mutex);
515         /* Search for IB context by device name. */
516         LIST_FOREACH(sh, &mlx5_ibv_list, next) {
517                 if (!strcmp(sh->ibdev_name, spawn->ibv_dev->name)) {
518                         sh->refcnt++;
519                         goto exit;
520                 }
521         }
522         /* No device found, we have to create new shared context. */
523         MLX5_ASSERT(spawn->max_port);
524         sh = rte_zmalloc("ethdev shared ib context",
525                          sizeof(struct mlx5_ibv_shared) +
526                          spawn->max_port *
527                          sizeof(struct mlx5_ibv_shared_port),
528                          RTE_CACHE_LINE_SIZE);
529         if (!sh) {
530                 DRV_LOG(ERR, "shared context allocation failure");
531                 rte_errno  = ENOMEM;
532                 goto exit;
533         }
534         /*
535          * Configure environment variable "MLX5_BF_SHUT_UP"
536          * before the device creation. The rdma_core library
537          * checks the variable at device creation and
538          * stores the result internally.
539          */
540         dbmap_env = mlx5_config_doorbell_mapping_env(config);
541         /* Try to open IB device with DV first, then usual Verbs. */
542         errno = 0;
543         sh->ctx = mlx5_glue->dv_open_device(spawn->ibv_dev);
544         if (sh->ctx) {
545                 sh->devx = 1;
546                 DRV_LOG(DEBUG, "DevX is supported");
547                 /* The device is created, no need for environment. */
548                 mlx5_restore_doorbell_mapping_env(dbmap_env);
549         } else {
550                 /* The environment variable is still configured. */
551                 sh->ctx = mlx5_glue->open_device(spawn->ibv_dev);
552                 err = errno ? errno : ENODEV;
553                 /*
554                  * The environment variable is not needed anymore,
555                  * all device creation attempts are completed.
556                  */
557                 mlx5_restore_doorbell_mapping_env(dbmap_env);
558                 if (!sh->ctx)
559                         goto error;
560                 DRV_LOG(DEBUG, "DevX is NOT supported");
561         }
562         err = mlx5_glue->query_device_ex(sh->ctx, NULL, &sh->device_attr);
563         if (err) {
564                 DRV_LOG(DEBUG, "ibv_query_device_ex() failed");
565                 goto error;
566         }
567         sh->refcnt = 1;
568         sh->max_port = spawn->max_port;
569         strncpy(sh->ibdev_name, sh->ctx->device->name,
570                 sizeof(sh->ibdev_name));
571         strncpy(sh->ibdev_path, sh->ctx->device->ibdev_path,
572                 sizeof(sh->ibdev_path));
573         pthread_mutex_init(&sh->intr_mutex, NULL);
574         /*
575          * Setting port_id to max unallowed value means
576          * there is no interrupt subhandler installed for
577          * the given port index i.
578          */
579         for (i = 0; i < sh->max_port; i++) {
580                 sh->port[i].ih_port_id = RTE_MAX_ETHPORTS;
581                 sh->port[i].devx_ih_port_id = RTE_MAX_ETHPORTS;
582         }
583         sh->pd = mlx5_glue->alloc_pd(sh->ctx);
584         if (sh->pd == NULL) {
585                 DRV_LOG(ERR, "PD allocation failure");
586                 err = ENOMEM;
587                 goto error;
588         }
589 #ifdef HAVE_IBV_FLOW_DV_SUPPORT
590         if (sh->devx) {
591                 err = mlx5_get_pdn(sh->pd, &sh->pdn);
592                 if (err) {
593                         DRV_LOG(ERR, "Fail to extract pdn from PD");
594                         goto error;
595                 }
596                 sh->td = mlx5_devx_cmd_create_td(sh->ctx);
597                 if (!sh->td) {
598                         DRV_LOG(ERR, "TD allocation failure");
599                         err = ENOMEM;
600                         goto error;
601                 }
602                 tis_attr.transport_domain = sh->td->id;
603                 sh->tis = mlx5_devx_cmd_create_tis(sh->ctx, &tis_attr);
604                 if (!sh->tis) {
605                         DRV_LOG(ERR, "TIS allocation failure");
606                         err = ENOMEM;
607                         goto error;
608                 }
609         }
610         sh->flow_id_pool = mlx5_flow_id_pool_alloc(UINT32_MAX);
611         if (!sh->flow_id_pool) {
612                 DRV_LOG(ERR, "can't create flow id pool");
613                 err = ENOMEM;
614                 goto error;
615         }
616 #endif /* HAVE_IBV_FLOW_DV_SUPPORT */
617         /*
618          * Once the device is added to the list of memory event
619          * callback, its global MR cache table cannot be expanded
620          * on the fly because of deadlock. If it overflows, lookup
621          * should be done by searching MR list linearly, which is slow.
622          *
623          * At this point the device is not added to the memory
624          * event list yet, context is just being created.
625          */
626         err = mlx5_mr_btree_init(&sh->mr.cache,
627                                  MLX5_MR_BTREE_CACHE_N * 2,
628                                  spawn->pci_dev->device.numa_node);
629         if (err) {
630                 err = rte_errno;
631                 goto error;
632         }
633         mlx5_flow_counters_mng_init(sh);
634         /* Add device to memory callback list. */
635         rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock);
636         LIST_INSERT_HEAD(&mlx5_shared_data->mem_event_cb_list,
637                          sh, mem_event_cb);
638         rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
639         /* Add context to the global device list. */
640         LIST_INSERT_HEAD(&mlx5_ibv_list, sh, next);
641 exit:
642         pthread_mutex_unlock(&mlx5_ibv_list_mutex);
643         return sh;
644 error:
645         pthread_mutex_unlock(&mlx5_ibv_list_mutex);
646         MLX5_ASSERT(sh);
647         if (sh->tis)
648                 claim_zero(mlx5_devx_cmd_destroy(sh->tis));
649         if (sh->td)
650                 claim_zero(mlx5_devx_cmd_destroy(sh->td));
651         if (sh->pd)
652                 claim_zero(mlx5_glue->dealloc_pd(sh->pd));
653         if (sh->ctx)
654                 claim_zero(mlx5_glue->close_device(sh->ctx));
655         if (sh->flow_id_pool)
656                 mlx5_flow_id_pool_release(sh->flow_id_pool);
657         rte_free(sh);
658         MLX5_ASSERT(err > 0);
659         rte_errno = err;
660         return NULL;
661 }
662
663 /**
664  * Free shared IB device context. Decrement counter and if zero free
665  * all allocated resources and close handles.
666  *
667  * @param[in] sh
668  *   Pointer to mlx5_ibv_shared object to free
669  */
670 static void
671 mlx5_free_shared_ibctx(struct mlx5_ibv_shared *sh)
672 {
673         pthread_mutex_lock(&mlx5_ibv_list_mutex);
674 #ifdef RTE_LIBRTE_MLX5_DEBUG
675         /* Check the object presence in the list. */
676         struct mlx5_ibv_shared *lctx;
677
678         LIST_FOREACH(lctx, &mlx5_ibv_list, next)
679                 if (lctx == sh)
680                         break;
681         MLX5_ASSERT(lctx);
682         if (lctx != sh) {
683                 DRV_LOG(ERR, "Freeing non-existing shared IB context");
684                 goto exit;
685         }
686 #endif
687         MLX5_ASSERT(sh);
688         MLX5_ASSERT(sh->refcnt);
689         /* Secondary process should not free the shared context. */
690         MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY);
691         if (--sh->refcnt)
692                 goto exit;
693         /* Remove from memory callback device list. */
694         rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock);
695         LIST_REMOVE(sh, mem_event_cb);
696         rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
697         /* Release created Memory Regions. */
698         mlx5_mr_release(sh);
699         /* Remove context from the global device list. */
700         LIST_REMOVE(sh, next);
701         /*
702          *  Ensure there is no async event handler installed.
703          *  Only primary process handles async device events.
704          **/
705         mlx5_flow_counters_mng_close(sh);
706         MLX5_ASSERT(!sh->intr_cnt);
707         if (sh->intr_cnt)
708                 mlx5_intr_callback_unregister
709                         (&sh->intr_handle, mlx5_dev_interrupt_handler, sh);
710 #ifdef HAVE_MLX5_DEVX_ASYNC_SUPPORT
711         if (sh->devx_intr_cnt) {
712                 if (sh->intr_handle_devx.fd)
713                         rte_intr_callback_unregister(&sh->intr_handle_devx,
714                                           mlx5_dev_interrupt_handler_devx, sh);
715                 if (sh->devx_comp)
716                         mlx5dv_devx_destroy_cmd_comp(sh->devx_comp);
717         }
718 #endif
719         pthread_mutex_destroy(&sh->intr_mutex);
720         if (sh->pd)
721                 claim_zero(mlx5_glue->dealloc_pd(sh->pd));
722         if (sh->tis)
723                 claim_zero(mlx5_devx_cmd_destroy(sh->tis));
724         if (sh->td)
725                 claim_zero(mlx5_devx_cmd_destroy(sh->td));
726         if (sh->ctx)
727                 claim_zero(mlx5_glue->close_device(sh->ctx));
728         if (sh->flow_id_pool)
729                 mlx5_flow_id_pool_release(sh->flow_id_pool);
730         rte_free(sh);
731 exit:
732         pthread_mutex_unlock(&mlx5_ibv_list_mutex);
733 }
734
735 /**
736  * Destroy table hash list and all the root entries per domain.
737  *
738  * @param[in] priv
739  *   Pointer to the private device data structure.
740  */
741 static void
742 mlx5_free_table_hash_list(struct mlx5_priv *priv)
743 {
744         struct mlx5_ibv_shared *sh = priv->sh;
745         struct mlx5_flow_tbl_data_entry *tbl_data;
746         union mlx5_flow_tbl_key table_key = {
747                 {
748                         .table_id = 0,
749                         .reserved = 0,
750                         .domain = 0,
751                         .direction = 0,
752                 }
753         };
754         struct mlx5_hlist_entry *pos;
755
756         if (!sh->flow_tbls)
757                 return;
758         pos = mlx5_hlist_lookup(sh->flow_tbls, table_key.v64);
759         if (pos) {
760                 tbl_data = container_of(pos, struct mlx5_flow_tbl_data_entry,
761                                         entry);
762                 MLX5_ASSERT(tbl_data);
763                 mlx5_hlist_remove(sh->flow_tbls, pos);
764                 rte_free(tbl_data);
765         }
766         table_key.direction = 1;
767         pos = mlx5_hlist_lookup(sh->flow_tbls, table_key.v64);
768         if (pos) {
769                 tbl_data = container_of(pos, struct mlx5_flow_tbl_data_entry,
770                                         entry);
771                 MLX5_ASSERT(tbl_data);
772                 mlx5_hlist_remove(sh->flow_tbls, pos);
773                 rte_free(tbl_data);
774         }
775         table_key.direction = 0;
776         table_key.domain = 1;
777         pos = mlx5_hlist_lookup(sh->flow_tbls, table_key.v64);
778         if (pos) {
779                 tbl_data = container_of(pos, struct mlx5_flow_tbl_data_entry,
780                                         entry);
781                 MLX5_ASSERT(tbl_data);
782                 mlx5_hlist_remove(sh->flow_tbls, pos);
783                 rte_free(tbl_data);
784         }
785         mlx5_hlist_destroy(sh->flow_tbls, NULL, NULL);
786 }
787
788 /**
789  * Initialize flow table hash list and create the root tables entry
790  * for each domain.
791  *
792  * @param[in] priv
793  *   Pointer to the private device data structure.
794  *
795  * @return
796  *   Zero on success, positive error code otherwise.
797  */
798 static int
799 mlx5_alloc_table_hash_list(struct mlx5_priv *priv)
800 {
801         struct mlx5_ibv_shared *sh = priv->sh;
802         char s[MLX5_HLIST_NAMESIZE];
803         int err = 0;
804
805         MLX5_ASSERT(sh);
806         snprintf(s, sizeof(s), "%s_flow_table", priv->sh->ibdev_name);
807         sh->flow_tbls = mlx5_hlist_create(s, MLX5_FLOW_TABLE_HLIST_ARRAY_SIZE);
808         if (!sh->flow_tbls) {
809                 DRV_LOG(ERR, "flow tables with hash creation failed.\n");
810                 err = ENOMEM;
811                 return err;
812         }
813 #ifndef HAVE_MLX5DV_DR
814         /*
815          * In case we have not DR support, the zero tables should be created
816          * because DV expect to see them even if they cannot be created by
817          * RDMA-CORE.
818          */
819         union mlx5_flow_tbl_key table_key = {
820                 {
821                         .table_id = 0,
822                         .reserved = 0,
823                         .domain = 0,
824                         .direction = 0,
825                 }
826         };
827         struct mlx5_flow_tbl_data_entry *tbl_data = rte_zmalloc(NULL,
828                                                           sizeof(*tbl_data), 0);
829
830         if (!tbl_data) {
831                 err = ENOMEM;
832                 goto error;
833         }
834         tbl_data->entry.key = table_key.v64;
835         err = mlx5_hlist_insert(sh->flow_tbls, &tbl_data->entry);
836         if (err)
837                 goto error;
838         rte_atomic32_init(&tbl_data->tbl.refcnt);
839         rte_atomic32_inc(&tbl_data->tbl.refcnt);
840         table_key.direction = 1;
841         tbl_data = rte_zmalloc(NULL, sizeof(*tbl_data), 0);
842         if (!tbl_data) {
843                 err = ENOMEM;
844                 goto error;
845         }
846         tbl_data->entry.key = table_key.v64;
847         err = mlx5_hlist_insert(sh->flow_tbls, &tbl_data->entry);
848         if (err)
849                 goto error;
850         rte_atomic32_init(&tbl_data->tbl.refcnt);
851         rte_atomic32_inc(&tbl_data->tbl.refcnt);
852         table_key.direction = 0;
853         table_key.domain = 1;
854         tbl_data = rte_zmalloc(NULL, sizeof(*tbl_data), 0);
855         if (!tbl_data) {
856                 err = ENOMEM;
857                 goto error;
858         }
859         tbl_data->entry.key = table_key.v64;
860         err = mlx5_hlist_insert(sh->flow_tbls, &tbl_data->entry);
861         if (err)
862                 goto error;
863         rte_atomic32_init(&tbl_data->tbl.refcnt);
864         rte_atomic32_inc(&tbl_data->tbl.refcnt);
865         return err;
866 error:
867         mlx5_free_table_hash_list(priv);
868 #endif /* HAVE_MLX5DV_DR */
869         return err;
870 }
871
872 /**
873  * Initialize DR related data within private structure.
874  * Routine checks the reference counter and does actual
875  * resources creation/initialization only if counter is zero.
876  *
877  * @param[in] priv
878  *   Pointer to the private device data structure.
879  *
880  * @return
881  *   Zero on success, positive error code otherwise.
882  */
883 static int
884 mlx5_alloc_shared_dr(struct mlx5_priv *priv)
885 {
886         struct mlx5_ibv_shared *sh = priv->sh;
887         char s[MLX5_HLIST_NAMESIZE];
888         int err = 0;
889
890         if (!sh->flow_tbls)
891                 err = mlx5_alloc_table_hash_list(priv);
892         else
893                 DRV_LOG(DEBUG, "sh->flow_tbls[%p] already created, reuse\n",
894                         (void *)sh->flow_tbls);
895         if (err)
896                 return err;
897         /* Create tags hash list table. */
898         snprintf(s, sizeof(s), "%s_tags", sh->ibdev_name);
899         sh->tag_table = mlx5_hlist_create(s, MLX5_TAGS_HLIST_ARRAY_SIZE);
900         if (!sh->tag_table) {
901                 DRV_LOG(ERR, "tags with hash creation failed.\n");
902                 err = ENOMEM;
903                 goto error;
904         }
905 #ifdef HAVE_MLX5DV_DR
906         void *domain;
907
908         if (sh->dv_refcnt) {
909                 /* Shared DV/DR structures is already initialized. */
910                 sh->dv_refcnt++;
911                 priv->dr_shared = 1;
912                 return 0;
913         }
914         /* Reference counter is zero, we should initialize structures. */
915         domain = mlx5_glue->dr_create_domain(sh->ctx,
916                                              MLX5DV_DR_DOMAIN_TYPE_NIC_RX);
917         if (!domain) {
918                 DRV_LOG(ERR, "ingress mlx5dv_dr_create_domain failed");
919                 err = errno;
920                 goto error;
921         }
922         sh->rx_domain = domain;
923         domain = mlx5_glue->dr_create_domain(sh->ctx,
924                                              MLX5DV_DR_DOMAIN_TYPE_NIC_TX);
925         if (!domain) {
926                 DRV_LOG(ERR, "egress mlx5dv_dr_create_domain failed");
927                 err = errno;
928                 goto error;
929         }
930         pthread_mutex_init(&sh->dv_mutex, NULL);
931         sh->tx_domain = domain;
932 #ifdef HAVE_MLX5DV_DR_ESWITCH
933         if (priv->config.dv_esw_en) {
934                 domain  = mlx5_glue->dr_create_domain
935                         (sh->ctx, MLX5DV_DR_DOMAIN_TYPE_FDB);
936                 if (!domain) {
937                         DRV_LOG(ERR, "FDB mlx5dv_dr_create_domain failed");
938                         err = errno;
939                         goto error;
940                 }
941                 sh->fdb_domain = domain;
942                 sh->esw_drop_action = mlx5_glue->dr_create_flow_action_drop();
943         }
944 #endif
945         sh->pop_vlan_action = mlx5_glue->dr_create_flow_action_pop_vlan();
946 #endif /* HAVE_MLX5DV_DR */
947         sh->dv_refcnt++;
948         priv->dr_shared = 1;
949         return 0;
950 error:
951         /* Rollback the created objects. */
952         if (sh->rx_domain) {
953                 mlx5_glue->dr_destroy_domain(sh->rx_domain);
954                 sh->rx_domain = NULL;
955         }
956         if (sh->tx_domain) {
957                 mlx5_glue->dr_destroy_domain(sh->tx_domain);
958                 sh->tx_domain = NULL;
959         }
960         if (sh->fdb_domain) {
961                 mlx5_glue->dr_destroy_domain(sh->fdb_domain);
962                 sh->fdb_domain = NULL;
963         }
964         if (sh->esw_drop_action) {
965                 mlx5_glue->destroy_flow_action(sh->esw_drop_action);
966                 sh->esw_drop_action = NULL;
967         }
968         if (sh->pop_vlan_action) {
969                 mlx5_glue->destroy_flow_action(sh->pop_vlan_action);
970                 sh->pop_vlan_action = NULL;
971         }
972         if (sh->tag_table) {
973                 /* tags should be destroyed with flow before. */
974                 mlx5_hlist_destroy(sh->tag_table, NULL, NULL);
975                 sh->tag_table = NULL;
976         }
977         mlx5_free_table_hash_list(priv);
978         return err;
979 }
980
981 /**
982  * Destroy DR related data within private structure.
983  *
984  * @param[in] priv
985  *   Pointer to the private device data structure.
986  */
987 static void
988 mlx5_free_shared_dr(struct mlx5_priv *priv)
989 {
990         struct mlx5_ibv_shared *sh;
991
992         if (!priv->dr_shared)
993                 return;
994         priv->dr_shared = 0;
995         sh = priv->sh;
996         MLX5_ASSERT(sh);
997 #ifdef HAVE_MLX5DV_DR
998         MLX5_ASSERT(sh->dv_refcnt);
999         if (sh->dv_refcnt && --sh->dv_refcnt)
1000                 return;
1001         if (sh->rx_domain) {
1002                 mlx5_glue->dr_destroy_domain(sh->rx_domain);
1003                 sh->rx_domain = NULL;
1004         }
1005         if (sh->tx_domain) {
1006                 mlx5_glue->dr_destroy_domain(sh->tx_domain);
1007                 sh->tx_domain = NULL;
1008         }
1009 #ifdef HAVE_MLX5DV_DR_ESWITCH
1010         if (sh->fdb_domain) {
1011                 mlx5_glue->dr_destroy_domain(sh->fdb_domain);
1012                 sh->fdb_domain = NULL;
1013         }
1014         if (sh->esw_drop_action) {
1015                 mlx5_glue->destroy_flow_action(sh->esw_drop_action);
1016                 sh->esw_drop_action = NULL;
1017         }
1018 #endif
1019         if (sh->pop_vlan_action) {
1020                 mlx5_glue->destroy_flow_action(sh->pop_vlan_action);
1021                 sh->pop_vlan_action = NULL;
1022         }
1023         pthread_mutex_destroy(&sh->dv_mutex);
1024 #endif /* HAVE_MLX5DV_DR */
1025         if (sh->tag_table) {
1026                 /* tags should be destroyed with flow before. */
1027                 mlx5_hlist_destroy(sh->tag_table, NULL, NULL);
1028                 sh->tag_table = NULL;
1029         }
1030         mlx5_free_table_hash_list(priv);
1031 }
1032
1033 /**
1034  * Initialize shared data between primary and secondary process.
1035  *
1036  * A memzone is reserved by primary process and secondary processes attach to
1037  * the memzone.
1038  *
1039  * @return
1040  *   0 on success, a negative errno value otherwise and rte_errno is set.
1041  */
1042 static int
1043 mlx5_init_shared_data(void)
1044 {
1045         const struct rte_memzone *mz;
1046         int ret = 0;
1047
1048         rte_spinlock_lock(&mlx5_shared_data_lock);
1049         if (mlx5_shared_data == NULL) {
1050                 if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
1051                         /* Allocate shared memory. */
1052                         mz = rte_memzone_reserve(MZ_MLX5_PMD_SHARED_DATA,
1053                                                  sizeof(*mlx5_shared_data),
1054                                                  SOCKET_ID_ANY, 0);
1055                         if (mz == NULL) {
1056                                 DRV_LOG(ERR,
1057                                         "Cannot allocate mlx5 shared data");
1058                                 ret = -rte_errno;
1059                                 goto error;
1060                         }
1061                         mlx5_shared_data = mz->addr;
1062                         memset(mlx5_shared_data, 0, sizeof(*mlx5_shared_data));
1063                         rte_spinlock_init(&mlx5_shared_data->lock);
1064                 } else {
1065                         /* Lookup allocated shared memory. */
1066                         mz = rte_memzone_lookup(MZ_MLX5_PMD_SHARED_DATA);
1067                         if (mz == NULL) {
1068                                 DRV_LOG(ERR,
1069                                         "Cannot attach mlx5 shared data");
1070                                 ret = -rte_errno;
1071                                 goto error;
1072                         }
1073                         mlx5_shared_data = mz->addr;
1074                         memset(&mlx5_local_data, 0, sizeof(mlx5_local_data));
1075                 }
1076         }
1077 error:
1078         rte_spinlock_unlock(&mlx5_shared_data_lock);
1079         return ret;
1080 }
1081
1082 /**
1083  * Retrieve integer value from environment variable.
1084  *
1085  * @param[in] name
1086  *   Environment variable name.
1087  *
1088  * @return
1089  *   Integer value, 0 if the variable is not set.
1090  */
1091 int
1092 mlx5_getenv_int(const char *name)
1093 {
1094         const char *val = getenv(name);
1095
1096         if (val == NULL)
1097                 return 0;
1098         return atoi(val);
1099 }
1100
1101 /**
1102  * Verbs callback to allocate a memory. This function should allocate the space
1103  * according to the size provided residing inside a huge page.
1104  * Please note that all allocation must respect the alignment from libmlx5
1105  * (i.e. currently sysconf(_SC_PAGESIZE)).
1106  *
1107  * @param[in] size
1108  *   The size in bytes of the memory to allocate.
1109  * @param[in] data
1110  *   A pointer to the callback data.
1111  *
1112  * @return
1113  *   Allocated buffer, NULL otherwise and rte_errno is set.
1114  */
1115 static void *
1116 mlx5_alloc_verbs_buf(size_t size, void *data)
1117 {
1118         struct mlx5_priv *priv = data;
1119         void *ret;
1120         size_t alignment = sysconf(_SC_PAGESIZE);
1121         unsigned int socket = SOCKET_ID_ANY;
1122
1123         if (priv->verbs_alloc_ctx.type == MLX5_VERBS_ALLOC_TYPE_TX_QUEUE) {
1124                 const struct mlx5_txq_ctrl *ctrl = priv->verbs_alloc_ctx.obj;
1125
1126                 socket = ctrl->socket;
1127         } else if (priv->verbs_alloc_ctx.type ==
1128                    MLX5_VERBS_ALLOC_TYPE_RX_QUEUE) {
1129                 const struct mlx5_rxq_ctrl *ctrl = priv->verbs_alloc_ctx.obj;
1130
1131                 socket = ctrl->socket;
1132         }
1133         MLX5_ASSERT(data != NULL);
1134         ret = rte_malloc_socket(__func__, size, alignment, socket);
1135         if (!ret && size)
1136                 rte_errno = ENOMEM;
1137         return ret;
1138 }
1139
1140 /**
1141  * Verbs callback to free a memory.
1142  *
1143  * @param[in] ptr
1144  *   A pointer to the memory to free.
1145  * @param[in] data
1146  *   A pointer to the callback data.
1147  */
1148 static void
1149 mlx5_free_verbs_buf(void *ptr, void *data __rte_unused)
1150 {
1151         MLX5_ASSERT(data != NULL);
1152         rte_free(ptr);
1153 }
1154
1155 /**
1156  * DPDK callback to add udp tunnel port
1157  *
1158  * @param[in] dev
1159  *   A pointer to eth_dev
1160  * @param[in] udp_tunnel
1161  *   A pointer to udp tunnel
1162  *
1163  * @return
1164  *   0 on valid udp ports and tunnels, -ENOTSUP otherwise.
1165  */
1166 int
1167 mlx5_udp_tunnel_port_add(struct rte_eth_dev *dev __rte_unused,
1168                          struct rte_eth_udp_tunnel *udp_tunnel)
1169 {
1170         MLX5_ASSERT(udp_tunnel != NULL);
1171         if (udp_tunnel->prot_type == RTE_TUNNEL_TYPE_VXLAN &&
1172             udp_tunnel->udp_port == 4789)
1173                 return 0;
1174         if (udp_tunnel->prot_type == RTE_TUNNEL_TYPE_VXLAN_GPE &&
1175             udp_tunnel->udp_port == 4790)
1176                 return 0;
1177         return -ENOTSUP;
1178 }
1179
1180 /**
1181  * Initialize process private data structure.
1182  *
1183  * @param dev
1184  *   Pointer to Ethernet device structure.
1185  *
1186  * @return
1187  *   0 on success, a negative errno value otherwise and rte_errno is set.
1188  */
1189 int
1190 mlx5_proc_priv_init(struct rte_eth_dev *dev)
1191 {
1192         struct mlx5_priv *priv = dev->data->dev_private;
1193         struct mlx5_proc_priv *ppriv;
1194         size_t ppriv_size;
1195
1196         /*
1197          * UAR register table follows the process private structure. BlueFlame
1198          * registers for Tx queues are stored in the table.
1199          */
1200         ppriv_size =
1201                 sizeof(struct mlx5_proc_priv) + priv->txqs_n * sizeof(void *);
1202         ppriv = rte_malloc_socket("mlx5_proc_priv", ppriv_size,
1203                                   RTE_CACHE_LINE_SIZE, dev->device->numa_node);
1204         if (!ppriv) {
1205                 rte_errno = ENOMEM;
1206                 return -rte_errno;
1207         }
1208         ppriv->uar_table_sz = ppriv_size;
1209         dev->process_private = ppriv;
1210         return 0;
1211 }
1212
1213 /**
1214  * Un-initialize process private data structure.
1215  *
1216  * @param dev
1217  *   Pointer to Ethernet device structure.
1218  */
1219 static void
1220 mlx5_proc_priv_uninit(struct rte_eth_dev *dev)
1221 {
1222         if (!dev->process_private)
1223                 return;
1224         rte_free(dev->process_private);
1225         dev->process_private = NULL;
1226 }
1227
1228 /**
1229  * DPDK callback to close the device.
1230  *
1231  * Destroy all queues and objects, free memory.
1232  *
1233  * @param dev
1234  *   Pointer to Ethernet device structure.
1235  */
1236 static void
1237 mlx5_dev_close(struct rte_eth_dev *dev)
1238 {
1239         struct mlx5_priv *priv = dev->data->dev_private;
1240         unsigned int i;
1241         int ret;
1242
1243         DRV_LOG(DEBUG, "port %u closing device \"%s\"",
1244                 dev->data->port_id,
1245                 ((priv->sh->ctx != NULL) ? priv->sh->ctx->device->name : ""));
1246         /* In case mlx5_dev_stop() has not been called. */
1247         mlx5_dev_interrupt_handler_uninstall(dev);
1248         mlx5_dev_interrupt_handler_devx_uninstall(dev);
1249         /*
1250          * If default mreg copy action is removed at the stop stage,
1251          * the search will return none and nothing will be done anymore.
1252          */
1253         mlx5_flow_stop_default(dev);
1254         mlx5_traffic_disable(dev);
1255         /*
1256          * If all the flows are already flushed in the device stop stage,
1257          * then this will return directly without any action.
1258          */
1259         mlx5_flow_list_flush(dev, &priv->flows, true);
1260         mlx5_flow_meter_flush(dev, NULL);
1261         /* Free the intermediate buffers for flow creation. */
1262         mlx5_flow_free_intermediate(dev);
1263         /* Prevent crashes when queues are still in use. */
1264         dev->rx_pkt_burst = removed_rx_burst;
1265         dev->tx_pkt_burst = removed_tx_burst;
1266         rte_wmb();
1267         /* Disable datapath on secondary process. */
1268         mlx5_mp_req_stop_rxtx(dev);
1269         if (priv->rxqs != NULL) {
1270                 /* XXX race condition if mlx5_rx_burst() is still running. */
1271                 usleep(1000);
1272                 for (i = 0; (i != priv->rxqs_n); ++i)
1273                         mlx5_rxq_release(dev, i);
1274                 priv->rxqs_n = 0;
1275                 priv->rxqs = NULL;
1276         }
1277         if (priv->txqs != NULL) {
1278                 /* XXX race condition if mlx5_tx_burst() is still running. */
1279                 usleep(1000);
1280                 for (i = 0; (i != priv->txqs_n); ++i)
1281                         mlx5_txq_release(dev, i);
1282                 priv->txqs_n = 0;
1283                 priv->txqs = NULL;
1284         }
1285         mlx5_proc_priv_uninit(dev);
1286         if (priv->mreg_cp_tbl)
1287                 mlx5_hlist_destroy(priv->mreg_cp_tbl, NULL, NULL);
1288         mlx5_mprq_free_mp(dev);
1289         mlx5_free_shared_dr(priv);
1290         if (priv->rss_conf.rss_key != NULL)
1291                 rte_free(priv->rss_conf.rss_key);
1292         if (priv->reta_idx != NULL)
1293                 rte_free(priv->reta_idx);
1294         if (priv->config.vf)
1295                 mlx5_nl_mac_addr_flush(priv->nl_socket_route, mlx5_ifindex(dev),
1296                                        dev->data->mac_addrs,
1297                                        MLX5_MAX_MAC_ADDRESSES, priv->mac_own);
1298         if (priv->nl_socket_route >= 0)
1299                 close(priv->nl_socket_route);
1300         if (priv->nl_socket_rdma >= 0)
1301                 close(priv->nl_socket_rdma);
1302         if (priv->vmwa_context)
1303                 mlx5_vlan_vmwa_exit(priv->vmwa_context);
1304         if (priv->sh) {
1305                 /*
1306                  * Free the shared context in last turn, because the cleanup
1307                  * routines above may use some shared fields, like
1308                  * mlx5_nl_mac_addr_flush() uses ibdev_path for retrieveing
1309                  * ifindex if Netlink fails.
1310                  */
1311                 mlx5_free_shared_ibctx(priv->sh);
1312                 priv->sh = NULL;
1313         }
1314         ret = mlx5_hrxq_verify(dev);
1315         if (ret)
1316                 DRV_LOG(WARNING, "port %u some hash Rx queue still remain",
1317                         dev->data->port_id);
1318         ret = mlx5_ind_table_obj_verify(dev);
1319         if (ret)
1320                 DRV_LOG(WARNING, "port %u some indirection table still remain",
1321                         dev->data->port_id);
1322         ret = mlx5_rxq_obj_verify(dev);
1323         if (ret)
1324                 DRV_LOG(WARNING, "port %u some Rx queue objects still remain",
1325                         dev->data->port_id);
1326         ret = mlx5_rxq_verify(dev);
1327         if (ret)
1328                 DRV_LOG(WARNING, "port %u some Rx queues still remain",
1329                         dev->data->port_id);
1330         ret = mlx5_txq_obj_verify(dev);
1331         if (ret)
1332                 DRV_LOG(WARNING, "port %u some Verbs Tx queue still remain",
1333                         dev->data->port_id);
1334         ret = mlx5_txq_verify(dev);
1335         if (ret)
1336                 DRV_LOG(WARNING, "port %u some Tx queues still remain",
1337                         dev->data->port_id);
1338         ret = mlx5_flow_verify(dev);
1339         if (ret)
1340                 DRV_LOG(WARNING, "port %u some flows still remain",
1341                         dev->data->port_id);
1342         if (priv->domain_id != RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID) {
1343                 unsigned int c = 0;
1344                 uint16_t port_id;
1345
1346                 MLX5_ETH_FOREACH_DEV(port_id, priv->pci_dev) {
1347                         struct mlx5_priv *opriv =
1348                                 rte_eth_devices[port_id].data->dev_private;
1349
1350                         if (!opriv ||
1351                             opriv->domain_id != priv->domain_id ||
1352                             &rte_eth_devices[port_id] == dev)
1353                                 continue;
1354                         ++c;
1355                         break;
1356                 }
1357                 if (!c)
1358                         claim_zero(rte_eth_switch_domain_free(priv->domain_id));
1359         }
1360         memset(priv, 0, sizeof(*priv));
1361         priv->domain_id = RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID;
1362         /*
1363          * Reset mac_addrs to NULL such that it is not freed as part of
1364          * rte_eth_dev_release_port(). mac_addrs is part of dev_private so
1365          * it is freed when dev_private is freed.
1366          */
1367         dev->data->mac_addrs = NULL;
1368 }
1369
1370 const struct eth_dev_ops mlx5_dev_ops = {
1371         .dev_configure = mlx5_dev_configure,
1372         .dev_start = mlx5_dev_start,
1373         .dev_stop = mlx5_dev_stop,
1374         .dev_set_link_down = mlx5_set_link_down,
1375         .dev_set_link_up = mlx5_set_link_up,
1376         .dev_close = mlx5_dev_close,
1377         .promiscuous_enable = mlx5_promiscuous_enable,
1378         .promiscuous_disable = mlx5_promiscuous_disable,
1379         .allmulticast_enable = mlx5_allmulticast_enable,
1380         .allmulticast_disable = mlx5_allmulticast_disable,
1381         .link_update = mlx5_link_update,
1382         .stats_get = mlx5_stats_get,
1383         .stats_reset = mlx5_stats_reset,
1384         .xstats_get = mlx5_xstats_get,
1385         .xstats_reset = mlx5_xstats_reset,
1386         .xstats_get_names = mlx5_xstats_get_names,
1387         .fw_version_get = mlx5_fw_version_get,
1388         .dev_infos_get = mlx5_dev_infos_get,
1389         .read_clock = mlx5_read_clock,
1390         .dev_supported_ptypes_get = mlx5_dev_supported_ptypes_get,
1391         .vlan_filter_set = mlx5_vlan_filter_set,
1392         .rx_queue_setup = mlx5_rx_queue_setup,
1393         .rx_hairpin_queue_setup = mlx5_rx_hairpin_queue_setup,
1394         .tx_queue_setup = mlx5_tx_queue_setup,
1395         .tx_hairpin_queue_setup = mlx5_tx_hairpin_queue_setup,
1396         .rx_queue_release = mlx5_rx_queue_release,
1397         .tx_queue_release = mlx5_tx_queue_release,
1398         .flow_ctrl_get = mlx5_dev_get_flow_ctrl,
1399         .flow_ctrl_set = mlx5_dev_set_flow_ctrl,
1400         .mac_addr_remove = mlx5_mac_addr_remove,
1401         .mac_addr_add = mlx5_mac_addr_add,
1402         .mac_addr_set = mlx5_mac_addr_set,
1403         .set_mc_addr_list = mlx5_set_mc_addr_list,
1404         .mtu_set = mlx5_dev_set_mtu,
1405         .vlan_strip_queue_set = mlx5_vlan_strip_queue_set,
1406         .vlan_offload_set = mlx5_vlan_offload_set,
1407         .reta_update = mlx5_dev_rss_reta_update,
1408         .reta_query = mlx5_dev_rss_reta_query,
1409         .rss_hash_update = mlx5_rss_hash_update,
1410         .rss_hash_conf_get = mlx5_rss_hash_conf_get,
1411         .filter_ctrl = mlx5_dev_filter_ctrl,
1412         .rx_descriptor_status = mlx5_rx_descriptor_status,
1413         .tx_descriptor_status = mlx5_tx_descriptor_status,
1414         .rxq_info_get = mlx5_rxq_info_get,
1415         .txq_info_get = mlx5_txq_info_get,
1416         .rx_burst_mode_get = mlx5_rx_burst_mode_get,
1417         .tx_burst_mode_get = mlx5_tx_burst_mode_get,
1418         .rx_queue_count = mlx5_rx_queue_count,
1419         .rx_queue_intr_enable = mlx5_rx_intr_enable,
1420         .rx_queue_intr_disable = mlx5_rx_intr_disable,
1421         .is_removed = mlx5_is_removed,
1422         .udp_tunnel_port_add  = mlx5_udp_tunnel_port_add,
1423         .get_module_info = mlx5_get_module_info,
1424         .get_module_eeprom = mlx5_get_module_eeprom,
1425         .hairpin_cap_get = mlx5_hairpin_cap_get,
1426         .mtr_ops_get = mlx5_flow_meter_ops_get,
1427 };
1428
1429 /* Available operations from secondary process. */
1430 static const struct eth_dev_ops mlx5_dev_sec_ops = {
1431         .stats_get = mlx5_stats_get,
1432         .stats_reset = mlx5_stats_reset,
1433         .xstats_get = mlx5_xstats_get,
1434         .xstats_reset = mlx5_xstats_reset,
1435         .xstats_get_names = mlx5_xstats_get_names,
1436         .fw_version_get = mlx5_fw_version_get,
1437         .dev_infos_get = mlx5_dev_infos_get,
1438         .rx_descriptor_status = mlx5_rx_descriptor_status,
1439         .tx_descriptor_status = mlx5_tx_descriptor_status,
1440         .rxq_info_get = mlx5_rxq_info_get,
1441         .txq_info_get = mlx5_txq_info_get,
1442         .rx_burst_mode_get = mlx5_rx_burst_mode_get,
1443         .tx_burst_mode_get = mlx5_tx_burst_mode_get,
1444         .get_module_info = mlx5_get_module_info,
1445         .get_module_eeprom = mlx5_get_module_eeprom,
1446 };
1447
1448 /* Available operations in flow isolated mode. */
1449 const struct eth_dev_ops mlx5_dev_ops_isolate = {
1450         .dev_configure = mlx5_dev_configure,
1451         .dev_start = mlx5_dev_start,
1452         .dev_stop = mlx5_dev_stop,
1453         .dev_set_link_down = mlx5_set_link_down,
1454         .dev_set_link_up = mlx5_set_link_up,
1455         .dev_close = mlx5_dev_close,
1456         .promiscuous_enable = mlx5_promiscuous_enable,
1457         .promiscuous_disable = mlx5_promiscuous_disable,
1458         .allmulticast_enable = mlx5_allmulticast_enable,
1459         .allmulticast_disable = mlx5_allmulticast_disable,
1460         .link_update = mlx5_link_update,
1461         .stats_get = mlx5_stats_get,
1462         .stats_reset = mlx5_stats_reset,
1463         .xstats_get = mlx5_xstats_get,
1464         .xstats_reset = mlx5_xstats_reset,
1465         .xstats_get_names = mlx5_xstats_get_names,
1466         .fw_version_get = mlx5_fw_version_get,
1467         .dev_infos_get = mlx5_dev_infos_get,
1468         .dev_supported_ptypes_get = mlx5_dev_supported_ptypes_get,
1469         .vlan_filter_set = mlx5_vlan_filter_set,
1470         .rx_queue_setup = mlx5_rx_queue_setup,
1471         .rx_hairpin_queue_setup = mlx5_rx_hairpin_queue_setup,
1472         .tx_queue_setup = mlx5_tx_queue_setup,
1473         .tx_hairpin_queue_setup = mlx5_tx_hairpin_queue_setup,
1474         .rx_queue_release = mlx5_rx_queue_release,
1475         .tx_queue_release = mlx5_tx_queue_release,
1476         .flow_ctrl_get = mlx5_dev_get_flow_ctrl,
1477         .flow_ctrl_set = mlx5_dev_set_flow_ctrl,
1478         .mac_addr_remove = mlx5_mac_addr_remove,
1479         .mac_addr_add = mlx5_mac_addr_add,
1480         .mac_addr_set = mlx5_mac_addr_set,
1481         .set_mc_addr_list = mlx5_set_mc_addr_list,
1482         .mtu_set = mlx5_dev_set_mtu,
1483         .vlan_strip_queue_set = mlx5_vlan_strip_queue_set,
1484         .vlan_offload_set = mlx5_vlan_offload_set,
1485         .filter_ctrl = mlx5_dev_filter_ctrl,
1486         .rx_descriptor_status = mlx5_rx_descriptor_status,
1487         .tx_descriptor_status = mlx5_tx_descriptor_status,
1488         .rxq_info_get = mlx5_rxq_info_get,
1489         .txq_info_get = mlx5_txq_info_get,
1490         .rx_burst_mode_get = mlx5_rx_burst_mode_get,
1491         .tx_burst_mode_get = mlx5_tx_burst_mode_get,
1492         .rx_queue_intr_enable = mlx5_rx_intr_enable,
1493         .rx_queue_intr_disable = mlx5_rx_intr_disable,
1494         .is_removed = mlx5_is_removed,
1495         .get_module_info = mlx5_get_module_info,
1496         .get_module_eeprom = mlx5_get_module_eeprom,
1497         .hairpin_cap_get = mlx5_hairpin_cap_get,
1498         .mtr_ops_get = mlx5_flow_meter_ops_get,
1499 };
1500
1501 /**
1502  * Verify and store value for device argument.
1503  *
1504  * @param[in] key
1505  *   Key argument to verify.
1506  * @param[in] val
1507  *   Value associated with key.
1508  * @param opaque
1509  *   User data.
1510  *
1511  * @return
1512  *   0 on success, a negative errno value otherwise and rte_errno is set.
1513  */
1514 static int
1515 mlx5_args_check(const char *key, const char *val, void *opaque)
1516 {
1517         struct mlx5_dev_config *config = opaque;
1518         unsigned long tmp;
1519
1520         /* No-op, port representors are processed in mlx5_dev_spawn(). */
1521         if (!strcmp(MLX5_REPRESENTOR, key))
1522                 return 0;
1523         errno = 0;
1524         tmp = strtoul(val, NULL, 0);
1525         if (errno) {
1526                 rte_errno = errno;
1527                 DRV_LOG(WARNING, "%s: \"%s\" is not a valid integer", key, val);
1528                 return -rte_errno;
1529         }
1530         if (strcmp(MLX5_RXQ_CQE_COMP_EN, key) == 0) {
1531                 config->cqe_comp = !!tmp;
1532         } else if (strcmp(MLX5_RXQ_CQE_PAD_EN, key) == 0) {
1533                 config->cqe_pad = !!tmp;
1534         } else if (strcmp(MLX5_RXQ_PKT_PAD_EN, key) == 0) {
1535                 config->hw_padding = !!tmp;
1536         } else if (strcmp(MLX5_RX_MPRQ_EN, key) == 0) {
1537                 config->mprq.enabled = !!tmp;
1538         } else if (strcmp(MLX5_RX_MPRQ_LOG_STRIDE_NUM, key) == 0) {
1539                 config->mprq.stride_num_n = tmp;
1540         } else if (strcmp(MLX5_RX_MPRQ_LOG_STRIDE_SIZE, key) == 0) {
1541                 config->mprq.stride_size_n = tmp;
1542         } else if (strcmp(MLX5_RX_MPRQ_MAX_MEMCPY_LEN, key) == 0) {
1543                 config->mprq.max_memcpy_len = tmp;
1544         } else if (strcmp(MLX5_RXQS_MIN_MPRQ, key) == 0) {
1545                 config->mprq.min_rxqs_num = tmp;
1546         } else if (strcmp(MLX5_TXQ_INLINE, key) == 0) {
1547                 DRV_LOG(WARNING, "%s: deprecated parameter,"
1548                                  " converted to txq_inline_max", key);
1549                 config->txq_inline_max = tmp;
1550         } else if (strcmp(MLX5_TXQ_INLINE_MAX, key) == 0) {
1551                 config->txq_inline_max = tmp;
1552         } else if (strcmp(MLX5_TXQ_INLINE_MIN, key) == 0) {
1553                 config->txq_inline_min = tmp;
1554         } else if (strcmp(MLX5_TXQ_INLINE_MPW, key) == 0) {
1555                 config->txq_inline_mpw = tmp;
1556         } else if (strcmp(MLX5_TXQS_MIN_INLINE, key) == 0) {
1557                 config->txqs_inline = tmp;
1558         } else if (strcmp(MLX5_TXQS_MAX_VEC, key) == 0) {
1559                 DRV_LOG(WARNING, "%s: deprecated parameter, ignored", key);
1560         } else if (strcmp(MLX5_TXQ_MPW_EN, key) == 0) {
1561                 config->mps = !!tmp;
1562         } else if (strcmp(MLX5_TX_DB_NC, key) == 0) {
1563                 if (tmp != MLX5_TXDB_CACHED &&
1564                     tmp != MLX5_TXDB_NCACHED &&
1565                     tmp != MLX5_TXDB_HEURISTIC) {
1566                         DRV_LOG(ERR, "invalid Tx doorbell "
1567                                      "mapping parameter");
1568                         rte_errno = EINVAL;
1569                         return -rte_errno;
1570                 }
1571                 config->dbnc = tmp;
1572         } else if (strcmp(MLX5_TXQ_MPW_HDR_DSEG_EN, key) == 0) {
1573                 DRV_LOG(WARNING, "%s: deprecated parameter, ignored", key);
1574         } else if (strcmp(MLX5_TXQ_MAX_INLINE_LEN, key) == 0) {
1575                 DRV_LOG(WARNING, "%s: deprecated parameter,"
1576                                  " converted to txq_inline_mpw", key);
1577                 config->txq_inline_mpw = tmp;
1578         } else if (strcmp(MLX5_TX_VEC_EN, key) == 0) {
1579                 DRV_LOG(WARNING, "%s: deprecated parameter, ignored", key);
1580         } else if (strcmp(MLX5_RX_VEC_EN, key) == 0) {
1581                 config->rx_vec_en = !!tmp;
1582         } else if (strcmp(MLX5_L3_VXLAN_EN, key) == 0) {
1583                 config->l3_vxlan_en = !!tmp;
1584         } else if (strcmp(MLX5_VF_NL_EN, key) == 0) {
1585                 config->vf_nl_en = !!tmp;
1586         } else if (strcmp(MLX5_DV_ESW_EN, key) == 0) {
1587                 config->dv_esw_en = !!tmp;
1588         } else if (strcmp(MLX5_DV_FLOW_EN, key) == 0) {
1589                 config->dv_flow_en = !!tmp;
1590         } else if (strcmp(MLX5_DV_XMETA_EN, key) == 0) {
1591                 if (tmp != MLX5_XMETA_MODE_LEGACY &&
1592                     tmp != MLX5_XMETA_MODE_META16 &&
1593                     tmp != MLX5_XMETA_MODE_META32) {
1594                         DRV_LOG(ERR, "invalid extensive "
1595                                      "metadata parameter");
1596                         rte_errno = EINVAL;
1597                         return -rte_errno;
1598                 }
1599                 config->dv_xmeta_en = tmp;
1600         } else if (strcmp(MLX5_MR_EXT_MEMSEG_EN, key) == 0) {
1601                 config->mr_ext_memseg_en = !!tmp;
1602         } else if (strcmp(MLX5_MAX_DUMP_FILES_NUM, key) == 0) {
1603                 config->max_dump_files_num = tmp;
1604         } else if (strcmp(MLX5_LRO_TIMEOUT_USEC, key) == 0) {
1605                 config->lro.timeout = tmp;
1606         } else if (strcmp(MLX5_CLASS_ARG_NAME, key) == 0) {
1607                 DRV_LOG(DEBUG, "class argument is %s.", val);
1608         } else if (strcmp(MLX5_HP_BUF_SIZE, key) == 0) {
1609                 config->log_hp_size = tmp;
1610         } else {
1611                 DRV_LOG(WARNING, "%s: unknown parameter", key);
1612                 rte_errno = EINVAL;
1613                 return -rte_errno;
1614         }
1615         return 0;
1616 }
1617
1618 /**
1619  * Parse device parameters.
1620  *
1621  * @param config
1622  *   Pointer to device configuration structure.
1623  * @param devargs
1624  *   Device arguments structure.
1625  *
1626  * @return
1627  *   0 on success, a negative errno value otherwise and rte_errno is set.
1628  */
1629 static int
1630 mlx5_args(struct mlx5_dev_config *config, struct rte_devargs *devargs)
1631 {
1632         const char **params = (const char *[]){
1633                 MLX5_RXQ_CQE_COMP_EN,
1634                 MLX5_RXQ_CQE_PAD_EN,
1635                 MLX5_RXQ_PKT_PAD_EN,
1636                 MLX5_RX_MPRQ_EN,
1637                 MLX5_RX_MPRQ_LOG_STRIDE_NUM,
1638                 MLX5_RX_MPRQ_LOG_STRIDE_SIZE,
1639                 MLX5_RX_MPRQ_MAX_MEMCPY_LEN,
1640                 MLX5_RXQS_MIN_MPRQ,
1641                 MLX5_TXQ_INLINE,
1642                 MLX5_TXQ_INLINE_MIN,
1643                 MLX5_TXQ_INLINE_MAX,
1644                 MLX5_TXQ_INLINE_MPW,
1645                 MLX5_TXQS_MIN_INLINE,
1646                 MLX5_TXQS_MAX_VEC,
1647                 MLX5_TXQ_MPW_EN,
1648                 MLX5_TXQ_MPW_HDR_DSEG_EN,
1649                 MLX5_TXQ_MAX_INLINE_LEN,
1650                 MLX5_TX_DB_NC,
1651                 MLX5_TX_VEC_EN,
1652                 MLX5_RX_VEC_EN,
1653                 MLX5_L3_VXLAN_EN,
1654                 MLX5_VF_NL_EN,
1655                 MLX5_DV_ESW_EN,
1656                 MLX5_DV_FLOW_EN,
1657                 MLX5_DV_XMETA_EN,
1658                 MLX5_MR_EXT_MEMSEG_EN,
1659                 MLX5_REPRESENTOR,
1660                 MLX5_MAX_DUMP_FILES_NUM,
1661                 MLX5_LRO_TIMEOUT_USEC,
1662                 MLX5_CLASS_ARG_NAME,
1663                 MLX5_HP_BUF_SIZE,
1664                 NULL,
1665         };
1666         struct rte_kvargs *kvlist;
1667         int ret = 0;
1668         int i;
1669
1670         if (devargs == NULL)
1671                 return 0;
1672         /* Following UGLY cast is done to pass checkpatch. */
1673         kvlist = rte_kvargs_parse(devargs->args, params);
1674         if (kvlist == NULL) {
1675                 rte_errno = EINVAL;
1676                 return -rte_errno;
1677         }
1678         /* Process parameters. */
1679         for (i = 0; (params[i] != NULL); ++i) {
1680                 if (rte_kvargs_count(kvlist, params[i])) {
1681                         ret = rte_kvargs_process(kvlist, params[i],
1682                                                  mlx5_args_check, config);
1683                         if (ret) {
1684                                 rte_errno = EINVAL;
1685                                 rte_kvargs_free(kvlist);
1686                                 return -rte_errno;
1687                         }
1688                 }
1689         }
1690         rte_kvargs_free(kvlist);
1691         return 0;
1692 }
1693
1694 static struct rte_pci_driver mlx5_driver;
1695
1696 /**
1697  * PMD global initialization.
1698  *
1699  * Independent from individual device, this function initializes global
1700  * per-PMD data structures distinguishing primary and secondary processes.
1701  * Hence, each initialization is called once per a process.
1702  *
1703  * @return
1704  *   0 on success, a negative errno value otherwise and rte_errno is set.
1705  */
1706 static int
1707 mlx5_init_once(void)
1708 {
1709         struct mlx5_shared_data *sd;
1710         struct mlx5_local_data *ld = &mlx5_local_data;
1711         int ret = 0;
1712
1713         if (mlx5_init_shared_data())
1714                 return -rte_errno;
1715         sd = mlx5_shared_data;
1716         MLX5_ASSERT(sd);
1717         rte_spinlock_lock(&sd->lock);
1718         switch (rte_eal_process_type()) {
1719         case RTE_PROC_PRIMARY:
1720                 if (sd->init_done)
1721                         break;
1722                 LIST_INIT(&sd->mem_event_cb_list);
1723                 rte_rwlock_init(&sd->mem_event_rwlock);
1724                 rte_mem_event_callback_register("MLX5_MEM_EVENT_CB",
1725                                                 mlx5_mr_mem_event_cb, NULL);
1726                 ret = mlx5_mp_init_primary(MLX5_MP_NAME,
1727                                            mlx5_mp_primary_handle);
1728                 if (ret)
1729                         goto out;
1730                 sd->init_done = true;
1731                 break;
1732         case RTE_PROC_SECONDARY:
1733                 if (ld->init_done)
1734                         break;
1735                 ret = mlx5_mp_init_secondary(MLX5_MP_NAME,
1736                                              mlx5_mp_secondary_handle);
1737                 if (ret)
1738                         goto out;
1739                 ++sd->secondary_cnt;
1740                 ld->init_done = true;
1741                 break;
1742         default:
1743                 break;
1744         }
1745 out:
1746         rte_spinlock_unlock(&sd->lock);
1747         return ret;
1748 }
1749
1750 /**
1751  * Configures the minimal amount of data to inline into WQE
1752  * while sending packets.
1753  *
1754  * - the txq_inline_min has the maximal priority, if this
1755  *   key is specified in devargs
1756  * - if DevX is enabled the inline mode is queried from the
1757  *   device (HCA attributes and NIC vport context if needed).
1758  * - otherwise L2 mode (18 bytes) is assumed for ConnectX-4/4 Lx
1759  *   and none (0 bytes) for other NICs
1760  *
1761  * @param spawn
1762  *   Verbs device parameters (name, port, switch_info) to spawn.
1763  * @param config
1764  *   Device configuration parameters.
1765  */
1766 static void
1767 mlx5_set_min_inline(struct mlx5_dev_spawn_data *spawn,
1768                     struct mlx5_dev_config *config)
1769 {
1770         if (config->txq_inline_min != MLX5_ARG_UNSET) {
1771                 /* Application defines size of inlined data explicitly. */
1772                 switch (spawn->pci_dev->id.device_id) {
1773                 case PCI_DEVICE_ID_MELLANOX_CONNECTX4:
1774                 case PCI_DEVICE_ID_MELLANOX_CONNECTX4VF:
1775                         if (config->txq_inline_min <
1776                                        (int)MLX5_INLINE_HSIZE_L2) {
1777                                 DRV_LOG(DEBUG,
1778                                         "txq_inline_mix aligned to minimal"
1779                                         " ConnectX-4 required value %d",
1780                                         (int)MLX5_INLINE_HSIZE_L2);
1781                                 config->txq_inline_min = MLX5_INLINE_HSIZE_L2;
1782                         }
1783                         break;
1784                 }
1785                 goto exit;
1786         }
1787         if (config->hca_attr.eth_net_offloads) {
1788                 /* We have DevX enabled, inline mode queried successfully. */
1789                 switch (config->hca_attr.wqe_inline_mode) {
1790                 case MLX5_CAP_INLINE_MODE_L2:
1791                         /* outer L2 header must be inlined. */
1792                         config->txq_inline_min = MLX5_INLINE_HSIZE_L2;
1793                         goto exit;
1794                 case MLX5_CAP_INLINE_MODE_NOT_REQUIRED:
1795                         /* No inline data are required by NIC. */
1796                         config->txq_inline_min = MLX5_INLINE_HSIZE_NONE;
1797                         config->hw_vlan_insert =
1798                                 config->hca_attr.wqe_vlan_insert;
1799                         DRV_LOG(DEBUG, "Tx VLAN insertion is supported");
1800                         goto exit;
1801                 case MLX5_CAP_INLINE_MODE_VPORT_CONTEXT:
1802                         /* inline mode is defined by NIC vport context. */
1803                         if (!config->hca_attr.eth_virt)
1804                                 break;
1805                         switch (config->hca_attr.vport_inline_mode) {
1806                         case MLX5_INLINE_MODE_NONE:
1807                                 config->txq_inline_min =
1808                                         MLX5_INLINE_HSIZE_NONE;
1809                                 goto exit;
1810                         case MLX5_INLINE_MODE_L2:
1811                                 config->txq_inline_min =
1812                                         MLX5_INLINE_HSIZE_L2;
1813                                 goto exit;
1814                         case MLX5_INLINE_MODE_IP:
1815                                 config->txq_inline_min =
1816                                         MLX5_INLINE_HSIZE_L3;
1817                                 goto exit;
1818                         case MLX5_INLINE_MODE_TCP_UDP:
1819                                 config->txq_inline_min =
1820                                         MLX5_INLINE_HSIZE_L4;
1821                                 goto exit;
1822                         case MLX5_INLINE_MODE_INNER_L2:
1823                                 config->txq_inline_min =
1824                                         MLX5_INLINE_HSIZE_INNER_L2;
1825                                 goto exit;
1826                         case MLX5_INLINE_MODE_INNER_IP:
1827                                 config->txq_inline_min =
1828                                         MLX5_INLINE_HSIZE_INNER_L3;
1829                                 goto exit;
1830                         case MLX5_INLINE_MODE_INNER_TCP_UDP:
1831                                 config->txq_inline_min =
1832                                         MLX5_INLINE_HSIZE_INNER_L4;
1833                                 goto exit;
1834                         }
1835                 }
1836         }
1837         /*
1838          * We get here if we are unable to deduce
1839          * inline data size with DevX. Try PCI ID
1840          * to determine old NICs.
1841          */
1842         switch (spawn->pci_dev->id.device_id) {
1843         case PCI_DEVICE_ID_MELLANOX_CONNECTX4:
1844         case PCI_DEVICE_ID_MELLANOX_CONNECTX4VF:
1845         case PCI_DEVICE_ID_MELLANOX_CONNECTX4LX:
1846         case PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF:
1847                 config->txq_inline_min = MLX5_INLINE_HSIZE_L2;
1848                 config->hw_vlan_insert = 0;
1849                 break;
1850         case PCI_DEVICE_ID_MELLANOX_CONNECTX5:
1851         case PCI_DEVICE_ID_MELLANOX_CONNECTX5VF:
1852         case PCI_DEVICE_ID_MELLANOX_CONNECTX5EX:
1853         case PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF:
1854                 /*
1855                  * These NICs support VLAN insertion from WQE and
1856                  * report the wqe_vlan_insert flag. But there is the bug
1857                  * and PFC control may be broken, so disable feature.
1858                  */
1859                 config->hw_vlan_insert = 0;
1860                 config->txq_inline_min = MLX5_INLINE_HSIZE_NONE;
1861                 break;
1862         default:
1863                 config->txq_inline_min = MLX5_INLINE_HSIZE_NONE;
1864                 break;
1865         }
1866 exit:
1867         DRV_LOG(DEBUG, "min tx inline configured: %d", config->txq_inline_min);
1868 }
1869
1870 /**
1871  * Configures the metadata mask fields in the shared context.
1872  *
1873  * @param [in] dev
1874  *   Pointer to Ethernet device.
1875  */
1876 static void
1877 mlx5_set_metadata_mask(struct rte_eth_dev *dev)
1878 {
1879         struct mlx5_priv *priv = dev->data->dev_private;
1880         struct mlx5_ibv_shared *sh = priv->sh;
1881         uint32_t meta, mark, reg_c0;
1882
1883         reg_c0 = ~priv->vport_meta_mask;
1884         switch (priv->config.dv_xmeta_en) {
1885         case MLX5_XMETA_MODE_LEGACY:
1886                 meta = UINT32_MAX;
1887                 mark = MLX5_FLOW_MARK_MASK;
1888                 break;
1889         case MLX5_XMETA_MODE_META16:
1890                 meta = reg_c0 >> rte_bsf32(reg_c0);
1891                 mark = MLX5_FLOW_MARK_MASK;
1892                 break;
1893         case MLX5_XMETA_MODE_META32:
1894                 meta = UINT32_MAX;
1895                 mark = (reg_c0 >> rte_bsf32(reg_c0)) & MLX5_FLOW_MARK_MASK;
1896                 break;
1897         default:
1898                 meta = 0;
1899                 mark = 0;
1900                 MLX5_ASSERT(false);
1901                 break;
1902         }
1903         if (sh->dv_mark_mask && sh->dv_mark_mask != mark)
1904                 DRV_LOG(WARNING, "metadata MARK mask mismatche %08X:%08X",
1905                                  sh->dv_mark_mask, mark);
1906         else
1907                 sh->dv_mark_mask = mark;
1908         if (sh->dv_meta_mask && sh->dv_meta_mask != meta)
1909                 DRV_LOG(WARNING, "metadata META mask mismatche %08X:%08X",
1910                                  sh->dv_meta_mask, meta);
1911         else
1912                 sh->dv_meta_mask = meta;
1913         if (sh->dv_regc0_mask && sh->dv_regc0_mask != reg_c0)
1914                 DRV_LOG(WARNING, "metadata reg_c0 mask mismatche %08X:%08X",
1915                                  sh->dv_meta_mask, reg_c0);
1916         else
1917                 sh->dv_regc0_mask = reg_c0;
1918         DRV_LOG(DEBUG, "metadata mode %u", priv->config.dv_xmeta_en);
1919         DRV_LOG(DEBUG, "metadata MARK mask %08X", sh->dv_mark_mask);
1920         DRV_LOG(DEBUG, "metadata META mask %08X", sh->dv_meta_mask);
1921         DRV_LOG(DEBUG, "metadata reg_c0 mask %08X", sh->dv_regc0_mask);
1922 }
1923
1924 /**
1925  * Allocate page of door-bells and register it using DevX API.
1926  *
1927  * @param [in] dev
1928  *   Pointer to Ethernet device.
1929  *
1930  * @return
1931  *   Pointer to new page on success, NULL otherwise.
1932  */
1933 static struct mlx5_devx_dbr_page *
1934 mlx5_alloc_dbr_page(struct rte_eth_dev *dev)
1935 {
1936         struct mlx5_priv *priv = dev->data->dev_private;
1937         struct mlx5_devx_dbr_page *page;
1938
1939         /* Allocate space for door-bell page and management data. */
1940         page = rte_calloc_socket(__func__, 1, sizeof(struct mlx5_devx_dbr_page),
1941                                  RTE_CACHE_LINE_SIZE, dev->device->numa_node);
1942         if (!page) {
1943                 DRV_LOG(ERR, "port %u cannot allocate dbr page",
1944                         dev->data->port_id);
1945                 return NULL;
1946         }
1947         /* Register allocated memory. */
1948         page->umem = mlx5_glue->devx_umem_reg(priv->sh->ctx, page->dbrs,
1949                                               MLX5_DBR_PAGE_SIZE, 0);
1950         if (!page->umem) {
1951                 DRV_LOG(ERR, "port %u cannot umem reg dbr page",
1952                         dev->data->port_id);
1953                 rte_free(page);
1954                 return NULL;
1955         }
1956         return page;
1957 }
1958
1959 /**
1960  * Find the next available door-bell, allocate new page if needed.
1961  *
1962  * @param [in] dev
1963  *   Pointer to Ethernet device.
1964  * @param [out] dbr_page
1965  *   Door-bell page containing the page data.
1966  *
1967  * @return
1968  *   Door-bell address offset on success, a negative error value otherwise.
1969  */
1970 int64_t
1971 mlx5_get_dbr(struct rte_eth_dev *dev, struct mlx5_devx_dbr_page **dbr_page)
1972 {
1973         struct mlx5_priv *priv = dev->data->dev_private;
1974         struct mlx5_devx_dbr_page *page = NULL;
1975         uint32_t i, j;
1976
1977         LIST_FOREACH(page, &priv->dbrpgs, next)
1978                 if (page->dbr_count < MLX5_DBR_PER_PAGE)
1979                         break;
1980         if (!page) { /* No page with free door-bell exists. */
1981                 page = mlx5_alloc_dbr_page(dev);
1982                 if (!page) /* Failed to allocate new page. */
1983                         return (-1);
1984                 LIST_INSERT_HEAD(&priv->dbrpgs, page, next);
1985         }
1986         /* Loop to find bitmap part with clear bit. */
1987         for (i = 0;
1988              i < MLX5_DBR_BITMAP_SIZE && page->dbr_bitmap[i] == UINT64_MAX;
1989              i++)
1990                 ; /* Empty. */
1991         /* Find the first clear bit. */
1992         j = rte_bsf64(~page->dbr_bitmap[i]);
1993         MLX5_ASSERT(i < (MLX5_DBR_PER_PAGE / 64));
1994         page->dbr_bitmap[i] |= (1 << j);
1995         page->dbr_count++;
1996         *dbr_page = page;
1997         return (((i * 64) + j) * sizeof(uint64_t));
1998 }
1999
2000 /**
2001  * Release a door-bell record.
2002  *
2003  * @param [in] dev
2004  *   Pointer to Ethernet device.
2005  * @param [in] umem_id
2006  *   UMEM ID of page containing the door-bell record to release.
2007  * @param [in] offset
2008  *   Offset of door-bell record in page.
2009  *
2010  * @return
2011  *   0 on success, a negative error value otherwise.
2012  */
2013 int32_t
2014 mlx5_release_dbr(struct rte_eth_dev *dev, uint32_t umem_id, uint64_t offset)
2015 {
2016         struct mlx5_priv *priv = dev->data->dev_private;
2017         struct mlx5_devx_dbr_page *page = NULL;
2018         int ret = 0;
2019
2020         LIST_FOREACH(page, &priv->dbrpgs, next)
2021                 /* Find the page this address belongs to. */
2022                 if (page->umem->umem_id == umem_id)
2023                         break;
2024         if (!page)
2025                 return -EINVAL;
2026         page->dbr_count--;
2027         if (!page->dbr_count) {
2028                 /* Page not used, free it and remove from list. */
2029                 LIST_REMOVE(page, next);
2030                 if (page->umem)
2031                         ret = -mlx5_glue->devx_umem_dereg(page->umem);
2032                 rte_free(page);
2033         } else {
2034                 /* Mark in bitmap that this door-bell is not in use. */
2035                 offset /= MLX5_DBR_SIZE;
2036                 int i = offset / 64;
2037                 int j = offset % 64;
2038
2039                 page->dbr_bitmap[i] &= ~(1 << j);
2040         }
2041         return ret;
2042 }
2043
2044 int
2045 rte_pmd_mlx5_get_dyn_flag_names(char *names[], unsigned int n)
2046 {
2047         static const char *const dynf_names[] = {
2048                 RTE_PMD_MLX5_FINE_GRANULARITY_INLINE,
2049                 RTE_MBUF_DYNFLAG_METADATA_NAME
2050         };
2051         unsigned int i;
2052
2053         if (n < RTE_DIM(dynf_names))
2054                 return -ENOMEM;
2055         for (i = 0; i < RTE_DIM(dynf_names); i++) {
2056                 if (names[i] == NULL)
2057                         return -EINVAL;
2058                 strcpy(names[i], dynf_names[i]);
2059         }
2060         return RTE_DIM(dynf_names);
2061 }
2062
2063 /**
2064  * Check sibling device configurations.
2065  *
2066  * Sibling devices sharing the Infiniband device context
2067  * should have compatible configurations. This regards
2068  * representors and bonding slaves.
2069  *
2070  * @param priv
2071  *   Private device descriptor.
2072  * @param config
2073  *   Configuration of the device is going to be created.
2074  *
2075  * @return
2076  *   0 on success, EINVAL otherwise
2077  */
2078 static int
2079 mlx5_dev_check_sibling_config(struct mlx5_priv *priv,
2080                               struct mlx5_dev_config *config)
2081 {
2082         struct mlx5_ibv_shared *sh = priv->sh;
2083         struct mlx5_dev_config *sh_conf = NULL;
2084         uint16_t port_id;
2085
2086         MLX5_ASSERT(sh);
2087         /* Nothing to compare for the single/first device. */
2088         if (sh->refcnt == 1)
2089                 return 0;
2090         /* Find the device with shared context. */
2091         MLX5_ETH_FOREACH_DEV(port_id, priv->pci_dev) {
2092                 struct mlx5_priv *opriv =
2093                         rte_eth_devices[port_id].data->dev_private;
2094
2095                 if (opriv && opriv != priv && opriv->sh == sh) {
2096                         sh_conf = &opriv->config;
2097                         break;
2098                 }
2099         }
2100         if (!sh_conf)
2101                 return 0;
2102         if (sh_conf->dv_flow_en ^ config->dv_flow_en) {
2103                 DRV_LOG(ERR, "\"dv_flow_en\" configuration mismatch"
2104                              " for shared %s context", sh->ibdev_name);
2105                 rte_errno = EINVAL;
2106                 return rte_errno;
2107         }
2108         if (sh_conf->dv_xmeta_en ^ config->dv_xmeta_en) {
2109                 DRV_LOG(ERR, "\"dv_xmeta_en\" configuration mismatch"
2110                              " for shared %s context", sh->ibdev_name);
2111                 rte_errno = EINVAL;
2112                 return rte_errno;
2113         }
2114         return 0;
2115 }
2116 /**
2117  * Spawn an Ethernet device from Verbs information.
2118  *
2119  * @param dpdk_dev
2120  *   Backing DPDK device.
2121  * @param spawn
2122  *   Verbs device parameters (name, port, switch_info) to spawn.
2123  * @param config
2124  *   Device configuration parameters.
2125  *
2126  * @return
2127  *   A valid Ethernet device object on success, NULL otherwise and rte_errno
2128  *   is set. The following errors are defined:
2129  *
2130  *   EBUSY: device is not supposed to be spawned.
2131  *   EEXIST: device is already spawned
2132  */
2133 static struct rte_eth_dev *
2134 mlx5_dev_spawn(struct rte_device *dpdk_dev,
2135                struct mlx5_dev_spawn_data *spawn,
2136                struct mlx5_dev_config config)
2137 {
2138         const struct mlx5_switch_info *switch_info = &spawn->info;
2139         struct mlx5_ibv_shared *sh = NULL;
2140         struct ibv_port_attr port_attr;
2141         struct mlx5dv_context dv_attr = { .comp_mask = 0 };
2142         struct rte_eth_dev *eth_dev = NULL;
2143         struct mlx5_priv *priv = NULL;
2144         int err = 0;
2145         unsigned int hw_padding = 0;
2146         unsigned int mps;
2147         unsigned int cqe_comp;
2148         unsigned int cqe_pad = 0;
2149         unsigned int tunnel_en = 0;
2150         unsigned int mpls_en = 0;
2151         unsigned int swp = 0;
2152         unsigned int mprq = 0;
2153         unsigned int mprq_min_stride_size_n = 0;
2154         unsigned int mprq_max_stride_size_n = 0;
2155         unsigned int mprq_min_stride_num_n = 0;
2156         unsigned int mprq_max_stride_num_n = 0;
2157         struct rte_ether_addr mac;
2158         char name[RTE_ETH_NAME_MAX_LEN];
2159         int own_domain_id = 0;
2160         uint16_t port_id;
2161         unsigned int i;
2162 #ifdef HAVE_MLX5DV_DR_DEVX_PORT
2163         struct mlx5dv_devx_port devx_port = { .comp_mask = 0 };
2164 #endif
2165
2166         /* Determine if this port representor is supposed to be spawned. */
2167         if (switch_info->representor && dpdk_dev->devargs) {
2168                 struct rte_eth_devargs eth_da;
2169
2170                 err = rte_eth_devargs_parse(dpdk_dev->devargs->args, &eth_da);
2171                 if (err) {
2172                         rte_errno = -err;
2173                         DRV_LOG(ERR, "failed to process device arguments: %s",
2174                                 strerror(rte_errno));
2175                         return NULL;
2176                 }
2177                 for (i = 0; i < eth_da.nb_representor_ports; ++i)
2178                         if (eth_da.representor_ports[i] ==
2179                             (uint16_t)switch_info->port_name)
2180                                 break;
2181                 if (i == eth_da.nb_representor_ports) {
2182                         rte_errno = EBUSY;
2183                         return NULL;
2184                 }
2185         }
2186         /* Build device name. */
2187         if (spawn->pf_bond <  0) {
2188                 /* Single device. */
2189                 if (!switch_info->representor)
2190                         strlcpy(name, dpdk_dev->name, sizeof(name));
2191                 else
2192                         snprintf(name, sizeof(name), "%s_representor_%u",
2193                                  dpdk_dev->name, switch_info->port_name);
2194         } else {
2195                 /* Bonding device. */
2196                 if (!switch_info->representor)
2197                         snprintf(name, sizeof(name), "%s_%s",
2198                                  dpdk_dev->name, spawn->ibv_dev->name);
2199                 else
2200                         snprintf(name, sizeof(name), "%s_%s_representor_%u",
2201                                  dpdk_dev->name, spawn->ibv_dev->name,
2202                                  switch_info->port_name);
2203         }
2204         /* check if the device is already spawned */
2205         if (rte_eth_dev_get_port_by_name(name, &port_id) == 0) {
2206                 rte_errno = EEXIST;
2207                 return NULL;
2208         }
2209         DRV_LOG(DEBUG, "naming Ethernet device \"%s\"", name);
2210         if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
2211                 struct mlx5_mp_id mp_id;
2212
2213                 eth_dev = rte_eth_dev_attach_secondary(name);
2214                 if (eth_dev == NULL) {
2215                         DRV_LOG(ERR, "can not attach rte ethdev");
2216                         rte_errno = ENOMEM;
2217                         return NULL;
2218                 }
2219                 eth_dev->device = dpdk_dev;
2220                 eth_dev->dev_ops = &mlx5_dev_sec_ops;
2221                 err = mlx5_proc_priv_init(eth_dev);
2222                 if (err)
2223                         return NULL;
2224                 mp_id.port_id = eth_dev->data->port_id;
2225                 strlcpy(mp_id.name, MLX5_MP_NAME, RTE_MP_MAX_NAME_LEN);
2226                 /* Receive command fd from primary process */
2227                 err = mlx5_mp_req_verbs_cmd_fd(&mp_id);
2228                 if (err < 0)
2229                         return NULL;
2230                 /* Remap UAR for Tx queues. */
2231                 err = mlx5_tx_uar_init_secondary(eth_dev, err);
2232                 if (err)
2233                         return NULL;
2234                 /*
2235                  * Ethdev pointer is still required as input since
2236                  * the primary device is not accessible from the
2237                  * secondary process.
2238                  */
2239                 eth_dev->rx_pkt_burst = mlx5_select_rx_function(eth_dev);
2240                 eth_dev->tx_pkt_burst = mlx5_select_tx_function(eth_dev);
2241                 return eth_dev;
2242         }
2243         /*
2244          * Some parameters ("tx_db_nc" in particularly) are needed in
2245          * advance to create dv/verbs device context. We proceed the
2246          * devargs here to get ones, and later proceed devargs again
2247          * to override some hardware settings.
2248          */
2249         err = mlx5_args(&config, dpdk_dev->devargs);
2250         if (err) {
2251                 err = rte_errno;
2252                 DRV_LOG(ERR, "failed to process device arguments: %s",
2253                         strerror(rte_errno));
2254                 goto error;
2255         }
2256         sh = mlx5_alloc_shared_ibctx(spawn, &config);
2257         if (!sh)
2258                 return NULL;
2259         config.devx = sh->devx;
2260 #ifdef HAVE_MLX5DV_DR_ACTION_DEST_DEVX_TIR
2261         config.dest_tir = 1;
2262 #endif
2263 #ifdef HAVE_IBV_MLX5_MOD_SWP
2264         dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_SWP;
2265 #endif
2266         /*
2267          * Multi-packet send is supported by ConnectX-4 Lx PF as well
2268          * as all ConnectX-5 devices.
2269          */
2270 #ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT
2271         dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS;
2272 #endif
2273 #ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT
2274         dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_STRIDING_RQ;
2275 #endif
2276         mlx5_glue->dv_query_device(sh->ctx, &dv_attr);
2277         if (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED) {
2278                 if (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW) {
2279                         DRV_LOG(DEBUG, "enhanced MPW is supported");
2280                         mps = MLX5_MPW_ENHANCED;
2281                 } else {
2282                         DRV_LOG(DEBUG, "MPW is supported");
2283                         mps = MLX5_MPW;
2284                 }
2285         } else {
2286                 DRV_LOG(DEBUG, "MPW isn't supported");
2287                 mps = MLX5_MPW_DISABLED;
2288         }
2289 #ifdef HAVE_IBV_MLX5_MOD_SWP
2290         if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_SWP)
2291                 swp = dv_attr.sw_parsing_caps.sw_parsing_offloads;
2292         DRV_LOG(DEBUG, "SWP support: %u", swp);
2293 #endif
2294         config.swp = !!swp;
2295 #ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT
2296         if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_STRIDING_RQ) {
2297                 struct mlx5dv_striding_rq_caps mprq_caps =
2298                         dv_attr.striding_rq_caps;
2299
2300                 DRV_LOG(DEBUG, "\tmin_single_stride_log_num_of_bytes: %d",
2301                         mprq_caps.min_single_stride_log_num_of_bytes);
2302                 DRV_LOG(DEBUG, "\tmax_single_stride_log_num_of_bytes: %d",
2303                         mprq_caps.max_single_stride_log_num_of_bytes);
2304                 DRV_LOG(DEBUG, "\tmin_single_wqe_log_num_of_strides: %d",
2305                         mprq_caps.min_single_wqe_log_num_of_strides);
2306                 DRV_LOG(DEBUG, "\tmax_single_wqe_log_num_of_strides: %d",
2307                         mprq_caps.max_single_wqe_log_num_of_strides);
2308                 DRV_LOG(DEBUG, "\tsupported_qpts: %d",
2309                         mprq_caps.supported_qpts);
2310                 DRV_LOG(DEBUG, "device supports Multi-Packet RQ");
2311                 mprq = 1;
2312                 mprq_min_stride_size_n =
2313                         mprq_caps.min_single_stride_log_num_of_bytes;
2314                 mprq_max_stride_size_n =
2315                         mprq_caps.max_single_stride_log_num_of_bytes;
2316                 mprq_min_stride_num_n =
2317                         mprq_caps.min_single_wqe_log_num_of_strides;
2318                 mprq_max_stride_num_n =
2319                         mprq_caps.max_single_wqe_log_num_of_strides;
2320         }
2321 #endif
2322         if (RTE_CACHE_LINE_SIZE == 128 &&
2323             !(dv_attr.flags & MLX5DV_CONTEXT_FLAGS_CQE_128B_COMP))
2324                 cqe_comp = 0;
2325         else
2326                 cqe_comp = 1;
2327         config.cqe_comp = cqe_comp;
2328 #ifdef HAVE_IBV_MLX5_MOD_CQE_128B_PAD
2329         /* Whether device supports 128B Rx CQE padding. */
2330         cqe_pad = RTE_CACHE_LINE_SIZE == 128 &&
2331                   (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_CQE_128B_PAD);
2332 #endif
2333 #ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT
2334         if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS) {
2335                 tunnel_en = ((dv_attr.tunnel_offloads_caps &
2336                               MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_VXLAN) &&
2337                              (dv_attr.tunnel_offloads_caps &
2338                               MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_GRE) &&
2339                              (dv_attr.tunnel_offloads_caps &
2340                               MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_GENEVE));
2341         }
2342         DRV_LOG(DEBUG, "tunnel offloading is %ssupported",
2343                 tunnel_en ? "" : "not ");
2344 #else
2345         DRV_LOG(WARNING,
2346                 "tunnel offloading disabled due to old OFED/rdma-core version");
2347 #endif
2348         config.tunnel_en = tunnel_en;
2349 #ifdef HAVE_IBV_DEVICE_MPLS_SUPPORT
2350         mpls_en = ((dv_attr.tunnel_offloads_caps &
2351                     MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_CW_MPLS_OVER_GRE) &&
2352                    (dv_attr.tunnel_offloads_caps &
2353                     MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_CW_MPLS_OVER_UDP));
2354         DRV_LOG(DEBUG, "MPLS over GRE/UDP tunnel offloading is %ssupported",
2355                 mpls_en ? "" : "not ");
2356 #else
2357         DRV_LOG(WARNING, "MPLS over GRE/UDP tunnel offloading disabled due to"
2358                 " old OFED/rdma-core version or firmware configuration");
2359 #endif
2360         config.mpls_en = mpls_en;
2361         /* Check port status. */
2362         err = mlx5_glue->query_port(sh->ctx, spawn->ibv_port, &port_attr);
2363         if (err) {
2364                 DRV_LOG(ERR, "port query failed: %s", strerror(err));
2365                 goto error;
2366         }
2367         if (port_attr.link_layer != IBV_LINK_LAYER_ETHERNET) {
2368                 DRV_LOG(ERR, "port is not configured in Ethernet mode");
2369                 err = EINVAL;
2370                 goto error;
2371         }
2372         if (port_attr.state != IBV_PORT_ACTIVE)
2373                 DRV_LOG(DEBUG, "port is not active: \"%s\" (%d)",
2374                         mlx5_glue->port_state_str(port_attr.state),
2375                         port_attr.state);
2376         /* Allocate private eth device data. */
2377         priv = rte_zmalloc("ethdev private structure",
2378                            sizeof(*priv),
2379                            RTE_CACHE_LINE_SIZE);
2380         if (priv == NULL) {
2381                 DRV_LOG(ERR, "priv allocation failure");
2382                 err = ENOMEM;
2383                 goto error;
2384         }
2385         priv->sh = sh;
2386         priv->ibv_port = spawn->ibv_port;
2387         priv->pci_dev = spawn->pci_dev;
2388         priv->mtu = RTE_ETHER_MTU;
2389         priv->mp_id.port_id = port_id;
2390         strlcpy(priv->mp_id.name, MLX5_MP_NAME, RTE_MP_MAX_NAME_LEN);
2391 #ifndef RTE_ARCH_64
2392         /* Initialize UAR access locks for 32bit implementations. */
2393         rte_spinlock_init(&priv->uar_lock_cq);
2394         for (i = 0; i < MLX5_UAR_PAGE_NUM_MAX; i++)
2395                 rte_spinlock_init(&priv->uar_lock[i]);
2396 #endif
2397         /* Some internal functions rely on Netlink sockets, open them now. */
2398         priv->nl_socket_rdma = mlx5_nl_init(NETLINK_RDMA);
2399         priv->nl_socket_route = mlx5_nl_init(NETLINK_ROUTE);
2400         priv->representor = !!switch_info->representor;
2401         priv->master = !!switch_info->master;
2402         priv->domain_id = RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID;
2403         priv->vport_meta_tag = 0;
2404         priv->vport_meta_mask = 0;
2405         priv->pf_bond = spawn->pf_bond;
2406 #ifdef HAVE_MLX5DV_DR_DEVX_PORT
2407         /*
2408          * The DevX port query API is implemented. E-Switch may use
2409          * either vport or reg_c[0] metadata register to match on
2410          * vport index. The engaged part of metadata register is
2411          * defined by mask.
2412          */
2413         if (switch_info->representor || switch_info->master) {
2414                 devx_port.comp_mask = MLX5DV_DEVX_PORT_VPORT |
2415                                       MLX5DV_DEVX_PORT_MATCH_REG_C_0;
2416                 err = mlx5_glue->devx_port_query(sh->ctx, spawn->ibv_port,
2417                                                  &devx_port);
2418                 if (err) {
2419                         DRV_LOG(WARNING,
2420                                 "can't query devx port %d on device %s",
2421                                 spawn->ibv_port, spawn->ibv_dev->name);
2422                         devx_port.comp_mask = 0;
2423                 }
2424         }
2425         if (devx_port.comp_mask & MLX5DV_DEVX_PORT_MATCH_REG_C_0) {
2426                 priv->vport_meta_tag = devx_port.reg_c_0.value;
2427                 priv->vport_meta_mask = devx_port.reg_c_0.mask;
2428                 if (!priv->vport_meta_mask) {
2429                         DRV_LOG(ERR, "vport zero mask for port %d"
2430                                      " on bonding device %s",
2431                                      spawn->ibv_port, spawn->ibv_dev->name);
2432                         err = ENOTSUP;
2433                         goto error;
2434                 }
2435                 if (priv->vport_meta_tag & ~priv->vport_meta_mask) {
2436                         DRV_LOG(ERR, "invalid vport tag for port %d"
2437                                      " on bonding device %s",
2438                                      spawn->ibv_port, spawn->ibv_dev->name);
2439                         err = ENOTSUP;
2440                         goto error;
2441                 }
2442         }
2443         if (devx_port.comp_mask & MLX5DV_DEVX_PORT_VPORT) {
2444                 priv->vport_id = devx_port.vport_num;
2445         } else if (spawn->pf_bond >= 0) {
2446                 DRV_LOG(ERR, "can't deduce vport index for port %d"
2447                              " on bonding device %s",
2448                              spawn->ibv_port, spawn->ibv_dev->name);
2449                 err = ENOTSUP;
2450                 goto error;
2451         } else {
2452                 /* Suppose vport index in compatible way. */
2453                 priv->vport_id = switch_info->representor ?
2454                                  switch_info->port_name + 1 : -1;
2455         }
2456 #else
2457         /*
2458          * Kernel/rdma_core support single E-Switch per PF configurations
2459          * only and vport_id field contains the vport index for
2460          * associated VF, which is deduced from representor port name.
2461          * For example, let's have the IB device port 10, it has
2462          * attached network device eth0, which has port name attribute
2463          * pf0vf2, we can deduce the VF number as 2, and set vport index
2464          * as 3 (2+1). This assigning schema should be changed if the
2465          * multiple E-Switch instances per PF configurations or/and PCI
2466          * subfunctions are added.
2467          */
2468         priv->vport_id = switch_info->representor ?
2469                          switch_info->port_name + 1 : -1;
2470 #endif
2471         /* representor_id field keeps the unmodified VF index. */
2472         priv->representor_id = switch_info->representor ?
2473                                switch_info->port_name : -1;
2474         /*
2475          * Look for sibling devices in order to reuse their switch domain
2476          * if any, otherwise allocate one.
2477          */
2478         MLX5_ETH_FOREACH_DEV(port_id, priv->pci_dev) {
2479                 const struct mlx5_priv *opriv =
2480                         rte_eth_devices[port_id].data->dev_private;
2481
2482                 if (!opriv ||
2483                     opriv->sh != priv->sh ||
2484                         opriv->domain_id ==
2485                         RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID)
2486                         continue;
2487                 priv->domain_id = opriv->domain_id;
2488                 break;
2489         }
2490         if (priv->domain_id == RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID) {
2491                 err = rte_eth_switch_domain_alloc(&priv->domain_id);
2492                 if (err) {
2493                         err = rte_errno;
2494                         DRV_LOG(ERR, "unable to allocate switch domain: %s",
2495                                 strerror(rte_errno));
2496                         goto error;
2497                 }
2498                 own_domain_id = 1;
2499         }
2500         /* Override some values set by hardware configuration. */
2501         mlx5_args(&config, dpdk_dev->devargs);
2502         err = mlx5_dev_check_sibling_config(priv, &config);
2503         if (err)
2504                 goto error;
2505         config.hw_csum = !!(sh->device_attr.device_cap_flags_ex &
2506                             IBV_DEVICE_RAW_IP_CSUM);
2507         DRV_LOG(DEBUG, "checksum offloading is %ssupported",
2508                 (config.hw_csum ? "" : "not "));
2509 #if !defined(HAVE_IBV_DEVICE_COUNTERS_SET_V42) && \
2510         !defined(HAVE_IBV_DEVICE_COUNTERS_SET_V45)
2511         DRV_LOG(DEBUG, "counters are not supported");
2512 #endif
2513 #if !defined(HAVE_IBV_FLOW_DV_SUPPORT) || !defined(HAVE_MLX5DV_DR)
2514         if (config.dv_flow_en) {
2515                 DRV_LOG(WARNING, "DV flow is not supported");
2516                 config.dv_flow_en = 0;
2517         }
2518 #endif
2519         config.ind_table_max_size =
2520                 sh->device_attr.rss_caps.max_rwq_indirection_table_size;
2521         /*
2522          * Remove this check once DPDK supports larger/variable
2523          * indirection tables.
2524          */
2525         if (config.ind_table_max_size > (unsigned int)ETH_RSS_RETA_SIZE_512)
2526                 config.ind_table_max_size = ETH_RSS_RETA_SIZE_512;
2527         DRV_LOG(DEBUG, "maximum Rx indirection table size is %u",
2528                 config.ind_table_max_size);
2529         config.hw_vlan_strip = !!(sh->device_attr.raw_packet_caps &
2530                                   IBV_RAW_PACKET_CAP_CVLAN_STRIPPING);
2531         DRV_LOG(DEBUG, "VLAN stripping is %ssupported",
2532                 (config.hw_vlan_strip ? "" : "not "));
2533         config.hw_fcs_strip = !!(sh->device_attr.raw_packet_caps &
2534                                  IBV_RAW_PACKET_CAP_SCATTER_FCS);
2535         DRV_LOG(DEBUG, "FCS stripping configuration is %ssupported",
2536                 (config.hw_fcs_strip ? "" : "not "));
2537 #if defined(HAVE_IBV_WQ_FLAG_RX_END_PADDING)
2538         hw_padding = !!sh->device_attr.rx_pad_end_addr_align;
2539 #elif defined(HAVE_IBV_WQ_FLAGS_PCI_WRITE_END_PADDING)
2540         hw_padding = !!(sh->device_attr.device_cap_flags_ex &
2541                         IBV_DEVICE_PCI_WRITE_END_PADDING);
2542 #endif
2543         if (config.hw_padding && !hw_padding) {
2544                 DRV_LOG(DEBUG, "Rx end alignment padding isn't supported");
2545                 config.hw_padding = 0;
2546         } else if (config.hw_padding) {
2547                 DRV_LOG(DEBUG, "Rx end alignment padding is enabled");
2548         }
2549         config.tso = (sh->device_attr.tso_caps.max_tso > 0 &&
2550                       (sh->device_attr.tso_caps.supported_qpts &
2551                        (1 << IBV_QPT_RAW_PACKET)));
2552         if (config.tso)
2553                 config.tso_max_payload_sz = sh->device_attr.tso_caps.max_tso;
2554         /*
2555          * MPW is disabled by default, while the Enhanced MPW is enabled
2556          * by default.
2557          */
2558         if (config.mps == MLX5_ARG_UNSET)
2559                 config.mps = (mps == MLX5_MPW_ENHANCED) ? MLX5_MPW_ENHANCED :
2560                                                           MLX5_MPW_DISABLED;
2561         else
2562                 config.mps = config.mps ? mps : MLX5_MPW_DISABLED;
2563         DRV_LOG(INFO, "%sMPS is %s",
2564                 config.mps == MLX5_MPW_ENHANCED ? "enhanced " :
2565                 config.mps == MLX5_MPW ? "legacy " : "",
2566                 config.mps != MLX5_MPW_DISABLED ? "enabled" : "disabled");
2567         if (config.cqe_comp && !cqe_comp) {
2568                 DRV_LOG(WARNING, "Rx CQE compression isn't supported");
2569                 config.cqe_comp = 0;
2570         }
2571         if (config.cqe_pad && !cqe_pad) {
2572                 DRV_LOG(WARNING, "Rx CQE padding isn't supported");
2573                 config.cqe_pad = 0;
2574         } else if (config.cqe_pad) {
2575                 DRV_LOG(INFO, "Rx CQE padding is enabled");
2576         }
2577         if (config.devx) {
2578                 priv->counter_fallback = 0;
2579                 err = mlx5_devx_cmd_query_hca_attr(sh->ctx, &config.hca_attr);
2580                 if (err) {
2581                         err = -err;
2582                         goto error;
2583                 }
2584                 if (!config.hca_attr.flow_counters_dump)
2585                         priv->counter_fallback = 1;
2586 #ifndef HAVE_IBV_DEVX_ASYNC
2587                 priv->counter_fallback = 1;
2588 #endif
2589                 if (priv->counter_fallback)
2590                         DRV_LOG(INFO, "Use fall-back DV counter management");
2591                 /* Check for LRO support. */
2592                 if (config.dest_tir && config.hca_attr.lro_cap &&
2593                     config.dv_flow_en) {
2594                         /* TBD check tunnel lro caps. */
2595                         config.lro.supported = config.hca_attr.lro_cap;
2596                         DRV_LOG(DEBUG, "Device supports LRO");
2597                         /*
2598                          * If LRO timeout is not configured by application,
2599                          * use the minimal supported value.
2600                          */
2601                         if (!config.lro.timeout)
2602                                 config.lro.timeout =
2603                                 config.hca_attr.lro_timer_supported_periods[0];
2604                         DRV_LOG(DEBUG, "LRO session timeout set to %d usec",
2605                                 config.lro.timeout);
2606                 }
2607 #if defined(HAVE_MLX5DV_DR) && defined(HAVE_MLX5_DR_CREATE_ACTION_FLOW_METER)
2608                 if (config.hca_attr.qos.sup && config.hca_attr.qos.srtcm_sup &&
2609                     config.dv_flow_en) {
2610                         uint8_t reg_c_mask =
2611                                 config.hca_attr.qos.flow_meter_reg_c_ids;
2612                         /*
2613                          * Meter needs two REG_C's for color match and pre-sfx
2614                          * flow match. Here get the REG_C for color match.
2615                          * REG_C_0 and REG_C_1 is reserved for metadata feature.
2616                          */
2617                         reg_c_mask &= 0xfc;
2618                         if (__builtin_popcount(reg_c_mask) < 1) {
2619                                 priv->mtr_en = 0;
2620                                 DRV_LOG(WARNING, "No available register for"
2621                                         " meter.");
2622                         } else {
2623                                 priv->mtr_color_reg = ffs(reg_c_mask) - 1 +
2624                                                       REG_C_0;
2625                                 priv->mtr_en = 1;
2626                                 priv->mtr_reg_share =
2627                                       config.hca_attr.qos.flow_meter_reg_share;
2628                                 DRV_LOG(DEBUG, "The REG_C meter uses is %d",
2629                                         priv->mtr_color_reg);
2630                         }
2631                 }
2632 #endif
2633         }
2634         if (config.mprq.enabled && mprq) {
2635                 if (config.mprq.stride_num_n &&
2636                     (config.mprq.stride_num_n > mprq_max_stride_num_n ||
2637                      config.mprq.stride_num_n < mprq_min_stride_num_n)) {
2638                         config.mprq.stride_num_n =
2639                                 RTE_MIN(RTE_MAX(MLX5_MPRQ_STRIDE_NUM_N,
2640                                                 mprq_min_stride_num_n),
2641                                         mprq_max_stride_num_n);
2642                         DRV_LOG(WARNING,
2643                                 "the number of strides"
2644                                 " for Multi-Packet RQ is out of range,"
2645                                 " setting default value (%u)",
2646                                 1 << config.mprq.stride_num_n);
2647                 }
2648                 if (config.mprq.stride_size_n &&
2649                     (config.mprq.stride_size_n > mprq_max_stride_size_n ||
2650                      config.mprq.stride_size_n < mprq_min_stride_size_n)) {
2651                         config.mprq.stride_size_n =
2652                                 RTE_MIN(RTE_MAX(MLX5_MPRQ_STRIDE_SIZE_N,
2653                                                 mprq_min_stride_size_n),
2654                                         mprq_max_stride_size_n);
2655                         DRV_LOG(WARNING,
2656                                 "the size of a stride"
2657                                 " for Multi-Packet RQ is out of range,"
2658                                 " setting default value (%u)",
2659                                 1 << config.mprq.stride_size_n);
2660                 }
2661                 config.mprq.min_stride_size_n = mprq_min_stride_size_n;
2662                 config.mprq.max_stride_size_n = mprq_max_stride_size_n;
2663         } else if (config.mprq.enabled && !mprq) {
2664                 DRV_LOG(WARNING, "Multi-Packet RQ isn't supported");
2665                 config.mprq.enabled = 0;
2666         }
2667         if (config.max_dump_files_num == 0)
2668                 config.max_dump_files_num = 128;
2669         eth_dev = rte_eth_dev_allocate(name);
2670         if (eth_dev == NULL) {
2671                 DRV_LOG(ERR, "can not allocate rte ethdev");
2672                 err = ENOMEM;
2673                 goto error;
2674         }
2675         /* Flag to call rte_eth_dev_release_port() in rte_eth_dev_close(). */
2676         eth_dev->data->dev_flags |= RTE_ETH_DEV_CLOSE_REMOVE;
2677         if (priv->representor) {
2678                 eth_dev->data->dev_flags |= RTE_ETH_DEV_REPRESENTOR;
2679                 eth_dev->data->representor_id = priv->representor_id;
2680         }
2681         /*
2682          * Store associated network device interface index. This index
2683          * is permanent throughout the lifetime of device. So, we may store
2684          * the ifindex here and use the cached value further.
2685          */
2686         MLX5_ASSERT(spawn->ifindex);
2687         priv->if_index = spawn->ifindex;
2688         eth_dev->data->dev_private = priv;
2689         priv->dev_data = eth_dev->data;
2690         eth_dev->data->mac_addrs = priv->mac;
2691         eth_dev->device = dpdk_dev;
2692         /* Configure the first MAC address by default. */
2693         if (mlx5_get_mac(eth_dev, &mac.addr_bytes)) {
2694                 DRV_LOG(ERR,
2695                         "port %u cannot get MAC address, is mlx5_en"
2696                         " loaded? (errno: %s)",
2697                         eth_dev->data->port_id, strerror(rte_errno));
2698                 err = ENODEV;
2699                 goto error;
2700         }
2701         DRV_LOG(INFO,
2702                 "port %u MAC address is %02x:%02x:%02x:%02x:%02x:%02x",
2703                 eth_dev->data->port_id,
2704                 mac.addr_bytes[0], mac.addr_bytes[1],
2705                 mac.addr_bytes[2], mac.addr_bytes[3],
2706                 mac.addr_bytes[4], mac.addr_bytes[5]);
2707 #ifdef RTE_LIBRTE_MLX5_DEBUG
2708         {
2709                 char ifname[IF_NAMESIZE];
2710
2711                 if (mlx5_get_ifname(eth_dev, &ifname) == 0)
2712                         DRV_LOG(DEBUG, "port %u ifname is \"%s\"",
2713                                 eth_dev->data->port_id, ifname);
2714                 else
2715                         DRV_LOG(DEBUG, "port %u ifname is unknown",
2716                                 eth_dev->data->port_id);
2717         }
2718 #endif
2719         /* Get actual MTU if possible. */
2720         err = mlx5_get_mtu(eth_dev, &priv->mtu);
2721         if (err) {
2722                 err = rte_errno;
2723                 goto error;
2724         }
2725         DRV_LOG(DEBUG, "port %u MTU is %u", eth_dev->data->port_id,
2726                 priv->mtu);
2727         /* Initialize burst functions to prevent crashes before link-up. */
2728         eth_dev->rx_pkt_burst = removed_rx_burst;
2729         eth_dev->tx_pkt_burst = removed_tx_burst;
2730         eth_dev->dev_ops = &mlx5_dev_ops;
2731         /* Register MAC address. */
2732         claim_zero(mlx5_mac_addr_add(eth_dev, &mac, 0, 0));
2733         if (config.vf && config.vf_nl_en)
2734                 mlx5_nl_mac_addr_sync(priv->nl_socket_route,
2735                                       mlx5_ifindex(eth_dev),
2736                                       eth_dev->data->mac_addrs,
2737                                       MLX5_MAX_MAC_ADDRESSES);
2738         TAILQ_INIT(&priv->flows);
2739         TAILQ_INIT(&priv->ctrl_flows);
2740         TAILQ_INIT(&priv->flow_meters);
2741         TAILQ_INIT(&priv->flow_meter_profiles);
2742         /* Hint libmlx5 to use PMD allocator for data plane resources */
2743         struct mlx5dv_ctx_allocators alctr = {
2744                 .alloc = &mlx5_alloc_verbs_buf,
2745                 .free = &mlx5_free_verbs_buf,
2746                 .data = priv,
2747         };
2748         mlx5_glue->dv_set_context_attr(sh->ctx,
2749                                        MLX5DV_CTX_ATTR_BUF_ALLOCATORS,
2750                                        (void *)((uintptr_t)&alctr));
2751         /* Bring Ethernet device up. */
2752         DRV_LOG(DEBUG, "port %u forcing Ethernet interface up",
2753                 eth_dev->data->port_id);
2754         mlx5_set_link_up(eth_dev);
2755         /*
2756          * Even though the interrupt handler is not installed yet,
2757          * interrupts will still trigger on the async_fd from
2758          * Verbs context returned by ibv_open_device().
2759          */
2760         mlx5_link_update(eth_dev, 0);
2761 #ifdef HAVE_MLX5DV_DR_ESWITCH
2762         if (!(config.hca_attr.eswitch_manager && config.dv_flow_en &&
2763               (switch_info->representor || switch_info->master)))
2764                 config.dv_esw_en = 0;
2765 #else
2766         config.dv_esw_en = 0;
2767 #endif
2768         /* Detect minimal data bytes to inline. */
2769         mlx5_set_min_inline(spawn, &config);
2770         /* Store device configuration on private structure. */
2771         priv->config = config;
2772         /* Create context for virtual machine VLAN workaround. */
2773         priv->vmwa_context = mlx5_vlan_vmwa_init(eth_dev, spawn->ifindex);
2774         if (config.dv_flow_en) {
2775                 err = mlx5_alloc_shared_dr(priv);
2776                 if (err)
2777                         goto error;
2778                 /*
2779                  * RSS id is shared with meter flow id. Meter flow id can only
2780                  * use the 24 MSB of the register.
2781                  */
2782                 priv->qrss_id_pool = mlx5_flow_id_pool_alloc(UINT32_MAX >>
2783                                      MLX5_MTR_COLOR_BITS);
2784                 if (!priv->qrss_id_pool) {
2785                         DRV_LOG(ERR, "can't create flow id pool");
2786                         err = ENOMEM;
2787                         goto error;
2788                 }
2789         }
2790         /* Supported Verbs flow priority number detection. */
2791         err = mlx5_flow_discover_priorities(eth_dev);
2792         if (err < 0) {
2793                 err = -err;
2794                 goto error;
2795         }
2796         priv->config.flow_prio = err;
2797         if (!priv->config.dv_esw_en &&
2798             priv->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY) {
2799                 DRV_LOG(WARNING, "metadata mode %u is not supported "
2800                                  "(no E-Switch)", priv->config.dv_xmeta_en);
2801                 priv->config.dv_xmeta_en = MLX5_XMETA_MODE_LEGACY;
2802         }
2803         mlx5_set_metadata_mask(eth_dev);
2804         if (priv->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY &&
2805             !priv->sh->dv_regc0_mask) {
2806                 DRV_LOG(ERR, "metadata mode %u is not supported "
2807                              "(no metadata reg_c[0] is available)",
2808                              priv->config.dv_xmeta_en);
2809                         err = ENOTSUP;
2810                         goto error;
2811         }
2812         /*
2813          * Allocate the buffer for flow creating, just once.
2814          * The allocation must be done before any flow creating.
2815          */
2816         mlx5_flow_alloc_intermediate(eth_dev);
2817         /* Query availibility of metadata reg_c's. */
2818         err = mlx5_flow_discover_mreg_c(eth_dev);
2819         if (err < 0) {
2820                 err = -err;
2821                 goto error;
2822         }
2823         if (!mlx5_flow_ext_mreg_supported(eth_dev)) {
2824                 DRV_LOG(DEBUG,
2825                         "port %u extensive metadata register is not supported",
2826                         eth_dev->data->port_id);
2827                 if (priv->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY) {
2828                         DRV_LOG(ERR, "metadata mode %u is not supported "
2829                                      "(no metadata registers available)",
2830                                      priv->config.dv_xmeta_en);
2831                         err = ENOTSUP;
2832                         goto error;
2833                 }
2834         }
2835         if (priv->config.dv_flow_en &&
2836             priv->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY &&
2837             mlx5_flow_ext_mreg_supported(eth_dev) &&
2838             priv->sh->dv_regc0_mask) {
2839                 priv->mreg_cp_tbl = mlx5_hlist_create(MLX5_FLOW_MREG_HNAME,
2840                                                       MLX5_FLOW_MREG_HTABLE_SZ);
2841                 if (!priv->mreg_cp_tbl) {
2842                         err = ENOMEM;
2843                         goto error;
2844                 }
2845         }
2846         return eth_dev;
2847 error:
2848         if (priv) {
2849                 if (priv->mreg_cp_tbl)
2850                         mlx5_hlist_destroy(priv->mreg_cp_tbl, NULL, NULL);
2851                 if (priv->sh)
2852                         mlx5_free_shared_dr(priv);
2853                 if (priv->nl_socket_route >= 0)
2854                         close(priv->nl_socket_route);
2855                 if (priv->nl_socket_rdma >= 0)
2856                         close(priv->nl_socket_rdma);
2857                 if (priv->vmwa_context)
2858                         mlx5_vlan_vmwa_exit(priv->vmwa_context);
2859                 if (priv->qrss_id_pool)
2860                         mlx5_flow_id_pool_release(priv->qrss_id_pool);
2861                 if (own_domain_id)
2862                         claim_zero(rte_eth_switch_domain_free(priv->domain_id));
2863                 rte_free(priv);
2864                 if (eth_dev != NULL)
2865                         eth_dev->data->dev_private = NULL;
2866         }
2867         if (eth_dev != NULL) {
2868                 /* mac_addrs must not be freed alone because part of dev_private */
2869                 eth_dev->data->mac_addrs = NULL;
2870                 rte_eth_dev_release_port(eth_dev);
2871         }
2872         if (sh)
2873                 mlx5_free_shared_ibctx(sh);
2874         MLX5_ASSERT(err > 0);
2875         rte_errno = err;
2876         return NULL;
2877 }
2878
2879 /**
2880  * Comparison callback to sort device data.
2881  *
2882  * This is meant to be used with qsort().
2883  *
2884  * @param a[in]
2885  *   Pointer to pointer to first data object.
2886  * @param b[in]
2887  *   Pointer to pointer to second data object.
2888  *
2889  * @return
2890  *   0 if both objects are equal, less than 0 if the first argument is less
2891  *   than the second, greater than 0 otherwise.
2892  */
2893 static int
2894 mlx5_dev_spawn_data_cmp(const void *a, const void *b)
2895 {
2896         const struct mlx5_switch_info *si_a =
2897                 &((const struct mlx5_dev_spawn_data *)a)->info;
2898         const struct mlx5_switch_info *si_b =
2899                 &((const struct mlx5_dev_spawn_data *)b)->info;
2900         int ret;
2901
2902         /* Master device first. */
2903         ret = si_b->master - si_a->master;
2904         if (ret)
2905                 return ret;
2906         /* Then representor devices. */
2907         ret = si_b->representor - si_a->representor;
2908         if (ret)
2909                 return ret;
2910         /* Unidentified devices come last in no specific order. */
2911         if (!si_a->representor)
2912                 return 0;
2913         /* Order representors by name. */
2914         return si_a->port_name - si_b->port_name;
2915 }
2916
2917 /**
2918  * Match PCI information for possible slaves of bonding device.
2919  *
2920  * @param[in] ibv_dev
2921  *   Pointer to Infiniband device structure.
2922  * @param[in] pci_dev
2923  *   Pointer to PCI device structure to match PCI address.
2924  * @param[in] nl_rdma
2925  *   Netlink RDMA group socket handle.
2926  *
2927  * @return
2928  *   negative value if no bonding device found, otherwise
2929  *   positive index of slave PF in bonding.
2930  */
2931 static int
2932 mlx5_device_bond_pci_match(const struct ibv_device *ibv_dev,
2933                            const struct rte_pci_device *pci_dev,
2934                            int nl_rdma)
2935 {
2936         char ifname[IF_NAMESIZE + 1];
2937         unsigned int ifindex;
2938         unsigned int np, i;
2939         FILE *file = NULL;
2940         int pf = -1;
2941
2942         /*
2943          * Try to get master device name. If something goes
2944          * wrong suppose the lack of kernel support and no
2945          * bonding devices.
2946          */
2947         if (nl_rdma < 0)
2948                 return -1;
2949         if (!strstr(ibv_dev->name, "bond"))
2950                 return -1;
2951         np = mlx5_nl_portnum(nl_rdma, ibv_dev->name);
2952         if (!np)
2953                 return -1;
2954         /*
2955          * The Master device might not be on the predefined
2956          * port (not on port index 1, it is not garanted),
2957          * we have to scan all Infiniband device port and
2958          * find master.
2959          */
2960         for (i = 1; i <= np; ++i) {
2961                 /* Check whether Infiniband port is populated. */
2962                 ifindex = mlx5_nl_ifindex(nl_rdma, ibv_dev->name, i);
2963                 if (!ifindex)
2964                         continue;
2965                 if (!if_indextoname(ifindex, ifname))
2966                         continue;
2967                 /* Try to read bonding slave names from sysfs. */
2968                 MKSTR(slaves,
2969                       "/sys/class/net/%s/master/bonding/slaves", ifname);
2970                 file = fopen(slaves, "r");
2971                 if (file)
2972                         break;
2973         }
2974         if (!file)
2975                 return -1;
2976         /* Use safe format to check maximal buffer length. */
2977         MLX5_ASSERT(atol(RTE_STR(IF_NAMESIZE)) == IF_NAMESIZE);
2978         while (fscanf(file, "%" RTE_STR(IF_NAMESIZE) "s", ifname) == 1) {
2979                 char tmp_str[IF_NAMESIZE + 32];
2980                 struct rte_pci_addr pci_addr;
2981                 struct mlx5_switch_info info;
2982
2983                 /* Process slave interface names in the loop. */
2984                 snprintf(tmp_str, sizeof(tmp_str),
2985                          "/sys/class/net/%s", ifname);
2986                 if (mlx5_dev_to_pci_addr(tmp_str, &pci_addr)) {
2987                         DRV_LOG(WARNING, "can not get PCI address"
2988                                          " for netdev \"%s\"", ifname);
2989                         continue;
2990                 }
2991                 if (pci_dev->addr.domain != pci_addr.domain ||
2992                     pci_dev->addr.bus != pci_addr.bus ||
2993                     pci_dev->addr.devid != pci_addr.devid ||
2994                     pci_dev->addr.function != pci_addr.function)
2995                         continue;
2996                 /* Slave interface PCI address match found. */
2997                 fclose(file);
2998                 snprintf(tmp_str, sizeof(tmp_str),
2999                          "/sys/class/net/%s/phys_port_name", ifname);
3000                 file = fopen(tmp_str, "rb");
3001                 if (!file)
3002                         break;
3003                 info.name_type = MLX5_PHYS_PORT_NAME_TYPE_NOTSET;
3004                 if (fscanf(file, "%32s", tmp_str) == 1)
3005                         mlx5_translate_port_name(tmp_str, &info);
3006                 if (info.name_type == MLX5_PHYS_PORT_NAME_TYPE_LEGACY ||
3007                     info.name_type == MLX5_PHYS_PORT_NAME_TYPE_UPLINK)
3008                         pf = info.port_name;
3009                 break;
3010         }
3011         if (file)
3012                 fclose(file);
3013         return pf;
3014 }
3015
3016 /**
3017  * DPDK callback to register a PCI device.
3018  *
3019  * This function spawns Ethernet devices out of a given PCI device.
3020  *
3021  * @param[in] pci_drv
3022  *   PCI driver structure (mlx5_driver).
3023  * @param[in] pci_dev
3024  *   PCI device information.
3025  *
3026  * @return
3027  *   0 on success, a negative errno value otherwise and rte_errno is set.
3028  */
3029 static int
3030 mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
3031                struct rte_pci_device *pci_dev)
3032 {
3033         struct ibv_device **ibv_list;
3034         /*
3035          * Number of found IB Devices matching with requested PCI BDF.
3036          * nd != 1 means there are multiple IB devices over the same
3037          * PCI device and we have representors and master.
3038          */
3039         unsigned int nd = 0;
3040         /*
3041          * Number of found IB device Ports. nd = 1 and np = 1..n means
3042          * we have the single multiport IB device, and there may be
3043          * representors attached to some of found ports.
3044          */
3045         unsigned int np = 0;
3046         /*
3047          * Number of DPDK ethernet devices to Spawn - either over
3048          * multiple IB devices or multiple ports of single IB device.
3049          * Actually this is the number of iterations to spawn.
3050          */
3051         unsigned int ns = 0;
3052         /*
3053          * Bonding device
3054          *   < 0 - no bonding device (single one)
3055          *  >= 0 - bonding device (value is slave PF index)
3056          */
3057         int bd = -1;
3058         struct mlx5_dev_spawn_data *list = NULL;
3059         struct mlx5_dev_config dev_config;
3060         int ret;
3061
3062         if (mlx5_class_get(pci_dev->device.devargs) != MLX5_CLASS_NET) {
3063                 DRV_LOG(DEBUG, "Skip probing - should be probed by other mlx5"
3064                         " driver.");
3065                 return 1;
3066         }
3067         if (rte_eal_process_type() == RTE_PROC_PRIMARY)
3068                 mlx5_pmd_socket_init();
3069         ret = mlx5_init_once();
3070         if (ret) {
3071                 DRV_LOG(ERR, "unable to init PMD global data: %s",
3072                         strerror(rte_errno));
3073                 return -rte_errno;
3074         }
3075         MLX5_ASSERT(pci_drv == &mlx5_driver);
3076         errno = 0;
3077         ibv_list = mlx5_glue->get_device_list(&ret);
3078         if (!ibv_list) {
3079                 rte_errno = errno ? errno : ENOSYS;
3080                 DRV_LOG(ERR, "cannot list devices, is ib_uverbs loaded?");
3081                 return -rte_errno;
3082         }
3083         /*
3084          * First scan the list of all Infiniband devices to find
3085          * matching ones, gathering into the list.
3086          */
3087         struct ibv_device *ibv_match[ret + 1];
3088         int nl_route = mlx5_nl_init(NETLINK_ROUTE);
3089         int nl_rdma = mlx5_nl_init(NETLINK_RDMA);
3090         unsigned int i;
3091
3092         while (ret-- > 0) {
3093                 struct rte_pci_addr pci_addr;
3094
3095                 DRV_LOG(DEBUG, "checking device \"%s\"", ibv_list[ret]->name);
3096                 bd = mlx5_device_bond_pci_match
3097                                 (ibv_list[ret], pci_dev, nl_rdma);
3098                 if (bd >= 0) {
3099                         /*
3100                          * Bonding device detected. Only one match is allowed,
3101                          * the bonding is supported over multi-port IB device,
3102                          * there should be no matches on representor PCI
3103                          * functions or non VF LAG bonding devices with
3104                          * specified address.
3105                          */
3106                         if (nd) {
3107                                 DRV_LOG(ERR,
3108                                         "multiple PCI match on bonding device"
3109                                         "\"%s\" found", ibv_list[ret]->name);
3110                                 rte_errno = ENOENT;
3111                                 ret = -rte_errno;
3112                                 goto exit;
3113                         }
3114                         DRV_LOG(INFO, "PCI information matches for"
3115                                       " slave %d bonding device \"%s\"",
3116                                       bd, ibv_list[ret]->name);
3117                         ibv_match[nd++] = ibv_list[ret];
3118                         break;
3119                 }
3120                 if (mlx5_dev_to_pci_addr
3121                         (ibv_list[ret]->ibdev_path, &pci_addr))
3122                         continue;
3123                 if (pci_dev->addr.domain != pci_addr.domain ||
3124                     pci_dev->addr.bus != pci_addr.bus ||
3125                     pci_dev->addr.devid != pci_addr.devid ||
3126                     pci_dev->addr.function != pci_addr.function)
3127                         continue;
3128                 DRV_LOG(INFO, "PCI information matches for device \"%s\"",
3129                         ibv_list[ret]->name);
3130                 ibv_match[nd++] = ibv_list[ret];
3131         }
3132         ibv_match[nd] = NULL;
3133         if (!nd) {
3134                 /* No device matches, just complain and bail out. */
3135                 DRV_LOG(WARNING,
3136                         "no Verbs device matches PCI device " PCI_PRI_FMT ","
3137                         " are kernel drivers loaded?",
3138                         pci_dev->addr.domain, pci_dev->addr.bus,
3139                         pci_dev->addr.devid, pci_dev->addr.function);
3140                 rte_errno = ENOENT;
3141                 ret = -rte_errno;
3142                 goto exit;
3143         }
3144         if (nd == 1) {
3145                 /*
3146                  * Found single matching device may have multiple ports.
3147                  * Each port may be representor, we have to check the port
3148                  * number and check the representors existence.
3149                  */
3150                 if (nl_rdma >= 0)
3151                         np = mlx5_nl_portnum(nl_rdma, ibv_match[0]->name);
3152                 if (!np)
3153                         DRV_LOG(WARNING, "can not get IB device \"%s\""
3154                                          " ports number", ibv_match[0]->name);
3155                 if (bd >= 0 && !np) {
3156                         DRV_LOG(ERR, "can not get ports"
3157                                      " for bonding device");
3158                         rte_errno = ENOENT;
3159                         ret = -rte_errno;
3160                         goto exit;
3161                 }
3162         }
3163 #ifndef HAVE_MLX5DV_DR_DEVX_PORT
3164         if (bd >= 0) {
3165                 /*
3166                  * This may happen if there is VF LAG kernel support and
3167                  * application is compiled with older rdma_core library.
3168                  */
3169                 DRV_LOG(ERR,
3170                         "No kernel/verbs support for VF LAG bonding found.");
3171                 rte_errno = ENOTSUP;
3172                 ret = -rte_errno;
3173                 goto exit;
3174         }
3175 #endif
3176         /*
3177          * Now we can determine the maximal
3178          * amount of devices to be spawned.
3179          */
3180         list = rte_zmalloc("device spawn data",
3181                          sizeof(struct mlx5_dev_spawn_data) *
3182                          (np ? np : nd),
3183                          RTE_CACHE_LINE_SIZE);
3184         if (!list) {
3185                 DRV_LOG(ERR, "spawn data array allocation failure");
3186                 rte_errno = ENOMEM;
3187                 ret = -rte_errno;
3188                 goto exit;
3189         }
3190         if (bd >= 0 || np > 1) {
3191                 /*
3192                  * Single IB device with multiple ports found,
3193                  * it may be E-Switch master device and representors.
3194                  * We have to perform identification trough the ports.
3195                  */
3196                 MLX5_ASSERT(nl_rdma >= 0);
3197                 MLX5_ASSERT(ns == 0);
3198                 MLX5_ASSERT(nd == 1);
3199                 MLX5_ASSERT(np);
3200                 for (i = 1; i <= np; ++i) {
3201                         list[ns].max_port = np;
3202                         list[ns].ibv_port = i;
3203                         list[ns].ibv_dev = ibv_match[0];
3204                         list[ns].eth_dev = NULL;
3205                         list[ns].pci_dev = pci_dev;
3206                         list[ns].pf_bond = bd;
3207                         list[ns].ifindex = mlx5_nl_ifindex
3208                                         (nl_rdma, list[ns].ibv_dev->name, i);
3209                         if (!list[ns].ifindex) {
3210                                 /*
3211                                  * No network interface index found for the
3212                                  * specified port, it means there is no
3213                                  * representor on this port. It's OK,
3214                                  * there can be disabled ports, for example
3215                                  * if sriov_numvfs < sriov_totalvfs.
3216                                  */
3217                                 continue;
3218                         }
3219                         ret = -1;
3220                         if (nl_route >= 0)
3221                                 ret = mlx5_nl_switch_info
3222                                                (nl_route,
3223                                                 list[ns].ifindex,
3224                                                 &list[ns].info);
3225                         if (ret || (!list[ns].info.representor &&
3226                                     !list[ns].info.master)) {
3227                                 /*
3228                                  * We failed to recognize representors with
3229                                  * Netlink, let's try to perform the task
3230                                  * with sysfs.
3231                                  */
3232                                 ret =  mlx5_sysfs_switch_info
3233                                                 (list[ns].ifindex,
3234                                                  &list[ns].info);
3235                         }
3236                         if (!ret && bd >= 0) {
3237                                 switch (list[ns].info.name_type) {
3238                                 case MLX5_PHYS_PORT_NAME_TYPE_UPLINK:
3239                                         if (list[ns].info.port_name == bd)
3240                                                 ns++;
3241                                         break;
3242                                 case MLX5_PHYS_PORT_NAME_TYPE_PFVF:
3243                                         if (list[ns].info.pf_num == bd)
3244                                                 ns++;
3245                                         break;
3246                                 default:
3247                                         break;
3248                                 }
3249                                 continue;
3250                         }
3251                         if (!ret && (list[ns].info.representor ^
3252                                      list[ns].info.master))
3253                                 ns++;
3254                 }
3255                 if (!ns) {
3256                         DRV_LOG(ERR,
3257                                 "unable to recognize master/representors"
3258                                 " on the IB device with multiple ports");
3259                         rte_errno = ENOENT;
3260                         ret = -rte_errno;
3261                         goto exit;
3262                 }
3263         } else {
3264                 /*
3265                  * The existence of several matching entries (nd > 1) means
3266                  * port representors have been instantiated. No existing Verbs
3267                  * call nor sysfs entries can tell them apart, this can only
3268                  * be done through Netlink calls assuming kernel drivers are
3269                  * recent enough to support them.
3270                  *
3271                  * In the event of identification failure through Netlink,
3272                  * try again through sysfs, then:
3273                  *
3274                  * 1. A single IB device matches (nd == 1) with single
3275                  *    port (np=0/1) and is not a representor, assume
3276                  *    no switch support.
3277                  *
3278                  * 2. Otherwise no safe assumptions can be made;
3279                  *    complain louder and bail out.
3280                  */
3281                 np = 1;
3282                 for (i = 0; i != nd; ++i) {
3283                         memset(&list[ns].info, 0, sizeof(list[ns].info));
3284                         list[ns].max_port = 1;
3285                         list[ns].ibv_port = 1;
3286                         list[ns].ibv_dev = ibv_match[i];
3287                         list[ns].eth_dev = NULL;
3288                         list[ns].pci_dev = pci_dev;
3289                         list[ns].pf_bond = -1;
3290                         list[ns].ifindex = 0;
3291                         if (nl_rdma >= 0)
3292                                 list[ns].ifindex = mlx5_nl_ifindex
3293                                         (nl_rdma, list[ns].ibv_dev->name, 1);
3294                         if (!list[ns].ifindex) {
3295                                 char ifname[IF_NAMESIZE];
3296
3297                                 /*
3298                                  * Netlink failed, it may happen with old
3299                                  * ib_core kernel driver (before 4.16).
3300                                  * We can assume there is old driver because
3301                                  * here we are processing single ports IB
3302                                  * devices. Let's try sysfs to retrieve
3303                                  * the ifindex. The method works for
3304                                  * master device only.
3305                                  */
3306                                 if (nd > 1) {
3307                                         /*
3308                                          * Multiple devices found, assume
3309                                          * representors, can not distinguish
3310                                          * master/representor and retrieve
3311                                          * ifindex via sysfs.
3312                                          */
3313                                         continue;
3314                                 }
3315                                 ret = mlx5_get_master_ifname
3316                                         (ibv_match[i]->ibdev_path, &ifname);
3317                                 if (!ret)
3318                                         list[ns].ifindex =
3319                                                 if_nametoindex(ifname);
3320                                 if (!list[ns].ifindex) {
3321                                         /*
3322                                          * No network interface index found
3323                                          * for the specified device, it means
3324                                          * there it is neither representor
3325                                          * nor master.
3326                                          */
3327                                         continue;
3328                                 }
3329                         }
3330                         ret = -1;
3331                         if (nl_route >= 0)
3332                                 ret = mlx5_nl_switch_info
3333                                                (nl_route,
3334                                                 list[ns].ifindex,
3335                                                 &list[ns].info);
3336                         if (ret || (!list[ns].info.representor &&
3337                                     !list[ns].info.master)) {
3338                                 /*
3339                                  * We failed to recognize representors with
3340                                  * Netlink, let's try to perform the task
3341                                  * with sysfs.
3342                                  */
3343                                 ret =  mlx5_sysfs_switch_info
3344                                                 (list[ns].ifindex,
3345                                                  &list[ns].info);
3346                         }
3347                         if (!ret && (list[ns].info.representor ^
3348                                      list[ns].info.master)) {
3349                                 ns++;
3350                         } else if ((nd == 1) &&
3351                                    !list[ns].info.representor &&
3352                                    !list[ns].info.master) {
3353                                 /*
3354                                  * Single IB device with
3355                                  * one physical port and
3356                                  * attached network device.
3357                                  * May be SRIOV is not enabled
3358                                  * or there is no representors.
3359                                  */
3360                                 DRV_LOG(INFO, "no E-Switch support detected");
3361                                 ns++;
3362                                 break;
3363                         }
3364                 }
3365                 if (!ns) {
3366                         DRV_LOG(ERR,
3367                                 "unable to recognize master/representors"
3368                                 " on the multiple IB devices");
3369                         rte_errno = ENOENT;
3370                         ret = -rte_errno;
3371                         goto exit;
3372                 }
3373         }
3374         MLX5_ASSERT(ns);
3375         /*
3376          * Sort list to probe devices in natural order for users convenience
3377          * (i.e. master first, then representors from lowest to highest ID).
3378          */
3379         qsort(list, ns, sizeof(*list), mlx5_dev_spawn_data_cmp);
3380         /* Default configuration. */
3381         dev_config = (struct mlx5_dev_config){
3382                 .hw_padding = 0,
3383                 .mps = MLX5_ARG_UNSET,
3384                 .dbnc = MLX5_ARG_UNSET,
3385                 .rx_vec_en = 1,
3386                 .txq_inline_max = MLX5_ARG_UNSET,
3387                 .txq_inline_min = MLX5_ARG_UNSET,
3388                 .txq_inline_mpw = MLX5_ARG_UNSET,
3389                 .txqs_inline = MLX5_ARG_UNSET,
3390                 .vf_nl_en = 1,
3391                 .mr_ext_memseg_en = 1,
3392                 .mprq = {
3393                         .enabled = 0, /* Disabled by default. */
3394                         .stride_num_n = 0,
3395                         .stride_size_n = 0,
3396                         .max_memcpy_len = MLX5_MPRQ_MEMCPY_DEFAULT_LEN,
3397                         .min_rxqs_num = MLX5_MPRQ_MIN_RXQS,
3398                 },
3399                 .dv_esw_en = 1,
3400                 .dv_flow_en = 1,
3401                 .log_hp_size = MLX5_ARG_UNSET,
3402         };
3403         /* Device specific configuration. */
3404         switch (pci_dev->id.device_id) {
3405         case PCI_DEVICE_ID_MELLANOX_CONNECTX4VF:
3406         case PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF:
3407         case PCI_DEVICE_ID_MELLANOX_CONNECTX5VF:
3408         case PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF:
3409         case PCI_DEVICE_ID_MELLANOX_CONNECTX5BFVF:
3410         case PCI_DEVICE_ID_MELLANOX_CONNECTX6VF:
3411         case PCI_DEVICE_ID_MELLANOX_CONNECTX6DXVF:
3412                 dev_config.vf = 1;
3413                 break;
3414         default:
3415                 break;
3416         }
3417         for (i = 0; i != ns; ++i) {
3418                 uint32_t restore;
3419
3420                 list[i].eth_dev = mlx5_dev_spawn(&pci_dev->device,
3421                                                  &list[i],
3422                                                  dev_config);
3423                 if (!list[i].eth_dev) {
3424                         if (rte_errno != EBUSY && rte_errno != EEXIST)
3425                                 break;
3426                         /* Device is disabled or already spawned. Ignore it. */
3427                         continue;
3428                 }
3429                 restore = list[i].eth_dev->data->dev_flags;
3430                 rte_eth_copy_pci_info(list[i].eth_dev, pci_dev);
3431                 /* Restore non-PCI flags cleared by the above call. */
3432                 list[i].eth_dev->data->dev_flags |= restore;
3433                 mlx5_dev_interrupt_handler_devx_install(list[i].eth_dev);
3434                 rte_eth_dev_probing_finish(list[i].eth_dev);
3435         }
3436         if (i != ns) {
3437                 DRV_LOG(ERR,
3438                         "probe of PCI device " PCI_PRI_FMT " aborted after"
3439                         " encountering an error: %s",
3440                         pci_dev->addr.domain, pci_dev->addr.bus,
3441                         pci_dev->addr.devid, pci_dev->addr.function,
3442                         strerror(rte_errno));
3443                 ret = -rte_errno;
3444                 /* Roll back. */
3445                 while (i--) {
3446                         if (!list[i].eth_dev)
3447                                 continue;
3448                         mlx5_dev_close(list[i].eth_dev);
3449                         /* mac_addrs must not be freed because in dev_private */
3450                         list[i].eth_dev->data->mac_addrs = NULL;
3451                         claim_zero(rte_eth_dev_release_port(list[i].eth_dev));
3452                 }
3453                 /* Restore original error. */
3454                 rte_errno = -ret;
3455         } else {
3456                 ret = 0;
3457         }
3458 exit:
3459         /*
3460          * Do the routine cleanup:
3461          * - close opened Netlink sockets
3462          * - free allocated spawn data array
3463          * - free the Infiniband device list
3464          */
3465         if (nl_rdma >= 0)
3466                 close(nl_rdma);
3467         if (nl_route >= 0)
3468                 close(nl_route);
3469         if (list)
3470                 rte_free(list);
3471         MLX5_ASSERT(ibv_list);
3472         mlx5_glue->free_device_list(ibv_list);
3473         return ret;
3474 }
3475
3476 /**
3477  * Look for the ethernet device belonging to mlx5 driver.
3478  *
3479  * @param[in] port_id
3480  *   port_id to start looking for device.
3481  * @param[in] pci_dev
3482  *   Pointer to the hint PCI device. When device is being probed
3483  *   the its siblings (master and preceding representors might
3484  *   not have assigned driver yet (because the mlx5_pci_probe()
3485  *   is not completed yet, for this case match on hint PCI
3486  *   device may be used to detect sibling device.
3487  *
3488  * @return
3489  *   port_id of found device, RTE_MAX_ETHPORT if not found.
3490  */
3491 uint16_t
3492 mlx5_eth_find_next(uint16_t port_id, struct rte_pci_device *pci_dev)
3493 {
3494         while (port_id < RTE_MAX_ETHPORTS) {
3495                 struct rte_eth_dev *dev = &rte_eth_devices[port_id];
3496
3497                 if (dev->state != RTE_ETH_DEV_UNUSED &&
3498                     dev->device &&
3499                     (dev->device == &pci_dev->device ||
3500                      (dev->device->driver &&
3501                      dev->device->driver->name &&
3502                      !strcmp(dev->device->driver->name, MLX5_DRIVER_NAME))))
3503                         break;
3504                 port_id++;
3505         }
3506         if (port_id >= RTE_MAX_ETHPORTS)
3507                 return RTE_MAX_ETHPORTS;
3508         return port_id;
3509 }
3510
3511 /**
3512  * DPDK callback to remove a PCI device.
3513  *
3514  * This function removes all Ethernet devices belong to a given PCI device.
3515  *
3516  * @param[in] pci_dev
3517  *   Pointer to the PCI device.
3518  *
3519  * @return
3520  *   0 on success, the function cannot fail.
3521  */
3522 static int
3523 mlx5_pci_remove(struct rte_pci_device *pci_dev)
3524 {
3525         uint16_t port_id;
3526
3527         RTE_ETH_FOREACH_DEV_OF(port_id, &pci_dev->device)
3528                 rte_eth_dev_close(port_id);
3529         return 0;
3530 }
3531
3532 static const struct rte_pci_id mlx5_pci_id_map[] = {
3533         {
3534                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
3535                                PCI_DEVICE_ID_MELLANOX_CONNECTX4)
3536         },
3537         {
3538                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
3539                                PCI_DEVICE_ID_MELLANOX_CONNECTX4VF)
3540         },
3541         {
3542                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
3543                                PCI_DEVICE_ID_MELLANOX_CONNECTX4LX)
3544         },
3545         {
3546                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
3547                                PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF)
3548         },
3549         {
3550                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
3551                                PCI_DEVICE_ID_MELLANOX_CONNECTX5)
3552         },
3553         {
3554                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
3555                                PCI_DEVICE_ID_MELLANOX_CONNECTX5VF)
3556         },
3557         {
3558                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
3559                                PCI_DEVICE_ID_MELLANOX_CONNECTX5EX)
3560         },
3561         {
3562                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
3563                                PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF)
3564         },
3565         {
3566                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
3567                                PCI_DEVICE_ID_MELLANOX_CONNECTX5BF)
3568         },
3569         {
3570                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
3571                                PCI_DEVICE_ID_MELLANOX_CONNECTX5BFVF)
3572         },
3573         {
3574                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
3575                                 PCI_DEVICE_ID_MELLANOX_CONNECTX6)
3576         },
3577         {
3578                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
3579                                 PCI_DEVICE_ID_MELLANOX_CONNECTX6VF)
3580         },
3581         {
3582                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
3583                                 PCI_DEVICE_ID_MELLANOX_CONNECTX6DX)
3584         },
3585         {
3586                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
3587                                 PCI_DEVICE_ID_MELLANOX_CONNECTX6DXVF)
3588         },
3589         {
3590                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
3591                                 PCI_DEVICE_ID_MELLANOX_CONNECTX6DXBF)
3592         },
3593         {
3594                 .vendor_id = 0
3595         }
3596 };
3597
3598 static struct rte_pci_driver mlx5_driver = {
3599         .driver = {
3600                 .name = MLX5_DRIVER_NAME
3601         },
3602         .id_table = mlx5_pci_id_map,
3603         .probe = mlx5_pci_probe,
3604         .remove = mlx5_pci_remove,
3605         .dma_map = mlx5_dma_map,
3606         .dma_unmap = mlx5_dma_unmap,
3607         .drv_flags = RTE_PCI_DRV_INTR_LSC | RTE_PCI_DRV_INTR_RMV |
3608                      RTE_PCI_DRV_PROBE_AGAIN,
3609 };
3610
3611 /**
3612  * Driver initialization routine.
3613  */
3614 RTE_INIT(rte_mlx5_pmd_init)
3615 {
3616         /* Initialize driver log type. */
3617         mlx5_logtype = rte_log_register("pmd.net.mlx5");
3618         if (mlx5_logtype >= 0)
3619                 rte_log_set_level(mlx5_logtype, RTE_LOG_NOTICE);
3620
3621         /* Build the static tables for Verbs conversion. */
3622         mlx5_set_ptype_table();
3623         mlx5_set_cksum_table();
3624         mlx5_set_swp_types_table();
3625         if (mlx5_glue)
3626                 rte_pci_register(&mlx5_driver);
3627 }
3628
3629 RTE_PMD_EXPORT_NAME(net_mlx5, __COUNTER__);
3630 RTE_PMD_REGISTER_PCI_TABLE(net_mlx5, mlx5_pci_id_map);
3631 RTE_PMD_REGISTER_KMOD_DEP(net_mlx5, "* ib_uverbs & mlx5_core & mlx5_ib");