0c654ed8b7b712547b76954a281d303fd8f3d275
[dpdk.git] / drivers / net / mlx5 / mlx5.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2015 6WIND S.A.
3  * Copyright 2015 Mellanox Technologies, Ltd
4  */
5
6 #include <stddef.h>
7 #include <unistd.h>
8 #include <string.h>
9 #include <stdint.h>
10 #include <stdlib.h>
11 #include <errno.h>
12 #include <net/if.h>
13 #include <sys/mman.h>
14 #include <linux/rtnetlink.h>
15
16 /* Verbs header. */
17 /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
18 #ifdef PEDANTIC
19 #pragma GCC diagnostic ignored "-Wpedantic"
20 #endif
21 #include <infiniband/verbs.h>
22 #ifdef PEDANTIC
23 #pragma GCC diagnostic error "-Wpedantic"
24 #endif
25
26 #include <rte_malloc.h>
27 #include <rte_ethdev_driver.h>
28 #include <rte_ethdev_pci.h>
29 #include <rte_pci.h>
30 #include <rte_bus_pci.h>
31 #include <rte_common.h>
32 #include <rte_kvargs.h>
33 #include <rte_rwlock.h>
34 #include <rte_spinlock.h>
35 #include <rte_string_fns.h>
36 #include <rte_alarm.h>
37
38 #include <mlx5_glue.h>
39 #include <mlx5_devx_cmds.h>
40 #include <mlx5_common.h>
41 #include <mlx5_common_os.h>
42 #include <mlx5_common_mp.h>
43
44 #include "mlx5_defs.h"
45 #include "mlx5.h"
46 #include "mlx5_utils.h"
47 #include "mlx5_rxtx.h"
48 #include "mlx5_autoconf.h"
49 #include "mlx5_mr.h"
50 #include "mlx5_flow.h"
51 #include "rte_pmd_mlx5.h"
52
53 /* Device parameter to enable RX completion queue compression. */
54 #define MLX5_RXQ_CQE_COMP_EN "rxq_cqe_comp_en"
55
56 /* Device parameter to enable RX completion entry padding to 128B. */
57 #define MLX5_RXQ_CQE_PAD_EN "rxq_cqe_pad_en"
58
59 /* Device parameter to enable padding Rx packet to cacheline size. */
60 #define MLX5_RXQ_PKT_PAD_EN "rxq_pkt_pad_en"
61
62 /* Device parameter to enable Multi-Packet Rx queue. */
63 #define MLX5_RX_MPRQ_EN "mprq_en"
64
65 /* Device parameter to configure log 2 of the number of strides for MPRQ. */
66 #define MLX5_RX_MPRQ_LOG_STRIDE_NUM "mprq_log_stride_num"
67
68 /* Device parameter to configure log 2 of the stride size for MPRQ. */
69 #define MLX5_RX_MPRQ_LOG_STRIDE_SIZE "mprq_log_stride_size"
70
71 /* Device parameter to limit the size of memcpy'd packet for MPRQ. */
72 #define MLX5_RX_MPRQ_MAX_MEMCPY_LEN "mprq_max_memcpy_len"
73
74 /* Device parameter to set the minimum number of Rx queues to enable MPRQ. */
75 #define MLX5_RXQS_MIN_MPRQ "rxqs_min_mprq"
76
77 /* Device parameter to configure inline send. Deprecated, ignored.*/
78 #define MLX5_TXQ_INLINE "txq_inline"
79
80 /* Device parameter to limit packet size to inline with ordinary SEND. */
81 #define MLX5_TXQ_INLINE_MAX "txq_inline_max"
82
83 /* Device parameter to configure minimal data size to inline. */
84 #define MLX5_TXQ_INLINE_MIN "txq_inline_min"
85
86 /* Device parameter to limit packet size to inline with Enhanced MPW. */
87 #define MLX5_TXQ_INLINE_MPW "txq_inline_mpw"
88
89 /*
90  * Device parameter to configure the number of TX queues threshold for
91  * enabling inline send.
92  */
93 #define MLX5_TXQS_MIN_INLINE "txqs_min_inline"
94
95 /*
96  * Device parameter to configure the number of TX queues threshold for
97  * enabling vectorized Tx, deprecated, ignored (no vectorized Tx routines).
98  */
99 #define MLX5_TXQS_MAX_VEC "txqs_max_vec"
100
101 /* Device parameter to enable multi-packet send WQEs. */
102 #define MLX5_TXQ_MPW_EN "txq_mpw_en"
103
104 /*
105  * Device parameter to force doorbell register mapping
106  * to non-cahed region eliminating the extra write memory barrier.
107  */
108 #define MLX5_TX_DB_NC "tx_db_nc"
109
110 /*
111  * Device parameter to include 2 dsegs in the title WQEBB.
112  * Deprecated, ignored.
113  */
114 #define MLX5_TXQ_MPW_HDR_DSEG_EN "txq_mpw_hdr_dseg_en"
115
116 /*
117  * Device parameter to limit the size of inlining packet.
118  * Deprecated, ignored.
119  */
120 #define MLX5_TXQ_MAX_INLINE_LEN "txq_max_inline_len"
121
122 /*
123  * Device parameter to enable hardware Tx vector.
124  * Deprecated, ignored (no vectorized Tx routines anymore).
125  */
126 #define MLX5_TX_VEC_EN "tx_vec_en"
127
128 /* Device parameter to enable hardware Rx vector. */
129 #define MLX5_RX_VEC_EN "rx_vec_en"
130
131 /* Allow L3 VXLAN flow creation. */
132 #define MLX5_L3_VXLAN_EN "l3_vxlan_en"
133
134 /* Activate DV E-Switch flow steering. */
135 #define MLX5_DV_ESW_EN "dv_esw_en"
136
137 /* Activate DV flow steering. */
138 #define MLX5_DV_FLOW_EN "dv_flow_en"
139
140 /* Enable extensive flow metadata support. */
141 #define MLX5_DV_XMETA_EN "dv_xmeta_en"
142
143 /* Device parameter to let the user manage the lacp traffic of bonded device */
144 #define MLX5_LACP_BY_USER "lacp_by_user"
145
146 /* Activate Netlink support in VF mode. */
147 #define MLX5_VF_NL_EN "vf_nl_en"
148
149 /* Enable extending memsegs when creating a MR. */
150 #define MLX5_MR_EXT_MEMSEG_EN "mr_ext_memseg_en"
151
152 /* Select port representors to instantiate. */
153 #define MLX5_REPRESENTOR "representor"
154
155 /* Device parameter to configure the maximum number of dump files per queue. */
156 #define MLX5_MAX_DUMP_FILES_NUM "max_dump_files_num"
157
158 /* Configure timeout of LRO session (in microseconds). */
159 #define MLX5_LRO_TIMEOUT_USEC "lro_timeout_usec"
160
161 /*
162  * Device parameter to configure the total data buffer size for a single
163  * hairpin queue (logarithm value).
164  */
165 #define MLX5_HP_BUF_SIZE "hp_buf_log_sz"
166
167 /* Flow memory reclaim mode. */
168 #define MLX5_RECLAIM_MEM "reclaim_mem_mode"
169
170 static const char *MZ_MLX5_PMD_SHARED_DATA = "mlx5_pmd_shared_data";
171
172 /* Shared memory between primary and secondary processes. */
173 struct mlx5_shared_data *mlx5_shared_data;
174
175 /* Spinlock for mlx5_shared_data allocation. */
176 static rte_spinlock_t mlx5_shared_data_lock = RTE_SPINLOCK_INITIALIZER;
177
178 /* Process local data for secondary processes. */
179 static struct mlx5_local_data mlx5_local_data;
180
181 static LIST_HEAD(, mlx5_dev_ctx_shared) mlx5_dev_ctx_list =
182                                                 LIST_HEAD_INITIALIZER();
183 static pthread_mutex_t mlx5_dev_ctx_list_mutex = PTHREAD_MUTEX_INITIALIZER;
184
185 static const struct mlx5_indexed_pool_config mlx5_ipool_cfg[] = {
186 #ifdef HAVE_IBV_FLOW_DV_SUPPORT
187         {
188                 .size = sizeof(struct mlx5_flow_dv_encap_decap_resource),
189                 .trunk_size = 64,
190                 .grow_trunk = 3,
191                 .grow_shift = 2,
192                 .need_lock = 0,
193                 .release_mem_en = 1,
194                 .malloc = rte_malloc_socket,
195                 .free = rte_free,
196                 .type = "mlx5_encap_decap_ipool",
197         },
198         {
199                 .size = sizeof(struct mlx5_flow_dv_push_vlan_action_resource),
200                 .trunk_size = 64,
201                 .grow_trunk = 3,
202                 .grow_shift = 2,
203                 .need_lock = 0,
204                 .release_mem_en = 1,
205                 .malloc = rte_malloc_socket,
206                 .free = rte_free,
207                 .type = "mlx5_push_vlan_ipool",
208         },
209         {
210                 .size = sizeof(struct mlx5_flow_dv_tag_resource),
211                 .trunk_size = 64,
212                 .grow_trunk = 3,
213                 .grow_shift = 2,
214                 .need_lock = 0,
215                 .release_mem_en = 1,
216                 .malloc = rte_malloc_socket,
217                 .free = rte_free,
218                 .type = "mlx5_tag_ipool",
219         },
220         {
221                 .size = sizeof(struct mlx5_flow_dv_port_id_action_resource),
222                 .trunk_size = 64,
223                 .grow_trunk = 3,
224                 .grow_shift = 2,
225                 .need_lock = 0,
226                 .release_mem_en = 1,
227                 .malloc = rte_malloc_socket,
228                 .free = rte_free,
229                 .type = "mlx5_port_id_ipool",
230         },
231         {
232                 .size = sizeof(struct mlx5_flow_tbl_data_entry),
233                 .trunk_size = 64,
234                 .grow_trunk = 3,
235                 .grow_shift = 2,
236                 .need_lock = 0,
237                 .release_mem_en = 1,
238                 .malloc = rte_malloc_socket,
239                 .free = rte_free,
240                 .type = "mlx5_jump_ipool",
241         },
242 #endif
243         {
244                 .size = sizeof(struct mlx5_flow_meter),
245                 .trunk_size = 64,
246                 .grow_trunk = 3,
247                 .grow_shift = 2,
248                 .need_lock = 0,
249                 .release_mem_en = 1,
250                 .malloc = rte_malloc_socket,
251                 .free = rte_free,
252                 .type = "mlx5_meter_ipool",
253         },
254         {
255                 .size = sizeof(struct mlx5_flow_mreg_copy_resource),
256                 .trunk_size = 64,
257                 .grow_trunk = 3,
258                 .grow_shift = 2,
259                 .need_lock = 0,
260                 .release_mem_en = 1,
261                 .malloc = rte_malloc_socket,
262                 .free = rte_free,
263                 .type = "mlx5_mcp_ipool",
264         },
265         {
266                 .size = (sizeof(struct mlx5_hrxq) + MLX5_RSS_HASH_KEY_LEN),
267                 .trunk_size = 64,
268                 .grow_trunk = 3,
269                 .grow_shift = 2,
270                 .need_lock = 0,
271                 .release_mem_en = 1,
272                 .malloc = rte_malloc_socket,
273                 .free = rte_free,
274                 .type = "mlx5_hrxq_ipool",
275         },
276         {
277                 /*
278                  * MLX5_IPOOL_MLX5_FLOW size varies for DV and VERBS flows.
279                  * It set in run time according to PCI function configuration.
280                  */
281                 .size = 0,
282                 .trunk_size = 64,
283                 .grow_trunk = 3,
284                 .grow_shift = 2,
285                 .need_lock = 0,
286                 .release_mem_en = 1,
287                 .malloc = rte_malloc_socket,
288                 .free = rte_free,
289                 .type = "mlx5_flow_handle_ipool",
290         },
291         {
292                 .size = sizeof(struct rte_flow),
293                 .trunk_size = 4096,
294                 .need_lock = 1,
295                 .release_mem_en = 1,
296                 .malloc = rte_malloc_socket,
297                 .free = rte_free,
298                 .type = "rte_flow_ipool",
299         },
300 };
301
302
303 #define MLX5_FLOW_MIN_ID_POOL_SIZE 512
304 #define MLX5_ID_GENERATION_ARRAY_FACTOR 16
305
306 #define MLX5_FLOW_TABLE_HLIST_ARRAY_SIZE 4096
307
308 /**
309  * Allocate ID pool structure.
310  *
311  * @param[in] max_id
312  *   The maximum id can be allocated from the pool.
313  *
314  * @return
315  *   Pointer to pool object, NULL value otherwise.
316  */
317 struct mlx5_flow_id_pool *
318 mlx5_flow_id_pool_alloc(uint32_t max_id)
319 {
320         struct mlx5_flow_id_pool *pool;
321         void *mem;
322
323         pool = rte_zmalloc("id pool allocation", sizeof(*pool),
324                            RTE_CACHE_LINE_SIZE);
325         if (!pool) {
326                 DRV_LOG(ERR, "can't allocate id pool");
327                 rte_errno  = ENOMEM;
328                 return NULL;
329         }
330         mem = rte_zmalloc("", MLX5_FLOW_MIN_ID_POOL_SIZE * sizeof(uint32_t),
331                           RTE_CACHE_LINE_SIZE);
332         if (!mem) {
333                 DRV_LOG(ERR, "can't allocate mem for id pool");
334                 rte_errno  = ENOMEM;
335                 goto error;
336         }
337         pool->free_arr = mem;
338         pool->curr = pool->free_arr;
339         pool->last = pool->free_arr + MLX5_FLOW_MIN_ID_POOL_SIZE;
340         pool->base_index = 0;
341         pool->max_id = max_id;
342         return pool;
343 error:
344         rte_free(pool);
345         return NULL;
346 }
347
348 /**
349  * Release ID pool structure.
350  *
351  * @param[in] pool
352  *   Pointer to flow id pool object to free.
353  */
354 void
355 mlx5_flow_id_pool_release(struct mlx5_flow_id_pool *pool)
356 {
357         rte_free(pool->free_arr);
358         rte_free(pool);
359 }
360
361 /**
362  * Generate ID.
363  *
364  * @param[in] pool
365  *   Pointer to flow id pool.
366  * @param[out] id
367  *   The generated ID.
368  *
369  * @return
370  *   0 on success, error value otherwise.
371  */
372 uint32_t
373 mlx5_flow_id_get(struct mlx5_flow_id_pool *pool, uint32_t *id)
374 {
375         if (pool->curr == pool->free_arr) {
376                 if (pool->base_index == pool->max_id) {
377                         rte_errno  = ENOMEM;
378                         DRV_LOG(ERR, "no free id");
379                         return -rte_errno;
380                 }
381                 *id = ++pool->base_index;
382                 return 0;
383         }
384         *id = *(--pool->curr);
385         return 0;
386 }
387
388 /**
389  * Release ID.
390  *
391  * @param[in] pool
392  *   Pointer to flow id pool.
393  * @param[out] id
394  *   The generated ID.
395  *
396  * @return
397  *   0 on success, error value otherwise.
398  */
399 uint32_t
400 mlx5_flow_id_release(struct mlx5_flow_id_pool *pool, uint32_t id)
401 {
402         uint32_t size;
403         uint32_t size2;
404         void *mem;
405
406         if (pool->curr == pool->last) {
407                 size = pool->curr - pool->free_arr;
408                 size2 = size * MLX5_ID_GENERATION_ARRAY_FACTOR;
409                 MLX5_ASSERT(size2 > size);
410                 mem = rte_malloc("", size2 * sizeof(uint32_t), 0);
411                 if (!mem) {
412                         DRV_LOG(ERR, "can't allocate mem for id pool");
413                         rte_errno  = ENOMEM;
414                         return -rte_errno;
415                 }
416                 memcpy(mem, pool->free_arr, size * sizeof(uint32_t));
417                 rte_free(pool->free_arr);
418                 pool->free_arr = mem;
419                 pool->curr = pool->free_arr + size;
420                 pool->last = pool->free_arr + size2;
421         }
422         *pool->curr = id;
423         pool->curr++;
424         return 0;
425 }
426
427 /**
428  * Initialize the shared aging list information per port.
429  *
430  * @param[in] sh
431  *   Pointer to mlx5_dev_ctx_shared object.
432  */
433 static void
434 mlx5_flow_aging_init(struct mlx5_dev_ctx_shared *sh)
435 {
436         uint32_t i;
437         struct mlx5_age_info *age_info;
438
439         for (i = 0; i < sh->max_port; i++) {
440                 age_info = &sh->port[i].age_info;
441                 age_info->flags = 0;
442                 TAILQ_INIT(&age_info->aged_counters);
443                 rte_spinlock_init(&age_info->aged_sl);
444                 MLX5_AGE_SET(age_info, MLX5_AGE_TRIGGER);
445         }
446 }
447
448 /**
449  * Initialize the counters management structure.
450  *
451  * @param[in] sh
452  *   Pointer to mlx5_dev_ctx_shared object to free
453  */
454 static void
455 mlx5_flow_counters_mng_init(struct mlx5_dev_ctx_shared *sh)
456 {
457         int i;
458
459         memset(&sh->cmng, 0, sizeof(sh->cmng));
460         TAILQ_INIT(&sh->cmng.flow_counters);
461         for (i = 0; i < MLX5_CCONT_TYPE_MAX; ++i) {
462                 sh->cmng.ccont[i].min_id = MLX5_CNT_BATCH_OFFSET;
463                 sh->cmng.ccont[i].max_id = -1;
464                 sh->cmng.ccont[i].last_pool_idx = POOL_IDX_INVALID;
465                 TAILQ_INIT(&sh->cmng.ccont[i].pool_list);
466                 rte_spinlock_init(&sh->cmng.ccont[i].resize_sl);
467                 TAILQ_INIT(&sh->cmng.ccont[i].counters);
468                 rte_spinlock_init(&sh->cmng.ccont[i].csl);
469         }
470 }
471
472 /**
473  * Destroy all the resources allocated for a counter memory management.
474  *
475  * @param[in] mng
476  *   Pointer to the memory management structure.
477  */
478 static void
479 mlx5_flow_destroy_counter_stat_mem_mng(struct mlx5_counter_stats_mem_mng *mng)
480 {
481         uint8_t *mem = (uint8_t *)(uintptr_t)mng->raws[0].data;
482
483         LIST_REMOVE(mng, next);
484         claim_zero(mlx5_devx_cmd_destroy(mng->dm));
485         claim_zero(mlx5_glue->devx_umem_dereg(mng->umem));
486         rte_free(mem);
487 }
488
489 /**
490  * Close and release all the resources of the counters management.
491  *
492  * @param[in] sh
493  *   Pointer to mlx5_dev_ctx_shared object to free.
494  */
495 static void
496 mlx5_flow_counters_mng_close(struct mlx5_dev_ctx_shared *sh)
497 {
498         struct mlx5_counter_stats_mem_mng *mng;
499         int i;
500         int j;
501         int retries = 1024;
502
503         rte_errno = 0;
504         while (--retries) {
505                 rte_eal_alarm_cancel(mlx5_flow_query_alarm, sh);
506                 if (rte_errno != EINPROGRESS)
507                         break;
508                 rte_pause();
509         }
510         for (i = 0; i < MLX5_CCONT_TYPE_MAX; ++i) {
511                 struct mlx5_flow_counter_pool *pool;
512                 uint32_t batch = !!(i > 1);
513
514                 if (!sh->cmng.ccont[i].pools)
515                         continue;
516                 pool = TAILQ_FIRST(&sh->cmng.ccont[i].pool_list);
517                 while (pool) {
518                         if (batch && pool->min_dcs)
519                                 claim_zero(mlx5_devx_cmd_destroy
520                                                                (pool->min_dcs));
521                         for (j = 0; j < MLX5_COUNTERS_PER_POOL; ++j) {
522                                 if (MLX5_POOL_GET_CNT(pool, j)->action)
523                                         claim_zero
524                                          (mlx5_glue->destroy_flow_action
525                                           (MLX5_POOL_GET_CNT
526                                           (pool, j)->action));
527                                 if (!batch && MLX5_GET_POOL_CNT_EXT
528                                     (pool, j)->dcs)
529                                         claim_zero(mlx5_devx_cmd_destroy
530                                                    (MLX5_GET_POOL_CNT_EXT
531                                                     (pool, j)->dcs));
532                         }
533                         TAILQ_REMOVE(&sh->cmng.ccont[i].pool_list, pool, next);
534                         rte_free(pool);
535                         pool = TAILQ_FIRST(&sh->cmng.ccont[i].pool_list);
536                 }
537                 rte_free(sh->cmng.ccont[i].pools);
538         }
539         mng = LIST_FIRST(&sh->cmng.mem_mngs);
540         while (mng) {
541                 mlx5_flow_destroy_counter_stat_mem_mng(mng);
542                 mng = LIST_FIRST(&sh->cmng.mem_mngs);
543         }
544         memset(&sh->cmng, 0, sizeof(sh->cmng));
545 }
546
547 /**
548  * Initialize the flow resources' indexed mempool.
549  *
550  * @param[in] sh
551  *   Pointer to mlx5_dev_ctx_shared object.
552  * @param[in] sh
553  *   Pointer to user dev config.
554  */
555 static void
556 mlx5_flow_ipool_create(struct mlx5_dev_ctx_shared *sh,
557                        const struct mlx5_dev_config *config)
558 {
559         uint8_t i;
560         struct mlx5_indexed_pool_config cfg;
561
562         for (i = 0; i < MLX5_IPOOL_MAX; ++i) {
563                 cfg = mlx5_ipool_cfg[i];
564                 switch (i) {
565                 default:
566                         break;
567                 /*
568                  * Set MLX5_IPOOL_MLX5_FLOW ipool size
569                  * according to PCI function flow configuration.
570                  */
571                 case MLX5_IPOOL_MLX5_FLOW:
572                         cfg.size = config->dv_flow_en ?
573                                 sizeof(struct mlx5_flow_handle) :
574                                 MLX5_FLOW_HANDLE_VERBS_SIZE;
575                         break;
576                 }
577                 if (config->reclaim_mode)
578                         cfg.release_mem_en = 1;
579                 sh->ipool[i] = mlx5_ipool_create(&cfg);
580         }
581 }
582
583 /**
584  * Release the flow resources' indexed mempool.
585  *
586  * @param[in] sh
587  *   Pointer to mlx5_dev_ctx_shared object.
588  */
589 static void
590 mlx5_flow_ipool_destroy(struct mlx5_dev_ctx_shared *sh)
591 {
592         uint8_t i;
593
594         for (i = 0; i < MLX5_IPOOL_MAX; ++i)
595                 mlx5_ipool_destroy(sh->ipool[i]);
596 }
597
598 /**
599  * Allocate shared device context. If there is multiport device the
600  * master and representors will share this context, if there is single
601  * port dedicated device, the context will be used by only given
602  * port due to unification.
603  *
604  * Routine first searches the context for the specified device name,
605  * if found the shared context assumed and reference counter is incremented.
606  * If no context found the new one is created and initialized with specified
607  * device context and parameters.
608  *
609  * @param[in] spawn
610  *   Pointer to the device attributes (name, port, etc).
611  * @param[in] config
612  *   Pointer to device configuration structure.
613  *
614  * @return
615  *   Pointer to mlx5_dev_ctx_shared object on success,
616  *   otherwise NULL and rte_errno is set.
617  */
618 struct mlx5_dev_ctx_shared *
619 mlx5_alloc_shared_dev_ctx(const struct mlx5_dev_spawn_data *spawn,
620                            const struct mlx5_dev_config *config)
621 {
622         struct mlx5_dev_ctx_shared *sh;
623         int err = 0;
624         uint32_t i;
625         struct mlx5_devx_tis_attr tis_attr = { 0 };
626
627         MLX5_ASSERT(spawn);
628         /* Secondary process should not create the shared context. */
629         MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY);
630         pthread_mutex_lock(&mlx5_dev_ctx_list_mutex);
631         /* Search for IB context by device name. */
632         LIST_FOREACH(sh, &mlx5_dev_ctx_list, next) {
633                 if (!strcmp(sh->ibdev_name,
634                         mlx5_os_get_dev_device_name(spawn->phys_dev))) {
635                         sh->refcnt++;
636                         goto exit;
637                 }
638         }
639         /* No device found, we have to create new shared context. */
640         MLX5_ASSERT(spawn->max_port);
641         sh = rte_zmalloc("ethdev shared ib context",
642                          sizeof(struct mlx5_dev_ctx_shared) +
643                          spawn->max_port *
644                          sizeof(struct mlx5_dev_shared_port),
645                          RTE_CACHE_LINE_SIZE);
646         if (!sh) {
647                 DRV_LOG(ERR, "shared context allocation failure");
648                 rte_errno  = ENOMEM;
649                 goto exit;
650         }
651         err = mlx5_os_open_device(spawn, config, sh);
652         if (!sh->ctx)
653                 goto error;
654         err = mlx5_os_get_dev_attr(sh->ctx, &sh->device_attr);
655         if (err) {
656                 DRV_LOG(DEBUG, "mlx5_os_get_dev_attr() failed");
657                 goto error;
658         }
659         sh->refcnt = 1;
660         sh->max_port = spawn->max_port;
661         strncpy(sh->ibdev_name, mlx5_os_get_ctx_device_name(sh->ctx),
662                 sizeof(sh->ibdev_name) - 1);
663         strncpy(sh->ibdev_path, mlx5_os_get_ctx_device_path(sh->ctx),
664                 sizeof(sh->ibdev_path) - 1);
665         /*
666          * Setting port_id to max unallowed value means
667          * there is no interrupt subhandler installed for
668          * the given port index i.
669          */
670         for (i = 0; i < sh->max_port; i++) {
671                 sh->port[i].ih_port_id = RTE_MAX_ETHPORTS;
672                 sh->port[i].devx_ih_port_id = RTE_MAX_ETHPORTS;
673         }
674         sh->pd = mlx5_glue->alloc_pd(sh->ctx);
675         if (sh->pd == NULL) {
676                 DRV_LOG(ERR, "PD allocation failure");
677                 err = ENOMEM;
678                 goto error;
679         }
680         if (sh->devx) {
681                 err = mlx5_os_get_pdn(sh->pd, &sh->pdn);
682                 if (err) {
683                         DRV_LOG(ERR, "Fail to extract pdn from PD");
684                         goto error;
685                 }
686                 sh->td = mlx5_devx_cmd_create_td(sh->ctx);
687                 if (!sh->td) {
688                         DRV_LOG(ERR, "TD allocation failure");
689                         err = ENOMEM;
690                         goto error;
691                 }
692                 tis_attr.transport_domain = sh->td->id;
693                 sh->tis = mlx5_devx_cmd_create_tis(sh->ctx, &tis_attr);
694                 if (!sh->tis) {
695                         DRV_LOG(ERR, "TIS allocation failure");
696                         err = ENOMEM;
697                         goto error;
698                 }
699         }
700         sh->flow_id_pool = mlx5_flow_id_pool_alloc
701                                         ((1 << HAIRPIN_FLOW_ID_BITS) - 1);
702         if (!sh->flow_id_pool) {
703                 DRV_LOG(ERR, "can't create flow id pool");
704                 err = ENOMEM;
705                 goto error;
706         }
707         /*
708          * Once the device is added to the list of memory event
709          * callback, its global MR cache table cannot be expanded
710          * on the fly because of deadlock. If it overflows, lookup
711          * should be done by searching MR list linearly, which is slow.
712          *
713          * At this point the device is not added to the memory
714          * event list yet, context is just being created.
715          */
716         err = mlx5_mr_btree_init(&sh->share_cache.cache,
717                                  MLX5_MR_BTREE_CACHE_N * 2,
718                                  spawn->pci_dev->device.numa_node);
719         if (err) {
720                 err = rte_errno;
721                 goto error;
722         }
723         mlx5_os_set_reg_mr_cb(&sh->share_cache.reg_mr_cb,
724                               &sh->share_cache.dereg_mr_cb);
725         mlx5_os_dev_shared_handler_install(sh);
726         sh->cnt_id_tbl = mlx5_l3t_create(MLX5_L3T_TYPE_DWORD);
727         if (!sh->cnt_id_tbl) {
728                 err = rte_errno;
729                 goto error;
730         }
731         mlx5_flow_aging_init(sh);
732         mlx5_flow_counters_mng_init(sh);
733         mlx5_flow_ipool_create(sh, config);
734         /* Add device to memory callback list. */
735         rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock);
736         LIST_INSERT_HEAD(&mlx5_shared_data->mem_event_cb_list,
737                          sh, mem_event_cb);
738         rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
739         /* Add context to the global device list. */
740         LIST_INSERT_HEAD(&mlx5_dev_ctx_list, sh, next);
741 exit:
742         pthread_mutex_unlock(&mlx5_dev_ctx_list_mutex);
743         return sh;
744 error:
745         pthread_mutex_unlock(&mlx5_dev_ctx_list_mutex);
746         MLX5_ASSERT(sh);
747         if (sh->cnt_id_tbl) {
748                 mlx5_l3t_destroy(sh->cnt_id_tbl);
749                 sh->cnt_id_tbl = NULL;
750         }
751         if (sh->tis)
752                 claim_zero(mlx5_devx_cmd_destroy(sh->tis));
753         if (sh->td)
754                 claim_zero(mlx5_devx_cmd_destroy(sh->td));
755         if (sh->pd)
756                 claim_zero(mlx5_glue->dealloc_pd(sh->pd));
757         if (sh->ctx)
758                 claim_zero(mlx5_glue->close_device(sh->ctx));
759         if (sh->flow_id_pool)
760                 mlx5_flow_id_pool_release(sh->flow_id_pool);
761         rte_free(sh);
762         MLX5_ASSERT(err > 0);
763         rte_errno = err;
764         return NULL;
765 }
766
767 /**
768  * Free shared IB device context. Decrement counter and if zero free
769  * all allocated resources and close handles.
770  *
771  * @param[in] sh
772  *   Pointer to mlx5_dev_ctx_shared object to free
773  */
774 void
775 mlx5_free_shared_dev_ctx(struct mlx5_dev_ctx_shared *sh)
776 {
777         pthread_mutex_lock(&mlx5_dev_ctx_list_mutex);
778 #ifdef RTE_LIBRTE_MLX5_DEBUG
779         /* Check the object presence in the list. */
780         struct mlx5_dev_ctx_shared *lctx;
781
782         LIST_FOREACH(lctx, &mlx5_dev_ctx_list, next)
783                 if (lctx == sh)
784                         break;
785         MLX5_ASSERT(lctx);
786         if (lctx != sh) {
787                 DRV_LOG(ERR, "Freeing non-existing shared IB context");
788                 goto exit;
789         }
790 #endif
791         MLX5_ASSERT(sh);
792         MLX5_ASSERT(sh->refcnt);
793         /* Secondary process should not free the shared context. */
794         MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY);
795         if (--sh->refcnt)
796                 goto exit;
797         /* Remove from memory callback device list. */
798         rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock);
799         LIST_REMOVE(sh, mem_event_cb);
800         rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
801         /* Release created Memory Regions. */
802         mlx5_mr_release_cache(&sh->share_cache);
803         /* Remove context from the global device list. */
804         LIST_REMOVE(sh, next);
805         /*
806          *  Ensure there is no async event handler installed.
807          *  Only primary process handles async device events.
808          **/
809         mlx5_flow_counters_mng_close(sh);
810         mlx5_flow_ipool_destroy(sh);
811         mlx5_os_dev_shared_handler_uninstall(sh);
812         if (sh->cnt_id_tbl) {
813                 mlx5_l3t_destroy(sh->cnt_id_tbl);
814                 sh->cnt_id_tbl = NULL;
815         }
816         if (sh->pd)
817                 claim_zero(mlx5_glue->dealloc_pd(sh->pd));
818         if (sh->tis)
819                 claim_zero(mlx5_devx_cmd_destroy(sh->tis));
820         if (sh->td)
821                 claim_zero(mlx5_devx_cmd_destroy(sh->td));
822         if (sh->ctx)
823                 claim_zero(mlx5_glue->close_device(sh->ctx));
824         if (sh->flow_id_pool)
825                 mlx5_flow_id_pool_release(sh->flow_id_pool);
826         rte_free(sh);
827 exit:
828         pthread_mutex_unlock(&mlx5_dev_ctx_list_mutex);
829 }
830
831 /**
832  * Destroy table hash list and all the root entries per domain.
833  *
834  * @param[in] priv
835  *   Pointer to the private device data structure.
836  */
837 void
838 mlx5_free_table_hash_list(struct mlx5_priv *priv)
839 {
840         struct mlx5_dev_ctx_shared *sh = priv->sh;
841         struct mlx5_flow_tbl_data_entry *tbl_data;
842         union mlx5_flow_tbl_key table_key = {
843                 {
844                         .table_id = 0,
845                         .reserved = 0,
846                         .domain = 0,
847                         .direction = 0,
848                 }
849         };
850         struct mlx5_hlist_entry *pos;
851
852         if (!sh->flow_tbls)
853                 return;
854         pos = mlx5_hlist_lookup(sh->flow_tbls, table_key.v64);
855         if (pos) {
856                 tbl_data = container_of(pos, struct mlx5_flow_tbl_data_entry,
857                                         entry);
858                 MLX5_ASSERT(tbl_data);
859                 mlx5_hlist_remove(sh->flow_tbls, pos);
860                 rte_free(tbl_data);
861         }
862         table_key.direction = 1;
863         pos = mlx5_hlist_lookup(sh->flow_tbls, table_key.v64);
864         if (pos) {
865                 tbl_data = container_of(pos, struct mlx5_flow_tbl_data_entry,
866                                         entry);
867                 MLX5_ASSERT(tbl_data);
868                 mlx5_hlist_remove(sh->flow_tbls, pos);
869                 rte_free(tbl_data);
870         }
871         table_key.direction = 0;
872         table_key.domain = 1;
873         pos = mlx5_hlist_lookup(sh->flow_tbls, table_key.v64);
874         if (pos) {
875                 tbl_data = container_of(pos, struct mlx5_flow_tbl_data_entry,
876                                         entry);
877                 MLX5_ASSERT(tbl_data);
878                 mlx5_hlist_remove(sh->flow_tbls, pos);
879                 rte_free(tbl_data);
880         }
881         mlx5_hlist_destroy(sh->flow_tbls, NULL, NULL);
882 }
883
884 /**
885  * Initialize flow table hash list and create the root tables entry
886  * for each domain.
887  *
888  * @param[in] priv
889  *   Pointer to the private device data structure.
890  *
891  * @return
892  *   Zero on success, positive error code otherwise.
893  */
894 int
895 mlx5_alloc_table_hash_list(struct mlx5_priv *priv)
896 {
897         struct mlx5_dev_ctx_shared *sh = priv->sh;
898         char s[MLX5_HLIST_NAMESIZE];
899         int err = 0;
900
901         MLX5_ASSERT(sh);
902         snprintf(s, sizeof(s), "%s_flow_table", priv->sh->ibdev_name);
903         sh->flow_tbls = mlx5_hlist_create(s, MLX5_FLOW_TABLE_HLIST_ARRAY_SIZE);
904         if (!sh->flow_tbls) {
905                 DRV_LOG(ERR, "flow tables with hash creation failed.");
906                 err = ENOMEM;
907                 return err;
908         }
909 #ifndef HAVE_MLX5DV_DR
910         /*
911          * In case we have not DR support, the zero tables should be created
912          * because DV expect to see them even if they cannot be created by
913          * RDMA-CORE.
914          */
915         union mlx5_flow_tbl_key table_key = {
916                 {
917                         .table_id = 0,
918                         .reserved = 0,
919                         .domain = 0,
920                         .direction = 0,
921                 }
922         };
923         struct mlx5_flow_tbl_data_entry *tbl_data = rte_zmalloc(NULL,
924                                                           sizeof(*tbl_data), 0);
925
926         if (!tbl_data) {
927                 err = ENOMEM;
928                 goto error;
929         }
930         tbl_data->entry.key = table_key.v64;
931         err = mlx5_hlist_insert(sh->flow_tbls, &tbl_data->entry);
932         if (err)
933                 goto error;
934         rte_atomic32_init(&tbl_data->tbl.refcnt);
935         rte_atomic32_inc(&tbl_data->tbl.refcnt);
936         table_key.direction = 1;
937         tbl_data = rte_zmalloc(NULL, sizeof(*tbl_data), 0);
938         if (!tbl_data) {
939                 err = ENOMEM;
940                 goto error;
941         }
942         tbl_data->entry.key = table_key.v64;
943         err = mlx5_hlist_insert(sh->flow_tbls, &tbl_data->entry);
944         if (err)
945                 goto error;
946         rte_atomic32_init(&tbl_data->tbl.refcnt);
947         rte_atomic32_inc(&tbl_data->tbl.refcnt);
948         table_key.direction = 0;
949         table_key.domain = 1;
950         tbl_data = rte_zmalloc(NULL, sizeof(*tbl_data), 0);
951         if (!tbl_data) {
952                 err = ENOMEM;
953                 goto error;
954         }
955         tbl_data->entry.key = table_key.v64;
956         err = mlx5_hlist_insert(sh->flow_tbls, &tbl_data->entry);
957         if (err)
958                 goto error;
959         rte_atomic32_init(&tbl_data->tbl.refcnt);
960         rte_atomic32_inc(&tbl_data->tbl.refcnt);
961         return err;
962 error:
963         mlx5_free_table_hash_list(priv);
964 #endif /* HAVE_MLX5DV_DR */
965         return err;
966 }
967
968 /**
969  * Initialize shared data between primary and secondary process.
970  *
971  * A memzone is reserved by primary process and secondary processes attach to
972  * the memzone.
973  *
974  * @return
975  *   0 on success, a negative errno value otherwise and rte_errno is set.
976  */
977 static int
978 mlx5_init_shared_data(void)
979 {
980         const struct rte_memzone *mz;
981         int ret = 0;
982
983         rte_spinlock_lock(&mlx5_shared_data_lock);
984         if (mlx5_shared_data == NULL) {
985                 if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
986                         /* Allocate shared memory. */
987                         mz = rte_memzone_reserve(MZ_MLX5_PMD_SHARED_DATA,
988                                                  sizeof(*mlx5_shared_data),
989                                                  SOCKET_ID_ANY, 0);
990                         if (mz == NULL) {
991                                 DRV_LOG(ERR,
992                                         "Cannot allocate mlx5 shared data");
993                                 ret = -rte_errno;
994                                 goto error;
995                         }
996                         mlx5_shared_data = mz->addr;
997                         memset(mlx5_shared_data, 0, sizeof(*mlx5_shared_data));
998                         rte_spinlock_init(&mlx5_shared_data->lock);
999                 } else {
1000                         /* Lookup allocated shared memory. */
1001                         mz = rte_memzone_lookup(MZ_MLX5_PMD_SHARED_DATA);
1002                         if (mz == NULL) {
1003                                 DRV_LOG(ERR,
1004                                         "Cannot attach mlx5 shared data");
1005                                 ret = -rte_errno;
1006                                 goto error;
1007                         }
1008                         mlx5_shared_data = mz->addr;
1009                         memset(&mlx5_local_data, 0, sizeof(mlx5_local_data));
1010                 }
1011         }
1012 error:
1013         rte_spinlock_unlock(&mlx5_shared_data_lock);
1014         return ret;
1015 }
1016
1017 /**
1018  * Retrieve integer value from environment variable.
1019  *
1020  * @param[in] name
1021  *   Environment variable name.
1022  *
1023  * @return
1024  *   Integer value, 0 if the variable is not set.
1025  */
1026 int
1027 mlx5_getenv_int(const char *name)
1028 {
1029         const char *val = getenv(name);
1030
1031         if (val == NULL)
1032                 return 0;
1033         return atoi(val);
1034 }
1035
1036 /**
1037  * DPDK callback to add udp tunnel port
1038  *
1039  * @param[in] dev
1040  *   A pointer to eth_dev
1041  * @param[in] udp_tunnel
1042  *   A pointer to udp tunnel
1043  *
1044  * @return
1045  *   0 on valid udp ports and tunnels, -ENOTSUP otherwise.
1046  */
1047 int
1048 mlx5_udp_tunnel_port_add(struct rte_eth_dev *dev __rte_unused,
1049                          struct rte_eth_udp_tunnel *udp_tunnel)
1050 {
1051         MLX5_ASSERT(udp_tunnel != NULL);
1052         if (udp_tunnel->prot_type == RTE_TUNNEL_TYPE_VXLAN &&
1053             udp_tunnel->udp_port == 4789)
1054                 return 0;
1055         if (udp_tunnel->prot_type == RTE_TUNNEL_TYPE_VXLAN_GPE &&
1056             udp_tunnel->udp_port == 4790)
1057                 return 0;
1058         return -ENOTSUP;
1059 }
1060
1061 /**
1062  * Initialize process private data structure.
1063  *
1064  * @param dev
1065  *   Pointer to Ethernet device structure.
1066  *
1067  * @return
1068  *   0 on success, a negative errno value otherwise and rte_errno is set.
1069  */
1070 int
1071 mlx5_proc_priv_init(struct rte_eth_dev *dev)
1072 {
1073         struct mlx5_priv *priv = dev->data->dev_private;
1074         struct mlx5_proc_priv *ppriv;
1075         size_t ppriv_size;
1076
1077         /*
1078          * UAR register table follows the process private structure. BlueFlame
1079          * registers for Tx queues are stored in the table.
1080          */
1081         ppriv_size =
1082                 sizeof(struct mlx5_proc_priv) + priv->txqs_n * sizeof(void *);
1083         ppriv = rte_malloc_socket("mlx5_proc_priv", ppriv_size,
1084                                   RTE_CACHE_LINE_SIZE, dev->device->numa_node);
1085         if (!ppriv) {
1086                 rte_errno = ENOMEM;
1087                 return -rte_errno;
1088         }
1089         ppriv->uar_table_sz = ppriv_size;
1090         dev->process_private = ppriv;
1091         return 0;
1092 }
1093
1094 /**
1095  * Un-initialize process private data structure.
1096  *
1097  * @param dev
1098  *   Pointer to Ethernet device structure.
1099  */
1100 static void
1101 mlx5_proc_priv_uninit(struct rte_eth_dev *dev)
1102 {
1103         if (!dev->process_private)
1104                 return;
1105         rte_free(dev->process_private);
1106         dev->process_private = NULL;
1107 }
1108
1109 /**
1110  * DPDK callback to close the device.
1111  *
1112  * Destroy all queues and objects, free memory.
1113  *
1114  * @param dev
1115  *   Pointer to Ethernet device structure.
1116  */
1117 void
1118 mlx5_dev_close(struct rte_eth_dev *dev)
1119 {
1120         struct mlx5_priv *priv = dev->data->dev_private;
1121         unsigned int i;
1122         int ret;
1123
1124         if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
1125                 /* Check if process_private released. */
1126                 if (!dev->process_private)
1127                         return;
1128                 mlx5_tx_uar_uninit_secondary(dev);
1129                 mlx5_proc_priv_uninit(dev);
1130                 rte_eth_dev_release_port(dev);
1131                 return;
1132         }
1133         if (!priv->sh)
1134                 return;
1135         DRV_LOG(DEBUG, "port %u closing device \"%s\"",
1136                 dev->data->port_id,
1137                 ((priv->sh->ctx != NULL) ?
1138                 mlx5_os_get_ctx_device_name(priv->sh->ctx) : ""));
1139         /*
1140          * If default mreg copy action is removed at the stop stage,
1141          * the search will return none and nothing will be done anymore.
1142          */
1143         mlx5_flow_stop_default(dev);
1144         mlx5_traffic_disable(dev);
1145         /*
1146          * If all the flows are already flushed in the device stop stage,
1147          * then this will return directly without any action.
1148          */
1149         mlx5_flow_list_flush(dev, &priv->flows, true);
1150         mlx5_flow_meter_flush(dev, NULL);
1151         /* Free the intermediate buffers for flow creation. */
1152         mlx5_flow_free_intermediate(dev);
1153         /* Prevent crashes when queues are still in use. */
1154         dev->rx_pkt_burst = removed_rx_burst;
1155         dev->tx_pkt_burst = removed_tx_burst;
1156         rte_wmb();
1157         /* Disable datapath on secondary process. */
1158         mlx5_mp_req_stop_rxtx(dev);
1159         if (priv->rxqs != NULL) {
1160                 /* XXX race condition if mlx5_rx_burst() is still running. */
1161                 usleep(1000);
1162                 for (i = 0; (i != priv->rxqs_n); ++i)
1163                         mlx5_rxq_release(dev, i);
1164                 priv->rxqs_n = 0;
1165                 priv->rxqs = NULL;
1166         }
1167         if (priv->txqs != NULL) {
1168                 /* XXX race condition if mlx5_tx_burst() is still running. */
1169                 usleep(1000);
1170                 for (i = 0; (i != priv->txqs_n); ++i)
1171                         mlx5_txq_release(dev, i);
1172                 priv->txqs_n = 0;
1173                 priv->txqs = NULL;
1174         }
1175         mlx5_proc_priv_uninit(dev);
1176         if (priv->mreg_cp_tbl)
1177                 mlx5_hlist_destroy(priv->mreg_cp_tbl, NULL, NULL);
1178         mlx5_mprq_free_mp(dev);
1179         mlx5_os_free_shared_dr(priv);
1180         if (priv->rss_conf.rss_key != NULL)
1181                 rte_free(priv->rss_conf.rss_key);
1182         if (priv->reta_idx != NULL)
1183                 rte_free(priv->reta_idx);
1184         if (priv->config.vf)
1185                 mlx5_nl_mac_addr_flush(priv->nl_socket_route, mlx5_ifindex(dev),
1186                                        dev->data->mac_addrs,
1187                                        MLX5_MAX_MAC_ADDRESSES, priv->mac_own);
1188         if (priv->nl_socket_route >= 0)
1189                 close(priv->nl_socket_route);
1190         if (priv->nl_socket_rdma >= 0)
1191                 close(priv->nl_socket_rdma);
1192         if (priv->vmwa_context)
1193                 mlx5_vlan_vmwa_exit(priv->vmwa_context);
1194         ret = mlx5_hrxq_verify(dev);
1195         if (ret)
1196                 DRV_LOG(WARNING, "port %u some hash Rx queue still remain",
1197                         dev->data->port_id);
1198         ret = mlx5_ind_table_obj_verify(dev);
1199         if (ret)
1200                 DRV_LOG(WARNING, "port %u some indirection table still remain",
1201                         dev->data->port_id);
1202         ret = mlx5_rxq_obj_verify(dev);
1203         if (ret)
1204                 DRV_LOG(WARNING, "port %u some Rx queue objects still remain",
1205                         dev->data->port_id);
1206         ret = mlx5_rxq_verify(dev);
1207         if (ret)
1208                 DRV_LOG(WARNING, "port %u some Rx queues still remain",
1209                         dev->data->port_id);
1210         ret = mlx5_txq_obj_verify(dev);
1211         if (ret)
1212                 DRV_LOG(WARNING, "port %u some Verbs Tx queue still remain",
1213                         dev->data->port_id);
1214         ret = mlx5_txq_verify(dev);
1215         if (ret)
1216                 DRV_LOG(WARNING, "port %u some Tx queues still remain",
1217                         dev->data->port_id);
1218         ret = mlx5_flow_verify(dev);
1219         if (ret)
1220                 DRV_LOG(WARNING, "port %u some flows still remain",
1221                         dev->data->port_id);
1222         /*
1223          * Free the shared context in last turn, because the cleanup
1224          * routines above may use some shared fields, like
1225          * mlx5_nl_mac_addr_flush() uses ibdev_path for retrieveing
1226          * ifindex if Netlink fails.
1227          */
1228         mlx5_free_shared_dev_ctx(priv->sh);
1229         if (priv->domain_id != RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID) {
1230                 unsigned int c = 0;
1231                 uint16_t port_id;
1232
1233                 MLX5_ETH_FOREACH_DEV(port_id, priv->pci_dev) {
1234                         struct mlx5_priv *opriv =
1235                                 rte_eth_devices[port_id].data->dev_private;
1236
1237                         if (!opriv ||
1238                             opriv->domain_id != priv->domain_id ||
1239                             &rte_eth_devices[port_id] == dev)
1240                                 continue;
1241                         ++c;
1242                         break;
1243                 }
1244                 if (!c)
1245                         claim_zero(rte_eth_switch_domain_free(priv->domain_id));
1246         }
1247         memset(priv, 0, sizeof(*priv));
1248         priv->domain_id = RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID;
1249         /*
1250          * Reset mac_addrs to NULL such that it is not freed as part of
1251          * rte_eth_dev_release_port(). mac_addrs is part of dev_private so
1252          * it is freed when dev_private is freed.
1253          */
1254         dev->data->mac_addrs = NULL;
1255 }
1256
1257 /**
1258  * Verify and store value for device argument.
1259  *
1260  * @param[in] key
1261  *   Key argument to verify.
1262  * @param[in] val
1263  *   Value associated with key.
1264  * @param opaque
1265  *   User data.
1266  *
1267  * @return
1268  *   0 on success, a negative errno value otherwise and rte_errno is set.
1269  */
1270 static int
1271 mlx5_args_check(const char *key, const char *val, void *opaque)
1272 {
1273         struct mlx5_dev_config *config = opaque;
1274         unsigned long tmp;
1275
1276         /* No-op, port representors are processed in mlx5_dev_spawn(). */
1277         if (!strcmp(MLX5_REPRESENTOR, key))
1278                 return 0;
1279         errno = 0;
1280         tmp = strtoul(val, NULL, 0);
1281         if (errno) {
1282                 rte_errno = errno;
1283                 DRV_LOG(WARNING, "%s: \"%s\" is not a valid integer", key, val);
1284                 return -rte_errno;
1285         }
1286         if (strcmp(MLX5_RXQ_CQE_COMP_EN, key) == 0) {
1287                 config->cqe_comp = !!tmp;
1288         } else if (strcmp(MLX5_RXQ_CQE_PAD_EN, key) == 0) {
1289                 config->cqe_pad = !!tmp;
1290         } else if (strcmp(MLX5_RXQ_PKT_PAD_EN, key) == 0) {
1291                 config->hw_padding = !!tmp;
1292         } else if (strcmp(MLX5_RX_MPRQ_EN, key) == 0) {
1293                 config->mprq.enabled = !!tmp;
1294         } else if (strcmp(MLX5_RX_MPRQ_LOG_STRIDE_NUM, key) == 0) {
1295                 config->mprq.stride_num_n = tmp;
1296         } else if (strcmp(MLX5_RX_MPRQ_LOG_STRIDE_SIZE, key) == 0) {
1297                 config->mprq.stride_size_n = tmp;
1298         } else if (strcmp(MLX5_RX_MPRQ_MAX_MEMCPY_LEN, key) == 0) {
1299                 config->mprq.max_memcpy_len = tmp;
1300         } else if (strcmp(MLX5_RXQS_MIN_MPRQ, key) == 0) {
1301                 config->mprq.min_rxqs_num = tmp;
1302         } else if (strcmp(MLX5_TXQ_INLINE, key) == 0) {
1303                 DRV_LOG(WARNING, "%s: deprecated parameter,"
1304                                  " converted to txq_inline_max", key);
1305                 config->txq_inline_max = tmp;
1306         } else if (strcmp(MLX5_TXQ_INLINE_MAX, key) == 0) {
1307                 config->txq_inline_max = tmp;
1308         } else if (strcmp(MLX5_TXQ_INLINE_MIN, key) == 0) {
1309                 config->txq_inline_min = tmp;
1310         } else if (strcmp(MLX5_TXQ_INLINE_MPW, key) == 0) {
1311                 config->txq_inline_mpw = tmp;
1312         } else if (strcmp(MLX5_TXQS_MIN_INLINE, key) == 0) {
1313                 config->txqs_inline = tmp;
1314         } else if (strcmp(MLX5_TXQS_MAX_VEC, key) == 0) {
1315                 DRV_LOG(WARNING, "%s: deprecated parameter, ignored", key);
1316         } else if (strcmp(MLX5_TXQ_MPW_EN, key) == 0) {
1317                 config->mps = !!tmp;
1318         } else if (strcmp(MLX5_TX_DB_NC, key) == 0) {
1319                 if (tmp != MLX5_TXDB_CACHED &&
1320                     tmp != MLX5_TXDB_NCACHED &&
1321                     tmp != MLX5_TXDB_HEURISTIC) {
1322                         DRV_LOG(ERR, "invalid Tx doorbell "
1323                                      "mapping parameter");
1324                         rte_errno = EINVAL;
1325                         return -rte_errno;
1326                 }
1327                 config->dbnc = tmp;
1328         } else if (strcmp(MLX5_TXQ_MPW_HDR_DSEG_EN, key) == 0) {
1329                 DRV_LOG(WARNING, "%s: deprecated parameter, ignored", key);
1330         } else if (strcmp(MLX5_TXQ_MAX_INLINE_LEN, key) == 0) {
1331                 DRV_LOG(WARNING, "%s: deprecated parameter,"
1332                                  " converted to txq_inline_mpw", key);
1333                 config->txq_inline_mpw = tmp;
1334         } else if (strcmp(MLX5_TX_VEC_EN, key) == 0) {
1335                 DRV_LOG(WARNING, "%s: deprecated parameter, ignored", key);
1336         } else if (strcmp(MLX5_RX_VEC_EN, key) == 0) {
1337                 config->rx_vec_en = !!tmp;
1338         } else if (strcmp(MLX5_L3_VXLAN_EN, key) == 0) {
1339                 config->l3_vxlan_en = !!tmp;
1340         } else if (strcmp(MLX5_VF_NL_EN, key) == 0) {
1341                 config->vf_nl_en = !!tmp;
1342         } else if (strcmp(MLX5_DV_ESW_EN, key) == 0) {
1343                 config->dv_esw_en = !!tmp;
1344         } else if (strcmp(MLX5_DV_FLOW_EN, key) == 0) {
1345                 config->dv_flow_en = !!tmp;
1346         } else if (strcmp(MLX5_DV_XMETA_EN, key) == 0) {
1347                 if (tmp != MLX5_XMETA_MODE_LEGACY &&
1348                     tmp != MLX5_XMETA_MODE_META16 &&
1349                     tmp != MLX5_XMETA_MODE_META32) {
1350                         DRV_LOG(ERR, "invalid extensive "
1351                                      "metadata parameter");
1352                         rte_errno = EINVAL;
1353                         return -rte_errno;
1354                 }
1355                 config->dv_xmeta_en = tmp;
1356         } else if (strcmp(MLX5_LACP_BY_USER, key) == 0) {
1357                 config->lacp_by_user = !!tmp;
1358         } else if (strcmp(MLX5_MR_EXT_MEMSEG_EN, key) == 0) {
1359                 config->mr_ext_memseg_en = !!tmp;
1360         } else if (strcmp(MLX5_MAX_DUMP_FILES_NUM, key) == 0) {
1361                 config->max_dump_files_num = tmp;
1362         } else if (strcmp(MLX5_LRO_TIMEOUT_USEC, key) == 0) {
1363                 config->lro.timeout = tmp;
1364         } else if (strcmp(MLX5_CLASS_ARG_NAME, key) == 0) {
1365                 DRV_LOG(DEBUG, "class argument is %s.", val);
1366         } else if (strcmp(MLX5_HP_BUF_SIZE, key) == 0) {
1367                 config->log_hp_size = tmp;
1368         } else if (strcmp(MLX5_RECLAIM_MEM, key) == 0) {
1369                 if (tmp != MLX5_RCM_NONE &&
1370                     tmp != MLX5_RCM_LIGHT &&
1371                     tmp != MLX5_RCM_AGGR) {
1372                         DRV_LOG(ERR, "Unrecognize %s: \"%s\"", key, val);
1373                         rte_errno = EINVAL;
1374                         return -rte_errno;
1375                 }
1376                 config->reclaim_mode = tmp;
1377         } else {
1378                 DRV_LOG(WARNING, "%s: unknown parameter", key);
1379                 rte_errno = EINVAL;
1380                 return -rte_errno;
1381         }
1382         return 0;
1383 }
1384
1385 /**
1386  * Parse device parameters.
1387  *
1388  * @param config
1389  *   Pointer to device configuration structure.
1390  * @param devargs
1391  *   Device arguments structure.
1392  *
1393  * @return
1394  *   0 on success, a negative errno value otherwise and rte_errno is set.
1395  */
1396 int
1397 mlx5_args(struct mlx5_dev_config *config, struct rte_devargs *devargs)
1398 {
1399         const char **params = (const char *[]){
1400                 MLX5_RXQ_CQE_COMP_EN,
1401                 MLX5_RXQ_CQE_PAD_EN,
1402                 MLX5_RXQ_PKT_PAD_EN,
1403                 MLX5_RX_MPRQ_EN,
1404                 MLX5_RX_MPRQ_LOG_STRIDE_NUM,
1405                 MLX5_RX_MPRQ_LOG_STRIDE_SIZE,
1406                 MLX5_RX_MPRQ_MAX_MEMCPY_LEN,
1407                 MLX5_RXQS_MIN_MPRQ,
1408                 MLX5_TXQ_INLINE,
1409                 MLX5_TXQ_INLINE_MIN,
1410                 MLX5_TXQ_INLINE_MAX,
1411                 MLX5_TXQ_INLINE_MPW,
1412                 MLX5_TXQS_MIN_INLINE,
1413                 MLX5_TXQS_MAX_VEC,
1414                 MLX5_TXQ_MPW_EN,
1415                 MLX5_TXQ_MPW_HDR_DSEG_EN,
1416                 MLX5_TXQ_MAX_INLINE_LEN,
1417                 MLX5_TX_DB_NC,
1418                 MLX5_TX_VEC_EN,
1419                 MLX5_RX_VEC_EN,
1420                 MLX5_L3_VXLAN_EN,
1421                 MLX5_VF_NL_EN,
1422                 MLX5_DV_ESW_EN,
1423                 MLX5_DV_FLOW_EN,
1424                 MLX5_DV_XMETA_EN,
1425                 MLX5_LACP_BY_USER,
1426                 MLX5_MR_EXT_MEMSEG_EN,
1427                 MLX5_REPRESENTOR,
1428                 MLX5_MAX_DUMP_FILES_NUM,
1429                 MLX5_LRO_TIMEOUT_USEC,
1430                 MLX5_CLASS_ARG_NAME,
1431                 MLX5_HP_BUF_SIZE,
1432                 MLX5_RECLAIM_MEM,
1433                 NULL,
1434         };
1435         struct rte_kvargs *kvlist;
1436         int ret = 0;
1437         int i;
1438
1439         if (devargs == NULL)
1440                 return 0;
1441         /* Following UGLY cast is done to pass checkpatch. */
1442         kvlist = rte_kvargs_parse(devargs->args, params);
1443         if (kvlist == NULL) {
1444                 rte_errno = EINVAL;
1445                 return -rte_errno;
1446         }
1447         /* Process parameters. */
1448         for (i = 0; (params[i] != NULL); ++i) {
1449                 if (rte_kvargs_count(kvlist, params[i])) {
1450                         ret = rte_kvargs_process(kvlist, params[i],
1451                                                  mlx5_args_check, config);
1452                         if (ret) {
1453                                 rte_errno = EINVAL;
1454                                 rte_kvargs_free(kvlist);
1455                                 return -rte_errno;
1456                         }
1457                 }
1458         }
1459         rte_kvargs_free(kvlist);
1460         return 0;
1461 }
1462
1463 /**
1464  * PMD global initialization.
1465  *
1466  * Independent from individual device, this function initializes global
1467  * per-PMD data structures distinguishing primary and secondary processes.
1468  * Hence, each initialization is called once per a process.
1469  *
1470  * @return
1471  *   0 on success, a negative errno value otherwise and rte_errno is set.
1472  */
1473 int
1474 mlx5_init_once(void)
1475 {
1476         struct mlx5_shared_data *sd;
1477         struct mlx5_local_data *ld = &mlx5_local_data;
1478         int ret = 0;
1479
1480         if (mlx5_init_shared_data())
1481                 return -rte_errno;
1482         sd = mlx5_shared_data;
1483         MLX5_ASSERT(sd);
1484         rte_spinlock_lock(&sd->lock);
1485         switch (rte_eal_process_type()) {
1486         case RTE_PROC_PRIMARY:
1487                 if (sd->init_done)
1488                         break;
1489                 LIST_INIT(&sd->mem_event_cb_list);
1490                 rte_rwlock_init(&sd->mem_event_rwlock);
1491                 rte_mem_event_callback_register("MLX5_MEM_EVENT_CB",
1492                                                 mlx5_mr_mem_event_cb, NULL);
1493                 ret = mlx5_mp_init_primary(MLX5_MP_NAME,
1494                                            mlx5_mp_primary_handle);
1495                 if (ret)
1496                         goto out;
1497                 sd->init_done = true;
1498                 break;
1499         case RTE_PROC_SECONDARY:
1500                 if (ld->init_done)
1501                         break;
1502                 ret = mlx5_mp_init_secondary(MLX5_MP_NAME,
1503                                              mlx5_mp_secondary_handle);
1504                 if (ret)
1505                         goto out;
1506                 ++sd->secondary_cnt;
1507                 ld->init_done = true;
1508                 break;
1509         default:
1510                 break;
1511         }
1512 out:
1513         rte_spinlock_unlock(&sd->lock);
1514         return ret;
1515 }
1516
1517 /**
1518  * Configures the minimal amount of data to inline into WQE
1519  * while sending packets.
1520  *
1521  * - the txq_inline_min has the maximal priority, if this
1522  *   key is specified in devargs
1523  * - if DevX is enabled the inline mode is queried from the
1524  *   device (HCA attributes and NIC vport context if needed).
1525  * - otherwise L2 mode (18 bytes) is assumed for ConnectX-4/4 Lx
1526  *   and none (0 bytes) for other NICs
1527  *
1528  * @param spawn
1529  *   Verbs device parameters (name, port, switch_info) to spawn.
1530  * @param config
1531  *   Device configuration parameters.
1532  */
1533 void
1534 mlx5_set_min_inline(struct mlx5_dev_spawn_data *spawn,
1535                     struct mlx5_dev_config *config)
1536 {
1537         if (config->txq_inline_min != MLX5_ARG_UNSET) {
1538                 /* Application defines size of inlined data explicitly. */
1539                 switch (spawn->pci_dev->id.device_id) {
1540                 case PCI_DEVICE_ID_MELLANOX_CONNECTX4:
1541                 case PCI_DEVICE_ID_MELLANOX_CONNECTX4VF:
1542                         if (config->txq_inline_min <
1543                                        (int)MLX5_INLINE_HSIZE_L2) {
1544                                 DRV_LOG(DEBUG,
1545                                         "txq_inline_mix aligned to minimal"
1546                                         " ConnectX-4 required value %d",
1547                                         (int)MLX5_INLINE_HSIZE_L2);
1548                                 config->txq_inline_min = MLX5_INLINE_HSIZE_L2;
1549                         }
1550                         break;
1551                 }
1552                 goto exit;
1553         }
1554         if (config->hca_attr.eth_net_offloads) {
1555                 /* We have DevX enabled, inline mode queried successfully. */
1556                 switch (config->hca_attr.wqe_inline_mode) {
1557                 case MLX5_CAP_INLINE_MODE_L2:
1558                         /* outer L2 header must be inlined. */
1559                         config->txq_inline_min = MLX5_INLINE_HSIZE_L2;
1560                         goto exit;
1561                 case MLX5_CAP_INLINE_MODE_NOT_REQUIRED:
1562                         /* No inline data are required by NIC. */
1563                         config->txq_inline_min = MLX5_INLINE_HSIZE_NONE;
1564                         config->hw_vlan_insert =
1565                                 config->hca_attr.wqe_vlan_insert;
1566                         DRV_LOG(DEBUG, "Tx VLAN insertion is supported");
1567                         goto exit;
1568                 case MLX5_CAP_INLINE_MODE_VPORT_CONTEXT:
1569                         /* inline mode is defined by NIC vport context. */
1570                         if (!config->hca_attr.eth_virt)
1571                                 break;
1572                         switch (config->hca_attr.vport_inline_mode) {
1573                         case MLX5_INLINE_MODE_NONE:
1574                                 config->txq_inline_min =
1575                                         MLX5_INLINE_HSIZE_NONE;
1576                                 goto exit;
1577                         case MLX5_INLINE_MODE_L2:
1578                                 config->txq_inline_min =
1579                                         MLX5_INLINE_HSIZE_L2;
1580                                 goto exit;
1581                         case MLX5_INLINE_MODE_IP:
1582                                 config->txq_inline_min =
1583                                         MLX5_INLINE_HSIZE_L3;
1584                                 goto exit;
1585                         case MLX5_INLINE_MODE_TCP_UDP:
1586                                 config->txq_inline_min =
1587                                         MLX5_INLINE_HSIZE_L4;
1588                                 goto exit;
1589                         case MLX5_INLINE_MODE_INNER_L2:
1590                                 config->txq_inline_min =
1591                                         MLX5_INLINE_HSIZE_INNER_L2;
1592                                 goto exit;
1593                         case MLX5_INLINE_MODE_INNER_IP:
1594                                 config->txq_inline_min =
1595                                         MLX5_INLINE_HSIZE_INNER_L3;
1596                                 goto exit;
1597                         case MLX5_INLINE_MODE_INNER_TCP_UDP:
1598                                 config->txq_inline_min =
1599                                         MLX5_INLINE_HSIZE_INNER_L4;
1600                                 goto exit;
1601                         }
1602                 }
1603         }
1604         /*
1605          * We get here if we are unable to deduce
1606          * inline data size with DevX. Try PCI ID
1607          * to determine old NICs.
1608          */
1609         switch (spawn->pci_dev->id.device_id) {
1610         case PCI_DEVICE_ID_MELLANOX_CONNECTX4:
1611         case PCI_DEVICE_ID_MELLANOX_CONNECTX4VF:
1612         case PCI_DEVICE_ID_MELLANOX_CONNECTX4LX:
1613         case PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF:
1614                 config->txq_inline_min = MLX5_INLINE_HSIZE_L2;
1615                 config->hw_vlan_insert = 0;
1616                 break;
1617         case PCI_DEVICE_ID_MELLANOX_CONNECTX5:
1618         case PCI_DEVICE_ID_MELLANOX_CONNECTX5VF:
1619         case PCI_DEVICE_ID_MELLANOX_CONNECTX5EX:
1620         case PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF:
1621                 /*
1622                  * These NICs support VLAN insertion from WQE and
1623                  * report the wqe_vlan_insert flag. But there is the bug
1624                  * and PFC control may be broken, so disable feature.
1625                  */
1626                 config->hw_vlan_insert = 0;
1627                 config->txq_inline_min = MLX5_INLINE_HSIZE_NONE;
1628                 break;
1629         default:
1630                 config->txq_inline_min = MLX5_INLINE_HSIZE_NONE;
1631                 break;
1632         }
1633 exit:
1634         DRV_LOG(DEBUG, "min tx inline configured: %d", config->txq_inline_min);
1635 }
1636
1637 /**
1638  * Configures the metadata mask fields in the shared context.
1639  *
1640  * @param [in] dev
1641  *   Pointer to Ethernet device.
1642  */
1643 void
1644 mlx5_set_metadata_mask(struct rte_eth_dev *dev)
1645 {
1646         struct mlx5_priv *priv = dev->data->dev_private;
1647         struct mlx5_dev_ctx_shared *sh = priv->sh;
1648         uint32_t meta, mark, reg_c0;
1649
1650         reg_c0 = ~priv->vport_meta_mask;
1651         switch (priv->config.dv_xmeta_en) {
1652         case MLX5_XMETA_MODE_LEGACY:
1653                 meta = UINT32_MAX;
1654                 mark = MLX5_FLOW_MARK_MASK;
1655                 break;
1656         case MLX5_XMETA_MODE_META16:
1657                 meta = reg_c0 >> rte_bsf32(reg_c0);
1658                 mark = MLX5_FLOW_MARK_MASK;
1659                 break;
1660         case MLX5_XMETA_MODE_META32:
1661                 meta = UINT32_MAX;
1662                 mark = (reg_c0 >> rte_bsf32(reg_c0)) & MLX5_FLOW_MARK_MASK;
1663                 break;
1664         default:
1665                 meta = 0;
1666                 mark = 0;
1667                 MLX5_ASSERT(false);
1668                 break;
1669         }
1670         if (sh->dv_mark_mask && sh->dv_mark_mask != mark)
1671                 DRV_LOG(WARNING, "metadata MARK mask mismatche %08X:%08X",
1672                                  sh->dv_mark_mask, mark);
1673         else
1674                 sh->dv_mark_mask = mark;
1675         if (sh->dv_meta_mask && sh->dv_meta_mask != meta)
1676                 DRV_LOG(WARNING, "metadata META mask mismatche %08X:%08X",
1677                                  sh->dv_meta_mask, meta);
1678         else
1679                 sh->dv_meta_mask = meta;
1680         if (sh->dv_regc0_mask && sh->dv_regc0_mask != reg_c0)
1681                 DRV_LOG(WARNING, "metadata reg_c0 mask mismatche %08X:%08X",
1682                                  sh->dv_meta_mask, reg_c0);
1683         else
1684                 sh->dv_regc0_mask = reg_c0;
1685         DRV_LOG(DEBUG, "metadata mode %u", priv->config.dv_xmeta_en);
1686         DRV_LOG(DEBUG, "metadata MARK mask %08X", sh->dv_mark_mask);
1687         DRV_LOG(DEBUG, "metadata META mask %08X", sh->dv_meta_mask);
1688         DRV_LOG(DEBUG, "metadata reg_c0 mask %08X", sh->dv_regc0_mask);
1689 }
1690
1691 int
1692 rte_pmd_mlx5_get_dyn_flag_names(char *names[], unsigned int n)
1693 {
1694         static const char *const dynf_names[] = {
1695                 RTE_PMD_MLX5_FINE_GRANULARITY_INLINE,
1696                 RTE_MBUF_DYNFLAG_METADATA_NAME
1697         };
1698         unsigned int i;
1699
1700         if (n < RTE_DIM(dynf_names))
1701                 return -ENOMEM;
1702         for (i = 0; i < RTE_DIM(dynf_names); i++) {
1703                 if (names[i] == NULL)
1704                         return -EINVAL;
1705                 strcpy(names[i], dynf_names[i]);
1706         }
1707         return RTE_DIM(dynf_names);
1708 }
1709
1710 /**
1711  * Comparison callback to sort device data.
1712  *
1713  * This is meant to be used with qsort().
1714  *
1715  * @param a[in]
1716  *   Pointer to pointer to first data object.
1717  * @param b[in]
1718  *   Pointer to pointer to second data object.
1719  *
1720  * @return
1721  *   0 if both objects are equal, less than 0 if the first argument is less
1722  *   than the second, greater than 0 otherwise.
1723  */
1724 int
1725 mlx5_dev_check_sibling_config(struct mlx5_priv *priv,
1726                               struct mlx5_dev_config *config)
1727 {
1728         struct mlx5_dev_ctx_shared *sh = priv->sh;
1729         struct mlx5_dev_config *sh_conf = NULL;
1730         uint16_t port_id;
1731
1732         MLX5_ASSERT(sh);
1733         /* Nothing to compare for the single/first device. */
1734         if (sh->refcnt == 1)
1735                 return 0;
1736         /* Find the device with shared context. */
1737         MLX5_ETH_FOREACH_DEV(port_id, priv->pci_dev) {
1738                 struct mlx5_priv *opriv =
1739                         rte_eth_devices[port_id].data->dev_private;
1740
1741                 if (opriv && opriv != priv && opriv->sh == sh) {
1742                         sh_conf = &opriv->config;
1743                         break;
1744                 }
1745         }
1746         if (!sh_conf)
1747                 return 0;
1748         if (sh_conf->dv_flow_en ^ config->dv_flow_en) {
1749                 DRV_LOG(ERR, "\"dv_flow_en\" configuration mismatch"
1750                              " for shared %s context", sh->ibdev_name);
1751                 rte_errno = EINVAL;
1752                 return rte_errno;
1753         }
1754         if (sh_conf->dv_xmeta_en ^ config->dv_xmeta_en) {
1755                 DRV_LOG(ERR, "\"dv_xmeta_en\" configuration mismatch"
1756                              " for shared %s context", sh->ibdev_name);
1757                 rte_errno = EINVAL;
1758                 return rte_errno;
1759         }
1760         return 0;
1761 }
1762
1763 /**
1764  * Look for the ethernet device belonging to mlx5 driver.
1765  *
1766  * @param[in] port_id
1767  *   port_id to start looking for device.
1768  * @param[in] pci_dev
1769  *   Pointer to the hint PCI device. When device is being probed
1770  *   the its siblings (master and preceding representors might
1771  *   not have assigned driver yet (because the mlx5_os_pci_probe()
1772  *   is not completed yet, for this case match on hint PCI
1773  *   device may be used to detect sibling device.
1774  *
1775  * @return
1776  *   port_id of found device, RTE_MAX_ETHPORT if not found.
1777  */
1778 uint16_t
1779 mlx5_eth_find_next(uint16_t port_id, struct rte_pci_device *pci_dev)
1780 {
1781         while (port_id < RTE_MAX_ETHPORTS) {
1782                 struct rte_eth_dev *dev = &rte_eth_devices[port_id];
1783
1784                 if (dev->state != RTE_ETH_DEV_UNUSED &&
1785                     dev->device &&
1786                     (dev->device == &pci_dev->device ||
1787                      (dev->device->driver &&
1788                      dev->device->driver->name &&
1789                      !strcmp(dev->device->driver->name, MLX5_DRIVER_NAME))))
1790                         break;
1791                 port_id++;
1792         }
1793         if (port_id >= RTE_MAX_ETHPORTS)
1794                 return RTE_MAX_ETHPORTS;
1795         return port_id;
1796 }
1797
1798 /**
1799  * DPDK callback to remove a PCI device.
1800  *
1801  * This function removes all Ethernet devices belong to a given PCI device.
1802  *
1803  * @param[in] pci_dev
1804  *   Pointer to the PCI device.
1805  *
1806  * @return
1807  *   0 on success, the function cannot fail.
1808  */
1809 static int
1810 mlx5_pci_remove(struct rte_pci_device *pci_dev)
1811 {
1812         uint16_t port_id;
1813
1814         RTE_ETH_FOREACH_DEV_OF(port_id, &pci_dev->device) {
1815                 /*
1816                  * mlx5_dev_close() is not registered to secondary process,
1817                  * call the close function explicitly for secondary process.
1818                  */
1819                 if (rte_eal_process_type() == RTE_PROC_SECONDARY)
1820                         mlx5_dev_close(&rte_eth_devices[port_id]);
1821                 else
1822                         rte_eth_dev_close(port_id);
1823         }
1824         return 0;
1825 }
1826
1827 static const struct rte_pci_id mlx5_pci_id_map[] = {
1828         {
1829                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
1830                                PCI_DEVICE_ID_MELLANOX_CONNECTX4)
1831         },
1832         {
1833                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
1834                                PCI_DEVICE_ID_MELLANOX_CONNECTX4VF)
1835         },
1836         {
1837                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
1838                                PCI_DEVICE_ID_MELLANOX_CONNECTX4LX)
1839         },
1840         {
1841                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
1842                                PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF)
1843         },
1844         {
1845                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
1846                                PCI_DEVICE_ID_MELLANOX_CONNECTX5)
1847         },
1848         {
1849                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
1850                                PCI_DEVICE_ID_MELLANOX_CONNECTX5VF)
1851         },
1852         {
1853                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
1854                                PCI_DEVICE_ID_MELLANOX_CONNECTX5EX)
1855         },
1856         {
1857                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
1858                                PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF)
1859         },
1860         {
1861                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
1862                                PCI_DEVICE_ID_MELLANOX_CONNECTX5BF)
1863         },
1864         {
1865                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
1866                                PCI_DEVICE_ID_MELLANOX_CONNECTX5BFVF)
1867         },
1868         {
1869                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
1870                                 PCI_DEVICE_ID_MELLANOX_CONNECTX6)
1871         },
1872         {
1873                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
1874                                 PCI_DEVICE_ID_MELLANOX_CONNECTX6VF)
1875         },
1876         {
1877                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
1878                                 PCI_DEVICE_ID_MELLANOX_CONNECTX6DX)
1879         },
1880         {
1881                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
1882                                 PCI_DEVICE_ID_MELLANOX_CONNECTX6DXVF)
1883         },
1884         {
1885                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
1886                                 PCI_DEVICE_ID_MELLANOX_CONNECTX6DXBF)
1887         },
1888         {
1889                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
1890                                 PCI_DEVICE_ID_MELLANOX_CONNECTX6LX)
1891         },
1892         {
1893                 .vendor_id = 0
1894         }
1895 };
1896
1897 struct rte_pci_driver mlx5_driver = {
1898         .driver = {
1899                 .name = MLX5_DRIVER_NAME
1900         },
1901         .id_table = mlx5_pci_id_map,
1902         .probe = mlx5_os_pci_probe,
1903         .remove = mlx5_pci_remove,
1904         .dma_map = mlx5_dma_map,
1905         .dma_unmap = mlx5_dma_unmap,
1906         .drv_flags = PCI_DRV_FLAGS,
1907 };
1908
1909 /* Initialize driver log type. */
1910 RTE_LOG_REGISTER(mlx5_logtype, pmd.net.mlx5, NOTICE)
1911
1912 /**
1913  * Driver initialization routine.
1914  */
1915 RTE_INIT(rte_mlx5_pmd_init)
1916 {
1917         /* Build the static tables for Verbs conversion. */
1918         mlx5_set_ptype_table();
1919         mlx5_set_cksum_table();
1920         mlx5_set_swp_types_table();
1921         if (mlx5_glue)
1922                 rte_pci_register(&mlx5_driver);
1923 }
1924
1925 RTE_PMD_EXPORT_NAME(net_mlx5, __COUNTER__);
1926 RTE_PMD_REGISTER_PCI_TABLE(net_mlx5, mlx5_pci_id_map);
1927 RTE_PMD_REGISTER_KMOD_DEP(net_mlx5, "* ib_uverbs & mlx5_core & mlx5_ib");