net/mlx5: convert control path memory to unified malloc
[dpdk.git] / drivers / net / mlx5 / mlx5.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2015 6WIND S.A.
3  * Copyright 2015 Mellanox Technologies, Ltd
4  */
5
6 #include <stddef.h>
7 #include <unistd.h>
8 #include <string.h>
9 #include <stdint.h>
10 #include <stdlib.h>
11 #include <errno.h>
12 #include <net/if.h>
13 #include <sys/mman.h>
14 #include <linux/rtnetlink.h>
15
16 /* Verbs header. */
17 /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
18 #ifdef PEDANTIC
19 #pragma GCC diagnostic ignored "-Wpedantic"
20 #endif
21 #include <infiniband/verbs.h>
22 #ifdef PEDANTIC
23 #pragma GCC diagnostic error "-Wpedantic"
24 #endif
25
26 #include <rte_malloc.h>
27 #include <rte_ethdev_driver.h>
28 #include <rte_ethdev_pci.h>
29 #include <rte_pci.h>
30 #include <rte_bus_pci.h>
31 #include <rte_common.h>
32 #include <rte_kvargs.h>
33 #include <rte_rwlock.h>
34 #include <rte_spinlock.h>
35 #include <rte_string_fns.h>
36 #include <rte_alarm.h>
37
38 #include <mlx5_glue.h>
39 #include <mlx5_devx_cmds.h>
40 #include <mlx5_common.h>
41 #include <mlx5_common_os.h>
42 #include <mlx5_common_mp.h>
43 #include <mlx5_malloc.h>
44
45 #include "mlx5_defs.h"
46 #include "mlx5.h"
47 #include "mlx5_utils.h"
48 #include "mlx5_rxtx.h"
49 #include "mlx5_autoconf.h"
50 #include "mlx5_mr.h"
51 #include "mlx5_flow.h"
52 #include "rte_pmd_mlx5.h"
53
54 /* Device parameter to enable RX completion queue compression. */
55 #define MLX5_RXQ_CQE_COMP_EN "rxq_cqe_comp_en"
56
57 /* Device parameter to enable RX completion entry padding to 128B. */
58 #define MLX5_RXQ_CQE_PAD_EN "rxq_cqe_pad_en"
59
60 /* Device parameter to enable padding Rx packet to cacheline size. */
61 #define MLX5_RXQ_PKT_PAD_EN "rxq_pkt_pad_en"
62
63 /* Device parameter to enable Multi-Packet Rx queue. */
64 #define MLX5_RX_MPRQ_EN "mprq_en"
65
66 /* Device parameter to configure log 2 of the number of strides for MPRQ. */
67 #define MLX5_RX_MPRQ_LOG_STRIDE_NUM "mprq_log_stride_num"
68
69 /* Device parameter to configure log 2 of the stride size for MPRQ. */
70 #define MLX5_RX_MPRQ_LOG_STRIDE_SIZE "mprq_log_stride_size"
71
72 /* Device parameter to limit the size of memcpy'd packet for MPRQ. */
73 #define MLX5_RX_MPRQ_MAX_MEMCPY_LEN "mprq_max_memcpy_len"
74
75 /* Device parameter to set the minimum number of Rx queues to enable MPRQ. */
76 #define MLX5_RXQS_MIN_MPRQ "rxqs_min_mprq"
77
78 /* Device parameter to configure inline send. Deprecated, ignored.*/
79 #define MLX5_TXQ_INLINE "txq_inline"
80
81 /* Device parameter to limit packet size to inline with ordinary SEND. */
82 #define MLX5_TXQ_INLINE_MAX "txq_inline_max"
83
84 /* Device parameter to configure minimal data size to inline. */
85 #define MLX5_TXQ_INLINE_MIN "txq_inline_min"
86
87 /* Device parameter to limit packet size to inline with Enhanced MPW. */
88 #define MLX5_TXQ_INLINE_MPW "txq_inline_mpw"
89
90 /*
91  * Device parameter to configure the number of TX queues threshold for
92  * enabling inline send.
93  */
94 #define MLX5_TXQS_MIN_INLINE "txqs_min_inline"
95
96 /*
97  * Device parameter to configure the number of TX queues threshold for
98  * enabling vectorized Tx, deprecated, ignored (no vectorized Tx routines).
99  */
100 #define MLX5_TXQS_MAX_VEC "txqs_max_vec"
101
102 /* Device parameter to enable multi-packet send WQEs. */
103 #define MLX5_TXQ_MPW_EN "txq_mpw_en"
104
105 /*
106  * Device parameter to force doorbell register mapping
107  * to non-cahed region eliminating the extra write memory barrier.
108  */
109 #define MLX5_TX_DB_NC "tx_db_nc"
110
111 /*
112  * Device parameter to include 2 dsegs in the title WQEBB.
113  * Deprecated, ignored.
114  */
115 #define MLX5_TXQ_MPW_HDR_DSEG_EN "txq_mpw_hdr_dseg_en"
116
117 /*
118  * Device parameter to limit the size of inlining packet.
119  * Deprecated, ignored.
120  */
121 #define MLX5_TXQ_MAX_INLINE_LEN "txq_max_inline_len"
122
123 /*
124  * Device parameter to enable Tx scheduling on timestamps
125  * and specify the packet pacing granularity in nanoseconds.
126  */
127 #define MLX5_TX_PP "tx_pp"
128
129 /*
130  * Device parameter to specify skew in nanoseconds on Tx datapath,
131  * it represents the time between SQ start WQE processing and
132  * appearing actual packet data on the wire.
133  */
134 #define MLX5_TX_SKEW "tx_skew"
135
136 /*
137  * Device parameter to enable hardware Tx vector.
138  * Deprecated, ignored (no vectorized Tx routines anymore).
139  */
140 #define MLX5_TX_VEC_EN "tx_vec_en"
141
142 /* Device parameter to enable hardware Rx vector. */
143 #define MLX5_RX_VEC_EN "rx_vec_en"
144
145 /* Allow L3 VXLAN flow creation. */
146 #define MLX5_L3_VXLAN_EN "l3_vxlan_en"
147
148 /* Activate DV E-Switch flow steering. */
149 #define MLX5_DV_ESW_EN "dv_esw_en"
150
151 /* Activate DV flow steering. */
152 #define MLX5_DV_FLOW_EN "dv_flow_en"
153
154 /* Enable extensive flow metadata support. */
155 #define MLX5_DV_XMETA_EN "dv_xmeta_en"
156
157 /* Device parameter to let the user manage the lacp traffic of bonded device */
158 #define MLX5_LACP_BY_USER "lacp_by_user"
159
160 /* Activate Netlink support in VF mode. */
161 #define MLX5_VF_NL_EN "vf_nl_en"
162
163 /* Enable extending memsegs when creating a MR. */
164 #define MLX5_MR_EXT_MEMSEG_EN "mr_ext_memseg_en"
165
166 /* Select port representors to instantiate. */
167 #define MLX5_REPRESENTOR "representor"
168
169 /* Device parameter to configure the maximum number of dump files per queue. */
170 #define MLX5_MAX_DUMP_FILES_NUM "max_dump_files_num"
171
172 /* Configure timeout of LRO session (in microseconds). */
173 #define MLX5_LRO_TIMEOUT_USEC "lro_timeout_usec"
174
175 /*
176  * Device parameter to configure the total data buffer size for a single
177  * hairpin queue (logarithm value).
178  */
179 #define MLX5_HP_BUF_SIZE "hp_buf_log_sz"
180
181 /* Flow memory reclaim mode. */
182 #define MLX5_RECLAIM_MEM "reclaim_mem_mode"
183
184 /* The default memory allocator used in PMD. */
185 #define MLX5_SYS_MEM_EN "sys_mem_en"
186
187 static const char *MZ_MLX5_PMD_SHARED_DATA = "mlx5_pmd_shared_data";
188
189 /* Shared memory between primary and secondary processes. */
190 struct mlx5_shared_data *mlx5_shared_data;
191
192 /* Spinlock for mlx5_shared_data allocation. */
193 static rte_spinlock_t mlx5_shared_data_lock = RTE_SPINLOCK_INITIALIZER;
194
195 /* Process local data for secondary processes. */
196 static struct mlx5_local_data mlx5_local_data;
197
198 static LIST_HEAD(, mlx5_dev_ctx_shared) mlx5_dev_ctx_list =
199                                                 LIST_HEAD_INITIALIZER();
200 static pthread_mutex_t mlx5_dev_ctx_list_mutex = PTHREAD_MUTEX_INITIALIZER;
201
202 static const struct mlx5_indexed_pool_config mlx5_ipool_cfg[] = {
203 #ifdef HAVE_IBV_FLOW_DV_SUPPORT
204         {
205                 .size = sizeof(struct mlx5_flow_dv_encap_decap_resource),
206                 .trunk_size = 64,
207                 .grow_trunk = 3,
208                 .grow_shift = 2,
209                 .need_lock = 0,
210                 .release_mem_en = 1,
211                 .malloc = mlx5_malloc,
212                 .free = mlx5_free,
213                 .type = "mlx5_encap_decap_ipool",
214         },
215         {
216                 .size = sizeof(struct mlx5_flow_dv_push_vlan_action_resource),
217                 .trunk_size = 64,
218                 .grow_trunk = 3,
219                 .grow_shift = 2,
220                 .need_lock = 0,
221                 .release_mem_en = 1,
222                 .malloc = mlx5_malloc,
223                 .free = mlx5_free,
224                 .type = "mlx5_push_vlan_ipool",
225         },
226         {
227                 .size = sizeof(struct mlx5_flow_dv_tag_resource),
228                 .trunk_size = 64,
229                 .grow_trunk = 3,
230                 .grow_shift = 2,
231                 .need_lock = 0,
232                 .release_mem_en = 1,
233                 .malloc = mlx5_malloc,
234                 .free = mlx5_free,
235                 .type = "mlx5_tag_ipool",
236         },
237         {
238                 .size = sizeof(struct mlx5_flow_dv_port_id_action_resource),
239                 .trunk_size = 64,
240                 .grow_trunk = 3,
241                 .grow_shift = 2,
242                 .need_lock = 0,
243                 .release_mem_en = 1,
244                 .malloc = mlx5_malloc,
245                 .free = mlx5_free,
246                 .type = "mlx5_port_id_ipool",
247         },
248         {
249                 .size = sizeof(struct mlx5_flow_tbl_data_entry),
250                 .trunk_size = 64,
251                 .grow_trunk = 3,
252                 .grow_shift = 2,
253                 .need_lock = 0,
254                 .release_mem_en = 1,
255                 .malloc = mlx5_malloc,
256                 .free = mlx5_free,
257                 .type = "mlx5_jump_ipool",
258         },
259 #endif
260         {
261                 .size = sizeof(struct mlx5_flow_meter),
262                 .trunk_size = 64,
263                 .grow_trunk = 3,
264                 .grow_shift = 2,
265                 .need_lock = 0,
266                 .release_mem_en = 1,
267                 .malloc = mlx5_malloc,
268                 .free = mlx5_free,
269                 .type = "mlx5_meter_ipool",
270         },
271         {
272                 .size = sizeof(struct mlx5_flow_mreg_copy_resource),
273                 .trunk_size = 64,
274                 .grow_trunk = 3,
275                 .grow_shift = 2,
276                 .need_lock = 0,
277                 .release_mem_en = 1,
278                 .malloc = mlx5_malloc,
279                 .free = mlx5_free,
280                 .type = "mlx5_mcp_ipool",
281         },
282         {
283                 .size = (sizeof(struct mlx5_hrxq) + MLX5_RSS_HASH_KEY_LEN),
284                 .trunk_size = 64,
285                 .grow_trunk = 3,
286                 .grow_shift = 2,
287                 .need_lock = 0,
288                 .release_mem_en = 1,
289                 .malloc = mlx5_malloc,
290                 .free = mlx5_free,
291                 .type = "mlx5_hrxq_ipool",
292         },
293         {
294                 /*
295                  * MLX5_IPOOL_MLX5_FLOW size varies for DV and VERBS flows.
296                  * It set in run time according to PCI function configuration.
297                  */
298                 .size = 0,
299                 .trunk_size = 64,
300                 .grow_trunk = 3,
301                 .grow_shift = 2,
302                 .need_lock = 0,
303                 .release_mem_en = 1,
304                 .malloc = mlx5_malloc,
305                 .free = mlx5_free,
306                 .type = "mlx5_flow_handle_ipool",
307         },
308         {
309                 .size = sizeof(struct rte_flow),
310                 .trunk_size = 4096,
311                 .need_lock = 1,
312                 .release_mem_en = 1,
313                 .malloc = mlx5_malloc,
314                 .free = mlx5_free,
315                 .type = "rte_flow_ipool",
316         },
317 };
318
319
320 #define MLX5_FLOW_MIN_ID_POOL_SIZE 512
321 #define MLX5_ID_GENERATION_ARRAY_FACTOR 16
322
323 #define MLX5_FLOW_TABLE_HLIST_ARRAY_SIZE 4096
324
325 /**
326  * Allocate ID pool structure.
327  *
328  * @param[in] max_id
329  *   The maximum id can be allocated from the pool.
330  *
331  * @return
332  *   Pointer to pool object, NULL value otherwise.
333  */
334 struct mlx5_flow_id_pool *
335 mlx5_flow_id_pool_alloc(uint32_t max_id)
336 {
337         struct mlx5_flow_id_pool *pool;
338         void *mem;
339
340         pool = mlx5_malloc(MLX5_MEM_ZERO, sizeof(*pool),
341                            RTE_CACHE_LINE_SIZE, SOCKET_ID_ANY);
342         if (!pool) {
343                 DRV_LOG(ERR, "can't allocate id pool");
344                 rte_errno  = ENOMEM;
345                 return NULL;
346         }
347         mem = mlx5_malloc(MLX5_MEM_ZERO,
348                           MLX5_FLOW_MIN_ID_POOL_SIZE * sizeof(uint32_t),
349                           RTE_CACHE_LINE_SIZE, SOCKET_ID_ANY);
350         if (!mem) {
351                 DRV_LOG(ERR, "can't allocate mem for id pool");
352                 rte_errno  = ENOMEM;
353                 goto error;
354         }
355         pool->free_arr = mem;
356         pool->curr = pool->free_arr;
357         pool->last = pool->free_arr + MLX5_FLOW_MIN_ID_POOL_SIZE;
358         pool->base_index = 0;
359         pool->max_id = max_id;
360         return pool;
361 error:
362         mlx5_free(pool);
363         return NULL;
364 }
365
366 /**
367  * Release ID pool structure.
368  *
369  * @param[in] pool
370  *   Pointer to flow id pool object to free.
371  */
372 void
373 mlx5_flow_id_pool_release(struct mlx5_flow_id_pool *pool)
374 {
375         mlx5_free(pool->free_arr);
376         mlx5_free(pool);
377 }
378
379 /**
380  * Generate ID.
381  *
382  * @param[in] pool
383  *   Pointer to flow id pool.
384  * @param[out] id
385  *   The generated ID.
386  *
387  * @return
388  *   0 on success, error value otherwise.
389  */
390 uint32_t
391 mlx5_flow_id_get(struct mlx5_flow_id_pool *pool, uint32_t *id)
392 {
393         if (pool->curr == pool->free_arr) {
394                 if (pool->base_index == pool->max_id) {
395                         rte_errno  = ENOMEM;
396                         DRV_LOG(ERR, "no free id");
397                         return -rte_errno;
398                 }
399                 *id = ++pool->base_index;
400                 return 0;
401         }
402         *id = *(--pool->curr);
403         return 0;
404 }
405
406 /**
407  * Release ID.
408  *
409  * @param[in] pool
410  *   Pointer to flow id pool.
411  * @param[out] id
412  *   The generated ID.
413  *
414  * @return
415  *   0 on success, error value otherwise.
416  */
417 uint32_t
418 mlx5_flow_id_release(struct mlx5_flow_id_pool *pool, uint32_t id)
419 {
420         uint32_t size;
421         uint32_t size2;
422         void *mem;
423
424         if (pool->curr == pool->last) {
425                 size = pool->curr - pool->free_arr;
426                 size2 = size * MLX5_ID_GENERATION_ARRAY_FACTOR;
427                 MLX5_ASSERT(size2 > size);
428                 mem = mlx5_malloc(0, size2 * sizeof(uint32_t), 0,
429                                   SOCKET_ID_ANY);
430                 if (!mem) {
431                         DRV_LOG(ERR, "can't allocate mem for id pool");
432                         rte_errno  = ENOMEM;
433                         return -rte_errno;
434                 }
435                 memcpy(mem, pool->free_arr, size * sizeof(uint32_t));
436                 mlx5_free(pool->free_arr);
437                 pool->free_arr = mem;
438                 pool->curr = pool->free_arr + size;
439                 pool->last = pool->free_arr + size2;
440         }
441         *pool->curr = id;
442         pool->curr++;
443         return 0;
444 }
445
446 /**
447  * Initialize the shared aging list information per port.
448  *
449  * @param[in] sh
450  *   Pointer to mlx5_dev_ctx_shared object.
451  */
452 static void
453 mlx5_flow_aging_init(struct mlx5_dev_ctx_shared *sh)
454 {
455         uint32_t i;
456         struct mlx5_age_info *age_info;
457
458         for (i = 0; i < sh->max_port; i++) {
459                 age_info = &sh->port[i].age_info;
460                 age_info->flags = 0;
461                 TAILQ_INIT(&age_info->aged_counters);
462                 rte_spinlock_init(&age_info->aged_sl);
463                 MLX5_AGE_SET(age_info, MLX5_AGE_TRIGGER);
464         }
465 }
466
467 /**
468  * Initialize the counters management structure.
469  *
470  * @param[in] sh
471  *   Pointer to mlx5_dev_ctx_shared object to free
472  */
473 static void
474 mlx5_flow_counters_mng_init(struct mlx5_dev_ctx_shared *sh)
475 {
476         int i;
477
478         memset(&sh->cmng, 0, sizeof(sh->cmng));
479         TAILQ_INIT(&sh->cmng.flow_counters);
480         for (i = 0; i < MLX5_CCONT_TYPE_MAX; ++i) {
481                 sh->cmng.ccont[i].min_id = MLX5_CNT_BATCH_OFFSET;
482                 sh->cmng.ccont[i].max_id = -1;
483                 sh->cmng.ccont[i].last_pool_idx = POOL_IDX_INVALID;
484                 TAILQ_INIT(&sh->cmng.ccont[i].pool_list);
485                 rte_spinlock_init(&sh->cmng.ccont[i].resize_sl);
486                 TAILQ_INIT(&sh->cmng.ccont[i].counters);
487                 rte_spinlock_init(&sh->cmng.ccont[i].csl);
488         }
489 }
490
491 /**
492  * Destroy all the resources allocated for a counter memory management.
493  *
494  * @param[in] mng
495  *   Pointer to the memory management structure.
496  */
497 static void
498 mlx5_flow_destroy_counter_stat_mem_mng(struct mlx5_counter_stats_mem_mng *mng)
499 {
500         uint8_t *mem = (uint8_t *)(uintptr_t)mng->raws[0].data;
501
502         LIST_REMOVE(mng, next);
503         claim_zero(mlx5_devx_cmd_destroy(mng->dm));
504         claim_zero(mlx5_glue->devx_umem_dereg(mng->umem));
505         mlx5_free(mem);
506 }
507
508 /**
509  * Close and release all the resources of the counters management.
510  *
511  * @param[in] sh
512  *   Pointer to mlx5_dev_ctx_shared object to free.
513  */
514 static void
515 mlx5_flow_counters_mng_close(struct mlx5_dev_ctx_shared *sh)
516 {
517         struct mlx5_counter_stats_mem_mng *mng;
518         int i;
519         int j;
520         int retries = 1024;
521
522         rte_errno = 0;
523         while (--retries) {
524                 rte_eal_alarm_cancel(mlx5_flow_query_alarm, sh);
525                 if (rte_errno != EINPROGRESS)
526                         break;
527                 rte_pause();
528         }
529         for (i = 0; i < MLX5_CCONT_TYPE_MAX; ++i) {
530                 struct mlx5_flow_counter_pool *pool;
531                 uint32_t batch = !!(i > 1);
532
533                 if (!sh->cmng.ccont[i].pools)
534                         continue;
535                 pool = TAILQ_FIRST(&sh->cmng.ccont[i].pool_list);
536                 while (pool) {
537                         if (batch && pool->min_dcs)
538                                 claim_zero(mlx5_devx_cmd_destroy
539                                                                (pool->min_dcs));
540                         for (j = 0; j < MLX5_COUNTERS_PER_POOL; ++j) {
541                                 if (MLX5_POOL_GET_CNT(pool, j)->action)
542                                         claim_zero
543                                          (mlx5_glue->destroy_flow_action
544                                           (MLX5_POOL_GET_CNT
545                                           (pool, j)->action));
546                                 if (!batch && MLX5_GET_POOL_CNT_EXT
547                                     (pool, j)->dcs)
548                                         claim_zero(mlx5_devx_cmd_destroy
549                                                    (MLX5_GET_POOL_CNT_EXT
550                                                     (pool, j)->dcs));
551                         }
552                         TAILQ_REMOVE(&sh->cmng.ccont[i].pool_list, pool, next);
553                         mlx5_free(pool);
554                         pool = TAILQ_FIRST(&sh->cmng.ccont[i].pool_list);
555                 }
556                 mlx5_free(sh->cmng.ccont[i].pools);
557         }
558         mng = LIST_FIRST(&sh->cmng.mem_mngs);
559         while (mng) {
560                 mlx5_flow_destroy_counter_stat_mem_mng(mng);
561                 mng = LIST_FIRST(&sh->cmng.mem_mngs);
562         }
563         memset(&sh->cmng, 0, sizeof(sh->cmng));
564 }
565
566 /**
567  * Initialize the flow resources' indexed mempool.
568  *
569  * @param[in] sh
570  *   Pointer to mlx5_dev_ctx_shared object.
571  * @param[in] sh
572  *   Pointer to user dev config.
573  */
574 static void
575 mlx5_flow_ipool_create(struct mlx5_dev_ctx_shared *sh,
576                        const struct mlx5_dev_config *config)
577 {
578         uint8_t i;
579         struct mlx5_indexed_pool_config cfg;
580
581         for (i = 0; i < MLX5_IPOOL_MAX; ++i) {
582                 cfg = mlx5_ipool_cfg[i];
583                 switch (i) {
584                 default:
585                         break;
586                 /*
587                  * Set MLX5_IPOOL_MLX5_FLOW ipool size
588                  * according to PCI function flow configuration.
589                  */
590                 case MLX5_IPOOL_MLX5_FLOW:
591                         cfg.size = config->dv_flow_en ?
592                                 sizeof(struct mlx5_flow_handle) :
593                                 MLX5_FLOW_HANDLE_VERBS_SIZE;
594                         break;
595                 }
596                 if (config->reclaim_mode)
597                         cfg.release_mem_en = 1;
598                 sh->ipool[i] = mlx5_ipool_create(&cfg);
599         }
600 }
601
602 /**
603  * Release the flow resources' indexed mempool.
604  *
605  * @param[in] sh
606  *   Pointer to mlx5_dev_ctx_shared object.
607  */
608 static void
609 mlx5_flow_ipool_destroy(struct mlx5_dev_ctx_shared *sh)
610 {
611         uint8_t i;
612
613         for (i = 0; i < MLX5_IPOOL_MAX; ++i)
614                 mlx5_ipool_destroy(sh->ipool[i]);
615 }
616
617 /*
618  * Check if dynamic flex parser for eCPRI already exists.
619  *
620  * @param dev
621  *   Pointer to Ethernet device structure.
622  *
623  * @return
624  *   true on exists, false on not.
625  */
626 bool
627 mlx5_flex_parser_ecpri_exist(struct rte_eth_dev *dev)
628 {
629         struct mlx5_priv *priv = dev->data->dev_private;
630         struct mlx5_flex_parser_profiles *prf =
631                                 &priv->sh->fp[MLX5_FLEX_PARSER_ECPRI_0];
632
633         return !!prf->obj;
634 }
635
636 /*
637  * Allocation of a flex parser for eCPRI. Once created, this parser related
638  * resources will be held until the device is closed.
639  *
640  * @param dev
641  *   Pointer to Ethernet device structure.
642  *
643  * @return
644  *   0 on success, a negative errno value otherwise and rte_errno is set.
645  */
646 int
647 mlx5_flex_parser_ecpri_alloc(struct rte_eth_dev *dev)
648 {
649         struct mlx5_priv *priv = dev->data->dev_private;
650         struct mlx5_flex_parser_profiles *prf =
651                                 &priv->sh->fp[MLX5_FLEX_PARSER_ECPRI_0];
652         struct mlx5_devx_graph_node_attr node = {
653                 .modify_field_select = 0,
654         };
655         uint32_t ids[8];
656         int ret;
657
658         if (!priv->config.hca_attr.parse_graph_flex_node) {
659                 DRV_LOG(ERR, "Dynamic flex parser is not supported "
660                         "for device %s.", priv->dev_data->name);
661                 return -ENOTSUP;
662         }
663         node.header_length_mode = MLX5_GRAPH_NODE_LEN_FIXED;
664         /* 8 bytes now: 4B common header + 4B message body header. */
665         node.header_length_base_value = 0x8;
666         /* After MAC layer: Ether / VLAN. */
667         node.in[0].arc_parse_graph_node = MLX5_GRAPH_ARC_NODE_MAC;
668         /* Type of compared condition should be 0xAEFE in the L2 layer. */
669         node.in[0].compare_condition_value = RTE_ETHER_TYPE_ECPRI;
670         /* Sample #0: type in common header. */
671         node.sample[0].flow_match_sample_en = 1;
672         /* Fixed offset. */
673         node.sample[0].flow_match_sample_offset_mode = 0x0;
674         /* Only the 2nd byte will be used. */
675         node.sample[0].flow_match_sample_field_base_offset = 0x0;
676         /* Sample #1: message payload. */
677         node.sample[1].flow_match_sample_en = 1;
678         /* Fixed offset. */
679         node.sample[1].flow_match_sample_offset_mode = 0x0;
680         /*
681          * Only the first two bytes will be used right now, and its offset will
682          * start after the common header that with the length of a DW(u32).
683          */
684         node.sample[1].flow_match_sample_field_base_offset = sizeof(uint32_t);
685         prf->obj = mlx5_devx_cmd_create_flex_parser(priv->sh->ctx, &node);
686         if (!prf->obj) {
687                 DRV_LOG(ERR, "Failed to create flex parser node object.");
688                 return (rte_errno == 0) ? -ENODEV : -rte_errno;
689         }
690         prf->num = 2;
691         ret = mlx5_devx_cmd_query_parse_samples(prf->obj, ids, prf->num);
692         if (ret) {
693                 DRV_LOG(ERR, "Failed to query sample IDs.");
694                 return (rte_errno == 0) ? -ENODEV : -rte_errno;
695         }
696         prf->offset[0] = 0x0;
697         prf->offset[1] = sizeof(uint32_t);
698         prf->ids[0] = ids[0];
699         prf->ids[1] = ids[1];
700         return 0;
701 }
702
703 /*
704  * Destroy the flex parser node, including the parser itself, input / output
705  * arcs and DW samples. Resources could be reused then.
706  *
707  * @param dev
708  *   Pointer to Ethernet device structure.
709  */
710 static void
711 mlx5_flex_parser_ecpri_release(struct rte_eth_dev *dev)
712 {
713         struct mlx5_priv *priv = dev->data->dev_private;
714         struct mlx5_flex_parser_profiles *prf =
715                                 &priv->sh->fp[MLX5_FLEX_PARSER_ECPRI_0];
716
717         if (prf->obj)
718                 mlx5_devx_cmd_destroy(prf->obj);
719         prf->obj = NULL;
720 }
721
722 /**
723  * Allocate shared device context. If there is multiport device the
724  * master and representors will share this context, if there is single
725  * port dedicated device, the context will be used by only given
726  * port due to unification.
727  *
728  * Routine first searches the context for the specified device name,
729  * if found the shared context assumed and reference counter is incremented.
730  * If no context found the new one is created and initialized with specified
731  * device context and parameters.
732  *
733  * @param[in] spawn
734  *   Pointer to the device attributes (name, port, etc).
735  * @param[in] config
736  *   Pointer to device configuration structure.
737  *
738  * @return
739  *   Pointer to mlx5_dev_ctx_shared object on success,
740  *   otherwise NULL and rte_errno is set.
741  */
742 struct mlx5_dev_ctx_shared *
743 mlx5_alloc_shared_dev_ctx(const struct mlx5_dev_spawn_data *spawn,
744                            const struct mlx5_dev_config *config)
745 {
746         struct mlx5_dev_ctx_shared *sh;
747         int err = 0;
748         uint32_t i;
749         struct mlx5_devx_tis_attr tis_attr = { 0 };
750
751         MLX5_ASSERT(spawn);
752         /* Secondary process should not create the shared context. */
753         MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY);
754         pthread_mutex_lock(&mlx5_dev_ctx_list_mutex);
755         /* Search for IB context by device name. */
756         LIST_FOREACH(sh, &mlx5_dev_ctx_list, next) {
757                 if (!strcmp(sh->ibdev_name,
758                         mlx5_os_get_dev_device_name(spawn->phys_dev))) {
759                         sh->refcnt++;
760                         goto exit;
761                 }
762         }
763         /* No device found, we have to create new shared context. */
764         MLX5_ASSERT(spawn->max_port);
765         sh = rte_zmalloc("ethdev shared ib context",
766                          sizeof(struct mlx5_dev_ctx_shared) +
767                          spawn->max_port *
768                          sizeof(struct mlx5_dev_shared_port),
769                          RTE_CACHE_LINE_SIZE);
770         if (!sh) {
771                 DRV_LOG(ERR, "shared context allocation failure");
772                 rte_errno  = ENOMEM;
773                 goto exit;
774         }
775         err = mlx5_os_open_device(spawn, config, sh);
776         if (!sh->ctx)
777                 goto error;
778         err = mlx5_os_get_dev_attr(sh->ctx, &sh->device_attr);
779         if (err) {
780                 DRV_LOG(DEBUG, "mlx5_os_get_dev_attr() failed");
781                 goto error;
782         }
783         sh->refcnt = 1;
784         sh->max_port = spawn->max_port;
785         strncpy(sh->ibdev_name, mlx5_os_get_ctx_device_name(sh->ctx),
786                 sizeof(sh->ibdev_name) - 1);
787         strncpy(sh->ibdev_path, mlx5_os_get_ctx_device_path(sh->ctx),
788                 sizeof(sh->ibdev_path) - 1);
789         /*
790          * Setting port_id to max unallowed value means
791          * there is no interrupt subhandler installed for
792          * the given port index i.
793          */
794         for (i = 0; i < sh->max_port; i++) {
795                 sh->port[i].ih_port_id = RTE_MAX_ETHPORTS;
796                 sh->port[i].devx_ih_port_id = RTE_MAX_ETHPORTS;
797         }
798         sh->pd = mlx5_glue->alloc_pd(sh->ctx);
799         if (sh->pd == NULL) {
800                 DRV_LOG(ERR, "PD allocation failure");
801                 err = ENOMEM;
802                 goto error;
803         }
804         if (sh->devx) {
805                 err = mlx5_os_get_pdn(sh->pd, &sh->pdn);
806                 if (err) {
807                         DRV_LOG(ERR, "Fail to extract pdn from PD");
808                         goto error;
809                 }
810                 sh->td = mlx5_devx_cmd_create_td(sh->ctx);
811                 if (!sh->td) {
812                         DRV_LOG(ERR, "TD allocation failure");
813                         err = ENOMEM;
814                         goto error;
815                 }
816                 tis_attr.transport_domain = sh->td->id;
817                 sh->tis = mlx5_devx_cmd_create_tis(sh->ctx, &tis_attr);
818                 if (!sh->tis) {
819                         DRV_LOG(ERR, "TIS allocation failure");
820                         err = ENOMEM;
821                         goto error;
822                 }
823                 sh->tx_uar = mlx5_glue->devx_alloc_uar(sh->ctx, 0);
824                 if (!sh->tx_uar) {
825                         DRV_LOG(ERR, "Failed to allocate DevX UAR.");
826                         err = ENOMEM;
827                         goto error;
828                 }
829         }
830         sh->flow_id_pool = mlx5_flow_id_pool_alloc
831                                         ((1 << HAIRPIN_FLOW_ID_BITS) - 1);
832         if (!sh->flow_id_pool) {
833                 DRV_LOG(ERR, "can't create flow id pool");
834                 err = ENOMEM;
835                 goto error;
836         }
837 #ifndef RTE_ARCH_64
838         /* Initialize UAR access locks for 32bit implementations. */
839         rte_spinlock_init(&sh->uar_lock_cq);
840         for (i = 0; i < MLX5_UAR_PAGE_NUM_MAX; i++)
841                 rte_spinlock_init(&sh->uar_lock[i]);
842 #endif
843         /*
844          * Once the device is added to the list of memory event
845          * callback, its global MR cache table cannot be expanded
846          * on the fly because of deadlock. If it overflows, lookup
847          * should be done by searching MR list linearly, which is slow.
848          *
849          * At this point the device is not added to the memory
850          * event list yet, context is just being created.
851          */
852         err = mlx5_mr_btree_init(&sh->share_cache.cache,
853                                  MLX5_MR_BTREE_CACHE_N * 2,
854                                  spawn->pci_dev->device.numa_node);
855         if (err) {
856                 err = rte_errno;
857                 goto error;
858         }
859         mlx5_os_set_reg_mr_cb(&sh->share_cache.reg_mr_cb,
860                               &sh->share_cache.dereg_mr_cb);
861         mlx5_os_dev_shared_handler_install(sh);
862         sh->cnt_id_tbl = mlx5_l3t_create(MLX5_L3T_TYPE_DWORD);
863         if (!sh->cnt_id_tbl) {
864                 err = rte_errno;
865                 goto error;
866         }
867         mlx5_flow_aging_init(sh);
868         mlx5_flow_counters_mng_init(sh);
869         mlx5_flow_ipool_create(sh, config);
870         /* Add device to memory callback list. */
871         rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock);
872         LIST_INSERT_HEAD(&mlx5_shared_data->mem_event_cb_list,
873                          sh, mem_event_cb);
874         rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
875         /* Add context to the global device list. */
876         LIST_INSERT_HEAD(&mlx5_dev_ctx_list, sh, next);
877 exit:
878         pthread_mutex_unlock(&mlx5_dev_ctx_list_mutex);
879         return sh;
880 error:
881         pthread_mutex_destroy(&sh->txpp.mutex);
882         pthread_mutex_unlock(&mlx5_dev_ctx_list_mutex);
883         MLX5_ASSERT(sh);
884         if (sh->cnt_id_tbl) {
885                 mlx5_l3t_destroy(sh->cnt_id_tbl);
886                 sh->cnt_id_tbl = NULL;
887         }
888         if (sh->tx_uar) {
889                 mlx5_glue->devx_free_uar(sh->tx_uar);
890                 sh->tx_uar = NULL;
891         }
892         if (sh->tis)
893                 claim_zero(mlx5_devx_cmd_destroy(sh->tis));
894         if (sh->td)
895                 claim_zero(mlx5_devx_cmd_destroy(sh->td));
896         if (sh->pd)
897                 claim_zero(mlx5_glue->dealloc_pd(sh->pd));
898         if (sh->ctx)
899                 claim_zero(mlx5_glue->close_device(sh->ctx));
900         if (sh->flow_id_pool)
901                 mlx5_flow_id_pool_release(sh->flow_id_pool);
902         rte_free(sh);
903         MLX5_ASSERT(err > 0);
904         rte_errno = err;
905         return NULL;
906 }
907
908 /**
909  * Free shared IB device context. Decrement counter and if zero free
910  * all allocated resources and close handles.
911  *
912  * @param[in] sh
913  *   Pointer to mlx5_dev_ctx_shared object to free
914  */
915 void
916 mlx5_free_shared_dev_ctx(struct mlx5_dev_ctx_shared *sh)
917 {
918         pthread_mutex_lock(&mlx5_dev_ctx_list_mutex);
919 #ifdef RTE_LIBRTE_MLX5_DEBUG
920         /* Check the object presence in the list. */
921         struct mlx5_dev_ctx_shared *lctx;
922
923         LIST_FOREACH(lctx, &mlx5_dev_ctx_list, next)
924                 if (lctx == sh)
925                         break;
926         MLX5_ASSERT(lctx);
927         if (lctx != sh) {
928                 DRV_LOG(ERR, "Freeing non-existing shared IB context");
929                 goto exit;
930         }
931 #endif
932         MLX5_ASSERT(sh);
933         MLX5_ASSERT(sh->refcnt);
934         /* Secondary process should not free the shared context. */
935         MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY);
936         if (--sh->refcnt)
937                 goto exit;
938         /* Remove from memory callback device list. */
939         rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock);
940         LIST_REMOVE(sh, mem_event_cb);
941         rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
942         /* Release created Memory Regions. */
943         mlx5_mr_release_cache(&sh->share_cache);
944         /* Remove context from the global device list. */
945         LIST_REMOVE(sh, next);
946         /*
947          *  Ensure there is no async event handler installed.
948          *  Only primary process handles async device events.
949          **/
950         mlx5_flow_counters_mng_close(sh);
951         mlx5_flow_ipool_destroy(sh);
952         mlx5_os_dev_shared_handler_uninstall(sh);
953         if (sh->cnt_id_tbl) {
954                 mlx5_l3t_destroy(sh->cnt_id_tbl);
955                 sh->cnt_id_tbl = NULL;
956         }
957         if (sh->tx_uar) {
958                 mlx5_glue->devx_free_uar(sh->tx_uar);
959                 sh->tx_uar = NULL;
960         }
961         if (sh->pd)
962                 claim_zero(mlx5_glue->dealloc_pd(sh->pd));
963         if (sh->tis)
964                 claim_zero(mlx5_devx_cmd_destroy(sh->tis));
965         if (sh->td)
966                 claim_zero(mlx5_devx_cmd_destroy(sh->td));
967         if (sh->ctx)
968                 claim_zero(mlx5_glue->close_device(sh->ctx));
969         if (sh->flow_id_pool)
970                 mlx5_flow_id_pool_release(sh->flow_id_pool);
971         pthread_mutex_destroy(&sh->txpp.mutex);
972         rte_free(sh);
973 exit:
974         pthread_mutex_unlock(&mlx5_dev_ctx_list_mutex);
975 }
976
977 /**
978  * Destroy table hash list and all the root entries per domain.
979  *
980  * @param[in] priv
981  *   Pointer to the private device data structure.
982  */
983 void
984 mlx5_free_table_hash_list(struct mlx5_priv *priv)
985 {
986         struct mlx5_dev_ctx_shared *sh = priv->sh;
987         struct mlx5_flow_tbl_data_entry *tbl_data;
988         union mlx5_flow_tbl_key table_key = {
989                 {
990                         .table_id = 0,
991                         .reserved = 0,
992                         .domain = 0,
993                         .direction = 0,
994                 }
995         };
996         struct mlx5_hlist_entry *pos;
997
998         if (!sh->flow_tbls)
999                 return;
1000         pos = mlx5_hlist_lookup(sh->flow_tbls, table_key.v64);
1001         if (pos) {
1002                 tbl_data = container_of(pos, struct mlx5_flow_tbl_data_entry,
1003                                         entry);
1004                 MLX5_ASSERT(tbl_data);
1005                 mlx5_hlist_remove(sh->flow_tbls, pos);
1006                 mlx5_free(tbl_data);
1007         }
1008         table_key.direction = 1;
1009         pos = mlx5_hlist_lookup(sh->flow_tbls, table_key.v64);
1010         if (pos) {
1011                 tbl_data = container_of(pos, struct mlx5_flow_tbl_data_entry,
1012                                         entry);
1013                 MLX5_ASSERT(tbl_data);
1014                 mlx5_hlist_remove(sh->flow_tbls, pos);
1015                 mlx5_free(tbl_data);
1016         }
1017         table_key.direction = 0;
1018         table_key.domain = 1;
1019         pos = mlx5_hlist_lookup(sh->flow_tbls, table_key.v64);
1020         if (pos) {
1021                 tbl_data = container_of(pos, struct mlx5_flow_tbl_data_entry,
1022                                         entry);
1023                 MLX5_ASSERT(tbl_data);
1024                 mlx5_hlist_remove(sh->flow_tbls, pos);
1025                 mlx5_free(tbl_data);
1026         }
1027         mlx5_hlist_destroy(sh->flow_tbls, NULL, NULL);
1028 }
1029
1030 /**
1031  * Initialize flow table hash list and create the root tables entry
1032  * for each domain.
1033  *
1034  * @param[in] priv
1035  *   Pointer to the private device data structure.
1036  *
1037  * @return
1038  *   Zero on success, positive error code otherwise.
1039  */
1040 int
1041 mlx5_alloc_table_hash_list(struct mlx5_priv *priv)
1042 {
1043         struct mlx5_dev_ctx_shared *sh = priv->sh;
1044         char s[MLX5_HLIST_NAMESIZE];
1045         int err = 0;
1046
1047         MLX5_ASSERT(sh);
1048         snprintf(s, sizeof(s), "%s_flow_table", priv->sh->ibdev_name);
1049         sh->flow_tbls = mlx5_hlist_create(s, MLX5_FLOW_TABLE_HLIST_ARRAY_SIZE);
1050         if (!sh->flow_tbls) {
1051                 DRV_LOG(ERR, "flow tables with hash creation failed.");
1052                 err = ENOMEM;
1053                 return err;
1054         }
1055 #ifndef HAVE_MLX5DV_DR
1056         /*
1057          * In case we have not DR support, the zero tables should be created
1058          * because DV expect to see them even if they cannot be created by
1059          * RDMA-CORE.
1060          */
1061         union mlx5_flow_tbl_key table_key = {
1062                 {
1063                         .table_id = 0,
1064                         .reserved = 0,
1065                         .domain = 0,
1066                         .direction = 0,
1067                 }
1068         };
1069         struct mlx5_flow_tbl_data_entry *tbl_data = mlx5_malloc(MLX5_MEM_ZERO,
1070                                                           sizeof(*tbl_data), 0,
1071                                                           SOCKET_ID_ANY);
1072
1073         if (!tbl_data) {
1074                 err = ENOMEM;
1075                 goto error;
1076         }
1077         tbl_data->entry.key = table_key.v64;
1078         err = mlx5_hlist_insert(sh->flow_tbls, &tbl_data->entry);
1079         if (err)
1080                 goto error;
1081         rte_atomic32_init(&tbl_data->tbl.refcnt);
1082         rte_atomic32_inc(&tbl_data->tbl.refcnt);
1083         table_key.direction = 1;
1084         tbl_data = mlx5_malloc(MLX5_MEM_ZERO, sizeof(*tbl_data), 0,
1085                                SOCKET_ID_ANY);
1086         if (!tbl_data) {
1087                 err = ENOMEM;
1088                 goto error;
1089         }
1090         tbl_data->entry.key = table_key.v64;
1091         err = mlx5_hlist_insert(sh->flow_tbls, &tbl_data->entry);
1092         if (err)
1093                 goto error;
1094         rte_atomic32_init(&tbl_data->tbl.refcnt);
1095         rte_atomic32_inc(&tbl_data->tbl.refcnt);
1096         table_key.direction = 0;
1097         table_key.domain = 1;
1098         tbl_data = mlx5_malloc(MLX5_MEM_ZERO, sizeof(*tbl_data), 0,
1099                                SOCKET_ID_ANY);
1100         if (!tbl_data) {
1101                 err = ENOMEM;
1102                 goto error;
1103         }
1104         tbl_data->entry.key = table_key.v64;
1105         err = mlx5_hlist_insert(sh->flow_tbls, &tbl_data->entry);
1106         if (err)
1107                 goto error;
1108         rte_atomic32_init(&tbl_data->tbl.refcnt);
1109         rte_atomic32_inc(&tbl_data->tbl.refcnt);
1110         return err;
1111 error:
1112         mlx5_free_table_hash_list(priv);
1113 #endif /* HAVE_MLX5DV_DR */
1114         return err;
1115 }
1116
1117 /**
1118  * Initialize shared data between primary and secondary process.
1119  *
1120  * A memzone is reserved by primary process and secondary processes attach to
1121  * the memzone.
1122  *
1123  * @return
1124  *   0 on success, a negative errno value otherwise and rte_errno is set.
1125  */
1126 static int
1127 mlx5_init_shared_data(void)
1128 {
1129         const struct rte_memzone *mz;
1130         int ret = 0;
1131
1132         rte_spinlock_lock(&mlx5_shared_data_lock);
1133         if (mlx5_shared_data == NULL) {
1134                 if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
1135                         /* Allocate shared memory. */
1136                         mz = rte_memzone_reserve(MZ_MLX5_PMD_SHARED_DATA,
1137                                                  sizeof(*mlx5_shared_data),
1138                                                  SOCKET_ID_ANY, 0);
1139                         if (mz == NULL) {
1140                                 DRV_LOG(ERR,
1141                                         "Cannot allocate mlx5 shared data");
1142                                 ret = -rte_errno;
1143                                 goto error;
1144                         }
1145                         mlx5_shared_data = mz->addr;
1146                         memset(mlx5_shared_data, 0, sizeof(*mlx5_shared_data));
1147                         rte_spinlock_init(&mlx5_shared_data->lock);
1148                 } else {
1149                         /* Lookup allocated shared memory. */
1150                         mz = rte_memzone_lookup(MZ_MLX5_PMD_SHARED_DATA);
1151                         if (mz == NULL) {
1152                                 DRV_LOG(ERR,
1153                                         "Cannot attach mlx5 shared data");
1154                                 ret = -rte_errno;
1155                                 goto error;
1156                         }
1157                         mlx5_shared_data = mz->addr;
1158                         memset(&mlx5_local_data, 0, sizeof(mlx5_local_data));
1159                 }
1160         }
1161 error:
1162         rte_spinlock_unlock(&mlx5_shared_data_lock);
1163         return ret;
1164 }
1165
1166 /**
1167  * Retrieve integer value from environment variable.
1168  *
1169  * @param[in] name
1170  *   Environment variable name.
1171  *
1172  * @return
1173  *   Integer value, 0 if the variable is not set.
1174  */
1175 int
1176 mlx5_getenv_int(const char *name)
1177 {
1178         const char *val = getenv(name);
1179
1180         if (val == NULL)
1181                 return 0;
1182         return atoi(val);
1183 }
1184
1185 /**
1186  * DPDK callback to add udp tunnel port
1187  *
1188  * @param[in] dev
1189  *   A pointer to eth_dev
1190  * @param[in] udp_tunnel
1191  *   A pointer to udp tunnel
1192  *
1193  * @return
1194  *   0 on valid udp ports and tunnels, -ENOTSUP otherwise.
1195  */
1196 int
1197 mlx5_udp_tunnel_port_add(struct rte_eth_dev *dev __rte_unused,
1198                          struct rte_eth_udp_tunnel *udp_tunnel)
1199 {
1200         MLX5_ASSERT(udp_tunnel != NULL);
1201         if (udp_tunnel->prot_type == RTE_TUNNEL_TYPE_VXLAN &&
1202             udp_tunnel->udp_port == 4789)
1203                 return 0;
1204         if (udp_tunnel->prot_type == RTE_TUNNEL_TYPE_VXLAN_GPE &&
1205             udp_tunnel->udp_port == 4790)
1206                 return 0;
1207         return -ENOTSUP;
1208 }
1209
1210 /**
1211  * Initialize process private data structure.
1212  *
1213  * @param dev
1214  *   Pointer to Ethernet device structure.
1215  *
1216  * @return
1217  *   0 on success, a negative errno value otherwise and rte_errno is set.
1218  */
1219 int
1220 mlx5_proc_priv_init(struct rte_eth_dev *dev)
1221 {
1222         struct mlx5_priv *priv = dev->data->dev_private;
1223         struct mlx5_proc_priv *ppriv;
1224         size_t ppriv_size;
1225
1226         /*
1227          * UAR register table follows the process private structure. BlueFlame
1228          * registers for Tx queues are stored in the table.
1229          */
1230         ppriv_size =
1231                 sizeof(struct mlx5_proc_priv) + priv->txqs_n * sizeof(void *);
1232         ppriv = rte_malloc_socket("mlx5_proc_priv", ppriv_size,
1233                                   RTE_CACHE_LINE_SIZE, dev->device->numa_node);
1234         if (!ppriv) {
1235                 rte_errno = ENOMEM;
1236                 return -rte_errno;
1237         }
1238         ppriv->uar_table_sz = ppriv_size;
1239         dev->process_private = ppriv;
1240         return 0;
1241 }
1242
1243 /**
1244  * Un-initialize process private data structure.
1245  *
1246  * @param dev
1247  *   Pointer to Ethernet device structure.
1248  */
1249 static void
1250 mlx5_proc_priv_uninit(struct rte_eth_dev *dev)
1251 {
1252         if (!dev->process_private)
1253                 return;
1254         rte_free(dev->process_private);
1255         dev->process_private = NULL;
1256 }
1257
1258 /**
1259  * DPDK callback to close the device.
1260  *
1261  * Destroy all queues and objects, free memory.
1262  *
1263  * @param dev
1264  *   Pointer to Ethernet device structure.
1265  */
1266 void
1267 mlx5_dev_close(struct rte_eth_dev *dev)
1268 {
1269         struct mlx5_priv *priv = dev->data->dev_private;
1270         unsigned int i;
1271         int ret;
1272
1273         if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
1274                 /* Check if process_private released. */
1275                 if (!dev->process_private)
1276                         return;
1277                 mlx5_tx_uar_uninit_secondary(dev);
1278                 mlx5_proc_priv_uninit(dev);
1279                 rte_eth_dev_release_port(dev);
1280                 return;
1281         }
1282         if (!priv->sh)
1283                 return;
1284         DRV_LOG(DEBUG, "port %u closing device \"%s\"",
1285                 dev->data->port_id,
1286                 ((priv->sh->ctx != NULL) ?
1287                 mlx5_os_get_ctx_device_name(priv->sh->ctx) : ""));
1288         /*
1289          * If default mreg copy action is removed at the stop stage,
1290          * the search will return none and nothing will be done anymore.
1291          */
1292         mlx5_flow_stop_default(dev);
1293         mlx5_traffic_disable(dev);
1294         /*
1295          * If all the flows are already flushed in the device stop stage,
1296          * then this will return directly without any action.
1297          */
1298         mlx5_flow_list_flush(dev, &priv->flows, true);
1299         mlx5_flow_meter_flush(dev, NULL);
1300         /* Free the intermediate buffers for flow creation. */
1301         mlx5_flow_free_intermediate(dev);
1302         /* Prevent crashes when queues are still in use. */
1303         dev->rx_pkt_burst = removed_rx_burst;
1304         dev->tx_pkt_burst = removed_tx_burst;
1305         rte_wmb();
1306         /* Disable datapath on secondary process. */
1307         mlx5_mp_req_stop_rxtx(dev);
1308         /* Free the eCPRI flex parser resource. */
1309         mlx5_flex_parser_ecpri_release(dev);
1310         if (priv->rxqs != NULL) {
1311                 /* XXX race condition if mlx5_rx_burst() is still running. */
1312                 usleep(1000);
1313                 for (i = 0; (i != priv->rxqs_n); ++i)
1314                         mlx5_rxq_release(dev, i);
1315                 priv->rxqs_n = 0;
1316                 priv->rxqs = NULL;
1317         }
1318         if (priv->txqs != NULL) {
1319                 /* XXX race condition if mlx5_tx_burst() is still running. */
1320                 usleep(1000);
1321                 for (i = 0; (i != priv->txqs_n); ++i)
1322                         mlx5_txq_release(dev, i);
1323                 priv->txqs_n = 0;
1324                 priv->txqs = NULL;
1325         }
1326         mlx5_proc_priv_uninit(dev);
1327         if (priv->mreg_cp_tbl)
1328                 mlx5_hlist_destroy(priv->mreg_cp_tbl, NULL, NULL);
1329         mlx5_mprq_free_mp(dev);
1330         mlx5_os_free_shared_dr(priv);
1331         if (priv->rss_conf.rss_key != NULL)
1332                 mlx5_free(priv->rss_conf.rss_key);
1333         if (priv->reta_idx != NULL)
1334                 mlx5_free(priv->reta_idx);
1335         if (priv->config.vf)
1336                 mlx5_nl_mac_addr_flush(priv->nl_socket_route, mlx5_ifindex(dev),
1337                                        dev->data->mac_addrs,
1338                                        MLX5_MAX_MAC_ADDRESSES, priv->mac_own);
1339         if (priv->nl_socket_route >= 0)
1340                 close(priv->nl_socket_route);
1341         if (priv->nl_socket_rdma >= 0)
1342                 close(priv->nl_socket_rdma);
1343         if (priv->vmwa_context)
1344                 mlx5_vlan_vmwa_exit(priv->vmwa_context);
1345         ret = mlx5_hrxq_verify(dev);
1346         if (ret)
1347                 DRV_LOG(WARNING, "port %u some hash Rx queue still remain",
1348                         dev->data->port_id);
1349         ret = mlx5_ind_table_obj_verify(dev);
1350         if (ret)
1351                 DRV_LOG(WARNING, "port %u some indirection table still remain",
1352                         dev->data->port_id);
1353         ret = mlx5_rxq_obj_verify(dev);
1354         if (ret)
1355                 DRV_LOG(WARNING, "port %u some Rx queue objects still remain",
1356                         dev->data->port_id);
1357         ret = mlx5_rxq_verify(dev);
1358         if (ret)
1359                 DRV_LOG(WARNING, "port %u some Rx queues still remain",
1360                         dev->data->port_id);
1361         ret = mlx5_txq_obj_verify(dev);
1362         if (ret)
1363                 DRV_LOG(WARNING, "port %u some Verbs Tx queue still remain",
1364                         dev->data->port_id);
1365         ret = mlx5_txq_verify(dev);
1366         if (ret)
1367                 DRV_LOG(WARNING, "port %u some Tx queues still remain",
1368                         dev->data->port_id);
1369         ret = mlx5_flow_verify(dev);
1370         if (ret)
1371                 DRV_LOG(WARNING, "port %u some flows still remain",
1372                         dev->data->port_id);
1373         /*
1374          * Free the shared context in last turn, because the cleanup
1375          * routines above may use some shared fields, like
1376          * mlx5_nl_mac_addr_flush() uses ibdev_path for retrieveing
1377          * ifindex if Netlink fails.
1378          */
1379         mlx5_free_shared_dev_ctx(priv->sh);
1380         if (priv->domain_id != RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID) {
1381                 unsigned int c = 0;
1382                 uint16_t port_id;
1383
1384                 MLX5_ETH_FOREACH_DEV(port_id, priv->pci_dev) {
1385                         struct mlx5_priv *opriv =
1386                                 rte_eth_devices[port_id].data->dev_private;
1387
1388                         if (!opriv ||
1389                             opriv->domain_id != priv->domain_id ||
1390                             &rte_eth_devices[port_id] == dev)
1391                                 continue;
1392                         ++c;
1393                         break;
1394                 }
1395                 if (!c)
1396                         claim_zero(rte_eth_switch_domain_free(priv->domain_id));
1397         }
1398         memset(priv, 0, sizeof(*priv));
1399         priv->domain_id = RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID;
1400         /*
1401          * Reset mac_addrs to NULL such that it is not freed as part of
1402          * rte_eth_dev_release_port(). mac_addrs is part of dev_private so
1403          * it is freed when dev_private is freed.
1404          */
1405         dev->data->mac_addrs = NULL;
1406 }
1407
1408 /**
1409  * Verify and store value for device argument.
1410  *
1411  * @param[in] key
1412  *   Key argument to verify.
1413  * @param[in] val
1414  *   Value associated with key.
1415  * @param opaque
1416  *   User data.
1417  *
1418  * @return
1419  *   0 on success, a negative errno value otherwise and rte_errno is set.
1420  */
1421 static int
1422 mlx5_args_check(const char *key, const char *val, void *opaque)
1423 {
1424         struct mlx5_dev_config *config = opaque;
1425         unsigned long mod;
1426         signed long tmp;
1427
1428         /* No-op, port representors are processed in mlx5_dev_spawn(). */
1429         if (!strcmp(MLX5_REPRESENTOR, key))
1430                 return 0;
1431         errno = 0;
1432         tmp = strtol(val, NULL, 0);
1433         if (errno) {
1434                 rte_errno = errno;
1435                 DRV_LOG(WARNING, "%s: \"%s\" is not a valid integer", key, val);
1436                 return -rte_errno;
1437         }
1438         if (tmp < 0 && strcmp(MLX5_TX_PP, key) && strcmp(MLX5_TX_SKEW, key)) {
1439                 /* Negative values are acceptable for some keys only. */
1440                 rte_errno = EINVAL;
1441                 DRV_LOG(WARNING, "%s: invalid negative value \"%s\"", key, val);
1442                 return -rte_errno;
1443         }
1444         mod = tmp >= 0 ? tmp : -tmp;
1445         if (strcmp(MLX5_RXQ_CQE_COMP_EN, key) == 0) {
1446                 config->cqe_comp = !!tmp;
1447         } else if (strcmp(MLX5_RXQ_CQE_PAD_EN, key) == 0) {
1448                 config->cqe_pad = !!tmp;
1449         } else if (strcmp(MLX5_RXQ_PKT_PAD_EN, key) == 0) {
1450                 config->hw_padding = !!tmp;
1451         } else if (strcmp(MLX5_RX_MPRQ_EN, key) == 0) {
1452                 config->mprq.enabled = !!tmp;
1453         } else if (strcmp(MLX5_RX_MPRQ_LOG_STRIDE_NUM, key) == 0) {
1454                 config->mprq.stride_num_n = tmp;
1455         } else if (strcmp(MLX5_RX_MPRQ_LOG_STRIDE_SIZE, key) == 0) {
1456                 config->mprq.stride_size_n = tmp;
1457         } else if (strcmp(MLX5_RX_MPRQ_MAX_MEMCPY_LEN, key) == 0) {
1458                 config->mprq.max_memcpy_len = tmp;
1459         } else if (strcmp(MLX5_RXQS_MIN_MPRQ, key) == 0) {
1460                 config->mprq.min_rxqs_num = tmp;
1461         } else if (strcmp(MLX5_TXQ_INLINE, key) == 0) {
1462                 DRV_LOG(WARNING, "%s: deprecated parameter,"
1463                                  " converted to txq_inline_max", key);
1464                 config->txq_inline_max = tmp;
1465         } else if (strcmp(MLX5_TXQ_INLINE_MAX, key) == 0) {
1466                 config->txq_inline_max = tmp;
1467         } else if (strcmp(MLX5_TXQ_INLINE_MIN, key) == 0) {
1468                 config->txq_inline_min = tmp;
1469         } else if (strcmp(MLX5_TXQ_INLINE_MPW, key) == 0) {
1470                 config->txq_inline_mpw = tmp;
1471         } else if (strcmp(MLX5_TXQS_MIN_INLINE, key) == 0) {
1472                 config->txqs_inline = tmp;
1473         } else if (strcmp(MLX5_TXQS_MAX_VEC, key) == 0) {
1474                 DRV_LOG(WARNING, "%s: deprecated parameter, ignored", key);
1475         } else if (strcmp(MLX5_TXQ_MPW_EN, key) == 0) {
1476                 config->mps = !!tmp;
1477         } else if (strcmp(MLX5_TX_DB_NC, key) == 0) {
1478                 if (tmp != MLX5_TXDB_CACHED &&
1479                     tmp != MLX5_TXDB_NCACHED &&
1480                     tmp != MLX5_TXDB_HEURISTIC) {
1481                         DRV_LOG(ERR, "invalid Tx doorbell "
1482                                      "mapping parameter");
1483                         rte_errno = EINVAL;
1484                         return -rte_errno;
1485                 }
1486                 config->dbnc = tmp;
1487         } else if (strcmp(MLX5_TXQ_MPW_HDR_DSEG_EN, key) == 0) {
1488                 DRV_LOG(WARNING, "%s: deprecated parameter, ignored", key);
1489         } else if (strcmp(MLX5_TXQ_MAX_INLINE_LEN, key) == 0) {
1490                 DRV_LOG(WARNING, "%s: deprecated parameter,"
1491                                  " converted to txq_inline_mpw", key);
1492                 config->txq_inline_mpw = tmp;
1493         } else if (strcmp(MLX5_TX_VEC_EN, key) == 0) {
1494                 DRV_LOG(WARNING, "%s: deprecated parameter, ignored", key);
1495         } else if (strcmp(MLX5_TX_PP, key) == 0) {
1496                 if (!mod) {
1497                         DRV_LOG(ERR, "Zero Tx packet pacing parameter");
1498                         rte_errno = EINVAL;
1499                         return -rte_errno;
1500                 }
1501                 config->tx_pp = tmp;
1502         } else if (strcmp(MLX5_TX_SKEW, key) == 0) {
1503                 config->tx_skew = tmp;
1504         } else if (strcmp(MLX5_RX_VEC_EN, key) == 0) {
1505                 config->rx_vec_en = !!tmp;
1506         } else if (strcmp(MLX5_L3_VXLAN_EN, key) == 0) {
1507                 config->l3_vxlan_en = !!tmp;
1508         } else if (strcmp(MLX5_VF_NL_EN, key) == 0) {
1509                 config->vf_nl_en = !!tmp;
1510         } else if (strcmp(MLX5_DV_ESW_EN, key) == 0) {
1511                 config->dv_esw_en = !!tmp;
1512         } else if (strcmp(MLX5_DV_FLOW_EN, key) == 0) {
1513                 config->dv_flow_en = !!tmp;
1514         } else if (strcmp(MLX5_DV_XMETA_EN, key) == 0) {
1515                 if (tmp != MLX5_XMETA_MODE_LEGACY &&
1516                     tmp != MLX5_XMETA_MODE_META16 &&
1517                     tmp != MLX5_XMETA_MODE_META32) {
1518                         DRV_LOG(ERR, "invalid extensive "
1519                                      "metadata parameter");
1520                         rte_errno = EINVAL;
1521                         return -rte_errno;
1522                 }
1523                 config->dv_xmeta_en = tmp;
1524         } else if (strcmp(MLX5_LACP_BY_USER, key) == 0) {
1525                 config->lacp_by_user = !!tmp;
1526         } else if (strcmp(MLX5_MR_EXT_MEMSEG_EN, key) == 0) {
1527                 config->mr_ext_memseg_en = !!tmp;
1528         } else if (strcmp(MLX5_MAX_DUMP_FILES_NUM, key) == 0) {
1529                 config->max_dump_files_num = tmp;
1530         } else if (strcmp(MLX5_LRO_TIMEOUT_USEC, key) == 0) {
1531                 config->lro.timeout = tmp;
1532         } else if (strcmp(MLX5_CLASS_ARG_NAME, key) == 0) {
1533                 DRV_LOG(DEBUG, "class argument is %s.", val);
1534         } else if (strcmp(MLX5_HP_BUF_SIZE, key) == 0) {
1535                 config->log_hp_size = tmp;
1536         } else if (strcmp(MLX5_RECLAIM_MEM, key) == 0) {
1537                 if (tmp != MLX5_RCM_NONE &&
1538                     tmp != MLX5_RCM_LIGHT &&
1539                     tmp != MLX5_RCM_AGGR) {
1540                         DRV_LOG(ERR, "Unrecognize %s: \"%s\"", key, val);
1541                         rte_errno = EINVAL;
1542                         return -rte_errno;
1543                 }
1544                 config->reclaim_mode = tmp;
1545         } else if (strcmp(MLX5_SYS_MEM_EN, key) == 0) {
1546                 config->sys_mem_en = !!tmp;
1547         } else {
1548                 DRV_LOG(WARNING, "%s: unknown parameter", key);
1549                 rte_errno = EINVAL;
1550                 return -rte_errno;
1551         }
1552         return 0;
1553 }
1554
1555 /**
1556  * Parse device parameters.
1557  *
1558  * @param config
1559  *   Pointer to device configuration structure.
1560  * @param devargs
1561  *   Device arguments structure.
1562  *
1563  * @return
1564  *   0 on success, a negative errno value otherwise and rte_errno is set.
1565  */
1566 int
1567 mlx5_args(struct mlx5_dev_config *config, struct rte_devargs *devargs)
1568 {
1569         const char **params = (const char *[]){
1570                 MLX5_RXQ_CQE_COMP_EN,
1571                 MLX5_RXQ_CQE_PAD_EN,
1572                 MLX5_RXQ_PKT_PAD_EN,
1573                 MLX5_RX_MPRQ_EN,
1574                 MLX5_RX_MPRQ_LOG_STRIDE_NUM,
1575                 MLX5_RX_MPRQ_LOG_STRIDE_SIZE,
1576                 MLX5_RX_MPRQ_MAX_MEMCPY_LEN,
1577                 MLX5_RXQS_MIN_MPRQ,
1578                 MLX5_TXQ_INLINE,
1579                 MLX5_TXQ_INLINE_MIN,
1580                 MLX5_TXQ_INLINE_MAX,
1581                 MLX5_TXQ_INLINE_MPW,
1582                 MLX5_TXQS_MIN_INLINE,
1583                 MLX5_TXQS_MAX_VEC,
1584                 MLX5_TXQ_MPW_EN,
1585                 MLX5_TXQ_MPW_HDR_DSEG_EN,
1586                 MLX5_TXQ_MAX_INLINE_LEN,
1587                 MLX5_TX_DB_NC,
1588                 MLX5_TX_PP,
1589                 MLX5_TX_SKEW,
1590                 MLX5_TX_VEC_EN,
1591                 MLX5_RX_VEC_EN,
1592                 MLX5_L3_VXLAN_EN,
1593                 MLX5_VF_NL_EN,
1594                 MLX5_DV_ESW_EN,
1595                 MLX5_DV_FLOW_EN,
1596                 MLX5_DV_XMETA_EN,
1597                 MLX5_LACP_BY_USER,
1598                 MLX5_MR_EXT_MEMSEG_EN,
1599                 MLX5_REPRESENTOR,
1600                 MLX5_MAX_DUMP_FILES_NUM,
1601                 MLX5_LRO_TIMEOUT_USEC,
1602                 MLX5_CLASS_ARG_NAME,
1603                 MLX5_HP_BUF_SIZE,
1604                 MLX5_RECLAIM_MEM,
1605                 MLX5_SYS_MEM_EN,
1606                 NULL,
1607         };
1608         struct rte_kvargs *kvlist;
1609         int ret = 0;
1610         int i;
1611
1612         if (devargs == NULL)
1613                 return 0;
1614         /* Following UGLY cast is done to pass checkpatch. */
1615         kvlist = rte_kvargs_parse(devargs->args, params);
1616         if (kvlist == NULL) {
1617                 rte_errno = EINVAL;
1618                 return -rte_errno;
1619         }
1620         /* Process parameters. */
1621         for (i = 0; (params[i] != NULL); ++i) {
1622                 if (rte_kvargs_count(kvlist, params[i])) {
1623                         ret = rte_kvargs_process(kvlist, params[i],
1624                                                  mlx5_args_check, config);
1625                         if (ret) {
1626                                 rte_errno = EINVAL;
1627                                 rte_kvargs_free(kvlist);
1628                                 return -rte_errno;
1629                         }
1630                 }
1631         }
1632         rte_kvargs_free(kvlist);
1633         return 0;
1634 }
1635
1636 /**
1637  * PMD global initialization.
1638  *
1639  * Independent from individual device, this function initializes global
1640  * per-PMD data structures distinguishing primary and secondary processes.
1641  * Hence, each initialization is called once per a process.
1642  *
1643  * @return
1644  *   0 on success, a negative errno value otherwise and rte_errno is set.
1645  */
1646 int
1647 mlx5_init_once(void)
1648 {
1649         struct mlx5_shared_data *sd;
1650         struct mlx5_local_data *ld = &mlx5_local_data;
1651         int ret = 0;
1652
1653         if (mlx5_init_shared_data())
1654                 return -rte_errno;
1655         sd = mlx5_shared_data;
1656         MLX5_ASSERT(sd);
1657         rte_spinlock_lock(&sd->lock);
1658         switch (rte_eal_process_type()) {
1659         case RTE_PROC_PRIMARY:
1660                 if (sd->init_done)
1661                         break;
1662                 LIST_INIT(&sd->mem_event_cb_list);
1663                 rte_rwlock_init(&sd->mem_event_rwlock);
1664                 rte_mem_event_callback_register("MLX5_MEM_EVENT_CB",
1665                                                 mlx5_mr_mem_event_cb, NULL);
1666                 ret = mlx5_mp_init_primary(MLX5_MP_NAME,
1667                                            mlx5_mp_primary_handle);
1668                 if (ret)
1669                         goto out;
1670                 sd->init_done = true;
1671                 break;
1672         case RTE_PROC_SECONDARY:
1673                 if (ld->init_done)
1674                         break;
1675                 ret = mlx5_mp_init_secondary(MLX5_MP_NAME,
1676                                              mlx5_mp_secondary_handle);
1677                 if (ret)
1678                         goto out;
1679                 ++sd->secondary_cnt;
1680                 ld->init_done = true;
1681                 break;
1682         default:
1683                 break;
1684         }
1685 out:
1686         rte_spinlock_unlock(&sd->lock);
1687         return ret;
1688 }
1689
1690 /**
1691  * Configures the minimal amount of data to inline into WQE
1692  * while sending packets.
1693  *
1694  * - the txq_inline_min has the maximal priority, if this
1695  *   key is specified in devargs
1696  * - if DevX is enabled the inline mode is queried from the
1697  *   device (HCA attributes and NIC vport context if needed).
1698  * - otherwise L2 mode (18 bytes) is assumed for ConnectX-4/4 Lx
1699  *   and none (0 bytes) for other NICs
1700  *
1701  * @param spawn
1702  *   Verbs device parameters (name, port, switch_info) to spawn.
1703  * @param config
1704  *   Device configuration parameters.
1705  */
1706 void
1707 mlx5_set_min_inline(struct mlx5_dev_spawn_data *spawn,
1708                     struct mlx5_dev_config *config)
1709 {
1710         if (config->txq_inline_min != MLX5_ARG_UNSET) {
1711                 /* Application defines size of inlined data explicitly. */
1712                 switch (spawn->pci_dev->id.device_id) {
1713                 case PCI_DEVICE_ID_MELLANOX_CONNECTX4:
1714                 case PCI_DEVICE_ID_MELLANOX_CONNECTX4VF:
1715                         if (config->txq_inline_min <
1716                                        (int)MLX5_INLINE_HSIZE_L2) {
1717                                 DRV_LOG(DEBUG,
1718                                         "txq_inline_mix aligned to minimal"
1719                                         " ConnectX-4 required value %d",
1720                                         (int)MLX5_INLINE_HSIZE_L2);
1721                                 config->txq_inline_min = MLX5_INLINE_HSIZE_L2;
1722                         }
1723                         break;
1724                 }
1725                 goto exit;
1726         }
1727         if (config->hca_attr.eth_net_offloads) {
1728                 /* We have DevX enabled, inline mode queried successfully. */
1729                 switch (config->hca_attr.wqe_inline_mode) {
1730                 case MLX5_CAP_INLINE_MODE_L2:
1731                         /* outer L2 header must be inlined. */
1732                         config->txq_inline_min = MLX5_INLINE_HSIZE_L2;
1733                         goto exit;
1734                 case MLX5_CAP_INLINE_MODE_NOT_REQUIRED:
1735                         /* No inline data are required by NIC. */
1736                         config->txq_inline_min = MLX5_INLINE_HSIZE_NONE;
1737                         config->hw_vlan_insert =
1738                                 config->hca_attr.wqe_vlan_insert;
1739                         DRV_LOG(DEBUG, "Tx VLAN insertion is supported");
1740                         goto exit;
1741                 case MLX5_CAP_INLINE_MODE_VPORT_CONTEXT:
1742                         /* inline mode is defined by NIC vport context. */
1743                         if (!config->hca_attr.eth_virt)
1744                                 break;
1745                         switch (config->hca_attr.vport_inline_mode) {
1746                         case MLX5_INLINE_MODE_NONE:
1747                                 config->txq_inline_min =
1748                                         MLX5_INLINE_HSIZE_NONE;
1749                                 goto exit;
1750                         case MLX5_INLINE_MODE_L2:
1751                                 config->txq_inline_min =
1752                                         MLX5_INLINE_HSIZE_L2;
1753                                 goto exit;
1754                         case MLX5_INLINE_MODE_IP:
1755                                 config->txq_inline_min =
1756                                         MLX5_INLINE_HSIZE_L3;
1757                                 goto exit;
1758                         case MLX5_INLINE_MODE_TCP_UDP:
1759                                 config->txq_inline_min =
1760                                         MLX5_INLINE_HSIZE_L4;
1761                                 goto exit;
1762                         case MLX5_INLINE_MODE_INNER_L2:
1763                                 config->txq_inline_min =
1764                                         MLX5_INLINE_HSIZE_INNER_L2;
1765                                 goto exit;
1766                         case MLX5_INLINE_MODE_INNER_IP:
1767                                 config->txq_inline_min =
1768                                         MLX5_INLINE_HSIZE_INNER_L3;
1769                                 goto exit;
1770                         case MLX5_INLINE_MODE_INNER_TCP_UDP:
1771                                 config->txq_inline_min =
1772                                         MLX5_INLINE_HSIZE_INNER_L4;
1773                                 goto exit;
1774                         }
1775                 }
1776         }
1777         /*
1778          * We get here if we are unable to deduce
1779          * inline data size with DevX. Try PCI ID
1780          * to determine old NICs.
1781          */
1782         switch (spawn->pci_dev->id.device_id) {
1783         case PCI_DEVICE_ID_MELLANOX_CONNECTX4:
1784         case PCI_DEVICE_ID_MELLANOX_CONNECTX4VF:
1785         case PCI_DEVICE_ID_MELLANOX_CONNECTX4LX:
1786         case PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF:
1787                 config->txq_inline_min = MLX5_INLINE_HSIZE_L2;
1788                 config->hw_vlan_insert = 0;
1789                 break;
1790         case PCI_DEVICE_ID_MELLANOX_CONNECTX5:
1791         case PCI_DEVICE_ID_MELLANOX_CONNECTX5VF:
1792         case PCI_DEVICE_ID_MELLANOX_CONNECTX5EX:
1793         case PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF:
1794                 /*
1795                  * These NICs support VLAN insertion from WQE and
1796                  * report the wqe_vlan_insert flag. But there is the bug
1797                  * and PFC control may be broken, so disable feature.
1798                  */
1799                 config->hw_vlan_insert = 0;
1800                 config->txq_inline_min = MLX5_INLINE_HSIZE_NONE;
1801                 break;
1802         default:
1803                 config->txq_inline_min = MLX5_INLINE_HSIZE_NONE;
1804                 break;
1805         }
1806 exit:
1807         DRV_LOG(DEBUG, "min tx inline configured: %d", config->txq_inline_min);
1808 }
1809
1810 /**
1811  * Configures the metadata mask fields in the shared context.
1812  *
1813  * @param [in] dev
1814  *   Pointer to Ethernet device.
1815  */
1816 void
1817 mlx5_set_metadata_mask(struct rte_eth_dev *dev)
1818 {
1819         struct mlx5_priv *priv = dev->data->dev_private;
1820         struct mlx5_dev_ctx_shared *sh = priv->sh;
1821         uint32_t meta, mark, reg_c0;
1822
1823         reg_c0 = ~priv->vport_meta_mask;
1824         switch (priv->config.dv_xmeta_en) {
1825         case MLX5_XMETA_MODE_LEGACY:
1826                 meta = UINT32_MAX;
1827                 mark = MLX5_FLOW_MARK_MASK;
1828                 break;
1829         case MLX5_XMETA_MODE_META16:
1830                 meta = reg_c0 >> rte_bsf32(reg_c0);
1831                 mark = MLX5_FLOW_MARK_MASK;
1832                 break;
1833         case MLX5_XMETA_MODE_META32:
1834                 meta = UINT32_MAX;
1835                 mark = (reg_c0 >> rte_bsf32(reg_c0)) & MLX5_FLOW_MARK_MASK;
1836                 break;
1837         default:
1838                 meta = 0;
1839                 mark = 0;
1840                 MLX5_ASSERT(false);
1841                 break;
1842         }
1843         if (sh->dv_mark_mask && sh->dv_mark_mask != mark)
1844                 DRV_LOG(WARNING, "metadata MARK mask mismatche %08X:%08X",
1845                                  sh->dv_mark_mask, mark);
1846         else
1847                 sh->dv_mark_mask = mark;
1848         if (sh->dv_meta_mask && sh->dv_meta_mask != meta)
1849                 DRV_LOG(WARNING, "metadata META mask mismatche %08X:%08X",
1850                                  sh->dv_meta_mask, meta);
1851         else
1852                 sh->dv_meta_mask = meta;
1853         if (sh->dv_regc0_mask && sh->dv_regc0_mask != reg_c0)
1854                 DRV_LOG(WARNING, "metadata reg_c0 mask mismatche %08X:%08X",
1855                                  sh->dv_meta_mask, reg_c0);
1856         else
1857                 sh->dv_regc0_mask = reg_c0;
1858         DRV_LOG(DEBUG, "metadata mode %u", priv->config.dv_xmeta_en);
1859         DRV_LOG(DEBUG, "metadata MARK mask %08X", sh->dv_mark_mask);
1860         DRV_LOG(DEBUG, "metadata META mask %08X", sh->dv_meta_mask);
1861         DRV_LOG(DEBUG, "metadata reg_c0 mask %08X", sh->dv_regc0_mask);
1862 }
1863
1864 int
1865 rte_pmd_mlx5_get_dyn_flag_names(char *names[], unsigned int n)
1866 {
1867         static const char *const dynf_names[] = {
1868                 RTE_PMD_MLX5_FINE_GRANULARITY_INLINE,
1869                 RTE_MBUF_DYNFLAG_METADATA_NAME,
1870                 RTE_MBUF_DYNFLAG_TX_TIMESTAMP_NAME
1871         };
1872         unsigned int i;
1873
1874         if (n < RTE_DIM(dynf_names))
1875                 return -ENOMEM;
1876         for (i = 0; i < RTE_DIM(dynf_names); i++) {
1877                 if (names[i] == NULL)
1878                         return -EINVAL;
1879                 strcpy(names[i], dynf_names[i]);
1880         }
1881         return RTE_DIM(dynf_names);
1882 }
1883
1884 /**
1885  * Comparison callback to sort device data.
1886  *
1887  * This is meant to be used with qsort().
1888  *
1889  * @param a[in]
1890  *   Pointer to pointer to first data object.
1891  * @param b[in]
1892  *   Pointer to pointer to second data object.
1893  *
1894  * @return
1895  *   0 if both objects are equal, less than 0 if the first argument is less
1896  *   than the second, greater than 0 otherwise.
1897  */
1898 int
1899 mlx5_dev_check_sibling_config(struct mlx5_priv *priv,
1900                               struct mlx5_dev_config *config)
1901 {
1902         struct mlx5_dev_ctx_shared *sh = priv->sh;
1903         struct mlx5_dev_config *sh_conf = NULL;
1904         uint16_t port_id;
1905
1906         MLX5_ASSERT(sh);
1907         /* Nothing to compare for the single/first device. */
1908         if (sh->refcnt == 1)
1909                 return 0;
1910         /* Find the device with shared context. */
1911         MLX5_ETH_FOREACH_DEV(port_id, priv->pci_dev) {
1912                 struct mlx5_priv *opriv =
1913                         rte_eth_devices[port_id].data->dev_private;
1914
1915                 if (opriv && opriv != priv && opriv->sh == sh) {
1916                         sh_conf = &opriv->config;
1917                         break;
1918                 }
1919         }
1920         if (!sh_conf)
1921                 return 0;
1922         if (sh_conf->dv_flow_en ^ config->dv_flow_en) {
1923                 DRV_LOG(ERR, "\"dv_flow_en\" configuration mismatch"
1924                              " for shared %s context", sh->ibdev_name);
1925                 rte_errno = EINVAL;
1926                 return rte_errno;
1927         }
1928         if (sh_conf->dv_xmeta_en ^ config->dv_xmeta_en) {
1929                 DRV_LOG(ERR, "\"dv_xmeta_en\" configuration mismatch"
1930                              " for shared %s context", sh->ibdev_name);
1931                 rte_errno = EINVAL;
1932                 return rte_errno;
1933         }
1934         return 0;
1935 }
1936
1937 /**
1938  * Look for the ethernet device belonging to mlx5 driver.
1939  *
1940  * @param[in] port_id
1941  *   port_id to start looking for device.
1942  * @param[in] pci_dev
1943  *   Pointer to the hint PCI device. When device is being probed
1944  *   the its siblings (master and preceding representors might
1945  *   not have assigned driver yet (because the mlx5_os_pci_probe()
1946  *   is not completed yet, for this case match on hint PCI
1947  *   device may be used to detect sibling device.
1948  *
1949  * @return
1950  *   port_id of found device, RTE_MAX_ETHPORT if not found.
1951  */
1952 uint16_t
1953 mlx5_eth_find_next(uint16_t port_id, struct rte_pci_device *pci_dev)
1954 {
1955         while (port_id < RTE_MAX_ETHPORTS) {
1956                 struct rte_eth_dev *dev = &rte_eth_devices[port_id];
1957
1958                 if (dev->state != RTE_ETH_DEV_UNUSED &&
1959                     dev->device &&
1960                     (dev->device == &pci_dev->device ||
1961                      (dev->device->driver &&
1962                      dev->device->driver->name &&
1963                      !strcmp(dev->device->driver->name, MLX5_DRIVER_NAME))))
1964                         break;
1965                 port_id++;
1966         }
1967         if (port_id >= RTE_MAX_ETHPORTS)
1968                 return RTE_MAX_ETHPORTS;
1969         return port_id;
1970 }
1971
1972 /**
1973  * DPDK callback to remove a PCI device.
1974  *
1975  * This function removes all Ethernet devices belong to a given PCI device.
1976  *
1977  * @param[in] pci_dev
1978  *   Pointer to the PCI device.
1979  *
1980  * @return
1981  *   0 on success, the function cannot fail.
1982  */
1983 static int
1984 mlx5_pci_remove(struct rte_pci_device *pci_dev)
1985 {
1986         uint16_t port_id;
1987
1988         RTE_ETH_FOREACH_DEV_OF(port_id, &pci_dev->device) {
1989                 /*
1990                  * mlx5_dev_close() is not registered to secondary process,
1991                  * call the close function explicitly for secondary process.
1992                  */
1993                 if (rte_eal_process_type() == RTE_PROC_SECONDARY)
1994                         mlx5_dev_close(&rte_eth_devices[port_id]);
1995                 else
1996                         rte_eth_dev_close(port_id);
1997         }
1998         return 0;
1999 }
2000
2001 static const struct rte_pci_id mlx5_pci_id_map[] = {
2002         {
2003                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
2004                                PCI_DEVICE_ID_MELLANOX_CONNECTX4)
2005         },
2006         {
2007                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
2008                                PCI_DEVICE_ID_MELLANOX_CONNECTX4VF)
2009         },
2010         {
2011                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
2012                                PCI_DEVICE_ID_MELLANOX_CONNECTX4LX)
2013         },
2014         {
2015                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
2016                                PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF)
2017         },
2018         {
2019                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
2020                                PCI_DEVICE_ID_MELLANOX_CONNECTX5)
2021         },
2022         {
2023                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
2024                                PCI_DEVICE_ID_MELLANOX_CONNECTX5VF)
2025         },
2026         {
2027                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
2028                                PCI_DEVICE_ID_MELLANOX_CONNECTX5EX)
2029         },
2030         {
2031                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
2032                                PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF)
2033         },
2034         {
2035                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
2036                                PCI_DEVICE_ID_MELLANOX_CONNECTX5BF)
2037         },
2038         {
2039                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
2040                                PCI_DEVICE_ID_MELLANOX_CONNECTX5BFVF)
2041         },
2042         {
2043                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
2044                                 PCI_DEVICE_ID_MELLANOX_CONNECTX6)
2045         },
2046         {
2047                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
2048                                 PCI_DEVICE_ID_MELLANOX_CONNECTX6VF)
2049         },
2050         {
2051                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
2052                                 PCI_DEVICE_ID_MELLANOX_CONNECTX6DX)
2053         },
2054         {
2055                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
2056                                 PCI_DEVICE_ID_MELLANOX_CONNECTX6DXVF)
2057         },
2058         {
2059                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
2060                                 PCI_DEVICE_ID_MELLANOX_CONNECTX6DXBF)
2061         },
2062         {
2063                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
2064                                 PCI_DEVICE_ID_MELLANOX_CONNECTX6LX)
2065         },
2066         {
2067                 .vendor_id = 0
2068         }
2069 };
2070
2071 struct rte_pci_driver mlx5_driver = {
2072         .driver = {
2073                 .name = MLX5_DRIVER_NAME
2074         },
2075         .id_table = mlx5_pci_id_map,
2076         .probe = mlx5_os_pci_probe,
2077         .remove = mlx5_pci_remove,
2078         .dma_map = mlx5_dma_map,
2079         .dma_unmap = mlx5_dma_unmap,
2080         .drv_flags = PCI_DRV_FLAGS,
2081 };
2082
2083 /* Initialize driver log type. */
2084 RTE_LOG_REGISTER(mlx5_logtype, pmd.net.mlx5, NOTICE)
2085
2086 /**
2087  * Driver initialization routine.
2088  */
2089 RTE_INIT(rte_mlx5_pmd_init)
2090 {
2091         /* Build the static tables for Verbs conversion. */
2092         mlx5_set_ptype_table();
2093         mlx5_set_cksum_table();
2094         mlx5_set_swp_types_table();
2095         if (mlx5_glue)
2096                 rte_pci_register(&mlx5_driver);
2097 }
2098
2099 RTE_PMD_EXPORT_NAME(net_mlx5, __COUNTER__);
2100 RTE_PMD_REGISTER_PCI_TABLE(net_mlx5, mlx5_pci_id_map);
2101 RTE_PMD_REGISTER_KMOD_DEP(net_mlx5, "* ib_uverbs & mlx5_core & mlx5_ib");