net/mlx5: register memory callback only when probing
[dpdk.git] / drivers / net / mlx5 / mlx5.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2015 6WIND S.A.
3  * Copyright 2015 Mellanox Technologies, Ltd
4  */
5
6 #include <stddef.h>
7 #include <unistd.h>
8 #include <string.h>
9 #include <assert.h>
10 #include <dlfcn.h>
11 #include <stdint.h>
12 #include <stdlib.h>
13 #include <errno.h>
14 #include <net/if.h>
15 #include <sys/mman.h>
16 #include <linux/rtnetlink.h>
17
18 /* Verbs header. */
19 /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
20 #ifdef PEDANTIC
21 #pragma GCC diagnostic ignored "-Wpedantic"
22 #endif
23 #include <infiniband/verbs.h>
24 #ifdef PEDANTIC
25 #pragma GCC diagnostic error "-Wpedantic"
26 #endif
27
28 #include <rte_malloc.h>
29 #include <rte_ethdev_driver.h>
30 #include <rte_ethdev_pci.h>
31 #include <rte_pci.h>
32 #include <rte_bus_pci.h>
33 #include <rte_common.h>
34 #include <rte_config.h>
35 #include <rte_eal_memconfig.h>
36 #include <rte_kvargs.h>
37 #include <rte_rwlock.h>
38 #include <rte_spinlock.h>
39
40 #include "mlx5.h"
41 #include "mlx5_utils.h"
42 #include "mlx5_rxtx.h"
43 #include "mlx5_autoconf.h"
44 #include "mlx5_defs.h"
45 #include "mlx5_glue.h"
46 #include "mlx5_mr.h"
47
48 /* Device parameter to enable RX completion queue compression. */
49 #define MLX5_RXQ_CQE_COMP_EN "rxq_cqe_comp_en"
50
51 /* Device parameter to enable Multi-Packet Rx queue. */
52 #define MLX5_RX_MPRQ_EN "mprq_en"
53
54 /* Device parameter to configure log 2 of the number of strides for MPRQ. */
55 #define MLX5_RX_MPRQ_LOG_STRIDE_NUM "mprq_log_stride_num"
56
57 /* Device parameter to limit the size of memcpy'd packet for MPRQ. */
58 #define MLX5_RX_MPRQ_MAX_MEMCPY_LEN "mprq_max_memcpy_len"
59
60 /* Device parameter to set the minimum number of Rx queues to enable MPRQ. */
61 #define MLX5_RXQS_MIN_MPRQ "rxqs_min_mprq"
62
63 /* Device parameter to configure inline send. */
64 #define MLX5_TXQ_INLINE "txq_inline"
65
66 /*
67  * Device parameter to configure the number of TX queues threshold for
68  * enabling inline send.
69  */
70 #define MLX5_TXQS_MIN_INLINE "txqs_min_inline"
71
72 /* Device parameter to enable multi-packet send WQEs. */
73 #define MLX5_TXQ_MPW_EN "txq_mpw_en"
74
75 /* Device parameter to include 2 dsegs in the title WQEBB. */
76 #define MLX5_TXQ_MPW_HDR_DSEG_EN "txq_mpw_hdr_dseg_en"
77
78 /* Device parameter to limit the size of inlining packet. */
79 #define MLX5_TXQ_MAX_INLINE_LEN "txq_max_inline_len"
80
81 /* Device parameter to enable hardware Tx vector. */
82 #define MLX5_TX_VEC_EN "tx_vec_en"
83
84 /* Device parameter to enable hardware Rx vector. */
85 #define MLX5_RX_VEC_EN "rx_vec_en"
86
87 /* Allow L3 VXLAN flow creation. */
88 #define MLX5_L3_VXLAN_EN "l3_vxlan_en"
89
90 /* Activate Netlink support in VF mode. */
91 #define MLX5_VF_NL_EN "vf_nl_en"
92
93 #ifndef HAVE_IBV_MLX5_MOD_MPW
94 #define MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED (1 << 2)
95 #define MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW (1 << 3)
96 #endif
97
98 #ifndef HAVE_IBV_MLX5_MOD_CQE_128B_COMP
99 #define MLX5DV_CONTEXT_FLAGS_CQE_128B_COMP (1 << 4)
100 #endif
101
102 static const char *MZ_MLX5_PMD_SHARED_DATA = "mlx5_pmd_shared_data";
103
104 /* Shared memory between primary and secondary processes. */
105 struct mlx5_shared_data *mlx5_shared_data;
106
107 /* Spinlock for mlx5_shared_data allocation. */
108 static rte_spinlock_t mlx5_shared_data_lock = RTE_SPINLOCK_INITIALIZER;
109
110 /** Driver-specific log messages type. */
111 int mlx5_logtype;
112
113 /**
114  * Prepare shared data between primary and secondary process.
115  */
116 static void
117 mlx5_prepare_shared_data(void)
118 {
119         const struct rte_memzone *mz;
120
121         rte_spinlock_lock(&mlx5_shared_data_lock);
122         if (mlx5_shared_data == NULL) {
123                 if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
124                         /* Allocate shared memory. */
125                         mz = rte_memzone_reserve(MZ_MLX5_PMD_SHARED_DATA,
126                                                  sizeof(*mlx5_shared_data),
127                                                  SOCKET_ID_ANY, 0);
128                 } else {
129                         /* Lookup allocated shared memory. */
130                         mz = rte_memzone_lookup(MZ_MLX5_PMD_SHARED_DATA);
131                 }
132                 if (mz == NULL)
133                         rte_panic("Cannot allocate mlx5 shared data\n");
134                 mlx5_shared_data = mz->addr;
135                 /* Initialize shared data. */
136                 if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
137                         LIST_INIT(&mlx5_shared_data->mem_event_cb_list);
138                         rte_rwlock_init(&mlx5_shared_data->mem_event_rwlock);
139                 }
140                 rte_mem_event_callback_register("MLX5_MEM_EVENT_CB",
141                                                 mlx5_mr_mem_event_cb, NULL);
142         }
143         rte_spinlock_unlock(&mlx5_shared_data_lock);
144 }
145
146 /**
147  * Retrieve integer value from environment variable.
148  *
149  * @param[in] name
150  *   Environment variable name.
151  *
152  * @return
153  *   Integer value, 0 if the variable is not set.
154  */
155 int
156 mlx5_getenv_int(const char *name)
157 {
158         const char *val = getenv(name);
159
160         if (val == NULL)
161                 return 0;
162         return atoi(val);
163 }
164
165 /**
166  * Verbs callback to allocate a memory. This function should allocate the space
167  * according to the size provided residing inside a huge page.
168  * Please note that all allocation must respect the alignment from libmlx5
169  * (i.e. currently sysconf(_SC_PAGESIZE)).
170  *
171  * @param[in] size
172  *   The size in bytes of the memory to allocate.
173  * @param[in] data
174  *   A pointer to the callback data.
175  *
176  * @return
177  *   Allocated buffer, NULL otherwise and rte_errno is set.
178  */
179 static void *
180 mlx5_alloc_verbs_buf(size_t size, void *data)
181 {
182         struct priv *priv = data;
183         void *ret;
184         size_t alignment = sysconf(_SC_PAGESIZE);
185         unsigned int socket = SOCKET_ID_ANY;
186
187         if (priv->verbs_alloc_ctx.type == MLX5_VERBS_ALLOC_TYPE_TX_QUEUE) {
188                 const struct mlx5_txq_ctrl *ctrl = priv->verbs_alloc_ctx.obj;
189
190                 socket = ctrl->socket;
191         } else if (priv->verbs_alloc_ctx.type ==
192                    MLX5_VERBS_ALLOC_TYPE_RX_QUEUE) {
193                 const struct mlx5_rxq_ctrl *ctrl = priv->verbs_alloc_ctx.obj;
194
195                 socket = ctrl->socket;
196         }
197         assert(data != NULL);
198         ret = rte_malloc_socket(__func__, size, alignment, socket);
199         if (!ret && size)
200                 rte_errno = ENOMEM;
201         return ret;
202 }
203
204 /**
205  * Verbs callback to free a memory.
206  *
207  * @param[in] ptr
208  *   A pointer to the memory to free.
209  * @param[in] data
210  *   A pointer to the callback data.
211  */
212 static void
213 mlx5_free_verbs_buf(void *ptr, void *data __rte_unused)
214 {
215         assert(data != NULL);
216         rte_free(ptr);
217 }
218
219 /**
220  * DPDK callback to close the device.
221  *
222  * Destroy all queues and objects, free memory.
223  *
224  * @param dev
225  *   Pointer to Ethernet device structure.
226  */
227 static void
228 mlx5_dev_close(struct rte_eth_dev *dev)
229 {
230         struct priv *priv = dev->data->dev_private;
231         unsigned int i;
232         int ret;
233
234         DRV_LOG(DEBUG, "port %u closing device \"%s\"",
235                 dev->data->port_id,
236                 ((priv->ctx != NULL) ? priv->ctx->device->name : ""));
237         /* In case mlx5_dev_stop() has not been called. */
238         mlx5_dev_interrupt_handler_uninstall(dev);
239         mlx5_traffic_disable(dev);
240         /* Prevent crashes when queues are still in use. */
241         dev->rx_pkt_burst = removed_rx_burst;
242         dev->tx_pkt_burst = removed_tx_burst;
243         if (priv->rxqs != NULL) {
244                 /* XXX race condition if mlx5_rx_burst() is still running. */
245                 usleep(1000);
246                 for (i = 0; (i != priv->rxqs_n); ++i)
247                         mlx5_rxq_release(dev, i);
248                 priv->rxqs_n = 0;
249                 priv->rxqs = NULL;
250         }
251         if (priv->txqs != NULL) {
252                 /* XXX race condition if mlx5_tx_burst() is still running. */
253                 usleep(1000);
254                 for (i = 0; (i != priv->txqs_n); ++i)
255                         mlx5_txq_release(dev, i);
256                 priv->txqs_n = 0;
257                 priv->txqs = NULL;
258         }
259         mlx5_flow_delete_drop_queue(dev);
260         mlx5_mprq_free_mp(dev);
261         mlx5_mr_release(dev);
262         if (priv->pd != NULL) {
263                 assert(priv->ctx != NULL);
264                 claim_zero(mlx5_glue->dealloc_pd(priv->pd));
265                 claim_zero(mlx5_glue->close_device(priv->ctx));
266         } else
267                 assert(priv->ctx == NULL);
268         if (priv->rss_conf.rss_key != NULL)
269                 rte_free(priv->rss_conf.rss_key);
270         if (priv->reta_idx != NULL)
271                 rte_free(priv->reta_idx);
272         if (priv->primary_socket)
273                 mlx5_socket_uninit(dev);
274         if (priv->config.vf)
275                 mlx5_nl_mac_addr_flush(dev);
276         if (priv->nl_socket >= 0)
277                 close(priv->nl_socket);
278         ret = mlx5_hrxq_ibv_verify(dev);
279         if (ret)
280                 DRV_LOG(WARNING, "port %u some hash Rx queue still remain",
281                         dev->data->port_id);
282         ret = mlx5_ind_table_ibv_verify(dev);
283         if (ret)
284                 DRV_LOG(WARNING, "port %u some indirection table still remain",
285                         dev->data->port_id);
286         ret = mlx5_rxq_ibv_verify(dev);
287         if (ret)
288                 DRV_LOG(WARNING, "port %u some Verbs Rx queue still remain",
289                         dev->data->port_id);
290         ret = mlx5_rxq_verify(dev);
291         if (ret)
292                 DRV_LOG(WARNING, "port %u some Rx queues still remain",
293                         dev->data->port_id);
294         ret = mlx5_txq_ibv_verify(dev);
295         if (ret)
296                 DRV_LOG(WARNING, "port %u some Verbs Tx queue still remain",
297                         dev->data->port_id);
298         ret = mlx5_txq_verify(dev);
299         if (ret)
300                 DRV_LOG(WARNING, "port %u some Tx queues still remain",
301                         dev->data->port_id);
302         ret = mlx5_flow_verify(dev);
303         if (ret)
304                 DRV_LOG(WARNING, "port %u some flows still remain",
305                         dev->data->port_id);
306         memset(priv, 0, sizeof(*priv));
307 }
308
309 const struct eth_dev_ops mlx5_dev_ops = {
310         .dev_configure = mlx5_dev_configure,
311         .dev_start = mlx5_dev_start,
312         .dev_stop = mlx5_dev_stop,
313         .dev_set_link_down = mlx5_set_link_down,
314         .dev_set_link_up = mlx5_set_link_up,
315         .dev_close = mlx5_dev_close,
316         .promiscuous_enable = mlx5_promiscuous_enable,
317         .promiscuous_disable = mlx5_promiscuous_disable,
318         .allmulticast_enable = mlx5_allmulticast_enable,
319         .allmulticast_disable = mlx5_allmulticast_disable,
320         .link_update = mlx5_link_update,
321         .stats_get = mlx5_stats_get,
322         .stats_reset = mlx5_stats_reset,
323         .xstats_get = mlx5_xstats_get,
324         .xstats_reset = mlx5_xstats_reset,
325         .xstats_get_names = mlx5_xstats_get_names,
326         .dev_infos_get = mlx5_dev_infos_get,
327         .dev_supported_ptypes_get = mlx5_dev_supported_ptypes_get,
328         .vlan_filter_set = mlx5_vlan_filter_set,
329         .rx_queue_setup = mlx5_rx_queue_setup,
330         .tx_queue_setup = mlx5_tx_queue_setup,
331         .rx_queue_release = mlx5_rx_queue_release,
332         .tx_queue_release = mlx5_tx_queue_release,
333         .flow_ctrl_get = mlx5_dev_get_flow_ctrl,
334         .flow_ctrl_set = mlx5_dev_set_flow_ctrl,
335         .mac_addr_remove = mlx5_mac_addr_remove,
336         .mac_addr_add = mlx5_mac_addr_add,
337         .mac_addr_set = mlx5_mac_addr_set,
338         .set_mc_addr_list = mlx5_set_mc_addr_list,
339         .mtu_set = mlx5_dev_set_mtu,
340         .vlan_strip_queue_set = mlx5_vlan_strip_queue_set,
341         .vlan_offload_set = mlx5_vlan_offload_set,
342         .reta_update = mlx5_dev_rss_reta_update,
343         .reta_query = mlx5_dev_rss_reta_query,
344         .rss_hash_update = mlx5_rss_hash_update,
345         .rss_hash_conf_get = mlx5_rss_hash_conf_get,
346         .filter_ctrl = mlx5_dev_filter_ctrl,
347         .rx_descriptor_status = mlx5_rx_descriptor_status,
348         .tx_descriptor_status = mlx5_tx_descriptor_status,
349         .rx_queue_intr_enable = mlx5_rx_intr_enable,
350         .rx_queue_intr_disable = mlx5_rx_intr_disable,
351         .is_removed = mlx5_is_removed,
352 };
353
354 static const struct eth_dev_ops mlx5_dev_sec_ops = {
355         .stats_get = mlx5_stats_get,
356         .stats_reset = mlx5_stats_reset,
357         .xstats_get = mlx5_xstats_get,
358         .xstats_reset = mlx5_xstats_reset,
359         .xstats_get_names = mlx5_xstats_get_names,
360         .dev_infos_get = mlx5_dev_infos_get,
361         .rx_descriptor_status = mlx5_rx_descriptor_status,
362         .tx_descriptor_status = mlx5_tx_descriptor_status,
363 };
364
365 /* Available operators in flow isolated mode. */
366 const struct eth_dev_ops mlx5_dev_ops_isolate = {
367         .dev_configure = mlx5_dev_configure,
368         .dev_start = mlx5_dev_start,
369         .dev_stop = mlx5_dev_stop,
370         .dev_set_link_down = mlx5_set_link_down,
371         .dev_set_link_up = mlx5_set_link_up,
372         .dev_close = mlx5_dev_close,
373         .link_update = mlx5_link_update,
374         .stats_get = mlx5_stats_get,
375         .stats_reset = mlx5_stats_reset,
376         .xstats_get = mlx5_xstats_get,
377         .xstats_reset = mlx5_xstats_reset,
378         .xstats_get_names = mlx5_xstats_get_names,
379         .dev_infos_get = mlx5_dev_infos_get,
380         .dev_supported_ptypes_get = mlx5_dev_supported_ptypes_get,
381         .vlan_filter_set = mlx5_vlan_filter_set,
382         .rx_queue_setup = mlx5_rx_queue_setup,
383         .tx_queue_setup = mlx5_tx_queue_setup,
384         .rx_queue_release = mlx5_rx_queue_release,
385         .tx_queue_release = mlx5_tx_queue_release,
386         .flow_ctrl_get = mlx5_dev_get_flow_ctrl,
387         .flow_ctrl_set = mlx5_dev_set_flow_ctrl,
388         .mac_addr_remove = mlx5_mac_addr_remove,
389         .mac_addr_add = mlx5_mac_addr_add,
390         .mac_addr_set = mlx5_mac_addr_set,
391         .set_mc_addr_list = mlx5_set_mc_addr_list,
392         .mtu_set = mlx5_dev_set_mtu,
393         .vlan_strip_queue_set = mlx5_vlan_strip_queue_set,
394         .vlan_offload_set = mlx5_vlan_offload_set,
395         .filter_ctrl = mlx5_dev_filter_ctrl,
396         .rx_descriptor_status = mlx5_rx_descriptor_status,
397         .tx_descriptor_status = mlx5_tx_descriptor_status,
398         .rx_queue_intr_enable = mlx5_rx_intr_enable,
399         .rx_queue_intr_disable = mlx5_rx_intr_disable,
400         .is_removed = mlx5_is_removed,
401 };
402
403 static struct {
404         struct rte_pci_addr pci_addr; /* associated PCI address */
405         uint32_t ports; /* physical ports bitfield. */
406 } mlx5_dev[32];
407
408 /**
409  * Get device index in mlx5_dev[] from PCI bus address.
410  *
411  * @param[in] pci_addr
412  *   PCI bus address to look for.
413  *
414  * @return
415  *   mlx5_dev[] index on success, -1 on failure.
416  */
417 static int
418 mlx5_dev_idx(struct rte_pci_addr *pci_addr)
419 {
420         unsigned int i;
421         int ret = -1;
422
423         assert(pci_addr != NULL);
424         for (i = 0; (i != RTE_DIM(mlx5_dev)); ++i) {
425                 if ((mlx5_dev[i].pci_addr.domain == pci_addr->domain) &&
426                     (mlx5_dev[i].pci_addr.bus == pci_addr->bus) &&
427                     (mlx5_dev[i].pci_addr.devid == pci_addr->devid) &&
428                     (mlx5_dev[i].pci_addr.function == pci_addr->function))
429                         return i;
430                 if ((mlx5_dev[i].ports == 0) && (ret == -1))
431                         ret = i;
432         }
433         return ret;
434 }
435
436 /**
437  * Verify and store value for device argument.
438  *
439  * @param[in] key
440  *   Key argument to verify.
441  * @param[in] val
442  *   Value associated with key.
443  * @param opaque
444  *   User data.
445  *
446  * @return
447  *   0 on success, a negative errno value otherwise and rte_errno is set.
448  */
449 static int
450 mlx5_args_check(const char *key, const char *val, void *opaque)
451 {
452         struct mlx5_dev_config *config = opaque;
453         unsigned long tmp;
454
455         errno = 0;
456         tmp = strtoul(val, NULL, 0);
457         if (errno) {
458                 rte_errno = errno;
459                 DRV_LOG(WARNING, "%s: \"%s\" is not a valid integer", key, val);
460                 return -rte_errno;
461         }
462         if (strcmp(MLX5_RXQ_CQE_COMP_EN, key) == 0) {
463                 config->cqe_comp = !!tmp;
464         } else if (strcmp(MLX5_RX_MPRQ_EN, key) == 0) {
465                 config->mprq.enabled = !!tmp;
466         } else if (strcmp(MLX5_RX_MPRQ_LOG_STRIDE_NUM, key) == 0) {
467                 config->mprq.stride_num_n = tmp;
468         } else if (strcmp(MLX5_RX_MPRQ_MAX_MEMCPY_LEN, key) == 0) {
469                 config->mprq.max_memcpy_len = tmp;
470         } else if (strcmp(MLX5_RXQS_MIN_MPRQ, key) == 0) {
471                 config->mprq.min_rxqs_num = tmp;
472         } else if (strcmp(MLX5_TXQ_INLINE, key) == 0) {
473                 config->txq_inline = tmp;
474         } else if (strcmp(MLX5_TXQS_MIN_INLINE, key) == 0) {
475                 config->txqs_inline = tmp;
476         } else if (strcmp(MLX5_TXQ_MPW_EN, key) == 0) {
477                 config->mps = !!tmp ? config->mps : 0;
478         } else if (strcmp(MLX5_TXQ_MPW_HDR_DSEG_EN, key) == 0) {
479                 config->mpw_hdr_dseg = !!tmp;
480         } else if (strcmp(MLX5_TXQ_MAX_INLINE_LEN, key) == 0) {
481                 config->inline_max_packet_sz = tmp;
482         } else if (strcmp(MLX5_TX_VEC_EN, key) == 0) {
483                 config->tx_vec_en = !!tmp;
484         } else if (strcmp(MLX5_RX_VEC_EN, key) == 0) {
485                 config->rx_vec_en = !!tmp;
486         } else if (strcmp(MLX5_L3_VXLAN_EN, key) == 0) {
487                 config->l3_vxlan_en = !!tmp;
488         } else if (strcmp(MLX5_VF_NL_EN, key) == 0) {
489                 config->vf_nl_en = !!tmp;
490         } else {
491                 DRV_LOG(WARNING, "%s: unknown parameter", key);
492                 rte_errno = EINVAL;
493                 return -rte_errno;
494         }
495         return 0;
496 }
497
498 /**
499  * Parse device parameters.
500  *
501  * @param config
502  *   Pointer to device configuration structure.
503  * @param devargs
504  *   Device arguments structure.
505  *
506  * @return
507  *   0 on success, a negative errno value otherwise and rte_errno is set.
508  */
509 static int
510 mlx5_args(struct mlx5_dev_config *config, struct rte_devargs *devargs)
511 {
512         const char **params = (const char *[]){
513                 MLX5_RXQ_CQE_COMP_EN,
514                 MLX5_RX_MPRQ_EN,
515                 MLX5_RX_MPRQ_LOG_STRIDE_NUM,
516                 MLX5_RX_MPRQ_MAX_MEMCPY_LEN,
517                 MLX5_RXQS_MIN_MPRQ,
518                 MLX5_TXQ_INLINE,
519                 MLX5_TXQS_MIN_INLINE,
520                 MLX5_TXQ_MPW_EN,
521                 MLX5_TXQ_MPW_HDR_DSEG_EN,
522                 MLX5_TXQ_MAX_INLINE_LEN,
523                 MLX5_TX_VEC_EN,
524                 MLX5_RX_VEC_EN,
525                 MLX5_L3_VXLAN_EN,
526                 MLX5_VF_NL_EN,
527                 NULL,
528         };
529         struct rte_kvargs *kvlist;
530         int ret = 0;
531         int i;
532
533         if (devargs == NULL)
534                 return 0;
535         /* Following UGLY cast is done to pass checkpatch. */
536         kvlist = rte_kvargs_parse(devargs->args, params);
537         if (kvlist == NULL)
538                 return 0;
539         /* Process parameters. */
540         for (i = 0; (params[i] != NULL); ++i) {
541                 if (rte_kvargs_count(kvlist, params[i])) {
542                         ret = rte_kvargs_process(kvlist, params[i],
543                                                  mlx5_args_check, config);
544                         if (ret) {
545                                 rte_errno = EINVAL;
546                                 rte_kvargs_free(kvlist);
547                                 return -rte_errno;
548                         }
549                 }
550         }
551         rte_kvargs_free(kvlist);
552         return 0;
553 }
554
555 static struct rte_pci_driver mlx5_driver;
556
557 /*
558  * Reserved UAR address space for TXQ UAR(hw doorbell) mapping, process
559  * local resource used by both primary and secondary to avoid duplicate
560  * reservation.
561  * The space has to be available on both primary and secondary process,
562  * TXQ UAR maps to this area using fixed mmap w/o double check.
563  */
564 static void *uar_base;
565
566 static int
567 find_lower_va_bound(const struct rte_memseg_list *msl __rte_unused,
568                 const struct rte_memseg *ms, void *arg)
569 {
570         void **addr = arg;
571
572         if (*addr == NULL)
573                 *addr = ms->addr;
574         else
575                 *addr = RTE_MIN(*addr, ms->addr);
576
577         return 0;
578 }
579
580 /**
581  * Reserve UAR address space for primary process.
582  *
583  * @param[in] dev
584  *   Pointer to Ethernet device.
585  *
586  * @return
587  *   0 on success, a negative errno value otherwise and rte_errno is set.
588  */
589 static int
590 mlx5_uar_init_primary(struct rte_eth_dev *dev)
591 {
592         struct priv *priv = dev->data->dev_private;
593         void *addr = (void *)0;
594
595         if (uar_base) { /* UAR address space mapped. */
596                 priv->uar_base = uar_base;
597                 return 0;
598         }
599         /* find out lower bound of hugepage segments */
600         rte_memseg_walk(find_lower_va_bound, &addr);
601
602         /* keep distance to hugepages to minimize potential conflicts. */
603         addr = RTE_PTR_SUB(addr, MLX5_UAR_OFFSET + MLX5_UAR_SIZE);
604         /* anonymous mmap, no real memory consumption. */
605         addr = mmap(addr, MLX5_UAR_SIZE,
606                     PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
607         if (addr == MAP_FAILED) {
608                 DRV_LOG(ERR,
609                         "port %u failed to reserve UAR address space, please"
610                         " adjust MLX5_UAR_SIZE or try --base-virtaddr",
611                         dev->data->port_id);
612                 rte_errno = ENOMEM;
613                 return -rte_errno;
614         }
615         /* Accept either same addr or a new addr returned from mmap if target
616          * range occupied.
617          */
618         DRV_LOG(INFO, "port %u reserved UAR address space: %p",
619                 dev->data->port_id, addr);
620         priv->uar_base = addr; /* for primary and secondary UAR re-mmap. */
621         uar_base = addr; /* process local, don't reserve again. */
622         return 0;
623 }
624
625 /**
626  * Reserve UAR address space for secondary process, align with
627  * primary process.
628  *
629  * @param[in] dev
630  *   Pointer to Ethernet device.
631  *
632  * @return
633  *   0 on success, a negative errno value otherwise and rte_errno is set.
634  */
635 static int
636 mlx5_uar_init_secondary(struct rte_eth_dev *dev)
637 {
638         struct priv *priv = dev->data->dev_private;
639         void *addr;
640
641         assert(priv->uar_base);
642         if (uar_base) { /* already reserved. */
643                 assert(uar_base == priv->uar_base);
644                 return 0;
645         }
646         /* anonymous mmap, no real memory consumption. */
647         addr = mmap(priv->uar_base, MLX5_UAR_SIZE,
648                     PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
649         if (addr == MAP_FAILED) {
650                 DRV_LOG(ERR, "port %u UAR mmap failed: %p size: %llu",
651                         dev->data->port_id, priv->uar_base, MLX5_UAR_SIZE);
652                 rte_errno = ENXIO;
653                 return -rte_errno;
654         }
655         if (priv->uar_base != addr) {
656                 DRV_LOG(ERR,
657                         "port %u UAR address %p size %llu occupied, please"
658                         " adjust MLX5_UAR_OFFSET or try EAL parameter"
659                         " --base-virtaddr",
660                         dev->data->port_id, priv->uar_base, MLX5_UAR_SIZE);
661                 rte_errno = ENXIO;
662                 return -rte_errno;
663         }
664         uar_base = addr; /* process local, don't reserve again */
665         DRV_LOG(INFO, "port %u reserved UAR address space: %p",
666                 dev->data->port_id, addr);
667         return 0;
668 }
669
670 /**
671  * DPDK callback to register a PCI device.
672  *
673  * This function creates an Ethernet device for each port of a given
674  * PCI device.
675  *
676  * @param[in] pci_drv
677  *   PCI driver structure (mlx5_driver).
678  * @param[in] pci_dev
679  *   PCI device information.
680  *
681  * @return
682  *   0 on success, a negative errno value otherwise and rte_errno is set.
683  */
684 static int
685 mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
686                struct rte_pci_device *pci_dev)
687 {
688         struct ibv_device **list = NULL;
689         struct ibv_device *ibv_dev;
690         int err = 0;
691         struct ibv_context *attr_ctx = NULL;
692         struct ibv_device_attr_ex device_attr;
693         unsigned int vf = 0;
694         unsigned int mps;
695         unsigned int cqe_comp;
696         unsigned int tunnel_en = 0;
697         unsigned int mpls_en = 0;
698         unsigned int swp = 0;
699         unsigned int verb_priorities = 0;
700         unsigned int mprq = 0;
701         unsigned int mprq_min_stride_size_n = 0;
702         unsigned int mprq_max_stride_size_n = 0;
703         unsigned int mprq_min_stride_num_n = 0;
704         unsigned int mprq_max_stride_num_n = 0;
705         int idx;
706         int i;
707         struct mlx5dv_context attrs_out = {0};
708 #ifdef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT
709         struct ibv_counter_set_description cs_desc;
710 #endif
711
712         /* Prepare shared data between primary and secondary process. */
713         mlx5_prepare_shared_data();
714         assert(pci_drv == &mlx5_driver);
715         /* Get mlx5_dev[] index. */
716         idx = mlx5_dev_idx(&pci_dev->addr);
717         if (idx == -1) {
718                 DRV_LOG(ERR, "this driver cannot support any more adapters");
719                 err = ENOMEM;
720                 goto error;
721         }
722         DRV_LOG(DEBUG, "using driver device index %d", idx);
723         /* Save PCI address. */
724         mlx5_dev[idx].pci_addr = pci_dev->addr;
725         list = mlx5_glue->get_device_list(&i);
726         if (list == NULL) {
727                 assert(errno);
728                 err = errno;
729                 if (errno == ENOSYS)
730                         DRV_LOG(ERR,
731                                 "cannot list devices, is ib_uverbs loaded?");
732                 goto error;
733         }
734         assert(i >= 0);
735         /*
736          * For each listed device, check related sysfs entry against
737          * the provided PCI ID.
738          */
739         while (i != 0) {
740                 struct rte_pci_addr pci_addr;
741
742                 --i;
743                 DRV_LOG(DEBUG, "checking device \"%s\"", list[i]->name);
744                 if (mlx5_ibv_device_to_pci_addr(list[i], &pci_addr))
745                         continue;
746                 if ((pci_dev->addr.domain != pci_addr.domain) ||
747                     (pci_dev->addr.bus != pci_addr.bus) ||
748                     (pci_dev->addr.devid != pci_addr.devid) ||
749                     (pci_dev->addr.function != pci_addr.function))
750                         continue;
751                 DRV_LOG(INFO, "PCI information matches, using device \"%s\"",
752                         list[i]->name);
753                 vf = ((pci_dev->id.device_id ==
754                        PCI_DEVICE_ID_MELLANOX_CONNECTX4VF) ||
755                       (pci_dev->id.device_id ==
756                        PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF) ||
757                       (pci_dev->id.device_id ==
758                        PCI_DEVICE_ID_MELLANOX_CONNECTX5VF) ||
759                       (pci_dev->id.device_id ==
760                        PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF));
761                 attr_ctx = mlx5_glue->open_device(list[i]);
762                 rte_errno = errno;
763                 err = rte_errno;
764                 break;
765         }
766         if (attr_ctx == NULL) {
767                 switch (err) {
768                 case 0:
769                         DRV_LOG(ERR,
770                                 "cannot access device, is mlx5_ib loaded?");
771                         err = ENODEV;
772                         break;
773                 case EINVAL:
774                         DRV_LOG(ERR,
775                                 "cannot use device, are drivers up to date?");
776                         break;
777                 }
778                 goto error;
779         }
780         ibv_dev = list[i];
781         DRV_LOG(DEBUG, "device opened");
782 #ifdef HAVE_IBV_MLX5_MOD_SWP
783         attrs_out.comp_mask |= MLX5DV_CONTEXT_MASK_SWP;
784 #endif
785         /*
786          * Multi-packet send is supported by ConnectX-4 Lx PF as well
787          * as all ConnectX-5 devices.
788          */
789 #ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT
790         attrs_out.comp_mask |= MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS;
791 #endif
792 #ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT
793         attrs_out.comp_mask |= MLX5DV_CONTEXT_MASK_STRIDING_RQ;
794 #endif
795         mlx5_glue->dv_query_device(attr_ctx, &attrs_out);
796         if (attrs_out.flags & MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED) {
797                 if (attrs_out.flags & MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW) {
798                         DRV_LOG(DEBUG, "enhanced MPW is supported");
799                         mps = MLX5_MPW_ENHANCED;
800                 } else {
801                         DRV_LOG(DEBUG, "MPW is supported");
802                         mps = MLX5_MPW;
803                 }
804         } else {
805                 DRV_LOG(DEBUG, "MPW isn't supported");
806                 mps = MLX5_MPW_DISABLED;
807         }
808 #ifdef HAVE_IBV_MLX5_MOD_SWP
809         if (attrs_out.comp_mask & MLX5DV_CONTEXT_MASK_SWP)
810                 swp = attrs_out.sw_parsing_caps.sw_parsing_offloads;
811         DRV_LOG(DEBUG, "SWP support: %u", swp);
812 #endif
813 #ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT
814         if (attrs_out.comp_mask & MLX5DV_CONTEXT_MASK_STRIDING_RQ) {
815                 struct mlx5dv_striding_rq_caps mprq_caps =
816                         attrs_out.striding_rq_caps;
817
818                 DRV_LOG(DEBUG, "\tmin_single_stride_log_num_of_bytes: %d",
819                         mprq_caps.min_single_stride_log_num_of_bytes);
820                 DRV_LOG(DEBUG, "\tmax_single_stride_log_num_of_bytes: %d",
821                         mprq_caps.max_single_stride_log_num_of_bytes);
822                 DRV_LOG(DEBUG, "\tmin_single_wqe_log_num_of_strides: %d",
823                         mprq_caps.min_single_wqe_log_num_of_strides);
824                 DRV_LOG(DEBUG, "\tmax_single_wqe_log_num_of_strides: %d",
825                         mprq_caps.max_single_wqe_log_num_of_strides);
826                 DRV_LOG(DEBUG, "\tsupported_qpts: %d",
827                         mprq_caps.supported_qpts);
828                 DRV_LOG(DEBUG, "device supports Multi-Packet RQ");
829                 mprq = 1;
830                 mprq_min_stride_size_n =
831                         mprq_caps.min_single_stride_log_num_of_bytes;
832                 mprq_max_stride_size_n =
833                         mprq_caps.max_single_stride_log_num_of_bytes;
834                 mprq_min_stride_num_n =
835                         mprq_caps.min_single_wqe_log_num_of_strides;
836                 mprq_max_stride_num_n =
837                         mprq_caps.max_single_wqe_log_num_of_strides;
838         }
839 #endif
840         if (RTE_CACHE_LINE_SIZE == 128 &&
841             !(attrs_out.flags & MLX5DV_CONTEXT_FLAGS_CQE_128B_COMP))
842                 cqe_comp = 0;
843         else
844                 cqe_comp = 1;
845 #ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT
846         if (attrs_out.comp_mask & MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS) {
847                 tunnel_en = ((attrs_out.tunnel_offloads_caps &
848                               MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_VXLAN) &&
849                              (attrs_out.tunnel_offloads_caps &
850                               MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_GRE));
851         }
852         DRV_LOG(DEBUG, "tunnel offloading is %ssupported",
853                 tunnel_en ? "" : "not ");
854 #else
855         DRV_LOG(WARNING,
856                 "tunnel offloading disabled due to old OFED/rdma-core version");
857 #endif
858 #ifdef HAVE_IBV_DEVICE_MPLS_SUPPORT
859         mpls_en = ((attrs_out.tunnel_offloads_caps &
860                     MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_CW_MPLS_OVER_GRE) &&
861                    (attrs_out.tunnel_offloads_caps &
862                     MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_CW_MPLS_OVER_UDP));
863         DRV_LOG(DEBUG, "MPLS over GRE/UDP tunnel offloading is %ssupported",
864                 mpls_en ? "" : "not ");
865 #else
866         DRV_LOG(WARNING, "MPLS over GRE/UDP tunnel offloading disabled due to"
867                 " old OFED/rdma-core version or firmware configuration");
868 #endif
869         err = mlx5_glue->query_device_ex(attr_ctx, NULL, &device_attr);
870         if (err) {
871                 DEBUG("ibv_query_device_ex() failed");
872                 goto error;
873         }
874         DRV_LOG(INFO, "%u port(s) detected",
875                 device_attr.orig_attr.phys_port_cnt);
876         for (i = 0; i < device_attr.orig_attr.phys_port_cnt; i++) {
877                 char name[RTE_ETH_NAME_MAX_LEN];
878                 int len;
879                 uint32_t port = i + 1; /* ports are indexed from one */
880                 uint32_t test = (1 << i);
881                 struct ibv_context *ctx = NULL;
882                 struct ibv_port_attr port_attr;
883                 struct ibv_pd *pd = NULL;
884                 struct priv *priv = NULL;
885                 struct rte_eth_dev *eth_dev = NULL;
886                 struct ibv_device_attr_ex device_attr_ex;
887                 struct ether_addr mac;
888                 struct mlx5_dev_config config = {
889                         .cqe_comp = cqe_comp,
890                         .mps = mps,
891                         .tunnel_en = tunnel_en,
892                         .mpls_en = mpls_en,
893                         .tx_vec_en = 1,
894                         .rx_vec_en = 1,
895                         .mpw_hdr_dseg = 0,
896                         .txq_inline = MLX5_ARG_UNSET,
897                         .txqs_inline = MLX5_ARG_UNSET,
898                         .inline_max_packet_sz = MLX5_ARG_UNSET,
899                         .vf_nl_en = 1,
900                         .swp = !!swp,
901                         .mprq = {
902                                 .enabled = 0, /* Disabled by default. */
903                                 .stride_num_n = RTE_MAX(MLX5_MPRQ_STRIDE_NUM_N,
904                                                         mprq_min_stride_num_n),
905                                 .max_memcpy_len = MLX5_MPRQ_MEMCPY_DEFAULT_LEN,
906                                 .min_rxqs_num = MLX5_MPRQ_MIN_RXQS,
907                         },
908                 };
909
910                 len = snprintf(name, sizeof(name), PCI_PRI_FMT,
911                          pci_dev->addr.domain, pci_dev->addr.bus,
912                          pci_dev->addr.devid, pci_dev->addr.function);
913                 if (device_attr.orig_attr.phys_port_cnt > 1)
914                         snprintf(name + len, sizeof(name), " port %u", i);
915                 mlx5_dev[idx].ports |= test;
916                 if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
917                         eth_dev = rte_eth_dev_attach_secondary(name);
918                         if (eth_dev == NULL) {
919                                 DRV_LOG(ERR, "can not attach rte ethdev");
920                                 rte_errno = ENOMEM;
921                                 err = rte_errno;
922                                 goto error;
923                         }
924                         eth_dev->device = &pci_dev->device;
925                         eth_dev->dev_ops = &mlx5_dev_sec_ops;
926                         err = mlx5_uar_init_secondary(eth_dev);
927                         if (err) {
928                                 err = rte_errno;
929                                 goto error;
930                         }
931                         /* Receive command fd from primary process */
932                         err = mlx5_socket_connect(eth_dev);
933                         if (err < 0) {
934                                 err = rte_errno;
935                                 goto error;
936                         }
937                         /* Remap UAR for Tx queues. */
938                         err = mlx5_tx_uar_remap(eth_dev, err);
939                         if (err) {
940                                 err = rte_errno;
941                                 goto error;
942                         }
943                         /*
944                          * Ethdev pointer is still required as input since
945                          * the primary device is not accessible from the
946                          * secondary process.
947                          */
948                         eth_dev->rx_pkt_burst =
949                                 mlx5_select_rx_function(eth_dev);
950                         eth_dev->tx_pkt_burst =
951                                 mlx5_select_tx_function(eth_dev);
952                         rte_eth_dev_probing_finish(eth_dev);
953                         continue;
954                 }
955                 DRV_LOG(DEBUG, "using port %u (%08" PRIx32 ")", port, test);
956                 ctx = mlx5_glue->open_device(ibv_dev);
957                 if (ctx == NULL) {
958                         err = ENODEV;
959                         goto port_error;
960                 }
961                 /* Check port status. */
962                 err = mlx5_glue->query_port(ctx, port, &port_attr);
963                 if (err) {
964                         DRV_LOG(ERR, "port query failed: %s", strerror(err));
965                         goto port_error;
966                 }
967                 if (port_attr.link_layer != IBV_LINK_LAYER_ETHERNET) {
968                         DRV_LOG(ERR,
969                                 "port %d is not configured in Ethernet mode",
970                                 port);
971                         err = EINVAL;
972                         goto port_error;
973                 }
974                 if (port_attr.state != IBV_PORT_ACTIVE)
975                         DRV_LOG(DEBUG, "port %d is not active: \"%s\" (%d)",
976                                 port,
977                                 mlx5_glue->port_state_str(port_attr.state),
978                                 port_attr.state);
979                 /* Allocate protection domain. */
980                 pd = mlx5_glue->alloc_pd(ctx);
981                 if (pd == NULL) {
982                         DRV_LOG(ERR, "PD allocation failure");
983                         err = ENOMEM;
984                         goto port_error;
985                 }
986                 mlx5_dev[idx].ports |= test;
987                 /* from rte_ethdev.c */
988                 priv = rte_zmalloc("ethdev private structure",
989                                    sizeof(*priv),
990                                    RTE_CACHE_LINE_SIZE);
991                 if (priv == NULL) {
992                         DRV_LOG(ERR, "priv allocation failure");
993                         err = ENOMEM;
994                         goto port_error;
995                 }
996                 priv->ctx = ctx;
997                 strncpy(priv->ibdev_path, priv->ctx->device->ibdev_path,
998                         sizeof(priv->ibdev_path));
999                 priv->device_attr = device_attr;
1000                 priv->port = port;
1001                 priv->pd = pd;
1002                 priv->mtu = ETHER_MTU;
1003                 err = mlx5_args(&config, pci_dev->device.devargs);
1004                 if (err) {
1005                         DRV_LOG(ERR, "failed to process device arguments: %s",
1006                                 strerror(err));
1007                         err = rte_errno;
1008                         goto port_error;
1009                 }
1010                 err = mlx5_glue->query_device_ex(ctx, NULL, &device_attr_ex);
1011                 if (err) {
1012                         DRV_LOG(ERR, "ibv_query_device_ex() failed");
1013                         goto port_error;
1014                 }
1015                 config.hw_csum = !!(device_attr_ex.device_cap_flags_ex &
1016                                     IBV_DEVICE_RAW_IP_CSUM);
1017                 DRV_LOG(DEBUG, "checksum offloading is %ssupported",
1018                         (config.hw_csum ? "" : "not "));
1019 #ifdef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT
1020                 config.flow_counter_en = !!(device_attr.max_counter_sets);
1021                 mlx5_glue->describe_counter_set(ctx, 0, &cs_desc);
1022                 DRV_LOG(DEBUG,
1023                         "counter type = %d, num of cs = %ld, attributes = %d",
1024                         cs_desc.counter_type, cs_desc.num_of_cs,
1025                         cs_desc.attributes);
1026 #endif
1027                 config.ind_table_max_size =
1028                         device_attr_ex.rss_caps.max_rwq_indirection_table_size;
1029                 /* Remove this check once DPDK supports larger/variable
1030                  * indirection tables. */
1031                 if (config.ind_table_max_size >
1032                                 (unsigned int)ETH_RSS_RETA_SIZE_512)
1033                         config.ind_table_max_size = ETH_RSS_RETA_SIZE_512;
1034                 DRV_LOG(DEBUG, "maximum Rx indirection table size is %u",
1035                         config.ind_table_max_size);
1036                 config.hw_vlan_strip = !!(device_attr_ex.raw_packet_caps &
1037                                          IBV_RAW_PACKET_CAP_CVLAN_STRIPPING);
1038                 DRV_LOG(DEBUG, "VLAN stripping is %ssupported",
1039                         (config.hw_vlan_strip ? "" : "not "));
1040
1041                 config.hw_fcs_strip = !!(device_attr_ex.raw_packet_caps &
1042                                          IBV_RAW_PACKET_CAP_SCATTER_FCS);
1043                 DRV_LOG(DEBUG, "FCS stripping configuration is %ssupported",
1044                         (config.hw_fcs_strip ? "" : "not "));
1045
1046 #ifdef HAVE_IBV_WQ_FLAG_RX_END_PADDING
1047                 config.hw_padding = !!device_attr_ex.rx_pad_end_addr_align;
1048 #endif
1049                 DRV_LOG(DEBUG,
1050                         "hardware Rx end alignment padding is %ssupported",
1051                         (config.hw_padding ? "" : "not "));
1052                 config.vf = vf;
1053                 config.tso = ((device_attr_ex.tso_caps.max_tso > 0) &&
1054                               (device_attr_ex.tso_caps.supported_qpts &
1055                               (1 << IBV_QPT_RAW_PACKET)));
1056                 if (config.tso)
1057                         config.tso_max_payload_sz =
1058                                         device_attr_ex.tso_caps.max_tso;
1059                 if (config.mps && !mps) {
1060                         DRV_LOG(ERR,
1061                                 "multi-packet send not supported on this device"
1062                                 " (" MLX5_TXQ_MPW_EN ")");
1063                         err = ENOTSUP;
1064                         goto port_error;
1065                 }
1066                 DRV_LOG(INFO, "%s MPS is %s",
1067                         config.mps == MLX5_MPW_ENHANCED ? "enhanced " : "",
1068                         config.mps != MLX5_MPW_DISABLED ? "enabled" :
1069                         "disabled");
1070                 if (config.cqe_comp && !cqe_comp) {
1071                         DRV_LOG(WARNING, "Rx CQE compression isn't supported");
1072                         config.cqe_comp = 0;
1073                 }
1074                 config.mprq.enabled = config.mprq.enabled && mprq;
1075                 if (config.mprq.enabled) {
1076                         if (config.mprq.stride_num_n > mprq_max_stride_num_n ||
1077                             config.mprq.stride_num_n < mprq_min_stride_num_n) {
1078                                 config.mprq.stride_num_n =
1079                                         RTE_MAX(MLX5_MPRQ_STRIDE_NUM_N,
1080                                                 mprq_min_stride_num_n);
1081                                 DRV_LOG(WARNING,
1082                                         "the number of strides"
1083                                         " for Multi-Packet RQ is out of range,"
1084                                         " setting default value (%u)",
1085                                         1 << config.mprq.stride_num_n);
1086                         }
1087                         config.mprq.min_stride_size_n = mprq_min_stride_size_n;
1088                         config.mprq.max_stride_size_n = mprq_max_stride_size_n;
1089                 }
1090                 eth_dev = rte_eth_dev_allocate(name);
1091                 if (eth_dev == NULL) {
1092                         DRV_LOG(ERR, "can not allocate rte ethdev");
1093                         err = ENOMEM;
1094                         goto port_error;
1095                 }
1096                 eth_dev->data->dev_private = priv;
1097                 priv->dev_data = eth_dev->data;
1098                 eth_dev->data->mac_addrs = priv->mac;
1099                 eth_dev->device = &pci_dev->device;
1100                 rte_eth_copy_pci_info(eth_dev, pci_dev);
1101                 eth_dev->device->driver = &mlx5_driver.driver;
1102                 err = mlx5_uar_init_primary(eth_dev);
1103                 if (err) {
1104                         err = rte_errno;
1105                         goto port_error;
1106                 }
1107                 /* Configure the first MAC address by default. */
1108                 if (mlx5_get_mac(eth_dev, &mac.addr_bytes)) {
1109                         DRV_LOG(ERR,
1110                                 "port %u cannot get MAC address, is mlx5_en"
1111                                 " loaded? (errno: %s)",
1112                                 eth_dev->data->port_id, strerror(errno));
1113                         err = ENODEV;
1114                         goto port_error;
1115                 }
1116                 DRV_LOG(INFO,
1117                         "port %u MAC address is %02x:%02x:%02x:%02x:%02x:%02x",
1118                         eth_dev->data->port_id,
1119                         mac.addr_bytes[0], mac.addr_bytes[1],
1120                         mac.addr_bytes[2], mac.addr_bytes[3],
1121                         mac.addr_bytes[4], mac.addr_bytes[5]);
1122 #ifndef NDEBUG
1123                 {
1124                         char ifname[IF_NAMESIZE];
1125
1126                         if (mlx5_get_ifname(eth_dev, &ifname) == 0)
1127                                 DRV_LOG(DEBUG, "port %u ifname is \"%s\"",
1128                                         eth_dev->data->port_id, ifname);
1129                         else
1130                                 DRV_LOG(DEBUG, "port %u ifname is unknown",
1131                                         eth_dev->data->port_id);
1132                 }
1133 #endif
1134                 /* Get actual MTU if possible. */
1135                 err = mlx5_get_mtu(eth_dev, &priv->mtu);
1136                 if (err) {
1137                         err = rte_errno;
1138                         goto port_error;
1139                 }
1140                 DRV_LOG(DEBUG, "port %u MTU is %u", eth_dev->data->port_id,
1141                         priv->mtu);
1142                 /*
1143                  * Initialize burst functions to prevent crashes before link-up.
1144                  */
1145                 eth_dev->rx_pkt_burst = removed_rx_burst;
1146                 eth_dev->tx_pkt_burst = removed_tx_burst;
1147                 eth_dev->dev_ops = &mlx5_dev_ops;
1148                 /* Register MAC address. */
1149                 claim_zero(mlx5_mac_addr_add(eth_dev, &mac, 0, 0));
1150                 priv->nl_socket = -1;
1151                 priv->nl_sn = 0;
1152                 if (vf && config.vf_nl_en) {
1153                         priv->nl_socket = mlx5_nl_init(RTMGRP_LINK);
1154                         if (priv->nl_socket < 0)
1155                                 priv->nl_socket = -1;
1156                         mlx5_nl_mac_addr_sync(eth_dev);
1157                 }
1158                 TAILQ_INIT(&priv->flows);
1159                 TAILQ_INIT(&priv->ctrl_flows);
1160                 /* Hint libmlx5 to use PMD allocator for data plane resources */
1161                 struct mlx5dv_ctx_allocators alctr = {
1162                         .alloc = &mlx5_alloc_verbs_buf,
1163                         .free = &mlx5_free_verbs_buf,
1164                         .data = priv,
1165                 };
1166                 mlx5_glue->dv_set_context_attr(ctx,
1167                                                MLX5DV_CTX_ATTR_BUF_ALLOCATORS,
1168                                                (void *)((uintptr_t)&alctr));
1169                 /* Bring Ethernet device up. */
1170                 DRV_LOG(DEBUG, "port %u forcing Ethernet interface up",
1171                         eth_dev->data->port_id);
1172                 mlx5_set_link_up(eth_dev);
1173                 /*
1174                  * Even though the interrupt handler is not installed yet,
1175                  * interrupts will still trigger on the asyn_fd from
1176                  * Verbs context returned by ibv_open_device().
1177                  */
1178                 mlx5_link_update(eth_dev, 0);
1179                 /* Store device configuration on private structure. */
1180                 priv->config = config;
1181                 /* Create drop queue. */
1182                 err = mlx5_flow_create_drop_queue(eth_dev);
1183                 if (err) {
1184                         DRV_LOG(ERR, "port %u drop queue allocation failed: %s",
1185                                 eth_dev->data->port_id, strerror(rte_errno));
1186                         err = rte_errno;
1187                         goto port_error;
1188                 }
1189                 /* Supported Verbs flow priority number detection. */
1190                 if (verb_priorities == 0)
1191                         verb_priorities = mlx5_get_max_verbs_prio(eth_dev);
1192                 if (verb_priorities < MLX5_VERBS_FLOW_PRIO_8) {
1193                         DRV_LOG(ERR, "port %u wrong Verbs flow priorities: %u",
1194                                 eth_dev->data->port_id, verb_priorities);
1195                         goto port_error;
1196                 }
1197                 priv->config.max_verbs_prio = verb_priorities;
1198                 /*
1199                  * Once the device is added to the list of memory event
1200                  * callback, its global MR cache table cannot be expanded
1201                  * on the fly because of deadlock. If it overflows, lookup
1202                  * should be done by searching MR list linearly, which is slow.
1203                  */
1204                 err = mlx5_mr_btree_init(&priv->mr.cache,
1205                                          MLX5_MR_BTREE_CACHE_N * 2,
1206                                          eth_dev->device->numa_node);
1207                 if (err) {
1208                         err = rte_errno;
1209                         goto port_error;
1210                 }
1211                 /* Add device to memory callback list. */
1212                 rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock);
1213                 LIST_INSERT_HEAD(&mlx5_shared_data->mem_event_cb_list,
1214                                  priv, mem_event_cb);
1215                 rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
1216                 rte_eth_dev_probing_finish(eth_dev);
1217                 continue;
1218 port_error:
1219                 if (priv)
1220                         rte_free(priv);
1221                 if (pd)
1222                         claim_zero(mlx5_glue->dealloc_pd(pd));
1223                 if (ctx)
1224                         claim_zero(mlx5_glue->close_device(ctx));
1225                 if (eth_dev && rte_eal_process_type() == RTE_PROC_PRIMARY)
1226                         rte_eth_dev_release_port(eth_dev);
1227                 break;
1228         }
1229         /*
1230          * XXX if something went wrong in the loop above, there is a resource
1231          * leak (ctx, pd, priv, dpdk ethdev) but we can do nothing about it as
1232          * long as the dpdk does not provide a way to deallocate a ethdev and a
1233          * way to enumerate the registered ethdevs to free the previous ones.
1234          */
1235         /* no port found, complain */
1236         if (!mlx5_dev[idx].ports) {
1237                 rte_errno = ENODEV;
1238                 err = rte_errno;
1239         }
1240 error:
1241         if (attr_ctx)
1242                 claim_zero(mlx5_glue->close_device(attr_ctx));
1243         if (list)
1244                 mlx5_glue->free_device_list(list);
1245         if (err) {
1246                 rte_errno = err;
1247                 return -rte_errno;
1248         }
1249         return 0;
1250 }
1251
1252 static const struct rte_pci_id mlx5_pci_id_map[] = {
1253         {
1254                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
1255                                PCI_DEVICE_ID_MELLANOX_CONNECTX4)
1256         },
1257         {
1258                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
1259                                PCI_DEVICE_ID_MELLANOX_CONNECTX4VF)
1260         },
1261         {
1262                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
1263                                PCI_DEVICE_ID_MELLANOX_CONNECTX4LX)
1264         },
1265         {
1266                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
1267                                PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF)
1268         },
1269         {
1270                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
1271                                PCI_DEVICE_ID_MELLANOX_CONNECTX5)
1272         },
1273         {
1274                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
1275                                PCI_DEVICE_ID_MELLANOX_CONNECTX5VF)
1276         },
1277         {
1278                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
1279                                PCI_DEVICE_ID_MELLANOX_CONNECTX5EX)
1280         },
1281         {
1282                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
1283                                PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF)
1284         },
1285         {
1286                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
1287                                PCI_DEVICE_ID_MELLANOX_CONNECTX5BF)
1288         },
1289         {
1290                 .vendor_id = 0
1291         }
1292 };
1293
1294 static struct rte_pci_driver mlx5_driver = {
1295         .driver = {
1296                 .name = MLX5_DRIVER_NAME
1297         },
1298         .id_table = mlx5_pci_id_map,
1299         .probe = mlx5_pci_probe,
1300         .drv_flags = RTE_PCI_DRV_INTR_LSC | RTE_PCI_DRV_INTR_RMV,
1301 };
1302
1303 #ifdef RTE_LIBRTE_MLX5_DLOPEN_DEPS
1304
1305 /**
1306  * Suffix RTE_EAL_PMD_PATH with "-glue".
1307  *
1308  * This function performs a sanity check on RTE_EAL_PMD_PATH before
1309  * suffixing its last component.
1310  *
1311  * @param buf[out]
1312  *   Output buffer, should be large enough otherwise NULL is returned.
1313  * @param size
1314  *   Size of @p out.
1315  *
1316  * @return
1317  *   Pointer to @p buf or @p NULL in case suffix cannot be appended.
1318  */
1319 static char *
1320 mlx5_glue_path(char *buf, size_t size)
1321 {
1322         static const char *const bad[] = { "/", ".", "..", NULL };
1323         const char *path = RTE_EAL_PMD_PATH;
1324         size_t len = strlen(path);
1325         size_t off;
1326         int i;
1327
1328         while (len && path[len - 1] == '/')
1329                 --len;
1330         for (off = len; off && path[off - 1] != '/'; --off)
1331                 ;
1332         for (i = 0; bad[i]; ++i)
1333                 if (!strncmp(path + off, bad[i], (int)(len - off)))
1334                         goto error;
1335         i = snprintf(buf, size, "%.*s-glue", (int)len, path);
1336         if (i == -1 || (size_t)i >= size)
1337                 goto error;
1338         return buf;
1339 error:
1340         DRV_LOG(ERR,
1341                 "unable to append \"-glue\" to last component of"
1342                 " RTE_EAL_PMD_PATH (\"" RTE_EAL_PMD_PATH "\"),"
1343                 " please re-configure DPDK");
1344         return NULL;
1345 }
1346
1347 /**
1348  * Initialization routine for run-time dependency on rdma-core.
1349  */
1350 static int
1351 mlx5_glue_init(void)
1352 {
1353         char glue_path[sizeof(RTE_EAL_PMD_PATH) - 1 + sizeof("-glue")];
1354         const char *path[] = {
1355                 /*
1356                  * A basic security check is necessary before trusting
1357                  * MLX5_GLUE_PATH, which may override RTE_EAL_PMD_PATH.
1358                  */
1359                 (geteuid() == getuid() && getegid() == getgid() ?
1360                  getenv("MLX5_GLUE_PATH") : NULL),
1361                 /*
1362                  * When RTE_EAL_PMD_PATH is set, use its glue-suffixed
1363                  * variant, otherwise let dlopen() look up libraries on its
1364                  * own.
1365                  */
1366                 (*RTE_EAL_PMD_PATH ?
1367                  mlx5_glue_path(glue_path, sizeof(glue_path)) : ""),
1368         };
1369         unsigned int i = 0;
1370         void *handle = NULL;
1371         void **sym;
1372         const char *dlmsg;
1373
1374         while (!handle && i != RTE_DIM(path)) {
1375                 const char *end;
1376                 size_t len;
1377                 int ret;
1378
1379                 if (!path[i]) {
1380                         ++i;
1381                         continue;
1382                 }
1383                 end = strpbrk(path[i], ":;");
1384                 if (!end)
1385                         end = path[i] + strlen(path[i]);
1386                 len = end - path[i];
1387                 ret = 0;
1388                 do {
1389                         char name[ret + 1];
1390
1391                         ret = snprintf(name, sizeof(name), "%.*s%s" MLX5_GLUE,
1392                                        (int)len, path[i],
1393                                        (!len || *(end - 1) == '/') ? "" : "/");
1394                         if (ret == -1)
1395                                 break;
1396                         if (sizeof(name) != (size_t)ret + 1)
1397                                 continue;
1398                         DRV_LOG(DEBUG, "looking for rdma-core glue as \"%s\"",
1399                                 name);
1400                         handle = dlopen(name, RTLD_LAZY);
1401                         break;
1402                 } while (1);
1403                 path[i] = end + 1;
1404                 if (!*end)
1405                         ++i;
1406         }
1407         if (!handle) {
1408                 rte_errno = EINVAL;
1409                 dlmsg = dlerror();
1410                 if (dlmsg)
1411                         DRV_LOG(WARNING, "cannot load glue library: %s", dlmsg);
1412                 goto glue_error;
1413         }
1414         sym = dlsym(handle, "mlx5_glue");
1415         if (!sym || !*sym) {
1416                 rte_errno = EINVAL;
1417                 dlmsg = dlerror();
1418                 if (dlmsg)
1419                         DRV_LOG(ERR, "cannot resolve glue symbol: %s", dlmsg);
1420                 goto glue_error;
1421         }
1422         mlx5_glue = *sym;
1423         return 0;
1424 glue_error:
1425         if (handle)
1426                 dlclose(handle);
1427         DRV_LOG(WARNING,
1428                 "cannot initialize PMD due to missing run-time dependency on"
1429                 " rdma-core libraries (libibverbs, libmlx5)");
1430         return -rte_errno;
1431 }
1432
1433 #endif
1434
1435 /**
1436  * Driver initialization routine.
1437  */
1438 RTE_INIT(rte_mlx5_pmd_init);
1439 static void
1440 rte_mlx5_pmd_init(void)
1441 {
1442         /* Build the static tables for Verbs conversion. */
1443         mlx5_set_ptype_table();
1444         mlx5_set_cksum_table();
1445         mlx5_set_swp_types_table();
1446         /*
1447          * RDMAV_HUGEPAGES_SAFE tells ibv_fork_init() we intend to use
1448          * huge pages. Calling ibv_fork_init() during init allows
1449          * applications to use fork() safely for purposes other than
1450          * using this PMD, which is not supported in forked processes.
1451          */
1452         setenv("RDMAV_HUGEPAGES_SAFE", "1", 1);
1453         /* Match the size of Rx completion entry to the size of a cacheline. */
1454         if (RTE_CACHE_LINE_SIZE == 128)
1455                 setenv("MLX5_CQE_SIZE", "128", 0);
1456 #ifdef RTE_LIBRTE_MLX5_DLOPEN_DEPS
1457         if (mlx5_glue_init())
1458                 return;
1459         assert(mlx5_glue);
1460 #endif
1461 #ifndef NDEBUG
1462         /* Glue structure must not contain any NULL pointers. */
1463         {
1464                 unsigned int i;
1465
1466                 for (i = 0; i != sizeof(*mlx5_glue) / sizeof(void *); ++i)
1467                         assert(((const void *const *)mlx5_glue)[i]);
1468         }
1469 #endif
1470         if (strcmp(mlx5_glue->version, MLX5_GLUE_VERSION)) {
1471                 DRV_LOG(ERR,
1472                         "rdma-core glue \"%s\" mismatch: \"%s\" is required",
1473                         mlx5_glue->version, MLX5_GLUE_VERSION);
1474                 return;
1475         }
1476         mlx5_glue->fork_init();
1477         rte_pci_register(&mlx5_driver);
1478 }
1479
1480 RTE_PMD_EXPORT_NAME(net_mlx5, __COUNTER__);
1481 RTE_PMD_REGISTER_PCI_TABLE(net_mlx5, mlx5_pci_id_map);
1482 RTE_PMD_REGISTER_KMOD_DEP(net_mlx5, "* ib_uverbs & mlx5_core & mlx5_ib");
1483
1484 /** Initialize driver log type. */
1485 RTE_INIT(vdev_netvsc_init_log)
1486 {
1487         mlx5_logtype = rte_log_register("pmd.net.mlx5");
1488         if (mlx5_logtype >= 0)
1489                 rte_log_set_level(mlx5_logtype, RTE_LOG_NOTICE);
1490 }