net/mlx5: use Netlink to add/remove MAC addresses
[dpdk.git] / drivers / net / mlx5 / mlx5.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2015 6WIND S.A.
3  * Copyright 2015 Mellanox Technologies, Ltd
4  */
5
6 #include <stddef.h>
7 #include <unistd.h>
8 #include <string.h>
9 #include <assert.h>
10 #include <dlfcn.h>
11 #include <stdint.h>
12 #include <stdlib.h>
13 #include <errno.h>
14 #include <net/if.h>
15 #include <sys/mman.h>
16 #include <linux/rtnetlink.h>
17
18 /* Verbs header. */
19 /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
20 #ifdef PEDANTIC
21 #pragma GCC diagnostic ignored "-Wpedantic"
22 #endif
23 #include <infiniband/verbs.h>
24 #ifdef PEDANTIC
25 #pragma GCC diagnostic error "-Wpedantic"
26 #endif
27
28 #include <rte_malloc.h>
29 #include <rte_ethdev_driver.h>
30 #include <rte_ethdev_pci.h>
31 #include <rte_pci.h>
32 #include <rte_bus_pci.h>
33 #include <rte_common.h>
34 #include <rte_config.h>
35 #include <rte_eal_memconfig.h>
36 #include <rte_kvargs.h>
37
38 #include "mlx5.h"
39 #include "mlx5_utils.h"
40 #include "mlx5_rxtx.h"
41 #include "mlx5_autoconf.h"
42 #include "mlx5_defs.h"
43 #include "mlx5_glue.h"
44
45 /* Device parameter to enable RX completion queue compression. */
46 #define MLX5_RXQ_CQE_COMP_EN "rxq_cqe_comp_en"
47
48 /* Device parameter to configure inline send. */
49 #define MLX5_TXQ_INLINE "txq_inline"
50
51 /*
52  * Device parameter to configure the number of TX queues threshold for
53  * enabling inline send.
54  */
55 #define MLX5_TXQS_MIN_INLINE "txqs_min_inline"
56
57 /* Device parameter to enable multi-packet send WQEs. */
58 #define MLX5_TXQ_MPW_EN "txq_mpw_en"
59
60 /* Device parameter to include 2 dsegs in the title WQEBB. */
61 #define MLX5_TXQ_MPW_HDR_DSEG_EN "txq_mpw_hdr_dseg_en"
62
63 /* Device parameter to limit the size of inlining packet. */
64 #define MLX5_TXQ_MAX_INLINE_LEN "txq_max_inline_len"
65
66 /* Device parameter to enable hardware Tx vector. */
67 #define MLX5_TX_VEC_EN "tx_vec_en"
68
69 /* Device parameter to enable hardware Rx vector. */
70 #define MLX5_RX_VEC_EN "rx_vec_en"
71
72 #ifndef HAVE_IBV_MLX5_MOD_MPW
73 #define MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED (1 << 2)
74 #define MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW (1 << 3)
75 #endif
76
77 #ifndef HAVE_IBV_MLX5_MOD_CQE_128B_COMP
78 #define MLX5DV_CONTEXT_FLAGS_CQE_128B_COMP (1 << 4)
79 #endif
80
81 /** Driver-specific log messages type. */
82 int mlx5_logtype;
83
84 /**
85  * Retrieve integer value from environment variable.
86  *
87  * @param[in] name
88  *   Environment variable name.
89  *
90  * @return
91  *   Integer value, 0 if the variable is not set.
92  */
93 int
94 mlx5_getenv_int(const char *name)
95 {
96         const char *val = getenv(name);
97
98         if (val == NULL)
99                 return 0;
100         return atoi(val);
101 }
102
103 /**
104  * Verbs callback to allocate a memory. This function should allocate the space
105  * according to the size provided residing inside a huge page.
106  * Please note that all allocation must respect the alignment from libmlx5
107  * (i.e. currently sysconf(_SC_PAGESIZE)).
108  *
109  * @param[in] size
110  *   The size in bytes of the memory to allocate.
111  * @param[in] data
112  *   A pointer to the callback data.
113  *
114  * @return
115  *   Allocated buffer, NULL otherwise and rte_errno is set.
116  */
117 static void *
118 mlx5_alloc_verbs_buf(size_t size, void *data)
119 {
120         struct priv *priv = data;
121         void *ret;
122         size_t alignment = sysconf(_SC_PAGESIZE);
123         unsigned int socket = SOCKET_ID_ANY;
124
125         if (priv->verbs_alloc_ctx.type == MLX5_VERBS_ALLOC_TYPE_TX_QUEUE) {
126                 const struct mlx5_txq_ctrl *ctrl = priv->verbs_alloc_ctx.obj;
127
128                 socket = ctrl->socket;
129         } else if (priv->verbs_alloc_ctx.type ==
130                    MLX5_VERBS_ALLOC_TYPE_RX_QUEUE) {
131                 const struct mlx5_rxq_ctrl *ctrl = priv->verbs_alloc_ctx.obj;
132
133                 socket = ctrl->socket;
134         }
135         assert(data != NULL);
136         ret = rte_malloc_socket(__func__, size, alignment, socket);
137         if (!ret && size)
138                 rte_errno = ENOMEM;
139         return ret;
140 }
141
142 /**
143  * Verbs callback to free a memory.
144  *
145  * @param[in] ptr
146  *   A pointer to the memory to free.
147  * @param[in] data
148  *   A pointer to the callback data.
149  */
150 static void
151 mlx5_free_verbs_buf(void *ptr, void *data __rte_unused)
152 {
153         assert(data != NULL);
154         rte_free(ptr);
155 }
156
157 /**
158  * DPDK callback to close the device.
159  *
160  * Destroy all queues and objects, free memory.
161  *
162  * @param dev
163  *   Pointer to Ethernet device structure.
164  */
165 static void
166 mlx5_dev_close(struct rte_eth_dev *dev)
167 {
168         struct priv *priv = dev->data->dev_private;
169         unsigned int i;
170         int ret;
171
172         DRV_LOG(DEBUG, "port %u closing device \"%s\"",
173                 dev->data->port_id,
174                 ((priv->ctx != NULL) ? priv->ctx->device->name : ""));
175         /* In case mlx5_dev_stop() has not been called. */
176         mlx5_dev_interrupt_handler_uninstall(dev);
177         mlx5_traffic_disable(dev);
178         /* Prevent crashes when queues are still in use. */
179         dev->rx_pkt_burst = removed_rx_burst;
180         dev->tx_pkt_burst = removed_tx_burst;
181         if (priv->rxqs != NULL) {
182                 /* XXX race condition if mlx5_rx_burst() is still running. */
183                 usleep(1000);
184                 for (i = 0; (i != priv->rxqs_n); ++i)
185                         mlx5_rxq_release(dev, i);
186                 priv->rxqs_n = 0;
187                 priv->rxqs = NULL;
188         }
189         if (priv->txqs != NULL) {
190                 /* XXX race condition if mlx5_tx_burst() is still running. */
191                 usleep(1000);
192                 for (i = 0; (i != priv->txqs_n); ++i)
193                         mlx5_txq_release(dev, i);
194                 priv->txqs_n = 0;
195                 priv->txqs = NULL;
196         }
197         if (priv->pd != NULL) {
198                 assert(priv->ctx != NULL);
199                 claim_zero(mlx5_glue->dealloc_pd(priv->pd));
200                 claim_zero(mlx5_glue->close_device(priv->ctx));
201         } else
202                 assert(priv->ctx == NULL);
203         if (priv->rss_conf.rss_key != NULL)
204                 rte_free(priv->rss_conf.rss_key);
205         if (priv->reta_idx != NULL)
206                 rte_free(priv->reta_idx);
207         if (priv->primary_socket)
208                 mlx5_socket_uninit(dev);
209         if (priv->config.vf)
210                 mlx5_nl_mac_addr_flush(dev);
211         if (priv->nl_socket >= 0)
212                 close(priv->nl_socket);
213         ret = mlx5_hrxq_ibv_verify(dev);
214         if (ret)
215                 DRV_LOG(WARNING, "port %u some hash Rx queue still remain",
216                         dev->data->port_id);
217         ret = mlx5_ind_table_ibv_verify(dev);
218         if (ret)
219                 DRV_LOG(WARNING, "port %u some indirection table still remain",
220                         dev->data->port_id);
221         ret = mlx5_rxq_ibv_verify(dev);
222         if (ret)
223                 DRV_LOG(WARNING, "port %u some Verbs Rx queue still remain",
224                         dev->data->port_id);
225         ret = mlx5_rxq_verify(dev);
226         if (ret)
227                 DRV_LOG(WARNING, "port %u some Rx queues still remain",
228                         dev->data->port_id);
229         ret = mlx5_txq_ibv_verify(dev);
230         if (ret)
231                 DRV_LOG(WARNING, "port %u some Verbs Tx queue still remain",
232                         dev->data->port_id);
233         ret = mlx5_txq_verify(dev);
234         if (ret)
235                 DRV_LOG(WARNING, "port %u some Tx queues still remain",
236                         dev->data->port_id);
237         ret = mlx5_flow_verify(dev);
238         if (ret)
239                 DRV_LOG(WARNING, "port %u some flows still remain",
240                         dev->data->port_id);
241         ret = mlx5_mr_verify(dev);
242         if (ret)
243                 DRV_LOG(WARNING, "port %u some memory region still remain",
244                         dev->data->port_id);
245         memset(priv, 0, sizeof(*priv));
246 }
247
248 const struct eth_dev_ops mlx5_dev_ops = {
249         .dev_configure = mlx5_dev_configure,
250         .dev_start = mlx5_dev_start,
251         .dev_stop = mlx5_dev_stop,
252         .dev_set_link_down = mlx5_set_link_down,
253         .dev_set_link_up = mlx5_set_link_up,
254         .dev_close = mlx5_dev_close,
255         .promiscuous_enable = mlx5_promiscuous_enable,
256         .promiscuous_disable = mlx5_promiscuous_disable,
257         .allmulticast_enable = mlx5_allmulticast_enable,
258         .allmulticast_disable = mlx5_allmulticast_disable,
259         .link_update = mlx5_link_update,
260         .stats_get = mlx5_stats_get,
261         .stats_reset = mlx5_stats_reset,
262         .xstats_get = mlx5_xstats_get,
263         .xstats_reset = mlx5_xstats_reset,
264         .xstats_get_names = mlx5_xstats_get_names,
265         .dev_infos_get = mlx5_dev_infos_get,
266         .dev_supported_ptypes_get = mlx5_dev_supported_ptypes_get,
267         .vlan_filter_set = mlx5_vlan_filter_set,
268         .rx_queue_setup = mlx5_rx_queue_setup,
269         .tx_queue_setup = mlx5_tx_queue_setup,
270         .rx_queue_release = mlx5_rx_queue_release,
271         .tx_queue_release = mlx5_tx_queue_release,
272         .flow_ctrl_get = mlx5_dev_get_flow_ctrl,
273         .flow_ctrl_set = mlx5_dev_set_flow_ctrl,
274         .mac_addr_remove = mlx5_mac_addr_remove,
275         .mac_addr_add = mlx5_mac_addr_add,
276         .mac_addr_set = mlx5_mac_addr_set,
277         .mtu_set = mlx5_dev_set_mtu,
278         .vlan_strip_queue_set = mlx5_vlan_strip_queue_set,
279         .vlan_offload_set = mlx5_vlan_offload_set,
280         .reta_update = mlx5_dev_rss_reta_update,
281         .reta_query = mlx5_dev_rss_reta_query,
282         .rss_hash_update = mlx5_rss_hash_update,
283         .rss_hash_conf_get = mlx5_rss_hash_conf_get,
284         .filter_ctrl = mlx5_dev_filter_ctrl,
285         .rx_descriptor_status = mlx5_rx_descriptor_status,
286         .tx_descriptor_status = mlx5_tx_descriptor_status,
287         .rx_queue_intr_enable = mlx5_rx_intr_enable,
288         .rx_queue_intr_disable = mlx5_rx_intr_disable,
289         .is_removed = mlx5_is_removed,
290 };
291
292 static const struct eth_dev_ops mlx5_dev_sec_ops = {
293         .stats_get = mlx5_stats_get,
294         .stats_reset = mlx5_stats_reset,
295         .xstats_get = mlx5_xstats_get,
296         .xstats_reset = mlx5_xstats_reset,
297         .xstats_get_names = mlx5_xstats_get_names,
298         .dev_infos_get = mlx5_dev_infos_get,
299         .rx_descriptor_status = mlx5_rx_descriptor_status,
300         .tx_descriptor_status = mlx5_tx_descriptor_status,
301 };
302
303 /* Available operators in flow isolated mode. */
304 const struct eth_dev_ops mlx5_dev_ops_isolate = {
305         .dev_configure = mlx5_dev_configure,
306         .dev_start = mlx5_dev_start,
307         .dev_stop = mlx5_dev_stop,
308         .dev_set_link_down = mlx5_set_link_down,
309         .dev_set_link_up = mlx5_set_link_up,
310         .dev_close = mlx5_dev_close,
311         .link_update = mlx5_link_update,
312         .stats_get = mlx5_stats_get,
313         .stats_reset = mlx5_stats_reset,
314         .xstats_get = mlx5_xstats_get,
315         .xstats_reset = mlx5_xstats_reset,
316         .xstats_get_names = mlx5_xstats_get_names,
317         .dev_infos_get = mlx5_dev_infos_get,
318         .dev_supported_ptypes_get = mlx5_dev_supported_ptypes_get,
319         .vlan_filter_set = mlx5_vlan_filter_set,
320         .rx_queue_setup = mlx5_rx_queue_setup,
321         .tx_queue_setup = mlx5_tx_queue_setup,
322         .rx_queue_release = mlx5_rx_queue_release,
323         .tx_queue_release = mlx5_tx_queue_release,
324         .flow_ctrl_get = mlx5_dev_get_flow_ctrl,
325         .flow_ctrl_set = mlx5_dev_set_flow_ctrl,
326         .mac_addr_remove = mlx5_mac_addr_remove,
327         .mac_addr_add = mlx5_mac_addr_add,
328         .mac_addr_set = mlx5_mac_addr_set,
329         .mtu_set = mlx5_dev_set_mtu,
330         .vlan_strip_queue_set = mlx5_vlan_strip_queue_set,
331         .vlan_offload_set = mlx5_vlan_offload_set,
332         .filter_ctrl = mlx5_dev_filter_ctrl,
333         .rx_descriptor_status = mlx5_rx_descriptor_status,
334         .tx_descriptor_status = mlx5_tx_descriptor_status,
335         .rx_queue_intr_enable = mlx5_rx_intr_enable,
336         .rx_queue_intr_disable = mlx5_rx_intr_disable,
337         .is_removed = mlx5_is_removed,
338 };
339
340 static struct {
341         struct rte_pci_addr pci_addr; /* associated PCI address */
342         uint32_t ports; /* physical ports bitfield. */
343 } mlx5_dev[32];
344
345 /**
346  * Get device index in mlx5_dev[] from PCI bus address.
347  *
348  * @param[in] pci_addr
349  *   PCI bus address to look for.
350  *
351  * @return
352  *   mlx5_dev[] index on success, -1 on failure.
353  */
354 static int
355 mlx5_dev_idx(struct rte_pci_addr *pci_addr)
356 {
357         unsigned int i;
358         int ret = -1;
359
360         assert(pci_addr != NULL);
361         for (i = 0; (i != RTE_DIM(mlx5_dev)); ++i) {
362                 if ((mlx5_dev[i].pci_addr.domain == pci_addr->domain) &&
363                     (mlx5_dev[i].pci_addr.bus == pci_addr->bus) &&
364                     (mlx5_dev[i].pci_addr.devid == pci_addr->devid) &&
365                     (mlx5_dev[i].pci_addr.function == pci_addr->function))
366                         return i;
367                 if ((mlx5_dev[i].ports == 0) && (ret == -1))
368                         ret = i;
369         }
370         return ret;
371 }
372
373 /**
374  * Verify and store value for device argument.
375  *
376  * @param[in] key
377  *   Key argument to verify.
378  * @param[in] val
379  *   Value associated with key.
380  * @param opaque
381  *   User data.
382  *
383  * @return
384  *   0 on success, a negative errno value otherwise and rte_errno is set.
385  */
386 static int
387 mlx5_args_check(const char *key, const char *val, void *opaque)
388 {
389         struct mlx5_dev_config *config = opaque;
390         unsigned long tmp;
391
392         errno = 0;
393         tmp = strtoul(val, NULL, 0);
394         if (errno) {
395                 rte_errno = errno;
396                 DRV_LOG(WARNING, "%s: \"%s\" is not a valid integer", key, val);
397                 return -rte_errno;
398         }
399         if (strcmp(MLX5_RXQ_CQE_COMP_EN, key) == 0) {
400                 config->cqe_comp = !!tmp;
401         } else if (strcmp(MLX5_TXQ_INLINE, key) == 0) {
402                 config->txq_inline = tmp;
403         } else if (strcmp(MLX5_TXQS_MIN_INLINE, key) == 0) {
404                 config->txqs_inline = tmp;
405         } else if (strcmp(MLX5_TXQ_MPW_EN, key) == 0) {
406                 config->mps = !!tmp ? config->mps : 0;
407         } else if (strcmp(MLX5_TXQ_MPW_HDR_DSEG_EN, key) == 0) {
408                 config->mpw_hdr_dseg = !!tmp;
409         } else if (strcmp(MLX5_TXQ_MAX_INLINE_LEN, key) == 0) {
410                 config->inline_max_packet_sz = tmp;
411         } else if (strcmp(MLX5_TX_VEC_EN, key) == 0) {
412                 config->tx_vec_en = !!tmp;
413         } else if (strcmp(MLX5_RX_VEC_EN, key) == 0) {
414                 config->rx_vec_en = !!tmp;
415         } else {
416                 DRV_LOG(WARNING, "%s: unknown parameter", key);
417                 rte_errno = EINVAL;
418                 return -rte_errno;
419         }
420         return 0;
421 }
422
423 /**
424  * Parse device parameters.
425  *
426  * @param config
427  *   Pointer to device configuration structure.
428  * @param devargs
429  *   Device arguments structure.
430  *
431  * @return
432  *   0 on success, a negative errno value otherwise and rte_errno is set.
433  */
434 static int
435 mlx5_args(struct mlx5_dev_config *config, struct rte_devargs *devargs)
436 {
437         const char **params = (const char *[]){
438                 MLX5_RXQ_CQE_COMP_EN,
439                 MLX5_TXQ_INLINE,
440                 MLX5_TXQS_MIN_INLINE,
441                 MLX5_TXQ_MPW_EN,
442                 MLX5_TXQ_MPW_HDR_DSEG_EN,
443                 MLX5_TXQ_MAX_INLINE_LEN,
444                 MLX5_TX_VEC_EN,
445                 MLX5_RX_VEC_EN,
446                 NULL,
447         };
448         struct rte_kvargs *kvlist;
449         int ret = 0;
450         int i;
451
452         if (devargs == NULL)
453                 return 0;
454         /* Following UGLY cast is done to pass checkpatch. */
455         kvlist = rte_kvargs_parse(devargs->args, params);
456         if (kvlist == NULL)
457                 return 0;
458         /* Process parameters. */
459         for (i = 0; (params[i] != NULL); ++i) {
460                 if (rte_kvargs_count(kvlist, params[i])) {
461                         ret = rte_kvargs_process(kvlist, params[i],
462                                                  mlx5_args_check, config);
463                         if (ret) {
464                                 rte_errno = EINVAL;
465                                 rte_kvargs_free(kvlist);
466                                 return -rte_errno;
467                         }
468                 }
469         }
470         rte_kvargs_free(kvlist);
471         return 0;
472 }
473
474 static struct rte_pci_driver mlx5_driver;
475
476 /*
477  * Reserved UAR address space for TXQ UAR(hw doorbell) mapping, process
478  * local resource used by both primary and secondary to avoid duplicate
479  * reservation.
480  * The space has to be available on both primary and secondary process,
481  * TXQ UAR maps to this area using fixed mmap w/o double check.
482  */
483 static void *uar_base;
484
485 static int
486 find_lower_va_bound(const struct rte_memseg_list *msl __rte_unused,
487                 const struct rte_memseg *ms, void *arg)
488 {
489         void **addr = arg;
490
491         if (*addr == NULL)
492                 *addr = ms->addr;
493         else
494                 *addr = RTE_MIN(*addr, ms->addr);
495
496         return 0;
497 }
498
499 /**
500  * Reserve UAR address space for primary process.
501  *
502  * @param[in] dev
503  *   Pointer to Ethernet device.
504  *
505  * @return
506  *   0 on success, a negative errno value otherwise and rte_errno is set.
507  */
508 static int
509 mlx5_uar_init_primary(struct rte_eth_dev *dev)
510 {
511         struct priv *priv = dev->data->dev_private;
512         void *addr = (void *)0;
513
514         if (uar_base) { /* UAR address space mapped. */
515                 priv->uar_base = uar_base;
516                 return 0;
517         }
518         /* find out lower bound of hugepage segments */
519         rte_memseg_walk(find_lower_va_bound, &addr);
520
521         /* keep distance to hugepages to minimize potential conflicts. */
522         addr = RTE_PTR_SUB(addr, MLX5_UAR_OFFSET + MLX5_UAR_SIZE);
523         /* anonymous mmap, no real memory consumption. */
524         addr = mmap(addr, MLX5_UAR_SIZE,
525                     PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
526         if (addr == MAP_FAILED) {
527                 DRV_LOG(ERR,
528                         "port %u failed to reserve UAR address space, please"
529                         " adjust MLX5_UAR_SIZE or try --base-virtaddr",
530                         dev->data->port_id);
531                 rte_errno = ENOMEM;
532                 return -rte_errno;
533         }
534         /* Accept either same addr or a new addr returned from mmap if target
535          * range occupied.
536          */
537         DRV_LOG(INFO, "port %u reserved UAR address space: %p",
538                 dev->data->port_id, addr);
539         priv->uar_base = addr; /* for primary and secondary UAR re-mmap. */
540         uar_base = addr; /* process local, don't reserve again. */
541         return 0;
542 }
543
544 /**
545  * Reserve UAR address space for secondary process, align with
546  * primary process.
547  *
548  * @param[in] dev
549  *   Pointer to Ethernet device.
550  *
551  * @return
552  *   0 on success, a negative errno value otherwise and rte_errno is set.
553  */
554 static int
555 mlx5_uar_init_secondary(struct rte_eth_dev *dev)
556 {
557         struct priv *priv = dev->data->dev_private;
558         void *addr;
559
560         assert(priv->uar_base);
561         if (uar_base) { /* already reserved. */
562                 assert(uar_base == priv->uar_base);
563                 return 0;
564         }
565         /* anonymous mmap, no real memory consumption. */
566         addr = mmap(priv->uar_base, MLX5_UAR_SIZE,
567                     PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
568         if (addr == MAP_FAILED) {
569                 DRV_LOG(ERR, "port %u UAR mmap failed: %p size: %llu",
570                         dev->data->port_id, priv->uar_base, MLX5_UAR_SIZE);
571                 rte_errno = ENXIO;
572                 return -rte_errno;
573         }
574         if (priv->uar_base != addr) {
575                 DRV_LOG(ERR,
576                         "port %u UAR address %p size %llu occupied, please"
577                         " adjust MLX5_UAR_OFFSET or try EAL parameter"
578                         " --base-virtaddr",
579                         dev->data->port_id, priv->uar_base, MLX5_UAR_SIZE);
580                 rte_errno = ENXIO;
581                 return -rte_errno;
582         }
583         uar_base = addr; /* process local, don't reserve again */
584         DRV_LOG(INFO, "port %u reserved UAR address space: %p",
585                 dev->data->port_id, addr);
586         return 0;
587 }
588
589 /**
590  * DPDK callback to register a PCI device.
591  *
592  * This function creates an Ethernet device for each port of a given
593  * PCI device.
594  *
595  * @param[in] pci_drv
596  *   PCI driver structure (mlx5_driver).
597  * @param[in] pci_dev
598  *   PCI device information.
599  *
600  * @return
601  *   0 on success, a negative errno value otherwise and rte_errno is set.
602  */
603 static int
604 mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
605                struct rte_pci_device *pci_dev)
606 {
607         struct ibv_device **list = NULL;
608         struct ibv_device *ibv_dev;
609         int err = 0;
610         struct ibv_context *attr_ctx = NULL;
611         struct ibv_device_attr_ex device_attr;
612         unsigned int vf;
613         unsigned int mps;
614         unsigned int cqe_comp;
615         unsigned int tunnel_en = 0;
616         int idx;
617         int i;
618         struct mlx5dv_context attrs_out = {0};
619 #ifdef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT
620         struct ibv_counter_set_description cs_desc;
621 #endif
622
623         assert(pci_drv == &mlx5_driver);
624         /* Get mlx5_dev[] index. */
625         idx = mlx5_dev_idx(&pci_dev->addr);
626         if (idx == -1) {
627                 DRV_LOG(ERR, "this driver cannot support any more adapters");
628                 err = ENOMEM;
629                 goto error;
630         }
631         DRV_LOG(DEBUG, "using driver device index %d", idx);
632         /* Save PCI address. */
633         mlx5_dev[idx].pci_addr = pci_dev->addr;
634         list = mlx5_glue->get_device_list(&i);
635         if (list == NULL) {
636                 assert(errno);
637                 err = errno;
638                 if (errno == ENOSYS)
639                         DRV_LOG(ERR,
640                                 "cannot list devices, is ib_uverbs loaded?");
641                 goto error;
642         }
643         assert(i >= 0);
644         /*
645          * For each listed device, check related sysfs entry against
646          * the provided PCI ID.
647          */
648         while (i != 0) {
649                 struct rte_pci_addr pci_addr;
650
651                 --i;
652                 DRV_LOG(DEBUG, "checking device \"%s\"", list[i]->name);
653                 if (mlx5_ibv_device_to_pci_addr(list[i], &pci_addr))
654                         continue;
655                 if ((pci_dev->addr.domain != pci_addr.domain) ||
656                     (pci_dev->addr.bus != pci_addr.bus) ||
657                     (pci_dev->addr.devid != pci_addr.devid) ||
658                     (pci_dev->addr.function != pci_addr.function))
659                         continue;
660                 DRV_LOG(INFO, "PCI information matches, using device \"%s\"",
661                         list[i]->name);
662                 vf = ((pci_dev->id.device_id ==
663                        PCI_DEVICE_ID_MELLANOX_CONNECTX4VF) ||
664                       (pci_dev->id.device_id ==
665                        PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF) ||
666                       (pci_dev->id.device_id ==
667                        PCI_DEVICE_ID_MELLANOX_CONNECTX5VF) ||
668                       (pci_dev->id.device_id ==
669                        PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF));
670                 attr_ctx = mlx5_glue->open_device(list[i]);
671                 rte_errno = errno;
672                 err = rte_errno;
673                 break;
674         }
675         if (attr_ctx == NULL) {
676                 mlx5_glue->free_device_list(list);
677                 switch (err) {
678                 case 0:
679                         DRV_LOG(ERR,
680                                 "cannot access device, is mlx5_ib loaded?");
681                         err = ENODEV;
682                         goto error;
683                 case EINVAL:
684                         DRV_LOG(ERR,
685                                 "cannot use device, are drivers up to date?");
686                         goto error;
687                 }
688         }
689         ibv_dev = list[i];
690         DRV_LOG(DEBUG, "device opened");
691         /*
692          * Multi-packet send is supported by ConnectX-4 Lx PF as well
693          * as all ConnectX-5 devices.
694          */
695 #ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT
696         attrs_out.comp_mask |= MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS;
697 #endif
698         mlx5_glue->dv_query_device(attr_ctx, &attrs_out);
699         if (attrs_out.flags & MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED) {
700                 if (attrs_out.flags & MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW) {
701                         DRV_LOG(DEBUG, "enhanced MPW is supported");
702                         mps = MLX5_MPW_ENHANCED;
703                 } else {
704                         DRV_LOG(DEBUG, "MPW is supported");
705                         mps = MLX5_MPW;
706                 }
707         } else {
708                 DRV_LOG(DEBUG, "MPW isn't supported");
709                 mps = MLX5_MPW_DISABLED;
710         }
711         if (RTE_CACHE_LINE_SIZE == 128 &&
712             !(attrs_out.flags & MLX5DV_CONTEXT_FLAGS_CQE_128B_COMP))
713                 cqe_comp = 0;
714         else
715                 cqe_comp = 1;
716 #ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT
717         if (attrs_out.comp_mask & MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS) {
718                 tunnel_en = ((attrs_out.tunnel_offloads_caps &
719                               MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_VXLAN) &&
720                              (attrs_out.tunnel_offloads_caps &
721                               MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_GRE));
722         }
723         DRV_LOG(DEBUG, "tunnel offloading is %ssupported",
724                 tunnel_en ? "" : "not ");
725 #else
726         DRV_LOG(WARNING,
727                 "tunnel offloading disabled due to old OFED/rdma-core version");
728 #endif
729         if (mlx5_glue->query_device_ex(attr_ctx, NULL, &device_attr)) {
730                 err = errno;
731                 goto error;
732         }
733         DRV_LOG(INFO, "%u port(s) detected",
734                 device_attr.orig_attr.phys_port_cnt);
735         for (i = 0; i < device_attr.orig_attr.phys_port_cnt; i++) {
736                 char name[RTE_ETH_NAME_MAX_LEN];
737                 int len;
738                 uint32_t port = i + 1; /* ports are indexed from one */
739                 uint32_t test = (1 << i);
740                 struct ibv_context *ctx = NULL;
741                 struct ibv_port_attr port_attr;
742                 struct ibv_pd *pd = NULL;
743                 struct priv *priv = NULL;
744                 struct rte_eth_dev *eth_dev = NULL;
745                 struct ibv_device_attr_ex device_attr_ex;
746                 struct ether_addr mac;
747                 struct mlx5_dev_config config = {
748                         .cqe_comp = cqe_comp,
749                         .mps = mps,
750                         .tunnel_en = tunnel_en,
751                         .tx_vec_en = 1,
752                         .rx_vec_en = 1,
753                         .mpw_hdr_dseg = 0,
754                         .txq_inline = MLX5_ARG_UNSET,
755                         .txqs_inline = MLX5_ARG_UNSET,
756                         .inline_max_packet_sz = MLX5_ARG_UNSET,
757                 };
758
759                 len = snprintf(name, sizeof(name), PCI_PRI_FMT,
760                          pci_dev->addr.domain, pci_dev->addr.bus,
761                          pci_dev->addr.devid, pci_dev->addr.function);
762                 if (device_attr.orig_attr.phys_port_cnt > 1)
763                         snprintf(name + len, sizeof(name), " port %u", i);
764                 mlx5_dev[idx].ports |= test;
765                 if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
766                         eth_dev = rte_eth_dev_attach_secondary(name);
767                         if (eth_dev == NULL) {
768                                 DRV_LOG(ERR, "can not attach rte ethdev");
769                                 rte_errno = ENOMEM;
770                                 err = rte_errno;
771                                 goto error;
772                         }
773                         eth_dev->device = &pci_dev->device;
774                         eth_dev->dev_ops = &mlx5_dev_sec_ops;
775                         err = mlx5_uar_init_secondary(eth_dev);
776                         if (err)
777                                 goto error;
778                         /* Receive command fd from primary process */
779                         err = mlx5_socket_connect(eth_dev);
780                         if (err)
781                                 goto error;
782                         /* Remap UAR for Tx queues. */
783                         err = mlx5_tx_uar_remap(eth_dev, err);
784                         if (err)
785                                 goto error;
786                         /*
787                          * Ethdev pointer is still required as input since
788                          * the primary device is not accessible from the
789                          * secondary process.
790                          */
791                         eth_dev->rx_pkt_burst =
792                                 mlx5_select_rx_function(eth_dev);
793                         eth_dev->tx_pkt_burst =
794                                 mlx5_select_tx_function(eth_dev);
795                         continue;
796                 }
797                 DRV_LOG(DEBUG, "using port %u (%08" PRIx32 ")", port, test);
798                 ctx = mlx5_glue->open_device(ibv_dev);
799                 if (ctx == NULL) {
800                         err = ENODEV;
801                         goto port_error;
802                 }
803                 /* Check port status. */
804                 err = mlx5_glue->query_port(ctx, port, &port_attr);
805                 if (err) {
806                         DRV_LOG(ERR, "port query failed: %s", strerror(err));
807                         goto port_error;
808                 }
809                 if (port_attr.link_layer != IBV_LINK_LAYER_ETHERNET) {
810                         DRV_LOG(ERR,
811                                 "port %d is not configured in Ethernet mode",
812                                 port);
813                         err = EINVAL;
814                         goto port_error;
815                 }
816                 if (port_attr.state != IBV_PORT_ACTIVE)
817                         DRV_LOG(DEBUG, "port %d is not active: \"%s\" (%d)",
818                                 port,
819                                 mlx5_glue->port_state_str(port_attr.state),
820                                 port_attr.state);
821                 /* Allocate protection domain. */
822                 pd = mlx5_glue->alloc_pd(ctx);
823                 if (pd == NULL) {
824                         DRV_LOG(ERR, "PD allocation failure");
825                         err = ENOMEM;
826                         goto port_error;
827                 }
828                 mlx5_dev[idx].ports |= test;
829                 /* from rte_ethdev.c */
830                 priv = rte_zmalloc("ethdev private structure",
831                                    sizeof(*priv),
832                                    RTE_CACHE_LINE_SIZE);
833                 if (priv == NULL) {
834                         DRV_LOG(ERR, "priv allocation failure");
835                         err = ENOMEM;
836                         goto port_error;
837                 }
838                 priv->ctx = ctx;
839                 strncpy(priv->ibdev_path, priv->ctx->device->ibdev_path,
840                         sizeof(priv->ibdev_path));
841                 priv->device_attr = device_attr;
842                 priv->port = port;
843                 priv->pd = pd;
844                 priv->mtu = ETHER_MTU;
845                 err = mlx5_args(&config, pci_dev->device.devargs);
846                 if (err) {
847                         DRV_LOG(ERR, "failed to process device arguments: %s",
848                                 strerror(err));
849                         goto port_error;
850                 }
851                 if (mlx5_glue->query_device_ex(ctx, NULL, &device_attr_ex)) {
852                         DRV_LOG(ERR, "ibv_query_device_ex() failed");
853                         err = errno;
854                         goto port_error;
855                 }
856                 config.hw_csum = !!(device_attr_ex.device_cap_flags_ex &
857                                     IBV_DEVICE_RAW_IP_CSUM);
858                 DRV_LOG(DEBUG, "checksum offloading is %ssupported",
859                         (config.hw_csum ? "" : "not "));
860 #ifdef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT
861                 config.flow_counter_en = !!(device_attr.max_counter_sets);
862                 mlx5_glue->describe_counter_set(ctx, 0, &cs_desc);
863                 DRV_LOG(DEBUG,
864                         "counter type = %d, num of cs = %ld, attributes = %d",
865                         cs_desc.counter_type, cs_desc.num_of_cs,
866                         cs_desc.attributes);
867 #endif
868                 config.ind_table_max_size =
869                         device_attr_ex.rss_caps.max_rwq_indirection_table_size;
870                 /* Remove this check once DPDK supports larger/variable
871                  * indirection tables. */
872                 if (config.ind_table_max_size >
873                                 (unsigned int)ETH_RSS_RETA_SIZE_512)
874                         config.ind_table_max_size = ETH_RSS_RETA_SIZE_512;
875                 DRV_LOG(DEBUG, "maximum Rx indirection table size is %u",
876                         config.ind_table_max_size);
877                 config.hw_vlan_strip = !!(device_attr_ex.raw_packet_caps &
878                                          IBV_RAW_PACKET_CAP_CVLAN_STRIPPING);
879                 DRV_LOG(DEBUG, "VLAN stripping is %ssupported",
880                         (config.hw_vlan_strip ? "" : "not "));
881
882                 config.hw_fcs_strip = !!(device_attr_ex.raw_packet_caps &
883                                          IBV_RAW_PACKET_CAP_SCATTER_FCS);
884                 DRV_LOG(DEBUG, "FCS stripping configuration is %ssupported",
885                         (config.hw_fcs_strip ? "" : "not "));
886
887 #ifdef HAVE_IBV_WQ_FLAG_RX_END_PADDING
888                 config.hw_padding = !!device_attr_ex.rx_pad_end_addr_align;
889 #endif
890                 DRV_LOG(DEBUG,
891                         "hardware Rx end alignment padding is %ssupported",
892                         (config.hw_padding ? "" : "not "));
893                 config.vf = vf;
894                 config.tso = ((device_attr_ex.tso_caps.max_tso > 0) &&
895                               (device_attr_ex.tso_caps.supported_qpts &
896                               (1 << IBV_QPT_RAW_PACKET)));
897                 if (config.tso)
898                         config.tso_max_payload_sz =
899                                         device_attr_ex.tso_caps.max_tso;
900                 if (config.mps && !mps) {
901                         DRV_LOG(ERR,
902                                 "multi-packet send not supported on this device"
903                                 " (" MLX5_TXQ_MPW_EN ")");
904                         err = ENOTSUP;
905                         goto port_error;
906                 }
907                 DRV_LOG(INFO, "%s MPS is %s",
908                         config.mps == MLX5_MPW_ENHANCED ? "enhanced " : "",
909                         config.mps != MLX5_MPW_DISABLED ? "enabled" :
910                         "disabled");
911                 if (config.cqe_comp && !cqe_comp) {
912                         DRV_LOG(WARNING, "Rx CQE compression isn't supported");
913                         config.cqe_comp = 0;
914                 }
915                 eth_dev = rte_eth_dev_allocate(name);
916                 if (eth_dev == NULL) {
917                         DRV_LOG(ERR, "can not allocate rte ethdev");
918                         err = ENOMEM;
919                         goto port_error;
920                 }
921                 eth_dev->data->dev_private = priv;
922                 priv->dev = eth_dev;
923                 eth_dev->data->mac_addrs = priv->mac;
924                 eth_dev->device = &pci_dev->device;
925                 rte_eth_copy_pci_info(eth_dev, pci_dev);
926                 eth_dev->device->driver = &mlx5_driver.driver;
927                 err = mlx5_uar_init_primary(eth_dev);
928                 if (err)
929                         goto port_error;
930                 /* Configure the first MAC address by default. */
931                 if (mlx5_get_mac(eth_dev, &mac.addr_bytes)) {
932                         DRV_LOG(ERR,
933                                 "port %u cannot get MAC address, is mlx5_en"
934                                 " loaded? (errno: %s)",
935                                 eth_dev->data->port_id, strerror(errno));
936                         err = ENODEV;
937                         goto port_error;
938                 }
939                 DRV_LOG(INFO,
940                         "port %u MAC address is %02x:%02x:%02x:%02x:%02x:%02x",
941                         eth_dev->data->port_id,
942                         mac.addr_bytes[0], mac.addr_bytes[1],
943                         mac.addr_bytes[2], mac.addr_bytes[3],
944                         mac.addr_bytes[4], mac.addr_bytes[5]);
945 #ifndef NDEBUG
946                 {
947                         char ifname[IF_NAMESIZE];
948
949                         if (mlx5_get_ifname(eth_dev, &ifname) == 0)
950                                 DRV_LOG(DEBUG, "port %u ifname is \"%s\"",
951                                         eth_dev->data->port_id, ifname);
952                         else
953                                 DRV_LOG(DEBUG, "port %u ifname is unknown",
954                                         eth_dev->data->port_id);
955                 }
956 #endif
957                 /* Get actual MTU if possible. */
958                 err = mlx5_get_mtu(eth_dev, &priv->mtu);
959                 if (err)
960                         goto port_error;
961                 DRV_LOG(DEBUG, "port %u MTU is %u", eth_dev->data->port_id,
962                         priv->mtu);
963                 /*
964                  * Initialize burst functions to prevent crashes before link-up.
965                  */
966                 eth_dev->rx_pkt_burst = removed_rx_burst;
967                 eth_dev->tx_pkt_burst = removed_tx_burst;
968                 eth_dev->dev_ops = &mlx5_dev_ops;
969                 /* Register MAC address. */
970                 claim_zero(mlx5_mac_addr_add(eth_dev, &mac, 0, 0));
971                 priv->nl_socket = -1;
972                 priv->nl_sn = 0;
973                 if (vf) {
974                         priv->nl_socket = mlx5_nl_init(RTMGRP_LINK);
975                         if (priv->nl_socket < 0)
976                                 priv->nl_socket = -1;
977                         mlx5_nl_mac_addr_sync(eth_dev);
978                 }
979                 TAILQ_INIT(&priv->flows);
980                 TAILQ_INIT(&priv->ctrl_flows);
981                 /* Hint libmlx5 to use PMD allocator for data plane resources */
982                 struct mlx5dv_ctx_allocators alctr = {
983                         .alloc = &mlx5_alloc_verbs_buf,
984                         .free = &mlx5_free_verbs_buf,
985                         .data = priv,
986                 };
987                 mlx5_glue->dv_set_context_attr(ctx,
988                                                MLX5DV_CTX_ATTR_BUF_ALLOCATORS,
989                                                (void *)((uintptr_t)&alctr));
990                 /* Bring Ethernet device up. */
991                 DRV_LOG(DEBUG, "port %u forcing Ethernet interface up",
992                         eth_dev->data->port_id);
993                 mlx5_set_link_up(eth_dev);
994                 /* Store device configuration on private structure. */
995                 priv->config = config;
996                 continue;
997 port_error:
998                 if (priv)
999                         rte_free(priv);
1000                 if (pd)
1001                         claim_zero(mlx5_glue->dealloc_pd(pd));
1002                 if (ctx)
1003                         claim_zero(mlx5_glue->close_device(ctx));
1004                 break;
1005         }
1006         /*
1007          * XXX if something went wrong in the loop above, there is a resource
1008          * leak (ctx, pd, priv, dpdk ethdev) but we can do nothing about it as
1009          * long as the dpdk does not provide a way to deallocate a ethdev and a
1010          * way to enumerate the registered ethdevs to free the previous ones.
1011          */
1012         /* no port found, complain */
1013         if (!mlx5_dev[idx].ports) {
1014                 rte_errno = ENODEV;
1015                 err = rte_errno;
1016         }
1017 error:
1018         if (attr_ctx)
1019                 claim_zero(mlx5_glue->close_device(attr_ctx));
1020         if (list)
1021                 mlx5_glue->free_device_list(list);
1022         if (err) {
1023                 rte_errno = err;
1024                 return -rte_errno;
1025         }
1026         return 0;
1027 }
1028
1029 static const struct rte_pci_id mlx5_pci_id_map[] = {
1030         {
1031                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
1032                                PCI_DEVICE_ID_MELLANOX_CONNECTX4)
1033         },
1034         {
1035                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
1036                                PCI_DEVICE_ID_MELLANOX_CONNECTX4VF)
1037         },
1038         {
1039                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
1040                                PCI_DEVICE_ID_MELLANOX_CONNECTX4LX)
1041         },
1042         {
1043                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
1044                                PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF)
1045         },
1046         {
1047                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
1048                                PCI_DEVICE_ID_MELLANOX_CONNECTX5)
1049         },
1050         {
1051                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
1052                                PCI_DEVICE_ID_MELLANOX_CONNECTX5VF)
1053         },
1054         {
1055                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
1056                                PCI_DEVICE_ID_MELLANOX_CONNECTX5EX)
1057         },
1058         {
1059                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
1060                                PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF)
1061         },
1062         {
1063                 .vendor_id = 0
1064         }
1065 };
1066
1067 static struct rte_pci_driver mlx5_driver = {
1068         .driver = {
1069                 .name = MLX5_DRIVER_NAME
1070         },
1071         .id_table = mlx5_pci_id_map,
1072         .probe = mlx5_pci_probe,
1073         .drv_flags = RTE_PCI_DRV_INTR_LSC | RTE_PCI_DRV_INTR_RMV,
1074 };
1075
1076 #ifdef RTE_LIBRTE_MLX5_DLOPEN_DEPS
1077
1078 /**
1079  * Suffix RTE_EAL_PMD_PATH with "-glue".
1080  *
1081  * This function performs a sanity check on RTE_EAL_PMD_PATH before
1082  * suffixing its last component.
1083  *
1084  * @param buf[out]
1085  *   Output buffer, should be large enough otherwise NULL is returned.
1086  * @param size
1087  *   Size of @p out.
1088  *
1089  * @return
1090  *   Pointer to @p buf or @p NULL in case suffix cannot be appended.
1091  */
1092 static char *
1093 mlx5_glue_path(char *buf, size_t size)
1094 {
1095         static const char *const bad[] = { "/", ".", "..", NULL };
1096         const char *path = RTE_EAL_PMD_PATH;
1097         size_t len = strlen(path);
1098         size_t off;
1099         int i;
1100
1101         while (len && path[len - 1] == '/')
1102                 --len;
1103         for (off = len; off && path[off - 1] != '/'; --off)
1104                 ;
1105         for (i = 0; bad[i]; ++i)
1106                 if (!strncmp(path + off, bad[i], (int)(len - off)))
1107                         goto error;
1108         i = snprintf(buf, size, "%.*s-glue", (int)len, path);
1109         if (i == -1 || (size_t)i >= size)
1110                 goto error;
1111         return buf;
1112 error:
1113         DRV_LOG(ERR,
1114                 "unable to append \"-glue\" to last component of"
1115                 " RTE_EAL_PMD_PATH (\"" RTE_EAL_PMD_PATH "\"),"
1116                 " please re-configure DPDK");
1117         return NULL;
1118 }
1119
1120 /**
1121  * Initialization routine for run-time dependency on rdma-core.
1122  */
1123 static int
1124 mlx5_glue_init(void)
1125 {
1126         char glue_path[sizeof(RTE_EAL_PMD_PATH) - 1 + sizeof("-glue")];
1127         const char *path[] = {
1128                 /*
1129                  * A basic security check is necessary before trusting
1130                  * MLX5_GLUE_PATH, which may override RTE_EAL_PMD_PATH.
1131                  */
1132                 (geteuid() == getuid() && getegid() == getgid() ?
1133                  getenv("MLX5_GLUE_PATH") : NULL),
1134                 /*
1135                  * When RTE_EAL_PMD_PATH is set, use its glue-suffixed
1136                  * variant, otherwise let dlopen() look up libraries on its
1137                  * own.
1138                  */
1139                 (*RTE_EAL_PMD_PATH ?
1140                  mlx5_glue_path(glue_path, sizeof(glue_path)) : ""),
1141         };
1142         unsigned int i = 0;
1143         void *handle = NULL;
1144         void **sym;
1145         const char *dlmsg;
1146
1147         while (!handle && i != RTE_DIM(path)) {
1148                 const char *end;
1149                 size_t len;
1150                 int ret;
1151
1152                 if (!path[i]) {
1153                         ++i;
1154                         continue;
1155                 }
1156                 end = strpbrk(path[i], ":;");
1157                 if (!end)
1158                         end = path[i] + strlen(path[i]);
1159                 len = end - path[i];
1160                 ret = 0;
1161                 do {
1162                         char name[ret + 1];
1163
1164                         ret = snprintf(name, sizeof(name), "%.*s%s" MLX5_GLUE,
1165                                        (int)len, path[i],
1166                                        (!len || *(end - 1) == '/') ? "" : "/");
1167                         if (ret == -1)
1168                                 break;
1169                         if (sizeof(name) != (size_t)ret + 1)
1170                                 continue;
1171                         DRV_LOG(DEBUG, "looking for rdma-core glue as \"%s\"",
1172                                 name);
1173                         handle = dlopen(name, RTLD_LAZY);
1174                         break;
1175                 } while (1);
1176                 path[i] = end + 1;
1177                 if (!*end)
1178                         ++i;
1179         }
1180         if (!handle) {
1181                 rte_errno = EINVAL;
1182                 dlmsg = dlerror();
1183                 if (dlmsg)
1184                         DRV_LOG(WARNING, "cannot load glue library: %s", dlmsg);
1185                 goto glue_error;
1186         }
1187         sym = dlsym(handle, "mlx5_glue");
1188         if (!sym || !*sym) {
1189                 rte_errno = EINVAL;
1190                 dlmsg = dlerror();
1191                 if (dlmsg)
1192                         DRV_LOG(ERR, "cannot resolve glue symbol: %s", dlmsg);
1193                 goto glue_error;
1194         }
1195         mlx5_glue = *sym;
1196         return 0;
1197 glue_error:
1198         if (handle)
1199                 dlclose(handle);
1200         DRV_LOG(WARNING,
1201                 "cannot initialize PMD due to missing run-time dependency on"
1202                 " rdma-core libraries (libibverbs, libmlx5)");
1203         return -rte_errno;
1204 }
1205
1206 #endif
1207
1208 /**
1209  * Driver initialization routine.
1210  */
1211 RTE_INIT(rte_mlx5_pmd_init);
1212 static void
1213 rte_mlx5_pmd_init(void)
1214 {
1215         /* Build the static table for ptype conversion. */
1216         mlx5_set_ptype_table();
1217         /*
1218          * RDMAV_HUGEPAGES_SAFE tells ibv_fork_init() we intend to use
1219          * huge pages. Calling ibv_fork_init() during init allows
1220          * applications to use fork() safely for purposes other than
1221          * using this PMD, which is not supported in forked processes.
1222          */
1223         setenv("RDMAV_HUGEPAGES_SAFE", "1", 1);
1224         /* Match the size of Rx completion entry to the size of a cacheline. */
1225         if (RTE_CACHE_LINE_SIZE == 128)
1226                 setenv("MLX5_CQE_SIZE", "128", 0);
1227 #ifdef RTE_LIBRTE_MLX5_DLOPEN_DEPS
1228         if (mlx5_glue_init())
1229                 return;
1230         assert(mlx5_glue);
1231 #endif
1232 #ifndef NDEBUG
1233         /* Glue structure must not contain any NULL pointers. */
1234         {
1235                 unsigned int i;
1236
1237                 for (i = 0; i != sizeof(*mlx5_glue) / sizeof(void *); ++i)
1238                         assert(((const void *const *)mlx5_glue)[i]);
1239         }
1240 #endif
1241         if (strcmp(mlx5_glue->version, MLX5_GLUE_VERSION)) {
1242                 DRV_LOG(ERR,
1243                         "rdma-core glue \"%s\" mismatch: \"%s\" is required",
1244                         mlx5_glue->version, MLX5_GLUE_VERSION);
1245                 return;
1246         }
1247         mlx5_glue->fork_init();
1248         rte_pci_register(&mlx5_driver);
1249 }
1250
1251 RTE_PMD_EXPORT_NAME(net_mlx5, __COUNTER__);
1252 RTE_PMD_REGISTER_PCI_TABLE(net_mlx5, mlx5_pci_id_map);
1253 RTE_PMD_REGISTER_KMOD_DEP(net_mlx5, "* ib_uverbs & mlx5_core & mlx5_ib");
1254
1255 /** Initialize driver log type. */
1256 RTE_INIT(vdev_netvsc_init_log)
1257 {
1258         mlx5_logtype = rte_log_register("pmd.net.mlx5");
1259         if (mlx5_logtype >= 0)
1260                 rte_log_set_level(mlx5_logtype, RTE_LOG_NOTICE);
1261 }