net/mlx4: fix inner RSS support for broken kernels
[dpdk.git] / drivers / net / mlx4 / mlx4.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2012 6WIND S.A.
3  * Copyright 2012 Mellanox Technologies, Ltd
4  */
5
6 /**
7  * @file
8  * mlx4 driver initialization.
9  */
10
11 #include <assert.h>
12 #include <dlfcn.h>
13 #include <errno.h>
14 #include <inttypes.h>
15 #include <stddef.h>
16 #include <stdint.h>
17 #include <stdio.h>
18 #include <stdlib.h>
19 #include <string.h>
20 #include <unistd.h>
21
22 /* Verbs headers do not support -pedantic. */
23 #ifdef PEDANTIC
24 #pragma GCC diagnostic ignored "-Wpedantic"
25 #endif
26 #include <infiniband/verbs.h>
27 #ifdef PEDANTIC
28 #pragma GCC diagnostic error "-Wpedantic"
29 #endif
30
31 #include <rte_common.h>
32 #include <rte_config.h>
33 #include <rte_dev.h>
34 #include <rte_errno.h>
35 #include <rte_ethdev_driver.h>
36 #include <rte_ethdev_pci.h>
37 #include <rte_ether.h>
38 #include <rte_flow.h>
39 #include <rte_interrupts.h>
40 #include <rte_kvargs.h>
41 #include <rte_malloc.h>
42 #include <rte_mbuf.h>
43
44 #include "mlx4.h"
45 #include "mlx4_glue.h"
46 #include "mlx4_flow.h"
47 #include "mlx4_rxtx.h"
48 #include "mlx4_utils.h"
49
50 /** Configuration structure for device arguments. */
51 struct mlx4_conf {
52         struct {
53                 uint32_t present; /**< Bit-field for existing ports. */
54                 uint32_t enabled; /**< Bit-field for user-enabled ports. */
55         } ports;
56 };
57
58 /* Available parameters list. */
59 const char *pmd_mlx4_init_params[] = {
60         MLX4_PMD_PORT_KVARG,
61         NULL,
62 };
63
64 static void mlx4_dev_stop(struct rte_eth_dev *dev);
65
66 /**
67  * DPDK callback for Ethernet device configuration.
68  *
69  * @param dev
70  *   Pointer to Ethernet device structure.
71  *
72  * @return
73  *   0 on success, negative errno value otherwise and rte_errno is set.
74  */
75 static int
76 mlx4_dev_configure(struct rte_eth_dev *dev)
77 {
78         struct priv *priv = dev->data->dev_private;
79         struct rte_flow_error error;
80         int ret;
81
82         /* Prepare internal flow rules. */
83         ret = mlx4_flow_sync(priv, &error);
84         if (ret) {
85                 ERROR("cannot set up internal flow rules (code %d, \"%s\"),"
86                       " flow error type %d, cause %p, message: %s",
87                       -ret, strerror(-ret), error.type, error.cause,
88                       error.message ? error.message : "(unspecified)");
89                 goto exit;
90         }
91         ret = mlx4_intr_install(priv);
92         if (ret)
93                 ERROR("%p: interrupt handler installation failed",
94                       (void *)dev);
95 exit:
96         return ret;
97 }
98
99 /**
100  * DPDK callback to start the device.
101  *
102  * Simulate device start by initializing common RSS resources and attaching
103  * all configured flows.
104  *
105  * @param dev
106  *   Pointer to Ethernet device structure.
107  *
108  * @return
109  *   0 on success, negative errno value otherwise and rte_errno is set.
110  */
111 static int
112 mlx4_dev_start(struct rte_eth_dev *dev)
113 {
114         struct priv *priv = dev->data->dev_private;
115         struct rte_flow_error error;
116         int ret;
117
118         if (priv->started)
119                 return 0;
120         DEBUG("%p: attaching configured flows to all RX queues", (void *)dev);
121         priv->started = 1;
122         ret = mlx4_rss_init(priv);
123         if (ret) {
124                 ERROR("%p: cannot initialize RSS resources: %s",
125                       (void *)dev, strerror(-ret));
126                 goto err;
127         }
128         ret = mlx4_rxq_intr_enable(priv);
129         if (ret) {
130                 ERROR("%p: interrupt handler installation failed",
131                      (void *)dev);
132                 goto err;
133         }
134         ret = mlx4_flow_sync(priv, &error);
135         if (ret) {
136                 ERROR("%p: cannot attach flow rules (code %d, \"%s\"),"
137                       " flow error type %d, cause %p, message: %s",
138                       (void *)dev,
139                       -ret, strerror(-ret), error.type, error.cause,
140                       error.message ? error.message : "(unspecified)");
141                 goto err;
142         }
143         rte_wmb();
144         dev->tx_pkt_burst = mlx4_tx_burst;
145         dev->rx_pkt_burst = mlx4_rx_burst;
146         return 0;
147 err:
148         mlx4_dev_stop(dev);
149         return ret;
150 }
151
152 /**
153  * DPDK callback to stop the device.
154  *
155  * Simulate device stop by detaching all configured flows.
156  *
157  * @param dev
158  *   Pointer to Ethernet device structure.
159  */
160 static void
161 mlx4_dev_stop(struct rte_eth_dev *dev)
162 {
163         struct priv *priv = dev->data->dev_private;
164
165         if (!priv->started)
166                 return;
167         DEBUG("%p: detaching flows from all RX queues", (void *)dev);
168         priv->started = 0;
169         dev->tx_pkt_burst = mlx4_tx_burst_removed;
170         dev->rx_pkt_burst = mlx4_rx_burst_removed;
171         rte_wmb();
172         mlx4_flow_sync(priv, NULL);
173         mlx4_rxq_intr_disable(priv);
174         mlx4_rss_deinit(priv);
175 }
176
177 /**
178  * DPDK callback to close the device.
179  *
180  * Destroy all queues and objects, free memory.
181  *
182  * @param dev
183  *   Pointer to Ethernet device structure.
184  */
185 static void
186 mlx4_dev_close(struct rte_eth_dev *dev)
187 {
188         struct priv *priv = dev->data->dev_private;
189         unsigned int i;
190
191         DEBUG("%p: closing device \"%s\"",
192               (void *)dev,
193               ((priv->ctx != NULL) ? priv->ctx->device->name : ""));
194         dev->rx_pkt_burst = mlx4_rx_burst_removed;
195         dev->tx_pkt_burst = mlx4_tx_burst_removed;
196         rte_wmb();
197         mlx4_flow_clean(priv);
198         mlx4_rss_deinit(priv);
199         for (i = 0; i != dev->data->nb_rx_queues; ++i)
200                 mlx4_rx_queue_release(dev->data->rx_queues[i]);
201         for (i = 0; i != dev->data->nb_tx_queues; ++i)
202                 mlx4_tx_queue_release(dev->data->tx_queues[i]);
203         if (priv->pd != NULL) {
204                 assert(priv->ctx != NULL);
205                 claim_zero(mlx4_glue->dealloc_pd(priv->pd));
206                 claim_zero(mlx4_glue->close_device(priv->ctx));
207         } else
208                 assert(priv->ctx == NULL);
209         mlx4_intr_uninstall(priv);
210         memset(priv, 0, sizeof(*priv));
211 }
212
213 static const struct eth_dev_ops mlx4_dev_ops = {
214         .dev_configure = mlx4_dev_configure,
215         .dev_start = mlx4_dev_start,
216         .dev_stop = mlx4_dev_stop,
217         .dev_set_link_down = mlx4_dev_set_link_down,
218         .dev_set_link_up = mlx4_dev_set_link_up,
219         .dev_close = mlx4_dev_close,
220         .link_update = mlx4_link_update,
221         .promiscuous_enable = mlx4_promiscuous_enable,
222         .promiscuous_disable = mlx4_promiscuous_disable,
223         .allmulticast_enable = mlx4_allmulticast_enable,
224         .allmulticast_disable = mlx4_allmulticast_disable,
225         .mac_addr_remove = mlx4_mac_addr_remove,
226         .mac_addr_add = mlx4_mac_addr_add,
227         .mac_addr_set = mlx4_mac_addr_set,
228         .stats_get = mlx4_stats_get,
229         .stats_reset = mlx4_stats_reset,
230         .dev_infos_get = mlx4_dev_infos_get,
231         .dev_supported_ptypes_get = mlx4_dev_supported_ptypes_get,
232         .vlan_filter_set = mlx4_vlan_filter_set,
233         .rx_queue_setup = mlx4_rx_queue_setup,
234         .tx_queue_setup = mlx4_tx_queue_setup,
235         .rx_queue_release = mlx4_rx_queue_release,
236         .tx_queue_release = mlx4_tx_queue_release,
237         .flow_ctrl_get = mlx4_flow_ctrl_get,
238         .flow_ctrl_set = mlx4_flow_ctrl_set,
239         .mtu_set = mlx4_mtu_set,
240         .filter_ctrl = mlx4_filter_ctrl,
241         .rx_queue_intr_enable = mlx4_rx_intr_enable,
242         .rx_queue_intr_disable = mlx4_rx_intr_disable,
243         .is_removed = mlx4_is_removed,
244 };
245
246 /**
247  * Get PCI information from struct ibv_device.
248  *
249  * @param device
250  *   Pointer to Ethernet device structure.
251  * @param[out] pci_addr
252  *   PCI bus address output buffer.
253  *
254  * @return
255  *   0 on success, negative errno value otherwise and rte_errno is set.
256  */
257 static int
258 mlx4_ibv_device_to_pci_addr(const struct ibv_device *device,
259                             struct rte_pci_addr *pci_addr)
260 {
261         FILE *file;
262         char line[32];
263         MKSTR(path, "%s/device/uevent", device->ibdev_path);
264
265         file = fopen(path, "rb");
266         if (file == NULL) {
267                 rte_errno = errno;
268                 return -rte_errno;
269         }
270         while (fgets(line, sizeof(line), file) == line) {
271                 size_t len = strlen(line);
272                 int ret;
273
274                 /* Truncate long lines. */
275                 if (len == (sizeof(line) - 1))
276                         while (line[(len - 1)] != '\n') {
277                                 ret = fgetc(file);
278                                 if (ret == EOF)
279                                         break;
280                                 line[(len - 1)] = ret;
281                         }
282                 /* Extract information. */
283                 if (sscanf(line,
284                            "PCI_SLOT_NAME="
285                            "%" SCNx32 ":%" SCNx8 ":%" SCNx8 ".%" SCNx8 "\n",
286                            &pci_addr->domain,
287                            &pci_addr->bus,
288                            &pci_addr->devid,
289                            &pci_addr->function) == 4) {
290                         ret = 0;
291                         break;
292                 }
293         }
294         fclose(file);
295         return 0;
296 }
297
298 /**
299  * Verify and store value for device argument.
300  *
301  * @param[in] key
302  *   Key argument to verify.
303  * @param[in] val
304  *   Value associated with key.
305  * @param[in, out] conf
306  *   Shared configuration data.
307  *
308  * @return
309  *   0 on success, negative errno value otherwise and rte_errno is set.
310  */
311 static int
312 mlx4_arg_parse(const char *key, const char *val, struct mlx4_conf *conf)
313 {
314         unsigned long tmp;
315
316         errno = 0;
317         tmp = strtoul(val, NULL, 0);
318         if (errno) {
319                 rte_errno = errno;
320                 WARN("%s: \"%s\" is not a valid integer", key, val);
321                 return -rte_errno;
322         }
323         if (strcmp(MLX4_PMD_PORT_KVARG, key) == 0) {
324                 uint32_t ports = rte_log2_u32(conf->ports.present + 1);
325
326                 if (tmp >= ports) {
327                         ERROR("port index %lu outside range [0,%" PRIu32 ")",
328                               tmp, ports);
329                         return -EINVAL;
330                 }
331                 if (!(conf->ports.present & (1 << tmp))) {
332                         rte_errno = EINVAL;
333                         ERROR("invalid port index %lu", tmp);
334                         return -rte_errno;
335                 }
336                 conf->ports.enabled |= 1 << tmp;
337         } else {
338                 rte_errno = EINVAL;
339                 WARN("%s: unknown parameter", key);
340                 return -rte_errno;
341         }
342         return 0;
343 }
344
345 /**
346  * Parse device parameters.
347  *
348  * @param devargs
349  *   Device arguments structure.
350  *
351  * @return
352  *   0 on success, negative errno value otherwise and rte_errno is set.
353  */
354 static int
355 mlx4_args(struct rte_devargs *devargs, struct mlx4_conf *conf)
356 {
357         struct rte_kvargs *kvlist;
358         unsigned int arg_count;
359         int ret = 0;
360         int i;
361
362         if (devargs == NULL)
363                 return 0;
364         kvlist = rte_kvargs_parse(devargs->args, pmd_mlx4_init_params);
365         if (kvlist == NULL) {
366                 rte_errno = EINVAL;
367                 ERROR("failed to parse kvargs");
368                 return -rte_errno;
369         }
370         /* Process parameters. */
371         for (i = 0; pmd_mlx4_init_params[i]; ++i) {
372                 arg_count = rte_kvargs_count(kvlist, MLX4_PMD_PORT_KVARG);
373                 while (arg_count-- > 0) {
374                         ret = rte_kvargs_process(kvlist,
375                                                  MLX4_PMD_PORT_KVARG,
376                                                  (int (*)(const char *,
377                                                           const char *,
378                                                           void *))
379                                                  mlx4_arg_parse,
380                                                  conf);
381                         if (ret != 0)
382                                 goto free_kvlist;
383                 }
384         }
385 free_kvlist:
386         rte_kvargs_free(kvlist);
387         return ret;
388 }
389
390 /**
391  * Interpret RSS capabilities reported by device.
392  *
393  * This function returns the set of usable Verbs RSS hash fields, kernel
394  * quirks taken into account.
395  *
396  * @param ctx
397  *   Verbs context.
398  * @param pd
399  *   Verbs protection domain.
400  * @param device_attr_ex
401  *   Extended device attributes to interpret.
402  *
403  * @return
404  *   Usable RSS hash fields mask in Verbs format.
405  */
406 static uint64_t
407 mlx4_hw_rss_sup(struct ibv_context *ctx, struct ibv_pd *pd,
408                 struct ibv_device_attr_ex *device_attr_ex)
409 {
410         uint64_t hw_rss_sup = device_attr_ex->rss_caps.rx_hash_fields_mask;
411         struct ibv_cq *cq = NULL;
412         struct ibv_wq *wq = NULL;
413         struct ibv_rwq_ind_table *ind = NULL;
414         struct ibv_qp *qp = NULL;
415
416         if (!hw_rss_sup) {
417                 WARN("no RSS capabilities reported; disabling support for UDP"
418                      " RSS and inner VXLAN RSS");
419                 return IBV_RX_HASH_SRC_IPV4 | IBV_RX_HASH_DST_IPV4 |
420                         IBV_RX_HASH_SRC_IPV6 | IBV_RX_HASH_DST_IPV6 |
421                         IBV_RX_HASH_SRC_PORT_TCP | IBV_RX_HASH_DST_PORT_TCP;
422         }
423         if (!(hw_rss_sup & IBV_RX_HASH_INNER))
424                 return hw_rss_sup;
425         /*
426          * Although reported as supported, missing code in some Linux
427          * versions (v4.15, v4.16) prevents the creation of hash QPs with
428          * inner capability.
429          *
430          * There is no choice but to attempt to instantiate a temporary RSS
431          * context in order to confirm its support.
432          */
433         cq = mlx4_glue->create_cq(ctx, 1, NULL, NULL, 0);
434         wq = cq ? mlx4_glue->create_wq
435                 (ctx,
436                  &(struct ibv_wq_init_attr){
437                         .wq_type = IBV_WQT_RQ,
438                         .max_wr = 1,
439                         .max_sge = 1,
440                         .pd = pd,
441                         .cq = cq,
442                  }) : NULL;
443         ind = wq ? mlx4_glue->create_rwq_ind_table
444                 (ctx,
445                  &(struct ibv_rwq_ind_table_init_attr){
446                         .log_ind_tbl_size = 0,
447                         .ind_tbl = &wq,
448                         .comp_mask = 0,
449                  }) : NULL;
450         qp = ind ? mlx4_glue->create_qp_ex
451                 (ctx,
452                  &(struct ibv_qp_init_attr_ex){
453                         .comp_mask =
454                                 (IBV_QP_INIT_ATTR_PD |
455                                  IBV_QP_INIT_ATTR_RX_HASH |
456                                  IBV_QP_INIT_ATTR_IND_TABLE),
457                         .qp_type = IBV_QPT_RAW_PACKET,
458                         .pd = pd,
459                         .rwq_ind_tbl = ind,
460                         .rx_hash_conf = {
461                                 .rx_hash_function = IBV_RX_HASH_FUNC_TOEPLITZ,
462                                 .rx_hash_key_len = MLX4_RSS_HASH_KEY_SIZE,
463                                 .rx_hash_key = mlx4_rss_hash_key_default,
464                                 .rx_hash_fields_mask = hw_rss_sup,
465                         },
466                  }) : NULL;
467         if (!qp) {
468                 WARN("disabling unusable inner RSS capability due to kernel"
469                      " quirk");
470                 hw_rss_sup &= ~IBV_RX_HASH_INNER;
471         } else {
472                 claim_zero(mlx4_glue->destroy_qp(qp));
473         }
474         if (ind)
475                 claim_zero(mlx4_glue->destroy_rwq_ind_table(ind));
476         if (wq)
477                 claim_zero(mlx4_glue->destroy_wq(wq));
478         if (cq)
479                 claim_zero(mlx4_glue->destroy_cq(cq));
480         return hw_rss_sup;
481 }
482
483 static struct rte_pci_driver mlx4_driver;
484
485 /**
486  * DPDK callback to register a PCI device.
487  *
488  * This function creates an Ethernet device for each port of a given
489  * PCI device.
490  *
491  * @param[in] pci_drv
492  *   PCI driver structure (mlx4_driver).
493  * @param[in] pci_dev
494  *   PCI device information.
495  *
496  * @return
497  *   0 on success, negative errno value otherwise and rte_errno is set.
498  */
499 static int
500 mlx4_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
501 {
502         struct ibv_device **list;
503         struct ibv_device *ibv_dev;
504         int err = 0;
505         struct ibv_context *attr_ctx = NULL;
506         struct ibv_device_attr device_attr;
507         struct ibv_device_attr_ex device_attr_ex;
508         struct mlx4_conf conf = {
509                 .ports.present = 0,
510         };
511         unsigned int vf;
512         int i;
513
514         (void)pci_drv;
515         assert(pci_drv == &mlx4_driver);
516         list = mlx4_glue->get_device_list(&i);
517         if (list == NULL) {
518                 rte_errno = errno;
519                 assert(rte_errno);
520                 if (rte_errno == ENOSYS)
521                         ERROR("cannot list devices, is ib_uverbs loaded?");
522                 return -rte_errno;
523         }
524         assert(i >= 0);
525         /*
526          * For each listed device, check related sysfs entry against
527          * the provided PCI ID.
528          */
529         while (i != 0) {
530                 struct rte_pci_addr pci_addr;
531
532                 --i;
533                 DEBUG("checking device \"%s\"", list[i]->name);
534                 if (mlx4_ibv_device_to_pci_addr(list[i], &pci_addr))
535                         continue;
536                 if ((pci_dev->addr.domain != pci_addr.domain) ||
537                     (pci_dev->addr.bus != pci_addr.bus) ||
538                     (pci_dev->addr.devid != pci_addr.devid) ||
539                     (pci_dev->addr.function != pci_addr.function))
540                         continue;
541                 vf = (pci_dev->id.device_id ==
542                       PCI_DEVICE_ID_MELLANOX_CONNECTX3VF);
543                 INFO("PCI information matches, using device \"%s\" (VF: %s)",
544                      list[i]->name, (vf ? "true" : "false"));
545                 attr_ctx = mlx4_glue->open_device(list[i]);
546                 err = errno;
547                 break;
548         }
549         if (attr_ctx == NULL) {
550                 mlx4_glue->free_device_list(list);
551                 switch (err) {
552                 case 0:
553                         rte_errno = ENODEV;
554                         ERROR("cannot access device, is mlx4_ib loaded?");
555                         return -rte_errno;
556                 case EINVAL:
557                         rte_errno = EINVAL;
558                         ERROR("cannot use device, are drivers up to date?");
559                         return -rte_errno;
560                 }
561                 assert(err > 0);
562                 rte_errno = err;
563                 return -rte_errno;
564         }
565         ibv_dev = list[i];
566         DEBUG("device opened");
567         if (mlx4_glue->query_device(attr_ctx, &device_attr)) {
568                 rte_errno = ENODEV;
569                 goto error;
570         }
571         INFO("%u port(s) detected", device_attr.phys_port_cnt);
572         conf.ports.present |= (UINT64_C(1) << device_attr.phys_port_cnt) - 1;
573         if (mlx4_args(pci_dev->device.devargs, &conf)) {
574                 ERROR("failed to process device arguments");
575                 rte_errno = EINVAL;
576                 goto error;
577         }
578         /* Use all ports when none are defined */
579         if (!conf.ports.enabled)
580                 conf.ports.enabled = conf.ports.present;
581         /* Retrieve extended device attributes. */
582         if (mlx4_glue->query_device_ex(attr_ctx, NULL, &device_attr_ex)) {
583                 rte_errno = ENODEV;
584                 goto error;
585         }
586         assert(device_attr.max_sge >= MLX4_MAX_SGE);
587         for (i = 0; i < device_attr.phys_port_cnt; i++) {
588                 uint32_t port = i + 1; /* ports are indexed from one */
589                 struct ibv_context *ctx = NULL;
590                 struct ibv_port_attr port_attr;
591                 struct ibv_pd *pd = NULL;
592                 struct priv *priv = NULL;
593                 struct rte_eth_dev *eth_dev = NULL;
594                 struct ether_addr mac;
595
596                 /* If port is not enabled, skip. */
597                 if (!(conf.ports.enabled & (1 << i)))
598                         continue;
599                 DEBUG("using port %u", port);
600                 ctx = mlx4_glue->open_device(ibv_dev);
601                 if (ctx == NULL) {
602                         rte_errno = ENODEV;
603                         goto port_error;
604                 }
605                 /* Check port status. */
606                 err = mlx4_glue->query_port(ctx, port, &port_attr);
607                 if (err) {
608                         rte_errno = err;
609                         ERROR("port query failed: %s", strerror(rte_errno));
610                         goto port_error;
611                 }
612                 if (port_attr.link_layer != IBV_LINK_LAYER_ETHERNET) {
613                         rte_errno = ENOTSUP;
614                         ERROR("port %d is not configured in Ethernet mode",
615                               port);
616                         goto port_error;
617                 }
618                 if (port_attr.state != IBV_PORT_ACTIVE)
619                         DEBUG("port %d is not active: \"%s\" (%d)",
620                               port, mlx4_glue->port_state_str(port_attr.state),
621                               port_attr.state);
622                 /* Make asynchronous FD non-blocking to handle interrupts. */
623                 if (mlx4_fd_set_non_blocking(ctx->async_fd) < 0) {
624                         ERROR("cannot make asynchronous FD non-blocking: %s",
625                               strerror(rte_errno));
626                         goto port_error;
627                 }
628                 /* Allocate protection domain. */
629                 pd = mlx4_glue->alloc_pd(ctx);
630                 if (pd == NULL) {
631                         rte_errno = ENOMEM;
632                         ERROR("PD allocation failure");
633                         goto port_error;
634                 }
635                 /* from rte_ethdev.c */
636                 priv = rte_zmalloc("ethdev private structure",
637                                    sizeof(*priv),
638                                    RTE_CACHE_LINE_SIZE);
639                 if (priv == NULL) {
640                         rte_errno = ENOMEM;
641                         ERROR("priv allocation failure");
642                         goto port_error;
643                 }
644                 priv->ctx = ctx;
645                 priv->device_attr = device_attr;
646                 priv->port = port;
647                 priv->pd = pd;
648                 priv->mtu = ETHER_MTU;
649                 priv->vf = vf;
650                 priv->hw_csum = !!(device_attr.device_cap_flags &
651                                    IBV_DEVICE_RAW_IP_CSUM);
652                 DEBUG("checksum offloading is %ssupported",
653                       (priv->hw_csum ? "" : "not "));
654                 /* Only ConnectX-3 Pro supports tunneling. */
655                 priv->hw_csum_l2tun =
656                         priv->hw_csum &&
657                         (device_attr.vendor_part_id ==
658                          PCI_DEVICE_ID_MELLANOX_CONNECTX3PRO);
659                 DEBUG("L2 tunnel checksum offloads are %ssupported",
660                       priv->hw_csum_l2tun ? "" : "not ");
661                 priv->hw_rss_sup = mlx4_hw_rss_sup(priv->ctx, priv->pd,
662                                                    &device_attr_ex);
663                 DEBUG("supported RSS hash fields mask: %016" PRIx64,
664                       priv->hw_rss_sup);
665                 priv->hw_fcs_strip = !!(device_attr_ex.raw_packet_caps &
666                                         IBV_RAW_PACKET_CAP_SCATTER_FCS);
667                 DEBUG("FCS stripping toggling is %ssupported",
668                       priv->hw_fcs_strip ? "" : "not ");
669                 /* Configure the first MAC address by default. */
670                 if (mlx4_get_mac(priv, &mac.addr_bytes)) {
671                         ERROR("cannot get MAC address, is mlx4_en loaded?"
672                               " (rte_errno: %s)", strerror(rte_errno));
673                         goto port_error;
674                 }
675                 INFO("port %u MAC address is %02x:%02x:%02x:%02x:%02x:%02x",
676                      priv->port,
677                      mac.addr_bytes[0], mac.addr_bytes[1],
678                      mac.addr_bytes[2], mac.addr_bytes[3],
679                      mac.addr_bytes[4], mac.addr_bytes[5]);
680                 /* Register MAC address. */
681                 priv->mac[0] = mac;
682 #ifndef NDEBUG
683                 {
684                         char ifname[IF_NAMESIZE];
685
686                         if (mlx4_get_ifname(priv, &ifname) == 0)
687                                 DEBUG("port %u ifname is \"%s\"",
688                                       priv->port, ifname);
689                         else
690                                 DEBUG("port %u ifname is unknown", priv->port);
691                 }
692 #endif
693                 /* Get actual MTU if possible. */
694                 mlx4_mtu_get(priv, &priv->mtu);
695                 DEBUG("port %u MTU is %u", priv->port, priv->mtu);
696                 /* from rte_ethdev.c */
697                 {
698                         char name[RTE_ETH_NAME_MAX_LEN];
699
700                         snprintf(name, sizeof(name), "%s port %u",
701                                  mlx4_glue->get_device_name(ibv_dev), port);
702                         eth_dev = rte_eth_dev_allocate(name);
703                 }
704                 if (eth_dev == NULL) {
705                         ERROR("can not allocate rte ethdev");
706                         rte_errno = ENOMEM;
707                         goto port_error;
708                 }
709                 eth_dev->data->dev_private = priv;
710                 eth_dev->data->mac_addrs = priv->mac;
711                 eth_dev->device = &pci_dev->device;
712                 rte_eth_copy_pci_info(eth_dev, pci_dev);
713                 eth_dev->device->driver = &mlx4_driver.driver;
714                 /* Initialize local interrupt handle for current port. */
715                 priv->intr_handle = (struct rte_intr_handle){
716                         .fd = -1,
717                         .type = RTE_INTR_HANDLE_EXT,
718                 };
719                 /*
720                  * Override ethdev interrupt handle pointer with private
721                  * handle instead of that of the parent PCI device used by
722                  * default. This prevents it from being shared between all
723                  * ports of the same PCI device since each of them is
724                  * associated its own Verbs context.
725                  *
726                  * Rx interrupts in particular require this as the PMD has
727                  * no control over the registration of queue interrupts
728                  * besides setting up eth_dev->intr_handle, the rest is
729                  * handled by rte_intr_rx_ctl().
730                  */
731                 eth_dev->intr_handle = &priv->intr_handle;
732                 priv->dev = eth_dev;
733                 eth_dev->dev_ops = &mlx4_dev_ops;
734                 /* Bring Ethernet device up. */
735                 DEBUG("forcing Ethernet interface up");
736                 mlx4_dev_set_link_up(priv->dev);
737                 /* Update link status once if waiting for LSC. */
738                 if (eth_dev->data->dev_flags & RTE_ETH_DEV_INTR_LSC)
739                         mlx4_link_update(eth_dev, 0);
740                 continue;
741 port_error:
742                 rte_free(priv);
743                 if (pd)
744                         claim_zero(mlx4_glue->dealloc_pd(pd));
745                 if (ctx)
746                         claim_zero(mlx4_glue->close_device(ctx));
747                 if (eth_dev)
748                         rte_eth_dev_release_port(eth_dev);
749                 break;
750         }
751         if (i == device_attr.phys_port_cnt)
752                 return 0;
753         /*
754          * XXX if something went wrong in the loop above, there is a resource
755          * leak (ctx, pd, priv, dpdk ethdev) but we can do nothing about it as
756          * long as the dpdk does not provide a way to deallocate a ethdev and a
757          * way to enumerate the registered ethdevs to free the previous ones.
758          */
759 error:
760         if (attr_ctx)
761                 claim_zero(mlx4_glue->close_device(attr_ctx));
762         if (list)
763                 mlx4_glue->free_device_list(list);
764         assert(rte_errno >= 0);
765         return -rte_errno;
766 }
767
768 static const struct rte_pci_id mlx4_pci_id_map[] = {
769         {
770                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
771                                PCI_DEVICE_ID_MELLANOX_CONNECTX3)
772         },
773         {
774                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
775                                PCI_DEVICE_ID_MELLANOX_CONNECTX3PRO)
776         },
777         {
778                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
779                                PCI_DEVICE_ID_MELLANOX_CONNECTX3VF)
780         },
781         {
782                 .vendor_id = 0
783         }
784 };
785
786 static struct rte_pci_driver mlx4_driver = {
787         .driver = {
788                 .name = MLX4_DRIVER_NAME
789         },
790         .id_table = mlx4_pci_id_map,
791         .probe = mlx4_pci_probe,
792         .drv_flags = RTE_PCI_DRV_INTR_LSC |
793                      RTE_PCI_DRV_INTR_RMV,
794 };
795
796 #ifdef RTE_LIBRTE_MLX4_DLOPEN_DEPS
797
798 /**
799  * Suffix RTE_EAL_PMD_PATH with "-glue".
800  *
801  * This function performs a sanity check on RTE_EAL_PMD_PATH before
802  * suffixing its last component.
803  *
804  * @param buf[out]
805  *   Output buffer, should be large enough otherwise NULL is returned.
806  * @param size
807  *   Size of @p out.
808  *
809  * @return
810  *   Pointer to @p buf or @p NULL in case suffix cannot be appended.
811  */
812 static char *
813 mlx4_glue_path(char *buf, size_t size)
814 {
815         static const char *const bad[] = { "/", ".", "..", NULL };
816         const char *path = RTE_EAL_PMD_PATH;
817         size_t len = strlen(path);
818         size_t off;
819         int i;
820
821         while (len && path[len - 1] == '/')
822                 --len;
823         for (off = len; off && path[off - 1] != '/'; --off)
824                 ;
825         for (i = 0; bad[i]; ++i)
826                 if (!strncmp(path + off, bad[i], (int)(len - off)))
827                         goto error;
828         i = snprintf(buf, size, "%.*s-glue", (int)len, path);
829         if (i == -1 || (size_t)i >= size)
830                 goto error;
831         return buf;
832 error:
833         ERROR("unable to append \"-glue\" to last component of"
834               " RTE_EAL_PMD_PATH (\"" RTE_EAL_PMD_PATH "\"),"
835               " please re-configure DPDK");
836         return NULL;
837 }
838
839 /**
840  * Initialization routine for run-time dependency on rdma-core.
841  */
842 static int
843 mlx4_glue_init(void)
844 {
845         char glue_path[sizeof(RTE_EAL_PMD_PATH) - 1 + sizeof("-glue")];
846         const char *path[] = {
847                 /*
848                  * A basic security check is necessary before trusting
849                  * MLX4_GLUE_PATH, which may override RTE_EAL_PMD_PATH.
850                  */
851                 (geteuid() == getuid() && getegid() == getgid() ?
852                  getenv("MLX4_GLUE_PATH") : NULL),
853                 /*
854                  * When RTE_EAL_PMD_PATH is set, use its glue-suffixed
855                  * variant, otherwise let dlopen() look up libraries on its
856                  * own.
857                  */
858                 (*RTE_EAL_PMD_PATH ?
859                  mlx4_glue_path(glue_path, sizeof(glue_path)) : ""),
860         };
861         unsigned int i = 0;
862         void *handle = NULL;
863         void **sym;
864         const char *dlmsg;
865
866         while (!handle && i != RTE_DIM(path)) {
867                 const char *end;
868                 size_t len;
869                 int ret;
870
871                 if (!path[i]) {
872                         ++i;
873                         continue;
874                 }
875                 end = strpbrk(path[i], ":;");
876                 if (!end)
877                         end = path[i] + strlen(path[i]);
878                 len = end - path[i];
879                 ret = 0;
880                 do {
881                         char name[ret + 1];
882
883                         ret = snprintf(name, sizeof(name), "%.*s%s" MLX4_GLUE,
884                                        (int)len, path[i],
885                                        (!len || *(end - 1) == '/') ? "" : "/");
886                         if (ret == -1)
887                                 break;
888                         if (sizeof(name) != (size_t)ret + 1)
889                                 continue;
890                         DEBUG("looking for rdma-core glue as \"%s\"", name);
891                         handle = dlopen(name, RTLD_LAZY);
892                         break;
893                 } while (1);
894                 path[i] = end + 1;
895                 if (!*end)
896                         ++i;
897         }
898         if (!handle) {
899                 rte_errno = EINVAL;
900                 dlmsg = dlerror();
901                 if (dlmsg)
902                         WARN("cannot load glue library: %s", dlmsg);
903                 goto glue_error;
904         }
905         sym = dlsym(handle, "mlx4_glue");
906         if (!sym || !*sym) {
907                 rte_errno = EINVAL;
908                 dlmsg = dlerror();
909                 if (dlmsg)
910                         ERROR("cannot resolve glue symbol: %s", dlmsg);
911                 goto glue_error;
912         }
913         mlx4_glue = *sym;
914         return 0;
915 glue_error:
916         if (handle)
917                 dlclose(handle);
918         WARN("cannot initialize PMD due to missing run-time"
919              " dependency on rdma-core libraries (libibverbs,"
920              " libmlx4)");
921         return -rte_errno;
922 }
923
924 #endif
925
926 /**
927  * Driver initialization routine.
928  */
929 RTE_INIT(rte_mlx4_pmd_init);
930 static void
931 rte_mlx4_pmd_init(void)
932 {
933         /*
934          * MLX4_DEVICE_FATAL_CLEANUP tells ibv_destroy functions we
935          * want to get success errno value in case of calling them
936          * when the device was removed.
937          */
938         setenv("MLX4_DEVICE_FATAL_CLEANUP", "1", 1);
939         /*
940          * RDMAV_HUGEPAGES_SAFE tells ibv_fork_init() we intend to use
941          * huge pages. Calling ibv_fork_init() during init allows
942          * applications to use fork() safely for purposes other than
943          * using this PMD, which is not supported in forked processes.
944          */
945         setenv("RDMAV_HUGEPAGES_SAFE", "1", 1);
946 #ifdef RTE_LIBRTE_MLX4_DLOPEN_DEPS
947         if (mlx4_glue_init())
948                 return;
949         assert(mlx4_glue);
950 #endif
951 #ifndef NDEBUG
952         /* Glue structure must not contain any NULL pointers. */
953         {
954                 unsigned int i;
955
956                 for (i = 0; i != sizeof(*mlx4_glue) / sizeof(void *); ++i)
957                         assert(((const void *const *)mlx4_glue)[i]);
958         }
959 #endif
960         if (strcmp(mlx4_glue->version, MLX4_GLUE_VERSION)) {
961                 ERROR("rdma-core glue \"%s\" mismatch: \"%s\" is required",
962                       mlx4_glue->version, MLX4_GLUE_VERSION);
963                 return;
964         }
965         mlx4_glue->fork_init();
966         rte_pci_register(&mlx4_driver);
967 }
968
969 RTE_PMD_EXPORT_NAME(net_mlx4, __COUNTER__);
970 RTE_PMD_REGISTER_PCI_TABLE(net_mlx4, mlx4_pci_id_map);
971 RTE_PMD_REGISTER_KMOD_DEP(net_mlx4,
972         "* ib_uverbs & mlx4_en & mlx4_core & mlx4_ib");