ad53721f2d8ee5b252c0c04e27ed5d138e894055
[dpdk.git] / drivers / net / mlx5 / mlx5_ethdev.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2015 6WIND S.A.
3  * Copyright 2015 Mellanox Technologies, Ltd
4  */
5
6 #include <stddef.h>
7 #include <assert.h>
8 #include <inttypes.h>
9 #include <unistd.h>
10 #include <stdbool.h>
11 #include <stdint.h>
12 #include <stdio.h>
13 #include <string.h>
14 #include <stdlib.h>
15 #include <errno.h>
16 #include <dirent.h>
17 #include <net/if.h>
18 #include <sys/ioctl.h>
19 #include <sys/socket.h>
20 #include <netinet/in.h>
21 #include <linux/ethtool.h>
22 #include <linux/sockios.h>
23 #include <fcntl.h>
24 #include <stdalign.h>
25 #include <sys/un.h>
26 #include <time.h>
27
28 #include <rte_atomic.h>
29 #include <rte_ethdev_driver.h>
30 #include <rte_bus_pci.h>
31 #include <rte_mbuf.h>
32 #include <rte_common.h>
33 #include <rte_interrupts.h>
34 #include <rte_malloc.h>
35 #include <rte_string_fns.h>
36 #include <rte_rwlock.h>
37 #include <rte_cycles.h>
38
39 #include "mlx5.h"
40 #include "mlx5_glue.h"
41 #include "mlx5_rxtx.h"
42 #include "mlx5_utils.h"
43
44 /* Supported speed values found in /usr/include/linux/ethtool.h */
45 #ifndef HAVE_SUPPORTED_40000baseKR4_Full
46 #define SUPPORTED_40000baseKR4_Full (1 << 23)
47 #endif
48 #ifndef HAVE_SUPPORTED_40000baseCR4_Full
49 #define SUPPORTED_40000baseCR4_Full (1 << 24)
50 #endif
51 #ifndef HAVE_SUPPORTED_40000baseSR4_Full
52 #define SUPPORTED_40000baseSR4_Full (1 << 25)
53 #endif
54 #ifndef HAVE_SUPPORTED_40000baseLR4_Full
55 #define SUPPORTED_40000baseLR4_Full (1 << 26)
56 #endif
57 #ifndef HAVE_SUPPORTED_56000baseKR4_Full
58 #define SUPPORTED_56000baseKR4_Full (1 << 27)
59 #endif
60 #ifndef HAVE_SUPPORTED_56000baseCR4_Full
61 #define SUPPORTED_56000baseCR4_Full (1 << 28)
62 #endif
63 #ifndef HAVE_SUPPORTED_56000baseSR4_Full
64 #define SUPPORTED_56000baseSR4_Full (1 << 29)
65 #endif
66 #ifndef HAVE_SUPPORTED_56000baseLR4_Full
67 #define SUPPORTED_56000baseLR4_Full (1 << 30)
68 #endif
69
70 /* Add defines in case the running kernel is not the same as user headers. */
71 #ifndef ETHTOOL_GLINKSETTINGS
72 struct ethtool_link_settings {
73         uint32_t cmd;
74         uint32_t speed;
75         uint8_t duplex;
76         uint8_t port;
77         uint8_t phy_address;
78         uint8_t autoneg;
79         uint8_t mdio_support;
80         uint8_t eth_to_mdix;
81         uint8_t eth_tp_mdix_ctrl;
82         int8_t link_mode_masks_nwords;
83         uint32_t reserved[8];
84         uint32_t link_mode_masks[];
85 };
86
87 #define ETHTOOL_GLINKSETTINGS 0x0000004c
88 #define ETHTOOL_LINK_MODE_1000baseT_Full_BIT 5
89 #define ETHTOOL_LINK_MODE_Autoneg_BIT 6
90 #define ETHTOOL_LINK_MODE_1000baseKX_Full_BIT 17
91 #define ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT 18
92 #define ETHTOOL_LINK_MODE_10000baseKR_Full_BIT 19
93 #define ETHTOOL_LINK_MODE_10000baseR_FEC_BIT 20
94 #define ETHTOOL_LINK_MODE_20000baseMLD2_Full_BIT 21
95 #define ETHTOOL_LINK_MODE_20000baseKR2_Full_BIT 22
96 #define ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT 23
97 #define ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT 24
98 #define ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT 25
99 #define ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT 26
100 #define ETHTOOL_LINK_MODE_56000baseKR4_Full_BIT 27
101 #define ETHTOOL_LINK_MODE_56000baseCR4_Full_BIT 28
102 #define ETHTOOL_LINK_MODE_56000baseSR4_Full_BIT 29
103 #define ETHTOOL_LINK_MODE_56000baseLR4_Full_BIT 30
104 #endif
105 #ifndef HAVE_ETHTOOL_LINK_MODE_25G
106 #define ETHTOOL_LINK_MODE_25000baseCR_Full_BIT 31
107 #define ETHTOOL_LINK_MODE_25000baseKR_Full_BIT 32
108 #define ETHTOOL_LINK_MODE_25000baseSR_Full_BIT 33
109 #endif
110 #ifndef HAVE_ETHTOOL_LINK_MODE_50G
111 #define ETHTOOL_LINK_MODE_50000baseCR2_Full_BIT 34
112 #define ETHTOOL_LINK_MODE_50000baseKR2_Full_BIT 35
113 #endif
114 #ifndef HAVE_ETHTOOL_LINK_MODE_100G
115 #define ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT 36
116 #define ETHTOOL_LINK_MODE_100000baseSR4_Full_BIT 37
117 #define ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT 38
118 #define ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT 39
119 #endif
120
121 /**
122  * Get master interface name from private structure.
123  *
124  * @param[in] dev
125  *   Pointer to Ethernet device.
126  * @param[out] ifname
127  *   Interface name output buffer.
128  *
129  * @return
130  *   0 on success, a negative errno value otherwise and rte_errno is set.
131  */
132 int
133 mlx5_get_master_ifname(const char *ibdev_path, char (*ifname)[IF_NAMESIZE])
134 {
135         DIR *dir;
136         struct dirent *dent;
137         unsigned int dev_type = 0;
138         unsigned int dev_port_prev = ~0u;
139         char match[IF_NAMESIZE] = "";
140
141         assert(ibdev_path);
142         {
143                 MKSTR(path, "%s/device/net", ibdev_path);
144
145                 dir = opendir(path);
146                 if (dir == NULL) {
147                         rte_errno = errno;
148                         return -rte_errno;
149                 }
150         }
151         while ((dent = readdir(dir)) != NULL) {
152                 char *name = dent->d_name;
153                 FILE *file;
154                 unsigned int dev_port;
155                 int r;
156
157                 if ((name[0] == '.') &&
158                     ((name[1] == '\0') ||
159                      ((name[1] == '.') && (name[2] == '\0'))))
160                         continue;
161
162                 MKSTR(path, "%s/device/net/%s/%s",
163                       ibdev_path, name,
164                       (dev_type ? "dev_id" : "dev_port"));
165
166                 file = fopen(path, "rb");
167                 if (file == NULL) {
168                         if (errno != ENOENT)
169                                 continue;
170                         /*
171                          * Switch to dev_id when dev_port does not exist as
172                          * is the case with Linux kernel versions < 3.15.
173                          */
174 try_dev_id:
175                         match[0] = '\0';
176                         if (dev_type)
177                                 break;
178                         dev_type = 1;
179                         dev_port_prev = ~0u;
180                         rewinddir(dir);
181                         continue;
182                 }
183                 r = fscanf(file, (dev_type ? "%x" : "%u"), &dev_port);
184                 fclose(file);
185                 if (r != 1)
186                         continue;
187                 /*
188                  * Switch to dev_id when dev_port returns the same value for
189                  * all ports. May happen when using a MOFED release older than
190                  * 3.0 with a Linux kernel >= 3.15.
191                  */
192                 if (dev_port == dev_port_prev)
193                         goto try_dev_id;
194                 dev_port_prev = dev_port;
195                 if (dev_port == 0)
196                         strlcpy(match, name, sizeof(match));
197         }
198         closedir(dir);
199         if (match[0] == '\0') {
200                 rte_errno = ENOENT;
201                 return -rte_errno;
202         }
203         strncpy(*ifname, match, sizeof(*ifname));
204         return 0;
205 }
206
207 /**
208  * Get interface name from private structure.
209  *
210  * This is a port representor-aware version of mlx5_get_master_ifname().
211  *
212  * @param[in] dev
213  *   Pointer to Ethernet device.
214  * @param[out] ifname
215  *   Interface name output buffer.
216  *
217  * @return
218  *   0 on success, a negative errno value otherwise and rte_errno is set.
219  */
220 int
221 mlx5_get_ifname(const struct rte_eth_dev *dev, char (*ifname)[IF_NAMESIZE])
222 {
223         struct mlx5_priv *priv = dev->data->dev_private;
224         unsigned int ifindex;
225
226         assert(priv);
227         assert(priv->sh);
228         ifindex = mlx5_ifindex(dev);
229         if (!ifindex) {
230                 if (!priv->representor)
231                         return mlx5_get_master_ifname(priv->sh->ibdev_path,
232                                                       ifname);
233                 rte_errno = ENXIO;
234                 return -rte_errno;
235         }
236         if (if_indextoname(ifindex, &(*ifname)[0]))
237                 return 0;
238         rte_errno = errno;
239         return -rte_errno;
240 }
241
242 /**
243  * Get the interface index from device name.
244  *
245  * @param[in] dev
246  *   Pointer to Ethernet device.
247  *
248  * @return
249  *   Nonzero interface index on success, zero otherwise and rte_errno is set.
250  */
251 unsigned int
252 mlx5_ifindex(const struct rte_eth_dev *dev)
253 {
254         struct mlx5_priv *priv = dev->data->dev_private;
255         unsigned int ifindex;
256
257         assert(priv);
258         assert(priv->if_index);
259         ifindex = priv->if_index;
260         if (!ifindex)
261                 rte_errno = ENXIO;
262         return ifindex;
263 }
264
265 /**
266  * Perform ifreq ioctl() on associated Ethernet device.
267  *
268  * @param[in] dev
269  *   Pointer to Ethernet device.
270  * @param req
271  *   Request number to pass to ioctl().
272  * @param[out] ifr
273  *   Interface request structure output buffer.
274  *
275  * @return
276  *   0 on success, a negative errno value otherwise and rte_errno is set.
277  */
278 int
279 mlx5_ifreq(const struct rte_eth_dev *dev, int req, struct ifreq *ifr)
280 {
281         int sock = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
282         int ret = 0;
283
284         if (sock == -1) {
285                 rte_errno = errno;
286                 return -rte_errno;
287         }
288         ret = mlx5_get_ifname(dev, &ifr->ifr_name);
289         if (ret)
290                 goto error;
291         ret = ioctl(sock, req, ifr);
292         if (ret == -1) {
293                 rte_errno = errno;
294                 goto error;
295         }
296         close(sock);
297         return 0;
298 error:
299         close(sock);
300         return -rte_errno;
301 }
302
303 /**
304  * Get device MTU.
305  *
306  * @param dev
307  *   Pointer to Ethernet device.
308  * @param[out] mtu
309  *   MTU value output buffer.
310  *
311  * @return
312  *   0 on success, a negative errno value otherwise and rte_errno is set.
313  */
314 int
315 mlx5_get_mtu(struct rte_eth_dev *dev, uint16_t *mtu)
316 {
317         struct ifreq request;
318         int ret = mlx5_ifreq(dev, SIOCGIFMTU, &request);
319
320         if (ret)
321                 return ret;
322         *mtu = request.ifr_mtu;
323         return 0;
324 }
325
326 /**
327  * Set device MTU.
328  *
329  * @param dev
330  *   Pointer to Ethernet device.
331  * @param mtu
332  *   MTU value to set.
333  *
334  * @return
335  *   0 on success, a negative errno value otherwise and rte_errno is set.
336  */
337 static int
338 mlx5_set_mtu(struct rte_eth_dev *dev, uint16_t mtu)
339 {
340         struct ifreq request = { .ifr_mtu = mtu, };
341
342         return mlx5_ifreq(dev, SIOCSIFMTU, &request);
343 }
344
345 /**
346  * Set device flags.
347  *
348  * @param dev
349  *   Pointer to Ethernet device.
350  * @param keep
351  *   Bitmask for flags that must remain untouched.
352  * @param flags
353  *   Bitmask for flags to modify.
354  *
355  * @return
356  *   0 on success, a negative errno value otherwise and rte_errno is set.
357  */
358 int
359 mlx5_set_flags(struct rte_eth_dev *dev, unsigned int keep, unsigned int flags)
360 {
361         struct ifreq request;
362         int ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &request);
363
364         if (ret)
365                 return ret;
366         request.ifr_flags &= keep;
367         request.ifr_flags |= flags & ~keep;
368         return mlx5_ifreq(dev, SIOCSIFFLAGS, &request);
369 }
370
371 /**
372  * DPDK callback for Ethernet device configuration.
373  *
374  * @param dev
375  *   Pointer to Ethernet device structure.
376  *
377  * @return
378  *   0 on success, a negative errno value otherwise and rte_errno is set.
379  */
380 int
381 mlx5_dev_configure(struct rte_eth_dev *dev)
382 {
383         struct mlx5_priv *priv = dev->data->dev_private;
384         unsigned int rxqs_n = dev->data->nb_rx_queues;
385         unsigned int txqs_n = dev->data->nb_tx_queues;
386         unsigned int i;
387         unsigned int j;
388         unsigned int reta_idx_n;
389         const uint8_t use_app_rss_key =
390                 !!dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key;
391         int ret = 0;
392
393         if (use_app_rss_key &&
394             (dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key_len !=
395              MLX5_RSS_HASH_KEY_LEN)) {
396                 DRV_LOG(ERR, "port %u RSS key len must be %s Bytes long",
397                         dev->data->port_id, RTE_STR(MLX5_RSS_HASH_KEY_LEN));
398                 rte_errno = EINVAL;
399                 return -rte_errno;
400         }
401         priv->rss_conf.rss_key =
402                 rte_realloc(priv->rss_conf.rss_key,
403                             MLX5_RSS_HASH_KEY_LEN, 0);
404         if (!priv->rss_conf.rss_key) {
405                 DRV_LOG(ERR, "port %u cannot allocate RSS hash key memory (%u)",
406                         dev->data->port_id, rxqs_n);
407                 rte_errno = ENOMEM;
408                 return -rte_errno;
409         }
410         memcpy(priv->rss_conf.rss_key,
411                use_app_rss_key ?
412                dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key :
413                rss_hash_default_key,
414                MLX5_RSS_HASH_KEY_LEN);
415         priv->rss_conf.rss_key_len = MLX5_RSS_HASH_KEY_LEN;
416         priv->rss_conf.rss_hf = dev->data->dev_conf.rx_adv_conf.rss_conf.rss_hf;
417         priv->rxqs = (void *)dev->data->rx_queues;
418         priv->txqs = (void *)dev->data->tx_queues;
419         if (txqs_n != priv->txqs_n) {
420                 DRV_LOG(INFO, "port %u Tx queues number update: %u -> %u",
421                         dev->data->port_id, priv->txqs_n, txqs_n);
422                 priv->txqs_n = txqs_n;
423         }
424         if (rxqs_n > priv->config.ind_table_max_size) {
425                 DRV_LOG(ERR, "port %u cannot handle this many Rx queues (%u)",
426                         dev->data->port_id, rxqs_n);
427                 rte_errno = EINVAL;
428                 return -rte_errno;
429         }
430         if (rxqs_n != priv->rxqs_n) {
431                 DRV_LOG(INFO, "port %u Rx queues number update: %u -> %u",
432                         dev->data->port_id, priv->rxqs_n, rxqs_n);
433                 priv->rxqs_n = rxqs_n;
434                 /*
435                  * If the requested number of RX queues is not a power of two,
436                  * use the maximum indirection table size for better balancing.
437                  * The result is always rounded to the next power of two.
438                  */
439                 reta_idx_n = (1 << log2above((rxqs_n & (rxqs_n - 1)) ?
440                                              priv->config.ind_table_max_size :
441                                              rxqs_n));
442                 ret = mlx5_rss_reta_index_resize(dev, reta_idx_n);
443                 if (ret)
444                         return ret;
445                 /*
446                  * When the number of RX queues is not a power of two,
447                  * the remaining table entries are padded with reused WQs
448                  * and hashes are not spread uniformly.
449                  */
450                 for (i = 0, j = 0; (i != reta_idx_n); ++i) {
451                         (*priv->reta_idx)[i] = j;
452                         if (++j == rxqs_n)
453                                 j = 0;
454                 }
455         }
456         ret = mlx5_proc_priv_init(dev);
457         if (ret)
458                 return ret;
459         return 0;
460 }
461
462 /**
463  * Sets default tuning parameters.
464  *
465  * @param dev
466  *   Pointer to Ethernet device.
467  * @param[out] info
468  *   Info structure output buffer.
469  */
470 static void
471 mlx5_set_default_params(struct rte_eth_dev *dev, struct rte_eth_dev_info *info)
472 {
473         struct mlx5_priv *priv = dev->data->dev_private;
474
475         /* Minimum CPU utilization. */
476         info->default_rxportconf.ring_size = 256;
477         info->default_txportconf.ring_size = 256;
478         info->default_rxportconf.burst_size = 64;
479         info->default_txportconf.burst_size = 64;
480         if (priv->link_speed_capa & ETH_LINK_SPEED_100G) {
481                 info->default_rxportconf.nb_queues = 16;
482                 info->default_txportconf.nb_queues = 16;
483                 if (dev->data->nb_rx_queues > 2 ||
484                     dev->data->nb_tx_queues > 2) {
485                         /* Max Throughput. */
486                         info->default_rxportconf.ring_size = 2048;
487                         info->default_txportconf.ring_size = 2048;
488                 }
489         } else {
490                 info->default_rxportconf.nb_queues = 8;
491                 info->default_txportconf.nb_queues = 8;
492                 if (dev->data->nb_rx_queues > 2 ||
493                     dev->data->nb_tx_queues > 2) {
494                         /* Max Throughput. */
495                         info->default_rxportconf.ring_size = 4096;
496                         info->default_txportconf.ring_size = 4096;
497                 }
498         }
499 }
500
501 /**
502  * Sets tx mbuf limiting parameters.
503  *
504  * @param dev
505  *   Pointer to Ethernet device.
506  * @param[out] info
507  *   Info structure output buffer.
508  */
509 static void
510 mlx5_set_txlimit_params(struct rte_eth_dev *dev, struct rte_eth_dev_info *info)
511 {
512         struct mlx5_priv *priv = dev->data->dev_private;
513         struct mlx5_dev_config *config = &priv->config;
514         unsigned int inlen;
515         uint16_t nb_max;
516
517         inlen = (config->txq_inline_max == MLX5_ARG_UNSET) ?
518                 MLX5_SEND_DEF_INLINE_LEN :
519                 (unsigned int)config->txq_inline_max;
520         assert(config->txq_inline_min >= 0);
521         inlen = RTE_MAX(inlen, (unsigned int)config->txq_inline_min);
522         inlen = RTE_MIN(inlen, MLX5_WQE_SIZE_MAX +
523                                MLX5_ESEG_MIN_INLINE_SIZE -
524                                MLX5_WQE_CSEG_SIZE -
525                                MLX5_WQE_ESEG_SIZE -
526                                MLX5_WQE_DSEG_SIZE * 2);
527         nb_max = (MLX5_WQE_SIZE_MAX +
528                   MLX5_ESEG_MIN_INLINE_SIZE -
529                   MLX5_WQE_CSEG_SIZE -
530                   MLX5_WQE_ESEG_SIZE -
531                   MLX5_WQE_DSEG_SIZE -
532                   inlen) / MLX5_WSEG_SIZE;
533         info->tx_desc_lim.nb_seg_max = nb_max;
534         info->tx_desc_lim.nb_mtu_seg_max = nb_max;
535 }
536
537 /**
538  * DPDK callback to get information about the device.
539  *
540  * @param dev
541  *   Pointer to Ethernet device structure.
542  * @param[out] info
543  *   Info structure output buffer.
544  */
545 int
546 mlx5_dev_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *info)
547 {
548         struct mlx5_priv *priv = dev->data->dev_private;
549         struct mlx5_dev_config *config = &priv->config;
550         unsigned int max;
551
552         /* FIXME: we should ask the device for these values. */
553         info->min_rx_bufsize = 32;
554         info->max_rx_pktlen = 65536;
555         /*
556          * Since we need one CQ per QP, the limit is the minimum number
557          * between the two values.
558          */
559         max = RTE_MIN(priv->sh->device_attr.orig_attr.max_cq,
560                       priv->sh->device_attr.orig_attr.max_qp);
561         /* If max >= 65535 then max = 0, max_rx_queues is uint16_t. */
562         if (max >= 65535)
563                 max = 65535;
564         info->max_rx_queues = max;
565         info->max_tx_queues = max;
566         info->max_mac_addrs = MLX5_MAX_UC_MAC_ADDRESSES;
567         info->rx_queue_offload_capa = mlx5_get_rx_queue_offloads(dev);
568         info->rx_offload_capa = (mlx5_get_rx_port_offloads() |
569                                  info->rx_queue_offload_capa);
570         info->tx_offload_capa = mlx5_get_tx_port_offloads(dev);
571         info->if_index = mlx5_ifindex(dev);
572         info->reta_size = priv->reta_idx_n ?
573                 priv->reta_idx_n : config->ind_table_max_size;
574         info->hash_key_size = MLX5_RSS_HASH_KEY_LEN;
575         info->speed_capa = priv->link_speed_capa;
576         info->flow_type_rss_offloads = ~MLX5_RSS_HF_MASK;
577         mlx5_set_default_params(dev, info);
578         mlx5_set_txlimit_params(dev, info);
579         info->switch_info.name = dev->data->name;
580         info->switch_info.domain_id = priv->domain_id;
581         info->switch_info.port_id = priv->representor_id;
582         if (priv->representor) {
583                 unsigned int i = mlx5_dev_to_port_id(dev->device, NULL, 0);
584                 uint16_t port_id[i];
585
586                 i = RTE_MIN(mlx5_dev_to_port_id(dev->device, port_id, i), i);
587                 while (i--) {
588                         struct mlx5_priv *opriv =
589                                 rte_eth_devices[port_id[i]].data->dev_private;
590
591                         if (!opriv ||
592                             opriv->representor ||
593                             opriv->domain_id != priv->domain_id)
594                                 continue;
595                         /*
596                          * Override switch name with that of the master
597                          * device.
598                          */
599                         info->switch_info.name = opriv->dev_data->name;
600                         break;
601                 }
602         }
603
604         return 0;
605 }
606
607 /**
608  * Get device current raw clock counter
609  *
610  * @param dev
611  *   Pointer to Ethernet device structure.
612  * @param[out] time
613  *   Current raw clock counter of the device.
614  *
615  * @return
616  *   0 if the clock has correctly been read
617  *   The value of errno in case of error
618  */
619 int
620 mlx5_read_clock(struct rte_eth_dev *dev, uint64_t *clock)
621 {
622         struct mlx5_priv *priv = dev->data->dev_private;
623         struct ibv_context *ctx = priv->sh->ctx;
624         struct ibv_values_ex values;
625         int err = 0;
626
627         values.comp_mask = IBV_VALUES_MASK_RAW_CLOCK;
628         err = mlx5_glue->query_rt_values_ex(ctx, &values);
629         if (err != 0) {
630                 DRV_LOG(WARNING, "Could not query the clock !");
631                 return err;
632         }
633         *clock = values.raw_clock.tv_nsec;
634         return 0;
635 }
636
637 /**
638  * Get firmware version of a device.
639  *
640  * @param dev
641  *   Ethernet device port.
642  * @param fw_ver
643  *   String output allocated by caller.
644  * @param fw_size
645  *   Size of the output string, including terminating null byte.
646  *
647  * @return
648  *   0 on success, or the size of the non truncated string if too big.
649  */
650 int mlx5_fw_version_get(struct rte_eth_dev *dev, char *fw_ver, size_t fw_size)
651 {
652         struct mlx5_priv *priv = dev->data->dev_private;
653         struct ibv_device_attr *attr = &priv->sh->device_attr.orig_attr;
654         size_t size = strnlen(attr->fw_ver, sizeof(attr->fw_ver)) + 1;
655
656         if (fw_size < size)
657                 return size;
658         if (fw_ver != NULL)
659                 strlcpy(fw_ver, attr->fw_ver, fw_size);
660         return 0;
661 }
662
663 /**
664  * Get supported packet types.
665  *
666  * @param dev
667  *   Pointer to Ethernet device structure.
668  *
669  * @return
670  *   A pointer to the supported Packet types array.
671  */
672 const uint32_t *
673 mlx5_dev_supported_ptypes_get(struct rte_eth_dev *dev)
674 {
675         static const uint32_t ptypes[] = {
676                 /* refers to rxq_cq_to_pkt_type() */
677                 RTE_PTYPE_L2_ETHER,
678                 RTE_PTYPE_L3_IPV4_EXT_UNKNOWN,
679                 RTE_PTYPE_L3_IPV6_EXT_UNKNOWN,
680                 RTE_PTYPE_L4_NONFRAG,
681                 RTE_PTYPE_L4_FRAG,
682                 RTE_PTYPE_L4_TCP,
683                 RTE_PTYPE_L4_UDP,
684                 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN,
685                 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN,
686                 RTE_PTYPE_INNER_L4_NONFRAG,
687                 RTE_PTYPE_INNER_L4_FRAG,
688                 RTE_PTYPE_INNER_L4_TCP,
689                 RTE_PTYPE_INNER_L4_UDP,
690                 RTE_PTYPE_UNKNOWN
691         };
692
693         if (dev->rx_pkt_burst == mlx5_rx_burst ||
694             dev->rx_pkt_burst == mlx5_rx_burst_mprq ||
695             dev->rx_pkt_burst == mlx5_rx_burst_vec)
696                 return ptypes;
697         return NULL;
698 }
699
700 /**
701  * Retrieve the master device for representor in the same switch domain.
702  *
703  * @param dev
704  *   Pointer to representor Ethernet device structure.
705  *
706  * @return
707  *   Master device structure  on success, NULL otherwise.
708  */
709
710 static struct rte_eth_dev *
711 mlx5_find_master_dev(struct rte_eth_dev *dev)
712 {
713         struct mlx5_priv *priv;
714         uint16_t port_id;
715         uint16_t domain_id;
716
717         priv = dev->data->dev_private;
718         domain_id = priv->domain_id;
719         assert(priv->representor);
720         RTE_ETH_FOREACH_DEV_OF(port_id, dev->device) {
721                 priv = rte_eth_devices[port_id].data->dev_private;
722                 if (priv &&
723                     priv->master &&
724                     priv->domain_id == domain_id)
725                         return &rte_eth_devices[port_id];
726         }
727         return NULL;
728 }
729
730 /**
731  * DPDK callback to retrieve physical link information.
732  *
733  * @param dev
734  *   Pointer to Ethernet device structure.
735  * @param[out] link
736  *   Storage for current link status.
737  *
738  * @return
739  *   0 on success, a negative errno value otherwise and rte_errno is set.
740  */
741 static int
742 mlx5_link_update_unlocked_gset(struct rte_eth_dev *dev,
743                                struct rte_eth_link *link)
744 {
745         struct mlx5_priv *priv = dev->data->dev_private;
746         struct ethtool_cmd edata = {
747                 .cmd = ETHTOOL_GSET /* Deprecated since Linux v4.5. */
748         };
749         struct ifreq ifr;
750         struct rte_eth_link dev_link;
751         int link_speed = 0;
752         int ret;
753
754         ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr);
755         if (ret) {
756                 DRV_LOG(WARNING, "port %u ioctl(SIOCGIFFLAGS) failed: %s",
757                         dev->data->port_id, strerror(rte_errno));
758                 return ret;
759         }
760         dev_link = (struct rte_eth_link) {
761                 .link_status = ((ifr.ifr_flags & IFF_UP) &&
762                                 (ifr.ifr_flags & IFF_RUNNING)),
763         };
764         ifr = (struct ifreq) {
765                 .ifr_data = (void *)&edata,
766         };
767         ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
768         if (ret) {
769                 if (ret == -ENOTSUP && priv->representor) {
770                         struct rte_eth_dev *master;
771
772                         /*
773                          * For representors we can try to inherit link
774                          * settings from the master device. Actually
775                          * link settings do not make a lot of sense
776                          * for representors due to missing physical
777                          * link. The old kernel drivers supported
778                          * emulated settings query for representors,
779                          * the new ones do not, so we have to add
780                          * this code for compatibility issues.
781                          */
782                         master = mlx5_find_master_dev(dev);
783                         if (master) {
784                                 ifr = (struct ifreq) {
785                                         .ifr_data = (void *)&edata,
786                                 };
787                                 ret = mlx5_ifreq(master, SIOCETHTOOL, &ifr);
788                         }
789                 }
790                 if (ret) {
791                         DRV_LOG(WARNING,
792                                 "port %u ioctl(SIOCETHTOOL,"
793                                 " ETHTOOL_GSET) failed: %s",
794                                 dev->data->port_id, strerror(rte_errno));
795                         return ret;
796                 }
797         }
798         link_speed = ethtool_cmd_speed(&edata);
799         if (link_speed == -1)
800                 dev_link.link_speed = ETH_SPEED_NUM_NONE;
801         else
802                 dev_link.link_speed = link_speed;
803         priv->link_speed_capa = 0;
804         if (edata.supported & SUPPORTED_Autoneg)
805                 priv->link_speed_capa |= ETH_LINK_SPEED_AUTONEG;
806         if (edata.supported & (SUPPORTED_1000baseT_Full |
807                                SUPPORTED_1000baseKX_Full))
808                 priv->link_speed_capa |= ETH_LINK_SPEED_1G;
809         if (edata.supported & SUPPORTED_10000baseKR_Full)
810                 priv->link_speed_capa |= ETH_LINK_SPEED_10G;
811         if (edata.supported & (SUPPORTED_40000baseKR4_Full |
812                                SUPPORTED_40000baseCR4_Full |
813                                SUPPORTED_40000baseSR4_Full |
814                                SUPPORTED_40000baseLR4_Full))
815                 priv->link_speed_capa |= ETH_LINK_SPEED_40G;
816         dev_link.link_duplex = ((edata.duplex == DUPLEX_HALF) ?
817                                 ETH_LINK_HALF_DUPLEX : ETH_LINK_FULL_DUPLEX);
818         dev_link.link_autoneg = !(dev->data->dev_conf.link_speeds &
819                         ETH_LINK_SPEED_FIXED);
820         if (((dev_link.link_speed && !dev_link.link_status) ||
821              (!dev_link.link_speed && dev_link.link_status))) {
822                 rte_errno = EAGAIN;
823                 return -rte_errno;
824         }
825         *link = dev_link;
826         return 0;
827 }
828
829 /**
830  * Retrieve physical link information (unlocked version using new ioctl).
831  *
832  * @param dev
833  *   Pointer to Ethernet device structure.
834  * @param[out] link
835  *   Storage for current link status.
836  *
837  * @return
838  *   0 on success, a negative errno value otherwise and rte_errno is set.
839  */
840 static int
841 mlx5_link_update_unlocked_gs(struct rte_eth_dev *dev,
842                              struct rte_eth_link *link)
843
844 {
845         struct mlx5_priv *priv = dev->data->dev_private;
846         struct ethtool_link_settings gcmd = { .cmd = ETHTOOL_GLINKSETTINGS };
847         struct ifreq ifr;
848         struct rte_eth_link dev_link;
849         struct rte_eth_dev *master = NULL;
850         uint64_t sc;
851         int ret;
852
853         ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr);
854         if (ret) {
855                 DRV_LOG(WARNING, "port %u ioctl(SIOCGIFFLAGS) failed: %s",
856                         dev->data->port_id, strerror(rte_errno));
857                 return ret;
858         }
859         dev_link = (struct rte_eth_link) {
860                 .link_status = ((ifr.ifr_flags & IFF_UP) &&
861                                 (ifr.ifr_flags & IFF_RUNNING)),
862         };
863         ifr = (struct ifreq) {
864                 .ifr_data = (void *)&gcmd,
865         };
866         ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
867         if (ret) {
868                 if (ret == -ENOTSUP && priv->representor) {
869                         /*
870                          * For representors we can try to inherit link
871                          * settings from the master device. Actually
872                          * link settings do not make a lot of sense
873                          * for representors due to missing physical
874                          * link. The old kernel drivers supported
875                          * emulated settings query for representors,
876                          * the new ones do not, so we have to add
877                          * this code for compatibility issues.
878                          */
879                         master = mlx5_find_master_dev(dev);
880                         if (master) {
881                                 ifr = (struct ifreq) {
882                                         .ifr_data = (void *)&gcmd,
883                                 };
884                                 ret = mlx5_ifreq(master, SIOCETHTOOL, &ifr);
885                         }
886                 }
887                 if (ret) {
888                         DRV_LOG(DEBUG,
889                                 "port %u ioctl(SIOCETHTOOL,"
890                                 " ETHTOOL_GLINKSETTINGS) failed: %s",
891                                 dev->data->port_id, strerror(rte_errno));
892                         return ret;
893                 }
894
895         }
896         gcmd.link_mode_masks_nwords = -gcmd.link_mode_masks_nwords;
897
898         alignas(struct ethtool_link_settings)
899         uint8_t data[offsetof(struct ethtool_link_settings, link_mode_masks) +
900                      sizeof(uint32_t) * gcmd.link_mode_masks_nwords * 3];
901         struct ethtool_link_settings *ecmd = (void *)data;
902
903         *ecmd = gcmd;
904         ifr.ifr_data = (void *)ecmd;
905         ret = mlx5_ifreq(master ? master : dev, SIOCETHTOOL, &ifr);
906         if (ret) {
907                 DRV_LOG(DEBUG,
908                         "port %u ioctl(SIOCETHTOOL,"
909                         "ETHTOOL_GLINKSETTINGS) failed: %s",
910                         dev->data->port_id, strerror(rte_errno));
911                 return ret;
912         }
913         dev_link.link_speed = (ecmd->speed == UINT32_MAX) ? ETH_SPEED_NUM_NONE :
914                                                             ecmd->speed;
915         sc = ecmd->link_mode_masks[0] |
916                 ((uint64_t)ecmd->link_mode_masks[1] << 32);
917         priv->link_speed_capa = 0;
918         if (sc & MLX5_BITSHIFT(ETHTOOL_LINK_MODE_Autoneg_BIT))
919                 priv->link_speed_capa |= ETH_LINK_SPEED_AUTONEG;
920         if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_1000baseT_Full_BIT) |
921                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_1000baseKX_Full_BIT)))
922                 priv->link_speed_capa |= ETH_LINK_SPEED_1G;
923         if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT) |
924                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_10000baseKR_Full_BIT) |
925                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_10000baseR_FEC_BIT)))
926                 priv->link_speed_capa |= ETH_LINK_SPEED_10G;
927         if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_20000baseMLD2_Full_BIT) |
928                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_20000baseKR2_Full_BIT)))
929                 priv->link_speed_capa |= ETH_LINK_SPEED_20G;
930         if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT) |
931                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT) |
932                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT) |
933                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT)))
934                 priv->link_speed_capa |= ETH_LINK_SPEED_40G;
935         if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseKR4_Full_BIT) |
936                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseCR4_Full_BIT) |
937                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseSR4_Full_BIT) |
938                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseLR4_Full_BIT)))
939                 priv->link_speed_capa |= ETH_LINK_SPEED_56G;
940         if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_25000baseCR_Full_BIT) |
941                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_25000baseKR_Full_BIT) |
942                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_25000baseSR_Full_BIT)))
943                 priv->link_speed_capa |= ETH_LINK_SPEED_25G;
944         if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_50000baseCR2_Full_BIT) |
945                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_50000baseKR2_Full_BIT)))
946                 priv->link_speed_capa |= ETH_LINK_SPEED_50G;
947         if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT) |
948                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseSR4_Full_BIT) |
949                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT) |
950                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT)))
951                 priv->link_speed_capa |= ETH_LINK_SPEED_100G;
952         dev_link.link_duplex = ((ecmd->duplex == DUPLEX_HALF) ?
953                                 ETH_LINK_HALF_DUPLEX : ETH_LINK_FULL_DUPLEX);
954         dev_link.link_autoneg = !(dev->data->dev_conf.link_speeds &
955                                   ETH_LINK_SPEED_FIXED);
956         if (((dev_link.link_speed && !dev_link.link_status) ||
957              (!dev_link.link_speed && dev_link.link_status))) {
958                 rte_errno = EAGAIN;
959                 return -rte_errno;
960         }
961         *link = dev_link;
962         return 0;
963 }
964
965 /**
966  * DPDK callback to retrieve physical link information.
967  *
968  * @param dev
969  *   Pointer to Ethernet device structure.
970  * @param wait_to_complete
971  *   Wait for request completion.
972  *
973  * @return
974  *   0 if link status was not updated, positive if it was, a negative errno
975  *   value otherwise and rte_errno is set.
976  */
977 int
978 mlx5_link_update(struct rte_eth_dev *dev, int wait_to_complete)
979 {
980         int ret;
981         struct rte_eth_link dev_link;
982         time_t start_time = time(NULL);
983
984         do {
985                 ret = mlx5_link_update_unlocked_gs(dev, &dev_link);
986                 if (ret == -ENOTSUP)
987                         ret = mlx5_link_update_unlocked_gset(dev, &dev_link);
988                 if (ret == 0)
989                         break;
990                 /* Handle wait to complete situation. */
991                 if (wait_to_complete && ret == -EAGAIN) {
992                         if (abs((int)difftime(time(NULL), start_time)) <
993                             MLX5_LINK_STATUS_TIMEOUT) {
994                                 usleep(0);
995                                 continue;
996                         } else {
997                                 rte_errno = EBUSY;
998                                 return -rte_errno;
999                         }
1000                 } else if (ret < 0) {
1001                         return ret;
1002                 }
1003         } while (wait_to_complete);
1004         ret = !!memcmp(&dev->data->dev_link, &dev_link,
1005                        sizeof(struct rte_eth_link));
1006         dev->data->dev_link = dev_link;
1007         return ret;
1008 }
1009
1010 /**
1011  * DPDK callback to change the MTU.
1012  *
1013  * @param dev
1014  *   Pointer to Ethernet device structure.
1015  * @param in_mtu
1016  *   New MTU.
1017  *
1018  * @return
1019  *   0 on success, a negative errno value otherwise and rte_errno is set.
1020  */
1021 int
1022 mlx5_dev_set_mtu(struct rte_eth_dev *dev, uint16_t mtu)
1023 {
1024         struct mlx5_priv *priv = dev->data->dev_private;
1025         uint16_t kern_mtu = 0;
1026         int ret;
1027
1028         ret = mlx5_get_mtu(dev, &kern_mtu);
1029         if (ret)
1030                 return ret;
1031         /* Set kernel interface MTU first. */
1032         ret = mlx5_set_mtu(dev, mtu);
1033         if (ret)
1034                 return ret;
1035         ret = mlx5_get_mtu(dev, &kern_mtu);
1036         if (ret)
1037                 return ret;
1038         if (kern_mtu == mtu) {
1039                 priv->mtu = mtu;
1040                 DRV_LOG(DEBUG, "port %u adapter MTU set to %u",
1041                         dev->data->port_id, mtu);
1042                 return 0;
1043         }
1044         rte_errno = EAGAIN;
1045         return -rte_errno;
1046 }
1047
1048 /**
1049  * DPDK callback to get flow control status.
1050  *
1051  * @param dev
1052  *   Pointer to Ethernet device structure.
1053  * @param[out] fc_conf
1054  *   Flow control output buffer.
1055  *
1056  * @return
1057  *   0 on success, a negative errno value otherwise and rte_errno is set.
1058  */
1059 int
1060 mlx5_dev_get_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf)
1061 {
1062         struct ifreq ifr;
1063         struct ethtool_pauseparam ethpause = {
1064                 .cmd = ETHTOOL_GPAUSEPARAM
1065         };
1066         int ret;
1067
1068         ifr.ifr_data = (void *)&ethpause;
1069         ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
1070         if (ret) {
1071                 DRV_LOG(WARNING,
1072                         "port %u ioctl(SIOCETHTOOL, ETHTOOL_GPAUSEPARAM) failed:"
1073                         " %s",
1074                         dev->data->port_id, strerror(rte_errno));
1075                 return ret;
1076         }
1077         fc_conf->autoneg = ethpause.autoneg;
1078         if (ethpause.rx_pause && ethpause.tx_pause)
1079                 fc_conf->mode = RTE_FC_FULL;
1080         else if (ethpause.rx_pause)
1081                 fc_conf->mode = RTE_FC_RX_PAUSE;
1082         else if (ethpause.tx_pause)
1083                 fc_conf->mode = RTE_FC_TX_PAUSE;
1084         else
1085                 fc_conf->mode = RTE_FC_NONE;
1086         return 0;
1087 }
1088
1089 /**
1090  * DPDK callback to modify flow control parameters.
1091  *
1092  * @param dev
1093  *   Pointer to Ethernet device structure.
1094  * @param[in] fc_conf
1095  *   Flow control parameters.
1096  *
1097  * @return
1098  *   0 on success, a negative errno value otherwise and rte_errno is set.
1099  */
1100 int
1101 mlx5_dev_set_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf)
1102 {
1103         struct ifreq ifr;
1104         struct ethtool_pauseparam ethpause = {
1105                 .cmd = ETHTOOL_SPAUSEPARAM
1106         };
1107         int ret;
1108
1109         ifr.ifr_data = (void *)&ethpause;
1110         ethpause.autoneg = fc_conf->autoneg;
1111         if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) ||
1112             (fc_conf->mode & RTE_FC_RX_PAUSE))
1113                 ethpause.rx_pause = 1;
1114         else
1115                 ethpause.rx_pause = 0;
1116
1117         if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) ||
1118             (fc_conf->mode & RTE_FC_TX_PAUSE))
1119                 ethpause.tx_pause = 1;
1120         else
1121                 ethpause.tx_pause = 0;
1122         ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
1123         if (ret) {
1124                 DRV_LOG(WARNING,
1125                         "port %u ioctl(SIOCETHTOOL, ETHTOOL_SPAUSEPARAM)"
1126                         " failed: %s",
1127                         dev->data->port_id, strerror(rte_errno));
1128                 return ret;
1129         }
1130         return 0;
1131 }
1132
1133 /**
1134  * Get PCI information from struct ibv_device.
1135  *
1136  * @param device
1137  *   Pointer to Ethernet device structure.
1138  * @param[out] pci_addr
1139  *   PCI bus address output buffer.
1140  *
1141  * @return
1142  *   0 on success, a negative errno value otherwise and rte_errno is set.
1143  */
1144 int
1145 mlx5_ibv_device_to_pci_addr(const struct ibv_device *device,
1146                             struct rte_pci_addr *pci_addr)
1147 {
1148         FILE *file;
1149         char line[32];
1150         MKSTR(path, "%s/device/uevent", device->ibdev_path);
1151
1152         file = fopen(path, "rb");
1153         if (file == NULL) {
1154                 rte_errno = errno;
1155                 return -rte_errno;
1156         }
1157         while (fgets(line, sizeof(line), file) == line) {
1158                 size_t len = strlen(line);
1159                 int ret;
1160
1161                 /* Truncate long lines. */
1162                 if (len == (sizeof(line) - 1))
1163                         while (line[(len - 1)] != '\n') {
1164                                 ret = fgetc(file);
1165                                 if (ret == EOF)
1166                                         break;
1167                                 line[(len - 1)] = ret;
1168                         }
1169                 /* Extract information. */
1170                 if (sscanf(line,
1171                            "PCI_SLOT_NAME="
1172                            "%" SCNx32 ":%" SCNx8 ":%" SCNx8 ".%" SCNx8 "\n",
1173                            &pci_addr->domain,
1174                            &pci_addr->bus,
1175                            &pci_addr->devid,
1176                            &pci_addr->function) == 4) {
1177                         ret = 0;
1178                         break;
1179                 }
1180         }
1181         fclose(file);
1182         return 0;
1183 }
1184
1185 /**
1186  * Handle asynchronous removal event for entire multiport device.
1187  *
1188  * @param sh
1189  *   Infiniband device shared context.
1190  */
1191 static void
1192 mlx5_dev_interrupt_device_fatal(struct mlx5_ibv_shared *sh)
1193 {
1194         uint32_t i;
1195
1196         for (i = 0; i < sh->max_port; ++i) {
1197                 struct rte_eth_dev *dev;
1198
1199                 if (sh->port[i].ih_port_id >= RTE_MAX_ETHPORTS) {
1200                         /*
1201                          * Or not existing port either no
1202                          * handler installed for this port.
1203                          */
1204                         continue;
1205                 }
1206                 dev = &rte_eth_devices[sh->port[i].ih_port_id];
1207                 assert(dev);
1208                 if (dev->data->dev_conf.intr_conf.rmv)
1209                         _rte_eth_dev_callback_process
1210                                 (dev, RTE_ETH_EVENT_INTR_RMV, NULL);
1211         }
1212 }
1213
1214 /**
1215  * Handle shared asynchronous events the NIC (removal event
1216  * and link status change). Supports multiport IB device.
1217  *
1218  * @param cb_arg
1219  *   Callback argument.
1220  */
1221 void
1222 mlx5_dev_interrupt_handler(void *cb_arg)
1223 {
1224         struct mlx5_ibv_shared *sh = cb_arg;
1225         struct ibv_async_event event;
1226
1227         /* Read all message from the IB device and acknowledge them. */
1228         for (;;) {
1229                 struct rte_eth_dev *dev;
1230                 uint32_t tmp;
1231
1232                 if (mlx5_glue->get_async_event(sh->ctx, &event))
1233                         break;
1234                 /* Retrieve and check IB port index. */
1235                 tmp = (uint32_t)event.element.port_num;
1236                 if (!tmp && event.event_type == IBV_EVENT_DEVICE_FATAL) {
1237                         /*
1238                          * The DEVICE_FATAL event is called once for
1239                          * entire device without port specifying.
1240                          * We should notify all existing ports.
1241                          */
1242                         mlx5_glue->ack_async_event(&event);
1243                         mlx5_dev_interrupt_device_fatal(sh);
1244                         continue;
1245                 }
1246                 assert(tmp && (tmp <= sh->max_port));
1247                 if (!tmp) {
1248                         /* Unsupported devive level event. */
1249                         mlx5_glue->ack_async_event(&event);
1250                         DRV_LOG(DEBUG,
1251                                 "unsupported common event (type %d)",
1252                                 event.event_type);
1253                         continue;
1254                 }
1255                 if (tmp > sh->max_port) {
1256                         /* Invalid IB port index. */
1257                         mlx5_glue->ack_async_event(&event);
1258                         DRV_LOG(DEBUG,
1259                                 "cannot handle an event (type %d)"
1260                                 "due to invalid IB port index (%u)",
1261                                 event.event_type, tmp);
1262                         continue;
1263                 }
1264                 if (sh->port[tmp - 1].ih_port_id >= RTE_MAX_ETHPORTS) {
1265                         /* No handler installed. */
1266                         mlx5_glue->ack_async_event(&event);
1267                         DRV_LOG(DEBUG,
1268                                 "cannot handle an event (type %d)"
1269                                 "due to no handler installed for port %u",
1270                                 event.event_type, tmp);
1271                         continue;
1272                 }
1273                 /* Retrieve ethernet device descriptor. */
1274                 tmp = sh->port[tmp - 1].ih_port_id;
1275                 dev = &rte_eth_devices[tmp];
1276                 assert(dev);
1277                 if ((event.event_type == IBV_EVENT_PORT_ACTIVE ||
1278                      event.event_type == IBV_EVENT_PORT_ERR) &&
1279                         dev->data->dev_conf.intr_conf.lsc) {
1280                         mlx5_glue->ack_async_event(&event);
1281                         if (mlx5_link_update(dev, 0) == -EAGAIN) {
1282                                 usleep(0);
1283                                 continue;
1284                         }
1285                         _rte_eth_dev_callback_process
1286                                 (dev, RTE_ETH_EVENT_INTR_LSC, NULL);
1287                         continue;
1288                 }
1289                 DRV_LOG(DEBUG,
1290                         "port %u cannot handle an unknown event (type %d)",
1291                         dev->data->port_id, event.event_type);
1292                 mlx5_glue->ack_async_event(&event);
1293         }
1294 }
1295
1296 /*
1297  * Unregister callback handler safely. The handler may be active
1298  * while we are trying to unregister it, in this case code -EAGAIN
1299  * is returned by rte_intr_callback_unregister(). This routine checks
1300  * the return code and tries to unregister handler again.
1301  *
1302  * @param handle
1303  *   interrupt handle
1304  * @param cb_fn
1305  *   pointer to callback routine
1306  * @cb_arg
1307  *   opaque callback parameter
1308  */
1309 void
1310 mlx5_intr_callback_unregister(const struct rte_intr_handle *handle,
1311                               rte_intr_callback_fn cb_fn, void *cb_arg)
1312 {
1313         /*
1314          * Try to reduce timeout management overhead by not calling
1315          * the timer related routines on the first iteration. If the
1316          * unregistering succeeds on first call there will be no
1317          * timer calls at all.
1318          */
1319         uint64_t twait = 0;
1320         uint64_t start = 0;
1321
1322         do {
1323                 int ret;
1324
1325                 ret = rte_intr_callback_unregister(handle, cb_fn, cb_arg);
1326                 if (ret >= 0)
1327                         return;
1328                 if (ret != -EAGAIN) {
1329                         DRV_LOG(INFO, "failed to unregister interrupt"
1330                                       " handler (error: %d)", ret);
1331                         assert(false);
1332                         return;
1333                 }
1334                 if (twait) {
1335                         struct timespec onems;
1336
1337                         /* Wait one millisecond and try again. */
1338                         onems.tv_sec = 0;
1339                         onems.tv_nsec = NS_PER_S / MS_PER_S;
1340                         nanosleep(&onems, 0);
1341                         /* Check whether one second elapsed. */
1342                         if ((rte_get_timer_cycles() - start) <= twait)
1343                                 continue;
1344                 } else {
1345                         /*
1346                          * We get the amount of timer ticks for one second.
1347                          * If this amount elapsed it means we spent one
1348                          * second in waiting. This branch is executed once
1349                          * on first iteration.
1350                          */
1351                         twait = rte_get_timer_hz();
1352                         assert(twait);
1353                 }
1354                 /*
1355                  * Timeout elapsed, show message (once a second) and retry.
1356                  * We have no other acceptable option here, if we ignore
1357                  * the unregistering return code the handler will not
1358                  * be unregistered, fd will be closed and we may get the
1359                  * crush. Hanging and messaging in the loop seems not to be
1360                  * the worst choice.
1361                  */
1362                 DRV_LOG(INFO, "Retrying to unregister interrupt handler");
1363                 start = rte_get_timer_cycles();
1364         } while (true);
1365 }
1366
1367 /**
1368  * Handle DEVX interrupts from the NIC.
1369  * This function is probably called from the DPDK host thread.
1370  *
1371  * @param cb_arg
1372  *   Callback argument.
1373  */
1374 void
1375 mlx5_dev_interrupt_handler_devx(void *cb_arg)
1376 {
1377 #ifndef HAVE_IBV_DEVX_ASYNC
1378         (void)cb_arg;
1379         return;
1380 #else
1381         struct mlx5_ibv_shared *sh = cb_arg;
1382         union {
1383                 struct mlx5dv_devx_async_cmd_hdr cmd_resp;
1384                 uint8_t buf[MLX5_ST_SZ_BYTES(query_flow_counter_out) +
1385                             MLX5_ST_SZ_BYTES(traffic_counter) +
1386                             sizeof(struct mlx5dv_devx_async_cmd_hdr)];
1387         } out;
1388         uint8_t *buf = out.buf + sizeof(out.cmd_resp);
1389
1390         while (!mlx5_glue->devx_get_async_cmd_comp(sh->devx_comp,
1391                                                    &out.cmd_resp,
1392                                                    sizeof(out.buf)))
1393                 mlx5_flow_async_pool_query_handle
1394                         (sh, (uint64_t)out.cmd_resp.wr_id,
1395                          mlx5_devx_get_out_command_status(buf));
1396 #endif /* HAVE_IBV_DEVX_ASYNC */
1397 }
1398
1399 /**
1400  * Uninstall shared asynchronous device events handler.
1401  * This function is implemented to support event sharing
1402  * between multiple ports of single IB device.
1403  *
1404  * @param dev
1405  *   Pointer to Ethernet device.
1406  */
1407 static void
1408 mlx5_dev_shared_handler_uninstall(struct rte_eth_dev *dev)
1409 {
1410         struct mlx5_priv *priv = dev->data->dev_private;
1411         struct mlx5_ibv_shared *sh = priv->sh;
1412
1413         if (rte_eal_process_type() != RTE_PROC_PRIMARY)
1414                 return;
1415         pthread_mutex_lock(&sh->intr_mutex);
1416         assert(priv->ibv_port);
1417         assert(priv->ibv_port <= sh->max_port);
1418         assert(dev->data->port_id < RTE_MAX_ETHPORTS);
1419         if (sh->port[priv->ibv_port - 1].ih_port_id >= RTE_MAX_ETHPORTS)
1420                 goto exit;
1421         assert(sh->port[priv->ibv_port - 1].ih_port_id ==
1422                                         (uint32_t)dev->data->port_id);
1423         assert(sh->intr_cnt);
1424         sh->port[priv->ibv_port - 1].ih_port_id = RTE_MAX_ETHPORTS;
1425         if (!sh->intr_cnt || --sh->intr_cnt)
1426                 goto exit;
1427         mlx5_intr_callback_unregister(&sh->intr_handle,
1428                                      mlx5_dev_interrupt_handler, sh);
1429         sh->intr_handle.fd = 0;
1430         sh->intr_handle.type = RTE_INTR_HANDLE_UNKNOWN;
1431         if (sh->intr_handle_devx.fd) {
1432                 rte_intr_callback_unregister(&sh->intr_handle_devx,
1433                                              mlx5_dev_interrupt_handler_devx,
1434                                              sh);
1435                 sh->intr_handle_devx.fd = 0;
1436                 sh->intr_handle_devx.type = RTE_INTR_HANDLE_UNKNOWN;
1437         }
1438         if (sh->devx_comp) {
1439                 mlx5_glue->devx_destroy_cmd_comp(sh->devx_comp);
1440                 sh->devx_comp = NULL;
1441         }
1442 exit:
1443         pthread_mutex_unlock(&sh->intr_mutex);
1444 }
1445
1446 /**
1447  * Install shared asynchronous device events handler.
1448  * This function is implemented to support event sharing
1449  * between multiple ports of single IB device.
1450  *
1451  * @param dev
1452  *   Pointer to Ethernet device.
1453  */
1454 static void
1455 mlx5_dev_shared_handler_install(struct rte_eth_dev *dev)
1456 {
1457         struct mlx5_priv *priv = dev->data->dev_private;
1458         struct mlx5_ibv_shared *sh = priv->sh;
1459         int ret;
1460         int flags;
1461
1462         if (rte_eal_process_type() != RTE_PROC_PRIMARY)
1463                 return;
1464         pthread_mutex_lock(&sh->intr_mutex);
1465         assert(priv->ibv_port);
1466         assert(priv->ibv_port <= sh->max_port);
1467         assert(dev->data->port_id < RTE_MAX_ETHPORTS);
1468         if (sh->port[priv->ibv_port - 1].ih_port_id < RTE_MAX_ETHPORTS) {
1469                 /* The handler is already installed for this port. */
1470                 assert(sh->intr_cnt);
1471                 goto exit;
1472         }
1473         sh->port[priv->ibv_port - 1].ih_port_id = (uint32_t)dev->data->port_id;
1474         if (sh->intr_cnt) {
1475                 sh->intr_cnt++;
1476                 goto exit;
1477         }
1478         /* No shared handler installed. */
1479         assert(sh->ctx->async_fd > 0);
1480         flags = fcntl(sh->ctx->async_fd, F_GETFL);
1481         ret = fcntl(sh->ctx->async_fd, F_SETFL, flags | O_NONBLOCK);
1482         if (ret) {
1483                 DRV_LOG(INFO, "failed to change file descriptor"
1484                               " async event queue");
1485                 goto error;
1486         }
1487         sh->intr_handle.fd = sh->ctx->async_fd;
1488         sh->intr_handle.type = RTE_INTR_HANDLE_EXT;
1489         rte_intr_callback_register(&sh->intr_handle,
1490                                    mlx5_dev_interrupt_handler, sh);
1491         if (priv->config.devx) {
1492 #ifndef HAVE_IBV_DEVX_ASYNC
1493                 goto error_unregister;
1494 #else
1495                 sh->devx_comp = mlx5_glue->devx_create_cmd_comp(sh->ctx);
1496                 if (sh->devx_comp) {
1497                         flags = fcntl(sh->devx_comp->fd, F_GETFL);
1498                         ret = fcntl(sh->devx_comp->fd, F_SETFL,
1499                                     flags | O_NONBLOCK);
1500                         if (ret) {
1501                                 DRV_LOG(INFO, "failed to change file descriptor"
1502                                               " devx async event queue");
1503                                 goto error_unregister;
1504                         }
1505                         sh->intr_handle_devx.fd = sh->devx_comp->fd;
1506                         sh->intr_handle_devx.type = RTE_INTR_HANDLE_EXT;
1507                         rte_intr_callback_register
1508                                 (&sh->intr_handle_devx,
1509                                  mlx5_dev_interrupt_handler_devx, sh);
1510                 } else {
1511                         DRV_LOG(INFO, "failed to create devx async command "
1512                                 "completion");
1513                         goto error_unregister;
1514                 }
1515 #endif /* HAVE_IBV_DEVX_ASYNC */
1516         }
1517         sh->intr_cnt++;
1518         goto exit;
1519 error_unregister:
1520         rte_intr_callback_unregister(&sh->intr_handle,
1521                                      mlx5_dev_interrupt_handler, sh);
1522 error:
1523         /* Indicate there will be no interrupts. */
1524         dev->data->dev_conf.intr_conf.lsc = 0;
1525         dev->data->dev_conf.intr_conf.rmv = 0;
1526         sh->intr_handle.fd = 0;
1527         sh->intr_handle.type = RTE_INTR_HANDLE_UNKNOWN;
1528         sh->port[priv->ibv_port - 1].ih_port_id = RTE_MAX_ETHPORTS;
1529 exit:
1530         pthread_mutex_unlock(&sh->intr_mutex);
1531 }
1532
1533 /**
1534  * Uninstall interrupt handler.
1535  *
1536  * @param dev
1537  *   Pointer to Ethernet device.
1538  */
1539 void
1540 mlx5_dev_interrupt_handler_uninstall(struct rte_eth_dev *dev)
1541 {
1542         mlx5_dev_shared_handler_uninstall(dev);
1543 }
1544
1545 /**
1546  * Install interrupt handler.
1547  *
1548  * @param dev
1549  *   Pointer to Ethernet device.
1550  */
1551 void
1552 mlx5_dev_interrupt_handler_install(struct rte_eth_dev *dev)
1553 {
1554         mlx5_dev_shared_handler_install(dev);
1555 }
1556
1557 /**
1558  * DPDK callback to bring the link DOWN.
1559  *
1560  * @param dev
1561  *   Pointer to Ethernet device structure.
1562  *
1563  * @return
1564  *   0 on success, a negative errno value otherwise and rte_errno is set.
1565  */
1566 int
1567 mlx5_set_link_down(struct rte_eth_dev *dev)
1568 {
1569         return mlx5_set_flags(dev, ~IFF_UP, ~IFF_UP);
1570 }
1571
1572 /**
1573  * DPDK callback to bring the link UP.
1574  *
1575  * @param dev
1576  *   Pointer to Ethernet device structure.
1577  *
1578  * @return
1579  *   0 on success, a negative errno value otherwise and rte_errno is set.
1580  */
1581 int
1582 mlx5_set_link_up(struct rte_eth_dev *dev)
1583 {
1584         return mlx5_set_flags(dev, ~IFF_UP, IFF_UP);
1585 }
1586
1587 /**
1588  * Configure the RX function to use.
1589  *
1590  * @param dev
1591  *   Pointer to private data structure.
1592  *
1593  * @return
1594  *   Pointer to selected Rx burst function.
1595  */
1596 eth_rx_burst_t
1597 mlx5_select_rx_function(struct rte_eth_dev *dev)
1598 {
1599         eth_rx_burst_t rx_pkt_burst = mlx5_rx_burst;
1600
1601         assert(dev != NULL);
1602         if (mlx5_check_vec_rx_support(dev) > 0) {
1603                 rx_pkt_burst = mlx5_rx_burst_vec;
1604                 DRV_LOG(DEBUG, "port %u selected Rx vectorized function",
1605                         dev->data->port_id);
1606         } else if (mlx5_mprq_enabled(dev)) {
1607                 rx_pkt_burst = mlx5_rx_burst_mprq;
1608         }
1609         return rx_pkt_burst;
1610 }
1611
1612 /**
1613  * Check if mlx5 device was removed.
1614  *
1615  * @param dev
1616  *   Pointer to Ethernet device structure.
1617  *
1618  * @return
1619  *   1 when device is removed, otherwise 0.
1620  */
1621 int
1622 mlx5_is_removed(struct rte_eth_dev *dev)
1623 {
1624         struct ibv_device_attr device_attr;
1625         struct mlx5_priv *priv = dev->data->dev_private;
1626
1627         if (mlx5_glue->query_device(priv->sh->ctx, &device_attr) == EIO)
1628                 return 1;
1629         return 0;
1630 }
1631
1632 /**
1633  * Get port ID list of mlx5 instances sharing a common device.
1634  *
1635  * @param[in] dev
1636  *   Device to look for.
1637  * @param[out] port_list
1638  *   Result buffer for collected port IDs.
1639  * @param port_list_n
1640  *   Maximum number of entries in result buffer. If 0, @p port_list can be
1641  *   NULL.
1642  *
1643  * @return
1644  *   Number of matching instances regardless of the @p port_list_n
1645  *   parameter, 0 if none were found.
1646  */
1647 unsigned int
1648 mlx5_dev_to_port_id(const struct rte_device *dev, uint16_t *port_list,
1649                     unsigned int port_list_n)
1650 {
1651         uint16_t id;
1652         unsigned int n = 0;
1653
1654         RTE_ETH_FOREACH_DEV_OF(id, dev) {
1655                 if (n < port_list_n)
1656                         port_list[n] = id;
1657                 n++;
1658         }
1659         return n;
1660 }
1661
1662 /**
1663  * Get the E-Switch domain id this port belongs to.
1664  *
1665  * @param[in] port
1666  *   Device port id.
1667  * @param[out] es_domain_id
1668  *   E-Switch domain id.
1669  * @param[out] es_port_id
1670  *   The port id of the port in the E-Switch.
1671  *
1672  * @return
1673  *   0 on success, a negative errno value otherwise and rte_errno is set.
1674  */
1675 int
1676 mlx5_port_to_eswitch_info(uint16_t port,
1677                           uint16_t *es_domain_id, uint16_t *es_port_id)
1678 {
1679         struct rte_eth_dev *dev;
1680         struct mlx5_priv *priv;
1681
1682         if (port >= RTE_MAX_ETHPORTS) {
1683                 rte_errno = EINVAL;
1684                 return -rte_errno;
1685         }
1686         if (!rte_eth_dev_is_valid_port(port)) {
1687                 rte_errno = ENODEV;
1688                 return -rte_errno;
1689         }
1690         dev = &rte_eth_devices[port];
1691         priv = dev->data->dev_private;
1692         if (!(priv->representor || priv->master)) {
1693                 rte_errno = EINVAL;
1694                 return -rte_errno;
1695         }
1696         if (es_domain_id)
1697                 *es_domain_id = priv->domain_id;
1698         if (es_port_id)
1699                 *es_port_id = priv->vport_id;
1700         return 0;
1701 }
1702
1703 /**
1704  * Get switch information associated with network interface.
1705  *
1706  * @param ifindex
1707  *   Network interface index.
1708  * @param[out] info
1709  *   Switch information object, populated in case of success.
1710  *
1711  * @return
1712  *   0 on success, a negative errno value otherwise and rte_errno is set.
1713  */
1714 int
1715 mlx5_sysfs_switch_info(unsigned int ifindex, struct mlx5_switch_info *info)
1716 {
1717         char ifname[IF_NAMESIZE];
1718         char port_name[IF_NAMESIZE];
1719         FILE *file;
1720         struct mlx5_switch_info data = {
1721                 .master = 0,
1722                 .representor = 0,
1723                 .name_type = MLX5_PHYS_PORT_NAME_TYPE_NOTSET,
1724                 .port_name = 0,
1725                 .switch_id = 0,
1726         };
1727         DIR *dir;
1728         bool port_switch_id_set = false;
1729         bool device_dir = false;
1730         char c;
1731         int ret;
1732
1733         if (!if_indextoname(ifindex, ifname)) {
1734                 rte_errno = errno;
1735                 return -rte_errno;
1736         }
1737
1738         MKSTR(phys_port_name, "/sys/class/net/%s/phys_port_name",
1739               ifname);
1740         MKSTR(phys_switch_id, "/sys/class/net/%s/phys_switch_id",
1741               ifname);
1742         MKSTR(pci_device, "/sys/class/net/%s/device",
1743               ifname);
1744
1745         file = fopen(phys_port_name, "rb");
1746         if (file != NULL) {
1747                 ret = fscanf(file, "%s", port_name);
1748                 fclose(file);
1749                 if (ret == 1)
1750                         mlx5_translate_port_name(port_name, &data);
1751         }
1752         file = fopen(phys_switch_id, "rb");
1753         if (file == NULL) {
1754                 rte_errno = errno;
1755                 return -rte_errno;
1756         }
1757         port_switch_id_set =
1758                 fscanf(file, "%" SCNx64 "%c", &data.switch_id, &c) == 2 &&
1759                 c == '\n';
1760         fclose(file);
1761         dir = opendir(pci_device);
1762         if (dir != NULL) {
1763                 closedir(dir);
1764                 device_dir = true;
1765         }
1766         if (port_switch_id_set) {
1767                 /* We have some E-Switch configuration. */
1768                 mlx5_sysfs_check_switch_info(device_dir, &data);
1769         }
1770         *info = data;
1771         assert(!(data.master && data.representor));
1772         if (data.master && data.representor) {
1773                 DRV_LOG(ERR, "ifindex %u device is recognized as master"
1774                              " and as representor", ifindex);
1775                 rte_errno = ENODEV;
1776                 return -rte_errno;
1777         }
1778         return 0;
1779 }
1780
1781 /**
1782  * Analyze gathered port parameters via Netlink to recognize master
1783  * and representor devices for E-Switch configuration.
1784  *
1785  * @param[in] num_vf_set
1786  *   flag of presence of number of VFs port attribute.
1787  * @param[inout] switch_info
1788  *   Port information, including port name as a number and port name
1789  *   type if recognized
1790  *
1791  * @return
1792  *   master and representor flags are set in switch_info according to
1793  *   recognized parameters (if any).
1794  */
1795 void
1796 mlx5_nl_check_switch_info(bool num_vf_set,
1797                           struct mlx5_switch_info *switch_info)
1798 {
1799         switch (switch_info->name_type) {
1800         case MLX5_PHYS_PORT_NAME_TYPE_UNKNOWN:
1801                 /*
1802                  * Name is not recognized, assume the master,
1803                  * check the number of VFs key presence.
1804                  */
1805                 switch_info->master = num_vf_set;
1806                 break;
1807         case MLX5_PHYS_PORT_NAME_TYPE_NOTSET:
1808                 /*
1809                  * Name is not set, this assumes the legacy naming
1810                  * schema for master, just check if there is a
1811                  * number of VFs key.
1812                  */
1813                 switch_info->master = num_vf_set;
1814                 break;
1815         case MLX5_PHYS_PORT_NAME_TYPE_UPLINK:
1816                 /* New uplink naming schema recognized. */
1817                 switch_info->master = 1;
1818                 break;
1819         case MLX5_PHYS_PORT_NAME_TYPE_LEGACY:
1820                 /* Legacy representors naming schema. */
1821                 switch_info->representor = !num_vf_set;
1822                 break;
1823         case MLX5_PHYS_PORT_NAME_TYPE_PFVF:
1824                 /* New representors naming schema. */
1825                 switch_info->representor = 1;
1826                 break;
1827         }
1828 }
1829
1830 /**
1831  * Analyze gathered port parameters via sysfs to recognize master
1832  * and representor devices for E-Switch configuration.
1833  *
1834  * @param[in] device_dir
1835  *   flag of presence of "device" directory under port device key.
1836  * @param[inout] switch_info
1837  *   Port information, including port name as a number and port name
1838  *   type if recognized
1839  *
1840  * @return
1841  *   master and representor flags are set in switch_info according to
1842  *   recognized parameters (if any).
1843  */
1844 void
1845 mlx5_sysfs_check_switch_info(bool device_dir,
1846                              struct mlx5_switch_info *switch_info)
1847 {
1848         switch (switch_info->name_type) {
1849         case MLX5_PHYS_PORT_NAME_TYPE_UNKNOWN:
1850                 /*
1851                  * Name is not recognized, assume the master,
1852                  * check the device directory presence.
1853                  */
1854                 switch_info->master = device_dir;
1855                 break;
1856         case MLX5_PHYS_PORT_NAME_TYPE_NOTSET:
1857                 /*
1858                  * Name is not set, this assumes the legacy naming
1859                  * schema for master, just check if there is
1860                  * a device directory.
1861                  */
1862                 switch_info->master = device_dir;
1863                 break;
1864         case MLX5_PHYS_PORT_NAME_TYPE_UPLINK:
1865                 /* New uplink naming schema recognized. */
1866                 switch_info->master = 1;
1867                 break;
1868         case MLX5_PHYS_PORT_NAME_TYPE_LEGACY:
1869                 /* Legacy representors naming schema. */
1870                 switch_info->representor = !device_dir;
1871                 break;
1872         case MLX5_PHYS_PORT_NAME_TYPE_PFVF:
1873                 /* New representors naming schema. */
1874                 switch_info->representor = 1;
1875                 break;
1876         }
1877 }
1878
1879 /**
1880  * Extract port name, as a number, from sysfs or netlink information.
1881  *
1882  * @param[in] port_name_in
1883  *   String representing the port name.
1884  * @param[out] port_info_out
1885  *   Port information, including port name as a number and port name
1886  *   type if recognized
1887  *
1888  * @return
1889  *   port_name field set according to recognized name format.
1890  */
1891 void
1892 mlx5_translate_port_name(const char *port_name_in,
1893                          struct mlx5_switch_info *port_info_out)
1894 {
1895         char pf_c1, pf_c2, vf_c1, vf_c2;
1896         char *end;
1897         int sc_items;
1898
1899         /*
1900          * Check for port-name as a string of the form pf0vf0
1901          * (support kernel ver >= 5.0 or OFED ver >= 4.6).
1902          */
1903         sc_items = sscanf(port_name_in, "%c%c%d%c%c%d",
1904                           &pf_c1, &pf_c2, &port_info_out->pf_num,
1905                           &vf_c1, &vf_c2, &port_info_out->port_name);
1906         if (sc_items == 6 &&
1907             pf_c1 == 'p' && pf_c2 == 'f' &&
1908             vf_c1 == 'v' && vf_c2 == 'f') {
1909                 port_info_out->name_type = MLX5_PHYS_PORT_NAME_TYPE_PFVF;
1910                 return;
1911         }
1912         /*
1913          * Check for port-name as a string of the form p0
1914          * (support kernel ver >= 5.0, or OFED ver >= 4.6).
1915          */
1916         sc_items = sscanf(port_name_in, "%c%d",
1917                           &pf_c1, &port_info_out->port_name);
1918         if (sc_items == 2 && pf_c1 == 'p') {
1919                 port_info_out->name_type = MLX5_PHYS_PORT_NAME_TYPE_UPLINK;
1920                 return;
1921         }
1922         /* Check for port-name as a number (support kernel ver < 5.0 */
1923         errno = 0;
1924         port_info_out->port_name = strtol(port_name_in, &end, 0);
1925         if (!errno &&
1926             (size_t)(end - port_name_in) == strlen(port_name_in)) {
1927                 port_info_out->name_type = MLX5_PHYS_PORT_NAME_TYPE_LEGACY;
1928                 return;
1929         }
1930         port_info_out->name_type = MLX5_PHYS_PORT_NAME_TYPE_UNKNOWN;
1931         return;
1932 }
1933
1934 /**
1935  * DPDK callback to retrieve plug-in module EEPROM information (type and size).
1936  *
1937  * @param dev
1938  *   Pointer to Ethernet device structure.
1939  * @param[out] modinfo
1940  *   Storage for plug-in module EEPROM information.
1941  *
1942  * @return
1943  *   0 on success, a negative errno value otherwise and rte_errno is set.
1944  */
1945 int
1946 mlx5_get_module_info(struct rte_eth_dev *dev,
1947                      struct rte_eth_dev_module_info *modinfo)
1948 {
1949         struct ethtool_modinfo info = {
1950                 .cmd = ETHTOOL_GMODULEINFO,
1951         };
1952         struct ifreq ifr = (struct ifreq) {
1953                 .ifr_data = (void *)&info,
1954         };
1955         int ret = 0;
1956
1957         if (!dev || !modinfo) {
1958                 DRV_LOG(WARNING, "missing argument, cannot get module info");
1959                 rte_errno = EINVAL;
1960                 return -rte_errno;
1961         }
1962         ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
1963         if (ret) {
1964                 DRV_LOG(WARNING, "port %u ioctl(SIOCETHTOOL) failed: %s",
1965                         dev->data->port_id, strerror(rte_errno));
1966                 return ret;
1967         }
1968         modinfo->type = info.type;
1969         modinfo->eeprom_len = info.eeprom_len;
1970         return ret;
1971 }
1972
1973 /**
1974  * DPDK callback to retrieve plug-in module EEPROM data.
1975  *
1976  * @param dev
1977  *   Pointer to Ethernet device structure.
1978  * @param[out] info
1979  *   Storage for plug-in module EEPROM data.
1980  *
1981  * @return
1982  *   0 on success, a negative errno value otherwise and rte_errno is set.
1983  */
1984 int mlx5_get_module_eeprom(struct rte_eth_dev *dev,
1985                            struct rte_dev_eeprom_info *info)
1986 {
1987         struct ethtool_eeprom *eeprom;
1988         struct ifreq ifr;
1989         int ret = 0;
1990
1991         if (!dev || !info) {
1992                 DRV_LOG(WARNING, "missing argument, cannot get module eeprom");
1993                 rte_errno = EINVAL;
1994                 return -rte_errno;
1995         }
1996         eeprom = rte_calloc(__func__, 1,
1997                             (sizeof(struct ethtool_eeprom) + info->length), 0);
1998         if (!eeprom) {
1999                 DRV_LOG(WARNING, "port %u cannot allocate memory for "
2000                         "eeprom data", dev->data->port_id);
2001                 rte_errno = ENOMEM;
2002                 return -rte_errno;
2003         }
2004         eeprom->cmd = ETHTOOL_GMODULEEEPROM;
2005         eeprom->offset = info->offset;
2006         eeprom->len = info->length;
2007         ifr = (struct ifreq) {
2008                 .ifr_data = (void *)eeprom,
2009         };
2010         ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
2011         if (ret)
2012                 DRV_LOG(WARNING, "port %u ioctl(SIOCETHTOOL) failed: %s",
2013                         dev->data->port_id, strerror(rte_errno));
2014         else
2015                 rte_memcpy(info->data, eeprom->data, info->length);
2016         rte_free(eeprom);
2017         return ret;
2018 }