471c3f1bdc5a62a3d770a74dafbaca99fd07568e
[dpdk.git] / drivers / net / mlx5 / linux / mlx5_ethdev_os.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2015 6WIND S.A.
3  * Copyright 2015 Mellanox Technologies, Ltd
4  */
5
6 #include <stddef.h>
7 #include <inttypes.h>
8 #include <unistd.h>
9 #include <stdbool.h>
10 #include <stdint.h>
11 #include <stdio.h>
12 #include <string.h>
13 #include <stdlib.h>
14 #include <errno.h>
15 #include <dirent.h>
16 #include <net/if.h>
17 #include <sys/ioctl.h>
18 #include <sys/socket.h>
19 #include <netinet/in.h>
20 #include <linux/ethtool.h>
21 #include <linux/sockios.h>
22 #include <fcntl.h>
23 #include <stdalign.h>
24 #include <sys/un.h>
25 #include <time.h>
26
27 #include <ethdev_driver.h>
28 #include <rte_bus_pci.h>
29 #include <rte_mbuf.h>
30 #include <rte_common.h>
31 #include <rte_interrupts.h>
32 #include <rte_malloc.h>
33 #include <rte_string_fns.h>
34 #include <rte_rwlock.h>
35 #include <rte_cycles.h>
36
37 #include <mlx5_glue.h>
38 #include <mlx5_devx_cmds.h>
39 #include <mlx5_common.h>
40 #include <mlx5_malloc.h>
41
42 #include "mlx5.h"
43 #include "mlx5_rxtx.h"
44 #include "mlx5_utils.h"
45
46 /* Supported speed values found in /usr/include/linux/ethtool.h */
47 #ifndef HAVE_SUPPORTED_40000baseKR4_Full
48 #define SUPPORTED_40000baseKR4_Full (1 << 23)
49 #endif
50 #ifndef HAVE_SUPPORTED_40000baseCR4_Full
51 #define SUPPORTED_40000baseCR4_Full (1 << 24)
52 #endif
53 #ifndef HAVE_SUPPORTED_40000baseSR4_Full
54 #define SUPPORTED_40000baseSR4_Full (1 << 25)
55 #endif
56 #ifndef HAVE_SUPPORTED_40000baseLR4_Full
57 #define SUPPORTED_40000baseLR4_Full (1 << 26)
58 #endif
59 #ifndef HAVE_SUPPORTED_56000baseKR4_Full
60 #define SUPPORTED_56000baseKR4_Full (1 << 27)
61 #endif
62 #ifndef HAVE_SUPPORTED_56000baseCR4_Full
63 #define SUPPORTED_56000baseCR4_Full (1 << 28)
64 #endif
65 #ifndef HAVE_SUPPORTED_56000baseSR4_Full
66 #define SUPPORTED_56000baseSR4_Full (1 << 29)
67 #endif
68 #ifndef HAVE_SUPPORTED_56000baseLR4_Full
69 #define SUPPORTED_56000baseLR4_Full (1 << 30)
70 #endif
71
72 /* Add defines in case the running kernel is not the same as user headers. */
73 #ifndef ETHTOOL_GLINKSETTINGS
74 struct ethtool_link_settings {
75         uint32_t cmd;
76         uint32_t speed;
77         uint8_t duplex;
78         uint8_t port;
79         uint8_t phy_address;
80         uint8_t autoneg;
81         uint8_t mdio_support;
82         uint8_t eth_to_mdix;
83         uint8_t eth_tp_mdix_ctrl;
84         int8_t link_mode_masks_nwords;
85         uint32_t reserved[8];
86         uint32_t link_mode_masks[];
87 };
88
89 /* The kernel values can be found in /include/uapi/linux/ethtool.h */
90 #define ETHTOOL_GLINKSETTINGS 0x0000004c
91 #define ETHTOOL_LINK_MODE_1000baseT_Full_BIT 5
92 #define ETHTOOL_LINK_MODE_Autoneg_BIT 6
93 #define ETHTOOL_LINK_MODE_1000baseKX_Full_BIT 17
94 #define ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT 18
95 #define ETHTOOL_LINK_MODE_10000baseKR_Full_BIT 19
96 #define ETHTOOL_LINK_MODE_10000baseR_FEC_BIT 20
97 #define ETHTOOL_LINK_MODE_20000baseMLD2_Full_BIT 21
98 #define ETHTOOL_LINK_MODE_20000baseKR2_Full_BIT 22
99 #define ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT 23
100 #define ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT 24
101 #define ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT 25
102 #define ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT 26
103 #define ETHTOOL_LINK_MODE_56000baseKR4_Full_BIT 27
104 #define ETHTOOL_LINK_MODE_56000baseCR4_Full_BIT 28
105 #define ETHTOOL_LINK_MODE_56000baseSR4_Full_BIT 29
106 #define ETHTOOL_LINK_MODE_56000baseLR4_Full_BIT 30
107 #endif
108 #ifndef HAVE_ETHTOOL_LINK_MODE_25G
109 #define ETHTOOL_LINK_MODE_25000baseCR_Full_BIT 31
110 #define ETHTOOL_LINK_MODE_25000baseKR_Full_BIT 32
111 #define ETHTOOL_LINK_MODE_25000baseSR_Full_BIT 33
112 #endif
113 #ifndef HAVE_ETHTOOL_LINK_MODE_50G
114 #define ETHTOOL_LINK_MODE_50000baseCR2_Full_BIT 34
115 #define ETHTOOL_LINK_MODE_50000baseKR2_Full_BIT 35
116 #endif
117 #ifndef HAVE_ETHTOOL_LINK_MODE_100G
118 #define ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT 36
119 #define ETHTOOL_LINK_MODE_100000baseSR4_Full_BIT 37
120 #define ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT 38
121 #define ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT 39
122 #endif
123 #ifndef HAVE_ETHTOOL_LINK_MODE_200G
124 #define ETHTOOL_LINK_MODE_200000baseKR4_Full_BIT 62
125 #define ETHTOOL_LINK_MODE_200000baseSR4_Full_BIT 63
126 #define ETHTOOL_LINK_MODE_200000baseLR4_ER4_FR4_Full_BIT 0 /* 64 - 64 */
127 #define ETHTOOL_LINK_MODE_200000baseDR4_Full_BIT 1 /* 65 - 64 */
128 #define ETHTOOL_LINK_MODE_200000baseCR4_Full_BIT 2 /* 66 - 64 */
129 #endif
130
131 /* Get interface index from SubFunction device name. */
132 int
133 mlx5_auxiliary_get_ifindex(const char *sf_name)
134 {
135         char if_name[IF_NAMESIZE] = { 0 };
136
137         if (mlx5_auxiliary_get_child_name(sf_name, "/net",
138                                           if_name, sizeof(if_name)) != 0)
139                 return -rte_errno;
140         return if_nametoindex(if_name);
141 }
142
143 /**
144  * Get interface name from private structure.
145  *
146  * This is a port representor-aware version of mlx5_get_ifname_sysfs().
147  *
148  * @param[in] dev
149  *   Pointer to Ethernet device.
150  * @param[out] ifname
151  *   Interface name output buffer.
152  *
153  * @return
154  *   0 on success, a negative errno value otherwise and rte_errno is set.
155  */
156 int
157 mlx5_get_ifname(const struct rte_eth_dev *dev, char (*ifname)[MLX5_NAMESIZE])
158 {
159         struct mlx5_priv *priv = dev->data->dev_private;
160         unsigned int ifindex;
161
162         MLX5_ASSERT(priv);
163         MLX5_ASSERT(priv->sh);
164         if (priv->master && priv->sh->bond.ifindex > 0) {
165                 memcpy(ifname, priv->sh->bond.ifname, MLX5_NAMESIZE);
166                 return 0;
167         }
168         ifindex = mlx5_ifindex(dev);
169         if (!ifindex) {
170                 if (!priv->representor)
171                         return mlx5_get_ifname_sysfs(priv->sh->ibdev_path,
172                                                      *ifname);
173                 rte_errno = ENXIO;
174                 return -rte_errno;
175         }
176         if (if_indextoname(ifindex, &(*ifname)[0]))
177                 return 0;
178         rte_errno = errno;
179         return -rte_errno;
180 }
181
182 /**
183  * Perform ifreq ioctl() on associated netdev ifname.
184  *
185  * @param[in] ifname
186  *   Pointer to netdev name.
187  * @param req
188  *   Request number to pass to ioctl().
189  * @param[out] ifr
190  *   Interface request structure output buffer.
191  *
192  * @return
193  *   0 on success, a negative errno value otherwise and rte_errno is set.
194  */
195 static int
196 mlx5_ifreq_by_ifname(const char *ifname, int req, struct ifreq *ifr)
197 {
198         int sock = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
199         int ret = 0;
200
201         if (sock == -1) {
202                 rte_errno = errno;
203                 return -rte_errno;
204         }
205         rte_strscpy(ifr->ifr_name, ifname, sizeof(ifr->ifr_name));
206         ret = ioctl(sock, req, ifr);
207         if (ret == -1) {
208                 rte_errno = errno;
209                 goto error;
210         }
211         close(sock);
212         return 0;
213 error:
214         close(sock);
215         return -rte_errno;
216 }
217
218 /**
219  * Perform ifreq ioctl() on associated Ethernet device.
220  *
221  * @param[in] dev
222  *   Pointer to Ethernet device.
223  * @param req
224  *   Request number to pass to ioctl().
225  * @param[out] ifr
226  *   Interface request structure output buffer.
227  *
228  * @return
229  *   0 on success, a negative errno value otherwise and rte_errno is set.
230  */
231 static int
232 mlx5_ifreq(const struct rte_eth_dev *dev, int req, struct ifreq *ifr)
233 {
234         char ifname[sizeof(ifr->ifr_name)];
235         int ret;
236
237         ret = mlx5_get_ifname(dev, &ifname);
238         if (ret)
239                 return -rte_errno;
240         return mlx5_ifreq_by_ifname(ifname, req, ifr);
241 }
242
243 /**
244  * Get device MTU.
245  *
246  * @param dev
247  *   Pointer to Ethernet device.
248  * @param[out] mtu
249  *   MTU value output buffer.
250  *
251  * @return
252  *   0 on success, a negative errno value otherwise and rte_errno is set.
253  */
254 int
255 mlx5_get_mtu(struct rte_eth_dev *dev, uint16_t *mtu)
256 {
257         struct ifreq request;
258         int ret = mlx5_ifreq(dev, SIOCGIFMTU, &request);
259
260         if (ret)
261                 return ret;
262         *mtu = request.ifr_mtu;
263         return 0;
264 }
265
266 /**
267  * Set device MTU.
268  *
269  * @param dev
270  *   Pointer to Ethernet device.
271  * @param mtu
272  *   MTU value to set.
273  *
274  * @return
275  *   0 on success, a negative errno value otherwise and rte_errno is set.
276  */
277 int
278 mlx5_set_mtu(struct rte_eth_dev *dev, uint16_t mtu)
279 {
280         struct ifreq request = { .ifr_mtu = mtu, };
281
282         return mlx5_ifreq(dev, SIOCSIFMTU, &request);
283 }
284
285 /**
286  * Set device flags.
287  *
288  * @param dev
289  *   Pointer to Ethernet device.
290  * @param keep
291  *   Bitmask for flags that must remain untouched.
292  * @param flags
293  *   Bitmask for flags to modify.
294  *
295  * @return
296  *   0 on success, a negative errno value otherwise and rte_errno is set.
297  */
298 static int
299 mlx5_set_flags(struct rte_eth_dev *dev, unsigned int keep, unsigned int flags)
300 {
301         struct ifreq request;
302         int ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &request);
303
304         if (ret)
305                 return ret;
306         request.ifr_flags &= keep;
307         request.ifr_flags |= flags & ~keep;
308         return mlx5_ifreq(dev, SIOCSIFFLAGS, &request);
309 }
310
311 /**
312  * Get device current raw clock counter
313  *
314  * @param dev
315  *   Pointer to Ethernet device structure.
316  * @param[out] time
317  *   Current raw clock counter of the device.
318  *
319  * @return
320  *   0 if the clock has correctly been read
321  *   The value of errno in case of error
322  */
323 int
324 mlx5_read_clock(struct rte_eth_dev *dev, uint64_t *clock)
325 {
326         struct mlx5_priv *priv = dev->data->dev_private;
327         struct ibv_context *ctx = priv->sh->cdev->ctx;
328         struct ibv_values_ex values;
329         int err = 0;
330
331         values.comp_mask = IBV_VALUES_MASK_RAW_CLOCK;
332         err = mlx5_glue->query_rt_values_ex(ctx, &values);
333         if (err != 0) {
334                 DRV_LOG(WARNING, "Could not query the clock !");
335                 return err;
336         }
337         *clock = values.raw_clock.tv_nsec;
338         return 0;
339 }
340
341 /**
342  * Retrieve the master device for representor in the same switch domain.
343  *
344  * @param dev
345  *   Pointer to representor Ethernet device structure.
346  *
347  * @return
348  *   Master device structure  on success, NULL otherwise.
349  */
350 static struct rte_eth_dev *
351 mlx5_find_master_dev(struct rte_eth_dev *dev)
352 {
353         struct mlx5_priv *priv;
354         uint16_t port_id;
355         uint16_t domain_id;
356
357         priv = dev->data->dev_private;
358         domain_id = priv->domain_id;
359         MLX5_ASSERT(priv->representor);
360         MLX5_ETH_FOREACH_DEV(port_id, dev->device) {
361                 struct mlx5_priv *opriv =
362                         rte_eth_devices[port_id].data->dev_private;
363                 if (opriv &&
364                     opriv->master &&
365                     opriv->domain_id == domain_id &&
366                     opriv->sh == priv->sh)
367                         return &rte_eth_devices[port_id];
368         }
369         return NULL;
370 }
371
372 /**
373  * DPDK callback to retrieve physical link information.
374  *
375  * @param dev
376  *   Pointer to Ethernet device structure.
377  * @param[out] link
378  *   Storage for current link status.
379  *
380  * @return
381  *   0 on success, a negative errno value otherwise and rte_errno is set.
382  */
383 static int
384 mlx5_link_update_unlocked_gset(struct rte_eth_dev *dev,
385                                struct rte_eth_link *link)
386 {
387         struct mlx5_priv *priv = dev->data->dev_private;
388         struct ethtool_cmd edata = {
389                 .cmd = ETHTOOL_GSET /* Deprecated since Linux v4.5. */
390         };
391         struct ifreq ifr;
392         struct rte_eth_link dev_link;
393         int link_speed = 0;
394         int ret;
395
396         ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr);
397         if (ret) {
398                 DRV_LOG(WARNING, "port %u ioctl(SIOCGIFFLAGS) failed: %s",
399                         dev->data->port_id, strerror(rte_errno));
400                 return ret;
401         }
402         dev_link = (struct rte_eth_link) {
403                 .link_status = ((ifr.ifr_flags & IFF_UP) &&
404                                 (ifr.ifr_flags & IFF_RUNNING)),
405         };
406         ifr = (struct ifreq) {
407                 .ifr_data = (void *)&edata,
408         };
409         ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
410         if (ret) {
411                 if (ret == -ENOTSUP && priv->representor) {
412                         struct rte_eth_dev *master;
413
414                         /*
415                          * For representors we can try to inherit link
416                          * settings from the master device. Actually
417                          * link settings do not make a lot of sense
418                          * for representors due to missing physical
419                          * link. The old kernel drivers supported
420                          * emulated settings query for representors,
421                          * the new ones do not, so we have to add
422                          * this code for compatibility issues.
423                          */
424                         master = mlx5_find_master_dev(dev);
425                         if (master) {
426                                 ifr = (struct ifreq) {
427                                         .ifr_data = (void *)&edata,
428                                 };
429                                 ret = mlx5_ifreq(master, SIOCETHTOOL, &ifr);
430                         }
431                 }
432                 if (ret) {
433                         DRV_LOG(WARNING,
434                                 "port %u ioctl(SIOCETHTOOL,"
435                                 " ETHTOOL_GSET) failed: %s",
436                                 dev->data->port_id, strerror(rte_errno));
437                         return ret;
438                 }
439         }
440         link_speed = ethtool_cmd_speed(&edata);
441         if (link_speed == -1)
442                 dev_link.link_speed = ETH_SPEED_NUM_UNKNOWN;
443         else
444                 dev_link.link_speed = link_speed;
445         priv->link_speed_capa = 0;
446         if (edata.supported & (SUPPORTED_1000baseT_Full |
447                                SUPPORTED_1000baseKX_Full))
448                 priv->link_speed_capa |= ETH_LINK_SPEED_1G;
449         if (edata.supported & SUPPORTED_10000baseKR_Full)
450                 priv->link_speed_capa |= ETH_LINK_SPEED_10G;
451         if (edata.supported & (SUPPORTED_40000baseKR4_Full |
452                                SUPPORTED_40000baseCR4_Full |
453                                SUPPORTED_40000baseSR4_Full |
454                                SUPPORTED_40000baseLR4_Full))
455                 priv->link_speed_capa |= ETH_LINK_SPEED_40G;
456         dev_link.link_duplex = ((edata.duplex == DUPLEX_HALF) ?
457                                 ETH_LINK_HALF_DUPLEX : ETH_LINK_FULL_DUPLEX);
458         dev_link.link_autoneg = !(dev->data->dev_conf.link_speeds &
459                         ETH_LINK_SPEED_FIXED);
460         *link = dev_link;
461         return 0;
462 }
463
464 /**
465  * Retrieve physical link information (unlocked version using new ioctl).
466  *
467  * @param dev
468  *   Pointer to Ethernet device structure.
469  * @param[out] link
470  *   Storage for current link status.
471  *
472  * @return
473  *   0 on success, a negative errno value otherwise and rte_errno is set.
474  */
475 static int
476 mlx5_link_update_unlocked_gs(struct rte_eth_dev *dev,
477                              struct rte_eth_link *link)
478
479 {
480         struct mlx5_priv *priv = dev->data->dev_private;
481         struct ethtool_link_settings gcmd = { .cmd = ETHTOOL_GLINKSETTINGS };
482         struct ifreq ifr;
483         struct rte_eth_link dev_link;
484         struct rte_eth_dev *master = NULL;
485         uint64_t sc;
486         int ret;
487
488         ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr);
489         if (ret) {
490                 DRV_LOG(WARNING, "port %u ioctl(SIOCGIFFLAGS) failed: %s",
491                         dev->data->port_id, strerror(rte_errno));
492                 return ret;
493         }
494         dev_link = (struct rte_eth_link) {
495                 .link_status = ((ifr.ifr_flags & IFF_UP) &&
496                                 (ifr.ifr_flags & IFF_RUNNING)),
497         };
498         ifr = (struct ifreq) {
499                 .ifr_data = (void *)&gcmd,
500         };
501         ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
502         if (ret) {
503                 if (ret == -ENOTSUP && priv->representor) {
504                         /*
505                          * For representors we can try to inherit link
506                          * settings from the master device. Actually
507                          * link settings do not make a lot of sense
508                          * for representors due to missing physical
509                          * link. The old kernel drivers supported
510                          * emulated settings query for representors,
511                          * the new ones do not, so we have to add
512                          * this code for compatibility issues.
513                          */
514                         master = mlx5_find_master_dev(dev);
515                         if (master) {
516                                 ifr = (struct ifreq) {
517                                         .ifr_data = (void *)&gcmd,
518                                 };
519                                 ret = mlx5_ifreq(master, SIOCETHTOOL, &ifr);
520                         }
521                 }
522                 if (ret) {
523                         DRV_LOG(DEBUG,
524                                 "port %u ioctl(SIOCETHTOOL,"
525                                 " ETHTOOL_GLINKSETTINGS) failed: %s",
526                                 dev->data->port_id, strerror(rte_errno));
527                         return ret;
528                 }
529         }
530         gcmd.link_mode_masks_nwords = -gcmd.link_mode_masks_nwords;
531
532         alignas(struct ethtool_link_settings)
533         uint8_t data[offsetof(struct ethtool_link_settings, link_mode_masks) +
534                      sizeof(uint32_t) * gcmd.link_mode_masks_nwords * 3];
535         struct ethtool_link_settings *ecmd = (void *)data;
536
537         *ecmd = gcmd;
538         ifr.ifr_data = (void *)ecmd;
539         ret = mlx5_ifreq(master ? master : dev, SIOCETHTOOL, &ifr);
540         if (ret) {
541                 DRV_LOG(DEBUG,
542                         "port %u ioctl(SIOCETHTOOL,"
543                         "ETHTOOL_GLINKSETTINGS) failed: %s",
544                         dev->data->port_id, strerror(rte_errno));
545                 return ret;
546         }
547         dev_link.link_speed = (ecmd->speed == UINT32_MAX) ?
548                                 ETH_SPEED_NUM_UNKNOWN : ecmd->speed;
549         sc = ecmd->link_mode_masks[0] |
550                 ((uint64_t)ecmd->link_mode_masks[1] << 32);
551         priv->link_speed_capa = 0;
552         if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_1000baseT_Full_BIT) |
553                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_1000baseKX_Full_BIT)))
554                 priv->link_speed_capa |= ETH_LINK_SPEED_1G;
555         if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT) |
556                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_10000baseKR_Full_BIT) |
557                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_10000baseR_FEC_BIT)))
558                 priv->link_speed_capa |= ETH_LINK_SPEED_10G;
559         if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_20000baseMLD2_Full_BIT) |
560                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_20000baseKR2_Full_BIT)))
561                 priv->link_speed_capa |= ETH_LINK_SPEED_20G;
562         if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT) |
563                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT) |
564                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT) |
565                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT)))
566                 priv->link_speed_capa |= ETH_LINK_SPEED_40G;
567         if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseKR4_Full_BIT) |
568                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseCR4_Full_BIT) |
569                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseSR4_Full_BIT) |
570                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseLR4_Full_BIT)))
571                 priv->link_speed_capa |= ETH_LINK_SPEED_56G;
572         if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_25000baseCR_Full_BIT) |
573                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_25000baseKR_Full_BIT) |
574                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_25000baseSR_Full_BIT)))
575                 priv->link_speed_capa |= ETH_LINK_SPEED_25G;
576         if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_50000baseCR2_Full_BIT) |
577                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_50000baseKR2_Full_BIT)))
578                 priv->link_speed_capa |= ETH_LINK_SPEED_50G;
579         if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT) |
580                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseSR4_Full_BIT) |
581                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT) |
582                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT)))
583                 priv->link_speed_capa |= ETH_LINK_SPEED_100G;
584         if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_200000baseKR4_Full_BIT) |
585                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_200000baseSR4_Full_BIT)))
586                 priv->link_speed_capa |= ETH_LINK_SPEED_200G;
587
588         sc = ecmd->link_mode_masks[2] |
589                 ((uint64_t)ecmd->link_mode_masks[3] << 32);
590         if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_200000baseCR4_Full_BIT) |
591                   MLX5_BITSHIFT
592                        (ETHTOOL_LINK_MODE_200000baseLR4_ER4_FR4_Full_BIT) |
593                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_200000baseDR4_Full_BIT)))
594                 priv->link_speed_capa |= ETH_LINK_SPEED_200G;
595         dev_link.link_duplex = ((ecmd->duplex == DUPLEX_HALF) ?
596                                 ETH_LINK_HALF_DUPLEX : ETH_LINK_FULL_DUPLEX);
597         dev_link.link_autoneg = !(dev->data->dev_conf.link_speeds &
598                                   ETH_LINK_SPEED_FIXED);
599         *link = dev_link;
600         return 0;
601 }
602
603 /**
604  * DPDK callback to retrieve physical link information.
605  *
606  * @param dev
607  *   Pointer to Ethernet device structure.
608  * @param wait_to_complete
609  *   Wait for request completion.
610  *
611  * @return
612  *   0 if link status was not updated, positive if it was, a negative errno
613  *   value otherwise and rte_errno is set.
614  */
615 int
616 mlx5_link_update(struct rte_eth_dev *dev, int wait_to_complete)
617 {
618         int ret;
619         struct rte_eth_link dev_link;
620         time_t start_time = time(NULL);
621         int retry = MLX5_GET_LINK_STATUS_RETRY_COUNT;
622
623         do {
624                 ret = mlx5_link_update_unlocked_gs(dev, &dev_link);
625                 if (ret == -ENOTSUP)
626                         ret = mlx5_link_update_unlocked_gset(dev, &dev_link);
627                 if (ret == 0)
628                         break;
629                 /* Handle wait to complete situation. */
630                 if ((wait_to_complete || retry) && ret == -EAGAIN) {
631                         if (abs((int)difftime(time(NULL), start_time)) <
632                             MLX5_LINK_STATUS_TIMEOUT) {
633                                 usleep(0);
634                                 continue;
635                         } else {
636                                 rte_errno = EBUSY;
637                                 return -rte_errno;
638                         }
639                 } else if (ret < 0) {
640                         return ret;
641                 }
642         } while (wait_to_complete || retry-- > 0);
643         ret = !!memcmp(&dev->data->dev_link, &dev_link,
644                        sizeof(struct rte_eth_link));
645         dev->data->dev_link = dev_link;
646         return ret;
647 }
648
649 /**
650  * DPDK callback to get flow control status.
651  *
652  * @param dev
653  *   Pointer to Ethernet device structure.
654  * @param[out] fc_conf
655  *   Flow control output buffer.
656  *
657  * @return
658  *   0 on success, a negative errno value otherwise and rte_errno is set.
659  */
660 int
661 mlx5_dev_get_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf)
662 {
663         struct ifreq ifr;
664         struct ethtool_pauseparam ethpause = {
665                 .cmd = ETHTOOL_GPAUSEPARAM
666         };
667         int ret;
668
669         ifr.ifr_data = (void *)&ethpause;
670         ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
671         if (ret) {
672                 DRV_LOG(WARNING,
673                         "port %u ioctl(SIOCETHTOOL, ETHTOOL_GPAUSEPARAM) failed:"
674                         " %s",
675                         dev->data->port_id, strerror(rte_errno));
676                 return ret;
677         }
678         fc_conf->autoneg = ethpause.autoneg;
679         if (ethpause.rx_pause && ethpause.tx_pause)
680                 fc_conf->mode = RTE_FC_FULL;
681         else if (ethpause.rx_pause)
682                 fc_conf->mode = RTE_FC_RX_PAUSE;
683         else if (ethpause.tx_pause)
684                 fc_conf->mode = RTE_FC_TX_PAUSE;
685         else
686                 fc_conf->mode = RTE_FC_NONE;
687         return 0;
688 }
689
690 /**
691  * DPDK callback to modify flow control parameters.
692  *
693  * @param dev
694  *   Pointer to Ethernet device structure.
695  * @param[in] fc_conf
696  *   Flow control parameters.
697  *
698  * @return
699  *   0 on success, a negative errno value otherwise and rte_errno is set.
700  */
701 int
702 mlx5_dev_set_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf)
703 {
704         struct ifreq ifr;
705         struct ethtool_pauseparam ethpause = {
706                 .cmd = ETHTOOL_SPAUSEPARAM
707         };
708         int ret;
709
710         ifr.ifr_data = (void *)&ethpause;
711         ethpause.autoneg = fc_conf->autoneg;
712         if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) ||
713             (fc_conf->mode & RTE_FC_RX_PAUSE))
714                 ethpause.rx_pause = 1;
715         else
716                 ethpause.rx_pause = 0;
717
718         if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) ||
719             (fc_conf->mode & RTE_FC_TX_PAUSE))
720                 ethpause.tx_pause = 1;
721         else
722                 ethpause.tx_pause = 0;
723         ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
724         if (ret) {
725                 DRV_LOG(WARNING,
726                         "port %u ioctl(SIOCETHTOOL, ETHTOOL_SPAUSEPARAM)"
727                         " failed: %s",
728                         dev->data->port_id, strerror(rte_errno));
729                 return ret;
730         }
731         return 0;
732 }
733
734 /**
735  * Handle asynchronous removal event for entire multiport device.
736  *
737  * @param sh
738  *   Infiniband device shared context.
739  */
740 static void
741 mlx5_dev_interrupt_device_fatal(struct mlx5_dev_ctx_shared *sh)
742 {
743         uint32_t i;
744
745         for (i = 0; i < sh->max_port; ++i) {
746                 struct rte_eth_dev *dev;
747
748                 if (sh->port[i].ih_port_id >= RTE_MAX_ETHPORTS) {
749                         /*
750                          * Or not existing port either no
751                          * handler installed for this port.
752                          */
753                         continue;
754                 }
755                 dev = &rte_eth_devices[sh->port[i].ih_port_id];
756                 MLX5_ASSERT(dev);
757                 if (dev->data->dev_conf.intr_conf.rmv)
758                         rte_eth_dev_callback_process
759                                 (dev, RTE_ETH_EVENT_INTR_RMV, NULL);
760         }
761 }
762
763 /**
764  * Handle shared asynchronous events the NIC (removal event
765  * and link status change). Supports multiport IB device.
766  *
767  * @param cb_arg
768  *   Callback argument.
769  */
770 void
771 mlx5_dev_interrupt_handler(void *cb_arg)
772 {
773         struct mlx5_dev_ctx_shared *sh = cb_arg;
774         struct ibv_async_event event;
775
776         /* Read all message from the IB device and acknowledge them. */
777         for (;;) {
778                 struct rte_eth_dev *dev;
779                 uint32_t tmp;
780
781                 if (mlx5_glue->get_async_event(sh->cdev->ctx, &event))
782                         break;
783                 /* Retrieve and check IB port index. */
784                 tmp = (uint32_t)event.element.port_num;
785                 if (!tmp && event.event_type == IBV_EVENT_DEVICE_FATAL) {
786                         /*
787                          * The DEVICE_FATAL event is called once for
788                          * entire device without port specifying.
789                          * We should notify all existing ports.
790                          */
791                         mlx5_glue->ack_async_event(&event);
792                         mlx5_dev_interrupt_device_fatal(sh);
793                         continue;
794                 }
795                 MLX5_ASSERT(tmp && (tmp <= sh->max_port));
796                 if (!tmp) {
797                         /* Unsupported device level event. */
798                         mlx5_glue->ack_async_event(&event);
799                         DRV_LOG(DEBUG,
800                                 "unsupported common event (type %d)",
801                                 event.event_type);
802                         continue;
803                 }
804                 if (tmp > sh->max_port) {
805                         /* Invalid IB port index. */
806                         mlx5_glue->ack_async_event(&event);
807                         DRV_LOG(DEBUG,
808                                 "cannot handle an event (type %d)"
809                                 "due to invalid IB port index (%u)",
810                                 event.event_type, tmp);
811                         continue;
812                 }
813                 if (sh->port[tmp - 1].ih_port_id >= RTE_MAX_ETHPORTS) {
814                         /* No handler installed. */
815                         mlx5_glue->ack_async_event(&event);
816                         DRV_LOG(DEBUG,
817                                 "cannot handle an event (type %d)"
818                                 "due to no handler installed for port %u",
819                                 event.event_type, tmp);
820                         continue;
821                 }
822                 /* Retrieve ethernet device descriptor. */
823                 tmp = sh->port[tmp - 1].ih_port_id;
824                 dev = &rte_eth_devices[tmp];
825                 MLX5_ASSERT(dev);
826                 if ((event.event_type == IBV_EVENT_PORT_ACTIVE ||
827                      event.event_type == IBV_EVENT_PORT_ERR) &&
828                         dev->data->dev_conf.intr_conf.lsc) {
829                         mlx5_glue->ack_async_event(&event);
830                         if (mlx5_link_update(dev, 0) == -EAGAIN) {
831                                 usleep(0);
832                                 continue;
833                         }
834                         rte_eth_dev_callback_process
835                                 (dev, RTE_ETH_EVENT_INTR_LSC, NULL);
836                         continue;
837                 }
838                 DRV_LOG(DEBUG,
839                         "port %u cannot handle an unknown event (type %d)",
840                         dev->data->port_id, event.event_type);
841                 mlx5_glue->ack_async_event(&event);
842         }
843 }
844
845 /*
846  * Unregister callback handler safely. The handler may be active
847  * while we are trying to unregister it, in this case code -EAGAIN
848  * is returned by rte_intr_callback_unregister(). This routine checks
849  * the return code and tries to unregister handler again.
850  *
851  * @param handle
852  *   interrupt handle
853  * @param cb_fn
854  *   pointer to callback routine
855  * @cb_arg
856  *   opaque callback parameter
857  */
858 void
859 mlx5_intr_callback_unregister(const struct rte_intr_handle *handle,
860                               rte_intr_callback_fn cb_fn, void *cb_arg)
861 {
862         /*
863          * Try to reduce timeout management overhead by not calling
864          * the timer related routines on the first iteration. If the
865          * unregistering succeeds on first call there will be no
866          * timer calls at all.
867          */
868         uint64_t twait = 0;
869         uint64_t start = 0;
870
871         do {
872                 int ret;
873
874                 ret = rte_intr_callback_unregister(handle, cb_fn, cb_arg);
875                 if (ret >= 0)
876                         return;
877                 if (ret != -EAGAIN) {
878                         DRV_LOG(INFO, "failed to unregister interrupt"
879                                       " handler (error: %d)", ret);
880                         MLX5_ASSERT(false);
881                         return;
882                 }
883                 if (twait) {
884                         struct timespec onems;
885
886                         /* Wait one millisecond and try again. */
887                         onems.tv_sec = 0;
888                         onems.tv_nsec = NS_PER_S / MS_PER_S;
889                         nanosleep(&onems, 0);
890                         /* Check whether one second elapsed. */
891                         if ((rte_get_timer_cycles() - start) <= twait)
892                                 continue;
893                 } else {
894                         /*
895                          * We get the amount of timer ticks for one second.
896                          * If this amount elapsed it means we spent one
897                          * second in waiting. This branch is executed once
898                          * on first iteration.
899                          */
900                         twait = rte_get_timer_hz();
901                         MLX5_ASSERT(twait);
902                 }
903                 /*
904                  * Timeout elapsed, show message (once a second) and retry.
905                  * We have no other acceptable option here, if we ignore
906                  * the unregistering return code the handler will not
907                  * be unregistered, fd will be closed and we may get the
908                  * crush. Hanging and messaging in the loop seems not to be
909                  * the worst choice.
910                  */
911                 DRV_LOG(INFO, "Retrying to unregister interrupt handler");
912                 start = rte_get_timer_cycles();
913         } while (true);
914 }
915
916 /**
917  * Handle DEVX interrupts from the NIC.
918  * This function is probably called from the DPDK host thread.
919  *
920  * @param cb_arg
921  *   Callback argument.
922  */
923 void
924 mlx5_dev_interrupt_handler_devx(void *cb_arg)
925 {
926 #ifndef HAVE_IBV_DEVX_ASYNC
927         (void)cb_arg;
928         return;
929 #else
930         struct mlx5_dev_ctx_shared *sh = cb_arg;
931         union {
932                 struct mlx5dv_devx_async_cmd_hdr cmd_resp;
933                 uint8_t buf[MLX5_ST_SZ_BYTES(query_flow_counter_out) +
934                             MLX5_ST_SZ_BYTES(traffic_counter) +
935                             sizeof(struct mlx5dv_devx_async_cmd_hdr)];
936         } out;
937         uint8_t *buf = out.buf + sizeof(out.cmd_resp);
938
939         while (!mlx5_glue->devx_get_async_cmd_comp(sh->devx_comp,
940                                                    &out.cmd_resp,
941                                                    sizeof(out.buf)))
942                 mlx5_flow_async_pool_query_handle
943                         (sh, (uint64_t)out.cmd_resp.wr_id,
944                          mlx5_devx_get_out_command_status(buf));
945 #endif /* HAVE_IBV_DEVX_ASYNC */
946 }
947
948 /**
949  * DPDK callback to bring the link DOWN.
950  *
951  * @param dev
952  *   Pointer to Ethernet device structure.
953  *
954  * @return
955  *   0 on success, a negative errno value otherwise and rte_errno is set.
956  */
957 int
958 mlx5_set_link_down(struct rte_eth_dev *dev)
959 {
960         return mlx5_set_flags(dev, ~IFF_UP, ~IFF_UP);
961 }
962
963 /**
964  * DPDK callback to bring the link UP.
965  *
966  * @param dev
967  *   Pointer to Ethernet device structure.
968  *
969  * @return
970  *   0 on success, a negative errno value otherwise and rte_errno is set.
971  */
972 int
973 mlx5_set_link_up(struct rte_eth_dev *dev)
974 {
975         return mlx5_set_flags(dev, ~IFF_UP, IFF_UP);
976 }
977
978 /**
979  * Check if mlx5 device was removed.
980  *
981  * @param dev
982  *   Pointer to Ethernet device structure.
983  *
984  * @return
985  *   1 when device is removed, otherwise 0.
986  */
987 int
988 mlx5_is_removed(struct rte_eth_dev *dev)
989 {
990         struct ibv_device_attr device_attr;
991         struct mlx5_priv *priv = dev->data->dev_private;
992
993         if (mlx5_glue->query_device(priv->sh->cdev->ctx, &device_attr) == EIO)
994                 return 1;
995         return 0;
996 }
997
998 /**
999  * Analyze gathered port parameters via sysfs to recognize master
1000  * and representor devices for E-Switch configuration.
1001  *
1002  * @param[in] device_dir
1003  *   flag of presence of "device" directory under port device key.
1004  * @param[inout] switch_info
1005  *   Port information, including port name as a number and port name
1006  *   type if recognized
1007  *
1008  * @return
1009  *   master and representor flags are set in switch_info according to
1010  *   recognized parameters (if any).
1011  */
1012 static void
1013 mlx5_sysfs_check_switch_info(bool device_dir,
1014                              struct mlx5_switch_info *switch_info)
1015 {
1016         switch (switch_info->name_type) {
1017         case MLX5_PHYS_PORT_NAME_TYPE_UNKNOWN:
1018                 /*
1019                  * Name is not recognized, assume the master,
1020                  * check the device directory presence.
1021                  */
1022                 switch_info->master = device_dir;
1023                 break;
1024         case MLX5_PHYS_PORT_NAME_TYPE_NOTSET:
1025                 /*
1026                  * Name is not set, this assumes the legacy naming
1027                  * schema for master, just check if there is
1028                  * a device directory.
1029                  */
1030                 switch_info->master = device_dir;
1031                 break;
1032         case MLX5_PHYS_PORT_NAME_TYPE_UPLINK:
1033                 /* New uplink naming schema recognized. */
1034                 switch_info->master = 1;
1035                 break;
1036         case MLX5_PHYS_PORT_NAME_TYPE_LEGACY:
1037                 /* Legacy representors naming schema. */
1038                 switch_info->representor = !device_dir;
1039                 break;
1040         case MLX5_PHYS_PORT_NAME_TYPE_PFHPF:
1041                 /* Fallthrough */
1042         case MLX5_PHYS_PORT_NAME_TYPE_PFVF:
1043                 /* Fallthrough */
1044         case MLX5_PHYS_PORT_NAME_TYPE_PFSF:
1045                 /* New representors naming schema. */
1046                 switch_info->representor = 1;
1047                 break;
1048         default:
1049                 switch_info->master = device_dir;
1050                 break;
1051         }
1052 }
1053
1054 /**
1055  * Get switch information associated with network interface.
1056  *
1057  * @param ifindex
1058  *   Network interface index.
1059  * @param[out] info
1060  *   Switch information object, populated in case of success.
1061  *
1062  * @return
1063  *   0 on success, a negative errno value otherwise and rte_errno is set.
1064  */
1065 int
1066 mlx5_sysfs_switch_info(unsigned int ifindex, struct mlx5_switch_info *info)
1067 {
1068         char ifname[IF_NAMESIZE];
1069         char port_name[IF_NAMESIZE];
1070         FILE *file;
1071         struct mlx5_switch_info data = {
1072                 .master = 0,
1073                 .representor = 0,
1074                 .name_type = MLX5_PHYS_PORT_NAME_TYPE_NOTSET,
1075                 .port_name = 0,
1076                 .switch_id = 0,
1077         };
1078         DIR *dir;
1079         bool port_switch_id_set = false;
1080         bool device_dir = false;
1081         char c;
1082         int ret;
1083
1084         if (!if_indextoname(ifindex, ifname)) {
1085                 rte_errno = errno;
1086                 return -rte_errno;
1087         }
1088
1089         MKSTR(phys_port_name, "/sys/class/net/%s/phys_port_name",
1090               ifname);
1091         MKSTR(phys_switch_id, "/sys/class/net/%s/phys_switch_id",
1092               ifname);
1093         MKSTR(pci_device, "/sys/class/net/%s/device",
1094               ifname);
1095
1096         file = fopen(phys_port_name, "rb");
1097         if (file != NULL) {
1098                 ret = fscanf(file, "%" RTE_STR(IF_NAMESIZE) "s", port_name);
1099                 fclose(file);
1100                 if (ret == 1)
1101                         mlx5_translate_port_name(port_name, &data);
1102         }
1103         file = fopen(phys_switch_id, "rb");
1104         if (file == NULL) {
1105                 rte_errno = errno;
1106                 return -rte_errno;
1107         }
1108         port_switch_id_set =
1109                 fscanf(file, "%" SCNx64 "%c", &data.switch_id, &c) == 2 &&
1110                 c == '\n';
1111         fclose(file);
1112         dir = opendir(pci_device);
1113         if (dir != NULL) {
1114                 closedir(dir);
1115                 device_dir = true;
1116         }
1117         if (port_switch_id_set) {
1118                 /* We have some E-Switch configuration. */
1119                 mlx5_sysfs_check_switch_info(device_dir, &data);
1120         }
1121         *info = data;
1122         MLX5_ASSERT(!(data.master && data.representor));
1123         if (data.master && data.representor) {
1124                 DRV_LOG(ERR, "ifindex %u device is recognized as master"
1125                              " and as representor", ifindex);
1126                 rte_errno = ENODEV;
1127                 return -rte_errno;
1128         }
1129         return 0;
1130 }
1131
1132 /**
1133  * Get bond information associated with network interface.
1134  *
1135  * @param pf_ifindex
1136  *   Network interface index of bond slave interface
1137  * @param[out] ifindex
1138  *   Pointer to bond ifindex.
1139  * @param[out] ifname
1140  *   Pointer to bond ifname.
1141  *
1142  * @return
1143  *   0 on success, a negative errno value otherwise and rte_errno is set.
1144  */
1145 int
1146 mlx5_sysfs_bond_info(unsigned int pf_ifindex, unsigned int *ifindex,
1147                      char *ifname)
1148 {
1149         char name[IF_NAMESIZE];
1150         FILE *file;
1151         unsigned int index;
1152         int ret;
1153
1154         if (!if_indextoname(pf_ifindex, name) || !strlen(name)) {
1155                 rte_errno = errno;
1156                 return -rte_errno;
1157         }
1158         MKSTR(bond_if, "/sys/class/net/%s/master/ifindex", name);
1159         /* read bond ifindex */
1160         file = fopen(bond_if, "rb");
1161         if (file == NULL) {
1162                 rte_errno = errno;
1163                 return -rte_errno;
1164         }
1165         ret = fscanf(file, "%u", &index);
1166         fclose(file);
1167         if (ret <= 0) {
1168                 rte_errno = errno;
1169                 return -rte_errno;
1170         }
1171         if (ifindex)
1172                 *ifindex = index;
1173
1174         /* read bond device name from symbol link */
1175         if (ifname) {
1176                 if (!if_indextoname(index, ifname)) {
1177                         rte_errno = errno;
1178                         return -rte_errno;
1179                 }
1180         }
1181         return 0;
1182 }
1183
1184 /**
1185  * DPDK callback to retrieve plug-in module EEPROM information (type and size).
1186  *
1187  * @param dev
1188  *   Pointer to Ethernet device structure.
1189  * @param[out] modinfo
1190  *   Storage for plug-in module EEPROM information.
1191  *
1192  * @return
1193  *   0 on success, a negative errno value otherwise and rte_errno is set.
1194  */
1195 int
1196 mlx5_get_module_info(struct rte_eth_dev *dev,
1197                      struct rte_eth_dev_module_info *modinfo)
1198 {
1199         struct ethtool_modinfo info = {
1200                 .cmd = ETHTOOL_GMODULEINFO,
1201         };
1202         struct ifreq ifr = (struct ifreq) {
1203                 .ifr_data = (void *)&info,
1204         };
1205         int ret = 0;
1206
1207         if (!dev) {
1208                 DRV_LOG(WARNING, "missing argument, cannot get module info");
1209                 rte_errno = EINVAL;
1210                 return -rte_errno;
1211         }
1212         ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
1213         if (ret) {
1214                 DRV_LOG(WARNING, "port %u ioctl(SIOCETHTOOL) failed: %s",
1215                         dev->data->port_id, strerror(rte_errno));
1216                 return ret;
1217         }
1218         modinfo->type = info.type;
1219         modinfo->eeprom_len = info.eeprom_len;
1220         return ret;
1221 }
1222
1223 /**
1224  * DPDK callback to retrieve plug-in module EEPROM data.
1225  *
1226  * @param dev
1227  *   Pointer to Ethernet device structure.
1228  * @param[out] info
1229  *   Storage for plug-in module EEPROM data.
1230  *
1231  * @return
1232  *   0 on success, a negative errno value otherwise and rte_errno is set.
1233  */
1234 int mlx5_get_module_eeprom(struct rte_eth_dev *dev,
1235                            struct rte_dev_eeprom_info *info)
1236 {
1237         struct ethtool_eeprom *eeprom;
1238         struct ifreq ifr;
1239         int ret = 0;
1240
1241         if (!dev) {
1242                 DRV_LOG(WARNING, "missing argument, cannot get module eeprom");
1243                 rte_errno = EINVAL;
1244                 return -rte_errno;
1245         }
1246         eeprom = mlx5_malloc(MLX5_MEM_ZERO,
1247                              (sizeof(struct ethtool_eeprom) + info->length), 0,
1248                              SOCKET_ID_ANY);
1249         if (!eeprom) {
1250                 DRV_LOG(WARNING, "port %u cannot allocate memory for "
1251                         "eeprom data", dev->data->port_id);
1252                 rte_errno = ENOMEM;
1253                 return -rte_errno;
1254         }
1255         eeprom->cmd = ETHTOOL_GMODULEEEPROM;
1256         eeprom->offset = info->offset;
1257         eeprom->len = info->length;
1258         ifr = (struct ifreq) {
1259                 .ifr_data = (void *)eeprom,
1260         };
1261         ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
1262         if (ret)
1263                 DRV_LOG(WARNING, "port %u ioctl(SIOCETHTOOL) failed: %s",
1264                         dev->data->port_id, strerror(rte_errno));
1265         else
1266                 rte_memcpy(info->data, eeprom->data, info->length);
1267         mlx5_free(eeprom);
1268         return ret;
1269 }
1270
1271 /**
1272  * Read device counters table.
1273  *
1274  * @param dev
1275  *   Pointer to Ethernet device.
1276  * @param[in] pf
1277  *   PF index in case of bonding device, -1 otherwise
1278  * @param[out] stats
1279  *   Counters table output buffer.
1280  *
1281  * @return
1282  *   0 on success and stats is filled, negative errno value otherwise and
1283  *   rte_errno is set.
1284  */
1285 static int
1286 _mlx5_os_read_dev_counters(struct rte_eth_dev *dev, int pf, uint64_t *stats)
1287 {
1288         struct mlx5_priv *priv = dev->data->dev_private;
1289         struct mlx5_xstats_ctrl *xstats_ctrl = &priv->xstats_ctrl;
1290         unsigned int i;
1291         struct ifreq ifr;
1292         unsigned int stats_sz = xstats_ctrl->stats_n * sizeof(uint64_t);
1293         unsigned char et_stat_buf[sizeof(struct ethtool_stats) + stats_sz];
1294         struct ethtool_stats *et_stats = (struct ethtool_stats *)et_stat_buf;
1295         int ret;
1296
1297         et_stats->cmd = ETHTOOL_GSTATS;
1298         et_stats->n_stats = xstats_ctrl->stats_n;
1299         ifr.ifr_data = (caddr_t)et_stats;
1300         if (pf >= 0)
1301                 ret = mlx5_ifreq_by_ifname(priv->sh->bond.ports[pf].ifname,
1302                                            SIOCETHTOOL, &ifr);
1303         else
1304                 ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
1305         if (ret) {
1306                 DRV_LOG(WARNING,
1307                         "port %u unable to read statistic values from device",
1308                         dev->data->port_id);
1309                 return ret;
1310         }
1311         for (i = 0; i != xstats_ctrl->mlx5_stats_n; ++i) {
1312                 if (xstats_ctrl->info[i].dev)
1313                         continue;
1314                 stats[i] += (uint64_t)
1315                             et_stats->data[xstats_ctrl->dev_table_idx[i]];
1316         }
1317         return 0;
1318 }
1319
1320 /**
1321  * Read device counters.
1322  *
1323  * @param dev
1324  *   Pointer to Ethernet device.
1325  * @param[out] stats
1326  *   Counters table output buffer.
1327  *
1328  * @return
1329  *   0 on success and stats is filled, negative errno value otherwise and
1330  *   rte_errno is set.
1331  */
1332 int
1333 mlx5_os_read_dev_counters(struct rte_eth_dev *dev, uint64_t *stats)
1334 {
1335         struct mlx5_priv *priv = dev->data->dev_private;
1336         struct mlx5_xstats_ctrl *xstats_ctrl = &priv->xstats_ctrl;
1337         int ret = 0, i;
1338
1339         memset(stats, 0, sizeof(*stats) * xstats_ctrl->mlx5_stats_n);
1340         /* Read ifreq counters. */
1341         if (priv->master && priv->pf_bond >= 0) {
1342                 /* Sum xstats from bonding device member ports. */
1343                 for (i = 0; i < priv->sh->bond.n_port; i++) {
1344                         ret = _mlx5_os_read_dev_counters(dev, i, stats);
1345                         if (ret)
1346                                 return ret;
1347                 }
1348         } else {
1349                 ret = _mlx5_os_read_dev_counters(dev, -1, stats);
1350         }
1351         /* Read IB counters. */
1352         for (i = 0; i != xstats_ctrl->mlx5_stats_n; ++i) {
1353                 if (!xstats_ctrl->info[i].dev)
1354                         continue;
1355                 ret = mlx5_os_read_dev_stat(priv, xstats_ctrl->info[i].ctr_name,
1356                                             &stats[i]);
1357                 /* return last xstats counter if fail to read. */
1358                 if (ret != 0)
1359                         xstats_ctrl->xstats[i] = stats[i];
1360                 else
1361                         stats[i] = xstats_ctrl->xstats[i];
1362         }
1363         return ret;
1364 }
1365
1366 /**
1367  * Query the number of statistics provided by ETHTOOL.
1368  *
1369  * @param dev
1370  *   Pointer to Ethernet device.
1371  *
1372  * @return
1373  *   Number of statistics on success, negative errno value otherwise and
1374  *   rte_errno is set.
1375  */
1376 int
1377 mlx5_os_get_stats_n(struct rte_eth_dev *dev)
1378 {
1379         struct mlx5_priv *priv = dev->data->dev_private;
1380         struct ethtool_drvinfo drvinfo;
1381         struct ifreq ifr;
1382         int ret;
1383
1384         drvinfo.cmd = ETHTOOL_GDRVINFO;
1385         ifr.ifr_data = (caddr_t)&drvinfo;
1386         if (priv->master && priv->pf_bond >= 0)
1387                 /* Bonding PF. */
1388                 ret = mlx5_ifreq_by_ifname(priv->sh->bond.ports[0].ifname,
1389                                            SIOCETHTOOL, &ifr);
1390         else
1391                 ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
1392         if (ret) {
1393                 DRV_LOG(WARNING, "port %u unable to query number of statistics",
1394                         dev->data->port_id);
1395                 return ret;
1396         }
1397         return drvinfo.n_stats;
1398 }
1399
1400 static const struct mlx5_counter_ctrl mlx5_counters_init[] = {
1401         {
1402                 .dpdk_name = "rx_unicast_bytes",
1403                 .ctr_name = "rx_vport_unicast_bytes",
1404         },
1405         {
1406                 .dpdk_name = "rx_multicast_bytes",
1407                 .ctr_name = "rx_vport_multicast_bytes",
1408         },
1409         {
1410                 .dpdk_name = "rx_broadcast_bytes",
1411                 .ctr_name = "rx_vport_broadcast_bytes",
1412         },
1413         {
1414                 .dpdk_name = "rx_unicast_packets",
1415                 .ctr_name = "rx_vport_unicast_packets",
1416         },
1417         {
1418                 .dpdk_name = "rx_multicast_packets",
1419                 .ctr_name = "rx_vport_multicast_packets",
1420         },
1421         {
1422                 .dpdk_name = "rx_broadcast_packets",
1423                 .ctr_name = "rx_vport_broadcast_packets",
1424         },
1425         {
1426                 .dpdk_name = "tx_unicast_bytes",
1427                 .ctr_name = "tx_vport_unicast_bytes",
1428         },
1429         {
1430                 .dpdk_name = "tx_multicast_bytes",
1431                 .ctr_name = "tx_vport_multicast_bytes",
1432         },
1433         {
1434                 .dpdk_name = "tx_broadcast_bytes",
1435                 .ctr_name = "tx_vport_broadcast_bytes",
1436         },
1437         {
1438                 .dpdk_name = "tx_unicast_packets",
1439                 .ctr_name = "tx_vport_unicast_packets",
1440         },
1441         {
1442                 .dpdk_name = "tx_multicast_packets",
1443                 .ctr_name = "tx_vport_multicast_packets",
1444         },
1445         {
1446                 .dpdk_name = "tx_broadcast_packets",
1447                 .ctr_name = "tx_vport_broadcast_packets",
1448         },
1449         {
1450                 .dpdk_name = "rx_wqe_errors",
1451                 .ctr_name = "rx_wqe_err",
1452         },
1453         {
1454                 .dpdk_name = "rx_phy_crc_errors",
1455                 .ctr_name = "rx_crc_errors_phy",
1456         },
1457         {
1458                 .dpdk_name = "rx_phy_in_range_len_errors",
1459                 .ctr_name = "rx_in_range_len_errors_phy",
1460         },
1461         {
1462                 .dpdk_name = "rx_phy_symbol_errors",
1463                 .ctr_name = "rx_symbol_err_phy",
1464         },
1465         {
1466                 .dpdk_name = "tx_phy_errors",
1467                 .ctr_name = "tx_errors_phy",
1468         },
1469         {
1470                 .dpdk_name = "rx_out_of_buffer",
1471                 .ctr_name = "out_of_buffer",
1472                 .dev = 1,
1473         },
1474         {
1475                 .dpdk_name = "tx_phy_packets",
1476                 .ctr_name = "tx_packets_phy",
1477         },
1478         {
1479                 .dpdk_name = "rx_phy_packets",
1480                 .ctr_name = "rx_packets_phy",
1481         },
1482         {
1483                 .dpdk_name = "tx_phy_discard_packets",
1484                 .ctr_name = "tx_discards_phy",
1485         },
1486         {
1487                 .dpdk_name = "rx_phy_discard_packets",
1488                 .ctr_name = "rx_discards_phy",
1489         },
1490         {
1491                 .dpdk_name = "tx_phy_bytes",
1492                 .ctr_name = "tx_bytes_phy",
1493         },
1494         {
1495                 .dpdk_name = "rx_phy_bytes",
1496                 .ctr_name = "rx_bytes_phy",
1497         },
1498         /* Representor only */
1499         {
1500                 .dpdk_name = "rx_vport_packets",
1501                 .ctr_name = "vport_rx_packets",
1502         },
1503         {
1504                 .dpdk_name = "rx_vport_bytes",
1505                 .ctr_name = "vport_rx_bytes",
1506         },
1507         {
1508                 .dpdk_name = "tx_vport_packets",
1509                 .ctr_name = "vport_tx_packets",
1510         },
1511         {
1512                 .dpdk_name = "tx_vport_bytes",
1513                 .ctr_name = "vport_tx_bytes",
1514         },
1515 };
1516
1517 static const unsigned int xstats_n = RTE_DIM(mlx5_counters_init);
1518
1519 /**
1520  * Init the structures to read device counters.
1521  *
1522  * @param dev
1523  *   Pointer to Ethernet device.
1524  */
1525 void
1526 mlx5_os_stats_init(struct rte_eth_dev *dev)
1527 {
1528         struct mlx5_priv *priv = dev->data->dev_private;
1529         struct mlx5_xstats_ctrl *xstats_ctrl = &priv->xstats_ctrl;
1530         struct mlx5_stats_ctrl *stats_ctrl = &priv->stats_ctrl;
1531         unsigned int i;
1532         unsigned int j;
1533         struct ifreq ifr;
1534         struct ethtool_gstrings *strings = NULL;
1535         unsigned int dev_stats_n;
1536         unsigned int str_sz;
1537         int ret;
1538
1539         /* So that it won't aggregate for each init. */
1540         xstats_ctrl->mlx5_stats_n = 0;
1541         ret = mlx5_os_get_stats_n(dev);
1542         if (ret < 0) {
1543                 DRV_LOG(WARNING, "port %u no extended statistics available",
1544                         dev->data->port_id);
1545                 return;
1546         }
1547         dev_stats_n = ret;
1548         /* Allocate memory to grab stat names and values. */
1549         str_sz = dev_stats_n * ETH_GSTRING_LEN;
1550         strings = (struct ethtool_gstrings *)
1551                   mlx5_malloc(0, str_sz + sizeof(struct ethtool_gstrings), 0,
1552                               SOCKET_ID_ANY);
1553         if (!strings) {
1554                 DRV_LOG(WARNING, "port %u unable to allocate memory for xstats",
1555                      dev->data->port_id);
1556                 return;
1557         }
1558         strings->cmd = ETHTOOL_GSTRINGS;
1559         strings->string_set = ETH_SS_STATS;
1560         strings->len = dev_stats_n;
1561         ifr.ifr_data = (caddr_t)strings;
1562         if (priv->master && priv->pf_bond >= 0)
1563                 /* Bonding master. */
1564                 ret = mlx5_ifreq_by_ifname(priv->sh->bond.ports[0].ifname,
1565                                            SIOCETHTOOL, &ifr);
1566         else
1567                 ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
1568         if (ret) {
1569                 DRV_LOG(WARNING, "port %u unable to get statistic names",
1570                         dev->data->port_id);
1571                 goto free;
1572         }
1573         for (i = 0; i != dev_stats_n; ++i) {
1574                 const char *curr_string = (const char *)
1575                         &strings->data[i * ETH_GSTRING_LEN];
1576
1577                 for (j = 0; j != xstats_n; ++j) {
1578                         if (!strcmp(mlx5_counters_init[j].ctr_name,
1579                                     curr_string)) {
1580                                 unsigned int idx = xstats_ctrl->mlx5_stats_n++;
1581
1582                                 xstats_ctrl->dev_table_idx[idx] = i;
1583                                 xstats_ctrl->info[idx] = mlx5_counters_init[j];
1584                                 break;
1585                         }
1586                 }
1587         }
1588         /* Add dev counters. */
1589         for (i = 0; i != xstats_n; ++i) {
1590                 if (mlx5_counters_init[i].dev) {
1591                         unsigned int idx = xstats_ctrl->mlx5_stats_n++;
1592
1593                         xstats_ctrl->info[idx] = mlx5_counters_init[i];
1594                         xstats_ctrl->hw_stats[idx] = 0;
1595                 }
1596         }
1597         MLX5_ASSERT(xstats_ctrl->mlx5_stats_n <= MLX5_MAX_XSTATS);
1598         xstats_ctrl->stats_n = dev_stats_n;
1599         /* Copy to base at first time. */
1600         ret = mlx5_os_read_dev_counters(dev, xstats_ctrl->base);
1601         if (ret)
1602                 DRV_LOG(ERR, "port %u cannot read device counters: %s",
1603                         dev->data->port_id, strerror(rte_errno));
1604         mlx5_os_read_dev_stat(priv, "out_of_buffer", &stats_ctrl->imissed_base);
1605         stats_ctrl->imissed = 0;
1606 free:
1607         mlx5_free(strings);
1608 }
1609
1610 /**
1611  * Get MAC address by querying netdevice.
1612  *
1613  * @param[in] dev
1614  *   Pointer to Ethernet device.
1615  * @param[out] mac
1616  *   MAC address output buffer.
1617  *
1618  * @return
1619  *   0 on success, a negative errno value otherwise and rte_errno is set.
1620  */
1621 int
1622 mlx5_get_mac(struct rte_eth_dev *dev, uint8_t (*mac)[RTE_ETHER_ADDR_LEN])
1623 {
1624         struct ifreq request;
1625         int ret;
1626
1627         ret = mlx5_ifreq(dev, SIOCGIFHWADDR, &request);
1628         if (ret)
1629                 return ret;
1630         memcpy(mac, request.ifr_hwaddr.sa_data, RTE_ETHER_ADDR_LEN);
1631         return 0;
1632 }