net/mlx5: cleanup header file
[dpdk.git] / drivers / net / mlx5 / linux / mlx5_ethdev_os.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2015 6WIND S.A.
3  * Copyright 2015 Mellanox Technologies, Ltd
4  */
5
6 #include <stddef.h>
7 #include <inttypes.h>
8 #include <unistd.h>
9 #include <stdbool.h>
10 #include <stdint.h>
11 #include <stdio.h>
12 #include <string.h>
13 #include <stdlib.h>
14 #include <errno.h>
15 #include <dirent.h>
16 #include <net/if.h>
17 #include <sys/ioctl.h>
18 #include <sys/socket.h>
19 #include <netinet/in.h>
20 #include <linux/ethtool.h>
21 #include <linux/sockios.h>
22 #include <fcntl.h>
23 #include <stdalign.h>
24 #include <sys/un.h>
25 #include <time.h>
26
27 #include <rte_atomic.h>
28 #include <rte_ethdev_driver.h>
29 #include <rte_bus_pci.h>
30 #include <rte_mbuf.h>
31 #include <rte_common.h>
32 #include <rte_interrupts.h>
33 #include <rte_malloc.h>
34 #include <rte_string_fns.h>
35 #include <rte_rwlock.h>
36 #include <rte_cycles.h>
37
38 #include <mlx5_glue.h>
39 #include <mlx5_devx_cmds.h>
40 #include <mlx5_common.h>
41 #include <mlx5_malloc.h>
42
43 #include "mlx5.h"
44 #include "mlx5_rxtx.h"
45 #include "mlx5_utils.h"
46
47 /* Supported speed values found in /usr/include/linux/ethtool.h */
48 #ifndef HAVE_SUPPORTED_40000baseKR4_Full
49 #define SUPPORTED_40000baseKR4_Full (1 << 23)
50 #endif
51 #ifndef HAVE_SUPPORTED_40000baseCR4_Full
52 #define SUPPORTED_40000baseCR4_Full (1 << 24)
53 #endif
54 #ifndef HAVE_SUPPORTED_40000baseSR4_Full
55 #define SUPPORTED_40000baseSR4_Full (1 << 25)
56 #endif
57 #ifndef HAVE_SUPPORTED_40000baseLR4_Full
58 #define SUPPORTED_40000baseLR4_Full (1 << 26)
59 #endif
60 #ifndef HAVE_SUPPORTED_56000baseKR4_Full
61 #define SUPPORTED_56000baseKR4_Full (1 << 27)
62 #endif
63 #ifndef HAVE_SUPPORTED_56000baseCR4_Full
64 #define SUPPORTED_56000baseCR4_Full (1 << 28)
65 #endif
66 #ifndef HAVE_SUPPORTED_56000baseSR4_Full
67 #define SUPPORTED_56000baseSR4_Full (1 << 29)
68 #endif
69 #ifndef HAVE_SUPPORTED_56000baseLR4_Full
70 #define SUPPORTED_56000baseLR4_Full (1 << 30)
71 #endif
72
73 /* Add defines in case the running kernel is not the same as user headers. */
74 #ifndef ETHTOOL_GLINKSETTINGS
75 struct ethtool_link_settings {
76         uint32_t cmd;
77         uint32_t speed;
78         uint8_t duplex;
79         uint8_t port;
80         uint8_t phy_address;
81         uint8_t autoneg;
82         uint8_t mdio_support;
83         uint8_t eth_to_mdix;
84         uint8_t eth_tp_mdix_ctrl;
85         int8_t link_mode_masks_nwords;
86         uint32_t reserved[8];
87         uint32_t link_mode_masks[];
88 };
89
90 /* The kernel values can be found in /include/uapi/linux/ethtool.h */
91 #define ETHTOOL_GLINKSETTINGS 0x0000004c
92 #define ETHTOOL_LINK_MODE_1000baseT_Full_BIT 5
93 #define ETHTOOL_LINK_MODE_Autoneg_BIT 6
94 #define ETHTOOL_LINK_MODE_1000baseKX_Full_BIT 17
95 #define ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT 18
96 #define ETHTOOL_LINK_MODE_10000baseKR_Full_BIT 19
97 #define ETHTOOL_LINK_MODE_10000baseR_FEC_BIT 20
98 #define ETHTOOL_LINK_MODE_20000baseMLD2_Full_BIT 21
99 #define ETHTOOL_LINK_MODE_20000baseKR2_Full_BIT 22
100 #define ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT 23
101 #define ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT 24
102 #define ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT 25
103 #define ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT 26
104 #define ETHTOOL_LINK_MODE_56000baseKR4_Full_BIT 27
105 #define ETHTOOL_LINK_MODE_56000baseCR4_Full_BIT 28
106 #define ETHTOOL_LINK_MODE_56000baseSR4_Full_BIT 29
107 #define ETHTOOL_LINK_MODE_56000baseLR4_Full_BIT 30
108 #endif
109 #ifndef HAVE_ETHTOOL_LINK_MODE_25G
110 #define ETHTOOL_LINK_MODE_25000baseCR_Full_BIT 31
111 #define ETHTOOL_LINK_MODE_25000baseKR_Full_BIT 32
112 #define ETHTOOL_LINK_MODE_25000baseSR_Full_BIT 33
113 #endif
114 #ifndef HAVE_ETHTOOL_LINK_MODE_50G
115 #define ETHTOOL_LINK_MODE_50000baseCR2_Full_BIT 34
116 #define ETHTOOL_LINK_MODE_50000baseKR2_Full_BIT 35
117 #endif
118 #ifndef HAVE_ETHTOOL_LINK_MODE_100G
119 #define ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT 36
120 #define ETHTOOL_LINK_MODE_100000baseSR4_Full_BIT 37
121 #define ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT 38
122 #define ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT 39
123 #endif
124 #ifndef HAVE_ETHTOOL_LINK_MODE_200G
125 #define ETHTOOL_LINK_MODE_200000baseKR4_Full_BIT 62
126 #define ETHTOOL_LINK_MODE_200000baseSR4_Full_BIT 63
127 #define ETHTOOL_LINK_MODE_200000baseLR4_ER4_FR4_Full_BIT 0 /* 64 - 64 */
128 #define ETHTOOL_LINK_MODE_200000baseDR4_Full_BIT 1 /* 65 - 64 */
129 #define ETHTOOL_LINK_MODE_200000baseCR4_Full_BIT 2 /* 66 - 64 */
130 #endif
131
132
133 /**
134  * Get interface name from private structure.
135  *
136  * This is a port representor-aware version of mlx5_get_ifname_sysfs().
137  *
138  * @param[in] dev
139  *   Pointer to Ethernet device.
140  * @param[out] ifname
141  *   Interface name output buffer.
142  *
143  * @return
144  *   0 on success, a negative errno value otherwise and rte_errno is set.
145  */
146 int
147 mlx5_get_ifname(const struct rte_eth_dev *dev, char (*ifname)[IF_NAMESIZE])
148 {
149         struct mlx5_priv *priv = dev->data->dev_private;
150         unsigned int ifindex;
151
152         MLX5_ASSERT(priv);
153         MLX5_ASSERT(priv->sh);
154         ifindex = mlx5_ifindex(dev);
155         if (!ifindex) {
156                 if (!priv->representor)
157                         return mlx5_get_ifname_sysfs(priv->sh->ibdev_path,
158                                                      *ifname);
159                 rte_errno = ENXIO;
160                 return -rte_errno;
161         }
162         if (if_indextoname(ifindex, &(*ifname)[0]))
163                 return 0;
164         rte_errno = errno;
165         return -rte_errno;
166 }
167
168 /**
169  * Perform ifreq ioctl() on associated Ethernet device.
170  *
171  * @param[in] dev
172  *   Pointer to Ethernet device.
173  * @param req
174  *   Request number to pass to ioctl().
175  * @param[out] ifr
176  *   Interface request structure output buffer.
177  *
178  * @return
179  *   0 on success, a negative errno value otherwise and rte_errno is set.
180  */
181 static int
182 mlx5_ifreq(const struct rte_eth_dev *dev, int req, struct ifreq *ifr)
183 {
184         int sock = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
185         int ret = 0;
186
187         if (sock == -1) {
188                 rte_errno = errno;
189                 return -rte_errno;
190         }
191         ret = mlx5_get_ifname(dev, &ifr->ifr_name);
192         if (ret)
193                 goto error;
194         ret = ioctl(sock, req, ifr);
195         if (ret == -1) {
196                 rte_errno = errno;
197                 goto error;
198         }
199         close(sock);
200         return 0;
201 error:
202         close(sock);
203         return -rte_errno;
204 }
205
206 /**
207  * Get device MTU.
208  *
209  * @param dev
210  *   Pointer to Ethernet device.
211  * @param[out] mtu
212  *   MTU value output buffer.
213  *
214  * @return
215  *   0 on success, a negative errno value otherwise and rte_errno is set.
216  */
217 int
218 mlx5_get_mtu(struct rte_eth_dev *dev, uint16_t *mtu)
219 {
220         struct ifreq request;
221         int ret = mlx5_ifreq(dev, SIOCGIFMTU, &request);
222
223         if (ret)
224                 return ret;
225         *mtu = request.ifr_mtu;
226         return 0;
227 }
228
229 /**
230  * Set device MTU.
231  *
232  * @param dev
233  *   Pointer to Ethernet device.
234  * @param mtu
235  *   MTU value to set.
236  *
237  * @return
238  *   0 on success, a negative errno value otherwise and rte_errno is set.
239  */
240 int
241 mlx5_set_mtu(struct rte_eth_dev *dev, uint16_t mtu)
242 {
243         struct ifreq request = { .ifr_mtu = mtu, };
244
245         return mlx5_ifreq(dev, SIOCSIFMTU, &request);
246 }
247
248 /**
249  * Set device flags.
250  *
251  * @param dev
252  *   Pointer to Ethernet device.
253  * @param keep
254  *   Bitmask for flags that must remain untouched.
255  * @param flags
256  *   Bitmask for flags to modify.
257  *
258  * @return
259  *   0 on success, a negative errno value otherwise and rte_errno is set.
260  */
261 static int
262 mlx5_set_flags(struct rte_eth_dev *dev, unsigned int keep, unsigned int flags)
263 {
264         struct ifreq request;
265         int ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &request);
266
267         if (ret)
268                 return ret;
269         request.ifr_flags &= keep;
270         request.ifr_flags |= flags & ~keep;
271         return mlx5_ifreq(dev, SIOCSIFFLAGS, &request);
272 }
273
274 /**
275  * Get device current raw clock counter
276  *
277  * @param dev
278  *   Pointer to Ethernet device structure.
279  * @param[out] time
280  *   Current raw clock counter of the device.
281  *
282  * @return
283  *   0 if the clock has correctly been read
284  *   The value of errno in case of error
285  */
286 int
287 mlx5_read_clock(struct rte_eth_dev *dev, uint64_t *clock)
288 {
289         struct mlx5_priv *priv = dev->data->dev_private;
290         struct ibv_context *ctx = priv->sh->ctx;
291         struct ibv_values_ex values;
292         int err = 0;
293
294         values.comp_mask = IBV_VALUES_MASK_RAW_CLOCK;
295         err = mlx5_glue->query_rt_values_ex(ctx, &values);
296         if (err != 0) {
297                 DRV_LOG(WARNING, "Could not query the clock !");
298                 return err;
299         }
300         *clock = values.raw_clock.tv_nsec;
301         return 0;
302 }
303
304 /**
305  * Retrieve the master device for representor in the same switch domain.
306  *
307  * @param dev
308  *   Pointer to representor Ethernet device structure.
309  *
310  * @return
311  *   Master device structure  on success, NULL otherwise.
312  */
313 static struct rte_eth_dev *
314 mlx5_find_master_dev(struct rte_eth_dev *dev)
315 {
316         struct mlx5_priv *priv;
317         uint16_t port_id;
318         uint16_t domain_id;
319
320         priv = dev->data->dev_private;
321         domain_id = priv->domain_id;
322         MLX5_ASSERT(priv->representor);
323         MLX5_ETH_FOREACH_DEV(port_id, priv->pci_dev) {
324                 struct mlx5_priv *opriv =
325                         rte_eth_devices[port_id].data->dev_private;
326                 if (opriv &&
327                     opriv->master &&
328                     opriv->domain_id == domain_id &&
329                     opriv->sh == priv->sh)
330                         return &rte_eth_devices[port_id];
331         }
332         return NULL;
333 }
334
335 /**
336  * DPDK callback to retrieve physical link information.
337  *
338  * @param dev
339  *   Pointer to Ethernet device structure.
340  * @param[out] link
341  *   Storage for current link status.
342  *
343  * @return
344  *   0 on success, a negative errno value otherwise and rte_errno is set.
345  */
346 static int
347 mlx5_link_update_unlocked_gset(struct rte_eth_dev *dev,
348                                struct rte_eth_link *link)
349 {
350         struct mlx5_priv *priv = dev->data->dev_private;
351         struct ethtool_cmd edata = {
352                 .cmd = ETHTOOL_GSET /* Deprecated since Linux v4.5. */
353         };
354         struct ifreq ifr;
355         struct rte_eth_link dev_link;
356         int link_speed = 0;
357         int ret;
358
359         ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr);
360         if (ret) {
361                 DRV_LOG(WARNING, "port %u ioctl(SIOCGIFFLAGS) failed: %s",
362                         dev->data->port_id, strerror(rte_errno));
363                 return ret;
364         }
365         dev_link = (struct rte_eth_link) {
366                 .link_status = ((ifr.ifr_flags & IFF_UP) &&
367                                 (ifr.ifr_flags & IFF_RUNNING)),
368         };
369         ifr = (struct ifreq) {
370                 .ifr_data = (void *)&edata,
371         };
372         ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
373         if (ret) {
374                 if (ret == -ENOTSUP && priv->representor) {
375                         struct rte_eth_dev *master;
376
377                         /*
378                          * For representors we can try to inherit link
379                          * settings from the master device. Actually
380                          * link settings do not make a lot of sense
381                          * for representors due to missing physical
382                          * link. The old kernel drivers supported
383                          * emulated settings query for representors,
384                          * the new ones do not, so we have to add
385                          * this code for compatibility issues.
386                          */
387                         master = mlx5_find_master_dev(dev);
388                         if (master) {
389                                 ifr = (struct ifreq) {
390                                         .ifr_data = (void *)&edata,
391                                 };
392                                 ret = mlx5_ifreq(master, SIOCETHTOOL, &ifr);
393                         }
394                 }
395                 if (ret) {
396                         DRV_LOG(WARNING,
397                                 "port %u ioctl(SIOCETHTOOL,"
398                                 " ETHTOOL_GSET) failed: %s",
399                                 dev->data->port_id, strerror(rte_errno));
400                         return ret;
401                 }
402         }
403         link_speed = ethtool_cmd_speed(&edata);
404         if (link_speed == -1)
405                 dev_link.link_speed = ETH_SPEED_NUM_NONE;
406         else
407                 dev_link.link_speed = link_speed;
408         priv->link_speed_capa = 0;
409         if (edata.supported & SUPPORTED_Autoneg)
410                 priv->link_speed_capa |= ETH_LINK_SPEED_AUTONEG;
411         if (edata.supported & (SUPPORTED_1000baseT_Full |
412                                SUPPORTED_1000baseKX_Full))
413                 priv->link_speed_capa |= ETH_LINK_SPEED_1G;
414         if (edata.supported & SUPPORTED_10000baseKR_Full)
415                 priv->link_speed_capa |= ETH_LINK_SPEED_10G;
416         if (edata.supported & (SUPPORTED_40000baseKR4_Full |
417                                SUPPORTED_40000baseCR4_Full |
418                                SUPPORTED_40000baseSR4_Full |
419                                SUPPORTED_40000baseLR4_Full))
420                 priv->link_speed_capa |= ETH_LINK_SPEED_40G;
421         dev_link.link_duplex = ((edata.duplex == DUPLEX_HALF) ?
422                                 ETH_LINK_HALF_DUPLEX : ETH_LINK_FULL_DUPLEX);
423         dev_link.link_autoneg = !(dev->data->dev_conf.link_speeds &
424                         ETH_LINK_SPEED_FIXED);
425         if (((dev_link.link_speed && !dev_link.link_status) ||
426              (!dev_link.link_speed && dev_link.link_status))) {
427                 rte_errno = EAGAIN;
428                 return -rte_errno;
429         }
430         *link = dev_link;
431         return 0;
432 }
433
434 /**
435  * Retrieve physical link information (unlocked version using new ioctl).
436  *
437  * @param dev
438  *   Pointer to Ethernet device structure.
439  * @param[out] link
440  *   Storage for current link status.
441  *
442  * @return
443  *   0 on success, a negative errno value otherwise and rte_errno is set.
444  */
445 static int
446 mlx5_link_update_unlocked_gs(struct rte_eth_dev *dev,
447                              struct rte_eth_link *link)
448
449 {
450         struct mlx5_priv *priv = dev->data->dev_private;
451         struct ethtool_link_settings gcmd = { .cmd = ETHTOOL_GLINKSETTINGS };
452         struct ifreq ifr;
453         struct rte_eth_link dev_link;
454         struct rte_eth_dev *master = NULL;
455         uint64_t sc;
456         int ret;
457
458         ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr);
459         if (ret) {
460                 DRV_LOG(WARNING, "port %u ioctl(SIOCGIFFLAGS) failed: %s",
461                         dev->data->port_id, strerror(rte_errno));
462                 return ret;
463         }
464         dev_link = (struct rte_eth_link) {
465                 .link_status = ((ifr.ifr_flags & IFF_UP) &&
466                                 (ifr.ifr_flags & IFF_RUNNING)),
467         };
468         ifr = (struct ifreq) {
469                 .ifr_data = (void *)&gcmd,
470         };
471         ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
472         if (ret) {
473                 if (ret == -ENOTSUP && priv->representor) {
474                         /*
475                          * For representors we can try to inherit link
476                          * settings from the master device. Actually
477                          * link settings do not make a lot of sense
478                          * for representors due to missing physical
479                          * link. The old kernel drivers supported
480                          * emulated settings query for representors,
481                          * the new ones do not, so we have to add
482                          * this code for compatibility issues.
483                          */
484                         master = mlx5_find_master_dev(dev);
485                         if (master) {
486                                 ifr = (struct ifreq) {
487                                         .ifr_data = (void *)&gcmd,
488                                 };
489                                 ret = mlx5_ifreq(master, SIOCETHTOOL, &ifr);
490                         }
491                 }
492                 if (ret) {
493                         DRV_LOG(DEBUG,
494                                 "port %u ioctl(SIOCETHTOOL,"
495                                 " ETHTOOL_GLINKSETTINGS) failed: %s",
496                                 dev->data->port_id, strerror(rte_errno));
497                         return ret;
498                 }
499         }
500         gcmd.link_mode_masks_nwords = -gcmd.link_mode_masks_nwords;
501
502         alignas(struct ethtool_link_settings)
503         uint8_t data[offsetof(struct ethtool_link_settings, link_mode_masks) +
504                      sizeof(uint32_t) * gcmd.link_mode_masks_nwords * 3];
505         struct ethtool_link_settings *ecmd = (void *)data;
506
507         *ecmd = gcmd;
508         ifr.ifr_data = (void *)ecmd;
509         ret = mlx5_ifreq(master ? master : dev, SIOCETHTOOL, &ifr);
510         if (ret) {
511                 DRV_LOG(DEBUG,
512                         "port %u ioctl(SIOCETHTOOL,"
513                         "ETHTOOL_GLINKSETTINGS) failed: %s",
514                         dev->data->port_id, strerror(rte_errno));
515                 return ret;
516         }
517         dev_link.link_speed = (ecmd->speed == UINT32_MAX) ? ETH_SPEED_NUM_NONE :
518                                                             ecmd->speed;
519         sc = ecmd->link_mode_masks[0] |
520                 ((uint64_t)ecmd->link_mode_masks[1] << 32);
521         priv->link_speed_capa = 0;
522         if (sc & MLX5_BITSHIFT(ETHTOOL_LINK_MODE_Autoneg_BIT))
523                 priv->link_speed_capa |= ETH_LINK_SPEED_AUTONEG;
524         if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_1000baseT_Full_BIT) |
525                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_1000baseKX_Full_BIT)))
526                 priv->link_speed_capa |= ETH_LINK_SPEED_1G;
527         if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT) |
528                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_10000baseKR_Full_BIT) |
529                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_10000baseR_FEC_BIT)))
530                 priv->link_speed_capa |= ETH_LINK_SPEED_10G;
531         if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_20000baseMLD2_Full_BIT) |
532                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_20000baseKR2_Full_BIT)))
533                 priv->link_speed_capa |= ETH_LINK_SPEED_20G;
534         if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT) |
535                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT) |
536                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT) |
537                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT)))
538                 priv->link_speed_capa |= ETH_LINK_SPEED_40G;
539         if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseKR4_Full_BIT) |
540                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseCR4_Full_BIT) |
541                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseSR4_Full_BIT) |
542                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseLR4_Full_BIT)))
543                 priv->link_speed_capa |= ETH_LINK_SPEED_56G;
544         if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_25000baseCR_Full_BIT) |
545                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_25000baseKR_Full_BIT) |
546                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_25000baseSR_Full_BIT)))
547                 priv->link_speed_capa |= ETH_LINK_SPEED_25G;
548         if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_50000baseCR2_Full_BIT) |
549                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_50000baseKR2_Full_BIT)))
550                 priv->link_speed_capa |= ETH_LINK_SPEED_50G;
551         if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT) |
552                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseSR4_Full_BIT) |
553                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT) |
554                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT)))
555                 priv->link_speed_capa |= ETH_LINK_SPEED_100G;
556         if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_200000baseKR4_Full_BIT) |
557                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_200000baseSR4_Full_BIT)))
558                 priv->link_speed_capa |= ETH_LINK_SPEED_200G;
559
560         sc = ecmd->link_mode_masks[2] |
561                 ((uint64_t)ecmd->link_mode_masks[3] << 32);
562         if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_200000baseCR4_Full_BIT) |
563                   MLX5_BITSHIFT
564                        (ETHTOOL_LINK_MODE_200000baseLR4_ER4_FR4_Full_BIT) |
565                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_200000baseDR4_Full_BIT)))
566                 priv->link_speed_capa |= ETH_LINK_SPEED_200G;
567         dev_link.link_duplex = ((ecmd->duplex == DUPLEX_HALF) ?
568                                 ETH_LINK_HALF_DUPLEX : ETH_LINK_FULL_DUPLEX);
569         dev_link.link_autoneg = !(dev->data->dev_conf.link_speeds &
570                                   ETH_LINK_SPEED_FIXED);
571         if (((dev_link.link_speed && !dev_link.link_status) ||
572              (!dev_link.link_speed && dev_link.link_status))) {
573                 rte_errno = EAGAIN;
574                 return -rte_errno;
575         }
576         *link = dev_link;
577         return 0;
578 }
579
580 /**
581  * DPDK callback to retrieve physical link information.
582  *
583  * @param dev
584  *   Pointer to Ethernet device structure.
585  * @param wait_to_complete
586  *   Wait for request completion.
587  *
588  * @return
589  *   0 if link status was not updated, positive if it was, a negative errno
590  *   value otherwise and rte_errno is set.
591  */
592 int
593 mlx5_link_update(struct rte_eth_dev *dev, int wait_to_complete)
594 {
595         int ret;
596         struct rte_eth_link dev_link;
597         time_t start_time = time(NULL);
598         int retry = MLX5_GET_LINK_STATUS_RETRY_COUNT;
599
600         do {
601                 ret = mlx5_link_update_unlocked_gs(dev, &dev_link);
602                 if (ret == -ENOTSUP)
603                         ret = mlx5_link_update_unlocked_gset(dev, &dev_link);
604                 if (ret == 0)
605                         break;
606                 /* Handle wait to complete situation. */
607                 if ((wait_to_complete || retry) && ret == -EAGAIN) {
608                         if (abs((int)difftime(time(NULL), start_time)) <
609                             MLX5_LINK_STATUS_TIMEOUT) {
610                                 usleep(0);
611                                 continue;
612                         } else {
613                                 rte_errno = EBUSY;
614                                 return -rte_errno;
615                         }
616                 } else if (ret < 0) {
617                         return ret;
618                 }
619         } while (wait_to_complete || retry-- > 0);
620         ret = !!memcmp(&dev->data->dev_link, &dev_link,
621                        sizeof(struct rte_eth_link));
622         dev->data->dev_link = dev_link;
623         return ret;
624 }
625
626 /**
627  * DPDK callback to get flow control status.
628  *
629  * @param dev
630  *   Pointer to Ethernet device structure.
631  * @param[out] fc_conf
632  *   Flow control output buffer.
633  *
634  * @return
635  *   0 on success, a negative errno value otherwise and rte_errno is set.
636  */
637 int
638 mlx5_dev_get_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf)
639 {
640         struct ifreq ifr;
641         struct ethtool_pauseparam ethpause = {
642                 .cmd = ETHTOOL_GPAUSEPARAM
643         };
644         int ret;
645
646         ifr.ifr_data = (void *)&ethpause;
647         ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
648         if (ret) {
649                 DRV_LOG(WARNING,
650                         "port %u ioctl(SIOCETHTOOL, ETHTOOL_GPAUSEPARAM) failed:"
651                         " %s",
652                         dev->data->port_id, strerror(rte_errno));
653                 return ret;
654         }
655         fc_conf->autoneg = ethpause.autoneg;
656         if (ethpause.rx_pause && ethpause.tx_pause)
657                 fc_conf->mode = RTE_FC_FULL;
658         else if (ethpause.rx_pause)
659                 fc_conf->mode = RTE_FC_RX_PAUSE;
660         else if (ethpause.tx_pause)
661                 fc_conf->mode = RTE_FC_TX_PAUSE;
662         else
663                 fc_conf->mode = RTE_FC_NONE;
664         return 0;
665 }
666
667 /**
668  * DPDK callback to modify flow control parameters.
669  *
670  * @param dev
671  *   Pointer to Ethernet device structure.
672  * @param[in] fc_conf
673  *   Flow control parameters.
674  *
675  * @return
676  *   0 on success, a negative errno value otherwise and rte_errno is set.
677  */
678 int
679 mlx5_dev_set_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf)
680 {
681         struct ifreq ifr;
682         struct ethtool_pauseparam ethpause = {
683                 .cmd = ETHTOOL_SPAUSEPARAM
684         };
685         int ret;
686
687         ifr.ifr_data = (void *)&ethpause;
688         ethpause.autoneg = fc_conf->autoneg;
689         if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) ||
690             (fc_conf->mode & RTE_FC_RX_PAUSE))
691                 ethpause.rx_pause = 1;
692         else
693                 ethpause.rx_pause = 0;
694
695         if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) ||
696             (fc_conf->mode & RTE_FC_TX_PAUSE))
697                 ethpause.tx_pause = 1;
698         else
699                 ethpause.tx_pause = 0;
700         ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
701         if (ret) {
702                 DRV_LOG(WARNING,
703                         "port %u ioctl(SIOCETHTOOL, ETHTOOL_SPAUSEPARAM)"
704                         " failed: %s",
705                         dev->data->port_id, strerror(rte_errno));
706                 return ret;
707         }
708         return 0;
709 }
710
711 /**
712  * Handle asynchronous removal event for entire multiport device.
713  *
714  * @param sh
715  *   Infiniband device shared context.
716  */
717 static void
718 mlx5_dev_interrupt_device_fatal(struct mlx5_dev_ctx_shared *sh)
719 {
720         uint32_t i;
721
722         for (i = 0; i < sh->max_port; ++i) {
723                 struct rte_eth_dev *dev;
724
725                 if (sh->port[i].ih_port_id >= RTE_MAX_ETHPORTS) {
726                         /*
727                          * Or not existing port either no
728                          * handler installed for this port.
729                          */
730                         continue;
731                 }
732                 dev = &rte_eth_devices[sh->port[i].ih_port_id];
733                 MLX5_ASSERT(dev);
734                 if (dev->data->dev_conf.intr_conf.rmv)
735                         _rte_eth_dev_callback_process
736                                 (dev, RTE_ETH_EVENT_INTR_RMV, NULL);
737         }
738 }
739
740 /**
741  * Handle shared asynchronous events the NIC (removal event
742  * and link status change). Supports multiport IB device.
743  *
744  * @param cb_arg
745  *   Callback argument.
746  */
747 void
748 mlx5_dev_interrupt_handler(void *cb_arg)
749 {
750         struct mlx5_dev_ctx_shared *sh = cb_arg;
751         struct ibv_async_event event;
752
753         /* Read all message from the IB device and acknowledge them. */
754         for (;;) {
755                 struct rte_eth_dev *dev;
756                 uint32_t tmp;
757
758                 if (mlx5_glue->get_async_event(sh->ctx, &event))
759                         break;
760                 /* Retrieve and check IB port index. */
761                 tmp = (uint32_t)event.element.port_num;
762                 if (!tmp && event.event_type == IBV_EVENT_DEVICE_FATAL) {
763                         /*
764                          * The DEVICE_FATAL event is called once for
765                          * entire device without port specifying.
766                          * We should notify all existing ports.
767                          */
768                         mlx5_glue->ack_async_event(&event);
769                         mlx5_dev_interrupt_device_fatal(sh);
770                         continue;
771                 }
772                 MLX5_ASSERT(tmp && (tmp <= sh->max_port));
773                 if (!tmp) {
774                         /* Unsupported device level event. */
775                         mlx5_glue->ack_async_event(&event);
776                         DRV_LOG(DEBUG,
777                                 "unsupported common event (type %d)",
778                                 event.event_type);
779                         continue;
780                 }
781                 if (tmp > sh->max_port) {
782                         /* Invalid IB port index. */
783                         mlx5_glue->ack_async_event(&event);
784                         DRV_LOG(DEBUG,
785                                 "cannot handle an event (type %d)"
786                                 "due to invalid IB port index (%u)",
787                                 event.event_type, tmp);
788                         continue;
789                 }
790                 if (sh->port[tmp - 1].ih_port_id >= RTE_MAX_ETHPORTS) {
791                         /* No handler installed. */
792                         mlx5_glue->ack_async_event(&event);
793                         DRV_LOG(DEBUG,
794                                 "cannot handle an event (type %d)"
795                                 "due to no handler installed for port %u",
796                                 event.event_type, tmp);
797                         continue;
798                 }
799                 /* Retrieve ethernet device descriptor. */
800                 tmp = sh->port[tmp - 1].ih_port_id;
801                 dev = &rte_eth_devices[tmp];
802                 MLX5_ASSERT(dev);
803                 if ((event.event_type == IBV_EVENT_PORT_ACTIVE ||
804                      event.event_type == IBV_EVENT_PORT_ERR) &&
805                         dev->data->dev_conf.intr_conf.lsc) {
806                         mlx5_glue->ack_async_event(&event);
807                         if (mlx5_link_update(dev, 0) == -EAGAIN) {
808                                 usleep(0);
809                                 continue;
810                         }
811                         _rte_eth_dev_callback_process
812                                 (dev, RTE_ETH_EVENT_INTR_LSC, NULL);
813                         continue;
814                 }
815                 DRV_LOG(DEBUG,
816                         "port %u cannot handle an unknown event (type %d)",
817                         dev->data->port_id, event.event_type);
818                 mlx5_glue->ack_async_event(&event);
819         }
820 }
821
822 /*
823  * Unregister callback handler safely. The handler may be active
824  * while we are trying to unregister it, in this case code -EAGAIN
825  * is returned by rte_intr_callback_unregister(). This routine checks
826  * the return code and tries to unregister handler again.
827  *
828  * @param handle
829  *   interrupt handle
830  * @param cb_fn
831  *   pointer to callback routine
832  * @cb_arg
833  *   opaque callback parameter
834  */
835 void
836 mlx5_intr_callback_unregister(const struct rte_intr_handle *handle,
837                               rte_intr_callback_fn cb_fn, void *cb_arg)
838 {
839         /*
840          * Try to reduce timeout management overhead by not calling
841          * the timer related routines on the first iteration. If the
842          * unregistering succeeds on first call there will be no
843          * timer calls at all.
844          */
845         uint64_t twait = 0;
846         uint64_t start = 0;
847
848         do {
849                 int ret;
850
851                 ret = rte_intr_callback_unregister(handle, cb_fn, cb_arg);
852                 if (ret >= 0)
853                         return;
854                 if (ret != -EAGAIN) {
855                         DRV_LOG(INFO, "failed to unregister interrupt"
856                                       " handler (error: %d)", ret);
857                         MLX5_ASSERT(false);
858                         return;
859                 }
860                 if (twait) {
861                         struct timespec onems;
862
863                         /* Wait one millisecond and try again. */
864                         onems.tv_sec = 0;
865                         onems.tv_nsec = NS_PER_S / MS_PER_S;
866                         nanosleep(&onems, 0);
867                         /* Check whether one second elapsed. */
868                         if ((rte_get_timer_cycles() - start) <= twait)
869                                 continue;
870                 } else {
871                         /*
872                          * We get the amount of timer ticks for one second.
873                          * If this amount elapsed it means we spent one
874                          * second in waiting. This branch is executed once
875                          * on first iteration.
876                          */
877                         twait = rte_get_timer_hz();
878                         MLX5_ASSERT(twait);
879                 }
880                 /*
881                  * Timeout elapsed, show message (once a second) and retry.
882                  * We have no other acceptable option here, if we ignore
883                  * the unregistering return code the handler will not
884                  * be unregistered, fd will be closed and we may get the
885                  * crush. Hanging and messaging in the loop seems not to be
886                  * the worst choice.
887                  */
888                 DRV_LOG(INFO, "Retrying to unregister interrupt handler");
889                 start = rte_get_timer_cycles();
890         } while (true);
891 }
892
893 /**
894  * Handle DEVX interrupts from the NIC.
895  * This function is probably called from the DPDK host thread.
896  *
897  * @param cb_arg
898  *   Callback argument.
899  */
900 void
901 mlx5_dev_interrupt_handler_devx(void *cb_arg)
902 {
903 #ifndef HAVE_IBV_DEVX_ASYNC
904         (void)cb_arg;
905         return;
906 #else
907         struct mlx5_dev_ctx_shared *sh = cb_arg;
908         union {
909                 struct mlx5dv_devx_async_cmd_hdr cmd_resp;
910                 uint8_t buf[MLX5_ST_SZ_BYTES(query_flow_counter_out) +
911                             MLX5_ST_SZ_BYTES(traffic_counter) +
912                             sizeof(struct mlx5dv_devx_async_cmd_hdr)];
913         } out;
914         uint8_t *buf = out.buf + sizeof(out.cmd_resp);
915
916         while (!mlx5_glue->devx_get_async_cmd_comp(sh->devx_comp,
917                                                    &out.cmd_resp,
918                                                    sizeof(out.buf)))
919                 mlx5_flow_async_pool_query_handle
920                         (sh, (uint64_t)out.cmd_resp.wr_id,
921                          mlx5_devx_get_out_command_status(buf));
922 #endif /* HAVE_IBV_DEVX_ASYNC */
923 }
924
925 /**
926  * DPDK callback to bring the link DOWN.
927  *
928  * @param dev
929  *   Pointer to Ethernet device structure.
930  *
931  * @return
932  *   0 on success, a negative errno value otherwise and rte_errno is set.
933  */
934 int
935 mlx5_set_link_down(struct rte_eth_dev *dev)
936 {
937         return mlx5_set_flags(dev, ~IFF_UP, ~IFF_UP);
938 }
939
940 /**
941  * DPDK callback to bring the link UP.
942  *
943  * @param dev
944  *   Pointer to Ethernet device structure.
945  *
946  * @return
947  *   0 on success, a negative errno value otherwise and rte_errno is set.
948  */
949 int
950 mlx5_set_link_up(struct rte_eth_dev *dev)
951 {
952         return mlx5_set_flags(dev, ~IFF_UP, IFF_UP);
953 }
954
955 /**
956  * Check if mlx5 device was removed.
957  *
958  * @param dev
959  *   Pointer to Ethernet device structure.
960  *
961  * @return
962  *   1 when device is removed, otherwise 0.
963  */
964 int
965 mlx5_is_removed(struct rte_eth_dev *dev)
966 {
967         struct ibv_device_attr device_attr;
968         struct mlx5_priv *priv = dev->data->dev_private;
969
970         if (mlx5_glue->query_device(priv->sh->ctx, &device_attr) == EIO)
971                 return 1;
972         return 0;
973 }
974
975 /**
976  * Analyze gathered port parameters via sysfs to recognize master
977  * and representor devices for E-Switch configuration.
978  *
979  * @param[in] device_dir
980  *   flag of presence of "device" directory under port device key.
981  * @param[inout] switch_info
982  *   Port information, including port name as a number and port name
983  *   type if recognized
984  *
985  * @return
986  *   master and representor flags are set in switch_info according to
987  *   recognized parameters (if any).
988  */
989 static void
990 mlx5_sysfs_check_switch_info(bool device_dir,
991                              struct mlx5_switch_info *switch_info)
992 {
993         switch (switch_info->name_type) {
994         case MLX5_PHYS_PORT_NAME_TYPE_UNKNOWN:
995                 /*
996                  * Name is not recognized, assume the master,
997                  * check the device directory presence.
998                  */
999                 switch_info->master = device_dir;
1000                 break;
1001         case MLX5_PHYS_PORT_NAME_TYPE_NOTSET:
1002                 /*
1003                  * Name is not set, this assumes the legacy naming
1004                  * schema for master, just check if there is
1005                  * a device directory.
1006                  */
1007                 switch_info->master = device_dir;
1008                 break;
1009         case MLX5_PHYS_PORT_NAME_TYPE_UPLINK:
1010                 /* New uplink naming schema recognized. */
1011                 switch_info->master = 1;
1012                 break;
1013         case MLX5_PHYS_PORT_NAME_TYPE_LEGACY:
1014                 /* Legacy representors naming schema. */
1015                 switch_info->representor = !device_dir;
1016                 break;
1017         case MLX5_PHYS_PORT_NAME_TYPE_PFHPF:
1018                 /* Fallthrough */
1019         case MLX5_PHYS_PORT_NAME_TYPE_PFVF:
1020                 /* New representors naming schema. */
1021                 switch_info->representor = 1;
1022                 break;
1023         }
1024 }
1025
1026 /**
1027  * Get switch information associated with network interface.
1028  *
1029  * @param ifindex
1030  *   Network interface index.
1031  * @param[out] info
1032  *   Switch information object, populated in case of success.
1033  *
1034  * @return
1035  *   0 on success, a negative errno value otherwise and rte_errno is set.
1036  */
1037 int
1038 mlx5_sysfs_switch_info(unsigned int ifindex, struct mlx5_switch_info *info)
1039 {
1040         char ifname[IF_NAMESIZE];
1041         char port_name[IF_NAMESIZE];
1042         FILE *file;
1043         struct mlx5_switch_info data = {
1044                 .master = 0,
1045                 .representor = 0,
1046                 .name_type = MLX5_PHYS_PORT_NAME_TYPE_NOTSET,
1047                 .port_name = 0,
1048                 .switch_id = 0,
1049         };
1050         DIR *dir;
1051         bool port_switch_id_set = false;
1052         bool device_dir = false;
1053         char c;
1054         int ret;
1055
1056         if (!if_indextoname(ifindex, ifname)) {
1057                 rte_errno = errno;
1058                 return -rte_errno;
1059         }
1060
1061         MKSTR(phys_port_name, "/sys/class/net/%s/phys_port_name",
1062               ifname);
1063         MKSTR(phys_switch_id, "/sys/class/net/%s/phys_switch_id",
1064               ifname);
1065         MKSTR(pci_device, "/sys/class/net/%s/device",
1066               ifname);
1067
1068         file = fopen(phys_port_name, "rb");
1069         if (file != NULL) {
1070                 ret = fscanf(file, "%s", port_name);
1071                 fclose(file);
1072                 if (ret == 1)
1073                         mlx5_translate_port_name(port_name, &data);
1074         }
1075         file = fopen(phys_switch_id, "rb");
1076         if (file == NULL) {
1077                 rte_errno = errno;
1078                 return -rte_errno;
1079         }
1080         port_switch_id_set =
1081                 fscanf(file, "%" SCNx64 "%c", &data.switch_id, &c) == 2 &&
1082                 c == '\n';
1083         fclose(file);
1084         dir = opendir(pci_device);
1085         if (dir != NULL) {
1086                 closedir(dir);
1087                 device_dir = true;
1088         }
1089         if (port_switch_id_set) {
1090                 /* We have some E-Switch configuration. */
1091                 mlx5_sysfs_check_switch_info(device_dir, &data);
1092         }
1093         *info = data;
1094         MLX5_ASSERT(!(data.master && data.representor));
1095         if (data.master && data.representor) {
1096                 DRV_LOG(ERR, "ifindex %u device is recognized as master"
1097                              " and as representor", ifindex);
1098                 rte_errno = ENODEV;
1099                 return -rte_errno;
1100         }
1101         return 0;
1102 }
1103
1104 /**
1105  * DPDK callback to retrieve plug-in module EEPROM information (type and size).
1106  *
1107  * @param dev
1108  *   Pointer to Ethernet device structure.
1109  * @param[out] modinfo
1110  *   Storage for plug-in module EEPROM information.
1111  *
1112  * @return
1113  *   0 on success, a negative errno value otherwise and rte_errno is set.
1114  */
1115 int
1116 mlx5_get_module_info(struct rte_eth_dev *dev,
1117                      struct rte_eth_dev_module_info *modinfo)
1118 {
1119         struct ethtool_modinfo info = {
1120                 .cmd = ETHTOOL_GMODULEINFO,
1121         };
1122         struct ifreq ifr = (struct ifreq) {
1123                 .ifr_data = (void *)&info,
1124         };
1125         int ret = 0;
1126
1127         if (!dev || !modinfo) {
1128                 DRV_LOG(WARNING, "missing argument, cannot get module info");
1129                 rte_errno = EINVAL;
1130                 return -rte_errno;
1131         }
1132         ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
1133         if (ret) {
1134                 DRV_LOG(WARNING, "port %u ioctl(SIOCETHTOOL) failed: %s",
1135                         dev->data->port_id, strerror(rte_errno));
1136                 return ret;
1137         }
1138         modinfo->type = info.type;
1139         modinfo->eeprom_len = info.eeprom_len;
1140         return ret;
1141 }
1142
1143 /**
1144  * DPDK callback to retrieve plug-in module EEPROM data.
1145  *
1146  * @param dev
1147  *   Pointer to Ethernet device structure.
1148  * @param[out] info
1149  *   Storage for plug-in module EEPROM data.
1150  *
1151  * @return
1152  *   0 on success, a negative errno value otherwise and rte_errno is set.
1153  */
1154 int mlx5_get_module_eeprom(struct rte_eth_dev *dev,
1155                            struct rte_dev_eeprom_info *info)
1156 {
1157         struct ethtool_eeprom *eeprom;
1158         struct ifreq ifr;
1159         int ret = 0;
1160
1161         if (!dev || !info) {
1162                 DRV_LOG(WARNING, "missing argument, cannot get module eeprom");
1163                 rte_errno = EINVAL;
1164                 return -rte_errno;
1165         }
1166         eeprom = mlx5_malloc(MLX5_MEM_ZERO,
1167                              (sizeof(struct ethtool_eeprom) + info->length), 0,
1168                              SOCKET_ID_ANY);
1169         if (!eeprom) {
1170                 DRV_LOG(WARNING, "port %u cannot allocate memory for "
1171                         "eeprom data", dev->data->port_id);
1172                 rte_errno = ENOMEM;
1173                 return -rte_errno;
1174         }
1175         eeprom->cmd = ETHTOOL_GMODULEEEPROM;
1176         eeprom->offset = info->offset;
1177         eeprom->len = info->length;
1178         ifr = (struct ifreq) {
1179                 .ifr_data = (void *)eeprom,
1180         };
1181         ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
1182         if (ret)
1183                 DRV_LOG(WARNING, "port %u ioctl(SIOCETHTOOL) failed: %s",
1184                         dev->data->port_id, strerror(rte_errno));
1185         else
1186                 rte_memcpy(info->data, eeprom->data, info->length);
1187         mlx5_free(eeprom);
1188         return ret;
1189 }
1190
1191 /**
1192  * Read device counters table.
1193  *
1194  * @param dev
1195  *   Pointer to Ethernet device.
1196  * @param[out] stats
1197  *   Counters table output buffer.
1198  *
1199  * @return
1200  *   0 on success and stats is filled, negative errno value otherwise and
1201  *   rte_errno is set.
1202  */
1203 int
1204 mlx5_os_read_dev_counters(struct rte_eth_dev *dev, uint64_t *stats)
1205 {
1206         struct mlx5_priv *priv = dev->data->dev_private;
1207         struct mlx5_xstats_ctrl *xstats_ctrl = &priv->xstats_ctrl;
1208         unsigned int i;
1209         struct ifreq ifr;
1210         unsigned int stats_sz = xstats_ctrl->stats_n * sizeof(uint64_t);
1211         unsigned char et_stat_buf[sizeof(struct ethtool_stats) + stats_sz];
1212         struct ethtool_stats *et_stats = (struct ethtool_stats *)et_stat_buf;
1213         int ret;
1214
1215         et_stats->cmd = ETHTOOL_GSTATS;
1216         et_stats->n_stats = xstats_ctrl->stats_n;
1217         ifr.ifr_data = (caddr_t)et_stats;
1218         ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
1219         if (ret) {
1220                 DRV_LOG(WARNING,
1221                         "port %u unable to read statistic values from device",
1222                         dev->data->port_id);
1223                 return ret;
1224         }
1225         for (i = 0; i != xstats_ctrl->mlx5_stats_n; ++i) {
1226                 if (xstats_ctrl->info[i].dev) {
1227                         ret = mlx5_os_read_dev_stat(priv,
1228                                             xstats_ctrl->info[i].ctr_name,
1229                                             &stats[i]);
1230                         /* return last xstats counter if fail to read. */
1231                         if (ret == 0)
1232                                 xstats_ctrl->xstats[i] = stats[i];
1233                         else
1234                                 stats[i] = xstats_ctrl->xstats[i];
1235                 } else {
1236                         stats[i] = (uint64_t)
1237                                 et_stats->data[xstats_ctrl->dev_table_idx[i]];
1238                 }
1239         }
1240         return 0;
1241 }
1242
1243 /**
1244  * Query the number of statistics provided by ETHTOOL.
1245  *
1246  * @param dev
1247  *   Pointer to Ethernet device.
1248  *
1249  * @return
1250  *   Number of statistics on success, negative errno value otherwise and
1251  *   rte_errno is set.
1252  */
1253 int
1254 mlx5_os_get_stats_n(struct rte_eth_dev *dev)
1255 {
1256         struct ethtool_drvinfo drvinfo;
1257         struct ifreq ifr;
1258         int ret;
1259
1260         drvinfo.cmd = ETHTOOL_GDRVINFO;
1261         ifr.ifr_data = (caddr_t)&drvinfo;
1262         ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
1263         if (ret) {
1264                 DRV_LOG(WARNING, "port %u unable to query number of statistics",
1265                         dev->data->port_id);
1266                 return ret;
1267         }
1268         return drvinfo.n_stats;
1269 }
1270
1271 static const struct mlx5_counter_ctrl mlx5_counters_init[] = {
1272         {
1273                 .dpdk_name = "rx_port_unicast_bytes",
1274                 .ctr_name = "rx_vport_unicast_bytes",
1275         },
1276         {
1277                 .dpdk_name = "rx_port_multicast_bytes",
1278                 .ctr_name = "rx_vport_multicast_bytes",
1279         },
1280         {
1281                 .dpdk_name = "rx_port_broadcast_bytes",
1282                 .ctr_name = "rx_vport_broadcast_bytes",
1283         },
1284         {
1285                 .dpdk_name = "rx_port_unicast_packets",
1286                 .ctr_name = "rx_vport_unicast_packets",
1287         },
1288         {
1289                 .dpdk_name = "rx_port_multicast_packets",
1290                 .ctr_name = "rx_vport_multicast_packets",
1291         },
1292         {
1293                 .dpdk_name = "rx_port_broadcast_packets",
1294                 .ctr_name = "rx_vport_broadcast_packets",
1295         },
1296         {
1297                 .dpdk_name = "tx_port_unicast_bytes",
1298                 .ctr_name = "tx_vport_unicast_bytes",
1299         },
1300         {
1301                 .dpdk_name = "tx_port_multicast_bytes",
1302                 .ctr_name = "tx_vport_multicast_bytes",
1303         },
1304         {
1305                 .dpdk_name = "tx_port_broadcast_bytes",
1306                 .ctr_name = "tx_vport_broadcast_bytes",
1307         },
1308         {
1309                 .dpdk_name = "tx_port_unicast_packets",
1310                 .ctr_name = "tx_vport_unicast_packets",
1311         },
1312         {
1313                 .dpdk_name = "tx_port_multicast_packets",
1314                 .ctr_name = "tx_vport_multicast_packets",
1315         },
1316         {
1317                 .dpdk_name = "tx_port_broadcast_packets",
1318                 .ctr_name = "tx_vport_broadcast_packets",
1319         },
1320         {
1321                 .dpdk_name = "rx_wqe_err",
1322                 .ctr_name = "rx_wqe_err",
1323         },
1324         {
1325                 .dpdk_name = "rx_crc_errors_phy",
1326                 .ctr_name = "rx_crc_errors_phy",
1327         },
1328         {
1329                 .dpdk_name = "rx_in_range_len_errors_phy",
1330                 .ctr_name = "rx_in_range_len_errors_phy",
1331         },
1332         {
1333                 .dpdk_name = "rx_symbol_err_phy",
1334                 .ctr_name = "rx_symbol_err_phy",
1335         },
1336         {
1337                 .dpdk_name = "tx_errors_phy",
1338                 .ctr_name = "tx_errors_phy",
1339         },
1340         {
1341                 .dpdk_name = "rx_out_of_buffer",
1342                 .ctr_name = "out_of_buffer",
1343                 .dev = 1,
1344         },
1345         {
1346                 .dpdk_name = "tx_packets_phy",
1347                 .ctr_name = "tx_packets_phy",
1348         },
1349         {
1350                 .dpdk_name = "rx_packets_phy",
1351                 .ctr_name = "rx_packets_phy",
1352         },
1353         {
1354                 .dpdk_name = "tx_discards_phy",
1355                 .ctr_name = "tx_discards_phy",
1356         },
1357         {
1358                 .dpdk_name = "rx_discards_phy",
1359                 .ctr_name = "rx_discards_phy",
1360         },
1361         {
1362                 .dpdk_name = "tx_bytes_phy",
1363                 .ctr_name = "tx_bytes_phy",
1364         },
1365         {
1366                 .dpdk_name = "rx_bytes_phy",
1367                 .ctr_name = "rx_bytes_phy",
1368         },
1369         /* Representor only */
1370         {
1371                 .dpdk_name = "rx_packets",
1372                 .ctr_name = "vport_rx_packets",
1373         },
1374         {
1375                 .dpdk_name = "rx_bytes",
1376                 .ctr_name = "vport_rx_bytes",
1377         },
1378         {
1379                 .dpdk_name = "tx_packets",
1380                 .ctr_name = "vport_tx_packets",
1381         },
1382         {
1383                 .dpdk_name = "tx_bytes",
1384                 .ctr_name = "vport_tx_bytes",
1385         },
1386 };
1387
1388 static const unsigned int xstats_n = RTE_DIM(mlx5_counters_init);
1389
1390 /**
1391  * Init the structures to read device counters.
1392  *
1393  * @param dev
1394  *   Pointer to Ethernet device.
1395  */
1396 void
1397 mlx5_os_stats_init(struct rte_eth_dev *dev)
1398 {
1399         struct mlx5_priv *priv = dev->data->dev_private;
1400         struct mlx5_xstats_ctrl *xstats_ctrl = &priv->xstats_ctrl;
1401         struct mlx5_stats_ctrl *stats_ctrl = &priv->stats_ctrl;
1402         unsigned int i;
1403         unsigned int j;
1404         struct ifreq ifr;
1405         struct ethtool_gstrings *strings = NULL;
1406         unsigned int dev_stats_n;
1407         unsigned int str_sz;
1408         int ret;
1409
1410         /* So that it won't aggregate for each init. */
1411         xstats_ctrl->mlx5_stats_n = 0;
1412         ret = mlx5_os_get_stats_n(dev);
1413         if (ret < 0) {
1414                 DRV_LOG(WARNING, "port %u no extended statistics available",
1415                         dev->data->port_id);
1416                 return;
1417         }
1418         dev_stats_n = ret;
1419         /* Allocate memory to grab stat names and values. */
1420         str_sz = dev_stats_n * ETH_GSTRING_LEN;
1421         strings = (struct ethtool_gstrings *)
1422                   mlx5_malloc(0, str_sz + sizeof(struct ethtool_gstrings), 0,
1423                               SOCKET_ID_ANY);
1424         if (!strings) {
1425                 DRV_LOG(WARNING, "port %u unable to allocate memory for xstats",
1426                      dev->data->port_id);
1427                 return;
1428         }
1429         strings->cmd = ETHTOOL_GSTRINGS;
1430         strings->string_set = ETH_SS_STATS;
1431         strings->len = dev_stats_n;
1432         ifr.ifr_data = (caddr_t)strings;
1433         ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
1434         if (ret) {
1435                 DRV_LOG(WARNING, "port %u unable to get statistic names",
1436                         dev->data->port_id);
1437                 goto free;
1438         }
1439         for (i = 0; i != dev_stats_n; ++i) {
1440                 const char *curr_string = (const char *)
1441                         &strings->data[i * ETH_GSTRING_LEN];
1442
1443                 for (j = 0; j != xstats_n; ++j) {
1444                         if (!strcmp(mlx5_counters_init[j].ctr_name,
1445                                     curr_string)) {
1446                                 unsigned int idx = xstats_ctrl->mlx5_stats_n++;
1447
1448                                 xstats_ctrl->dev_table_idx[idx] = i;
1449                                 xstats_ctrl->info[idx] = mlx5_counters_init[j];
1450                                 break;
1451                         }
1452                 }
1453         }
1454         /* Add dev counters. */
1455         for (i = 0; i != xstats_n; ++i) {
1456                 if (mlx5_counters_init[i].dev) {
1457                         unsigned int idx = xstats_ctrl->mlx5_stats_n++;
1458
1459                         xstats_ctrl->info[idx] = mlx5_counters_init[i];
1460                         xstats_ctrl->hw_stats[idx] = 0;
1461                 }
1462         }
1463         MLX5_ASSERT(xstats_ctrl->mlx5_stats_n <= MLX5_MAX_XSTATS);
1464         xstats_ctrl->stats_n = dev_stats_n;
1465         /* Copy to base at first time. */
1466         ret = mlx5_os_read_dev_counters(dev, xstats_ctrl->base);
1467         if (ret)
1468                 DRV_LOG(ERR, "port %u cannot read device counters: %s",
1469                         dev->data->port_id, strerror(rte_errno));
1470         mlx5_os_read_dev_stat(priv, "out_of_buffer", &stats_ctrl->imissed_base);
1471         stats_ctrl->imissed = 0;
1472 free:
1473         mlx5_free(strings);
1474 }
1475
1476 /**
1477  * Get MAC address by querying netdevice.
1478  *
1479  * @param[in] dev
1480  *   Pointer to Ethernet device.
1481  * @param[out] mac
1482  *   MAC address output buffer.
1483  *
1484  * @return
1485  *   0 on success, a negative errno value otherwise and rte_errno is set.
1486  */
1487 int
1488 mlx5_get_mac(struct rte_eth_dev *dev, uint8_t (*mac)[RTE_ETHER_ADDR_LEN])
1489 {
1490         struct ifreq request;
1491         int ret;
1492
1493         ret = mlx5_ifreq(dev, SIOCGIFHWADDR, &request);
1494         if (ret)
1495                 return ret;
1496         memcpy(mac, request.ifr_hwaddr.sa_data, RTE_ETHER_ADDR_LEN);
1497         return 0;
1498 }
1499