net/mlx5: report max number of mbuf segments
[dpdk.git] / drivers / net / mlx5 / mlx5_ethdev.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2015 6WIND S.A.
3  * Copyright 2015 Mellanox Technologies, Ltd
4  */
5
6 #include <stddef.h>
7 #include <assert.h>
8 #include <inttypes.h>
9 #include <unistd.h>
10 #include <stdbool.h>
11 #include <stdint.h>
12 #include <stdio.h>
13 #include <string.h>
14 #include <stdlib.h>
15 #include <errno.h>
16 #include <dirent.h>
17 #include <net/if.h>
18 #include <sys/ioctl.h>
19 #include <sys/socket.h>
20 #include <netinet/in.h>
21 #include <linux/ethtool.h>
22 #include <linux/sockios.h>
23 #include <fcntl.h>
24 #include <stdalign.h>
25 #include <sys/un.h>
26 #include <time.h>
27
28 #include <rte_atomic.h>
29 #include <rte_ethdev_driver.h>
30 #include <rte_bus_pci.h>
31 #include <rte_mbuf.h>
32 #include <rte_common.h>
33 #include <rte_interrupts.h>
34 #include <rte_malloc.h>
35 #include <rte_string_fns.h>
36 #include <rte_rwlock.h>
37 #include <rte_cycles.h>
38
39 #include "mlx5.h"
40 #include "mlx5_glue.h"
41 #include "mlx5_rxtx.h"
42 #include "mlx5_utils.h"
43
44 /* Supported speed values found in /usr/include/linux/ethtool.h */
45 #ifndef HAVE_SUPPORTED_40000baseKR4_Full
46 #define SUPPORTED_40000baseKR4_Full (1 << 23)
47 #endif
48 #ifndef HAVE_SUPPORTED_40000baseCR4_Full
49 #define SUPPORTED_40000baseCR4_Full (1 << 24)
50 #endif
51 #ifndef HAVE_SUPPORTED_40000baseSR4_Full
52 #define SUPPORTED_40000baseSR4_Full (1 << 25)
53 #endif
54 #ifndef HAVE_SUPPORTED_40000baseLR4_Full
55 #define SUPPORTED_40000baseLR4_Full (1 << 26)
56 #endif
57 #ifndef HAVE_SUPPORTED_56000baseKR4_Full
58 #define SUPPORTED_56000baseKR4_Full (1 << 27)
59 #endif
60 #ifndef HAVE_SUPPORTED_56000baseCR4_Full
61 #define SUPPORTED_56000baseCR4_Full (1 << 28)
62 #endif
63 #ifndef HAVE_SUPPORTED_56000baseSR4_Full
64 #define SUPPORTED_56000baseSR4_Full (1 << 29)
65 #endif
66 #ifndef HAVE_SUPPORTED_56000baseLR4_Full
67 #define SUPPORTED_56000baseLR4_Full (1 << 30)
68 #endif
69
70 /* Add defines in case the running kernel is not the same as user headers. */
71 #ifndef ETHTOOL_GLINKSETTINGS
72 struct ethtool_link_settings {
73         uint32_t cmd;
74         uint32_t speed;
75         uint8_t duplex;
76         uint8_t port;
77         uint8_t phy_address;
78         uint8_t autoneg;
79         uint8_t mdio_support;
80         uint8_t eth_to_mdix;
81         uint8_t eth_tp_mdix_ctrl;
82         int8_t link_mode_masks_nwords;
83         uint32_t reserved[8];
84         uint32_t link_mode_masks[];
85 };
86
87 #define ETHTOOL_GLINKSETTINGS 0x0000004c
88 #define ETHTOOL_LINK_MODE_1000baseT_Full_BIT 5
89 #define ETHTOOL_LINK_MODE_Autoneg_BIT 6
90 #define ETHTOOL_LINK_MODE_1000baseKX_Full_BIT 17
91 #define ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT 18
92 #define ETHTOOL_LINK_MODE_10000baseKR_Full_BIT 19
93 #define ETHTOOL_LINK_MODE_10000baseR_FEC_BIT 20
94 #define ETHTOOL_LINK_MODE_20000baseMLD2_Full_BIT 21
95 #define ETHTOOL_LINK_MODE_20000baseKR2_Full_BIT 22
96 #define ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT 23
97 #define ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT 24
98 #define ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT 25
99 #define ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT 26
100 #define ETHTOOL_LINK_MODE_56000baseKR4_Full_BIT 27
101 #define ETHTOOL_LINK_MODE_56000baseCR4_Full_BIT 28
102 #define ETHTOOL_LINK_MODE_56000baseSR4_Full_BIT 29
103 #define ETHTOOL_LINK_MODE_56000baseLR4_Full_BIT 30
104 #endif
105 #ifndef HAVE_ETHTOOL_LINK_MODE_25G
106 #define ETHTOOL_LINK_MODE_25000baseCR_Full_BIT 31
107 #define ETHTOOL_LINK_MODE_25000baseKR_Full_BIT 32
108 #define ETHTOOL_LINK_MODE_25000baseSR_Full_BIT 33
109 #endif
110 #ifndef HAVE_ETHTOOL_LINK_MODE_50G
111 #define ETHTOOL_LINK_MODE_50000baseCR2_Full_BIT 34
112 #define ETHTOOL_LINK_MODE_50000baseKR2_Full_BIT 35
113 #endif
114 #ifndef HAVE_ETHTOOL_LINK_MODE_100G
115 #define ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT 36
116 #define ETHTOOL_LINK_MODE_100000baseSR4_Full_BIT 37
117 #define ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT 38
118 #define ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT 39
119 #endif
120
121 /**
122  * Get master interface name from private structure.
123  *
124  * @param[in] dev
125  *   Pointer to Ethernet device.
126  * @param[out] ifname
127  *   Interface name output buffer.
128  *
129  * @return
130  *   0 on success, a negative errno value otherwise and rte_errno is set.
131  */
132 int
133 mlx5_get_master_ifname(const char *ibdev_path, char (*ifname)[IF_NAMESIZE])
134 {
135         DIR *dir;
136         struct dirent *dent;
137         unsigned int dev_type = 0;
138         unsigned int dev_port_prev = ~0u;
139         char match[IF_NAMESIZE] = "";
140
141         assert(ibdev_path);
142         {
143                 MKSTR(path, "%s/device/net", ibdev_path);
144
145                 dir = opendir(path);
146                 if (dir == NULL) {
147                         rte_errno = errno;
148                         return -rte_errno;
149                 }
150         }
151         while ((dent = readdir(dir)) != NULL) {
152                 char *name = dent->d_name;
153                 FILE *file;
154                 unsigned int dev_port;
155                 int r;
156
157                 if ((name[0] == '.') &&
158                     ((name[1] == '\0') ||
159                      ((name[1] == '.') && (name[2] == '\0'))))
160                         continue;
161
162                 MKSTR(path, "%s/device/net/%s/%s",
163                       ibdev_path, name,
164                       (dev_type ? "dev_id" : "dev_port"));
165
166                 file = fopen(path, "rb");
167                 if (file == NULL) {
168                         if (errno != ENOENT)
169                                 continue;
170                         /*
171                          * Switch to dev_id when dev_port does not exist as
172                          * is the case with Linux kernel versions < 3.15.
173                          */
174 try_dev_id:
175                         match[0] = '\0';
176                         if (dev_type)
177                                 break;
178                         dev_type = 1;
179                         dev_port_prev = ~0u;
180                         rewinddir(dir);
181                         continue;
182                 }
183                 r = fscanf(file, (dev_type ? "%x" : "%u"), &dev_port);
184                 fclose(file);
185                 if (r != 1)
186                         continue;
187                 /*
188                  * Switch to dev_id when dev_port returns the same value for
189                  * all ports. May happen when using a MOFED release older than
190                  * 3.0 with a Linux kernel >= 3.15.
191                  */
192                 if (dev_port == dev_port_prev)
193                         goto try_dev_id;
194                 dev_port_prev = dev_port;
195                 if (dev_port == 0)
196                         strlcpy(match, name, sizeof(match));
197         }
198         closedir(dir);
199         if (match[0] == '\0') {
200                 rte_errno = ENOENT;
201                 return -rte_errno;
202         }
203         strncpy(*ifname, match, sizeof(*ifname));
204         return 0;
205 }
206
207 /**
208  * Get interface name from private structure.
209  *
210  * This is a port representor-aware version of mlx5_get_master_ifname().
211  *
212  * @param[in] dev
213  *   Pointer to Ethernet device.
214  * @param[out] ifname
215  *   Interface name output buffer.
216  *
217  * @return
218  *   0 on success, a negative errno value otherwise and rte_errno is set.
219  */
220 int
221 mlx5_get_ifname(const struct rte_eth_dev *dev, char (*ifname)[IF_NAMESIZE])
222 {
223         struct mlx5_priv *priv = dev->data->dev_private;
224         unsigned int ifindex;
225
226         assert(priv);
227         assert(priv->sh);
228         ifindex = priv->nl_socket_rdma >= 0 ?
229                   mlx5_nl_ifindex(priv->nl_socket_rdma,
230                                   priv->sh->ibdev_name,
231                                   priv->ibv_port) : 0;
232         if (!ifindex) {
233                 if (!priv->representor)
234                         return mlx5_get_master_ifname(priv->sh->ibdev_path,
235                                                       ifname);
236                 rte_errno = ENXIO;
237                 return -rte_errno;
238         }
239         if (if_indextoname(ifindex, &(*ifname)[0]))
240                 return 0;
241         rte_errno = errno;
242         return -rte_errno;
243 }
244
245 /**
246  * Get interface name for the specified device, uses the extra base
247  * device resources to perform Netlink requests.
248  *
249  * This is a port representor-aware version of mlx5_get_master_ifname().
250  *
251  * @param[in] base
252  *   Pointer to Ethernet device to use Netlink socket from
253  *   to perfrom requests.
254  * @param[in] dev
255  *   Pointer to Ethernet device.
256  * @param[out] ifname
257  *   Interface name output buffer.
258  *
259  * @return
260  *   0 on success, a negative errno value otherwise and rte_errno is set.
261  */
262 int
263 mlx5_get_ifname_base(const struct rte_eth_dev *base,
264                      const struct rte_eth_dev *dev,
265                      char (*ifname)[IF_NAMESIZE])
266 {
267         struct mlx5_priv *priv = dev->data->dev_private;
268         struct mlx5_priv *priv_base = base->data->dev_private;
269         unsigned int ifindex;
270
271         assert(priv);
272         assert(priv->sh);
273         assert(priv_base);
274         ifindex = priv_base->nl_socket_rdma >= 0 ?
275                   mlx5_nl_ifindex(priv_base->nl_socket_rdma,
276                                   priv->sh->ibdev_name,
277                                   priv->ibv_port) : 0;
278         if (!ifindex) {
279                 if (!priv->representor)
280                         return mlx5_get_master_ifname(priv->sh->ibdev_path,
281                                                       ifname);
282                 rte_errno = ENXIO;
283                 return -rte_errno;
284         }
285         if (if_indextoname(ifindex, &(*ifname)[0]))
286                 return 0;
287         rte_errno = errno;
288         return -rte_errno;
289 }
290 /**
291  * Get the interface index from device name.
292  *
293  * @param[in] dev
294  *   Pointer to Ethernet device.
295  *
296  * @return
297  *   Nonzero interface index on success, zero otherwise and rte_errno is set.
298  */
299 unsigned int
300 mlx5_ifindex(const struct rte_eth_dev *dev)
301 {
302         char ifname[IF_NAMESIZE];
303         unsigned int ifindex;
304
305         if (mlx5_get_ifname(dev, &ifname))
306                 return 0;
307         ifindex = if_nametoindex(ifname);
308         if (!ifindex)
309                 rte_errno = errno;
310         return ifindex;
311 }
312
313 /**
314  * Perform ifreq ioctl() on associated Ethernet device.
315  *
316  * @param[in] dev
317  *   Pointer to Ethernet device.
318  * @param req
319  *   Request number to pass to ioctl().
320  * @param[out] ifr
321  *   Interface request structure output buffer.
322  *
323  * @return
324  *   0 on success, a negative errno value otherwise and rte_errno is set.
325  */
326 int
327 mlx5_ifreq(const struct rte_eth_dev *dev, int req, struct ifreq *ifr)
328 {
329         int sock = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
330         int ret = 0;
331
332         if (sock == -1) {
333                 rte_errno = errno;
334                 return -rte_errno;
335         }
336         ret = mlx5_get_ifname(dev, &ifr->ifr_name);
337         if (ret)
338                 goto error;
339         ret = ioctl(sock, req, ifr);
340         if (ret == -1) {
341                 rte_errno = errno;
342                 goto error;
343         }
344         close(sock);
345         return 0;
346 error:
347         close(sock);
348         return -rte_errno;
349 }
350
351 /**
352  * Perform ifreq ioctl() on specified Ethernet device,
353  * ifindex, name and other attributes are requested
354  * on the base device to avoid specified device Netlink
355  * socket sharing (this is not thread-safe).
356  *
357  * @param[in] base
358  *   Pointer to Ethernet device to get dev attributes.
359  * @param[in] dev
360  *   Pointer to Ethernet device to perform ioctl.
361  * @param req
362  *   Request number to pass to ioctl().
363  * @param[out] ifr
364  *   Interface request structure output buffer.
365  *
366  * @return
367  *   0 on success, a negative errno value otherwise and rte_errno is set.
368  */
369 int
370 mlx5_ifreq_base(const struct rte_eth_dev *base,
371                 const struct rte_eth_dev *dev,
372                 int req, struct ifreq *ifr)
373 {
374         int sock = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
375         int ret = 0;
376
377         if (sock == -1) {
378                 rte_errno = errno;
379                 return -rte_errno;
380         }
381         ret = mlx5_get_ifname_base(base, dev, &ifr->ifr_name);
382         if (ret)
383                 goto error;
384         ret = ioctl(sock, req, ifr);
385         if (ret == -1) {
386                 rte_errno = errno;
387                 goto error;
388         }
389         close(sock);
390         return 0;
391 error:
392         close(sock);
393         return -rte_errno;
394 }
395
396 /**
397  * Get device MTU.
398  *
399  * @param dev
400  *   Pointer to Ethernet device.
401  * @param[out] mtu
402  *   MTU value output buffer.
403  *
404  * @return
405  *   0 on success, a negative errno value otherwise and rte_errno is set.
406  */
407 int
408 mlx5_get_mtu(struct rte_eth_dev *dev, uint16_t *mtu)
409 {
410         struct ifreq request;
411         int ret = mlx5_ifreq(dev, SIOCGIFMTU, &request);
412
413         if (ret)
414                 return ret;
415         *mtu = request.ifr_mtu;
416         return 0;
417 }
418
419 /**
420  * Set device MTU.
421  *
422  * @param dev
423  *   Pointer to Ethernet device.
424  * @param mtu
425  *   MTU value to set.
426  *
427  * @return
428  *   0 on success, a negative errno value otherwise and rte_errno is set.
429  */
430 static int
431 mlx5_set_mtu(struct rte_eth_dev *dev, uint16_t mtu)
432 {
433         struct ifreq request = { .ifr_mtu = mtu, };
434
435         return mlx5_ifreq(dev, SIOCSIFMTU, &request);
436 }
437
438 /**
439  * Set device flags.
440  *
441  * @param dev
442  *   Pointer to Ethernet device.
443  * @param keep
444  *   Bitmask for flags that must remain untouched.
445  * @param flags
446  *   Bitmask for flags to modify.
447  *
448  * @return
449  *   0 on success, a negative errno value otherwise and rte_errno is set.
450  */
451 int
452 mlx5_set_flags(struct rte_eth_dev *dev, unsigned int keep, unsigned int flags)
453 {
454         struct ifreq request;
455         int ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &request);
456
457         if (ret)
458                 return ret;
459         request.ifr_flags &= keep;
460         request.ifr_flags |= flags & ~keep;
461         return mlx5_ifreq(dev, SIOCSIFFLAGS, &request);
462 }
463
464 /**
465  * DPDK callback for Ethernet device configuration.
466  *
467  * @param dev
468  *   Pointer to Ethernet device structure.
469  *
470  * @return
471  *   0 on success, a negative errno value otherwise and rte_errno is set.
472  */
473 int
474 mlx5_dev_configure(struct rte_eth_dev *dev)
475 {
476         struct mlx5_priv *priv = dev->data->dev_private;
477         unsigned int rxqs_n = dev->data->nb_rx_queues;
478         unsigned int txqs_n = dev->data->nb_tx_queues;
479         unsigned int i;
480         unsigned int j;
481         unsigned int reta_idx_n;
482         const uint8_t use_app_rss_key =
483                 !!dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key;
484         int ret = 0;
485
486         if (use_app_rss_key &&
487             (dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key_len !=
488              MLX5_RSS_HASH_KEY_LEN)) {
489                 DRV_LOG(ERR, "port %u RSS key len must be %s Bytes long",
490                         dev->data->port_id, RTE_STR(MLX5_RSS_HASH_KEY_LEN));
491                 rte_errno = EINVAL;
492                 return -rte_errno;
493         }
494         priv->rss_conf.rss_key =
495                 rte_realloc(priv->rss_conf.rss_key,
496                             MLX5_RSS_HASH_KEY_LEN, 0);
497         if (!priv->rss_conf.rss_key) {
498                 DRV_LOG(ERR, "port %u cannot allocate RSS hash key memory (%u)",
499                         dev->data->port_id, rxqs_n);
500                 rte_errno = ENOMEM;
501                 return -rte_errno;
502         }
503         memcpy(priv->rss_conf.rss_key,
504                use_app_rss_key ?
505                dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key :
506                rss_hash_default_key,
507                MLX5_RSS_HASH_KEY_LEN);
508         priv->rss_conf.rss_key_len = MLX5_RSS_HASH_KEY_LEN;
509         priv->rss_conf.rss_hf = dev->data->dev_conf.rx_adv_conf.rss_conf.rss_hf;
510         priv->rxqs = (void *)dev->data->rx_queues;
511         priv->txqs = (void *)dev->data->tx_queues;
512         if (txqs_n != priv->txqs_n) {
513                 DRV_LOG(INFO, "port %u Tx queues number update: %u -> %u",
514                         dev->data->port_id, priv->txqs_n, txqs_n);
515                 priv->txqs_n = txqs_n;
516         }
517         if (rxqs_n > priv->config.ind_table_max_size) {
518                 DRV_LOG(ERR, "port %u cannot handle this many Rx queues (%u)",
519                         dev->data->port_id, rxqs_n);
520                 rte_errno = EINVAL;
521                 return -rte_errno;
522         }
523         if (rxqs_n != priv->rxqs_n) {
524                 DRV_LOG(INFO, "port %u Rx queues number update: %u -> %u",
525                         dev->data->port_id, priv->rxqs_n, rxqs_n);
526                 priv->rxqs_n = rxqs_n;
527                 /*
528                  * If the requested number of RX queues is not a power of two,
529                  * use the maximum indirection table size for better balancing.
530                  * The result is always rounded to the next power of two.
531                  */
532                 reta_idx_n = (1 << log2above((rxqs_n & (rxqs_n - 1)) ?
533                                              priv->config.ind_table_max_size :
534                                              rxqs_n));
535                 ret = mlx5_rss_reta_index_resize(dev, reta_idx_n);
536                 if (ret)
537                         return ret;
538                 /*
539                  * When the number of RX queues is not a power of two,
540                  * the remaining table entries are padded with reused WQs
541                  * and hashes are not spread uniformly.
542                  */
543                 for (i = 0, j = 0; (i != reta_idx_n); ++i) {
544                         (*priv->reta_idx)[i] = j;
545                         if (++j == rxqs_n)
546                                 j = 0;
547                 }
548         }
549         ret = mlx5_proc_priv_init(dev);
550         if (ret)
551                 return ret;
552         return 0;
553 }
554
555 /**
556  * Sets default tuning parameters.
557  *
558  * @param dev
559  *   Pointer to Ethernet device.
560  * @param[out] info
561  *   Info structure output buffer.
562  */
563 static void
564 mlx5_set_default_params(struct rte_eth_dev *dev, struct rte_eth_dev_info *info)
565 {
566         struct mlx5_priv *priv = dev->data->dev_private;
567
568         /* Minimum CPU utilization. */
569         info->default_rxportconf.ring_size = 256;
570         info->default_txportconf.ring_size = 256;
571         info->default_rxportconf.burst_size = 64;
572         info->default_txportconf.burst_size = 64;
573         if (priv->link_speed_capa & ETH_LINK_SPEED_100G) {
574                 info->default_rxportconf.nb_queues = 16;
575                 info->default_txportconf.nb_queues = 16;
576                 if (dev->data->nb_rx_queues > 2 ||
577                     dev->data->nb_tx_queues > 2) {
578                         /* Max Throughput. */
579                         info->default_rxportconf.ring_size = 2048;
580                         info->default_txportconf.ring_size = 2048;
581                 }
582         } else {
583                 info->default_rxportconf.nb_queues = 8;
584                 info->default_txportconf.nb_queues = 8;
585                 if (dev->data->nb_rx_queues > 2 ||
586                     dev->data->nb_tx_queues > 2) {
587                         /* Max Throughput. */
588                         info->default_rxportconf.ring_size = 4096;
589                         info->default_txportconf.ring_size = 4096;
590                 }
591         }
592 }
593
594 /**
595  * Sets tx mbuf limiting parameters.
596  *
597  * @param dev
598  *   Pointer to Ethernet device.
599  * @param[out] info
600  *   Info structure output buffer.
601  */
602 static void
603 mlx5_set_txlimit_params(struct rte_eth_dev *dev, struct rte_eth_dev_info *info)
604 {
605         struct mlx5_priv *priv = dev->data->dev_private;
606         struct mlx5_dev_config *config = &priv->config;
607         unsigned int inlen;
608         uint16_t nb_max;
609
610         inlen = (config->txq_inline_max == MLX5_ARG_UNSET) ?
611                 MLX5_SEND_DEF_INLINE_LEN :
612                 (unsigned int)config->txq_inline_max;
613         assert(config->txq_inline_min >= 0);
614         inlen = RTE_MAX(inlen, (unsigned int)config->txq_inline_min);
615         inlen = RTE_MIN(inlen, MLX5_WQE_SIZE_MAX +
616                                MLX5_ESEG_MIN_INLINE_SIZE -
617                                MLX5_WQE_CSEG_SIZE -
618                                MLX5_WQE_ESEG_SIZE -
619                                MLX5_WQE_DSEG_SIZE * 2);
620         nb_max = (MLX5_WQE_SIZE_MAX +
621                   MLX5_ESEG_MIN_INLINE_SIZE -
622                   MLX5_WQE_CSEG_SIZE -
623                   MLX5_WQE_ESEG_SIZE -
624                   MLX5_WQE_DSEG_SIZE -
625                   inlen) / MLX5_WSEG_SIZE;
626         info->tx_desc_lim.nb_seg_max = nb_max;
627         info->tx_desc_lim.nb_mtu_seg_max = nb_max;
628 }
629
630 /**
631  * DPDK callback to get information about the device.
632  *
633  * @param dev
634  *   Pointer to Ethernet device structure.
635  * @param[out] info
636  *   Info structure output buffer.
637  */
638 void
639 mlx5_dev_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *info)
640 {
641         struct mlx5_priv *priv = dev->data->dev_private;
642         struct mlx5_dev_config *config = &priv->config;
643         unsigned int max;
644         char ifname[IF_NAMESIZE];
645
646         /* FIXME: we should ask the device for these values. */
647         info->min_rx_bufsize = 32;
648         info->max_rx_pktlen = 65536;
649         /*
650          * Since we need one CQ per QP, the limit is the minimum number
651          * between the two values.
652          */
653         max = RTE_MIN(priv->sh->device_attr.orig_attr.max_cq,
654                       priv->sh->device_attr.orig_attr.max_qp);
655         /* If max >= 65535 then max = 0, max_rx_queues is uint16_t. */
656         if (max >= 65535)
657                 max = 65535;
658         info->max_rx_queues = max;
659         info->max_tx_queues = max;
660         info->max_mac_addrs = MLX5_MAX_UC_MAC_ADDRESSES;
661         info->rx_queue_offload_capa = mlx5_get_rx_queue_offloads(dev);
662         info->rx_offload_capa = (mlx5_get_rx_port_offloads() |
663                                  info->rx_queue_offload_capa);
664         info->tx_offload_capa = mlx5_get_tx_port_offloads(dev);
665         if (mlx5_get_ifname(dev, &ifname) == 0)
666                 info->if_index = if_nametoindex(ifname);
667         info->reta_size = priv->reta_idx_n ?
668                 priv->reta_idx_n : config->ind_table_max_size;
669         info->hash_key_size = MLX5_RSS_HASH_KEY_LEN;
670         info->speed_capa = priv->link_speed_capa;
671         info->flow_type_rss_offloads = ~MLX5_RSS_HF_MASK;
672         mlx5_set_default_params(dev, info);
673         mlx5_set_txlimit_params(dev, info);
674         info->switch_info.name = dev->data->name;
675         info->switch_info.domain_id = priv->domain_id;
676         info->switch_info.port_id = priv->representor_id;
677         if (priv->representor) {
678                 unsigned int i = mlx5_dev_to_port_id(dev->device, NULL, 0);
679                 uint16_t port_id[i];
680
681                 i = RTE_MIN(mlx5_dev_to_port_id(dev->device, port_id, i), i);
682                 while (i--) {
683                         struct mlx5_priv *opriv =
684                                 rte_eth_devices[port_id[i]].data->dev_private;
685
686                         if (!opriv ||
687                             opriv->representor ||
688                             opriv->domain_id != priv->domain_id)
689                                 continue;
690                         /*
691                          * Override switch name with that of the master
692                          * device.
693                          */
694                         info->switch_info.name = opriv->dev_data->name;
695                         break;
696                 }
697         }
698 }
699
700 /**
701  * Get device current raw clock counter
702  *
703  * @param dev
704  *   Pointer to Ethernet device structure.
705  * @param[out] time
706  *   Current raw clock counter of the device.
707  *
708  * @return
709  *   0 if the clock has correctly been read
710  *   The value of errno in case of error
711  */
712 int
713 mlx5_read_clock(struct rte_eth_dev *dev, uint64_t *clock)
714 {
715         struct mlx5_priv *priv = dev->data->dev_private;
716         struct ibv_context *ctx = priv->sh->ctx;
717         struct ibv_values_ex values;
718         int err = 0;
719
720         values.comp_mask = IBV_VALUES_MASK_RAW_CLOCK;
721         err = mlx5_glue->query_rt_values_ex(ctx, &values);
722         if (err != 0) {
723                 DRV_LOG(WARNING, "Could not query the clock !");
724                 return err;
725         }
726         *clock = values.raw_clock.tv_nsec;
727         return 0;
728 }
729
730 /**
731  * Get firmware version of a device.
732  *
733  * @param dev
734  *   Ethernet device port.
735  * @param fw_ver
736  *   String output allocated by caller.
737  * @param fw_size
738  *   Size of the output string, including terminating null byte.
739  *
740  * @return
741  *   0 on success, or the size of the non truncated string if too big.
742  */
743 int mlx5_fw_version_get(struct rte_eth_dev *dev, char *fw_ver, size_t fw_size)
744 {
745         struct mlx5_priv *priv = dev->data->dev_private;
746         struct ibv_device_attr *attr = &priv->sh->device_attr.orig_attr;
747         size_t size = strnlen(attr->fw_ver, sizeof(attr->fw_ver)) + 1;
748
749         if (fw_size < size)
750                 return size;
751         if (fw_ver != NULL)
752                 strlcpy(fw_ver, attr->fw_ver, fw_size);
753         return 0;
754 }
755
756 /**
757  * Get supported packet types.
758  *
759  * @param dev
760  *   Pointer to Ethernet device structure.
761  *
762  * @return
763  *   A pointer to the supported Packet types array.
764  */
765 const uint32_t *
766 mlx5_dev_supported_ptypes_get(struct rte_eth_dev *dev)
767 {
768         static const uint32_t ptypes[] = {
769                 /* refers to rxq_cq_to_pkt_type() */
770                 RTE_PTYPE_L2_ETHER,
771                 RTE_PTYPE_L3_IPV4_EXT_UNKNOWN,
772                 RTE_PTYPE_L3_IPV6_EXT_UNKNOWN,
773                 RTE_PTYPE_L4_NONFRAG,
774                 RTE_PTYPE_L4_FRAG,
775                 RTE_PTYPE_L4_TCP,
776                 RTE_PTYPE_L4_UDP,
777                 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN,
778                 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN,
779                 RTE_PTYPE_INNER_L4_NONFRAG,
780                 RTE_PTYPE_INNER_L4_FRAG,
781                 RTE_PTYPE_INNER_L4_TCP,
782                 RTE_PTYPE_INNER_L4_UDP,
783                 RTE_PTYPE_UNKNOWN
784         };
785
786         if (dev->rx_pkt_burst == mlx5_rx_burst ||
787             dev->rx_pkt_burst == mlx5_rx_burst_mprq ||
788             dev->rx_pkt_burst == mlx5_rx_burst_vec)
789                 return ptypes;
790         return NULL;
791 }
792
793 /**
794  * Retrieve the master device for representor in the same switch domain.
795  *
796  * @param dev
797  *   Pointer to representor Ethernet device structure.
798  *
799  * @return
800  *   Master device structure  on success, NULL otherwise.
801  */
802
803 static struct rte_eth_dev *
804 mlx5_find_master_dev(struct rte_eth_dev *dev)
805 {
806         struct mlx5_priv *priv;
807         uint16_t port_id;
808         uint16_t domain_id;
809
810         priv = dev->data->dev_private;
811         domain_id = priv->domain_id;
812         assert(priv->representor);
813         RTE_ETH_FOREACH_DEV_OF(port_id, dev->device) {
814                 priv = rte_eth_devices[port_id].data->dev_private;
815                 if (priv &&
816                     priv->master &&
817                     priv->domain_id == domain_id)
818                         return &rte_eth_devices[port_id];
819         }
820         return NULL;
821 }
822
823 /**
824  * DPDK callback to retrieve physical link information.
825  *
826  * @param dev
827  *   Pointer to Ethernet device structure.
828  * @param[out] link
829  *   Storage for current link status.
830  *
831  * @return
832  *   0 on success, a negative errno value otherwise and rte_errno is set.
833  */
834 static int
835 mlx5_link_update_unlocked_gset(struct rte_eth_dev *dev,
836                                struct rte_eth_link *link)
837 {
838         struct mlx5_priv *priv = dev->data->dev_private;
839         struct ethtool_cmd edata = {
840                 .cmd = ETHTOOL_GSET /* Deprecated since Linux v4.5. */
841         };
842         struct ifreq ifr;
843         struct rte_eth_link dev_link;
844         int link_speed = 0;
845         int ret;
846
847         ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr);
848         if (ret) {
849                 DRV_LOG(WARNING, "port %u ioctl(SIOCGIFFLAGS) failed: %s",
850                         dev->data->port_id, strerror(rte_errno));
851                 return ret;
852         }
853         dev_link = (struct rte_eth_link) {
854                 .link_status = ((ifr.ifr_flags & IFF_UP) &&
855                                 (ifr.ifr_flags & IFF_RUNNING)),
856         };
857         ifr = (struct ifreq) {
858                 .ifr_data = (void *)&edata,
859         };
860         ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
861         if (ret) {
862                 if (ret == -ENOTSUP && priv->representor) {
863                         struct rte_eth_dev *master;
864
865                         /*
866                          * For representors we can try to inherit link
867                          * settings from the master device. Actually
868                          * link settings do not make a lot of sense
869                          * for representors due to missing physical
870                          * link. The old kernel drivers supported
871                          * emulated settings query for representors,
872                          * the new ones do not, so we have to add
873                          * this code for compatibility issues.
874                          */
875                         master = mlx5_find_master_dev(dev);
876                         if (master) {
877                                 ifr = (struct ifreq) {
878                                         .ifr_data = (void *)&edata,
879                                 };
880                                 /*
881                                  * Use special version of mlx5_ifreq()
882                                  * to get master device name with local
883                                  * device Netlink socket. Using master
884                                  * device Netlink socket is not thread
885                                  * safe.
886                                  */
887                                 ret = mlx5_ifreq_base(dev, master,
888                                                       SIOCETHTOOL, &ifr);
889                         }
890                 }
891                 if (ret) {
892                         DRV_LOG(WARNING,
893                                 "port %u ioctl(SIOCETHTOOL,"
894                                 " ETHTOOL_GSET) failed: %s",
895                                 dev->data->port_id, strerror(rte_errno));
896                         return ret;
897                 }
898         }
899         link_speed = ethtool_cmd_speed(&edata);
900         if (link_speed == -1)
901                 dev_link.link_speed = ETH_SPEED_NUM_NONE;
902         else
903                 dev_link.link_speed = link_speed;
904         priv->link_speed_capa = 0;
905         if (edata.supported & SUPPORTED_Autoneg)
906                 priv->link_speed_capa |= ETH_LINK_SPEED_AUTONEG;
907         if (edata.supported & (SUPPORTED_1000baseT_Full |
908                                SUPPORTED_1000baseKX_Full))
909                 priv->link_speed_capa |= ETH_LINK_SPEED_1G;
910         if (edata.supported & SUPPORTED_10000baseKR_Full)
911                 priv->link_speed_capa |= ETH_LINK_SPEED_10G;
912         if (edata.supported & (SUPPORTED_40000baseKR4_Full |
913                                SUPPORTED_40000baseCR4_Full |
914                                SUPPORTED_40000baseSR4_Full |
915                                SUPPORTED_40000baseLR4_Full))
916                 priv->link_speed_capa |= ETH_LINK_SPEED_40G;
917         dev_link.link_duplex = ((edata.duplex == DUPLEX_HALF) ?
918                                 ETH_LINK_HALF_DUPLEX : ETH_LINK_FULL_DUPLEX);
919         dev_link.link_autoneg = !(dev->data->dev_conf.link_speeds &
920                         ETH_LINK_SPEED_FIXED);
921         if (((dev_link.link_speed && !dev_link.link_status) ||
922              (!dev_link.link_speed && dev_link.link_status))) {
923                 rte_errno = EAGAIN;
924                 return -rte_errno;
925         }
926         *link = dev_link;
927         return 0;
928 }
929
930 /**
931  * Retrieve physical link information (unlocked version using new ioctl).
932  *
933  * @param dev
934  *   Pointer to Ethernet device structure.
935  * @param[out] link
936  *   Storage for current link status.
937  *
938  * @return
939  *   0 on success, a negative errno value otherwise and rte_errno is set.
940  */
941 static int
942 mlx5_link_update_unlocked_gs(struct rte_eth_dev *dev,
943                              struct rte_eth_link *link)
944
945 {
946         struct mlx5_priv *priv = dev->data->dev_private;
947         struct ethtool_link_settings gcmd = { .cmd = ETHTOOL_GLINKSETTINGS };
948         struct ifreq ifr;
949         struct rte_eth_link dev_link;
950         struct rte_eth_dev *master = NULL;
951         uint64_t sc;
952         int ret;
953
954         ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr);
955         if (ret) {
956                 DRV_LOG(WARNING, "port %u ioctl(SIOCGIFFLAGS) failed: %s",
957                         dev->data->port_id, strerror(rte_errno));
958                 return ret;
959         }
960         dev_link = (struct rte_eth_link) {
961                 .link_status = ((ifr.ifr_flags & IFF_UP) &&
962                                 (ifr.ifr_flags & IFF_RUNNING)),
963         };
964         ifr = (struct ifreq) {
965                 .ifr_data = (void *)&gcmd,
966         };
967         ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
968         if (ret) {
969                 if (ret == -ENOTSUP && priv->representor) {
970                         /*
971                          * For representors we can try to inherit link
972                          * settings from the master device. Actually
973                          * link settings do not make a lot of sense
974                          * for representors due to missing physical
975                          * link. The old kernel drivers supported
976                          * emulated settings query for representors,
977                          * the new ones do not, so we have to add
978                          * this code for compatibility issues.
979                          */
980                         master = mlx5_find_master_dev(dev);
981                         if (master) {
982                                 ifr = (struct ifreq) {
983                                         .ifr_data = (void *)&gcmd,
984                                 };
985                                 /*
986                                  * Avoid using master Netlink socket.
987                                  * This is not thread-safe.
988                                  */
989                                 ret = mlx5_ifreq_base(dev, master,
990                                                       SIOCETHTOOL, &ifr);
991                         }
992                 }
993                 if (ret) {
994                         DRV_LOG(DEBUG,
995                                 "port %u ioctl(SIOCETHTOOL,"
996                                 " ETHTOOL_GLINKSETTINGS) failed: %s",
997                                 dev->data->port_id, strerror(rte_errno));
998                         return ret;
999                 }
1000
1001         }
1002         gcmd.link_mode_masks_nwords = -gcmd.link_mode_masks_nwords;
1003
1004         alignas(struct ethtool_link_settings)
1005         uint8_t data[offsetof(struct ethtool_link_settings, link_mode_masks) +
1006                      sizeof(uint32_t) * gcmd.link_mode_masks_nwords * 3];
1007         struct ethtool_link_settings *ecmd = (void *)data;
1008
1009         *ecmd = gcmd;
1010         ifr.ifr_data = (void *)ecmd;
1011         ret = mlx5_ifreq_base(dev, master ? master : dev, SIOCETHTOOL, &ifr);
1012         if (ret) {
1013                 DRV_LOG(DEBUG,
1014                         "port %u ioctl(SIOCETHTOOL,"
1015                         "ETHTOOL_GLINKSETTINGS) failed: %s",
1016                         dev->data->port_id, strerror(rte_errno));
1017                 return ret;
1018         }
1019         dev_link.link_speed = ecmd->speed;
1020         sc = ecmd->link_mode_masks[0] |
1021                 ((uint64_t)ecmd->link_mode_masks[1] << 32);
1022         priv->link_speed_capa = 0;
1023         if (sc & MLX5_BITSHIFT(ETHTOOL_LINK_MODE_Autoneg_BIT))
1024                 priv->link_speed_capa |= ETH_LINK_SPEED_AUTONEG;
1025         if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_1000baseT_Full_BIT) |
1026                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_1000baseKX_Full_BIT)))
1027                 priv->link_speed_capa |= ETH_LINK_SPEED_1G;
1028         if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT) |
1029                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_10000baseKR_Full_BIT) |
1030                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_10000baseR_FEC_BIT)))
1031                 priv->link_speed_capa |= ETH_LINK_SPEED_10G;
1032         if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_20000baseMLD2_Full_BIT) |
1033                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_20000baseKR2_Full_BIT)))
1034                 priv->link_speed_capa |= ETH_LINK_SPEED_20G;
1035         if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT) |
1036                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT) |
1037                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT) |
1038                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT)))
1039                 priv->link_speed_capa |= ETH_LINK_SPEED_40G;
1040         if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseKR4_Full_BIT) |
1041                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseCR4_Full_BIT) |
1042                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseSR4_Full_BIT) |
1043                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseLR4_Full_BIT)))
1044                 priv->link_speed_capa |= ETH_LINK_SPEED_56G;
1045         if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_25000baseCR_Full_BIT) |
1046                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_25000baseKR_Full_BIT) |
1047                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_25000baseSR_Full_BIT)))
1048                 priv->link_speed_capa |= ETH_LINK_SPEED_25G;
1049         if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_50000baseCR2_Full_BIT) |
1050                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_50000baseKR2_Full_BIT)))
1051                 priv->link_speed_capa |= ETH_LINK_SPEED_50G;
1052         if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT) |
1053                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseSR4_Full_BIT) |
1054                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT) |
1055                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT)))
1056                 priv->link_speed_capa |= ETH_LINK_SPEED_100G;
1057         dev_link.link_duplex = ((ecmd->duplex == DUPLEX_HALF) ?
1058                                 ETH_LINK_HALF_DUPLEX : ETH_LINK_FULL_DUPLEX);
1059         dev_link.link_autoneg = !(dev->data->dev_conf.link_speeds &
1060                                   ETH_LINK_SPEED_FIXED);
1061         if (((dev_link.link_speed && !dev_link.link_status) ||
1062              (!dev_link.link_speed && dev_link.link_status))) {
1063                 rte_errno = EAGAIN;
1064                 return -rte_errno;
1065         }
1066         *link = dev_link;
1067         return 0;
1068 }
1069
1070 /**
1071  * DPDK callback to retrieve physical link information.
1072  *
1073  * @param dev
1074  *   Pointer to Ethernet device structure.
1075  * @param wait_to_complete
1076  *   Wait for request completion.
1077  *
1078  * @return
1079  *   0 if link status was not updated, positive if it was, a negative errno
1080  *   value otherwise and rte_errno is set.
1081  */
1082 int
1083 mlx5_link_update(struct rte_eth_dev *dev, int wait_to_complete)
1084 {
1085         int ret;
1086         struct rte_eth_link dev_link;
1087         time_t start_time = time(NULL);
1088
1089         do {
1090                 ret = mlx5_link_update_unlocked_gs(dev, &dev_link);
1091                 if (ret == -ENOTSUP)
1092                         ret = mlx5_link_update_unlocked_gset(dev, &dev_link);
1093                 if (ret == 0)
1094                         break;
1095                 /* Handle wait to complete situation. */
1096                 if (wait_to_complete && ret == -EAGAIN) {
1097                         if (abs((int)difftime(time(NULL), start_time)) <
1098                             MLX5_LINK_STATUS_TIMEOUT) {
1099                                 usleep(0);
1100                                 continue;
1101                         } else {
1102                                 rte_errno = EBUSY;
1103                                 return -rte_errno;
1104                         }
1105                 } else if (ret < 0) {
1106                         return ret;
1107                 }
1108         } while (wait_to_complete);
1109         ret = !!memcmp(&dev->data->dev_link, &dev_link,
1110                        sizeof(struct rte_eth_link));
1111         dev->data->dev_link = dev_link;
1112         return ret;
1113 }
1114
1115 /**
1116  * DPDK callback to change the MTU.
1117  *
1118  * @param dev
1119  *   Pointer to Ethernet device structure.
1120  * @param in_mtu
1121  *   New MTU.
1122  *
1123  * @return
1124  *   0 on success, a negative errno value otherwise and rte_errno is set.
1125  */
1126 int
1127 mlx5_dev_set_mtu(struct rte_eth_dev *dev, uint16_t mtu)
1128 {
1129         struct mlx5_priv *priv = dev->data->dev_private;
1130         uint16_t kern_mtu = 0;
1131         int ret;
1132
1133         ret = mlx5_get_mtu(dev, &kern_mtu);
1134         if (ret)
1135                 return ret;
1136         /* Set kernel interface MTU first. */
1137         ret = mlx5_set_mtu(dev, mtu);
1138         if (ret)
1139                 return ret;
1140         ret = mlx5_get_mtu(dev, &kern_mtu);
1141         if (ret)
1142                 return ret;
1143         if (kern_mtu == mtu) {
1144                 priv->mtu = mtu;
1145                 DRV_LOG(DEBUG, "port %u adapter MTU set to %u",
1146                         dev->data->port_id, mtu);
1147                 return 0;
1148         }
1149         rte_errno = EAGAIN;
1150         return -rte_errno;
1151 }
1152
1153 /**
1154  * DPDK callback to get flow control status.
1155  *
1156  * @param dev
1157  *   Pointer to Ethernet device structure.
1158  * @param[out] fc_conf
1159  *   Flow control output buffer.
1160  *
1161  * @return
1162  *   0 on success, a negative errno value otherwise and rte_errno is set.
1163  */
1164 int
1165 mlx5_dev_get_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf)
1166 {
1167         struct ifreq ifr;
1168         struct ethtool_pauseparam ethpause = {
1169                 .cmd = ETHTOOL_GPAUSEPARAM
1170         };
1171         int ret;
1172
1173         ifr.ifr_data = (void *)&ethpause;
1174         ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
1175         if (ret) {
1176                 DRV_LOG(WARNING,
1177                         "port %u ioctl(SIOCETHTOOL, ETHTOOL_GPAUSEPARAM) failed:"
1178                         " %s",
1179                         dev->data->port_id, strerror(rte_errno));
1180                 return ret;
1181         }
1182         fc_conf->autoneg = ethpause.autoneg;
1183         if (ethpause.rx_pause && ethpause.tx_pause)
1184                 fc_conf->mode = RTE_FC_FULL;
1185         else if (ethpause.rx_pause)
1186                 fc_conf->mode = RTE_FC_RX_PAUSE;
1187         else if (ethpause.tx_pause)
1188                 fc_conf->mode = RTE_FC_TX_PAUSE;
1189         else
1190                 fc_conf->mode = RTE_FC_NONE;
1191         return 0;
1192 }
1193
1194 /**
1195  * DPDK callback to modify flow control parameters.
1196  *
1197  * @param dev
1198  *   Pointer to Ethernet device structure.
1199  * @param[in] fc_conf
1200  *   Flow control parameters.
1201  *
1202  * @return
1203  *   0 on success, a negative errno value otherwise and rte_errno is set.
1204  */
1205 int
1206 mlx5_dev_set_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf)
1207 {
1208         struct ifreq ifr;
1209         struct ethtool_pauseparam ethpause = {
1210                 .cmd = ETHTOOL_SPAUSEPARAM
1211         };
1212         int ret;
1213
1214         ifr.ifr_data = (void *)&ethpause;
1215         ethpause.autoneg = fc_conf->autoneg;
1216         if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) ||
1217             (fc_conf->mode & RTE_FC_RX_PAUSE))
1218                 ethpause.rx_pause = 1;
1219         else
1220                 ethpause.rx_pause = 0;
1221
1222         if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) ||
1223             (fc_conf->mode & RTE_FC_TX_PAUSE))
1224                 ethpause.tx_pause = 1;
1225         else
1226                 ethpause.tx_pause = 0;
1227         ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
1228         if (ret) {
1229                 DRV_LOG(WARNING,
1230                         "port %u ioctl(SIOCETHTOOL, ETHTOOL_SPAUSEPARAM)"
1231                         " failed: %s",
1232                         dev->data->port_id, strerror(rte_errno));
1233                 return ret;
1234         }
1235         return 0;
1236 }
1237
1238 /**
1239  * Get PCI information from struct ibv_device.
1240  *
1241  * @param device
1242  *   Pointer to Ethernet device structure.
1243  * @param[out] pci_addr
1244  *   PCI bus address output buffer.
1245  *
1246  * @return
1247  *   0 on success, a negative errno value otherwise and rte_errno is set.
1248  */
1249 int
1250 mlx5_ibv_device_to_pci_addr(const struct ibv_device *device,
1251                             struct rte_pci_addr *pci_addr)
1252 {
1253         FILE *file;
1254         char line[32];
1255         MKSTR(path, "%s/device/uevent", device->ibdev_path);
1256
1257         file = fopen(path, "rb");
1258         if (file == NULL) {
1259                 rte_errno = errno;
1260                 return -rte_errno;
1261         }
1262         while (fgets(line, sizeof(line), file) == line) {
1263                 size_t len = strlen(line);
1264                 int ret;
1265
1266                 /* Truncate long lines. */
1267                 if (len == (sizeof(line) - 1))
1268                         while (line[(len - 1)] != '\n') {
1269                                 ret = fgetc(file);
1270                                 if (ret == EOF)
1271                                         break;
1272                                 line[(len - 1)] = ret;
1273                         }
1274                 /* Extract information. */
1275                 if (sscanf(line,
1276                            "PCI_SLOT_NAME="
1277                            "%" SCNx32 ":%" SCNx8 ":%" SCNx8 ".%" SCNx8 "\n",
1278                            &pci_addr->domain,
1279                            &pci_addr->bus,
1280                            &pci_addr->devid,
1281                            &pci_addr->function) == 4) {
1282                         ret = 0;
1283                         break;
1284                 }
1285         }
1286         fclose(file);
1287         return 0;
1288 }
1289
1290 /**
1291  * Handle asynchronous removal event for entire multiport device.
1292  *
1293  * @param sh
1294  *   Infiniband device shared context.
1295  */
1296 static void
1297 mlx5_dev_interrupt_device_fatal(struct mlx5_ibv_shared *sh)
1298 {
1299         uint32_t i;
1300
1301         for (i = 0; i < sh->max_port; ++i) {
1302                 struct rte_eth_dev *dev;
1303
1304                 if (sh->port[i].ih_port_id >= RTE_MAX_ETHPORTS) {
1305                         /*
1306                          * Or not existing port either no
1307                          * handler installed for this port.
1308                          */
1309                         continue;
1310                 }
1311                 dev = &rte_eth_devices[sh->port[i].ih_port_id];
1312                 assert(dev);
1313                 if (dev->data->dev_conf.intr_conf.rmv)
1314                         _rte_eth_dev_callback_process
1315                                 (dev, RTE_ETH_EVENT_INTR_RMV, NULL);
1316         }
1317 }
1318
1319 /**
1320  * Handle shared asynchronous events the NIC (removal event
1321  * and link status change). Supports multiport IB device.
1322  *
1323  * @param cb_arg
1324  *   Callback argument.
1325  */
1326 void
1327 mlx5_dev_interrupt_handler(void *cb_arg)
1328 {
1329         struct mlx5_ibv_shared *sh = cb_arg;
1330         struct ibv_async_event event;
1331
1332         /* Read all message from the IB device and acknowledge them. */
1333         for (;;) {
1334                 struct rte_eth_dev *dev;
1335                 uint32_t tmp;
1336
1337                 if (mlx5_glue->get_async_event(sh->ctx, &event))
1338                         break;
1339                 /* Retrieve and check IB port index. */
1340                 tmp = (uint32_t)event.element.port_num;
1341                 if (!tmp && event.event_type == IBV_EVENT_DEVICE_FATAL) {
1342                         /*
1343                          * The DEVICE_FATAL event is called once for
1344                          * entire device without port specifying.
1345                          * We should notify all existing ports.
1346                          */
1347                         mlx5_glue->ack_async_event(&event);
1348                         mlx5_dev_interrupt_device_fatal(sh);
1349                         continue;
1350                 }
1351                 assert(tmp && (tmp <= sh->max_port));
1352                 if (!tmp) {
1353                         /* Unsupported devive level event. */
1354                         mlx5_glue->ack_async_event(&event);
1355                         DRV_LOG(DEBUG,
1356                                 "unsupported common event (type %d)",
1357                                 event.event_type);
1358                         continue;
1359                 }
1360                 if (tmp > sh->max_port) {
1361                         /* Invalid IB port index. */
1362                         mlx5_glue->ack_async_event(&event);
1363                         DRV_LOG(DEBUG,
1364                                 "cannot handle an event (type %d)"
1365                                 "due to invalid IB port index (%u)",
1366                                 event.event_type, tmp);
1367                         continue;
1368                 }
1369                 if (sh->port[tmp - 1].ih_port_id >= RTE_MAX_ETHPORTS) {
1370                         /* No handler installed. */
1371                         mlx5_glue->ack_async_event(&event);
1372                         DRV_LOG(DEBUG,
1373                                 "cannot handle an event (type %d)"
1374                                 "due to no handler installed for port %u",
1375                                 event.event_type, tmp);
1376                         continue;
1377                 }
1378                 /* Retrieve ethernet device descriptor. */
1379                 tmp = sh->port[tmp - 1].ih_port_id;
1380                 dev = &rte_eth_devices[tmp];
1381                 assert(dev);
1382                 if ((event.event_type == IBV_EVENT_PORT_ACTIVE ||
1383                      event.event_type == IBV_EVENT_PORT_ERR) &&
1384                         dev->data->dev_conf.intr_conf.lsc) {
1385                         mlx5_glue->ack_async_event(&event);
1386                         if (mlx5_link_update(dev, 0) == -EAGAIN) {
1387                                 usleep(0);
1388                                 continue;
1389                         }
1390                         _rte_eth_dev_callback_process
1391                                 (dev, RTE_ETH_EVENT_INTR_LSC, NULL);
1392                         continue;
1393                 }
1394                 DRV_LOG(DEBUG,
1395                         "port %u cannot handle an unknown event (type %d)",
1396                         dev->data->port_id, event.event_type);
1397                 mlx5_glue->ack_async_event(&event);
1398         }
1399 }
1400
1401 /*
1402  * Unregister callback handler safely. The handler may be active
1403  * while we are trying to unregister it, in this case code -EAGAIN
1404  * is returned by rte_intr_callback_unregister(). This routine checks
1405  * the return code and tries to unregister handler again.
1406  *
1407  * @param handle
1408  *   interrupt handle
1409  * @param cb_fn
1410  *   pointer to callback routine
1411  * @cb_arg
1412  *   opaque callback parameter
1413  */
1414 void
1415 mlx5_intr_callback_unregister(const struct rte_intr_handle *handle,
1416                               rte_intr_callback_fn cb_fn, void *cb_arg)
1417 {
1418         /*
1419          * Try to reduce timeout management overhead by not calling
1420          * the timer related routines on the first iteration. If the
1421          * unregistering succeeds on first call there will be no
1422          * timer calls at all.
1423          */
1424         uint64_t twait = 0;
1425         uint64_t start = 0;
1426
1427         do {
1428                 int ret;
1429
1430                 ret = rte_intr_callback_unregister(handle, cb_fn, cb_arg);
1431                 if (ret >= 0)
1432                         return;
1433                 if (ret != -EAGAIN) {
1434                         DRV_LOG(INFO, "failed to unregister interrupt"
1435                                       " handler (error: %d)", ret);
1436                         assert(false);
1437                         return;
1438                 }
1439                 if (twait) {
1440                         struct timespec onems;
1441
1442                         /* Wait one millisecond and try again. */
1443                         onems.tv_sec = 0;
1444                         onems.tv_nsec = NS_PER_S / MS_PER_S;
1445                         nanosleep(&onems, 0);
1446                         /* Check whether one second elapsed. */
1447                         if ((rte_get_timer_cycles() - start) <= twait)
1448                                 continue;
1449                 } else {
1450                         /*
1451                          * We get the amount of timer ticks for one second.
1452                          * If this amount elapsed it means we spent one
1453                          * second in waiting. This branch is executed once
1454                          * on first iteration.
1455                          */
1456                         twait = rte_get_timer_hz();
1457                         assert(twait);
1458                 }
1459                 /*
1460                  * Timeout elapsed, show message (once a second) and retry.
1461                  * We have no other acceptable option here, if we ignore
1462                  * the unregistering return code the handler will not
1463                  * be unregistered, fd will be closed and we may get the
1464                  * crush. Hanging and messaging in the loop seems not to be
1465                  * the worst choice.
1466                  */
1467                 DRV_LOG(INFO, "Retrying to unregister interrupt handler");
1468                 start = rte_get_timer_cycles();
1469         } while (true);
1470 }
1471
1472 /**
1473  * Handle DEVX interrupts from the NIC.
1474  * This function is probably called from the DPDK host thread.
1475  *
1476  * @param cb_arg
1477  *   Callback argument.
1478  */
1479 void
1480 mlx5_dev_interrupt_handler_devx(void *cb_arg)
1481 {
1482 #ifndef HAVE_IBV_DEVX_ASYNC
1483         (void)cb_arg;
1484         return;
1485 #else
1486         struct mlx5_ibv_shared *sh = cb_arg;
1487         union {
1488                 struct mlx5dv_devx_async_cmd_hdr cmd_resp;
1489                 uint8_t buf[MLX5_ST_SZ_BYTES(query_flow_counter_out) +
1490                             MLX5_ST_SZ_BYTES(traffic_counter) +
1491                             sizeof(struct mlx5dv_devx_async_cmd_hdr)];
1492         } out;
1493         uint8_t *buf = out.buf + sizeof(out.cmd_resp);
1494
1495         while (!mlx5_glue->devx_get_async_cmd_comp(sh->devx_comp,
1496                                                    &out.cmd_resp,
1497                                                    sizeof(out.buf)))
1498                 mlx5_flow_async_pool_query_handle
1499                         (sh, (uint64_t)out.cmd_resp.wr_id,
1500                          mlx5_devx_get_out_command_status(buf));
1501 #endif /* HAVE_IBV_DEVX_ASYNC */
1502 }
1503
1504 /**
1505  * Uninstall shared asynchronous device events handler.
1506  * This function is implemented to support event sharing
1507  * between multiple ports of single IB device.
1508  *
1509  * @param dev
1510  *   Pointer to Ethernet device.
1511  */
1512 static void
1513 mlx5_dev_shared_handler_uninstall(struct rte_eth_dev *dev)
1514 {
1515         struct mlx5_priv *priv = dev->data->dev_private;
1516         struct mlx5_ibv_shared *sh = priv->sh;
1517
1518         if (rte_eal_process_type() != RTE_PROC_PRIMARY)
1519                 return;
1520         pthread_mutex_lock(&sh->intr_mutex);
1521         assert(priv->ibv_port);
1522         assert(priv->ibv_port <= sh->max_port);
1523         assert(dev->data->port_id < RTE_MAX_ETHPORTS);
1524         if (sh->port[priv->ibv_port - 1].ih_port_id >= RTE_MAX_ETHPORTS)
1525                 goto exit;
1526         assert(sh->port[priv->ibv_port - 1].ih_port_id ==
1527                                         (uint32_t)dev->data->port_id);
1528         assert(sh->intr_cnt);
1529         sh->port[priv->ibv_port - 1].ih_port_id = RTE_MAX_ETHPORTS;
1530         if (!sh->intr_cnt || --sh->intr_cnt)
1531                 goto exit;
1532         mlx5_intr_callback_unregister(&sh->intr_handle,
1533                                      mlx5_dev_interrupt_handler, sh);
1534         sh->intr_handle.fd = 0;
1535         sh->intr_handle.type = RTE_INTR_HANDLE_UNKNOWN;
1536         if (sh->intr_handle_devx.fd) {
1537                 rte_intr_callback_unregister(&sh->intr_handle_devx,
1538                                              mlx5_dev_interrupt_handler_devx,
1539                                              sh);
1540                 sh->intr_handle_devx.fd = 0;
1541                 sh->intr_handle_devx.type = RTE_INTR_HANDLE_UNKNOWN;
1542         }
1543         if (sh->devx_comp) {
1544                 mlx5_glue->devx_destroy_cmd_comp(sh->devx_comp);
1545                 sh->devx_comp = NULL;
1546         }
1547 exit:
1548         pthread_mutex_unlock(&sh->intr_mutex);
1549 }
1550
1551 /**
1552  * Install shared asynchronous device events handler.
1553  * This function is implemented to support event sharing
1554  * between multiple ports of single IB device.
1555  *
1556  * @param dev
1557  *   Pointer to Ethernet device.
1558  */
1559 static void
1560 mlx5_dev_shared_handler_install(struct rte_eth_dev *dev)
1561 {
1562         struct mlx5_priv *priv = dev->data->dev_private;
1563         struct mlx5_ibv_shared *sh = priv->sh;
1564         int ret;
1565         int flags;
1566
1567         if (rte_eal_process_type() != RTE_PROC_PRIMARY)
1568                 return;
1569         pthread_mutex_lock(&sh->intr_mutex);
1570         assert(priv->ibv_port);
1571         assert(priv->ibv_port <= sh->max_port);
1572         assert(dev->data->port_id < RTE_MAX_ETHPORTS);
1573         if (sh->port[priv->ibv_port - 1].ih_port_id < RTE_MAX_ETHPORTS) {
1574                 /* The handler is already installed for this port. */
1575                 assert(sh->intr_cnt);
1576                 goto exit;
1577         }
1578         sh->port[priv->ibv_port - 1].ih_port_id = (uint32_t)dev->data->port_id;
1579         if (sh->intr_cnt) {
1580                 sh->intr_cnt++;
1581                 goto exit;
1582         }
1583         /* No shared handler installed. */
1584         assert(sh->ctx->async_fd > 0);
1585         flags = fcntl(sh->ctx->async_fd, F_GETFL);
1586         ret = fcntl(sh->ctx->async_fd, F_SETFL, flags | O_NONBLOCK);
1587         if (ret) {
1588                 DRV_LOG(INFO, "failed to change file descriptor"
1589                               " async event queue");
1590                 goto error;
1591         }
1592         sh->intr_handle.fd = sh->ctx->async_fd;
1593         sh->intr_handle.type = RTE_INTR_HANDLE_EXT;
1594         rte_intr_callback_register(&sh->intr_handle,
1595                                    mlx5_dev_interrupt_handler, sh);
1596         if (priv->config.devx) {
1597 #ifndef HAVE_IBV_DEVX_ASYNC
1598                 goto error_unregister;
1599 #else
1600                 sh->devx_comp = mlx5_glue->devx_create_cmd_comp(sh->ctx);
1601                 if (sh->devx_comp) {
1602                         flags = fcntl(sh->devx_comp->fd, F_GETFL);
1603                         ret = fcntl(sh->devx_comp->fd, F_SETFL,
1604                                     flags | O_NONBLOCK);
1605                         if (ret) {
1606                                 DRV_LOG(INFO, "failed to change file descriptor"
1607                                               " devx async event queue");
1608                                 goto error_unregister;
1609                         }
1610                         sh->intr_handle_devx.fd = sh->devx_comp->fd;
1611                         sh->intr_handle_devx.type = RTE_INTR_HANDLE_EXT;
1612                         rte_intr_callback_register
1613                                 (&sh->intr_handle_devx,
1614                                  mlx5_dev_interrupt_handler_devx, sh);
1615                 } else {
1616                         DRV_LOG(INFO, "failed to create devx async command "
1617                                 "completion");
1618                         goto error_unregister;
1619                 }
1620 #endif /* HAVE_IBV_DEVX_ASYNC */
1621         }
1622         sh->intr_cnt++;
1623         goto exit;
1624 error_unregister:
1625         rte_intr_callback_unregister(&sh->intr_handle,
1626                                      mlx5_dev_interrupt_handler, sh);
1627 error:
1628         /* Indicate there will be no interrupts. */
1629         dev->data->dev_conf.intr_conf.lsc = 0;
1630         dev->data->dev_conf.intr_conf.rmv = 0;
1631         sh->intr_handle.fd = 0;
1632         sh->intr_handle.type = RTE_INTR_HANDLE_UNKNOWN;
1633         sh->port[priv->ibv_port - 1].ih_port_id = RTE_MAX_ETHPORTS;
1634 exit:
1635         pthread_mutex_unlock(&sh->intr_mutex);
1636 }
1637
1638 /**
1639  * Uninstall interrupt handler.
1640  *
1641  * @param dev
1642  *   Pointer to Ethernet device.
1643  */
1644 void
1645 mlx5_dev_interrupt_handler_uninstall(struct rte_eth_dev *dev)
1646 {
1647         mlx5_dev_shared_handler_uninstall(dev);
1648 }
1649
1650 /**
1651  * Install interrupt handler.
1652  *
1653  * @param dev
1654  *   Pointer to Ethernet device.
1655  */
1656 void
1657 mlx5_dev_interrupt_handler_install(struct rte_eth_dev *dev)
1658 {
1659         mlx5_dev_shared_handler_install(dev);
1660 }
1661
1662 /**
1663  * DPDK callback to bring the link DOWN.
1664  *
1665  * @param dev
1666  *   Pointer to Ethernet device structure.
1667  *
1668  * @return
1669  *   0 on success, a negative errno value otherwise and rte_errno is set.
1670  */
1671 int
1672 mlx5_set_link_down(struct rte_eth_dev *dev)
1673 {
1674         return mlx5_set_flags(dev, ~IFF_UP, ~IFF_UP);
1675 }
1676
1677 /**
1678  * DPDK callback to bring the link UP.
1679  *
1680  * @param dev
1681  *   Pointer to Ethernet device structure.
1682  *
1683  * @return
1684  *   0 on success, a negative errno value otherwise and rte_errno is set.
1685  */
1686 int
1687 mlx5_set_link_up(struct rte_eth_dev *dev)
1688 {
1689         return mlx5_set_flags(dev, ~IFF_UP, IFF_UP);
1690 }
1691
1692 /**
1693  * Configure the RX function to use.
1694  *
1695  * @param dev
1696  *   Pointer to private data structure.
1697  *
1698  * @return
1699  *   Pointer to selected Rx burst function.
1700  */
1701 eth_rx_burst_t
1702 mlx5_select_rx_function(struct rte_eth_dev *dev)
1703 {
1704         eth_rx_burst_t rx_pkt_burst = mlx5_rx_burst;
1705
1706         assert(dev != NULL);
1707         if (mlx5_check_vec_rx_support(dev) > 0) {
1708                 rx_pkt_burst = mlx5_rx_burst_vec;
1709                 DRV_LOG(DEBUG, "port %u selected Rx vectorized function",
1710                         dev->data->port_id);
1711         } else if (mlx5_mprq_enabled(dev)) {
1712                 rx_pkt_burst = mlx5_rx_burst_mprq;
1713         }
1714         return rx_pkt_burst;
1715 }
1716
1717 /**
1718  * Check if mlx5 device was removed.
1719  *
1720  * @param dev
1721  *   Pointer to Ethernet device structure.
1722  *
1723  * @return
1724  *   1 when device is removed, otherwise 0.
1725  */
1726 int
1727 mlx5_is_removed(struct rte_eth_dev *dev)
1728 {
1729         struct ibv_device_attr device_attr;
1730         struct mlx5_priv *priv = dev->data->dev_private;
1731
1732         if (mlx5_glue->query_device(priv->sh->ctx, &device_attr) == EIO)
1733                 return 1;
1734         return 0;
1735 }
1736
1737 /**
1738  * Get port ID list of mlx5 instances sharing a common device.
1739  *
1740  * @param[in] dev
1741  *   Device to look for.
1742  * @param[out] port_list
1743  *   Result buffer for collected port IDs.
1744  * @param port_list_n
1745  *   Maximum number of entries in result buffer. If 0, @p port_list can be
1746  *   NULL.
1747  *
1748  * @return
1749  *   Number of matching instances regardless of the @p port_list_n
1750  *   parameter, 0 if none were found.
1751  */
1752 unsigned int
1753 mlx5_dev_to_port_id(const struct rte_device *dev, uint16_t *port_list,
1754                     unsigned int port_list_n)
1755 {
1756         uint16_t id;
1757         unsigned int n = 0;
1758
1759         RTE_ETH_FOREACH_DEV_OF(id, dev) {
1760                 if (n < port_list_n)
1761                         port_list[n] = id;
1762                 n++;
1763         }
1764         return n;
1765 }
1766
1767 /**
1768  * Get the E-Switch domain id this port belongs to.
1769  *
1770  * @param[in] port
1771  *   Device port id.
1772  * @param[out] es_domain_id
1773  *   E-Switch domain id.
1774  * @param[out] es_port_id
1775  *   The port id of the port in the E-Switch.
1776  *
1777  * @return
1778  *   0 on success, a negative errno value otherwise and rte_errno is set.
1779  */
1780 int
1781 mlx5_port_to_eswitch_info(uint16_t port,
1782                           uint16_t *es_domain_id, uint16_t *es_port_id)
1783 {
1784         struct rte_eth_dev *dev;
1785         struct mlx5_priv *priv;
1786
1787         if (port >= RTE_MAX_ETHPORTS) {
1788                 rte_errno = EINVAL;
1789                 return -rte_errno;
1790         }
1791         if (!rte_eth_dev_is_valid_port(port)) {
1792                 rte_errno = ENODEV;
1793                 return -rte_errno;
1794         }
1795         dev = &rte_eth_devices[port];
1796         priv = dev->data->dev_private;
1797         if (!(priv->representor || priv->master)) {
1798                 rte_errno = EINVAL;
1799                 return -rte_errno;
1800         }
1801         if (es_domain_id)
1802                 *es_domain_id = priv->domain_id;
1803         if (es_port_id)
1804                 *es_port_id = priv->vport_id;
1805         return 0;
1806 }
1807
1808 /**
1809  * Get switch information associated with network interface.
1810  *
1811  * @param ifindex
1812  *   Network interface index.
1813  * @param[out] info
1814  *   Switch information object, populated in case of success.
1815  *
1816  * @return
1817  *   0 on success, a negative errno value otherwise and rte_errno is set.
1818  */
1819 int
1820 mlx5_sysfs_switch_info(unsigned int ifindex, struct mlx5_switch_info *info)
1821 {
1822         char ifname[IF_NAMESIZE];
1823         char port_name[IF_NAMESIZE];
1824         FILE *file;
1825         struct mlx5_switch_info data = {
1826                 .master = 0,
1827                 .representor = 0,
1828                 .name_type = MLX5_PHYS_PORT_NAME_TYPE_NOTSET,
1829                 .port_name = 0,
1830                 .switch_id = 0,
1831         };
1832         DIR *dir;
1833         bool port_switch_id_set = false;
1834         bool device_dir = false;
1835         char c;
1836         int ret;
1837
1838         if (!if_indextoname(ifindex, ifname)) {
1839                 rte_errno = errno;
1840                 return -rte_errno;
1841         }
1842
1843         MKSTR(phys_port_name, "/sys/class/net/%s/phys_port_name",
1844               ifname);
1845         MKSTR(phys_switch_id, "/sys/class/net/%s/phys_switch_id",
1846               ifname);
1847         MKSTR(pci_device, "/sys/class/net/%s/device",
1848               ifname);
1849
1850         file = fopen(phys_port_name, "rb");
1851         if (file != NULL) {
1852                 ret = fscanf(file, "%s", port_name);
1853                 fclose(file);
1854                 if (ret == 1)
1855                         mlx5_translate_port_name(port_name, &data);
1856         }
1857         file = fopen(phys_switch_id, "rb");
1858         if (file == NULL) {
1859                 rte_errno = errno;
1860                 return -rte_errno;
1861         }
1862         port_switch_id_set =
1863                 fscanf(file, "%" SCNx64 "%c", &data.switch_id, &c) == 2 &&
1864                 c == '\n';
1865         fclose(file);
1866         dir = opendir(pci_device);
1867         if (dir != NULL) {
1868                 closedir(dir);
1869                 device_dir = true;
1870         }
1871         if (port_switch_id_set) {
1872                 /* We have some E-Switch configuration. */
1873                 mlx5_sysfs_check_switch_info(device_dir, &data);
1874         }
1875         *info = data;
1876         assert(!(data.master && data.representor));
1877         if (data.master && data.representor) {
1878                 DRV_LOG(ERR, "ifindex %u device is recognized as master"
1879                              " and as representor", ifindex);
1880                 rte_errno = ENODEV;
1881                 return -rte_errno;
1882         }
1883         return 0;
1884 }
1885
1886 /**
1887  * Analyze gathered port parameters via Netlink to recognize master
1888  * and representor devices for E-Switch configuration.
1889  *
1890  * @param[in] num_vf_set
1891  *   flag of presence of number of VFs port attribute.
1892  * @param[inout] switch_info
1893  *   Port information, including port name as a number and port name
1894  *   type if recognized
1895  *
1896  * @return
1897  *   master and representor flags are set in switch_info according to
1898  *   recognized parameters (if any).
1899  */
1900 void
1901 mlx5_nl_check_switch_info(bool num_vf_set,
1902                           struct mlx5_switch_info *switch_info)
1903 {
1904         switch (switch_info->name_type) {
1905         case MLX5_PHYS_PORT_NAME_TYPE_UNKNOWN:
1906                 /*
1907                  * Name is not recognized, assume the master,
1908                  * check the number of VFs key presence.
1909                  */
1910                 switch_info->master = num_vf_set;
1911                 break;
1912         case MLX5_PHYS_PORT_NAME_TYPE_NOTSET:
1913                 /*
1914                  * Name is not set, this assumes the legacy naming
1915                  * schema for master, just check if there is a
1916                  * number of VFs key.
1917                  */
1918                 switch_info->master = num_vf_set;
1919                 break;
1920         case MLX5_PHYS_PORT_NAME_TYPE_UPLINK:
1921                 /* New uplink naming schema recognized. */
1922                 switch_info->master = 1;
1923                 break;
1924         case MLX5_PHYS_PORT_NAME_TYPE_LEGACY:
1925                 /* Legacy representors naming schema. */
1926                 switch_info->representor = !num_vf_set;
1927                 break;
1928         case MLX5_PHYS_PORT_NAME_TYPE_PFVF:
1929                 /* New representors naming schema. */
1930                 switch_info->representor = 1;
1931                 break;
1932         }
1933 }
1934
1935 /**
1936  * Analyze gathered port parameters via sysfs to recognize master
1937  * and representor devices for E-Switch configuration.
1938  *
1939  * @param[in] device_dir
1940  *   flag of presence of "device" directory under port device key.
1941  * @param[inout] switch_info
1942  *   Port information, including port name as a number and port name
1943  *   type if recognized
1944  *
1945  * @return
1946  *   master and representor flags are set in switch_info according to
1947  *   recognized parameters (if any).
1948  */
1949 void
1950 mlx5_sysfs_check_switch_info(bool device_dir,
1951                              struct mlx5_switch_info *switch_info)
1952 {
1953         switch (switch_info->name_type) {
1954         case MLX5_PHYS_PORT_NAME_TYPE_UNKNOWN:
1955                 /*
1956                  * Name is not recognized, assume the master,
1957                  * check the device directory presence.
1958                  */
1959                 switch_info->master = device_dir;
1960                 break;
1961         case MLX5_PHYS_PORT_NAME_TYPE_NOTSET:
1962                 /*
1963                  * Name is not set, this assumes the legacy naming
1964                  * schema for master, just check if there is
1965                  * a device directory.
1966                  */
1967                 switch_info->master = device_dir;
1968                 break;
1969         case MLX5_PHYS_PORT_NAME_TYPE_UPLINK:
1970                 /* New uplink naming schema recognized. */
1971                 switch_info->master = 1;
1972                 break;
1973         case MLX5_PHYS_PORT_NAME_TYPE_LEGACY:
1974                 /* Legacy representors naming schema. */
1975                 switch_info->representor = !device_dir;
1976                 break;
1977         case MLX5_PHYS_PORT_NAME_TYPE_PFVF:
1978                 /* New representors naming schema. */
1979                 switch_info->representor = 1;
1980                 break;
1981         }
1982 }
1983
1984 /**
1985  * Extract port name, as a number, from sysfs or netlink information.
1986  *
1987  * @param[in] port_name_in
1988  *   String representing the port name.
1989  * @param[out] port_info_out
1990  *   Port information, including port name as a number and port name
1991  *   type if recognized
1992  *
1993  * @return
1994  *   port_name field set according to recognized name format.
1995  */
1996 void
1997 mlx5_translate_port_name(const char *port_name_in,
1998                          struct mlx5_switch_info *port_info_out)
1999 {
2000         char pf_c1, pf_c2, vf_c1, vf_c2;
2001         char *end;
2002         int sc_items;
2003
2004         /*
2005          * Check for port-name as a string of the form pf0vf0
2006          * (support kernel ver >= 5.0 or OFED ver >= 4.6).
2007          */
2008         sc_items = sscanf(port_name_in, "%c%c%d%c%c%d",
2009                           &pf_c1, &pf_c2, &port_info_out->pf_num,
2010                           &vf_c1, &vf_c2, &port_info_out->port_name);
2011         if (sc_items == 6 &&
2012             pf_c1 == 'p' && pf_c2 == 'f' &&
2013             vf_c1 == 'v' && vf_c2 == 'f') {
2014                 port_info_out->name_type = MLX5_PHYS_PORT_NAME_TYPE_PFVF;
2015                 return;
2016         }
2017         /*
2018          * Check for port-name as a string of the form p0
2019          * (support kernel ver >= 5.0, or OFED ver >= 4.6).
2020          */
2021         sc_items = sscanf(port_name_in, "%c%d",
2022                           &pf_c1, &port_info_out->port_name);
2023         if (sc_items == 2 && pf_c1 == 'p') {
2024                 port_info_out->name_type = MLX5_PHYS_PORT_NAME_TYPE_UPLINK;
2025                 return;
2026         }
2027         /* Check for port-name as a number (support kernel ver < 5.0 */
2028         errno = 0;
2029         port_info_out->port_name = strtol(port_name_in, &end, 0);
2030         if (!errno &&
2031             (size_t)(end - port_name_in) == strlen(port_name_in)) {
2032                 port_info_out->name_type = MLX5_PHYS_PORT_NAME_TYPE_LEGACY;
2033                 return;
2034         }
2035         port_info_out->name_type = MLX5_PHYS_PORT_NAME_TYPE_UNKNOWN;
2036         return;
2037 }