2628e6481e2b14549096338dd28bec474e6468df
[dpdk.git] / drivers / net / mlx5 / mlx5_ethdev.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2015 6WIND S.A.
3  * Copyright 2015 Mellanox Technologies, Ltd
4  */
5
6 #include <stddef.h>
7 #include <assert.h>
8 #include <inttypes.h>
9 #include <unistd.h>
10 #include <stdbool.h>
11 #include <stdint.h>
12 #include <stdio.h>
13 #include <string.h>
14 #include <stdlib.h>
15 #include <errno.h>
16 #include <dirent.h>
17 #include <net/if.h>
18 #include <sys/ioctl.h>
19 #include <sys/socket.h>
20 #include <netinet/in.h>
21 #include <linux/ethtool.h>
22 #include <linux/sockios.h>
23 #include <fcntl.h>
24 #include <stdalign.h>
25 #include <sys/un.h>
26 #include <time.h>
27
28 #include <rte_atomic.h>
29 #include <rte_ethdev_driver.h>
30 #include <rte_bus_pci.h>
31 #include <rte_mbuf.h>
32 #include <rte_common.h>
33 #include <rte_interrupts.h>
34 #include <rte_malloc.h>
35 #include <rte_string_fns.h>
36 #include <rte_rwlock.h>
37 #include <rte_cycles.h>
38
39 #include <mlx5_glue.h>
40 #include <mlx5_devx_cmds.h>
41 #include <mlx5_common.h>
42
43 #include "mlx5.h"
44 #include "mlx5_rxtx.h"
45 #include "mlx5_utils.h"
46
47 /* Supported speed values found in /usr/include/linux/ethtool.h */
48 #ifndef HAVE_SUPPORTED_40000baseKR4_Full
49 #define SUPPORTED_40000baseKR4_Full (1 << 23)
50 #endif
51 #ifndef HAVE_SUPPORTED_40000baseCR4_Full
52 #define SUPPORTED_40000baseCR4_Full (1 << 24)
53 #endif
54 #ifndef HAVE_SUPPORTED_40000baseSR4_Full
55 #define SUPPORTED_40000baseSR4_Full (1 << 25)
56 #endif
57 #ifndef HAVE_SUPPORTED_40000baseLR4_Full
58 #define SUPPORTED_40000baseLR4_Full (1 << 26)
59 #endif
60 #ifndef HAVE_SUPPORTED_56000baseKR4_Full
61 #define SUPPORTED_56000baseKR4_Full (1 << 27)
62 #endif
63 #ifndef HAVE_SUPPORTED_56000baseCR4_Full
64 #define SUPPORTED_56000baseCR4_Full (1 << 28)
65 #endif
66 #ifndef HAVE_SUPPORTED_56000baseSR4_Full
67 #define SUPPORTED_56000baseSR4_Full (1 << 29)
68 #endif
69 #ifndef HAVE_SUPPORTED_56000baseLR4_Full
70 #define SUPPORTED_56000baseLR4_Full (1 << 30)
71 #endif
72
73 /* Add defines in case the running kernel is not the same as user headers. */
74 #ifndef ETHTOOL_GLINKSETTINGS
75 struct ethtool_link_settings {
76         uint32_t cmd;
77         uint32_t speed;
78         uint8_t duplex;
79         uint8_t port;
80         uint8_t phy_address;
81         uint8_t autoneg;
82         uint8_t mdio_support;
83         uint8_t eth_to_mdix;
84         uint8_t eth_tp_mdix_ctrl;
85         int8_t link_mode_masks_nwords;
86         uint32_t reserved[8];
87         uint32_t link_mode_masks[];
88 };
89
90 #define ETHTOOL_GLINKSETTINGS 0x0000004c
91 #define ETHTOOL_LINK_MODE_1000baseT_Full_BIT 5
92 #define ETHTOOL_LINK_MODE_Autoneg_BIT 6
93 #define ETHTOOL_LINK_MODE_1000baseKX_Full_BIT 17
94 #define ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT 18
95 #define ETHTOOL_LINK_MODE_10000baseKR_Full_BIT 19
96 #define ETHTOOL_LINK_MODE_10000baseR_FEC_BIT 20
97 #define ETHTOOL_LINK_MODE_20000baseMLD2_Full_BIT 21
98 #define ETHTOOL_LINK_MODE_20000baseKR2_Full_BIT 22
99 #define ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT 23
100 #define ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT 24
101 #define ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT 25
102 #define ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT 26
103 #define ETHTOOL_LINK_MODE_56000baseKR4_Full_BIT 27
104 #define ETHTOOL_LINK_MODE_56000baseCR4_Full_BIT 28
105 #define ETHTOOL_LINK_MODE_56000baseSR4_Full_BIT 29
106 #define ETHTOOL_LINK_MODE_56000baseLR4_Full_BIT 30
107 #endif
108 #ifndef HAVE_ETHTOOL_LINK_MODE_25G
109 #define ETHTOOL_LINK_MODE_25000baseCR_Full_BIT 31
110 #define ETHTOOL_LINK_MODE_25000baseKR_Full_BIT 32
111 #define ETHTOOL_LINK_MODE_25000baseSR_Full_BIT 33
112 #endif
113 #ifndef HAVE_ETHTOOL_LINK_MODE_50G
114 #define ETHTOOL_LINK_MODE_50000baseCR2_Full_BIT 34
115 #define ETHTOOL_LINK_MODE_50000baseKR2_Full_BIT 35
116 #endif
117 #ifndef HAVE_ETHTOOL_LINK_MODE_100G
118 #define ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT 36
119 #define ETHTOOL_LINK_MODE_100000baseSR4_Full_BIT 37
120 #define ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT 38
121 #define ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT 39
122 #endif
123
124 /**
125  * Get master interface name from private structure.
126  *
127  * @param[in] dev
128  *   Pointer to Ethernet device.
129  * @param[out] ifname
130  *   Interface name output buffer.
131  *
132  * @return
133  *   0 on success, a negative errno value otherwise and rte_errno is set.
134  */
135 int
136 mlx5_get_master_ifname(const char *ibdev_path, char (*ifname)[IF_NAMESIZE])
137 {
138         DIR *dir;
139         struct dirent *dent;
140         unsigned int dev_type = 0;
141         unsigned int dev_port_prev = ~0u;
142         char match[IF_NAMESIZE] = "";
143
144         assert(ibdev_path);
145         {
146                 MKSTR(path, "%s/device/net", ibdev_path);
147
148                 dir = opendir(path);
149                 if (dir == NULL) {
150                         rte_errno = errno;
151                         return -rte_errno;
152                 }
153         }
154         while ((dent = readdir(dir)) != NULL) {
155                 char *name = dent->d_name;
156                 FILE *file;
157                 unsigned int dev_port;
158                 int r;
159
160                 if ((name[0] == '.') &&
161                     ((name[1] == '\0') ||
162                      ((name[1] == '.') && (name[2] == '\0'))))
163                         continue;
164
165                 MKSTR(path, "%s/device/net/%s/%s",
166                       ibdev_path, name,
167                       (dev_type ? "dev_id" : "dev_port"));
168
169                 file = fopen(path, "rb");
170                 if (file == NULL) {
171                         if (errno != ENOENT)
172                                 continue;
173                         /*
174                          * Switch to dev_id when dev_port does not exist as
175                          * is the case with Linux kernel versions < 3.15.
176                          */
177 try_dev_id:
178                         match[0] = '\0';
179                         if (dev_type)
180                                 break;
181                         dev_type = 1;
182                         dev_port_prev = ~0u;
183                         rewinddir(dir);
184                         continue;
185                 }
186                 r = fscanf(file, (dev_type ? "%x" : "%u"), &dev_port);
187                 fclose(file);
188                 if (r != 1)
189                         continue;
190                 /*
191                  * Switch to dev_id when dev_port returns the same value for
192                  * all ports. May happen when using a MOFED release older than
193                  * 3.0 with a Linux kernel >= 3.15.
194                  */
195                 if (dev_port == dev_port_prev)
196                         goto try_dev_id;
197                 dev_port_prev = dev_port;
198                 if (dev_port == 0)
199                         strlcpy(match, name, sizeof(match));
200         }
201         closedir(dir);
202         if (match[0] == '\0') {
203                 rte_errno = ENOENT;
204                 return -rte_errno;
205         }
206         strncpy(*ifname, match, sizeof(*ifname));
207         return 0;
208 }
209
210 /**
211  * Get interface name from private structure.
212  *
213  * This is a port representor-aware version of mlx5_get_master_ifname().
214  *
215  * @param[in] dev
216  *   Pointer to Ethernet device.
217  * @param[out] ifname
218  *   Interface name output buffer.
219  *
220  * @return
221  *   0 on success, a negative errno value otherwise and rte_errno is set.
222  */
223 int
224 mlx5_get_ifname(const struct rte_eth_dev *dev, char (*ifname)[IF_NAMESIZE])
225 {
226         struct mlx5_priv *priv = dev->data->dev_private;
227         unsigned int ifindex;
228
229         assert(priv);
230         assert(priv->sh);
231         ifindex = mlx5_ifindex(dev);
232         if (!ifindex) {
233                 if (!priv->representor)
234                         return mlx5_get_master_ifname(priv->sh->ibdev_path,
235                                                       ifname);
236                 rte_errno = ENXIO;
237                 return -rte_errno;
238         }
239         if (if_indextoname(ifindex, &(*ifname)[0]))
240                 return 0;
241         rte_errno = errno;
242         return -rte_errno;
243 }
244
245 /**
246  * Get the interface index from device name.
247  *
248  * @param[in] dev
249  *   Pointer to Ethernet device.
250  *
251  * @return
252  *   Nonzero interface index on success, zero otherwise and rte_errno is set.
253  */
254 unsigned int
255 mlx5_ifindex(const struct rte_eth_dev *dev)
256 {
257         struct mlx5_priv *priv = dev->data->dev_private;
258         unsigned int ifindex;
259
260         assert(priv);
261         assert(priv->if_index);
262         ifindex = priv->if_index;
263         if (!ifindex)
264                 rte_errno = ENXIO;
265         return ifindex;
266 }
267
268 /**
269  * Perform ifreq ioctl() on associated Ethernet device.
270  *
271  * @param[in] dev
272  *   Pointer to Ethernet device.
273  * @param req
274  *   Request number to pass to ioctl().
275  * @param[out] ifr
276  *   Interface request structure output buffer.
277  *
278  * @return
279  *   0 on success, a negative errno value otherwise and rte_errno is set.
280  */
281 int
282 mlx5_ifreq(const struct rte_eth_dev *dev, int req, struct ifreq *ifr)
283 {
284         int sock = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
285         int ret = 0;
286
287         if (sock == -1) {
288                 rte_errno = errno;
289                 return -rte_errno;
290         }
291         ret = mlx5_get_ifname(dev, &ifr->ifr_name);
292         if (ret)
293                 goto error;
294         ret = ioctl(sock, req, ifr);
295         if (ret == -1) {
296                 rte_errno = errno;
297                 goto error;
298         }
299         close(sock);
300         return 0;
301 error:
302         close(sock);
303         return -rte_errno;
304 }
305
306 /**
307  * Get device MTU.
308  *
309  * @param dev
310  *   Pointer to Ethernet device.
311  * @param[out] mtu
312  *   MTU value output buffer.
313  *
314  * @return
315  *   0 on success, a negative errno value otherwise and rte_errno is set.
316  */
317 int
318 mlx5_get_mtu(struct rte_eth_dev *dev, uint16_t *mtu)
319 {
320         struct ifreq request;
321         int ret = mlx5_ifreq(dev, SIOCGIFMTU, &request);
322
323         if (ret)
324                 return ret;
325         *mtu = request.ifr_mtu;
326         return 0;
327 }
328
329 /**
330  * Set device MTU.
331  *
332  * @param dev
333  *   Pointer to Ethernet device.
334  * @param mtu
335  *   MTU value to set.
336  *
337  * @return
338  *   0 on success, a negative errno value otherwise and rte_errno is set.
339  */
340 static int
341 mlx5_set_mtu(struct rte_eth_dev *dev, uint16_t mtu)
342 {
343         struct ifreq request = { .ifr_mtu = mtu, };
344
345         return mlx5_ifreq(dev, SIOCSIFMTU, &request);
346 }
347
348 /**
349  * Set device flags.
350  *
351  * @param dev
352  *   Pointer to Ethernet device.
353  * @param keep
354  *   Bitmask for flags that must remain untouched.
355  * @param flags
356  *   Bitmask for flags to modify.
357  *
358  * @return
359  *   0 on success, a negative errno value otherwise and rte_errno is set.
360  */
361 int
362 mlx5_set_flags(struct rte_eth_dev *dev, unsigned int keep, unsigned int flags)
363 {
364         struct ifreq request;
365         int ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &request);
366
367         if (ret)
368                 return ret;
369         request.ifr_flags &= keep;
370         request.ifr_flags |= flags & ~keep;
371         return mlx5_ifreq(dev, SIOCSIFFLAGS, &request);
372 }
373
374 /**
375  * DPDK callback for Ethernet device configuration.
376  *
377  * @param dev
378  *   Pointer to Ethernet device structure.
379  *
380  * @return
381  *   0 on success, a negative errno value otherwise and rte_errno is set.
382  */
383 int
384 mlx5_dev_configure(struct rte_eth_dev *dev)
385 {
386         struct mlx5_priv *priv = dev->data->dev_private;
387         unsigned int rxqs_n = dev->data->nb_rx_queues;
388         unsigned int txqs_n = dev->data->nb_tx_queues;
389         const uint8_t use_app_rss_key =
390                 !!dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key;
391         int ret = 0;
392
393         if (use_app_rss_key &&
394             (dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key_len !=
395              MLX5_RSS_HASH_KEY_LEN)) {
396                 DRV_LOG(ERR, "port %u RSS key len must be %s Bytes long",
397                         dev->data->port_id, RTE_STR(MLX5_RSS_HASH_KEY_LEN));
398                 rte_errno = EINVAL;
399                 return -rte_errno;
400         }
401         priv->rss_conf.rss_key =
402                 rte_realloc(priv->rss_conf.rss_key,
403                             MLX5_RSS_HASH_KEY_LEN, 0);
404         if (!priv->rss_conf.rss_key) {
405                 DRV_LOG(ERR, "port %u cannot allocate RSS hash key memory (%u)",
406                         dev->data->port_id, rxqs_n);
407                 rte_errno = ENOMEM;
408                 return -rte_errno;
409         }
410
411         if (dev->data->dev_conf.rxmode.mq_mode & ETH_MQ_RX_RSS_FLAG)
412                 dev->data->dev_conf.rxmode.offloads |= DEV_RX_OFFLOAD_RSS_HASH;
413
414         memcpy(priv->rss_conf.rss_key,
415                use_app_rss_key ?
416                dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key :
417                rss_hash_default_key,
418                MLX5_RSS_HASH_KEY_LEN);
419         priv->rss_conf.rss_key_len = MLX5_RSS_HASH_KEY_LEN;
420         priv->rss_conf.rss_hf = dev->data->dev_conf.rx_adv_conf.rss_conf.rss_hf;
421         priv->rxqs = (void *)dev->data->rx_queues;
422         priv->txqs = (void *)dev->data->tx_queues;
423         if (txqs_n != priv->txqs_n) {
424                 DRV_LOG(INFO, "port %u Tx queues number update: %u -> %u",
425                         dev->data->port_id, priv->txqs_n, txqs_n);
426                 priv->txqs_n = txqs_n;
427         }
428         if (rxqs_n > priv->config.ind_table_max_size) {
429                 DRV_LOG(ERR, "port %u cannot handle this many Rx queues (%u)",
430                         dev->data->port_id, rxqs_n);
431                 rte_errno = EINVAL;
432                 return -rte_errno;
433         }
434         if (rxqs_n != priv->rxqs_n) {
435                 DRV_LOG(INFO, "port %u Rx queues number update: %u -> %u",
436                         dev->data->port_id, priv->rxqs_n, rxqs_n);
437                 priv->rxqs_n = rxqs_n;
438         }
439         priv->skip_default_rss_reta = 0;
440         ret = mlx5_proc_priv_init(dev);
441         if (ret)
442                 return ret;
443         return 0;
444 }
445
446 /**
447  * Configure default RSS reta.
448  *
449  * @param dev
450  *   Pointer to Ethernet device structure.
451  *
452  * @return
453  *   0 on success, a negative errno value otherwise and rte_errno is set.
454  */
455 int
456 mlx5_dev_configure_rss_reta(struct rte_eth_dev *dev)
457 {
458         struct mlx5_priv *priv = dev->data->dev_private;
459         unsigned int rxqs_n = dev->data->nb_rx_queues;
460         unsigned int i;
461         unsigned int j;
462         unsigned int reta_idx_n;
463         int ret = 0;
464         unsigned int *rss_queue_arr = NULL;
465         unsigned int rss_queue_n = 0;
466
467         if (priv->skip_default_rss_reta)
468                 return ret;
469         rss_queue_arr = rte_malloc("", rxqs_n * sizeof(unsigned int), 0);
470         if (!rss_queue_arr) {
471                 DRV_LOG(ERR, "port %u cannot allocate RSS queue list (%u)",
472                         dev->data->port_id, rxqs_n);
473                 rte_errno = ENOMEM;
474                 return -rte_errno;
475         }
476         for (i = 0, j = 0; i < rxqs_n; i++) {
477                 struct mlx5_rxq_data *rxq_data;
478                 struct mlx5_rxq_ctrl *rxq_ctrl;
479
480                 rxq_data = (*priv->rxqs)[i];
481                 rxq_ctrl = container_of(rxq_data, struct mlx5_rxq_ctrl, rxq);
482                 if (rxq_ctrl && rxq_ctrl->type == MLX5_RXQ_TYPE_STANDARD)
483                         rss_queue_arr[j++] = i;
484         }
485         rss_queue_n = j;
486         if (rss_queue_n > priv->config.ind_table_max_size) {
487                 DRV_LOG(ERR, "port %u cannot handle this many Rx queues (%u)",
488                         dev->data->port_id, rss_queue_n);
489                 rte_errno = EINVAL;
490                 rte_free(rss_queue_arr);
491                 return -rte_errno;
492         }
493         DRV_LOG(INFO, "port %u Rx queues number update: %u -> %u",
494                 dev->data->port_id, priv->rxqs_n, rxqs_n);
495         priv->rxqs_n = rxqs_n;
496         /*
497          * If the requested number of RX queues is not a power of two,
498          * use the maximum indirection table size for better balancing.
499          * The result is always rounded to the next power of two.
500          */
501         reta_idx_n = (1 << log2above((rss_queue_n & (rss_queue_n - 1)) ?
502                                 priv->config.ind_table_max_size :
503                                 rss_queue_n));
504         ret = mlx5_rss_reta_index_resize(dev, reta_idx_n);
505         if (ret) {
506                 rte_free(rss_queue_arr);
507                 return ret;
508         }
509         /*
510          * When the number of RX queues is not a power of two,
511          * the remaining table entries are padded with reused WQs
512          * and hashes are not spread uniformly.
513          */
514         for (i = 0, j = 0; (i != reta_idx_n); ++i) {
515                 (*priv->reta_idx)[i] = rss_queue_arr[j];
516                 if (++j == rss_queue_n)
517                         j = 0;
518         }
519         rte_free(rss_queue_arr);
520         return ret;
521 }
522
523 /**
524  * Sets default tuning parameters.
525  *
526  * @param dev
527  *   Pointer to Ethernet device.
528  * @param[out] info
529  *   Info structure output buffer.
530  */
531 static void
532 mlx5_set_default_params(struct rte_eth_dev *dev, struct rte_eth_dev_info *info)
533 {
534         struct mlx5_priv *priv = dev->data->dev_private;
535
536         /* Minimum CPU utilization. */
537         info->default_rxportconf.ring_size = 256;
538         info->default_txportconf.ring_size = 256;
539         info->default_rxportconf.burst_size = MLX5_RX_DEFAULT_BURST;
540         info->default_txportconf.burst_size = MLX5_TX_DEFAULT_BURST;
541         if (priv->link_speed_capa & ETH_LINK_SPEED_100G) {
542                 info->default_rxportconf.nb_queues = 16;
543                 info->default_txportconf.nb_queues = 16;
544                 if (dev->data->nb_rx_queues > 2 ||
545                     dev->data->nb_tx_queues > 2) {
546                         /* Max Throughput. */
547                         info->default_rxportconf.ring_size = 2048;
548                         info->default_txportconf.ring_size = 2048;
549                 }
550         } else {
551                 info->default_rxportconf.nb_queues = 8;
552                 info->default_txportconf.nb_queues = 8;
553                 if (dev->data->nb_rx_queues > 2 ||
554                     dev->data->nb_tx_queues > 2) {
555                         /* Max Throughput. */
556                         info->default_rxportconf.ring_size = 4096;
557                         info->default_txportconf.ring_size = 4096;
558                 }
559         }
560 }
561
562 /**
563  * Sets tx mbuf limiting parameters.
564  *
565  * @param dev
566  *   Pointer to Ethernet device.
567  * @param[out] info
568  *   Info structure output buffer.
569  */
570 static void
571 mlx5_set_txlimit_params(struct rte_eth_dev *dev, struct rte_eth_dev_info *info)
572 {
573         struct mlx5_priv *priv = dev->data->dev_private;
574         struct mlx5_dev_config *config = &priv->config;
575         unsigned int inlen;
576         uint16_t nb_max;
577
578         inlen = (config->txq_inline_max == MLX5_ARG_UNSET) ?
579                 MLX5_SEND_DEF_INLINE_LEN :
580                 (unsigned int)config->txq_inline_max;
581         assert(config->txq_inline_min >= 0);
582         inlen = RTE_MAX(inlen, (unsigned int)config->txq_inline_min);
583         inlen = RTE_MIN(inlen, MLX5_WQE_SIZE_MAX +
584                                MLX5_ESEG_MIN_INLINE_SIZE -
585                                MLX5_WQE_CSEG_SIZE -
586                                MLX5_WQE_ESEG_SIZE -
587                                MLX5_WQE_DSEG_SIZE * 2);
588         nb_max = (MLX5_WQE_SIZE_MAX +
589                   MLX5_ESEG_MIN_INLINE_SIZE -
590                   MLX5_WQE_CSEG_SIZE -
591                   MLX5_WQE_ESEG_SIZE -
592                   MLX5_WQE_DSEG_SIZE -
593                   inlen) / MLX5_WSEG_SIZE;
594         info->tx_desc_lim.nb_seg_max = nb_max;
595         info->tx_desc_lim.nb_mtu_seg_max = nb_max;
596 }
597
598 /**
599  * DPDK callback to get information about the device.
600  *
601  * @param dev
602  *   Pointer to Ethernet device structure.
603  * @param[out] info
604  *   Info structure output buffer.
605  */
606 int
607 mlx5_dev_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *info)
608 {
609         struct mlx5_priv *priv = dev->data->dev_private;
610         struct mlx5_dev_config *config = &priv->config;
611         unsigned int max;
612
613         /* FIXME: we should ask the device for these values. */
614         info->min_rx_bufsize = 32;
615         info->max_rx_pktlen = 65536;
616         info->max_lro_pkt_size = MLX5_MAX_LRO_SIZE;
617         /*
618          * Since we need one CQ per QP, the limit is the minimum number
619          * between the two values.
620          */
621         max = RTE_MIN(priv->sh->device_attr.orig_attr.max_cq,
622                       priv->sh->device_attr.orig_attr.max_qp);
623         /* If max >= 65535 then max = 0, max_rx_queues is uint16_t. */
624         if (max >= 65535)
625                 max = 65535;
626         info->max_rx_queues = max;
627         info->max_tx_queues = max;
628         info->max_mac_addrs = MLX5_MAX_UC_MAC_ADDRESSES;
629         info->rx_queue_offload_capa = mlx5_get_rx_queue_offloads(dev);
630         info->rx_offload_capa = (mlx5_get_rx_port_offloads() |
631                                  info->rx_queue_offload_capa);
632         info->tx_offload_capa = mlx5_get_tx_port_offloads(dev);
633         info->if_index = mlx5_ifindex(dev);
634         info->reta_size = priv->reta_idx_n ?
635                 priv->reta_idx_n : config->ind_table_max_size;
636         info->hash_key_size = MLX5_RSS_HASH_KEY_LEN;
637         info->speed_capa = priv->link_speed_capa;
638         info->flow_type_rss_offloads = ~MLX5_RSS_HF_MASK;
639         mlx5_set_default_params(dev, info);
640         mlx5_set_txlimit_params(dev, info);
641         info->switch_info.name = dev->data->name;
642         info->switch_info.domain_id = priv->domain_id;
643         info->switch_info.port_id = priv->representor_id;
644         if (priv->representor) {
645                 uint16_t port_id;
646
647                 if (priv->pf_bond >= 0) {
648                         /*
649                          * Switch port ID is opaque value with driver defined
650                          * format. Push the PF index in bonding configurations
651                          * in upper four bits of port ID. If we get too many
652                          * representors (more than 4K) or PFs (more than 15)
653                          * this approach must be reconsidered.
654                          */
655                         if ((info->switch_info.port_id >>
656                                 MLX5_PORT_ID_BONDING_PF_SHIFT) ||
657                             priv->pf_bond > MLX5_PORT_ID_BONDING_PF_MASK) {
658                                 DRV_LOG(ERR, "can't update switch port ID"
659                                              " for bonding device");
660                                 assert(false);
661                                 return -ENODEV;
662                         }
663                         info->switch_info.port_id |=
664                                 priv->pf_bond << MLX5_PORT_ID_BONDING_PF_SHIFT;
665                 }
666                 MLX5_ETH_FOREACH_DEV(port_id, priv->pci_dev) {
667                         struct mlx5_priv *opriv =
668                                 rte_eth_devices[port_id].data->dev_private;
669
670                         if (!opriv ||
671                             opriv->representor ||
672                             opriv->sh != priv->sh ||
673                             opriv->domain_id != priv->domain_id)
674                                 continue;
675                         /*
676                          * Override switch name with that of the master
677                          * device.
678                          */
679                         info->switch_info.name = opriv->dev_data->name;
680                         break;
681                 }
682         }
683         return 0;
684 }
685
686 /**
687  * Get device current raw clock counter
688  *
689  * @param dev
690  *   Pointer to Ethernet device structure.
691  * @param[out] time
692  *   Current raw clock counter of the device.
693  *
694  * @return
695  *   0 if the clock has correctly been read
696  *   The value of errno in case of error
697  */
698 int
699 mlx5_read_clock(struct rte_eth_dev *dev, uint64_t *clock)
700 {
701         struct mlx5_priv *priv = dev->data->dev_private;
702         struct ibv_context *ctx = priv->sh->ctx;
703         struct ibv_values_ex values;
704         int err = 0;
705
706         values.comp_mask = IBV_VALUES_MASK_RAW_CLOCK;
707         err = mlx5_glue->query_rt_values_ex(ctx, &values);
708         if (err != 0) {
709                 DRV_LOG(WARNING, "Could not query the clock !");
710                 return err;
711         }
712         *clock = values.raw_clock.tv_nsec;
713         return 0;
714 }
715
716 /**
717  * Get firmware version of a device.
718  *
719  * @param dev
720  *   Ethernet device port.
721  * @param fw_ver
722  *   String output allocated by caller.
723  * @param fw_size
724  *   Size of the output string, including terminating null byte.
725  *
726  * @return
727  *   0 on success, or the size of the non truncated string if too big.
728  */
729 int mlx5_fw_version_get(struct rte_eth_dev *dev, char *fw_ver, size_t fw_size)
730 {
731         struct mlx5_priv *priv = dev->data->dev_private;
732         struct ibv_device_attr *attr = &priv->sh->device_attr.orig_attr;
733         size_t size = strnlen(attr->fw_ver, sizeof(attr->fw_ver)) + 1;
734
735         if (fw_size < size)
736                 return size;
737         if (fw_ver != NULL)
738                 strlcpy(fw_ver, attr->fw_ver, fw_size);
739         return 0;
740 }
741
742 /**
743  * Get supported packet types.
744  *
745  * @param dev
746  *   Pointer to Ethernet device structure.
747  *
748  * @return
749  *   A pointer to the supported Packet types array.
750  */
751 const uint32_t *
752 mlx5_dev_supported_ptypes_get(struct rte_eth_dev *dev)
753 {
754         static const uint32_t ptypes[] = {
755                 /* refers to rxq_cq_to_pkt_type() */
756                 RTE_PTYPE_L2_ETHER,
757                 RTE_PTYPE_L3_IPV4_EXT_UNKNOWN,
758                 RTE_PTYPE_L3_IPV6_EXT_UNKNOWN,
759                 RTE_PTYPE_L4_NONFRAG,
760                 RTE_PTYPE_L4_FRAG,
761                 RTE_PTYPE_L4_TCP,
762                 RTE_PTYPE_L4_UDP,
763                 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN,
764                 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN,
765                 RTE_PTYPE_INNER_L4_NONFRAG,
766                 RTE_PTYPE_INNER_L4_FRAG,
767                 RTE_PTYPE_INNER_L4_TCP,
768                 RTE_PTYPE_INNER_L4_UDP,
769                 RTE_PTYPE_UNKNOWN
770         };
771
772         if (dev->rx_pkt_burst == mlx5_rx_burst ||
773             dev->rx_pkt_burst == mlx5_rx_burst_mprq ||
774             dev->rx_pkt_burst == mlx5_rx_burst_vec)
775                 return ptypes;
776         return NULL;
777 }
778
779 /**
780  * Retrieve the master device for representor in the same switch domain.
781  *
782  * @param dev
783  *   Pointer to representor Ethernet device structure.
784  *
785  * @return
786  *   Master device structure  on success, NULL otherwise.
787  */
788
789 static struct rte_eth_dev *
790 mlx5_find_master_dev(struct rte_eth_dev *dev)
791 {
792         struct mlx5_priv *priv;
793         uint16_t port_id;
794         uint16_t domain_id;
795
796         priv = dev->data->dev_private;
797         domain_id = priv->domain_id;
798         assert(priv->representor);
799         MLX5_ETH_FOREACH_DEV(port_id, priv->pci_dev) {
800                 struct mlx5_priv *opriv =
801                         rte_eth_devices[port_id].data->dev_private;
802                 if (opriv &&
803                     opriv->master &&
804                     opriv->domain_id == domain_id &&
805                     opriv->sh == priv->sh)
806                         return &rte_eth_devices[port_id];
807         }
808         return NULL;
809 }
810
811 /**
812  * DPDK callback to retrieve physical link information.
813  *
814  * @param dev
815  *   Pointer to Ethernet device structure.
816  * @param[out] link
817  *   Storage for current link status.
818  *
819  * @return
820  *   0 on success, a negative errno value otherwise and rte_errno is set.
821  */
822 static int
823 mlx5_link_update_unlocked_gset(struct rte_eth_dev *dev,
824                                struct rte_eth_link *link)
825 {
826         struct mlx5_priv *priv = dev->data->dev_private;
827         struct ethtool_cmd edata = {
828                 .cmd = ETHTOOL_GSET /* Deprecated since Linux v4.5. */
829         };
830         struct ifreq ifr;
831         struct rte_eth_link dev_link;
832         int link_speed = 0;
833         int ret;
834
835         ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr);
836         if (ret) {
837                 DRV_LOG(WARNING, "port %u ioctl(SIOCGIFFLAGS) failed: %s",
838                         dev->data->port_id, strerror(rte_errno));
839                 return ret;
840         }
841         dev_link = (struct rte_eth_link) {
842                 .link_status = ((ifr.ifr_flags & IFF_UP) &&
843                                 (ifr.ifr_flags & IFF_RUNNING)),
844         };
845         ifr = (struct ifreq) {
846                 .ifr_data = (void *)&edata,
847         };
848         ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
849         if (ret) {
850                 if (ret == -ENOTSUP && priv->representor) {
851                         struct rte_eth_dev *master;
852
853                         /*
854                          * For representors we can try to inherit link
855                          * settings from the master device. Actually
856                          * link settings do not make a lot of sense
857                          * for representors due to missing physical
858                          * link. The old kernel drivers supported
859                          * emulated settings query for representors,
860                          * the new ones do not, so we have to add
861                          * this code for compatibility issues.
862                          */
863                         master = mlx5_find_master_dev(dev);
864                         if (master) {
865                                 ifr = (struct ifreq) {
866                                         .ifr_data = (void *)&edata,
867                                 };
868                                 ret = mlx5_ifreq(master, SIOCETHTOOL, &ifr);
869                         }
870                 }
871                 if (ret) {
872                         DRV_LOG(WARNING,
873                                 "port %u ioctl(SIOCETHTOOL,"
874                                 " ETHTOOL_GSET) failed: %s",
875                                 dev->data->port_id, strerror(rte_errno));
876                         return ret;
877                 }
878         }
879         link_speed = ethtool_cmd_speed(&edata);
880         if (link_speed == -1)
881                 dev_link.link_speed = ETH_SPEED_NUM_NONE;
882         else
883                 dev_link.link_speed = link_speed;
884         priv->link_speed_capa = 0;
885         if (edata.supported & SUPPORTED_Autoneg)
886                 priv->link_speed_capa |= ETH_LINK_SPEED_AUTONEG;
887         if (edata.supported & (SUPPORTED_1000baseT_Full |
888                                SUPPORTED_1000baseKX_Full))
889                 priv->link_speed_capa |= ETH_LINK_SPEED_1G;
890         if (edata.supported & SUPPORTED_10000baseKR_Full)
891                 priv->link_speed_capa |= ETH_LINK_SPEED_10G;
892         if (edata.supported & (SUPPORTED_40000baseKR4_Full |
893                                SUPPORTED_40000baseCR4_Full |
894                                SUPPORTED_40000baseSR4_Full |
895                                SUPPORTED_40000baseLR4_Full))
896                 priv->link_speed_capa |= ETH_LINK_SPEED_40G;
897         dev_link.link_duplex = ((edata.duplex == DUPLEX_HALF) ?
898                                 ETH_LINK_HALF_DUPLEX : ETH_LINK_FULL_DUPLEX);
899         dev_link.link_autoneg = !(dev->data->dev_conf.link_speeds &
900                         ETH_LINK_SPEED_FIXED);
901         if (((dev_link.link_speed && !dev_link.link_status) ||
902              (!dev_link.link_speed && dev_link.link_status))) {
903                 rte_errno = EAGAIN;
904                 return -rte_errno;
905         }
906         *link = dev_link;
907         return 0;
908 }
909
910 /**
911  * Retrieve physical link information (unlocked version using new ioctl).
912  *
913  * @param dev
914  *   Pointer to Ethernet device structure.
915  * @param[out] link
916  *   Storage for current link status.
917  *
918  * @return
919  *   0 on success, a negative errno value otherwise and rte_errno is set.
920  */
921 static int
922 mlx5_link_update_unlocked_gs(struct rte_eth_dev *dev,
923                              struct rte_eth_link *link)
924
925 {
926         struct mlx5_priv *priv = dev->data->dev_private;
927         struct ethtool_link_settings gcmd = { .cmd = ETHTOOL_GLINKSETTINGS };
928         struct ifreq ifr;
929         struct rte_eth_link dev_link;
930         struct rte_eth_dev *master = NULL;
931         uint64_t sc;
932         int ret;
933
934         ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr);
935         if (ret) {
936                 DRV_LOG(WARNING, "port %u ioctl(SIOCGIFFLAGS) failed: %s",
937                         dev->data->port_id, strerror(rte_errno));
938                 return ret;
939         }
940         dev_link = (struct rte_eth_link) {
941                 .link_status = ((ifr.ifr_flags & IFF_UP) &&
942                                 (ifr.ifr_flags & IFF_RUNNING)),
943         };
944         ifr = (struct ifreq) {
945                 .ifr_data = (void *)&gcmd,
946         };
947         ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
948         if (ret) {
949                 if (ret == -ENOTSUP && priv->representor) {
950                         /*
951                          * For representors we can try to inherit link
952                          * settings from the master device. Actually
953                          * link settings do not make a lot of sense
954                          * for representors due to missing physical
955                          * link. The old kernel drivers supported
956                          * emulated settings query for representors,
957                          * the new ones do not, so we have to add
958                          * this code for compatibility issues.
959                          */
960                         master = mlx5_find_master_dev(dev);
961                         if (master) {
962                                 ifr = (struct ifreq) {
963                                         .ifr_data = (void *)&gcmd,
964                                 };
965                                 ret = mlx5_ifreq(master, SIOCETHTOOL, &ifr);
966                         }
967                 }
968                 if (ret) {
969                         DRV_LOG(DEBUG,
970                                 "port %u ioctl(SIOCETHTOOL,"
971                                 " ETHTOOL_GLINKSETTINGS) failed: %s",
972                                 dev->data->port_id, strerror(rte_errno));
973                         return ret;
974                 }
975
976         }
977         gcmd.link_mode_masks_nwords = -gcmd.link_mode_masks_nwords;
978
979         alignas(struct ethtool_link_settings)
980         uint8_t data[offsetof(struct ethtool_link_settings, link_mode_masks) +
981                      sizeof(uint32_t) * gcmd.link_mode_masks_nwords * 3];
982         struct ethtool_link_settings *ecmd = (void *)data;
983
984         *ecmd = gcmd;
985         ifr.ifr_data = (void *)ecmd;
986         ret = mlx5_ifreq(master ? master : dev, SIOCETHTOOL, &ifr);
987         if (ret) {
988                 DRV_LOG(DEBUG,
989                         "port %u ioctl(SIOCETHTOOL,"
990                         "ETHTOOL_GLINKSETTINGS) failed: %s",
991                         dev->data->port_id, strerror(rte_errno));
992                 return ret;
993         }
994         dev_link.link_speed = (ecmd->speed == UINT32_MAX) ? ETH_SPEED_NUM_NONE :
995                                                             ecmd->speed;
996         sc = ecmd->link_mode_masks[0] |
997                 ((uint64_t)ecmd->link_mode_masks[1] << 32);
998         priv->link_speed_capa = 0;
999         if (sc & MLX5_BITSHIFT(ETHTOOL_LINK_MODE_Autoneg_BIT))
1000                 priv->link_speed_capa |= ETH_LINK_SPEED_AUTONEG;
1001         if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_1000baseT_Full_BIT) |
1002                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_1000baseKX_Full_BIT)))
1003                 priv->link_speed_capa |= ETH_LINK_SPEED_1G;
1004         if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT) |
1005                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_10000baseKR_Full_BIT) |
1006                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_10000baseR_FEC_BIT)))
1007                 priv->link_speed_capa |= ETH_LINK_SPEED_10G;
1008         if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_20000baseMLD2_Full_BIT) |
1009                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_20000baseKR2_Full_BIT)))
1010                 priv->link_speed_capa |= ETH_LINK_SPEED_20G;
1011         if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT) |
1012                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT) |
1013                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT) |
1014                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT)))
1015                 priv->link_speed_capa |= ETH_LINK_SPEED_40G;
1016         if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseKR4_Full_BIT) |
1017                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseCR4_Full_BIT) |
1018                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseSR4_Full_BIT) |
1019                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseLR4_Full_BIT)))
1020                 priv->link_speed_capa |= ETH_LINK_SPEED_56G;
1021         if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_25000baseCR_Full_BIT) |
1022                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_25000baseKR_Full_BIT) |
1023                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_25000baseSR_Full_BIT)))
1024                 priv->link_speed_capa |= ETH_LINK_SPEED_25G;
1025         if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_50000baseCR2_Full_BIT) |
1026                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_50000baseKR2_Full_BIT)))
1027                 priv->link_speed_capa |= ETH_LINK_SPEED_50G;
1028         if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT) |
1029                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseSR4_Full_BIT) |
1030                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT) |
1031                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT)))
1032                 priv->link_speed_capa |= ETH_LINK_SPEED_100G;
1033         dev_link.link_duplex = ((ecmd->duplex == DUPLEX_HALF) ?
1034                                 ETH_LINK_HALF_DUPLEX : ETH_LINK_FULL_DUPLEX);
1035         dev_link.link_autoneg = !(dev->data->dev_conf.link_speeds &
1036                                   ETH_LINK_SPEED_FIXED);
1037         if (((dev_link.link_speed && !dev_link.link_status) ||
1038              (!dev_link.link_speed && dev_link.link_status))) {
1039                 rte_errno = EAGAIN;
1040                 return -rte_errno;
1041         }
1042         *link = dev_link;
1043         return 0;
1044 }
1045
1046 /**
1047  * DPDK callback to retrieve physical link information.
1048  *
1049  * @param dev
1050  *   Pointer to Ethernet device structure.
1051  * @param wait_to_complete
1052  *   Wait for request completion.
1053  *
1054  * @return
1055  *   0 if link status was not updated, positive if it was, a negative errno
1056  *   value otherwise and rte_errno is set.
1057  */
1058 int
1059 mlx5_link_update(struct rte_eth_dev *dev, int wait_to_complete)
1060 {
1061         int ret;
1062         struct rte_eth_link dev_link;
1063         time_t start_time = time(NULL);
1064         int retry = MLX5_GET_LINK_STATUS_RETRY_COUNT;
1065
1066         do {
1067                 ret = mlx5_link_update_unlocked_gs(dev, &dev_link);
1068                 if (ret == -ENOTSUP)
1069                         ret = mlx5_link_update_unlocked_gset(dev, &dev_link);
1070                 if (ret == 0)
1071                         break;
1072                 /* Handle wait to complete situation. */
1073                 if ((wait_to_complete || retry) && ret == -EAGAIN) {
1074                         if (abs((int)difftime(time(NULL), start_time)) <
1075                             MLX5_LINK_STATUS_TIMEOUT) {
1076                                 usleep(0);
1077                                 continue;
1078                         } else {
1079                                 rte_errno = EBUSY;
1080                                 return -rte_errno;
1081                         }
1082                 } else if (ret < 0) {
1083                         return ret;
1084                 }
1085         } while (wait_to_complete || retry-- > 0);
1086         ret = !!memcmp(&dev->data->dev_link, &dev_link,
1087                        sizeof(struct rte_eth_link));
1088         dev->data->dev_link = dev_link;
1089         return ret;
1090 }
1091
1092 /**
1093  * DPDK callback to change the MTU.
1094  *
1095  * @param dev
1096  *   Pointer to Ethernet device structure.
1097  * @param in_mtu
1098  *   New MTU.
1099  *
1100  * @return
1101  *   0 on success, a negative errno value otherwise and rte_errno is set.
1102  */
1103 int
1104 mlx5_dev_set_mtu(struct rte_eth_dev *dev, uint16_t mtu)
1105 {
1106         struct mlx5_priv *priv = dev->data->dev_private;
1107         uint16_t kern_mtu = 0;
1108         int ret;
1109
1110         ret = mlx5_get_mtu(dev, &kern_mtu);
1111         if (ret)
1112                 return ret;
1113         /* Set kernel interface MTU first. */
1114         ret = mlx5_set_mtu(dev, mtu);
1115         if (ret)
1116                 return ret;
1117         ret = mlx5_get_mtu(dev, &kern_mtu);
1118         if (ret)
1119                 return ret;
1120         if (kern_mtu == mtu) {
1121                 priv->mtu = mtu;
1122                 DRV_LOG(DEBUG, "port %u adapter MTU set to %u",
1123                         dev->data->port_id, mtu);
1124                 return 0;
1125         }
1126         rte_errno = EAGAIN;
1127         return -rte_errno;
1128 }
1129
1130 /**
1131  * DPDK callback to get flow control status.
1132  *
1133  * @param dev
1134  *   Pointer to Ethernet device structure.
1135  * @param[out] fc_conf
1136  *   Flow control output buffer.
1137  *
1138  * @return
1139  *   0 on success, a negative errno value otherwise and rte_errno is set.
1140  */
1141 int
1142 mlx5_dev_get_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf)
1143 {
1144         struct ifreq ifr;
1145         struct ethtool_pauseparam ethpause = {
1146                 .cmd = ETHTOOL_GPAUSEPARAM
1147         };
1148         int ret;
1149
1150         ifr.ifr_data = (void *)&ethpause;
1151         ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
1152         if (ret) {
1153                 DRV_LOG(WARNING,
1154                         "port %u ioctl(SIOCETHTOOL, ETHTOOL_GPAUSEPARAM) failed:"
1155                         " %s",
1156                         dev->data->port_id, strerror(rte_errno));
1157                 return ret;
1158         }
1159         fc_conf->autoneg = ethpause.autoneg;
1160         if (ethpause.rx_pause && ethpause.tx_pause)
1161                 fc_conf->mode = RTE_FC_FULL;
1162         else if (ethpause.rx_pause)
1163                 fc_conf->mode = RTE_FC_RX_PAUSE;
1164         else if (ethpause.tx_pause)
1165                 fc_conf->mode = RTE_FC_TX_PAUSE;
1166         else
1167                 fc_conf->mode = RTE_FC_NONE;
1168         return 0;
1169 }
1170
1171 /**
1172  * DPDK callback to modify flow control parameters.
1173  *
1174  * @param dev
1175  *   Pointer to Ethernet device structure.
1176  * @param[in] fc_conf
1177  *   Flow control parameters.
1178  *
1179  * @return
1180  *   0 on success, a negative errno value otherwise and rte_errno is set.
1181  */
1182 int
1183 mlx5_dev_set_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf)
1184 {
1185         struct ifreq ifr;
1186         struct ethtool_pauseparam ethpause = {
1187                 .cmd = ETHTOOL_SPAUSEPARAM
1188         };
1189         int ret;
1190
1191         ifr.ifr_data = (void *)&ethpause;
1192         ethpause.autoneg = fc_conf->autoneg;
1193         if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) ||
1194             (fc_conf->mode & RTE_FC_RX_PAUSE))
1195                 ethpause.rx_pause = 1;
1196         else
1197                 ethpause.rx_pause = 0;
1198
1199         if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) ||
1200             (fc_conf->mode & RTE_FC_TX_PAUSE))
1201                 ethpause.tx_pause = 1;
1202         else
1203                 ethpause.tx_pause = 0;
1204         ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
1205         if (ret) {
1206                 DRV_LOG(WARNING,
1207                         "port %u ioctl(SIOCETHTOOL, ETHTOOL_SPAUSEPARAM)"
1208                         " failed: %s",
1209                         dev->data->port_id, strerror(rte_errno));
1210                 return ret;
1211         }
1212         return 0;
1213 }
1214
1215 /**
1216  * Handle asynchronous removal event for entire multiport device.
1217  *
1218  * @param sh
1219  *   Infiniband device shared context.
1220  */
1221 static void
1222 mlx5_dev_interrupt_device_fatal(struct mlx5_ibv_shared *sh)
1223 {
1224         uint32_t i;
1225
1226         for (i = 0; i < sh->max_port; ++i) {
1227                 struct rte_eth_dev *dev;
1228
1229                 if (sh->port[i].ih_port_id >= RTE_MAX_ETHPORTS) {
1230                         /*
1231                          * Or not existing port either no
1232                          * handler installed for this port.
1233                          */
1234                         continue;
1235                 }
1236                 dev = &rte_eth_devices[sh->port[i].ih_port_id];
1237                 assert(dev);
1238                 if (dev->data->dev_conf.intr_conf.rmv)
1239                         _rte_eth_dev_callback_process
1240                                 (dev, RTE_ETH_EVENT_INTR_RMV, NULL);
1241         }
1242 }
1243
1244 /**
1245  * Handle shared asynchronous events the NIC (removal event
1246  * and link status change). Supports multiport IB device.
1247  *
1248  * @param cb_arg
1249  *   Callback argument.
1250  */
1251 void
1252 mlx5_dev_interrupt_handler(void *cb_arg)
1253 {
1254         struct mlx5_ibv_shared *sh = cb_arg;
1255         struct ibv_async_event event;
1256
1257         /* Read all message from the IB device and acknowledge them. */
1258         for (;;) {
1259                 struct rte_eth_dev *dev;
1260                 uint32_t tmp;
1261
1262                 if (mlx5_glue->get_async_event(sh->ctx, &event))
1263                         break;
1264                 /* Retrieve and check IB port index. */
1265                 tmp = (uint32_t)event.element.port_num;
1266                 if (!tmp && event.event_type == IBV_EVENT_DEVICE_FATAL) {
1267                         /*
1268                          * The DEVICE_FATAL event is called once for
1269                          * entire device without port specifying.
1270                          * We should notify all existing ports.
1271                          */
1272                         mlx5_glue->ack_async_event(&event);
1273                         mlx5_dev_interrupt_device_fatal(sh);
1274                         continue;
1275                 }
1276                 assert(tmp && (tmp <= sh->max_port));
1277                 if (!tmp) {
1278                         /* Unsupported devive level event. */
1279                         mlx5_glue->ack_async_event(&event);
1280                         DRV_LOG(DEBUG,
1281                                 "unsupported common event (type %d)",
1282                                 event.event_type);
1283                         continue;
1284                 }
1285                 if (tmp > sh->max_port) {
1286                         /* Invalid IB port index. */
1287                         mlx5_glue->ack_async_event(&event);
1288                         DRV_LOG(DEBUG,
1289                                 "cannot handle an event (type %d)"
1290                                 "due to invalid IB port index (%u)",
1291                                 event.event_type, tmp);
1292                         continue;
1293                 }
1294                 if (sh->port[tmp - 1].ih_port_id >= RTE_MAX_ETHPORTS) {
1295                         /* No handler installed. */
1296                         mlx5_glue->ack_async_event(&event);
1297                         DRV_LOG(DEBUG,
1298                                 "cannot handle an event (type %d)"
1299                                 "due to no handler installed for port %u",
1300                                 event.event_type, tmp);
1301                         continue;
1302                 }
1303                 /* Retrieve ethernet device descriptor. */
1304                 tmp = sh->port[tmp - 1].ih_port_id;
1305                 dev = &rte_eth_devices[tmp];
1306                 assert(dev);
1307                 if ((event.event_type == IBV_EVENT_PORT_ACTIVE ||
1308                      event.event_type == IBV_EVENT_PORT_ERR) &&
1309                         dev->data->dev_conf.intr_conf.lsc) {
1310                         mlx5_glue->ack_async_event(&event);
1311                         if (mlx5_link_update(dev, 0) == -EAGAIN) {
1312                                 usleep(0);
1313                                 continue;
1314                         }
1315                         _rte_eth_dev_callback_process
1316                                 (dev, RTE_ETH_EVENT_INTR_LSC, NULL);
1317                         continue;
1318                 }
1319                 DRV_LOG(DEBUG,
1320                         "port %u cannot handle an unknown event (type %d)",
1321                         dev->data->port_id, event.event_type);
1322                 mlx5_glue->ack_async_event(&event);
1323         }
1324 }
1325
1326 /*
1327  * Unregister callback handler safely. The handler may be active
1328  * while we are trying to unregister it, in this case code -EAGAIN
1329  * is returned by rte_intr_callback_unregister(). This routine checks
1330  * the return code and tries to unregister handler again.
1331  *
1332  * @param handle
1333  *   interrupt handle
1334  * @param cb_fn
1335  *   pointer to callback routine
1336  * @cb_arg
1337  *   opaque callback parameter
1338  */
1339 void
1340 mlx5_intr_callback_unregister(const struct rte_intr_handle *handle,
1341                               rte_intr_callback_fn cb_fn, void *cb_arg)
1342 {
1343         /*
1344          * Try to reduce timeout management overhead by not calling
1345          * the timer related routines on the first iteration. If the
1346          * unregistering succeeds on first call there will be no
1347          * timer calls at all.
1348          */
1349         uint64_t twait = 0;
1350         uint64_t start = 0;
1351
1352         do {
1353                 int ret;
1354
1355                 ret = rte_intr_callback_unregister(handle, cb_fn, cb_arg);
1356                 if (ret >= 0)
1357                         return;
1358                 if (ret != -EAGAIN) {
1359                         DRV_LOG(INFO, "failed to unregister interrupt"
1360                                       " handler (error: %d)", ret);
1361                         assert(false);
1362                         return;
1363                 }
1364                 if (twait) {
1365                         struct timespec onems;
1366
1367                         /* Wait one millisecond and try again. */
1368                         onems.tv_sec = 0;
1369                         onems.tv_nsec = NS_PER_S / MS_PER_S;
1370                         nanosleep(&onems, 0);
1371                         /* Check whether one second elapsed. */
1372                         if ((rte_get_timer_cycles() - start) <= twait)
1373                                 continue;
1374                 } else {
1375                         /*
1376                          * We get the amount of timer ticks for one second.
1377                          * If this amount elapsed it means we spent one
1378                          * second in waiting. This branch is executed once
1379                          * on first iteration.
1380                          */
1381                         twait = rte_get_timer_hz();
1382                         assert(twait);
1383                 }
1384                 /*
1385                  * Timeout elapsed, show message (once a second) and retry.
1386                  * We have no other acceptable option here, if we ignore
1387                  * the unregistering return code the handler will not
1388                  * be unregistered, fd will be closed and we may get the
1389                  * crush. Hanging and messaging in the loop seems not to be
1390                  * the worst choice.
1391                  */
1392                 DRV_LOG(INFO, "Retrying to unregister interrupt handler");
1393                 start = rte_get_timer_cycles();
1394         } while (true);
1395 }
1396
1397 /**
1398  * Handle DEVX interrupts from the NIC.
1399  * This function is probably called from the DPDK host thread.
1400  *
1401  * @param cb_arg
1402  *   Callback argument.
1403  */
1404 void
1405 mlx5_dev_interrupt_handler_devx(void *cb_arg)
1406 {
1407 #ifndef HAVE_IBV_DEVX_ASYNC
1408         (void)cb_arg;
1409         return;
1410 #else
1411         struct mlx5_ibv_shared *sh = cb_arg;
1412         union {
1413                 struct mlx5dv_devx_async_cmd_hdr cmd_resp;
1414                 uint8_t buf[MLX5_ST_SZ_BYTES(query_flow_counter_out) +
1415                             MLX5_ST_SZ_BYTES(traffic_counter) +
1416                             sizeof(struct mlx5dv_devx_async_cmd_hdr)];
1417         } out;
1418         uint8_t *buf = out.buf + sizeof(out.cmd_resp);
1419
1420         while (!mlx5_glue->devx_get_async_cmd_comp(sh->devx_comp,
1421                                                    &out.cmd_resp,
1422                                                    sizeof(out.buf)))
1423                 mlx5_flow_async_pool_query_handle
1424                         (sh, (uint64_t)out.cmd_resp.wr_id,
1425                          mlx5_devx_get_out_command_status(buf));
1426 #endif /* HAVE_IBV_DEVX_ASYNC */
1427 }
1428
1429 /**
1430  * Uninstall shared asynchronous device events handler.
1431  * This function is implemented to support event sharing
1432  * between multiple ports of single IB device.
1433  *
1434  * @param dev
1435  *   Pointer to Ethernet device.
1436  */
1437 static void
1438 mlx5_dev_shared_handler_uninstall(struct rte_eth_dev *dev)
1439 {
1440         struct mlx5_priv *priv = dev->data->dev_private;
1441         struct mlx5_ibv_shared *sh = priv->sh;
1442
1443         if (rte_eal_process_type() != RTE_PROC_PRIMARY)
1444                 return;
1445         pthread_mutex_lock(&sh->intr_mutex);
1446         assert(priv->ibv_port);
1447         assert(priv->ibv_port <= sh->max_port);
1448         assert(dev->data->port_id < RTE_MAX_ETHPORTS);
1449         if (sh->port[priv->ibv_port - 1].ih_port_id >= RTE_MAX_ETHPORTS)
1450                 goto exit;
1451         assert(sh->port[priv->ibv_port - 1].ih_port_id ==
1452                                         (uint32_t)dev->data->port_id);
1453         assert(sh->intr_cnt);
1454         sh->port[priv->ibv_port - 1].ih_port_id = RTE_MAX_ETHPORTS;
1455         if (!sh->intr_cnt || --sh->intr_cnt)
1456                 goto exit;
1457         mlx5_intr_callback_unregister(&sh->intr_handle,
1458                                      mlx5_dev_interrupt_handler, sh);
1459         sh->intr_handle.fd = 0;
1460         sh->intr_handle.type = RTE_INTR_HANDLE_UNKNOWN;
1461 exit:
1462         pthread_mutex_unlock(&sh->intr_mutex);
1463 }
1464
1465 /**
1466  * Uninstall devx shared asynchronous device events handler.
1467  * This function is implemeted to support event sharing
1468  * between multiple ports of single IB device.
1469  *
1470  * @param dev
1471  *   Pointer to Ethernet device.
1472  */
1473 static void
1474 mlx5_dev_shared_handler_devx_uninstall(struct rte_eth_dev *dev)
1475 {
1476         struct mlx5_priv *priv = dev->data->dev_private;
1477         struct mlx5_ibv_shared *sh = priv->sh;
1478
1479         if (rte_eal_process_type() != RTE_PROC_PRIMARY)
1480                 return;
1481         pthread_mutex_lock(&sh->intr_mutex);
1482         assert(priv->ibv_port);
1483         assert(priv->ibv_port <= sh->max_port);
1484         assert(dev->data->port_id < RTE_MAX_ETHPORTS);
1485         if (sh->port[priv->ibv_port - 1].devx_ih_port_id >= RTE_MAX_ETHPORTS)
1486                 goto exit;
1487         assert(sh->port[priv->ibv_port - 1].devx_ih_port_id ==
1488                                         (uint32_t)dev->data->port_id);
1489         sh->port[priv->ibv_port - 1].devx_ih_port_id = RTE_MAX_ETHPORTS;
1490         if (!sh->devx_intr_cnt || --sh->devx_intr_cnt)
1491                 goto exit;
1492         if (sh->intr_handle_devx.fd) {
1493                 rte_intr_callback_unregister(&sh->intr_handle_devx,
1494                                              mlx5_dev_interrupt_handler_devx,
1495                                              sh);
1496                 sh->intr_handle_devx.fd = 0;
1497                 sh->intr_handle_devx.type = RTE_INTR_HANDLE_UNKNOWN;
1498         }
1499         if (sh->devx_comp) {
1500                 mlx5_glue->devx_destroy_cmd_comp(sh->devx_comp);
1501                 sh->devx_comp = NULL;
1502         }
1503 exit:
1504         pthread_mutex_unlock(&sh->intr_mutex);
1505 }
1506
1507 /**
1508  * Install shared asynchronous device events handler.
1509  * This function is implemented to support event sharing
1510  * between multiple ports of single IB device.
1511  *
1512  * @param dev
1513  *   Pointer to Ethernet device.
1514  */
1515 static void
1516 mlx5_dev_shared_handler_install(struct rte_eth_dev *dev)
1517 {
1518         struct mlx5_priv *priv = dev->data->dev_private;
1519         struct mlx5_ibv_shared *sh = priv->sh;
1520         int ret;
1521         int flags;
1522
1523         if (rte_eal_process_type() != RTE_PROC_PRIMARY)
1524                 return;
1525         pthread_mutex_lock(&sh->intr_mutex);
1526         assert(priv->ibv_port);
1527         assert(priv->ibv_port <= sh->max_port);
1528         assert(dev->data->port_id < RTE_MAX_ETHPORTS);
1529         if (sh->port[priv->ibv_port - 1].ih_port_id < RTE_MAX_ETHPORTS) {
1530                 /* The handler is already installed for this port. */
1531                 assert(sh->intr_cnt);
1532                 goto exit;
1533         }
1534         if (sh->intr_cnt) {
1535                 sh->port[priv->ibv_port - 1].ih_port_id =
1536                                                 (uint32_t)dev->data->port_id;
1537                 sh->intr_cnt++;
1538                 goto exit;
1539         }
1540         /* No shared handler installed. */
1541         assert(sh->ctx->async_fd > 0);
1542         flags = fcntl(sh->ctx->async_fd, F_GETFL);
1543         ret = fcntl(sh->ctx->async_fd, F_SETFL, flags | O_NONBLOCK);
1544         if (ret) {
1545                 DRV_LOG(INFO, "failed to change file descriptor async event"
1546                         " queue");
1547                 /* Indicate there will be no interrupts. */
1548                 dev->data->dev_conf.intr_conf.lsc = 0;
1549                 dev->data->dev_conf.intr_conf.rmv = 0;
1550         } else {
1551                 sh->intr_handle.fd = sh->ctx->async_fd;
1552                 sh->intr_handle.type = RTE_INTR_HANDLE_EXT;
1553                 rte_intr_callback_register(&sh->intr_handle,
1554                                            mlx5_dev_interrupt_handler, sh);
1555                 sh->intr_cnt++;
1556                 sh->port[priv->ibv_port - 1].ih_port_id =
1557                                                 (uint32_t)dev->data->port_id;
1558         }
1559 exit:
1560         pthread_mutex_unlock(&sh->intr_mutex);
1561 }
1562
1563 /**
1564  * Install devx shared asyncronous device events handler.
1565  * This function is implemeted to support event sharing
1566  * between multiple ports of single IB device.
1567  *
1568  * @param dev
1569  *   Pointer to Ethernet device.
1570  */
1571 static void
1572 mlx5_dev_shared_handler_devx_install(struct rte_eth_dev *dev)
1573 {
1574         struct mlx5_priv *priv = dev->data->dev_private;
1575         struct mlx5_ibv_shared *sh = priv->sh;
1576
1577         if (rte_eal_process_type() != RTE_PROC_PRIMARY)
1578                 return;
1579         pthread_mutex_lock(&sh->intr_mutex);
1580         assert(priv->ibv_port);
1581         assert(priv->ibv_port <= sh->max_port);
1582         assert(dev->data->port_id < RTE_MAX_ETHPORTS);
1583         if (sh->port[priv->ibv_port - 1].devx_ih_port_id < RTE_MAX_ETHPORTS) {
1584                 /* The handler is already installed for this port. */
1585                 assert(sh->devx_intr_cnt);
1586                 goto exit;
1587         }
1588         if (sh->devx_intr_cnt) {
1589                 sh->devx_intr_cnt++;
1590                 sh->port[priv->ibv_port - 1].devx_ih_port_id =
1591                                         (uint32_t)dev->data->port_id;
1592                 goto exit;
1593         }
1594         if (priv->config.devx) {
1595 #ifndef HAVE_IBV_DEVX_ASYNC
1596                 goto exit;
1597 #else
1598                 sh->devx_comp = mlx5_glue->devx_create_cmd_comp(sh->ctx);
1599                 if (sh->devx_comp) {
1600                         int flags = fcntl(sh->devx_comp->fd, F_GETFL);
1601                         int ret = fcntl(sh->devx_comp->fd, F_SETFL,
1602                                     flags | O_NONBLOCK);
1603
1604                         if (ret) {
1605                                 DRV_LOG(INFO, "failed to change file descriptor"
1606                                         " devx async event queue");
1607                         } else {
1608                                 sh->intr_handle_devx.fd = sh->devx_comp->fd;
1609                                 sh->intr_handle_devx.type = RTE_INTR_HANDLE_EXT;
1610                                 rte_intr_callback_register
1611                                         (&sh->intr_handle_devx,
1612                                          mlx5_dev_interrupt_handler_devx, sh);
1613                                 sh->devx_intr_cnt++;
1614                                 sh->port[priv->ibv_port - 1].devx_ih_port_id =
1615                                                 (uint32_t)dev->data->port_id;
1616                         }
1617                 }
1618 #endif /* HAVE_IBV_DEVX_ASYNC */
1619         }
1620 exit:
1621         pthread_mutex_unlock(&sh->intr_mutex);
1622 }
1623
1624 /**
1625  * Uninstall interrupt handler.
1626  *
1627  * @param dev
1628  *   Pointer to Ethernet device.
1629  */
1630 void
1631 mlx5_dev_interrupt_handler_uninstall(struct rte_eth_dev *dev)
1632 {
1633         mlx5_dev_shared_handler_uninstall(dev);
1634 }
1635
1636 /**
1637  * Install interrupt handler.
1638  *
1639  * @param dev
1640  *   Pointer to Ethernet device.
1641  */
1642 void
1643 mlx5_dev_interrupt_handler_install(struct rte_eth_dev *dev)
1644 {
1645         mlx5_dev_shared_handler_install(dev);
1646 }
1647
1648 /**
1649  * Devx uninstall interrupt handler.
1650  *
1651  * @param dev
1652  *   Pointer to Ethernet device.
1653  */
1654 void
1655 mlx5_dev_interrupt_handler_devx_uninstall(struct rte_eth_dev *dev)
1656 {
1657         mlx5_dev_shared_handler_devx_uninstall(dev);
1658 }
1659
1660 /**
1661  * Devx install interrupt handler.
1662  *
1663  * @param dev
1664  *   Pointer to Ethernet device.
1665  */
1666 void
1667 mlx5_dev_interrupt_handler_devx_install(struct rte_eth_dev *dev)
1668 {
1669         mlx5_dev_shared_handler_devx_install(dev);
1670 }
1671
1672 /**
1673  * DPDK callback to bring the link DOWN.
1674  *
1675  * @param dev
1676  *   Pointer to Ethernet device structure.
1677  *
1678  * @return
1679  *   0 on success, a negative errno value otherwise and rte_errno is set.
1680  */
1681 int
1682 mlx5_set_link_down(struct rte_eth_dev *dev)
1683 {
1684         return mlx5_set_flags(dev, ~IFF_UP, ~IFF_UP);
1685 }
1686
1687 /**
1688  * DPDK callback to bring the link UP.
1689  *
1690  * @param dev
1691  *   Pointer to Ethernet device structure.
1692  *
1693  * @return
1694  *   0 on success, a negative errno value otherwise and rte_errno is set.
1695  */
1696 int
1697 mlx5_set_link_up(struct rte_eth_dev *dev)
1698 {
1699         return mlx5_set_flags(dev, ~IFF_UP, IFF_UP);
1700 }
1701
1702 /**
1703  * Configure the RX function to use.
1704  *
1705  * @param dev
1706  *   Pointer to private data structure.
1707  *
1708  * @return
1709  *   Pointer to selected Rx burst function.
1710  */
1711 eth_rx_burst_t
1712 mlx5_select_rx_function(struct rte_eth_dev *dev)
1713 {
1714         eth_rx_burst_t rx_pkt_burst = mlx5_rx_burst;
1715
1716         assert(dev != NULL);
1717         if (mlx5_check_vec_rx_support(dev) > 0) {
1718                 rx_pkt_burst = mlx5_rx_burst_vec;
1719                 DRV_LOG(DEBUG, "port %u selected Rx vectorized function",
1720                         dev->data->port_id);
1721         } else if (mlx5_mprq_enabled(dev)) {
1722                 rx_pkt_burst = mlx5_rx_burst_mprq;
1723         }
1724         return rx_pkt_burst;
1725 }
1726
1727 /**
1728  * Check if mlx5 device was removed.
1729  *
1730  * @param dev
1731  *   Pointer to Ethernet device structure.
1732  *
1733  * @return
1734  *   1 when device is removed, otherwise 0.
1735  */
1736 int
1737 mlx5_is_removed(struct rte_eth_dev *dev)
1738 {
1739         struct ibv_device_attr device_attr;
1740         struct mlx5_priv *priv = dev->data->dev_private;
1741
1742         if (mlx5_glue->query_device(priv->sh->ctx, &device_attr) == EIO)
1743                 return 1;
1744         return 0;
1745 }
1746
1747 /**
1748  * Get the E-Switch parameters by port id.
1749  *
1750  * @param[in] port
1751  *   Device port id.
1752  * @param[in] valid
1753  *   Device port id is valid, skip check. This flag is useful
1754  *   when trials are performed from probing and device is not
1755  *   flagged as valid yet (in attaching process).
1756  * @param[out] es_domain_id
1757  *   E-Switch domain id.
1758  * @param[out] es_port_id
1759  *   The port id of the port in the E-Switch.
1760  *
1761  * @return
1762  *   pointer to device private data structure containing data needed
1763  *   on success, NULL otherwise and rte_errno is set.
1764  */
1765 struct mlx5_priv *
1766 mlx5_port_to_eswitch_info(uint16_t port, bool valid)
1767 {
1768         struct rte_eth_dev *dev;
1769         struct mlx5_priv *priv;
1770
1771         if (port >= RTE_MAX_ETHPORTS) {
1772                 rte_errno = EINVAL;
1773                 return NULL;
1774         }
1775         if (!valid && !rte_eth_dev_is_valid_port(port)) {
1776                 rte_errno = ENODEV;
1777                 return NULL;
1778         }
1779         dev = &rte_eth_devices[port];
1780         priv = dev->data->dev_private;
1781         if (!(priv->representor || priv->master)) {
1782                 rte_errno = EINVAL;
1783                 return NULL;
1784         }
1785         return priv;
1786 }
1787
1788 /**
1789  * Get the E-Switch parameters by device instance.
1790  *
1791  * @param[in] port
1792  *   Device port id.
1793  * @param[out] es_domain_id
1794  *   E-Switch domain id.
1795  * @param[out] es_port_id
1796  *   The port id of the port in the E-Switch.
1797  *
1798  * @return
1799  *   pointer to device private data structure containing data needed
1800  *   on success, NULL otherwise and rte_errno is set.
1801  */
1802 struct mlx5_priv *
1803 mlx5_dev_to_eswitch_info(struct rte_eth_dev *dev)
1804 {
1805         struct mlx5_priv *priv;
1806
1807         priv = dev->data->dev_private;
1808         if (!(priv->representor || priv->master)) {
1809                 rte_errno = EINVAL;
1810                 return NULL;
1811         }
1812         return priv;
1813 }
1814
1815 /**
1816  * Get switch information associated with network interface.
1817  *
1818  * @param ifindex
1819  *   Network interface index.
1820  * @param[out] info
1821  *   Switch information object, populated in case of success.
1822  *
1823  * @return
1824  *   0 on success, a negative errno value otherwise and rte_errno is set.
1825  */
1826 int
1827 mlx5_sysfs_switch_info(unsigned int ifindex, struct mlx5_switch_info *info)
1828 {
1829         char ifname[IF_NAMESIZE];
1830         char port_name[IF_NAMESIZE];
1831         FILE *file;
1832         struct mlx5_switch_info data = {
1833                 .master = 0,
1834                 .representor = 0,
1835                 .name_type = MLX5_PHYS_PORT_NAME_TYPE_NOTSET,
1836                 .port_name = 0,
1837                 .switch_id = 0,
1838         };
1839         DIR *dir;
1840         bool port_switch_id_set = false;
1841         bool device_dir = false;
1842         char c;
1843         int ret;
1844
1845         if (!if_indextoname(ifindex, ifname)) {
1846                 rte_errno = errno;
1847                 return -rte_errno;
1848         }
1849
1850         MKSTR(phys_port_name, "/sys/class/net/%s/phys_port_name",
1851               ifname);
1852         MKSTR(phys_switch_id, "/sys/class/net/%s/phys_switch_id",
1853               ifname);
1854         MKSTR(pci_device, "/sys/class/net/%s/device",
1855               ifname);
1856
1857         file = fopen(phys_port_name, "rb");
1858         if (file != NULL) {
1859                 ret = fscanf(file, "%s", port_name);
1860                 fclose(file);
1861                 if (ret == 1)
1862                         mlx5_translate_port_name(port_name, &data);
1863         }
1864         file = fopen(phys_switch_id, "rb");
1865         if (file == NULL) {
1866                 rte_errno = errno;
1867                 return -rte_errno;
1868         }
1869         port_switch_id_set =
1870                 fscanf(file, "%" SCNx64 "%c", &data.switch_id, &c) == 2 &&
1871                 c == '\n';
1872         fclose(file);
1873         dir = opendir(pci_device);
1874         if (dir != NULL) {
1875                 closedir(dir);
1876                 device_dir = true;
1877         }
1878         if (port_switch_id_set) {
1879                 /* We have some E-Switch configuration. */
1880                 mlx5_sysfs_check_switch_info(device_dir, &data);
1881         }
1882         *info = data;
1883         assert(!(data.master && data.representor));
1884         if (data.master && data.representor) {
1885                 DRV_LOG(ERR, "ifindex %u device is recognized as master"
1886                              " and as representor", ifindex);
1887                 rte_errno = ENODEV;
1888                 return -rte_errno;
1889         }
1890         return 0;
1891 }
1892
1893 /**
1894  * Analyze gathered port parameters via Netlink to recognize master
1895  * and representor devices for E-Switch configuration.
1896  *
1897  * @param[in] num_vf_set
1898  *   flag of presence of number of VFs port attribute.
1899  * @param[inout] switch_info
1900  *   Port information, including port name as a number and port name
1901  *   type if recognized
1902  *
1903  * @return
1904  *   master and representor flags are set in switch_info according to
1905  *   recognized parameters (if any).
1906  */
1907 void
1908 mlx5_nl_check_switch_info(bool num_vf_set,
1909                           struct mlx5_switch_info *switch_info)
1910 {
1911         switch (switch_info->name_type) {
1912         case MLX5_PHYS_PORT_NAME_TYPE_UNKNOWN:
1913                 /*
1914                  * Name is not recognized, assume the master,
1915                  * check the number of VFs key presence.
1916                  */
1917                 switch_info->master = num_vf_set;
1918                 break;
1919         case MLX5_PHYS_PORT_NAME_TYPE_NOTSET:
1920                 /*
1921                  * Name is not set, this assumes the legacy naming
1922                  * schema for master, just check if there is a
1923                  * number of VFs key.
1924                  */
1925                 switch_info->master = num_vf_set;
1926                 break;
1927         case MLX5_PHYS_PORT_NAME_TYPE_UPLINK:
1928                 /* New uplink naming schema recognized. */
1929                 switch_info->master = 1;
1930                 break;
1931         case MLX5_PHYS_PORT_NAME_TYPE_LEGACY:
1932                 /* Legacy representors naming schema. */
1933                 switch_info->representor = !num_vf_set;
1934                 break;
1935         case MLX5_PHYS_PORT_NAME_TYPE_PFVF:
1936                 /* New representors naming schema. */
1937                 switch_info->representor = 1;
1938                 break;
1939         }
1940 }
1941
1942 /**
1943  * Analyze gathered port parameters via sysfs to recognize master
1944  * and representor devices for E-Switch configuration.
1945  *
1946  * @param[in] device_dir
1947  *   flag of presence of "device" directory under port device key.
1948  * @param[inout] switch_info
1949  *   Port information, including port name as a number and port name
1950  *   type if recognized
1951  *
1952  * @return
1953  *   master and representor flags are set in switch_info according to
1954  *   recognized parameters (if any).
1955  */
1956 void
1957 mlx5_sysfs_check_switch_info(bool device_dir,
1958                              struct mlx5_switch_info *switch_info)
1959 {
1960         switch (switch_info->name_type) {
1961         case MLX5_PHYS_PORT_NAME_TYPE_UNKNOWN:
1962                 /*
1963                  * Name is not recognized, assume the master,
1964                  * check the device directory presence.
1965                  */
1966                 switch_info->master = device_dir;
1967                 break;
1968         case MLX5_PHYS_PORT_NAME_TYPE_NOTSET:
1969                 /*
1970                  * Name is not set, this assumes the legacy naming
1971                  * schema for master, just check if there is
1972                  * a device directory.
1973                  */
1974                 switch_info->master = device_dir;
1975                 break;
1976         case MLX5_PHYS_PORT_NAME_TYPE_UPLINK:
1977                 /* New uplink naming schema recognized. */
1978                 switch_info->master = 1;
1979                 break;
1980         case MLX5_PHYS_PORT_NAME_TYPE_LEGACY:
1981                 /* Legacy representors naming schema. */
1982                 switch_info->representor = !device_dir;
1983                 break;
1984         case MLX5_PHYS_PORT_NAME_TYPE_PFVF:
1985                 /* New representors naming schema. */
1986                 switch_info->representor = 1;
1987                 break;
1988         }
1989 }
1990
1991 /**
1992  * Extract port name, as a number, from sysfs or netlink information.
1993  *
1994  * @param[in] port_name_in
1995  *   String representing the port name.
1996  * @param[out] port_info_out
1997  *   Port information, including port name as a number and port name
1998  *   type if recognized
1999  *
2000  * @return
2001  *   port_name field set according to recognized name format.
2002  */
2003 void
2004 mlx5_translate_port_name(const char *port_name_in,
2005                          struct mlx5_switch_info *port_info_out)
2006 {
2007         char pf_c1, pf_c2, vf_c1, vf_c2;
2008         char *end;
2009         int sc_items;
2010
2011         /*
2012          * Check for port-name as a string of the form pf0vf0
2013          * (support kernel ver >= 5.0 or OFED ver >= 4.6).
2014          */
2015         sc_items = sscanf(port_name_in, "%c%c%d%c%c%d",
2016                           &pf_c1, &pf_c2, &port_info_out->pf_num,
2017                           &vf_c1, &vf_c2, &port_info_out->port_name);
2018         if (sc_items == 6 &&
2019             pf_c1 == 'p' && pf_c2 == 'f' &&
2020             vf_c1 == 'v' && vf_c2 == 'f') {
2021                 port_info_out->name_type = MLX5_PHYS_PORT_NAME_TYPE_PFVF;
2022                 return;
2023         }
2024         /*
2025          * Check for port-name as a string of the form p0
2026          * (support kernel ver >= 5.0, or OFED ver >= 4.6).
2027          */
2028         sc_items = sscanf(port_name_in, "%c%d",
2029                           &pf_c1, &port_info_out->port_name);
2030         if (sc_items == 2 && pf_c1 == 'p') {
2031                 port_info_out->name_type = MLX5_PHYS_PORT_NAME_TYPE_UPLINK;
2032                 return;
2033         }
2034         /* Check for port-name as a number (support kernel ver < 5.0 */
2035         errno = 0;
2036         port_info_out->port_name = strtol(port_name_in, &end, 0);
2037         if (!errno &&
2038             (size_t)(end - port_name_in) == strlen(port_name_in)) {
2039                 port_info_out->name_type = MLX5_PHYS_PORT_NAME_TYPE_LEGACY;
2040                 return;
2041         }
2042         port_info_out->name_type = MLX5_PHYS_PORT_NAME_TYPE_UNKNOWN;
2043         return;
2044 }
2045
2046 /**
2047  * DPDK callback to retrieve plug-in module EEPROM information (type and size).
2048  *
2049  * @param dev
2050  *   Pointer to Ethernet device structure.
2051  * @param[out] modinfo
2052  *   Storage for plug-in module EEPROM information.
2053  *
2054  * @return
2055  *   0 on success, a negative errno value otherwise and rte_errno is set.
2056  */
2057 int
2058 mlx5_get_module_info(struct rte_eth_dev *dev,
2059                      struct rte_eth_dev_module_info *modinfo)
2060 {
2061         struct ethtool_modinfo info = {
2062                 .cmd = ETHTOOL_GMODULEINFO,
2063         };
2064         struct ifreq ifr = (struct ifreq) {
2065                 .ifr_data = (void *)&info,
2066         };
2067         int ret = 0;
2068
2069         if (!dev || !modinfo) {
2070                 DRV_LOG(WARNING, "missing argument, cannot get module info");
2071                 rte_errno = EINVAL;
2072                 return -rte_errno;
2073         }
2074         ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
2075         if (ret) {
2076                 DRV_LOG(WARNING, "port %u ioctl(SIOCETHTOOL) failed: %s",
2077                         dev->data->port_id, strerror(rte_errno));
2078                 return ret;
2079         }
2080         modinfo->type = info.type;
2081         modinfo->eeprom_len = info.eeprom_len;
2082         return ret;
2083 }
2084
2085 /**
2086  * DPDK callback to retrieve plug-in module EEPROM data.
2087  *
2088  * @param dev
2089  *   Pointer to Ethernet device structure.
2090  * @param[out] info
2091  *   Storage for plug-in module EEPROM data.
2092  *
2093  * @return
2094  *   0 on success, a negative errno value otherwise and rte_errno is set.
2095  */
2096 int mlx5_get_module_eeprom(struct rte_eth_dev *dev,
2097                            struct rte_dev_eeprom_info *info)
2098 {
2099         struct ethtool_eeprom *eeprom;
2100         struct ifreq ifr;
2101         int ret = 0;
2102
2103         if (!dev || !info) {
2104                 DRV_LOG(WARNING, "missing argument, cannot get module eeprom");
2105                 rte_errno = EINVAL;
2106                 return -rte_errno;
2107         }
2108         eeprom = rte_calloc(__func__, 1,
2109                             (sizeof(struct ethtool_eeprom) + info->length), 0);
2110         if (!eeprom) {
2111                 DRV_LOG(WARNING, "port %u cannot allocate memory for "
2112                         "eeprom data", dev->data->port_id);
2113                 rte_errno = ENOMEM;
2114                 return -rte_errno;
2115         }
2116         eeprom->cmd = ETHTOOL_GMODULEEEPROM;
2117         eeprom->offset = info->offset;
2118         eeprom->len = info->length;
2119         ifr = (struct ifreq) {
2120                 .ifr_data = (void *)eeprom,
2121         };
2122         ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
2123         if (ret)
2124                 DRV_LOG(WARNING, "port %u ioctl(SIOCETHTOOL) failed: %s",
2125                         dev->data->port_id, strerror(rte_errno));
2126         else
2127                 rte_memcpy(info->data, eeprom->data, info->length);
2128         rte_free(eeprom);
2129         return ret;
2130 }
2131
2132 /**
2133  * DPDK callback to retrieve hairpin capabilities.
2134  *
2135  * @param dev
2136  *   Pointer to Ethernet device structure.
2137  * @param[out] cap
2138  *   Storage for hairpin capability data.
2139  *
2140  * @return
2141  *   0 on success, a negative errno value otherwise and rte_errno is set.
2142  */
2143 int mlx5_hairpin_cap_get(struct rte_eth_dev *dev,
2144                          struct rte_eth_hairpin_cap *cap)
2145 {
2146         struct mlx5_priv *priv = dev->data->dev_private;
2147
2148         if (priv->sh->devx == 0) {
2149                 rte_errno = ENOTSUP;
2150                 return -rte_errno;
2151         }
2152         cap->max_nb_queues = UINT16_MAX;
2153         cap->max_rx_2_tx = 1;
2154         cap->max_tx_2_rx = 1;
2155         cap->max_nb_desc = 8192;
2156         return 0;
2157 }