d7d3bc73cec8431e1b33f1f4538c8b39f845b98c
[dpdk.git] / drivers / net / mlx5 / mlx5_ethdev.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2015 6WIND S.A.
3  * Copyright 2015 Mellanox Technologies, Ltd
4  */
5
6 #include <stddef.h>
7 #include <inttypes.h>
8 #include <unistd.h>
9 #include <stdbool.h>
10 #include <stdint.h>
11 #include <stdio.h>
12 #include <string.h>
13 #include <stdlib.h>
14 #include <errno.h>
15 #include <dirent.h>
16 #include <net/if.h>
17 #include <sys/ioctl.h>
18 #include <sys/socket.h>
19 #include <netinet/in.h>
20 #include <linux/ethtool.h>
21 #include <linux/sockios.h>
22 #include <fcntl.h>
23 #include <stdalign.h>
24 #include <sys/un.h>
25 #include <time.h>
26
27 #include <rte_atomic.h>
28 #include <rte_ethdev_driver.h>
29 #include <rte_bus_pci.h>
30 #include <rte_mbuf.h>
31 #include <rte_common.h>
32 #include <rte_interrupts.h>
33 #include <rte_malloc.h>
34 #include <rte_string_fns.h>
35 #include <rte_rwlock.h>
36 #include <rte_cycles.h>
37
38 #include <mlx5_glue.h>
39 #include <mlx5_devx_cmds.h>
40 #include <mlx5_common.h>
41
42 #include "mlx5.h"
43 #include "mlx5_rxtx.h"
44 #include "mlx5_utils.h"
45
46 /* Supported speed values found in /usr/include/linux/ethtool.h */
47 #ifndef HAVE_SUPPORTED_40000baseKR4_Full
48 #define SUPPORTED_40000baseKR4_Full (1 << 23)
49 #endif
50 #ifndef HAVE_SUPPORTED_40000baseCR4_Full
51 #define SUPPORTED_40000baseCR4_Full (1 << 24)
52 #endif
53 #ifndef HAVE_SUPPORTED_40000baseSR4_Full
54 #define SUPPORTED_40000baseSR4_Full (1 << 25)
55 #endif
56 #ifndef HAVE_SUPPORTED_40000baseLR4_Full
57 #define SUPPORTED_40000baseLR4_Full (1 << 26)
58 #endif
59 #ifndef HAVE_SUPPORTED_56000baseKR4_Full
60 #define SUPPORTED_56000baseKR4_Full (1 << 27)
61 #endif
62 #ifndef HAVE_SUPPORTED_56000baseCR4_Full
63 #define SUPPORTED_56000baseCR4_Full (1 << 28)
64 #endif
65 #ifndef HAVE_SUPPORTED_56000baseSR4_Full
66 #define SUPPORTED_56000baseSR4_Full (1 << 29)
67 #endif
68 #ifndef HAVE_SUPPORTED_56000baseLR4_Full
69 #define SUPPORTED_56000baseLR4_Full (1 << 30)
70 #endif
71
72 /* Add defines in case the running kernel is not the same as user headers. */
73 #ifndef ETHTOOL_GLINKSETTINGS
74 struct ethtool_link_settings {
75         uint32_t cmd;
76         uint32_t speed;
77         uint8_t duplex;
78         uint8_t port;
79         uint8_t phy_address;
80         uint8_t autoneg;
81         uint8_t mdio_support;
82         uint8_t eth_to_mdix;
83         uint8_t eth_tp_mdix_ctrl;
84         int8_t link_mode_masks_nwords;
85         uint32_t reserved[8];
86         uint32_t link_mode_masks[];
87 };
88
89 #define ETHTOOL_GLINKSETTINGS 0x0000004c
90 #define ETHTOOL_LINK_MODE_1000baseT_Full_BIT 5
91 #define ETHTOOL_LINK_MODE_Autoneg_BIT 6
92 #define ETHTOOL_LINK_MODE_1000baseKX_Full_BIT 17
93 #define ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT 18
94 #define ETHTOOL_LINK_MODE_10000baseKR_Full_BIT 19
95 #define ETHTOOL_LINK_MODE_10000baseR_FEC_BIT 20
96 #define ETHTOOL_LINK_MODE_20000baseMLD2_Full_BIT 21
97 #define ETHTOOL_LINK_MODE_20000baseKR2_Full_BIT 22
98 #define ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT 23
99 #define ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT 24
100 #define ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT 25
101 #define ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT 26
102 #define ETHTOOL_LINK_MODE_56000baseKR4_Full_BIT 27
103 #define ETHTOOL_LINK_MODE_56000baseCR4_Full_BIT 28
104 #define ETHTOOL_LINK_MODE_56000baseSR4_Full_BIT 29
105 #define ETHTOOL_LINK_MODE_56000baseLR4_Full_BIT 30
106 #endif
107 #ifndef HAVE_ETHTOOL_LINK_MODE_25G
108 #define ETHTOOL_LINK_MODE_25000baseCR_Full_BIT 31
109 #define ETHTOOL_LINK_MODE_25000baseKR_Full_BIT 32
110 #define ETHTOOL_LINK_MODE_25000baseSR_Full_BIT 33
111 #endif
112 #ifndef HAVE_ETHTOOL_LINK_MODE_50G
113 #define ETHTOOL_LINK_MODE_50000baseCR2_Full_BIT 34
114 #define ETHTOOL_LINK_MODE_50000baseKR2_Full_BIT 35
115 #endif
116 #ifndef HAVE_ETHTOOL_LINK_MODE_100G
117 #define ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT 36
118 #define ETHTOOL_LINK_MODE_100000baseSR4_Full_BIT 37
119 #define ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT 38
120 #define ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT 39
121 #endif
122
123 /**
124  * Get master interface name from private structure.
125  *
126  * @param[in] dev
127  *   Pointer to Ethernet device.
128  * @param[out] ifname
129  *   Interface name output buffer.
130  *
131  * @return
132  *   0 on success, a negative errno value otherwise and rte_errno is set.
133  */
134 int
135 mlx5_get_master_ifname(const char *ibdev_path, char (*ifname)[IF_NAMESIZE])
136 {
137         DIR *dir;
138         struct dirent *dent;
139         unsigned int dev_type = 0;
140         unsigned int dev_port_prev = ~0u;
141         char match[IF_NAMESIZE] = "";
142
143         MLX5_ASSERT(ibdev_path);
144         {
145                 MKSTR(path, "%s/device/net", ibdev_path);
146
147                 dir = opendir(path);
148                 if (dir == NULL) {
149                         rte_errno = errno;
150                         return -rte_errno;
151                 }
152         }
153         while ((dent = readdir(dir)) != NULL) {
154                 char *name = dent->d_name;
155                 FILE *file;
156                 unsigned int dev_port;
157                 int r;
158
159                 if ((name[0] == '.') &&
160                     ((name[1] == '\0') ||
161                      ((name[1] == '.') && (name[2] == '\0'))))
162                         continue;
163
164                 MKSTR(path, "%s/device/net/%s/%s",
165                       ibdev_path, name,
166                       (dev_type ? "dev_id" : "dev_port"));
167
168                 file = fopen(path, "rb");
169                 if (file == NULL) {
170                         if (errno != ENOENT)
171                                 continue;
172                         /*
173                          * Switch to dev_id when dev_port does not exist as
174                          * is the case with Linux kernel versions < 3.15.
175                          */
176 try_dev_id:
177                         match[0] = '\0';
178                         if (dev_type)
179                                 break;
180                         dev_type = 1;
181                         dev_port_prev = ~0u;
182                         rewinddir(dir);
183                         continue;
184                 }
185                 r = fscanf(file, (dev_type ? "%x" : "%u"), &dev_port);
186                 fclose(file);
187                 if (r != 1)
188                         continue;
189                 /*
190                  * Switch to dev_id when dev_port returns the same value for
191                  * all ports. May happen when using a MOFED release older than
192                  * 3.0 with a Linux kernel >= 3.15.
193                  */
194                 if (dev_port == dev_port_prev)
195                         goto try_dev_id;
196                 dev_port_prev = dev_port;
197                 if (dev_port == 0)
198                         strlcpy(match, name, sizeof(match));
199         }
200         closedir(dir);
201         if (match[0] == '\0') {
202                 rte_errno = ENOENT;
203                 return -rte_errno;
204         }
205         strncpy(*ifname, match, sizeof(*ifname));
206         return 0;
207 }
208
209 /**
210  * Get interface name from private structure.
211  *
212  * This is a port representor-aware version of mlx5_get_master_ifname().
213  *
214  * @param[in] dev
215  *   Pointer to Ethernet device.
216  * @param[out] ifname
217  *   Interface name output buffer.
218  *
219  * @return
220  *   0 on success, a negative errno value otherwise and rte_errno is set.
221  */
222 int
223 mlx5_get_ifname(const struct rte_eth_dev *dev, char (*ifname)[IF_NAMESIZE])
224 {
225         struct mlx5_priv *priv = dev->data->dev_private;
226         unsigned int ifindex;
227
228         MLX5_ASSERT(priv);
229         MLX5_ASSERT(priv->sh);
230         ifindex = mlx5_ifindex(dev);
231         if (!ifindex) {
232                 if (!priv->representor)
233                         return mlx5_get_master_ifname(priv->sh->ibdev_path,
234                                                       ifname);
235                 rte_errno = ENXIO;
236                 return -rte_errno;
237         }
238         if (if_indextoname(ifindex, &(*ifname)[0]))
239                 return 0;
240         rte_errno = errno;
241         return -rte_errno;
242 }
243
244 /**
245  * Get the interface index from device name.
246  *
247  * @param[in] dev
248  *   Pointer to Ethernet device.
249  *
250  * @return
251  *   Nonzero interface index on success, zero otherwise and rte_errno is set.
252  */
253 unsigned int
254 mlx5_ifindex(const struct rte_eth_dev *dev)
255 {
256         struct mlx5_priv *priv = dev->data->dev_private;
257         unsigned int ifindex;
258
259         MLX5_ASSERT(priv);
260         MLX5_ASSERT(priv->if_index);
261         ifindex = priv->if_index;
262         if (!ifindex)
263                 rte_errno = ENXIO;
264         return ifindex;
265 }
266
267 /**
268  * Perform ifreq ioctl() on associated Ethernet device.
269  *
270  * @param[in] dev
271  *   Pointer to Ethernet device.
272  * @param req
273  *   Request number to pass to ioctl().
274  * @param[out] ifr
275  *   Interface request structure output buffer.
276  *
277  * @return
278  *   0 on success, a negative errno value otherwise and rte_errno is set.
279  */
280 int
281 mlx5_ifreq(const struct rte_eth_dev *dev, int req, struct ifreq *ifr)
282 {
283         int sock = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
284         int ret = 0;
285
286         if (sock == -1) {
287                 rte_errno = errno;
288                 return -rte_errno;
289         }
290         ret = mlx5_get_ifname(dev, &ifr->ifr_name);
291         if (ret)
292                 goto error;
293         ret = ioctl(sock, req, ifr);
294         if (ret == -1) {
295                 rte_errno = errno;
296                 goto error;
297         }
298         close(sock);
299         return 0;
300 error:
301         close(sock);
302         return -rte_errno;
303 }
304
305 /**
306  * Get device MTU.
307  *
308  * @param dev
309  *   Pointer to Ethernet device.
310  * @param[out] mtu
311  *   MTU value output buffer.
312  *
313  * @return
314  *   0 on success, a negative errno value otherwise and rte_errno is set.
315  */
316 int
317 mlx5_get_mtu(struct rte_eth_dev *dev, uint16_t *mtu)
318 {
319         struct ifreq request;
320         int ret = mlx5_ifreq(dev, SIOCGIFMTU, &request);
321
322         if (ret)
323                 return ret;
324         *mtu = request.ifr_mtu;
325         return 0;
326 }
327
328 /**
329  * Set device MTU.
330  *
331  * @param dev
332  *   Pointer to Ethernet device.
333  * @param mtu
334  *   MTU value to set.
335  *
336  * @return
337  *   0 on success, a negative errno value otherwise and rte_errno is set.
338  */
339 static int
340 mlx5_set_mtu(struct rte_eth_dev *dev, uint16_t mtu)
341 {
342         struct ifreq request = { .ifr_mtu = mtu, };
343
344         return mlx5_ifreq(dev, SIOCSIFMTU, &request);
345 }
346
347 /**
348  * Set device flags.
349  *
350  * @param dev
351  *   Pointer to Ethernet device.
352  * @param keep
353  *   Bitmask for flags that must remain untouched.
354  * @param flags
355  *   Bitmask for flags to modify.
356  *
357  * @return
358  *   0 on success, a negative errno value otherwise and rte_errno is set.
359  */
360 int
361 mlx5_set_flags(struct rte_eth_dev *dev, unsigned int keep, unsigned int flags)
362 {
363         struct ifreq request;
364         int ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &request);
365
366         if (ret)
367                 return ret;
368         request.ifr_flags &= keep;
369         request.ifr_flags |= flags & ~keep;
370         return mlx5_ifreq(dev, SIOCSIFFLAGS, &request);
371 }
372
373 /**
374  * DPDK callback for Ethernet device configuration.
375  *
376  * @param dev
377  *   Pointer to Ethernet device structure.
378  *
379  * @return
380  *   0 on success, a negative errno value otherwise and rte_errno is set.
381  */
382 int
383 mlx5_dev_configure(struct rte_eth_dev *dev)
384 {
385         struct mlx5_priv *priv = dev->data->dev_private;
386         unsigned int rxqs_n = dev->data->nb_rx_queues;
387         unsigned int txqs_n = dev->data->nb_tx_queues;
388         const uint8_t use_app_rss_key =
389                 !!dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key;
390         int ret = 0;
391
392         if (use_app_rss_key &&
393             (dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key_len !=
394              MLX5_RSS_HASH_KEY_LEN)) {
395                 DRV_LOG(ERR, "port %u RSS key len must be %s Bytes long",
396                         dev->data->port_id, RTE_STR(MLX5_RSS_HASH_KEY_LEN));
397                 rte_errno = EINVAL;
398                 return -rte_errno;
399         }
400         priv->rss_conf.rss_key =
401                 rte_realloc(priv->rss_conf.rss_key,
402                             MLX5_RSS_HASH_KEY_LEN, 0);
403         if (!priv->rss_conf.rss_key) {
404                 DRV_LOG(ERR, "port %u cannot allocate RSS hash key memory (%u)",
405                         dev->data->port_id, rxqs_n);
406                 rte_errno = ENOMEM;
407                 return -rte_errno;
408         }
409
410         if (dev->data->dev_conf.rxmode.mq_mode & ETH_MQ_RX_RSS_FLAG)
411                 dev->data->dev_conf.rxmode.offloads |= DEV_RX_OFFLOAD_RSS_HASH;
412
413         memcpy(priv->rss_conf.rss_key,
414                use_app_rss_key ?
415                dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key :
416                rss_hash_default_key,
417                MLX5_RSS_HASH_KEY_LEN);
418         priv->rss_conf.rss_key_len = MLX5_RSS_HASH_KEY_LEN;
419         priv->rss_conf.rss_hf = dev->data->dev_conf.rx_adv_conf.rss_conf.rss_hf;
420         priv->rxqs = (void *)dev->data->rx_queues;
421         priv->txqs = (void *)dev->data->tx_queues;
422         if (txqs_n != priv->txqs_n) {
423                 DRV_LOG(INFO, "port %u Tx queues number update: %u -> %u",
424                         dev->data->port_id, priv->txqs_n, txqs_n);
425                 priv->txqs_n = txqs_n;
426         }
427         if (rxqs_n > priv->config.ind_table_max_size) {
428                 DRV_LOG(ERR, "port %u cannot handle this many Rx queues (%u)",
429                         dev->data->port_id, rxqs_n);
430                 rte_errno = EINVAL;
431                 return -rte_errno;
432         }
433         if (rxqs_n != priv->rxqs_n) {
434                 DRV_LOG(INFO, "port %u Rx queues number update: %u -> %u",
435                         dev->data->port_id, priv->rxqs_n, rxqs_n);
436                 priv->rxqs_n = rxqs_n;
437         }
438         priv->skip_default_rss_reta = 0;
439         ret = mlx5_proc_priv_init(dev);
440         if (ret)
441                 return ret;
442         return 0;
443 }
444
445 /**
446  * Configure default RSS reta.
447  *
448  * @param dev
449  *   Pointer to Ethernet device structure.
450  *
451  * @return
452  *   0 on success, a negative errno value otherwise and rte_errno is set.
453  */
454 int
455 mlx5_dev_configure_rss_reta(struct rte_eth_dev *dev)
456 {
457         struct mlx5_priv *priv = dev->data->dev_private;
458         unsigned int rxqs_n = dev->data->nb_rx_queues;
459         unsigned int i;
460         unsigned int j;
461         unsigned int reta_idx_n;
462         int ret = 0;
463         unsigned int *rss_queue_arr = NULL;
464         unsigned int rss_queue_n = 0;
465
466         if (priv->skip_default_rss_reta)
467                 return ret;
468         rss_queue_arr = rte_malloc("", rxqs_n * sizeof(unsigned int), 0);
469         if (!rss_queue_arr) {
470                 DRV_LOG(ERR, "port %u cannot allocate RSS queue list (%u)",
471                         dev->data->port_id, rxqs_n);
472                 rte_errno = ENOMEM;
473                 return -rte_errno;
474         }
475         for (i = 0, j = 0; i < rxqs_n; i++) {
476                 struct mlx5_rxq_data *rxq_data;
477                 struct mlx5_rxq_ctrl *rxq_ctrl;
478
479                 rxq_data = (*priv->rxqs)[i];
480                 rxq_ctrl = container_of(rxq_data, struct mlx5_rxq_ctrl, rxq);
481                 if (rxq_ctrl && rxq_ctrl->type == MLX5_RXQ_TYPE_STANDARD)
482                         rss_queue_arr[j++] = i;
483         }
484         rss_queue_n = j;
485         if (rss_queue_n > priv->config.ind_table_max_size) {
486                 DRV_LOG(ERR, "port %u cannot handle this many Rx queues (%u)",
487                         dev->data->port_id, rss_queue_n);
488                 rte_errno = EINVAL;
489                 rte_free(rss_queue_arr);
490                 return -rte_errno;
491         }
492         DRV_LOG(INFO, "port %u Rx queues number update: %u -> %u",
493                 dev->data->port_id, priv->rxqs_n, rxqs_n);
494         priv->rxqs_n = rxqs_n;
495         /*
496          * If the requested number of RX queues is not a power of two,
497          * use the maximum indirection table size for better balancing.
498          * The result is always rounded to the next power of two.
499          */
500         reta_idx_n = (1 << log2above((rss_queue_n & (rss_queue_n - 1)) ?
501                                 priv->config.ind_table_max_size :
502                                 rss_queue_n));
503         ret = mlx5_rss_reta_index_resize(dev, reta_idx_n);
504         if (ret) {
505                 rte_free(rss_queue_arr);
506                 return ret;
507         }
508         /*
509          * When the number of RX queues is not a power of two,
510          * the remaining table entries are padded with reused WQs
511          * and hashes are not spread uniformly.
512          */
513         for (i = 0, j = 0; (i != reta_idx_n); ++i) {
514                 (*priv->reta_idx)[i] = rss_queue_arr[j];
515                 if (++j == rss_queue_n)
516                         j = 0;
517         }
518         rte_free(rss_queue_arr);
519         return ret;
520 }
521
522 /**
523  * Sets default tuning parameters.
524  *
525  * @param dev
526  *   Pointer to Ethernet device.
527  * @param[out] info
528  *   Info structure output buffer.
529  */
530 static void
531 mlx5_set_default_params(struct rte_eth_dev *dev, struct rte_eth_dev_info *info)
532 {
533         struct mlx5_priv *priv = dev->data->dev_private;
534
535         /* Minimum CPU utilization. */
536         info->default_rxportconf.ring_size = 256;
537         info->default_txportconf.ring_size = 256;
538         info->default_rxportconf.burst_size = MLX5_RX_DEFAULT_BURST;
539         info->default_txportconf.burst_size = MLX5_TX_DEFAULT_BURST;
540         if (priv->link_speed_capa & ETH_LINK_SPEED_100G) {
541                 info->default_rxportconf.nb_queues = 16;
542                 info->default_txportconf.nb_queues = 16;
543                 if (dev->data->nb_rx_queues > 2 ||
544                     dev->data->nb_tx_queues > 2) {
545                         /* Max Throughput. */
546                         info->default_rxportconf.ring_size = 2048;
547                         info->default_txportconf.ring_size = 2048;
548                 }
549         } else {
550                 info->default_rxportconf.nb_queues = 8;
551                 info->default_txportconf.nb_queues = 8;
552                 if (dev->data->nb_rx_queues > 2 ||
553                     dev->data->nb_tx_queues > 2) {
554                         /* Max Throughput. */
555                         info->default_rxportconf.ring_size = 4096;
556                         info->default_txportconf.ring_size = 4096;
557                 }
558         }
559 }
560
561 /**
562  * Sets tx mbuf limiting parameters.
563  *
564  * @param dev
565  *   Pointer to Ethernet device.
566  * @param[out] info
567  *   Info structure output buffer.
568  */
569 static void
570 mlx5_set_txlimit_params(struct rte_eth_dev *dev, struct rte_eth_dev_info *info)
571 {
572         struct mlx5_priv *priv = dev->data->dev_private;
573         struct mlx5_dev_config *config = &priv->config;
574         unsigned int inlen;
575         uint16_t nb_max;
576
577         inlen = (config->txq_inline_max == MLX5_ARG_UNSET) ?
578                 MLX5_SEND_DEF_INLINE_LEN :
579                 (unsigned int)config->txq_inline_max;
580         MLX5_ASSERT(config->txq_inline_min >= 0);
581         inlen = RTE_MAX(inlen, (unsigned int)config->txq_inline_min);
582         inlen = RTE_MIN(inlen, MLX5_WQE_SIZE_MAX +
583                                MLX5_ESEG_MIN_INLINE_SIZE -
584                                MLX5_WQE_CSEG_SIZE -
585                                MLX5_WQE_ESEG_SIZE -
586                                MLX5_WQE_DSEG_SIZE * 2);
587         nb_max = (MLX5_WQE_SIZE_MAX +
588                   MLX5_ESEG_MIN_INLINE_SIZE -
589                   MLX5_WQE_CSEG_SIZE -
590                   MLX5_WQE_ESEG_SIZE -
591                   MLX5_WQE_DSEG_SIZE -
592                   inlen) / MLX5_WSEG_SIZE;
593         info->tx_desc_lim.nb_seg_max = nb_max;
594         info->tx_desc_lim.nb_mtu_seg_max = nb_max;
595 }
596
597 /**
598  * DPDK callback to get information about the device.
599  *
600  * @param dev
601  *   Pointer to Ethernet device structure.
602  * @param[out] info
603  *   Info structure output buffer.
604  */
605 int
606 mlx5_dev_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *info)
607 {
608         struct mlx5_priv *priv = dev->data->dev_private;
609         struct mlx5_dev_config *config = &priv->config;
610         unsigned int max;
611
612         /* FIXME: we should ask the device for these values. */
613         info->min_rx_bufsize = 32;
614         info->max_rx_pktlen = 65536;
615         info->max_lro_pkt_size = MLX5_MAX_LRO_SIZE;
616         /*
617          * Since we need one CQ per QP, the limit is the minimum number
618          * between the two values.
619          */
620         max = RTE_MIN(priv->sh->device_attr.orig_attr.max_cq,
621                       priv->sh->device_attr.orig_attr.max_qp);
622         /* max_rx_queues is uint16_t. */
623         max = RTE_MIN(max, (unsigned int)UINT16_MAX);
624         info->max_rx_queues = max;
625         info->max_tx_queues = max;
626         info->max_mac_addrs = MLX5_MAX_UC_MAC_ADDRESSES;
627         info->rx_queue_offload_capa = mlx5_get_rx_queue_offloads(dev);
628         info->rx_offload_capa = (mlx5_get_rx_port_offloads() |
629                                  info->rx_queue_offload_capa);
630         info->tx_offload_capa = mlx5_get_tx_port_offloads(dev);
631         info->if_index = mlx5_ifindex(dev);
632         info->reta_size = priv->reta_idx_n ?
633                 priv->reta_idx_n : config->ind_table_max_size;
634         info->hash_key_size = MLX5_RSS_HASH_KEY_LEN;
635         info->speed_capa = priv->link_speed_capa;
636         info->flow_type_rss_offloads = ~MLX5_RSS_HF_MASK;
637         mlx5_set_default_params(dev, info);
638         mlx5_set_txlimit_params(dev, info);
639         info->switch_info.name = dev->data->name;
640         info->switch_info.domain_id = priv->domain_id;
641         info->switch_info.port_id = priv->representor_id;
642         if (priv->representor) {
643                 uint16_t port_id;
644
645                 if (priv->pf_bond >= 0) {
646                         /*
647                          * Switch port ID is opaque value with driver defined
648                          * format. Push the PF index in bonding configurations
649                          * in upper four bits of port ID. If we get too many
650                          * representors (more than 4K) or PFs (more than 15)
651                          * this approach must be reconsidered.
652                          */
653                         if ((info->switch_info.port_id >>
654                                 MLX5_PORT_ID_BONDING_PF_SHIFT) ||
655                             priv->pf_bond > MLX5_PORT_ID_BONDING_PF_MASK) {
656                                 DRV_LOG(ERR, "can't update switch port ID"
657                                              " for bonding device");
658                                 MLX5_ASSERT(false);
659                                 return -ENODEV;
660                         }
661                         info->switch_info.port_id |=
662                                 priv->pf_bond << MLX5_PORT_ID_BONDING_PF_SHIFT;
663                 }
664                 MLX5_ETH_FOREACH_DEV(port_id, priv->pci_dev) {
665                         struct mlx5_priv *opriv =
666                                 rte_eth_devices[port_id].data->dev_private;
667
668                         if (!opriv ||
669                             opriv->representor ||
670                             opriv->sh != priv->sh ||
671                             opriv->domain_id != priv->domain_id)
672                                 continue;
673                         /*
674                          * Override switch name with that of the master
675                          * device.
676                          */
677                         info->switch_info.name = opriv->dev_data->name;
678                         break;
679                 }
680         }
681         return 0;
682 }
683
684 /**
685  * Get device current raw clock counter
686  *
687  * @param dev
688  *   Pointer to Ethernet device structure.
689  * @param[out] time
690  *   Current raw clock counter of the device.
691  *
692  * @return
693  *   0 if the clock has correctly been read
694  *   The value of errno in case of error
695  */
696 int
697 mlx5_read_clock(struct rte_eth_dev *dev, uint64_t *clock)
698 {
699         struct mlx5_priv *priv = dev->data->dev_private;
700         struct ibv_context *ctx = priv->sh->ctx;
701         struct ibv_values_ex values;
702         int err = 0;
703
704         values.comp_mask = IBV_VALUES_MASK_RAW_CLOCK;
705         err = mlx5_glue->query_rt_values_ex(ctx, &values);
706         if (err != 0) {
707                 DRV_LOG(WARNING, "Could not query the clock !");
708                 return err;
709         }
710         *clock = values.raw_clock.tv_nsec;
711         return 0;
712 }
713
714 /**
715  * Get firmware version of a device.
716  *
717  * @param dev
718  *   Ethernet device port.
719  * @param fw_ver
720  *   String output allocated by caller.
721  * @param fw_size
722  *   Size of the output string, including terminating null byte.
723  *
724  * @return
725  *   0 on success, or the size of the non truncated string if too big.
726  */
727 int mlx5_fw_version_get(struct rte_eth_dev *dev, char *fw_ver, size_t fw_size)
728 {
729         struct mlx5_priv *priv = dev->data->dev_private;
730         struct ibv_device_attr *attr = &priv->sh->device_attr.orig_attr;
731         size_t size = strnlen(attr->fw_ver, sizeof(attr->fw_ver)) + 1;
732
733         if (fw_size < size)
734                 return size;
735         if (fw_ver != NULL)
736                 strlcpy(fw_ver, attr->fw_ver, fw_size);
737         return 0;
738 }
739
740 /**
741  * Get supported packet types.
742  *
743  * @param dev
744  *   Pointer to Ethernet device structure.
745  *
746  * @return
747  *   A pointer to the supported Packet types array.
748  */
749 const uint32_t *
750 mlx5_dev_supported_ptypes_get(struct rte_eth_dev *dev)
751 {
752         static const uint32_t ptypes[] = {
753                 /* refers to rxq_cq_to_pkt_type() */
754                 RTE_PTYPE_L2_ETHER,
755                 RTE_PTYPE_L3_IPV4_EXT_UNKNOWN,
756                 RTE_PTYPE_L3_IPV6_EXT_UNKNOWN,
757                 RTE_PTYPE_L4_NONFRAG,
758                 RTE_PTYPE_L4_FRAG,
759                 RTE_PTYPE_L4_TCP,
760                 RTE_PTYPE_L4_UDP,
761                 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN,
762                 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN,
763                 RTE_PTYPE_INNER_L4_NONFRAG,
764                 RTE_PTYPE_INNER_L4_FRAG,
765                 RTE_PTYPE_INNER_L4_TCP,
766                 RTE_PTYPE_INNER_L4_UDP,
767                 RTE_PTYPE_UNKNOWN
768         };
769
770         if (dev->rx_pkt_burst == mlx5_rx_burst ||
771             dev->rx_pkt_burst == mlx5_rx_burst_mprq ||
772             dev->rx_pkt_burst == mlx5_rx_burst_vec)
773                 return ptypes;
774         return NULL;
775 }
776
777 /**
778  * Retrieve the master device for representor in the same switch domain.
779  *
780  * @param dev
781  *   Pointer to representor Ethernet device structure.
782  *
783  * @return
784  *   Master device structure  on success, NULL otherwise.
785  */
786
787 static struct rte_eth_dev *
788 mlx5_find_master_dev(struct rte_eth_dev *dev)
789 {
790         struct mlx5_priv *priv;
791         uint16_t port_id;
792         uint16_t domain_id;
793
794         priv = dev->data->dev_private;
795         domain_id = priv->domain_id;
796         MLX5_ASSERT(priv->representor);
797         MLX5_ETH_FOREACH_DEV(port_id, priv->pci_dev) {
798                 struct mlx5_priv *opriv =
799                         rte_eth_devices[port_id].data->dev_private;
800                 if (opriv &&
801                     opriv->master &&
802                     opriv->domain_id == domain_id &&
803                     opriv->sh == priv->sh)
804                         return &rte_eth_devices[port_id];
805         }
806         return NULL;
807 }
808
809 /**
810  * DPDK callback to retrieve physical link information.
811  *
812  * @param dev
813  *   Pointer to Ethernet device structure.
814  * @param[out] link
815  *   Storage for current link status.
816  *
817  * @return
818  *   0 on success, a negative errno value otherwise and rte_errno is set.
819  */
820 static int
821 mlx5_link_update_unlocked_gset(struct rte_eth_dev *dev,
822                                struct rte_eth_link *link)
823 {
824         struct mlx5_priv *priv = dev->data->dev_private;
825         struct ethtool_cmd edata = {
826                 .cmd = ETHTOOL_GSET /* Deprecated since Linux v4.5. */
827         };
828         struct ifreq ifr;
829         struct rte_eth_link dev_link;
830         int link_speed = 0;
831         int ret;
832
833         ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr);
834         if (ret) {
835                 DRV_LOG(WARNING, "port %u ioctl(SIOCGIFFLAGS) failed: %s",
836                         dev->data->port_id, strerror(rte_errno));
837                 return ret;
838         }
839         dev_link = (struct rte_eth_link) {
840                 .link_status = ((ifr.ifr_flags & IFF_UP) &&
841                                 (ifr.ifr_flags & IFF_RUNNING)),
842         };
843         ifr = (struct ifreq) {
844                 .ifr_data = (void *)&edata,
845         };
846         ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
847         if (ret) {
848                 if (ret == -ENOTSUP && priv->representor) {
849                         struct rte_eth_dev *master;
850
851                         /*
852                          * For representors we can try to inherit link
853                          * settings from the master device. Actually
854                          * link settings do not make a lot of sense
855                          * for representors due to missing physical
856                          * link. The old kernel drivers supported
857                          * emulated settings query for representors,
858                          * the new ones do not, so we have to add
859                          * this code for compatibility issues.
860                          */
861                         master = mlx5_find_master_dev(dev);
862                         if (master) {
863                                 ifr = (struct ifreq) {
864                                         .ifr_data = (void *)&edata,
865                                 };
866                                 ret = mlx5_ifreq(master, SIOCETHTOOL, &ifr);
867                         }
868                 }
869                 if (ret) {
870                         DRV_LOG(WARNING,
871                                 "port %u ioctl(SIOCETHTOOL,"
872                                 " ETHTOOL_GSET) failed: %s",
873                                 dev->data->port_id, strerror(rte_errno));
874                         return ret;
875                 }
876         }
877         link_speed = ethtool_cmd_speed(&edata);
878         if (link_speed == -1)
879                 dev_link.link_speed = ETH_SPEED_NUM_NONE;
880         else
881                 dev_link.link_speed = link_speed;
882         priv->link_speed_capa = 0;
883         if (edata.supported & SUPPORTED_Autoneg)
884                 priv->link_speed_capa |= ETH_LINK_SPEED_AUTONEG;
885         if (edata.supported & (SUPPORTED_1000baseT_Full |
886                                SUPPORTED_1000baseKX_Full))
887                 priv->link_speed_capa |= ETH_LINK_SPEED_1G;
888         if (edata.supported & SUPPORTED_10000baseKR_Full)
889                 priv->link_speed_capa |= ETH_LINK_SPEED_10G;
890         if (edata.supported & (SUPPORTED_40000baseKR4_Full |
891                                SUPPORTED_40000baseCR4_Full |
892                                SUPPORTED_40000baseSR4_Full |
893                                SUPPORTED_40000baseLR4_Full))
894                 priv->link_speed_capa |= ETH_LINK_SPEED_40G;
895         dev_link.link_duplex = ((edata.duplex == DUPLEX_HALF) ?
896                                 ETH_LINK_HALF_DUPLEX : ETH_LINK_FULL_DUPLEX);
897         dev_link.link_autoneg = !(dev->data->dev_conf.link_speeds &
898                         ETH_LINK_SPEED_FIXED);
899         if (((dev_link.link_speed && !dev_link.link_status) ||
900              (!dev_link.link_speed && dev_link.link_status))) {
901                 rte_errno = EAGAIN;
902                 return -rte_errno;
903         }
904         *link = dev_link;
905         return 0;
906 }
907
908 /**
909  * Retrieve physical link information (unlocked version using new ioctl).
910  *
911  * @param dev
912  *   Pointer to Ethernet device structure.
913  * @param[out] link
914  *   Storage for current link status.
915  *
916  * @return
917  *   0 on success, a negative errno value otherwise and rte_errno is set.
918  */
919 static int
920 mlx5_link_update_unlocked_gs(struct rte_eth_dev *dev,
921                              struct rte_eth_link *link)
922
923 {
924         struct mlx5_priv *priv = dev->data->dev_private;
925         struct ethtool_link_settings gcmd = { .cmd = ETHTOOL_GLINKSETTINGS };
926         struct ifreq ifr;
927         struct rte_eth_link dev_link;
928         struct rte_eth_dev *master = NULL;
929         uint64_t sc;
930         int ret;
931
932         ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr);
933         if (ret) {
934                 DRV_LOG(WARNING, "port %u ioctl(SIOCGIFFLAGS) failed: %s",
935                         dev->data->port_id, strerror(rte_errno));
936                 return ret;
937         }
938         dev_link = (struct rte_eth_link) {
939                 .link_status = ((ifr.ifr_flags & IFF_UP) &&
940                                 (ifr.ifr_flags & IFF_RUNNING)),
941         };
942         ifr = (struct ifreq) {
943                 .ifr_data = (void *)&gcmd,
944         };
945         ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
946         if (ret) {
947                 if (ret == -ENOTSUP && priv->representor) {
948                         /*
949                          * For representors we can try to inherit link
950                          * settings from the master device. Actually
951                          * link settings do not make a lot of sense
952                          * for representors due to missing physical
953                          * link. The old kernel drivers supported
954                          * emulated settings query for representors,
955                          * the new ones do not, so we have to add
956                          * this code for compatibility issues.
957                          */
958                         master = mlx5_find_master_dev(dev);
959                         if (master) {
960                                 ifr = (struct ifreq) {
961                                         .ifr_data = (void *)&gcmd,
962                                 };
963                                 ret = mlx5_ifreq(master, SIOCETHTOOL, &ifr);
964                         }
965                 }
966                 if (ret) {
967                         DRV_LOG(DEBUG,
968                                 "port %u ioctl(SIOCETHTOOL,"
969                                 " ETHTOOL_GLINKSETTINGS) failed: %s",
970                                 dev->data->port_id, strerror(rte_errno));
971                         return ret;
972                 }
973
974         }
975         gcmd.link_mode_masks_nwords = -gcmd.link_mode_masks_nwords;
976
977         alignas(struct ethtool_link_settings)
978         uint8_t data[offsetof(struct ethtool_link_settings, link_mode_masks) +
979                      sizeof(uint32_t) * gcmd.link_mode_masks_nwords * 3];
980         struct ethtool_link_settings *ecmd = (void *)data;
981
982         *ecmd = gcmd;
983         ifr.ifr_data = (void *)ecmd;
984         ret = mlx5_ifreq(master ? master : dev, SIOCETHTOOL, &ifr);
985         if (ret) {
986                 DRV_LOG(DEBUG,
987                         "port %u ioctl(SIOCETHTOOL,"
988                         "ETHTOOL_GLINKSETTINGS) failed: %s",
989                         dev->data->port_id, strerror(rte_errno));
990                 return ret;
991         }
992         dev_link.link_speed = (ecmd->speed == UINT32_MAX) ? ETH_SPEED_NUM_NONE :
993                                                             ecmd->speed;
994         sc = ecmd->link_mode_masks[0] |
995                 ((uint64_t)ecmd->link_mode_masks[1] << 32);
996         priv->link_speed_capa = 0;
997         if (sc & MLX5_BITSHIFT(ETHTOOL_LINK_MODE_Autoneg_BIT))
998                 priv->link_speed_capa |= ETH_LINK_SPEED_AUTONEG;
999         if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_1000baseT_Full_BIT) |
1000                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_1000baseKX_Full_BIT)))
1001                 priv->link_speed_capa |= ETH_LINK_SPEED_1G;
1002         if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT) |
1003                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_10000baseKR_Full_BIT) |
1004                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_10000baseR_FEC_BIT)))
1005                 priv->link_speed_capa |= ETH_LINK_SPEED_10G;
1006         if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_20000baseMLD2_Full_BIT) |
1007                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_20000baseKR2_Full_BIT)))
1008                 priv->link_speed_capa |= ETH_LINK_SPEED_20G;
1009         if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT) |
1010                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT) |
1011                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT) |
1012                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT)))
1013                 priv->link_speed_capa |= ETH_LINK_SPEED_40G;
1014         if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseKR4_Full_BIT) |
1015                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseCR4_Full_BIT) |
1016                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseSR4_Full_BIT) |
1017                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseLR4_Full_BIT)))
1018                 priv->link_speed_capa |= ETH_LINK_SPEED_56G;
1019         if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_25000baseCR_Full_BIT) |
1020                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_25000baseKR_Full_BIT) |
1021                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_25000baseSR_Full_BIT)))
1022                 priv->link_speed_capa |= ETH_LINK_SPEED_25G;
1023         if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_50000baseCR2_Full_BIT) |
1024                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_50000baseKR2_Full_BIT)))
1025                 priv->link_speed_capa |= ETH_LINK_SPEED_50G;
1026         if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT) |
1027                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseSR4_Full_BIT) |
1028                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT) |
1029                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT)))
1030                 priv->link_speed_capa |= ETH_LINK_SPEED_100G;
1031         dev_link.link_duplex = ((ecmd->duplex == DUPLEX_HALF) ?
1032                                 ETH_LINK_HALF_DUPLEX : ETH_LINK_FULL_DUPLEX);
1033         dev_link.link_autoneg = !(dev->data->dev_conf.link_speeds &
1034                                   ETH_LINK_SPEED_FIXED);
1035         if (((dev_link.link_speed && !dev_link.link_status) ||
1036              (!dev_link.link_speed && dev_link.link_status))) {
1037                 rte_errno = EAGAIN;
1038                 return -rte_errno;
1039         }
1040         *link = dev_link;
1041         return 0;
1042 }
1043
1044 /**
1045  * DPDK callback to retrieve physical link information.
1046  *
1047  * @param dev
1048  *   Pointer to Ethernet device structure.
1049  * @param wait_to_complete
1050  *   Wait for request completion.
1051  *
1052  * @return
1053  *   0 if link status was not updated, positive if it was, a negative errno
1054  *   value otherwise and rte_errno is set.
1055  */
1056 int
1057 mlx5_link_update(struct rte_eth_dev *dev, int wait_to_complete)
1058 {
1059         int ret;
1060         struct rte_eth_link dev_link;
1061         time_t start_time = time(NULL);
1062         int retry = MLX5_GET_LINK_STATUS_RETRY_COUNT;
1063
1064         do {
1065                 ret = mlx5_link_update_unlocked_gs(dev, &dev_link);
1066                 if (ret == -ENOTSUP)
1067                         ret = mlx5_link_update_unlocked_gset(dev, &dev_link);
1068                 if (ret == 0)
1069                         break;
1070                 /* Handle wait to complete situation. */
1071                 if ((wait_to_complete || retry) && ret == -EAGAIN) {
1072                         if (abs((int)difftime(time(NULL), start_time)) <
1073                             MLX5_LINK_STATUS_TIMEOUT) {
1074                                 usleep(0);
1075                                 continue;
1076                         } else {
1077                                 rte_errno = EBUSY;
1078                                 return -rte_errno;
1079                         }
1080                 } else if (ret < 0) {
1081                         return ret;
1082                 }
1083         } while (wait_to_complete || retry-- > 0);
1084         ret = !!memcmp(&dev->data->dev_link, &dev_link,
1085                        sizeof(struct rte_eth_link));
1086         dev->data->dev_link = dev_link;
1087         return ret;
1088 }
1089
1090 /**
1091  * DPDK callback to change the MTU.
1092  *
1093  * @param dev
1094  *   Pointer to Ethernet device structure.
1095  * @param in_mtu
1096  *   New MTU.
1097  *
1098  * @return
1099  *   0 on success, a negative errno value otherwise and rte_errno is set.
1100  */
1101 int
1102 mlx5_dev_set_mtu(struct rte_eth_dev *dev, uint16_t mtu)
1103 {
1104         struct mlx5_priv *priv = dev->data->dev_private;
1105         uint16_t kern_mtu = 0;
1106         int ret;
1107
1108         ret = mlx5_get_mtu(dev, &kern_mtu);
1109         if (ret)
1110                 return ret;
1111         /* Set kernel interface MTU first. */
1112         ret = mlx5_set_mtu(dev, mtu);
1113         if (ret)
1114                 return ret;
1115         ret = mlx5_get_mtu(dev, &kern_mtu);
1116         if (ret)
1117                 return ret;
1118         if (kern_mtu == mtu) {
1119                 priv->mtu = mtu;
1120                 DRV_LOG(DEBUG, "port %u adapter MTU set to %u",
1121                         dev->data->port_id, mtu);
1122                 return 0;
1123         }
1124         rte_errno = EAGAIN;
1125         return -rte_errno;
1126 }
1127
1128 /**
1129  * DPDK callback to get flow control status.
1130  *
1131  * @param dev
1132  *   Pointer to Ethernet device structure.
1133  * @param[out] fc_conf
1134  *   Flow control output buffer.
1135  *
1136  * @return
1137  *   0 on success, a negative errno value otherwise and rte_errno is set.
1138  */
1139 int
1140 mlx5_dev_get_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf)
1141 {
1142         struct ifreq ifr;
1143         struct ethtool_pauseparam ethpause = {
1144                 .cmd = ETHTOOL_GPAUSEPARAM
1145         };
1146         int ret;
1147
1148         ifr.ifr_data = (void *)&ethpause;
1149         ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
1150         if (ret) {
1151                 DRV_LOG(WARNING,
1152                         "port %u ioctl(SIOCETHTOOL, ETHTOOL_GPAUSEPARAM) failed:"
1153                         " %s",
1154                         dev->data->port_id, strerror(rte_errno));
1155                 return ret;
1156         }
1157         fc_conf->autoneg = ethpause.autoneg;
1158         if (ethpause.rx_pause && ethpause.tx_pause)
1159                 fc_conf->mode = RTE_FC_FULL;
1160         else if (ethpause.rx_pause)
1161                 fc_conf->mode = RTE_FC_RX_PAUSE;
1162         else if (ethpause.tx_pause)
1163                 fc_conf->mode = RTE_FC_TX_PAUSE;
1164         else
1165                 fc_conf->mode = RTE_FC_NONE;
1166         return 0;
1167 }
1168
1169 /**
1170  * DPDK callback to modify flow control parameters.
1171  *
1172  * @param dev
1173  *   Pointer to Ethernet device structure.
1174  * @param[in] fc_conf
1175  *   Flow control parameters.
1176  *
1177  * @return
1178  *   0 on success, a negative errno value otherwise and rte_errno is set.
1179  */
1180 int
1181 mlx5_dev_set_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf)
1182 {
1183         struct ifreq ifr;
1184         struct ethtool_pauseparam ethpause = {
1185                 .cmd = ETHTOOL_SPAUSEPARAM
1186         };
1187         int ret;
1188
1189         ifr.ifr_data = (void *)&ethpause;
1190         ethpause.autoneg = fc_conf->autoneg;
1191         if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) ||
1192             (fc_conf->mode & RTE_FC_RX_PAUSE))
1193                 ethpause.rx_pause = 1;
1194         else
1195                 ethpause.rx_pause = 0;
1196
1197         if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) ||
1198             (fc_conf->mode & RTE_FC_TX_PAUSE))
1199                 ethpause.tx_pause = 1;
1200         else
1201                 ethpause.tx_pause = 0;
1202         ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
1203         if (ret) {
1204                 DRV_LOG(WARNING,
1205                         "port %u ioctl(SIOCETHTOOL, ETHTOOL_SPAUSEPARAM)"
1206                         " failed: %s",
1207                         dev->data->port_id, strerror(rte_errno));
1208                 return ret;
1209         }
1210         return 0;
1211 }
1212
1213 /**
1214  * Handle asynchronous removal event for entire multiport device.
1215  *
1216  * @param sh
1217  *   Infiniband device shared context.
1218  */
1219 static void
1220 mlx5_dev_interrupt_device_fatal(struct mlx5_ibv_shared *sh)
1221 {
1222         uint32_t i;
1223
1224         for (i = 0; i < sh->max_port; ++i) {
1225                 struct rte_eth_dev *dev;
1226
1227                 if (sh->port[i].ih_port_id >= RTE_MAX_ETHPORTS) {
1228                         /*
1229                          * Or not existing port either no
1230                          * handler installed for this port.
1231                          */
1232                         continue;
1233                 }
1234                 dev = &rte_eth_devices[sh->port[i].ih_port_id];
1235                 MLX5_ASSERT(dev);
1236                 if (dev->data->dev_conf.intr_conf.rmv)
1237                         _rte_eth_dev_callback_process
1238                                 (dev, RTE_ETH_EVENT_INTR_RMV, NULL);
1239         }
1240 }
1241
1242 /**
1243  * Handle shared asynchronous events the NIC (removal event
1244  * and link status change). Supports multiport IB device.
1245  *
1246  * @param cb_arg
1247  *   Callback argument.
1248  */
1249 void
1250 mlx5_dev_interrupt_handler(void *cb_arg)
1251 {
1252         struct mlx5_ibv_shared *sh = cb_arg;
1253         struct ibv_async_event event;
1254
1255         /* Read all message from the IB device and acknowledge them. */
1256         for (;;) {
1257                 struct rte_eth_dev *dev;
1258                 uint32_t tmp;
1259
1260                 if (mlx5_glue->get_async_event(sh->ctx, &event))
1261                         break;
1262                 /* Retrieve and check IB port index. */
1263                 tmp = (uint32_t)event.element.port_num;
1264                 if (!tmp && event.event_type == IBV_EVENT_DEVICE_FATAL) {
1265                         /*
1266                          * The DEVICE_FATAL event is called once for
1267                          * entire device without port specifying.
1268                          * We should notify all existing ports.
1269                          */
1270                         mlx5_glue->ack_async_event(&event);
1271                         mlx5_dev_interrupt_device_fatal(sh);
1272                         continue;
1273                 }
1274                 MLX5_ASSERT(tmp && (tmp <= sh->max_port));
1275                 if (!tmp) {
1276                         /* Unsupported devive level event. */
1277                         mlx5_glue->ack_async_event(&event);
1278                         DRV_LOG(DEBUG,
1279                                 "unsupported common event (type %d)",
1280                                 event.event_type);
1281                         continue;
1282                 }
1283                 if (tmp > sh->max_port) {
1284                         /* Invalid IB port index. */
1285                         mlx5_glue->ack_async_event(&event);
1286                         DRV_LOG(DEBUG,
1287                                 "cannot handle an event (type %d)"
1288                                 "due to invalid IB port index (%u)",
1289                                 event.event_type, tmp);
1290                         continue;
1291                 }
1292                 if (sh->port[tmp - 1].ih_port_id >= RTE_MAX_ETHPORTS) {
1293                         /* No handler installed. */
1294                         mlx5_glue->ack_async_event(&event);
1295                         DRV_LOG(DEBUG,
1296                                 "cannot handle an event (type %d)"
1297                                 "due to no handler installed for port %u",
1298                                 event.event_type, tmp);
1299                         continue;
1300                 }
1301                 /* Retrieve ethernet device descriptor. */
1302                 tmp = sh->port[tmp - 1].ih_port_id;
1303                 dev = &rte_eth_devices[tmp];
1304                 MLX5_ASSERT(dev);
1305                 if ((event.event_type == IBV_EVENT_PORT_ACTIVE ||
1306                      event.event_type == IBV_EVENT_PORT_ERR) &&
1307                         dev->data->dev_conf.intr_conf.lsc) {
1308                         mlx5_glue->ack_async_event(&event);
1309                         if (mlx5_link_update(dev, 0) == -EAGAIN) {
1310                                 usleep(0);
1311                                 continue;
1312                         }
1313                         _rte_eth_dev_callback_process
1314                                 (dev, RTE_ETH_EVENT_INTR_LSC, NULL);
1315                         continue;
1316                 }
1317                 DRV_LOG(DEBUG,
1318                         "port %u cannot handle an unknown event (type %d)",
1319                         dev->data->port_id, event.event_type);
1320                 mlx5_glue->ack_async_event(&event);
1321         }
1322 }
1323
1324 /*
1325  * Unregister callback handler safely. The handler may be active
1326  * while we are trying to unregister it, in this case code -EAGAIN
1327  * is returned by rte_intr_callback_unregister(). This routine checks
1328  * the return code and tries to unregister handler again.
1329  *
1330  * @param handle
1331  *   interrupt handle
1332  * @param cb_fn
1333  *   pointer to callback routine
1334  * @cb_arg
1335  *   opaque callback parameter
1336  */
1337 void
1338 mlx5_intr_callback_unregister(const struct rte_intr_handle *handle,
1339                               rte_intr_callback_fn cb_fn, void *cb_arg)
1340 {
1341         /*
1342          * Try to reduce timeout management overhead by not calling
1343          * the timer related routines on the first iteration. If the
1344          * unregistering succeeds on first call there will be no
1345          * timer calls at all.
1346          */
1347         uint64_t twait = 0;
1348         uint64_t start = 0;
1349
1350         do {
1351                 int ret;
1352
1353                 ret = rte_intr_callback_unregister(handle, cb_fn, cb_arg);
1354                 if (ret >= 0)
1355                         return;
1356                 if (ret != -EAGAIN) {
1357                         DRV_LOG(INFO, "failed to unregister interrupt"
1358                                       " handler (error: %d)", ret);
1359                         MLX5_ASSERT(false);
1360                         return;
1361                 }
1362                 if (twait) {
1363                         struct timespec onems;
1364
1365                         /* Wait one millisecond and try again. */
1366                         onems.tv_sec = 0;
1367                         onems.tv_nsec = NS_PER_S / MS_PER_S;
1368                         nanosleep(&onems, 0);
1369                         /* Check whether one second elapsed. */
1370                         if ((rte_get_timer_cycles() - start) <= twait)
1371                                 continue;
1372                 } else {
1373                         /*
1374                          * We get the amount of timer ticks for one second.
1375                          * If this amount elapsed it means we spent one
1376                          * second in waiting. This branch is executed once
1377                          * on first iteration.
1378                          */
1379                         twait = rte_get_timer_hz();
1380                         MLX5_ASSERT(twait);
1381                 }
1382                 /*
1383                  * Timeout elapsed, show message (once a second) and retry.
1384                  * We have no other acceptable option here, if we ignore
1385                  * the unregistering return code the handler will not
1386                  * be unregistered, fd will be closed and we may get the
1387                  * crush. Hanging and messaging in the loop seems not to be
1388                  * the worst choice.
1389                  */
1390                 DRV_LOG(INFO, "Retrying to unregister interrupt handler");
1391                 start = rte_get_timer_cycles();
1392         } while (true);
1393 }
1394
1395 /**
1396  * Handle DEVX interrupts from the NIC.
1397  * This function is probably called from the DPDK host thread.
1398  *
1399  * @param cb_arg
1400  *   Callback argument.
1401  */
1402 void
1403 mlx5_dev_interrupt_handler_devx(void *cb_arg)
1404 {
1405 #ifndef HAVE_IBV_DEVX_ASYNC
1406         (void)cb_arg;
1407         return;
1408 #else
1409         struct mlx5_ibv_shared *sh = cb_arg;
1410         union {
1411                 struct mlx5dv_devx_async_cmd_hdr cmd_resp;
1412                 uint8_t buf[MLX5_ST_SZ_BYTES(query_flow_counter_out) +
1413                             MLX5_ST_SZ_BYTES(traffic_counter) +
1414                             sizeof(struct mlx5dv_devx_async_cmd_hdr)];
1415         } out;
1416         uint8_t *buf = out.buf + sizeof(out.cmd_resp);
1417
1418         while (!mlx5_glue->devx_get_async_cmd_comp(sh->devx_comp,
1419                                                    &out.cmd_resp,
1420                                                    sizeof(out.buf)))
1421                 mlx5_flow_async_pool_query_handle
1422                         (sh, (uint64_t)out.cmd_resp.wr_id,
1423                          mlx5_devx_get_out_command_status(buf));
1424 #endif /* HAVE_IBV_DEVX_ASYNC */
1425 }
1426
1427 /**
1428  * Uninstall shared asynchronous device events handler.
1429  * This function is implemented to support event sharing
1430  * between multiple ports of single IB device.
1431  *
1432  * @param dev
1433  *   Pointer to Ethernet device.
1434  */
1435 static void
1436 mlx5_dev_shared_handler_uninstall(struct rte_eth_dev *dev)
1437 {
1438         struct mlx5_priv *priv = dev->data->dev_private;
1439         struct mlx5_ibv_shared *sh = priv->sh;
1440
1441         if (rte_eal_process_type() != RTE_PROC_PRIMARY)
1442                 return;
1443         pthread_mutex_lock(&sh->intr_mutex);
1444         MLX5_ASSERT(priv->ibv_port);
1445         MLX5_ASSERT(priv->ibv_port <= sh->max_port);
1446         MLX5_ASSERT(dev->data->port_id < RTE_MAX_ETHPORTS);
1447         if (sh->port[priv->ibv_port - 1].ih_port_id >= RTE_MAX_ETHPORTS)
1448                 goto exit;
1449         MLX5_ASSERT(sh->port[priv->ibv_port - 1].ih_port_id ==
1450                                         (uint32_t)dev->data->port_id);
1451         MLX5_ASSERT(sh->intr_cnt);
1452         sh->port[priv->ibv_port - 1].ih_port_id = RTE_MAX_ETHPORTS;
1453         if (!sh->intr_cnt || --sh->intr_cnt)
1454                 goto exit;
1455         mlx5_intr_callback_unregister(&sh->intr_handle,
1456                                      mlx5_dev_interrupt_handler, sh);
1457         sh->intr_handle.fd = 0;
1458         sh->intr_handle.type = RTE_INTR_HANDLE_UNKNOWN;
1459 exit:
1460         pthread_mutex_unlock(&sh->intr_mutex);
1461 }
1462
1463 /**
1464  * Uninstall devx shared asynchronous device events handler.
1465  * This function is implemeted to support event sharing
1466  * between multiple ports of single IB device.
1467  *
1468  * @param dev
1469  *   Pointer to Ethernet device.
1470  */
1471 static void
1472 mlx5_dev_shared_handler_devx_uninstall(struct rte_eth_dev *dev)
1473 {
1474         struct mlx5_priv *priv = dev->data->dev_private;
1475         struct mlx5_ibv_shared *sh = priv->sh;
1476
1477         if (rte_eal_process_type() != RTE_PROC_PRIMARY)
1478                 return;
1479         pthread_mutex_lock(&sh->intr_mutex);
1480         MLX5_ASSERT(priv->ibv_port);
1481         MLX5_ASSERT(priv->ibv_port <= sh->max_port);
1482         MLX5_ASSERT(dev->data->port_id < RTE_MAX_ETHPORTS);
1483         if (sh->port[priv->ibv_port - 1].devx_ih_port_id >= RTE_MAX_ETHPORTS)
1484                 goto exit;
1485         MLX5_ASSERT(sh->port[priv->ibv_port - 1].devx_ih_port_id ==
1486                     (uint32_t)dev->data->port_id);
1487         sh->port[priv->ibv_port - 1].devx_ih_port_id = RTE_MAX_ETHPORTS;
1488         if (!sh->devx_intr_cnt || --sh->devx_intr_cnt)
1489                 goto exit;
1490         if (sh->intr_handle_devx.fd) {
1491                 rte_intr_callback_unregister(&sh->intr_handle_devx,
1492                                              mlx5_dev_interrupt_handler_devx,
1493                                              sh);
1494                 sh->intr_handle_devx.fd = 0;
1495                 sh->intr_handle_devx.type = RTE_INTR_HANDLE_UNKNOWN;
1496         }
1497         if (sh->devx_comp) {
1498                 mlx5_glue->devx_destroy_cmd_comp(sh->devx_comp);
1499                 sh->devx_comp = NULL;
1500         }
1501 exit:
1502         pthread_mutex_unlock(&sh->intr_mutex);
1503 }
1504
1505 /**
1506  * Install shared asynchronous device events handler.
1507  * This function is implemented to support event sharing
1508  * between multiple ports of single IB device.
1509  *
1510  * @param dev
1511  *   Pointer to Ethernet device.
1512  */
1513 static void
1514 mlx5_dev_shared_handler_install(struct rte_eth_dev *dev)
1515 {
1516         struct mlx5_priv *priv = dev->data->dev_private;
1517         struct mlx5_ibv_shared *sh = priv->sh;
1518         int ret;
1519         int flags;
1520
1521         if (rte_eal_process_type() != RTE_PROC_PRIMARY)
1522                 return;
1523         pthread_mutex_lock(&sh->intr_mutex);
1524         MLX5_ASSERT(priv->ibv_port);
1525         MLX5_ASSERT(priv->ibv_port <= sh->max_port);
1526         MLX5_ASSERT(dev->data->port_id < RTE_MAX_ETHPORTS);
1527         if (sh->port[priv->ibv_port - 1].ih_port_id < RTE_MAX_ETHPORTS) {
1528                 /* The handler is already installed for this port. */
1529                 MLX5_ASSERT(sh->intr_cnt);
1530                 goto exit;
1531         }
1532         if (sh->intr_cnt) {
1533                 sh->port[priv->ibv_port - 1].ih_port_id =
1534                                                 (uint32_t)dev->data->port_id;
1535                 sh->intr_cnt++;
1536                 goto exit;
1537         }
1538         /* No shared handler installed. */
1539         MLX5_ASSERT(sh->ctx->async_fd > 0);
1540         flags = fcntl(sh->ctx->async_fd, F_GETFL);
1541         ret = fcntl(sh->ctx->async_fd, F_SETFL, flags | O_NONBLOCK);
1542         if (ret) {
1543                 DRV_LOG(INFO, "failed to change file descriptor async event"
1544                         " queue");
1545                 /* Indicate there will be no interrupts. */
1546                 dev->data->dev_conf.intr_conf.lsc = 0;
1547                 dev->data->dev_conf.intr_conf.rmv = 0;
1548         } else {
1549                 sh->intr_handle.fd = sh->ctx->async_fd;
1550                 sh->intr_handle.type = RTE_INTR_HANDLE_EXT;
1551                 rte_intr_callback_register(&sh->intr_handle,
1552                                            mlx5_dev_interrupt_handler, sh);
1553                 sh->intr_cnt++;
1554                 sh->port[priv->ibv_port - 1].ih_port_id =
1555                                                 (uint32_t)dev->data->port_id;
1556         }
1557 exit:
1558         pthread_mutex_unlock(&sh->intr_mutex);
1559 }
1560
1561 /**
1562  * Install devx shared asyncronous device events handler.
1563  * This function is implemeted to support event sharing
1564  * between multiple ports of single IB device.
1565  *
1566  * @param dev
1567  *   Pointer to Ethernet device.
1568  */
1569 static void
1570 mlx5_dev_shared_handler_devx_install(struct rte_eth_dev *dev)
1571 {
1572         struct mlx5_priv *priv = dev->data->dev_private;
1573         struct mlx5_ibv_shared *sh = priv->sh;
1574
1575         if (rte_eal_process_type() != RTE_PROC_PRIMARY)
1576                 return;
1577         pthread_mutex_lock(&sh->intr_mutex);
1578         MLX5_ASSERT(priv->ibv_port);
1579         MLX5_ASSERT(priv->ibv_port <= sh->max_port);
1580         MLX5_ASSERT(dev->data->port_id < RTE_MAX_ETHPORTS);
1581         if (sh->port[priv->ibv_port - 1].devx_ih_port_id < RTE_MAX_ETHPORTS) {
1582                 /* The handler is already installed for this port. */
1583                 MLX5_ASSERT(sh->devx_intr_cnt);
1584                 goto exit;
1585         }
1586         if (sh->devx_intr_cnt) {
1587                 sh->devx_intr_cnt++;
1588                 sh->port[priv->ibv_port - 1].devx_ih_port_id =
1589                                         (uint32_t)dev->data->port_id;
1590                 goto exit;
1591         }
1592         if (priv->config.devx) {
1593 #ifndef HAVE_IBV_DEVX_ASYNC
1594                 goto exit;
1595 #else
1596                 sh->devx_comp = mlx5_glue->devx_create_cmd_comp(sh->ctx);
1597                 if (sh->devx_comp) {
1598                         int flags = fcntl(sh->devx_comp->fd, F_GETFL);
1599                         int ret = fcntl(sh->devx_comp->fd, F_SETFL,
1600                                     flags | O_NONBLOCK);
1601
1602                         if (ret) {
1603                                 DRV_LOG(INFO, "failed to change file descriptor"
1604                                         " devx async event queue");
1605                         } else {
1606                                 sh->intr_handle_devx.fd = sh->devx_comp->fd;
1607                                 sh->intr_handle_devx.type = RTE_INTR_HANDLE_EXT;
1608                                 rte_intr_callback_register
1609                                         (&sh->intr_handle_devx,
1610                                          mlx5_dev_interrupt_handler_devx, sh);
1611                                 sh->devx_intr_cnt++;
1612                                 sh->port[priv->ibv_port - 1].devx_ih_port_id =
1613                                                 (uint32_t)dev->data->port_id;
1614                         }
1615                 }
1616 #endif /* HAVE_IBV_DEVX_ASYNC */
1617         }
1618 exit:
1619         pthread_mutex_unlock(&sh->intr_mutex);
1620 }
1621
1622 /**
1623  * Uninstall interrupt handler.
1624  *
1625  * @param dev
1626  *   Pointer to Ethernet device.
1627  */
1628 void
1629 mlx5_dev_interrupt_handler_uninstall(struct rte_eth_dev *dev)
1630 {
1631         mlx5_dev_shared_handler_uninstall(dev);
1632 }
1633
1634 /**
1635  * Install interrupt handler.
1636  *
1637  * @param dev
1638  *   Pointer to Ethernet device.
1639  */
1640 void
1641 mlx5_dev_interrupt_handler_install(struct rte_eth_dev *dev)
1642 {
1643         mlx5_dev_shared_handler_install(dev);
1644 }
1645
1646 /**
1647  * Devx uninstall interrupt handler.
1648  *
1649  * @param dev
1650  *   Pointer to Ethernet device.
1651  */
1652 void
1653 mlx5_dev_interrupt_handler_devx_uninstall(struct rte_eth_dev *dev)
1654 {
1655         mlx5_dev_shared_handler_devx_uninstall(dev);
1656 }
1657
1658 /**
1659  * Devx install interrupt handler.
1660  *
1661  * @param dev
1662  *   Pointer to Ethernet device.
1663  */
1664 void
1665 mlx5_dev_interrupt_handler_devx_install(struct rte_eth_dev *dev)
1666 {
1667         mlx5_dev_shared_handler_devx_install(dev);
1668 }
1669
1670 /**
1671  * DPDK callback to bring the link DOWN.
1672  *
1673  * @param dev
1674  *   Pointer to Ethernet device structure.
1675  *
1676  * @return
1677  *   0 on success, a negative errno value otherwise and rte_errno is set.
1678  */
1679 int
1680 mlx5_set_link_down(struct rte_eth_dev *dev)
1681 {
1682         return mlx5_set_flags(dev, ~IFF_UP, ~IFF_UP);
1683 }
1684
1685 /**
1686  * DPDK callback to bring the link UP.
1687  *
1688  * @param dev
1689  *   Pointer to Ethernet device structure.
1690  *
1691  * @return
1692  *   0 on success, a negative errno value otherwise and rte_errno is set.
1693  */
1694 int
1695 mlx5_set_link_up(struct rte_eth_dev *dev)
1696 {
1697         return mlx5_set_flags(dev, ~IFF_UP, IFF_UP);
1698 }
1699
1700 /**
1701  * Configure the RX function to use.
1702  *
1703  * @param dev
1704  *   Pointer to private data structure.
1705  *
1706  * @return
1707  *   Pointer to selected Rx burst function.
1708  */
1709 eth_rx_burst_t
1710 mlx5_select_rx_function(struct rte_eth_dev *dev)
1711 {
1712         eth_rx_burst_t rx_pkt_burst = mlx5_rx_burst;
1713
1714         MLX5_ASSERT(dev != NULL);
1715         if (mlx5_check_vec_rx_support(dev) > 0) {
1716                 rx_pkt_burst = mlx5_rx_burst_vec;
1717                 DRV_LOG(DEBUG, "port %u selected Rx vectorized function",
1718                         dev->data->port_id);
1719         } else if (mlx5_mprq_enabled(dev)) {
1720                 rx_pkt_burst = mlx5_rx_burst_mprq;
1721         }
1722         return rx_pkt_burst;
1723 }
1724
1725 /**
1726  * Check if mlx5 device was removed.
1727  *
1728  * @param dev
1729  *   Pointer to Ethernet device structure.
1730  *
1731  * @return
1732  *   1 when device is removed, otherwise 0.
1733  */
1734 int
1735 mlx5_is_removed(struct rte_eth_dev *dev)
1736 {
1737         struct ibv_device_attr device_attr;
1738         struct mlx5_priv *priv = dev->data->dev_private;
1739
1740         if (mlx5_glue->query_device(priv->sh->ctx, &device_attr) == EIO)
1741                 return 1;
1742         return 0;
1743 }
1744
1745 /**
1746  * Get the E-Switch parameters by port id.
1747  *
1748  * @param[in] port
1749  *   Device port id.
1750  * @param[in] valid
1751  *   Device port id is valid, skip check. This flag is useful
1752  *   when trials are performed from probing and device is not
1753  *   flagged as valid yet (in attaching process).
1754  * @param[out] es_domain_id
1755  *   E-Switch domain id.
1756  * @param[out] es_port_id
1757  *   The port id of the port in the E-Switch.
1758  *
1759  * @return
1760  *   pointer to device private data structure containing data needed
1761  *   on success, NULL otherwise and rte_errno is set.
1762  */
1763 struct mlx5_priv *
1764 mlx5_port_to_eswitch_info(uint16_t port, bool valid)
1765 {
1766         struct rte_eth_dev *dev;
1767         struct mlx5_priv *priv;
1768
1769         if (port >= RTE_MAX_ETHPORTS) {
1770                 rte_errno = EINVAL;
1771                 return NULL;
1772         }
1773         if (!valid && !rte_eth_dev_is_valid_port(port)) {
1774                 rte_errno = ENODEV;
1775                 return NULL;
1776         }
1777         dev = &rte_eth_devices[port];
1778         priv = dev->data->dev_private;
1779         if (!(priv->representor || priv->master)) {
1780                 rte_errno = EINVAL;
1781                 return NULL;
1782         }
1783         return priv;
1784 }
1785
1786 /**
1787  * Get the E-Switch parameters by device instance.
1788  *
1789  * @param[in] port
1790  *   Device port id.
1791  * @param[out] es_domain_id
1792  *   E-Switch domain id.
1793  * @param[out] es_port_id
1794  *   The port id of the port in the E-Switch.
1795  *
1796  * @return
1797  *   pointer to device private data structure containing data needed
1798  *   on success, NULL otherwise and rte_errno is set.
1799  */
1800 struct mlx5_priv *
1801 mlx5_dev_to_eswitch_info(struct rte_eth_dev *dev)
1802 {
1803         struct mlx5_priv *priv;
1804
1805         priv = dev->data->dev_private;
1806         if (!(priv->representor || priv->master)) {
1807                 rte_errno = EINVAL;
1808                 return NULL;
1809         }
1810         return priv;
1811 }
1812
1813 /**
1814  * Get switch information associated with network interface.
1815  *
1816  * @param ifindex
1817  *   Network interface index.
1818  * @param[out] info
1819  *   Switch information object, populated in case of success.
1820  *
1821  * @return
1822  *   0 on success, a negative errno value otherwise and rte_errno is set.
1823  */
1824 int
1825 mlx5_sysfs_switch_info(unsigned int ifindex, struct mlx5_switch_info *info)
1826 {
1827         char ifname[IF_NAMESIZE];
1828         char port_name[IF_NAMESIZE];
1829         FILE *file;
1830         struct mlx5_switch_info data = {
1831                 .master = 0,
1832                 .representor = 0,
1833                 .name_type = MLX5_PHYS_PORT_NAME_TYPE_NOTSET,
1834                 .port_name = 0,
1835                 .switch_id = 0,
1836         };
1837         DIR *dir;
1838         bool port_switch_id_set = false;
1839         bool device_dir = false;
1840         char c;
1841         int ret;
1842
1843         if (!if_indextoname(ifindex, ifname)) {
1844                 rte_errno = errno;
1845                 return -rte_errno;
1846         }
1847
1848         MKSTR(phys_port_name, "/sys/class/net/%s/phys_port_name",
1849               ifname);
1850         MKSTR(phys_switch_id, "/sys/class/net/%s/phys_switch_id",
1851               ifname);
1852         MKSTR(pci_device, "/sys/class/net/%s/device",
1853               ifname);
1854
1855         file = fopen(phys_port_name, "rb");
1856         if (file != NULL) {
1857                 ret = fscanf(file, "%s", port_name);
1858                 fclose(file);
1859                 if (ret == 1)
1860                         mlx5_translate_port_name(port_name, &data);
1861         }
1862         file = fopen(phys_switch_id, "rb");
1863         if (file == NULL) {
1864                 rte_errno = errno;
1865                 return -rte_errno;
1866         }
1867         port_switch_id_set =
1868                 fscanf(file, "%" SCNx64 "%c", &data.switch_id, &c) == 2 &&
1869                 c == '\n';
1870         fclose(file);
1871         dir = opendir(pci_device);
1872         if (dir != NULL) {
1873                 closedir(dir);
1874                 device_dir = true;
1875         }
1876         if (port_switch_id_set) {
1877                 /* We have some E-Switch configuration. */
1878                 mlx5_sysfs_check_switch_info(device_dir, &data);
1879         }
1880         *info = data;
1881         MLX5_ASSERT(!(data.master && data.representor));
1882         if (data.master && data.representor) {
1883                 DRV_LOG(ERR, "ifindex %u device is recognized as master"
1884                              " and as representor", ifindex);
1885                 rte_errno = ENODEV;
1886                 return -rte_errno;
1887         }
1888         return 0;
1889 }
1890
1891 /**
1892  * Analyze gathered port parameters via sysfs to recognize master
1893  * and representor devices for E-Switch configuration.
1894  *
1895  * @param[in] device_dir
1896  *   flag of presence of "device" directory under port device key.
1897  * @param[inout] switch_info
1898  *   Port information, including port name as a number and port name
1899  *   type if recognized
1900  *
1901  * @return
1902  *   master and representor flags are set in switch_info according to
1903  *   recognized parameters (if any).
1904  */
1905 void
1906 mlx5_sysfs_check_switch_info(bool device_dir,
1907                              struct mlx5_switch_info *switch_info)
1908 {
1909         switch (switch_info->name_type) {
1910         case MLX5_PHYS_PORT_NAME_TYPE_UNKNOWN:
1911                 /*
1912                  * Name is not recognized, assume the master,
1913                  * check the device directory presence.
1914                  */
1915                 switch_info->master = device_dir;
1916                 break;
1917         case MLX5_PHYS_PORT_NAME_TYPE_NOTSET:
1918                 /*
1919                  * Name is not set, this assumes the legacy naming
1920                  * schema for master, just check if there is
1921                  * a device directory.
1922                  */
1923                 switch_info->master = device_dir;
1924                 break;
1925         case MLX5_PHYS_PORT_NAME_TYPE_UPLINK:
1926                 /* New uplink naming schema recognized. */
1927                 switch_info->master = 1;
1928                 break;
1929         case MLX5_PHYS_PORT_NAME_TYPE_LEGACY:
1930                 /* Legacy representors naming schema. */
1931                 switch_info->representor = !device_dir;
1932                 break;
1933         case MLX5_PHYS_PORT_NAME_TYPE_PFVF:
1934                 /* New representors naming schema. */
1935                 switch_info->representor = 1;
1936                 break;
1937         }
1938 }
1939
1940 /**
1941  * DPDK callback to retrieve plug-in module EEPROM information (type and size).
1942  *
1943  * @param dev
1944  *   Pointer to Ethernet device structure.
1945  * @param[out] modinfo
1946  *   Storage for plug-in module EEPROM information.
1947  *
1948  * @return
1949  *   0 on success, a negative errno value otherwise and rte_errno is set.
1950  */
1951 int
1952 mlx5_get_module_info(struct rte_eth_dev *dev,
1953                      struct rte_eth_dev_module_info *modinfo)
1954 {
1955         struct ethtool_modinfo info = {
1956                 .cmd = ETHTOOL_GMODULEINFO,
1957         };
1958         struct ifreq ifr = (struct ifreq) {
1959                 .ifr_data = (void *)&info,
1960         };
1961         int ret = 0;
1962
1963         if (!dev || !modinfo) {
1964                 DRV_LOG(WARNING, "missing argument, cannot get module info");
1965                 rte_errno = EINVAL;
1966                 return -rte_errno;
1967         }
1968         ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
1969         if (ret) {
1970                 DRV_LOG(WARNING, "port %u ioctl(SIOCETHTOOL) failed: %s",
1971                         dev->data->port_id, strerror(rte_errno));
1972                 return ret;
1973         }
1974         modinfo->type = info.type;
1975         modinfo->eeprom_len = info.eeprom_len;
1976         return ret;
1977 }
1978
1979 /**
1980  * DPDK callback to retrieve plug-in module EEPROM data.
1981  *
1982  * @param dev
1983  *   Pointer to Ethernet device structure.
1984  * @param[out] info
1985  *   Storage for plug-in module EEPROM data.
1986  *
1987  * @return
1988  *   0 on success, a negative errno value otherwise and rte_errno is set.
1989  */
1990 int mlx5_get_module_eeprom(struct rte_eth_dev *dev,
1991                            struct rte_dev_eeprom_info *info)
1992 {
1993         struct ethtool_eeprom *eeprom;
1994         struct ifreq ifr;
1995         int ret = 0;
1996
1997         if (!dev || !info) {
1998                 DRV_LOG(WARNING, "missing argument, cannot get module eeprom");
1999                 rte_errno = EINVAL;
2000                 return -rte_errno;
2001         }
2002         eeprom = rte_calloc(__func__, 1,
2003                             (sizeof(struct ethtool_eeprom) + info->length), 0);
2004         if (!eeprom) {
2005                 DRV_LOG(WARNING, "port %u cannot allocate memory for "
2006                         "eeprom data", dev->data->port_id);
2007                 rte_errno = ENOMEM;
2008                 return -rte_errno;
2009         }
2010         eeprom->cmd = ETHTOOL_GMODULEEEPROM;
2011         eeprom->offset = info->offset;
2012         eeprom->len = info->length;
2013         ifr = (struct ifreq) {
2014                 .ifr_data = (void *)eeprom,
2015         };
2016         ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
2017         if (ret)
2018                 DRV_LOG(WARNING, "port %u ioctl(SIOCETHTOOL) failed: %s",
2019                         dev->data->port_id, strerror(rte_errno));
2020         else
2021                 rte_memcpy(info->data, eeprom->data, info->length);
2022         rte_free(eeprom);
2023         return ret;
2024 }
2025
2026 /**
2027  * DPDK callback to retrieve hairpin capabilities.
2028  *
2029  * @param dev
2030  *   Pointer to Ethernet device structure.
2031  * @param[out] cap
2032  *   Storage for hairpin capability data.
2033  *
2034  * @return
2035  *   0 on success, a negative errno value otherwise and rte_errno is set.
2036  */
2037 int mlx5_hairpin_cap_get(struct rte_eth_dev *dev,
2038                          struct rte_eth_hairpin_cap *cap)
2039 {
2040         struct mlx5_priv *priv = dev->data->dev_private;
2041
2042         if (priv->sh->devx == 0) {
2043                 rte_errno = ENOTSUP;
2044                 return -rte_errno;
2045         }
2046         cap->max_nb_queues = UINT16_MAX;
2047         cap->max_rx_2_tx = 1;
2048         cap->max_tx_2_rx = 1;
2049         cap->max_nb_desc = 8192;
2050         return 0;
2051 }