align SPDX Mellanox copyrights
[dpdk.git] / drivers / net / mlx5 / mlx5_ethdev.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2015 6WIND S.A.
3  * Copyright 2015 Mellanox Technologies, Ltd
4  */
5
6 #define _GNU_SOURCE
7
8 #include <stddef.h>
9 #include <assert.h>
10 #include <inttypes.h>
11 #include <unistd.h>
12 #include <stdint.h>
13 #include <stdio.h>
14 #include <string.h>
15 #include <stdlib.h>
16 #include <errno.h>
17 #include <dirent.h>
18 #include <net/if.h>
19 #include <sys/ioctl.h>
20 #include <sys/socket.h>
21 #include <netinet/in.h>
22 #include <linux/ethtool.h>
23 #include <linux/sockios.h>
24 #include <fcntl.h>
25 #include <stdalign.h>
26 #include <sys/un.h>
27 #include <time.h>
28
29 #include <rte_atomic.h>
30 #include <rte_ethdev_driver.h>
31 #include <rte_bus_pci.h>
32 #include <rte_mbuf.h>
33 #include <rte_common.h>
34 #include <rte_interrupts.h>
35 #include <rte_malloc.h>
36 #include <rte_string_fns.h>
37
38 #include "mlx5.h"
39 #include "mlx5_glue.h"
40 #include "mlx5_rxtx.h"
41 #include "mlx5_utils.h"
42
43 /* Add defines in case the running kernel is not the same as user headers. */
44 #ifndef ETHTOOL_GLINKSETTINGS
45 struct ethtool_link_settings {
46         uint32_t cmd;
47         uint32_t speed;
48         uint8_t duplex;
49         uint8_t port;
50         uint8_t phy_address;
51         uint8_t autoneg;
52         uint8_t mdio_support;
53         uint8_t eth_to_mdix;
54         uint8_t eth_tp_mdix_ctrl;
55         int8_t link_mode_masks_nwords;
56         uint32_t reserved[8];
57         uint32_t link_mode_masks[];
58 };
59
60 #define ETHTOOL_GLINKSETTINGS 0x0000004c
61 #define ETHTOOL_LINK_MODE_1000baseT_Full_BIT 5
62 #define ETHTOOL_LINK_MODE_Autoneg_BIT 6
63 #define ETHTOOL_LINK_MODE_1000baseKX_Full_BIT 17
64 #define ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT 18
65 #define ETHTOOL_LINK_MODE_10000baseKR_Full_BIT 19
66 #define ETHTOOL_LINK_MODE_10000baseR_FEC_BIT 20
67 #define ETHTOOL_LINK_MODE_20000baseMLD2_Full_BIT 21
68 #define ETHTOOL_LINK_MODE_20000baseKR2_Full_BIT 22
69 #define ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT 23
70 #define ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT 24
71 #define ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT 25
72 #define ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT 26
73 #define ETHTOOL_LINK_MODE_56000baseKR4_Full_BIT 27
74 #define ETHTOOL_LINK_MODE_56000baseCR4_Full_BIT 28
75 #define ETHTOOL_LINK_MODE_56000baseSR4_Full_BIT 29
76 #define ETHTOOL_LINK_MODE_56000baseLR4_Full_BIT 30
77 #endif
78 #ifndef HAVE_ETHTOOL_LINK_MODE_25G
79 #define ETHTOOL_LINK_MODE_25000baseCR_Full_BIT 31
80 #define ETHTOOL_LINK_MODE_25000baseKR_Full_BIT 32
81 #define ETHTOOL_LINK_MODE_25000baseSR_Full_BIT 33
82 #endif
83 #ifndef HAVE_ETHTOOL_LINK_MODE_50G
84 #define ETHTOOL_LINK_MODE_50000baseCR2_Full_BIT 34
85 #define ETHTOOL_LINK_MODE_50000baseKR2_Full_BIT 35
86 #endif
87 #ifndef HAVE_ETHTOOL_LINK_MODE_100G
88 #define ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT 36
89 #define ETHTOOL_LINK_MODE_100000baseSR4_Full_BIT 37
90 #define ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT 38
91 #define ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT 39
92 #endif
93
94 /**
95  * Get interface name from private structure.
96  *
97  * @param[in] dev
98  *   Pointer to Ethernet device.
99  * @param[out] ifname
100  *   Interface name output buffer.
101  *
102  * @return
103  *   0 on success, a negative errno value otherwise and rte_errno is set.
104  */
105 int
106 mlx5_get_ifname(const struct rte_eth_dev *dev, char (*ifname)[IF_NAMESIZE])
107 {
108         struct priv *priv = dev->data->dev_private;
109         DIR *dir;
110         struct dirent *dent;
111         unsigned int dev_type = 0;
112         unsigned int dev_port_prev = ~0u;
113         char match[IF_NAMESIZE] = "";
114
115         {
116                 MKSTR(path, "%s/device/net", priv->ibdev_path);
117
118                 dir = opendir(path);
119                 if (dir == NULL) {
120                         rte_errno = errno;
121                         return -rte_errno;
122                 }
123         }
124         while ((dent = readdir(dir)) != NULL) {
125                 char *name = dent->d_name;
126                 FILE *file;
127                 unsigned int dev_port;
128                 int r;
129
130                 if ((name[0] == '.') &&
131                     ((name[1] == '\0') ||
132                      ((name[1] == '.') && (name[2] == '\0'))))
133                         continue;
134
135                 MKSTR(path, "%s/device/net/%s/%s",
136                       priv->ibdev_path, name,
137                       (dev_type ? "dev_id" : "dev_port"));
138
139                 file = fopen(path, "rb");
140                 if (file == NULL) {
141                         if (errno != ENOENT)
142                                 continue;
143                         /*
144                          * Switch to dev_id when dev_port does not exist as
145                          * is the case with Linux kernel versions < 3.15.
146                          */
147 try_dev_id:
148                         match[0] = '\0';
149                         if (dev_type)
150                                 break;
151                         dev_type = 1;
152                         dev_port_prev = ~0u;
153                         rewinddir(dir);
154                         continue;
155                 }
156                 r = fscanf(file, (dev_type ? "%x" : "%u"), &dev_port);
157                 fclose(file);
158                 if (r != 1)
159                         continue;
160                 /*
161                  * Switch to dev_id when dev_port returns the same value for
162                  * all ports. May happen when using a MOFED release older than
163                  * 3.0 with a Linux kernel >= 3.15.
164                  */
165                 if (dev_port == dev_port_prev)
166                         goto try_dev_id;
167                 dev_port_prev = dev_port;
168                 if (dev_port == (priv->port - 1u))
169                         strlcpy(match, name, sizeof(match));
170         }
171         closedir(dir);
172         if (match[0] == '\0') {
173                 rte_errno = ENOENT;
174                 return -rte_errno;
175         }
176         strncpy(*ifname, match, sizeof(*ifname));
177         return 0;
178 }
179
180 /**
181  * Perform ifreq ioctl() on associated Ethernet device.
182  *
183  * @param[in] dev
184  *   Pointer to Ethernet device.
185  * @param req
186  *   Request number to pass to ioctl().
187  * @param[out] ifr
188  *   Interface request structure output buffer.
189  *
190  * @return
191  *   0 on success, a negative errno value otherwise and rte_errno is set.
192  */
193 int
194 mlx5_ifreq(const struct rte_eth_dev *dev, int req, struct ifreq *ifr)
195 {
196         int sock = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
197         int ret = 0;
198
199         if (sock == -1) {
200                 rte_errno = errno;
201                 return -rte_errno;
202         }
203         ret = mlx5_get_ifname(dev, &ifr->ifr_name);
204         if (ret)
205                 goto error;
206         ret = ioctl(sock, req, ifr);
207         if (ret == -1) {
208                 rte_errno = errno;
209                 goto error;
210         }
211         close(sock);
212         return 0;
213 error:
214         close(sock);
215         return -rte_errno;
216 }
217
218 /**
219  * Get device MTU.
220  *
221  * @param dev
222  *   Pointer to Ethernet device.
223  * @param[out] mtu
224  *   MTU value output buffer.
225  *
226  * @return
227  *   0 on success, a negative errno value otherwise and rte_errno is set.
228  */
229 int
230 mlx5_get_mtu(struct rte_eth_dev *dev, uint16_t *mtu)
231 {
232         struct ifreq request;
233         int ret = mlx5_ifreq(dev, SIOCGIFMTU, &request);
234
235         if (ret)
236                 return ret;
237         *mtu = request.ifr_mtu;
238         return 0;
239 }
240
241 /**
242  * Set device MTU.
243  *
244  * @param dev
245  *   Pointer to Ethernet device.
246  * @param mtu
247  *   MTU value to set.
248  *
249  * @return
250  *   0 on success, a negative errno value otherwise and rte_errno is set.
251  */
252 static int
253 mlx5_set_mtu(struct rte_eth_dev *dev, uint16_t mtu)
254 {
255         struct ifreq request = { .ifr_mtu = mtu, };
256
257         return mlx5_ifreq(dev, SIOCSIFMTU, &request);
258 }
259
260 /**
261  * Set device flags.
262  *
263  * @param dev
264  *   Pointer to Ethernet device.
265  * @param keep
266  *   Bitmask for flags that must remain untouched.
267  * @param flags
268  *   Bitmask for flags to modify.
269  *
270  * @return
271  *   0 on success, a negative errno value otherwise and rte_errno is set.
272  */
273 int
274 mlx5_set_flags(struct rte_eth_dev *dev, unsigned int keep, unsigned int flags)
275 {
276         struct ifreq request;
277         int ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &request);
278
279         if (ret)
280                 return ret;
281         request.ifr_flags &= keep;
282         request.ifr_flags |= flags & ~keep;
283         return mlx5_ifreq(dev, SIOCSIFFLAGS, &request);
284 }
285
286 /**
287  * DPDK callback for Ethernet device configuration.
288  *
289  * @param dev
290  *   Pointer to Ethernet device structure.
291  *
292  * @return
293  *   0 on success, a negative errno value otherwise and rte_errno is set.
294  */
295 int
296 mlx5_dev_configure(struct rte_eth_dev *dev)
297 {
298         struct priv *priv = dev->data->dev_private;
299         unsigned int rxqs_n = dev->data->nb_rx_queues;
300         unsigned int txqs_n = dev->data->nb_tx_queues;
301         unsigned int i;
302         unsigned int j;
303         unsigned int reta_idx_n;
304         const uint8_t use_app_rss_key =
305                 !!dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key;
306         uint64_t supp_tx_offloads = mlx5_get_tx_port_offloads(dev);
307         uint64_t tx_offloads = dev->data->dev_conf.txmode.offloads;
308         uint64_t supp_rx_offloads =
309                 (mlx5_get_rx_port_offloads() |
310                  mlx5_get_rx_queue_offloads(dev));
311         uint64_t rx_offloads = dev->data->dev_conf.rxmode.offloads;
312         int ret = 0;
313
314         if ((tx_offloads & supp_tx_offloads) != tx_offloads) {
315                 DRV_LOG(ERR,
316                         "port %u some Tx offloads are not supported requested"
317                         " 0x%" PRIx64 " supported 0x%" PRIx64,
318                         dev->data->port_id, tx_offloads, supp_tx_offloads);
319                 rte_errno = ENOTSUP;
320                 return -rte_errno;
321         }
322         if ((rx_offloads & supp_rx_offloads) != rx_offloads) {
323                 DRV_LOG(ERR,
324                         "port %u some Rx offloads are not supported requested"
325                         " 0x%" PRIx64 " supported 0x%" PRIx64,
326                         dev->data->port_id, rx_offloads, supp_rx_offloads);
327                 rte_errno = ENOTSUP;
328                 return -rte_errno;
329         }
330         if (use_app_rss_key &&
331             (dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key_len !=
332              rss_hash_default_key_len)) {
333                 DRV_LOG(ERR, "port %u RSS key len must be %zu Bytes long",
334                         dev->data->port_id, rss_hash_default_key_len);
335                 rte_errno = EINVAL;
336                 return -rte_errno;
337         }
338         priv->rss_conf.rss_key =
339                 rte_realloc(priv->rss_conf.rss_key,
340                             rss_hash_default_key_len, 0);
341         if (!priv->rss_conf.rss_key) {
342                 DRV_LOG(ERR, "port %u cannot allocate RSS hash key memory (%u)",
343                         dev->data->port_id, rxqs_n);
344                 rte_errno = ENOMEM;
345                 return -rte_errno;
346         }
347         memcpy(priv->rss_conf.rss_key,
348                use_app_rss_key ?
349                dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key :
350                rss_hash_default_key,
351                rss_hash_default_key_len);
352         priv->rss_conf.rss_key_len = rss_hash_default_key_len;
353         priv->rss_conf.rss_hf = dev->data->dev_conf.rx_adv_conf.rss_conf.rss_hf;
354         priv->rxqs = (void *)dev->data->rx_queues;
355         priv->txqs = (void *)dev->data->tx_queues;
356         if (txqs_n != priv->txqs_n) {
357                 DRV_LOG(INFO, "port %u Tx queues number update: %u -> %u",
358                         dev->data->port_id, priv->txqs_n, txqs_n);
359                 priv->txqs_n = txqs_n;
360         }
361         if (rxqs_n > priv->config.ind_table_max_size) {
362                 DRV_LOG(ERR, "port %u cannot handle this many Rx queues (%u)",
363                         dev->data->port_id, rxqs_n);
364                 rte_errno = EINVAL;
365                 return -rte_errno;
366         }
367         if (rxqs_n == priv->rxqs_n)
368                 return 0;
369         DRV_LOG(INFO, "port %u Rx queues number update: %u -> %u",
370                 dev->data->port_id, priv->rxqs_n, rxqs_n);
371         priv->rxqs_n = rxqs_n;
372         /* If the requested number of RX queues is not a power of two, use the
373          * maximum indirection table size for better balancing.
374          * The result is always rounded to the next power of two. */
375         reta_idx_n = (1 << log2above((rxqs_n & (rxqs_n - 1)) ?
376                                      priv->config.ind_table_max_size :
377                                      rxqs_n));
378         ret = mlx5_rss_reta_index_resize(dev, reta_idx_n);
379         if (ret)
380                 return ret;
381         /* When the number of RX queues is not a power of two, the remaining
382          * table entries are padded with reused WQs and hashes are not spread
383          * uniformly. */
384         for (i = 0, j = 0; (i != reta_idx_n); ++i) {
385                 (*priv->reta_idx)[i] = j;
386                 if (++j == rxqs_n)
387                         j = 0;
388         }
389         return 0;
390 }
391
392 /**
393  * DPDK callback to get information about the device.
394  *
395  * @param dev
396  *   Pointer to Ethernet device structure.
397  * @param[out] info
398  *   Info structure output buffer.
399  */
400 void
401 mlx5_dev_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *info)
402 {
403         struct priv *priv = dev->data->dev_private;
404         struct mlx5_dev_config *config = &priv->config;
405         unsigned int max;
406         char ifname[IF_NAMESIZE];
407
408         info->pci_dev = RTE_ETH_DEV_TO_PCI(dev);
409         /* FIXME: we should ask the device for these values. */
410         info->min_rx_bufsize = 32;
411         info->max_rx_pktlen = 65536;
412         /*
413          * Since we need one CQ per QP, the limit is the minimum number
414          * between the two values.
415          */
416         max = RTE_MIN(priv->device_attr.orig_attr.max_cq,
417                       priv->device_attr.orig_attr.max_qp);
418         /* If max >= 65535 then max = 0, max_rx_queues is uint16_t. */
419         if (max >= 65535)
420                 max = 65535;
421         info->max_rx_queues = max;
422         info->max_tx_queues = max;
423         info->max_mac_addrs = RTE_DIM(priv->mac);
424         info->rx_queue_offload_capa = mlx5_get_rx_queue_offloads(dev);
425         info->rx_offload_capa = (mlx5_get_rx_port_offloads() |
426                                  info->rx_queue_offload_capa);
427         info->tx_offload_capa = mlx5_get_tx_port_offloads(dev);
428         if (mlx5_get_ifname(dev, &ifname) == 0)
429                 info->if_index = if_nametoindex(ifname);
430         info->reta_size = priv->reta_idx_n ?
431                 priv->reta_idx_n : config->ind_table_max_size;
432         info->hash_key_size = rss_hash_default_key_len;
433         info->speed_capa = priv->link_speed_capa;
434         info->flow_type_rss_offloads = ~MLX5_RSS_HF_MASK;
435 }
436
437 /**
438  * Get supported packet types.
439  *
440  * @param dev
441  *   Pointer to Ethernet device structure.
442  *
443  * @return
444  *   A pointer to the supported Packet types array.
445  */
446 const uint32_t *
447 mlx5_dev_supported_ptypes_get(struct rte_eth_dev *dev)
448 {
449         static const uint32_t ptypes[] = {
450                 /* refers to rxq_cq_to_pkt_type() */
451                 RTE_PTYPE_L2_ETHER,
452                 RTE_PTYPE_L3_IPV4_EXT_UNKNOWN,
453                 RTE_PTYPE_L3_IPV6_EXT_UNKNOWN,
454                 RTE_PTYPE_L4_NONFRAG,
455                 RTE_PTYPE_L4_FRAG,
456                 RTE_PTYPE_L4_TCP,
457                 RTE_PTYPE_L4_UDP,
458                 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN,
459                 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN,
460                 RTE_PTYPE_INNER_L4_NONFRAG,
461                 RTE_PTYPE_INNER_L4_FRAG,
462                 RTE_PTYPE_INNER_L4_TCP,
463                 RTE_PTYPE_INNER_L4_UDP,
464                 RTE_PTYPE_UNKNOWN
465         };
466
467         if (dev->rx_pkt_burst == mlx5_rx_burst ||
468             dev->rx_pkt_burst == mlx5_rx_burst_vec)
469                 return ptypes;
470         return NULL;
471 }
472
473 /**
474  * DPDK callback to retrieve physical link information.
475  *
476  * @param dev
477  *   Pointer to Ethernet device structure.
478  * @param[out] link
479  *   Storage for current link status.
480  *
481  * @return
482  *   0 on success, a negative errno value otherwise and rte_errno is set.
483  */
484 static int
485 mlx5_link_update_unlocked_gset(struct rte_eth_dev *dev,
486                                struct rte_eth_link *link)
487 {
488         struct priv *priv = dev->data->dev_private;
489         struct ethtool_cmd edata = {
490                 .cmd = ETHTOOL_GSET /* Deprecated since Linux v4.5. */
491         };
492         struct ifreq ifr;
493         struct rte_eth_link dev_link;
494         int link_speed = 0;
495         int ret;
496
497         ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr);
498         if (ret) {
499                 DRV_LOG(WARNING, "port %u ioctl(SIOCGIFFLAGS) failed: %s",
500                         dev->data->port_id, strerror(rte_errno));
501                 return ret;
502         }
503         memset(&dev_link, 0, sizeof(dev_link));
504         dev_link.link_status = ((ifr.ifr_flags & IFF_UP) &&
505                                 (ifr.ifr_flags & IFF_RUNNING));
506         ifr.ifr_data = (void *)&edata;
507         ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
508         if (ret) {
509                 DRV_LOG(WARNING,
510                         "port %u ioctl(SIOCETHTOOL, ETHTOOL_GSET) failed: %s",
511                         dev->data->port_id, strerror(rte_errno));
512                 return ret;
513         }
514         link_speed = ethtool_cmd_speed(&edata);
515         if (link_speed == -1)
516                 dev_link.link_speed = 0;
517         else
518                 dev_link.link_speed = link_speed;
519         priv->link_speed_capa = 0;
520         if (edata.supported & SUPPORTED_Autoneg)
521                 priv->link_speed_capa |= ETH_LINK_SPEED_AUTONEG;
522         if (edata.supported & (SUPPORTED_1000baseT_Full |
523                                SUPPORTED_1000baseKX_Full))
524                 priv->link_speed_capa |= ETH_LINK_SPEED_1G;
525         if (edata.supported & SUPPORTED_10000baseKR_Full)
526                 priv->link_speed_capa |= ETH_LINK_SPEED_10G;
527         if (edata.supported & (SUPPORTED_40000baseKR4_Full |
528                                SUPPORTED_40000baseCR4_Full |
529                                SUPPORTED_40000baseSR4_Full |
530                                SUPPORTED_40000baseLR4_Full))
531                 priv->link_speed_capa |= ETH_LINK_SPEED_40G;
532         dev_link.link_duplex = ((edata.duplex == DUPLEX_HALF) ?
533                                 ETH_LINK_HALF_DUPLEX : ETH_LINK_FULL_DUPLEX);
534         dev_link.link_autoneg = !(dev->data->dev_conf.link_speeds &
535                         ETH_LINK_SPEED_FIXED);
536         if ((dev_link.link_speed && !dev_link.link_status) ||
537             (!dev_link.link_speed && dev_link.link_status)) {
538                 rte_errno = EAGAIN;
539                 return -rte_errno;
540         }
541         *link = dev_link;
542         return 0;
543 }
544
545 /**
546  * Retrieve physical link information (unlocked version using new ioctl).
547  *
548  * @param dev
549  *   Pointer to Ethernet device structure.
550  * @param[out] link
551  *   Storage for current link status.
552  *
553  * @return
554  *   0 on success, a negative errno value otherwise and rte_errno is set.
555  */
556 static int
557 mlx5_link_update_unlocked_gs(struct rte_eth_dev *dev,
558                              struct rte_eth_link *link)
559
560 {
561         struct priv *priv = dev->data->dev_private;
562         struct ethtool_link_settings gcmd = { .cmd = ETHTOOL_GLINKSETTINGS };
563         struct ifreq ifr;
564         struct rte_eth_link dev_link;
565         uint64_t sc;
566         int ret;
567
568         ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr);
569         if (ret) {
570                 DRV_LOG(WARNING, "port %u ioctl(SIOCGIFFLAGS) failed: %s",
571                         dev->data->port_id, strerror(rte_errno));
572                 return ret;
573         }
574         memset(&dev_link, 0, sizeof(dev_link));
575         dev_link.link_status = ((ifr.ifr_flags & IFF_UP) &&
576                                 (ifr.ifr_flags & IFF_RUNNING));
577         ifr.ifr_data = (void *)&gcmd;
578         ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
579         if (ret) {
580                 DRV_LOG(DEBUG,
581                         "port %u ioctl(SIOCETHTOOL, ETHTOOL_GLINKSETTINGS)"
582                         " failed: %s",
583                         dev->data->port_id, strerror(rte_errno));
584                 return ret;
585         }
586         gcmd.link_mode_masks_nwords = -gcmd.link_mode_masks_nwords;
587
588         alignas(struct ethtool_link_settings)
589         uint8_t data[offsetof(struct ethtool_link_settings, link_mode_masks) +
590                      sizeof(uint32_t) * gcmd.link_mode_masks_nwords * 3];
591         struct ethtool_link_settings *ecmd = (void *)data;
592
593         *ecmd = gcmd;
594         ifr.ifr_data = (void *)ecmd;
595         ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
596         if (ret) {
597                 DRV_LOG(DEBUG,
598                         "port %u ioctl(SIOCETHTOOL, ETHTOOL_GLINKSETTINGS)"
599                         " failed: %s",
600                         dev->data->port_id, strerror(rte_errno));
601                 return ret;
602         }
603         dev_link.link_speed = ecmd->speed;
604         sc = ecmd->link_mode_masks[0] |
605                 ((uint64_t)ecmd->link_mode_masks[1] << 32);
606         priv->link_speed_capa = 0;
607         if (sc & MLX5_BITSHIFT(ETHTOOL_LINK_MODE_Autoneg_BIT))
608                 priv->link_speed_capa |= ETH_LINK_SPEED_AUTONEG;
609         if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_1000baseT_Full_BIT) |
610                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_1000baseKX_Full_BIT)))
611                 priv->link_speed_capa |= ETH_LINK_SPEED_1G;
612         if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT) |
613                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_10000baseKR_Full_BIT) |
614                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_10000baseR_FEC_BIT)))
615                 priv->link_speed_capa |= ETH_LINK_SPEED_10G;
616         if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_20000baseMLD2_Full_BIT) |
617                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_20000baseKR2_Full_BIT)))
618                 priv->link_speed_capa |= ETH_LINK_SPEED_20G;
619         if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT) |
620                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT) |
621                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT) |
622                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT)))
623                 priv->link_speed_capa |= ETH_LINK_SPEED_40G;
624         if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseKR4_Full_BIT) |
625                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseCR4_Full_BIT) |
626                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseSR4_Full_BIT) |
627                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseLR4_Full_BIT)))
628                 priv->link_speed_capa |= ETH_LINK_SPEED_56G;
629         if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_25000baseCR_Full_BIT) |
630                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_25000baseKR_Full_BIT) |
631                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_25000baseSR_Full_BIT)))
632                 priv->link_speed_capa |= ETH_LINK_SPEED_25G;
633         if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_50000baseCR2_Full_BIT) |
634                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_50000baseKR2_Full_BIT)))
635                 priv->link_speed_capa |= ETH_LINK_SPEED_50G;
636         if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT) |
637                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseSR4_Full_BIT) |
638                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT) |
639                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT)))
640                 priv->link_speed_capa |= ETH_LINK_SPEED_100G;
641         dev_link.link_duplex = ((ecmd->duplex == DUPLEX_HALF) ?
642                                 ETH_LINK_HALF_DUPLEX : ETH_LINK_FULL_DUPLEX);
643         dev_link.link_autoneg = !(dev->data->dev_conf.link_speeds &
644                                   ETH_LINK_SPEED_FIXED);
645         if ((dev_link.link_speed && !dev_link.link_status) ||
646             (!dev_link.link_speed && dev_link.link_status)) {
647                 rte_errno = EAGAIN;
648                 return -rte_errno;
649         }
650         *link = dev_link;
651         return 0;
652 }
653
654 /**
655  * DPDK callback to retrieve physical link information.
656  *
657  * @param dev
658  *   Pointer to Ethernet device structure.
659  * @param wait_to_complete
660  *   Wait for request completion.
661  *
662  * @return
663  *   0 if link status was not updated, positive if it was, a negative errno
664  *   value otherwise and rte_errno is set.
665  */
666 int
667 mlx5_link_update(struct rte_eth_dev *dev, int wait_to_complete)
668 {
669         int ret;
670         struct rte_eth_link dev_link;
671         time_t start_time = time(NULL);
672
673         do {
674                 ret = mlx5_link_update_unlocked_gset(dev, &dev_link);
675                 if (ret)
676                         ret = mlx5_link_update_unlocked_gs(dev, &dev_link);
677                 if (ret == 0)
678                         break;
679                 /* Handle wait to complete situation. */
680                 if (wait_to_complete && ret == -EAGAIN) {
681                         if (abs((int)difftime(time(NULL), start_time)) <
682                             MLX5_LINK_STATUS_TIMEOUT) {
683                                 usleep(0);
684                                 continue;
685                         } else {
686                                 rte_errno = EBUSY;
687                                 return -rte_errno;
688                         }
689                 } else if (ret < 0) {
690                         return ret;
691                 }
692         } while (wait_to_complete);
693         ret = !!memcmp(&dev->data->dev_link, &dev_link,
694                        sizeof(struct rte_eth_link));
695         dev->data->dev_link = dev_link;
696         return ret;
697 }
698
699 /**
700  * DPDK callback to change the MTU.
701  *
702  * @param dev
703  *   Pointer to Ethernet device structure.
704  * @param in_mtu
705  *   New MTU.
706  *
707  * @return
708  *   0 on success, a negative errno value otherwise and rte_errno is set.
709  */
710 int
711 mlx5_dev_set_mtu(struct rte_eth_dev *dev, uint16_t mtu)
712 {
713         struct priv *priv = dev->data->dev_private;
714         uint16_t kern_mtu = 0;
715         int ret;
716
717         ret = mlx5_get_mtu(dev, &kern_mtu);
718         if (ret)
719                 return ret;
720         /* Set kernel interface MTU first. */
721         ret = mlx5_set_mtu(dev, mtu);
722         if (ret)
723                 return ret;
724         ret = mlx5_get_mtu(dev, &kern_mtu);
725         if (ret)
726                 return ret;
727         if (kern_mtu == mtu) {
728                 priv->mtu = mtu;
729                 DRV_LOG(DEBUG, "port %u adapter MTU set to %u",
730                         dev->data->port_id, mtu);
731                 return 0;
732         }
733         rte_errno = EAGAIN;
734         return -rte_errno;
735 }
736
737 /**
738  * DPDK callback to get flow control status.
739  *
740  * @param dev
741  *   Pointer to Ethernet device structure.
742  * @param[out] fc_conf
743  *   Flow control output buffer.
744  *
745  * @return
746  *   0 on success, a negative errno value otherwise and rte_errno is set.
747  */
748 int
749 mlx5_dev_get_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf)
750 {
751         struct ifreq ifr;
752         struct ethtool_pauseparam ethpause = {
753                 .cmd = ETHTOOL_GPAUSEPARAM
754         };
755         int ret;
756
757         ifr.ifr_data = (void *)&ethpause;
758         ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
759         if (ret) {
760                 DRV_LOG(WARNING,
761                         "port %u ioctl(SIOCETHTOOL, ETHTOOL_GPAUSEPARAM) failed:"
762                         " %s",
763                         dev->data->port_id, strerror(rte_errno));
764                 return ret;
765         }
766         fc_conf->autoneg = ethpause.autoneg;
767         if (ethpause.rx_pause && ethpause.tx_pause)
768                 fc_conf->mode = RTE_FC_FULL;
769         else if (ethpause.rx_pause)
770                 fc_conf->mode = RTE_FC_RX_PAUSE;
771         else if (ethpause.tx_pause)
772                 fc_conf->mode = RTE_FC_TX_PAUSE;
773         else
774                 fc_conf->mode = RTE_FC_NONE;
775         return 0;
776 }
777
778 /**
779  * DPDK callback to modify flow control parameters.
780  *
781  * @param dev
782  *   Pointer to Ethernet device structure.
783  * @param[in] fc_conf
784  *   Flow control parameters.
785  *
786  * @return
787  *   0 on success, a negative errno value otherwise and rte_errno is set.
788  */
789 int
790 mlx5_dev_set_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf)
791 {
792         struct ifreq ifr;
793         struct ethtool_pauseparam ethpause = {
794                 .cmd = ETHTOOL_SPAUSEPARAM
795         };
796         int ret;
797
798         ifr.ifr_data = (void *)&ethpause;
799         ethpause.autoneg = fc_conf->autoneg;
800         if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) ||
801             (fc_conf->mode & RTE_FC_RX_PAUSE))
802                 ethpause.rx_pause = 1;
803         else
804                 ethpause.rx_pause = 0;
805
806         if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) ||
807             (fc_conf->mode & RTE_FC_TX_PAUSE))
808                 ethpause.tx_pause = 1;
809         else
810                 ethpause.tx_pause = 0;
811         ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
812         if (ret) {
813                 DRV_LOG(WARNING,
814                         "port %u ioctl(SIOCETHTOOL, ETHTOOL_SPAUSEPARAM)"
815                         " failed: %s",
816                         dev->data->port_id, strerror(rte_errno));
817                 return ret;
818         }
819         return 0;
820 }
821
822 /**
823  * Get PCI information from struct ibv_device.
824  *
825  * @param device
826  *   Pointer to Ethernet device structure.
827  * @param[out] pci_addr
828  *   PCI bus address output buffer.
829  *
830  * @return
831  *   0 on success, a negative errno value otherwise and rte_errno is set.
832  */
833 int
834 mlx5_ibv_device_to_pci_addr(const struct ibv_device *device,
835                             struct rte_pci_addr *pci_addr)
836 {
837         FILE *file;
838         char line[32];
839         MKSTR(path, "%s/device/uevent", device->ibdev_path);
840
841         file = fopen(path, "rb");
842         if (file == NULL) {
843                 rte_errno = errno;
844                 return -rte_errno;
845         }
846         while (fgets(line, sizeof(line), file) == line) {
847                 size_t len = strlen(line);
848                 int ret;
849
850                 /* Truncate long lines. */
851                 if (len == (sizeof(line) - 1))
852                         while (line[(len - 1)] != '\n') {
853                                 ret = fgetc(file);
854                                 if (ret == EOF)
855                                         break;
856                                 line[(len - 1)] = ret;
857                         }
858                 /* Extract information. */
859                 if (sscanf(line,
860                            "PCI_SLOT_NAME="
861                            "%" SCNx32 ":%" SCNx8 ":%" SCNx8 ".%" SCNx8 "\n",
862                            &pci_addr->domain,
863                            &pci_addr->bus,
864                            &pci_addr->devid,
865                            &pci_addr->function) == 4) {
866                         ret = 0;
867                         break;
868                 }
869         }
870         fclose(file);
871         return 0;
872 }
873
874 /**
875  * Device status handler.
876  *
877  * @param dev
878  *   Pointer to Ethernet device.
879  * @param events
880  *   Pointer to event flags holder.
881  *
882  * @return
883  *   Events bitmap of callback process which can be called immediately.
884  */
885 static uint32_t
886 mlx5_dev_status_handler(struct rte_eth_dev *dev)
887 {
888         struct priv *priv = dev->data->dev_private;
889         struct ibv_async_event event;
890         uint32_t ret = 0;
891
892         if (mlx5_link_update(dev, 0) == -EAGAIN) {
893                 usleep(0);
894                 return 0;
895         }
896         /* Read all message and acknowledge them. */
897         for (;;) {
898                 if (mlx5_glue->get_async_event(priv->ctx, &event))
899                         break;
900                 if ((event.event_type == IBV_EVENT_PORT_ACTIVE ||
901                         event.event_type == IBV_EVENT_PORT_ERR) &&
902                         (dev->data->dev_conf.intr_conf.lsc == 1))
903                         ret |= (1 << RTE_ETH_EVENT_INTR_LSC);
904                 else if (event.event_type == IBV_EVENT_DEVICE_FATAL &&
905                         dev->data->dev_conf.intr_conf.rmv == 1)
906                         ret |= (1 << RTE_ETH_EVENT_INTR_RMV);
907                 else
908                         DRV_LOG(DEBUG,
909                                 "port %u event type %d on not handled",
910                                 dev->data->port_id, event.event_type);
911                 mlx5_glue->ack_async_event(&event);
912         }
913         return ret;
914 }
915
916 /**
917  * Handle interrupts from the NIC.
918  *
919  * @param[in] intr_handle
920  *   Interrupt handler.
921  * @param cb_arg
922  *   Callback argument.
923  */
924 void
925 mlx5_dev_interrupt_handler(void *cb_arg)
926 {
927         struct rte_eth_dev *dev = cb_arg;
928         uint32_t events;
929
930         events = mlx5_dev_status_handler(dev);
931         if (events & (1 << RTE_ETH_EVENT_INTR_LSC))
932                 _rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_INTR_LSC, NULL);
933         if (events & (1 << RTE_ETH_EVENT_INTR_RMV))
934                 _rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_INTR_RMV, NULL);
935 }
936
937 /**
938  * Handle interrupts from the socket.
939  *
940  * @param cb_arg
941  *   Callback argument.
942  */
943 static void
944 mlx5_dev_handler_socket(void *cb_arg)
945 {
946         struct rte_eth_dev *dev = cb_arg;
947
948         mlx5_socket_handle(dev);
949 }
950
951 /**
952  * Uninstall interrupt handler.
953  *
954  * @param dev
955  *   Pointer to Ethernet device.
956  */
957 void
958 mlx5_dev_interrupt_handler_uninstall(struct rte_eth_dev *dev)
959 {
960         struct priv *priv = dev->data->dev_private;
961
962         if (dev->data->dev_conf.intr_conf.lsc ||
963             dev->data->dev_conf.intr_conf.rmv)
964                 rte_intr_callback_unregister(&priv->intr_handle,
965                                              mlx5_dev_interrupt_handler, dev);
966         if (priv->primary_socket)
967                 rte_intr_callback_unregister(&priv->intr_handle_socket,
968                                              mlx5_dev_handler_socket, dev);
969         priv->intr_handle.fd = 0;
970         priv->intr_handle.type = RTE_INTR_HANDLE_UNKNOWN;
971         priv->intr_handle_socket.fd = 0;
972         priv->intr_handle_socket.type = RTE_INTR_HANDLE_UNKNOWN;
973 }
974
975 /**
976  * Install interrupt handler.
977  *
978  * @param dev
979  *   Pointer to Ethernet device.
980  */
981 void
982 mlx5_dev_interrupt_handler_install(struct rte_eth_dev *dev)
983 {
984         struct priv *priv = dev->data->dev_private;
985         int ret;
986         int flags;
987
988         assert(priv->ctx->async_fd > 0);
989         flags = fcntl(priv->ctx->async_fd, F_GETFL);
990         ret = fcntl(priv->ctx->async_fd, F_SETFL, flags | O_NONBLOCK);
991         if (ret) {
992                 DRV_LOG(INFO,
993                         "port %u failed to change file descriptor async event"
994                         " queue",
995                         dev->data->port_id);
996                 dev->data->dev_conf.intr_conf.lsc = 0;
997                 dev->data->dev_conf.intr_conf.rmv = 0;
998         }
999         if (dev->data->dev_conf.intr_conf.lsc ||
1000             dev->data->dev_conf.intr_conf.rmv) {
1001                 priv->intr_handle.fd = priv->ctx->async_fd;
1002                 priv->intr_handle.type = RTE_INTR_HANDLE_EXT;
1003                 rte_intr_callback_register(&priv->intr_handle,
1004                                            mlx5_dev_interrupt_handler, dev);
1005         }
1006         ret = mlx5_socket_init(dev);
1007         if (ret)
1008                 DRV_LOG(ERR, "port %u cannot initialise socket: %s",
1009                         dev->data->port_id, strerror(rte_errno));
1010         else if (priv->primary_socket) {
1011                 priv->intr_handle_socket.fd = priv->primary_socket;
1012                 priv->intr_handle_socket.type = RTE_INTR_HANDLE_EXT;
1013                 rte_intr_callback_register(&priv->intr_handle_socket,
1014                                            mlx5_dev_handler_socket, dev);
1015         }
1016 }
1017
1018 /**
1019  * DPDK callback to bring the link DOWN.
1020  *
1021  * @param dev
1022  *   Pointer to Ethernet device structure.
1023  *
1024  * @return
1025  *   0 on success, a negative errno value otherwise and rte_errno is set.
1026  */
1027 int
1028 mlx5_set_link_down(struct rte_eth_dev *dev)
1029 {
1030         return mlx5_set_flags(dev, ~IFF_UP, ~IFF_UP);
1031 }
1032
1033 /**
1034  * DPDK callback to bring the link UP.
1035  *
1036  * @param dev
1037  *   Pointer to Ethernet device structure.
1038  *
1039  * @return
1040  *   0 on success, a negative errno value otherwise and rte_errno is set.
1041  */
1042 int
1043 mlx5_set_link_up(struct rte_eth_dev *dev)
1044 {
1045         return mlx5_set_flags(dev, ~IFF_UP, IFF_UP);
1046 }
1047
1048 /**
1049  * Configure the TX function to use.
1050  *
1051  * @param dev
1052  *   Pointer to private data structure.
1053  *
1054  * @return
1055  *   Pointer to selected Tx burst function.
1056  */
1057 eth_tx_burst_t
1058 mlx5_select_tx_function(struct rte_eth_dev *dev)
1059 {
1060         struct priv *priv = dev->data->dev_private;
1061         eth_tx_burst_t tx_pkt_burst = mlx5_tx_burst;
1062         struct mlx5_dev_config *config = &priv->config;
1063         uint64_t tx_offloads = dev->data->dev_conf.txmode.offloads;
1064         int tso = !!(tx_offloads & (DEV_TX_OFFLOAD_TCP_TSO |
1065                                     DEV_TX_OFFLOAD_VXLAN_TNL_TSO |
1066                                     DEV_TX_OFFLOAD_GRE_TNL_TSO));
1067         int vlan_insert = !!(tx_offloads & DEV_TX_OFFLOAD_VLAN_INSERT);
1068
1069         assert(priv != NULL);
1070         /* Select appropriate TX function. */
1071         if (vlan_insert || tso)
1072                 return tx_pkt_burst;
1073         if (config->mps == MLX5_MPW_ENHANCED) {
1074                 if (mlx5_check_vec_tx_support(dev) > 0) {
1075                         if (mlx5_check_raw_vec_tx_support(dev) > 0)
1076                                 tx_pkt_burst = mlx5_tx_burst_raw_vec;
1077                         else
1078                                 tx_pkt_burst = mlx5_tx_burst_vec;
1079                         DRV_LOG(DEBUG,
1080                                 "port %u selected enhanced MPW Tx vectorized"
1081                                 " function",
1082                                 dev->data->port_id);
1083                 } else {
1084                         tx_pkt_burst = mlx5_tx_burst_empw;
1085                         DRV_LOG(DEBUG,
1086                                 "port %u selected enhanced MPW Tx function",
1087                                 dev->data->port_id);
1088                 }
1089         } else if (config->mps && (config->txq_inline > 0)) {
1090                 tx_pkt_burst = mlx5_tx_burst_mpw_inline;
1091                 DRV_LOG(DEBUG, "port %u selected MPW inline Tx function",
1092                         dev->data->port_id);
1093         } else if (config->mps) {
1094                 tx_pkt_burst = mlx5_tx_burst_mpw;
1095                 DRV_LOG(DEBUG, "port %u selected MPW Tx function",
1096                         dev->data->port_id);
1097         }
1098         return tx_pkt_burst;
1099 }
1100
1101 /**
1102  * Configure the RX function to use.
1103  *
1104  * @param dev
1105  *   Pointer to private data structure.
1106  *
1107  * @return
1108  *   Pointer to selected Rx burst function.
1109  */
1110 eth_rx_burst_t
1111 mlx5_select_rx_function(struct rte_eth_dev *dev)
1112 {
1113         eth_rx_burst_t rx_pkt_burst = mlx5_rx_burst;
1114
1115         assert(dev != NULL);
1116         if (mlx5_check_vec_rx_support(dev) > 0) {
1117                 rx_pkt_burst = mlx5_rx_burst_vec;
1118                 DRV_LOG(DEBUG, "port %u selected Rx vectorized function",
1119                         dev->data->port_id);
1120         }
1121         return rx_pkt_burst;
1122 }
1123
1124 /**
1125  * Check if mlx5 device was removed.
1126  *
1127  * @param dev
1128  *   Pointer to Ethernet device structure.
1129  *
1130  * @return
1131  *   1 when device is removed, otherwise 0.
1132  */
1133 int
1134 mlx5_is_removed(struct rte_eth_dev *dev)
1135 {
1136         struct ibv_device_attr device_attr;
1137         struct priv *priv = dev->data->dev_private;
1138
1139         if (mlx5_glue->query_device(priv->ctx, &device_attr) == EIO)
1140                 return 1;
1141         return 0;
1142 }