net/mlx5: use Netlink to add/remove MAC addresses
[dpdk.git] / drivers / net / mlx5 / mlx5_ethdev.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2015 6WIND S.A.
3  * Copyright 2015 Mellanox Technologies, Ltd
4  */
5
6 #define _GNU_SOURCE
7
8 #include <stddef.h>
9 #include <assert.h>
10 #include <inttypes.h>
11 #include <unistd.h>
12 #include <stdint.h>
13 #include <stdio.h>
14 #include <string.h>
15 #include <stdlib.h>
16 #include <errno.h>
17 #include <dirent.h>
18 #include <net/if.h>
19 #include <sys/ioctl.h>
20 #include <sys/socket.h>
21 #include <netinet/in.h>
22 #include <linux/ethtool.h>
23 #include <linux/sockios.h>
24 #include <fcntl.h>
25 #include <stdalign.h>
26 #include <sys/un.h>
27 #include <time.h>
28
29 #include <rte_atomic.h>
30 #include <rte_ethdev_driver.h>
31 #include <rte_bus_pci.h>
32 #include <rte_mbuf.h>
33 #include <rte_common.h>
34 #include <rte_interrupts.h>
35 #include <rte_malloc.h>
36 #include <rte_string_fns.h>
37
38 #include "mlx5.h"
39 #include "mlx5_glue.h"
40 #include "mlx5_rxtx.h"
41 #include "mlx5_utils.h"
42
43 /* Add defines in case the running kernel is not the same as user headers. */
44 #ifndef ETHTOOL_GLINKSETTINGS
45 struct ethtool_link_settings {
46         uint32_t cmd;
47         uint32_t speed;
48         uint8_t duplex;
49         uint8_t port;
50         uint8_t phy_address;
51         uint8_t autoneg;
52         uint8_t mdio_support;
53         uint8_t eth_to_mdix;
54         uint8_t eth_tp_mdix_ctrl;
55         int8_t link_mode_masks_nwords;
56         uint32_t reserved[8];
57         uint32_t link_mode_masks[];
58 };
59
60 #define ETHTOOL_GLINKSETTINGS 0x0000004c
61 #define ETHTOOL_LINK_MODE_1000baseT_Full_BIT 5
62 #define ETHTOOL_LINK_MODE_Autoneg_BIT 6
63 #define ETHTOOL_LINK_MODE_1000baseKX_Full_BIT 17
64 #define ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT 18
65 #define ETHTOOL_LINK_MODE_10000baseKR_Full_BIT 19
66 #define ETHTOOL_LINK_MODE_10000baseR_FEC_BIT 20
67 #define ETHTOOL_LINK_MODE_20000baseMLD2_Full_BIT 21
68 #define ETHTOOL_LINK_MODE_20000baseKR2_Full_BIT 22
69 #define ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT 23
70 #define ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT 24
71 #define ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT 25
72 #define ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT 26
73 #define ETHTOOL_LINK_MODE_56000baseKR4_Full_BIT 27
74 #define ETHTOOL_LINK_MODE_56000baseCR4_Full_BIT 28
75 #define ETHTOOL_LINK_MODE_56000baseSR4_Full_BIT 29
76 #define ETHTOOL_LINK_MODE_56000baseLR4_Full_BIT 30
77 #endif
78 #ifndef HAVE_ETHTOOL_LINK_MODE_25G
79 #define ETHTOOL_LINK_MODE_25000baseCR_Full_BIT 31
80 #define ETHTOOL_LINK_MODE_25000baseKR_Full_BIT 32
81 #define ETHTOOL_LINK_MODE_25000baseSR_Full_BIT 33
82 #endif
83 #ifndef HAVE_ETHTOOL_LINK_MODE_50G
84 #define ETHTOOL_LINK_MODE_50000baseCR2_Full_BIT 34
85 #define ETHTOOL_LINK_MODE_50000baseKR2_Full_BIT 35
86 #endif
87 #ifndef HAVE_ETHTOOL_LINK_MODE_100G
88 #define ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT 36
89 #define ETHTOOL_LINK_MODE_100000baseSR4_Full_BIT 37
90 #define ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT 38
91 #define ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT 39
92 #endif
93
94 /**
95  * Get interface name from private structure.
96  *
97  * @param[in] dev
98  *   Pointer to Ethernet device.
99  * @param[out] ifname
100  *   Interface name output buffer.
101  *
102  * @return
103  *   0 on success, a negative errno value otherwise and rte_errno is set.
104  */
105 int
106 mlx5_get_ifname(const struct rte_eth_dev *dev, char (*ifname)[IF_NAMESIZE])
107 {
108         struct priv *priv = dev->data->dev_private;
109         DIR *dir;
110         struct dirent *dent;
111         unsigned int dev_type = 0;
112         unsigned int dev_port_prev = ~0u;
113         char match[IF_NAMESIZE] = "";
114
115         {
116                 MKSTR(path, "%s/device/net", priv->ibdev_path);
117
118                 dir = opendir(path);
119                 if (dir == NULL) {
120                         rte_errno = errno;
121                         return -rte_errno;
122                 }
123         }
124         while ((dent = readdir(dir)) != NULL) {
125                 char *name = dent->d_name;
126                 FILE *file;
127                 unsigned int dev_port;
128                 int r;
129
130                 if ((name[0] == '.') &&
131                     ((name[1] == '\0') ||
132                      ((name[1] == '.') && (name[2] == '\0'))))
133                         continue;
134
135                 MKSTR(path, "%s/device/net/%s/%s",
136                       priv->ibdev_path, name,
137                       (dev_type ? "dev_id" : "dev_port"));
138
139                 file = fopen(path, "rb");
140                 if (file == NULL) {
141                         if (errno != ENOENT)
142                                 continue;
143                         /*
144                          * Switch to dev_id when dev_port does not exist as
145                          * is the case with Linux kernel versions < 3.15.
146                          */
147 try_dev_id:
148                         match[0] = '\0';
149                         if (dev_type)
150                                 break;
151                         dev_type = 1;
152                         dev_port_prev = ~0u;
153                         rewinddir(dir);
154                         continue;
155                 }
156                 r = fscanf(file, (dev_type ? "%x" : "%u"), &dev_port);
157                 fclose(file);
158                 if (r != 1)
159                         continue;
160                 /*
161                  * Switch to dev_id when dev_port returns the same value for
162                  * all ports. May happen when using a MOFED release older than
163                  * 3.0 with a Linux kernel >= 3.15.
164                  */
165                 if (dev_port == dev_port_prev)
166                         goto try_dev_id;
167                 dev_port_prev = dev_port;
168                 if (dev_port == (priv->port - 1u))
169                         strlcpy(match, name, sizeof(match));
170         }
171         closedir(dir);
172         if (match[0] == '\0') {
173                 rte_errno = ENOENT;
174                 return -rte_errno;
175         }
176         strncpy(*ifname, match, sizeof(*ifname));
177         return 0;
178 }
179
180 /**
181  * Get the interface index from device name.
182  *
183  * @param[in] dev
184  *   Pointer to Ethernet device.
185  *
186  * @return
187  *   Interface index on success, a negative errno value otherwise and
188  *   rte_errno is set.
189  */
190 int
191 mlx5_ifindex(const struct rte_eth_dev *dev)
192 {
193         char ifname[IF_NAMESIZE];
194         int ret;
195
196         ret = mlx5_get_ifname(dev, &ifname);
197         if (ret)
198                 return ret;
199         ret = if_nametoindex(ifname);
200         if (ret == -1) {
201                 rte_errno = errno;
202                 return -rte_errno;
203         }
204         return ret;
205 }
206
207 /**
208  * Perform ifreq ioctl() on associated Ethernet device.
209  *
210  * @param[in] dev
211  *   Pointer to Ethernet device.
212  * @param req
213  *   Request number to pass to ioctl().
214  * @param[out] ifr
215  *   Interface request structure output buffer.
216  *
217  * @return
218  *   0 on success, a negative errno value otherwise and rte_errno is set.
219  */
220 int
221 mlx5_ifreq(const struct rte_eth_dev *dev, int req, struct ifreq *ifr)
222 {
223         int sock = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
224         int ret = 0;
225
226         if (sock == -1) {
227                 rte_errno = errno;
228                 return -rte_errno;
229         }
230         ret = mlx5_get_ifname(dev, &ifr->ifr_name);
231         if (ret)
232                 goto error;
233         ret = ioctl(sock, req, ifr);
234         if (ret == -1) {
235                 rte_errno = errno;
236                 goto error;
237         }
238         close(sock);
239         return 0;
240 error:
241         close(sock);
242         return -rte_errno;
243 }
244
245 /**
246  * Get device MTU.
247  *
248  * @param dev
249  *   Pointer to Ethernet device.
250  * @param[out] mtu
251  *   MTU value output buffer.
252  *
253  * @return
254  *   0 on success, a negative errno value otherwise and rte_errno is set.
255  */
256 int
257 mlx5_get_mtu(struct rte_eth_dev *dev, uint16_t *mtu)
258 {
259         struct ifreq request;
260         int ret = mlx5_ifreq(dev, SIOCGIFMTU, &request);
261
262         if (ret)
263                 return ret;
264         *mtu = request.ifr_mtu;
265         return 0;
266 }
267
268 /**
269  * Set device MTU.
270  *
271  * @param dev
272  *   Pointer to Ethernet device.
273  * @param mtu
274  *   MTU value to set.
275  *
276  * @return
277  *   0 on success, a negative errno value otherwise and rte_errno is set.
278  */
279 static int
280 mlx5_set_mtu(struct rte_eth_dev *dev, uint16_t mtu)
281 {
282         struct ifreq request = { .ifr_mtu = mtu, };
283
284         return mlx5_ifreq(dev, SIOCSIFMTU, &request);
285 }
286
287 /**
288  * Set device flags.
289  *
290  * @param dev
291  *   Pointer to Ethernet device.
292  * @param keep
293  *   Bitmask for flags that must remain untouched.
294  * @param flags
295  *   Bitmask for flags to modify.
296  *
297  * @return
298  *   0 on success, a negative errno value otherwise and rte_errno is set.
299  */
300 int
301 mlx5_set_flags(struct rte_eth_dev *dev, unsigned int keep, unsigned int flags)
302 {
303         struct ifreq request;
304         int ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &request);
305
306         if (ret)
307                 return ret;
308         request.ifr_flags &= keep;
309         request.ifr_flags |= flags & ~keep;
310         return mlx5_ifreq(dev, SIOCSIFFLAGS, &request);
311 }
312
313 /**
314  * DPDK callback for Ethernet device configuration.
315  *
316  * @param dev
317  *   Pointer to Ethernet device structure.
318  *
319  * @return
320  *   0 on success, a negative errno value otherwise and rte_errno is set.
321  */
322 int
323 mlx5_dev_configure(struct rte_eth_dev *dev)
324 {
325         struct priv *priv = dev->data->dev_private;
326         unsigned int rxqs_n = dev->data->nb_rx_queues;
327         unsigned int txqs_n = dev->data->nb_tx_queues;
328         unsigned int i;
329         unsigned int j;
330         unsigned int reta_idx_n;
331         const uint8_t use_app_rss_key =
332                 !!dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key;
333         uint64_t supp_tx_offloads = mlx5_get_tx_port_offloads(dev);
334         uint64_t tx_offloads = dev->data->dev_conf.txmode.offloads;
335         uint64_t supp_rx_offloads =
336                 (mlx5_get_rx_port_offloads() |
337                  mlx5_get_rx_queue_offloads(dev));
338         uint64_t rx_offloads = dev->data->dev_conf.rxmode.offloads;
339         int ret = 0;
340
341         if ((tx_offloads & supp_tx_offloads) != tx_offloads) {
342                 DRV_LOG(ERR,
343                         "port %u some Tx offloads are not supported requested"
344                         " 0x%" PRIx64 " supported 0x%" PRIx64,
345                         dev->data->port_id, tx_offloads, supp_tx_offloads);
346                 rte_errno = ENOTSUP;
347                 return -rte_errno;
348         }
349         if ((rx_offloads & supp_rx_offloads) != rx_offloads) {
350                 DRV_LOG(ERR,
351                         "port %u some Rx offloads are not supported requested"
352                         " 0x%" PRIx64 " supported 0x%" PRIx64,
353                         dev->data->port_id, rx_offloads, supp_rx_offloads);
354                 rte_errno = ENOTSUP;
355                 return -rte_errno;
356         }
357         if (use_app_rss_key &&
358             (dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key_len !=
359              rss_hash_default_key_len)) {
360                 DRV_LOG(ERR, "port %u RSS key len must be %zu Bytes long",
361                         dev->data->port_id, rss_hash_default_key_len);
362                 rte_errno = EINVAL;
363                 return -rte_errno;
364         }
365         priv->rss_conf.rss_key =
366                 rte_realloc(priv->rss_conf.rss_key,
367                             rss_hash_default_key_len, 0);
368         if (!priv->rss_conf.rss_key) {
369                 DRV_LOG(ERR, "port %u cannot allocate RSS hash key memory (%u)",
370                         dev->data->port_id, rxqs_n);
371                 rte_errno = ENOMEM;
372                 return -rte_errno;
373         }
374         memcpy(priv->rss_conf.rss_key,
375                use_app_rss_key ?
376                dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key :
377                rss_hash_default_key,
378                rss_hash_default_key_len);
379         priv->rss_conf.rss_key_len = rss_hash_default_key_len;
380         priv->rss_conf.rss_hf = dev->data->dev_conf.rx_adv_conf.rss_conf.rss_hf;
381         priv->rxqs = (void *)dev->data->rx_queues;
382         priv->txqs = (void *)dev->data->tx_queues;
383         if (txqs_n != priv->txqs_n) {
384                 DRV_LOG(INFO, "port %u Tx queues number update: %u -> %u",
385                         dev->data->port_id, priv->txqs_n, txqs_n);
386                 priv->txqs_n = txqs_n;
387         }
388         if (rxqs_n > priv->config.ind_table_max_size) {
389                 DRV_LOG(ERR, "port %u cannot handle this many Rx queues (%u)",
390                         dev->data->port_id, rxqs_n);
391                 rte_errno = EINVAL;
392                 return -rte_errno;
393         }
394         if (rxqs_n == priv->rxqs_n)
395                 return 0;
396         DRV_LOG(INFO, "port %u Rx queues number update: %u -> %u",
397                 dev->data->port_id, priv->rxqs_n, rxqs_n);
398         priv->rxqs_n = rxqs_n;
399         /* If the requested number of RX queues is not a power of two, use the
400          * maximum indirection table size for better balancing.
401          * The result is always rounded to the next power of two. */
402         reta_idx_n = (1 << log2above((rxqs_n & (rxqs_n - 1)) ?
403                                      priv->config.ind_table_max_size :
404                                      rxqs_n));
405         ret = mlx5_rss_reta_index_resize(dev, reta_idx_n);
406         if (ret)
407                 return ret;
408         /* When the number of RX queues is not a power of two, the remaining
409          * table entries are padded with reused WQs and hashes are not spread
410          * uniformly. */
411         for (i = 0, j = 0; (i != reta_idx_n); ++i) {
412                 (*priv->reta_idx)[i] = j;
413                 if (++j == rxqs_n)
414                         j = 0;
415         }
416         return 0;
417 }
418
419 /**
420  * DPDK callback to get information about the device.
421  *
422  * @param dev
423  *   Pointer to Ethernet device structure.
424  * @param[out] info
425  *   Info structure output buffer.
426  */
427 void
428 mlx5_dev_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *info)
429 {
430         struct priv *priv = dev->data->dev_private;
431         struct mlx5_dev_config *config = &priv->config;
432         unsigned int max;
433         char ifname[IF_NAMESIZE];
434
435         info->pci_dev = RTE_ETH_DEV_TO_PCI(dev);
436         /* FIXME: we should ask the device for these values. */
437         info->min_rx_bufsize = 32;
438         info->max_rx_pktlen = 65536;
439         /*
440          * Since we need one CQ per QP, the limit is the minimum number
441          * between the two values.
442          */
443         max = RTE_MIN(priv->device_attr.orig_attr.max_cq,
444                       priv->device_attr.orig_attr.max_qp);
445         /* If max >= 65535 then max = 0, max_rx_queues is uint16_t. */
446         if (max >= 65535)
447                 max = 65535;
448         info->max_rx_queues = max;
449         info->max_tx_queues = max;
450         info->max_mac_addrs = RTE_DIM(priv->mac);
451         info->rx_queue_offload_capa = mlx5_get_rx_queue_offloads(dev);
452         info->rx_offload_capa = (mlx5_get_rx_port_offloads() |
453                                  info->rx_queue_offload_capa);
454         info->tx_offload_capa = mlx5_get_tx_port_offloads(dev);
455         if (mlx5_get_ifname(dev, &ifname) == 0)
456                 info->if_index = if_nametoindex(ifname);
457         info->reta_size = priv->reta_idx_n ?
458                 priv->reta_idx_n : config->ind_table_max_size;
459         info->hash_key_size = rss_hash_default_key_len;
460         info->speed_capa = priv->link_speed_capa;
461         info->flow_type_rss_offloads = ~MLX5_RSS_HF_MASK;
462 }
463
464 /**
465  * Get supported packet types.
466  *
467  * @param dev
468  *   Pointer to Ethernet device structure.
469  *
470  * @return
471  *   A pointer to the supported Packet types array.
472  */
473 const uint32_t *
474 mlx5_dev_supported_ptypes_get(struct rte_eth_dev *dev)
475 {
476         static const uint32_t ptypes[] = {
477                 /* refers to rxq_cq_to_pkt_type() */
478                 RTE_PTYPE_L2_ETHER,
479                 RTE_PTYPE_L3_IPV4_EXT_UNKNOWN,
480                 RTE_PTYPE_L3_IPV6_EXT_UNKNOWN,
481                 RTE_PTYPE_L4_NONFRAG,
482                 RTE_PTYPE_L4_FRAG,
483                 RTE_PTYPE_L4_TCP,
484                 RTE_PTYPE_L4_UDP,
485                 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN,
486                 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN,
487                 RTE_PTYPE_INNER_L4_NONFRAG,
488                 RTE_PTYPE_INNER_L4_FRAG,
489                 RTE_PTYPE_INNER_L4_TCP,
490                 RTE_PTYPE_INNER_L4_UDP,
491                 RTE_PTYPE_UNKNOWN
492         };
493
494         if (dev->rx_pkt_burst == mlx5_rx_burst ||
495             dev->rx_pkt_burst == mlx5_rx_burst_vec)
496                 return ptypes;
497         return NULL;
498 }
499
500 /**
501  * DPDK callback to retrieve physical link information.
502  *
503  * @param dev
504  *   Pointer to Ethernet device structure.
505  * @param[out] link
506  *   Storage for current link status.
507  *
508  * @return
509  *   0 on success, a negative errno value otherwise and rte_errno is set.
510  */
511 static int
512 mlx5_link_update_unlocked_gset(struct rte_eth_dev *dev,
513                                struct rte_eth_link *link)
514 {
515         struct priv *priv = dev->data->dev_private;
516         struct ethtool_cmd edata = {
517                 .cmd = ETHTOOL_GSET /* Deprecated since Linux v4.5. */
518         };
519         struct ifreq ifr;
520         struct rte_eth_link dev_link;
521         int link_speed = 0;
522         int ret;
523
524         ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr);
525         if (ret) {
526                 DRV_LOG(WARNING, "port %u ioctl(SIOCGIFFLAGS) failed: %s",
527                         dev->data->port_id, strerror(rte_errno));
528                 return ret;
529         }
530         memset(&dev_link, 0, sizeof(dev_link));
531         dev_link.link_status = ((ifr.ifr_flags & IFF_UP) &&
532                                 (ifr.ifr_flags & IFF_RUNNING));
533         ifr.ifr_data = (void *)&edata;
534         ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
535         if (ret) {
536                 DRV_LOG(WARNING,
537                         "port %u ioctl(SIOCETHTOOL, ETHTOOL_GSET) failed: %s",
538                         dev->data->port_id, strerror(rte_errno));
539                 return ret;
540         }
541         link_speed = ethtool_cmd_speed(&edata);
542         if (link_speed == -1)
543                 dev_link.link_speed = 0;
544         else
545                 dev_link.link_speed = link_speed;
546         priv->link_speed_capa = 0;
547         if (edata.supported & SUPPORTED_Autoneg)
548                 priv->link_speed_capa |= ETH_LINK_SPEED_AUTONEG;
549         if (edata.supported & (SUPPORTED_1000baseT_Full |
550                                SUPPORTED_1000baseKX_Full))
551                 priv->link_speed_capa |= ETH_LINK_SPEED_1G;
552         if (edata.supported & SUPPORTED_10000baseKR_Full)
553                 priv->link_speed_capa |= ETH_LINK_SPEED_10G;
554         if (edata.supported & (SUPPORTED_40000baseKR4_Full |
555                                SUPPORTED_40000baseCR4_Full |
556                                SUPPORTED_40000baseSR4_Full |
557                                SUPPORTED_40000baseLR4_Full))
558                 priv->link_speed_capa |= ETH_LINK_SPEED_40G;
559         dev_link.link_duplex = ((edata.duplex == DUPLEX_HALF) ?
560                                 ETH_LINK_HALF_DUPLEX : ETH_LINK_FULL_DUPLEX);
561         dev_link.link_autoneg = !(dev->data->dev_conf.link_speeds &
562                         ETH_LINK_SPEED_FIXED);
563         if ((dev_link.link_speed && !dev_link.link_status) ||
564             (!dev_link.link_speed && dev_link.link_status)) {
565                 rte_errno = EAGAIN;
566                 return -rte_errno;
567         }
568         *link = dev_link;
569         return 0;
570 }
571
572 /**
573  * Retrieve physical link information (unlocked version using new ioctl).
574  *
575  * @param dev
576  *   Pointer to Ethernet device structure.
577  * @param[out] link
578  *   Storage for current link status.
579  *
580  * @return
581  *   0 on success, a negative errno value otherwise and rte_errno is set.
582  */
583 static int
584 mlx5_link_update_unlocked_gs(struct rte_eth_dev *dev,
585                              struct rte_eth_link *link)
586
587 {
588         struct priv *priv = dev->data->dev_private;
589         struct ethtool_link_settings gcmd = { .cmd = ETHTOOL_GLINKSETTINGS };
590         struct ifreq ifr;
591         struct rte_eth_link dev_link;
592         uint64_t sc;
593         int ret;
594
595         ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr);
596         if (ret) {
597                 DRV_LOG(WARNING, "port %u ioctl(SIOCGIFFLAGS) failed: %s",
598                         dev->data->port_id, strerror(rte_errno));
599                 return ret;
600         }
601         memset(&dev_link, 0, sizeof(dev_link));
602         dev_link.link_status = ((ifr.ifr_flags & IFF_UP) &&
603                                 (ifr.ifr_flags & IFF_RUNNING));
604         ifr.ifr_data = (void *)&gcmd;
605         ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
606         if (ret) {
607                 DRV_LOG(DEBUG,
608                         "port %u ioctl(SIOCETHTOOL, ETHTOOL_GLINKSETTINGS)"
609                         " failed: %s",
610                         dev->data->port_id, strerror(rte_errno));
611                 return ret;
612         }
613         gcmd.link_mode_masks_nwords = -gcmd.link_mode_masks_nwords;
614
615         alignas(struct ethtool_link_settings)
616         uint8_t data[offsetof(struct ethtool_link_settings, link_mode_masks) +
617                      sizeof(uint32_t) * gcmd.link_mode_masks_nwords * 3];
618         struct ethtool_link_settings *ecmd = (void *)data;
619
620         *ecmd = gcmd;
621         ifr.ifr_data = (void *)ecmd;
622         ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
623         if (ret) {
624                 DRV_LOG(DEBUG,
625                         "port %u ioctl(SIOCETHTOOL, ETHTOOL_GLINKSETTINGS)"
626                         " failed: %s",
627                         dev->data->port_id, strerror(rte_errno));
628                 return ret;
629         }
630         dev_link.link_speed = ecmd->speed;
631         sc = ecmd->link_mode_masks[0] |
632                 ((uint64_t)ecmd->link_mode_masks[1] << 32);
633         priv->link_speed_capa = 0;
634         if (sc & MLX5_BITSHIFT(ETHTOOL_LINK_MODE_Autoneg_BIT))
635                 priv->link_speed_capa |= ETH_LINK_SPEED_AUTONEG;
636         if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_1000baseT_Full_BIT) |
637                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_1000baseKX_Full_BIT)))
638                 priv->link_speed_capa |= ETH_LINK_SPEED_1G;
639         if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT) |
640                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_10000baseKR_Full_BIT) |
641                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_10000baseR_FEC_BIT)))
642                 priv->link_speed_capa |= ETH_LINK_SPEED_10G;
643         if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_20000baseMLD2_Full_BIT) |
644                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_20000baseKR2_Full_BIT)))
645                 priv->link_speed_capa |= ETH_LINK_SPEED_20G;
646         if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT) |
647                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT) |
648                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT) |
649                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT)))
650                 priv->link_speed_capa |= ETH_LINK_SPEED_40G;
651         if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseKR4_Full_BIT) |
652                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseCR4_Full_BIT) |
653                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseSR4_Full_BIT) |
654                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseLR4_Full_BIT)))
655                 priv->link_speed_capa |= ETH_LINK_SPEED_56G;
656         if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_25000baseCR_Full_BIT) |
657                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_25000baseKR_Full_BIT) |
658                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_25000baseSR_Full_BIT)))
659                 priv->link_speed_capa |= ETH_LINK_SPEED_25G;
660         if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_50000baseCR2_Full_BIT) |
661                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_50000baseKR2_Full_BIT)))
662                 priv->link_speed_capa |= ETH_LINK_SPEED_50G;
663         if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT) |
664                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseSR4_Full_BIT) |
665                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT) |
666                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT)))
667                 priv->link_speed_capa |= ETH_LINK_SPEED_100G;
668         dev_link.link_duplex = ((ecmd->duplex == DUPLEX_HALF) ?
669                                 ETH_LINK_HALF_DUPLEX : ETH_LINK_FULL_DUPLEX);
670         dev_link.link_autoneg = !(dev->data->dev_conf.link_speeds &
671                                   ETH_LINK_SPEED_FIXED);
672         if ((dev_link.link_speed && !dev_link.link_status) ||
673             (!dev_link.link_speed && dev_link.link_status)) {
674                 rte_errno = EAGAIN;
675                 return -rte_errno;
676         }
677         *link = dev_link;
678         return 0;
679 }
680
681 /**
682  * DPDK callback to retrieve physical link information.
683  *
684  * @param dev
685  *   Pointer to Ethernet device structure.
686  * @param wait_to_complete
687  *   Wait for request completion.
688  *
689  * @return
690  *   0 if link status was not updated, positive if it was, a negative errno
691  *   value otherwise and rte_errno is set.
692  */
693 int
694 mlx5_link_update(struct rte_eth_dev *dev, int wait_to_complete)
695 {
696         int ret;
697         struct rte_eth_link dev_link;
698         time_t start_time = time(NULL);
699
700         do {
701                 ret = mlx5_link_update_unlocked_gset(dev, &dev_link);
702                 if (ret)
703                         ret = mlx5_link_update_unlocked_gs(dev, &dev_link);
704                 if (ret == 0)
705                         break;
706                 /* Handle wait to complete situation. */
707                 if (wait_to_complete && ret == -EAGAIN) {
708                         if (abs((int)difftime(time(NULL), start_time)) <
709                             MLX5_LINK_STATUS_TIMEOUT) {
710                                 usleep(0);
711                                 continue;
712                         } else {
713                                 rte_errno = EBUSY;
714                                 return -rte_errno;
715                         }
716                 } else if (ret < 0) {
717                         return ret;
718                 }
719         } while (wait_to_complete);
720         ret = !!memcmp(&dev->data->dev_link, &dev_link,
721                        sizeof(struct rte_eth_link));
722         dev->data->dev_link = dev_link;
723         return ret;
724 }
725
726 /**
727  * DPDK callback to change the MTU.
728  *
729  * @param dev
730  *   Pointer to Ethernet device structure.
731  * @param in_mtu
732  *   New MTU.
733  *
734  * @return
735  *   0 on success, a negative errno value otherwise and rte_errno is set.
736  */
737 int
738 mlx5_dev_set_mtu(struct rte_eth_dev *dev, uint16_t mtu)
739 {
740         struct priv *priv = dev->data->dev_private;
741         uint16_t kern_mtu = 0;
742         int ret;
743
744         ret = mlx5_get_mtu(dev, &kern_mtu);
745         if (ret)
746                 return ret;
747         /* Set kernel interface MTU first. */
748         ret = mlx5_set_mtu(dev, mtu);
749         if (ret)
750                 return ret;
751         ret = mlx5_get_mtu(dev, &kern_mtu);
752         if (ret)
753                 return ret;
754         if (kern_mtu == mtu) {
755                 priv->mtu = mtu;
756                 DRV_LOG(DEBUG, "port %u adapter MTU set to %u",
757                         dev->data->port_id, mtu);
758                 return 0;
759         }
760         rte_errno = EAGAIN;
761         return -rte_errno;
762 }
763
764 /**
765  * DPDK callback to get flow control status.
766  *
767  * @param dev
768  *   Pointer to Ethernet device structure.
769  * @param[out] fc_conf
770  *   Flow control output buffer.
771  *
772  * @return
773  *   0 on success, a negative errno value otherwise and rte_errno is set.
774  */
775 int
776 mlx5_dev_get_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf)
777 {
778         struct ifreq ifr;
779         struct ethtool_pauseparam ethpause = {
780                 .cmd = ETHTOOL_GPAUSEPARAM
781         };
782         int ret;
783
784         ifr.ifr_data = (void *)&ethpause;
785         ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
786         if (ret) {
787                 DRV_LOG(WARNING,
788                         "port %u ioctl(SIOCETHTOOL, ETHTOOL_GPAUSEPARAM) failed:"
789                         " %s",
790                         dev->data->port_id, strerror(rte_errno));
791                 return ret;
792         }
793         fc_conf->autoneg = ethpause.autoneg;
794         if (ethpause.rx_pause && ethpause.tx_pause)
795                 fc_conf->mode = RTE_FC_FULL;
796         else if (ethpause.rx_pause)
797                 fc_conf->mode = RTE_FC_RX_PAUSE;
798         else if (ethpause.tx_pause)
799                 fc_conf->mode = RTE_FC_TX_PAUSE;
800         else
801                 fc_conf->mode = RTE_FC_NONE;
802         return 0;
803 }
804
805 /**
806  * DPDK callback to modify flow control parameters.
807  *
808  * @param dev
809  *   Pointer to Ethernet device structure.
810  * @param[in] fc_conf
811  *   Flow control parameters.
812  *
813  * @return
814  *   0 on success, a negative errno value otherwise and rte_errno is set.
815  */
816 int
817 mlx5_dev_set_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf)
818 {
819         struct ifreq ifr;
820         struct ethtool_pauseparam ethpause = {
821                 .cmd = ETHTOOL_SPAUSEPARAM
822         };
823         int ret;
824
825         ifr.ifr_data = (void *)&ethpause;
826         ethpause.autoneg = fc_conf->autoneg;
827         if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) ||
828             (fc_conf->mode & RTE_FC_RX_PAUSE))
829                 ethpause.rx_pause = 1;
830         else
831                 ethpause.rx_pause = 0;
832
833         if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) ||
834             (fc_conf->mode & RTE_FC_TX_PAUSE))
835                 ethpause.tx_pause = 1;
836         else
837                 ethpause.tx_pause = 0;
838         ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
839         if (ret) {
840                 DRV_LOG(WARNING,
841                         "port %u ioctl(SIOCETHTOOL, ETHTOOL_SPAUSEPARAM)"
842                         " failed: %s",
843                         dev->data->port_id, strerror(rte_errno));
844                 return ret;
845         }
846         return 0;
847 }
848
849 /**
850  * Get PCI information from struct ibv_device.
851  *
852  * @param device
853  *   Pointer to Ethernet device structure.
854  * @param[out] pci_addr
855  *   PCI bus address output buffer.
856  *
857  * @return
858  *   0 on success, a negative errno value otherwise and rte_errno is set.
859  */
860 int
861 mlx5_ibv_device_to_pci_addr(const struct ibv_device *device,
862                             struct rte_pci_addr *pci_addr)
863 {
864         FILE *file;
865         char line[32];
866         MKSTR(path, "%s/device/uevent", device->ibdev_path);
867
868         file = fopen(path, "rb");
869         if (file == NULL) {
870                 rte_errno = errno;
871                 return -rte_errno;
872         }
873         while (fgets(line, sizeof(line), file) == line) {
874                 size_t len = strlen(line);
875                 int ret;
876
877                 /* Truncate long lines. */
878                 if (len == (sizeof(line) - 1))
879                         while (line[(len - 1)] != '\n') {
880                                 ret = fgetc(file);
881                                 if (ret == EOF)
882                                         break;
883                                 line[(len - 1)] = ret;
884                         }
885                 /* Extract information. */
886                 if (sscanf(line,
887                            "PCI_SLOT_NAME="
888                            "%" SCNx32 ":%" SCNx8 ":%" SCNx8 ".%" SCNx8 "\n",
889                            &pci_addr->domain,
890                            &pci_addr->bus,
891                            &pci_addr->devid,
892                            &pci_addr->function) == 4) {
893                         ret = 0;
894                         break;
895                 }
896         }
897         fclose(file);
898         return 0;
899 }
900
901 /**
902  * Device status handler.
903  *
904  * @param dev
905  *   Pointer to Ethernet device.
906  * @param events
907  *   Pointer to event flags holder.
908  *
909  * @return
910  *   Events bitmap of callback process which can be called immediately.
911  */
912 static uint32_t
913 mlx5_dev_status_handler(struct rte_eth_dev *dev)
914 {
915         struct priv *priv = dev->data->dev_private;
916         struct ibv_async_event event;
917         uint32_t ret = 0;
918
919         if (mlx5_link_update(dev, 0) == -EAGAIN) {
920                 usleep(0);
921                 return 0;
922         }
923         /* Read all message and acknowledge them. */
924         for (;;) {
925                 if (mlx5_glue->get_async_event(priv->ctx, &event))
926                         break;
927                 if ((event.event_type == IBV_EVENT_PORT_ACTIVE ||
928                         event.event_type == IBV_EVENT_PORT_ERR) &&
929                         (dev->data->dev_conf.intr_conf.lsc == 1))
930                         ret |= (1 << RTE_ETH_EVENT_INTR_LSC);
931                 else if (event.event_type == IBV_EVENT_DEVICE_FATAL &&
932                         dev->data->dev_conf.intr_conf.rmv == 1)
933                         ret |= (1 << RTE_ETH_EVENT_INTR_RMV);
934                 else
935                         DRV_LOG(DEBUG,
936                                 "port %u event type %d on not handled",
937                                 dev->data->port_id, event.event_type);
938                 mlx5_glue->ack_async_event(&event);
939         }
940         return ret;
941 }
942
943 /**
944  * Handle interrupts from the NIC.
945  *
946  * @param[in] intr_handle
947  *   Interrupt handler.
948  * @param cb_arg
949  *   Callback argument.
950  */
951 void
952 mlx5_dev_interrupt_handler(void *cb_arg)
953 {
954         struct rte_eth_dev *dev = cb_arg;
955         uint32_t events;
956
957         events = mlx5_dev_status_handler(dev);
958         if (events & (1 << RTE_ETH_EVENT_INTR_LSC))
959                 _rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_INTR_LSC, NULL);
960         if (events & (1 << RTE_ETH_EVENT_INTR_RMV))
961                 _rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_INTR_RMV, NULL);
962 }
963
964 /**
965  * Handle interrupts from the socket.
966  *
967  * @param cb_arg
968  *   Callback argument.
969  */
970 static void
971 mlx5_dev_handler_socket(void *cb_arg)
972 {
973         struct rte_eth_dev *dev = cb_arg;
974
975         mlx5_socket_handle(dev);
976 }
977
978 /**
979  * Uninstall interrupt handler.
980  *
981  * @param dev
982  *   Pointer to Ethernet device.
983  */
984 void
985 mlx5_dev_interrupt_handler_uninstall(struct rte_eth_dev *dev)
986 {
987         struct priv *priv = dev->data->dev_private;
988
989         if (dev->data->dev_conf.intr_conf.lsc ||
990             dev->data->dev_conf.intr_conf.rmv)
991                 rte_intr_callback_unregister(&priv->intr_handle,
992                                              mlx5_dev_interrupt_handler, dev);
993         if (priv->primary_socket)
994                 rte_intr_callback_unregister(&priv->intr_handle_socket,
995                                              mlx5_dev_handler_socket, dev);
996         priv->intr_handle.fd = 0;
997         priv->intr_handle.type = RTE_INTR_HANDLE_UNKNOWN;
998         priv->intr_handle_socket.fd = 0;
999         priv->intr_handle_socket.type = RTE_INTR_HANDLE_UNKNOWN;
1000 }
1001
1002 /**
1003  * Install interrupt handler.
1004  *
1005  * @param dev
1006  *   Pointer to Ethernet device.
1007  */
1008 void
1009 mlx5_dev_interrupt_handler_install(struct rte_eth_dev *dev)
1010 {
1011         struct priv *priv = dev->data->dev_private;
1012         int ret;
1013         int flags;
1014
1015         assert(priv->ctx->async_fd > 0);
1016         flags = fcntl(priv->ctx->async_fd, F_GETFL);
1017         ret = fcntl(priv->ctx->async_fd, F_SETFL, flags | O_NONBLOCK);
1018         if (ret) {
1019                 DRV_LOG(INFO,
1020                         "port %u failed to change file descriptor async event"
1021                         " queue",
1022                         dev->data->port_id);
1023                 dev->data->dev_conf.intr_conf.lsc = 0;
1024                 dev->data->dev_conf.intr_conf.rmv = 0;
1025         }
1026         if (dev->data->dev_conf.intr_conf.lsc ||
1027             dev->data->dev_conf.intr_conf.rmv) {
1028                 priv->intr_handle.fd = priv->ctx->async_fd;
1029                 priv->intr_handle.type = RTE_INTR_HANDLE_EXT;
1030                 rte_intr_callback_register(&priv->intr_handle,
1031                                            mlx5_dev_interrupt_handler, dev);
1032         }
1033         ret = mlx5_socket_init(dev);
1034         if (ret)
1035                 DRV_LOG(ERR, "port %u cannot initialise socket: %s",
1036                         dev->data->port_id, strerror(rte_errno));
1037         else if (priv->primary_socket) {
1038                 priv->intr_handle_socket.fd = priv->primary_socket;
1039                 priv->intr_handle_socket.type = RTE_INTR_HANDLE_EXT;
1040                 rte_intr_callback_register(&priv->intr_handle_socket,
1041                                            mlx5_dev_handler_socket, dev);
1042         }
1043 }
1044
1045 /**
1046  * DPDK callback to bring the link DOWN.
1047  *
1048  * @param dev
1049  *   Pointer to Ethernet device structure.
1050  *
1051  * @return
1052  *   0 on success, a negative errno value otherwise and rte_errno is set.
1053  */
1054 int
1055 mlx5_set_link_down(struct rte_eth_dev *dev)
1056 {
1057         return mlx5_set_flags(dev, ~IFF_UP, ~IFF_UP);
1058 }
1059
1060 /**
1061  * DPDK callback to bring the link UP.
1062  *
1063  * @param dev
1064  *   Pointer to Ethernet device structure.
1065  *
1066  * @return
1067  *   0 on success, a negative errno value otherwise and rte_errno is set.
1068  */
1069 int
1070 mlx5_set_link_up(struct rte_eth_dev *dev)
1071 {
1072         return mlx5_set_flags(dev, ~IFF_UP, IFF_UP);
1073 }
1074
1075 /**
1076  * Configure the TX function to use.
1077  *
1078  * @param dev
1079  *   Pointer to private data structure.
1080  *
1081  * @return
1082  *   Pointer to selected Tx burst function.
1083  */
1084 eth_tx_burst_t
1085 mlx5_select_tx_function(struct rte_eth_dev *dev)
1086 {
1087         struct priv *priv = dev->data->dev_private;
1088         eth_tx_burst_t tx_pkt_burst = mlx5_tx_burst;
1089         struct mlx5_dev_config *config = &priv->config;
1090         uint64_t tx_offloads = dev->data->dev_conf.txmode.offloads;
1091         int tso = !!(tx_offloads & (DEV_TX_OFFLOAD_TCP_TSO |
1092                                     DEV_TX_OFFLOAD_VXLAN_TNL_TSO |
1093                                     DEV_TX_OFFLOAD_GRE_TNL_TSO));
1094         int vlan_insert = !!(tx_offloads & DEV_TX_OFFLOAD_VLAN_INSERT);
1095
1096         assert(priv != NULL);
1097         /* Select appropriate TX function. */
1098         if (vlan_insert || tso)
1099                 return tx_pkt_burst;
1100         if (config->mps == MLX5_MPW_ENHANCED) {
1101                 if (mlx5_check_vec_tx_support(dev) > 0) {
1102                         if (mlx5_check_raw_vec_tx_support(dev) > 0)
1103                                 tx_pkt_burst = mlx5_tx_burst_raw_vec;
1104                         else
1105                                 tx_pkt_burst = mlx5_tx_burst_vec;
1106                         DRV_LOG(DEBUG,
1107                                 "port %u selected enhanced MPW Tx vectorized"
1108                                 " function",
1109                                 dev->data->port_id);
1110                 } else {
1111                         tx_pkt_burst = mlx5_tx_burst_empw;
1112                         DRV_LOG(DEBUG,
1113                                 "port %u selected enhanced MPW Tx function",
1114                                 dev->data->port_id);
1115                 }
1116         } else if (config->mps && (config->txq_inline > 0)) {
1117                 tx_pkt_burst = mlx5_tx_burst_mpw_inline;
1118                 DRV_LOG(DEBUG, "port %u selected MPW inline Tx function",
1119                         dev->data->port_id);
1120         } else if (config->mps) {
1121                 tx_pkt_burst = mlx5_tx_burst_mpw;
1122                 DRV_LOG(DEBUG, "port %u selected MPW Tx function",
1123                         dev->data->port_id);
1124         }
1125         return tx_pkt_burst;
1126 }
1127
1128 /**
1129  * Configure the RX function to use.
1130  *
1131  * @param dev
1132  *   Pointer to private data structure.
1133  *
1134  * @return
1135  *   Pointer to selected Rx burst function.
1136  */
1137 eth_rx_burst_t
1138 mlx5_select_rx_function(struct rte_eth_dev *dev)
1139 {
1140         eth_rx_burst_t rx_pkt_burst = mlx5_rx_burst;
1141
1142         assert(dev != NULL);
1143         if (mlx5_check_vec_rx_support(dev) > 0) {
1144                 rx_pkt_burst = mlx5_rx_burst_vec;
1145                 DRV_LOG(DEBUG, "port %u selected Rx vectorized function",
1146                         dev->data->port_id);
1147         }
1148         return rx_pkt_burst;
1149 }
1150
1151 /**
1152  * Check if mlx5 device was removed.
1153  *
1154  * @param dev
1155  *   Pointer to Ethernet device structure.
1156  *
1157  * @return
1158  *   1 when device is removed, otherwise 0.
1159  */
1160 int
1161 mlx5_is_removed(struct rte_eth_dev *dev)
1162 {
1163         struct ibv_device_attr device_attr;
1164         struct priv *priv = dev->data->dev_private;
1165
1166         if (mlx5_glue->query_device(priv->ctx, &device_attr) == EIO)
1167                 return 1;
1168         return 0;
1169 }