ethdev: new Rx/Tx offloads API
[dpdk.git] / drivers / net / mlx5 / mlx5_ethdev.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2015 6WIND S.A.
3  * Copyright 2015 Mellanox Technologies, Ltd
4  */
5
6 #define _GNU_SOURCE
7
8 #include <stddef.h>
9 #include <assert.h>
10 #include <inttypes.h>
11 #include <unistd.h>
12 #include <stdint.h>
13 #include <stdio.h>
14 #include <string.h>
15 #include <stdlib.h>
16 #include <errno.h>
17 #include <dirent.h>
18 #include <net/if.h>
19 #include <sys/ioctl.h>
20 #include <sys/socket.h>
21 #include <netinet/in.h>
22 #include <linux/ethtool.h>
23 #include <linux/sockios.h>
24 #include <fcntl.h>
25 #include <stdalign.h>
26 #include <sys/un.h>
27 #include <time.h>
28
29 #include <rte_atomic.h>
30 #include <rte_ethdev_driver.h>
31 #include <rte_bus_pci.h>
32 #include <rte_mbuf.h>
33 #include <rte_common.h>
34 #include <rte_interrupts.h>
35 #include <rte_malloc.h>
36 #include <rte_string_fns.h>
37
38 #include "mlx5.h"
39 #include "mlx5_glue.h"
40 #include "mlx5_rxtx.h"
41 #include "mlx5_utils.h"
42
43 /* Add defines in case the running kernel is not the same as user headers. */
44 #ifndef ETHTOOL_GLINKSETTINGS
45 struct ethtool_link_settings {
46         uint32_t cmd;
47         uint32_t speed;
48         uint8_t duplex;
49         uint8_t port;
50         uint8_t phy_address;
51         uint8_t autoneg;
52         uint8_t mdio_support;
53         uint8_t eth_to_mdix;
54         uint8_t eth_tp_mdix_ctrl;
55         int8_t link_mode_masks_nwords;
56         uint32_t reserved[8];
57         uint32_t link_mode_masks[];
58 };
59
60 #define ETHTOOL_GLINKSETTINGS 0x0000004c
61 #define ETHTOOL_LINK_MODE_1000baseT_Full_BIT 5
62 #define ETHTOOL_LINK_MODE_Autoneg_BIT 6
63 #define ETHTOOL_LINK_MODE_1000baseKX_Full_BIT 17
64 #define ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT 18
65 #define ETHTOOL_LINK_MODE_10000baseKR_Full_BIT 19
66 #define ETHTOOL_LINK_MODE_10000baseR_FEC_BIT 20
67 #define ETHTOOL_LINK_MODE_20000baseMLD2_Full_BIT 21
68 #define ETHTOOL_LINK_MODE_20000baseKR2_Full_BIT 22
69 #define ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT 23
70 #define ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT 24
71 #define ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT 25
72 #define ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT 26
73 #define ETHTOOL_LINK_MODE_56000baseKR4_Full_BIT 27
74 #define ETHTOOL_LINK_MODE_56000baseCR4_Full_BIT 28
75 #define ETHTOOL_LINK_MODE_56000baseSR4_Full_BIT 29
76 #define ETHTOOL_LINK_MODE_56000baseLR4_Full_BIT 30
77 #endif
78 #ifndef HAVE_ETHTOOL_LINK_MODE_25G
79 #define ETHTOOL_LINK_MODE_25000baseCR_Full_BIT 31
80 #define ETHTOOL_LINK_MODE_25000baseKR_Full_BIT 32
81 #define ETHTOOL_LINK_MODE_25000baseSR_Full_BIT 33
82 #endif
83 #ifndef HAVE_ETHTOOL_LINK_MODE_50G
84 #define ETHTOOL_LINK_MODE_50000baseCR2_Full_BIT 34
85 #define ETHTOOL_LINK_MODE_50000baseKR2_Full_BIT 35
86 #endif
87 #ifndef HAVE_ETHTOOL_LINK_MODE_100G
88 #define ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT 36
89 #define ETHTOOL_LINK_MODE_100000baseSR4_Full_BIT 37
90 #define ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT 38
91 #define ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT 39
92 #endif
93
94 /**
95  * Get interface name from private structure.
96  *
97  * @param[in] dev
98  *   Pointer to Ethernet device.
99  * @param[out] ifname
100  *   Interface name output buffer.
101  *
102  * @return
103  *   0 on success, a negative errno value otherwise and rte_errno is set.
104  */
105 int
106 mlx5_get_ifname(const struct rte_eth_dev *dev, char (*ifname)[IF_NAMESIZE])
107 {
108         struct priv *priv = dev->data->dev_private;
109         DIR *dir;
110         struct dirent *dent;
111         unsigned int dev_type = 0;
112         unsigned int dev_port_prev = ~0u;
113         char match[IF_NAMESIZE] = "";
114
115         {
116                 MKSTR(path, "%s/device/net", priv->ibdev_path);
117
118                 dir = opendir(path);
119                 if (dir == NULL) {
120                         rte_errno = errno;
121                         return -rte_errno;
122                 }
123         }
124         while ((dent = readdir(dir)) != NULL) {
125                 char *name = dent->d_name;
126                 FILE *file;
127                 unsigned int dev_port;
128                 int r;
129
130                 if ((name[0] == '.') &&
131                     ((name[1] == '\0') ||
132                      ((name[1] == '.') && (name[2] == '\0'))))
133                         continue;
134
135                 MKSTR(path, "%s/device/net/%s/%s",
136                       priv->ibdev_path, name,
137                       (dev_type ? "dev_id" : "dev_port"));
138
139                 file = fopen(path, "rb");
140                 if (file == NULL) {
141                         if (errno != ENOENT)
142                                 continue;
143                         /*
144                          * Switch to dev_id when dev_port does not exist as
145                          * is the case with Linux kernel versions < 3.15.
146                          */
147 try_dev_id:
148                         match[0] = '\0';
149                         if (dev_type)
150                                 break;
151                         dev_type = 1;
152                         dev_port_prev = ~0u;
153                         rewinddir(dir);
154                         continue;
155                 }
156                 r = fscanf(file, (dev_type ? "%x" : "%u"), &dev_port);
157                 fclose(file);
158                 if (r != 1)
159                         continue;
160                 /*
161                  * Switch to dev_id when dev_port returns the same value for
162                  * all ports. May happen when using a MOFED release older than
163                  * 3.0 with a Linux kernel >= 3.15.
164                  */
165                 if (dev_port == dev_port_prev)
166                         goto try_dev_id;
167                 dev_port_prev = dev_port;
168                 if (dev_port == (priv->port - 1u))
169                         strlcpy(match, name, sizeof(match));
170         }
171         closedir(dir);
172         if (match[0] == '\0') {
173                 rte_errno = ENOENT;
174                 return -rte_errno;
175         }
176         strncpy(*ifname, match, sizeof(*ifname));
177         return 0;
178 }
179
180 /**
181  * Get the interface index from device name.
182  *
183  * @param[in] dev
184  *   Pointer to Ethernet device.
185  *
186  * @return
187  *   Interface index on success, a negative errno value otherwise and
188  *   rte_errno is set.
189  */
190 int
191 mlx5_ifindex(const struct rte_eth_dev *dev)
192 {
193         char ifname[IF_NAMESIZE];
194         int ret;
195
196         ret = mlx5_get_ifname(dev, &ifname);
197         if (ret)
198                 return ret;
199         ret = if_nametoindex(ifname);
200         if (ret == -1) {
201                 rte_errno = errno;
202                 return -rte_errno;
203         }
204         return ret;
205 }
206
207 /**
208  * Perform ifreq ioctl() on associated Ethernet device.
209  *
210  * @param[in] dev
211  *   Pointer to Ethernet device.
212  * @param req
213  *   Request number to pass to ioctl().
214  * @param[out] ifr
215  *   Interface request structure output buffer.
216  *
217  * @return
218  *   0 on success, a negative errno value otherwise and rte_errno is set.
219  */
220 int
221 mlx5_ifreq(const struct rte_eth_dev *dev, int req, struct ifreq *ifr)
222 {
223         int sock = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
224         int ret = 0;
225
226         if (sock == -1) {
227                 rte_errno = errno;
228                 return -rte_errno;
229         }
230         ret = mlx5_get_ifname(dev, &ifr->ifr_name);
231         if (ret)
232                 goto error;
233         ret = ioctl(sock, req, ifr);
234         if (ret == -1) {
235                 rte_errno = errno;
236                 goto error;
237         }
238         close(sock);
239         return 0;
240 error:
241         close(sock);
242         return -rte_errno;
243 }
244
245 /**
246  * Get device MTU.
247  *
248  * @param dev
249  *   Pointer to Ethernet device.
250  * @param[out] mtu
251  *   MTU value output buffer.
252  *
253  * @return
254  *   0 on success, a negative errno value otherwise and rte_errno is set.
255  */
256 int
257 mlx5_get_mtu(struct rte_eth_dev *dev, uint16_t *mtu)
258 {
259         struct ifreq request;
260         int ret = mlx5_ifreq(dev, SIOCGIFMTU, &request);
261
262         if (ret)
263                 return ret;
264         *mtu = request.ifr_mtu;
265         return 0;
266 }
267
268 /**
269  * Set device MTU.
270  *
271  * @param dev
272  *   Pointer to Ethernet device.
273  * @param mtu
274  *   MTU value to set.
275  *
276  * @return
277  *   0 on success, a negative errno value otherwise and rte_errno is set.
278  */
279 static int
280 mlx5_set_mtu(struct rte_eth_dev *dev, uint16_t mtu)
281 {
282         struct ifreq request = { .ifr_mtu = mtu, };
283
284         return mlx5_ifreq(dev, SIOCSIFMTU, &request);
285 }
286
287 /**
288  * Set device flags.
289  *
290  * @param dev
291  *   Pointer to Ethernet device.
292  * @param keep
293  *   Bitmask for flags that must remain untouched.
294  * @param flags
295  *   Bitmask for flags to modify.
296  *
297  * @return
298  *   0 on success, a negative errno value otherwise and rte_errno is set.
299  */
300 int
301 mlx5_set_flags(struct rte_eth_dev *dev, unsigned int keep, unsigned int flags)
302 {
303         struct ifreq request;
304         int ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &request);
305
306         if (ret)
307                 return ret;
308         request.ifr_flags &= keep;
309         request.ifr_flags |= flags & ~keep;
310         return mlx5_ifreq(dev, SIOCSIFFLAGS, &request);
311 }
312
313 /**
314  * DPDK callback for Ethernet device configuration.
315  *
316  * @param dev
317  *   Pointer to Ethernet device structure.
318  *
319  * @return
320  *   0 on success, a negative errno value otherwise and rte_errno is set.
321  */
322 int
323 mlx5_dev_configure(struct rte_eth_dev *dev)
324 {
325         struct priv *priv = dev->data->dev_private;
326         unsigned int rxqs_n = dev->data->nb_rx_queues;
327         unsigned int txqs_n = dev->data->nb_tx_queues;
328         unsigned int i;
329         unsigned int j;
330         unsigned int reta_idx_n;
331         const uint8_t use_app_rss_key =
332                 !!dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key;
333         int ret = 0;
334
335         if (use_app_rss_key &&
336             (dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key_len !=
337              rss_hash_default_key_len)) {
338                 DRV_LOG(ERR, "port %u RSS key len must be %zu Bytes long",
339                         dev->data->port_id, rss_hash_default_key_len);
340                 rte_errno = EINVAL;
341                 return -rte_errno;
342         }
343         priv->rss_conf.rss_key =
344                 rte_realloc(priv->rss_conf.rss_key,
345                             rss_hash_default_key_len, 0);
346         if (!priv->rss_conf.rss_key) {
347                 DRV_LOG(ERR, "port %u cannot allocate RSS hash key memory (%u)",
348                         dev->data->port_id, rxqs_n);
349                 rte_errno = ENOMEM;
350                 return -rte_errno;
351         }
352         memcpy(priv->rss_conf.rss_key,
353                use_app_rss_key ?
354                dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key :
355                rss_hash_default_key,
356                rss_hash_default_key_len);
357         priv->rss_conf.rss_key_len = rss_hash_default_key_len;
358         priv->rss_conf.rss_hf = dev->data->dev_conf.rx_adv_conf.rss_conf.rss_hf;
359         priv->rxqs = (void *)dev->data->rx_queues;
360         priv->txqs = (void *)dev->data->tx_queues;
361         if (txqs_n != priv->txqs_n) {
362                 DRV_LOG(INFO, "port %u Tx queues number update: %u -> %u",
363                         dev->data->port_id, priv->txqs_n, txqs_n);
364                 priv->txqs_n = txqs_n;
365         }
366         if (rxqs_n > priv->config.ind_table_max_size) {
367                 DRV_LOG(ERR, "port %u cannot handle this many Rx queues (%u)",
368                         dev->data->port_id, rxqs_n);
369                 rte_errno = EINVAL;
370                 return -rte_errno;
371         }
372         if (rxqs_n == priv->rxqs_n)
373                 return 0;
374         DRV_LOG(INFO, "port %u Rx queues number update: %u -> %u",
375                 dev->data->port_id, priv->rxqs_n, rxqs_n);
376         priv->rxqs_n = rxqs_n;
377         /* If the requested number of RX queues is not a power of two, use the
378          * maximum indirection table size for better balancing.
379          * The result is always rounded to the next power of two. */
380         reta_idx_n = (1 << log2above((rxqs_n & (rxqs_n - 1)) ?
381                                      priv->config.ind_table_max_size :
382                                      rxqs_n));
383         ret = mlx5_rss_reta_index_resize(dev, reta_idx_n);
384         if (ret)
385                 return ret;
386         /* When the number of RX queues is not a power of two, the remaining
387          * table entries are padded with reused WQs and hashes are not spread
388          * uniformly. */
389         for (i = 0, j = 0; (i != reta_idx_n); ++i) {
390                 (*priv->reta_idx)[i] = j;
391                 if (++j == rxqs_n)
392                         j = 0;
393         }
394         return 0;
395 }
396
397 /**
398  * Sets default tuning parameters.
399  *
400  * @param dev
401  *   Pointer to Ethernet device.
402  * @param[out] info
403  *   Info structure output buffer.
404  */
405 static void
406 mlx5_set_default_params(struct rte_eth_dev *dev, struct rte_eth_dev_info *info)
407 {
408         struct priv *priv = dev->data->dev_private;
409
410         /* Minimum CPU utilization. */
411         info->default_rxportconf.ring_size = 256;
412         info->default_txportconf.ring_size = 256;
413         info->default_rxportconf.burst_size = 64;
414         info->default_txportconf.burst_size = 64;
415         if (priv->link_speed_capa & ETH_LINK_SPEED_100G) {
416                 info->default_rxportconf.nb_queues = 16;
417                 info->default_txportconf.nb_queues = 16;
418                 if (dev->data->nb_rx_queues > 2 ||
419                     dev->data->nb_tx_queues > 2) {
420                         /* Max Throughput. */
421                         info->default_rxportconf.ring_size = 2048;
422                         info->default_txportconf.ring_size = 2048;
423                 }
424         } else {
425                 info->default_rxportconf.nb_queues = 8;
426                 info->default_txportconf.nb_queues = 8;
427                 if (dev->data->nb_rx_queues > 2 ||
428                     dev->data->nb_tx_queues > 2) {
429                         /* Max Throughput. */
430                         info->default_rxportconf.ring_size = 4096;
431                         info->default_txportconf.ring_size = 4096;
432                 }
433         }
434 }
435
436 /**
437  * DPDK callback to get information about the device.
438  *
439  * @param dev
440  *   Pointer to Ethernet device structure.
441  * @param[out] info
442  *   Info structure output buffer.
443  */
444 void
445 mlx5_dev_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *info)
446 {
447         struct priv *priv = dev->data->dev_private;
448         struct mlx5_dev_config *config = &priv->config;
449         unsigned int max;
450         char ifname[IF_NAMESIZE];
451
452         /* FIXME: we should ask the device for these values. */
453         info->min_rx_bufsize = 32;
454         info->max_rx_pktlen = 65536;
455         /*
456          * Since we need one CQ per QP, the limit is the minimum number
457          * between the two values.
458          */
459         max = RTE_MIN(priv->device_attr.orig_attr.max_cq,
460                       priv->device_attr.orig_attr.max_qp);
461         /* If max >= 65535 then max = 0, max_rx_queues is uint16_t. */
462         if (max >= 65535)
463                 max = 65535;
464         info->max_rx_queues = max;
465         info->max_tx_queues = max;
466         info->max_mac_addrs = MLX5_MAX_UC_MAC_ADDRESSES;
467         info->rx_queue_offload_capa = mlx5_get_rx_queue_offloads(dev);
468         info->rx_offload_capa = (mlx5_get_rx_port_offloads() |
469                                  info->rx_queue_offload_capa);
470         info->tx_offload_capa = mlx5_get_tx_port_offloads(dev);
471         if (mlx5_get_ifname(dev, &ifname) == 0)
472                 info->if_index = if_nametoindex(ifname);
473         info->reta_size = priv->reta_idx_n ?
474                 priv->reta_idx_n : config->ind_table_max_size;
475         info->hash_key_size = rss_hash_default_key_len;
476         info->speed_capa = priv->link_speed_capa;
477         info->flow_type_rss_offloads = ~MLX5_RSS_HF_MASK;
478         mlx5_set_default_params(dev, info);
479 }
480
481 /**
482  * Get supported packet types.
483  *
484  * @param dev
485  *   Pointer to Ethernet device structure.
486  *
487  * @return
488  *   A pointer to the supported Packet types array.
489  */
490 const uint32_t *
491 mlx5_dev_supported_ptypes_get(struct rte_eth_dev *dev)
492 {
493         static const uint32_t ptypes[] = {
494                 /* refers to rxq_cq_to_pkt_type() */
495                 RTE_PTYPE_L2_ETHER,
496                 RTE_PTYPE_L3_IPV4_EXT_UNKNOWN,
497                 RTE_PTYPE_L3_IPV6_EXT_UNKNOWN,
498                 RTE_PTYPE_L4_NONFRAG,
499                 RTE_PTYPE_L4_FRAG,
500                 RTE_PTYPE_L4_TCP,
501                 RTE_PTYPE_L4_UDP,
502                 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN,
503                 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN,
504                 RTE_PTYPE_INNER_L4_NONFRAG,
505                 RTE_PTYPE_INNER_L4_FRAG,
506                 RTE_PTYPE_INNER_L4_TCP,
507                 RTE_PTYPE_INNER_L4_UDP,
508                 RTE_PTYPE_UNKNOWN
509         };
510
511         if (dev->rx_pkt_burst == mlx5_rx_burst ||
512             dev->rx_pkt_burst == mlx5_rx_burst_vec)
513                 return ptypes;
514         return NULL;
515 }
516
517 /**
518  * DPDK callback to retrieve physical link information.
519  *
520  * @param dev
521  *   Pointer to Ethernet device structure.
522  * @param[out] link
523  *   Storage for current link status.
524  *
525  * @return
526  *   0 on success, a negative errno value otherwise and rte_errno is set.
527  */
528 static int
529 mlx5_link_update_unlocked_gset(struct rte_eth_dev *dev,
530                                struct rte_eth_link *link)
531 {
532         struct priv *priv = dev->data->dev_private;
533         struct ethtool_cmd edata = {
534                 .cmd = ETHTOOL_GSET /* Deprecated since Linux v4.5. */
535         };
536         struct ifreq ifr;
537         struct rte_eth_link dev_link;
538         int link_speed = 0;
539         int ret;
540
541         ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr);
542         if (ret) {
543                 DRV_LOG(WARNING, "port %u ioctl(SIOCGIFFLAGS) failed: %s",
544                         dev->data->port_id, strerror(rte_errno));
545                 return ret;
546         }
547         memset(&dev_link, 0, sizeof(dev_link));
548         dev_link.link_status = ((ifr.ifr_flags & IFF_UP) &&
549                                 (ifr.ifr_flags & IFF_RUNNING));
550         ifr.ifr_data = (void *)&edata;
551         ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
552         if (ret) {
553                 DRV_LOG(WARNING,
554                         "port %u ioctl(SIOCETHTOOL, ETHTOOL_GSET) failed: %s",
555                         dev->data->port_id, strerror(rte_errno));
556                 return ret;
557         }
558         link_speed = ethtool_cmd_speed(&edata);
559         if (link_speed == -1)
560                 dev_link.link_speed = ETH_SPEED_NUM_NONE;
561         else
562                 dev_link.link_speed = link_speed;
563         priv->link_speed_capa = 0;
564         if (edata.supported & SUPPORTED_Autoneg)
565                 priv->link_speed_capa |= ETH_LINK_SPEED_AUTONEG;
566         if (edata.supported & (SUPPORTED_1000baseT_Full |
567                                SUPPORTED_1000baseKX_Full))
568                 priv->link_speed_capa |= ETH_LINK_SPEED_1G;
569         if (edata.supported & SUPPORTED_10000baseKR_Full)
570                 priv->link_speed_capa |= ETH_LINK_SPEED_10G;
571         if (edata.supported & (SUPPORTED_40000baseKR4_Full |
572                                SUPPORTED_40000baseCR4_Full |
573                                SUPPORTED_40000baseSR4_Full |
574                                SUPPORTED_40000baseLR4_Full))
575                 priv->link_speed_capa |= ETH_LINK_SPEED_40G;
576         dev_link.link_duplex = ((edata.duplex == DUPLEX_HALF) ?
577                                 ETH_LINK_HALF_DUPLEX : ETH_LINK_FULL_DUPLEX);
578         dev_link.link_autoneg = !(dev->data->dev_conf.link_speeds &
579                         ETH_LINK_SPEED_FIXED);
580         if ((dev_link.link_speed && !dev_link.link_status) ||
581             (!dev_link.link_speed && dev_link.link_status)) {
582                 rte_errno = EAGAIN;
583                 return -rte_errno;
584         }
585         *link = dev_link;
586         return 0;
587 }
588
589 /**
590  * Retrieve physical link information (unlocked version using new ioctl).
591  *
592  * @param dev
593  *   Pointer to Ethernet device structure.
594  * @param[out] link
595  *   Storage for current link status.
596  *
597  * @return
598  *   0 on success, a negative errno value otherwise and rte_errno is set.
599  */
600 static int
601 mlx5_link_update_unlocked_gs(struct rte_eth_dev *dev,
602                              struct rte_eth_link *link)
603
604 {
605         struct priv *priv = dev->data->dev_private;
606         struct ethtool_link_settings gcmd = { .cmd = ETHTOOL_GLINKSETTINGS };
607         struct ifreq ifr;
608         struct rte_eth_link dev_link;
609         uint64_t sc;
610         int ret;
611
612         ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr);
613         if (ret) {
614                 DRV_LOG(WARNING, "port %u ioctl(SIOCGIFFLAGS) failed: %s",
615                         dev->data->port_id, strerror(rte_errno));
616                 return ret;
617         }
618         memset(&dev_link, 0, sizeof(dev_link));
619         dev_link.link_status = ((ifr.ifr_flags & IFF_UP) &&
620                                 (ifr.ifr_flags & IFF_RUNNING));
621         ifr.ifr_data = (void *)&gcmd;
622         ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
623         if (ret) {
624                 DRV_LOG(DEBUG,
625                         "port %u ioctl(SIOCETHTOOL, ETHTOOL_GLINKSETTINGS)"
626                         " failed: %s",
627                         dev->data->port_id, strerror(rte_errno));
628                 return ret;
629         }
630         gcmd.link_mode_masks_nwords = -gcmd.link_mode_masks_nwords;
631
632         alignas(struct ethtool_link_settings)
633         uint8_t data[offsetof(struct ethtool_link_settings, link_mode_masks) +
634                      sizeof(uint32_t) * gcmd.link_mode_masks_nwords * 3];
635         struct ethtool_link_settings *ecmd = (void *)data;
636
637         *ecmd = gcmd;
638         ifr.ifr_data = (void *)ecmd;
639         ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
640         if (ret) {
641                 DRV_LOG(DEBUG,
642                         "port %u ioctl(SIOCETHTOOL, ETHTOOL_GLINKSETTINGS)"
643                         " failed: %s",
644                         dev->data->port_id, strerror(rte_errno));
645                 return ret;
646         }
647         dev_link.link_speed = ecmd->speed;
648         sc = ecmd->link_mode_masks[0] |
649                 ((uint64_t)ecmd->link_mode_masks[1] << 32);
650         priv->link_speed_capa = 0;
651         if (sc & MLX5_BITSHIFT(ETHTOOL_LINK_MODE_Autoneg_BIT))
652                 priv->link_speed_capa |= ETH_LINK_SPEED_AUTONEG;
653         if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_1000baseT_Full_BIT) |
654                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_1000baseKX_Full_BIT)))
655                 priv->link_speed_capa |= ETH_LINK_SPEED_1G;
656         if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT) |
657                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_10000baseKR_Full_BIT) |
658                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_10000baseR_FEC_BIT)))
659                 priv->link_speed_capa |= ETH_LINK_SPEED_10G;
660         if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_20000baseMLD2_Full_BIT) |
661                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_20000baseKR2_Full_BIT)))
662                 priv->link_speed_capa |= ETH_LINK_SPEED_20G;
663         if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT) |
664                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT) |
665                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT) |
666                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT)))
667                 priv->link_speed_capa |= ETH_LINK_SPEED_40G;
668         if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseKR4_Full_BIT) |
669                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseCR4_Full_BIT) |
670                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseSR4_Full_BIT) |
671                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseLR4_Full_BIT)))
672                 priv->link_speed_capa |= ETH_LINK_SPEED_56G;
673         if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_25000baseCR_Full_BIT) |
674                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_25000baseKR_Full_BIT) |
675                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_25000baseSR_Full_BIT)))
676                 priv->link_speed_capa |= ETH_LINK_SPEED_25G;
677         if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_50000baseCR2_Full_BIT) |
678                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_50000baseKR2_Full_BIT)))
679                 priv->link_speed_capa |= ETH_LINK_SPEED_50G;
680         if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT) |
681                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseSR4_Full_BIT) |
682                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT) |
683                   MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT)))
684                 priv->link_speed_capa |= ETH_LINK_SPEED_100G;
685         dev_link.link_duplex = ((ecmd->duplex == DUPLEX_HALF) ?
686                                 ETH_LINK_HALF_DUPLEX : ETH_LINK_FULL_DUPLEX);
687         dev_link.link_autoneg = !(dev->data->dev_conf.link_speeds &
688                                   ETH_LINK_SPEED_FIXED);
689         if ((dev_link.link_speed && !dev_link.link_status) ||
690             (!dev_link.link_speed && dev_link.link_status)) {
691                 rte_errno = EAGAIN;
692                 return -rte_errno;
693         }
694         *link = dev_link;
695         return 0;
696 }
697
698 /**
699  * DPDK callback to retrieve physical link information.
700  *
701  * @param dev
702  *   Pointer to Ethernet device structure.
703  * @param wait_to_complete
704  *   Wait for request completion.
705  *
706  * @return
707  *   0 if link status was not updated, positive if it was, a negative errno
708  *   value otherwise and rte_errno is set.
709  */
710 int
711 mlx5_link_update(struct rte_eth_dev *dev, int wait_to_complete)
712 {
713         int ret;
714         struct rte_eth_link dev_link;
715         time_t start_time = time(NULL);
716
717         do {
718                 ret = mlx5_link_update_unlocked_gs(dev, &dev_link);
719                 if (ret)
720                         ret = mlx5_link_update_unlocked_gset(dev, &dev_link);
721                 if (ret == 0)
722                         break;
723                 /* Handle wait to complete situation. */
724                 if (wait_to_complete && ret == -EAGAIN) {
725                         if (abs((int)difftime(time(NULL), start_time)) <
726                             MLX5_LINK_STATUS_TIMEOUT) {
727                                 usleep(0);
728                                 continue;
729                         } else {
730                                 rte_errno = EBUSY;
731                                 return -rte_errno;
732                         }
733                 } else if (ret < 0) {
734                         return ret;
735                 }
736         } while (wait_to_complete);
737         ret = !!memcmp(&dev->data->dev_link, &dev_link,
738                        sizeof(struct rte_eth_link));
739         dev->data->dev_link = dev_link;
740         return ret;
741 }
742
743 /**
744  * DPDK callback to change the MTU.
745  *
746  * @param dev
747  *   Pointer to Ethernet device structure.
748  * @param in_mtu
749  *   New MTU.
750  *
751  * @return
752  *   0 on success, a negative errno value otherwise and rte_errno is set.
753  */
754 int
755 mlx5_dev_set_mtu(struct rte_eth_dev *dev, uint16_t mtu)
756 {
757         struct priv *priv = dev->data->dev_private;
758         uint16_t kern_mtu = 0;
759         int ret;
760
761         ret = mlx5_get_mtu(dev, &kern_mtu);
762         if (ret)
763                 return ret;
764         /* Set kernel interface MTU first. */
765         ret = mlx5_set_mtu(dev, mtu);
766         if (ret)
767                 return ret;
768         ret = mlx5_get_mtu(dev, &kern_mtu);
769         if (ret)
770                 return ret;
771         if (kern_mtu == mtu) {
772                 priv->mtu = mtu;
773                 DRV_LOG(DEBUG, "port %u adapter MTU set to %u",
774                         dev->data->port_id, mtu);
775                 return 0;
776         }
777         rte_errno = EAGAIN;
778         return -rte_errno;
779 }
780
781 /**
782  * DPDK callback to get flow control status.
783  *
784  * @param dev
785  *   Pointer to Ethernet device structure.
786  * @param[out] fc_conf
787  *   Flow control output buffer.
788  *
789  * @return
790  *   0 on success, a negative errno value otherwise and rte_errno is set.
791  */
792 int
793 mlx5_dev_get_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf)
794 {
795         struct ifreq ifr;
796         struct ethtool_pauseparam ethpause = {
797                 .cmd = ETHTOOL_GPAUSEPARAM
798         };
799         int ret;
800
801         ifr.ifr_data = (void *)&ethpause;
802         ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
803         if (ret) {
804                 DRV_LOG(WARNING,
805                         "port %u ioctl(SIOCETHTOOL, ETHTOOL_GPAUSEPARAM) failed:"
806                         " %s",
807                         dev->data->port_id, strerror(rte_errno));
808                 return ret;
809         }
810         fc_conf->autoneg = ethpause.autoneg;
811         if (ethpause.rx_pause && ethpause.tx_pause)
812                 fc_conf->mode = RTE_FC_FULL;
813         else if (ethpause.rx_pause)
814                 fc_conf->mode = RTE_FC_RX_PAUSE;
815         else if (ethpause.tx_pause)
816                 fc_conf->mode = RTE_FC_TX_PAUSE;
817         else
818                 fc_conf->mode = RTE_FC_NONE;
819         return 0;
820 }
821
822 /**
823  * DPDK callback to modify flow control parameters.
824  *
825  * @param dev
826  *   Pointer to Ethernet device structure.
827  * @param[in] fc_conf
828  *   Flow control parameters.
829  *
830  * @return
831  *   0 on success, a negative errno value otherwise and rte_errno is set.
832  */
833 int
834 mlx5_dev_set_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf)
835 {
836         struct ifreq ifr;
837         struct ethtool_pauseparam ethpause = {
838                 .cmd = ETHTOOL_SPAUSEPARAM
839         };
840         int ret;
841
842         ifr.ifr_data = (void *)&ethpause;
843         ethpause.autoneg = fc_conf->autoneg;
844         if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) ||
845             (fc_conf->mode & RTE_FC_RX_PAUSE))
846                 ethpause.rx_pause = 1;
847         else
848                 ethpause.rx_pause = 0;
849
850         if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) ||
851             (fc_conf->mode & RTE_FC_TX_PAUSE))
852                 ethpause.tx_pause = 1;
853         else
854                 ethpause.tx_pause = 0;
855         ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
856         if (ret) {
857                 DRV_LOG(WARNING,
858                         "port %u ioctl(SIOCETHTOOL, ETHTOOL_SPAUSEPARAM)"
859                         " failed: %s",
860                         dev->data->port_id, strerror(rte_errno));
861                 return ret;
862         }
863         return 0;
864 }
865
866 /**
867  * Get PCI information from struct ibv_device.
868  *
869  * @param device
870  *   Pointer to Ethernet device structure.
871  * @param[out] pci_addr
872  *   PCI bus address output buffer.
873  *
874  * @return
875  *   0 on success, a negative errno value otherwise and rte_errno is set.
876  */
877 int
878 mlx5_ibv_device_to_pci_addr(const struct ibv_device *device,
879                             struct rte_pci_addr *pci_addr)
880 {
881         FILE *file;
882         char line[32];
883         MKSTR(path, "%s/device/uevent", device->ibdev_path);
884
885         file = fopen(path, "rb");
886         if (file == NULL) {
887                 rte_errno = errno;
888                 return -rte_errno;
889         }
890         while (fgets(line, sizeof(line), file) == line) {
891                 size_t len = strlen(line);
892                 int ret;
893
894                 /* Truncate long lines. */
895                 if (len == (sizeof(line) - 1))
896                         while (line[(len - 1)] != '\n') {
897                                 ret = fgetc(file);
898                                 if (ret == EOF)
899                                         break;
900                                 line[(len - 1)] = ret;
901                         }
902                 /* Extract information. */
903                 if (sscanf(line,
904                            "PCI_SLOT_NAME="
905                            "%" SCNx32 ":%" SCNx8 ":%" SCNx8 ".%" SCNx8 "\n",
906                            &pci_addr->domain,
907                            &pci_addr->bus,
908                            &pci_addr->devid,
909                            &pci_addr->function) == 4) {
910                         ret = 0;
911                         break;
912                 }
913         }
914         fclose(file);
915         return 0;
916 }
917
918 /**
919  * Device status handler.
920  *
921  * @param dev
922  *   Pointer to Ethernet device.
923  * @param events
924  *   Pointer to event flags holder.
925  *
926  * @return
927  *   Events bitmap of callback process which can be called immediately.
928  */
929 static uint32_t
930 mlx5_dev_status_handler(struct rte_eth_dev *dev)
931 {
932         struct priv *priv = dev->data->dev_private;
933         struct ibv_async_event event;
934         uint32_t ret = 0;
935
936         if (mlx5_link_update(dev, 0) == -EAGAIN) {
937                 usleep(0);
938                 return 0;
939         }
940         /* Read all message and acknowledge them. */
941         for (;;) {
942                 if (mlx5_glue->get_async_event(priv->ctx, &event))
943                         break;
944                 if ((event.event_type == IBV_EVENT_PORT_ACTIVE ||
945                         event.event_type == IBV_EVENT_PORT_ERR) &&
946                         (dev->data->dev_conf.intr_conf.lsc == 1))
947                         ret |= (1 << RTE_ETH_EVENT_INTR_LSC);
948                 else if (event.event_type == IBV_EVENT_DEVICE_FATAL &&
949                         dev->data->dev_conf.intr_conf.rmv == 1)
950                         ret |= (1 << RTE_ETH_EVENT_INTR_RMV);
951                 else
952                         DRV_LOG(DEBUG,
953                                 "port %u event type %d on not handled",
954                                 dev->data->port_id, event.event_type);
955                 mlx5_glue->ack_async_event(&event);
956         }
957         return ret;
958 }
959
960 /**
961  * Handle interrupts from the NIC.
962  *
963  * @param[in] intr_handle
964  *   Interrupt handler.
965  * @param cb_arg
966  *   Callback argument.
967  */
968 void
969 mlx5_dev_interrupt_handler(void *cb_arg)
970 {
971         struct rte_eth_dev *dev = cb_arg;
972         uint32_t events;
973
974         events = mlx5_dev_status_handler(dev);
975         if (events & (1 << RTE_ETH_EVENT_INTR_LSC))
976                 _rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_INTR_LSC, NULL);
977         if (events & (1 << RTE_ETH_EVENT_INTR_RMV))
978                 _rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_INTR_RMV, NULL);
979 }
980
981 /**
982  * Handle interrupts from the socket.
983  *
984  * @param cb_arg
985  *   Callback argument.
986  */
987 static void
988 mlx5_dev_handler_socket(void *cb_arg)
989 {
990         struct rte_eth_dev *dev = cb_arg;
991
992         mlx5_socket_handle(dev);
993 }
994
995 /**
996  * Uninstall interrupt handler.
997  *
998  * @param dev
999  *   Pointer to Ethernet device.
1000  */
1001 void
1002 mlx5_dev_interrupt_handler_uninstall(struct rte_eth_dev *dev)
1003 {
1004         struct priv *priv = dev->data->dev_private;
1005
1006         if (dev->data->dev_conf.intr_conf.lsc ||
1007             dev->data->dev_conf.intr_conf.rmv)
1008                 rte_intr_callback_unregister(&priv->intr_handle,
1009                                              mlx5_dev_interrupt_handler, dev);
1010         if (priv->primary_socket)
1011                 rte_intr_callback_unregister(&priv->intr_handle_socket,
1012                                              mlx5_dev_handler_socket, dev);
1013         priv->intr_handle.fd = 0;
1014         priv->intr_handle.type = RTE_INTR_HANDLE_UNKNOWN;
1015         priv->intr_handle_socket.fd = 0;
1016         priv->intr_handle_socket.type = RTE_INTR_HANDLE_UNKNOWN;
1017 }
1018
1019 /**
1020  * Install interrupt handler.
1021  *
1022  * @param dev
1023  *   Pointer to Ethernet device.
1024  */
1025 void
1026 mlx5_dev_interrupt_handler_install(struct rte_eth_dev *dev)
1027 {
1028         struct priv *priv = dev->data->dev_private;
1029         int ret;
1030         int flags;
1031
1032         assert(priv->ctx->async_fd > 0);
1033         flags = fcntl(priv->ctx->async_fd, F_GETFL);
1034         ret = fcntl(priv->ctx->async_fd, F_SETFL, flags | O_NONBLOCK);
1035         if (ret) {
1036                 DRV_LOG(INFO,
1037                         "port %u failed to change file descriptor async event"
1038                         " queue",
1039                         dev->data->port_id);
1040                 dev->data->dev_conf.intr_conf.lsc = 0;
1041                 dev->data->dev_conf.intr_conf.rmv = 0;
1042         }
1043         if (dev->data->dev_conf.intr_conf.lsc ||
1044             dev->data->dev_conf.intr_conf.rmv) {
1045                 priv->intr_handle.fd = priv->ctx->async_fd;
1046                 priv->intr_handle.type = RTE_INTR_HANDLE_EXT;
1047                 rte_intr_callback_register(&priv->intr_handle,
1048                                            mlx5_dev_interrupt_handler, dev);
1049         }
1050         ret = mlx5_socket_init(dev);
1051         if (ret)
1052                 DRV_LOG(ERR, "port %u cannot initialise socket: %s",
1053                         dev->data->port_id, strerror(rte_errno));
1054         else if (priv->primary_socket) {
1055                 priv->intr_handle_socket.fd = priv->primary_socket;
1056                 priv->intr_handle_socket.type = RTE_INTR_HANDLE_EXT;
1057                 rte_intr_callback_register(&priv->intr_handle_socket,
1058                                            mlx5_dev_handler_socket, dev);
1059         }
1060 }
1061
1062 /**
1063  * DPDK callback to bring the link DOWN.
1064  *
1065  * @param dev
1066  *   Pointer to Ethernet device structure.
1067  *
1068  * @return
1069  *   0 on success, a negative errno value otherwise and rte_errno is set.
1070  */
1071 int
1072 mlx5_set_link_down(struct rte_eth_dev *dev)
1073 {
1074         return mlx5_set_flags(dev, ~IFF_UP, ~IFF_UP);
1075 }
1076
1077 /**
1078  * DPDK callback to bring the link UP.
1079  *
1080  * @param dev
1081  *   Pointer to Ethernet device structure.
1082  *
1083  * @return
1084  *   0 on success, a negative errno value otherwise and rte_errno is set.
1085  */
1086 int
1087 mlx5_set_link_up(struct rte_eth_dev *dev)
1088 {
1089         return mlx5_set_flags(dev, ~IFF_UP, IFF_UP);
1090 }
1091
1092 /**
1093  * Configure the TX function to use.
1094  *
1095  * @param dev
1096  *   Pointer to private data structure.
1097  *
1098  * @return
1099  *   Pointer to selected Tx burst function.
1100  */
1101 eth_tx_burst_t
1102 mlx5_select_tx_function(struct rte_eth_dev *dev)
1103 {
1104         struct priv *priv = dev->data->dev_private;
1105         eth_tx_burst_t tx_pkt_burst = mlx5_tx_burst;
1106         struct mlx5_dev_config *config = &priv->config;
1107         uint64_t tx_offloads = dev->data->dev_conf.txmode.offloads;
1108         int tso = !!(tx_offloads & (DEV_TX_OFFLOAD_TCP_TSO |
1109                                     DEV_TX_OFFLOAD_VXLAN_TNL_TSO |
1110                                     DEV_TX_OFFLOAD_GRE_TNL_TSO |
1111                                     DEV_TX_OFFLOAD_IP_TNL_TSO |
1112                                     DEV_TX_OFFLOAD_UDP_TNL_TSO));
1113         int swp = !!(tx_offloads & (DEV_TX_OFFLOAD_IP_TNL_TSO |
1114                                     DEV_TX_OFFLOAD_UDP_TNL_TSO |
1115                                     DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM));
1116         int vlan_insert = !!(tx_offloads & DEV_TX_OFFLOAD_VLAN_INSERT);
1117
1118         assert(priv != NULL);
1119         /* Select appropriate TX function. */
1120         if (vlan_insert || tso || swp)
1121                 return tx_pkt_burst;
1122         if (config->mps == MLX5_MPW_ENHANCED) {
1123                 if (mlx5_check_vec_tx_support(dev) > 0) {
1124                         if (mlx5_check_raw_vec_tx_support(dev) > 0)
1125                                 tx_pkt_burst = mlx5_tx_burst_raw_vec;
1126                         else
1127                                 tx_pkt_burst = mlx5_tx_burst_vec;
1128                         DRV_LOG(DEBUG,
1129                                 "port %u selected enhanced MPW Tx vectorized"
1130                                 " function",
1131                                 dev->data->port_id);
1132                 } else {
1133                         tx_pkt_burst = mlx5_tx_burst_empw;
1134                         DRV_LOG(DEBUG,
1135                                 "port %u selected enhanced MPW Tx function",
1136                                 dev->data->port_id);
1137                 }
1138         } else if (config->mps && (config->txq_inline > 0)) {
1139                 tx_pkt_burst = mlx5_tx_burst_mpw_inline;
1140                 DRV_LOG(DEBUG, "port %u selected MPW inline Tx function",
1141                         dev->data->port_id);
1142         } else if (config->mps) {
1143                 tx_pkt_burst = mlx5_tx_burst_mpw;
1144                 DRV_LOG(DEBUG, "port %u selected MPW Tx function",
1145                         dev->data->port_id);
1146         }
1147         return tx_pkt_burst;
1148 }
1149
1150 /**
1151  * Configure the RX function to use.
1152  *
1153  * @param dev
1154  *   Pointer to private data structure.
1155  *
1156  * @return
1157  *   Pointer to selected Rx burst function.
1158  */
1159 eth_rx_burst_t
1160 mlx5_select_rx_function(struct rte_eth_dev *dev)
1161 {
1162         eth_rx_burst_t rx_pkt_burst = mlx5_rx_burst;
1163
1164         assert(dev != NULL);
1165         if (mlx5_check_vec_rx_support(dev) > 0) {
1166                 rx_pkt_burst = mlx5_rx_burst_vec;
1167                 DRV_LOG(DEBUG, "port %u selected Rx vectorized function",
1168                         dev->data->port_id);
1169         }
1170         return rx_pkt_burst;
1171 }
1172
1173 /**
1174  * Check if mlx5 device was removed.
1175  *
1176  * @param dev
1177  *   Pointer to Ethernet device structure.
1178  *
1179  * @return
1180  *   1 when device is removed, otherwise 0.
1181  */
1182 int
1183 mlx5_is_removed(struct rte_eth_dev *dev)
1184 {
1185         struct ibv_device_attr device_attr;
1186         struct priv *priv = dev->data->dev_private;
1187
1188         if (mlx5_glue->query_device(priv->ctx, &device_attr) == EIO)
1189                 return 1;
1190         return 0;
1191 }