net/mlx5: add Rx HW timestamp
[dpdk.git] / drivers / net / mlx5 / mlx5_ethdev.c
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright 2015 6WIND S.A.
5  *   Copyright 2015 Mellanox.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of 6WIND S.A. nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33
34 #define _GNU_SOURCE
35
36 #include <stddef.h>
37 #include <assert.h>
38 #include <unistd.h>
39 #include <stdint.h>
40 #include <stdio.h>
41 #include <string.h>
42 #include <stdlib.h>
43 #include <errno.h>
44 #include <dirent.h>
45 #include <net/if.h>
46 #include <sys/ioctl.h>
47 #include <sys/socket.h>
48 #include <sys/utsname.h>
49 #include <netinet/in.h>
50 #include <linux/ethtool.h>
51 #include <linux/sockios.h>
52 #include <linux/version.h>
53 #include <fcntl.h>
54 #include <stdalign.h>
55 #include <sys/un.h>
56
57 #include <rte_atomic.h>
58 #include <rte_ethdev.h>
59 #include <rte_mbuf.h>
60 #include <rte_common.h>
61 #include <rte_interrupts.h>
62 #include <rte_alarm.h>
63 #include <rte_malloc.h>
64
65 #include "mlx5.h"
66 #include "mlx5_rxtx.h"
67 #include "mlx5_utils.h"
68
69 /* Add defines in case the running kernel is not the same as user headers. */
70 #ifndef ETHTOOL_GLINKSETTINGS
71 struct ethtool_link_settings {
72         uint32_t cmd;
73         uint32_t speed;
74         uint8_t duplex;
75         uint8_t port;
76         uint8_t phy_address;
77         uint8_t autoneg;
78         uint8_t mdio_support;
79         uint8_t eth_to_mdix;
80         uint8_t eth_tp_mdix_ctrl;
81         int8_t link_mode_masks_nwords;
82         uint32_t reserved[8];
83         uint32_t link_mode_masks[];
84 };
85
86 #define ETHTOOL_GLINKSETTINGS 0x0000004c
87 #define ETHTOOL_LINK_MODE_1000baseT_Full_BIT 5
88 #define ETHTOOL_LINK_MODE_Autoneg_BIT 6
89 #define ETHTOOL_LINK_MODE_1000baseKX_Full_BIT 17
90 #define ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT 18
91 #define ETHTOOL_LINK_MODE_10000baseKR_Full_BIT 19
92 #define ETHTOOL_LINK_MODE_10000baseR_FEC_BIT 20
93 #define ETHTOOL_LINK_MODE_20000baseMLD2_Full_BIT 21
94 #define ETHTOOL_LINK_MODE_20000baseKR2_Full_BIT 22
95 #define ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT 23
96 #define ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT 24
97 #define ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT 25
98 #define ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT 26
99 #define ETHTOOL_LINK_MODE_56000baseKR4_Full_BIT 27
100 #define ETHTOOL_LINK_MODE_56000baseCR4_Full_BIT 28
101 #define ETHTOOL_LINK_MODE_56000baseSR4_Full_BIT 29
102 #define ETHTOOL_LINK_MODE_56000baseLR4_Full_BIT 30
103 #endif
104 #ifndef HAVE_ETHTOOL_LINK_MODE_25G
105 #define ETHTOOL_LINK_MODE_25000baseCR_Full_BIT 31
106 #define ETHTOOL_LINK_MODE_25000baseKR_Full_BIT 32
107 #define ETHTOOL_LINK_MODE_25000baseSR_Full_BIT 33
108 #endif
109 #ifndef HAVE_ETHTOOL_LINK_MODE_50G
110 #define ETHTOOL_LINK_MODE_50000baseCR2_Full_BIT 34
111 #define ETHTOOL_LINK_MODE_50000baseKR2_Full_BIT 35
112 #endif
113 #ifndef HAVE_ETHTOOL_LINK_MODE_100G
114 #define ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT 36
115 #define ETHTOOL_LINK_MODE_100000baseSR4_Full_BIT 37
116 #define ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT 38
117 #define ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT 39
118 #endif
119
120 /**
121  * Return private structure associated with an Ethernet device.
122  *
123  * @param dev
124  *   Pointer to Ethernet device structure.
125  *
126  * @return
127  *   Pointer to private structure.
128  */
129 struct priv *
130 mlx5_get_priv(struct rte_eth_dev *dev)
131 {
132         return dev->data->dev_private;
133 }
134
135 /**
136  * Check if running as a secondary process.
137  *
138  * @return
139  *   Nonzero if running as a secondary process.
140  */
141 inline int
142 mlx5_is_secondary(void)
143 {
144         return rte_eal_process_type() == RTE_PROC_SECONDARY;
145 }
146
147 /**
148  * Get interface name from private structure.
149  *
150  * @param[in] priv
151  *   Pointer to private structure.
152  * @param[out] ifname
153  *   Interface name output buffer.
154  *
155  * @return
156  *   0 on success, -1 on failure and errno is set.
157  */
158 int
159 priv_get_ifname(const struct priv *priv, char (*ifname)[IF_NAMESIZE])
160 {
161         DIR *dir;
162         struct dirent *dent;
163         unsigned int dev_type = 0;
164         unsigned int dev_port_prev = ~0u;
165         char match[IF_NAMESIZE] = "";
166
167         {
168                 MKSTR(path, "%s/device/net", priv->ibdev_path);
169
170                 dir = opendir(path);
171                 if (dir == NULL)
172                         return -1;
173         }
174         while ((dent = readdir(dir)) != NULL) {
175                 char *name = dent->d_name;
176                 FILE *file;
177                 unsigned int dev_port;
178                 int r;
179
180                 if ((name[0] == '.') &&
181                     ((name[1] == '\0') ||
182                      ((name[1] == '.') && (name[2] == '\0'))))
183                         continue;
184
185                 MKSTR(path, "%s/device/net/%s/%s",
186                       priv->ibdev_path, name,
187                       (dev_type ? "dev_id" : "dev_port"));
188
189                 file = fopen(path, "rb");
190                 if (file == NULL) {
191                         if (errno != ENOENT)
192                                 continue;
193                         /*
194                          * Switch to dev_id when dev_port does not exist as
195                          * is the case with Linux kernel versions < 3.15.
196                          */
197 try_dev_id:
198                         match[0] = '\0';
199                         if (dev_type)
200                                 break;
201                         dev_type = 1;
202                         dev_port_prev = ~0u;
203                         rewinddir(dir);
204                         continue;
205                 }
206                 r = fscanf(file, (dev_type ? "%x" : "%u"), &dev_port);
207                 fclose(file);
208                 if (r != 1)
209                         continue;
210                 /*
211                  * Switch to dev_id when dev_port returns the same value for
212                  * all ports. May happen when using a MOFED release older than
213                  * 3.0 with a Linux kernel >= 3.15.
214                  */
215                 if (dev_port == dev_port_prev)
216                         goto try_dev_id;
217                 dev_port_prev = dev_port;
218                 if (dev_port == (priv->port - 1u))
219                         snprintf(match, sizeof(match), "%s", name);
220         }
221         closedir(dir);
222         if (match[0] == '\0')
223                 return -1;
224         strncpy(*ifname, match, sizeof(*ifname));
225         return 0;
226 }
227
228 /**
229  * Check if the counter is located on ib counters file.
230  *
231  * @param[in] cntr
232  *   Counter name.
233  *
234  * @return
235  *   1 if counter is located on ib counters file , 0 otherwise.
236  */
237 int
238 priv_is_ib_cntr(const char *cntr)
239 {
240         if (!strcmp(cntr, "out_of_buffer"))
241                 return 1;
242         return 0;
243 }
244
245 /**
246  * Read from sysfs entry.
247  *
248  * @param[in] priv
249  *   Pointer to private structure.
250  * @param[in] entry
251  *   Entry name relative to sysfs path.
252  * @param[out] buf
253  *   Data output buffer.
254  * @param size
255  *   Buffer size.
256  *
257  * @return
258  *   0 on success, -1 on failure and errno is set.
259  */
260 static int
261 priv_sysfs_read(const struct priv *priv, const char *entry,
262                 char *buf, size_t size)
263 {
264         char ifname[IF_NAMESIZE];
265         FILE *file;
266         int ret;
267         int err;
268
269         if (priv_get_ifname(priv, &ifname))
270                 return -1;
271
272         if (priv_is_ib_cntr(entry)) {
273                 MKSTR(path, "%s/ports/1/hw_counters/%s",
274                       priv->ibdev_path, entry);
275                 file = fopen(path, "rb");
276         } else {
277                 MKSTR(path, "%s/device/net/%s/%s",
278                       priv->ibdev_path, ifname, entry);
279                 file = fopen(path, "rb");
280         }
281         if (file == NULL)
282                 return -1;
283         ret = fread(buf, 1, size, file);
284         err = errno;
285         if (((size_t)ret < size) && (ferror(file)))
286                 ret = -1;
287         else
288                 ret = size;
289         fclose(file);
290         errno = err;
291         return ret;
292 }
293
294 /**
295  * Write to sysfs entry.
296  *
297  * @param[in] priv
298  *   Pointer to private structure.
299  * @param[in] entry
300  *   Entry name relative to sysfs path.
301  * @param[in] buf
302  *   Data buffer.
303  * @param size
304  *   Buffer size.
305  *
306  * @return
307  *   0 on success, -1 on failure and errno is set.
308  */
309 static int
310 priv_sysfs_write(const struct priv *priv, const char *entry,
311                  char *buf, size_t size)
312 {
313         char ifname[IF_NAMESIZE];
314         FILE *file;
315         int ret;
316         int err;
317
318         if (priv_get_ifname(priv, &ifname))
319                 return -1;
320
321         MKSTR(path, "%s/device/net/%s/%s", priv->ibdev_path, ifname, entry);
322
323         file = fopen(path, "wb");
324         if (file == NULL)
325                 return -1;
326         ret = fwrite(buf, 1, size, file);
327         err = errno;
328         if (((size_t)ret < size) || (ferror(file)))
329                 ret = -1;
330         else
331                 ret = size;
332         fclose(file);
333         errno = err;
334         return ret;
335 }
336
337 /**
338  * Get unsigned long sysfs property.
339  *
340  * @param priv
341  *   Pointer to private structure.
342  * @param[in] name
343  *   Entry name relative to sysfs path.
344  * @param[out] value
345  *   Value output buffer.
346  *
347  * @return
348  *   0 on success, -1 on failure and errno is set.
349  */
350 static int
351 priv_get_sysfs_ulong(struct priv *priv, const char *name, unsigned long *value)
352 {
353         int ret;
354         unsigned long value_ret;
355         char value_str[32];
356
357         ret = priv_sysfs_read(priv, name, value_str, (sizeof(value_str) - 1));
358         if (ret == -1) {
359                 DEBUG("cannot read %s value from sysfs: %s",
360                       name, strerror(errno));
361                 return -1;
362         }
363         value_str[ret] = '\0';
364         errno = 0;
365         value_ret = strtoul(value_str, NULL, 0);
366         if (errno) {
367                 DEBUG("invalid %s value `%s': %s", name, value_str,
368                       strerror(errno));
369                 return -1;
370         }
371         *value = value_ret;
372         return 0;
373 }
374
375 /**
376  * Set unsigned long sysfs property.
377  *
378  * @param priv
379  *   Pointer to private structure.
380  * @param[in] name
381  *   Entry name relative to sysfs path.
382  * @param value
383  *   Value to set.
384  *
385  * @return
386  *   0 on success, -1 on failure and errno is set.
387  */
388 static int
389 priv_set_sysfs_ulong(struct priv *priv, const char *name, unsigned long value)
390 {
391         int ret;
392         MKSTR(value_str, "%lu", value);
393
394         ret = priv_sysfs_write(priv, name, value_str, (sizeof(value_str) - 1));
395         if (ret == -1) {
396                 DEBUG("cannot write %s `%s' (%lu) to sysfs: %s",
397                       name, value_str, value, strerror(errno));
398                 return -1;
399         }
400         return 0;
401 }
402
403 /**
404  * Perform ifreq ioctl() on associated Ethernet device.
405  *
406  * @param[in] priv
407  *   Pointer to private structure.
408  * @param req
409  *   Request number to pass to ioctl().
410  * @param[out] ifr
411  *   Interface request structure output buffer.
412  *
413  * @return
414  *   0 on success, -1 on failure and errno is set.
415  */
416 int
417 priv_ifreq(const struct priv *priv, int req, struct ifreq *ifr)
418 {
419         int sock = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
420         int ret = -1;
421
422         if (sock == -1)
423                 return ret;
424         if (priv_get_ifname(priv, &ifr->ifr_name) == 0)
425                 ret = ioctl(sock, req, ifr);
426         close(sock);
427         return ret;
428 }
429
430 /**
431  * Return the number of active VFs for the current device.
432  *
433  * @param[in] priv
434  *   Pointer to private structure.
435  * @param[out] num_vfs
436  *   Number of active VFs.
437  *
438  * @return
439  *   0 on success, -1 on failure and errno is set.
440  */
441 int
442 priv_get_num_vfs(struct priv *priv, uint16_t *num_vfs)
443 {
444         /* The sysfs entry name depends on the operating system. */
445         const char **name = (const char *[]){
446                 "device/sriov_numvfs",
447                 "device/mlx5_num_vfs",
448                 NULL,
449         };
450         int ret;
451
452         do {
453                 unsigned long ulong_num_vfs;
454
455                 ret = priv_get_sysfs_ulong(priv, *name, &ulong_num_vfs);
456                 if (!ret)
457                         *num_vfs = ulong_num_vfs;
458         } while (*(++name) && ret);
459         return ret;
460 }
461
462 /**
463  * Get device MTU.
464  *
465  * @param priv
466  *   Pointer to private structure.
467  * @param[out] mtu
468  *   MTU value output buffer.
469  *
470  * @return
471  *   0 on success, -1 on failure and errno is set.
472  */
473 int
474 priv_get_mtu(struct priv *priv, uint16_t *mtu)
475 {
476         unsigned long ulong_mtu;
477
478         if (priv_get_sysfs_ulong(priv, "mtu", &ulong_mtu) == -1)
479                 return -1;
480         *mtu = ulong_mtu;
481         return 0;
482 }
483
484 /**
485  * Read device counter from sysfs.
486  *
487  * @param priv
488  *   Pointer to private structure.
489  * @param name
490  *   Counter name.
491  * @param[out] cntr
492  *   Counter output buffer.
493  *
494  * @return
495  *   0 on success, -1 on failure and errno is set.
496  */
497 int
498 priv_get_cntr_sysfs(struct priv *priv, const char *name, uint64_t *cntr)
499 {
500         unsigned long ulong_ctr;
501
502         if (priv_get_sysfs_ulong(priv, name, &ulong_ctr) == -1)
503                 return -1;
504         *cntr = ulong_ctr;
505         return 0;
506 }
507
508 /**
509  * Set device MTU.
510  *
511  * @param priv
512  *   Pointer to private structure.
513  * @param mtu
514  *   MTU value to set.
515  *
516  * @return
517  *   0 on success, -1 on failure and errno is set.
518  */
519 static int
520 priv_set_mtu(struct priv *priv, uint16_t mtu)
521 {
522         uint16_t new_mtu;
523
524         if (priv_set_sysfs_ulong(priv, "mtu", mtu) ||
525             priv_get_mtu(priv, &new_mtu))
526                 return -1;
527         if (new_mtu == mtu)
528                 return 0;
529         errno = EINVAL;
530         return -1;
531 }
532
533 /**
534  * Set device flags.
535  *
536  * @param priv
537  *   Pointer to private structure.
538  * @param keep
539  *   Bitmask for flags that must remain untouched.
540  * @param flags
541  *   Bitmask for flags to modify.
542  *
543  * @return
544  *   0 on success, -1 on failure and errno is set.
545  */
546 int
547 priv_set_flags(struct priv *priv, unsigned int keep, unsigned int flags)
548 {
549         unsigned long tmp;
550
551         if (priv_get_sysfs_ulong(priv, "flags", &tmp) == -1)
552                 return -1;
553         tmp &= keep;
554         tmp |= (flags & (~keep));
555         return priv_set_sysfs_ulong(priv, "flags", tmp);
556 }
557
558 /**
559  * Ethernet device configuration.
560  *
561  * Prepare the driver for a given number of TX and RX queues.
562  *
563  * @param dev
564  *   Pointer to Ethernet device structure.
565  *
566  * @return
567  *   0 on success, errno value on failure.
568  */
569 static int
570 dev_configure(struct rte_eth_dev *dev)
571 {
572         struct priv *priv = dev->data->dev_private;
573         unsigned int rxqs_n = dev->data->nb_rx_queues;
574         unsigned int txqs_n = dev->data->nb_tx_queues;
575         unsigned int i;
576         unsigned int j;
577         unsigned int reta_idx_n;
578         const uint8_t use_app_rss_key =
579                 !!dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key_len;
580
581         if (use_app_rss_key &&
582             (dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key_len !=
583              rss_hash_default_key_len)) {
584                 /* MLX5 RSS only support 40bytes key. */
585                 return EINVAL;
586         }
587         priv->rss_conf.rss_key =
588                 rte_realloc(priv->rss_conf.rss_key,
589                             rss_hash_default_key_len, 0);
590         if (!priv->rss_conf.rss_key) {
591                 ERROR("cannot allocate RSS hash key memory (%u)", rxqs_n);
592                 return ENOMEM;
593         }
594         memcpy(priv->rss_conf.rss_key,
595                use_app_rss_key ?
596                dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key :
597                rss_hash_default_key,
598                rss_hash_default_key_len);
599         priv->rss_conf.rss_key_len = rss_hash_default_key_len;
600         priv->rss_conf.rss_hf = dev->data->dev_conf.rx_adv_conf.rss_conf.rss_hf;
601         priv->rxqs = (void *)dev->data->rx_queues;
602         priv->txqs = (void *)dev->data->tx_queues;
603         if (txqs_n != priv->txqs_n) {
604                 INFO("%p: TX queues number update: %u -> %u",
605                      (void *)dev, priv->txqs_n, txqs_n);
606                 priv->txqs_n = txqs_n;
607         }
608         if (rxqs_n > priv->ind_table_max_size) {
609                 ERROR("cannot handle this many RX queues (%u)", rxqs_n);
610                 return EINVAL;
611         }
612         if (rxqs_n == priv->rxqs_n)
613                 return 0;
614         INFO("%p: RX queues number update: %u -> %u",
615              (void *)dev, priv->rxqs_n, rxqs_n);
616         priv->rxqs_n = rxqs_n;
617         /* If the requested number of RX queues is not a power of two, use the
618          * maximum indirection table size for better balancing.
619          * The result is always rounded to the next power of two. */
620         reta_idx_n = (1 << log2above((rxqs_n & (rxqs_n - 1)) ?
621                                      priv->ind_table_max_size :
622                                      rxqs_n));
623         if (priv_rss_reta_index_resize(priv, reta_idx_n))
624                 return ENOMEM;
625         /* When the number of RX queues is not a power of two, the remaining
626          * table entries are padded with reused WQs and hashes are not spread
627          * uniformly. */
628         for (i = 0, j = 0; (i != reta_idx_n); ++i) {
629                 (*priv->reta_idx)[i] = j;
630                 if (++j == rxqs_n)
631                         j = 0;
632         }
633         return 0;
634 }
635
636 /**
637  * DPDK callback for Ethernet device configuration.
638  *
639  * @param dev
640  *   Pointer to Ethernet device structure.
641  *
642  * @return
643  *   0 on success, negative errno value on failure.
644  */
645 int
646 mlx5_dev_configure(struct rte_eth_dev *dev)
647 {
648         struct priv *priv = dev->data->dev_private;
649         int ret;
650
651         if (mlx5_is_secondary())
652                 return -E_RTE_SECONDARY;
653
654         priv_lock(priv);
655         ret = dev_configure(dev);
656         assert(ret >= 0);
657         priv_unlock(priv);
658         return -ret;
659 }
660
661 /**
662  * DPDK callback to get information about the device.
663  *
664  * @param dev
665  *   Pointer to Ethernet device structure.
666  * @param[out] info
667  *   Info structure output buffer.
668  */
669 void
670 mlx5_dev_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *info)
671 {
672         struct priv *priv = mlx5_get_priv(dev);
673         unsigned int max;
674         char ifname[IF_NAMESIZE];
675
676         info->pci_dev = RTE_ETH_DEV_TO_PCI(dev);
677
678         priv_lock(priv);
679         /* FIXME: we should ask the device for these values. */
680         info->min_rx_bufsize = 32;
681         info->max_rx_pktlen = 65536;
682         /*
683          * Since we need one CQ per QP, the limit is the minimum number
684          * between the two values.
685          */
686         max = RTE_MIN(priv->device_attr.orig_attr.max_cq,
687                       priv->device_attr.orig_attr.max_qp);
688         /* If max >= 65535 then max = 0, max_rx_queues is uint16_t. */
689         if (max >= 65535)
690                 max = 65535;
691         info->max_rx_queues = max;
692         info->max_tx_queues = max;
693         info->max_mac_addrs = RTE_DIM(priv->mac);
694         info->rx_offload_capa =
695                 (priv->hw_csum ?
696                  (DEV_RX_OFFLOAD_IPV4_CKSUM |
697                   DEV_RX_OFFLOAD_UDP_CKSUM |
698                   DEV_RX_OFFLOAD_TCP_CKSUM) :
699                  0) |
700                 (priv->hw_vlan_strip ? DEV_RX_OFFLOAD_VLAN_STRIP : 0) |
701                 DEV_RX_OFFLOAD_TIMESTAMP;
702
703         if (!priv->mps)
704                 info->tx_offload_capa = DEV_TX_OFFLOAD_VLAN_INSERT;
705         if (priv->hw_csum)
706                 info->tx_offload_capa |=
707                         (DEV_TX_OFFLOAD_IPV4_CKSUM |
708                          DEV_TX_OFFLOAD_UDP_CKSUM |
709                          DEV_TX_OFFLOAD_TCP_CKSUM);
710         if (priv->tso)
711                 info->tx_offload_capa |= DEV_TX_OFFLOAD_TCP_TSO;
712         if (priv->tunnel_en)
713                 info->tx_offload_capa |= (DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM |
714                                           DEV_TX_OFFLOAD_VXLAN_TNL_TSO |
715                                           DEV_TX_OFFLOAD_GRE_TNL_TSO);
716         if (priv_get_ifname(priv, &ifname) == 0)
717                 info->if_index = if_nametoindex(ifname);
718         info->reta_size = priv->reta_idx_n ?
719                 priv->reta_idx_n : priv->ind_table_max_size;
720         info->hash_key_size = priv->rss_conf.rss_key_len;
721         info->speed_capa = priv->link_speed_capa;
722         priv_unlock(priv);
723 }
724
725 const uint32_t *
726 mlx5_dev_supported_ptypes_get(struct rte_eth_dev *dev)
727 {
728         static const uint32_t ptypes[] = {
729                 /* refers to rxq_cq_to_pkt_type() */
730                 RTE_PTYPE_L2_ETHER,
731                 RTE_PTYPE_L3_IPV4_EXT_UNKNOWN,
732                 RTE_PTYPE_L3_IPV6_EXT_UNKNOWN,
733                 RTE_PTYPE_L4_NONFRAG,
734                 RTE_PTYPE_L4_FRAG,
735                 RTE_PTYPE_L4_TCP,
736                 RTE_PTYPE_L4_UDP,
737                 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN,
738                 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN,
739                 RTE_PTYPE_INNER_L4_NONFRAG,
740                 RTE_PTYPE_INNER_L4_FRAG,
741                 RTE_PTYPE_INNER_L4_TCP,
742                 RTE_PTYPE_INNER_L4_UDP,
743                 RTE_PTYPE_UNKNOWN
744         };
745
746         if (dev->rx_pkt_burst == mlx5_rx_burst ||
747             dev->rx_pkt_burst == mlx5_rx_burst_vec)
748                 return ptypes;
749         return NULL;
750 }
751
752 /**
753  * DPDK callback to retrieve physical link information.
754  *
755  * @param dev
756  *   Pointer to Ethernet device structure.
757  * @param wait_to_complete
758  *   Wait for request completion (ignored).
759  */
760 static int
761 mlx5_link_update_unlocked_gset(struct rte_eth_dev *dev, int wait_to_complete)
762 {
763         struct priv *priv = mlx5_get_priv(dev);
764         struct ethtool_cmd edata = {
765                 .cmd = ETHTOOL_GSET /* Deprecated since Linux v4.5. */
766         };
767         struct ifreq ifr;
768         struct rte_eth_link dev_link;
769         int link_speed = 0;
770
771         /* priv_lock() is not taken to allow concurrent calls. */
772
773         (void)wait_to_complete;
774         if (priv_ifreq(priv, SIOCGIFFLAGS, &ifr)) {
775                 WARN("ioctl(SIOCGIFFLAGS) failed: %s", strerror(errno));
776                 return -1;
777         }
778         memset(&dev_link, 0, sizeof(dev_link));
779         dev_link.link_status = ((ifr.ifr_flags & IFF_UP) &&
780                                 (ifr.ifr_flags & IFF_RUNNING));
781         ifr.ifr_data = (void *)&edata;
782         if (priv_ifreq(priv, SIOCETHTOOL, &ifr)) {
783                 WARN("ioctl(SIOCETHTOOL, ETHTOOL_GSET) failed: %s",
784                      strerror(errno));
785                 return -1;
786         }
787         link_speed = ethtool_cmd_speed(&edata);
788         if (link_speed == -1)
789                 dev_link.link_speed = 0;
790         else
791                 dev_link.link_speed = link_speed;
792         priv->link_speed_capa = 0;
793         if (edata.supported & SUPPORTED_Autoneg)
794                 priv->link_speed_capa |= ETH_LINK_SPEED_AUTONEG;
795         if (edata.supported & (SUPPORTED_1000baseT_Full |
796                                SUPPORTED_1000baseKX_Full))
797                 priv->link_speed_capa |= ETH_LINK_SPEED_1G;
798         if (edata.supported & SUPPORTED_10000baseKR_Full)
799                 priv->link_speed_capa |= ETH_LINK_SPEED_10G;
800         if (edata.supported & (SUPPORTED_40000baseKR4_Full |
801                                SUPPORTED_40000baseCR4_Full |
802                                SUPPORTED_40000baseSR4_Full |
803                                SUPPORTED_40000baseLR4_Full))
804                 priv->link_speed_capa |= ETH_LINK_SPEED_40G;
805         dev_link.link_duplex = ((edata.duplex == DUPLEX_HALF) ?
806                                 ETH_LINK_HALF_DUPLEX : ETH_LINK_FULL_DUPLEX);
807         dev_link.link_autoneg = !(dev->data->dev_conf.link_speeds &
808                         ETH_LINK_SPEED_FIXED);
809         if (memcmp(&dev_link, &dev->data->dev_link, sizeof(dev_link))) {
810                 /* Link status changed. */
811                 dev->data->dev_link = dev_link;
812                 return 0;
813         }
814         /* Link status is still the same. */
815         return -1;
816 }
817
818 /**
819  * Retrieve physical link information (unlocked version using new ioctl).
820  *
821  * @param dev
822  *   Pointer to Ethernet device structure.
823  * @param wait_to_complete
824  *   Wait for request completion (ignored).
825  */
826 static int
827 mlx5_link_update_unlocked_gs(struct rte_eth_dev *dev, int wait_to_complete)
828 {
829         struct priv *priv = mlx5_get_priv(dev);
830         struct ethtool_link_settings gcmd = { .cmd = ETHTOOL_GLINKSETTINGS };
831         struct ifreq ifr;
832         struct rte_eth_link dev_link;
833         uint64_t sc;
834
835         (void)wait_to_complete;
836         if (priv_ifreq(priv, SIOCGIFFLAGS, &ifr)) {
837                 WARN("ioctl(SIOCGIFFLAGS) failed: %s", strerror(errno));
838                 return -1;
839         }
840         memset(&dev_link, 0, sizeof(dev_link));
841         dev_link.link_status = ((ifr.ifr_flags & IFF_UP) &&
842                                 (ifr.ifr_flags & IFF_RUNNING));
843         ifr.ifr_data = (void *)&gcmd;
844         if (priv_ifreq(priv, SIOCETHTOOL, &ifr)) {
845                 DEBUG("ioctl(SIOCETHTOOL, ETHTOOL_GLINKSETTINGS) failed: %s",
846                       strerror(errno));
847                 return -1;
848         }
849         gcmd.link_mode_masks_nwords = -gcmd.link_mode_masks_nwords;
850
851         alignas(struct ethtool_link_settings)
852         uint8_t data[offsetof(struct ethtool_link_settings, link_mode_masks) +
853                      sizeof(uint32_t) * gcmd.link_mode_masks_nwords * 3];
854         struct ethtool_link_settings *ecmd = (void *)data;
855
856         *ecmd = gcmd;
857         ifr.ifr_data = (void *)ecmd;
858         if (priv_ifreq(priv, SIOCETHTOOL, &ifr)) {
859                 DEBUG("ioctl(SIOCETHTOOL, ETHTOOL_GLINKSETTINGS) failed: %s",
860                       strerror(errno));
861                 return -1;
862         }
863         dev_link.link_speed = ecmd->speed;
864         sc = ecmd->link_mode_masks[0] |
865                 ((uint64_t)ecmd->link_mode_masks[1] << 32);
866         priv->link_speed_capa = 0;
867         if (sc & ETHTOOL_LINK_MODE_Autoneg_BIT)
868                 priv->link_speed_capa |= ETH_LINK_SPEED_AUTONEG;
869         if (sc & (ETHTOOL_LINK_MODE_1000baseT_Full_BIT |
870                   ETHTOOL_LINK_MODE_1000baseKX_Full_BIT))
871                 priv->link_speed_capa |= ETH_LINK_SPEED_1G;
872         if (sc & (ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT |
873                   ETHTOOL_LINK_MODE_10000baseKR_Full_BIT |
874                   ETHTOOL_LINK_MODE_10000baseR_FEC_BIT))
875                 priv->link_speed_capa |= ETH_LINK_SPEED_10G;
876         if (sc & (ETHTOOL_LINK_MODE_20000baseMLD2_Full_BIT |
877                   ETHTOOL_LINK_MODE_20000baseKR2_Full_BIT))
878                 priv->link_speed_capa |= ETH_LINK_SPEED_20G;
879         if (sc & (ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT |
880                   ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT |
881                   ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT |
882                   ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT))
883                 priv->link_speed_capa |= ETH_LINK_SPEED_40G;
884         if (sc & (ETHTOOL_LINK_MODE_56000baseKR4_Full_BIT |
885                   ETHTOOL_LINK_MODE_56000baseCR4_Full_BIT |
886                   ETHTOOL_LINK_MODE_56000baseSR4_Full_BIT |
887                   ETHTOOL_LINK_MODE_56000baseLR4_Full_BIT))
888                 priv->link_speed_capa |= ETH_LINK_SPEED_56G;
889         if (sc & (ETHTOOL_LINK_MODE_25000baseCR_Full_BIT |
890                   ETHTOOL_LINK_MODE_25000baseKR_Full_BIT |
891                   ETHTOOL_LINK_MODE_25000baseSR_Full_BIT))
892                 priv->link_speed_capa |= ETH_LINK_SPEED_25G;
893         if (sc & (ETHTOOL_LINK_MODE_50000baseCR2_Full_BIT |
894                   ETHTOOL_LINK_MODE_50000baseKR2_Full_BIT))
895                 priv->link_speed_capa |= ETH_LINK_SPEED_50G;
896         if (sc & (ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT |
897                   ETHTOOL_LINK_MODE_100000baseSR4_Full_BIT |
898                   ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT |
899                   ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT))
900                 priv->link_speed_capa |= ETH_LINK_SPEED_100G;
901         dev_link.link_duplex = ((ecmd->duplex == DUPLEX_HALF) ?
902                                 ETH_LINK_HALF_DUPLEX : ETH_LINK_FULL_DUPLEX);
903         dev_link.link_autoneg = !(dev->data->dev_conf.link_speeds &
904                                   ETH_LINK_SPEED_FIXED);
905         if (memcmp(&dev_link, &dev->data->dev_link, sizeof(dev_link))) {
906                 /* Link status changed. */
907                 dev->data->dev_link = dev_link;
908                 return 0;
909         }
910         /* Link status is still the same. */
911         return -1;
912 }
913
914 /**
915  * DPDK callback to retrieve physical link information.
916  *
917  * @param dev
918  *   Pointer to Ethernet device structure.
919  * @param wait_to_complete
920  *   Wait for request completion (ignored).
921  */
922 int
923 mlx5_link_update(struct rte_eth_dev *dev, int wait_to_complete)
924 {
925         struct utsname utsname;
926         int ver[3];
927
928         if (uname(&utsname) == -1 ||
929             sscanf(utsname.release, "%d.%d.%d",
930                    &ver[0], &ver[1], &ver[2]) != 3 ||
931             KERNEL_VERSION(ver[0], ver[1], ver[2]) < KERNEL_VERSION(4, 9, 0))
932                 return mlx5_link_update_unlocked_gset(dev, wait_to_complete);
933         return mlx5_link_update_unlocked_gs(dev, wait_to_complete);
934 }
935
936 /**
937  * DPDK callback to change the MTU.
938  *
939  * @param dev
940  *   Pointer to Ethernet device structure.
941  * @param in_mtu
942  *   New MTU.
943  *
944  * @return
945  *   0 on success, negative errno value on failure.
946  */
947 int
948 mlx5_dev_set_mtu(struct rte_eth_dev *dev, uint16_t mtu)
949 {
950         struct priv *priv = dev->data->dev_private;
951         uint16_t kern_mtu;
952         int ret = 0;
953
954         if (mlx5_is_secondary())
955                 return -E_RTE_SECONDARY;
956
957         priv_lock(priv);
958         ret = priv_get_mtu(priv, &kern_mtu);
959         if (ret)
960                 goto out;
961         /* Set kernel interface MTU first. */
962         ret = priv_set_mtu(priv, mtu);
963         if (ret)
964                 goto out;
965         ret = priv_get_mtu(priv, &kern_mtu);
966         if (ret)
967                 goto out;
968         if (kern_mtu == mtu) {
969                 priv->mtu = mtu;
970                 DEBUG("adapter port %u MTU set to %u", priv->port, mtu);
971         }
972         priv_unlock(priv);
973         return 0;
974 out:
975         ret = errno;
976         WARN("cannot set port %u MTU to %u: %s", priv->port, mtu,
977              strerror(ret));
978         priv_unlock(priv);
979         assert(ret >= 0);
980         return -ret;
981 }
982
983 /**
984  * DPDK callback to get flow control status.
985  *
986  * @param dev
987  *   Pointer to Ethernet device structure.
988  * @param[out] fc_conf
989  *   Flow control output buffer.
990  *
991  * @return
992  *   0 on success, negative errno value on failure.
993  */
994 int
995 mlx5_dev_get_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf)
996 {
997         struct priv *priv = dev->data->dev_private;
998         struct ifreq ifr;
999         struct ethtool_pauseparam ethpause = {
1000                 .cmd = ETHTOOL_GPAUSEPARAM
1001         };
1002         int ret;
1003
1004         if (mlx5_is_secondary())
1005                 return -E_RTE_SECONDARY;
1006
1007         ifr.ifr_data = (void *)&ethpause;
1008         priv_lock(priv);
1009         if (priv_ifreq(priv, SIOCETHTOOL, &ifr)) {
1010                 ret = errno;
1011                 WARN("ioctl(SIOCETHTOOL, ETHTOOL_GPAUSEPARAM)"
1012                      " failed: %s",
1013                      strerror(ret));
1014                 goto out;
1015         }
1016
1017         fc_conf->autoneg = ethpause.autoneg;
1018         if (ethpause.rx_pause && ethpause.tx_pause)
1019                 fc_conf->mode = RTE_FC_FULL;
1020         else if (ethpause.rx_pause)
1021                 fc_conf->mode = RTE_FC_RX_PAUSE;
1022         else if (ethpause.tx_pause)
1023                 fc_conf->mode = RTE_FC_TX_PAUSE;
1024         else
1025                 fc_conf->mode = RTE_FC_NONE;
1026         ret = 0;
1027
1028 out:
1029         priv_unlock(priv);
1030         assert(ret >= 0);
1031         return -ret;
1032 }
1033
1034 /**
1035  * DPDK callback to modify flow control parameters.
1036  *
1037  * @param dev
1038  *   Pointer to Ethernet device structure.
1039  * @param[in] fc_conf
1040  *   Flow control parameters.
1041  *
1042  * @return
1043  *   0 on success, negative errno value on failure.
1044  */
1045 int
1046 mlx5_dev_set_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf)
1047 {
1048         struct priv *priv = dev->data->dev_private;
1049         struct ifreq ifr;
1050         struct ethtool_pauseparam ethpause = {
1051                 .cmd = ETHTOOL_SPAUSEPARAM
1052         };
1053         int ret;
1054
1055         if (mlx5_is_secondary())
1056                 return -E_RTE_SECONDARY;
1057
1058         ifr.ifr_data = (void *)&ethpause;
1059         ethpause.autoneg = fc_conf->autoneg;
1060         if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) ||
1061             (fc_conf->mode & RTE_FC_RX_PAUSE))
1062                 ethpause.rx_pause = 1;
1063         else
1064                 ethpause.rx_pause = 0;
1065
1066         if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) ||
1067             (fc_conf->mode & RTE_FC_TX_PAUSE))
1068                 ethpause.tx_pause = 1;
1069         else
1070                 ethpause.tx_pause = 0;
1071
1072         priv_lock(priv);
1073         if (priv_ifreq(priv, SIOCETHTOOL, &ifr)) {
1074                 ret = errno;
1075                 WARN("ioctl(SIOCETHTOOL, ETHTOOL_SPAUSEPARAM)"
1076                      " failed: %s",
1077                      strerror(ret));
1078                 goto out;
1079         }
1080         ret = 0;
1081
1082 out:
1083         priv_unlock(priv);
1084         assert(ret >= 0);
1085         return -ret;
1086 }
1087
1088 /**
1089  * Get PCI information from struct ibv_device.
1090  *
1091  * @param device
1092  *   Pointer to Ethernet device structure.
1093  * @param[out] pci_addr
1094  *   PCI bus address output buffer.
1095  *
1096  * @return
1097  *   0 on success, -1 on failure and errno is set.
1098  */
1099 int
1100 mlx5_ibv_device_to_pci_addr(const struct ibv_device *device,
1101                             struct rte_pci_addr *pci_addr)
1102 {
1103         FILE *file;
1104         char line[32];
1105         MKSTR(path, "%s/device/uevent", device->ibdev_path);
1106
1107         file = fopen(path, "rb");
1108         if (file == NULL)
1109                 return -1;
1110         while (fgets(line, sizeof(line), file) == line) {
1111                 size_t len = strlen(line);
1112                 int ret;
1113
1114                 /* Truncate long lines. */
1115                 if (len == (sizeof(line) - 1))
1116                         while (line[(len - 1)] != '\n') {
1117                                 ret = fgetc(file);
1118                                 if (ret == EOF)
1119                                         break;
1120                                 line[(len - 1)] = ret;
1121                         }
1122                 /* Extract information. */
1123                 if (sscanf(line,
1124                            "PCI_SLOT_NAME="
1125                            "%" SCNx32 ":%" SCNx8 ":%" SCNx8 ".%" SCNx8 "\n",
1126                            &pci_addr->domain,
1127                            &pci_addr->bus,
1128                            &pci_addr->devid,
1129                            &pci_addr->function) == 4) {
1130                         ret = 0;
1131                         break;
1132                 }
1133         }
1134         fclose(file);
1135         return 0;
1136 }
1137
1138 /**
1139  * Update the link status.
1140  *
1141  * @param priv
1142  *   Pointer to private structure.
1143  *
1144  * @return
1145  *   Zero if the callback process can be called immediately.
1146  */
1147 static int
1148 priv_link_status_update(struct priv *priv)
1149 {
1150         struct rte_eth_link *link = &priv->dev->data->dev_link;
1151
1152         mlx5_link_update(priv->dev, 0);
1153         if (((link->link_speed == 0) && link->link_status) ||
1154                 ((link->link_speed != 0) && !link->link_status)) {
1155                 /*
1156                  * Inconsistent status. Event likely occurred before the
1157                  * kernel netdevice exposes the new status.
1158                  */
1159                 if (!priv->pending_alarm) {
1160                         priv->pending_alarm = 1;
1161                         rte_eal_alarm_set(MLX5_ALARM_TIMEOUT_US,
1162                                           mlx5_dev_link_status_handler,
1163                                           priv->dev);
1164                 }
1165                 return 1;
1166         } else if (unlikely(priv->pending_alarm)) {
1167                 /* Link interrupt occurred while alarm is already scheduled. */
1168                 priv->pending_alarm = 0;
1169                 rte_eal_alarm_cancel(mlx5_dev_link_status_handler, priv->dev);
1170         }
1171         return 0;
1172 }
1173
1174 /**
1175  * Device status handler.
1176  *
1177  * @param priv
1178  *   Pointer to private structure.
1179  * @param events
1180  *   Pointer to event flags holder.
1181  *
1182  * @return
1183  *   Events bitmap of callback process which can be called immediately.
1184  */
1185 static uint32_t
1186 priv_dev_status_handler(struct priv *priv)
1187 {
1188         struct ibv_async_event event;
1189         uint32_t ret = 0;
1190
1191         /* Read all message and acknowledge them. */
1192         for (;;) {
1193                 if (ibv_get_async_event(priv->ctx, &event))
1194                         break;
1195                 if ((event.event_type == IBV_EVENT_PORT_ACTIVE ||
1196                         event.event_type == IBV_EVENT_PORT_ERR) &&
1197                         (priv->dev->data->dev_conf.intr_conf.lsc == 1))
1198                         ret |= (1 << RTE_ETH_EVENT_INTR_LSC);
1199                 else if (event.event_type == IBV_EVENT_DEVICE_FATAL &&
1200                         priv->dev->data->dev_conf.intr_conf.rmv == 1)
1201                         ret |= (1 << RTE_ETH_EVENT_INTR_RMV);
1202                 else
1203                         DEBUG("event type %d on port %d not handled",
1204                               event.event_type, event.element.port_num);
1205                 ibv_ack_async_event(&event);
1206         }
1207         if (ret & (1 << RTE_ETH_EVENT_INTR_LSC))
1208                 if (priv_link_status_update(priv))
1209                         ret &= ~(1 << RTE_ETH_EVENT_INTR_LSC);
1210         return ret;
1211 }
1212
1213 /**
1214  * Handle delayed link status event.
1215  *
1216  * @param arg
1217  *   Registered argument.
1218  */
1219 void
1220 mlx5_dev_link_status_handler(void *arg)
1221 {
1222         struct rte_eth_dev *dev = arg;
1223         struct priv *priv = dev->data->dev_private;
1224         int ret;
1225
1226         priv_lock(priv);
1227         assert(priv->pending_alarm == 1);
1228         priv->pending_alarm = 0;
1229         ret = priv_link_status_update(priv);
1230         priv_unlock(priv);
1231         if (!ret)
1232                 _rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_INTR_LSC, NULL,
1233                                               NULL);
1234 }
1235
1236 /**
1237  * Handle interrupts from the NIC.
1238  *
1239  * @param[in] intr_handle
1240  *   Interrupt handler.
1241  * @param cb_arg
1242  *   Callback argument.
1243  */
1244 void
1245 mlx5_dev_interrupt_handler(void *cb_arg)
1246 {
1247         struct rte_eth_dev *dev = cb_arg;
1248         struct priv *priv = dev->data->dev_private;
1249         uint32_t events;
1250
1251         priv_lock(priv);
1252         events = priv_dev_status_handler(priv);
1253         priv_unlock(priv);
1254         if (events & (1 << RTE_ETH_EVENT_INTR_LSC))
1255                 _rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_INTR_LSC, NULL,
1256                                               NULL);
1257         if (events & (1 << RTE_ETH_EVENT_INTR_RMV))
1258                 _rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_INTR_RMV, NULL,
1259                                               NULL);
1260 }
1261
1262 /**
1263  * Handle interrupts from the socket.
1264  *
1265  * @param cb_arg
1266  *   Callback argument.
1267  */
1268 static void
1269 mlx5_dev_handler_socket(void *cb_arg)
1270 {
1271         struct rte_eth_dev *dev = cb_arg;
1272         struct priv *priv = dev->data->dev_private;
1273
1274         priv_lock(priv);
1275         priv_socket_handle(priv);
1276         priv_unlock(priv);
1277 }
1278
1279 /**
1280  * Uninstall interrupt handler.
1281  *
1282  * @param priv
1283  *   Pointer to private structure.
1284  * @param dev
1285  *   Pointer to the rte_eth_dev structure.
1286  */
1287 void
1288 priv_dev_interrupt_handler_uninstall(struct priv *priv, struct rte_eth_dev *dev)
1289 {
1290         if (dev->data->dev_conf.intr_conf.lsc ||
1291             dev->data->dev_conf.intr_conf.rmv)
1292                 rte_intr_callback_unregister(&priv->intr_handle,
1293                                              mlx5_dev_interrupt_handler, dev);
1294         if (priv->primary_socket)
1295                 rte_intr_callback_unregister(&priv->intr_handle_socket,
1296                                              mlx5_dev_handler_socket, dev);
1297         if (priv->pending_alarm)
1298                 rte_eal_alarm_cancel(mlx5_dev_link_status_handler, dev);
1299         priv->pending_alarm = 0;
1300         priv->intr_handle.fd = 0;
1301         priv->intr_handle.type = RTE_INTR_HANDLE_UNKNOWN;
1302         priv->intr_handle_socket.fd = 0;
1303         priv->intr_handle_socket.type = RTE_INTR_HANDLE_UNKNOWN;
1304 }
1305
1306 /**
1307  * Install interrupt handler.
1308  *
1309  * @param priv
1310  *   Pointer to private structure.
1311  * @param dev
1312  *   Pointer to the rte_eth_dev structure.
1313  */
1314 void
1315 priv_dev_interrupt_handler_install(struct priv *priv, struct rte_eth_dev *dev)
1316 {
1317         int rc, flags;
1318
1319         assert(!mlx5_is_secondary());
1320         assert(priv->ctx->async_fd > 0);
1321         flags = fcntl(priv->ctx->async_fd, F_GETFL);
1322         rc = fcntl(priv->ctx->async_fd, F_SETFL, flags | O_NONBLOCK);
1323         if (rc < 0) {
1324                 INFO("failed to change file descriptor async event queue");
1325                 dev->data->dev_conf.intr_conf.lsc = 0;
1326                 dev->data->dev_conf.intr_conf.rmv = 0;
1327         }
1328         if (dev->data->dev_conf.intr_conf.lsc ||
1329             dev->data->dev_conf.intr_conf.rmv) {
1330                 priv->intr_handle.fd = priv->ctx->async_fd;
1331                 priv->intr_handle.type = RTE_INTR_HANDLE_EXT;
1332                 rte_intr_callback_register(&priv->intr_handle,
1333                                            mlx5_dev_interrupt_handler, dev);
1334         }
1335
1336         rc = priv_socket_init(priv);
1337         if (!rc && priv->primary_socket) {
1338                 priv->intr_handle_socket.fd = priv->primary_socket;
1339                 priv->intr_handle_socket.type = RTE_INTR_HANDLE_EXT;
1340                 rte_intr_callback_register(&priv->intr_handle_socket,
1341                                            mlx5_dev_handler_socket, dev);
1342         }
1343 }
1344
1345 /**
1346  * Change the link state (UP / DOWN).
1347  *
1348  * @param priv
1349  *   Pointer to private data structure.
1350  * @param dev
1351  *   Pointer to rte_eth_dev structure.
1352  * @param up
1353  *   Nonzero for link up, otherwise link down.
1354  *
1355  * @return
1356  *   0 on success, errno value on failure.
1357  */
1358 static int
1359 priv_dev_set_link(struct priv *priv, struct rte_eth_dev *dev, int up)
1360 {
1361         int err;
1362
1363         if (up) {
1364                 err = priv_set_flags(priv, ~IFF_UP, IFF_UP);
1365                 if (err)
1366                         return err;
1367                 priv_dev_select_tx_function(priv, dev);
1368                 priv_dev_select_rx_function(priv, dev);
1369         } else {
1370                 err = priv_set_flags(priv, ~IFF_UP, ~IFF_UP);
1371                 if (err)
1372                         return err;
1373                 dev->rx_pkt_burst = removed_rx_burst;
1374                 dev->tx_pkt_burst = removed_tx_burst;
1375         }
1376         return 0;
1377 }
1378
1379 /**
1380  * DPDK callback to bring the link DOWN.
1381  *
1382  * @param dev
1383  *   Pointer to Ethernet device structure.
1384  *
1385  * @return
1386  *   0 on success, errno value on failure.
1387  */
1388 int
1389 mlx5_set_link_down(struct rte_eth_dev *dev)
1390 {
1391         struct priv *priv = dev->data->dev_private;
1392         int err;
1393
1394         priv_lock(priv);
1395         err = priv_dev_set_link(priv, dev, 0);
1396         priv_unlock(priv);
1397         return err;
1398 }
1399
1400 /**
1401  * DPDK callback to bring the link UP.
1402  *
1403  * @param dev
1404  *   Pointer to Ethernet device structure.
1405  *
1406  * @return
1407  *   0 on success, errno value on failure.
1408  */
1409 int
1410 mlx5_set_link_up(struct rte_eth_dev *dev)
1411 {
1412         struct priv *priv = dev->data->dev_private;
1413         int err;
1414
1415         priv_lock(priv);
1416         err = priv_dev_set_link(priv, dev, 1);
1417         priv_unlock(priv);
1418         return err;
1419 }
1420
1421 /**
1422  * Configure the TX function to use.
1423  *
1424  * @param priv
1425  *   Pointer to private data structure.
1426  * @param dev
1427  *   Pointer to rte_eth_dev structure.
1428  */
1429 void
1430 priv_dev_select_tx_function(struct priv *priv, struct rte_eth_dev *dev)
1431 {
1432         assert(priv != NULL);
1433         assert(dev != NULL);
1434         dev->tx_pkt_burst = mlx5_tx_burst;
1435         /* Select appropriate TX function. */
1436         if (priv->mps == MLX5_MPW_ENHANCED) {
1437                 if (priv_check_vec_tx_support(priv) > 0) {
1438                         if (priv_check_raw_vec_tx_support(priv) > 0)
1439                                 dev->tx_pkt_burst = mlx5_tx_burst_raw_vec;
1440                         else
1441                                 dev->tx_pkt_burst = mlx5_tx_burst_vec;
1442                         DEBUG("selected Enhanced MPW TX vectorized function");
1443                 } else {
1444                         dev->tx_pkt_burst = mlx5_tx_burst_empw;
1445                         DEBUG("selected Enhanced MPW TX function");
1446                 }
1447         } else if (priv->mps && priv->txq_inline) {
1448                 dev->tx_pkt_burst = mlx5_tx_burst_mpw_inline;
1449                 DEBUG("selected MPW inline TX function");
1450         } else if (priv->mps) {
1451                 dev->tx_pkt_burst = mlx5_tx_burst_mpw;
1452                 DEBUG("selected MPW TX function");
1453         }
1454 }
1455
1456 /**
1457  * Configure the RX function to use.
1458  *
1459  * @param priv
1460  *   Pointer to private data structure.
1461  * @param dev
1462  *   Pointer to rte_eth_dev structure.
1463  */
1464 void
1465 priv_dev_select_rx_function(struct priv *priv, struct rte_eth_dev *dev)
1466 {
1467         assert(priv != NULL);
1468         assert(dev != NULL);
1469         if (priv_check_vec_rx_support(priv) > 0) {
1470                 dev->rx_pkt_burst = mlx5_rx_burst_vec;
1471                 DEBUG("selected RX vectorized function");
1472         } else {
1473                 dev->rx_pkt_burst = mlx5_rx_burst;
1474         }
1475 }