1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright 2015 6WIND S.A.
3 * Copyright 2015 Mellanox.
19 #include <sys/ioctl.h>
20 #include <sys/socket.h>
21 #include <sys/utsname.h>
22 #include <netinet/in.h>
23 #include <linux/ethtool.h>
24 #include <linux/sockios.h>
25 #include <linux/version.h>
30 #include <rte_atomic.h>
31 #include <rte_ethdev_driver.h>
32 #include <rte_bus_pci.h>
34 #include <rte_common.h>
35 #include <rte_interrupts.h>
36 #include <rte_alarm.h>
37 #include <rte_malloc.h>
40 #include "mlx5_glue.h"
41 #include "mlx5_rxtx.h"
42 #include "mlx5_utils.h"
44 /* Add defines in case the running kernel is not the same as user headers. */
45 #ifndef ETHTOOL_GLINKSETTINGS
46 struct ethtool_link_settings {
55 uint8_t eth_tp_mdix_ctrl;
56 int8_t link_mode_masks_nwords;
58 uint32_t link_mode_masks[];
61 #define ETHTOOL_GLINKSETTINGS 0x0000004c
62 #define ETHTOOL_LINK_MODE_1000baseT_Full_BIT 5
63 #define ETHTOOL_LINK_MODE_Autoneg_BIT 6
64 #define ETHTOOL_LINK_MODE_1000baseKX_Full_BIT 17
65 #define ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT 18
66 #define ETHTOOL_LINK_MODE_10000baseKR_Full_BIT 19
67 #define ETHTOOL_LINK_MODE_10000baseR_FEC_BIT 20
68 #define ETHTOOL_LINK_MODE_20000baseMLD2_Full_BIT 21
69 #define ETHTOOL_LINK_MODE_20000baseKR2_Full_BIT 22
70 #define ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT 23
71 #define ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT 24
72 #define ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT 25
73 #define ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT 26
74 #define ETHTOOL_LINK_MODE_56000baseKR4_Full_BIT 27
75 #define ETHTOOL_LINK_MODE_56000baseCR4_Full_BIT 28
76 #define ETHTOOL_LINK_MODE_56000baseSR4_Full_BIT 29
77 #define ETHTOOL_LINK_MODE_56000baseLR4_Full_BIT 30
79 #ifndef HAVE_ETHTOOL_LINK_MODE_25G
80 #define ETHTOOL_LINK_MODE_25000baseCR_Full_BIT 31
81 #define ETHTOOL_LINK_MODE_25000baseKR_Full_BIT 32
82 #define ETHTOOL_LINK_MODE_25000baseSR_Full_BIT 33
84 #ifndef HAVE_ETHTOOL_LINK_MODE_50G
85 #define ETHTOOL_LINK_MODE_50000baseCR2_Full_BIT 34
86 #define ETHTOOL_LINK_MODE_50000baseKR2_Full_BIT 35
88 #ifndef HAVE_ETHTOOL_LINK_MODE_100G
89 #define ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT 36
90 #define ETHTOOL_LINK_MODE_100000baseSR4_Full_BIT 37
91 #define ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT 38
92 #define ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT 39
96 * Get interface name from private structure.
99 * Pointer to Ethernet device.
101 * Interface name output buffer.
104 * 0 on success, a negative errno value otherwise and rte_errno is set.
107 mlx5_get_ifname(const struct rte_eth_dev *dev, char (*ifname)[IF_NAMESIZE])
109 struct priv *priv = dev->data->dev_private;
112 unsigned int dev_type = 0;
113 unsigned int dev_port_prev = ~0u;
114 char match[IF_NAMESIZE] = "";
117 MKSTR(path, "%s/device/net", priv->ibdev_path);
125 while ((dent = readdir(dir)) != NULL) {
126 char *name = dent->d_name;
128 unsigned int dev_port;
131 if ((name[0] == '.') &&
132 ((name[1] == '\0') ||
133 ((name[1] == '.') && (name[2] == '\0'))))
136 MKSTR(path, "%s/device/net/%s/%s",
137 priv->ibdev_path, name,
138 (dev_type ? "dev_id" : "dev_port"));
140 file = fopen(path, "rb");
145 * Switch to dev_id when dev_port does not exist as
146 * is the case with Linux kernel versions < 3.15.
157 r = fscanf(file, (dev_type ? "%x" : "%u"), &dev_port);
162 * Switch to dev_id when dev_port returns the same value for
163 * all ports. May happen when using a MOFED release older than
164 * 3.0 with a Linux kernel >= 3.15.
166 if (dev_port == dev_port_prev)
168 dev_port_prev = dev_port;
169 if (dev_port == (priv->port - 1u))
170 snprintf(match, sizeof(match), "%s", name);
173 if (match[0] == '\0') {
177 strncpy(*ifname, match, sizeof(*ifname));
182 * Perform ifreq ioctl() on associated Ethernet device.
185 * Pointer to Ethernet device.
187 * Request number to pass to ioctl().
189 * Interface request structure output buffer.
192 * 0 on success, a negative errno value otherwise and rte_errno is set.
195 mlx5_ifreq(const struct rte_eth_dev *dev, int req, struct ifreq *ifr)
197 int sock = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
204 ret = mlx5_get_ifname(dev, &ifr->ifr_name);
207 ret = ioctl(sock, req, ifr);
223 * Pointer to Ethernet device.
225 * MTU value output buffer.
228 * 0 on success, a negative errno value otherwise and rte_errno is set.
231 mlx5_get_mtu(struct rte_eth_dev *dev, uint16_t *mtu)
233 struct ifreq request;
234 int ret = mlx5_ifreq(dev, SIOCGIFMTU, &request);
238 *mtu = request.ifr_mtu;
246 * Pointer to Ethernet device.
251 * 0 on success, a negative errno value otherwise and rte_errno is set.
254 mlx5_set_mtu(struct rte_eth_dev *dev, uint16_t mtu)
256 struct ifreq request = { .ifr_mtu = mtu, };
258 return mlx5_ifreq(dev, SIOCSIFMTU, &request);
265 * Pointer to Ethernet device.
267 * Bitmask for flags that must remain untouched.
269 * Bitmask for flags to modify.
272 * 0 on success, a negative errno value otherwise and rte_errno is set.
275 mlx5_set_flags(struct rte_eth_dev *dev, unsigned int keep, unsigned int flags)
277 struct ifreq request;
278 int ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &request);
282 request.ifr_flags &= keep;
283 request.ifr_flags |= flags & ~keep;
284 return mlx5_ifreq(dev, SIOCSIFFLAGS, &request);
288 * DPDK callback for Ethernet device configuration.
291 * Pointer to Ethernet device structure.
294 * 0 on success, a negative errno value otherwise and rte_errno is set.
297 mlx5_dev_configure(struct rte_eth_dev *dev)
299 struct priv *priv = dev->data->dev_private;
300 unsigned int rxqs_n = dev->data->nb_rx_queues;
301 unsigned int txqs_n = dev->data->nb_tx_queues;
304 unsigned int reta_idx_n;
305 const uint8_t use_app_rss_key =
306 !!dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key;
307 uint64_t supp_tx_offloads = mlx5_get_tx_port_offloads(dev);
308 uint64_t tx_offloads = dev->data->dev_conf.txmode.offloads;
309 uint64_t supp_rx_offloads =
310 (mlx5_get_rx_port_offloads() |
311 mlx5_get_rx_queue_offloads(dev));
312 uint64_t rx_offloads = dev->data->dev_conf.rxmode.offloads;
315 if ((tx_offloads & supp_tx_offloads) != tx_offloads) {
317 "port %u some Tx offloads are not supported requested"
318 " 0x%" PRIx64 " supported 0x%" PRIx64,
319 dev->data->port_id, tx_offloads, supp_tx_offloads);
323 if ((rx_offloads & supp_rx_offloads) != rx_offloads) {
325 "port %u some Rx offloads are not supported requested"
326 " 0x%" PRIx64 " supported 0x%" PRIx64,
327 dev->data->port_id, rx_offloads, supp_rx_offloads);
331 if (use_app_rss_key &&
332 (dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key_len !=
333 rss_hash_default_key_len)) {
334 /* MLX5 RSS only support 40bytes key. */
338 priv->rss_conf.rss_key =
339 rte_realloc(priv->rss_conf.rss_key,
340 rss_hash_default_key_len, 0);
341 if (!priv->rss_conf.rss_key) {
342 DRV_LOG(ERR, "port %u cannot allocate RSS hash key memory (%u)",
343 dev->data->port_id, rxqs_n);
347 memcpy(priv->rss_conf.rss_key,
349 dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key :
350 rss_hash_default_key,
351 rss_hash_default_key_len);
352 priv->rss_conf.rss_key_len = rss_hash_default_key_len;
353 priv->rss_conf.rss_hf = dev->data->dev_conf.rx_adv_conf.rss_conf.rss_hf;
354 priv->rxqs = (void *)dev->data->rx_queues;
355 priv->txqs = (void *)dev->data->tx_queues;
356 if (txqs_n != priv->txqs_n) {
357 DRV_LOG(INFO, "port %u Tx queues number update: %u -> %u",
358 dev->data->port_id, priv->txqs_n, txqs_n);
359 priv->txqs_n = txqs_n;
361 if (rxqs_n > priv->config.ind_table_max_size) {
362 DRV_LOG(ERR, "port %u cannot handle this many Rx queues (%u)",
363 dev->data->port_id, rxqs_n);
367 if (rxqs_n == priv->rxqs_n)
369 DRV_LOG(INFO, "port %u Rx queues number update: %u -> %u",
370 dev->data->port_id, priv->rxqs_n, rxqs_n);
371 priv->rxqs_n = rxqs_n;
372 /* If the requested number of RX queues is not a power of two, use the
373 * maximum indirection table size for better balancing.
374 * The result is always rounded to the next power of two. */
375 reta_idx_n = (1 << log2above((rxqs_n & (rxqs_n - 1)) ?
376 priv->config.ind_table_max_size :
378 ret = mlx5_rss_reta_index_resize(dev, reta_idx_n);
381 /* When the number of RX queues is not a power of two, the remaining
382 * table entries are padded with reused WQs and hashes are not spread
384 for (i = 0, j = 0; (i != reta_idx_n); ++i) {
385 (*priv->reta_idx)[i] = j;
393 * DPDK callback to get information about the device.
396 * Pointer to Ethernet device structure.
398 * Info structure output buffer.
401 mlx5_dev_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *info)
403 struct priv *priv = dev->data->dev_private;
404 struct mlx5_dev_config *config = &priv->config;
406 char ifname[IF_NAMESIZE];
408 info->pci_dev = RTE_ETH_DEV_TO_PCI(dev);
409 /* FIXME: we should ask the device for these values. */
410 info->min_rx_bufsize = 32;
411 info->max_rx_pktlen = 65536;
413 * Since we need one CQ per QP, the limit is the minimum number
414 * between the two values.
416 max = RTE_MIN(priv->device_attr.orig_attr.max_cq,
417 priv->device_attr.orig_attr.max_qp);
418 /* If max >= 65535 then max = 0, max_rx_queues is uint16_t. */
421 info->max_rx_queues = max;
422 info->max_tx_queues = max;
423 info->max_mac_addrs = RTE_DIM(priv->mac);
424 info->rx_queue_offload_capa = mlx5_get_rx_queue_offloads(dev);
425 info->rx_offload_capa = (mlx5_get_rx_port_offloads() |
426 info->rx_queue_offload_capa);
427 info->tx_offload_capa = mlx5_get_tx_port_offloads(dev);
428 if (mlx5_get_ifname(dev, &ifname) == 0)
429 info->if_index = if_nametoindex(ifname);
430 info->reta_size = priv->reta_idx_n ?
431 priv->reta_idx_n : config->ind_table_max_size;
432 info->hash_key_size = priv->rss_conf.rss_key_len;
433 info->speed_capa = priv->link_speed_capa;
434 info->flow_type_rss_offloads = ~MLX5_RSS_HF_MASK;
438 * Get supported packet types.
441 * Pointer to Ethernet device structure.
444 * A pointer to the supported Packet types array.
447 mlx5_dev_supported_ptypes_get(struct rte_eth_dev *dev)
449 static const uint32_t ptypes[] = {
450 /* refers to rxq_cq_to_pkt_type() */
452 RTE_PTYPE_L3_IPV4_EXT_UNKNOWN,
453 RTE_PTYPE_L3_IPV6_EXT_UNKNOWN,
454 RTE_PTYPE_L4_NONFRAG,
458 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN,
459 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN,
460 RTE_PTYPE_INNER_L4_NONFRAG,
461 RTE_PTYPE_INNER_L4_FRAG,
462 RTE_PTYPE_INNER_L4_TCP,
463 RTE_PTYPE_INNER_L4_UDP,
467 if (dev->rx_pkt_burst == mlx5_rx_burst ||
468 dev->rx_pkt_burst == mlx5_rx_burst_vec)
474 * DPDK callback to retrieve physical link information.
477 * Pointer to Ethernet device structure.
480 * 0 on success, a negative errno value otherwise and rte_errno is set.
483 mlx5_link_update_unlocked_gset(struct rte_eth_dev *dev)
485 struct priv *priv = dev->data->dev_private;
486 struct ethtool_cmd edata = {
487 .cmd = ETHTOOL_GSET /* Deprecated since Linux v4.5. */
490 struct rte_eth_link dev_link;
494 ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr);
496 DRV_LOG(WARNING, "port %u ioctl(SIOCGIFFLAGS) failed: %s",
497 dev->data->port_id, strerror(rte_errno));
500 memset(&dev_link, 0, sizeof(dev_link));
501 dev_link.link_status = ((ifr.ifr_flags & IFF_UP) &&
502 (ifr.ifr_flags & IFF_RUNNING));
503 ifr.ifr_data = (void *)&edata;
504 ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
507 "port %u ioctl(SIOCETHTOOL, ETHTOOL_GSET) failed: %s",
508 dev->data->port_id, strerror(rte_errno));
511 link_speed = ethtool_cmd_speed(&edata);
512 if (link_speed == -1)
513 dev_link.link_speed = 0;
515 dev_link.link_speed = link_speed;
516 priv->link_speed_capa = 0;
517 if (edata.supported & SUPPORTED_Autoneg)
518 priv->link_speed_capa |= ETH_LINK_SPEED_AUTONEG;
519 if (edata.supported & (SUPPORTED_1000baseT_Full |
520 SUPPORTED_1000baseKX_Full))
521 priv->link_speed_capa |= ETH_LINK_SPEED_1G;
522 if (edata.supported & SUPPORTED_10000baseKR_Full)
523 priv->link_speed_capa |= ETH_LINK_SPEED_10G;
524 if (edata.supported & (SUPPORTED_40000baseKR4_Full |
525 SUPPORTED_40000baseCR4_Full |
526 SUPPORTED_40000baseSR4_Full |
527 SUPPORTED_40000baseLR4_Full))
528 priv->link_speed_capa |= ETH_LINK_SPEED_40G;
529 dev_link.link_duplex = ((edata.duplex == DUPLEX_HALF) ?
530 ETH_LINK_HALF_DUPLEX : ETH_LINK_FULL_DUPLEX);
531 dev_link.link_autoneg = !(dev->data->dev_conf.link_speeds &
532 ETH_LINK_SPEED_FIXED);
533 if (memcmp(&dev_link, &dev->data->dev_link, sizeof(dev_link))) {
534 /* Link status changed. */
535 dev->data->dev_link = dev_link;
538 /* Link status is still the same. */
544 * Retrieve physical link information (unlocked version using new ioctl).
547 * Pointer to Ethernet device structure.
550 * 0 on success, a negative errno value otherwise and rte_errno is set.
553 mlx5_link_update_unlocked_gs(struct rte_eth_dev *dev)
555 struct priv *priv = dev->data->dev_private;
556 struct ethtool_link_settings gcmd = { .cmd = ETHTOOL_GLINKSETTINGS };
558 struct rte_eth_link dev_link;
562 ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr);
564 DRV_LOG(WARNING, "port %u ioctl(SIOCGIFFLAGS) failed: %s",
565 dev->data->port_id, strerror(rte_errno));
568 memset(&dev_link, 0, sizeof(dev_link));
569 dev_link.link_status = ((ifr.ifr_flags & IFF_UP) &&
570 (ifr.ifr_flags & IFF_RUNNING));
571 ifr.ifr_data = (void *)&gcmd;
572 ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
575 "port %u ioctl(SIOCETHTOOL, ETHTOOL_GLINKSETTINGS)"
577 dev->data->port_id, strerror(rte_errno));
580 gcmd.link_mode_masks_nwords = -gcmd.link_mode_masks_nwords;
582 alignas(struct ethtool_link_settings)
583 uint8_t data[offsetof(struct ethtool_link_settings, link_mode_masks) +
584 sizeof(uint32_t) * gcmd.link_mode_masks_nwords * 3];
585 struct ethtool_link_settings *ecmd = (void *)data;
588 ifr.ifr_data = (void *)ecmd;
589 ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
592 "port %u ioctl(SIOCETHTOOL, ETHTOOL_GLINKSETTINGS)"
594 dev->data->port_id, strerror(rte_errno));
597 dev_link.link_speed = ecmd->speed;
598 sc = ecmd->link_mode_masks[0] |
599 ((uint64_t)ecmd->link_mode_masks[1] << 32);
600 priv->link_speed_capa = 0;
601 if (sc & MLX5_BITSHIFT(ETHTOOL_LINK_MODE_Autoneg_BIT))
602 priv->link_speed_capa |= ETH_LINK_SPEED_AUTONEG;
603 if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_1000baseT_Full_BIT) |
604 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_1000baseKX_Full_BIT)))
605 priv->link_speed_capa |= ETH_LINK_SPEED_1G;
606 if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT) |
607 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_10000baseKR_Full_BIT) |
608 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_10000baseR_FEC_BIT)))
609 priv->link_speed_capa |= ETH_LINK_SPEED_10G;
610 if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_20000baseMLD2_Full_BIT) |
611 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_20000baseKR2_Full_BIT)))
612 priv->link_speed_capa |= ETH_LINK_SPEED_20G;
613 if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT) |
614 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT) |
615 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT) |
616 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT)))
617 priv->link_speed_capa |= ETH_LINK_SPEED_40G;
618 if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseKR4_Full_BIT) |
619 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseCR4_Full_BIT) |
620 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseSR4_Full_BIT) |
621 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseLR4_Full_BIT)))
622 priv->link_speed_capa |= ETH_LINK_SPEED_56G;
623 if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_25000baseCR_Full_BIT) |
624 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_25000baseKR_Full_BIT) |
625 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_25000baseSR_Full_BIT)))
626 priv->link_speed_capa |= ETH_LINK_SPEED_25G;
627 if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_50000baseCR2_Full_BIT) |
628 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_50000baseKR2_Full_BIT)))
629 priv->link_speed_capa |= ETH_LINK_SPEED_50G;
630 if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT) |
631 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseSR4_Full_BIT) |
632 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT) |
633 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT)))
634 priv->link_speed_capa |= ETH_LINK_SPEED_100G;
635 dev_link.link_duplex = ((ecmd->duplex == DUPLEX_HALF) ?
636 ETH_LINK_HALF_DUPLEX : ETH_LINK_FULL_DUPLEX);
637 dev_link.link_autoneg = !(dev->data->dev_conf.link_speeds &
638 ETH_LINK_SPEED_FIXED);
639 if (memcmp(&dev_link, &dev->data->dev_link, sizeof(dev_link))) {
640 /* Link status changed. */
641 dev->data->dev_link = dev_link;
644 /* Link status is still the same. */
650 * Enable receiving and transmitting traffic.
653 * Pointer to Ethernet device.
656 mlx5_link_start(struct rte_eth_dev *dev)
658 struct priv *priv = dev->data->dev_private;
661 dev->tx_pkt_burst = mlx5_select_tx_function(dev);
662 dev->rx_pkt_burst = mlx5_select_rx_function(dev);
663 ret = mlx5_traffic_enable(dev);
666 "port %u error occurred while configuring control"
668 dev->data->port_id, strerror(rte_errno));
671 ret = mlx5_flow_start(dev, &priv->flows);
674 "port %u error occurred while configuring flows: %s",
675 dev->data->port_id, strerror(rte_errno));
679 * Disable receiving and transmitting traffic.
682 * Pointer to Ethernet device.
685 mlx5_link_stop(struct rte_eth_dev *dev)
687 struct priv *priv = dev->data->dev_private;
689 mlx5_flow_stop(dev, &priv->flows);
690 mlx5_traffic_disable(dev);
691 dev->rx_pkt_burst = removed_rx_burst;
692 dev->tx_pkt_burst = removed_tx_burst;
696 * Querying the link status till it changes to the desired state.
697 * Number of query attempts is bounded by MLX5_MAX_LINK_QUERY_ATTEMPTS.
700 * Pointer to Ethernet device.
702 * Link desired status.
705 * 0 on success, a negative errno value otherwise and rte_errno is set.
708 mlx5_force_link_status_change(struct rte_eth_dev *dev, int status)
712 while (try < MLX5_MAX_LINK_QUERY_ATTEMPTS) {
713 mlx5_link_update(dev, 0);
714 if (dev->data->dev_link.link_status == status)
724 * DPDK callback to retrieve physical link information.
727 * Pointer to Ethernet device structure.
728 * @param wait_to_complete
729 * Wait for request completion (ignored).
732 * 0 on success, a negative errno value otherwise and rte_errno is set.
735 mlx5_link_update(struct rte_eth_dev *dev, int wait_to_complete __rte_unused)
737 struct utsname utsname;
740 struct rte_eth_link dev_link = dev->data->dev_link;
742 if (uname(&utsname) == -1 ||
743 sscanf(utsname.release, "%d.%d.%d",
744 &ver[0], &ver[1], &ver[2]) != 3 ||
745 KERNEL_VERSION(ver[0], ver[1], ver[2]) < KERNEL_VERSION(4, 9, 0))
746 ret = mlx5_link_update_unlocked_gset(dev);
748 ret = mlx5_link_update_unlocked_gs(dev);
751 /* If lsc interrupt is disabled, should always be ready for traffic. */
752 if (!dev->data->dev_conf.intr_conf.lsc) {
753 mlx5_link_start(dev);
756 /* Re-select burst callbacks only if link status has been changed. */
757 if (!ret && dev_link.link_status != dev->data->dev_link.link_status) {
758 if (dev->data->dev_link.link_status == ETH_LINK_UP)
759 mlx5_link_start(dev);
767 * DPDK callback to change the MTU.
770 * Pointer to Ethernet device structure.
775 * 0 on success, a negative errno value otherwise and rte_errno is set.
778 mlx5_dev_set_mtu(struct rte_eth_dev *dev, uint16_t mtu)
780 struct priv *priv = dev->data->dev_private;
781 uint16_t kern_mtu = 0;
784 ret = mlx5_get_mtu(dev, &kern_mtu);
787 /* Set kernel interface MTU first. */
788 ret = mlx5_set_mtu(dev, mtu);
791 ret = mlx5_get_mtu(dev, &kern_mtu);
794 if (kern_mtu == mtu) {
796 DRV_LOG(DEBUG, "port %u adapter MTU set to %u",
797 dev->data->port_id, mtu);
805 * DPDK callback to get flow control status.
808 * Pointer to Ethernet device structure.
809 * @param[out] fc_conf
810 * Flow control output buffer.
813 * 0 on success, a negative errno value otherwise and rte_errno is set.
816 mlx5_dev_get_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf)
819 struct ethtool_pauseparam ethpause = {
820 .cmd = ETHTOOL_GPAUSEPARAM
824 ifr.ifr_data = (void *)ðpause;
825 ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
828 "port %u ioctl(SIOCETHTOOL, ETHTOOL_GPAUSEPARAM) failed:"
830 dev->data->port_id, strerror(rte_errno));
833 fc_conf->autoneg = ethpause.autoneg;
834 if (ethpause.rx_pause && ethpause.tx_pause)
835 fc_conf->mode = RTE_FC_FULL;
836 else if (ethpause.rx_pause)
837 fc_conf->mode = RTE_FC_RX_PAUSE;
838 else if (ethpause.tx_pause)
839 fc_conf->mode = RTE_FC_TX_PAUSE;
841 fc_conf->mode = RTE_FC_NONE;
846 * DPDK callback to modify flow control parameters.
849 * Pointer to Ethernet device structure.
851 * Flow control parameters.
854 * 0 on success, a negative errno value otherwise and rte_errno is set.
857 mlx5_dev_set_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf)
860 struct ethtool_pauseparam ethpause = {
861 .cmd = ETHTOOL_SPAUSEPARAM
865 ifr.ifr_data = (void *)ðpause;
866 ethpause.autoneg = fc_conf->autoneg;
867 if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) ||
868 (fc_conf->mode & RTE_FC_RX_PAUSE))
869 ethpause.rx_pause = 1;
871 ethpause.rx_pause = 0;
873 if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) ||
874 (fc_conf->mode & RTE_FC_TX_PAUSE))
875 ethpause.tx_pause = 1;
877 ethpause.tx_pause = 0;
878 ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
881 "port %u ioctl(SIOCETHTOOL, ETHTOOL_SPAUSEPARAM)"
883 dev->data->port_id, strerror(rte_errno));
890 * Get PCI information from struct ibv_device.
893 * Pointer to Ethernet device structure.
894 * @param[out] pci_addr
895 * PCI bus address output buffer.
898 * 0 on success, a negative errno value otherwise and rte_errno is set.
901 mlx5_ibv_device_to_pci_addr(const struct ibv_device *device,
902 struct rte_pci_addr *pci_addr)
906 MKSTR(path, "%s/device/uevent", device->ibdev_path);
908 file = fopen(path, "rb");
913 while (fgets(line, sizeof(line), file) == line) {
914 size_t len = strlen(line);
917 /* Truncate long lines. */
918 if (len == (sizeof(line) - 1))
919 while (line[(len - 1)] != '\n') {
923 line[(len - 1)] = ret;
925 /* Extract information. */
928 "%" SCNx32 ":%" SCNx8 ":%" SCNx8 ".%" SCNx8 "\n",
932 &pci_addr->function) == 4) {
942 * Update the link status.
945 * Pointer to Ethernet device.
948 * Zero if the callback process can be called immediately, negative errno
949 * value otherwise and rte_errno is set.
952 mlx5_link_status_update(struct rte_eth_dev *dev)
954 struct priv *priv = dev->data->dev_private;
955 struct rte_eth_link *link = &dev->data->dev_link;
958 ret = mlx5_link_update(dev, 0);
961 if (((link->link_speed == 0) && link->link_status) ||
962 ((link->link_speed != 0) && !link->link_status)) {
964 * Inconsistent status. Event likely occurred before the
965 * kernel netdevice exposes the new status.
967 if (!priv->pending_alarm) {
968 priv->pending_alarm = 1;
969 rte_eal_alarm_set(MLX5_ALARM_TIMEOUT_US,
970 mlx5_dev_link_status_handler,
974 } else if (unlikely(priv->pending_alarm)) {
975 /* Link interrupt occurred while alarm is already scheduled. */
976 priv->pending_alarm = 0;
977 rte_eal_alarm_cancel(mlx5_dev_link_status_handler, priv->dev);
983 * Device status handler.
986 * Pointer to Ethernet device.
988 * Pointer to event flags holder.
991 * Events bitmap of callback process which can be called immediately.
994 mlx5_dev_status_handler(struct rte_eth_dev *dev)
996 struct priv *priv = dev->data->dev_private;
997 struct ibv_async_event event;
1000 /* Read all message and acknowledge them. */
1002 if (mlx5_glue->get_async_event(priv->ctx, &event))
1004 if ((event.event_type == IBV_EVENT_PORT_ACTIVE ||
1005 event.event_type == IBV_EVENT_PORT_ERR) &&
1006 (dev->data->dev_conf.intr_conf.lsc == 1))
1007 ret |= (1 << RTE_ETH_EVENT_INTR_LSC);
1008 else if (event.event_type == IBV_EVENT_DEVICE_FATAL &&
1009 dev->data->dev_conf.intr_conf.rmv == 1)
1010 ret |= (1 << RTE_ETH_EVENT_INTR_RMV);
1013 "port %u event type %d on not handled",
1014 dev->data->port_id, event.event_type);
1015 mlx5_glue->ack_async_event(&event);
1017 if (ret & (1 << RTE_ETH_EVENT_INTR_LSC))
1018 if (mlx5_link_status_update(dev))
1019 ret &= ~(1 << RTE_ETH_EVENT_INTR_LSC);
1024 * Handle delayed link status event.
1027 * Registered argument.
1030 mlx5_dev_link_status_handler(void *arg)
1032 struct rte_eth_dev *dev = arg;
1033 struct priv *priv = dev->data->dev_private;
1036 priv->pending_alarm = 0;
1037 ret = mlx5_link_status_update(dev);
1039 _rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_INTR_LSC, NULL);
1043 * Handle interrupts from the NIC.
1045 * @param[in] intr_handle
1046 * Interrupt handler.
1048 * Callback argument.
1051 mlx5_dev_interrupt_handler(void *cb_arg)
1053 struct rte_eth_dev *dev = cb_arg;
1056 events = mlx5_dev_status_handler(dev);
1057 if (events & (1 << RTE_ETH_EVENT_INTR_LSC))
1058 _rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_INTR_LSC, NULL);
1059 if (events & (1 << RTE_ETH_EVENT_INTR_RMV))
1060 _rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_INTR_RMV, NULL);
1064 * Handle interrupts from the socket.
1067 * Callback argument.
1070 mlx5_dev_handler_socket(void *cb_arg)
1072 struct rte_eth_dev *dev = cb_arg;
1074 mlx5_socket_handle(dev);
1078 * Uninstall interrupt handler.
1081 * Pointer to Ethernet device.
1084 mlx5_dev_interrupt_handler_uninstall(struct rte_eth_dev *dev)
1086 struct priv *priv = dev->data->dev_private;
1088 if (dev->data->dev_conf.intr_conf.lsc ||
1089 dev->data->dev_conf.intr_conf.rmv)
1090 rte_intr_callback_unregister(&priv->intr_handle,
1091 mlx5_dev_interrupt_handler, dev);
1092 if (priv->primary_socket)
1093 rte_intr_callback_unregister(&priv->intr_handle_socket,
1094 mlx5_dev_handler_socket, dev);
1095 if (priv->pending_alarm) {
1096 priv->pending_alarm = 0;
1097 rte_eal_alarm_cancel(mlx5_dev_link_status_handler, dev);
1099 priv->intr_handle.fd = 0;
1100 priv->intr_handle.type = RTE_INTR_HANDLE_UNKNOWN;
1101 priv->intr_handle_socket.fd = 0;
1102 priv->intr_handle_socket.type = RTE_INTR_HANDLE_UNKNOWN;
1106 * Install interrupt handler.
1109 * Pointer to Ethernet device.
1112 mlx5_dev_interrupt_handler_install(struct rte_eth_dev *dev)
1114 struct priv *priv = dev->data->dev_private;
1118 assert(priv->ctx->async_fd > 0);
1119 flags = fcntl(priv->ctx->async_fd, F_GETFL);
1120 ret = fcntl(priv->ctx->async_fd, F_SETFL, flags | O_NONBLOCK);
1123 "port %u failed to change file descriptor async event"
1125 dev->data->port_id);
1126 dev->data->dev_conf.intr_conf.lsc = 0;
1127 dev->data->dev_conf.intr_conf.rmv = 0;
1129 if (dev->data->dev_conf.intr_conf.lsc ||
1130 dev->data->dev_conf.intr_conf.rmv) {
1131 priv->intr_handle.fd = priv->ctx->async_fd;
1132 priv->intr_handle.type = RTE_INTR_HANDLE_EXT;
1133 rte_intr_callback_register(&priv->intr_handle,
1134 mlx5_dev_interrupt_handler, dev);
1136 ret = mlx5_socket_init(dev);
1138 DRV_LOG(ERR, "port %u cannot initialise socket: %s",
1139 dev->data->port_id, strerror(rte_errno));
1140 else if (priv->primary_socket) {
1141 priv->intr_handle_socket.fd = priv->primary_socket;
1142 priv->intr_handle_socket.type = RTE_INTR_HANDLE_EXT;
1143 rte_intr_callback_register(&priv->intr_handle_socket,
1144 mlx5_dev_handler_socket, dev);
1149 * DPDK callback to bring the link DOWN.
1152 * Pointer to Ethernet device structure.
1155 * 0 on success, a negative errno value otherwise and rte_errno is set.
1158 mlx5_set_link_down(struct rte_eth_dev *dev)
1160 return mlx5_set_flags(dev, ~IFF_UP, ~IFF_UP);
1164 * DPDK callback to bring the link UP.
1167 * Pointer to Ethernet device structure.
1170 * 0 on success, a negative errno value otherwise and rte_errno is set.
1173 mlx5_set_link_up(struct rte_eth_dev *dev)
1175 return mlx5_set_flags(dev, ~IFF_UP, IFF_UP);
1179 * Configure the TX function to use.
1182 * Pointer to private data structure.
1185 * Pointer to selected Tx burst function.
1188 mlx5_select_tx_function(struct rte_eth_dev *dev)
1190 struct priv *priv = dev->data->dev_private;
1191 eth_tx_burst_t tx_pkt_burst = mlx5_tx_burst;
1192 struct mlx5_dev_config *config = &priv->config;
1193 uint64_t tx_offloads = dev->data->dev_conf.txmode.offloads;
1194 int tso = !!(tx_offloads & (DEV_TX_OFFLOAD_TCP_TSO |
1195 DEV_TX_OFFLOAD_VXLAN_TNL_TSO |
1196 DEV_TX_OFFLOAD_GRE_TNL_TSO));
1197 int vlan_insert = !!(tx_offloads & DEV_TX_OFFLOAD_VLAN_INSERT);
1199 assert(priv != NULL);
1200 /* Select appropriate TX function. */
1201 if (vlan_insert || tso)
1202 return tx_pkt_burst;
1203 if (config->mps == MLX5_MPW_ENHANCED) {
1204 if (mlx5_check_vec_tx_support(dev) > 0) {
1205 if (mlx5_check_raw_vec_tx_support(dev) > 0)
1206 tx_pkt_burst = mlx5_tx_burst_raw_vec;
1208 tx_pkt_burst = mlx5_tx_burst_vec;
1210 "port %u selected enhanced MPW Tx vectorized"
1212 dev->data->port_id);
1214 tx_pkt_burst = mlx5_tx_burst_empw;
1216 "port %u selected enhanced MPW Tx function",
1217 dev->data->port_id);
1219 } else if (config->mps && (config->txq_inline > 0)) {
1220 tx_pkt_burst = mlx5_tx_burst_mpw_inline;
1221 DRV_LOG(DEBUG, "port %u selected MPW inline Tx function",
1222 dev->data->port_id);
1223 } else if (config->mps) {
1224 tx_pkt_burst = mlx5_tx_burst_mpw;
1225 DRV_LOG(DEBUG, "port %u selected MPW Tx function",
1226 dev->data->port_id);
1228 return tx_pkt_burst;
1232 * Configure the RX function to use.
1235 * Pointer to private data structure.
1238 * Pointer to selected Rx burst function.
1241 mlx5_select_rx_function(struct rte_eth_dev *dev)
1243 eth_rx_burst_t rx_pkt_burst = mlx5_rx_burst;
1245 assert(dev != NULL);
1246 if (mlx5_check_vec_rx_support(dev) > 0) {
1247 rx_pkt_burst = mlx5_rx_burst_vec;
1248 DRV_LOG(DEBUG, "port %u selected Rx vectorized function",
1249 dev->data->port_id);
1251 return rx_pkt_burst;
1255 * Check if mlx5 device was removed.
1258 * Pointer to Ethernet device structure.
1261 * 1 when device is removed, otherwise 0.
1264 mlx5_is_removed(struct rte_eth_dev *dev)
1266 struct ibv_device_attr device_attr;
1267 struct priv *priv = dev->data->dev_private;
1269 if (mlx5_glue->query_device(priv->ctx, &device_attr) == EIO)