1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright 2015 6WIND S.A.
3 * Copyright 2015 Mellanox Technologies, Ltd
19 #include <sys/ioctl.h>
20 #include <sys/socket.h>
21 #include <netinet/in.h>
22 #include <linux/ethtool.h>
23 #include <linux/sockios.h>
29 #include <rte_atomic.h>
30 #include <rte_ethdev_driver.h>
31 #include <rte_bus_pci.h>
33 #include <rte_common.h>
34 #include <rte_interrupts.h>
35 #include <rte_malloc.h>
36 #include <rte_string_fns.h>
37 #include <rte_rwlock.h>
40 #include "mlx5_glue.h"
41 #include "mlx5_rxtx.h"
42 #include "mlx5_utils.h"
44 /* Add defines in case the running kernel is not the same as user headers. */
45 #ifndef ETHTOOL_GLINKSETTINGS
46 struct ethtool_link_settings {
55 uint8_t eth_tp_mdix_ctrl;
56 int8_t link_mode_masks_nwords;
58 uint32_t link_mode_masks[];
61 #define ETHTOOL_GLINKSETTINGS 0x0000004c
62 #define ETHTOOL_LINK_MODE_1000baseT_Full_BIT 5
63 #define ETHTOOL_LINK_MODE_Autoneg_BIT 6
64 #define ETHTOOL_LINK_MODE_1000baseKX_Full_BIT 17
65 #define ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT 18
66 #define ETHTOOL_LINK_MODE_10000baseKR_Full_BIT 19
67 #define ETHTOOL_LINK_MODE_10000baseR_FEC_BIT 20
68 #define ETHTOOL_LINK_MODE_20000baseMLD2_Full_BIT 21
69 #define ETHTOOL_LINK_MODE_20000baseKR2_Full_BIT 22
70 #define ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT 23
71 #define ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT 24
72 #define ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT 25
73 #define ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT 26
74 #define ETHTOOL_LINK_MODE_56000baseKR4_Full_BIT 27
75 #define ETHTOOL_LINK_MODE_56000baseCR4_Full_BIT 28
76 #define ETHTOOL_LINK_MODE_56000baseSR4_Full_BIT 29
77 #define ETHTOOL_LINK_MODE_56000baseLR4_Full_BIT 30
79 #ifndef HAVE_ETHTOOL_LINK_MODE_25G
80 #define ETHTOOL_LINK_MODE_25000baseCR_Full_BIT 31
81 #define ETHTOOL_LINK_MODE_25000baseKR_Full_BIT 32
82 #define ETHTOOL_LINK_MODE_25000baseSR_Full_BIT 33
84 #ifndef HAVE_ETHTOOL_LINK_MODE_50G
85 #define ETHTOOL_LINK_MODE_50000baseCR2_Full_BIT 34
86 #define ETHTOOL_LINK_MODE_50000baseKR2_Full_BIT 35
88 #ifndef HAVE_ETHTOOL_LINK_MODE_100G
89 #define ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT 36
90 #define ETHTOOL_LINK_MODE_100000baseSR4_Full_BIT 37
91 #define ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT 38
92 #define ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT 39
96 * Get master interface name from private structure.
99 * Pointer to Ethernet device.
101 * Interface name output buffer.
104 * 0 on success, a negative errno value otherwise and rte_errno is set.
107 mlx5_get_master_ifname(const struct rte_eth_dev *dev,
108 char (*ifname)[IF_NAMESIZE])
110 struct priv *priv = dev->data->dev_private;
113 unsigned int dev_type = 0;
114 unsigned int dev_port_prev = ~0u;
115 char match[IF_NAMESIZE] = "";
118 MKSTR(path, "%s/device/net", priv->ibdev_path);
126 while ((dent = readdir(dir)) != NULL) {
127 char *name = dent->d_name;
129 unsigned int dev_port;
132 if ((name[0] == '.') &&
133 ((name[1] == '\0') ||
134 ((name[1] == '.') && (name[2] == '\0'))))
137 MKSTR(path, "%s/device/net/%s/%s",
138 priv->ibdev_path, name,
139 (dev_type ? "dev_id" : "dev_port"));
141 file = fopen(path, "rb");
146 * Switch to dev_id when dev_port does not exist as
147 * is the case with Linux kernel versions < 3.15.
158 r = fscanf(file, (dev_type ? "%x" : "%u"), &dev_port);
163 * Switch to dev_id when dev_port returns the same value for
164 * all ports. May happen when using a MOFED release older than
165 * 3.0 with a Linux kernel >= 3.15.
167 if (dev_port == dev_port_prev)
169 dev_port_prev = dev_port;
171 strlcpy(match, name, sizeof(match));
174 if (match[0] == '\0') {
178 strncpy(*ifname, match, sizeof(*ifname));
183 * Get interface name from private structure.
185 * This is a port representor-aware version of mlx5_get_master_ifname().
188 * Pointer to Ethernet device.
190 * Interface name output buffer.
193 * 0 on success, a negative errno value otherwise and rte_errno is set.
196 mlx5_get_ifname(const struct rte_eth_dev *dev, char (*ifname)[IF_NAMESIZE])
198 struct priv *priv = dev->data->dev_private;
199 unsigned int ifindex =
200 priv->nl_socket_rdma >= 0 ?
201 mlx5_nl_ifindex(priv->nl_socket_rdma, priv->ibdev_name) : 0;
204 if (!priv->representor)
205 return mlx5_get_master_ifname(dev, ifname);
209 if (if_indextoname(ifindex, &(*ifname)[0]))
216 * Get the interface index from device name.
219 * Pointer to Ethernet device.
222 * Interface index on success, a negative errno value otherwise and
226 mlx5_ifindex(const struct rte_eth_dev *dev)
228 char ifname[IF_NAMESIZE];
231 ret = mlx5_get_ifname(dev, &ifname);
234 ret = if_nametoindex(ifname);
243 * Perform ifreq ioctl() on associated Ethernet device.
246 * Pointer to Ethernet device.
248 * Request number to pass to ioctl().
250 * Interface request structure output buffer.
252 * When device is a port representor, perform request on master device
256 * 0 on success, a negative errno value otherwise and rte_errno is set.
259 mlx5_ifreq(const struct rte_eth_dev *dev, int req, struct ifreq *ifr,
262 int sock = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
270 ret = mlx5_get_master_ifname(dev, &ifr->ifr_name);
272 ret = mlx5_get_ifname(dev, &ifr->ifr_name);
275 ret = ioctl(sock, req, ifr);
291 * Pointer to Ethernet device.
293 * MTU value output buffer.
296 * 0 on success, a negative errno value otherwise and rte_errno is set.
299 mlx5_get_mtu(struct rte_eth_dev *dev, uint16_t *mtu)
301 struct ifreq request;
302 int ret = mlx5_ifreq(dev, SIOCGIFMTU, &request, 0);
306 *mtu = request.ifr_mtu;
314 * Pointer to Ethernet device.
319 * 0 on success, a negative errno value otherwise and rte_errno is set.
322 mlx5_set_mtu(struct rte_eth_dev *dev, uint16_t mtu)
324 struct ifreq request = { .ifr_mtu = mtu, };
326 return mlx5_ifreq(dev, SIOCSIFMTU, &request, 0);
333 * Pointer to Ethernet device.
335 * Bitmask for flags that must remain untouched.
337 * Bitmask for flags to modify.
340 * 0 on success, a negative errno value otherwise and rte_errno is set.
343 mlx5_set_flags(struct rte_eth_dev *dev, unsigned int keep, unsigned int flags)
345 struct ifreq request;
346 int ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &request, 0);
350 request.ifr_flags &= keep;
351 request.ifr_flags |= flags & ~keep;
352 return mlx5_ifreq(dev, SIOCSIFFLAGS, &request, 0);
356 * DPDK callback for Ethernet device configuration.
359 * Pointer to Ethernet device structure.
362 * 0 on success, a negative errno value otherwise and rte_errno is set.
365 mlx5_dev_configure(struct rte_eth_dev *dev)
367 struct priv *priv = dev->data->dev_private;
368 unsigned int rxqs_n = dev->data->nb_rx_queues;
369 unsigned int txqs_n = dev->data->nb_tx_queues;
372 unsigned int reta_idx_n;
373 const uint8_t use_app_rss_key =
374 !!dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key;
377 if (use_app_rss_key &&
378 (dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key_len !=
379 MLX5_RSS_HASH_KEY_LEN)) {
380 DRV_LOG(ERR, "port %u RSS key len must be %s Bytes long",
381 dev->data->port_id, RTE_STR(MLX5_RSS_HASH_KEY_LEN));
385 priv->rss_conf.rss_key =
386 rte_realloc(priv->rss_conf.rss_key,
387 MLX5_RSS_HASH_KEY_LEN, 0);
388 if (!priv->rss_conf.rss_key) {
389 DRV_LOG(ERR, "port %u cannot allocate RSS hash key memory (%u)",
390 dev->data->port_id, rxqs_n);
394 memcpy(priv->rss_conf.rss_key,
396 dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key :
397 rss_hash_default_key,
398 MLX5_RSS_HASH_KEY_LEN);
399 priv->rss_conf.rss_key_len = MLX5_RSS_HASH_KEY_LEN;
400 priv->rss_conf.rss_hf = dev->data->dev_conf.rx_adv_conf.rss_conf.rss_hf;
401 priv->rxqs = (void *)dev->data->rx_queues;
402 priv->txqs = (void *)dev->data->tx_queues;
403 if (txqs_n != priv->txqs_n) {
404 DRV_LOG(INFO, "port %u Tx queues number update: %u -> %u",
405 dev->data->port_id, priv->txqs_n, txqs_n);
406 priv->txqs_n = txqs_n;
408 if (rxqs_n > priv->config.ind_table_max_size) {
409 DRV_LOG(ERR, "port %u cannot handle this many Rx queues (%u)",
410 dev->data->port_id, rxqs_n);
414 if (rxqs_n == priv->rxqs_n)
416 DRV_LOG(INFO, "port %u Rx queues number update: %u -> %u",
417 dev->data->port_id, priv->rxqs_n, rxqs_n);
418 priv->rxqs_n = rxqs_n;
419 /* If the requested number of RX queues is not a power of two, use the
420 * maximum indirection table size for better balancing.
421 * The result is always rounded to the next power of two. */
422 reta_idx_n = (1 << log2above((rxqs_n & (rxqs_n - 1)) ?
423 priv->config.ind_table_max_size :
425 ret = mlx5_rss_reta_index_resize(dev, reta_idx_n);
428 /* When the number of RX queues is not a power of two, the remaining
429 * table entries are padded with reused WQs and hashes are not spread
431 for (i = 0, j = 0; (i != reta_idx_n); ++i) {
432 (*priv->reta_idx)[i] = j;
440 * Sets default tuning parameters.
443 * Pointer to Ethernet device.
445 * Info structure output buffer.
448 mlx5_set_default_params(struct rte_eth_dev *dev, struct rte_eth_dev_info *info)
450 struct priv *priv = dev->data->dev_private;
452 /* Minimum CPU utilization. */
453 info->default_rxportconf.ring_size = 256;
454 info->default_txportconf.ring_size = 256;
455 info->default_rxportconf.burst_size = 64;
456 info->default_txportconf.burst_size = 64;
457 if (priv->link_speed_capa & ETH_LINK_SPEED_100G) {
458 info->default_rxportconf.nb_queues = 16;
459 info->default_txportconf.nb_queues = 16;
460 if (dev->data->nb_rx_queues > 2 ||
461 dev->data->nb_tx_queues > 2) {
462 /* Max Throughput. */
463 info->default_rxportconf.ring_size = 2048;
464 info->default_txportconf.ring_size = 2048;
467 info->default_rxportconf.nb_queues = 8;
468 info->default_txportconf.nb_queues = 8;
469 if (dev->data->nb_rx_queues > 2 ||
470 dev->data->nb_tx_queues > 2) {
471 /* Max Throughput. */
472 info->default_rxportconf.ring_size = 4096;
473 info->default_txportconf.ring_size = 4096;
479 * DPDK callback to get information about the device.
482 * Pointer to Ethernet device structure.
484 * Info structure output buffer.
487 mlx5_dev_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *info)
489 struct priv *priv = dev->data->dev_private;
490 struct mlx5_dev_config *config = &priv->config;
492 char ifname[IF_NAMESIZE];
494 /* FIXME: we should ask the device for these values. */
495 info->min_rx_bufsize = 32;
496 info->max_rx_pktlen = 65536;
498 * Since we need one CQ per QP, the limit is the minimum number
499 * between the two values.
501 max = RTE_MIN(priv->device_attr.orig_attr.max_cq,
502 priv->device_attr.orig_attr.max_qp);
503 /* If max >= 65535 then max = 0, max_rx_queues is uint16_t. */
506 info->max_rx_queues = max;
507 info->max_tx_queues = max;
508 info->max_mac_addrs = MLX5_MAX_UC_MAC_ADDRESSES;
509 info->rx_queue_offload_capa = mlx5_get_rx_queue_offloads(dev);
510 info->rx_offload_capa = (mlx5_get_rx_port_offloads() |
511 info->rx_queue_offload_capa);
512 info->tx_offload_capa = mlx5_get_tx_port_offloads(dev);
513 if (mlx5_get_ifname(dev, &ifname) == 0)
514 info->if_index = if_nametoindex(ifname);
515 info->reta_size = priv->reta_idx_n ?
516 priv->reta_idx_n : config->ind_table_max_size;
517 info->hash_key_size = MLX5_RSS_HASH_KEY_LEN;
518 info->speed_capa = priv->link_speed_capa;
519 info->flow_type_rss_offloads = ~MLX5_RSS_HF_MASK;
520 mlx5_set_default_params(dev, info);
521 info->switch_info.name = dev->data->name;
522 info->switch_info.domain_id = priv->domain_id;
523 info->switch_info.port_id = priv->representor_id;
524 if (priv->representor) {
525 unsigned int i = mlx5_dev_to_port_id(dev->device, NULL, 0);
528 i = RTE_MIN(mlx5_dev_to_port_id(dev->device, port_id, i), i);
531 rte_eth_devices[port_id[i]].data->dev_private;
534 opriv->representor ||
535 opriv->domain_id != priv->domain_id)
538 * Override switch name with that of the master
541 info->switch_info.name = opriv->dev_data->name;
548 * Get supported packet types.
551 * Pointer to Ethernet device structure.
554 * A pointer to the supported Packet types array.
557 mlx5_dev_supported_ptypes_get(struct rte_eth_dev *dev)
559 static const uint32_t ptypes[] = {
560 /* refers to rxq_cq_to_pkt_type() */
562 RTE_PTYPE_L3_IPV4_EXT_UNKNOWN,
563 RTE_PTYPE_L3_IPV6_EXT_UNKNOWN,
564 RTE_PTYPE_L4_NONFRAG,
568 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN,
569 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN,
570 RTE_PTYPE_INNER_L4_NONFRAG,
571 RTE_PTYPE_INNER_L4_FRAG,
572 RTE_PTYPE_INNER_L4_TCP,
573 RTE_PTYPE_INNER_L4_UDP,
577 if (dev->rx_pkt_burst == mlx5_rx_burst ||
578 dev->rx_pkt_burst == mlx5_rx_burst_mprq ||
579 dev->rx_pkt_burst == mlx5_rx_burst_vec)
585 * DPDK callback to retrieve physical link information.
588 * Pointer to Ethernet device structure.
590 * Storage for current link status.
593 * 0 on success, a negative errno value otherwise and rte_errno is set.
596 mlx5_link_update_unlocked_gset(struct rte_eth_dev *dev,
597 struct rte_eth_link *link)
599 struct priv *priv = dev->data->dev_private;
600 struct ethtool_cmd edata = {
601 .cmd = ETHTOOL_GSET /* Deprecated since Linux v4.5. */
604 struct rte_eth_link dev_link;
608 ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr, 1);
610 DRV_LOG(WARNING, "port %u ioctl(SIOCGIFFLAGS) failed: %s",
611 dev->data->port_id, strerror(rte_errno));
614 memset(&dev_link, 0, sizeof(dev_link));
615 dev_link.link_status = ((ifr.ifr_flags & IFF_UP) &&
616 (ifr.ifr_flags & IFF_RUNNING));
617 ifr.ifr_data = (void *)&edata;
618 ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 1);
621 "port %u ioctl(SIOCETHTOOL, ETHTOOL_GSET) failed: %s",
622 dev->data->port_id, strerror(rte_errno));
625 link_speed = ethtool_cmd_speed(&edata);
626 if (link_speed == -1)
627 dev_link.link_speed = ETH_SPEED_NUM_NONE;
629 dev_link.link_speed = link_speed;
630 priv->link_speed_capa = 0;
631 if (edata.supported & SUPPORTED_Autoneg)
632 priv->link_speed_capa |= ETH_LINK_SPEED_AUTONEG;
633 if (edata.supported & (SUPPORTED_1000baseT_Full |
634 SUPPORTED_1000baseKX_Full))
635 priv->link_speed_capa |= ETH_LINK_SPEED_1G;
636 if (edata.supported & SUPPORTED_10000baseKR_Full)
637 priv->link_speed_capa |= ETH_LINK_SPEED_10G;
638 if (edata.supported & (SUPPORTED_40000baseKR4_Full |
639 SUPPORTED_40000baseCR4_Full |
640 SUPPORTED_40000baseSR4_Full |
641 SUPPORTED_40000baseLR4_Full))
642 priv->link_speed_capa |= ETH_LINK_SPEED_40G;
643 dev_link.link_duplex = ((edata.duplex == DUPLEX_HALF) ?
644 ETH_LINK_HALF_DUPLEX : ETH_LINK_FULL_DUPLEX);
645 dev_link.link_autoneg = !(dev->data->dev_conf.link_speeds &
646 ETH_LINK_SPEED_FIXED);
647 if ((dev_link.link_speed && !dev_link.link_status) ||
648 (!dev_link.link_speed && dev_link.link_status)) {
657 * Retrieve physical link information (unlocked version using new ioctl).
660 * Pointer to Ethernet device structure.
662 * Storage for current link status.
665 * 0 on success, a negative errno value otherwise and rte_errno is set.
668 mlx5_link_update_unlocked_gs(struct rte_eth_dev *dev,
669 struct rte_eth_link *link)
672 struct priv *priv = dev->data->dev_private;
673 struct ethtool_link_settings gcmd = { .cmd = ETHTOOL_GLINKSETTINGS };
675 struct rte_eth_link dev_link;
679 ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr, 1);
681 DRV_LOG(WARNING, "port %u ioctl(SIOCGIFFLAGS) failed: %s",
682 dev->data->port_id, strerror(rte_errno));
685 memset(&dev_link, 0, sizeof(dev_link));
686 dev_link.link_status = ((ifr.ifr_flags & IFF_UP) &&
687 (ifr.ifr_flags & IFF_RUNNING));
688 ifr.ifr_data = (void *)&gcmd;
689 ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 1);
692 "port %u ioctl(SIOCETHTOOL, ETHTOOL_GLINKSETTINGS)"
694 dev->data->port_id, strerror(rte_errno));
697 gcmd.link_mode_masks_nwords = -gcmd.link_mode_masks_nwords;
699 alignas(struct ethtool_link_settings)
700 uint8_t data[offsetof(struct ethtool_link_settings, link_mode_masks) +
701 sizeof(uint32_t) * gcmd.link_mode_masks_nwords * 3];
702 struct ethtool_link_settings *ecmd = (void *)data;
705 ifr.ifr_data = (void *)ecmd;
706 ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 1);
709 "port %u ioctl(SIOCETHTOOL, ETHTOOL_GLINKSETTINGS)"
711 dev->data->port_id, strerror(rte_errno));
714 dev_link.link_speed = ecmd->speed;
715 sc = ecmd->link_mode_masks[0] |
716 ((uint64_t)ecmd->link_mode_masks[1] << 32);
717 priv->link_speed_capa = 0;
718 if (sc & MLX5_BITSHIFT(ETHTOOL_LINK_MODE_Autoneg_BIT))
719 priv->link_speed_capa |= ETH_LINK_SPEED_AUTONEG;
720 if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_1000baseT_Full_BIT) |
721 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_1000baseKX_Full_BIT)))
722 priv->link_speed_capa |= ETH_LINK_SPEED_1G;
723 if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT) |
724 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_10000baseKR_Full_BIT) |
725 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_10000baseR_FEC_BIT)))
726 priv->link_speed_capa |= ETH_LINK_SPEED_10G;
727 if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_20000baseMLD2_Full_BIT) |
728 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_20000baseKR2_Full_BIT)))
729 priv->link_speed_capa |= ETH_LINK_SPEED_20G;
730 if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT) |
731 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT) |
732 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT) |
733 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT)))
734 priv->link_speed_capa |= ETH_LINK_SPEED_40G;
735 if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseKR4_Full_BIT) |
736 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseCR4_Full_BIT) |
737 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseSR4_Full_BIT) |
738 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseLR4_Full_BIT)))
739 priv->link_speed_capa |= ETH_LINK_SPEED_56G;
740 if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_25000baseCR_Full_BIT) |
741 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_25000baseKR_Full_BIT) |
742 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_25000baseSR_Full_BIT)))
743 priv->link_speed_capa |= ETH_LINK_SPEED_25G;
744 if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_50000baseCR2_Full_BIT) |
745 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_50000baseKR2_Full_BIT)))
746 priv->link_speed_capa |= ETH_LINK_SPEED_50G;
747 if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT) |
748 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseSR4_Full_BIT) |
749 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT) |
750 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT)))
751 priv->link_speed_capa |= ETH_LINK_SPEED_100G;
752 dev_link.link_duplex = ((ecmd->duplex == DUPLEX_HALF) ?
753 ETH_LINK_HALF_DUPLEX : ETH_LINK_FULL_DUPLEX);
754 dev_link.link_autoneg = !(dev->data->dev_conf.link_speeds &
755 ETH_LINK_SPEED_FIXED);
756 if ((dev_link.link_speed && !dev_link.link_status) ||
757 (!dev_link.link_speed && dev_link.link_status)) {
766 * DPDK callback to retrieve physical link information.
769 * Pointer to Ethernet device structure.
770 * @param wait_to_complete
771 * Wait for request completion.
774 * 0 if link status was not updated, positive if it was, a negative errno
775 * value otherwise and rte_errno is set.
778 mlx5_link_update(struct rte_eth_dev *dev, int wait_to_complete)
781 struct rte_eth_link dev_link;
782 time_t start_time = time(NULL);
785 ret = mlx5_link_update_unlocked_gs(dev, &dev_link);
787 ret = mlx5_link_update_unlocked_gset(dev, &dev_link);
790 /* Handle wait to complete situation. */
791 if (wait_to_complete && ret == -EAGAIN) {
792 if (abs((int)difftime(time(NULL), start_time)) <
793 MLX5_LINK_STATUS_TIMEOUT) {
800 } else if (ret < 0) {
803 } while (wait_to_complete);
804 ret = !!memcmp(&dev->data->dev_link, &dev_link,
805 sizeof(struct rte_eth_link));
806 dev->data->dev_link = dev_link;
811 * DPDK callback to change the MTU.
814 * Pointer to Ethernet device structure.
819 * 0 on success, a negative errno value otherwise and rte_errno is set.
822 mlx5_dev_set_mtu(struct rte_eth_dev *dev, uint16_t mtu)
824 struct priv *priv = dev->data->dev_private;
825 uint16_t kern_mtu = 0;
828 ret = mlx5_get_mtu(dev, &kern_mtu);
831 /* Set kernel interface MTU first. */
832 ret = mlx5_set_mtu(dev, mtu);
835 ret = mlx5_get_mtu(dev, &kern_mtu);
838 if (kern_mtu == mtu) {
840 DRV_LOG(DEBUG, "port %u adapter MTU set to %u",
841 dev->data->port_id, mtu);
849 * DPDK callback to get flow control status.
852 * Pointer to Ethernet device structure.
853 * @param[out] fc_conf
854 * Flow control output buffer.
857 * 0 on success, a negative errno value otherwise and rte_errno is set.
860 mlx5_dev_get_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf)
863 struct ethtool_pauseparam ethpause = {
864 .cmd = ETHTOOL_GPAUSEPARAM
868 ifr.ifr_data = (void *)ðpause;
869 ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 1);
872 "port %u ioctl(SIOCETHTOOL, ETHTOOL_GPAUSEPARAM) failed:"
874 dev->data->port_id, strerror(rte_errno));
877 fc_conf->autoneg = ethpause.autoneg;
878 if (ethpause.rx_pause && ethpause.tx_pause)
879 fc_conf->mode = RTE_FC_FULL;
880 else if (ethpause.rx_pause)
881 fc_conf->mode = RTE_FC_RX_PAUSE;
882 else if (ethpause.tx_pause)
883 fc_conf->mode = RTE_FC_TX_PAUSE;
885 fc_conf->mode = RTE_FC_NONE;
890 * DPDK callback to modify flow control parameters.
893 * Pointer to Ethernet device structure.
895 * Flow control parameters.
898 * 0 on success, a negative errno value otherwise and rte_errno is set.
901 mlx5_dev_set_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf)
904 struct ethtool_pauseparam ethpause = {
905 .cmd = ETHTOOL_SPAUSEPARAM
909 ifr.ifr_data = (void *)ðpause;
910 ethpause.autoneg = fc_conf->autoneg;
911 if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) ||
912 (fc_conf->mode & RTE_FC_RX_PAUSE))
913 ethpause.rx_pause = 1;
915 ethpause.rx_pause = 0;
917 if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) ||
918 (fc_conf->mode & RTE_FC_TX_PAUSE))
919 ethpause.tx_pause = 1;
921 ethpause.tx_pause = 0;
922 ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr, 0);
925 "port %u ioctl(SIOCETHTOOL, ETHTOOL_SPAUSEPARAM)"
927 dev->data->port_id, strerror(rte_errno));
934 * Get PCI information from struct ibv_device.
937 * Pointer to Ethernet device structure.
938 * @param[out] pci_addr
939 * PCI bus address output buffer.
942 * 0 on success, a negative errno value otherwise and rte_errno is set.
945 mlx5_ibv_device_to_pci_addr(const struct ibv_device *device,
946 struct rte_pci_addr *pci_addr)
950 MKSTR(path, "%s/device/uevent", device->ibdev_path);
952 file = fopen(path, "rb");
957 while (fgets(line, sizeof(line), file) == line) {
958 size_t len = strlen(line);
961 /* Truncate long lines. */
962 if (len == (sizeof(line) - 1))
963 while (line[(len - 1)] != '\n') {
967 line[(len - 1)] = ret;
969 /* Extract information. */
972 "%" SCNx32 ":%" SCNx8 ":%" SCNx8 ".%" SCNx8 "\n",
976 &pci_addr->function) == 4) {
986 * Device status handler.
989 * Pointer to Ethernet device.
991 * Pointer to event flags holder.
994 * Events bitmap of callback process which can be called immediately.
997 mlx5_dev_status_handler(struct rte_eth_dev *dev)
999 struct priv *priv = dev->data->dev_private;
1000 struct ibv_async_event event;
1003 if (mlx5_link_update(dev, 0) == -EAGAIN) {
1007 /* Read all message and acknowledge them. */
1009 if (mlx5_glue->get_async_event(priv->ctx, &event))
1011 if ((event.event_type == IBV_EVENT_PORT_ACTIVE ||
1012 event.event_type == IBV_EVENT_PORT_ERR) &&
1013 (dev->data->dev_conf.intr_conf.lsc == 1))
1014 ret |= (1 << RTE_ETH_EVENT_INTR_LSC);
1015 else if (event.event_type == IBV_EVENT_DEVICE_FATAL &&
1016 dev->data->dev_conf.intr_conf.rmv == 1)
1017 ret |= (1 << RTE_ETH_EVENT_INTR_RMV);
1020 "port %u event type %d on not handled",
1021 dev->data->port_id, event.event_type);
1022 mlx5_glue->ack_async_event(&event);
1028 * Handle interrupts from the NIC.
1030 * @param[in] intr_handle
1031 * Interrupt handler.
1033 * Callback argument.
1036 mlx5_dev_interrupt_handler(void *cb_arg)
1038 struct rte_eth_dev *dev = cb_arg;
1041 events = mlx5_dev_status_handler(dev);
1042 if (events & (1 << RTE_ETH_EVENT_INTR_LSC))
1043 _rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_INTR_LSC, NULL);
1044 if (events & (1 << RTE_ETH_EVENT_INTR_RMV))
1045 _rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_INTR_RMV, NULL);
1049 * Handle interrupts from the socket.
1052 * Callback argument.
1055 mlx5_dev_handler_socket(void *cb_arg)
1057 struct rte_eth_dev *dev = cb_arg;
1059 mlx5_socket_handle(dev);
1063 * Uninstall interrupt handler.
1066 * Pointer to Ethernet device.
1069 mlx5_dev_interrupt_handler_uninstall(struct rte_eth_dev *dev)
1071 struct priv *priv = dev->data->dev_private;
1073 if (dev->data->dev_conf.intr_conf.lsc ||
1074 dev->data->dev_conf.intr_conf.rmv)
1075 rte_intr_callback_unregister(&priv->intr_handle,
1076 mlx5_dev_interrupt_handler, dev);
1077 if (priv->primary_socket)
1078 rte_intr_callback_unregister(&priv->intr_handle_socket,
1079 mlx5_dev_handler_socket, dev);
1080 priv->intr_handle.fd = 0;
1081 priv->intr_handle.type = RTE_INTR_HANDLE_UNKNOWN;
1082 priv->intr_handle_socket.fd = 0;
1083 priv->intr_handle_socket.type = RTE_INTR_HANDLE_UNKNOWN;
1087 * Install interrupt handler.
1090 * Pointer to Ethernet device.
1093 mlx5_dev_interrupt_handler_install(struct rte_eth_dev *dev)
1095 struct priv *priv = dev->data->dev_private;
1099 assert(priv->ctx->async_fd > 0);
1100 flags = fcntl(priv->ctx->async_fd, F_GETFL);
1101 ret = fcntl(priv->ctx->async_fd, F_SETFL, flags | O_NONBLOCK);
1104 "port %u failed to change file descriptor async event"
1106 dev->data->port_id);
1107 dev->data->dev_conf.intr_conf.lsc = 0;
1108 dev->data->dev_conf.intr_conf.rmv = 0;
1110 if (dev->data->dev_conf.intr_conf.lsc ||
1111 dev->data->dev_conf.intr_conf.rmv) {
1112 priv->intr_handle.fd = priv->ctx->async_fd;
1113 priv->intr_handle.type = RTE_INTR_HANDLE_EXT;
1114 rte_intr_callback_register(&priv->intr_handle,
1115 mlx5_dev_interrupt_handler, dev);
1117 ret = mlx5_socket_init(dev);
1119 DRV_LOG(ERR, "port %u cannot initialise socket: %s",
1120 dev->data->port_id, strerror(rte_errno));
1121 else if (priv->primary_socket) {
1122 priv->intr_handle_socket.fd = priv->primary_socket;
1123 priv->intr_handle_socket.type = RTE_INTR_HANDLE_EXT;
1124 rte_intr_callback_register(&priv->intr_handle_socket,
1125 mlx5_dev_handler_socket, dev);
1130 * DPDK callback to bring the link DOWN.
1133 * Pointer to Ethernet device structure.
1136 * 0 on success, a negative errno value otherwise and rte_errno is set.
1139 mlx5_set_link_down(struct rte_eth_dev *dev)
1141 return mlx5_set_flags(dev, ~IFF_UP, ~IFF_UP);
1145 * DPDK callback to bring the link UP.
1148 * Pointer to Ethernet device structure.
1151 * 0 on success, a negative errno value otherwise and rte_errno is set.
1154 mlx5_set_link_up(struct rte_eth_dev *dev)
1156 return mlx5_set_flags(dev, ~IFF_UP, IFF_UP);
1160 * Configure the TX function to use.
1163 * Pointer to private data structure.
1166 * Pointer to selected Tx burst function.
1169 mlx5_select_tx_function(struct rte_eth_dev *dev)
1171 struct priv *priv = dev->data->dev_private;
1172 eth_tx_burst_t tx_pkt_burst = mlx5_tx_burst;
1173 struct mlx5_dev_config *config = &priv->config;
1174 uint64_t tx_offloads = dev->data->dev_conf.txmode.offloads;
1175 int tso = !!(tx_offloads & (DEV_TX_OFFLOAD_TCP_TSO |
1176 DEV_TX_OFFLOAD_VXLAN_TNL_TSO |
1177 DEV_TX_OFFLOAD_GRE_TNL_TSO |
1178 DEV_TX_OFFLOAD_IP_TNL_TSO |
1179 DEV_TX_OFFLOAD_UDP_TNL_TSO));
1180 int swp = !!(tx_offloads & (DEV_TX_OFFLOAD_IP_TNL_TSO |
1181 DEV_TX_OFFLOAD_UDP_TNL_TSO |
1182 DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM));
1183 int vlan_insert = !!(tx_offloads & DEV_TX_OFFLOAD_VLAN_INSERT);
1185 assert(priv != NULL);
1186 /* Select appropriate TX function. */
1187 if (vlan_insert || tso || swp)
1188 return tx_pkt_burst;
1189 if (config->mps == MLX5_MPW_ENHANCED) {
1190 if (mlx5_check_vec_tx_support(dev) > 0) {
1191 if (mlx5_check_raw_vec_tx_support(dev) > 0)
1192 tx_pkt_burst = mlx5_tx_burst_raw_vec;
1194 tx_pkt_burst = mlx5_tx_burst_vec;
1196 "port %u selected enhanced MPW Tx vectorized"
1198 dev->data->port_id);
1200 tx_pkt_burst = mlx5_tx_burst_empw;
1202 "port %u selected enhanced MPW Tx function",
1203 dev->data->port_id);
1205 } else if (config->mps && (config->txq_inline > 0)) {
1206 tx_pkt_burst = mlx5_tx_burst_mpw_inline;
1207 DRV_LOG(DEBUG, "port %u selected MPW inline Tx function",
1208 dev->data->port_id);
1209 } else if (config->mps) {
1210 tx_pkt_burst = mlx5_tx_burst_mpw;
1211 DRV_LOG(DEBUG, "port %u selected MPW Tx function",
1212 dev->data->port_id);
1214 return tx_pkt_burst;
1218 * Configure the RX function to use.
1221 * Pointer to private data structure.
1224 * Pointer to selected Rx burst function.
1227 mlx5_select_rx_function(struct rte_eth_dev *dev)
1229 eth_rx_burst_t rx_pkt_burst = mlx5_rx_burst;
1231 assert(dev != NULL);
1232 if (mlx5_check_vec_rx_support(dev) > 0) {
1233 rx_pkt_burst = mlx5_rx_burst_vec;
1234 DRV_LOG(DEBUG, "port %u selected Rx vectorized function",
1235 dev->data->port_id);
1236 } else if (mlx5_mprq_enabled(dev)) {
1237 rx_pkt_burst = mlx5_rx_burst_mprq;
1239 return rx_pkt_burst;
1243 * Check if mlx5 device was removed.
1246 * Pointer to Ethernet device structure.
1249 * 1 when device is removed, otherwise 0.
1252 mlx5_is_removed(struct rte_eth_dev *dev)
1254 struct ibv_device_attr device_attr;
1255 struct priv *priv = dev->data->dev_private;
1257 if (mlx5_glue->query_device(priv->ctx, &device_attr) == EIO)
1263 * Get port ID list of mlx5 instances sharing a common device.
1266 * Device to look for.
1267 * @param[out] port_list
1268 * Result buffer for collected port IDs.
1269 * @param port_list_n
1270 * Maximum number of entries in result buffer. If 0, @p port_list can be
1274 * Number of matching instances regardless of the @p port_list_n
1275 * parameter, 0 if none were found.
1278 mlx5_dev_to_port_id(const struct rte_device *dev, uint16_t *port_list,
1279 unsigned int port_list_n)
1284 RTE_ETH_FOREACH_DEV(id) {
1285 struct rte_eth_dev *ldev = &rte_eth_devices[id];
1287 if (!ldev->device ||
1288 !ldev->device->driver ||
1289 strcmp(ldev->device->driver->name, MLX5_DRIVER_NAME) ||
1290 ldev->device != dev)
1292 if (n < port_list_n)