1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright 2015 6WIND S.A.
3 * Copyright 2015 Mellanox Technologies, Ltd
17 #include <sys/ioctl.h>
18 #include <sys/socket.h>
19 #include <netinet/in.h>
20 #include <linux/ethtool.h>
21 #include <linux/sockios.h>
27 #include <rte_atomic.h>
28 #include <rte_ethdev_driver.h>
29 #include <rte_bus_pci.h>
31 #include <rte_common.h>
32 #include <rte_interrupts.h>
33 #include <rte_malloc.h>
34 #include <rte_string_fns.h>
35 #include <rte_rwlock.h>
36 #include <rte_cycles.h>
38 #include <mlx5_glue.h>
39 #include <mlx5_devx_cmds.h>
40 #include <mlx5_common.h>
43 #include "mlx5_rxtx.h"
44 #include "mlx5_utils.h"
46 /* Supported speed values found in /usr/include/linux/ethtool.h */
47 #ifndef HAVE_SUPPORTED_40000baseKR4_Full
48 #define SUPPORTED_40000baseKR4_Full (1 << 23)
50 #ifndef HAVE_SUPPORTED_40000baseCR4_Full
51 #define SUPPORTED_40000baseCR4_Full (1 << 24)
53 #ifndef HAVE_SUPPORTED_40000baseSR4_Full
54 #define SUPPORTED_40000baseSR4_Full (1 << 25)
56 #ifndef HAVE_SUPPORTED_40000baseLR4_Full
57 #define SUPPORTED_40000baseLR4_Full (1 << 26)
59 #ifndef HAVE_SUPPORTED_56000baseKR4_Full
60 #define SUPPORTED_56000baseKR4_Full (1 << 27)
62 #ifndef HAVE_SUPPORTED_56000baseCR4_Full
63 #define SUPPORTED_56000baseCR4_Full (1 << 28)
65 #ifndef HAVE_SUPPORTED_56000baseSR4_Full
66 #define SUPPORTED_56000baseSR4_Full (1 << 29)
68 #ifndef HAVE_SUPPORTED_56000baseLR4_Full
69 #define SUPPORTED_56000baseLR4_Full (1 << 30)
72 /* Add defines in case the running kernel is not the same as user headers. */
73 #ifndef ETHTOOL_GLINKSETTINGS
74 struct ethtool_link_settings {
83 uint8_t eth_tp_mdix_ctrl;
84 int8_t link_mode_masks_nwords;
86 uint32_t link_mode_masks[];
89 /* The kernel values can be found in /include/uapi/linux/ethtool.h */
90 #define ETHTOOL_GLINKSETTINGS 0x0000004c
91 #define ETHTOOL_LINK_MODE_1000baseT_Full_BIT 5
92 #define ETHTOOL_LINK_MODE_Autoneg_BIT 6
93 #define ETHTOOL_LINK_MODE_1000baseKX_Full_BIT 17
94 #define ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT 18
95 #define ETHTOOL_LINK_MODE_10000baseKR_Full_BIT 19
96 #define ETHTOOL_LINK_MODE_10000baseR_FEC_BIT 20
97 #define ETHTOOL_LINK_MODE_20000baseMLD2_Full_BIT 21
98 #define ETHTOOL_LINK_MODE_20000baseKR2_Full_BIT 22
99 #define ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT 23
100 #define ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT 24
101 #define ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT 25
102 #define ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT 26
103 #define ETHTOOL_LINK_MODE_56000baseKR4_Full_BIT 27
104 #define ETHTOOL_LINK_MODE_56000baseCR4_Full_BIT 28
105 #define ETHTOOL_LINK_MODE_56000baseSR4_Full_BIT 29
106 #define ETHTOOL_LINK_MODE_56000baseLR4_Full_BIT 30
108 #ifndef HAVE_ETHTOOL_LINK_MODE_25G
109 #define ETHTOOL_LINK_MODE_25000baseCR_Full_BIT 31
110 #define ETHTOOL_LINK_MODE_25000baseKR_Full_BIT 32
111 #define ETHTOOL_LINK_MODE_25000baseSR_Full_BIT 33
113 #ifndef HAVE_ETHTOOL_LINK_MODE_50G
114 #define ETHTOOL_LINK_MODE_50000baseCR2_Full_BIT 34
115 #define ETHTOOL_LINK_MODE_50000baseKR2_Full_BIT 35
117 #ifndef HAVE_ETHTOOL_LINK_MODE_100G
118 #define ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT 36
119 #define ETHTOOL_LINK_MODE_100000baseSR4_Full_BIT 37
120 #define ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT 38
121 #define ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT 39
123 #ifndef HAVE_ETHTOOL_LINK_MODE_200G
124 #define ETHTOOL_LINK_MODE_200000baseKR4_Full_BIT 62
125 #define ETHTOOL_LINK_MODE_200000baseSR4_Full_BIT 63
126 #define ETHTOOL_LINK_MODE_200000baseLR4_ER4_FR4_Full_BIT 0 /* 64 - 64 */
127 #define ETHTOOL_LINK_MODE_200000baseDR4_Full_BIT 1 /* 65 - 64 */
128 #define ETHTOOL_LINK_MODE_200000baseCR4_Full_BIT 2 /* 66 - 64 */
133 * Get interface name from private structure.
135 * This is a port representor-aware version of mlx5_get_ifname_sysfs().
138 * Pointer to Ethernet device.
140 * Interface name output buffer.
143 * 0 on success, a negative errno value otherwise and rte_errno is set.
146 mlx5_get_ifname(const struct rte_eth_dev *dev, char (*ifname)[IF_NAMESIZE])
148 struct mlx5_priv *priv = dev->data->dev_private;
149 unsigned int ifindex;
152 MLX5_ASSERT(priv->sh);
153 ifindex = mlx5_ifindex(dev);
155 if (!priv->representor)
156 return mlx5_get_ifname_sysfs(priv->sh->ibdev_path,
161 if (if_indextoname(ifindex, &(*ifname)[0]))
168 * Perform ifreq ioctl() on associated Ethernet device.
171 * Pointer to Ethernet device.
173 * Request number to pass to ioctl().
175 * Interface request structure output buffer.
178 * 0 on success, a negative errno value otherwise and rte_errno is set.
181 mlx5_ifreq(const struct rte_eth_dev *dev, int req, struct ifreq *ifr)
183 int sock = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
190 ret = mlx5_get_ifname(dev, &ifr->ifr_name);
193 ret = ioctl(sock, req, ifr);
209 * Pointer to Ethernet device.
211 * MTU value output buffer.
214 * 0 on success, a negative errno value otherwise and rte_errno is set.
217 mlx5_get_mtu(struct rte_eth_dev *dev, uint16_t *mtu)
219 struct ifreq request;
220 int ret = mlx5_ifreq(dev, SIOCGIFMTU, &request);
224 *mtu = request.ifr_mtu;
232 * Pointer to Ethernet device.
237 * 0 on success, a negative errno value otherwise and rte_errno is set.
240 mlx5_set_mtu(struct rte_eth_dev *dev, uint16_t mtu)
242 struct ifreq request = { .ifr_mtu = mtu, };
244 return mlx5_ifreq(dev, SIOCSIFMTU, &request);
251 * Pointer to Ethernet device.
253 * Bitmask for flags that must remain untouched.
255 * Bitmask for flags to modify.
258 * 0 on success, a negative errno value otherwise and rte_errno is set.
261 mlx5_set_flags(struct rte_eth_dev *dev, unsigned int keep, unsigned int flags)
263 struct ifreq request;
264 int ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &request);
268 request.ifr_flags &= keep;
269 request.ifr_flags |= flags & ~keep;
270 return mlx5_ifreq(dev, SIOCSIFFLAGS, &request);
274 * Get device current raw clock counter
277 * Pointer to Ethernet device structure.
279 * Current raw clock counter of the device.
282 * 0 if the clock has correctly been read
283 * The value of errno in case of error
286 mlx5_read_clock(struct rte_eth_dev *dev, uint64_t *clock)
288 struct mlx5_priv *priv = dev->data->dev_private;
289 struct ibv_context *ctx = priv->sh->ctx;
290 struct ibv_values_ex values;
293 values.comp_mask = IBV_VALUES_MASK_RAW_CLOCK;
294 err = mlx5_glue->query_rt_values_ex(ctx, &values);
296 DRV_LOG(WARNING, "Could not query the clock !");
299 *clock = values.raw_clock.tv_nsec;
304 * Retrieve the master device for representor in the same switch domain.
307 * Pointer to representor Ethernet device structure.
310 * Master device structure on success, NULL otherwise.
312 static struct rte_eth_dev *
313 mlx5_find_master_dev(struct rte_eth_dev *dev)
315 struct mlx5_priv *priv;
319 priv = dev->data->dev_private;
320 domain_id = priv->domain_id;
321 MLX5_ASSERT(priv->representor);
322 MLX5_ETH_FOREACH_DEV(port_id, priv->pci_dev) {
323 struct mlx5_priv *opriv =
324 rte_eth_devices[port_id].data->dev_private;
327 opriv->domain_id == domain_id &&
328 opriv->sh == priv->sh)
329 return &rte_eth_devices[port_id];
335 * DPDK callback to retrieve physical link information.
338 * Pointer to Ethernet device structure.
340 * Storage for current link status.
343 * 0 on success, a negative errno value otherwise and rte_errno is set.
346 mlx5_link_update_unlocked_gset(struct rte_eth_dev *dev,
347 struct rte_eth_link *link)
349 struct mlx5_priv *priv = dev->data->dev_private;
350 struct ethtool_cmd edata = {
351 .cmd = ETHTOOL_GSET /* Deprecated since Linux v4.5. */
354 struct rte_eth_link dev_link;
358 ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr);
360 DRV_LOG(WARNING, "port %u ioctl(SIOCGIFFLAGS) failed: %s",
361 dev->data->port_id, strerror(rte_errno));
364 dev_link = (struct rte_eth_link) {
365 .link_status = ((ifr.ifr_flags & IFF_UP) &&
366 (ifr.ifr_flags & IFF_RUNNING)),
368 ifr = (struct ifreq) {
369 .ifr_data = (void *)&edata,
371 ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
373 if (ret == -ENOTSUP && priv->representor) {
374 struct rte_eth_dev *master;
377 * For representors we can try to inherit link
378 * settings from the master device. Actually
379 * link settings do not make a lot of sense
380 * for representors due to missing physical
381 * link. The old kernel drivers supported
382 * emulated settings query for representors,
383 * the new ones do not, so we have to add
384 * this code for compatibility issues.
386 master = mlx5_find_master_dev(dev);
388 ifr = (struct ifreq) {
389 .ifr_data = (void *)&edata,
391 ret = mlx5_ifreq(master, SIOCETHTOOL, &ifr);
396 "port %u ioctl(SIOCETHTOOL,"
397 " ETHTOOL_GSET) failed: %s",
398 dev->data->port_id, strerror(rte_errno));
402 link_speed = ethtool_cmd_speed(&edata);
403 if (link_speed == -1)
404 dev_link.link_speed = ETH_SPEED_NUM_NONE;
406 dev_link.link_speed = link_speed;
407 priv->link_speed_capa = 0;
408 if (edata.supported & SUPPORTED_Autoneg)
409 priv->link_speed_capa |= ETH_LINK_SPEED_AUTONEG;
410 if (edata.supported & (SUPPORTED_1000baseT_Full |
411 SUPPORTED_1000baseKX_Full))
412 priv->link_speed_capa |= ETH_LINK_SPEED_1G;
413 if (edata.supported & SUPPORTED_10000baseKR_Full)
414 priv->link_speed_capa |= ETH_LINK_SPEED_10G;
415 if (edata.supported & (SUPPORTED_40000baseKR4_Full |
416 SUPPORTED_40000baseCR4_Full |
417 SUPPORTED_40000baseSR4_Full |
418 SUPPORTED_40000baseLR4_Full))
419 priv->link_speed_capa |= ETH_LINK_SPEED_40G;
420 dev_link.link_duplex = ((edata.duplex == DUPLEX_HALF) ?
421 ETH_LINK_HALF_DUPLEX : ETH_LINK_FULL_DUPLEX);
422 dev_link.link_autoneg = !(dev->data->dev_conf.link_speeds &
423 ETH_LINK_SPEED_FIXED);
424 if (((dev_link.link_speed && !dev_link.link_status) ||
425 (!dev_link.link_speed && dev_link.link_status))) {
434 * Retrieve physical link information (unlocked version using new ioctl).
437 * Pointer to Ethernet device structure.
439 * Storage for current link status.
442 * 0 on success, a negative errno value otherwise and rte_errno is set.
445 mlx5_link_update_unlocked_gs(struct rte_eth_dev *dev,
446 struct rte_eth_link *link)
449 struct mlx5_priv *priv = dev->data->dev_private;
450 struct ethtool_link_settings gcmd = { .cmd = ETHTOOL_GLINKSETTINGS };
452 struct rte_eth_link dev_link;
453 struct rte_eth_dev *master = NULL;
457 ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr);
459 DRV_LOG(WARNING, "port %u ioctl(SIOCGIFFLAGS) failed: %s",
460 dev->data->port_id, strerror(rte_errno));
463 dev_link = (struct rte_eth_link) {
464 .link_status = ((ifr.ifr_flags & IFF_UP) &&
465 (ifr.ifr_flags & IFF_RUNNING)),
467 ifr = (struct ifreq) {
468 .ifr_data = (void *)&gcmd,
470 ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
472 if (ret == -ENOTSUP && priv->representor) {
474 * For representors we can try to inherit link
475 * settings from the master device. Actually
476 * link settings do not make a lot of sense
477 * for representors due to missing physical
478 * link. The old kernel drivers supported
479 * emulated settings query for representors,
480 * the new ones do not, so we have to add
481 * this code for compatibility issues.
483 master = mlx5_find_master_dev(dev);
485 ifr = (struct ifreq) {
486 .ifr_data = (void *)&gcmd,
488 ret = mlx5_ifreq(master, SIOCETHTOOL, &ifr);
493 "port %u ioctl(SIOCETHTOOL,"
494 " ETHTOOL_GLINKSETTINGS) failed: %s",
495 dev->data->port_id, strerror(rte_errno));
499 gcmd.link_mode_masks_nwords = -gcmd.link_mode_masks_nwords;
501 alignas(struct ethtool_link_settings)
502 uint8_t data[offsetof(struct ethtool_link_settings, link_mode_masks) +
503 sizeof(uint32_t) * gcmd.link_mode_masks_nwords * 3];
504 struct ethtool_link_settings *ecmd = (void *)data;
507 ifr.ifr_data = (void *)ecmd;
508 ret = mlx5_ifreq(master ? master : dev, SIOCETHTOOL, &ifr);
511 "port %u ioctl(SIOCETHTOOL,"
512 "ETHTOOL_GLINKSETTINGS) failed: %s",
513 dev->data->port_id, strerror(rte_errno));
516 dev_link.link_speed = (ecmd->speed == UINT32_MAX) ? ETH_SPEED_NUM_NONE :
518 sc = ecmd->link_mode_masks[0] |
519 ((uint64_t)ecmd->link_mode_masks[1] << 32);
520 priv->link_speed_capa = 0;
521 if (sc & MLX5_BITSHIFT(ETHTOOL_LINK_MODE_Autoneg_BIT))
522 priv->link_speed_capa |= ETH_LINK_SPEED_AUTONEG;
523 if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_1000baseT_Full_BIT) |
524 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_1000baseKX_Full_BIT)))
525 priv->link_speed_capa |= ETH_LINK_SPEED_1G;
526 if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT) |
527 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_10000baseKR_Full_BIT) |
528 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_10000baseR_FEC_BIT)))
529 priv->link_speed_capa |= ETH_LINK_SPEED_10G;
530 if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_20000baseMLD2_Full_BIT) |
531 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_20000baseKR2_Full_BIT)))
532 priv->link_speed_capa |= ETH_LINK_SPEED_20G;
533 if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT) |
534 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT) |
535 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT) |
536 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT)))
537 priv->link_speed_capa |= ETH_LINK_SPEED_40G;
538 if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseKR4_Full_BIT) |
539 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseCR4_Full_BIT) |
540 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseSR4_Full_BIT) |
541 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseLR4_Full_BIT)))
542 priv->link_speed_capa |= ETH_LINK_SPEED_56G;
543 if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_25000baseCR_Full_BIT) |
544 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_25000baseKR_Full_BIT) |
545 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_25000baseSR_Full_BIT)))
546 priv->link_speed_capa |= ETH_LINK_SPEED_25G;
547 if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_50000baseCR2_Full_BIT) |
548 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_50000baseKR2_Full_BIT)))
549 priv->link_speed_capa |= ETH_LINK_SPEED_50G;
550 if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT) |
551 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseSR4_Full_BIT) |
552 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT) |
553 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT)))
554 priv->link_speed_capa |= ETH_LINK_SPEED_100G;
555 if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_200000baseKR4_Full_BIT) |
556 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_200000baseSR4_Full_BIT)))
557 priv->link_speed_capa |= ETH_LINK_SPEED_200G;
559 sc = ecmd->link_mode_masks[2] |
560 ((uint64_t)ecmd->link_mode_masks[3] << 32);
561 if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_200000baseCR4_Full_BIT) |
563 (ETHTOOL_LINK_MODE_200000baseLR4_ER4_FR4_Full_BIT) |
564 MLX5_BITSHIFT(ETHTOOL_LINK_MODE_200000baseDR4_Full_BIT)))
565 priv->link_speed_capa |= ETH_LINK_SPEED_200G;
566 dev_link.link_duplex = ((ecmd->duplex == DUPLEX_HALF) ?
567 ETH_LINK_HALF_DUPLEX : ETH_LINK_FULL_DUPLEX);
568 dev_link.link_autoneg = !(dev->data->dev_conf.link_speeds &
569 ETH_LINK_SPEED_FIXED);
570 if (((dev_link.link_speed && !dev_link.link_status) ||
571 (!dev_link.link_speed && dev_link.link_status))) {
580 * DPDK callback to retrieve physical link information.
583 * Pointer to Ethernet device structure.
584 * @param wait_to_complete
585 * Wait for request completion.
588 * 0 if link status was not updated, positive if it was, a negative errno
589 * value otherwise and rte_errno is set.
592 mlx5_link_update(struct rte_eth_dev *dev, int wait_to_complete)
595 struct rte_eth_link dev_link;
596 time_t start_time = time(NULL);
597 int retry = MLX5_GET_LINK_STATUS_RETRY_COUNT;
600 ret = mlx5_link_update_unlocked_gs(dev, &dev_link);
602 ret = mlx5_link_update_unlocked_gset(dev, &dev_link);
605 /* Handle wait to complete situation. */
606 if ((wait_to_complete || retry) && ret == -EAGAIN) {
607 if (abs((int)difftime(time(NULL), start_time)) <
608 MLX5_LINK_STATUS_TIMEOUT) {
615 } else if (ret < 0) {
618 } while (wait_to_complete || retry-- > 0);
619 ret = !!memcmp(&dev->data->dev_link, &dev_link,
620 sizeof(struct rte_eth_link));
621 dev->data->dev_link = dev_link;
626 * DPDK callback to get flow control status.
629 * Pointer to Ethernet device structure.
630 * @param[out] fc_conf
631 * Flow control output buffer.
634 * 0 on success, a negative errno value otherwise and rte_errno is set.
637 mlx5_dev_get_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf)
640 struct ethtool_pauseparam ethpause = {
641 .cmd = ETHTOOL_GPAUSEPARAM
645 ifr.ifr_data = (void *)ðpause;
646 ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
649 "port %u ioctl(SIOCETHTOOL, ETHTOOL_GPAUSEPARAM) failed:"
651 dev->data->port_id, strerror(rte_errno));
654 fc_conf->autoneg = ethpause.autoneg;
655 if (ethpause.rx_pause && ethpause.tx_pause)
656 fc_conf->mode = RTE_FC_FULL;
657 else if (ethpause.rx_pause)
658 fc_conf->mode = RTE_FC_RX_PAUSE;
659 else if (ethpause.tx_pause)
660 fc_conf->mode = RTE_FC_TX_PAUSE;
662 fc_conf->mode = RTE_FC_NONE;
667 * DPDK callback to modify flow control parameters.
670 * Pointer to Ethernet device structure.
672 * Flow control parameters.
675 * 0 on success, a negative errno value otherwise and rte_errno is set.
678 mlx5_dev_set_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf)
681 struct ethtool_pauseparam ethpause = {
682 .cmd = ETHTOOL_SPAUSEPARAM
686 ifr.ifr_data = (void *)ðpause;
687 ethpause.autoneg = fc_conf->autoneg;
688 if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) ||
689 (fc_conf->mode & RTE_FC_RX_PAUSE))
690 ethpause.rx_pause = 1;
692 ethpause.rx_pause = 0;
694 if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) ||
695 (fc_conf->mode & RTE_FC_TX_PAUSE))
696 ethpause.tx_pause = 1;
698 ethpause.tx_pause = 0;
699 ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
702 "port %u ioctl(SIOCETHTOOL, ETHTOOL_SPAUSEPARAM)"
704 dev->data->port_id, strerror(rte_errno));
711 * Handle asynchronous removal event for entire multiport device.
714 * Infiniband device shared context.
717 mlx5_dev_interrupt_device_fatal(struct mlx5_dev_ctx_shared *sh)
721 for (i = 0; i < sh->max_port; ++i) {
722 struct rte_eth_dev *dev;
724 if (sh->port[i].ih_port_id >= RTE_MAX_ETHPORTS) {
726 * Or not existing port either no
727 * handler installed for this port.
731 dev = &rte_eth_devices[sh->port[i].ih_port_id];
733 if (dev->data->dev_conf.intr_conf.rmv)
734 _rte_eth_dev_callback_process
735 (dev, RTE_ETH_EVENT_INTR_RMV, NULL);
740 * Handle shared asynchronous events the NIC (removal event
741 * and link status change). Supports multiport IB device.
747 mlx5_dev_interrupt_handler(void *cb_arg)
749 struct mlx5_dev_ctx_shared *sh = cb_arg;
750 struct ibv_async_event event;
752 /* Read all message from the IB device and acknowledge them. */
754 struct rte_eth_dev *dev;
757 if (mlx5_glue->get_async_event(sh->ctx, &event))
759 /* Retrieve and check IB port index. */
760 tmp = (uint32_t)event.element.port_num;
761 if (!tmp && event.event_type == IBV_EVENT_DEVICE_FATAL) {
763 * The DEVICE_FATAL event is called once for
764 * entire device without port specifying.
765 * We should notify all existing ports.
767 mlx5_glue->ack_async_event(&event);
768 mlx5_dev_interrupt_device_fatal(sh);
771 MLX5_ASSERT(tmp && (tmp <= sh->max_port));
773 /* Unsupported device level event. */
774 mlx5_glue->ack_async_event(&event);
776 "unsupported common event (type %d)",
780 if (tmp > sh->max_port) {
781 /* Invalid IB port index. */
782 mlx5_glue->ack_async_event(&event);
784 "cannot handle an event (type %d)"
785 "due to invalid IB port index (%u)",
786 event.event_type, tmp);
789 if (sh->port[tmp - 1].ih_port_id >= RTE_MAX_ETHPORTS) {
790 /* No handler installed. */
791 mlx5_glue->ack_async_event(&event);
793 "cannot handle an event (type %d)"
794 "due to no handler installed for port %u",
795 event.event_type, tmp);
798 /* Retrieve ethernet device descriptor. */
799 tmp = sh->port[tmp - 1].ih_port_id;
800 dev = &rte_eth_devices[tmp];
802 if ((event.event_type == IBV_EVENT_PORT_ACTIVE ||
803 event.event_type == IBV_EVENT_PORT_ERR) &&
804 dev->data->dev_conf.intr_conf.lsc) {
805 mlx5_glue->ack_async_event(&event);
806 if (mlx5_link_update(dev, 0) == -EAGAIN) {
810 _rte_eth_dev_callback_process
811 (dev, RTE_ETH_EVENT_INTR_LSC, NULL);
815 "port %u cannot handle an unknown event (type %d)",
816 dev->data->port_id, event.event_type);
817 mlx5_glue->ack_async_event(&event);
822 * Unregister callback handler safely. The handler may be active
823 * while we are trying to unregister it, in this case code -EAGAIN
824 * is returned by rte_intr_callback_unregister(). This routine checks
825 * the return code and tries to unregister handler again.
830 * pointer to callback routine
832 * opaque callback parameter
835 mlx5_intr_callback_unregister(const struct rte_intr_handle *handle,
836 rte_intr_callback_fn cb_fn, void *cb_arg)
839 * Try to reduce timeout management overhead by not calling
840 * the timer related routines on the first iteration. If the
841 * unregistering succeeds on first call there will be no
842 * timer calls at all.
850 ret = rte_intr_callback_unregister(handle, cb_fn, cb_arg);
853 if (ret != -EAGAIN) {
854 DRV_LOG(INFO, "failed to unregister interrupt"
855 " handler (error: %d)", ret);
860 struct timespec onems;
862 /* Wait one millisecond and try again. */
864 onems.tv_nsec = NS_PER_S / MS_PER_S;
865 nanosleep(&onems, 0);
866 /* Check whether one second elapsed. */
867 if ((rte_get_timer_cycles() - start) <= twait)
871 * We get the amount of timer ticks for one second.
872 * If this amount elapsed it means we spent one
873 * second in waiting. This branch is executed once
874 * on first iteration.
876 twait = rte_get_timer_hz();
880 * Timeout elapsed, show message (once a second) and retry.
881 * We have no other acceptable option here, if we ignore
882 * the unregistering return code the handler will not
883 * be unregistered, fd will be closed and we may get the
884 * crush. Hanging and messaging in the loop seems not to be
887 DRV_LOG(INFO, "Retrying to unregister interrupt handler");
888 start = rte_get_timer_cycles();
893 * Handle DEVX interrupts from the NIC.
894 * This function is probably called from the DPDK host thread.
900 mlx5_dev_interrupt_handler_devx(void *cb_arg)
902 #ifndef HAVE_IBV_DEVX_ASYNC
906 struct mlx5_dev_ctx_shared *sh = cb_arg;
908 struct mlx5dv_devx_async_cmd_hdr cmd_resp;
909 uint8_t buf[MLX5_ST_SZ_BYTES(query_flow_counter_out) +
910 MLX5_ST_SZ_BYTES(traffic_counter) +
911 sizeof(struct mlx5dv_devx_async_cmd_hdr)];
913 uint8_t *buf = out.buf + sizeof(out.cmd_resp);
915 while (!mlx5_glue->devx_get_async_cmd_comp(sh->devx_comp,
918 mlx5_flow_async_pool_query_handle
919 (sh, (uint64_t)out.cmd_resp.wr_id,
920 mlx5_devx_get_out_command_status(buf));
921 #endif /* HAVE_IBV_DEVX_ASYNC */
925 * DPDK callback to bring the link DOWN.
928 * Pointer to Ethernet device structure.
931 * 0 on success, a negative errno value otherwise and rte_errno is set.
934 mlx5_set_link_down(struct rte_eth_dev *dev)
936 return mlx5_set_flags(dev, ~IFF_UP, ~IFF_UP);
940 * DPDK callback to bring the link UP.
943 * Pointer to Ethernet device structure.
946 * 0 on success, a negative errno value otherwise and rte_errno is set.
949 mlx5_set_link_up(struct rte_eth_dev *dev)
951 return mlx5_set_flags(dev, ~IFF_UP, IFF_UP);
955 * Check if mlx5 device was removed.
958 * Pointer to Ethernet device structure.
961 * 1 when device is removed, otherwise 0.
964 mlx5_is_removed(struct rte_eth_dev *dev)
966 struct ibv_device_attr device_attr;
967 struct mlx5_priv *priv = dev->data->dev_private;
969 if (mlx5_glue->query_device(priv->sh->ctx, &device_attr) == EIO)
975 * Get switch information associated with network interface.
978 * Network interface index.
980 * Switch information object, populated in case of success.
983 * 0 on success, a negative errno value otherwise and rte_errno is set.
986 mlx5_sysfs_switch_info(unsigned int ifindex, struct mlx5_switch_info *info)
988 char ifname[IF_NAMESIZE];
989 char port_name[IF_NAMESIZE];
991 struct mlx5_switch_info data = {
994 .name_type = MLX5_PHYS_PORT_NAME_TYPE_NOTSET,
999 bool port_switch_id_set = false;
1000 bool device_dir = false;
1004 if (!if_indextoname(ifindex, ifname)) {
1009 MKSTR(phys_port_name, "/sys/class/net/%s/phys_port_name",
1011 MKSTR(phys_switch_id, "/sys/class/net/%s/phys_switch_id",
1013 MKSTR(pci_device, "/sys/class/net/%s/device",
1016 file = fopen(phys_port_name, "rb");
1018 ret = fscanf(file, "%s", port_name);
1021 mlx5_translate_port_name(port_name, &data);
1023 file = fopen(phys_switch_id, "rb");
1028 port_switch_id_set =
1029 fscanf(file, "%" SCNx64 "%c", &data.switch_id, &c) == 2 &&
1032 dir = opendir(pci_device);
1037 if (port_switch_id_set) {
1038 /* We have some E-Switch configuration. */
1039 mlx5_sysfs_check_switch_info(device_dir, &data);
1042 MLX5_ASSERT(!(data.master && data.representor));
1043 if (data.master && data.representor) {
1044 DRV_LOG(ERR, "ifindex %u device is recognized as master"
1045 " and as representor", ifindex);
1053 * Analyze gathered port parameters via sysfs to recognize master
1054 * and representor devices for E-Switch configuration.
1056 * @param[in] device_dir
1057 * flag of presence of "device" directory under port device key.
1058 * @param[inout] switch_info
1059 * Port information, including port name as a number and port name
1060 * type if recognized
1063 * master and representor flags are set in switch_info according to
1064 * recognized parameters (if any).
1067 mlx5_sysfs_check_switch_info(bool device_dir,
1068 struct mlx5_switch_info *switch_info)
1070 switch (switch_info->name_type) {
1071 case MLX5_PHYS_PORT_NAME_TYPE_UNKNOWN:
1073 * Name is not recognized, assume the master,
1074 * check the device directory presence.
1076 switch_info->master = device_dir;
1078 case MLX5_PHYS_PORT_NAME_TYPE_NOTSET:
1080 * Name is not set, this assumes the legacy naming
1081 * schema for master, just check if there is
1082 * a device directory.
1084 switch_info->master = device_dir;
1086 case MLX5_PHYS_PORT_NAME_TYPE_UPLINK:
1087 /* New uplink naming schema recognized. */
1088 switch_info->master = 1;
1090 case MLX5_PHYS_PORT_NAME_TYPE_LEGACY:
1091 /* Legacy representors naming schema. */
1092 switch_info->representor = !device_dir;
1094 case MLX5_PHYS_PORT_NAME_TYPE_PFHPF:
1096 case MLX5_PHYS_PORT_NAME_TYPE_PFVF:
1097 /* New representors naming schema. */
1098 switch_info->representor = 1;
1104 * DPDK callback to retrieve plug-in module EEPROM information (type and size).
1107 * Pointer to Ethernet device structure.
1108 * @param[out] modinfo
1109 * Storage for plug-in module EEPROM information.
1112 * 0 on success, a negative errno value otherwise and rte_errno is set.
1115 mlx5_get_module_info(struct rte_eth_dev *dev,
1116 struct rte_eth_dev_module_info *modinfo)
1118 struct ethtool_modinfo info = {
1119 .cmd = ETHTOOL_GMODULEINFO,
1121 struct ifreq ifr = (struct ifreq) {
1122 .ifr_data = (void *)&info,
1126 if (!dev || !modinfo) {
1127 DRV_LOG(WARNING, "missing argument, cannot get module info");
1131 ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
1133 DRV_LOG(WARNING, "port %u ioctl(SIOCETHTOOL) failed: %s",
1134 dev->data->port_id, strerror(rte_errno));
1137 modinfo->type = info.type;
1138 modinfo->eeprom_len = info.eeprom_len;
1143 * DPDK callback to retrieve plug-in module EEPROM data.
1146 * Pointer to Ethernet device structure.
1148 * Storage for plug-in module EEPROM data.
1151 * 0 on success, a negative errno value otherwise and rte_errno is set.
1153 int mlx5_get_module_eeprom(struct rte_eth_dev *dev,
1154 struct rte_dev_eeprom_info *info)
1156 struct ethtool_eeprom *eeprom;
1160 if (!dev || !info) {
1161 DRV_LOG(WARNING, "missing argument, cannot get module eeprom");
1165 eeprom = rte_calloc(__func__, 1,
1166 (sizeof(struct ethtool_eeprom) + info->length), 0);
1168 DRV_LOG(WARNING, "port %u cannot allocate memory for "
1169 "eeprom data", dev->data->port_id);
1173 eeprom->cmd = ETHTOOL_GMODULEEEPROM;
1174 eeprom->offset = info->offset;
1175 eeprom->len = info->length;
1176 ifr = (struct ifreq) {
1177 .ifr_data = (void *)eeprom,
1179 ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
1181 DRV_LOG(WARNING, "port %u ioctl(SIOCETHTOOL) failed: %s",
1182 dev->data->port_id, strerror(rte_errno));
1184 rte_memcpy(info->data, eeprom->data, info->length);