net/sfc/base: add APIs for PTP privilege configuration
[dpdk.git] / drivers / net / mlx5 / mlx5_ethdev.c
index d1a70fc..26a1a3a 100644 (file)
@@ -33,6 +33,7 @@
 #include <rte_malloc.h>
 #include <rte_string_fns.h>
 #include <rte_rwlock.h>
+#include <rte_cycles.h>
 
 #include "mlx5.h"
 #include "mlx5_glue.h"
@@ -240,6 +241,51 @@ mlx5_get_ifname(const struct rte_eth_dev *dev, char (*ifname)[IF_NAMESIZE])
        return -rte_errno;
 }
 
+/**
+ * Get interface name for the specified device, uses the extra base
+ * device resources to perform Netlink requests.
+ *
+ * This is a port representor-aware version of mlx5_get_master_ifname().
+ *
+ * @param[in] base
+ *   Pointer to Ethernet device to use Netlink socket from
+ *   to perfrom requests.
+ * @param[in] dev
+ *   Pointer to Ethernet device.
+ * @param[out] ifname
+ *   Interface name output buffer.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_get_ifname_base(const struct rte_eth_dev *base,
+                    const struct rte_eth_dev *dev,
+                    char (*ifname)[IF_NAMESIZE])
+{
+       struct mlx5_priv *priv = dev->data->dev_private;
+       struct mlx5_priv *priv_base = base->data->dev_private;
+       unsigned int ifindex;
+
+       assert(priv);
+       assert(priv->sh);
+       assert(priv_base);
+       ifindex = priv_base->nl_socket_rdma >= 0 ?
+                 mlx5_nl_ifindex(priv_base->nl_socket_rdma,
+                                 priv->sh->ibdev_name,
+                                 priv->ibv_port) : 0;
+       if (!ifindex) {
+               if (!priv->representor)
+                       return mlx5_get_master_ifname(priv->sh->ibdev_path,
+                                                     ifname);
+               rte_errno = ENXIO;
+               return -rte_errno;
+       }
+       if (if_indextoname(ifindex, &(*ifname)[0]))
+               return 0;
+       rte_errno = errno;
+       return -rte_errno;
+}
 /**
  * Get the interface index from device name.
  *
@@ -301,6 +347,51 @@ error:
        return -rte_errno;
 }
 
+/**
+ * Perform ifreq ioctl() on specified Ethernet device,
+ * ifindex, name and other attributes are requested
+ * on the base device to avoid specified device Netlink
+ * socket sharing (this is not thread-safe).
+ *
+ * @param[in] base
+ *   Pointer to Ethernet device to get dev attributes.
+ * @param[in] dev
+ *   Pointer to Ethernet device to perform ioctl.
+ * @param req
+ *   Request number to pass to ioctl().
+ * @param[out] ifr
+ *   Interface request structure output buffer.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_ifreq_base(const struct rte_eth_dev *base,
+               const struct rte_eth_dev *dev,
+               int req, struct ifreq *ifr)
+{
+       int sock = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
+       int ret = 0;
+
+       if (sock == -1) {
+               rte_errno = errno;
+               return -rte_errno;
+       }
+       ret = mlx5_get_ifname_base(base, dev, &ifr->ifr_name);
+       if (ret)
+               goto error;
+       ret = ioctl(sock, req, ifr);
+       if (ret == -1) {
+               rte_errno = errno;
+               goto error;
+       }
+       close(sock);
+       return 0;
+error:
+       close(sock);
+       return -rte_errno;
+}
+
 /**
  * Get device MTU.
  *
@@ -428,27 +519,31 @@ mlx5_dev_configure(struct rte_eth_dev *dev)
                rte_errno = EINVAL;
                return -rte_errno;
        }
-       if (rxqs_n == priv->rxqs_n)
-               return 0;
-       DRV_LOG(INFO, "port %u Rx queues number update: %u -> %u",
-               dev->data->port_id, priv->rxqs_n, rxqs_n);
-       priv->rxqs_n = rxqs_n;
-       /* If the requested number of RX queues is not a power of two, use the
-        * maximum indirection table size for better balancing.
-        * The result is always rounded to the next power of two. */
-       reta_idx_n = (1 << log2above((rxqs_n & (rxqs_n - 1)) ?
-                                    priv->config.ind_table_max_size :
-                                    rxqs_n));
-       ret = mlx5_rss_reta_index_resize(dev, reta_idx_n);
-       if (ret)
-               return ret;
-       /* When the number of RX queues is not a power of two, the remaining
-        * table entries are padded with reused WQs and hashes are not spread
-        * uniformly. */
-       for (i = 0, j = 0; (i != reta_idx_n); ++i) {
-               (*priv->reta_idx)[i] = j;
-               if (++j == rxqs_n)
-                       j = 0;
+       if (rxqs_n != priv->rxqs_n) {
+               DRV_LOG(INFO, "port %u Rx queues number update: %u -> %u",
+                       dev->data->port_id, priv->rxqs_n, rxqs_n);
+               priv->rxqs_n = rxqs_n;
+               /*
+                * If the requested number of RX queues is not a power of two,
+                * use the maximum indirection table size for better balancing.
+                * The result is always rounded to the next power of two.
+                */
+               reta_idx_n = (1 << log2above((rxqs_n & (rxqs_n - 1)) ?
+                                            priv->config.ind_table_max_size :
+                                            rxqs_n));
+               ret = mlx5_rss_reta_index_resize(dev, reta_idx_n);
+               if (ret)
+                       return ret;
+               /*
+                * When the number of RX queues is not a power of two,
+                * the remaining table entries are padded with reused WQs
+                * and hashes are not spread uniformly.
+                */
+               for (i = 0, j = 0; (i != reta_idx_n); ++i) {
+                       (*priv->reta_idx)[i] = j;
+                       if (++j == rxqs_n)
+                               j = 0;
+               }
        }
        ret = mlx5_proc_priv_init(dev);
        if (ret)
@@ -564,6 +659,36 @@ mlx5_dev_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *info)
        }
 }
 
+/**
+ * Get device current raw clock counter
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param[out] time
+ *   Current raw clock counter of the device.
+ *
+ * @return
+ *   0 if the clock has correctly been read
+ *   The value of errno in case of error
+ */
+int
+mlx5_read_clock(struct rte_eth_dev *dev, uint64_t *clock)
+{
+       struct mlx5_priv *priv = dev->data->dev_private;
+       struct ibv_context *ctx = priv->sh->ctx;
+       struct ibv_values_ex values;
+       int err = 0;
+
+       values.comp_mask = IBV_VALUES_MASK_RAW_CLOCK;
+       err = mlx5_glue->query_rt_values_ex(ctx, &values);
+       if (err != 0) {
+               DRV_LOG(WARNING, "Could not query the clock !");
+               return err;
+       }
+       *clock = values.raw_clock.tv_nsec;
+       return 0;
+}
+
 /**
  * Get firmware version of a device.
  *
@@ -714,7 +839,15 @@ mlx5_link_update_unlocked_gset(struct rte_eth_dev *dev,
                                ifr = (struct ifreq) {
                                        .ifr_data = (void *)&edata,
                                };
-                               ret = mlx5_ifreq(master, SIOCETHTOOL, &ifr);
+                               /*
+                                * Use special version of mlx5_ifreq()
+                                * to get master device name with local
+                                * device Netlink socket. Using master
+                                * device Netlink socket is not thread
+                                * safe.
+                                */
+                               ret = mlx5_ifreq_base(dev, master,
+                                                     SIOCETHTOOL, &ifr);
                        }
                }
                if (ret) {
@@ -811,7 +944,12 @@ mlx5_link_update_unlocked_gs(struct rte_eth_dev *dev,
                                ifr = (struct ifreq) {
                                        .ifr_data = (void *)&gcmd,
                                };
-                               ret = mlx5_ifreq(master, SIOCETHTOOL, &ifr);
+                               /*
+                                * Avoid using master Netlink socket.
+                                * This is not thread-safe.
+                                */
+                               ret = mlx5_ifreq_base(dev, master,
+                                                     SIOCETHTOOL, &ifr);
                        }
                }
                if (ret) {
@@ -832,7 +970,7 @@ mlx5_link_update_unlocked_gs(struct rte_eth_dev *dev,
 
        *ecmd = gcmd;
        ifr.ifr_data = (void *)ecmd;
-       ret = mlx5_ifreq(master ? master : dev, SIOCETHTOOL, &ifr);
+       ret = mlx5_ifreq_base(dev, master ? master : dev, SIOCETHTOOL, &ifr);
        if (ret) {
                DRV_LOG(DEBUG,
                        "port %u ioctl(SIOCETHTOOL,"
@@ -1111,6 +1249,35 @@ mlx5_ibv_device_to_pci_addr(const struct ibv_device *device,
        return 0;
 }
 
+/**
+ * Handle asynchronous removal event for entire multiport device.
+ *
+ * @param sh
+ *   Infiniband device shared context.
+ */
+static void
+mlx5_dev_interrupt_device_fatal(struct mlx5_ibv_shared *sh)
+{
+       uint32_t i;
+
+       for (i = 0; i < sh->max_port; ++i) {
+               struct rte_eth_dev *dev;
+
+               if (sh->port[i].ih_port_id >= RTE_MAX_ETHPORTS) {
+                       /*
+                        * Or not existing port either no
+                        * handler installed for this port.
+                        */
+                       continue;
+               }
+               dev = &rte_eth_devices[sh->port[i].ih_port_id];
+               assert(dev);
+               if (dev->data->dev_conf.intr_conf.rmv)
+                       _rte_eth_dev_callback_process
+                               (dev, RTE_ETH_EVENT_INTR_RMV, NULL);
+       }
+}
+
 /**
  * Handle shared asynchronous events the NIC (removal event
  * and link status change). Supports multiport IB device.
@@ -1133,21 +1300,46 @@ mlx5_dev_interrupt_handler(void *cb_arg)
                        break;
                /* Retrieve and check IB port index. */
                tmp = (uint32_t)event.element.port_num;
-               assert(tmp && (tmp <= sh->max_port));
-               if (!tmp ||
-                   tmp > sh->max_port ||
-                   sh->port[tmp - 1].ih_port_id >= RTE_MAX_ETHPORTS) {
+               if (!tmp && event.event_type == IBV_EVENT_DEVICE_FATAL) {
                        /*
-                        * Invalid IB port index or no handler
-                        * installed for this port.
+                        * The DEVICE_FATAL event is called once for
+                        * entire device without port specifying.
+                        * We should notify all existing ports.
                         */
                        mlx5_glue->ack_async_event(&event);
+                       mlx5_dev_interrupt_device_fatal(sh);
+                       continue;
+               }
+               assert(tmp && (tmp <= sh->max_port));
+               if (!tmp) {
+                       /* Unsupported devive level event. */
+                       mlx5_glue->ack_async_event(&event);
+                       DRV_LOG(DEBUG,
+                               "unsupported common event (type %d)",
+                               event.event_type);
+                       continue;
+               }
+               if (tmp > sh->max_port) {
+                       /* Invalid IB port index. */
+                       mlx5_glue->ack_async_event(&event);
+                       DRV_LOG(DEBUG,
+                               "cannot handle an event (type %d)"
+                               "due to invalid IB port index (%u)",
+                               event.event_type, tmp);
+                       continue;
+               }
+               if (sh->port[tmp - 1].ih_port_id >= RTE_MAX_ETHPORTS) {
+                       /* No handler installed. */
+                       mlx5_glue->ack_async_event(&event);
+                       DRV_LOG(DEBUG,
+                               "cannot handle an event (type %d)"
+                               "due to no handler installed for port %u",
+                               event.event_type, tmp);
                        continue;
                }
                /* Retrieve ethernet device descriptor. */
                tmp = sh->port[tmp - 1].ih_port_id;
                dev = &rte_eth_devices[tmp];
-               tmp = 0;
                assert(dev);
                if ((event.event_type == IBV_EVENT_PORT_ACTIVE ||
                     event.event_type == IBV_EVENT_PORT_ERR) &&
@@ -1161,23 +1353,87 @@ mlx5_dev_interrupt_handler(void *cb_arg)
                                (dev, RTE_ETH_EVENT_INTR_LSC, NULL);
                        continue;
                }
-               if (event.event_type == IBV_EVENT_DEVICE_FATAL &&
-                   dev->data->dev_conf.intr_conf.rmv) {
-                       mlx5_glue->ack_async_event(&event);
-                       _rte_eth_dev_callback_process
-                               (dev, RTE_ETH_EVENT_INTR_RMV, NULL);
-                       continue;
-               }
                DRV_LOG(DEBUG,
-                       "port %u event type %d on not handled",
+                       "port %u cannot handle an unknown event (type %d)",
                        dev->data->port_id, event.event_type);
                mlx5_glue->ack_async_event(&event);
        }
 }
 
+/*
+ * Unregister callback handler safely. The handler may be active
+ * while we are trying to unregister it, in this case code -EAGAIN
+ * is returned by rte_intr_callback_unregister(). This routine checks
+ * the return code and tries to unregister handler again.
+ *
+ * @param handle
+ *   interrupt handle
+ * @param cb_fn
+ *   pointer to callback routine
+ * @cb_arg
+ *   opaque callback parameter
+ */
+void
+mlx5_intr_callback_unregister(const struct rte_intr_handle *handle,
+                             rte_intr_callback_fn cb_fn, void *cb_arg)
+{
+       /*
+        * Try to reduce timeout management overhead by not calling
+        * the timer related routines on the first iteration. If the
+        * unregistering succeeds on first call there will be no
+        * timer calls at all.
+        */
+       uint64_t twait = 0;
+       uint64_t start = 0;
+
+       do {
+               int ret;
+
+               ret = rte_intr_callback_unregister(handle, cb_fn, cb_arg);
+               if (ret >= 0)
+                       return;
+               if (ret != -EAGAIN) {
+                       DRV_LOG(INFO, "failed to unregister interrupt"
+                                     " handler (error: %d)", ret);
+                       assert(false);
+                       return;
+               }
+               if (twait) {
+                       struct timespec onems;
+
+                       /* Wait one millisecond and try again. */
+                       onems.tv_sec = 0;
+                       onems.tv_nsec = NS_PER_S / MS_PER_S;
+                       nanosleep(&onems, 0);
+                       /* Check whether one second elapsed. */
+                       if ((rte_get_timer_cycles() - start) <= twait)
+                               continue;
+               } else {
+                       /*
+                        * We get the amount of timer ticks for one second.
+                        * If this amount elapsed it means we spent one
+                        * second in waiting. This branch is executed once
+                        * on first iteration.
+                        */
+                       twait = rte_get_timer_hz();
+                       assert(twait);
+               }
+               /*
+                * Timeout elapsed, show message (once a second) and retry.
+                * We have no other acceptable option here, if we ignore
+                * the unregistering return code the handler will not
+                * be unregistered, fd will be closed and we may get the
+                * crush. Hanging and messaging in the loop seems not to be
+                * the worst choice.
+                */
+               DRV_LOG(INFO, "Retrying to unregister interrupt handler");
+               start = rte_get_timer_cycles();
+       } while (true);
+}
+
 /**
  * Uninstall shared asynchronous device events handler.
- * This function is implemeted to support event sharing
+ * This function is implemented to support event sharing
  * between multiple ports of single IB device.
  *
  * @param dev
@@ -1203,7 +1459,7 @@ mlx5_dev_shared_handler_uninstall(struct rte_eth_dev *dev)
        sh->port[priv->ibv_port - 1].ih_port_id = RTE_MAX_ETHPORTS;
        if (!sh->intr_cnt || --sh->intr_cnt)
                goto exit;
-       rte_intr_callback_unregister(&sh->intr_handle,
+       mlx5_intr_callback_unregister(&sh->intr_handle,
                                     mlx5_dev_interrupt_handler, sh);
        sh->intr_handle.fd = 0;
        sh->intr_handle.type = RTE_INTR_HANDLE_UNKNOWN;
@@ -1212,8 +1468,8 @@ exit:
 }
 
 /**
- * Install shared asyncronous device events handler.
- * This function is implemeted to support event sharing
+ * Install shared asynchronous device events handler.
+ * This function is implemented to support event sharing
  * between multiple ports of single IB device.
  *
  * @param dev