ethdev: add namespace
[dpdk.git] / drivers / net / mlx5 / linux / mlx5_ethdev_os.c
index 21105f6..9d0e491 100644 (file)
@@ -24,8 +24,7 @@
 #include <sys/un.h>
 #include <time.h>
 
-#include <rte_atomic.h>
-#include <rte_ethdev_driver.h>
+#include <ethdev_driver.h>
 #include <rte_bus_pci.h>
 #include <rte_mbuf.h>
 #include <rte_common.h>
@@ -38,6 +37,7 @@
 #include <mlx5_glue.h>
 #include <mlx5_devx_cmds.h>
 #include <mlx5_common.h>
+#include <mlx5_malloc.h>
 
 #include "mlx5.h"
 #include "mlx5_rxtx.h"
@@ -128,6 +128,17 @@ struct ethtool_link_settings {
 #define ETHTOOL_LINK_MODE_200000baseCR4_Full_BIT 2 /* 66 - 64 */
 #endif
 
+/* Get interface index from SubFunction device name. */
+int
+mlx5_auxiliary_get_ifindex(const char *sf_name)
+{
+       char if_name[IF_NAMESIZE] = { 0 };
+
+       if (mlx5_auxiliary_get_child_name(sf_name, "/net",
+                                         if_name, sizeof(if_name)) != 0)
+               return -rte_errno;
+       return if_nametoindex(if_name);
+}
 
 /**
  * Get interface name from private structure.
@@ -143,13 +154,17 @@ struct ethtool_link_settings {
  *   0 on success, a negative errno value otherwise and rte_errno is set.
  */
 int
-mlx5_get_ifname(const struct rte_eth_dev *dev, char (*ifname)[IF_NAMESIZE])
+mlx5_get_ifname(const struct rte_eth_dev *dev, char (*ifname)[MLX5_NAMESIZE])
 {
        struct mlx5_priv *priv = dev->data->dev_private;
        unsigned int ifindex;
 
        MLX5_ASSERT(priv);
        MLX5_ASSERT(priv->sh);
+       if (priv->master && priv->sh->bond.ifindex > 0) {
+               memcpy(ifname, priv->sh->bond.ifname, MLX5_NAMESIZE);
+               return 0;
+       }
        ifindex = mlx5_ifindex(dev);
        if (!ifindex) {
                if (!priv->representor)
@@ -165,10 +180,10 @@ mlx5_get_ifname(const struct rte_eth_dev *dev, char (*ifname)[IF_NAMESIZE])
 }
 
 /**
- * Perform ifreq ioctl() on associated Ethernet device.
+ * Perform ifreq ioctl() on associated netdev ifname.
  *
- * @param[in] dev
- *   Pointer to Ethernet device.
+ * @param[in] ifname
+ *   Pointer to netdev name.
  * @param req
  *   Request number to pass to ioctl().
  * @param[out] ifr
@@ -177,8 +192,8 @@ mlx5_get_ifname(const struct rte_eth_dev *dev, char (*ifname)[IF_NAMESIZE])
  * @return
  *   0 on success, a negative errno value otherwise and rte_errno is set.
  */
-int
-mlx5_ifreq(const struct rte_eth_dev *dev, int req, struct ifreq *ifr)
+static int
+mlx5_ifreq_by_ifname(const char *ifname, int req, struct ifreq *ifr)
 {
        int sock = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
        int ret = 0;
@@ -187,9 +202,7 @@ mlx5_ifreq(const struct rte_eth_dev *dev, int req, struct ifreq *ifr)
                rte_errno = errno;
                return -rte_errno;
        }
-       ret = mlx5_get_ifname(dev, &ifr->ifr_name);
-       if (ret)
-               goto error;
+       rte_strscpy(ifr->ifr_name, ifname, sizeof(ifr->ifr_name));
        ret = ioctl(sock, req, ifr);
        if (ret == -1) {
                rte_errno = errno;
@@ -202,6 +215,31 @@ error:
        return -rte_errno;
 }
 
+/**
+ * Perform ifreq ioctl() on associated Ethernet device.
+ *
+ * @param[in] dev
+ *   Pointer to Ethernet device.
+ * @param req
+ *   Request number to pass to ioctl().
+ * @param[out] ifr
+ *   Interface request structure output buffer.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_ifreq(const struct rte_eth_dev *dev, int req, struct ifreq *ifr)
+{
+       char ifname[sizeof(ifr->ifr_name)];
+       int ret;
+
+       ret = mlx5_get_ifname(dev, &ifname);
+       if (ret)
+               return -rte_errno;
+       return mlx5_ifreq_by_ifname(ifname, req, ifr);
+}
+
 /**
  * Get device MTU.
  *
@@ -257,7 +295,7 @@ mlx5_set_mtu(struct rte_eth_dev *dev, uint16_t mtu)
  * @return
  *   0 on success, a negative errno value otherwise and rte_errno is set.
  */
-int
+static int
 mlx5_set_flags(struct rte_eth_dev *dev, unsigned int keep, unsigned int flags)
 {
        struct ifreq request;
@@ -286,7 +324,7 @@ int
 mlx5_read_clock(struct rte_eth_dev *dev, uint64_t *clock)
 {
        struct mlx5_priv *priv = dev->data->dev_private;
-       struct ibv_context *ctx = priv->sh->ctx;
+       struct ibv_context *ctx = priv->sh->cdev->ctx;
        struct ibv_values_ex values;
        int err = 0;
 
@@ -319,7 +357,7 @@ mlx5_find_master_dev(struct rte_eth_dev *dev)
        priv = dev->data->dev_private;
        domain_id = priv->domain_id;
        MLX5_ASSERT(priv->representor);
-       MLX5_ETH_FOREACH_DEV(port_id, priv->pci_dev) {
+       MLX5_ETH_FOREACH_DEV(port_id, dev->device) {
                struct mlx5_priv *opriv =
                        rte_eth_devices[port_id].data->dev_private;
                if (opriv &&
@@ -401,31 +439,24 @@ mlx5_link_update_unlocked_gset(struct rte_eth_dev *dev,
        }
        link_speed = ethtool_cmd_speed(&edata);
        if (link_speed == -1)
-               dev_link.link_speed = ETH_SPEED_NUM_NONE;
+               dev_link.link_speed = RTE_ETH_SPEED_NUM_UNKNOWN;
        else
                dev_link.link_speed = link_speed;
        priv->link_speed_capa = 0;
-       if (edata.supported & SUPPORTED_Autoneg)
-               priv->link_speed_capa |= ETH_LINK_SPEED_AUTONEG;
        if (edata.supported & (SUPPORTED_1000baseT_Full |
                               SUPPORTED_1000baseKX_Full))
-               priv->link_speed_capa |= ETH_LINK_SPEED_1G;
+               priv->link_speed_capa |= RTE_ETH_LINK_SPEED_1G;
        if (edata.supported & SUPPORTED_10000baseKR_Full)
-               priv->link_speed_capa |= ETH_LINK_SPEED_10G;
+               priv->link_speed_capa |= RTE_ETH_LINK_SPEED_10G;
        if (edata.supported & (SUPPORTED_40000baseKR4_Full |
                               SUPPORTED_40000baseCR4_Full |
                               SUPPORTED_40000baseSR4_Full |
                               SUPPORTED_40000baseLR4_Full))
-               priv->link_speed_capa |= ETH_LINK_SPEED_40G;
+               priv->link_speed_capa |= RTE_ETH_LINK_SPEED_40G;
        dev_link.link_duplex = ((edata.duplex == DUPLEX_HALF) ?
-                               ETH_LINK_HALF_DUPLEX : ETH_LINK_FULL_DUPLEX);
+                               RTE_ETH_LINK_HALF_DUPLEX : RTE_ETH_LINK_FULL_DUPLEX);
        dev_link.link_autoneg = !(dev->data->dev_conf.link_speeds &
-                       ETH_LINK_SPEED_FIXED);
-       if (((dev_link.link_speed && !dev_link.link_status) ||
-            (!dev_link.link_speed && dev_link.link_status))) {
-               rte_errno = EAGAIN;
-               return -rte_errno;
-       }
+                       RTE_ETH_LINK_SPEED_FIXED);
        *link = dev_link;
        return 0;
 }
@@ -513,48 +544,46 @@ mlx5_link_update_unlocked_gs(struct rte_eth_dev *dev,
                        dev->data->port_id, strerror(rte_errno));
                return ret;
        }
-       dev_link.link_speed = (ecmd->speed == UINT32_MAX) ? ETH_SPEED_NUM_NONE :
-                                                           ecmd->speed;
+       dev_link.link_speed = (ecmd->speed == UINT32_MAX) ?
+                               RTE_ETH_SPEED_NUM_UNKNOWN : ecmd->speed;
        sc = ecmd->link_mode_masks[0] |
                ((uint64_t)ecmd->link_mode_masks[1] << 32);
        priv->link_speed_capa = 0;
-       if (sc & MLX5_BITSHIFT(ETHTOOL_LINK_MODE_Autoneg_BIT))
-               priv->link_speed_capa |= ETH_LINK_SPEED_AUTONEG;
        if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_1000baseT_Full_BIT) |
                  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_1000baseKX_Full_BIT)))
-               priv->link_speed_capa |= ETH_LINK_SPEED_1G;
+               priv->link_speed_capa |= RTE_ETH_LINK_SPEED_1G;
        if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT) |
                  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_10000baseKR_Full_BIT) |
                  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_10000baseR_FEC_BIT)))
-               priv->link_speed_capa |= ETH_LINK_SPEED_10G;
+               priv->link_speed_capa |= RTE_ETH_LINK_SPEED_10G;
        if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_20000baseMLD2_Full_BIT) |
                  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_20000baseKR2_Full_BIT)))
-               priv->link_speed_capa |= ETH_LINK_SPEED_20G;
+               priv->link_speed_capa |= RTE_ETH_LINK_SPEED_20G;
        if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT) |
                  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT) |
                  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT) |
                  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT)))
-               priv->link_speed_capa |= ETH_LINK_SPEED_40G;
+               priv->link_speed_capa |= RTE_ETH_LINK_SPEED_40G;
        if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseKR4_Full_BIT) |
                  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseCR4_Full_BIT) |
                  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseSR4_Full_BIT) |
                  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseLR4_Full_BIT)))
-               priv->link_speed_capa |= ETH_LINK_SPEED_56G;
+               priv->link_speed_capa |= RTE_ETH_LINK_SPEED_56G;
        if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_25000baseCR_Full_BIT) |
                  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_25000baseKR_Full_BIT) |
                  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_25000baseSR_Full_BIT)))
-               priv->link_speed_capa |= ETH_LINK_SPEED_25G;
+               priv->link_speed_capa |= RTE_ETH_LINK_SPEED_25G;
        if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_50000baseCR2_Full_BIT) |
                  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_50000baseKR2_Full_BIT)))
-               priv->link_speed_capa |= ETH_LINK_SPEED_50G;
+               priv->link_speed_capa |= RTE_ETH_LINK_SPEED_50G;
        if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT) |
                  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseSR4_Full_BIT) |
                  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT) |
                  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT)))
-               priv->link_speed_capa |= ETH_LINK_SPEED_100G;
+               priv->link_speed_capa |= RTE_ETH_LINK_SPEED_100G;
        if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_200000baseKR4_Full_BIT) |
                  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_200000baseSR4_Full_BIT)))
-               priv->link_speed_capa |= ETH_LINK_SPEED_200G;
+               priv->link_speed_capa |= RTE_ETH_LINK_SPEED_200G;
 
        sc = ecmd->link_mode_masks[2] |
                ((uint64_t)ecmd->link_mode_masks[3] << 32);
@@ -562,16 +591,11 @@ mlx5_link_update_unlocked_gs(struct rte_eth_dev *dev,
                  MLX5_BITSHIFT
                       (ETHTOOL_LINK_MODE_200000baseLR4_ER4_FR4_Full_BIT) |
                  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_200000baseDR4_Full_BIT)))
-               priv->link_speed_capa |= ETH_LINK_SPEED_200G;
+               priv->link_speed_capa |= RTE_ETH_LINK_SPEED_200G;
        dev_link.link_duplex = ((ecmd->duplex == DUPLEX_HALF) ?
-                               ETH_LINK_HALF_DUPLEX : ETH_LINK_FULL_DUPLEX);
+                               RTE_ETH_LINK_HALF_DUPLEX : RTE_ETH_LINK_FULL_DUPLEX);
        dev_link.link_autoneg = !(dev->data->dev_conf.link_speeds &
-                                 ETH_LINK_SPEED_FIXED);
-       if (((dev_link.link_speed && !dev_link.link_status) ||
-            (!dev_link.link_speed && dev_link.link_status))) {
-               rte_errno = EAGAIN;
-               return -rte_errno;
-       }
+                                 RTE_ETH_LINK_SPEED_FIXED);
        *link = dev_link;
        return 0;
 }
@@ -653,13 +677,13 @@ mlx5_dev_get_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf)
        }
        fc_conf->autoneg = ethpause.autoneg;
        if (ethpause.rx_pause && ethpause.tx_pause)
-               fc_conf->mode = RTE_FC_FULL;
+               fc_conf->mode = RTE_ETH_FC_FULL;
        else if (ethpause.rx_pause)
-               fc_conf->mode = RTE_FC_RX_PAUSE;
+               fc_conf->mode = RTE_ETH_FC_RX_PAUSE;
        else if (ethpause.tx_pause)
-               fc_conf->mode = RTE_FC_TX_PAUSE;
+               fc_conf->mode = RTE_ETH_FC_TX_PAUSE;
        else
-               fc_conf->mode = RTE_FC_NONE;
+               fc_conf->mode = RTE_ETH_FC_NONE;
        return 0;
 }
 
@@ -685,14 +709,14 @@ mlx5_dev_set_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf)
 
        ifr.ifr_data = (void *)&ethpause;
        ethpause.autoneg = fc_conf->autoneg;
-       if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) ||
-           (fc_conf->mode & RTE_FC_RX_PAUSE))
+       if (((fc_conf->mode & RTE_ETH_FC_FULL) == RTE_ETH_FC_FULL) ||
+           (fc_conf->mode & RTE_ETH_FC_RX_PAUSE))
                ethpause.rx_pause = 1;
        else
                ethpause.rx_pause = 0;
 
-       if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) ||
-           (fc_conf->mode & RTE_FC_TX_PAUSE))
+       if (((fc_conf->mode & RTE_ETH_FC_FULL) == RTE_ETH_FC_FULL) ||
+           (fc_conf->mode & RTE_ETH_FC_TX_PAUSE))
                ethpause.tx_pause = 1;
        else
                ethpause.tx_pause = 0;
@@ -731,7 +755,7 @@ mlx5_dev_interrupt_device_fatal(struct mlx5_dev_ctx_shared *sh)
                dev = &rte_eth_devices[sh->port[i].ih_port_id];
                MLX5_ASSERT(dev);
                if (dev->data->dev_conf.intr_conf.rmv)
-                       _rte_eth_dev_callback_process
+                       rte_eth_dev_callback_process
                                (dev, RTE_ETH_EVENT_INTR_RMV, NULL);
        }
 }
@@ -754,7 +778,7 @@ mlx5_dev_interrupt_handler(void *cb_arg)
                struct rte_eth_dev *dev;
                uint32_t tmp;
 
-               if (mlx5_glue->get_async_event(sh->ctx, &event))
+               if (mlx5_glue->get_async_event(sh->cdev->ctx, &event))
                        break;
                /* Retrieve and check IB port index. */
                tmp = (uint32_t)event.element.port_num;
@@ -807,7 +831,7 @@ mlx5_dev_interrupt_handler(void *cb_arg)
                                usleep(0);
                                continue;
                        }
-                       _rte_eth_dev_callback_process
+                       rte_eth_dev_callback_process
                                (dev, RTE_ETH_EVENT_INTR_LSC, NULL);
                        continue;
                }
@@ -966,11 +990,67 @@ mlx5_is_removed(struct rte_eth_dev *dev)
        struct ibv_device_attr device_attr;
        struct mlx5_priv *priv = dev->data->dev_private;
 
-       if (mlx5_glue->query_device(priv->sh->ctx, &device_attr) == EIO)
+       if (mlx5_glue->query_device(priv->sh->cdev->ctx, &device_attr) == EIO)
                return 1;
        return 0;
 }
 
+/**
+ * Analyze gathered port parameters via sysfs to recognize master
+ * and representor devices for E-Switch configuration.
+ *
+ * @param[in] device_dir
+ *   flag of presence of "device" directory under port device key.
+ * @param[inout] switch_info
+ *   Port information, including port name as a number and port name
+ *   type if recognized
+ *
+ * @return
+ *   master and representor flags are set in switch_info according to
+ *   recognized parameters (if any).
+ */
+static void
+mlx5_sysfs_check_switch_info(bool device_dir,
+                            struct mlx5_switch_info *switch_info)
+{
+       switch (switch_info->name_type) {
+       case MLX5_PHYS_PORT_NAME_TYPE_UNKNOWN:
+               /*
+                * Name is not recognized, assume the master,
+                * check the device directory presence.
+                */
+               switch_info->master = device_dir;
+               break;
+       case MLX5_PHYS_PORT_NAME_TYPE_NOTSET:
+               /*
+                * Name is not set, this assumes the legacy naming
+                * schema for master, just check if there is
+                * a device directory.
+                */
+               switch_info->master = device_dir;
+               break;
+       case MLX5_PHYS_PORT_NAME_TYPE_UPLINK:
+               /* New uplink naming schema recognized. */
+               switch_info->master = 1;
+               break;
+       case MLX5_PHYS_PORT_NAME_TYPE_LEGACY:
+               /* Legacy representors naming schema. */
+               switch_info->representor = !device_dir;
+               break;
+       case MLX5_PHYS_PORT_NAME_TYPE_PFHPF:
+               /* Fallthrough */
+       case MLX5_PHYS_PORT_NAME_TYPE_PFVF:
+               /* Fallthrough */
+       case MLX5_PHYS_PORT_NAME_TYPE_PFSF:
+               /* New representors naming schema. */
+               switch_info->representor = 1;
+               break;
+       default:
+               switch_info->master = device_dir;
+               break;
+       }
+}
+
 /**
  * Get switch information associated with network interface.
  *
@@ -1015,7 +1095,7 @@ mlx5_sysfs_switch_info(unsigned int ifindex, struct mlx5_switch_info *info)
 
        file = fopen(phys_port_name, "rb");
        if (file != NULL) {
-               ret = fscanf(file, "%s", port_name);
+               ret = fscanf(file, "%" RTE_STR(IF_NAMESIZE) "s", port_name);
                fclose(file);
                if (ret == 1)
                        mlx5_translate_port_name(port_name, &data);
@@ -1050,52 +1130,55 @@ mlx5_sysfs_switch_info(unsigned int ifindex, struct mlx5_switch_info *info)
 }
 
 /**
- * Analyze gathered port parameters via sysfs to recognize master
- * and representor devices for E-Switch configuration.
+ * Get bond information associated with network interface.
  *
- * @param[in] device_dir
- *   flag of presence of "device" directory under port device key.
- * @param[inout] switch_info
- *   Port information, including port name as a number and port name
- *   type if recognized
+ * @param pf_ifindex
+ *   Network interface index of bond slave interface
+ * @param[out] ifindex
+ *   Pointer to bond ifindex.
+ * @param[out] ifname
+ *   Pointer to bond ifname.
  *
  * @return
- *   master and representor flags are set in switch_info according to
- *   recognized parameters (if any).
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
  */
-void
-mlx5_sysfs_check_switch_info(bool device_dir,
-                            struct mlx5_switch_info *switch_info)
+int
+mlx5_sysfs_bond_info(unsigned int pf_ifindex, unsigned int *ifindex,
+                    char *ifname)
 {
-       switch (switch_info->name_type) {
-       case MLX5_PHYS_PORT_NAME_TYPE_UNKNOWN:
-               /*
-                * Name is not recognized, assume the master,
-                * check the device directory presence.
-                */
-               switch_info->master = device_dir;
-               break;
-       case MLX5_PHYS_PORT_NAME_TYPE_NOTSET:
-               /*
-                * Name is not set, this assumes the legacy naming
-                * schema for master, just check if there is
-                * a device directory.
-                */
-               switch_info->master = device_dir;
-               break;
-       case MLX5_PHYS_PORT_NAME_TYPE_UPLINK:
-               /* New uplink naming schema recognized. */
-               switch_info->master = 1;
-               break;
-       case MLX5_PHYS_PORT_NAME_TYPE_LEGACY:
-               /* Legacy representors naming schema. */
-               switch_info->representor = !device_dir;
-               break;
-       case MLX5_PHYS_PORT_NAME_TYPE_PFVF:
-               /* New representors naming schema. */
-               switch_info->representor = 1;
-               break;
+       char name[IF_NAMESIZE];
+       FILE *file;
+       unsigned int index;
+       int ret;
+
+       if (!if_indextoname(pf_ifindex, name) || !strlen(name)) {
+               rte_errno = errno;
+               return -rte_errno;
+       }
+       MKSTR(bond_if, "/sys/class/net/%s/master/ifindex", name);
+       /* read bond ifindex */
+       file = fopen(bond_if, "rb");
+       if (file == NULL) {
+               rte_errno = errno;
+               return -rte_errno;
        }
+       ret = fscanf(file, "%u", &index);
+       fclose(file);
+       if (ret <= 0) {
+               rte_errno = errno;
+               return -rte_errno;
+       }
+       if (ifindex)
+               *ifindex = index;
+
+       /* read bond device name from symbol link */
+       if (ifname) {
+               if (!if_indextoname(index, ifname)) {
+                       rte_errno = errno;
+                       return -rte_errno;
+               }
+       }
+       return 0;
 }
 
 /**
@@ -1121,7 +1204,7 @@ mlx5_get_module_info(struct rte_eth_dev *dev,
        };
        int ret = 0;
 
-       if (!dev || !modinfo) {
+       if (!dev) {
                DRV_LOG(WARNING, "missing argument, cannot get module info");
                rte_errno = EINVAL;
                return -rte_errno;
@@ -1155,13 +1238,14 @@ int mlx5_get_module_eeprom(struct rte_eth_dev *dev,
        struct ifreq ifr;
        int ret = 0;
 
-       if (!dev || !info) {
+       if (!dev) {
                DRV_LOG(WARNING, "missing argument, cannot get module eeprom");
                rte_errno = EINVAL;
                return -rte_errno;
        }
-       eeprom = rte_calloc(__func__, 1,
-                           (sizeof(struct ethtool_eeprom) + info->length), 0);
+       eeprom = mlx5_malloc(MLX5_MEM_ZERO,
+                            (sizeof(struct ethtool_eeprom) + info->length), 0,
+                            SOCKET_ID_ANY);
        if (!eeprom) {
                DRV_LOG(WARNING, "port %u cannot allocate memory for "
                        "eeprom data", dev->data->port_id);
@@ -1180,6 +1264,369 @@ int mlx5_get_module_eeprom(struct rte_eth_dev *dev,
                        dev->data->port_id, strerror(rte_errno));
        else
                rte_memcpy(info->data, eeprom->data, info->length);
-       rte_free(eeprom);
+       mlx5_free(eeprom);
+       return ret;
+}
+
+/**
+ * Read device counters table.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param[in] pf
+ *   PF index in case of bonding device, -1 otherwise
+ * @param[out] stats
+ *   Counters table output buffer.
+ *
+ * @return
+ *   0 on success and stats is filled, negative errno value otherwise and
+ *   rte_errno is set.
+ */
+static int
+_mlx5_os_read_dev_counters(struct rte_eth_dev *dev, int pf, uint64_t *stats)
+{
+       struct mlx5_priv *priv = dev->data->dev_private;
+       struct mlx5_xstats_ctrl *xstats_ctrl = &priv->xstats_ctrl;
+       unsigned int i;
+       struct ifreq ifr;
+       unsigned int stats_sz = xstats_ctrl->stats_n * sizeof(uint64_t);
+       unsigned char et_stat_buf[sizeof(struct ethtool_stats) + stats_sz];
+       struct ethtool_stats *et_stats = (struct ethtool_stats *)et_stat_buf;
+       int ret;
+
+       et_stats->cmd = ETHTOOL_GSTATS;
+       et_stats->n_stats = xstats_ctrl->stats_n;
+       ifr.ifr_data = (caddr_t)et_stats;
+       if (pf >= 0)
+               ret = mlx5_ifreq_by_ifname(priv->sh->bond.ports[pf].ifname,
+                                          SIOCETHTOOL, &ifr);
+       else
+               ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
+       if (ret) {
+               DRV_LOG(WARNING,
+                       "port %u unable to read statistic values from device",
+                       dev->data->port_id);
+               return ret;
+       }
+       for (i = 0; i != xstats_ctrl->mlx5_stats_n; ++i) {
+               if (xstats_ctrl->info[i].dev)
+                       continue;
+               stats[i] += (uint64_t)
+                           et_stats->data[xstats_ctrl->dev_table_idx[i]];
+       }
+       return 0;
+}
+
+/**
+ * Read device counters.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param[out] stats
+ *   Counters table output buffer.
+ *
+ * @return
+ *   0 on success and stats is filled, negative errno value otherwise and
+ *   rte_errno is set.
+ */
+int
+mlx5_os_read_dev_counters(struct rte_eth_dev *dev, uint64_t *stats)
+{
+       struct mlx5_priv *priv = dev->data->dev_private;
+       struct mlx5_xstats_ctrl *xstats_ctrl = &priv->xstats_ctrl;
+       int ret = 0, i;
+
+       memset(stats, 0, sizeof(*stats) * xstats_ctrl->mlx5_stats_n);
+       /* Read ifreq counters. */
+       if (priv->master && priv->pf_bond >= 0) {
+               /* Sum xstats from bonding device member ports. */
+               for (i = 0; i < priv->sh->bond.n_port; i++) {
+                       ret = _mlx5_os_read_dev_counters(dev, i, stats);
+                       if (ret)
+                               return ret;
+               }
+       } else {
+               ret = _mlx5_os_read_dev_counters(dev, -1, stats);
+       }
+       /* Read IB counters. */
+       for (i = 0; i != xstats_ctrl->mlx5_stats_n; ++i) {
+               if (!xstats_ctrl->info[i].dev)
+                       continue;
+               ret = mlx5_os_read_dev_stat(priv, xstats_ctrl->info[i].ctr_name,
+                                           &stats[i]);
+               /* return last xstats counter if fail to read. */
+               if (ret != 0)
+                       xstats_ctrl->xstats[i] = stats[i];
+               else
+                       stats[i] = xstats_ctrl->xstats[i];
+       }
        return ret;
 }
+
+/**
+ * Query the number of statistics provided by ETHTOOL.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ *
+ * @return
+ *   Number of statistics on success, negative errno value otherwise and
+ *   rte_errno is set.
+ */
+int
+mlx5_os_get_stats_n(struct rte_eth_dev *dev)
+{
+       struct mlx5_priv *priv = dev->data->dev_private;
+       struct ethtool_drvinfo drvinfo;
+       struct ifreq ifr;
+       int ret;
+
+       drvinfo.cmd = ETHTOOL_GDRVINFO;
+       ifr.ifr_data = (caddr_t)&drvinfo;
+       if (priv->master && priv->pf_bond >= 0)
+               /* Bonding PF. */
+               ret = mlx5_ifreq_by_ifname(priv->sh->bond.ports[0].ifname,
+                                          SIOCETHTOOL, &ifr);
+       else
+               ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
+       if (ret) {
+               DRV_LOG(WARNING, "port %u unable to query number of statistics",
+                       dev->data->port_id);
+               return ret;
+       }
+       return drvinfo.n_stats;
+}
+
+static const struct mlx5_counter_ctrl mlx5_counters_init[] = {
+       {
+               .dpdk_name = "rx_unicast_bytes",
+               .ctr_name = "rx_vport_unicast_bytes",
+       },
+       {
+               .dpdk_name = "rx_multicast_bytes",
+               .ctr_name = "rx_vport_multicast_bytes",
+       },
+       {
+               .dpdk_name = "rx_broadcast_bytes",
+               .ctr_name = "rx_vport_broadcast_bytes",
+       },
+       {
+               .dpdk_name = "rx_unicast_packets",
+               .ctr_name = "rx_vport_unicast_packets",
+       },
+       {
+               .dpdk_name = "rx_multicast_packets",
+               .ctr_name = "rx_vport_multicast_packets",
+       },
+       {
+               .dpdk_name = "rx_broadcast_packets",
+               .ctr_name = "rx_vport_broadcast_packets",
+       },
+       {
+               .dpdk_name = "tx_unicast_bytes",
+               .ctr_name = "tx_vport_unicast_bytes",
+       },
+       {
+               .dpdk_name = "tx_multicast_bytes",
+               .ctr_name = "tx_vport_multicast_bytes",
+       },
+       {
+               .dpdk_name = "tx_broadcast_bytes",
+               .ctr_name = "tx_vport_broadcast_bytes",
+       },
+       {
+               .dpdk_name = "tx_unicast_packets",
+               .ctr_name = "tx_vport_unicast_packets",
+       },
+       {
+               .dpdk_name = "tx_multicast_packets",
+               .ctr_name = "tx_vport_multicast_packets",
+       },
+       {
+               .dpdk_name = "tx_broadcast_packets",
+               .ctr_name = "tx_vport_broadcast_packets",
+       },
+       {
+               .dpdk_name = "rx_wqe_errors",
+               .ctr_name = "rx_wqe_err",
+       },
+       {
+               .dpdk_name = "rx_phy_crc_errors",
+               .ctr_name = "rx_crc_errors_phy",
+       },
+       {
+               .dpdk_name = "rx_phy_in_range_len_errors",
+               .ctr_name = "rx_in_range_len_errors_phy",
+       },
+       {
+               .dpdk_name = "rx_phy_symbol_errors",
+               .ctr_name = "rx_symbol_err_phy",
+       },
+       {
+               .dpdk_name = "tx_phy_errors",
+               .ctr_name = "tx_errors_phy",
+       },
+       {
+               .dpdk_name = "rx_out_of_buffer",
+               .ctr_name = "out_of_buffer",
+               .dev = 1,
+       },
+       {
+               .dpdk_name = "tx_phy_packets",
+               .ctr_name = "tx_packets_phy",
+       },
+       {
+               .dpdk_name = "rx_phy_packets",
+               .ctr_name = "rx_packets_phy",
+       },
+       {
+               .dpdk_name = "tx_phy_discard_packets",
+               .ctr_name = "tx_discards_phy",
+       },
+       {
+               .dpdk_name = "rx_phy_discard_packets",
+               .ctr_name = "rx_discards_phy",
+       },
+       {
+               .dpdk_name = "tx_phy_bytes",
+               .ctr_name = "tx_bytes_phy",
+       },
+       {
+               .dpdk_name = "rx_phy_bytes",
+               .ctr_name = "rx_bytes_phy",
+       },
+       /* Representor only */
+       {
+               .dpdk_name = "rx_vport_packets",
+               .ctr_name = "vport_rx_packets",
+       },
+       {
+               .dpdk_name = "rx_vport_bytes",
+               .ctr_name = "vport_rx_bytes",
+       },
+       {
+               .dpdk_name = "tx_vport_packets",
+               .ctr_name = "vport_tx_packets",
+       },
+       {
+               .dpdk_name = "tx_vport_bytes",
+               .ctr_name = "vport_tx_bytes",
+       },
+};
+
+static const unsigned int xstats_n = RTE_DIM(mlx5_counters_init);
+
+/**
+ * Init the structures to read device counters.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ */
+void
+mlx5_os_stats_init(struct rte_eth_dev *dev)
+{
+       struct mlx5_priv *priv = dev->data->dev_private;
+       struct mlx5_xstats_ctrl *xstats_ctrl = &priv->xstats_ctrl;
+       struct mlx5_stats_ctrl *stats_ctrl = &priv->stats_ctrl;
+       unsigned int i;
+       unsigned int j;
+       struct ifreq ifr;
+       struct ethtool_gstrings *strings = NULL;
+       unsigned int dev_stats_n;
+       unsigned int str_sz;
+       int ret;
+
+       /* So that it won't aggregate for each init. */
+       xstats_ctrl->mlx5_stats_n = 0;
+       ret = mlx5_os_get_stats_n(dev);
+       if (ret < 0) {
+               DRV_LOG(WARNING, "port %u no extended statistics available",
+                       dev->data->port_id);
+               return;
+       }
+       dev_stats_n = ret;
+       /* Allocate memory to grab stat names and values. */
+       str_sz = dev_stats_n * ETH_GSTRING_LEN;
+       strings = (struct ethtool_gstrings *)
+                 mlx5_malloc(0, str_sz + sizeof(struct ethtool_gstrings), 0,
+                             SOCKET_ID_ANY);
+       if (!strings) {
+               DRV_LOG(WARNING, "port %u unable to allocate memory for xstats",
+                    dev->data->port_id);
+               return;
+       }
+       strings->cmd = ETHTOOL_GSTRINGS;
+       strings->string_set = ETH_SS_STATS;
+       strings->len = dev_stats_n;
+       ifr.ifr_data = (caddr_t)strings;
+       if (priv->master && priv->pf_bond >= 0)
+               /* Bonding master. */
+               ret = mlx5_ifreq_by_ifname(priv->sh->bond.ports[0].ifname,
+                                          SIOCETHTOOL, &ifr);
+       else
+               ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
+       if (ret) {
+               DRV_LOG(WARNING, "port %u unable to get statistic names",
+                       dev->data->port_id);
+               goto free;
+       }
+       for (i = 0; i != dev_stats_n; ++i) {
+               const char *curr_string = (const char *)
+                       &strings->data[i * ETH_GSTRING_LEN];
+
+               for (j = 0; j != xstats_n; ++j) {
+                       if (!strcmp(mlx5_counters_init[j].ctr_name,
+                                   curr_string)) {
+                               unsigned int idx = xstats_ctrl->mlx5_stats_n++;
+
+                               xstats_ctrl->dev_table_idx[idx] = i;
+                               xstats_ctrl->info[idx] = mlx5_counters_init[j];
+                               break;
+                       }
+               }
+       }
+       /* Add dev counters. */
+       for (i = 0; i != xstats_n; ++i) {
+               if (mlx5_counters_init[i].dev) {
+                       unsigned int idx = xstats_ctrl->mlx5_stats_n++;
+
+                       xstats_ctrl->info[idx] = mlx5_counters_init[i];
+                       xstats_ctrl->hw_stats[idx] = 0;
+               }
+       }
+       MLX5_ASSERT(xstats_ctrl->mlx5_stats_n <= MLX5_MAX_XSTATS);
+       xstats_ctrl->stats_n = dev_stats_n;
+       /* Copy to base at first time. */
+       ret = mlx5_os_read_dev_counters(dev, xstats_ctrl->base);
+       if (ret)
+               DRV_LOG(ERR, "port %u cannot read device counters: %s",
+                       dev->data->port_id, strerror(rte_errno));
+       mlx5_os_read_dev_stat(priv, "out_of_buffer", &stats_ctrl->imissed_base);
+       stats_ctrl->imissed = 0;
+free:
+       mlx5_free(strings);
+}
+
+/**
+ * Get MAC address by querying netdevice.
+ *
+ * @param[in] dev
+ *   Pointer to Ethernet device.
+ * @param[out] mac
+ *   MAC address output buffer.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_get_mac(struct rte_eth_dev *dev, uint8_t (*mac)[RTE_ETHER_ADDR_LEN])
+{
+       struct ifreq request;
+       int ret;
+
+       ret = mlx5_ifreq(dev, SIOCGIFHWADDR, &request);
+       if (ret)
+               return ret;
+       memcpy(mac, request.ifr_hwaddr.sa_data, RTE_ETHER_ADDR_LEN);
+       return 0;
+}