power: add ethdev power management
authorLiang Ma <liang.j.ma@intel.com>
Fri, 22 Jan 2021 17:12:16 +0000 (17:12 +0000)
committerThomas Monjalon <thomas@monjalon.net>
Fri, 29 Jan 2021 14:29:48 +0000 (15:29 +0100)
Add a simple on/off switch that will enable saving power when no
packets are arriving. It is based on counting the number of empty
polls and, when the number reaches a certain threshold, entering an
architecture-defined optimized power state that will either wait
until a TSC timestamp expires, or when packets arrive.

This API mandates a core-to-single-queue mapping (that is, multiple
queued per device are supported, but they have to be polled on different
cores).

This design is using PMD RX callbacks.

1. UMWAIT/UMONITOR:

   When a certain threshold of empty polls is reached, the core will go
   into a power optimized sleep while waiting on an address of next RX
   descriptor to be written to.

2. TPAUSE/Pause instruction

   This method uses the pause (or TPAUSE, if available) instruction to
   avoid busy polling.

3. Frequency scaling
   Reuse existing DPDK power library to scale up/down core frequency
   depending on traffic volume.

Signed-off-by: Liang Ma <liang.j.ma@intel.com>
Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
Acked-by: David Hunt <david.hunt@intel.com>
doc/guides/prog_guide/power_man.rst
doc/guides/rel_notes/release_21_02.rst
lib/librte_power/meson.build
lib/librte_power/rte_power_pmd_mgmt.c [new file with mode: 0644]
lib/librte_power/rte_power_pmd_mgmt.h [new file with mode: 0644]
lib/librte_power/version.map

index 0a3755a..c70ae12 100644 (file)
@@ -192,6 +192,48 @@ User Cases
 ----------
 The mechanism can applied to any device which is based on polling. e.g. NIC, FPGA.
 
+Ethernet PMD Power Management API
+---------------------------------
+
+Abstract
+~~~~~~~~
+
+Existing power management mechanisms require developers
+to change application design or change code to make use of it.
+The PMD power management API provides a convenient alternative
+by utilizing Ethernet PMD RX callbacks,
+and triggering power saving whenever empty poll count reaches a certain number.
+
+Monitor
+   This power saving scheme will put the CPU into optimized power state
+   and use the ``rte_power_monitor()`` function
+   to monitor the Ethernet PMD RX descriptor address,
+   and wake the CPU up whenever there's new traffic.
+
+Pause
+   This power saving scheme will avoid busy polling
+   by either entering power-optimized sleep state
+   with ``rte_power_pause()`` function,
+   or, if it's not available, use ``rte_pause()``.
+
+Frequency scaling
+   This power saving scheme will use ``librte_power`` library
+   functionality to scale the core frequency up/down
+   depending on traffic volume.
+
+.. note::
+
+   Currently, this power management API is limited to mandatory mapping
+   of 1 queue to 1 core (multiple queues are supported,
+   but they must be polled from different cores).
+
+API Overview for Ethernet PMD Power Management
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+* **Queue Enable**: Enable specific power scheme for certain queue/port/core.
+
+* **Queue Disable**: Disable power scheme for certain queue/port/core.
+
 References
 ----------
 
index ae36b6a..8ea9577 100644 (file)
@@ -60,6 +60,16 @@ New Features
   Added ``rte_eth_get_monitor_addr()``, to be used in conjunction with
   ``rte_power_monitor()`` to enable automatic power management for PMDs.
 
+* **Added Ethernet PMD power management helper API.**
+
+  A new helper API has been added to make using Ethernet PMD power management
+  easier for the user: ``rte_power_ethdev_pmgmt_queue_enable()``. Three power
+  management schemes are supported initially:
+
+  * Power saving based on UMWAIT instruction (x86 only)
+  * Power saving based on ``rte_pause()`` (generic) or TPAUSE instruction (x86 only)
+  * Power saving based on frequency scaling through the ``librte_power`` library
+
 * **Added GENEVE TLV option in rte_flow.**
 
   Added support for matching and raw encap/decap of GENEVE TLV option.
index 5415695..9a2dcbf 100644 (file)
@@ -9,7 +9,9 @@ sources = files('rte_power.c', 'power_acpi_cpufreq.c',
                'power_kvm_vm.c', 'guest_channel.c',
                'rte_power_empty_poll.c',
                'power_pstate_cpufreq.c',
+               'rte_power_pmd_mgmt.c',
                'power_common.c')
 headers = files('rte_power.h','rte_power_empty_poll.h',
+       'rte_power_pmd_mgmt.h',
        'rte_power_guest_channel.h')
-deps += ['timer']
+deps += ['timer', 'ethdev']
diff --git a/lib/librte_power/rte_power_pmd_mgmt.c b/lib/librte_power/rte_power_pmd_mgmt.c
new file mode 100644 (file)
index 0000000..454ef70
--- /dev/null
@@ -0,0 +1,365 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2020 Intel Corporation
+ */
+
+#include <rte_lcore.h>
+#include <rte_cycles.h>
+#include <rte_cpuflags.h>
+#include <rte_malloc.h>
+#include <rte_ethdev.h>
+#include <rte_power_intrinsics.h>
+
+#include "rte_power_pmd_mgmt.h"
+
+#define EMPTYPOLL_MAX  512
+
+/* store some internal state */
+static struct pmd_conf_data {
+       /** what do we support? */
+       struct rte_cpu_intrinsics intrinsics_support;
+       /** pre-calculated tsc diff for 1us */
+       uint64_t tsc_per_us;
+       /** how many rte_pause can we fit in a microsecond? */
+       uint64_t pause_per_us;
+} global_data;
+
+/**
+ * Possible power management states of an ethdev port.
+ */
+enum pmd_mgmt_state {
+       /** Device power management is disabled. */
+       PMD_MGMT_DISABLED = 0,
+       /** Device power management is enabled. */
+       PMD_MGMT_ENABLED
+};
+
+struct pmd_queue_cfg {
+       volatile enum pmd_mgmt_state pwr_mgmt_state;
+       /**< State of power management for this queue */
+       enum rte_power_pmd_mgmt_type cb_mode;
+       /**< Callback mode for this queue */
+       const struct rte_eth_rxtx_callback *cur_cb;
+       /**< Callback instance */
+       volatile bool umwait_in_progress;
+       /**< are we currently sleeping? */
+       uint64_t empty_poll_stats;
+       /**< Number of empty polls */
+} __rte_cache_aligned;
+
+static struct pmd_queue_cfg port_cfg[RTE_MAX_ETHPORTS][RTE_MAX_QUEUES_PER_PORT];
+
+static void
+calc_tsc(void)
+{
+       const uint64_t hz = rte_get_timer_hz();
+       const uint64_t tsc_per_us = hz / US_PER_S; /* 1us */
+
+       global_data.tsc_per_us = tsc_per_us;
+
+       /* only do this if we don't have tpause */
+       if (!global_data.intrinsics_support.power_pause) {
+               const uint64_t start = rte_rdtsc_precise();
+               const uint32_t n_pauses = 10000;
+               double us, us_per_pause;
+               uint64_t end;
+               unsigned int i;
+
+               /* estimate number of rte_pause() calls per us*/
+               for (i = 0; i < n_pauses; i++)
+                       rte_pause();
+
+               end = rte_rdtsc_precise();
+               us = (end - start) / (double)tsc_per_us;
+               us_per_pause = us / n_pauses;
+
+               global_data.pause_per_us = (uint64_t)(1.0 / us_per_pause);
+       }
+}
+
+static uint16_t
+clb_umwait(uint16_t port_id, uint16_t qidx, struct rte_mbuf **pkts __rte_unused,
+               uint16_t nb_rx, uint16_t max_pkts __rte_unused,
+               void *addr __rte_unused)
+{
+
+       struct pmd_queue_cfg *q_conf;
+
+       q_conf = &port_cfg[port_id][qidx];
+
+       if (unlikely(nb_rx == 0)) {
+               q_conf->empty_poll_stats++;
+               if (unlikely(q_conf->empty_poll_stats > EMPTYPOLL_MAX)) {
+                       struct rte_power_monitor_cond pmc;
+                       uint16_t ret;
+
+                       /*
+                        * we might get a cancellation request while being
+                        * inside the callback, in which case the wakeup
+                        * wouldn't work because it would've arrived too early.
+                        *
+                        * to get around this, we notify the other thread that
+                        * we're sleeping, so that it can spin until we're done.
+                        * unsolicited wakeups are perfectly safe.
+                        */
+                       q_conf->umwait_in_progress = true;
+
+                       rte_atomic_thread_fence(__ATOMIC_SEQ_CST);
+
+                       /* check if we need to cancel sleep */
+                       if (q_conf->pwr_mgmt_state == PMD_MGMT_ENABLED) {
+                               /* use monitoring condition to sleep */
+                               ret = rte_eth_get_monitor_addr(port_id, qidx,
+                                               &pmc);
+                               if (ret == 0)
+                                       rte_power_monitor(&pmc, -1ULL);
+                       }
+                       q_conf->umwait_in_progress = false;
+
+                       rte_atomic_thread_fence(__ATOMIC_SEQ_CST);
+               }
+       } else
+               q_conf->empty_poll_stats = 0;
+
+       return nb_rx;
+}
+
+static uint16_t
+clb_pause(uint16_t port_id, uint16_t qidx, struct rte_mbuf **pkts __rte_unused,
+               uint16_t nb_rx, uint16_t max_pkts __rte_unused,
+               void *addr __rte_unused)
+{
+       struct pmd_queue_cfg *q_conf;
+
+       q_conf = &port_cfg[port_id][qidx];
+
+       if (unlikely(nb_rx == 0)) {
+               q_conf->empty_poll_stats++;
+               /* sleep for 1 microsecond */
+               if (unlikely(q_conf->empty_poll_stats > EMPTYPOLL_MAX)) {
+                       /* use tpause if we have it */
+                       if (global_data.intrinsics_support.power_pause) {
+                               const uint64_t cur = rte_rdtsc();
+                               const uint64_t wait_tsc =
+                                               cur + global_data.tsc_per_us;
+                               rte_power_pause(wait_tsc);
+                       } else {
+                               uint64_t i;
+                               for (i = 0; i < global_data.pause_per_us; i++)
+                                       rte_pause();
+                       }
+               }
+       } else
+               q_conf->empty_poll_stats = 0;
+
+       return nb_rx;
+}
+
+static uint16_t
+clb_scale_freq(uint16_t port_id, uint16_t qidx,
+               struct rte_mbuf **pkts __rte_unused, uint16_t nb_rx,
+               uint16_t max_pkts __rte_unused, void *_  __rte_unused)
+{
+       struct pmd_queue_cfg *q_conf;
+
+       q_conf = &port_cfg[port_id][qidx];
+
+       if (unlikely(nb_rx == 0)) {
+               q_conf->empty_poll_stats++;
+               if (unlikely(q_conf->empty_poll_stats > EMPTYPOLL_MAX))
+                       /* scale down freq */
+                       rte_power_freq_min(rte_lcore_id());
+       } else {
+               q_conf->empty_poll_stats = 0;
+               /* scale up freq */
+               rte_power_freq_max(rte_lcore_id());
+       }
+
+       return nb_rx;
+}
+
+int
+rte_power_ethdev_pmgmt_queue_enable(unsigned int lcore_id, uint16_t port_id,
+               uint16_t queue_id, enum rte_power_pmd_mgmt_type mode)
+{
+       struct pmd_queue_cfg *queue_cfg;
+       struct rte_eth_dev_info info;
+       int ret;
+
+       RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL);
+
+       if (queue_id >= RTE_MAX_QUEUES_PER_PORT || lcore_id >= RTE_MAX_LCORE) {
+               ret = -EINVAL;
+               goto end;
+       }
+
+       if (rte_eth_dev_info_get(port_id, &info) < 0) {
+               ret = -EINVAL;
+               goto end;
+       }
+
+       /* check if queue id is valid */
+       if (queue_id >= info.nb_rx_queues) {
+               ret = -EINVAL;
+               goto end;
+       }
+
+       queue_cfg = &port_cfg[port_id][queue_id];
+
+       if (queue_cfg->pwr_mgmt_state != PMD_MGMT_DISABLED) {
+               ret = -EINVAL;
+               goto end;
+       }
+
+       /* we need this in various places */
+       rte_cpu_get_intrinsics_support(&global_data.intrinsics_support);
+
+       switch (mode) {
+       case RTE_POWER_MGMT_TYPE_MONITOR:
+       {
+               struct rte_power_monitor_cond dummy;
+
+               /* check if rte_power_monitor is supported */
+               if (!global_data.intrinsics_support.power_monitor) {
+                       RTE_LOG(DEBUG, POWER, "Monitoring intrinsics are not supported\n");
+                       ret = -ENOTSUP;
+                       goto end;
+               }
+
+               /* check if the device supports the necessary PMD API */
+               if (rte_eth_get_monitor_addr(port_id, queue_id,
+                               &dummy) == -ENOTSUP) {
+                       RTE_LOG(DEBUG, POWER, "The device does not support rte_eth_get_monitor_addr\n");
+                       ret = -ENOTSUP;
+                       goto end;
+               }
+               /* initialize data before enabling the callback */
+               queue_cfg->empty_poll_stats = 0;
+               queue_cfg->cb_mode = mode;
+               queue_cfg->umwait_in_progress = false;
+               queue_cfg->pwr_mgmt_state = PMD_MGMT_ENABLED;
+
+               /* ensure we update our state before callback starts */
+               rte_atomic_thread_fence(__ATOMIC_SEQ_CST);
+
+               queue_cfg->cur_cb = rte_eth_add_rx_callback(port_id, queue_id,
+                               clb_umwait, NULL);
+               break;
+       }
+       case RTE_POWER_MGMT_TYPE_SCALE:
+       {
+               enum power_management_env env;
+               /* only PSTATE and ACPI modes are supported */
+               if (!rte_power_check_env_supported(PM_ENV_ACPI_CPUFREQ) &&
+                               !rte_power_check_env_supported(
+                                       PM_ENV_PSTATE_CPUFREQ)) {
+                       RTE_LOG(DEBUG, POWER, "Neither ACPI nor PSTATE modes are supported\n");
+                       ret = -ENOTSUP;
+                       goto end;
+               }
+               /* ensure we could initialize the power library */
+               if (rte_power_init(lcore_id)) {
+                       ret = -EINVAL;
+                       goto end;
+               }
+               /* ensure we initialized the correct env */
+               env = rte_power_get_env();
+               if (env != PM_ENV_ACPI_CPUFREQ &&
+                               env != PM_ENV_PSTATE_CPUFREQ) {
+                       RTE_LOG(DEBUG, POWER, "Neither ACPI nor PSTATE modes were initialized\n");
+                       ret = -ENOTSUP;
+                       goto end;
+               }
+               /* initialize data before enabling the callback */
+               queue_cfg->empty_poll_stats = 0;
+               queue_cfg->cb_mode = mode;
+               queue_cfg->pwr_mgmt_state = PMD_MGMT_ENABLED;
+
+               /* this is not necessary here, but do it anyway */
+               rte_atomic_thread_fence(__ATOMIC_SEQ_CST);
+
+               queue_cfg->cur_cb = rte_eth_add_rx_callback(port_id,
+                               queue_id, clb_scale_freq, NULL);
+               break;
+       }
+       case RTE_POWER_MGMT_TYPE_PAUSE:
+               /* figure out various time-to-tsc conversions */
+               if (global_data.tsc_per_us == 0)
+                       calc_tsc();
+
+               /* initialize data before enabling the callback */
+               queue_cfg->empty_poll_stats = 0;
+               queue_cfg->cb_mode = mode;
+               queue_cfg->pwr_mgmt_state = PMD_MGMT_ENABLED;
+
+               /* this is not necessary here, but do it anyway */
+               rte_atomic_thread_fence(__ATOMIC_SEQ_CST);
+
+               queue_cfg->cur_cb = rte_eth_add_rx_callback(port_id, queue_id,
+                               clb_pause, NULL);
+               break;
+       }
+       ret = 0;
+end:
+       return ret;
+}
+
+int
+rte_power_ethdev_pmgmt_queue_disable(unsigned int lcore_id,
+               uint16_t port_id, uint16_t queue_id)
+{
+       struct pmd_queue_cfg *queue_cfg;
+
+       RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL);
+
+       if (lcore_id >= RTE_MAX_LCORE || queue_id >= RTE_MAX_QUEUES_PER_PORT)
+               return -EINVAL;
+
+       /* no need to check queue id as wrong queue id would not be enabled */
+       queue_cfg = &port_cfg[port_id][queue_id];
+
+       if (queue_cfg->pwr_mgmt_state != PMD_MGMT_ENABLED)
+               return -EINVAL;
+
+       /* stop any callbacks from progressing */
+       queue_cfg->pwr_mgmt_state = PMD_MGMT_DISABLED;
+
+       /* ensure we update our state before continuing */
+       rte_atomic_thread_fence(__ATOMIC_SEQ_CST);
+
+       switch (queue_cfg->cb_mode) {
+       case RTE_POWER_MGMT_TYPE_MONITOR:
+       {
+               bool exit = false;
+               do {
+                       /*
+                        * we may request cancellation while the other thread
+                        * has just entered the callback but hasn't started
+                        * sleeping yet, so keep waking it up until we know it's
+                        * done sleeping.
+                        */
+                       if (queue_cfg->umwait_in_progress)
+                               rte_power_monitor_wakeup(lcore_id);
+                       else
+                               exit = true;
+               } while (!exit);
+       }
+       /* fall-through */
+       case RTE_POWER_MGMT_TYPE_PAUSE:
+               rte_eth_remove_rx_callback(port_id, queue_id,
+                               queue_cfg->cur_cb);
+               break;
+       case RTE_POWER_MGMT_TYPE_SCALE:
+               rte_power_freq_max(lcore_id);
+               rte_eth_remove_rx_callback(port_id, queue_id,
+                               queue_cfg->cur_cb);
+               rte_power_exit(lcore_id);
+               break;
+       }
+       /*
+        * we don't free the RX callback here because it is unsafe to do so
+        * unless we know for a fact that all data plane threads have stopped.
+        */
+       queue_cfg->cur_cb = NULL;
+
+       return 0;
+}
diff --git a/lib/librte_power/rte_power_pmd_mgmt.h b/lib/librte_power/rte_power_pmd_mgmt.h
new file mode 100644 (file)
index 0000000..7a0ac24
--- /dev/null
@@ -0,0 +1,91 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2020 Intel Corporation
+ */
+
+#ifndef _RTE_POWER_PMD_MGMT_H
+#define _RTE_POWER_PMD_MGMT_H
+
+/**
+ * @file
+ * RTE PMD Power Management
+ */
+
+#include <stdint.h>
+#include <stdbool.h>
+
+#include <rte_common.h>
+#include <rte_byteorder.h>
+#include <rte_log.h>
+#include <rte_power.h>
+#include <rte_atomic.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * PMD Power Management Type
+ */
+enum rte_power_pmd_mgmt_type {
+       /** Use power-optimized monitoring to wait for incoming traffic */
+       RTE_POWER_MGMT_TYPE_MONITOR = 1,
+       /** Use power-optimized sleep to avoid busy polling */
+       RTE_POWER_MGMT_TYPE_PAUSE,
+       /** Use frequency scaling when traffic is low */
+       RTE_POWER_MGMT_TYPE_SCALE,
+};
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change, or be removed, without prior notice.
+ *
+ * Enable power management on a specified Ethernet device Rx queue and lcore.
+ *
+ * @note This function is not thread-safe.
+ *
+ * @param lcore_id
+ *   The lcore the Rx queue will be polled from.
+ * @param port_id
+ *   The port identifier of the Ethernet device.
+ * @param queue_id
+ *   The queue identifier of the Ethernet device.
+ * @param mode
+ *   The power management scheme to use for specified Rx queue.
+ * @return
+ *   0 on success
+ *   <0 on error
+ */
+__rte_experimental
+int
+rte_power_ethdev_pmgmt_queue_enable(unsigned int lcore_id,
+               uint16_t port_id, uint16_t queue_id,
+               enum rte_power_pmd_mgmt_type mode);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change, or be removed, without prior notice.
+ *
+ * Disable power management on a specified Ethernet device Rx queue and lcore.
+ *
+ * @note This function is not thread-safe.
+ *
+ * @param lcore_id
+ *   The lcore the Rx queue is polled from.
+ * @param port_id
+ *   The port identifier of the Ethernet device.
+ * @param queue_id
+ *   The queue identifier of the Ethernet device.
+ * @return
+ *   0 on success
+ *   <0 on error
+ */
+__rte_experimental
+int
+rte_power_ethdev_pmgmt_queue_disable(unsigned int lcore_id,
+               uint16_t port_id, uint16_t queue_id);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
index 13f0af3..3ba9390 100644 (file)
@@ -36,6 +36,8 @@ EXPERIMENTAL {
        rte_power_poll_stat_update;
 
        # added in 21.02
+       rte_power_ethdev_pmgmt_queue_disable;
+       rte_power_ethdev_pmgmt_queue_enable;
        rte_power_guest_channel_receive_msg;
        rte_power_guest_channel_send_msg;
 };