1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright(c) 2020 Intel Corporation
6 #include <rte_cycles.h>
7 #include <rte_cpuflags.h>
8 #include <rte_malloc.h>
9 #include <rte_ethdev.h>
10 #include <rte_power_intrinsics.h>
12 #include "rte_power_pmd_mgmt.h"
13 #include "power_common.h"
15 unsigned int emptypoll_max;
16 unsigned int pause_duration;
17 unsigned int scale_freq_min[RTE_MAX_LCORE];
18 unsigned int scale_freq_max[RTE_MAX_LCORE];
20 /* store some internal state */
21 static struct pmd_conf_data {
22 /** what do we support? */
23 struct rte_cpu_intrinsics intrinsics_support;
24 /** pre-calculated tsc diff for 1us */
26 /** how many rte_pause can we fit in a microsecond? */
27 uint64_t pause_per_us;
31 * Possible power management states of an ethdev port.
34 /** Device power management is disabled. */
35 PMD_MGMT_DISABLED = 0,
36 /** Device power management is enabled. */
48 struct queue_list_entry {
49 TAILQ_ENTRY(queue_list_entry) next;
51 uint64_t n_empty_polls;
53 const struct rte_eth_rxtx_callback *cb;
57 TAILQ_HEAD(queue_list_head, queue_list_entry) head;
58 /**< List of queues associated with this lcore */
60 /**< How many queues are in the list? */
61 volatile enum pmd_mgmt_state pwr_mgmt_state;
62 /**< State of power management for this queue */
63 enum rte_power_pmd_mgmt_type cb_mode;
64 /**< Callback mode for this queue */
65 uint64_t n_queues_ready_to_sleep;
66 /**< Number of queues ready to enter power optimized state */
67 uint64_t sleep_target;
68 /**< Prevent a queue from triggering sleep multiple times */
69 } __rte_cache_aligned;
70 static struct pmd_core_cfg lcore_cfgs[RTE_MAX_LCORE];
73 queue_equal(const union queue *l, const union queue *r)
75 return l->val == r->val;
79 queue_copy(union queue *dst, const union queue *src)
84 static struct queue_list_entry *
85 queue_list_find(const struct pmd_core_cfg *cfg, const union queue *q)
87 struct queue_list_entry *cur;
89 TAILQ_FOREACH(cur, &cfg->head, next) {
90 if (queue_equal(&cur->queue, q))
97 queue_list_add(struct pmd_core_cfg *cfg, const union queue *q)
99 struct queue_list_entry *qle;
101 /* is it already in the list? */
102 if (queue_list_find(cfg, q) != NULL)
105 qle = malloc(sizeof(*qle));
108 memset(qle, 0, sizeof(*qle));
110 queue_copy(&qle->queue, q);
111 TAILQ_INSERT_TAIL(&cfg->head, qle, next);
117 static struct queue_list_entry *
118 queue_list_take(struct pmd_core_cfg *cfg, const union queue *q)
120 struct queue_list_entry *found;
122 found = queue_list_find(cfg, q);
126 TAILQ_REMOVE(&cfg->head, found, next);
129 /* freeing is responsibility of the caller */
134 get_monitor_addresses(struct pmd_core_cfg *cfg,
135 struct rte_power_monitor_cond *pmc, size_t len)
137 const struct queue_list_entry *qle;
141 TAILQ_FOREACH(qle, &cfg->head, next) {
142 const union queue *q = &qle->queue;
143 struct rte_power_monitor_cond *cur;
145 /* attempted out of bounds access */
147 RTE_LOG(ERR, POWER, "Too many queues being monitored\n");
152 ret = rte_eth_get_monitor_addr(q->portid, q->qid, cur);
162 const uint64_t hz = rte_get_timer_hz();
163 const uint64_t tsc_per_us = hz / US_PER_S; /* 1us */
165 global_data.tsc_per_us = tsc_per_us;
167 /* only do this if we don't have tpause */
168 if (!global_data.intrinsics_support.power_pause) {
169 const uint64_t start = rte_rdtsc_precise();
170 const uint32_t n_pauses = 10000;
171 double us, us_per_pause;
175 /* estimate number of rte_pause() calls per us*/
176 for (i = 0; i < n_pauses; i++)
179 end = rte_rdtsc_precise();
180 us = (end - start) / (double)tsc_per_us;
181 us_per_pause = us / n_pauses;
183 global_data.pause_per_us = (uint64_t)(1.0 / us_per_pause);
188 queue_reset(struct pmd_core_cfg *cfg, struct queue_list_entry *qcfg)
190 const bool is_ready_to_sleep = qcfg->n_sleeps == cfg->sleep_target;
192 /* reset empty poll counter for this queue */
193 qcfg->n_empty_polls = 0;
194 /* reset the queue sleep counter as well */
196 /* remove the queue from list of queues ready to sleep */
197 if (is_ready_to_sleep)
198 cfg->n_queues_ready_to_sleep--;
200 * no need change the lcore sleep target counter because this lcore will
201 * reach the n_sleeps anyway, and the other cores are already counted so
202 * there's no need to do anything else.
207 queue_can_sleep(struct pmd_core_cfg *cfg, struct queue_list_entry *qcfg)
209 /* this function is called - that means we have an empty poll */
210 qcfg->n_empty_polls++;
212 /* if we haven't reached threshold for empty polls, we can't sleep */
213 if (qcfg->n_empty_polls <= emptypoll_max)
217 * we've reached a point where we are able to sleep, but we still need
218 * to check if this queue has already been marked for sleeping.
220 if (qcfg->n_sleeps == cfg->sleep_target)
223 /* mark this queue as ready for sleep */
224 qcfg->n_sleeps = cfg->sleep_target;
225 cfg->n_queues_ready_to_sleep++;
231 lcore_can_sleep(struct pmd_core_cfg *cfg)
233 /* are all queues ready to sleep? */
234 if (cfg->n_queues_ready_to_sleep != cfg->n_queues)
237 /* we've reached an iteration where we can sleep, reset sleep counter */
238 cfg->n_queues_ready_to_sleep = 0;
241 * we do not reset any individual queue empty poll counters, because
242 * we want to keep sleeping on every poll until we actually get traffic.
249 clb_multiwait(uint16_t port_id __rte_unused, uint16_t qidx __rte_unused,
250 struct rte_mbuf **pkts __rte_unused, uint16_t nb_rx,
251 uint16_t max_pkts __rte_unused, void *arg)
253 const unsigned int lcore = rte_lcore_id();
254 struct queue_list_entry *queue_conf = arg;
255 struct pmd_core_cfg *lcore_conf;
256 const bool empty = nb_rx == 0;
258 lcore_conf = &lcore_cfgs[lcore];
263 queue_reset(lcore_conf, queue_conf);
265 struct rte_power_monitor_cond pmc[lcore_conf->n_queues];
268 /* can this queue sleep? */
269 if (!queue_can_sleep(lcore_conf, queue_conf))
272 /* can this lcore sleep? */
273 if (!lcore_can_sleep(lcore_conf))
276 /* gather all monitoring conditions */
277 ret = get_monitor_addresses(lcore_conf, pmc,
278 lcore_conf->n_queues);
282 rte_power_monitor_multi(pmc, lcore_conf->n_queues, UINT64_MAX);
289 clb_umwait(uint16_t port_id, uint16_t qidx, struct rte_mbuf **pkts __rte_unused,
290 uint16_t nb_rx, uint16_t max_pkts __rte_unused, void *arg)
292 struct queue_list_entry *queue_conf = arg;
294 /* this callback can't do more than one queue, omit multiqueue logic */
295 if (unlikely(nb_rx == 0)) {
296 queue_conf->n_empty_polls++;
297 if (unlikely(queue_conf->n_empty_polls > emptypoll_max)) {
298 struct rte_power_monitor_cond pmc;
301 /* use monitoring condition to sleep */
302 ret = rte_eth_get_monitor_addr(port_id, qidx,
305 rte_power_monitor(&pmc, UINT64_MAX);
308 queue_conf->n_empty_polls = 0;
314 clb_pause(uint16_t port_id __rte_unused, uint16_t qidx __rte_unused,
315 struct rte_mbuf **pkts __rte_unused, uint16_t nb_rx,
316 uint16_t max_pkts __rte_unused, void *arg)
318 const unsigned int lcore = rte_lcore_id();
319 struct queue_list_entry *queue_conf = arg;
320 struct pmd_core_cfg *lcore_conf;
321 const bool empty = nb_rx == 0;
322 uint32_t pause_duration = rte_power_pmd_mgmt_get_pause_duration();
324 lcore_conf = &lcore_cfgs[lcore];
328 queue_reset(lcore_conf, queue_conf);
330 /* can this queue sleep? */
331 if (!queue_can_sleep(lcore_conf, queue_conf))
334 /* can this lcore sleep? */
335 if (!lcore_can_sleep(lcore_conf))
338 /* sleep for 1 microsecond, use tpause if we have it */
339 if (global_data.intrinsics_support.power_pause) {
340 const uint64_t cur = rte_rdtsc();
341 const uint64_t wait_tsc =
342 cur + global_data.tsc_per_us * pause_duration;
343 rte_power_pause(wait_tsc);
346 for (i = 0; i < global_data.pause_per_us * pause_duration; i++)
355 clb_scale_freq(uint16_t port_id __rte_unused, uint16_t qidx __rte_unused,
356 struct rte_mbuf **pkts __rte_unused, uint16_t nb_rx,
357 uint16_t max_pkts __rte_unused, void *arg)
359 const unsigned int lcore = rte_lcore_id();
360 const bool empty = nb_rx == 0;
361 struct pmd_core_cfg *lcore_conf = &lcore_cfgs[lcore];
362 struct queue_list_entry *queue_conf = arg;
364 if (likely(!empty)) {
366 queue_reset(lcore_conf, queue_conf);
368 /* scale up freq immediately */
369 rte_power_freq_max(rte_lcore_id());
371 /* can this queue sleep? */
372 if (!queue_can_sleep(lcore_conf, queue_conf))
375 /* can this lcore sleep? */
376 if (!lcore_can_sleep(lcore_conf))
379 rte_power_freq_min(rte_lcore_id());
386 queue_stopped(const uint16_t port_id, const uint16_t queue_id)
388 struct rte_eth_rxq_info qinfo;
390 int ret = rte_eth_rx_queue_info_get(port_id, queue_id, &qinfo);
398 return qinfo.queue_state == RTE_ETH_QUEUE_STATE_STOPPED;
402 cfg_queues_stopped(struct pmd_core_cfg *queue_cfg)
404 const struct queue_list_entry *entry;
406 TAILQ_FOREACH(entry, &queue_cfg->head, next) {
407 const union queue *q = &entry->queue;
408 int ret = queue_stopped(q->portid, q->qid);
416 check_scale(unsigned int lcore)
418 enum power_management_env env;
420 /* only PSTATE and ACPI modes are supported */
421 if (!rte_power_check_env_supported(PM_ENV_ACPI_CPUFREQ) &&
422 !rte_power_check_env_supported(PM_ENV_PSTATE_CPUFREQ)) {
423 RTE_LOG(DEBUG, POWER, "Neither ACPI nor PSTATE modes are supported\n");
426 /* ensure we could initialize the power library */
427 if (rte_power_init(lcore))
430 /* ensure we initialized the correct env */
431 env = rte_power_get_env();
432 if (env != PM_ENV_ACPI_CPUFREQ && env != PM_ENV_PSTATE_CPUFREQ) {
433 RTE_LOG(DEBUG, POWER, "Neither ACPI nor PSTATE modes were initialized\n");
442 check_monitor(struct pmd_core_cfg *cfg, const union queue *qdata)
444 struct rte_power_monitor_cond dummy;
445 bool multimonitor_supported;
447 /* check if rte_power_monitor is supported */
448 if (!global_data.intrinsics_support.power_monitor) {
449 RTE_LOG(DEBUG, POWER, "Monitoring intrinsics are not supported\n");
452 /* check if multi-monitor is supported */
453 multimonitor_supported =
454 global_data.intrinsics_support.power_monitor_multi;
456 /* if we're adding a new queue, do we support multiple queues? */
457 if (cfg->n_queues > 0 && !multimonitor_supported) {
458 RTE_LOG(DEBUG, POWER, "Monitoring multiple queues is not supported\n");
462 /* check if the device supports the necessary PMD API */
463 if (rte_eth_get_monitor_addr(qdata->portid, qdata->qid,
464 &dummy) == -ENOTSUP) {
465 RTE_LOG(DEBUG, POWER, "The device does not support rte_eth_get_monitor_addr\n");
473 static inline rte_rx_callback_fn
474 get_monitor_callback(void)
476 return global_data.intrinsics_support.power_monitor_multi ?
477 clb_multiwait : clb_umwait;
481 rte_power_ethdev_pmgmt_queue_enable(unsigned int lcore_id, uint16_t port_id,
482 uint16_t queue_id, enum rte_power_pmd_mgmt_type mode)
484 const union queue qdata = {.portid = port_id, .qid = queue_id};
485 struct pmd_core_cfg *lcore_cfg;
486 struct queue_list_entry *queue_cfg;
487 struct rte_eth_dev_info info;
488 rte_rx_callback_fn clb;
491 RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL);
493 if (queue_id >= RTE_MAX_QUEUES_PER_PORT || lcore_id >= RTE_MAX_LCORE) {
498 if (rte_eth_dev_info_get(port_id, &info) < 0) {
503 /* check if queue id is valid */
504 if (queue_id >= info.nb_rx_queues) {
509 /* check if the queue is stopped */
510 ret = queue_stopped(port_id, queue_id);
512 /* error means invalid queue, 0 means queue wasn't stopped */
513 ret = ret < 0 ? -EINVAL : -EBUSY;
517 lcore_cfg = &lcore_cfgs[lcore_id];
519 /* check if other queues are stopped as well */
520 ret = cfg_queues_stopped(lcore_cfg);
522 /* error means invalid queue, 0 means queue wasn't stopped */
523 ret = ret < 0 ? -EINVAL : -EBUSY;
527 /* if callback was already enabled, check current callback type */
528 if (lcore_cfg->pwr_mgmt_state != PMD_MGMT_DISABLED &&
529 lcore_cfg->cb_mode != mode) {
534 /* we need this in various places */
535 rte_cpu_get_intrinsics_support(&global_data.intrinsics_support);
538 case RTE_POWER_MGMT_TYPE_MONITOR:
539 /* check if we can add a new queue */
540 ret = check_monitor(lcore_cfg, &qdata);
544 clb = get_monitor_callback();
546 case RTE_POWER_MGMT_TYPE_SCALE:
547 clb = clb_scale_freq;
549 /* we only have to check this when enabling first queue */
550 if (lcore_cfg->pwr_mgmt_state != PMD_MGMT_DISABLED)
552 /* check if we can add a new queue */
553 ret = check_scale(lcore_id);
557 case RTE_POWER_MGMT_TYPE_PAUSE:
558 /* figure out various time-to-tsc conversions */
559 if (global_data.tsc_per_us == 0)
565 RTE_LOG(DEBUG, POWER, "Invalid power management type\n");
569 /* add this queue to the list */
570 ret = queue_list_add(lcore_cfg, &qdata);
572 RTE_LOG(DEBUG, POWER, "Failed to add queue to list: %s\n",
576 /* new queue is always added last */
577 queue_cfg = TAILQ_LAST(&lcore_cfg->head, queue_list_head);
579 /* when enabling first queue, ensure sleep target is not 0 */
580 if (lcore_cfg->n_queues == 1 && lcore_cfg->sleep_target == 0)
581 lcore_cfg->sleep_target = 1;
583 /* initialize data before enabling the callback */
584 if (lcore_cfg->n_queues == 1) {
585 lcore_cfg->cb_mode = mode;
586 lcore_cfg->pwr_mgmt_state = PMD_MGMT_ENABLED;
588 queue_cfg->cb = rte_eth_add_rx_callback(port_id, queue_id,
597 rte_power_ethdev_pmgmt_queue_disable(unsigned int lcore_id,
598 uint16_t port_id, uint16_t queue_id)
600 const union queue qdata = {.portid = port_id, .qid = queue_id};
601 struct pmd_core_cfg *lcore_cfg;
602 struct queue_list_entry *queue_cfg;
605 RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL);
607 if (lcore_id >= RTE_MAX_LCORE || queue_id >= RTE_MAX_QUEUES_PER_PORT)
610 /* check if the queue is stopped */
611 ret = queue_stopped(port_id, queue_id);
613 /* error means invalid queue, 0 means queue wasn't stopped */
614 return ret < 0 ? -EINVAL : -EBUSY;
617 /* no need to check queue id as wrong queue id would not be enabled */
618 lcore_cfg = &lcore_cfgs[lcore_id];
620 /* check if other queues are stopped as well */
621 ret = cfg_queues_stopped(lcore_cfg);
623 /* error means invalid queue, 0 means queue wasn't stopped */
624 return ret < 0 ? -EINVAL : -EBUSY;
627 if (lcore_cfg->pwr_mgmt_state != PMD_MGMT_ENABLED)
631 * There is no good/easy way to do this without race conditions, so we
632 * are just going to throw our hands in the air and hope that the user
633 * has read the documentation and has ensured that ports are stopped at
634 * the time we enter the API functions.
636 queue_cfg = queue_list_take(lcore_cfg, &qdata);
637 if (queue_cfg == NULL)
640 /* if we've removed all queues from the lists, set state to disabled */
641 if (lcore_cfg->n_queues == 0)
642 lcore_cfg->pwr_mgmt_state = PMD_MGMT_DISABLED;
644 switch (lcore_cfg->cb_mode) {
645 case RTE_POWER_MGMT_TYPE_MONITOR: /* fall-through */
646 case RTE_POWER_MGMT_TYPE_PAUSE:
647 rte_eth_remove_rx_callback(port_id, queue_id, queue_cfg->cb);
649 case RTE_POWER_MGMT_TYPE_SCALE:
650 rte_eth_remove_rx_callback(port_id, queue_id, queue_cfg->cb);
651 /* disable power library on this lcore if this was last queue */
652 if (lcore_cfg->pwr_mgmt_state == PMD_MGMT_DISABLED) {
653 rte_power_freq_max(lcore_id);
654 rte_power_exit(lcore_id);
659 * the API doc mandates that the user stops all processing on affected
660 * ports before calling any of these API's, so we can assume that the
661 * callbacks can be freed. we're intentionally casting away const-ness.
663 rte_free((void *)queue_cfg->cb);
670 rte_power_pmd_mgmt_set_emptypoll_max(unsigned int max)
676 rte_power_pmd_mgmt_get_emptypoll_max(void)
678 return emptypoll_max;
682 rte_power_pmd_mgmt_set_pause_duration(unsigned int duration)
685 RTE_LOG(ERR, POWER, "Pause duration must be greater than 0, value unchanged");
688 pause_duration = duration;
694 rte_power_pmd_mgmt_get_pause_duration(void)
696 return pause_duration;
700 rte_power_pmd_mgmt_set_scaling_freq_min(unsigned int lcore, unsigned int min)
702 if (lcore >= RTE_MAX_LCORE) {
703 RTE_LOG(ERR, POWER, "Invalid lcore ID: %u\n", lcore);
707 if (min > scale_freq_max[lcore]) {
708 RTE_LOG(ERR, POWER, "Invalid min frequency: Cannot be greater than max frequency");
711 scale_freq_min[lcore] = min;
717 rte_power_pmd_mgmt_set_scaling_freq_max(unsigned int lcore, unsigned int max)
719 if (lcore >= RTE_MAX_LCORE) {
720 RTE_LOG(ERR, POWER, "Invalid lcore ID: %u\n", lcore);
724 /* Zero means 'not set'. Use UINT32_MAX to enable RTE_MIN/MAX macro use when scaling. */
727 if (max < scale_freq_min[lcore]) {
728 RTE_LOG(ERR, POWER, "Invalid max frequency: Cannot be less than min frequency");
732 scale_freq_max[lcore] = max;
738 rte_power_pmd_mgmt_get_scaling_freq_min(unsigned int lcore)
740 if (lcore >= RTE_MAX_LCORE) {
741 RTE_LOG(ERR, POWER, "Invalid lcore ID: %u\n", lcore);
745 if (scale_freq_max[lcore] == 0)
746 RTE_LOG(DEBUG, POWER, "Scaling freq min config not set. Using sysfs min freq.\n");
748 return scale_freq_min[lcore];
752 rte_power_pmd_mgmt_get_scaling_freq_max(unsigned int lcore)
754 if (lcore >= RTE_MAX_LCORE) {
755 RTE_LOG(ERR, POWER, "Invalid lcore ID: %u\n", lcore);
759 if (scale_freq_max[lcore] == UINT32_MAX) {
760 RTE_LOG(DEBUG, POWER, "Scaling freq max config not set. Using sysfs max freq.\n");
764 return scale_freq_max[lcore];
767 RTE_INIT(rte_power_ethdev_pmgmt_init) {
771 /* initialize all tailqs */
772 for (i = 0; i < RTE_DIM(lcore_cfgs); i++) {
773 struct pmd_core_cfg *cfg = &lcore_cfgs[i];
774 TAILQ_INIT(&cfg->head);
777 /* initialize config defaults */
780 /* scaling defaults out of range to ensure not used unless set by user or app */
781 for (j = 0; j < RTE_MAX_LCORE; j++) {
782 scale_freq_min[j] = 0;
783 scale_freq_max[j] = UINT32_MAX;