1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright(c) 2020 Intel Corporation
6 #include <rte_cycles.h>
7 #include <rte_cpuflags.h>
8 #include <rte_malloc.h>
9 #include <rte_ethdev.h>
10 #include <rte_power_intrinsics.h>
12 #include "rte_power_pmd_mgmt.h"
14 unsigned int emptypoll_max;
16 /* store some internal state */
17 static struct pmd_conf_data {
18 /** what do we support? */
19 struct rte_cpu_intrinsics intrinsics_support;
20 /** pre-calculated tsc diff for 1us */
22 /** how many rte_pause can we fit in a microsecond? */
23 uint64_t pause_per_us;
27 * Possible power management states of an ethdev port.
30 /** Device power management is disabled. */
31 PMD_MGMT_DISABLED = 0,
32 /** Device power management is enabled. */
44 struct queue_list_entry {
45 TAILQ_ENTRY(queue_list_entry) next;
47 uint64_t n_empty_polls;
49 const struct rte_eth_rxtx_callback *cb;
53 TAILQ_HEAD(queue_list_head, queue_list_entry) head;
54 /**< List of queues associated with this lcore */
56 /**< How many queues are in the list? */
57 volatile enum pmd_mgmt_state pwr_mgmt_state;
58 /**< State of power management for this queue */
59 enum rte_power_pmd_mgmt_type cb_mode;
60 /**< Callback mode for this queue */
61 uint64_t n_queues_ready_to_sleep;
62 /**< Number of queues ready to enter power optimized state */
63 uint64_t sleep_target;
64 /**< Prevent a queue from triggering sleep multiple times */
65 } __rte_cache_aligned;
66 static struct pmd_core_cfg lcore_cfgs[RTE_MAX_LCORE];
69 queue_equal(const union queue *l, const union queue *r)
71 return l->val == r->val;
75 queue_copy(union queue *dst, const union queue *src)
80 static struct queue_list_entry *
81 queue_list_find(const struct pmd_core_cfg *cfg, const union queue *q)
83 struct queue_list_entry *cur;
85 TAILQ_FOREACH(cur, &cfg->head, next) {
86 if (queue_equal(&cur->queue, q))
93 queue_list_add(struct pmd_core_cfg *cfg, const union queue *q)
95 struct queue_list_entry *qle;
97 /* is it already in the list? */
98 if (queue_list_find(cfg, q) != NULL)
101 qle = malloc(sizeof(*qle));
104 memset(qle, 0, sizeof(*qle));
106 queue_copy(&qle->queue, q);
107 TAILQ_INSERT_TAIL(&cfg->head, qle, next);
113 static struct queue_list_entry *
114 queue_list_take(struct pmd_core_cfg *cfg, const union queue *q)
116 struct queue_list_entry *found;
118 found = queue_list_find(cfg, q);
122 TAILQ_REMOVE(&cfg->head, found, next);
125 /* freeing is responsibility of the caller */
130 get_monitor_addresses(struct pmd_core_cfg *cfg,
131 struct rte_power_monitor_cond *pmc, size_t len)
133 const struct queue_list_entry *qle;
137 TAILQ_FOREACH(qle, &cfg->head, next) {
138 const union queue *q = &qle->queue;
139 struct rte_power_monitor_cond *cur;
141 /* attempted out of bounds access */
143 RTE_LOG(ERR, POWER, "Too many queues being monitored\n");
148 ret = rte_eth_get_monitor_addr(q->portid, q->qid, cur);
158 const uint64_t hz = rte_get_timer_hz();
159 const uint64_t tsc_per_us = hz / US_PER_S; /* 1us */
161 global_data.tsc_per_us = tsc_per_us;
163 /* only do this if we don't have tpause */
164 if (!global_data.intrinsics_support.power_pause) {
165 const uint64_t start = rte_rdtsc_precise();
166 const uint32_t n_pauses = 10000;
167 double us, us_per_pause;
171 /* estimate number of rte_pause() calls per us*/
172 for (i = 0; i < n_pauses; i++)
175 end = rte_rdtsc_precise();
176 us = (end - start) / (double)tsc_per_us;
177 us_per_pause = us / n_pauses;
179 global_data.pause_per_us = (uint64_t)(1.0 / us_per_pause);
184 queue_reset(struct pmd_core_cfg *cfg, struct queue_list_entry *qcfg)
186 const bool is_ready_to_sleep = qcfg->n_sleeps == cfg->sleep_target;
188 /* reset empty poll counter for this queue */
189 qcfg->n_empty_polls = 0;
190 /* reset the queue sleep counter as well */
192 /* remove the queue from list of queues ready to sleep */
193 if (is_ready_to_sleep)
194 cfg->n_queues_ready_to_sleep--;
196 * no need change the lcore sleep target counter because this lcore will
197 * reach the n_sleeps anyway, and the other cores are already counted so
198 * there's no need to do anything else.
203 queue_can_sleep(struct pmd_core_cfg *cfg, struct queue_list_entry *qcfg)
205 /* this function is called - that means we have an empty poll */
206 qcfg->n_empty_polls++;
208 /* if we haven't reached threshold for empty polls, we can't sleep */
209 if (qcfg->n_empty_polls <= emptypoll_max)
213 * we've reached a point where we are able to sleep, but we still need
214 * to check if this queue has already been marked for sleeping.
216 if (qcfg->n_sleeps == cfg->sleep_target)
219 /* mark this queue as ready for sleep */
220 qcfg->n_sleeps = cfg->sleep_target;
221 cfg->n_queues_ready_to_sleep++;
227 lcore_can_sleep(struct pmd_core_cfg *cfg)
229 /* are all queues ready to sleep? */
230 if (cfg->n_queues_ready_to_sleep != cfg->n_queues)
233 /* we've reached an iteration where we can sleep, reset sleep counter */
234 cfg->n_queues_ready_to_sleep = 0;
237 * we do not reset any individual queue empty poll counters, because
238 * we want to keep sleeping on every poll until we actually get traffic.
245 clb_multiwait(uint16_t port_id __rte_unused, uint16_t qidx __rte_unused,
246 struct rte_mbuf **pkts __rte_unused, uint16_t nb_rx,
247 uint16_t max_pkts __rte_unused, void *arg)
249 const unsigned int lcore = rte_lcore_id();
250 struct queue_list_entry *queue_conf = arg;
251 struct pmd_core_cfg *lcore_conf;
252 const bool empty = nb_rx == 0;
254 lcore_conf = &lcore_cfgs[lcore];
259 queue_reset(lcore_conf, queue_conf);
261 struct rte_power_monitor_cond pmc[lcore_conf->n_queues];
264 /* can this queue sleep? */
265 if (!queue_can_sleep(lcore_conf, queue_conf))
268 /* can this lcore sleep? */
269 if (!lcore_can_sleep(lcore_conf))
272 /* gather all monitoring conditions */
273 ret = get_monitor_addresses(lcore_conf, pmc,
274 lcore_conf->n_queues);
278 rte_power_monitor_multi(pmc, lcore_conf->n_queues, UINT64_MAX);
285 clb_umwait(uint16_t port_id, uint16_t qidx, struct rte_mbuf **pkts __rte_unused,
286 uint16_t nb_rx, uint16_t max_pkts __rte_unused, void *arg)
288 struct queue_list_entry *queue_conf = arg;
290 /* this callback can't do more than one queue, omit multiqueue logic */
291 if (unlikely(nb_rx == 0)) {
292 queue_conf->n_empty_polls++;
293 if (unlikely(queue_conf->n_empty_polls > emptypoll_max)) {
294 struct rte_power_monitor_cond pmc;
297 /* use monitoring condition to sleep */
298 ret = rte_eth_get_monitor_addr(port_id, qidx,
301 rte_power_monitor(&pmc, UINT64_MAX);
304 queue_conf->n_empty_polls = 0;
310 clb_pause(uint16_t port_id __rte_unused, uint16_t qidx __rte_unused,
311 struct rte_mbuf **pkts __rte_unused, uint16_t nb_rx,
312 uint16_t max_pkts __rte_unused, void *arg)
314 const unsigned int lcore = rte_lcore_id();
315 struct queue_list_entry *queue_conf = arg;
316 struct pmd_core_cfg *lcore_conf;
317 const bool empty = nb_rx == 0;
319 lcore_conf = &lcore_cfgs[lcore];
323 queue_reset(lcore_conf, queue_conf);
325 /* can this queue sleep? */
326 if (!queue_can_sleep(lcore_conf, queue_conf))
329 /* can this lcore sleep? */
330 if (!lcore_can_sleep(lcore_conf))
333 /* sleep for 1 microsecond, use tpause if we have it */
334 if (global_data.intrinsics_support.power_pause) {
335 const uint64_t cur = rte_rdtsc();
336 const uint64_t wait_tsc =
337 cur + global_data.tsc_per_us;
338 rte_power_pause(wait_tsc);
341 for (i = 0; i < global_data.pause_per_us; i++)
350 clb_scale_freq(uint16_t port_id __rte_unused, uint16_t qidx __rte_unused,
351 struct rte_mbuf **pkts __rte_unused, uint16_t nb_rx,
352 uint16_t max_pkts __rte_unused, void *arg)
354 const unsigned int lcore = rte_lcore_id();
355 const bool empty = nb_rx == 0;
356 struct pmd_core_cfg *lcore_conf = &lcore_cfgs[lcore];
357 struct queue_list_entry *queue_conf = arg;
359 if (likely(!empty)) {
361 queue_reset(lcore_conf, queue_conf);
363 /* scale up freq immediately */
364 rte_power_freq_max(rte_lcore_id());
366 /* can this queue sleep? */
367 if (!queue_can_sleep(lcore_conf, queue_conf))
370 /* can this lcore sleep? */
371 if (!lcore_can_sleep(lcore_conf))
374 rte_power_freq_min(rte_lcore_id());
381 queue_stopped(const uint16_t port_id, const uint16_t queue_id)
383 struct rte_eth_rxq_info qinfo;
385 int ret = rte_eth_rx_queue_info_get(port_id, queue_id, &qinfo);
393 return qinfo.queue_state == RTE_ETH_QUEUE_STATE_STOPPED;
397 cfg_queues_stopped(struct pmd_core_cfg *queue_cfg)
399 const struct queue_list_entry *entry;
401 TAILQ_FOREACH(entry, &queue_cfg->head, next) {
402 const union queue *q = &entry->queue;
403 int ret = queue_stopped(q->portid, q->qid);
411 check_scale(unsigned int lcore)
413 enum power_management_env env;
415 /* only PSTATE and ACPI modes are supported */
416 if (!rte_power_check_env_supported(PM_ENV_ACPI_CPUFREQ) &&
417 !rte_power_check_env_supported(PM_ENV_PSTATE_CPUFREQ)) {
418 RTE_LOG(DEBUG, POWER, "Neither ACPI nor PSTATE modes are supported\n");
421 /* ensure we could initialize the power library */
422 if (rte_power_init(lcore))
425 /* ensure we initialized the correct env */
426 env = rte_power_get_env();
427 if (env != PM_ENV_ACPI_CPUFREQ && env != PM_ENV_PSTATE_CPUFREQ) {
428 RTE_LOG(DEBUG, POWER, "Neither ACPI nor PSTATE modes were initialized\n");
437 check_monitor(struct pmd_core_cfg *cfg, const union queue *qdata)
439 struct rte_power_monitor_cond dummy;
440 bool multimonitor_supported;
442 /* check if rte_power_monitor is supported */
443 if (!global_data.intrinsics_support.power_monitor) {
444 RTE_LOG(DEBUG, POWER, "Monitoring intrinsics are not supported\n");
447 /* check if multi-monitor is supported */
448 multimonitor_supported =
449 global_data.intrinsics_support.power_monitor_multi;
451 /* if we're adding a new queue, do we support multiple queues? */
452 if (cfg->n_queues > 0 && !multimonitor_supported) {
453 RTE_LOG(DEBUG, POWER, "Monitoring multiple queues is not supported\n");
457 /* check if the device supports the necessary PMD API */
458 if (rte_eth_get_monitor_addr(qdata->portid, qdata->qid,
459 &dummy) == -ENOTSUP) {
460 RTE_LOG(DEBUG, POWER, "The device does not support rte_eth_get_monitor_addr\n");
468 static inline rte_rx_callback_fn
469 get_monitor_callback(void)
471 return global_data.intrinsics_support.power_monitor_multi ?
472 clb_multiwait : clb_umwait;
476 rte_power_ethdev_pmgmt_queue_enable(unsigned int lcore_id, uint16_t port_id,
477 uint16_t queue_id, enum rte_power_pmd_mgmt_type mode)
479 const union queue qdata = {.portid = port_id, .qid = queue_id};
480 struct pmd_core_cfg *lcore_cfg;
481 struct queue_list_entry *queue_cfg;
482 struct rte_eth_dev_info info;
483 rte_rx_callback_fn clb;
486 RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL);
488 if (queue_id >= RTE_MAX_QUEUES_PER_PORT || lcore_id >= RTE_MAX_LCORE) {
493 if (rte_eth_dev_info_get(port_id, &info) < 0) {
498 /* check if queue id is valid */
499 if (queue_id >= info.nb_rx_queues) {
504 /* check if the queue is stopped */
505 ret = queue_stopped(port_id, queue_id);
507 /* error means invalid queue, 0 means queue wasn't stopped */
508 ret = ret < 0 ? -EINVAL : -EBUSY;
512 lcore_cfg = &lcore_cfgs[lcore_id];
514 /* check if other queues are stopped as well */
515 ret = cfg_queues_stopped(lcore_cfg);
517 /* error means invalid queue, 0 means queue wasn't stopped */
518 ret = ret < 0 ? -EINVAL : -EBUSY;
522 /* if callback was already enabled, check current callback type */
523 if (lcore_cfg->pwr_mgmt_state != PMD_MGMT_DISABLED &&
524 lcore_cfg->cb_mode != mode) {
529 /* we need this in various places */
530 rte_cpu_get_intrinsics_support(&global_data.intrinsics_support);
533 case RTE_POWER_MGMT_TYPE_MONITOR:
534 /* check if we can add a new queue */
535 ret = check_monitor(lcore_cfg, &qdata);
539 clb = get_monitor_callback();
541 case RTE_POWER_MGMT_TYPE_SCALE:
542 clb = clb_scale_freq;
544 /* we only have to check this when enabling first queue */
545 if (lcore_cfg->pwr_mgmt_state != PMD_MGMT_DISABLED)
547 /* check if we can add a new queue */
548 ret = check_scale(lcore_id);
552 case RTE_POWER_MGMT_TYPE_PAUSE:
553 /* figure out various time-to-tsc conversions */
554 if (global_data.tsc_per_us == 0)
560 RTE_LOG(DEBUG, POWER, "Invalid power management type\n");
564 /* add this queue to the list */
565 ret = queue_list_add(lcore_cfg, &qdata);
567 RTE_LOG(DEBUG, POWER, "Failed to add queue to list: %s\n",
571 /* new queue is always added last */
572 queue_cfg = TAILQ_LAST(&lcore_cfg->head, queue_list_head);
574 /* when enabling first queue, ensure sleep target is not 0 */
575 if (lcore_cfg->n_queues == 1 && lcore_cfg->sleep_target == 0)
576 lcore_cfg->sleep_target = 1;
578 /* initialize data before enabling the callback */
579 if (lcore_cfg->n_queues == 1) {
580 lcore_cfg->cb_mode = mode;
581 lcore_cfg->pwr_mgmt_state = PMD_MGMT_ENABLED;
583 queue_cfg->cb = rte_eth_add_rx_callback(port_id, queue_id,
592 rte_power_ethdev_pmgmt_queue_disable(unsigned int lcore_id,
593 uint16_t port_id, uint16_t queue_id)
595 const union queue qdata = {.portid = port_id, .qid = queue_id};
596 struct pmd_core_cfg *lcore_cfg;
597 struct queue_list_entry *queue_cfg;
600 RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL);
602 if (lcore_id >= RTE_MAX_LCORE || queue_id >= RTE_MAX_QUEUES_PER_PORT)
605 /* check if the queue is stopped */
606 ret = queue_stopped(port_id, queue_id);
608 /* error means invalid queue, 0 means queue wasn't stopped */
609 return ret < 0 ? -EINVAL : -EBUSY;
612 /* no need to check queue id as wrong queue id would not be enabled */
613 lcore_cfg = &lcore_cfgs[lcore_id];
615 /* check if other queues are stopped as well */
616 ret = cfg_queues_stopped(lcore_cfg);
618 /* error means invalid queue, 0 means queue wasn't stopped */
619 return ret < 0 ? -EINVAL : -EBUSY;
622 if (lcore_cfg->pwr_mgmt_state != PMD_MGMT_ENABLED)
626 * There is no good/easy way to do this without race conditions, so we
627 * are just going to throw our hands in the air and hope that the user
628 * has read the documentation and has ensured that ports are stopped at
629 * the time we enter the API functions.
631 queue_cfg = queue_list_take(lcore_cfg, &qdata);
632 if (queue_cfg == NULL)
635 /* if we've removed all queues from the lists, set state to disabled */
636 if (lcore_cfg->n_queues == 0)
637 lcore_cfg->pwr_mgmt_state = PMD_MGMT_DISABLED;
639 switch (lcore_cfg->cb_mode) {
640 case RTE_POWER_MGMT_TYPE_MONITOR: /* fall-through */
641 case RTE_POWER_MGMT_TYPE_PAUSE:
642 rte_eth_remove_rx_callback(port_id, queue_id, queue_cfg->cb);
644 case RTE_POWER_MGMT_TYPE_SCALE:
645 rte_eth_remove_rx_callback(port_id, queue_id, queue_cfg->cb);
646 /* disable power library on this lcore if this was last queue */
647 if (lcore_cfg->pwr_mgmt_state == PMD_MGMT_DISABLED) {
648 rte_power_freq_max(lcore_id);
649 rte_power_exit(lcore_id);
654 * the API doc mandates that the user stops all processing on affected
655 * ports before calling any of these API's, so we can assume that the
656 * callbacks can be freed. we're intentionally casting away const-ness.
658 rte_free((void *)queue_cfg->cb);
665 rte_power_pmd_mgmt_set_emptypoll_max(unsigned int max)
671 rte_power_pmd_mgmt_get_emptypoll_max(void)
673 return emptypoll_max;
676 RTE_INIT(rte_power_ethdev_pmgmt_init) {
679 /* initialize all tailqs */
680 for (i = 0; i < RTE_DIM(lcore_cfgs); i++) {
681 struct pmd_core_cfg *cfg = &lcore_cfgs[i];
682 TAILQ_INIT(&cfg->head);
685 /* initialize config defaults */