1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright(c) 2018 Intel Corporation
17 #include <rte_memcpy.h>
18 #include <rte_memory.h>
19 #include <rte_string_fns.h>
21 #include "power_pstate_cpufreq.h"
22 #include "power_common.h"
24 /* macros used for rounding frequency to nearest 100000 */
25 #define FREQ_ROUNDING_DELTA 50000
26 #define ROUND_FREQ_TO_N_100000 100000
28 #define BUS_FREQ 100000
30 #define POWER_GOVERNOR_PERF "performance"
31 #define POWER_SYSFILE_MAX_FREQ \
32 "/sys/devices/system/cpu/cpu%u/cpufreq/scaling_max_freq"
33 #define POWER_SYSFILE_MIN_FREQ \
34 "/sys/devices/system/cpu/cpu%u/cpufreq/scaling_min_freq"
35 #define POWER_SYSFILE_CUR_FREQ \
36 "/sys/devices/system/cpu/cpu%u/cpufreq/scaling_cur_freq"
37 #define POWER_SYSFILE_BASE_MAX_FREQ \
38 "/sys/devices/system/cpu/cpu%u/cpufreq/cpuinfo_max_freq"
39 #define POWER_SYSFILE_BASE_MIN_FREQ \
40 "/sys/devices/system/cpu/cpu%u/cpufreq/cpuinfo_min_freq"
41 #define POWER_SYSFILE_BASE_FREQ \
42 "/sys/devices/system/cpu/cpu%u/cpufreq/base_frequency"
43 #define POWER_PSTATE_DRIVER "intel_pstate"
44 #define POWER_MSR_PATH "/dev/cpu/%u/msr"
49 #define PLATFORM_INFO 0x0CE
50 #define NON_TURBO_MASK 0xFF00
51 #define NON_TURBO_OFFSET 0x8
61 struct pstate_power_info {
62 unsigned int lcore_id; /**< Logical core id */
63 uint32_t freqs[RTE_MAX_LCORE_FREQS]; /**< Frequency array */
64 uint32_t nb_freqs; /**< number of available freqs */
65 FILE *f_cur_min; /**< FD of scaling_min */
66 FILE *f_cur_max; /**< FD of scaling_max */
67 char governor_ori[32]; /**< Original governor name */
68 uint32_t curr_idx; /**< Freq index in freqs array */
69 uint32_t non_turbo_max_ratio; /**< Non Turbo Max ratio */
70 uint32_t sys_max_freq; /**< system wide max freq */
71 uint32_t core_base_freq; /**< core base freq */
72 uint32_t state; /**< Power in use state */
73 uint16_t turbo_available; /**< Turbo Boost available */
74 uint16_t turbo_enable; /**< Turbo Boost enable/disable */
75 uint16_t priority_core; /**< High Performance core */
76 } __rte_cache_aligned;
79 static struct pstate_power_info lcore_power_info[RTE_MAX_LCORE];
82 * It is to read the specific MSR.
86 power_rdmsr(int msr, uint64_t *val, unsigned int lcore_id)
89 char fullpath[PATH_MAX];
91 snprintf(fullpath, sizeof(fullpath), POWER_MSR_PATH, lcore_id);
93 fd = open(fullpath, O_RDONLY);
96 RTE_LOG(ERR, POWER, "Error opening '%s': %s\n", fullpath,
101 ret = pread(fd, val, sizeof(uint64_t), msr);
104 RTE_LOG(ERR, POWER, "Error reading '%s': %s\n", fullpath,
109 POWER_DEBUG_TRACE("MSR Path %s, offset 0x%X for lcore %u\n",
110 fullpath, msr, lcore_id);
112 POWER_DEBUG_TRACE("Ret value %d, content is 0x%"PRIx64"\n", ret, *val);
119 * It is to fopen the sys file for the future setting the lcore frequency.
122 power_init_for_setting_freq(struct pstate_power_info *pi)
124 FILE *f_base = NULL, *f_base_max = NULL, *f_min = NULL, *f_max = NULL;
125 uint32_t base_ratio, base_max_ratio;
126 uint64_t max_non_turbo;
129 /* open all files we expect to have open */
130 open_core_sysfs_file(&f_base_max, "r", POWER_SYSFILE_BASE_MAX_FREQ,
132 if (f_base_max == NULL) {
133 RTE_LOG(ERR, POWER, "failed to open %s\n",
134 POWER_SYSFILE_BASE_MAX_FREQ);
138 open_core_sysfs_file(&f_min, "rw+", POWER_SYSFILE_MIN_FREQ,
141 RTE_LOG(ERR, POWER, "failed to open %s\n",
142 POWER_SYSFILE_MIN_FREQ);
146 open_core_sysfs_file(&f_max, "rw+", POWER_SYSFILE_MAX_FREQ,
149 RTE_LOG(ERR, POWER, "failed to open %s\n",
150 POWER_SYSFILE_MAX_FREQ);
154 open_core_sysfs_file(&f_base, "r", POWER_SYSFILE_BASE_FREQ,
156 /* base ratio file may not exist in some kernels, so no error check */
158 /* read base max ratio */
159 ret = read_core_sysfs_u32(f_base_max, &base_max_ratio);
161 RTE_LOG(ERR, POWER, "Failed to read %s\n",
162 POWER_SYSFILE_BASE_MAX_FREQ);
166 /* base ratio may not exist */
167 if (f_base != NULL) {
168 ret = read_core_sysfs_u32(f_base, &base_ratio);
170 RTE_LOG(ERR, POWER, "Failed to read %s\n",
171 POWER_SYSFILE_BASE_FREQ);
178 /* Add MSR read to detect turbo status */
179 if (power_rdmsr(PLATFORM_INFO, &max_non_turbo, pi->lcore_id) < 0)
181 /* no errors after this point */
183 /* convert ratios to bins */
184 base_max_ratio /= BUS_FREQ;
185 base_ratio /= BUS_FREQ;
187 /* assign file handles */
188 pi->f_cur_min = f_min;
189 pi->f_cur_max = f_max;
191 max_non_turbo = (max_non_turbo&NON_TURBO_MASK)>>NON_TURBO_OFFSET;
193 POWER_DEBUG_TRACE("no turbo perf %"PRIu64"\n", max_non_turbo);
195 pi->non_turbo_max_ratio = (uint32_t)max_non_turbo;
198 * If base_frequency is reported as greater than the maximum
199 * turbo frequency, that's a known issue with some kernels.
200 * Set base_frequency to max_non_turbo as a workaround.
202 if (base_ratio > base_max_ratio) {
203 /* base_ratio is greater than max turbo. Kernel bug. */
204 pi->priority_core = 0;
209 * If base_frequency is reported as greater than the maximum
210 * non-turbo frequency, then mark it as a high priority core.
212 if (base_ratio > max_non_turbo)
213 pi->priority_core = 1;
215 pi->priority_core = 0;
216 pi->core_base_freq = base_ratio * BUS_FREQ;
222 /* f_min and f_max are stored, no need to close */
228 if (f_base_max != NULL)
238 set_freq_internal(struct pstate_power_info *pi, uint32_t idx)
240 uint32_t target_freq = 0;
242 if (idx >= RTE_MAX_LCORE_FREQS || idx >= pi->nb_freqs) {
243 RTE_LOG(ERR, POWER, "Invalid frequency index %u, which "
244 "should be less than %u\n", idx, pi->nb_freqs);
248 /* Check if it is the same as current */
249 if (idx == pi->curr_idx)
252 /* Because Intel Pstate Driver only allow user change min/max hint
253 * User need change the min/max as same value.
255 if (fseek(pi->f_cur_min, 0, SEEK_SET) < 0) {
256 RTE_LOG(ERR, POWER, "Fail to set file position indicator to 0 "
257 "for setting frequency for lcore %u\n",
262 if (fseek(pi->f_cur_max, 0, SEEK_SET) < 0) {
263 RTE_LOG(ERR, POWER, "Fail to set file position indicator to 0 "
264 "for setting frequency for lcore %u\n",
269 /* Turbo is available and enabled, first freq bucket is sys max freq */
270 if (pi->turbo_available && idx == 0) {
271 if (pi->turbo_enable)
272 target_freq = pi->sys_max_freq;
274 RTE_LOG(ERR, POWER, "Turbo is off, frequency can't be scaled up more %u\n",
279 target_freq = pi->freqs[idx];
281 /* Decrease freq, the min freq should be updated first */
282 if (idx > pi->curr_idx) {
284 if (fprintf(pi->f_cur_min, "%u", target_freq) < 0) {
285 RTE_LOG(ERR, POWER, "Fail to write new frequency for "
286 "lcore %u\n", pi->lcore_id);
290 if (fprintf(pi->f_cur_max, "%u", target_freq) < 0) {
291 RTE_LOG(ERR, POWER, "Fail to write new frequency for "
292 "lcore %u\n", pi->lcore_id);
296 POWER_DEBUG_TRACE("Frequency '%u' to be set for lcore %u\n",
297 target_freq, pi->lcore_id);
299 fflush(pi->f_cur_min);
300 fflush(pi->f_cur_max);
304 /* Increase freq, the max freq should be updated first */
305 if (idx < pi->curr_idx) {
307 if (fprintf(pi->f_cur_max, "%u", target_freq) < 0) {
308 RTE_LOG(ERR, POWER, "Fail to write new frequency for "
309 "lcore %u\n", pi->lcore_id);
313 if (fprintf(pi->f_cur_min, "%u", target_freq) < 0) {
314 RTE_LOG(ERR, POWER, "Fail to write new frequency for "
315 "lcore %u\n", pi->lcore_id);
319 POWER_DEBUG_TRACE("Frequency '%u' to be set for lcore %u\n",
320 target_freq, pi->lcore_id);
322 fflush(pi->f_cur_max);
323 fflush(pi->f_cur_min);
332 * It is to check the current scaling governor by reading sys file, and then
333 * set it into 'performance' if it is not by writing the sys file. The original
334 * governor will be saved for rolling back.
337 power_set_governor_performance(struct pstate_power_info *pi)
339 return power_set_governor(pi->lcore_id, POWER_GOVERNOR_PERF,
340 pi->governor_ori, sizeof(pi->governor_ori));
344 * It is to check the governor and then set the original governor back if
345 * needed by writing the sys file.
348 power_set_governor_original(struct pstate_power_info *pi)
350 return power_set_governor(pi->lcore_id, pi->governor_ori, NULL, 0);
354 * It is to get the available frequencies of the specific lcore by reading the
358 power_get_available_freqs(struct pstate_power_info *pi)
360 FILE *f_min = NULL, *f_max = NULL;
362 uint32_t sys_min_freq = 0, sys_max_freq = 0, base_max_freq = 0;
363 uint32_t i, num_freqs = 0;
366 open_core_sysfs_file(&f_max, "r", POWER_SYSFILE_BASE_MAX_FREQ,
369 RTE_LOG(ERR, POWER, "failed to open %s\n",
370 POWER_SYSFILE_BASE_MAX_FREQ);
374 open_core_sysfs_file(&f_min, "r", POWER_SYSFILE_BASE_MIN_FREQ,
377 RTE_LOG(ERR, POWER, "failed to open %s\n",
378 POWER_SYSFILE_BASE_MIN_FREQ);
382 /* read base ratios */
383 ret = read_core_sysfs_u32(f_max, &sys_max_freq);
385 RTE_LOG(ERR, POWER, "Failed to read %s\n",
386 POWER_SYSFILE_BASE_MAX_FREQ);
390 ret = read_core_sysfs_u32(f_min, &sys_min_freq);
392 RTE_LOG(ERR, POWER, "Failed to read %s\n",
393 POWER_SYSFILE_BASE_MIN_FREQ);
397 if (sys_max_freq < sys_min_freq)
400 pi->sys_max_freq = sys_max_freq;
402 if (pi->priority_core == 1)
403 base_max_freq = pi->core_base_freq;
405 base_max_freq = pi->non_turbo_max_ratio * BUS_FREQ;
407 POWER_DEBUG_TRACE("sys min %u, sys max %u, base_max %u\n",
412 if (base_max_freq < sys_max_freq)
413 pi->turbo_available = 1;
415 pi->turbo_available = 0;
417 /* If turbo is available then there is one extra freq bucket
418 * to store the sys max freq which value is base_max +1
420 num_freqs = (base_max_freq - sys_min_freq) / BUS_FREQ + 1 +
422 if (num_freqs >= RTE_MAX_LCORE_FREQS) {
423 RTE_LOG(ERR, POWER, "Too many available frequencies: %d\n",
428 /* Generate the freq bucket array.
429 * If turbo is available the freq bucket[0] value is base_max +1
430 * the bucket[1] is base_max, bucket[2] is base_max - BUS_FREQ
432 * If turbo is not available bucket[0] is base_max and so on
434 for (i = 0, pi->nb_freqs = 0; i < num_freqs; i++) {
435 if ((i == 0) && pi->turbo_available)
436 pi->freqs[pi->nb_freqs++] = base_max_freq + 1;
438 pi->freqs[pi->nb_freqs++] =
439 base_max_freq - (i - pi->turbo_available) * BUS_FREQ;
444 POWER_DEBUG_TRACE("%d frequency(s) of lcore %u are available\n",
445 num_freqs, pi->lcore_id);
457 power_get_cur_idx(struct pstate_power_info *pi)
461 uint32_t sys_cur_freq = 0;
464 open_core_sysfs_file(&f_cur, "r", POWER_SYSFILE_CUR_FREQ,
467 RTE_LOG(ERR, POWER, "failed to open %s\n",
468 POWER_SYSFILE_CUR_FREQ);
472 ret = read_core_sysfs_u32(f_cur, &sys_cur_freq);
474 RTE_LOG(ERR, POWER, "Failed to read %s\n",
475 POWER_SYSFILE_CUR_FREQ);
479 /* convert the frequency to nearest 100000 value
480 * Ex: if sys_cur_freq=1396789 then freq_conv=1400000
481 * Ex: if sys_cur_freq=800030 then freq_conv=800000
482 * Ex: if sys_cur_freq=800030 then freq_conv=800000
484 unsigned int freq_conv = 0;
485 freq_conv = (sys_cur_freq + FREQ_ROUNDING_DELTA)
486 / ROUND_FREQ_TO_N_100000;
487 freq_conv = freq_conv * ROUND_FREQ_TO_N_100000;
489 for (i = 0; i < pi->nb_freqs; i++) {
490 if (freq_conv == pi->freqs[i]) {
504 power_pstate_cpufreq_check_supported(void)
506 return cpufreq_check_scaling_driver(POWER_PSTATE_DRIVER);
510 power_pstate_cpufreq_init(unsigned int lcore_id)
512 struct pstate_power_info *pi;
515 if (lcore_id >= RTE_MAX_LCORE) {
516 RTE_LOG(ERR, POWER, "Lcore id %u can not exceed %u\n",
517 lcore_id, RTE_MAX_LCORE - 1U);
521 pi = &lcore_power_info[lcore_id];
522 exp_state = POWER_IDLE;
523 /* The power in use state works as a guard variable between
524 * the CPU frequency control initialization and exit process.
525 * The ACQUIRE memory ordering here pairs with the RELEASE
526 * ordering below as lock to make sure the frequency operations
527 * in the critical section are done under the correct state.
529 if (!__atomic_compare_exchange_n(&(pi->state), &exp_state,
531 __ATOMIC_ACQUIRE, __ATOMIC_RELAXED)) {
532 RTE_LOG(INFO, POWER, "Power management of lcore %u is "
533 "in use\n", lcore_id);
537 pi->lcore_id = lcore_id;
538 /* Check and set the governor */
539 if (power_set_governor_performance(pi) < 0) {
540 RTE_LOG(ERR, POWER, "Cannot set governor of lcore %u to "
541 "performance\n", lcore_id);
544 /* Init for setting lcore frequency */
545 if (power_init_for_setting_freq(pi) < 0) {
546 RTE_LOG(ERR, POWER, "Cannot init for setting frequency for "
547 "lcore %u\n", lcore_id);
551 /* Get the available frequencies */
552 if (power_get_available_freqs(pi) < 0) {
553 RTE_LOG(ERR, POWER, "Cannot get available frequencies of "
554 "lcore %u\n", lcore_id);
558 if (power_get_cur_idx(pi) < 0) {
559 RTE_LOG(ERR, POWER, "Cannot get current frequency "
560 "index of lcore %u\n", lcore_id);
564 /* Set freq to max by default */
565 if (power_pstate_cpufreq_freq_max(lcore_id) < 0) {
566 RTE_LOG(ERR, POWER, "Cannot set frequency of lcore %u "
567 "to max\n", lcore_id);
571 RTE_LOG(INFO, POWER, "Initialized successfully for lcore %u "
572 "power management\n", lcore_id);
573 exp_state = POWER_ONGOING;
574 __atomic_compare_exchange_n(&(pi->state), &exp_state, POWER_USED,
575 0, __ATOMIC_RELEASE, __ATOMIC_RELAXED);
580 exp_state = POWER_ONGOING;
581 __atomic_compare_exchange_n(&(pi->state), &exp_state, POWER_UNKNOWN,
582 0, __ATOMIC_RELEASE, __ATOMIC_RELAXED);
588 power_pstate_cpufreq_exit(unsigned int lcore_id)
590 struct pstate_power_info *pi;
593 if (lcore_id >= RTE_MAX_LCORE) {
594 RTE_LOG(ERR, POWER, "Lcore id %u can not exceeds %u\n",
595 lcore_id, RTE_MAX_LCORE - 1U);
598 pi = &lcore_power_info[lcore_id];
600 exp_state = POWER_USED;
601 /* The power in use state works as a guard variable between
602 * the CPU frequency control initialization and exit process.
603 * The ACQUIRE memory ordering here pairs with the RELEASE
604 * ordering below as lock to make sure the frequency operations
605 * in the critical section are under done the correct state.
607 if (!__atomic_compare_exchange_n(&(pi->state), &exp_state,
609 __ATOMIC_ACQUIRE, __ATOMIC_RELAXED)) {
610 RTE_LOG(INFO, POWER, "Power management of lcore %u is "
611 "not used\n", lcore_id);
615 /* Close FD of setting freq */
616 fclose(pi->f_cur_min);
617 fclose(pi->f_cur_max);
618 pi->f_cur_min = NULL;
619 pi->f_cur_max = NULL;
621 /* Set the governor back to the original */
622 if (power_set_governor_original(pi) < 0) {
623 RTE_LOG(ERR, POWER, "Cannot set the governor of %u back "
624 "to the original\n", lcore_id);
628 RTE_LOG(INFO, POWER, "Power management of lcore %u has exited from "
629 "'performance' mode and been set back to the "
630 "original\n", lcore_id);
631 exp_state = POWER_ONGOING;
632 __atomic_compare_exchange_n(&(pi->state), &exp_state, POWER_IDLE,
633 0, __ATOMIC_RELEASE, __ATOMIC_RELAXED);
638 exp_state = POWER_ONGOING;
639 __atomic_compare_exchange_n(&(pi->state), &exp_state, POWER_UNKNOWN,
640 0, __ATOMIC_RELEASE, __ATOMIC_RELAXED);
647 power_pstate_cpufreq_freqs(unsigned int lcore_id, uint32_t *freqs, uint32_t num)
649 struct pstate_power_info *pi;
651 if (lcore_id >= RTE_MAX_LCORE) {
652 RTE_LOG(ERR, POWER, "Invalid lcore ID\n");
657 RTE_LOG(ERR, POWER, "NULL buffer supplied\n");
661 pi = &lcore_power_info[lcore_id];
662 if (num < pi->nb_freqs) {
663 RTE_LOG(ERR, POWER, "Buffer size is not enough\n");
666 rte_memcpy(freqs, pi->freqs, pi->nb_freqs * sizeof(uint32_t));
672 power_pstate_cpufreq_get_freq(unsigned int lcore_id)
674 if (lcore_id >= RTE_MAX_LCORE) {
675 RTE_LOG(ERR, POWER, "Invalid lcore ID\n");
676 return RTE_POWER_INVALID_FREQ_INDEX;
679 return lcore_power_info[lcore_id].curr_idx;
684 power_pstate_cpufreq_set_freq(unsigned int lcore_id, uint32_t index)
686 if (lcore_id >= RTE_MAX_LCORE) {
687 RTE_LOG(ERR, POWER, "Invalid lcore ID\n");
691 return set_freq_internal(&(lcore_power_info[lcore_id]), index);
695 power_pstate_cpufreq_freq_up(unsigned int lcore_id)
697 struct pstate_power_info *pi;
699 if (lcore_id >= RTE_MAX_LCORE) {
700 RTE_LOG(ERR, POWER, "Invalid lcore ID\n");
704 pi = &lcore_power_info[lcore_id];
705 if (pi->curr_idx == 0 ||
706 (pi->curr_idx == 1 && pi->turbo_available && !pi->turbo_enable))
709 /* Frequencies in the array are from high to low. */
710 return set_freq_internal(pi, pi->curr_idx - 1);
714 power_pstate_cpufreq_freq_down(unsigned int lcore_id)
716 struct pstate_power_info *pi;
718 if (lcore_id >= RTE_MAX_LCORE) {
719 RTE_LOG(ERR, POWER, "Invalid lcore ID\n");
723 pi = &lcore_power_info[lcore_id];
724 if (pi->curr_idx + 1 == pi->nb_freqs)
727 /* Frequencies in the array are from high to low. */
728 return set_freq_internal(pi, pi->curr_idx + 1);
732 power_pstate_cpufreq_freq_max(unsigned int lcore_id)
734 if (lcore_id >= RTE_MAX_LCORE) {
735 RTE_LOG(ERR, POWER, "Invalid lcore ID\n");
739 /* Frequencies in the array are from high to low. */
740 if (lcore_power_info[lcore_id].turbo_available) {
741 if (lcore_power_info[lcore_id].turbo_enable)
743 return set_freq_internal(
744 &lcore_power_info[lcore_id], 0);
746 /* Set to max non-turbo */
747 return set_freq_internal(
748 &lcore_power_info[lcore_id], 1);
750 return set_freq_internal(&lcore_power_info[lcore_id], 0);
755 power_pstate_cpufreq_freq_min(unsigned int lcore_id)
757 struct pstate_power_info *pi;
759 if (lcore_id >= RTE_MAX_LCORE) {
760 RTE_LOG(ERR, POWER, "Invalid lcore ID\n");
764 pi = &lcore_power_info[lcore_id];
766 /* Frequencies in the array are from high to low. */
767 return set_freq_internal(pi, pi->nb_freqs - 1);
772 power_pstate_turbo_status(unsigned int lcore_id)
774 struct pstate_power_info *pi;
776 if (lcore_id >= RTE_MAX_LCORE) {
777 RTE_LOG(ERR, POWER, "Invalid lcore ID\n");
781 pi = &lcore_power_info[lcore_id];
783 return pi->turbo_enable;
787 power_pstate_enable_turbo(unsigned int lcore_id)
789 struct pstate_power_info *pi;
791 if (lcore_id >= RTE_MAX_LCORE) {
792 RTE_LOG(ERR, POWER, "Invalid lcore ID\n");
796 pi = &lcore_power_info[lcore_id];
798 if (pi->turbo_available)
799 pi->turbo_enable = 1;
801 pi->turbo_enable = 0;
803 "Failed to enable turbo on lcore %u\n",
813 power_pstate_disable_turbo(unsigned int lcore_id)
815 struct pstate_power_info *pi;
817 if (lcore_id >= RTE_MAX_LCORE) {
818 RTE_LOG(ERR, POWER, "Invalid lcore ID\n");
822 pi = &lcore_power_info[lcore_id];
824 pi->turbo_enable = 0;
826 if (pi->turbo_available && pi->curr_idx <= 1) {
827 /* Try to set freq to max by default coming out of turbo */
828 if (power_pstate_cpufreq_freq_max(lcore_id) < 0) {
830 "Failed to set frequency of lcore %u to max\n",
840 int power_pstate_get_capabilities(unsigned int lcore_id,
841 struct rte_power_core_capabilities *caps)
843 struct pstate_power_info *pi;
845 if (lcore_id >= RTE_MAX_LCORE) {
846 RTE_LOG(ERR, POWER, "Invalid lcore ID\n");
850 RTE_LOG(ERR, POWER, "Invalid argument\n");
854 pi = &lcore_power_info[lcore_id];
855 caps->capabilities = 0;
856 caps->turbo = !!(pi->turbo_available);
857 caps->priority = pi->priority_core;