1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright(c) 2018 Intel Corporation
17 #include <rte_memcpy.h>
18 #include <rte_memory.h>
19 #include <rte_string_fns.h>
21 #include "power_pstate_cpufreq.h"
22 #include "power_common.h"
24 /* macros used for rounding frequency to nearest 100000 */
25 #define FREQ_ROUNDING_DELTA 50000
26 #define ROUND_FREQ_TO_N_100000 100000
28 #define BUS_FREQ 100000
30 #define POWER_GOVERNOR_PERF "performance"
31 #define POWER_SYSFILE_MAX_FREQ \
32 "/sys/devices/system/cpu/cpu%u/cpufreq/scaling_max_freq"
33 #define POWER_SYSFILE_MIN_FREQ \
34 "/sys/devices/system/cpu/cpu%u/cpufreq/scaling_min_freq"
35 #define POWER_SYSFILE_CUR_FREQ \
36 "/sys/devices/system/cpu/cpu%u/cpufreq/scaling_cur_freq"
37 #define POWER_SYSFILE_BASE_MAX_FREQ \
38 "/sys/devices/system/cpu/cpu%u/cpufreq/cpuinfo_max_freq"
39 #define POWER_SYSFILE_BASE_MIN_FREQ \
40 "/sys/devices/system/cpu/cpu%u/cpufreq/cpuinfo_min_freq"
41 #define POWER_SYSFILE_BASE_FREQ \
42 "/sys/devices/system/cpu/cpu%u/cpufreq/base_frequency"
43 #define POWER_PSTATE_DRIVER "intel_pstate"
44 #define POWER_MSR_PATH "/dev/cpu/%u/msr"
49 #define PLATFORM_INFO 0x0CE
50 #define NON_TURBO_MASK 0xFF00
51 #define NON_TURBO_OFFSET 0x8
61 struct pstate_power_info {
62 unsigned int lcore_id; /**< Logical core id */
63 uint32_t freqs[RTE_MAX_LCORE_FREQS]; /**< Frequency array */
64 uint32_t nb_freqs; /**< number of available freqs */
65 FILE *f_cur_min; /**< FD of scaling_min */
66 FILE *f_cur_max; /**< FD of scaling_max */
67 char governor_ori[32]; /**< Original governor name */
68 uint32_t curr_idx; /**< Freq index in freqs array */
69 uint32_t non_turbo_max_ratio; /**< Non Turbo Max ratio */
70 uint32_t sys_max_freq; /**< system wide max freq */
71 uint32_t core_base_freq; /**< core base freq */
72 uint32_t state; /**< Power in use state */
73 uint16_t turbo_available; /**< Turbo Boost available */
74 uint16_t turbo_enable; /**< Turbo Boost enable/disable */
75 uint16_t priority_core; /**< High Performance core */
76 } __rte_cache_aligned;
79 static struct pstate_power_info lcore_power_info[RTE_MAX_LCORE];
82 * It is to read the specific MSR.
86 power_rdmsr(int msr, uint64_t *val, unsigned int lcore_id)
89 char fullpath[PATH_MAX];
91 snprintf(fullpath, sizeof(fullpath), POWER_MSR_PATH, lcore_id);
93 fd = open(fullpath, O_RDONLY);
96 RTE_LOG(ERR, POWER, "Error opening '%s': %s\n", fullpath,
101 ret = pread(fd, val, sizeof(uint64_t), msr);
104 RTE_LOG(ERR, POWER, "Error reading '%s': %s\n", fullpath,
109 POWER_DEBUG_TRACE("MSR Path %s, offset 0x%X for lcore %u\n",
110 fullpath, msr, lcore_id);
112 POWER_DEBUG_TRACE("Ret value %d, content is 0x%"PRIx64"\n", ret, *val);
119 * It is to fopen the sys file for the future setting the lcore frequency.
122 power_init_for_setting_freq(struct pstate_power_info *pi)
124 FILE *f_base = NULL, *f_base_max = NULL, *f_min = NULL, *f_max = NULL;
125 uint32_t base_ratio, base_max_ratio;
126 uint64_t max_non_turbo;
129 /* open all files we expect to have open */
130 open_core_sysfs_file(&f_base_max, "r", POWER_SYSFILE_BASE_MAX_FREQ,
132 if (f_base_max == NULL) {
133 RTE_LOG(ERR, POWER, "failed to open %s\n",
134 POWER_SYSFILE_BASE_MAX_FREQ);
138 open_core_sysfs_file(&f_min, "rw+", POWER_SYSFILE_MIN_FREQ,
141 RTE_LOG(ERR, POWER, "failed to open %s\n",
142 POWER_SYSFILE_MIN_FREQ);
146 open_core_sysfs_file(&f_max, "rw+", POWER_SYSFILE_MAX_FREQ,
149 RTE_LOG(ERR, POWER, "failed to open %s\n",
150 POWER_SYSFILE_MAX_FREQ);
154 open_core_sysfs_file(&f_base, "r", POWER_SYSFILE_BASE_FREQ,
156 /* base ratio file may not exist in some kernels, so no error check */
158 /* read base max ratio */
159 ret = read_core_sysfs_u32(f_base_max, &base_max_ratio);
161 RTE_LOG(ERR, POWER, "Failed to read %s\n",
162 POWER_SYSFILE_BASE_MAX_FREQ);
166 /* base ratio may not exist */
167 if (f_base != NULL) {
168 ret = read_core_sysfs_u32(f_base, &base_ratio);
170 RTE_LOG(ERR, POWER, "Failed to read %s\n",
171 POWER_SYSFILE_BASE_FREQ);
178 /* Add MSR read to detect turbo status */
179 if (power_rdmsr(PLATFORM_INFO, &max_non_turbo, pi->lcore_id) < 0)
181 /* no errors after this point */
183 /* convert ratios to bins */
184 base_max_ratio /= BUS_FREQ;
185 base_ratio /= BUS_FREQ;
187 /* assign file handles */
188 pi->f_cur_min = f_min;
189 pi->f_cur_max = f_max;
191 max_non_turbo = (max_non_turbo&NON_TURBO_MASK)>>NON_TURBO_OFFSET;
193 POWER_DEBUG_TRACE("no turbo perf %"PRIu64"\n", max_non_turbo);
195 pi->non_turbo_max_ratio = (uint32_t)max_non_turbo;
198 * If base_frequency is reported as greater than the maximum
199 * turbo frequency, that's a known issue with some kernels.
200 * Set base_frequency to max_non_turbo as a workaround.
202 if (base_ratio > base_max_ratio) {
203 /* base_ratio is greater than max turbo. Kernel bug. */
204 pi->priority_core = 0;
209 * If base_frequency is reported as greater than the maximum
210 * non-turbo frequency, then mark it as a high priority core.
212 if (base_ratio > max_non_turbo)
213 pi->priority_core = 1;
215 pi->priority_core = 0;
216 pi->core_base_freq = base_ratio * BUS_FREQ;
222 /* f_min and f_max are stored, no need to close */
228 if (f_base_max != NULL)
238 set_freq_internal(struct pstate_power_info *pi, uint32_t idx)
240 uint32_t target_freq = 0;
242 if (idx >= RTE_MAX_LCORE_FREQS || idx >= pi->nb_freqs) {
243 RTE_LOG(ERR, POWER, "Invalid frequency index %u, which "
244 "should be less than %u\n", idx, pi->nb_freqs);
248 /* Check if it is the same as current */
249 if (idx == pi->curr_idx)
252 /* Because Intel Pstate Driver only allow user change min/max hint
253 * User need change the min/max as same value.
255 if (fseek(pi->f_cur_min, 0, SEEK_SET) < 0) {
256 RTE_LOG(ERR, POWER, "Fail to set file position indicator to 0 "
257 "for setting frequency for lcore %u\n",
262 if (fseek(pi->f_cur_max, 0, SEEK_SET) < 0) {
263 RTE_LOG(ERR, POWER, "Fail to set file position indicator to 0 "
264 "for setting frequency for lcore %u\n",
269 /* Turbo is available and enabled, first freq bucket is sys max freq */
270 if (pi->turbo_available && idx == 0) {
271 if (pi->turbo_enable)
272 target_freq = pi->sys_max_freq;
274 RTE_LOG(ERR, POWER, "Turbo is off, frequency can't be scaled up more %u\n",
279 target_freq = pi->freqs[idx];
281 /* Decrease freq, the min freq should be updated first */
282 if (idx > pi->curr_idx) {
284 if (fprintf(pi->f_cur_min, "%u", target_freq) < 0) {
285 RTE_LOG(ERR, POWER, "Fail to write new frequency for "
286 "lcore %u\n", pi->lcore_id);
290 if (fprintf(pi->f_cur_max, "%u", target_freq) < 0) {
291 RTE_LOG(ERR, POWER, "Fail to write new frequency for "
292 "lcore %u\n", pi->lcore_id);
296 POWER_DEBUG_TRACE("Frequency '%u' to be set for lcore %u\n",
297 target_freq, pi->lcore_id);
299 fflush(pi->f_cur_min);
300 fflush(pi->f_cur_max);
304 /* Increase freq, the max freq should be updated first */
305 if (idx < pi->curr_idx) {
307 if (fprintf(pi->f_cur_max, "%u", target_freq) < 0) {
308 RTE_LOG(ERR, POWER, "Fail to write new frequency for "
309 "lcore %u\n", pi->lcore_id);
313 if (fprintf(pi->f_cur_min, "%u", target_freq) < 0) {
314 RTE_LOG(ERR, POWER, "Fail to write new frequency for "
315 "lcore %u\n", pi->lcore_id);
319 POWER_DEBUG_TRACE("Frequency '%u' to be set for lcore %u\n",
320 target_freq, pi->lcore_id);
322 fflush(pi->f_cur_max);
323 fflush(pi->f_cur_min);
332 * It is to check the current scaling governor by reading sys file, and then
333 * set it into 'performance' if it is not by writing the sys file. The original
334 * governor will be saved for rolling back.
337 power_set_governor_performance(struct pstate_power_info *pi)
339 return power_set_governor(pi->lcore_id, POWER_GOVERNOR_PERF,
340 pi->governor_ori, sizeof(pi->governor_ori));
344 * It is to check the governor and then set the original governor back if
345 * needed by writing the sys file.
348 power_set_governor_original(struct pstate_power_info *pi)
350 return power_set_governor(pi->lcore_id, pi->governor_ori, NULL, 0);
354 * It is to get the available frequencies of the specific lcore by reading the
358 power_get_available_freqs(struct pstate_power_info *pi)
360 FILE *f_min = NULL, *f_max = NULL;
362 uint32_t sys_min_freq = 0, sys_max_freq = 0, base_max_freq = 0;
363 uint32_t i, num_freqs = 0;
366 open_core_sysfs_file(&f_max, "r", POWER_SYSFILE_BASE_MAX_FREQ,
369 RTE_LOG(ERR, POWER, "failed to open %s\n",
370 POWER_SYSFILE_BASE_MAX_FREQ);
374 open_core_sysfs_file(&f_min, "r", POWER_SYSFILE_BASE_MIN_FREQ,
377 RTE_LOG(ERR, POWER, "failed to open %s\n",
378 POWER_SYSFILE_BASE_MIN_FREQ);
382 /* read base ratios */
383 ret = read_core_sysfs_u32(f_max, &sys_max_freq);
385 RTE_LOG(ERR, POWER, "Failed to read %s\n",
386 POWER_SYSFILE_BASE_MAX_FREQ);
390 ret = read_core_sysfs_u32(f_min, &sys_min_freq);
392 RTE_LOG(ERR, POWER, "Failed to read %s\n",
393 POWER_SYSFILE_BASE_MIN_FREQ);
397 if (sys_max_freq < sys_min_freq)
400 pi->sys_max_freq = sys_max_freq;
402 if (pi->priority_core == 1)
403 base_max_freq = pi->core_base_freq;
405 base_max_freq = pi->non_turbo_max_ratio * BUS_FREQ;
407 POWER_DEBUG_TRACE("sys min %u, sys max %u, base_max %u\n",
412 if (base_max_freq < sys_max_freq)
413 pi->turbo_available = 1;
415 pi->turbo_available = 0;
417 /* If turbo is available then there is one extra freq bucket
418 * to store the sys max freq which value is base_max +1
420 num_freqs = (base_max_freq - sys_min_freq) / BUS_FREQ + 1 +
423 /* Generate the freq bucket array.
424 * If turbo is available the freq bucket[0] value is base_max +1
425 * the bucket[1] is base_max, bucket[2] is base_max - BUS_FREQ
427 * If turbo is not available bucket[0] is base_max and so on
429 for (i = 0, pi->nb_freqs = 0; i < num_freqs; i++) {
430 if ((i == 0) && pi->turbo_available)
431 pi->freqs[pi->nb_freqs++] = base_max_freq + 1;
433 pi->freqs[pi->nb_freqs++] =
434 base_max_freq - (i - pi->turbo_available) * BUS_FREQ;
439 POWER_DEBUG_TRACE("%d frequency(s) of lcore %u are available\n",
440 num_freqs, pi->lcore_id);
452 power_get_cur_idx(struct pstate_power_info *pi)
456 uint32_t sys_cur_freq = 0;
459 open_core_sysfs_file(&f_cur, "r", POWER_SYSFILE_CUR_FREQ,
462 RTE_LOG(ERR, POWER, "failed to open %s\n",
463 POWER_SYSFILE_CUR_FREQ);
467 ret = read_core_sysfs_u32(f_cur, &sys_cur_freq);
469 RTE_LOG(ERR, POWER, "Failed to read %s\n",
470 POWER_SYSFILE_CUR_FREQ);
474 /* convert the frequency to nearest 100000 value
475 * Ex: if sys_cur_freq=1396789 then freq_conv=1400000
476 * Ex: if sys_cur_freq=800030 then freq_conv=800000
477 * Ex: if sys_cur_freq=800030 then freq_conv=800000
479 unsigned int freq_conv = 0;
480 freq_conv = (sys_cur_freq + FREQ_ROUNDING_DELTA)
481 / ROUND_FREQ_TO_N_100000;
482 freq_conv = freq_conv * ROUND_FREQ_TO_N_100000;
484 for (i = 0; i < pi->nb_freqs; i++) {
485 if (freq_conv == pi->freqs[i]) {
499 power_pstate_cpufreq_check_supported(void)
501 return cpufreq_check_scaling_driver(POWER_PSTATE_DRIVER);
505 power_pstate_cpufreq_init(unsigned int lcore_id)
507 struct pstate_power_info *pi;
510 if (lcore_id >= RTE_MAX_LCORE) {
511 RTE_LOG(ERR, POWER, "Lcore id %u can not exceed %u\n",
512 lcore_id, RTE_MAX_LCORE - 1U);
516 pi = &lcore_power_info[lcore_id];
517 exp_state = POWER_IDLE;
518 /* The power in use state works as a guard variable between
519 * the CPU frequency control initialization and exit process.
520 * The ACQUIRE memory ordering here pairs with the RELEASE
521 * ordering below as lock to make sure the frequency operations
522 * in the critical section are done under the correct state.
524 if (!__atomic_compare_exchange_n(&(pi->state), &exp_state,
526 __ATOMIC_ACQUIRE, __ATOMIC_RELAXED)) {
527 RTE_LOG(INFO, POWER, "Power management of lcore %u is "
528 "in use\n", lcore_id);
532 pi->lcore_id = lcore_id;
533 /* Check and set the governor */
534 if (power_set_governor_performance(pi) < 0) {
535 RTE_LOG(ERR, POWER, "Cannot set governor of lcore %u to "
536 "performance\n", lcore_id);
539 /* Init for setting lcore frequency */
540 if (power_init_for_setting_freq(pi) < 0) {
541 RTE_LOG(ERR, POWER, "Cannot init for setting frequency for "
542 "lcore %u\n", lcore_id);
546 /* Get the available frequencies */
547 if (power_get_available_freqs(pi) < 0) {
548 RTE_LOG(ERR, POWER, "Cannot get available frequencies of "
549 "lcore %u\n", lcore_id);
553 if (power_get_cur_idx(pi) < 0) {
554 RTE_LOG(ERR, POWER, "Cannot get current frequency "
555 "index of lcore %u\n", lcore_id);
559 /* Set freq to max by default */
560 if (power_pstate_cpufreq_freq_max(lcore_id) < 0) {
561 RTE_LOG(ERR, POWER, "Cannot set frequency of lcore %u "
562 "to max\n", lcore_id);
566 RTE_LOG(INFO, POWER, "Initialized successfully for lcore %u "
567 "power management\n", lcore_id);
568 exp_state = POWER_ONGOING;
569 __atomic_compare_exchange_n(&(pi->state), &exp_state, POWER_USED,
570 0, __ATOMIC_RELEASE, __ATOMIC_RELAXED);
575 exp_state = POWER_ONGOING;
576 __atomic_compare_exchange_n(&(pi->state), &exp_state, POWER_UNKNOWN,
577 0, __ATOMIC_RELEASE, __ATOMIC_RELAXED);
583 power_pstate_cpufreq_exit(unsigned int lcore_id)
585 struct pstate_power_info *pi;
588 if (lcore_id >= RTE_MAX_LCORE) {
589 RTE_LOG(ERR, POWER, "Lcore id %u can not exceeds %u\n",
590 lcore_id, RTE_MAX_LCORE - 1U);
593 pi = &lcore_power_info[lcore_id];
595 exp_state = POWER_USED;
596 /* The power in use state works as a guard variable between
597 * the CPU frequency control initialization and exit process.
598 * The ACQUIRE memory ordering here pairs with the RELEASE
599 * ordering below as lock to make sure the frequency operations
600 * in the critical section are under done the correct state.
602 if (!__atomic_compare_exchange_n(&(pi->state), &exp_state,
604 __ATOMIC_ACQUIRE, __ATOMIC_RELAXED)) {
605 RTE_LOG(INFO, POWER, "Power management of lcore %u is "
606 "not used\n", lcore_id);
610 /* Close FD of setting freq */
611 fclose(pi->f_cur_min);
612 fclose(pi->f_cur_max);
613 pi->f_cur_min = NULL;
614 pi->f_cur_max = NULL;
616 /* Set the governor back to the original */
617 if (power_set_governor_original(pi) < 0) {
618 RTE_LOG(ERR, POWER, "Cannot set the governor of %u back "
619 "to the original\n", lcore_id);
623 RTE_LOG(INFO, POWER, "Power management of lcore %u has exited from "
624 "'performance' mode and been set back to the "
625 "original\n", lcore_id);
626 exp_state = POWER_ONGOING;
627 __atomic_compare_exchange_n(&(pi->state), &exp_state, POWER_IDLE,
628 0, __ATOMIC_RELEASE, __ATOMIC_RELAXED);
633 exp_state = POWER_ONGOING;
634 __atomic_compare_exchange_n(&(pi->state), &exp_state, POWER_UNKNOWN,
635 0, __ATOMIC_RELEASE, __ATOMIC_RELAXED);
642 power_pstate_cpufreq_freqs(unsigned int lcore_id, uint32_t *freqs, uint32_t num)
644 struct pstate_power_info *pi;
646 if (lcore_id >= RTE_MAX_LCORE) {
647 RTE_LOG(ERR, POWER, "Invalid lcore ID\n");
652 RTE_LOG(ERR, POWER, "NULL buffer supplied\n");
656 pi = &lcore_power_info[lcore_id];
657 if (num < pi->nb_freqs) {
658 RTE_LOG(ERR, POWER, "Buffer size is not enough\n");
661 rte_memcpy(freqs, pi->freqs, pi->nb_freqs * sizeof(uint32_t));
667 power_pstate_cpufreq_get_freq(unsigned int lcore_id)
669 if (lcore_id >= RTE_MAX_LCORE) {
670 RTE_LOG(ERR, POWER, "Invalid lcore ID\n");
671 return RTE_POWER_INVALID_FREQ_INDEX;
674 return lcore_power_info[lcore_id].curr_idx;
679 power_pstate_cpufreq_set_freq(unsigned int lcore_id, uint32_t index)
681 if (lcore_id >= RTE_MAX_LCORE) {
682 RTE_LOG(ERR, POWER, "Invalid lcore ID\n");
686 return set_freq_internal(&(lcore_power_info[lcore_id]), index);
690 power_pstate_cpufreq_freq_up(unsigned int lcore_id)
692 struct pstate_power_info *pi;
694 if (lcore_id >= RTE_MAX_LCORE) {
695 RTE_LOG(ERR, POWER, "Invalid lcore ID\n");
699 pi = &lcore_power_info[lcore_id];
700 if (pi->curr_idx == 0 ||
701 (pi->curr_idx == 1 && pi->turbo_available && !pi->turbo_enable))
704 /* Frequencies in the array are from high to low. */
705 return set_freq_internal(pi, pi->curr_idx - 1);
709 power_pstate_cpufreq_freq_down(unsigned int lcore_id)
711 struct pstate_power_info *pi;
713 if (lcore_id >= RTE_MAX_LCORE) {
714 RTE_LOG(ERR, POWER, "Invalid lcore ID\n");
718 pi = &lcore_power_info[lcore_id];
719 if (pi->curr_idx + 1 == pi->nb_freqs)
722 /* Frequencies in the array are from high to low. */
723 return set_freq_internal(pi, pi->curr_idx + 1);
727 power_pstate_cpufreq_freq_max(unsigned int lcore_id)
729 if (lcore_id >= RTE_MAX_LCORE) {
730 RTE_LOG(ERR, POWER, "Invalid lcore ID\n");
734 /* Frequencies in the array are from high to low. */
735 if (lcore_power_info[lcore_id].turbo_available) {
736 if (lcore_power_info[lcore_id].turbo_enable)
738 return set_freq_internal(
739 &lcore_power_info[lcore_id], 0);
741 /* Set to max non-turbo */
742 return set_freq_internal(
743 &lcore_power_info[lcore_id], 1);
745 return set_freq_internal(&lcore_power_info[lcore_id], 0);
750 power_pstate_cpufreq_freq_min(unsigned int lcore_id)
752 struct pstate_power_info *pi;
754 if (lcore_id >= RTE_MAX_LCORE) {
755 RTE_LOG(ERR, POWER, "Invalid lcore ID\n");
759 pi = &lcore_power_info[lcore_id];
761 /* Frequencies in the array are from high to low. */
762 return set_freq_internal(pi, pi->nb_freqs - 1);
767 power_pstate_turbo_status(unsigned int lcore_id)
769 struct pstate_power_info *pi;
771 if (lcore_id >= RTE_MAX_LCORE) {
772 RTE_LOG(ERR, POWER, "Invalid lcore ID\n");
776 pi = &lcore_power_info[lcore_id];
778 return pi->turbo_enable;
782 power_pstate_enable_turbo(unsigned int lcore_id)
784 struct pstate_power_info *pi;
786 if (lcore_id >= RTE_MAX_LCORE) {
787 RTE_LOG(ERR, POWER, "Invalid lcore ID\n");
791 pi = &lcore_power_info[lcore_id];
793 if (pi->turbo_available)
794 pi->turbo_enable = 1;
796 pi->turbo_enable = 0;
798 "Failed to enable turbo on lcore %u\n",
808 power_pstate_disable_turbo(unsigned int lcore_id)
810 struct pstate_power_info *pi;
812 if (lcore_id >= RTE_MAX_LCORE) {
813 RTE_LOG(ERR, POWER, "Invalid lcore ID\n");
817 pi = &lcore_power_info[lcore_id];
819 pi->turbo_enable = 0;
821 if (pi->turbo_available && pi->curr_idx <= 1) {
822 /* Try to set freq to max by default coming out of turbo */
823 if (power_pstate_cpufreq_freq_max(lcore_id) < 0) {
825 "Failed to set frequency of lcore %u to max\n",
835 int power_pstate_get_capabilities(unsigned int lcore_id,
836 struct rte_power_core_capabilities *caps)
838 struct pstate_power_info *pi;
840 if (lcore_id >= RTE_MAX_LCORE) {
841 RTE_LOG(ERR, POWER, "Invalid lcore ID\n");
845 RTE_LOG(ERR, POWER, "Invalid argument\n");
849 pi = &lcore_power_info[lcore_id];
850 caps->capabilities = 0;
851 caps->turbo = !!(pi->turbo_available);
852 caps->priority = pi->priority_core;