1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright(c) 2018 Intel Corporation
17 #include <rte_memcpy.h>
18 #include <rte_memory.h>
19 #include <rte_string_fns.h>
21 #include "power_pstate_cpufreq.h"
22 #include "power_common.h"
24 /* macros used for rounding frequency to nearest 100000 */
25 #define FREQ_ROUNDING_DELTA 50000
26 #define ROUND_FREQ_TO_N_100000 100000
28 #define BUS_FREQ 100000
30 #define POWER_GOVERNOR_PERF "performance"
31 #define POWER_SYSFILE_MAX_FREQ \
32 "/sys/devices/system/cpu/cpu%u/cpufreq/scaling_max_freq"
33 #define POWER_SYSFILE_MIN_FREQ \
34 "/sys/devices/system/cpu/cpu%u/cpufreq/scaling_min_freq"
35 #define POWER_SYSFILE_CUR_FREQ \
36 "/sys/devices/system/cpu/cpu%u/cpufreq/scaling_cur_freq"
37 #define POWER_SYSFILE_BASE_MAX_FREQ \
38 "/sys/devices/system/cpu/cpu%u/cpufreq/cpuinfo_max_freq"
39 #define POWER_SYSFILE_BASE_MIN_FREQ \
40 "/sys/devices/system/cpu/cpu%u/cpufreq/cpuinfo_min_freq"
41 #define POWER_SYSFILE_BASE_FREQ \
42 "/sys/devices/system/cpu/cpu%u/cpufreq/base_frequency"
43 #define POWER_PSTATE_DRIVER "intel_pstate"
44 #define POWER_MSR_PATH "/dev/cpu/%u/msr"
49 #define PLATFORM_INFO 0x0CE
50 #define NON_TURBO_MASK 0xFF00
51 #define NON_TURBO_OFFSET 0x8
61 struct pstate_power_info {
62 unsigned int lcore_id; /**< Logical core id */
63 uint32_t freqs[RTE_MAX_LCORE_FREQS]; /**< Frequency array */
64 uint32_t nb_freqs; /**< number of available freqs */
65 FILE *f_cur_min; /**< FD of scaling_min */
66 FILE *f_cur_max; /**< FD of scaling_max */
67 char governor_ori[32]; /**< Original governor name */
68 uint32_t curr_idx; /**< Freq index in freqs array */
69 uint32_t non_turbo_max_ratio; /**< Non Turbo Max ratio */
70 uint32_t sys_max_freq; /**< system wide max freq */
71 uint32_t core_base_freq; /**< core base freq */
72 uint32_t state; /**< Power in use state */
73 uint16_t turbo_available; /**< Turbo Boost available */
74 uint16_t turbo_enable; /**< Turbo Boost enable/disable */
75 uint16_t priority_core; /**< High Performance core */
76 } __rte_cache_aligned;
79 static struct pstate_power_info lcore_power_info[RTE_MAX_LCORE];
82 * It is to read the specific MSR.
86 power_rdmsr(int msr, uint64_t *val, unsigned int lcore_id)
89 char fullpath[PATH_MAX];
91 snprintf(fullpath, sizeof(fullpath), POWER_MSR_PATH, lcore_id);
93 fd = open(fullpath, O_RDONLY);
96 RTE_LOG(ERR, POWER, "Error opening '%s': %s\n", fullpath,
101 ret = pread(fd, val, sizeof(uint64_t), msr);
104 RTE_LOG(ERR, POWER, "Error reading '%s': %s\n", fullpath,
109 POWER_DEBUG_TRACE("MSR Path %s, offset 0x%X for lcore %u\n",
110 fullpath, msr, lcore_id);
112 POWER_DEBUG_TRACE("Ret value %d, content is 0x%"PRIx64"\n", ret, *val);
119 * It is to fopen the sys file for the future setting the lcore frequency.
122 power_init_for_setting_freq(struct pstate_power_info *pi)
124 FILE *f_base = NULL, *f_base_max = NULL, *f_min = NULL, *f_max = NULL;
125 uint32_t base_ratio, base_max_ratio;
126 uint64_t max_non_turbo;
129 /* open all files we expect to have open */
130 open_core_sysfs_file(&f_base_max, "r", POWER_SYSFILE_BASE_MAX_FREQ,
132 if (f_base_max == NULL) {
133 RTE_LOG(ERR, POWER, "failed to open %s\n",
134 POWER_SYSFILE_BASE_MAX_FREQ);
138 open_core_sysfs_file(&f_min, "rw+", POWER_SYSFILE_MIN_FREQ,
141 RTE_LOG(ERR, POWER, "failed to open %s\n",
142 POWER_SYSFILE_MIN_FREQ);
146 open_core_sysfs_file(&f_max, "rw+", POWER_SYSFILE_MAX_FREQ,
149 RTE_LOG(ERR, POWER, "failed to open %s\n",
150 POWER_SYSFILE_MAX_FREQ);
154 open_core_sysfs_file(&f_base, "r", POWER_SYSFILE_BASE_FREQ,
156 /* base ratio file may not exist in some kernels, so no error check */
158 /* read base max ratio */
159 ret = read_core_sysfs_u32(f_base_max, &base_max_ratio);
161 RTE_LOG(ERR, POWER, "Failed to read %s\n",
162 POWER_SYSFILE_BASE_MAX_FREQ);
166 /* base ratio may not exist */
167 if (f_base != NULL) {
168 ret = read_core_sysfs_u32(f_base, &base_ratio);
170 RTE_LOG(ERR, POWER, "Failed to read %s\n",
171 POWER_SYSFILE_BASE_FREQ);
178 /* Add MSR read to detect turbo status */
179 if (power_rdmsr(PLATFORM_INFO, &max_non_turbo, pi->lcore_id) < 0)
181 /* no errors after this point */
183 /* convert ratios to bins */
184 base_max_ratio /= BUS_FREQ;
185 base_ratio /= BUS_FREQ;
187 /* assign file handles */
188 pi->f_cur_min = f_min;
189 pi->f_cur_max = f_max;
191 max_non_turbo = (max_non_turbo&NON_TURBO_MASK)>>NON_TURBO_OFFSET;
193 POWER_DEBUG_TRACE("no turbo perf %"PRIu64"\n", max_non_turbo);
195 pi->non_turbo_max_ratio = (uint32_t)max_non_turbo;
198 * If base_frequency is reported as greater than the maximum
199 * turbo frequency, that's a known issue with some kernels.
200 * Set base_frequency to max_non_turbo as a workaround.
202 if (base_ratio > base_max_ratio) {
203 /* base_ratio is greater than max turbo. Kernel bug. */
204 pi->priority_core = 0;
209 * If base_frequency is reported as greater than the maximum
210 * non-turbo frequency, then mark it as a high priority core.
212 if (base_ratio > max_non_turbo)
213 pi->priority_core = 1;
215 pi->priority_core = 0;
216 pi->core_base_freq = base_ratio * BUS_FREQ;
222 /* f_min and f_max are stored, no need to close */
228 if (f_base_max != NULL)
238 set_freq_internal(struct pstate_power_info *pi, uint32_t idx)
240 uint32_t target_freq = 0;
242 if (idx >= RTE_MAX_LCORE_FREQS || idx >= pi->nb_freqs) {
243 RTE_LOG(ERR, POWER, "Invalid frequency index %u, which "
244 "should be less than %u\n", idx, pi->nb_freqs);
248 /* Check if it is the same as current */
249 if (idx == pi->curr_idx)
252 /* Because Intel Pstate Driver only allow user change min/max hint
253 * User need change the min/max as same value.
255 if (fseek(pi->f_cur_min, 0, SEEK_SET) < 0) {
256 RTE_LOG(ERR, POWER, "Fail to set file position indicator to 0 "
257 "for setting frequency for lcore %u\n",
262 if (fseek(pi->f_cur_max, 0, SEEK_SET) < 0) {
263 RTE_LOG(ERR, POWER, "Fail to set file position indicator to 0 "
264 "for setting frequency for lcore %u\n",
269 /* Turbo is available and enabled, first freq bucket is sys max freq */
270 if (pi->turbo_available && idx == 0) {
271 if (pi->turbo_enable)
272 target_freq = pi->sys_max_freq;
274 RTE_LOG(ERR, POWER, "Turbo is off, frequency can't be scaled up more %u\n",
279 target_freq = pi->freqs[idx];
281 /* Decrease freq, the min freq should be updated first */
282 if (idx > pi->curr_idx) {
284 if (fprintf(pi->f_cur_min, "%u", target_freq) < 0) {
285 RTE_LOG(ERR, POWER, "Fail to write new frequency for "
286 "lcore %u\n", pi->lcore_id);
290 if (fprintf(pi->f_cur_max, "%u", target_freq) < 0) {
291 RTE_LOG(ERR, POWER, "Fail to write new frequency for "
292 "lcore %u\n", pi->lcore_id);
296 POWER_DEBUG_TRACE("Frequency '%u' to be set for lcore %u\n",
297 target_freq, pi->lcore_id);
299 fflush(pi->f_cur_min);
300 fflush(pi->f_cur_max);
304 /* Increase freq, the max freq should be updated first */
305 if (idx < pi->curr_idx) {
307 if (fprintf(pi->f_cur_max, "%u", target_freq) < 0) {
308 RTE_LOG(ERR, POWER, "Fail to write new frequency for "
309 "lcore %u\n", pi->lcore_id);
313 if (fprintf(pi->f_cur_min, "%u", target_freq) < 0) {
314 RTE_LOG(ERR, POWER, "Fail to write new frequency for "
315 "lcore %u\n", pi->lcore_id);
319 POWER_DEBUG_TRACE("Frequency '%u' to be set for lcore %u\n",
320 target_freq, pi->lcore_id);
322 fflush(pi->f_cur_max);
323 fflush(pi->f_cur_min);
332 * It is to check the current scaling governor by reading sys file, and then
333 * set it into 'performance' if it is not by writing the sys file. The original
334 * governor will be saved for rolling back.
337 power_set_governor_performance(struct pstate_power_info *pi)
339 return power_set_governor(pi->lcore_id, POWER_GOVERNOR_PERF,
340 pi->governor_ori, sizeof(pi->governor_ori));
344 * It is to check the governor and then set the original governor back if
345 * needed by writing the sys file.
348 power_set_governor_original(struct pstate_power_info *pi)
350 return power_set_governor(pi->lcore_id, pi->governor_ori, NULL, 0);
354 * It is to get the available frequencies of the specific lcore by reading the
358 power_get_available_freqs(struct pstate_power_info *pi)
360 FILE *f_min = NULL, *f_max = NULL;
362 uint32_t sys_min_freq = 0, sys_max_freq = 0, base_max_freq = 0;
363 uint32_t i, num_freqs = 0;
366 open_core_sysfs_file(&f_max, "r", POWER_SYSFILE_BASE_MAX_FREQ,
369 RTE_LOG(ERR, POWER, "failed to open %s\n",
370 POWER_SYSFILE_BASE_MAX_FREQ);
374 open_core_sysfs_file(&f_min, "r", POWER_SYSFILE_BASE_MIN_FREQ,
377 RTE_LOG(ERR, POWER, "failed to open %s\n",
378 POWER_SYSFILE_BASE_MIN_FREQ);
382 /* read base ratios */
383 ret = read_core_sysfs_u32(f_max, &sys_max_freq);
385 RTE_LOG(ERR, POWER, "Failed to read %s\n",
386 POWER_SYSFILE_BASE_MAX_FREQ);
390 ret = read_core_sysfs_u32(f_min, &sys_min_freq);
392 RTE_LOG(ERR, POWER, "Failed to read %s\n",
393 POWER_SYSFILE_BASE_MIN_FREQ);
397 if (sys_max_freq < sys_min_freq)
400 pi->sys_max_freq = sys_max_freq;
402 if (pi->priority_core == 1)
403 base_max_freq = pi->core_base_freq;
405 base_max_freq = pi->non_turbo_max_ratio * BUS_FREQ;
407 POWER_DEBUG_TRACE("sys min %u, sys max %u, base_max %u\n",
412 if (base_max_freq < sys_max_freq)
413 pi->turbo_available = 1;
415 pi->turbo_available = 0;
417 /* If turbo is available then there is one extra freq bucket
418 * to store the sys max freq which value is base_max +1
420 num_freqs = (base_max_freq - sys_min_freq) / BUS_FREQ + 1 +
423 /* Generate the freq bucket array.
424 * If turbo is available the freq bucket[0] value is base_max +1
425 * the bucket[1] is base_max, bucket[2] is base_max - BUS_FREQ
427 * If turbo is not available bucket[0] is base_max and so on
429 for (i = 0, pi->nb_freqs = 0; i < num_freqs; i++) {
430 if ((i == 0) && pi->turbo_available)
431 pi->freqs[pi->nb_freqs++] = base_max_freq + 1;
433 pi->freqs[pi->nb_freqs++] =
434 base_max_freq - (i - pi->turbo_available) * BUS_FREQ;
439 POWER_DEBUG_TRACE("%d frequency(s) of lcore %u are available\n",
440 num_freqs, pi->lcore_id);
450 power_get_cur_idx(struct pstate_power_info *pi)
454 uint32_t sys_cur_freq = 0;
457 open_core_sysfs_file(&f_cur, "r", POWER_SYSFILE_CUR_FREQ,
460 RTE_LOG(ERR, POWER, "failed to open %s\n",
461 POWER_SYSFILE_CUR_FREQ);
465 ret = read_core_sysfs_u32(f_cur, &sys_cur_freq);
467 RTE_LOG(ERR, POWER, "Failed to read %s\n",
468 POWER_SYSFILE_CUR_FREQ);
472 /* convert the frequency to nearest 100000 value
473 * Ex: if sys_cur_freq=1396789 then freq_conv=1400000
474 * Ex: if sys_cur_freq=800030 then freq_conv=800000
475 * Ex: if sys_cur_freq=800030 then freq_conv=800000
477 unsigned int freq_conv = 0;
478 freq_conv = (sys_cur_freq + FREQ_ROUNDING_DELTA)
479 / ROUND_FREQ_TO_N_100000;
480 freq_conv = freq_conv * ROUND_FREQ_TO_N_100000;
482 for (i = 0; i < pi->nb_freqs; i++) {
483 if (freq_conv == pi->freqs[i]) {
497 power_pstate_cpufreq_check_supported(void)
499 return cpufreq_check_scaling_driver(POWER_PSTATE_DRIVER);
503 power_pstate_cpufreq_init(unsigned int lcore_id)
505 struct pstate_power_info *pi;
508 if (lcore_id >= RTE_MAX_LCORE) {
509 RTE_LOG(ERR, POWER, "Lcore id %u can not exceed %u\n",
510 lcore_id, RTE_MAX_LCORE - 1U);
514 pi = &lcore_power_info[lcore_id];
515 exp_state = POWER_IDLE;
516 /* The power in use state works as a guard variable between
517 * the CPU frequency control initialization and exit process.
518 * The ACQUIRE memory ordering here pairs with the RELEASE
519 * ordering below as lock to make sure the frequency operations
520 * in the critical section are done under the correct state.
522 if (!__atomic_compare_exchange_n(&(pi->state), &exp_state,
524 __ATOMIC_ACQUIRE, __ATOMIC_RELAXED)) {
525 RTE_LOG(INFO, POWER, "Power management of lcore %u is "
526 "in use\n", lcore_id);
530 pi->lcore_id = lcore_id;
531 /* Check and set the governor */
532 if (power_set_governor_performance(pi) < 0) {
533 RTE_LOG(ERR, POWER, "Cannot set governor of lcore %u to "
534 "performance\n", lcore_id);
537 /* Init for setting lcore frequency */
538 if (power_init_for_setting_freq(pi) < 0) {
539 RTE_LOG(ERR, POWER, "Cannot init for setting frequency for "
540 "lcore %u\n", lcore_id);
544 /* Get the available frequencies */
545 if (power_get_available_freqs(pi) < 0) {
546 RTE_LOG(ERR, POWER, "Cannot get available frequencies of "
547 "lcore %u\n", lcore_id);
551 if (power_get_cur_idx(pi) < 0) {
552 RTE_LOG(ERR, POWER, "Cannot get current frequency "
553 "index of lcore %u\n", lcore_id);
557 /* Set freq to max by default */
558 if (power_pstate_cpufreq_freq_max(lcore_id) < 0) {
559 RTE_LOG(ERR, POWER, "Cannot set frequency of lcore %u "
560 "to max\n", lcore_id);
564 RTE_LOG(INFO, POWER, "Initialized successfully for lcore %u "
565 "power management\n", lcore_id);
566 exp_state = POWER_ONGOING;
567 __atomic_compare_exchange_n(&(pi->state), &exp_state, POWER_USED,
568 0, __ATOMIC_RELEASE, __ATOMIC_RELAXED);
573 exp_state = POWER_ONGOING;
574 __atomic_compare_exchange_n(&(pi->state), &exp_state, POWER_UNKNOWN,
575 0, __ATOMIC_RELEASE, __ATOMIC_RELAXED);
581 power_pstate_cpufreq_exit(unsigned int lcore_id)
583 struct pstate_power_info *pi;
586 if (lcore_id >= RTE_MAX_LCORE) {
587 RTE_LOG(ERR, POWER, "Lcore id %u can not exceeds %u\n",
588 lcore_id, RTE_MAX_LCORE - 1U);
591 pi = &lcore_power_info[lcore_id];
593 exp_state = POWER_USED;
594 /* The power in use state works as a guard variable between
595 * the CPU frequency control initialization and exit process.
596 * The ACQUIRE memory ordering here pairs with the RELEASE
597 * ordering below as lock to make sure the frequency operations
598 * in the critical section are under done the correct state.
600 if (!__atomic_compare_exchange_n(&(pi->state), &exp_state,
602 __ATOMIC_ACQUIRE, __ATOMIC_RELAXED)) {
603 RTE_LOG(INFO, POWER, "Power management of lcore %u is "
604 "not used\n", lcore_id);
608 /* Close FD of setting freq */
609 fclose(pi->f_cur_min);
610 fclose(pi->f_cur_max);
611 pi->f_cur_min = NULL;
612 pi->f_cur_max = NULL;
614 /* Set the governor back to the original */
615 if (power_set_governor_original(pi) < 0) {
616 RTE_LOG(ERR, POWER, "Cannot set the governor of %u back "
617 "to the original\n", lcore_id);
621 RTE_LOG(INFO, POWER, "Power management of lcore %u has exited from "
622 "'performance' mode and been set back to the "
623 "original\n", lcore_id);
624 exp_state = POWER_ONGOING;
625 __atomic_compare_exchange_n(&(pi->state), &exp_state, POWER_IDLE,
626 0, __ATOMIC_RELEASE, __ATOMIC_RELAXED);
631 exp_state = POWER_ONGOING;
632 __atomic_compare_exchange_n(&(pi->state), &exp_state, POWER_UNKNOWN,
633 0, __ATOMIC_RELEASE, __ATOMIC_RELAXED);
640 power_pstate_cpufreq_freqs(unsigned int lcore_id, uint32_t *freqs, uint32_t num)
642 struct pstate_power_info *pi;
644 if (lcore_id >= RTE_MAX_LCORE) {
645 RTE_LOG(ERR, POWER, "Invalid lcore ID\n");
650 RTE_LOG(ERR, POWER, "NULL buffer supplied\n");
654 pi = &lcore_power_info[lcore_id];
655 if (num < pi->nb_freqs) {
656 RTE_LOG(ERR, POWER, "Buffer size is not enough\n");
659 rte_memcpy(freqs, pi->freqs, pi->nb_freqs * sizeof(uint32_t));
665 power_pstate_cpufreq_get_freq(unsigned int lcore_id)
667 if (lcore_id >= RTE_MAX_LCORE) {
668 RTE_LOG(ERR, POWER, "Invalid lcore ID\n");
669 return RTE_POWER_INVALID_FREQ_INDEX;
672 return lcore_power_info[lcore_id].curr_idx;
677 power_pstate_cpufreq_set_freq(unsigned int lcore_id, uint32_t index)
679 if (lcore_id >= RTE_MAX_LCORE) {
680 RTE_LOG(ERR, POWER, "Invalid lcore ID\n");
684 return set_freq_internal(&(lcore_power_info[lcore_id]), index);
688 power_pstate_cpufreq_freq_up(unsigned int lcore_id)
690 struct pstate_power_info *pi;
692 if (lcore_id >= RTE_MAX_LCORE) {
693 RTE_LOG(ERR, POWER, "Invalid lcore ID\n");
697 pi = &lcore_power_info[lcore_id];
698 if (pi->curr_idx == 0 ||
699 (pi->curr_idx == 1 && pi->turbo_available && !pi->turbo_enable))
702 /* Frequencies in the array are from high to low. */
703 return set_freq_internal(pi, pi->curr_idx - 1);
707 power_pstate_cpufreq_freq_down(unsigned int lcore_id)
709 struct pstate_power_info *pi;
711 if (lcore_id >= RTE_MAX_LCORE) {
712 RTE_LOG(ERR, POWER, "Invalid lcore ID\n");
716 pi = &lcore_power_info[lcore_id];
717 if (pi->curr_idx + 1 == pi->nb_freqs)
720 /* Frequencies in the array are from high to low. */
721 return set_freq_internal(pi, pi->curr_idx + 1);
725 power_pstate_cpufreq_freq_max(unsigned int lcore_id)
727 if (lcore_id >= RTE_MAX_LCORE) {
728 RTE_LOG(ERR, POWER, "Invalid lcore ID\n");
732 /* Frequencies in the array are from high to low. */
733 if (lcore_power_info[lcore_id].turbo_available) {
734 if (lcore_power_info[lcore_id].turbo_enable)
736 return set_freq_internal(
737 &lcore_power_info[lcore_id], 0);
739 /* Set to max non-turbo */
740 return set_freq_internal(
741 &lcore_power_info[lcore_id], 1);
743 return set_freq_internal(&lcore_power_info[lcore_id], 0);
748 power_pstate_cpufreq_freq_min(unsigned int lcore_id)
750 struct pstate_power_info *pi;
752 if (lcore_id >= RTE_MAX_LCORE) {
753 RTE_LOG(ERR, POWER, "Invalid lcore ID\n");
757 pi = &lcore_power_info[lcore_id];
759 /* Frequencies in the array are from high to low. */
760 return set_freq_internal(pi, pi->nb_freqs - 1);
765 power_pstate_turbo_status(unsigned int lcore_id)
767 struct pstate_power_info *pi;
769 if (lcore_id >= RTE_MAX_LCORE) {
770 RTE_LOG(ERR, POWER, "Invalid lcore ID\n");
774 pi = &lcore_power_info[lcore_id];
776 return pi->turbo_enable;
780 power_pstate_enable_turbo(unsigned int lcore_id)
782 struct pstate_power_info *pi;
784 if (lcore_id >= RTE_MAX_LCORE) {
785 RTE_LOG(ERR, POWER, "Invalid lcore ID\n");
789 pi = &lcore_power_info[lcore_id];
791 if (pi->turbo_available)
792 pi->turbo_enable = 1;
794 pi->turbo_enable = 0;
796 "Failed to enable turbo on lcore %u\n",
806 power_pstate_disable_turbo(unsigned int lcore_id)
808 struct pstate_power_info *pi;
810 if (lcore_id >= RTE_MAX_LCORE) {
811 RTE_LOG(ERR, POWER, "Invalid lcore ID\n");
815 pi = &lcore_power_info[lcore_id];
817 pi->turbo_enable = 0;
819 if (pi->turbo_available && pi->curr_idx <= 1) {
820 /* Try to set freq to max by default coming out of turbo */
821 if (power_pstate_cpufreq_freq_max(lcore_id) < 0) {
823 "Failed to set frequency of lcore %u to max\n",
833 int power_pstate_get_capabilities(unsigned int lcore_id,
834 struct rte_power_core_capabilities *caps)
836 struct pstate_power_info *pi;
838 if (lcore_id >= RTE_MAX_LCORE) {
839 RTE_LOG(ERR, POWER, "Invalid lcore ID\n");
843 RTE_LOG(ERR, POWER, "Invalid argument\n");
847 pi = &lcore_power_info[lcore_id];
848 caps->capabilities = 0;
849 caps->turbo = !!(pi->turbo_available);
850 caps->priority = pi->priority_core;