#include <errno.h>
#include <inttypes.h>
-#include <rte_atomic.h>
#include <rte_memcpy.h>
#include <rte_memory.h>
#include <rte_string_fns.h>
} \
} while (0)
+/* macros used for rounding frequency to nearest 100000 */
+#define FREQ_ROUNDING_DELTA 50000
+#define ROUND_FREQ_TO_N_100000 100000
#define POWER_CONVERT_TO_DECIMAL 10
#define BUS_FREQ 100000
"/sys/devices/system/cpu/cpu%u/cpufreq/cpuinfo_min_freq"
#define POWER_SYSFILE_BASE_FREQ \
"/sys/devices/system/cpu/cpu%u/cpufreq/base_frequency"
+#define POWER_PSTATE_DRIVER "intel_pstate"
#define POWER_MSR_PATH "/dev/cpu/%u/msr"
/*
uint32_t non_turbo_max_ratio; /**< Non Turbo Max ratio */
uint32_t sys_max_freq; /**< system wide max freq */
uint32_t core_base_freq; /**< core base freq */
- volatile uint32_t state; /**< Power in use state */
+ uint32_t state; /**< Power in use state */
uint16_t turbo_available; /**< Turbo Boost available */
uint16_t turbo_enable; /**< Turbo Boost enable/disable */
uint16_t priority_core; /**< High Performance core */
static int
power_init_for_setting_freq(struct pstate_power_info *pi)
{
- FILE *f_min, *f_max, *f_base;
+ FILE *f_min, *f_max, *f_base = NULL, *f_base_max;
char fullpath_min[PATH_MAX];
char fullpath_max[PATH_MAX];
char fullpath_base[PATH_MAX];
+ char fullpath_base_max[PATH_MAX];
char buf_base[BUFSIZ];
char *s_base;
+ char *s_base_max;
uint32_t base_ratio = 0;
+ uint32_t base_max_ratio = 0;
uint64_t max_non_turbo = 0;
int ret_val = 0;
- snprintf(fullpath_min, sizeof(fullpath_min), POWER_SYSFILE_MIN_FREQ,
+ snprintf(fullpath_base_max,
+ sizeof(fullpath_base_max),
+ POWER_SYSFILE_BASE_MAX_FREQ,
pi->lcore_id);
+ f_base_max = fopen(fullpath_base_max, "r");
+ FOPEN_OR_ERR_RET(f_base_max, -1);
+ if (f_base_max != NULL) {
+ s_base_max = fgets(buf_base, sizeof(buf_base), f_base_max);
+
+ /* close the file unconditionally */
+ fclose(f_base_max);
+ f_base_max = NULL;
+
+ FOPS_OR_NULL_GOTO(s_base_max, out);
+
+ buf_base[BUFSIZ-1] = '\0';
+ if (strlen(buf_base))
+ /* Strip off terminating '\n' */
+ strtok(buf_base, "\n");
+
+ base_max_ratio =
+ strtoul(buf_base, NULL, POWER_CONVERT_TO_DECIMAL)
+ / BUS_FREQ;
+ }
+ snprintf(fullpath_min, sizeof(fullpath_min), POWER_SYSFILE_MIN_FREQ,
+ pi->lcore_id);
f_min = fopen(fullpath_min, "rw+");
FOPEN_OR_ERR_RET(f_min, -1);
snprintf(fullpath_max, sizeof(fullpath_max), POWER_SYSFILE_MAX_FREQ,
pi->lcore_id);
-
f_max = fopen(fullpath_max, "rw+");
if (f_max == NULL)
fclose(f_min);
-
FOPEN_OR_ERR_RET(f_max, -1);
pi->f_cur_min = f_min;
pi->non_turbo_max_ratio = max_non_turbo;
+ /*
+ * If base_frequency is reported as greater than the maximum
+ * turbo frequency, that's a known issue with some kernels.
+ * Set base_frequency to max_non_turbo as a workaround.
+ */
+ if (base_ratio > base_max_ratio) {
+ /* base_ratio is greater than max turbo. Kernel bug. */
+ pi->priority_core = 0;
+ goto out;
+ }
+
/*
* If base_frequency is reported as greater than the maximum
* non-turbo frequency, then mark it as a high priority core.
/* Strip off terminating '\n' */
strtok(buf, "\n");
+ /* Save the original governor */
+ rte_strscpy(pi->governor_ori, buf, sizeof(pi->governor_ori));
+
/* Check if current governor is performance */
if (strncmp(buf, POWER_GOVERNOR_PERF,
sizeof(POWER_GOVERNOR_PERF)) == 0) {
"already performance\n", pi->lcore_id);
goto out;
}
- /* Save the original governor */
- strlcpy(pi->governor_ori, buf, sizeof(pi->governor_ori));
/* Write 'performance' to the governor */
val = fseek(f, 0, SEEK_SET);
return ret;
}
+static int
+power_get_cur_idx(struct pstate_power_info *pi)
+{
+ FILE *f_cur;
+ int ret = -1;
+ char *p_cur;
+ char buf_cur[BUFSIZ];
+ char fullpath_cur[PATH_MAX];
+ char *s_cur;
+ uint32_t sys_cur_freq = 0;
+ unsigned int i;
+
+ snprintf(fullpath_cur, sizeof(fullpath_cur),
+ POWER_SYSFILE_CUR_FREQ,
+ pi->lcore_id);
+ f_cur = fopen(fullpath_cur, "r");
+ FOPEN_OR_ERR_RET(f_cur, ret);
+
+ /* initialize the cur_idx to matching current frequency freq index */
+ s_cur = fgets(buf_cur, sizeof(buf_cur), f_cur);
+ FOPS_OR_NULL_GOTO(s_cur, fail);
+
+ p_cur = strchr(buf_cur, '\n');
+ if (p_cur != NULL)
+ *p_cur = 0;
+ sys_cur_freq = strtoul(buf_cur, &p_cur, POWER_CONVERT_TO_DECIMAL);
+
+ /* convert the frequency to nearest 100000 value
+ * Ex: if sys_cur_freq=1396789 then freq_conv=1400000
+ * Ex: if sys_cur_freq=800030 then freq_conv=800000
+ * Ex: if sys_cur_freq=800030 then freq_conv=800000
+ */
+ unsigned int freq_conv = 0;
+ freq_conv = (sys_cur_freq + FREQ_ROUNDING_DELTA)
+ / ROUND_FREQ_TO_N_100000;
+ freq_conv = freq_conv * ROUND_FREQ_TO_N_100000;
+
+ for (i = 0; i < pi->nb_freqs; i++) {
+ if (freq_conv == pi->freqs[i]) {
+ pi->curr_idx = i;
+ break;
+ }
+ }
+
+ fclose(f_cur);
+ return 0;
+fail:
+ fclose(f_cur);
+ return ret;
+}
+
+int
+power_pstate_cpufreq_check_supported(void)
+{
+ return cpufreq_check_scaling_driver(POWER_PSTATE_DRIVER);
+}
+
int
power_pstate_cpufreq_init(unsigned int lcore_id)
{
struct pstate_power_info *pi;
+ uint32_t exp_state;
if (lcore_id >= RTE_MAX_LCORE) {
RTE_LOG(ERR, POWER, "Lcore id %u can not exceed %u\n",
}
pi = &lcore_power_info[lcore_id];
- if (rte_atomic32_cmpset(&(pi->state), POWER_IDLE, POWER_ONGOING)
- == 0) {
+ exp_state = POWER_IDLE;
+ /* The power in use state works as a guard variable between
+ * the CPU frequency control initialization and exit process.
+ * The ACQUIRE memory ordering here pairs with the RELEASE
+ * ordering below as lock to make sure the frequency operations
+ * in the critical section are done under the correct state.
+ */
+ if (!__atomic_compare_exchange_n(&(pi->state), &exp_state,
+ POWER_ONGOING, 0,
+ __ATOMIC_ACQUIRE, __ATOMIC_RELAXED)) {
RTE_LOG(INFO, POWER, "Power management of lcore %u is "
"in use\n", lcore_id);
return -1;
goto fail;
}
+ if (power_get_cur_idx(pi) < 0) {
+ RTE_LOG(ERR, POWER, "Cannot get current frequency "
+ "index of lcore %u\n", lcore_id);
+ goto fail;
+ }
/* Set freq to max by default */
if (power_pstate_cpufreq_freq_max(lcore_id) < 0) {
RTE_LOG(INFO, POWER, "Initialized successfully for lcore %u "
"power management\n", lcore_id);
- rte_atomic32_cmpset(&(pi->state), POWER_ONGOING, POWER_USED);
+ exp_state = POWER_ONGOING;
+ __atomic_compare_exchange_n(&(pi->state), &exp_state, POWER_USED,
+ 0, __ATOMIC_RELEASE, __ATOMIC_RELAXED);
return 0;
fail:
- rte_atomic32_cmpset(&(pi->state), POWER_ONGOING, POWER_UNKNOWN);
+ exp_state = POWER_ONGOING;
+ __atomic_compare_exchange_n(&(pi->state), &exp_state, POWER_UNKNOWN,
+ 0, __ATOMIC_RELEASE, __ATOMIC_RELAXED);
return -1;
}
power_pstate_cpufreq_exit(unsigned int lcore_id)
{
struct pstate_power_info *pi;
+ uint32_t exp_state;
if (lcore_id >= RTE_MAX_LCORE) {
RTE_LOG(ERR, POWER, "Lcore id %u can not exceeds %u\n",
}
pi = &lcore_power_info[lcore_id];
- if (rte_atomic32_cmpset(&(pi->state), POWER_USED, POWER_ONGOING)
- == 0) {
+ exp_state = POWER_USED;
+ /* The power in use state works as a guard variable between
+ * the CPU frequency control initialization and exit process.
+ * The ACQUIRE memory ordering here pairs with the RELEASE
+ * ordering below as lock to make sure the frequency operations
+ * in the critical section are under done the correct state.
+ */
+ if (!__atomic_compare_exchange_n(&(pi->state), &exp_state,
+ POWER_ONGOING, 0,
+ __ATOMIC_ACQUIRE, __ATOMIC_RELAXED)) {
RTE_LOG(INFO, POWER, "Power management of lcore %u is "
"not used\n", lcore_id);
return -1;
RTE_LOG(INFO, POWER, "Power management of lcore %u has exited from "
"'performance' mode and been set back to the "
"original\n", lcore_id);
- rte_atomic32_cmpset(&(pi->state), POWER_ONGOING, POWER_IDLE);
+ exp_state = POWER_ONGOING;
+ __atomic_compare_exchange_n(&(pi->state), &exp_state, POWER_IDLE,
+ 0, __ATOMIC_RELEASE, __ATOMIC_RELAXED);
return 0;
fail:
- rte_atomic32_cmpset(&(pi->state), POWER_ONGOING, POWER_UNKNOWN);
+ exp_state = POWER_ONGOING;
+ __atomic_compare_exchange_n(&(pi->state), &exp_state, POWER_UNKNOWN,
+ 0, __ATOMIC_RELEASE, __ATOMIC_RELAXED);
return -1;
}