doc: add Meson coding style to contributors guide
[dpdk.git] / lib / librte_power / power_pstate_cpufreq.c
index 2d8a949..2cfc54a 100644 (file)
@@ -14,7 +14,6 @@
 #include <errno.h>
 #include <inttypes.h>
 
-#include <rte_atomic.h>
 #include <rte_memcpy.h>
 #include <rte_memory.h>
 #include <rte_string_fns.h>
@@ -52,6 +51,9 @@
                } \
 } while (0)
 
+/* macros used for rounding frequency to nearest 100000 */
+#define FREQ_ROUNDING_DELTA 50000
+#define ROUND_FREQ_TO_N_100000 100000
 
 #define POWER_CONVERT_TO_DECIMAL 10
 #define BUS_FREQ     100000
@@ -71,6 +73,7 @@
                "/sys/devices/system/cpu/cpu%u/cpufreq/cpuinfo_min_freq"
 #define POWER_SYSFILE_BASE_FREQ  \
                "/sys/devices/system/cpu/cpu%u/cpufreq/base_frequency"
+#define POWER_PSTATE_DRIVER "intel_pstate"
 #define POWER_MSR_PATH  "/dev/cpu/%u/msr"
 
 /*
@@ -99,7 +102,7 @@ struct pstate_power_info {
        uint32_t non_turbo_max_ratio;        /**< Non Turbo Max ratio  */
        uint32_t sys_max_freq;               /**< system wide max freq  */
        uint32_t core_base_freq;             /**< core base freq  */
-       volatile uint32_t state;             /**< Power in use state */
+       uint32_t state;                      /**< Power in use state */
        uint16_t turbo_available;            /**< Turbo Boost available */
        uint16_t turbo_enable;               /**< Turbo Boost enable/disable */
        uint16_t priority_core;              /**< High Performance core */
@@ -151,29 +154,54 @@ out:      close(fd);
 static int
 power_init_for_setting_freq(struct pstate_power_info *pi)
 {
-       FILE *f_min, *f_max, *f_base;
+       FILE *f_min, *f_max, *f_base = NULL, *f_base_max;
        char fullpath_min[PATH_MAX];
        char fullpath_max[PATH_MAX];
        char fullpath_base[PATH_MAX];
+       char fullpath_base_max[PATH_MAX];
        char buf_base[BUFSIZ];
        char *s_base;
+       char *s_base_max;
        uint32_t base_ratio = 0;
+       uint32_t base_max_ratio = 0;
        uint64_t max_non_turbo = 0;
        int  ret_val = 0;
 
-       snprintf(fullpath_min, sizeof(fullpath_min), POWER_SYSFILE_MIN_FREQ,
+       snprintf(fullpath_base_max,
+                       sizeof(fullpath_base_max),
+                       POWER_SYSFILE_BASE_MAX_FREQ,
                        pi->lcore_id);
+       f_base_max = fopen(fullpath_base_max, "r");
+       FOPEN_OR_ERR_RET(f_base_max, -1);
+       if (f_base_max != NULL) {
+               s_base_max = fgets(buf_base, sizeof(buf_base), f_base_max);
+
+               /* close the file unconditionally */
+               fclose(f_base_max);
+               f_base_max = NULL;
+
+               FOPS_OR_NULL_GOTO(s_base_max, out);
+
+               buf_base[BUFSIZ-1] = '\0';
+               if (strlen(buf_base))
+                       /* Strip off terminating '\n' */
+                       strtok(buf_base, "\n");
+
+               base_max_ratio =
+                       strtoul(buf_base, NULL, POWER_CONVERT_TO_DECIMAL)
+                               / BUS_FREQ;
+       }
 
+       snprintf(fullpath_min, sizeof(fullpath_min), POWER_SYSFILE_MIN_FREQ,
+                       pi->lcore_id);
        f_min = fopen(fullpath_min, "rw+");
        FOPEN_OR_ERR_RET(f_min, -1);
 
        snprintf(fullpath_max, sizeof(fullpath_max), POWER_SYSFILE_MAX_FREQ,
                        pi->lcore_id);
-
        f_max = fopen(fullpath_max, "rw+");
        if (f_max == NULL)
                fclose(f_min);
-
        FOPEN_OR_ERR_RET(f_max, -1);
 
        pi->f_cur_min = f_min;
@@ -212,6 +240,17 @@ power_init_for_setting_freq(struct pstate_power_info *pi)
 
        pi->non_turbo_max_ratio = max_non_turbo;
 
+       /*
+        * If base_frequency is reported as greater than the maximum
+        * turbo frequency, that's a known issue with some kernels.
+        * Set base_frequency to max_non_turbo as a workaround.
+        */
+       if (base_ratio > base_max_ratio) {
+               /* base_ratio is greater than max turbo. Kernel bug. */
+               pi->priority_core = 0;
+               goto out;
+       }
+
        /*
         * If base_frequency is reported as greater than the maximum
         * non-turbo frequency, then mark it as a high priority core.
@@ -347,6 +386,9 @@ power_set_governor_performance(struct pstate_power_info *pi)
        /* Strip off terminating '\n' */
        strtok(buf, "\n");
 
+       /* Save the original governor */
+       rte_strscpy(pi->governor_ori, buf, sizeof(pi->governor_ori));
+
        /* Check if current governor is performance */
        if (strncmp(buf, POWER_GOVERNOR_PERF,
                        sizeof(POWER_GOVERNOR_PERF)) == 0) {
@@ -355,8 +397,6 @@ power_set_governor_performance(struct pstate_power_info *pi)
                                "already performance\n", pi->lcore_id);
                goto out;
        }
-       /* Save the original governor */
-       strlcpy(pi->governor_ori, buf, sizeof(pi->governor_ori));
 
        /* Write 'performance' to the governor */
        val = fseek(f, 0, SEEK_SET);
@@ -531,10 +571,68 @@ out:
        return ret;
 }
 
+static int
+power_get_cur_idx(struct pstate_power_info *pi)
+{
+       FILE *f_cur;
+       int ret = -1;
+       char *p_cur;
+       char buf_cur[BUFSIZ];
+       char fullpath_cur[PATH_MAX];
+       char *s_cur;
+       uint32_t sys_cur_freq = 0;
+       unsigned int i;
+
+       snprintf(fullpath_cur, sizeof(fullpath_cur),
+                       POWER_SYSFILE_CUR_FREQ,
+                       pi->lcore_id);
+       f_cur = fopen(fullpath_cur, "r");
+       FOPEN_OR_ERR_RET(f_cur, ret);
+
+       /* initialize the cur_idx to matching current frequency freq index */
+       s_cur = fgets(buf_cur, sizeof(buf_cur), f_cur);
+       FOPS_OR_NULL_GOTO(s_cur, fail);
+
+       p_cur = strchr(buf_cur, '\n');
+       if (p_cur != NULL)
+               *p_cur = 0;
+       sys_cur_freq = strtoul(buf_cur, &p_cur, POWER_CONVERT_TO_DECIMAL);
+
+       /* convert the frequency to nearest 100000 value
+        * Ex: if sys_cur_freq=1396789 then freq_conv=1400000
+        * Ex: if sys_cur_freq=800030 then freq_conv=800000
+        * Ex: if sys_cur_freq=800030 then freq_conv=800000
+        */
+       unsigned int freq_conv = 0;
+       freq_conv = (sys_cur_freq + FREQ_ROUNDING_DELTA)
+                               / ROUND_FREQ_TO_N_100000;
+       freq_conv = freq_conv * ROUND_FREQ_TO_N_100000;
+
+       for (i = 0; i < pi->nb_freqs; i++) {
+               if (freq_conv == pi->freqs[i]) {
+                       pi->curr_idx = i;
+                       break;
+               }
+       }
+
+       fclose(f_cur);
+       return 0;
+fail:
+       fclose(f_cur);
+       return ret;
+}
+
+int
+power_pstate_cpufreq_check_supported(void)
+{
+       return cpufreq_check_scaling_driver(POWER_PSTATE_DRIVER);
+}
+
 int
 power_pstate_cpufreq_init(unsigned int lcore_id)
 {
        struct pstate_power_info *pi;
+       uint32_t exp_state;
 
        if (lcore_id >= RTE_MAX_LCORE) {
                RTE_LOG(ERR, POWER, "Lcore id %u can not exceed %u\n",
@@ -543,8 +641,16 @@ power_pstate_cpufreq_init(unsigned int lcore_id)
        }
 
        pi = &lcore_power_info[lcore_id];
-       if (rte_atomic32_cmpset(&(pi->state), POWER_IDLE, POWER_ONGOING)
-                       == 0) {
+       exp_state = POWER_IDLE;
+       /* The power in use state works as a guard variable between
+        * the CPU frequency control initialization and exit process.
+        * The ACQUIRE memory ordering here pairs with the RELEASE
+        * ordering below as lock to make sure the frequency operations
+        * in the critical section are done under the correct state.
+        */
+       if (!__atomic_compare_exchange_n(&(pi->state), &exp_state,
+                                       POWER_ONGOING, 0,
+                                       __ATOMIC_ACQUIRE, __ATOMIC_RELAXED)) {
                RTE_LOG(INFO, POWER, "Power management of lcore %u is "
                                "in use\n", lcore_id);
                return -1;
@@ -571,6 +677,11 @@ power_pstate_cpufreq_init(unsigned int lcore_id)
                goto fail;
        }
 
+       if (power_get_cur_idx(pi) < 0) {
+               RTE_LOG(ERR, POWER, "Cannot get current frequency "
+                               "index of lcore %u\n", lcore_id);
+               goto fail;
+       }
 
        /* Set freq to max by default */
        if (power_pstate_cpufreq_freq_max(lcore_id) < 0) {
@@ -581,12 +692,16 @@ power_pstate_cpufreq_init(unsigned int lcore_id)
 
        RTE_LOG(INFO, POWER, "Initialized successfully for lcore %u "
                        "power management\n", lcore_id);
-       rte_atomic32_cmpset(&(pi->state), POWER_ONGOING, POWER_USED);
+       exp_state = POWER_ONGOING;
+       __atomic_compare_exchange_n(&(pi->state), &exp_state, POWER_USED,
+                                   0, __ATOMIC_RELEASE, __ATOMIC_RELAXED);
 
        return 0;
 
 fail:
-       rte_atomic32_cmpset(&(pi->state), POWER_ONGOING, POWER_UNKNOWN);
+       exp_state = POWER_ONGOING;
+       __atomic_compare_exchange_n(&(pi->state), &exp_state, POWER_UNKNOWN,
+                                   0, __ATOMIC_RELEASE, __ATOMIC_RELAXED);
 
        return -1;
 }
@@ -595,6 +710,7 @@ int
 power_pstate_cpufreq_exit(unsigned int lcore_id)
 {
        struct pstate_power_info *pi;
+       uint32_t exp_state;
 
        if (lcore_id >= RTE_MAX_LCORE) {
                RTE_LOG(ERR, POWER, "Lcore id %u can not exceeds %u\n",
@@ -603,8 +719,16 @@ power_pstate_cpufreq_exit(unsigned int lcore_id)
        }
        pi = &lcore_power_info[lcore_id];
 
-       if (rte_atomic32_cmpset(&(pi->state), POWER_USED, POWER_ONGOING)
-                       == 0) {
+       exp_state = POWER_USED;
+       /* The power in use state works as a guard variable between
+        * the CPU frequency control initialization and exit process.
+        * The ACQUIRE memory ordering here pairs with the RELEASE
+        * ordering below as lock to make sure the frequency operations
+        * in the critical section are under done the correct state.
+        */
+       if (!__atomic_compare_exchange_n(&(pi->state), &exp_state,
+                                       POWER_ONGOING, 0,
+                                       __ATOMIC_ACQUIRE, __ATOMIC_RELAXED)) {
                RTE_LOG(INFO, POWER, "Power management of lcore %u is "
                                "not used\n", lcore_id);
                return -1;
@@ -626,12 +750,16 @@ power_pstate_cpufreq_exit(unsigned int lcore_id)
        RTE_LOG(INFO, POWER, "Power management of lcore %u has exited from "
                        "'performance' mode and been set back to the "
                        "original\n", lcore_id);
-       rte_atomic32_cmpset(&(pi->state), POWER_ONGOING, POWER_IDLE);
+       exp_state = POWER_ONGOING;
+       __atomic_compare_exchange_n(&(pi->state), &exp_state, POWER_IDLE,
+                                   0, __ATOMIC_RELEASE, __ATOMIC_RELAXED);
 
        return 0;
 
 fail:
-       rte_atomic32_cmpset(&(pi->state), POWER_ONGOING, POWER_UNKNOWN);
+       exp_state = POWER_ONGOING;
+       __atomic_compare_exchange_n(&(pi->state), &exp_state, POWER_UNKNOWN,
+                                   0, __ATOMIC_RELEASE, __ATOMIC_RELAXED);
 
        return -1;
 }