power: add ethdev power management
[dpdk.git] / lib / librte_power / rte_power_pmd_mgmt.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2020 Intel Corporation
3  */
4
5 #include <rte_lcore.h>
6 #include <rte_cycles.h>
7 #include <rte_cpuflags.h>
8 #include <rte_malloc.h>
9 #include <rte_ethdev.h>
10 #include <rte_power_intrinsics.h>
11
12 #include "rte_power_pmd_mgmt.h"
13
14 #define EMPTYPOLL_MAX  512
15
16 /* store some internal state */
17 static struct pmd_conf_data {
18         /** what do we support? */
19         struct rte_cpu_intrinsics intrinsics_support;
20         /** pre-calculated tsc diff for 1us */
21         uint64_t tsc_per_us;
22         /** how many rte_pause can we fit in a microsecond? */
23         uint64_t pause_per_us;
24 } global_data;
25
26 /**
27  * Possible power management states of an ethdev port.
28  */
29 enum pmd_mgmt_state {
30         /** Device power management is disabled. */
31         PMD_MGMT_DISABLED = 0,
32         /** Device power management is enabled. */
33         PMD_MGMT_ENABLED
34 };
35
36 struct pmd_queue_cfg {
37         volatile enum pmd_mgmt_state pwr_mgmt_state;
38         /**< State of power management for this queue */
39         enum rte_power_pmd_mgmt_type cb_mode;
40         /**< Callback mode for this queue */
41         const struct rte_eth_rxtx_callback *cur_cb;
42         /**< Callback instance */
43         volatile bool umwait_in_progress;
44         /**< are we currently sleeping? */
45         uint64_t empty_poll_stats;
46         /**< Number of empty polls */
47 } __rte_cache_aligned;
48
49 static struct pmd_queue_cfg port_cfg[RTE_MAX_ETHPORTS][RTE_MAX_QUEUES_PER_PORT];
50
51 static void
52 calc_tsc(void)
53 {
54         const uint64_t hz = rte_get_timer_hz();
55         const uint64_t tsc_per_us = hz / US_PER_S; /* 1us */
56
57         global_data.tsc_per_us = tsc_per_us;
58
59         /* only do this if we don't have tpause */
60         if (!global_data.intrinsics_support.power_pause) {
61                 const uint64_t start = rte_rdtsc_precise();
62                 const uint32_t n_pauses = 10000;
63                 double us, us_per_pause;
64                 uint64_t end;
65                 unsigned int i;
66
67                 /* estimate number of rte_pause() calls per us*/
68                 for (i = 0; i < n_pauses; i++)
69                         rte_pause();
70
71                 end = rte_rdtsc_precise();
72                 us = (end - start) / (double)tsc_per_us;
73                 us_per_pause = us / n_pauses;
74
75                 global_data.pause_per_us = (uint64_t)(1.0 / us_per_pause);
76         }
77 }
78
79 static uint16_t
80 clb_umwait(uint16_t port_id, uint16_t qidx, struct rte_mbuf **pkts __rte_unused,
81                 uint16_t nb_rx, uint16_t max_pkts __rte_unused,
82                 void *addr __rte_unused)
83 {
84
85         struct pmd_queue_cfg *q_conf;
86
87         q_conf = &port_cfg[port_id][qidx];
88
89         if (unlikely(nb_rx == 0)) {
90                 q_conf->empty_poll_stats++;
91                 if (unlikely(q_conf->empty_poll_stats > EMPTYPOLL_MAX)) {
92                         struct rte_power_monitor_cond pmc;
93                         uint16_t ret;
94
95                         /*
96                          * we might get a cancellation request while being
97                          * inside the callback, in which case the wakeup
98                          * wouldn't work because it would've arrived too early.
99                          *
100                          * to get around this, we notify the other thread that
101                          * we're sleeping, so that it can spin until we're done.
102                          * unsolicited wakeups are perfectly safe.
103                          */
104                         q_conf->umwait_in_progress = true;
105
106                         rte_atomic_thread_fence(__ATOMIC_SEQ_CST);
107
108                         /* check if we need to cancel sleep */
109                         if (q_conf->pwr_mgmt_state == PMD_MGMT_ENABLED) {
110                                 /* use monitoring condition to sleep */
111                                 ret = rte_eth_get_monitor_addr(port_id, qidx,
112                                                 &pmc);
113                                 if (ret == 0)
114                                         rte_power_monitor(&pmc, -1ULL);
115                         }
116                         q_conf->umwait_in_progress = false;
117
118                         rte_atomic_thread_fence(__ATOMIC_SEQ_CST);
119                 }
120         } else
121                 q_conf->empty_poll_stats = 0;
122
123         return nb_rx;
124 }
125
126 static uint16_t
127 clb_pause(uint16_t port_id, uint16_t qidx, struct rte_mbuf **pkts __rte_unused,
128                 uint16_t nb_rx, uint16_t max_pkts __rte_unused,
129                 void *addr __rte_unused)
130 {
131         struct pmd_queue_cfg *q_conf;
132
133         q_conf = &port_cfg[port_id][qidx];
134
135         if (unlikely(nb_rx == 0)) {
136                 q_conf->empty_poll_stats++;
137                 /* sleep for 1 microsecond */
138                 if (unlikely(q_conf->empty_poll_stats > EMPTYPOLL_MAX)) {
139                         /* use tpause if we have it */
140                         if (global_data.intrinsics_support.power_pause) {
141                                 const uint64_t cur = rte_rdtsc();
142                                 const uint64_t wait_tsc =
143                                                 cur + global_data.tsc_per_us;
144                                 rte_power_pause(wait_tsc);
145                         } else {
146                                 uint64_t i;
147                                 for (i = 0; i < global_data.pause_per_us; i++)
148                                         rte_pause();
149                         }
150                 }
151         } else
152                 q_conf->empty_poll_stats = 0;
153
154         return nb_rx;
155 }
156
157 static uint16_t
158 clb_scale_freq(uint16_t port_id, uint16_t qidx,
159                 struct rte_mbuf **pkts __rte_unused, uint16_t nb_rx,
160                 uint16_t max_pkts __rte_unused, void *_  __rte_unused)
161 {
162         struct pmd_queue_cfg *q_conf;
163
164         q_conf = &port_cfg[port_id][qidx];
165
166         if (unlikely(nb_rx == 0)) {
167                 q_conf->empty_poll_stats++;
168                 if (unlikely(q_conf->empty_poll_stats > EMPTYPOLL_MAX))
169                         /* scale down freq */
170                         rte_power_freq_min(rte_lcore_id());
171         } else {
172                 q_conf->empty_poll_stats = 0;
173                 /* scale up freq */
174                 rte_power_freq_max(rte_lcore_id());
175         }
176
177         return nb_rx;
178 }
179
180 int
181 rte_power_ethdev_pmgmt_queue_enable(unsigned int lcore_id, uint16_t port_id,
182                 uint16_t queue_id, enum rte_power_pmd_mgmt_type mode)
183 {
184         struct pmd_queue_cfg *queue_cfg;
185         struct rte_eth_dev_info info;
186         int ret;
187
188         RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL);
189
190         if (queue_id >= RTE_MAX_QUEUES_PER_PORT || lcore_id >= RTE_MAX_LCORE) {
191                 ret = -EINVAL;
192                 goto end;
193         }
194
195         if (rte_eth_dev_info_get(port_id, &info) < 0) {
196                 ret = -EINVAL;
197                 goto end;
198         }
199
200         /* check if queue id is valid */
201         if (queue_id >= info.nb_rx_queues) {
202                 ret = -EINVAL;
203                 goto end;
204         }
205
206         queue_cfg = &port_cfg[port_id][queue_id];
207
208         if (queue_cfg->pwr_mgmt_state != PMD_MGMT_DISABLED) {
209                 ret = -EINVAL;
210                 goto end;
211         }
212
213         /* we need this in various places */
214         rte_cpu_get_intrinsics_support(&global_data.intrinsics_support);
215
216         switch (mode) {
217         case RTE_POWER_MGMT_TYPE_MONITOR:
218         {
219                 struct rte_power_monitor_cond dummy;
220
221                 /* check if rte_power_monitor is supported */
222                 if (!global_data.intrinsics_support.power_monitor) {
223                         RTE_LOG(DEBUG, POWER, "Monitoring intrinsics are not supported\n");
224                         ret = -ENOTSUP;
225                         goto end;
226                 }
227
228                 /* check if the device supports the necessary PMD API */
229                 if (rte_eth_get_monitor_addr(port_id, queue_id,
230                                 &dummy) == -ENOTSUP) {
231                         RTE_LOG(DEBUG, POWER, "The device does not support rte_eth_get_monitor_addr\n");
232                         ret = -ENOTSUP;
233                         goto end;
234                 }
235                 /* initialize data before enabling the callback */
236                 queue_cfg->empty_poll_stats = 0;
237                 queue_cfg->cb_mode = mode;
238                 queue_cfg->umwait_in_progress = false;
239                 queue_cfg->pwr_mgmt_state = PMD_MGMT_ENABLED;
240
241                 /* ensure we update our state before callback starts */
242                 rte_atomic_thread_fence(__ATOMIC_SEQ_CST);
243
244                 queue_cfg->cur_cb = rte_eth_add_rx_callback(port_id, queue_id,
245                                 clb_umwait, NULL);
246                 break;
247         }
248         case RTE_POWER_MGMT_TYPE_SCALE:
249         {
250                 enum power_management_env env;
251                 /* only PSTATE and ACPI modes are supported */
252                 if (!rte_power_check_env_supported(PM_ENV_ACPI_CPUFREQ) &&
253                                 !rte_power_check_env_supported(
254                                         PM_ENV_PSTATE_CPUFREQ)) {
255                         RTE_LOG(DEBUG, POWER, "Neither ACPI nor PSTATE modes are supported\n");
256                         ret = -ENOTSUP;
257                         goto end;
258                 }
259                 /* ensure we could initialize the power library */
260                 if (rte_power_init(lcore_id)) {
261                         ret = -EINVAL;
262                         goto end;
263                 }
264                 /* ensure we initialized the correct env */
265                 env = rte_power_get_env();
266                 if (env != PM_ENV_ACPI_CPUFREQ &&
267                                 env != PM_ENV_PSTATE_CPUFREQ) {
268                         RTE_LOG(DEBUG, POWER, "Neither ACPI nor PSTATE modes were initialized\n");
269                         ret = -ENOTSUP;
270                         goto end;
271                 }
272                 /* initialize data before enabling the callback */
273                 queue_cfg->empty_poll_stats = 0;
274                 queue_cfg->cb_mode = mode;
275                 queue_cfg->pwr_mgmt_state = PMD_MGMT_ENABLED;
276
277                 /* this is not necessary here, but do it anyway */
278                 rte_atomic_thread_fence(__ATOMIC_SEQ_CST);
279
280                 queue_cfg->cur_cb = rte_eth_add_rx_callback(port_id,
281                                 queue_id, clb_scale_freq, NULL);
282                 break;
283         }
284         case RTE_POWER_MGMT_TYPE_PAUSE:
285                 /* figure out various time-to-tsc conversions */
286                 if (global_data.tsc_per_us == 0)
287                         calc_tsc();
288
289                 /* initialize data before enabling the callback */
290                 queue_cfg->empty_poll_stats = 0;
291                 queue_cfg->cb_mode = mode;
292                 queue_cfg->pwr_mgmt_state = PMD_MGMT_ENABLED;
293
294                 /* this is not necessary here, but do it anyway */
295                 rte_atomic_thread_fence(__ATOMIC_SEQ_CST);
296
297                 queue_cfg->cur_cb = rte_eth_add_rx_callback(port_id, queue_id,
298                                 clb_pause, NULL);
299                 break;
300         }
301         ret = 0;
302 end:
303         return ret;
304 }
305
306 int
307 rte_power_ethdev_pmgmt_queue_disable(unsigned int lcore_id,
308                 uint16_t port_id, uint16_t queue_id)
309 {
310         struct pmd_queue_cfg *queue_cfg;
311
312         RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL);
313
314         if (lcore_id >= RTE_MAX_LCORE || queue_id >= RTE_MAX_QUEUES_PER_PORT)
315                 return -EINVAL;
316
317         /* no need to check queue id as wrong queue id would not be enabled */
318         queue_cfg = &port_cfg[port_id][queue_id];
319
320         if (queue_cfg->pwr_mgmt_state != PMD_MGMT_ENABLED)
321                 return -EINVAL;
322
323         /* stop any callbacks from progressing */
324         queue_cfg->pwr_mgmt_state = PMD_MGMT_DISABLED;
325
326         /* ensure we update our state before continuing */
327         rte_atomic_thread_fence(__ATOMIC_SEQ_CST);
328
329         switch (queue_cfg->cb_mode) {
330         case RTE_POWER_MGMT_TYPE_MONITOR:
331         {
332                 bool exit = false;
333                 do {
334                         /*
335                          * we may request cancellation while the other thread
336                          * has just entered the callback but hasn't started
337                          * sleeping yet, so keep waking it up until we know it's
338                          * done sleeping.
339                          */
340                         if (queue_cfg->umwait_in_progress)
341                                 rte_power_monitor_wakeup(lcore_id);
342                         else
343                                 exit = true;
344                 } while (!exit);
345         }
346         /* fall-through */
347         case RTE_POWER_MGMT_TYPE_PAUSE:
348                 rte_eth_remove_rx_callback(port_id, queue_id,
349                                 queue_cfg->cur_cb);
350                 break;
351         case RTE_POWER_MGMT_TYPE_SCALE:
352                 rte_power_freq_max(lcore_id);
353                 rte_eth_remove_rx_callback(port_id, queue_id,
354                                 queue_cfg->cur_cb);
355                 rte_power_exit(lcore_id);
356                 break;
357         }
358         /*
359          * we don't free the RX callback here because it is unsafe to do so
360          * unless we know for a fact that all data plane threads have stopped.
361          */
362         queue_cfg->cur_cb = NULL;
363
364         return 0;
365 }