4 * Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * * Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * * Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * * Neither the name of Intel Corporation nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
39 #include <sys/queue.h>
41 #include <rte_atomic.h>
42 #include <rte_common.h>
43 #include <rte_cycles.h>
44 #include <rte_per_lcore.h>
45 #include <rte_memory.h>
46 #include <rte_memzone.h>
47 #include <rte_launch.h>
49 #include <rte_lcore.h>
50 #include <rte_branch_prediction.h>
51 #include <rte_spinlock.h>
52 #include <rte_random.h>
53 #include <rte_pause.h>
55 #include "rte_timer.h"
57 LIST_HEAD(rte_timer_list, rte_timer);
60 struct rte_timer pending_head; /**< dummy timer instance to head up list */
61 rte_spinlock_t list_lock; /**< lock to protect list access */
63 /** per-core variable that true if a timer was updated on this
64 * core since last reset of the variable */
67 /** track the current depth of the skiplist */
68 unsigned curr_skiplist_depth;
70 unsigned prev_lcore; /**< used for lcore round robin */
72 /** running timer on this lcore now */
73 struct rte_timer *running_tim;
75 #ifdef RTE_LIBRTE_TIMER_DEBUG
76 /** per-lcore statistics */
77 struct rte_timer_debug_stats stats;
79 } __rte_cache_aligned;
81 /** per-lcore private info for timers */
82 static struct priv_timer priv_timer[RTE_MAX_LCORE];
84 /* when debug is enabled, store some statistics */
85 #ifdef RTE_LIBRTE_TIMER_DEBUG
86 #define __TIMER_STAT_ADD(name, n) do { \
87 unsigned __lcore_id = rte_lcore_id(); \
88 if (__lcore_id < RTE_MAX_LCORE) \
89 priv_timer[__lcore_id].stats.name += (n); \
92 #define __TIMER_STAT_ADD(name, n) do {} while(0)
95 /* Init the timer library. */
97 rte_timer_subsystem_init(void)
101 /* since priv_timer is static, it's zeroed by default, so only init some
104 for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++) {
105 rte_spinlock_init(&priv_timer[lcore_id].list_lock);
106 priv_timer[lcore_id].prev_lcore = lcore_id;
110 /* Initialize the timer handle tim for use */
112 rte_timer_init(struct rte_timer *tim)
114 union rte_timer_status status;
116 status.state = RTE_TIMER_STOP;
117 status.owner = RTE_TIMER_NO_OWNER;
118 tim->status.u32 = status.u32;
122 * if timer is pending or stopped (or running on the same core than
123 * us), mark timer as configuring, and on success return the previous
124 * status of the timer
127 timer_set_config_state(struct rte_timer *tim,
128 union rte_timer_status *ret_prev_status)
130 union rte_timer_status prev_status, status;
134 lcore_id = rte_lcore_id();
136 /* wait that the timer is in correct status before update,
137 * and mark it as being configured */
138 while (success == 0) {
139 prev_status.u32 = tim->status.u32;
141 /* timer is running on another core
142 * or ready to run on local core, exit
144 if (prev_status.state == RTE_TIMER_RUNNING &&
145 (prev_status.owner != (uint16_t)lcore_id ||
146 tim != priv_timer[lcore_id].running_tim))
149 /* timer is being configured on another core */
150 if (prev_status.state == RTE_TIMER_CONFIG)
153 /* here, we know that timer is stopped or pending,
154 * mark it atomically as being configured */
155 status.state = RTE_TIMER_CONFIG;
156 status.owner = (int16_t)lcore_id;
157 success = rte_atomic32_cmpset(&tim->status.u32,
162 ret_prev_status->u32 = prev_status.u32;
167 * if timer is pending, mark timer as running
170 timer_set_running_state(struct rte_timer *tim)
172 union rte_timer_status prev_status, status;
173 unsigned lcore_id = rte_lcore_id();
176 /* wait that the timer is in correct status before update,
177 * and mark it as running */
178 while (success == 0) {
179 prev_status.u32 = tim->status.u32;
181 /* timer is not pending anymore */
182 if (prev_status.state != RTE_TIMER_PENDING)
185 /* here, we know that timer is stopped or pending,
186 * mark it atomically as being configured */
187 status.state = RTE_TIMER_RUNNING;
188 status.owner = (int16_t)lcore_id;
189 success = rte_atomic32_cmpset(&tim->status.u32,
198 * Return a skiplist level for a new entry.
199 * This probabalistically gives a level with p=1/4 that an entry at level n
200 * will also appear at level n+1.
203 timer_get_skiplist_level(unsigned curr_depth)
205 #ifdef RTE_LIBRTE_TIMER_DEBUG
206 static uint32_t i, count = 0;
207 static uint32_t levels[MAX_SKIPLIST_DEPTH] = {0};
210 /* probability value is 1/4, i.e. all at level 0, 1 in 4 is at level 1,
211 * 1 in 16 at level 2, 1 in 64 at level 3, etc. Calculated using lowest
212 * bit position of a (pseudo)random number.
214 uint32_t rand = rte_rand() & (UINT32_MAX - 1);
215 uint32_t level = rand == 0 ? MAX_SKIPLIST_DEPTH : (rte_bsf32(rand)-1) / 2;
217 /* limit the levels used to one above our current level, so we don't,
218 * for instance, have a level 0 and a level 7 without anything between
220 if (level > curr_depth)
222 if (level >= MAX_SKIPLIST_DEPTH)
223 level = MAX_SKIPLIST_DEPTH-1;
224 #ifdef RTE_LIBRTE_TIMER_DEBUG
227 if (count % 10000 == 0)
228 for (i = 0; i < MAX_SKIPLIST_DEPTH; i++)
229 printf("Level %u: %u\n", (unsigned)i, (unsigned)levels[i]);
235 * For a given time value, get the entries at each level which
236 * are <= that time value.
239 timer_get_prev_entries(uint64_t time_val, unsigned tim_lcore,
240 struct rte_timer **prev)
242 unsigned lvl = priv_timer[tim_lcore].curr_skiplist_depth;
243 prev[lvl] = &priv_timer[tim_lcore].pending_head;
246 prev[lvl] = prev[lvl+1];
247 while (prev[lvl]->sl_next[lvl] &&
248 prev[lvl]->sl_next[lvl]->expire <= time_val)
249 prev[lvl] = prev[lvl]->sl_next[lvl];
254 * Given a timer node in the skiplist, find the previous entries for it at
255 * all skiplist levels.
258 timer_get_prev_entries_for_node(struct rte_timer *tim, unsigned tim_lcore,
259 struct rte_timer **prev)
262 /* to get a specific entry in the list, look for just lower than the time
263 * values, and then increment on each level individually if necessary
265 timer_get_prev_entries(tim->expire - 1, tim_lcore, prev);
266 for (i = priv_timer[tim_lcore].curr_skiplist_depth - 1; i >= 0; i--) {
267 while (prev[i]->sl_next[i] != NULL &&
268 prev[i]->sl_next[i] != tim &&
269 prev[i]->sl_next[i]->expire <= tim->expire)
270 prev[i] = prev[i]->sl_next[i];
275 * add in list, lock if needed
276 * timer must be in config state
277 * timer must not be in a list
280 timer_add(struct rte_timer *tim, unsigned tim_lcore, int local_is_locked)
282 unsigned lcore_id = rte_lcore_id();
284 struct rte_timer *prev[MAX_SKIPLIST_DEPTH+1];
286 /* if timer needs to be scheduled on another core, we need to
287 * lock the list; if it is on local core, we need to lock if
288 * we are not called from rte_timer_manage() */
289 if (tim_lcore != lcore_id || !local_is_locked)
290 rte_spinlock_lock(&priv_timer[tim_lcore].list_lock);
292 /* find where exactly this element goes in the list of elements
294 timer_get_prev_entries(tim->expire, tim_lcore, prev);
296 /* now assign it a new level and add at that level */
297 const unsigned tim_level = timer_get_skiplist_level(
298 priv_timer[tim_lcore].curr_skiplist_depth);
299 if (tim_level == priv_timer[tim_lcore].curr_skiplist_depth)
300 priv_timer[tim_lcore].curr_skiplist_depth++;
304 tim->sl_next[lvl] = prev[lvl]->sl_next[lvl];
305 prev[lvl]->sl_next[lvl] = tim;
308 tim->sl_next[0] = prev[0]->sl_next[0];
309 prev[0]->sl_next[0] = tim;
311 /* save the lowest list entry into the expire field of the dummy hdr
312 * NOTE: this is not atomic on 32-bit*/
313 priv_timer[tim_lcore].pending_head.expire = priv_timer[tim_lcore].\
314 pending_head.sl_next[0]->expire;
316 if (tim_lcore != lcore_id || !local_is_locked)
317 rte_spinlock_unlock(&priv_timer[tim_lcore].list_lock);
321 * del from list, lock if needed
322 * timer must be in config state
323 * timer must be in a list
326 timer_del(struct rte_timer *tim, union rte_timer_status prev_status,
329 unsigned lcore_id = rte_lcore_id();
330 unsigned prev_owner = prev_status.owner;
332 struct rte_timer *prev[MAX_SKIPLIST_DEPTH+1];
334 /* if timer needs is pending another core, we need to lock the
335 * list; if it is on local core, we need to lock if we are not
336 * called from rte_timer_manage() */
337 if (prev_owner != lcore_id || !local_is_locked)
338 rte_spinlock_lock(&priv_timer[prev_owner].list_lock);
340 /* save the lowest list entry into the expire field of the dummy hdr.
341 * NOTE: this is not atomic on 32-bit */
342 if (tim == priv_timer[prev_owner].pending_head.sl_next[0])
343 priv_timer[prev_owner].pending_head.expire =
344 ((tim->sl_next[0] == NULL) ? 0 : tim->sl_next[0]->expire);
346 /* adjust pointers from previous entries to point past this */
347 timer_get_prev_entries_for_node(tim, prev_owner, prev);
348 for (i = priv_timer[prev_owner].curr_skiplist_depth - 1; i >= 0; i--) {
349 if (prev[i]->sl_next[i] == tim)
350 prev[i]->sl_next[i] = tim->sl_next[i];
353 /* in case we deleted last entry at a level, adjust down max level */
354 for (i = priv_timer[prev_owner].curr_skiplist_depth - 1; i >= 0; i--)
355 if (priv_timer[prev_owner].pending_head.sl_next[i] == NULL)
356 priv_timer[prev_owner].curr_skiplist_depth --;
360 if (prev_owner != lcore_id || !local_is_locked)
361 rte_spinlock_unlock(&priv_timer[prev_owner].list_lock);
364 /* Reset and start the timer associated with the timer handle (private func) */
366 __rte_timer_reset(struct rte_timer *tim, uint64_t expire,
367 uint64_t period, unsigned tim_lcore,
368 rte_timer_cb_t fct, void *arg,
371 union rte_timer_status prev_status, status;
373 unsigned lcore_id = rte_lcore_id();
375 /* round robin for tim_lcore */
376 if (tim_lcore == (unsigned)LCORE_ID_ANY) {
377 if (lcore_id < RTE_MAX_LCORE) {
378 /* EAL thread with valid lcore_id */
379 tim_lcore = rte_get_next_lcore(
380 priv_timer[lcore_id].prev_lcore,
382 priv_timer[lcore_id].prev_lcore = tim_lcore;
384 /* non-EAL thread do not run rte_timer_manage(),
385 * so schedule the timer on the first enabled lcore. */
386 tim_lcore = rte_get_next_lcore(LCORE_ID_ANY, 0, 1);
389 /* wait that the timer is in correct status before update,
390 * and mark it as being configured */
391 ret = timer_set_config_state(tim, &prev_status);
395 __TIMER_STAT_ADD(reset, 1);
396 if (prev_status.state == RTE_TIMER_RUNNING &&
397 lcore_id < RTE_MAX_LCORE) {
398 priv_timer[lcore_id].updated = 1;
401 /* remove it from list */
402 if (prev_status.state == RTE_TIMER_PENDING) {
403 timer_del(tim, prev_status, local_is_locked);
404 __TIMER_STAT_ADD(pending, -1);
407 tim->period = period;
408 tim->expire = expire;
412 __TIMER_STAT_ADD(pending, 1);
413 timer_add(tim, tim_lcore, local_is_locked);
415 /* update state: as we are in CONFIG state, only us can modify
416 * the state so we don't need to use cmpset() here */
418 status.state = RTE_TIMER_PENDING;
419 status.owner = (int16_t)tim_lcore;
420 tim->status.u32 = status.u32;
425 /* Reset and start the timer associated with the timer handle tim */
427 rte_timer_reset(struct rte_timer *tim, uint64_t ticks,
428 enum rte_timer_type type, unsigned tim_lcore,
429 rte_timer_cb_t fct, void *arg)
431 uint64_t cur_time = rte_get_timer_cycles();
434 if (unlikely((tim_lcore != (unsigned)LCORE_ID_ANY) &&
435 !(rte_lcore_is_enabled(tim_lcore) ||
436 rte_lcore_has_role(tim_lcore, ROLE_SERVICE))))
439 if (type == PERIODICAL)
444 return __rte_timer_reset(tim, cur_time + ticks, period, tim_lcore,
448 /* loop until rte_timer_reset() succeed */
450 rte_timer_reset_sync(struct rte_timer *tim, uint64_t ticks,
451 enum rte_timer_type type, unsigned tim_lcore,
452 rte_timer_cb_t fct, void *arg)
454 while (rte_timer_reset(tim, ticks, type, tim_lcore,
459 /* Stop the timer associated with the timer handle tim */
461 rte_timer_stop(struct rte_timer *tim)
463 union rte_timer_status prev_status, status;
464 unsigned lcore_id = rte_lcore_id();
467 /* wait that the timer is in correct status before update,
468 * and mark it as being configured */
469 ret = timer_set_config_state(tim, &prev_status);
473 __TIMER_STAT_ADD(stop, 1);
474 if (prev_status.state == RTE_TIMER_RUNNING &&
475 lcore_id < RTE_MAX_LCORE) {
476 priv_timer[lcore_id].updated = 1;
479 /* remove it from list */
480 if (prev_status.state == RTE_TIMER_PENDING) {
481 timer_del(tim, prev_status, 0);
482 __TIMER_STAT_ADD(pending, -1);
485 /* mark timer as stopped */
487 status.state = RTE_TIMER_STOP;
488 status.owner = RTE_TIMER_NO_OWNER;
489 tim->status.u32 = status.u32;
494 /* loop until rte_timer_stop() succeed */
496 rte_timer_stop_sync(struct rte_timer *tim)
498 while (rte_timer_stop(tim) != 0)
502 /* Test the PENDING status of the timer handle tim */
504 rte_timer_pending(struct rte_timer *tim)
506 return tim->status.state == RTE_TIMER_PENDING;
509 /* must be called periodically, run all timer that expired */
510 void rte_timer_manage(void)
512 union rte_timer_status status;
513 struct rte_timer *tim, *next_tim;
514 struct rte_timer *run_first_tim, **pprev;
515 unsigned lcore_id = rte_lcore_id();
516 struct rte_timer *prev[MAX_SKIPLIST_DEPTH + 1];
520 /* timer manager only runs on EAL thread with valid lcore_id */
521 assert(lcore_id < RTE_MAX_LCORE);
523 __TIMER_STAT_ADD(manage, 1);
524 /* optimize for the case where per-cpu list is empty */
525 if (priv_timer[lcore_id].pending_head.sl_next[0] == NULL)
527 cur_time = rte_get_timer_cycles();
529 #ifdef RTE_ARCH_X86_64
530 /* on 64-bit the value cached in the pending_head.expired will be
531 * updated atomically, so we can consult that for a quick check here
532 * outside the lock */
533 if (likely(priv_timer[lcore_id].pending_head.expire > cur_time))
537 /* browse ordered list, add expired timers in 'expired' list */
538 rte_spinlock_lock(&priv_timer[lcore_id].list_lock);
540 /* if nothing to do just unlock and return */
541 if (priv_timer[lcore_id].pending_head.sl_next[0] == NULL ||
542 priv_timer[lcore_id].pending_head.sl_next[0]->expire > cur_time) {
543 rte_spinlock_unlock(&priv_timer[lcore_id].list_lock);
547 /* save start of list of expired timers */
548 tim = priv_timer[lcore_id].pending_head.sl_next[0];
550 /* break the existing list at current time point */
551 timer_get_prev_entries(cur_time, lcore_id, prev);
552 for (i = priv_timer[lcore_id].curr_skiplist_depth -1; i >= 0; i--) {
553 if (prev[i] == &priv_timer[lcore_id].pending_head)
555 priv_timer[lcore_id].pending_head.sl_next[i] =
557 if (prev[i]->sl_next[i] == NULL)
558 priv_timer[lcore_id].curr_skiplist_depth--;
559 prev[i] ->sl_next[i] = NULL;
562 /* transition run-list from PENDING to RUNNING */
564 pprev = &run_first_tim;
566 for ( ; tim != NULL; tim = next_tim) {
567 next_tim = tim->sl_next[0];
569 ret = timer_set_running_state(tim);
570 if (likely(ret == 0)) {
571 pprev = &tim->sl_next[0];
573 /* another core is trying to re-config this one,
574 * remove it from local expired list
580 /* update the next to expire timer value */
581 priv_timer[lcore_id].pending_head.expire =
582 (priv_timer[lcore_id].pending_head.sl_next[0] == NULL) ? 0 :
583 priv_timer[lcore_id].pending_head.sl_next[0]->expire;
585 rte_spinlock_unlock(&priv_timer[lcore_id].list_lock);
587 /* now scan expired list and call callbacks */
588 for (tim = run_first_tim; tim != NULL; tim = next_tim) {
589 next_tim = tim->sl_next[0];
590 priv_timer[lcore_id].updated = 0;
591 priv_timer[lcore_id].running_tim = tim;
593 /* execute callback function with list unlocked */
594 tim->f(tim, tim->arg);
596 __TIMER_STAT_ADD(pending, -1);
597 /* the timer was stopped or reloaded by the callback
598 * function, we have nothing to do here */
599 if (priv_timer[lcore_id].updated == 1)
602 if (tim->period == 0) {
603 /* remove from done list and mark timer as stopped */
604 status.state = RTE_TIMER_STOP;
605 status.owner = RTE_TIMER_NO_OWNER;
607 tim->status.u32 = status.u32;
610 /* keep it in list and mark timer as pending */
611 rte_spinlock_lock(&priv_timer[lcore_id].list_lock);
612 status.state = RTE_TIMER_PENDING;
613 __TIMER_STAT_ADD(pending, 1);
614 status.owner = (int16_t)lcore_id;
616 tim->status.u32 = status.u32;
617 __rte_timer_reset(tim, tim->expire + tim->period,
618 tim->period, lcore_id, tim->f, tim->arg, 1);
619 rte_spinlock_unlock(&priv_timer[lcore_id].list_lock);
622 priv_timer[lcore_id].running_tim = NULL;
625 /* dump statistics about timers */
626 void rte_timer_dump_stats(FILE *f)
628 #ifdef RTE_LIBRTE_TIMER_DEBUG
629 struct rte_timer_debug_stats sum;
632 memset(&sum, 0, sizeof(sum));
633 for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
634 sum.reset += priv_timer[lcore_id].stats.reset;
635 sum.stop += priv_timer[lcore_id].stats.stop;
636 sum.manage += priv_timer[lcore_id].stats.manage;
637 sum.pending += priv_timer[lcore_id].stats.pending;
639 fprintf(f, "Timer statistics:\n");
640 fprintf(f, " reset = %"PRIu64"\n", sum.reset);
641 fprintf(f, " stop = %"PRIu64"\n", sum.stop);
642 fprintf(f, " manage = %"PRIu64"\n", sum.manage);
643 fprintf(f, " pending = %"PRIu64"\n", sum.pending);
645 fprintf(f, "No timer statistics, RTE_LIBRTE_TIMER_DEBUG is disabled\n");