timer: fix race condition
authorRobert Sanford <rsanford@akamai.com>
Mon, 27 Jul 2015 22:46:06 +0000 (18:46 -0400)
committerThomas Monjalon <thomas.monjalon@6wind.com>
Mon, 3 Aug 2015 10:43:01 +0000 (12:43 +0200)
Eliminate problematic race condition in rte_timer_manage() that can
lead to corruption of per-lcore pending-lists (implemented as
skip-lists). The race condition occurs when rte_timer_manage() expires
multiple timers on lcore A, while lcore B simultaneously invokes
rte_timer_reset() for one of the expiring timers (other than the first
one).

Lcore A splits its pending-list, creating a local list of expired timers
linked through their sl_next[0] pointers, and sets the first expired
timer to the RUNNING state, all during one list-lock round trip.
Lcore A then unlocks the list-lock to run the first callback, and that
is when A and B can have different interpretations of the subsequent
expired timers' true state. Lcore B sees an expired timer still in the
PENDING state, atomically changes the timer to the CONFIG state, locks
lcore A's list-lock, and reinserts the timer into A's pending-list.
The two lcores try to use the same next-pointers to maintain both lists!

Our solution is to remove expired timers from the pending-list and try
to set them all to the RUNNING state in one atomic step, i.e.,
rte_timer_manage() should perform these two actions within one
ownership of the list-lock.

After splitting the pending-list at the current point in time and trying
to set all expired timers to the RUNNING state, we must put back into
the pending-list any timers that we failed to set to the RUNNING state,
all while still holding the list-lock. It is then safe to release the
lock and run the callback functions for all expired timers that remain
on our local run-list.

Signed-off-by: Robert Sanford <rsanford@akamai.com>
lib/librte_timer/rte_timer.c

index 8e9243a..3dcdab5 100644 (file)
@@ -504,6 +504,7 @@ void rte_timer_manage(void)
 {
        union rte_timer_status status;
        struct rte_timer *tim, *next_tim;
+       struct rte_timer *run_first_tim, **pprev;
        unsigned lcore_id = rte_lcore_id();
        struct rte_timer *prev[MAX_SKIPLIST_DEPTH + 1];
        uint64_t cur_time;
@@ -519,9 +520,9 @@ void rte_timer_manage(void)
        cur_time = rte_get_timer_cycles();
 
 #ifdef RTE_ARCH_X86_64
-       /* on 64-bit the value cached in the pending_head.expired will be updated
-        * atomically, so we can consult that for a quick check here outside the
-        * lock */
+       /* on 64-bit the value cached in the pending_head.expired will be
+        * updated atomically, so we can consult that for a quick check here
+        * outside the lock */
        if (likely(priv_timer[lcore_id].pending_head.expire > cur_time))
                return;
 #endif
@@ -531,8 +532,10 @@ void rte_timer_manage(void)
 
        /* if nothing to do just unlock and return */
        if (priv_timer[lcore_id].pending_head.sl_next[0] == NULL ||
-                       priv_timer[lcore_id].pending_head.sl_next[0]->expire > cur_time)
-               goto done;
+           priv_timer[lcore_id].pending_head.sl_next[0]->expire > cur_time) {
+               rte_spinlock_unlock(&priv_timer[lcore_id].list_lock);
+               return;
+       }
 
        /* save start of list of expired timers */
        tim = priv_timer[lcore_id].pending_head.sl_next[0];
@@ -540,30 +543,47 @@ void rte_timer_manage(void)
        /* break the existing list at current time point */
        timer_get_prev_entries(cur_time, lcore_id, prev);
        for (i = priv_timer[lcore_id].curr_skiplist_depth -1; i >= 0; i--) {
-               priv_timer[lcore_id].pending_head.sl_next[i] = prev[i]->sl_next[i];
+               priv_timer[lcore_id].pending_head.sl_next[i] =
+                   prev[i]->sl_next[i];
                if (prev[i]->sl_next[i] == NULL)
                        priv_timer[lcore_id].curr_skiplist_depth--;
                prev[i] ->sl_next[i] = NULL;
        }
 
-       /* now scan expired list and call callbacks */
+       /* transition run-list from PENDING to RUNNING */
+       run_first_tim = tim;
+       pprev = &run_first_tim;
+
        for ( ; tim != NULL; tim = next_tim) {
                next_tim = tim->sl_next[0];
 
                ret = timer_set_running_state(tim);
+               if (likely(ret == 0)) {
+                       pprev = &tim->sl_next[0];
+               } else {
+                       /* another core is trying to re-config this one,
+                        * remove it from local expired list and put it
+                        * back on the priv_timer[] skip list */
+                       *pprev = next_tim;
+                       timer_add(tim, lcore_id, 1);
+               }
+       }
 
-               /* this timer was not pending, continue */
-               if (ret < 0)
-                       continue;
+       /* update the next to expire timer value */
+       priv_timer[lcore_id].pending_head.expire =
+           (priv_timer[lcore_id].pending_head.sl_next[0] == NULL) ? 0 :
+               priv_timer[lcore_id].pending_head.sl_next[0]->expire;
 
-               rte_spinlock_unlock(&priv_timer[lcore_id].list_lock);
+       rte_spinlock_unlock(&priv_timer[lcore_id].list_lock);
 
+       /* now scan expired list and call callbacks */
+       for (tim = run_first_tim; tim != NULL; tim = next_tim) {
+               next_tim = tim->sl_next[0];
                priv_timer[lcore_id].updated = 0;
 
                /* execute callback function with list unlocked */
                tim->f(tim, tim->arg);
 
-               rte_spinlock_lock(&priv_timer[lcore_id].list_lock);
                __TIMER_STAT_ADD(pending, -1);
                /* the timer was stopped or reloaded by the callback
                 * function, we have nothing to do here */
@@ -579,23 +599,17 @@ void rte_timer_manage(void)
                }
                else {
                        /* keep it in list and mark timer as pending */
+                       rte_spinlock_lock(&priv_timer[lcore_id].list_lock);
                        status.state = RTE_TIMER_PENDING;
                        __TIMER_STAT_ADD(pending, 1);
                        status.owner = (int16_t)lcore_id;
                        rte_wmb();
                        tim->status.u32 = status.u32;
                        __rte_timer_reset(tim, cur_time + tim->period,
-                                       tim->period, lcore_id, tim->f, tim->arg, 1);
+                               tim->period, lcore_id, tim->f, tim->arg, 1);
+                       rte_spinlock_unlock(&priv_timer[lcore_id].list_lock);
                }
        }
-
-       /* update the next to expire timer value */
-       priv_timer[lcore_id].pending_head.expire =
-                       (priv_timer[lcore_id].pending_head.sl_next[0] == NULL) ? 0 :
-                                       priv_timer[lcore_id].pending_head.sl_next[0]->expire;
-done:
-       /* job finished, unlock the list lock */
-       rte_spinlock_unlock(&priv_timer[lcore_id].list_lock);
 }
 
 /* dump statistics about timers */