remove experimental tags from all symbol definitions
[dpdk.git] / lib / librte_timer / rte_timer.c
index a944bee..eaeafd7 100644 (file)
-/*-
- *   BSD LICENSE
- * 
- *   Copyright(c) 2010-2012 Intel Corporation. All rights reserved.
- *   All rights reserved.
- * 
- *   Redistribution and use in source and binary forms, with or without 
- *   modification, are permitted provided that the following conditions 
- *   are met:
- * 
- *     * Redistributions of source code must retain the above copyright 
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright 
- *       notice, this list of conditions and the following disclaimer in 
- *       the documentation and/or other materials provided with the 
- *       distribution.
- *     * Neither the name of Intel Corporation nor the names of its 
- *       contributors may be used to endorse or promote products derived 
- *       from this software without specific prior written permission.
- * 
- *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 
- *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 
- *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 
- *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 
- *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 
- *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 
- *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 
- *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 
- *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
- *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
- *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- * 
- *  version: DPDK.L.1.2.3-3
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2014 Intel Corporation
  */
 
 #include <string.h>
 #include <stdio.h>
 #include <stdint.h>
-#include <sys/queue.h>
+#include <stdbool.h>
 #include <inttypes.h>
+#include <assert.h>
+#include <sys/queue.h>
 
 #include <rte_atomic.h>
 #include <rte_common.h>
 #include <rte_cycles.h>
 #include <rte_per_lcore.h>
 #include <rte_memory.h>
-#include <rte_memzone.h>
 #include <rte_launch.h>
-#include <rte_tailq.h>
 #include <rte_eal.h>
-#include <rte_per_lcore.h>
 #include <rte_lcore.h>
 #include <rte_branch_prediction.h>
 #include <rte_spinlock.h>
+#include <rte_random.h>
+#include <rte_pause.h>
+#include <rte_memzone.h>
+#include <rte_malloc.h>
+#include <rte_compat.h>
+#include <rte_errno.h>
 
 #include "rte_timer.h"
 
-LIST_HEAD(rte_timer_list, rte_timer);
-
+/**
+ * Per-lcore info for timers.
+ */
 struct priv_timer {
-       struct rte_timer_list pending;  /**< list of pending timers */
-       struct rte_timer_list expired;  /**< list of expired timers */
-       struct rte_timer_list done;     /**< list of done timers */
+       struct rte_timer pending_head;  /**< dummy timer instance to head up list */
        rte_spinlock_t list_lock;       /**< lock to protect list access */
 
        /** per-core variable that true if a timer was updated on this
         *  core since last reset of the variable */
        int updated;
 
+       /** track the current depth of the skiplist */
+       unsigned curr_skiplist_depth;
+
        unsigned prev_lcore;              /**< used for lcore round robin */
 
+       /** running timer on this lcore now */
+       struct rte_timer *running_tim;
+
 #ifdef RTE_LIBRTE_TIMER_DEBUG
        /** per-lcore statistics */
        struct rte_timer_debug_stats stats;
 #endif
 } __rte_cache_aligned;
 
-/** per-lcore private info for timers */
-static struct priv_timer priv_timer[RTE_MAX_LCORE];
+#define FL_ALLOCATED   (1 << 0)
+struct rte_timer_data {
+       struct priv_timer priv_timer[RTE_MAX_LCORE];
+       uint8_t internal_flags;
+};
+
+#define RTE_MAX_DATA_ELS 64
+static struct rte_timer_data *rte_timer_data_arr;
+static const uint32_t default_data_id;
+static uint32_t rte_timer_subsystem_initialized;
+
+/* For maintaining older interfaces for a period */
+static struct rte_timer_data default_timer_data;
 
 /* when debug is enabled, store some statistics */
 #ifdef RTE_LIBRTE_TIMER_DEBUG
-#define __TIMER_STAT_ADD(name, n) do {                         \
-               unsigned __lcore_id = rte_lcore_id();           \
-               priv_timer[__lcore_id].stats.name += (n);       \
+#define __TIMER_STAT_ADD(priv_timer, name, n) do {                     \
+               unsigned __lcore_id = rte_lcore_id();                   \
+               if (__lcore_id < RTE_MAX_LCORE)                         \
+                       priv_timer[__lcore_id].stats.name += (n);       \
        } while(0)
 #else
-#define __TIMER_STAT_ADD(name, n) do {} while(0)
+#define __TIMER_STAT_ADD(priv_timer, name, n) do {} while (0)
 #endif
 
-/* this macro allow to modify var while browsing the list */
-#define LIST_FOREACH_SAFE(var, var2, head, field)                     \
-       for ((var) = ((head)->lh_first),                               \
-                    (var2) = ((var) ? ((var)->field.le_next) : NULL); \
-            (var);                                                    \
-            (var) = (var2),                                           \
-                    (var2) = ((var) ? ((var)->field.le_next) : NULL))
+static inline int
+timer_data_valid(uint32_t id)
+{
+       return !!(rte_timer_data_arr[id].internal_flags & FL_ALLOCATED);
+}
+
+/* validate ID and retrieve timer data pointer, or return error value */
+#define TIMER_DATA_VALID_GET_OR_ERR_RET(id, timer_data, retval) do {   \
+       if (id >= RTE_MAX_DATA_ELS || !timer_data_valid(id))            \
+               return retval;                                          \
+       timer_data = &rte_timer_data_arr[id];                           \
+} while (0)
 
+int
+rte_timer_data_alloc(uint32_t *id_ptr)
+{
+       int i;
+       struct rte_timer_data *data;
+
+       if (!rte_timer_subsystem_initialized)
+               return -ENOMEM;
+
+       for (i = 0; i < RTE_MAX_DATA_ELS; i++) {
+               data = &rte_timer_data_arr[i];
+               if (!(data->internal_flags & FL_ALLOCATED)) {
+                       data->internal_flags |= FL_ALLOCATED;
+
+                       if (id_ptr)
+                               *id_ptr = i;
+
+                       return 0;
+               }
+       }
+
+       return -ENOSPC;
+}
+
+int
+rte_timer_data_dealloc(uint32_t id)
+{
+       struct rte_timer_data *timer_data;
+       TIMER_DATA_VALID_GET_OR_ERR_RET(id, timer_data, -EINVAL);
+
+       timer_data->internal_flags &= ~(FL_ALLOCATED);
+
+       return 0;
+}
 
-/* Init the timer library. */
 void
-rte_timer_subsystem_init(void)
+rte_timer_subsystem_init_v20(void)
 {
        unsigned lcore_id;
+       struct priv_timer *priv_timer = default_timer_data.priv_timer;
 
+       /* since priv_timer is static, it's zeroed by default, so only init some
+        * fields.
+        */
        for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++) {
-               LIST_INIT(&priv_timer[lcore_id].pending);
-               LIST_INIT(&priv_timer[lcore_id].expired);
-               LIST_INIT(&priv_timer[lcore_id].done);
                rte_spinlock_init(&priv_timer[lcore_id].list_lock);
                priv_timer[lcore_id].prev_lcore = lcore_id;
        }
 }
+VERSION_SYMBOL(rte_timer_subsystem_init, _v20, 2.0);
+
+/* Init the timer library. Allocate an array of timer data structs in shared
+ * memory, and allocate the zeroth entry for use with original timer
+ * APIs. Since the intersection of the sets of lcore ids in primary and
+ * secondary processes should be empty, the zeroth entry can be shared by
+ * multiple processes.
+ */
+int
+rte_timer_subsystem_init_v1905(void)
+{
+       const struct rte_memzone *mz;
+       struct rte_timer_data *data;
+       int i, lcore_id;
+       static const char *mz_name = "rte_timer_mz";
+       const size_t data_arr_size =
+                               RTE_MAX_DATA_ELS * sizeof(*rte_timer_data_arr);
+       bool do_full_init = true;
+
+       if (rte_timer_subsystem_initialized)
+               return -EALREADY;
+
+reserve:
+       rte_errno = 0;
+       mz = rte_memzone_reserve_aligned(mz_name, data_arr_size, SOCKET_ID_ANY,
+                                        0, RTE_CACHE_LINE_SIZE);
+       if (mz == NULL) {
+               if (rte_errno == EEXIST) {
+                       mz = rte_memzone_lookup(mz_name);
+                       if (mz == NULL)
+                               goto reserve;
+
+                       do_full_init = false;
+               } else
+                       return -ENOMEM;
+       }
+
+       rte_timer_data_arr = mz->addr;
+
+       if (do_full_init) {
+               for (i = 0; i < RTE_MAX_DATA_ELS; i++) {
+                       data = &rte_timer_data_arr[i];
+
+                       for (lcore_id = 0; lcore_id < RTE_MAX_LCORE;
+                            lcore_id++) {
+                               rte_spinlock_init(
+                                       &data->priv_timer[lcore_id].list_lock);
+                               data->priv_timer[lcore_id].prev_lcore =
+                                       lcore_id;
+                       }
+               }
+       }
+
+       rte_timer_data_arr[default_data_id].internal_flags |= FL_ALLOCATED;
+
+       rte_timer_subsystem_initialized = 1;
+
+       return 0;
+}
+MAP_STATIC_SYMBOL(int rte_timer_subsystem_init(void),
+                 rte_timer_subsystem_init_v1905);
+BIND_DEFAULT_SYMBOL(rte_timer_subsystem_init, _v1905, 19.05);
+
+void
+rte_timer_subsystem_finalize(void)
+{
+       if (!rte_timer_subsystem_initialized)
+               return;
+
+       rte_timer_subsystem_initialized = 0;
+}
 
 /* Initialize the timer handle tim for use */
 void
@@ -130,7 +231,8 @@ rte_timer_init(struct rte_timer *tim)
  */
 static int
 timer_set_config_state(struct rte_timer *tim,
-                      union rte_timer_status *ret_prev_status)
+                      union rte_timer_status *ret_prev_status,
+                      struct priv_timer *priv_timer)
 {
        union rte_timer_status prev_status, status;
        int success = 0;
@@ -139,21 +241,24 @@ timer_set_config_state(struct rte_timer *tim,
        lcore_id = rte_lcore_id();
 
        /* wait that the timer is in correct status before update,
-        * and mark it as beeing configured */
+        * and mark it as being configured */
        while (success == 0) {
                prev_status.u32 = tim->status.u32;
 
-               /* timer is running on another core, exit */
+               /* timer is running on another core
+                * or ready to run on local core, exit
+                */
                if (prev_status.state == RTE_TIMER_RUNNING &&
-                   (unsigned)prev_status.owner != lcore_id)
+                   (prev_status.owner != (uint16_t)lcore_id ||
+                    tim != priv_timer[lcore_id].running_tim))
                        return -1;
 
-               /* timer is beeing configured on another core */
+               /* timer is being configured on another core */
                if (prev_status.state == RTE_TIMER_CONFIG)
                        return -1;
 
                /* here, we know that timer is stopped or pending,
-                * mark it atomically as beeing configured */
+                * mark it atomically as being configured */
                status.state = RTE_TIMER_CONFIG;
                status.owner = (int16_t)lcore_id;
                success = rte_atomic32_cmpset(&tim->status.u32,
@@ -185,7 +290,7 @@ timer_set_running_state(struct rte_timer *tim)
                        return -1;
 
                /* here, we know that timer is stopped or pending,
-                * mark it atomically as beeing configured */
+                * mark it atomically as being configured */
                status.state = RTE_TIMER_RUNNING;
                status.owner = (int16_t)lcore_id;
                success = rte_atomic32_cmpset(&tim->status.u32,
@@ -197,50 +302,119 @@ timer_set_running_state(struct rte_timer *tim)
 }
 
 /*
- * add in list, lock if needed
- * timer must be in config state
- * timer must not be in a list
+ * Return a skiplist level for a new entry.
+ * This probabilistically gives a level with p=1/4 that an entry at level n
+ * will also appear at level n+1.
  */
-static void
-timer_add(struct rte_timer *tim, unsigned tim_lcore, int local_is_locked)
+static uint32_t
+timer_get_skiplist_level(unsigned curr_depth)
 {
-       uint64_t cur_time = rte_get_hpet_cycles();
-       unsigned lcore_id = rte_lcore_id();
-       struct rte_timer *t, *t_prev;
+#ifdef RTE_LIBRTE_TIMER_DEBUG
+       static uint32_t i, count = 0;
+       static uint32_t levels[MAX_SKIPLIST_DEPTH] = {0};
+#endif
 
-       /* if timer needs to be scheduled on another core, we need to
-        * lock the list; if it is on local core, we need to lock if
-        * we are not called from rte_timer_manage() */
-       if (tim_lcore != lcore_id || !local_is_locked)
-               rte_spinlock_lock(&priv_timer[tim_lcore].list_lock);
+       /* probability value is 1/4, i.e. all at level 0, 1 in 4 is at level 1,
+        * 1 in 16 at level 2, 1 in 64 at level 3, etc. Calculated using lowest
+        * bit position of a (pseudo)random number.
+        */
+       uint32_t rand = rte_rand() & (UINT32_MAX - 1);
+       uint32_t level = rand == 0 ? MAX_SKIPLIST_DEPTH : (rte_bsf32(rand)-1) / 2;
+
+       /* limit the levels used to one above our current level, so we don't,
+        * for instance, have a level 0 and a level 7 without anything between
+        */
+       if (level > curr_depth)
+               level = curr_depth;
+       if (level >= MAX_SKIPLIST_DEPTH)
+               level = MAX_SKIPLIST_DEPTH-1;
+#ifdef RTE_LIBRTE_TIMER_DEBUG
+       count ++;
+       levels[level]++;
+       if (count % 10000 == 0)
+               for (i = 0; i < MAX_SKIPLIST_DEPTH; i++)
+                       printf("Level %u: %u\n", (unsigned)i, (unsigned)levels[i]);
+#endif
+       return level;
+}
 
-       t = LIST_FIRST(&priv_timer[tim_lcore].pending);
+/*
+ * For a given time value, get the entries at each level which
+ * are <= that time value.
+ */
+static void
+timer_get_prev_entries(uint64_t time_val, unsigned tim_lcore,
+                      struct rte_timer **prev, struct priv_timer *priv_timer)
+{
+       unsigned lvl = priv_timer[tim_lcore].curr_skiplist_depth;
+       prev[lvl] = &priv_timer[tim_lcore].pending_head;
+       while(lvl != 0) {
+               lvl--;
+               prev[lvl] = prev[lvl+1];
+               while (prev[lvl]->sl_next[lvl] &&
+                               prev[lvl]->sl_next[lvl]->expire <= time_val)
+                       prev[lvl] = prev[lvl]->sl_next[lvl];
+       }
+}
 
-       /* list is empty or 'tim' will expire before 't' */
-       if (t == NULL || ((int64_t)(tim->expire - cur_time) <
-                         (int64_t)(t->expire - cur_time))) {
-               LIST_INSERT_HEAD(&priv_timer[tim_lcore].pending, tim, next);
+/*
+ * Given a timer node in the skiplist, find the previous entries for it at
+ * all skiplist levels.
+ */
+static void
+timer_get_prev_entries_for_node(struct rte_timer *tim, unsigned tim_lcore,
+                               struct rte_timer **prev,
+                               struct priv_timer *priv_timer)
+{
+       int i;
+
+       /* to get a specific entry in the list, look for just lower than the time
+        * values, and then increment on each level individually if necessary
+        */
+       timer_get_prev_entries(tim->expire - 1, tim_lcore, prev, priv_timer);
+       for (i = priv_timer[tim_lcore].curr_skiplist_depth - 1; i >= 0; i--) {
+               while (prev[i]->sl_next[i] != NULL &&
+                               prev[i]->sl_next[i] != tim &&
+                               prev[i]->sl_next[i]->expire <= tim->expire)
+                       prev[i] = prev[i]->sl_next[i];
        }
-       else {
-               t_prev = t;
-
-               /* find an element that will expire after 'tim' */
-               LIST_FOREACH(t, &priv_timer[tim_lcore].pending, next) {
-                       if ((int64_t)(tim->expire - cur_time) <
-                           (int64_t)(t->expire - cur_time)) {
-                               LIST_INSERT_BEFORE(t, tim, next);
-                               break;
-                       }
-                       t_prev = t;
-               }
+}
 
-               /* not found, insert at the end of the list */
-               if (t == NULL)
-                       LIST_INSERT_AFTER(t_prev, tim, next);
+/* call with lock held as necessary
+ * add in list
+ * timer must be in config state
+ * timer must not be in a list
+ */
+static void
+timer_add(struct rte_timer *tim, unsigned int tim_lcore,
+         struct priv_timer *priv_timer)
+{
+       unsigned lvl;
+       struct rte_timer *prev[MAX_SKIPLIST_DEPTH+1];
+
+       /* find where exactly this element goes in the list of elements
+        * for each depth. */
+       timer_get_prev_entries(tim->expire, tim_lcore, prev, priv_timer);
+
+       /* now assign it a new level and add at that level */
+       const unsigned tim_level = timer_get_skiplist_level(
+                       priv_timer[tim_lcore].curr_skiplist_depth);
+       if (tim_level == priv_timer[tim_lcore].curr_skiplist_depth)
+               priv_timer[tim_lcore].curr_skiplist_depth++;
+
+       lvl = tim_level;
+       while (lvl > 0) {
+               tim->sl_next[lvl] = prev[lvl]->sl_next[lvl];
+               prev[lvl]->sl_next[lvl] = tim;
+               lvl--;
        }
+       tim->sl_next[0] = prev[0]->sl_next[0];
+       prev[0]->sl_next[0] = tim;
 
-       if (tim_lcore != lcore_id || !local_is_locked)
-               rte_spinlock_unlock(&priv_timer[tim_lcore].list_lock);
+       /* save the lowest list entry into the expire field of the dummy hdr
+        * NOTE: this is not atomic on 32-bit*/
+       priv_timer[tim_lcore].pending_head.expire = priv_timer[tim_lcore].\
+                       pending_head.sl_next[0]->expire;
 }
 
 /*
@@ -249,9 +423,13 @@ timer_add(struct rte_timer *tim, unsigned tim_lcore, int local_is_locked)
  * timer must be in a list
  */
 static void
-timer_del(struct rte_timer *tim, unsigned prev_owner, int local_is_locked)
+timer_del(struct rte_timer *tim, union rte_timer_status prev_status,
+         int local_is_locked, struct priv_timer *priv_timer)
 {
        unsigned lcore_id = rte_lcore_id();
+       unsigned prev_owner = prev_status.owner;
+       int i;
+       struct rte_timer *prev[MAX_SKIPLIST_DEPTH+1];
 
        /* if timer needs is pending another core, we need to lock the
         * list; if it is on local core, we need to lock if we are not
@@ -259,7 +437,25 @@ timer_del(struct rte_timer *tim, unsigned prev_owner, int local_is_locked)
        if (prev_owner != lcore_id || !local_is_locked)
                rte_spinlock_lock(&priv_timer[prev_owner].list_lock);
 
-       LIST_REMOVE(tim, next);
+       /* save the lowest list entry into the expire field of the dummy hdr.
+        * NOTE: this is not atomic on 32-bit */
+       if (tim == priv_timer[prev_owner].pending_head.sl_next[0])
+               priv_timer[prev_owner].pending_head.expire =
+                               ((tim->sl_next[0] == NULL) ? 0 : tim->sl_next[0]->expire);
+
+       /* adjust pointers from previous entries to point past this */
+       timer_get_prev_entries_for_node(tim, prev_owner, prev, priv_timer);
+       for (i = priv_timer[prev_owner].curr_skiplist_depth - 1; i >= 0; i--) {
+               if (prev[i]->sl_next[i] == tim)
+                       prev[i]->sl_next[i] = tim->sl_next[i];
+       }
+
+       /* in case we deleted last entry at a level, adjust down max level */
+       for (i = priv_timer[prev_owner].curr_skiplist_depth - 1; i >= 0; i--)
+               if (priv_timer[prev_owner].pending_head.sl_next[i] == NULL)
+                       priv_timer[prev_owner].curr_skiplist_depth --;
+               else
+                       break;
 
        if (prev_owner != lcore_id || !local_is_locked)
                rte_spinlock_unlock(&priv_timer[prev_owner].list_lock);
@@ -270,33 +466,44 @@ static int
 __rte_timer_reset(struct rte_timer *tim, uint64_t expire,
                  uint64_t period, unsigned tim_lcore,
                  rte_timer_cb_t fct, void *arg,
-                 int local_is_locked)
+                 int local_is_locked,
+                 struct rte_timer_data *timer_data)
 {
        union rte_timer_status prev_status, status;
        int ret;
        unsigned lcore_id = rte_lcore_id();
+       struct priv_timer *priv_timer = timer_data->priv_timer;
 
        /* round robin for tim_lcore */
        if (tim_lcore == (unsigned)LCORE_ID_ANY) {
-               tim_lcore = rte_get_next_lcore(priv_timer[lcore_id].prev_lcore,
-                                              0, 1);
-               priv_timer[lcore_id].prev_lcore = tim_lcore;
+               if (lcore_id < RTE_MAX_LCORE) {
+                       /* EAL thread with valid lcore_id */
+                       tim_lcore = rte_get_next_lcore(
+                               priv_timer[lcore_id].prev_lcore,
+                               0, 1);
+                       priv_timer[lcore_id].prev_lcore = tim_lcore;
+               } else
+                       /* non-EAL thread do not run rte_timer_manage(),
+                        * so schedule the timer on the first enabled lcore. */
+                       tim_lcore = rte_get_next_lcore(LCORE_ID_ANY, 0, 1);
        }
 
        /* wait that the timer is in correct status before update,
-        * and mark it as beeing configured */
-       ret = timer_set_config_state(tim, &prev_status);
+        * and mark it as being configured */
+       ret = timer_set_config_state(tim, &prev_status, priv_timer);
        if (ret < 0)
                return -1;
 
-       __TIMER_STAT_ADD(reset, 1);
-       priv_timer[lcore_id].updated = 1;
+       __TIMER_STAT_ADD(priv_timer, reset, 1);
+       if (prev_status.state == RTE_TIMER_RUNNING &&
+           lcore_id < RTE_MAX_LCORE) {
+               priv_timer[lcore_id].updated = 1;
+       }
 
        /* remove it from list */
-       if (prev_status.state == RTE_TIMER_PENDING ||
-           prev_status.state == RTE_TIMER_RUNNING) {
-               timer_del(tim, prev_status.owner, local_is_locked);
-               __TIMER_STAT_ADD(pending, -1);
+       if (prev_status.state == RTE_TIMER_PENDING) {
+               timer_del(tim, prev_status, local_is_locked, priv_timer);
+               __TIMER_STAT_ADD(priv_timer, pending, -1);
        }
 
        tim->period = period;
@@ -304,8 +511,15 @@ __rte_timer_reset(struct rte_timer *tim, uint64_t expire,
        tim->f = fct;
        tim->arg = arg;
 
-       __TIMER_STAT_ADD(pending, 1);
-       timer_add(tim, tim_lcore, local_is_locked);
+       /* if timer needs to be scheduled on another core, we need to
+        * lock the destination list; if it is on local core, we need to lock if
+        * we are not called from rte_timer_manage()
+        */
+       if (tim_lcore != lcore_id || !local_is_locked)
+               rte_spinlock_lock(&priv_timer[tim_lcore].list_lock);
+
+       __TIMER_STAT_ADD(priv_timer, pending, 1);
+       timer_add(tim, tim_lcore, priv_timer);
 
        /* update state: as we are in CONFIG state, only us can modify
         * the state so we don't need to use cmpset() here */
@@ -314,20 +528,24 @@ __rte_timer_reset(struct rte_timer *tim, uint64_t expire,
        status.owner = (int16_t)tim_lcore;
        tim->status.u32 = status.u32;
 
+       if (tim_lcore != lcore_id || !local_is_locked)
+               rte_spinlock_unlock(&priv_timer[tim_lcore].list_lock);
+
        return 0;
 }
 
 /* Reset and start the timer associated with the timer handle tim */
 int
-rte_timer_reset(struct rte_timer *tim, uint64_t ticks,
-               enum rte_timer_type type, unsigned tim_lcore,
-               rte_timer_cb_t fct, void *arg)
+rte_timer_reset_v20(struct rte_timer *tim, uint64_t ticks,
+                   enum rte_timer_type type, unsigned int tim_lcore,
+                   rte_timer_cb_t fct, void *arg)
 {
-       uint64_t cur_time = rte_get_hpet_cycles();
+       uint64_t cur_time = rte_get_timer_cycles();
        uint64_t period;
 
        if (unlikely((tim_lcore != (unsigned)LCORE_ID_ANY) &&
-                       !rte_lcore_is_enabled(tim_lcore)))
+                       !(rte_lcore_is_enabled(tim_lcore) ||
+                         rte_lcore_has_role(tim_lcore, ROLE_SERVICE))))
                return -1;
 
        if (type == PERIODICAL)
@@ -335,10 +553,44 @@ rte_timer_reset(struct rte_timer *tim, uint64_t ticks,
        else
                period = 0;
 
-       __rte_timer_reset(tim,  cur_time + ticks, period, tim_lcore,
-                         fct, arg, 0);
+       return __rte_timer_reset(tim,  cur_time + ticks, period, tim_lcore,
+                         fct, arg, 0, &default_timer_data);
+}
+VERSION_SYMBOL(rte_timer_reset, _v20, 2.0);
 
-       return 0;
+int
+rte_timer_reset_v1905(struct rte_timer *tim, uint64_t ticks,
+                     enum rte_timer_type type, unsigned int tim_lcore,
+                     rte_timer_cb_t fct, void *arg)
+{
+       return rte_timer_alt_reset(default_data_id, tim, ticks, type,
+                                  tim_lcore, fct, arg);
+}
+MAP_STATIC_SYMBOL(int rte_timer_reset(struct rte_timer *tim, uint64_t ticks,
+                                     enum rte_timer_type type,
+                                     unsigned int tim_lcore,
+                                     rte_timer_cb_t fct, void *arg),
+                 rte_timer_reset_v1905);
+BIND_DEFAULT_SYMBOL(rte_timer_reset, _v1905, 19.05);
+
+int
+rte_timer_alt_reset(uint32_t timer_data_id, struct rte_timer *tim,
+                   uint64_t ticks, enum rte_timer_type type,
+                   unsigned int tim_lcore, rte_timer_cb_t fct, void *arg)
+{
+       uint64_t cur_time = rte_get_timer_cycles();
+       uint64_t period;
+       struct rte_timer_data *timer_data;
+
+       TIMER_DATA_VALID_GET_OR_ERR_RET(timer_data_id, timer_data, -EINVAL);
+
+       if (type == PERIODICAL)
+               period = ticks;
+       else
+               period = 0;
+
+       return __rte_timer_reset(tim,  cur_time + ticks, period, tim_lcore,
+                                fct, arg, 0, timer_data);
 }
 
 /* loop until rte_timer_reset() succeed */
@@ -348,31 +600,35 @@ rte_timer_reset_sync(struct rte_timer *tim, uint64_t ticks,
                     rte_timer_cb_t fct, void *arg)
 {
        while (rte_timer_reset(tim, ticks, type, tim_lcore,
-                              fct, arg) != 0);
+                              fct, arg) != 0)
+               rte_pause();
 }
 
-/* Stop the timer associated with the timer handle tim */
-int
-rte_timer_stop(struct rte_timer *tim)
+static int
+__rte_timer_stop(struct rte_timer *tim, int local_is_locked,
+                struct rte_timer_data *timer_data)
 {
        union rte_timer_status prev_status, status;
        unsigned lcore_id = rte_lcore_id();
        int ret;
+       struct priv_timer *priv_timer = timer_data->priv_timer;
 
        /* wait that the timer is in correct status before update,
-        * and mark it as beeing configured */
-       ret = timer_set_config_state(tim, &prev_status);
+        * and mark it as being configured */
+       ret = timer_set_config_state(tim, &prev_status, priv_timer);
        if (ret < 0)
                return -1;
 
-       __TIMER_STAT_ADD(stop, 1);
-       priv_timer[lcore_id].updated = 1;
+       __TIMER_STAT_ADD(priv_timer, stop, 1);
+       if (prev_status.state == RTE_TIMER_RUNNING &&
+           lcore_id < RTE_MAX_LCORE) {
+               priv_timer[lcore_id].updated = 1;
+       }
 
        /* remove it from list */
-       if (prev_status.state == RTE_TIMER_PENDING ||
-           prev_status.state == RTE_TIMER_RUNNING) {
-               timer_del(tim, prev_status.owner, 0);
-               __TIMER_STAT_ADD(pending, -1);
+       if (prev_status.state == RTE_TIMER_PENDING) {
+               timer_del(tim, prev_status, local_is_locked, priv_timer);
+               __TIMER_STAT_ADD(priv_timer, pending, -1);
        }
 
        /* mark timer as stopped */
@@ -384,11 +640,39 @@ rte_timer_stop(struct rte_timer *tim)
        return 0;
 }
 
+/* Stop the timer associated with the timer handle tim */
+int
+rte_timer_stop_v20(struct rte_timer *tim)
+{
+       return __rte_timer_stop(tim, 0, &default_timer_data);
+}
+VERSION_SYMBOL(rte_timer_stop, _v20, 2.0);
+
+int
+rte_timer_stop_v1905(struct rte_timer *tim)
+{
+       return rte_timer_alt_stop(default_data_id, tim);
+}
+MAP_STATIC_SYMBOL(int rte_timer_stop(struct rte_timer *tim),
+                 rte_timer_stop_v1905);
+BIND_DEFAULT_SYMBOL(rte_timer_stop, _v1905, 19.05);
+
+int
+rte_timer_alt_stop(uint32_t timer_data_id, struct rte_timer *tim)
+{
+       struct rte_timer_data *timer_data;
+
+       TIMER_DATA_VALID_GET_OR_ERR_RET(timer_data_id, timer_data, -EINVAL);
+
+       return __rte_timer_stop(tim, 0, timer_data);
+}
+
 /* loop until rte_timer_stop() succeed */
 void
 rte_timer_stop_sync(struct rte_timer *tim)
 {
-       while (rte_timer_stop(tim) != 0);
+       while (rte_timer_stop(tim) != 0)
+               rte_pause();
 }
 
 /* Test the PENDING status of the timer handle tim */
@@ -399,49 +683,95 @@ rte_timer_pending(struct rte_timer *tim)
 }
 
 /* must be called periodically, run all timer that expired */
-void rte_timer_manage(void)
+static void
+__rte_timer_manage(struct rte_timer_data *timer_data)
 {
        union rte_timer_status status;
-       struct rte_timer *tim, *tim2;
+       struct rte_timer *tim, *next_tim;
+       struct rte_timer *run_first_tim, **pprev;
        unsigned lcore_id = rte_lcore_id();
-       uint64_t cur_time = rte_get_hpet_cycles();
-       int ret;
-
-       __TIMER_STAT_ADD(manage, 1);
+       struct rte_timer *prev[MAX_SKIPLIST_DEPTH + 1];
+       uint64_t cur_time;
+       int i, ret;
+       struct priv_timer *priv_timer = timer_data->priv_timer;
+
+       /* timer manager only runs on EAL thread with valid lcore_id */
+       assert(lcore_id < RTE_MAX_LCORE);
+
+       __TIMER_STAT_ADD(priv_timer, manage, 1);
+       /* optimize for the case where per-cpu list is empty */
+       if (priv_timer[lcore_id].pending_head.sl_next[0] == NULL)
+               return;
+       cur_time = rte_get_timer_cycles();
+
+#ifdef RTE_ARCH_64
+       /* on 64-bit the value cached in the pending_head.expired will be
+        * updated atomically, so we can consult that for a quick check here
+        * outside the lock */
+       if (likely(priv_timer[lcore_id].pending_head.expire > cur_time))
+               return;
+#endif
 
        /* browse ordered list, add expired timers in 'expired' list */
        rte_spinlock_lock(&priv_timer[lcore_id].list_lock);
 
-       LIST_FOREACH_SAFE(tim, tim2, &priv_timer[lcore_id].pending, next) {
-               if ((int64_t)(cur_time - tim->expire) < 0)
-                       break;
+       /* if nothing to do just unlock and return */
+       if (priv_timer[lcore_id].pending_head.sl_next[0] == NULL ||
+           priv_timer[lcore_id].pending_head.sl_next[0]->expire > cur_time) {
+               rte_spinlock_unlock(&priv_timer[lcore_id].list_lock);
+               return;
+       }
+
+       /* save start of list of expired timers */
+       tim = priv_timer[lcore_id].pending_head.sl_next[0];
 
-               LIST_REMOVE(tim, next);
-               LIST_INSERT_HEAD(&priv_timer[lcore_id].expired, tim, next);
+       /* break the existing list at current time point */
+       timer_get_prev_entries(cur_time, lcore_id, prev, priv_timer);
+       for (i = priv_timer[lcore_id].curr_skiplist_depth -1; i >= 0; i--) {
+               if (prev[i] == &priv_timer[lcore_id].pending_head)
+                       continue;
+               priv_timer[lcore_id].pending_head.sl_next[i] =
+                   prev[i]->sl_next[i];
+               if (prev[i]->sl_next[i] == NULL)
+                       priv_timer[lcore_id].curr_skiplist_depth--;
+               prev[i] ->sl_next[i] = NULL;
        }
 
+       /* transition run-list from PENDING to RUNNING */
+       run_first_tim = tim;
+       pprev = &run_first_tim;
 
-       /* for each timer of 'expired' list, check state and execute callback */
-       while ((tim = LIST_FIRST(&priv_timer[lcore_id].expired)) != NULL) {
-               ret = timer_set_running_state(tim);
+       for ( ; tim != NULL; tim = next_tim) {
+               next_tim = tim->sl_next[0];
 
-               /* remove from expired list, and add it in done list */
-               LIST_REMOVE(tim, next);
-               LIST_INSERT_HEAD(&priv_timer[lcore_id].done, tim, next);
+               ret = timer_set_running_state(tim);
+               if (likely(ret == 0)) {
+                       pprev = &tim->sl_next[0];
+               } else {
+                       /* another core is trying to re-config this one,
+                        * remove it from local expired list
+                        */
+                       *pprev = next_tim;
+               }
+       }
 
-               /* this timer was not pending, continue */
-               if (ret < 0)
-                       continue;
+       /* update the next to expire timer value */
+       priv_timer[lcore_id].pending_head.expire =
+           (priv_timer[lcore_id].pending_head.sl_next[0] == NULL) ? 0 :
+               priv_timer[lcore_id].pending_head.sl_next[0]->expire;
 
-               rte_spinlock_unlock(&priv_timer[lcore_id].list_lock);
+       rte_spinlock_unlock(&priv_timer[lcore_id].list_lock);
 
+       /* now scan expired list and call callbacks */
+       for (tim = run_first_tim; tim != NULL; tim = next_tim) {
+               next_tim = tim->sl_next[0];
                priv_timer[lcore_id].updated = 0;
+               priv_timer[lcore_id].running_tim = tim;
 
                /* execute callback function with list unlocked */
                tim->f(tim, tim->arg);
 
-               rte_spinlock_lock(&priv_timer[lcore_id].list_lock);
-
+               __TIMER_STAT_ADD(priv_timer, pending, -1);
                /* the timer was stopped or reloaded by the callback
                 * function, we have nothing to do here */
                if (priv_timer[lcore_id].updated == 1)
@@ -449,44 +779,267 @@ void rte_timer_manage(void)
 
                if (tim->period == 0) {
                        /* remove from done list and mark timer as stopped */
-                       LIST_REMOVE(tim, next);
-                       __TIMER_STAT_ADD(pending, -1);
                        status.state = RTE_TIMER_STOP;
                        status.owner = RTE_TIMER_NO_OWNER;
                        rte_wmb();
                        tim->status.u32 = status.u32;
                }
                else {
-                       /* keep it in done list and mark timer as pending */
+                       /* keep it in list and mark timer as pending */
+                       rte_spinlock_lock(&priv_timer[lcore_id].list_lock);
                        status.state = RTE_TIMER_PENDING;
+                       __TIMER_STAT_ADD(priv_timer, pending, 1);
                        status.owner = (int16_t)lcore_id;
                        rte_wmb();
                        tim->status.u32 = status.u32;
+                       __rte_timer_reset(tim, tim->expire + tim->period,
+                               tim->period, lcore_id, tim->f, tim->arg, 1,
+                               timer_data);
+                       rte_spinlock_unlock(&priv_timer[lcore_id].list_lock);
                }
        }
+       priv_timer[lcore_id].running_tim = NULL;
+}
+
+void
+rte_timer_manage_v20(void)
+{
+       __rte_timer_manage(&default_timer_data);
+}
+VERSION_SYMBOL(rte_timer_manage, _v20, 2.0);
+
+int
+rte_timer_manage_v1905(void)
+{
+       struct rte_timer_data *timer_data;
+
+       TIMER_DATA_VALID_GET_OR_ERR_RET(default_data_id, timer_data, -EINVAL);
+
+       __rte_timer_manage(timer_data);
+
+       return 0;
+}
+MAP_STATIC_SYMBOL(int rte_timer_manage(void), rte_timer_manage_v1905);
+BIND_DEFAULT_SYMBOL(rte_timer_manage, _v1905, 19.05);
+
+int
+rte_timer_alt_manage(uint32_t timer_data_id,
+                    unsigned int *poll_lcores,
+                    int nb_poll_lcores,
+                    rte_timer_alt_manage_cb_t f)
+{
+       unsigned int default_poll_lcores[] = {rte_lcore_id()};
+       union rte_timer_status status;
+       struct rte_timer *tim, *next_tim, **pprev;
+       struct rte_timer *run_first_tims[RTE_MAX_LCORE];
+       unsigned int this_lcore = rte_lcore_id();
+       struct rte_timer *prev[MAX_SKIPLIST_DEPTH + 1];
+       uint64_t cur_time;
+       int i, j, ret;
+       int nb_runlists = 0;
+       struct rte_timer_data *data;
+       struct priv_timer *privp;
+       uint32_t poll_lcore;
+
+       TIMER_DATA_VALID_GET_OR_ERR_RET(timer_data_id, data, -EINVAL);
+
+       /* timer manager only runs on EAL thread with valid lcore_id */
+       assert(this_lcore < RTE_MAX_LCORE);
+
+       __TIMER_STAT_ADD(data->priv_timer, manage, 1);
+
+       if (poll_lcores == NULL) {
+               poll_lcores = default_poll_lcores;
+               nb_poll_lcores = RTE_DIM(default_poll_lcores);
+       }
+
+       for (i = 0; i < nb_poll_lcores; i++) {
+               poll_lcore = poll_lcores[i];
+               privp = &data->priv_timer[poll_lcore];
 
-       /* finally, browse done list, some timer may have to be
-        * rescheduled automatically */
-       LIST_FOREACH_SAFE(tim, tim2, &priv_timer[lcore_id].done, next) {
+               /* optimize for the case where per-cpu list is empty */
+               if (privp->pending_head.sl_next[0] == NULL)
+                       continue;
+               cur_time = rte_get_timer_cycles();
+
+#ifdef RTE_ARCH_64
+               /* on 64-bit the value cached in the pending_head.expired will
+                * be updated atomically, so we can consult that for a quick
+                * check here outside the lock
+                */
+               if (likely(privp->pending_head.expire > cur_time))
+                       continue;
+#endif
+
+               /* browse ordered list, add expired timers in 'expired' list */
+               rte_spinlock_lock(&privp->list_lock);
 
-               /* reset may fail if timer is beeing modified, in this
-                * case the timer will remain in 'done' list until the
-                * core that is modifying it remove it */
-               __rte_timer_reset(tim, cur_time + tim->period,
-                                 tim->period, lcore_id, tim->f,
-                                 tim->arg, 1);
+               /* if nothing to do just unlock and return */
+               if (privp->pending_head.sl_next[0] == NULL ||
+                   privp->pending_head.sl_next[0]->expire > cur_time) {
+                       rte_spinlock_unlock(&privp->list_lock);
+                       continue;
+               }
+
+               /* save start of list of expired timers */
+               tim = privp->pending_head.sl_next[0];
+
+               /* break the existing list at current time point */
+               timer_get_prev_entries(cur_time, poll_lcore, prev,
+                                      data->priv_timer);
+               for (j = privp->curr_skiplist_depth - 1; j >= 0; j--) {
+                       if (prev[j] == &privp->pending_head)
+                               continue;
+                       privp->pending_head.sl_next[j] =
+                               prev[j]->sl_next[j];
+                       if (prev[j]->sl_next[j] == NULL)
+                               privp->curr_skiplist_depth--;
+
+                       prev[j]->sl_next[j] = NULL;
+               }
+
+               /* transition run-list from PENDING to RUNNING */
+               run_first_tims[nb_runlists] = tim;
+               pprev = &run_first_tims[nb_runlists];
+               nb_runlists++;
+
+               for ( ; tim != NULL; tim = next_tim) {
+                       next_tim = tim->sl_next[0];
+
+                       ret = timer_set_running_state(tim);
+                       if (likely(ret == 0)) {
+                               pprev = &tim->sl_next[0];
+                       } else {
+                               /* another core is trying to re-config this one,
+                                * remove it from local expired list
+                                */
+                               *pprev = next_tim;
+                       }
+               }
+
+               /* update the next to expire timer value */
+               privp->pending_head.expire =
+                   (privp->pending_head.sl_next[0] == NULL) ? 0 :
+                       privp->pending_head.sl_next[0]->expire;
+
+               rte_spinlock_unlock(&privp->list_lock);
        }
 
-       /* job finished, unlock the list lock */
-       rte_spinlock_unlock(&priv_timer[lcore_id].list_lock);
+       /* Now process the run lists */
+       while (1) {
+               bool done = true;
+               uint64_t min_expire = UINT64_MAX;
+               int min_idx = 0;
+
+               /* Find the next oldest timer to process */
+               for (i = 0; i < nb_runlists; i++) {
+                       tim = run_first_tims[i];
+
+                       if (tim != NULL && tim->expire < min_expire) {
+                               min_expire = tim->expire;
+                               min_idx = i;
+                               done = false;
+                       }
+               }
+
+               if (done)
+                       break;
+
+               tim = run_first_tims[min_idx];
+
+               /* Move down the runlist from which we picked a timer to
+                * execute
+                */
+               run_first_tims[min_idx] = run_first_tims[min_idx]->sl_next[0];
+
+               data->priv_timer[this_lcore].updated = 0;
+               data->priv_timer[this_lcore].running_tim = tim;
+
+               /* Call the provided callback function */
+               f(tim);
+
+               __TIMER_STAT_ADD(data->priv_timer, pending, -1);
+
+               /* the timer was stopped or reloaded by the callback
+                * function, we have nothing to do here
+                */
+               if (data->priv_timer[this_lcore].updated == 1)
+                       continue;
+
+               if (tim->period == 0) {
+                       /* remove from done list and mark timer as stopped */
+                       status.state = RTE_TIMER_STOP;
+                       status.owner = RTE_TIMER_NO_OWNER;
+                       rte_wmb();
+                       tim->status.u32 = status.u32;
+               } else {
+                       /* keep it in list and mark timer as pending */
+                       rte_spinlock_lock(
+                               &data->priv_timer[this_lcore].list_lock);
+                       status.state = RTE_TIMER_PENDING;
+                       __TIMER_STAT_ADD(data->priv_timer, pending, 1);
+                       status.owner = (int16_t)this_lcore;
+                       rte_wmb();
+                       tim->status.u32 = status.u32;
+                       __rte_timer_reset(tim, tim->expire + tim->period,
+                               tim->period, this_lcore, tim->f, tim->arg, 1,
+                               data);
+                       rte_spinlock_unlock(
+                               &data->priv_timer[this_lcore].list_lock);
+               }
+
+               data->priv_timer[this_lcore].running_tim = NULL;
+       }
+
+       return 0;
+}
+
+/* Walk pending lists, stopping timers and calling user-specified function */
+int
+rte_timer_stop_all(uint32_t timer_data_id, unsigned int *walk_lcores,
+                  int nb_walk_lcores,
+                  rte_timer_stop_all_cb_t f, void *f_arg)
+{
+       int i;
+       struct priv_timer *priv_timer;
+       uint32_t walk_lcore;
+       struct rte_timer *tim, *next_tim;
+       struct rte_timer_data *timer_data;
+
+       TIMER_DATA_VALID_GET_OR_ERR_RET(timer_data_id, timer_data, -EINVAL);
+
+       for (i = 0; i < nb_walk_lcores; i++) {
+               walk_lcore = walk_lcores[i];
+               priv_timer = &timer_data->priv_timer[walk_lcore];
+
+               rte_spinlock_lock(&priv_timer->list_lock);
+
+               for (tim = priv_timer->pending_head.sl_next[0];
+                    tim != NULL;
+                    tim = next_tim) {
+                       next_tim = tim->sl_next[0];
+
+                       /* Call timer_stop with lock held */
+                       __rte_timer_stop(tim, 1, timer_data);
+
+                       if (f)
+                               f(tim, f_arg);
+               }
+
+               rte_spinlock_unlock(&priv_timer->list_lock);
+       }
+
+       return 0;
 }
 
 /* dump statistics about timers */
-void rte_timer_dump_stats(void)
+static void
+__rte_timer_dump_stats(struct rte_timer_data *timer_data __rte_unused, FILE *f)
 {
 #ifdef RTE_LIBRTE_TIMER_DEBUG
        struct rte_timer_debug_stats sum;
        unsigned lcore_id;
+       struct priv_timer *priv_timer = timer_data->priv_timer;
 
        memset(&sum, 0, sizeof(sum));
        for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
@@ -495,12 +1048,40 @@ void rte_timer_dump_stats(void)
                sum.manage += priv_timer[lcore_id].stats.manage;
                sum.pending += priv_timer[lcore_id].stats.pending;
        }
-       printf("Timer statistics:\n");
-       printf("  reset = %"PRIu64"\n", sum.reset);
-       printf("  stop = %"PRIu64"\n", sum.stop);
-       printf("  manage = %"PRIu64"\n", sum.manage);
-       printf("  pending = %"PRIu64"\n", sum.pending);
+       fprintf(f, "Timer statistics:\n");
+       fprintf(f, "  reset = %"PRIu64"\n", sum.reset);
+       fprintf(f, "  stop = %"PRIu64"\n", sum.stop);
+       fprintf(f, "  manage = %"PRIu64"\n", sum.manage);
+       fprintf(f, "  pending = %"PRIu64"\n", sum.pending);
 #else
-       printf("No timer statistics, RTE_LIBRTE_TIMER_DEBUG is disabled\n");
+       fprintf(f, "No timer statistics, RTE_LIBRTE_TIMER_DEBUG is disabled\n");
 #endif
 }
+
+void
+rte_timer_dump_stats_v20(FILE *f)
+{
+       __rte_timer_dump_stats(&default_timer_data, f);
+}
+VERSION_SYMBOL(rte_timer_dump_stats, _v20, 2.0);
+
+int
+rte_timer_dump_stats_v1905(FILE *f)
+{
+       return rte_timer_alt_dump_stats(default_data_id, f);
+}
+MAP_STATIC_SYMBOL(int rte_timer_dump_stats(FILE *f),
+                 rte_timer_dump_stats_v1905);
+BIND_DEFAULT_SYMBOL(rte_timer_dump_stats, _v1905, 19.05);
+
+int
+rte_timer_alt_dump_stats(uint32_t timer_data_id __rte_unused, FILE *f)
+{
+       struct rte_timer_data *timer_data;
+
+       TIMER_DATA_VALID_GET_OR_ERR_RET(timer_data_id, timer_data, -EINVAL);
+
+       __rte_timer_dump_stats(timer_data, f);
+
+       return 0;
+}