@@ -78,6 +78,8 @@ struct tvec_root {
struct tvec_base {
spinlock_t lock;
struct timer_list *running_timer;
+ struct list_head migration_list;
+ struct tvec_base *preferred_target;
unsigned long timer_jiffies;
unsigned long next_timer;
unsigned long active_timers;
@@ -795,10 +797,18 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
* We are trying to schedule the timer on the local CPU.
* However we can't change timer's base while it is running,
* otherwise del_timer_sync() can't detect that the timer's
- * handler yet has not finished. This also guarantees that
- * the timer is serialized wrt itself.
+ * handler yet has not finished.
+ *
+ * Move timer to migration_list which can be processed after all
+ * expired timers are serviced. This also guarantees that the
+ * timer is serialized wrt itself.
*/
- if (likely(base->running_timer != timer)) {
+ if (unlikely(base->running_timer == timer)) {
+ timer->expires = expires;
+ base->preferred_target = new_base;
+ list_add_tail(&timer->entry, &base->migration_list);
+ goto out_unlock;
+ } else {
/* See the comment in lock_timer_base() */
timer_set_base(timer, NULL);
spin_unlock(&base->lock);
@@ -1228,6 +1238,41 @@ static inline void __run_timers(struct tvec_base *base)
}
}
base->running_timer = NULL;
+
+ /*
+ * Process timers from migration list, as their handlers have finished
+ * now.
+ */
+ if (unlikely(!list_empty(&base->migration_list))) {
+ struct tvec_base *new_base = base->preferred_target;
+
+ if (!idle_cpu(base->cpu)) {
+ /* Local CPU isn't idle anymore */
+ new_base = base;
+ } else if (idle_cpu(new_base->cpu)) {
+ /* Re-evaluate base, target CPU has gone idle */
+ new_base = per_cpu(tvec_bases, get_nohz_timer_target(false));
+ }
+
+ do {
+ timer = list_first_entry(&base->migration_list,
+ struct timer_list, entry);
+
+ __list_del(timer->entry.prev, timer->entry.next);
+
+ /* See the comment in lock_timer_base() */
+ timer_set_base(timer, NULL);
+ spin_unlock(&base->lock);
+
+ spin_lock(&new_base->lock);
+ timer_set_base(timer, new_base);
+ internal_add_timer(new_base, timer);
+ spin_unlock(&new_base->lock);
+
+ spin_lock(&base->lock);
+ } while (!list_empty(&base->migration_list));
+ }
+
spin_unlock_irq(&base->lock);
}
@@ -1635,6 +1680,7 @@ static void __init init_timer_cpu(struct tvec_base *base, int cpu)
for (j = 0; j < TVR_SIZE; j++)
INIT_LIST_HEAD(base->tv1.vec + j);
+ INIT_LIST_HEAD(&base->migration_list);
base->timer_jiffies = jiffies;
base->next_timer = base->timer_jiffies;
}
Currently, when adding new timers, we try to migrate them to a non-idle core if the local core is idle. However, we don't do this for timers that are re-armed from their handler. These running timers can wake up an idle core when they could've been serviced by any non-idle core, thus potentially impacting power savings by preventing deep idling of cores. Migration of a running timer is race-prone in two ways: 1. Serialization of the timer with itself: If migrated, it is possible that timer may fire on the new base, before the timer handler has finished execution on the old base. 2. Deletion of timer with del_timer_sync(): del_timer_sync() deletes timer if the timer handler isn't running currently - this is checked by comparing timer against the running_timer of its base. If we migrate the timer to a new base, del_timer_sync() might delete it while its handler is still running on the old base because it will check against the running_timer of the new base and not find it running. We can fix these problems by deferring the re-queuing of the timer until its handler has finished. By moving these timers to a special migration list, we can process it after all expired timers are processed. To optimize finding a suitable new base for timers in the migration list, preserve the preferred_target base in __mod_timer(). Suggested-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org> --- V1->V2: - Completely new approach and the correct one. - Re-queue timer only after its handler has finished. Tested on Exynos (Dual ARM Cortex-A9) board, with ubuntu. System was fairly idle and 'dmesg > /dev/null' was run in an infinite loop on one of the CPUs, to get it out of idle. It was seen multiple times that migration does happen and sometimes while processing migration-list, it is found that local CPU isn't idle anymore. And in such cases we end up adding timer on local CPU. kernel/time/timer.c | 52 +++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 49 insertions(+), 3 deletions(-)