Message ID | 20171201180157.18937-2-brendan.jackman@arm.com |
---|---|
State | New |
Headers | show |
Series | [v2,1/2] sched: force update of blocked load of idle cpus | expand |
On Fri, Dec 01, 2017 at 06:01:56PM +0000, Brendan Jackman wrote: > @@ -8955,8 +8964,20 @@ static void nohz_balancer_kick(void) > if (ilb_cpu >= nr_cpu_ids) > return; > > - if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu))) > + if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu))) { > + if (!only_update) { > + /* > + * There's a pending/ongoing nohz kick/balance. If it's > + * just for stats, convert it to a proper load balance. > + */ > + clear_bit(NOHZ_STATS_KICK, nohz_flags(ilb_cpu)); > + } > return; > + } > + > + if (only_update) > + set_bit(NOHZ_STATS_KICK, nohz_flags(ilb_cpu)); > + > /* > * Use smp_send_reschedule() instead of resched_cpu(). > * This way we generate a sched IPI on the target cpu which This looks racy.. if its not we don't need atomic ops, if it is but is still fine it needs a comment.
On Fri, Dec 01, 2017 at 06:01:56PM +0000, Brendan Jackman wrote: > @@ -9210,7 +9256,15 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) > cpu_load_update_idle(rq); > rq_unlock_irq(rq, &rf); > > - rebalance_domains(rq, CPU_IDLE); > + update_blocked_averages(balance_cpu); > + /* > + * This idle load balance softirq may have been > + * triggered only to update the blocked load and shares > + * of idle CPUs (which we have just done for > + * balance_cpu). In that case skip the actual balance. > + */ > + if (!in_nohz_stats_kick(this_cpu)) > + rebalance_domains(rq, idle); > } > > if (time_after(next_balance, rq->next_balance)) { > @@ -9336,7 +9396,12 @@ static __latent_entropy void run_rebalance_domains(struct softirq_action *h) > * and abort nohz_idle_balance altogether if we pull some load. > */ > nohz_idle_balance(this_rq, idle); > - rebalance_domains(this_rq, idle); > + update_blocked_averages(this_rq->cpu); > + if (!in_nohz_stats_kick(this_rq->cpu)) > + rebalance_domains(this_rq, idle); > +#ifdef CONFIG_NO_HZ_COMMON > + clear_bit(NOHZ_STATS_KICK, nohz_flags(this_rq->cpu)); > +#endif > } > > /* You're doing the same thing to both (all) callsites of rebalance_domains(), does that not suggest doing it inside and leaving update_blocked_averages() where it is?
On 20 December 2017 at 15:03, Peter Zijlstra <peterz@infradead.org> wrote: > On Fri, Dec 01, 2017 at 06:01:56PM +0000, Brendan Jackman wrote: >> @@ -8955,8 +8964,20 @@ static void nohz_balancer_kick(void) >> if (ilb_cpu >= nr_cpu_ids) >> return; >> >> - if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu))) >> + if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu))) { >> + if (!only_update) { >> + /* >> + * There's a pending/ongoing nohz kick/balance. If it's >> + * just for stats, convert it to a proper load balance. >> + */ >> + clear_bit(NOHZ_STATS_KICK, nohz_flags(ilb_cpu)); >> + } >> return; >> + } >> + >> + if (only_update) >> + set_bit(NOHZ_STATS_KICK, nohz_flags(ilb_cpu)); >> + >> /* >> * Use smp_send_reschedule() instead of resched_cpu(). >> * This way we generate a sched IPI on the target cpu which > > This looks racy.. if its not we don't need atomic ops, if it is but is > still fine it needs a comment. NOHZ_STATS_KICK modification is protected by test_and_set_bit(NOHZ_BALANCE_KICK. You're right that we don't need atomics ops and __set_bit() is enough
On 20 December 2017 at 15:09, Peter Zijlstra <peterz@infradead.org> wrote: > On Fri, Dec 01, 2017 at 06:01:56PM +0000, Brendan Jackman wrote: > >> @@ -9210,7 +9256,15 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) >> cpu_load_update_idle(rq); >> rq_unlock_irq(rq, &rf); >> >> - rebalance_domains(rq, CPU_IDLE); >> + update_blocked_averages(balance_cpu); >> + /* >> + * This idle load balance softirq may have been >> + * triggered only to update the blocked load and shares >> + * of idle CPUs (which we have just done for >> + * balance_cpu). In that case skip the actual balance. >> + */ >> + if (!in_nohz_stats_kick(this_cpu)) >> + rebalance_domains(rq, idle); >> } >> >> if (time_after(next_balance, rq->next_balance)) { > >> @@ -9336,7 +9396,12 @@ static __latent_entropy void run_rebalance_domains(struct softirq_action *h) >> * and abort nohz_idle_balance altogether if we pull some load. >> */ >> nohz_idle_balance(this_rq, idle); >> - rebalance_domains(this_rq, idle); >> + update_blocked_averages(this_rq->cpu); >> + if (!in_nohz_stats_kick(this_rq->cpu)) >> + rebalance_domains(this_rq, idle); >> +#ifdef CONFIG_NO_HZ_COMMON >> + clear_bit(NOHZ_STATS_KICK, nohz_flags(this_rq->cpu)); >> +#endif >> } >> >> /* > > You're doing the same thing to both (all) callsites of > rebalance_domains(), does that not suggest doing it inside and leaving > update_blocked_averages() where it is? The goal of moving update_blocked_averages() outside rebalance_domains is to not add a new parameter or use a special cpu_idle_type value in rebalance_domains parameters in order to abort the rebalance sequence just after updating blocked load > >
On Wed, Dec 20, 2017 at 03:23:01PM +0100, Vincent Guittot wrote: > On 20 December 2017 at 15:03, Peter Zijlstra <peterz@infradead.org> wrote: > > On Fri, Dec 01, 2017 at 06:01:56PM +0000, Brendan Jackman wrote: > >> @@ -8955,8 +8964,20 @@ static void nohz_balancer_kick(void) > >> if (ilb_cpu >= nr_cpu_ids) > >> return; > >> > >> - if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu))) > >> + if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu))) { > >> + if (!only_update) { > >> + /* > >> + * There's a pending/ongoing nohz kick/balance. If it's > >> + * just for stats, convert it to a proper load balance. > >> + */ > >> + clear_bit(NOHZ_STATS_KICK, nohz_flags(ilb_cpu)); > >> + } > >> return; > >> + } > >> + > >> + if (only_update) > >> + set_bit(NOHZ_STATS_KICK, nohz_flags(ilb_cpu)); > >> + > >> /* > >> * Use smp_send_reschedule() instead of resched_cpu(). > >> * This way we generate a sched IPI on the target cpu which > > > > This looks racy.. if its not we don't need atomic ops, if it is but is > > still fine it needs a comment. > > NOHZ_STATS_KICK modification is protected by test_and_set_bit(NOHZ_BALANCE_KICK. > You're right that we don't need atomics ops and __set_bit() is enough Well, you shouldn't mix atomic and non-atomic ops to the same word, that's asking for trouble. But why don't you do something like: nohz_kick() flags = NOHZ_STAT; if (!only_update) flags |= NOHZ_BALANCE; atomic_long_or(flags, &nohz_cpu(cpu)); nohz_idle_balance() unsigned long do_flags = atomic_long_fetch_andnot(NOHZ_BALANCE|NOHZ_STAT, &nohz_flags(cpu)); if (do_flags & NOHZ_STAT) update_blocked_stuff(); if (do_flags & NOHZ_BALANCE) rebalance_domains(); That way its far more readable.
On Wed, Dec 20, 2017 at 04:01:10PM +0100, Peter Zijlstra wrote: > Well, you shouldn't mix atomic and non-atomic ops to the same word, > that's asking for trouble. > > But why don't you do something like: > > nohz_kick() > > flags = NOHZ_STAT; > if (!only_update) > flags |= NOHZ_BALANCE; > > atomic_long_or(flags, &nohz_cpu(cpu)); > > > nohz_idle_balance() > > unsigned long do_flags = atomic_long_fetch_andnot(NOHZ_BALANCE|NOHZ_STAT, &nohz_flags(cpu)); > > if (do_flags & NOHZ_STAT) > update_blocked_stuff(); > > if (do_flags & NOHZ_BALANCE) > rebalance_domains(); > > That way its far more readable. we could use atomic_t too, there's not that many flags in there, the only reason its long is because of that bitmap crud.
Le Wednesday 20 Dec 2017 à 16:01:10 (+0100), Peter Zijlstra a écrit : > On Wed, Dec 20, 2017 at 03:23:01PM +0100, Vincent Guittot wrote: > > On 20 December 2017 at 15:03, Peter Zijlstra <peterz@infradead.org> wrote: > > > On Fri, Dec 01, 2017 at 06:01:56PM +0000, Brendan Jackman wrote: > > >> @@ -8955,8 +8964,20 @@ static void nohz_balancer_kick(void) > > >> if (ilb_cpu >= nr_cpu_ids) > > >> return; > > >> > > >> - if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu))) > > >> + if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu))) { > > >> + if (!only_update) { > > >> + /* > > >> + * There's a pending/ongoing nohz kick/balance. If it's > > >> + * just for stats, convert it to a proper load balance. > > >> + */ > > >> + clear_bit(NOHZ_STATS_KICK, nohz_flags(ilb_cpu)); > > >> + } > > >> return; > > >> + } > > >> + > > >> + if (only_update) > > >> + set_bit(NOHZ_STATS_KICK, nohz_flags(ilb_cpu)); > > >> + > > >> /* > > >> * Use smp_send_reschedule() instead of resched_cpu(). > > >> * This way we generate a sched IPI on the target cpu which > > > > > > This looks racy.. if its not we don't need atomic ops, if it is but is > > > still fine it needs a comment. > > > > NOHZ_STATS_KICK modification is protected by test_and_set_bit(NOHZ_BALANCE_KICK. > > You're right that we don't need atomics ops and __set_bit() is enough > > Well, you shouldn't mix atomic and non-atomic ops to the same word, > that's asking for trouble. In fact, the atomic operation is needed, I forgot that set/clear of NOHZ_TICK_STOPPED can happen simultaneously on the ilb_cpu so we must keep set_bit(NOHZ_STATS_KICK, nohz_flags(ilb_cpu)); As comment we can add: /* * Update of NOHZ_STATS_KICK itself is protected by test_and_set_biti but * clear/set of NOHZ_TICK_STOPPED can happen simultaneously so we need * atomic operation */ > > But why don't you do something like: > > nohz_kick() > > flags = NOHZ_STAT; > if (!only_update) > flags |= NOHZ_BALANCE; > > atomic_long_or(flags, &nohz_cpu(cpu)); > > > nohz_idle_balance() > > unsigned long do_flags = atomic_long_fetch_andnot(NOHZ_BALANCE|NOHZ_STAT, &nohz_flags(cpu)); > > if (do_flags & NOHZ_STAT) > update_blocked_stuff(); > > if (do_flags & NOHZ_BALANCE) > rebalance_domains(); > > That way its far more readable. >
On 20 December 2017 at 15:27, Vincent Guittot <vincent.guittot@linaro.org> wrote: > On 20 December 2017 at 15:09, Peter Zijlstra <peterz@infradead.org> wrote: >> On Fri, Dec 01, 2017 at 06:01:56PM +0000, Brendan Jackman wrote: >> >>> @@ -9210,7 +9256,15 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) >>> cpu_load_update_idle(rq); >>> rq_unlock_irq(rq, &rf); >>> >>> - rebalance_domains(rq, CPU_IDLE); >>> + update_blocked_averages(balance_cpu); >>> + /* >>> + * This idle load balance softirq may have been >>> + * triggered only to update the blocked load and shares >>> + * of idle CPUs (which we have just done for >>> + * balance_cpu). In that case skip the actual balance. >>> + */ >>> + if (!in_nohz_stats_kick(this_cpu)) >>> + rebalance_domains(rq, idle); >>> } >>> >>> if (time_after(next_balance, rq->next_balance)) { >> >>> @@ -9336,7 +9396,12 @@ static __latent_entropy void run_rebalance_domains(struct softirq_action *h) >>> * and abort nohz_idle_balance altogether if we pull some load. >>> */ >>> nohz_idle_balance(this_rq, idle); >>> - rebalance_domains(this_rq, idle); >>> + update_blocked_averages(this_rq->cpu); >>> + if (!in_nohz_stats_kick(this_rq->cpu)) >>> + rebalance_domains(this_rq, idle); >>> +#ifdef CONFIG_NO_HZ_COMMON >>> + clear_bit(NOHZ_STATS_KICK, nohz_flags(this_rq->cpu)); >>> +#endif >>> } >>> >>> /* >> >> You're doing the same thing to both (all) callsites of >> rebalance_domains(), does that not suggest doing it inside and leaving >> update_blocked_averages() where it is? > > The goal of moving update_blocked_averages() outside rebalance_domains > is to not add a new parameter or use a special cpu_idle_type value in > rebalance_domains parameters in order to abort the rebalance sequence > just after updating blocked load Peter, Is the reason above reasonable or you prefer update_blocked_averages to stay in rebalance_domains ? > >> >>
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4037e19bbca2..f83e8f0d4f06 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6258,6 +6258,9 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu) return min_cap * 1024 < task_util(p) * capacity_margin; } +static inline bool nohz_kick_needed(struct rq *rq, bool only_update); +static void nohz_balancer_kick(bool only_update); + /* * select_task_rq_fair: Select target runqueue for the waking task in domains * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE, @@ -6334,6 +6337,11 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f } rcu_read_unlock(); +#ifdef CONFIG_NO_HZ_COMMON + if (nohz_kick_needed(cpu_rq(new_cpu), true)) + nohz_balancer_kick(true); +#endif + return new_cpu; } @@ -8927,6 +8935,7 @@ static struct { cpumask_var_t idle_cpus_mask; atomic_t nr_cpus; unsigned long next_balance; /* in jiffy units */ + unsigned long next_update; /* in jiffy units */ } nohz ____cacheline_aligned; static inline int find_new_ilb(void) @@ -8944,7 +8953,7 @@ static inline int find_new_ilb(void) * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle * CPU (if there is one). */ -static void nohz_balancer_kick(void) +static void nohz_balancer_kick(bool only_update) { int ilb_cpu; @@ -8955,8 +8964,20 @@ static void nohz_balancer_kick(void) if (ilb_cpu >= nr_cpu_ids) return; - if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu))) + if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu))) { + if (!only_update) { + /* + * There's a pending/ongoing nohz kick/balance. If it's + * just for stats, convert it to a proper load balance. + */ + clear_bit(NOHZ_STATS_KICK, nohz_flags(ilb_cpu)); + } return; + } + + if (only_update) + set_bit(NOHZ_STATS_KICK, nohz_flags(ilb_cpu)); + /* * Use smp_send_reschedule() instead of resched_cpu(). * This way we generate a sched IPI on the target cpu which @@ -9044,6 +9065,8 @@ void nohz_balance_enter_idle(int cpu) atomic_inc(&nohz.nr_cpus); set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); } +#else +static inline void nohz_balancer_kick(bool only_update) {} #endif static DEFINE_SPINLOCK(balancing); @@ -9075,8 +9098,6 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) int need_serialize, need_decay = 0; u64 max_cost = 0; - update_blocked_averages(cpu); - rcu_read_lock(); for_each_domain(cpu, sd) { /* @@ -9167,6 +9188,13 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) } #ifdef CONFIG_NO_HZ_COMMON + +/* True iff @cpu was kicked for nohz balance, but just to update blocked load */ +static inline bool in_nohz_stats_kick(int cpu) +{ + return test_bit(NOHZ_STATS_KICK, nohz_flags(cpu)); +} + /* * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the * rebalancing for all the cpus for whom scheduler ticks are stopped. @@ -9175,6 +9203,7 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { int this_cpu = this_rq->cpu; struct rq *rq; + struct sched_domain *sd; int balance_cpu; /* Earliest time when we have to do rebalance again */ unsigned long next_balance = jiffies + 60*HZ; @@ -9184,6 +9213,23 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu))) goto end; + /* + * This cpu is going to update the blocked load of idle CPUs either + * before doing a rebalancing or just to keep metrics up to date. we + * can safely update the next update timestamp + */ + rcu_read_lock(); + sd = rcu_dereference(this_rq->sd); + /* + * Check whether there is a sched_domain available for this cpu. + * The last other cpu can have been unplugged since the ILB has been + * triggered and the sched_domain can now be null. The idle balance + * sequence will quickly be aborted as there is no more idle CPUs + */ + if (sd) + nohz.next_update = jiffies + msecs_to_jiffies(LOAD_AVG_PERIOD); + rcu_read_unlock(); + for_each_cpu(balance_cpu, nohz.idle_cpus_mask) { if (balance_cpu == this_cpu || !idle_cpu(balance_cpu)) continue; @@ -9210,7 +9256,15 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) cpu_load_update_idle(rq); rq_unlock_irq(rq, &rf); - rebalance_domains(rq, CPU_IDLE); + update_blocked_averages(balance_cpu); + /* + * This idle load balance softirq may have been + * triggered only to update the blocked load and shares + * of idle CPUs (which we have just done for + * balance_cpu). In that case skip the actual balance. + */ + if (!in_nohz_stats_kick(this_cpu)) + rebalance_domains(rq, idle); } if (time_after(next_balance, rq->next_balance)) { @@ -9241,7 +9295,7 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler * domain span are idle. */ -static inline bool nohz_kick_needed(struct rq *rq) +static inline bool nohz_kick_needed(struct rq *rq, bool only_update) { unsigned long now = jiffies; struct sched_domain_shared *sds; @@ -9249,7 +9303,7 @@ static inline bool nohz_kick_needed(struct rq *rq) int nr_busy, i, cpu = rq->cpu; bool kick = false; - if (unlikely(rq->idle_balance)) + if (unlikely(rq->idle_balance) && !only_update) return false; /* @@ -9266,6 +9320,9 @@ static inline bool nohz_kick_needed(struct rq *rq) if (likely(!atomic_read(&nohz.nr_cpus))) return false; + if (only_update) + return time_after_eq(now, nohz.next_update); + if (time_before(now, nohz.next_balance)) return false; @@ -9313,8 +9370,11 @@ static inline bool nohz_kick_needed(struct rq *rq) rcu_read_unlock(); return kick; } + #else +static inline bool in_nohz_stats_kick(int cpu) { return false; } static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { } +static inline bool nohz_kick_needed(struct rq *rq, bool only_update) { return false; } #endif /* @@ -9336,7 +9396,12 @@ static __latent_entropy void run_rebalance_domains(struct softirq_action *h) * and abort nohz_idle_balance altogether if we pull some load. */ nohz_idle_balance(this_rq, idle); - rebalance_domains(this_rq, idle); + update_blocked_averages(this_rq->cpu); + if (!in_nohz_stats_kick(this_rq->cpu)) + rebalance_domains(this_rq, idle); +#ifdef CONFIG_NO_HZ_COMMON + clear_bit(NOHZ_STATS_KICK, nohz_flags(this_rq->cpu)); +#endif } /* @@ -9351,8 +9416,8 @@ void trigger_load_balance(struct rq *rq) if (time_after_eq(jiffies, rq->next_balance)) raise_softirq(SCHED_SOFTIRQ); #ifdef CONFIG_NO_HZ_COMMON - if (nohz_kick_needed(rq)) - nohz_balancer_kick(); + if (nohz_kick_needed(rq, false)) + nohz_balancer_kick(false); #endif } @@ -9927,6 +9992,7 @@ __init void init_sched_fair_class(void) #ifdef CONFIG_NO_HZ_COMMON nohz.next_balance = jiffies; + nohz.next_update = jiffies; zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); #endif #endif /* SMP */ diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b19552a212de..a7e48b6ee68c 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2014,6 +2014,7 @@ extern void cfs_bandwidth_usage_dec(void); enum rq_nohz_flag_bits { NOHZ_TICK_STOPPED, NOHZ_BALANCE_KICK, + NOHZ_STATS_KICK }; #define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags)