Message ID | 20210428232821.2506201-2-swood@redhat.com |
---|---|
State | New |
Headers | show |
Series | newidle_balance() PREEMPT_RT latency mitigations | expand |
On Thu, 29 Apr 2021 at 01:28, Scott Wood <swood@redhat.com> wrote: > > This is required in order to be able to enable interrupts in the next > patch. This is limited to PREEMPT_RT to avoid adding potentially > measurable overhead to the non-RT case (requiring a double switch when > pulling a task onto a newly idle cpu). IIUC, only the newidle_balance is a problem and not the idle load balance that runs softirq. In this case, why not skipping newidle_balance entirely in case of preempt_rt and kick an idle load balance instead as you switch to idle thread context anyway > > update_misfit_status() is factored out for the PREEMPT_RT case, to ensure > that the misfit status is kept consistent before dropping the lock. > > Signed-off-by: Scott Wood <swood@redhat.com> > --- > v2: Use a balance callback, and limit to PREEMPT_RT > > kernel/sched/fair.c | 34 ++++++++++++++++++++++++++++++---- > 1 file changed, 30 insertions(+), 4 deletions(-) > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c > index 794c2cb945f8..ff369c38a5b5 100644 > --- a/kernel/sched/fair.c > +++ b/kernel/sched/fair.c > @@ -5660,6 +5660,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) > > #ifdef CONFIG_SMP > > +static const bool newidle_balance_in_callback = IS_ENABLED(CONFIG_PREEMPT_RT); > +static DEFINE_PER_CPU(struct callback_head, rebalance_head); > + > /* Working cpumask for: load_balance, load_balance_newidle. */ > DEFINE_PER_CPU(cpumask_var_t, load_balance_mask); > DEFINE_PER_CPU(cpumask_var_t, select_idle_mask); > @@ -10549,7 +10552,7 @@ static inline void nohz_newidle_balance(struct rq *this_rq) { } > * 0 - failed, no new tasks > * > 0 - success, new (fair) tasks present > */ > -static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) > +static int do_newidle_balance(struct rq *this_rq, struct rq_flags *rf) > { > unsigned long next_balance = jiffies + HZ; > int this_cpu = this_rq->cpu; > @@ -10557,7 +10560,9 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) > int pulled_task = 0; > u64 curr_cost = 0; > > - update_misfit_status(NULL, this_rq); > + if (!newidle_balance_in_callback) > + update_misfit_status(NULL, this_rq); > + > /* > * We must set idle_stamp _before_ calling idle_balance(), such that we > * measure the duration of idle_balance() as idle time. > @@ -10576,7 +10581,8 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) > * further scheduler activity on it and we're being very careful to > * re-start the picking loop. > */ > - rq_unpin_lock(this_rq, rf); > + if (!newidle_balance_in_callback) > + rq_unpin_lock(this_rq, rf); > > if (this_rq->avg_idle < sysctl_sched_migration_cost || > !READ_ONCE(this_rq->rd->overload)) { > @@ -10655,11 +10661,31 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) > if (pulled_task) > this_rq->idle_stamp = 0; > > - rq_repin_lock(this_rq, rf); > + if (!newidle_balance_in_callback) > + rq_repin_lock(this_rq, rf); > > return pulled_task; > } > > +static void newidle_balance_cb(struct rq *this_rq) > +{ > + update_rq_clock(this_rq); > + do_newidle_balance(this_rq, NULL); > +} > + > +static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) > +{ > + if (newidle_balance_in_callback) { > + update_misfit_status(NULL, this_rq); > + queue_balance_callback(this_rq, > + &per_cpu(rebalance_head, this_rq->cpu), > + newidle_balance_cb); > + return 0; > + } > + > + return do_newidle_balance(this_rq, rf); > +} > + > /* > * run_rebalance_domains is triggered when needed from the scheduler tick. > * Also triggered for nohz idle balancing (with nohz_balancing_kick set). > -- > 2.27.0 >
On 05/05/21 14:13, Vincent Guittot wrote: > On Thu, 29 Apr 2021 at 01:28, Scott Wood <swood@redhat.com> wrote: >> >> This is required in order to be able to enable interrupts in the next >> patch. This is limited to PREEMPT_RT to avoid adding potentially >> measurable overhead to the non-RT case (requiring a double switch when >> pulling a task onto a newly idle cpu). > > IIUC, only the newidle_balance is a problem and not the idle load > balance that runs softirq. In this case, why not skipping > newidle_balance entirely in case of preempt_rt and kick an idle load > balance instead as you switch to idle thread context anyway > So if I follow you, that would be along the lines of having PREEMPT_RT turn newidle_balance() into: rq->idle_balance = CPU_IDLE; rq->next_balance = jiffies; trigger_load_balance(rq); which I'm thinking isn't too crazy.
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 794c2cb945f8..ff369c38a5b5 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5660,6 +5660,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) #ifdef CONFIG_SMP +static const bool newidle_balance_in_callback = IS_ENABLED(CONFIG_PREEMPT_RT); +static DEFINE_PER_CPU(struct callback_head, rebalance_head); + /* Working cpumask for: load_balance, load_balance_newidle. */ DEFINE_PER_CPU(cpumask_var_t, load_balance_mask); DEFINE_PER_CPU(cpumask_var_t, select_idle_mask); @@ -10549,7 +10552,7 @@ static inline void nohz_newidle_balance(struct rq *this_rq) { } * 0 - failed, no new tasks * > 0 - success, new (fair) tasks present */ -static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) +static int do_newidle_balance(struct rq *this_rq, struct rq_flags *rf) { unsigned long next_balance = jiffies + HZ; int this_cpu = this_rq->cpu; @@ -10557,7 +10560,9 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) int pulled_task = 0; u64 curr_cost = 0; - update_misfit_status(NULL, this_rq); + if (!newidle_balance_in_callback) + update_misfit_status(NULL, this_rq); + /* * We must set idle_stamp _before_ calling idle_balance(), such that we * measure the duration of idle_balance() as idle time. @@ -10576,7 +10581,8 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) * further scheduler activity on it and we're being very careful to * re-start the picking loop. */ - rq_unpin_lock(this_rq, rf); + if (!newidle_balance_in_callback) + rq_unpin_lock(this_rq, rf); if (this_rq->avg_idle < sysctl_sched_migration_cost || !READ_ONCE(this_rq->rd->overload)) { @@ -10655,11 +10661,31 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) if (pulled_task) this_rq->idle_stamp = 0; - rq_repin_lock(this_rq, rf); + if (!newidle_balance_in_callback) + rq_repin_lock(this_rq, rf); return pulled_task; } +static void newidle_balance_cb(struct rq *this_rq) +{ + update_rq_clock(this_rq); + do_newidle_balance(this_rq, NULL); +} + +static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) +{ + if (newidle_balance_in_callback) { + update_misfit_status(NULL, this_rq); + queue_balance_callback(this_rq, + &per_cpu(rebalance_head, this_rq->cpu), + newidle_balance_cb); + return 0; + } + + return do_newidle_balance(this_rq, rf); +} + /* * run_rebalance_domains is triggered when needed from the scheduler tick. * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
This is required in order to be able to enable interrupts in the next patch. This is limited to PREEMPT_RT to avoid adding potentially measurable overhead to the non-RT case (requiring a double switch when pulling a task onto a newly idle cpu). update_misfit_status() is factored out for the PREEMPT_RT case, to ensure that the misfit status is kept consistent before dropping the lock. Signed-off-by: Scott Wood <swood@redhat.com> --- v2: Use a balance callback, and limit to PREEMPT_RT kernel/sched/fair.c | 34 ++++++++++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 4 deletions(-)