Message ID | 20160204163049.GE29586@e106622-lin |
---|---|
State | New |
Headers | show |
On 04/02/16 12:31, Steven Rostedt wrote: > On Thu, 4 Feb 2016 16:30:49 +0000 > Juri Lelli <juri.lelli@arm.com> wrote: > > > I've actually changed a bit this approach, and things seem better here. > > Could you please give this a try? (You can also fetch the same branch). > > It appears to fix the one issue I pointed out, but it doesn't fix the > issue with cpusets. > > # burn& > # TASK=$! > # schedtool -E -t 2000000:20000000 $TASK > # grep dl /proc/sched_debug > dl_rq[0]: > .dl_nr_running : 0 > .dl_bw->bw : 996147 > .dl_bw->total_bw : 104857 > dl_rq[1]: > .dl_nr_running : 0 > .dl_bw->bw : 996147 > .dl_bw->total_bw : 104857 > dl_rq[2]: > .dl_nr_running : 0 > .dl_bw->bw : 996147 > .dl_bw->total_bw : 104857 > dl_rq[3]: > .dl_nr_running : 0 > .dl_bw->bw : 996147 > .dl_bw->total_bw : 104857 > dl_rq[4]: > .dl_nr_running : 0 > .dl_bw->bw : 996147 > .dl_bw->total_bw : 104857 > dl_rq[5]: > .dl_nr_running : 0 > .dl_bw->bw : 996147 > .dl_bw->total_bw : 104857 > dl_rq[6]: > .dl_nr_running : 0 > .dl_bw->bw : 996147 > .dl_bw->total_bw : 104857 > dl_rq[7]: > .dl_nr_running : 0 > .dl_bw->bw : 996147 > .dl_bw->total_bw : 104857 > > # mkdir /sys/fs/cgroup/cpuset/my_cpuset > # echo 1 > /sys/fs/cgroup/cpuset/my_cpuset/cpuset.cpus > # grep dl /proc/sched_debug > dl_rq[0]: > .dl_nr_running : 0 > .dl_bw->bw : 996147 > .dl_bw->total_bw : 209714 > dl_rq[1]: > .dl_nr_running : 0 > .dl_bw->bw : 996147 > .dl_bw->total_bw : 209714 > dl_rq[2]: > .dl_nr_running : 0 > .dl_bw->bw : 996147 > .dl_bw->total_bw : 209714 > dl_rq[3]: > .dl_nr_running : 0 > .dl_bw->bw : 996147 > .dl_bw->total_bw : 209714 > dl_rq[4]: > .dl_nr_running : 0 > .dl_bw->bw : 996147 > .dl_bw->total_bw : 209714 > dl_rq[5]: > .dl_nr_running : 0 > .dl_bw->bw : 996147 > .dl_bw->total_bw : 209714 > dl_rq[6]: > .dl_nr_running : 0 > .dl_bw->bw : 996147 > .dl_bw->total_bw : 209714 > dl_rq[7]: > .dl_nr_running : 0 > .dl_bw->bw : 996147 > .dl_bw->total_bw : 209714 > > It appears to add double the bandwidth. > Mmm.. IIUC that's because we don't destroy any root_domain in this case, as sched_load_balance of the parent is still set. So we add again to the existing one. I could fix that with some flag indicating when we actually destroy root_domain(s), but I fear it will make this solution uglier than it is already :/. More thinking required. Thanks for testing. Best, - Juri
diff --git a/include/linux/sched.h b/include/linux/sched.h index a10494a..5f9eeb4 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2241,6 +2241,8 @@ extern int cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial); extern int task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed); +void sched_restore_dl_bw(struct task_struct *task, + const struct cpumask *new_mask); #ifdef CONFIG_SMP extern void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask); diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 3e945fc..57078f0 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -785,6 +785,44 @@ done: return ndoms; } +/** + * update_tasks_rd - Update tasks' root_domains status. + * @cs: the cpuset to which each task's root_domain belongs + * + * Iterate through each task of @cs updating state of its related + * root_domain. + */ +static void update_tasks_rd(struct cpuset *cs) +{ + struct css_task_iter it; + struct task_struct *task; + + css_task_iter_start(&cs->css, &it); + while ((task = css_task_iter_next(&it))) + sched_restore_dl_bw(task, cs->effective_cpus); + css_task_iter_end(&it); +} + +static void cpuset_update_rd(void) +{ + struct cpuset *cs; + struct cgroup_subsys_state *pos_css; + + lockdep_assert_held(&cpuset_mutex); + rcu_read_lock(); + cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { + if (!css_tryget_online(&cs->css)) + continue; + rcu_read_unlock(); + + update_tasks_rd(cs); + + rcu_read_lock(); + css_put(&cs->css); + } + rcu_read_unlock(); +} + /* * Rebuild scheduler domains. * @@ -818,6 +856,7 @@ static void rebuild_sched_domains_locked(void) /* Have scheduler rebuild the domains */ partition_sched_domains(ndoms, doms, attr); + cpuset_update_rd(); out: put_online_cpus(); } diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f1ce7a8..f9558f0 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2277,6 +2277,23 @@ static inline int dl_bw_cpus(int i) } #endif +void sched_restore_dl_bw(struct task_struct *task, + const struct cpumask *new_mask) +{ + struct dl_bw *dl_b; + unsigned long flags; + + if (!task_has_dl_policy(task)) + return; + + rcu_read_lock_sched(); + dl_b = dl_bw_of(cpumask_any(new_mask)); + raw_spin_lock_irqsave(&dl_b->lock, flags); + dl_b->total_bw += task->dl.dl_bw; + raw_spin_unlock_irqrestore(&dl_b->lock, flags); + rcu_read_unlock_sched(); +} + /* * We must be sure that accepting a new task (or allowing changing the * parameters of an existing one) is consistent with the bandwidth @@ -5636,6 +5653,17 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) cpumask_clear_cpu(rq->cpu, old_rd->span); + if (old_rd == &def_root_domain && + cpumask_empty(old_rd->span)) { + /* + * def_root_domain is never freed, so we have to clean + * it when it becomes empty. + */ + raw_spin_lock(&old_rd->dl_bw.lock); + old_rd->dl_bw.total_bw = 0; + raw_spin_unlock(&old_rd->dl_bw.lock); + } + /* * If we dont want to free the old_rd yet then * set old_rd to NULL to skip the freeing later