Message ID | b051b42f0c4f36d7177978e090c6a85df17922c6.1594707424.git.viresh.kumar@linaro.org |
---|---|
State | New |
Headers | show |
Series | cpufreq_cooling: Get effective CPU utilization from scheduler | expand |
On Tue, Jul 14, 2020 at 12:06:53PM +0530, Viresh Kumar wrote: > Several parts of the kernel are already using the effective CPU > utilization to get the current load on the CPU, do the same here instead > of depending on the idle time of the CPU, which isn't that accurate > comparatively. > > Note that, this (and CPU frequency scaling in general) doesn't work that > well with idle injection as that is done from rt threads and is counted > as load while it tries to do quite the opposite. That should be solved > separately though. > > Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org> > --- > drivers/thermal/cpufreq_cooling.c | 65 +++++++------------------------ > 1 file changed, 15 insertions(+), 50 deletions(-) > > diff --git a/drivers/thermal/cpufreq_cooling.c b/drivers/thermal/cpufreq_cooling.c > index 6c0e1b053126..74340b2b0da7 100644 > --- a/drivers/thermal/cpufreq_cooling.c > +++ b/drivers/thermal/cpufreq_cooling.c > @@ -23,6 +23,7 @@ > #include <linux/thermal.h> > > #include <trace/events/thermal.h> > +#include "../../kernel/sched/sched.h" Hard NAK on that. Just writing it should've been a clue.
On Tue, Jul 14, 2020 at 8:37 AM Viresh Kumar <viresh.kumar@linaro.org> wrote: > > Several parts of the kernel are already using the effective CPU > utilization to get the current load on the CPU, do the same here instead > of depending on the idle time of the CPU, which isn't that accurate > comparatively. > > Note that, this (and CPU frequency scaling in general) doesn't work that > well with idle injection as that is done from rt threads and is counted > as load while it tries to do quite the opposite. That should be solved > separately though. > > Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org> > --- > drivers/thermal/cpufreq_cooling.c | 65 +++++++------------------------ > 1 file changed, 15 insertions(+), 50 deletions(-) > > diff --git a/drivers/thermal/cpufreq_cooling.c b/drivers/thermal/cpufreq_cooling.c > index 6c0e1b053126..74340b2b0da7 100644 > --- a/drivers/thermal/cpufreq_cooling.c > +++ b/drivers/thermal/cpufreq_cooling.c > @@ -23,6 +23,7 @@ > #include <linux/thermal.h> > > #include <trace/events/thermal.h> > +#include "../../kernel/sched/sched.h" > > /* > * Cooling state <-> CPUFreq frequency > @@ -38,16 +39,6 @@ > * ... > */ > > -/** > - * struct time_in_idle - Idle time stats > - * @time: previous reading of the absolute time that this cpu was idle > - * @timestamp: wall time of the last invocation of get_cpu_idle_time_us() > - */ > -struct time_in_idle { > - u64 time; > - u64 timestamp; > -}; > - > /** > * struct cpufreq_cooling_device - data for cooling device with cpufreq > * @id: unique integer value corresponding to each cpufreq_cooling_device > @@ -62,7 +53,6 @@ struct time_in_idle { > * registered cooling device. > * @policy: cpufreq policy. > * @node: list_head to link all cpufreq_cooling_device together. > - * @idle_time: idle time stats > * @qos_req: PM QoS contraint to apply > * > * This structure is required for keeping information of each registered > @@ -76,7 +66,6 @@ struct cpufreq_cooling_device { > struct em_perf_domain *em; > struct cpufreq_policy *policy; > struct list_head node; > - struct time_in_idle *idle_time; > struct freq_qos_request qos_req; > }; > > @@ -132,34 +121,21 @@ static u32 cpu_power_to_freq(struct cpufreq_cooling_device *cpufreq_cdev, > } > > /** > - * get_load() - get load for a cpu since last updated > + * get_load() - get current load for a cpu > * @cpufreq_cdev: &struct cpufreq_cooling_device for this cpu > * @cpu: cpu number > - * @cpu_idx: index of the cpu in time_in_idle* > + * @cpu_idx: index of the cpu > * > - * Return: The average load of cpu @cpu in percentage since this > - * function was last called. > + * Return: The current load of cpu @cpu in percentage. > */ > static u32 get_load(struct cpufreq_cooling_device *cpufreq_cdev, int cpu, > int cpu_idx) > { > - u32 load; > - u64 now, now_idle, delta_time, delta_idle; > - struct time_in_idle *idle_time = &cpufreq_cdev->idle_time[cpu_idx]; > - > - now_idle = get_cpu_idle_time(cpu, &now, 0); > - delta_idle = now_idle - idle_time->time; > - delta_time = now - idle_time->timestamp; > + unsigned long util = cpu_util_cfs(cpu_rq(cpu)); > + unsigned long max = arch_scale_cpu_capacity(cpu); > > - if (delta_time <= delta_idle) > - load = 0; > - else > - load = div64_u64(100 * (delta_time - delta_idle), delta_time); > - > - idle_time->time = now_idle; > - idle_time->timestamp = now; > - > - return load; > + util = effective_cpu_util(cpu, util, max, ENERGY_UTIL, NULL); Hmm. It doesn't look like cpufreq_cdev and cpu_idx are needed any more in this function, so maybe drop them from the arg list? And then there won't be anything specific to CPU cooling in this function, so maybe move it to sched and export it from there properly? Also it looks like max could be passed to it along with the CPU number instead of being always taken as arch_scale_cpu_capacity(cpu). > + return (util * 100) / max; > } > > /**
On 14-07-20, 15:05, Rafael J. Wysocki wrote: > On Tue, Jul 14, 2020 at 8:37 AM Viresh Kumar <viresh.kumar@linaro.org> wrote: > > static u32 get_load(struct cpufreq_cooling_device *cpufreq_cdev, int cpu, > > int cpu_idx) > > { > > - u32 load; > > - u64 now, now_idle, delta_time, delta_idle; > > - struct time_in_idle *idle_time = &cpufreq_cdev->idle_time[cpu_idx]; > > - > > - now_idle = get_cpu_idle_time(cpu, &now, 0); > > - delta_idle = now_idle - idle_time->time; > > - delta_time = now - idle_time->timestamp; > > + unsigned long util = cpu_util_cfs(cpu_rq(cpu)); > > + unsigned long max = arch_scale_cpu_capacity(cpu); > > > > - if (delta_time <= delta_idle) > > - load = 0; > > - else > > - load = div64_u64(100 * (delta_time - delta_idle), delta_time); > > - > > - idle_time->time = now_idle; > > - idle_time->timestamp = now; > > - > > - return load; > > + util = effective_cpu_util(cpu, util, max, ENERGY_UTIL, NULL); > > Hmm. > > It doesn't look like cpufreq_cdev and cpu_idx are needed any more in > this function, so maybe drop them from the arg list? Right. > And then there > won't be anything specific to CPU cooling in this function, so maybe > move it to sched and export it from there properly? There isn't a lot happening in this routine right now TBH and am not sure if it is really worth it to have a separate routine for this (unless we can get rid of something for all the callers, like avoiding a call to arch_scale_cpu_capacity() and then naming it effective_cpu_load(). > Also it looks like max could be passed to it along with the CPU number > instead of being always taken as arch_scale_cpu_capacity(cpu). I am not sure what you are suggesting here. What will be the value of max if not arch_scale_cpu_capacity() ? -- viresh
On Wed, Jul 15, 2020 at 9:32 AM Viresh Kumar <viresh.kumar@linaro.org> wrote: > > On 14-07-20, 15:05, Rafael J. Wysocki wrote: > > On Tue, Jul 14, 2020 at 8:37 AM Viresh Kumar <viresh.kumar@linaro.org> wrote: > > > static u32 get_load(struct cpufreq_cooling_device *cpufreq_cdev, int cpu, > > > int cpu_idx) > > > { > > > - u32 load; > > > - u64 now, now_idle, delta_time, delta_idle; > > > - struct time_in_idle *idle_time = &cpufreq_cdev->idle_time[cpu_idx]; > > > - > > > - now_idle = get_cpu_idle_time(cpu, &now, 0); > > > - delta_idle = now_idle - idle_time->time; > > > - delta_time = now - idle_time->timestamp; > > > + unsigned long util = cpu_util_cfs(cpu_rq(cpu)); > > > + unsigned long max = arch_scale_cpu_capacity(cpu); > > > > > > - if (delta_time <= delta_idle) > > > - load = 0; > > > - else > > > - load = div64_u64(100 * (delta_time - delta_idle), delta_time); > > > - > > > - idle_time->time = now_idle; > > > - idle_time->timestamp = now; > > > - > > > - return load; > > > + util = effective_cpu_util(cpu, util, max, ENERGY_UTIL, NULL); > > > > Hmm. > > > > It doesn't look like cpufreq_cdev and cpu_idx are needed any more in > > this function, so maybe drop them from the arg list? > > Right. > > > And then there > > won't be anything specific to CPU cooling in this function, so maybe > > move it to sched and export it from there properly? > > There isn't a lot happening in this routine right now TBH and am not > sure if it is really worth it to have a separate routine for this > (unless we can get rid of something for all the callers, like avoiding > a call to arch_scale_cpu_capacity() and then naming it > effective_cpu_load(). Maybe yes. Or sched_cpu_load() to stand for "the effective CPU load as seen by the scheduler". But I'm not sure if percent is the best unit to return from it. Maybe make it return something like (util << SCHED_CAPACITY_SHFT) / arch_scale_cpu_capacity(cpu). > > Also it looks like max could be passed to it along with the CPU number > > instead of being always taken as arch_scale_cpu_capacity(cpu). > > I am not sure what you are suggesting here. What will be the value of > max if not arch_scale_cpu_capacity() ? I was thinking about a value supplied by the caller, eg. sched_cpu_load(cpu, max), but if all callers would pass arch_scale_cpu_capacity(cpu) as max anyway, then it's better to simply call it from there.
On Tue, Jul 14, 2020 at 12:06:53PM +0530, Viresh Kumar wrote: > /** > + * get_load() - get current load for a cpu > * @cpufreq_cdev: &struct cpufreq_cooling_device for this cpu > * @cpu: cpu number > + * @cpu_idx: index of the cpu > * > + * Return: The current load of cpu @cpu in percentage. > */ > static u32 get_load(struct cpufreq_cooling_device *cpufreq_cdev, int cpu, > int cpu_idx) > { > + unsigned long util = cpu_util_cfs(cpu_rq(cpu)); > + unsigned long max = arch_scale_cpu_capacity(cpu); > > + util = effective_cpu_util(cpu, util, max, ENERGY_UTIL, NULL); > + return (util * 100) / max; > } So there's a number of things... let me recap a bunch of things that got mentioned on IRC earlier this week and then continue from there.. So IPA* (or any other thermal governor) needs energy estimates for the various managed devices, cpufreq_cooling, being the driver for the CPU device, needs to provide that and in return receives feedback on how much energy it is allowed to consume, cpufreq_cooling then dynamically enables/disables OPP states. There are actually two methods the thermal governor will use: get_real_power() and get_requested_power(). The first isn't used anywhere in mainline, but could be implemented on hardware that has energy counters (like say x86 RAPL). The second attempts to guesstimate power, and is the subject of this patch. Currently cpufreq_cooling appears to estimate the CPU energy usage by calculating the percentage of idle time using the per-cpu cpustat stuff, which is pretty horrific. This patch then attempts to improve upon that by using the scheduler's cpu_util(ENERGY_UTIL) estimate, which is also used to select OPP state and improves upon avg idle. This should be a big improvement as higher frequency consumes more energy, but should we not also consider that: E = C V^2 f The EAS energy model has tables for the OPPs that contain this, but in this case we seem to be assuming a linear enery/frequency curve, which is just not the case. I suppose we could do something like **: 100 * util^3 / max^3 which assumes V~f. Another point is that cpu_util() vs turbo is a bit iffy, and to that, things like x86-APERF/MPERF and ARM-AMU got mentioned. Those might also have the benefit of giving you values that match your own sampling interval (100ms), where the sched stuff is PELT (64,32.. based). So what I've been thinking is that cpufreq drivers ought to be able to supply this method, and only when they lack, can the cpufreq-governor (schedutil) install a fallback. And then cpufreq-cooling can use whatever is provided (through the cpufreq interfaces). That way, we: 1) don't have to export anything 2) get arch drivers to provide something 'better' Does that sounds like something sensible? [*] I always want a beer when I see that name :-) [**] I despise code that uses percentages, computers suck at /100 and there is no reason not to use any other random fraction, so why pick a bad one.
Hi Peter, Thank you for summarizing this. I've put my comments below. On 7/16/20 12:56 PM, Peter Zijlstra wrote: > On Tue, Jul 14, 2020 at 12:06:53PM +0530, Viresh Kumar wrote: >> /** >> + * get_load() - get current load for a cpu >> * @cpufreq_cdev: &struct cpufreq_cooling_device for this cpu >> * @cpu: cpu number >> + * @cpu_idx: index of the cpu >> * >> + * Return: The current load of cpu @cpu in percentage. >> */ >> static u32 get_load(struct cpufreq_cooling_device *cpufreq_cdev, int cpu, >> int cpu_idx) >> { >> + unsigned long util = cpu_util_cfs(cpu_rq(cpu)); >> + unsigned long max = arch_scale_cpu_capacity(cpu); >> >> + util = effective_cpu_util(cpu, util, max, ENERGY_UTIL, NULL); >> + return (util * 100) / max; >> } > > So there's a number of things... let me recap a bunch of things that > got mentioned on IRC earlier this week and then continue from there.. > > So IPA* (or any other thermal governor) needs energy estimates for the > various managed devices, cpufreq_cooling, being the driver for the CPU > device, needs to provide that and in return receives feedback on how > much energy it is allowed to consume, cpufreq_cooling then dynamically > enables/disables OPP states. Currently, only IPA uses the power estimation, other governors don't use these API functions in cpufreq_cooling. > > There are actually two methods the thermal governor will use: > get_real_power() and get_requested_power(). > > The first isn't used anywhere in mainline, but could be implemented on > hardware that has energy counters (like say x86 RAPL). The first is only present as callback for registered devfreq cooling, which is registered by devfreq driver. If that driver provides the get_real_power(), it will be called from get_requested_power(). Thus, it's likely that IPA would get real power value from HW. I was planning to add it also to cpufreq_cooling callbacks years ago... > > The second attempts to guesstimate power, and is the subject of this > patch. > > Currently cpufreq_cooling appears to estimate the CPU energy usage by > calculating the percentage of idle time using the per-cpu cpustat stuff, > which is pretty horrific. Even worse, it then *samples* the *current* CPU frequency at that particular point in time and assumes that when the CPU wasn't idle during that period - it had *this* frequency... > > This patch then attempts to improve upon that by using the scheduler's > cpu_util(ENERGY_UTIL) estimate, which is also used to select OPP state > and improves upon avg idle. This should be a big improvement as higher IMHO this patch set doesn't address the real problem: 'sampling freq problem' described above. There was no issue with getting idle period. The avg freq was the problem, in that period when the CPUs were running. The model implemented in alg was also a problem. The whole period (e.g. CPU freqs which were used or idle state) ^(CPU freq) | | sampling the current freq | _______ | | | | | |________ | | | | | | | | | | idle | |________v________... |_ _____|_______|__________________________> (time) start of period end |<------- (typically 100ms)-->| > frequency consumes more energy, but should we not also consider that: > > E = C V^2 f > > The EAS energy model has tables for the OPPs that contain this, but in > this case we seem to be assuming a linear enery/frequency curve, which > is just not the case. I am not sure if I got your point. To understand your point better I think some drawing would be required. I will skip this patch and old mainline code and focus on your proposed solution (because this patch set does not address 'sampling freq problem'). > > I suppose we could do something like **: > > 100 * util^3 / max^3 > > which assumes V~f. In EM we keep power values in the array and these values grow exponentially. Each OPP has it corresponding P_x = C (V_x)^2 f_x , where x is the OPP id thus corresponding V,f so we have discrete power values, growing like: ^(power) | | | * | | | * | | | * | | | <----- power estimation function | * | should not use linear 'util/max_util' | * | relation here * |_______________________|_____________> (freq) opp0 opp1 opp2 opp3 opp4 What is the problem First: We need to pick the right Power from the array. I would suggest to pick the max allowed frequency for that whole period, because we don't know if the CPUs were using it (it's likely). Second: Then we have the utilization, which can be considered as: 'idle period & running period with various freq inside', lets call it avg performance in that whole period. Third: Try to estimate the power used in that whole period having the avg performance and max performance. What you are suggesting is to travel that [*] line in non-linear fashion, but in (util^3)/(max_util^3). Which means it goes down faster when the utilization drops. I think it is too aggressive, e.g. 500^3 / 1024^3 = 0.116 <--- very little, ~12% 200^3 / 300^3 = 0.296 Peter could you confirm if I understood you correct? This is quite important bit for me. > > Another point is that cpu_util() vs turbo is a bit iffy, and to that, > things like x86-APERF/MPERF and ARM-AMU got mentioned. Those might also > have the benefit of giving you values that match your own sampling > interval (100ms), where the sched stuff is PELT (64,32.. based). > > So what I've been thinking is that cpufreq drivers ought to be able to > supply this method, and only when they lack, can the cpufreq-governor > (schedutil) install a fallback. And then cpufreq-cooling can use > whatever is provided (through the cpufreq interfaces). > > That way, we: > > 1) don't have to export anything > 2) get arch drivers to provide something 'better' > > > Does that sounds like something sensible? > Yes, make sense. Please also keep in mind that this utilization somehow must be mapped into power in a proper way. I am currently working on addressing all of these problems (including this correlation). Thank you for your time spending on it and your suggestions. Regards, Lukasz > > > > [*] I always want a beer when I see that name :-) > > [**] I despise code that uses percentages, computers suck at > /100 and there is no reason not to use any other random fraction, so why > pick a bad one. >
On Thu, Jul 16, 2020 at 03:24:37PM +0100, Lukasz Luba wrote: > On 7/16/20 12:56 PM, Peter Zijlstra wrote: > > The second attempts to guesstimate power, and is the subject of this > > patch. > > > > Currently cpufreq_cooling appears to estimate the CPU energy usage by > > calculating the percentage of idle time using the per-cpu cpustat stuff, > > which is pretty horrific. > > Even worse, it then *samples* the *current* CPU frequency at that > particular point in time and assumes that when the CPU wasn't idle > during that period - it had *this* frequency... *whee* :-) ... > In EM we keep power values in the array and these values grow > exponentially. Each OPP has it corresponding > > P_x = C (V_x)^2 f_x , where x is the OPP id thus corresponding V,f > > so we have discrete power values, growing like: > > ^(power) > | > | > | * > | > | > | * > | | > | * | > | | <----- power estimation function > | * | should not use linear 'util/max_util' > | * | relation here * > |_______________________|_____________> (freq) > opp0 opp1 opp2 opp3 opp4 > > What is the problem > First: > We need to pick the right Power from the array. I would suggest > to pick the max allowed frequency for that whole period, because > we don't know if the CPUs were using it (it's likely). > Second: > Then we have the utilization, which can be considered as: > 'idle period & running period with various freq inside', lets > call it avg performance in that whole period. > Third: > Try to estimate the power used in that whole period having > the avg performance and max performance. > > What you are suggesting is to travel that [*] line in > non-linear fashion, but in (util^3)/(max_util^3). Which means > it goes down faster when the utilization drops. > I think it is too aggressive, e.g. > 500^3 / 1024^3 = 0.116 <--- very little, ~12% > 200^3 / 300^3 = 0.296 > > Peter could you confirm if I understood you correct? Correct, with the caveat that we might try and regression fit a 3rd order polynomial to a bunch of EM data to see if there's a 'better' function to be had than a raw 'f(x) := x^3'. > This is quite important bit for me. So, if we assume schedutil + EM, we can actually have schedutil calculate a running power sum. That is, something like: \Int P_x dt. Because we know the points where OPP changes. Although, thinking more, I suspect we need tighter integration with cpuidle, because we don't actually have idle times here, but that should be doable. But for anything other than schedutil + EM, things become more interesting, because then we need to guesstimate power usage without the benefit of having actual power numbers. We can of course still do that running power sum, with whatever P(u) or P(f) end up with, I suppose. > > Another point is that cpu_util() vs turbo is a bit iffy, and to that, > > things like x86-APERF/MPERF and ARM-AMU got mentioned. Those might also > > have the benefit of giving you values that match your own sampling > > interval (100ms), where the sched stuff is PELT (64,32.. based). > > > > So what I've been thinking is that cpufreq drivers ought to be able to > > supply this method, and only when they lack, can the cpufreq-governor > > (schedutil) install a fallback. And then cpufreq-cooling can use > > whatever is provided (through the cpufreq interfaces). > > > > That way, we: > > > > 1) don't have to export anything > > 2) get arch drivers to provide something 'better' > > > > > > Does that sounds like something sensible? > > > > Yes, make sense. Please also keep in mind that this > utilization somehow must be mapped into power in a proper way. > I am currently working on addressing all of these problems > (including this correlation). Right, so that mapping util to power was what I was missing and suggesting we do. So for 'simple' hardware we have cpufreq events for frequency change, and cpuidle events for idle, and with EM we can simply sum the relevant power numbers. For hardware lacking EM, or hardware managed DVFS, we'll have to fudge things a little. How best to do that is up in the air a little, but virtual power curves seem a useful tool to me. The next problem for IPA is having all the devices report power in the same virtual unit I suppose, but I'll leave that to others ;-)
On Thu, 16 Jul 2020 at 16:24, Lukasz Luba <lukasz.luba@arm.com> wrote: > > Hi Peter, > > Thank you for summarizing this. I've put my comments below. > > On 7/16/20 12:56 PM, Peter Zijlstra wrote: > > On Tue, Jul 14, 2020 at 12:06:53PM +0530, Viresh Kumar wrote: > >> /** > >> + * get_load() - get current load for a cpu > >> * @cpufreq_cdev: &struct cpufreq_cooling_device for this cpu > >> * @cpu: cpu number > >> + * @cpu_idx: index of the cpu > >> * > >> + * Return: The current load of cpu @cpu in percentage. > >> */ > >> static u32 get_load(struct cpufreq_cooling_device *cpufreq_cdev, int cpu, > >> int cpu_idx) > >> { > >> + unsigned long util = cpu_util_cfs(cpu_rq(cpu)); > >> + unsigned long max = arch_scale_cpu_capacity(cpu); > >> > >> + util = effective_cpu_util(cpu, util, max, ENERGY_UTIL, NULL); > >> + return (util * 100) / max; > >> } > > > > So there's a number of things... let me recap a bunch of things that > > got mentioned on IRC earlier this week and then continue from there.. > > > > So IPA* (or any other thermal governor) needs energy estimates for the > > various managed devices, cpufreq_cooling, being the driver for the CPU > > device, needs to provide that and in return receives feedback on how > > much energy it is allowed to consume, cpufreq_cooling then dynamically > > enables/disables OPP states. > > Currently, only IPA uses the power estimation, other governors don't > use these API functions in cpufreq_cooling. > > > > > There are actually two methods the thermal governor will use: > > get_real_power() and get_requested_power(). > > > > The first isn't used anywhere in mainline, but could be implemented on > > hardware that has energy counters (like say x86 RAPL). > > The first is only present as callback for registered devfreq cooling, > which is registered by devfreq driver. If that driver provides the > get_real_power(), it will be called from get_requested_power(). > Thus, it's likely that IPA would get real power value from HW. > > I was planning to add it also to cpufreq_cooling callbacks years > ago... > > > > > The second attempts to guesstimate power, and is the subject of this > > patch. > > > > Currently cpufreq_cooling appears to estimate the CPU energy usage by > > calculating the percentage of idle time using the per-cpu cpustat stuff, > > which is pretty horrific. > > Even worse, it then *samples* the *current* CPU frequency at that > particular point in time and assumes that when the CPU wasn't idle > during that period - it had *this* frequency... So there is 2 problems in the power calculation of cpufreq cooling device : - How to get an accurate utilization level of the cpu which is what this patch is trying to fix because using idle time is just wrong whereas scheduler utilization is frequency invariant - How to get power estimate from this utilization level. And as you pointed out, using the current freq which is not accurate. > > > > > This patch then attempts to improve upon that by using the scheduler's > > cpu_util(ENERGY_UTIL) estimate, which is also used to select OPP state > > and improves upon avg idle. This should be a big improvement as higher > > IMHO this patch set doesn't address the real problem: 'sampling > freq problem' described above. There was no issue with getting idle > period. The avg freq was the problem, in that period when the Not sure that you can say that avg freq is a bigger problem than getting the load because there is a real issue with tracking idle period for estimating load because running slower reduces the idle time and increases artificially the load. That's why we implemented frequency invariance in PELT. At the opposite when the thermal mitigation happens, the frequency will be most probably capped by cpu cooling device and will most probably stay at the capped value > CPUs were running. The model implemented in alg was also a problem. > > The whole period (e.g. CPU freqs which were used or idle state) > > ^(CPU freq) > | > | sampling the current freq > | _______ | > | | | | > |________ | | | > | | | | | > | | idle | |________v________... > |_ _____|_______|__________________________> (time) > start of period end > |<------- (typically 100ms)-->| > > > > > frequency consumes more energy, but should we not also consider that: > > > > E = C V^2 f > > > > The EAS energy model has tables for the OPPs that contain this, but in > > this case we seem to be assuming a linear enery/frequency curve, which > > is just not the case. > > I am not sure if I got your point. To understand your point better > I think some drawing would be required. I will skip this patch > and old mainline code and focus on your proposed solution > (because this patch set does not address 'sampling freq problem'). > > > > > I suppose we could do something like **: > > > > 100 * util^3 / max^3 > > > > which assumes V~f. > > In EM we keep power values in the array and these values grow > exponentially. Each OPP has it corresponding > > P_x = C (V_x)^2 f_x , where x is the OPP id thus corresponding V,f > > so we have discrete power values, growing like: > > ^(power) > | > | > | * > | > | > | * > | | > | * | > | | <----- power estimation function > | * | should not use linear 'util/max_util' > | * | relation here * > |_______________________|_____________> (freq) > opp0 opp1 opp2 opp3 opp4 > > What is the problem > First: > We need to pick the right Power from the array. I would suggest > to pick the max allowed frequency for that whole period, because > we don't know if the CPUs were using it (it's likely). > Second: > Then we have the utilization, which can be considered as: > 'idle period & running period with various freq inside', lets > call it avg performance in that whole period. > Third: > Try to estimate the power used in that whole period having > the avg performance and max performance. We already have a function that is doing such kind of computation based of the utilization of the CPU : em_pd_energy(). And we could reuse some of this function if not exactly this one > > What you are suggesting is to travel that [*] line in > non-linear fashion, but in (util^3)/(max_util^3). Which means > it goes down faster when the utilization drops. > I think it is too aggressive, e.g. > 500^3 / 1024^3 = 0.116 <--- very little, ~12% > 200^3 / 300^3 = 0.296 > > Peter could you confirm if I understood you correct? > This is quite important bit for me. > > > > > Another point is that cpu_util() vs turbo is a bit iffy, and to that, > > things like x86-APERF/MPERF and ARM-AMU got mentioned. Those might also > > have the benefit of giving you values that match your own sampling > > interval (100ms), where the sched stuff is PELT (64,32.. based). > > > > So what I've been thinking is that cpufreq drivers ought to be able to > > supply this method, and only when they lack, can the cpufreq-governor > > (schedutil) install a fallback. And then cpufreq-cooling can use > > whatever is provided (through the cpufreq interfaces). > > > > That way, we: > > > > 1) don't have to export anything > > 2) get arch drivers to provide something 'better' > > > > > > Does that sounds like something sensible? > > > > Yes, make sense. Please also keep in mind that this > utilization somehow must be mapped into power in a proper way. > I am currently working on addressing all of these problems > (including this correlation). > > Thank you for your time spending on it and your suggestions. > > Regards, > Lukasz > > > > > > > > > [*] I always want a beer when I see that name :-) > > > > [**] I despise code that uses percentages, computers suck at > > /100 and there is no reason not to use any other random fraction, so why > > pick a bad one. > >
On 7/16/20 4:43 PM, Peter Zijlstra wrote: > On Thu, Jul 16, 2020 at 03:24:37PM +0100, Lukasz Luba wrote: >> On 7/16/20 12:56 PM, Peter Zijlstra wrote: > >>> The second attempts to guesstimate power, and is the subject of this >>> patch. >>> >>> Currently cpufreq_cooling appears to estimate the CPU energy usage by >>> calculating the percentage of idle time using the per-cpu cpustat stuff, >>> which is pretty horrific. >> >> Even worse, it then *samples* the *current* CPU frequency at that >> particular point in time and assumes that when the CPU wasn't idle >> during that period - it had *this* frequency... > > *whee* :-) > > ... > >> In EM we keep power values in the array and these values grow >> exponentially. Each OPP has it corresponding >> >> P_x = C (V_x)^2 f_x , where x is the OPP id thus corresponding V,f >> >> so we have discrete power values, growing like: >> >> ^(power) >> | >> | >> | * >> | >> | >> | * >> | | >> | * | >> | | <----- power estimation function >> | * | should not use linear 'util/max_util' >> | * | relation here * >> |_______________________|_____________> (freq) >> opp0 opp1 opp2 opp3 opp4 >> >> What is the problem >> First: >> We need to pick the right Power from the array. I would suggest >> to pick the max allowed frequency for that whole period, because >> we don't know if the CPUs were using it (it's likely). >> Second: >> Then we have the utilization, which can be considered as: >> 'idle period & running period with various freq inside', lets >> call it avg performance in that whole period. >> Third: >> Try to estimate the power used in that whole period having >> the avg performance and max performance. >> >> What you are suggesting is to travel that [*] line in >> non-linear fashion, but in (util^3)/(max_util^3). Which means >> it goes down faster when the utilization drops. >> I think it is too aggressive, e.g. >> 500^3 / 1024^3 = 0.116 <--- very little, ~12% >> 200^3 / 300^3 = 0.296 >> >> Peter could you confirm if I understood you correct? > > Correct, with the caveat that we might try and regression fit a 3rd > order polynomial to a bunch of EM data to see if there's a 'better' > function to be had than a raw 'f(x) := x^3'. I agree, I think we are on the same wavelength now. > >> This is quite important bit for me. > > So, if we assume schedutil + EM, we can actually have schedutil > calculate a running power sum. That is, something like: \Int P_x dt. > Because we know the points where OPP changes. Yes, that's why I was thinking about having this information stored as a copy inside the EM, then just read it in other subsystem like: thermal, powercap, etc. > > Although, thinking more, I suspect we need tighter integration with > cpuidle, because we don't actually have idle times here, but that should > be doable. I am scratching my head for while because of that idle issue. It opens more dimensions to tackle. > > But for anything other than schedutil + EM, things become more > interesting, because then we need to guesstimate power usage without the > benefit of having actual power numbers. Yes, from the engineering/research perspective, platforms which do not have EM in Linux (like Intel) are also interesting. > > We can of course still do that running power sum, with whatever P(u) or > P(f) end up with, I suppose. > >>> Another point is that cpu_util() vs turbo is a bit iffy, and to that, >>> things like x86-APERF/MPERF and ARM-AMU got mentioned. Those might also >>> have the benefit of giving you values that match your own sampling >>> interval (100ms), where the sched stuff is PELT (64,32.. based). >>> >>> So what I've been thinking is that cpufreq drivers ought to be able to >>> supply this method, and only when they lack, can the cpufreq-governor >>> (schedutil) install a fallback. And then cpufreq-cooling can use >>> whatever is provided (through the cpufreq interfaces). >>> >>> That way, we: >>> >>> 1) don't have to export anything >>> 2) get arch drivers to provide something 'better' >>> >>> >>> Does that sounds like something sensible? >>> >> >> Yes, make sense. Please also keep in mind that this >> utilization somehow must be mapped into power in a proper way. >> I am currently working on addressing all of these problems >> (including this correlation). > > Right, so that mapping util to power was what I was missing and > suggesting we do. So for 'simple' hardware we have cpufreq events for > frequency change, and cpuidle events for idle, and with EM we can simply > sum the relevant power numbers. > > For hardware lacking EM, or hardware managed DVFS, we'll have to fudge > things a little. How best to do that is up in the air a little, but > virtual power curves seem a useful tool to me. > > The next problem for IPA is having all the devices report power in the > same virtual unit I suppose, but I'll leave that to others ;-) > True, there is more issues. There is also another movement with powercap driven by Daniel Lezcano, which I am going to support. Maybe he would be interested as well in having a copy of calculated energy stored in EM. I must gather some requirements and align with him. Thank you for your support! Regards, Lukasz
On Tuesday 14 Jul 2020 at 12:06:53 (+0530), Viresh Kumar wrote: > /** > - * get_load() - get load for a cpu since last updated > + * get_load() - get current load for a cpu > * @cpufreq_cdev: &struct cpufreq_cooling_device for this cpu > * @cpu: cpu number > - * @cpu_idx: index of the cpu in time_in_idle* > + * @cpu_idx: index of the cpu > * > - * Return: The average load of cpu @cpu in percentage since this > - * function was last called. > + * Return: The current load of cpu @cpu in percentage. > */ > static u32 get_load(struct cpufreq_cooling_device *cpufreq_cdev, int cpu, > int cpu_idx) > { > - u32 load; > - u64 now, now_idle, delta_time, delta_idle; > - struct time_in_idle *idle_time = &cpufreq_cdev->idle_time[cpu_idx]; > - > - now_idle = get_cpu_idle_time(cpu, &now, 0); > - delta_idle = now_idle - idle_time->time; > - delta_time = now - idle_time->timestamp; > + unsigned long util = cpu_util_cfs(cpu_rq(cpu)); > + unsigned long max = arch_scale_cpu_capacity(cpu); Should we subtract the thermal PELT signal from max? I'm worried that if we don't do that, the load computed in this function will look artificially small when IPA is capping the max freq, and that we'll enter a weird oscillating state due to the cyclic dependency here. Thoughts? > > - if (delta_time <= delta_idle) > - load = 0; > - else > - load = div64_u64(100 * (delta_time - delta_idle), delta_time); > - > - idle_time->time = now_idle; > - idle_time->timestamp = now; > - > - return load; > + util = effective_cpu_util(cpu, util, max, ENERGY_UTIL, NULL); > + return (util * 100) / max; > } Thanks, Quentin
On 7/17/20 10:46 AM, Vincent Guittot wrote: > On Thu, 16 Jul 2020 at 16:24, Lukasz Luba <lukasz.luba@arm.com> wrote: >> >> Hi Peter, >> >> Thank you for summarizing this. I've put my comments below. >> >> On 7/16/20 12:56 PM, Peter Zijlstra wrote: >>> On Tue, Jul 14, 2020 at 12:06:53PM +0530, Viresh Kumar wrote: >>>> /** >>>> + * get_load() - get current load for a cpu >>>> * @cpufreq_cdev: &struct cpufreq_cooling_device for this cpu >>>> * @cpu: cpu number >>>> + * @cpu_idx: index of the cpu >>>> * >>>> + * Return: The current load of cpu @cpu in percentage. >>>> */ >>>> static u32 get_load(struct cpufreq_cooling_device *cpufreq_cdev, int cpu, >>>> int cpu_idx) >>>> { >>>> + unsigned long util = cpu_util_cfs(cpu_rq(cpu)); >>>> + unsigned long max = arch_scale_cpu_capacity(cpu); >>>> >>>> + util = effective_cpu_util(cpu, util, max, ENERGY_UTIL, NULL); >>>> + return (util * 100) / max; >>>> } >>> >>> So there's a number of things... let me recap a bunch of things that >>> got mentioned on IRC earlier this week and then continue from there.. >>> >>> So IPA* (or any other thermal governor) needs energy estimates for the >>> various managed devices, cpufreq_cooling, being the driver for the CPU >>> device, needs to provide that and in return receives feedback on how >>> much energy it is allowed to consume, cpufreq_cooling then dynamically >>> enables/disables OPP states. >> >> Currently, only IPA uses the power estimation, other governors don't >> use these API functions in cpufreq_cooling. >> >>> >>> There are actually two methods the thermal governor will use: >>> get_real_power() and get_requested_power(). >>> >>> The first isn't used anywhere in mainline, but could be implemented on >>> hardware that has energy counters (like say x86 RAPL). >> >> The first is only present as callback for registered devfreq cooling, >> which is registered by devfreq driver. If that driver provides the >> get_real_power(), it will be called from get_requested_power(). >> Thus, it's likely that IPA would get real power value from HW. >> >> I was planning to add it also to cpufreq_cooling callbacks years >> ago... >> >>> >>> The second attempts to guesstimate power, and is the subject of this >>> patch. >>> >>> Currently cpufreq_cooling appears to estimate the CPU energy usage by >>> calculating the percentage of idle time using the per-cpu cpustat stuff, >>> which is pretty horrific. >> >> Even worse, it then *samples* the *current* CPU frequency at that >> particular point in time and assumes that when the CPU wasn't idle >> during that period - it had *this* frequency... > > So there is 2 problems in the power calculation of cpufreq cooling device : > - How to get an accurate utilization level of the cpu which is what > this patch is trying to fix because using idle time is just wrong > whereas scheduler utilization is frequency invariant > - How to get power estimate from this utilization level. And as you > pointed out, using the current freq which is not accurate. > >> >>> >>> This patch then attempts to improve upon that by using the scheduler's >>> cpu_util(ENERGY_UTIL) estimate, which is also used to select OPP state >>> and improves upon avg idle. This should be a big improvement as higher >> >> IMHO this patch set doesn't address the real problem: 'sampling >> freq problem' described above. There was no issue with getting idle >> period. The avg freq was the problem, in that period when the > > Not sure that you can say that avg freq is a bigger problem than > getting the load because there is a real issue with tracking idle > period for estimating load because running slower reduces the idle > time and increases artificially the load. That's why we implemented > frequency invariance in PELT. If you take a closer look into the code, you see that wrong freq picks the wrong power value from the EM, the line: freq = cpufreq_quick_get(policy->cpu) then check the function cpu_freq_to_power(). The power is calculated by: (raw_cpu_power * cpufreq_cdev->last_load) / 100 The load estimation error is also an issue, but the comprehensive solution should address possibly all existing issues. I don't know if you were approached also by the same vendor, complaining on IPA issues. Do you have some requirements? Or deadlines, for which kernel version you have to fix it? We can discuss this out offline if you like. > > At the opposite when the thermal mitigation happens, the frequency > will be most probably capped by cpu cooling device and will most > probably stay at the capped value I don't think that we can rely on such assumption. The whole cpufreq_get_requested_power() should be changed. > >> CPUs were running. The model implemented in alg was also a problem. >> >> The whole period (e.g. CPU freqs which were used or idle state) >> >> ^(CPU freq) >> | >> | sampling the current freq >> | _______ | >> | | | | >> |________ | | | >> | | | | | >> | | idle | |________v________... >> |_ _____|_______|__________________________> (time) >> start of period end >> |<------- (typically 100ms)-->| >> >> >> >>> frequency consumes more energy, but should we not also consider that: >>> >>> E = C V^2 f >>> >>> The EAS energy model has tables for the OPPs that contain this, but in >>> this case we seem to be assuming a linear enery/frequency curve, which >>> is just not the case. >> >> I am not sure if I got your point. To understand your point better >> I think some drawing would be required. I will skip this patch >> and old mainline code and focus on your proposed solution >> (because this patch set does not address 'sampling freq problem'). >> >>> >>> I suppose we could do something like **: >>> >>> 100 * util^3 / max^3 >>> >>> which assumes V~f. >> >> In EM we keep power values in the array and these values grow >> exponentially. Each OPP has it corresponding >> >> P_x = C (V_x)^2 f_x , where x is the OPP id thus corresponding V,f >> >> so we have discrete power values, growing like: >> >> ^(power) >> | >> | >> | * >> | >> | >> | * >> | | >> | * | >> | | <----- power estimation function >> | * | should not use linear 'util/max_util' >> | * | relation here * >> |_______________________|_____________> (freq) >> opp0 opp1 opp2 opp3 opp4 >> >> What is the problem >> First: >> We need to pick the right Power from the array. I would suggest >> to pick the max allowed frequency for that whole period, because >> we don't know if the CPUs were using it (it's likely). >> Second: >> Then we have the utilization, which can be considered as: >> 'idle period & running period with various freq inside', lets >> call it avg performance in that whole period. >> Third: >> Try to estimate the power used in that whole period having >> the avg performance and max performance. > > We already have a function that is doing such kind of computation > based of the utilization of the CPU : em_pd_energy(). And we could > reuse some of this function if not exactly this one Yes and I think we should use this information. I am going to talk with Daniel about EM evolution (this is one of the topics from my side). Next, it is going to be a LPC event, where we can also discuss with broader audience. Regards, Lukasz
On Friday 17 Jul 2020 at 11:14:38 (+0100), Quentin Perret wrote: > On Tuesday 14 Jul 2020 at 12:06:53 (+0530), Viresh Kumar wrote: > > /** > > - * get_load() - get load for a cpu since last updated > > + * get_load() - get current load for a cpu > > * @cpufreq_cdev: &struct cpufreq_cooling_device for this cpu > > * @cpu: cpu number > > - * @cpu_idx: index of the cpu in time_in_idle* > > + * @cpu_idx: index of the cpu > > * > > - * Return: The average load of cpu @cpu in percentage since this > > - * function was last called. > > + * Return: The current load of cpu @cpu in percentage. > > */ > > static u32 get_load(struct cpufreq_cooling_device *cpufreq_cdev, int cpu, > > int cpu_idx) > > { > > - u32 load; > > - u64 now, now_idle, delta_time, delta_idle; > > - struct time_in_idle *idle_time = &cpufreq_cdev->idle_time[cpu_idx]; > > - > > - now_idle = get_cpu_idle_time(cpu, &now, 0); > > - delta_idle = now_idle - idle_time->time; > > - delta_time = now - idle_time->timestamp; > > + unsigned long util = cpu_util_cfs(cpu_rq(cpu)); > > + unsigned long max = arch_scale_cpu_capacity(cpu); > > Should we subtract the thermal PELT signal from max? Actually, that or add it the ENERGY_UTIL case in effective_cpu_util() as this appears to be missing for EAS too ...
On Friday 17 Jul 2020 at 11:33:05 (+0100), Quentin Perret wrote: > On Friday 17 Jul 2020 at 11:14:38 (+0100), Quentin Perret wrote: > > On Tuesday 14 Jul 2020 at 12:06:53 (+0530), Viresh Kumar wrote: > > > /** > > > - * get_load() - get load for a cpu since last updated > > > + * get_load() - get current load for a cpu > > > * @cpufreq_cdev: &struct cpufreq_cooling_device for this cpu > > > * @cpu: cpu number > > > - * @cpu_idx: index of the cpu in time_in_idle* > > > + * @cpu_idx: index of the cpu > > > * > > > - * Return: The average load of cpu @cpu in percentage since this > > > - * function was last called. > > > + * Return: The current load of cpu @cpu in percentage. > > > */ > > > static u32 get_load(struct cpufreq_cooling_device *cpufreq_cdev, int cpu, > > > int cpu_idx) > > > { > > > - u32 load; > > > - u64 now, now_idle, delta_time, delta_idle; > > > - struct time_in_idle *idle_time = &cpufreq_cdev->idle_time[cpu_idx]; > > > - > > > - now_idle = get_cpu_idle_time(cpu, &now, 0); > > > - delta_idle = now_idle - idle_time->time; > > > - delta_time = now - idle_time->timestamp; > > > + unsigned long util = cpu_util_cfs(cpu_rq(cpu)); > > > + unsigned long max = arch_scale_cpu_capacity(cpu); > > > > Should we subtract the thermal PELT signal from max? > > Actually, that or add it the ENERGY_UTIL case in effective_cpu_util() as > this appears to be missing for EAS too ... Scratch that. I do think there is something missing in the EAS path, but not sure that would fix it. I'll have a think and stop spamming everybody in the meantime ... The first question is still valid, though :) Sorry for the noise, Quentin
On Fri, 17 Jul 2020 at 12:30, Lukasz Luba <lukasz.luba@arm.com> wrote: > > > > On 7/17/20 10:46 AM, Vincent Guittot wrote: > > On Thu, 16 Jul 2020 at 16:24, Lukasz Luba <lukasz.luba@arm.com> wrote: > >> > >> Hi Peter, > >> > >> Thank you for summarizing this. I've put my comments below. > >> > >> On 7/16/20 12:56 PM, Peter Zijlstra wrote: > >>> On Tue, Jul 14, 2020 at 12:06:53PM +0530, Viresh Kumar wrote: > >>>> /** > >>>> + * get_load() - get current load for a cpu > >>>> * @cpufreq_cdev: &struct cpufreq_cooling_device for this cpu > >>>> * @cpu: cpu number > >>>> + * @cpu_idx: index of the cpu > >>>> * > >>>> + * Return: The current load of cpu @cpu in percentage. > >>>> */ > >>>> static u32 get_load(struct cpufreq_cooling_device *cpufreq_cdev, int cpu, > >>>> int cpu_idx) > >>>> { > >>>> + unsigned long util = cpu_util_cfs(cpu_rq(cpu)); > >>>> + unsigned long max = arch_scale_cpu_capacity(cpu); > >>>> > >>>> + util = effective_cpu_util(cpu, util, max, ENERGY_UTIL, NULL); > >>>> + return (util * 100) / max; > >>>> } > >>> > >>> So there's a number of things... let me recap a bunch of things that > >>> got mentioned on IRC earlier this week and then continue from there.. > >>> > >>> So IPA* (or any other thermal governor) needs energy estimates for the > >>> various managed devices, cpufreq_cooling, being the driver for the CPU > >>> device, needs to provide that and in return receives feedback on how > >>> much energy it is allowed to consume, cpufreq_cooling then dynamically > >>> enables/disables OPP states. > >> > >> Currently, only IPA uses the power estimation, other governors don't > >> use these API functions in cpufreq_cooling. > >> > >>> > >>> There are actually two methods the thermal governor will use: > >>> get_real_power() and get_requested_power(). > >>> > >>> The first isn't used anywhere in mainline, but could be implemented on > >>> hardware that has energy counters (like say x86 RAPL). > >> > >> The first is only present as callback for registered devfreq cooling, > >> which is registered by devfreq driver. If that driver provides the > >> get_real_power(), it will be called from get_requested_power(). > >> Thus, it's likely that IPA would get real power value from HW. > >> > >> I was planning to add it also to cpufreq_cooling callbacks years > >> ago... > >> > >>> > >>> The second attempts to guesstimate power, and is the subject of this > >>> patch. > >>> > >>> Currently cpufreq_cooling appears to estimate the CPU energy usage by > >>> calculating the percentage of idle time using the per-cpu cpustat stuff, > >>> which is pretty horrific. > >> > >> Even worse, it then *samples* the *current* CPU frequency at that > >> particular point in time and assumes that when the CPU wasn't idle > >> during that period - it had *this* frequency... > > > > So there is 2 problems in the power calculation of cpufreq cooling device : > > - How to get an accurate utilization level of the cpu which is what > > this patch is trying to fix because using idle time is just wrong > > whereas scheduler utilization is frequency invariant > > - How to get power estimate from this utilization level. And as you > > pointed out, using the current freq which is not accurate. > > > > > > > >> > >>> > >>> This patch then attempts to improve upon that by using the scheduler's > >>> cpu_util(ENERGY_UTIL) estimate, which is also used to select OPP state > >>> and improves upon avg idle. This should be a big improvement as higher > >> > >> IMHO this patch set doesn't address the real problem: 'sampling > >> freq problem' described above. There was no issue with getting idle > >> period. The avg freq was the problem, in that period when the > > > > Not sure that you can say that avg freq is a bigger problem than > > getting the load because there is a real issue with tracking idle > > period for estimating load because running slower reduces the idle > > time and increases artificially the load. That's why we implemented > > frequency invariance in PELT. > > If you take a closer look into the code, you see that wrong > freq picks the wrong power value from the EM, the line: > freq = cpufreq_quick_get(policy->cpu) > then check the function cpu_freq_to_power(). > The power is calculated by: > (raw_cpu_power * cpufreq_cdev->last_load) / 100 > > The load estimation error is also an issue, but the comprehensive > solution should address possibly all existing issues. > > I don't know if you were approached also by the same vendor, > complaining on IPA issues. Do you have some requirements? Or deadlines, > for which kernel version you have to fix it? No, I don't have vendors complaining for IPA. This patch is just because there because tracking idle time is known to be wrong whereas sched_util gives a better estimation of the current/next utilization and as a result a better estimation of the power that will be consumed > We can discuss this out offline if you like. > > > > > At the opposite when the thermal mitigation happens, the frequency > > will be most probably capped by cpu cooling device and will most > > probably stay at the capped value > > I don't think that we can rely on such assumption. The whole I'm not sure that it's that bad although I fully agree that it's not perfect or maybe good enough. The scheduler's utilization is already used to select the cpu frequency. In an ideal world, it is not expected to change according to current information so using scheduler utilization and current freq, which has been also set based on the current knowledge of the utilization of the cpu, should not be that bad. More than what happened during the last period, we try to estimate what will happen during the next one in this case. Trying to track accurately the energy/power consumed over the last period (with idle events and freq changes) is not that useful in this case and especially because of task migration and other scheduler's activities that will make previous period tracking obsolete where scheduler utilization takes that into account. Such accurate power/energy tracking is useful if you want to cap the power consumption of the devices in framework like powercap when HW can't do it by itself but in this case you are woring at a different time scale > cpufreq_get_requested_power() should be changed. > > > > >> CPUs were running. The model implemented in alg was also a problem. > >> > >> The whole period (e.g. CPU freqs which were used or idle state) > >> > >> ^(CPU freq) > >> | > >> | sampling the current freq > >> | _______ | > >> | | | | > >> |________ | | | > >> | | | | | > >> | | idle | |________v________... > >> |_ _____|_______|__________________________> (time) > >> start of period end > >> |<------- (typically 100ms)-->| > >> > >> > >> > >>> frequency consumes more energy, but should we not also consider that: > >>> > >>> E = C V^2 f > >>> > >>> The EAS energy model has tables for the OPPs that contain this, but in > >>> this case we seem to be assuming a linear enery/frequency curve, which > >>> is just not the case. > >> > >> I am not sure if I got your point. To understand your point better > >> I think some drawing would be required. I will skip this patch > >> and old mainline code and focus on your proposed solution > >> (because this patch set does not address 'sampling freq problem'). > >> > >>> > >>> I suppose we could do something like **: > >>> > >>> 100 * util^3 / max^3 > >>> > >>> which assumes V~f. > >> > >> In EM we keep power values in the array and these values grow > >> exponentially. Each OPP has it corresponding > >> > >> P_x = C (V_x)^2 f_x , where x is the OPP id thus corresponding V,f > >> > >> so we have discrete power values, growing like: > >> > >> ^(power) > >> | > >> | > >> | * > >> | > >> | > >> | * > >> | | > >> | * | > >> | | <----- power estimation function > >> | * | should not use linear 'util/max_util' > >> | * | relation here * > >> |_______________________|_____________> (freq) > >> opp0 opp1 opp2 opp3 opp4 > >> > >> What is the problem > >> First: > >> We need to pick the right Power from the array. I would suggest > >> to pick the max allowed frequency for that whole period, because > >> we don't know if the CPUs were using it (it's likely). > >> Second: > >> Then we have the utilization, which can be considered as: > >> 'idle period & running period with various freq inside', lets > >> call it avg performance in that whole period. > >> Third: > >> Try to estimate the power used in that whole period having > >> the avg performance and max performance. > > > > We already have a function that is doing such kind of computation > > based of the utilization of the CPU : em_pd_energy(). And we could > > reuse some of this function if not exactly this one > > Yes and I think we should use this information. I am going to > talk with Daniel about EM evolution (this is one of the topics > from my side). Next, it is going to be a LPC event, where we > can also discuss with broader audience. > > Regards, > Lukasz >
On 17-07-20, 11:43, Quentin Perret wrote: > On Friday 17 Jul 2020 at 11:33:05 (+0100), Quentin Perret wrote: > > On Friday 17 Jul 2020 at 11:14:38 (+0100), Quentin Perret wrote: > > > On Tuesday 14 Jul 2020 at 12:06:53 (+0530), Viresh Kumar wrote: > > > > /** > > > > - * get_load() - get load for a cpu since last updated > > > > + * get_load() - get current load for a cpu > > > > * @cpufreq_cdev: &struct cpufreq_cooling_device for this cpu > > > > * @cpu: cpu number > > > > - * @cpu_idx: index of the cpu in time_in_idle* > > > > + * @cpu_idx: index of the cpu > > > > * > > > > - * Return: The average load of cpu @cpu in percentage since this > > > > - * function was last called. > > > > + * Return: The current load of cpu @cpu in percentage. > > > > */ > > > > static u32 get_load(struct cpufreq_cooling_device *cpufreq_cdev, int cpu, > > > > int cpu_idx) > > > > { > > > > - u32 load; > > > > - u64 now, now_idle, delta_time, delta_idle; > > > > - struct time_in_idle *idle_time = &cpufreq_cdev->idle_time[cpu_idx]; > > > > - > > > > - now_idle = get_cpu_idle_time(cpu, &now, 0); > > > > - delta_idle = now_idle - idle_time->time; > > > > - delta_time = now - idle_time->timestamp; > > > > + unsigned long util = cpu_util_cfs(cpu_rq(cpu)); > > > > + unsigned long max = arch_scale_cpu_capacity(cpu); > > > > > > Should we subtract the thermal PELT signal from max? Makes sense to me, but I am not really good with this util and load stuff and so would leave that to you guys to decide :) -- viresh
On 17-07-20, 11:46, Vincent Guittot wrote: > On Thu, 16 Jul 2020 at 16:24, Lukasz Luba <lukasz.luba@arm.com> wrote: > > On 7/16/20 12:56 PM, Peter Zijlstra wrote: > > > Currently cpufreq_cooling appears to estimate the CPU energy usage by > > > calculating the percentage of idle time using the per-cpu cpustat stuff, > > > which is pretty horrific. > > > > Even worse, it then *samples* the *current* CPU frequency at that > > particular point in time and assumes that when the CPU wasn't idle > > during that period - it had *this* frequency... > > So there is 2 problems in the power calculation of cpufreq cooling device : > - How to get an accurate utilization level of the cpu which is what > this patch is trying to fix because using idle time is just wrong > whereas scheduler utilization is frequency invariant Since this patch is targeted only towards fixing this particular problem, should I change something in the patch to make it acceptable ? > - How to get power estimate from this utilization level. And as you > pointed out, using the current freq which is not accurate. This should be tackled separately I believe. -- viresh
Hi Viresh, On 7/30/20 7:24 AM, Viresh Kumar wrote: > On 17-07-20, 11:46, Vincent Guittot wrote: >> On Thu, 16 Jul 2020 at 16:24, Lukasz Luba <lukasz.luba@arm.com> wrote: >>> On 7/16/20 12:56 PM, Peter Zijlstra wrote: >>>> Currently cpufreq_cooling appears to estimate the CPU energy usage by >>>> calculating the percentage of idle time using the per-cpu cpustat stuff, >>>> which is pretty horrific. >>> >>> Even worse, it then *samples* the *current* CPU frequency at that >>> particular point in time and assumes that when the CPU wasn't idle >>> during that period - it had *this* frequency... >> >> So there is 2 problems in the power calculation of cpufreq cooling device : >> - How to get an accurate utilization level of the cpu which is what >> this patch is trying to fix because using idle time is just wrong >> whereas scheduler utilization is frequency invariant > > Since this patch is targeted only towards fixing this particular > problem, should I change something in the patch to make it acceptable > ? > >> - How to get power estimate from this utilization level. And as you >> pointed out, using the current freq which is not accurate. > > This should be tackled separately I believe. > I don't think that these two are separate. Furthermore, I think we would need this kind of information also in future in the powercap. I've discussed with Daniel this possible scenario. We have a vendor who presented issue with the IPA input power and pointed out these issues. Unfortunately, I don't have this vendor phone but I assume it can last a few minutes without changing the max allowed OPP. Based on their plots the frequency driven by the governor is changing, also the idles are present during the IPA period. Please give me a few days, because I am also plumbing these stuff and would like to present it. These two interfaces: involving cpufreq driver or fallback mode for utilization and EM. Regards, Lukasz
On 30-07-20, 12:16, Lukasz Luba wrote: > Hi Viresh, > > On 7/30/20 7:24 AM, Viresh Kumar wrote: > > On 17-07-20, 11:46, Vincent Guittot wrote: > > > On Thu, 16 Jul 2020 at 16:24, Lukasz Luba <lukasz.luba@arm.com> wrote: > > > > On 7/16/20 12:56 PM, Peter Zijlstra wrote: > > > > > Currently cpufreq_cooling appears to estimate the CPU energy usage by > > > > > calculating the percentage of idle time using the per-cpu cpustat stuff, > > > > > which is pretty horrific. > > > > > > > > Even worse, it then *samples* the *current* CPU frequency at that > > > > particular point in time and assumes that when the CPU wasn't idle > > > > during that period - it had *this* frequency... > > > > > > So there is 2 problems in the power calculation of cpufreq cooling device : > > > - How to get an accurate utilization level of the cpu which is what > > > this patch is trying to fix because using idle time is just wrong > > > whereas scheduler utilization is frequency invariant > > > > Since this patch is targeted only towards fixing this particular > > problem, should I change something in the patch to make it acceptable > > ? > > > > > - How to get power estimate from this utilization level. And as you > > > pointed out, using the current freq which is not accurate. > > > > This should be tackled separately I believe. > > > > I don't think that these two are separate. Furthermore, I think we > would need this kind of information also in future in the powercap. > I've discussed with Daniel this possible scenario. > > We have a vendor who presented issue with the IPA input power and > pointed out these issues. Unfortunately, I don't have this vendor > phone but I assume it can last a few minutes without changing the > max allowed OPP. Based on their plots the frequency driven by the > governor is changing, also the idles are present during the IPA period. > > Please give me a few days, because I am also plumbing these stuff > and would like to present it. These two interfaces: involving cpufreq > driver or fallback mode for utilization and EM. Its been almost 3 months, do we have any update for this? We really would like to get this patchset merged in some form as it provides a simple update and I think more work can be done by anyone over it in future. -- viresh
On 10/19/20 8:40 AM, Viresh Kumar wrote: > On 30-07-20, 12:16, Lukasz Luba wrote: >> Hi Viresh, >> >> On 7/30/20 7:24 AM, Viresh Kumar wrote: >>> On 17-07-20, 11:46, Vincent Guittot wrote: >>>> On Thu, 16 Jul 2020 at 16:24, Lukasz Luba <lukasz.luba@arm.com> wrote: >>>>> On 7/16/20 12:56 PM, Peter Zijlstra wrote: >>>>>> Currently cpufreq_cooling appears to estimate the CPU energy usage by >>>>>> calculating the percentage of idle time using the per-cpu cpustat stuff, >>>>>> which is pretty horrific. >>>>> >>>>> Even worse, it then *samples* the *current* CPU frequency at that >>>>> particular point in time and assumes that when the CPU wasn't idle >>>>> during that period - it had *this* frequency... >>>> >>>> So there is 2 problems in the power calculation of cpufreq cooling device : >>>> - How to get an accurate utilization level of the cpu which is what >>>> this patch is trying to fix because using idle time is just wrong >>>> whereas scheduler utilization is frequency invariant >>> >>> Since this patch is targeted only towards fixing this particular >>> problem, should I change something in the patch to make it acceptable >>> ? >>> >>>> - How to get power estimate from this utilization level. And as you >>>> pointed out, using the current freq which is not accurate. >>> >>> This should be tackled separately I believe. >>> >> >> I don't think that these two are separate. Furthermore, I think we >> would need this kind of information also in future in the powercap. >> I've discussed with Daniel this possible scenario. >> >> We have a vendor who presented issue with the IPA input power and >> pointed out these issues. Unfortunately, I don't have this vendor >> phone but I assume it can last a few minutes without changing the >> max allowed OPP. Based on their plots the frequency driven by the >> governor is changing, also the idles are present during the IPA period. >> >> Please give me a few days, because I am also plumbing these stuff >> and would like to present it. These two interfaces: involving cpufreq >> driver or fallback mode for utilization and EM. > > Its been almost 3 months, do we have any update for this? We really > would like to get this patchset merged in some form as it provides a > simple update and I think more work can be done by anyone over it in > future. > I made a few implementations to compare the results with reality (power measured using power meter on cluster rails). This idea with utilization from the schedutil_cpu_util() has some edge cases with errors. The signal is good for comparison and short prediction, but taking it as an approximation for past arbitrary period (e.g. 100ms) has issues. It is good when estimating energy cost during e.g. compute_energy(). What your renamed function of old schedutil_cpu_util() does is returning the sum of utilization of runqueues (CFS, RT, DL, (IRQ)) at that time. This utilization is dependent on sum of utilization of tasks being there. These tasks could shuffle in the past (especially when we deal with period ~100ms in IPA)... I am currently working on a few different topics, not full time on this one. Thus, I tend to agree that this provides 'simple update and ... more work can be done' in future. Although, I am a bit concerned that it would require some exports from the scheduler, some changed to schedutil, which I am not sure they would pay off. If Rafael and Peter will allow you to change these sub-systems, then I don't mind. What I am trying to implement is different than this idea. Regards, Lukasz
Hi Peter, Since Lukasz asked me to hold on to this stuff so he can propose something in its place, I stayed away from discussing this patchset for sometime. But now that he agrees [1] that we may take this forward and he can work on top of it as and when he can, I am looking to find the way out to get this stuff merged in some form. On 16-07-20, 13:56, Peter Zijlstra wrote: > So there's a number of things... let me recap a bunch of things that > got mentioned on IRC earlier this week and then continue from there.. > > So IPA* (or any other thermal governor) needs energy estimates for the > various managed devices, cpufreq_cooling, being the driver for the CPU > device, needs to provide that and in return receives feedback on how > much energy it is allowed to consume, cpufreq_cooling then dynamically > enables/disables OPP states. > > There are actually two methods the thermal governor will use: > get_real_power() and get_requested_power(). > > The first isn't used anywhere in mainline, but could be implemented on > hardware that has energy counters (like say x86 RAPL). > > The second attempts to guesstimate power, and is the subject of this > patch. > > Currently cpufreq_cooling appears to estimate the CPU energy usage by > calculating the percentage of idle time using the per-cpu cpustat stuff, > which is pretty horrific. > > This patch then attempts to improve upon that by using the scheduler's > cpu_util(ENERGY_UTIL) estimate, which is also used to select OPP state > and improves upon avg idle. This should be a big improvement as higher > frequency consumes more energy Exactly and that's the motivation behind this change. > , but should we not also consider that: > > E = C V^2 f > > The EAS energy model has tables for the OPPs that contain this, but in > this case we seem to be assuming a linear enery/frequency curve, which > is just not the case. > > I suppose we could do something like **: > > 100 * util^3 / max^3 > > which assumes V~f. > > Another point is that cpu_util() vs turbo is a bit iffy, and to that, > things like x86-APERF/MPERF and ARM-AMU got mentioned. Those might also > have the benefit of giving you values that match your own sampling > interval (100ms), where the sched stuff is PELT (64,32.. based). I believe the above stuff is more around additional improvements that we can do over this change, and probably Lukasz was looking to do that. > So what I've been thinking is that cpufreq drivers ought to be able to > supply this method, and only when they lack, can the cpufreq-governor > (schedutil) install a fallback. One of the issues I see with this is that schedutil may not be available in all configurations and it is still absolutely fine to using the suggested helper to get the energy numbers in such cases, so we shouldn't really make it scheutil dependent. > And then cpufreq-cooling can use > whatever is provided (through the cpufreq interfaces). > > That way, we: > > 1) don't have to export anything Yeah, I understand that the exports are annoying and specially this line :) +#include "../../kernel/sched/sched.h" But this is the best I could think of then as we don't want to export them for anyone else to use sched specific stuff. And there was other core kernel stuff that was already doing it :) > 2) get arch drivers to provide something 'better' > > > Does that sounds like something sensible?
On Thu, Oct 22, 2020 at 02:02:55PM +0530, Viresh Kumar wrote: > On 16-07-20, 13:56, Peter Zijlstra wrote: > > Another point is that cpu_util() vs turbo is a bit iffy, and to that, > > things like x86-APERF/MPERF and ARM-AMU got mentioned. Those might also > > have the benefit of giving you values that match your own sampling > > interval (100ms), where the sched stuff is PELT (64,32.. based). > > I believe the above stuff is more around additional improvements that > we can do over this change, and probably Lukasz was looking to do > that. > > > So what I've been thinking is that cpufreq drivers ought to be able to > > supply this method, and only when they lack, can the cpufreq-governor > > (schedutil) install a fallback. > > One of the issues I see with this is that schedutil may not be > available in all configurations and it is still absolutely fine to > using the suggested helper to get the energy numbers in such cases, so > we shouldn't really make it scheutil dependent. The only constraint on schedutil is SMP I think; aside from that it should/could always be available. Given the trainwreck here: 20201022071145.GM2628@hirez.programming.kicks-ass.net (you're on Cc), I'm starting to lean more and more towards making it unconditionally available (when SMP). Anybody forcing it off either sets performance (in which case we don't care about energy usage anyway) or they select one of the old (broken) ondemand/conservative things and I don't give a crap.
On 22-10-20, 11:05, Peter Zijlstra wrote: > On Thu, Oct 22, 2020 at 02:02:55PM +0530, Viresh Kumar wrote: > > One of the issues I see with this is that schedutil may not be > > available in all configurations and it is still absolutely fine to > > using the suggested helper to get the energy numbers in such cases, so > > we shouldn't really make it scheutil dependent. > > The only constraint on schedutil is SMP I think; aside from that it > should/could always be available. > > Given the trainwreck here: > > 20201022071145.GM2628@hirez.programming.kicks-ass.net > > (you're on Cc), I'm starting to lean more and more towards making it > unconditionally available (when SMP). > > Anybody forcing it off either sets performance (in which case we don't > care about energy usage anyway) I agree. > or they select one of the old (broken) > ondemand/conservative things and I don't give a crap. The other kernel layers, for example cpufreq-cooling in question here, don't really need to bother with the governor in use and should be able to get the energy numbers anyway. So for me, the energy number that the cpufreq-cooling stuff gets should be same irrespective of the governor in use, schedutil or ondemand. Having said that, schedutil really doesn't need to install the fallback (which you suggested earlier), rather the scheduler core can do that directly with cpufreq core and schedutil can also use the same fallback mechanism maybe ? And so we can avoid the exporting of stuff that way. -- viresh
On Thu, Oct 22, 2020 at 1:07 PM Viresh Kumar <viresh.kumar@linaro.org> wrote: > > On 22-10-20, 11:05, Peter Zijlstra wrote: > > On Thu, Oct 22, 2020 at 02:02:55PM +0530, Viresh Kumar wrote: > > > One of the issues I see with this is that schedutil may not be > > > available in all configurations and it is still absolutely fine to > > > using the suggested helper to get the energy numbers in such cases, so > > > we shouldn't really make it scheutil dependent. > > > > The only constraint on schedutil is SMP I think; aside from that it > > should/could always be available. > > > > Given the trainwreck here: > > > > 20201022071145.GM2628@hirez.programming.kicks-ass.net > > > > (you're on Cc), I'm starting to lean more and more towards making it > > unconditionally available (when SMP). > > > > Anybody forcing it off either sets performance (in which case we don't > > care about energy usage anyway) > > I agree. That's not really the case, though. Many people use intel_pstate in the active mode with HWP enabled too. Arguably, that doesn't need to compute the effective utilization, so I guess it is not relevant for the discussion here, but it is not negligible in general. > > or they select one of the old (broken) > > ondemand/conservative things and I don't give a crap. > > The other kernel layers, for example cpufreq-cooling in question here, > don't really need to bother with the governor in use and should be > able to get the energy numbers anyway. So for me, the energy number > that the cpufreq-cooling stuff gets should be same irrespective of the > governor in use, schedutil or ondemand. > > Having said that, schedutil really doesn't need to install the > fallback (which you suggested earlier), rather the scheduler core can > do that directly with cpufreq core and schedutil can also use the same > fallback mechanism maybe ? And so we can avoid the exporting of stuff > that way. I like this better than the fallback idea TBH.
On Thu, Oct 22, 2020 at 04:36:56PM +0530, Viresh Kumar wrote: > On 22-10-20, 11:05, Peter Zijlstra wrote: > > On Thu, Oct 22, 2020 at 02:02:55PM +0530, Viresh Kumar wrote: > > > One of the issues I see with this is that schedutil may not be > > > available in all configurations and it is still absolutely fine to > > > using the suggested helper to get the energy numbers in such cases, so > > > we shouldn't really make it scheutil dependent. > > > > The only constraint on schedutil is SMP I think; aside from that it > > should/could always be available. > > > > Given the trainwreck here: > > > > 20201022071145.GM2628@hirez.programming.kicks-ass.net > > > > (you're on Cc), I'm starting to lean more and more towards making it > > unconditionally available (when SMP). > > > > Anybody forcing it off either sets performance (in which case we don't > > care about energy usage anyway) > > I agree. > > > or they select one of the old (broken) > > ondemand/conservative things and I don't give a crap. > > The other kernel layers, for example cpufreq-cooling in question here, > don't really need to bother with the governor in use and should be > able to get the energy numbers anyway. So for me, the energy number > that the cpufreq-cooling stuff gets should be same irrespective of the > governor in use, schedutil or ondemand. > > Having said that, schedutil really doesn't need to install the > fallback (which you suggested earlier), rather the scheduler core can > do that directly with cpufreq core and schedutil can also use the same > fallback mechanism maybe ? And so we can avoid the exporting of stuff > that way. I suppose that could work, yes. It's a bit weird to have two interactions with cpufreq, once through a governor and once outside it, but I suppose I can live with that.
On Thu, Oct 22, 2020 at 01:30:01PM +0200, Rafael J. Wysocki wrote: > Many people use intel_pstate in the active mode with HWP enabled too. We now have HWP-passive supported, afaict. So we should discourage that. That is; I'll care less and less about people not using schedutil as time goes on. > Arguably, that doesn't need to compute the effective utilization, so I > guess it is not relevant for the discussion here, but it is not > negligible in general. Why not? cpufreq-cooling should still be able to throttle the system by setting HWP.Highest_Performance no? In which case it still needs an energy estimate.
On Thu, Oct 22, 2020 at 1:58 PM Peter Zijlstra <peterz@infradead.org> wrote: > > On Thu, Oct 22, 2020 at 01:30:01PM +0200, Rafael J. Wysocki wrote: > > > Many people use intel_pstate in the active mode with HWP enabled too. > > We now have HWP-passive supported, afaict. So we should discourage that. Which is kind of hard, because plain HWP does better in some cases, especially performance-focused. I am still not sure why this is the case given how the passive mode with HWP enabled is implemented, but that's what Srinivas sees in his tests. > That is; I'll care less and less about people not using schedutil as > time goes on. > > > Arguably, that doesn't need to compute the effective utilization, so I > > guess it is not relevant for the discussion here, but it is not > > negligible in general. > > Why not? cpufreq-cooling should still be able to throttle the system by > setting HWP.Highest_Performance no? Well, in theory, but it is not used on x86 AFAICS. > In which case it still needs an energy estimate.
diff --git a/drivers/thermal/cpufreq_cooling.c b/drivers/thermal/cpufreq_cooling.c index 6c0e1b053126..74340b2b0da7 100644 --- a/drivers/thermal/cpufreq_cooling.c +++ b/drivers/thermal/cpufreq_cooling.c @@ -23,6 +23,7 @@ #include <linux/thermal.h> #include <trace/events/thermal.h> +#include "../../kernel/sched/sched.h" /* * Cooling state <-> CPUFreq frequency @@ -38,16 +39,6 @@ * ... */ -/** - * struct time_in_idle - Idle time stats - * @time: previous reading of the absolute time that this cpu was idle - * @timestamp: wall time of the last invocation of get_cpu_idle_time_us() - */ -struct time_in_idle { - u64 time; - u64 timestamp; -}; - /** * struct cpufreq_cooling_device - data for cooling device with cpufreq * @id: unique integer value corresponding to each cpufreq_cooling_device @@ -62,7 +53,6 @@ struct time_in_idle { * registered cooling device. * @policy: cpufreq policy. * @node: list_head to link all cpufreq_cooling_device together. - * @idle_time: idle time stats * @qos_req: PM QoS contraint to apply * * This structure is required for keeping information of each registered @@ -76,7 +66,6 @@ struct cpufreq_cooling_device { struct em_perf_domain *em; struct cpufreq_policy *policy; struct list_head node; - struct time_in_idle *idle_time; struct freq_qos_request qos_req; }; @@ -132,34 +121,21 @@ static u32 cpu_power_to_freq(struct cpufreq_cooling_device *cpufreq_cdev, } /** - * get_load() - get load for a cpu since last updated + * get_load() - get current load for a cpu * @cpufreq_cdev: &struct cpufreq_cooling_device for this cpu * @cpu: cpu number - * @cpu_idx: index of the cpu in time_in_idle* + * @cpu_idx: index of the cpu * - * Return: The average load of cpu @cpu in percentage since this - * function was last called. + * Return: The current load of cpu @cpu in percentage. */ static u32 get_load(struct cpufreq_cooling_device *cpufreq_cdev, int cpu, int cpu_idx) { - u32 load; - u64 now, now_idle, delta_time, delta_idle; - struct time_in_idle *idle_time = &cpufreq_cdev->idle_time[cpu_idx]; - - now_idle = get_cpu_idle_time(cpu, &now, 0); - delta_idle = now_idle - idle_time->time; - delta_time = now - idle_time->timestamp; + unsigned long util = cpu_util_cfs(cpu_rq(cpu)); + unsigned long max = arch_scale_cpu_capacity(cpu); - if (delta_time <= delta_idle) - load = 0; - else - load = div64_u64(100 * (delta_time - delta_idle), delta_time); - - idle_time->time = now_idle; - idle_time->timestamp = now; - - return load; + util = effective_cpu_util(cpu, util, max, ENERGY_UTIL, NULL); + return (util * 100) / max; } /** @@ -192,13 +168,12 @@ static u32 get_dynamic_power(struct cpufreq_cooling_device *cpufreq_cdev, * Instead, we calculate the current power on the assumption that the * immediate future will look like the immediate past. * - * We use the current frequency and the average load since this - * function was last called. In reality, there could have been - * multiple opps since this function was last called and that affects - * the load calculation. While it's not perfectly accurate, this - * simplification is good enough and works. REVISIT this, as more - * complex code may be needed if experiments show that it's not - * accurate enough. + * We use the current frequency and the current load. In reality, + * there could have been multiple opps since this function was last + * called and that affects the load calculation. While it's not + * perfectly accurate, this simplification is good enough and works. + * REVISIT this, as more complex code may be needed if experiments show + * that it's not accurate enough. * * Return: 0 on success, -E* if getting the static power failed. */ @@ -523,13 +498,6 @@ __cpufreq_cooling_register(struct device_node *np, cpufreq_cdev->policy = policy; num_cpus = cpumask_weight(policy->related_cpus); - cpufreq_cdev->idle_time = kcalloc(num_cpus, - sizeof(*cpufreq_cdev->idle_time), - GFP_KERNEL); - if (!cpufreq_cdev->idle_time) { - cdev = ERR_PTR(-ENOMEM); - goto free_cdev; - } /* max_level is an index, not a counter */ cpufreq_cdev->max_level = i - 1; @@ -537,7 +505,7 @@ __cpufreq_cooling_register(struct device_node *np, ret = ida_simple_get(&cpufreq_ida, 0, 0, GFP_KERNEL); if (ret < 0) { cdev = ERR_PTR(ret); - goto free_idle_time; + goto free_cdev; } cpufreq_cdev->id = ret; @@ -586,8 +554,6 @@ __cpufreq_cooling_register(struct device_node *np, freq_qos_remove_request(&cpufreq_cdev->qos_req); remove_ida: ida_simple_remove(&cpufreq_ida, cpufreq_cdev->id); -free_idle_time: - kfree(cpufreq_cdev->idle_time); free_cdev: kfree(cpufreq_cdev); return cdev; @@ -680,7 +646,6 @@ void cpufreq_cooling_unregister(struct thermal_cooling_device *cdev) thermal_cooling_device_unregister(cdev); freq_qos_remove_request(&cpufreq_cdev->qos_req); ida_simple_remove(&cpufreq_ida, cpufreq_cdev->id); - kfree(cpufreq_cdev->idle_time); kfree(cpufreq_cdev); } EXPORT_SYMBOL_GPL(cpufreq_cooling_unregister);
Several parts of the kernel are already using the effective CPU utilization to get the current load on the CPU, do the same here instead of depending on the idle time of the CPU, which isn't that accurate comparatively. Note that, this (and CPU frequency scaling in general) doesn't work that well with idle injection as that is done from rt threads and is counted as load while it tries to do quite the opposite. That should be solved separately though. Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org> --- drivers/thermal/cpufreq_cooling.c | 65 +++++++------------------------ 1 file changed, 15 insertions(+), 50 deletions(-) -- 2.25.0.rc1.19.g042ed3e048af