Message ID | 20240116115921.804185-1-daniel.lezcano@linaro.org |
---|---|
State | New |
Headers | show |
Series | arm64: dts: qcom: sdm845: Fix wild reboot during Antutu test | expand |
On Tue, 16 Jan 2024 at 17:29, Daniel Lezcano <daniel.lezcano@linaro.org> wrote: > > Running an Antutu benchmark makes the board to do a hard reboot. > > Cause: it appears the gpu-bottom and gpu-top temperature sensors are showing > too high temperatures, above 115°C. > > Out of tree configuratons show the gpu thermal zone is configured to > be mitigated at 85°C with devfreq. > > Add the DT snippet to enable the thermal mitigation on the sdm845 > based board. Smoke tested on Dragonboard 845c running AOSP with a custom/generic thermal HAL. Tested-by: Amit Pundir <amit.pundir@linaro.org> PS: Probably good to mention the OS (AOSP) and board (DB845c), on which the Antutu crash was reported, in the commit message as well . > > Fixes: c79800103eb18 ("arm64: dts: sdm845: Add gpu and gmu device nodes") > Cc: Amit Pundir <amit.pundir@linaro.org> > Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org> > --- > arch/arm64/boot/dts/qcom/sdm845.dtsi | 32 ++++++++++++++++++++++++++-- > 1 file changed, 30 insertions(+), 2 deletions(-) > > diff --git a/arch/arm64/boot/dts/qcom/sdm845.dtsi b/arch/arm64/boot/dts/qcom/sdm845.dtsi > index c2244824355a..20fefd6af0f8 100644 > --- a/arch/arm64/boot/dts/qcom/sdm845.dtsi > +++ b/arch/arm64/boot/dts/qcom/sdm845.dtsi > @@ -4764,6 +4764,8 @@ gpu: gpu@5000000 { > interconnects = <&mem_noc MASTER_GFX3D 0 &mem_noc SLAVE_EBI1 0>; > interconnect-names = "gfx-mem"; > > + #cooling-cells = <2>; > + > status = "disabled"; > > gpu_opp_table: opp-table { > @@ -5603,12 +5605,25 @@ gpu-top-thermal { > thermal-sensors = <&tsens0 11>; > > trips { > - gpu1_alert0: trip-point0 { > + gpu1_alert0: trip-point0 { > + temperature = <85000>; > + hysteresis = <2000>; > + type = "passive"; > + }; > + > + gpu1_alert1: trip-point1 { > temperature = <90000>; > hysteresis = <2000>; > type = "hot"; > }; > }; > + > + cooling-maps { > + map0 { > + trip = <&gpu1_alert0>; > + cooling-device = <&gpu THERMAL_NO_LIMIT THERMAL_NO_LIMIT>; > + }; > + }; > }; > > gpu-bottom-thermal { > @@ -5618,12 +5633,25 @@ gpu-bottom-thermal { > thermal-sensors = <&tsens0 12>; > > trips { > - gpu2_alert0: trip-point0 { > + gpu2_alert0: trip-point0 { > + temperature = <85000>; > + hysteresis = <2000>; > + type = "passive"; > + }; > + > + gpu2_alert1: trip-point1 { > temperature = <90000>; > hysteresis = <2000>; > type = "hot"; > }; > }; > + > + cooling-maps { > + map0 { > + trip = <&gpu2_alert0>; > + cooling-device = <&gpu THERMAL_NO_LIMIT THERMAL_NO_LIMIT>; > + }; > + }; > }; > > aoss1-thermal { > -- > 2.34.1 >
On Tue Jan 16, 2024 at 12:59 PM CET, Daniel Lezcano wrote: > Running an Antutu benchmark makes the board to do a hard reboot. > > Cause: it appears the gpu-bottom and gpu-top temperature sensors are showing > too high temperatures, above 115°C. > > Out of tree configuratons show the gpu thermal zone is configured to > be mitigated at 85°C with devfreq. > > Add the DT snippet to enable the thermal mitigation on the sdm845 > based board. > > Fixes: c79800103eb18 ("arm64: dts: sdm845: Add gpu and gmu device nodes") > Cc: Amit Pundir <amit.pundir@linaro.org> > Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org> A part of this is already included with this patch: https://lore.kernel.org/linux-arm-msm/20240102-topic-gpu_cooling-v1-4-fda30c57e353@linaro.org/ Maybe rebase on top of that one and add the 85degC trip point or something? > --- > arch/arm64/boot/dts/qcom/sdm845.dtsi | 32 ++++++++++++++++++++++++++-- > 1 file changed, 30 insertions(+), 2 deletions(-) > > diff --git a/arch/arm64/boot/dts/qcom/sdm845.dtsi b/arch/arm64/boot/dts/qcom/sdm845.dtsi > index c2244824355a..20fefd6af0f8 100644 > --- a/arch/arm64/boot/dts/qcom/sdm845.dtsi > +++ b/arch/arm64/boot/dts/qcom/sdm845.dtsi > @@ -4764,6 +4764,8 @@ gpu: gpu@5000000 { > interconnects = <&mem_noc MASTER_GFX3D 0 &mem_noc SLAVE_EBI1 0>; > interconnect-names = "gfx-mem"; > > + #cooling-cells = <2>; > + > status = "disabled"; > > gpu_opp_table: opp-table { > @@ -5603,12 +5605,25 @@ gpu-top-thermal { > thermal-sensors = <&tsens0 11>; > > trips { > - gpu1_alert0: trip-point0 { > + gpu1_alert0: trip-point0 { > + temperature = <85000>; > + hysteresis = <2000>; > + type = "passive"; > + }; > + The indentation here should use tabs not spaces. > + gpu1_alert1: trip-point1 { > temperature = <90000>; > hysteresis = <2000>; > type = "hot"; > }; > }; > + > + cooling-maps { > + map0 { > + trip = <&gpu1_alert0>; > + cooling-device = <&gpu THERMAL_NO_LIMIT THERMAL_NO_LIMIT>; > + }; > + }; > }; > > gpu-bottom-thermal { > @@ -5618,12 +5633,25 @@ gpu-bottom-thermal { > thermal-sensors = <&tsens0 12>; > > trips { > - gpu2_alert0: trip-point0 { > + gpu2_alert0: trip-point0 { > + temperature = <85000>; > + hysteresis = <2000>; > + type = "passive"; > + }; > + > + gpu2_alert1: trip-point1 { > temperature = <90000>; > hysteresis = <2000>; > type = "hot"; > }; > }; > + > + cooling-maps { > + map0 { > + trip = <&gpu2_alert0>; > + cooling-device = <&gpu THERMAL_NO_LIMIT THERMAL_NO_LIMIT>; > + }; > + }; > }; > > aoss1-thermal {
On Tue Jan 16, 2024 at 1:51 PM CET, Daniel Lezcano wrote: > On 16/01/2024 13:37, Luca Weiss wrote: > > On Tue Jan 16, 2024 at 12:59 PM CET, Daniel Lezcano wrote: > >> Running an Antutu benchmark makes the board to do a hard reboot. > >> > >> Cause: it appears the gpu-bottom and gpu-top temperature sensors are showing > >> too high temperatures, above 115°C. > >> > >> Out of tree configuratons show the gpu thermal zone is configured to > >> be mitigated at 85°C with devfreq. > >> > >> Add the DT snippet to enable the thermal mitigation on the sdm845 > >> based board. > >> > >> Fixes: c79800103eb18 ("arm64: dts: sdm845: Add gpu and gmu device nodes") > >> Cc: Amit Pundir <amit.pundir@linaro.org> > >> Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org> > > > > A part of this is already included with this patch: > > https://lore.kernel.org/linux-arm-msm/20240102-topic-gpu_cooling-v1-4-fda30c57e353@linaro.org/ > > > > Maybe rebase on top of that one and add the 85degC trip point or > > something? > > Actually, I think the patch is wrong. I recommend telling Konrad in that patch then, not me :) > > The cooling effect does not operate on 'hot' trip point type as it is > considered as a critical trip point. The governor is not invoked, so no > mitigation happen. The 'hot' trip point type results in sending a > notification to userspace to give the last chance to do something before > 'critical' is reached where the system is shut down. > > I suggest to revert it and pick the one I proposed. It hasn't been applied yet so it can be fixed in v2 there. Regards Luca
On 16/01/2024 15:03, Luca Weiss wrote: > On Tue Jan 16, 2024 at 1:51 PM CET, Daniel Lezcano wrote: >> On 16/01/2024 13:37, Luca Weiss wrote: >>> On Tue Jan 16, 2024 at 12:59 PM CET, Daniel Lezcano wrote: >>>> Running an Antutu benchmark makes the board to do a hard reboot. >>>> >>>> Cause: it appears the gpu-bottom and gpu-top temperature sensors are showing >>>> too high temperatures, above 115°C. >>>> >>>> Out of tree configuratons show the gpu thermal zone is configured to >>>> be mitigated at 85°C with devfreq. >>>> >>>> Add the DT snippet to enable the thermal mitigation on the sdm845 >>>> based board. >>>> >>>> Fixes: c79800103eb18 ("arm64: dts: sdm845: Add gpu and gmu device nodes") >>>> Cc: Amit Pundir <amit.pundir@linaro.org> >>>> Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org> >>> >>> A part of this is already included with this patch: >>> https://lore.kernel.org/linux-arm-msm/20240102-topic-gpu_cooling-v1-4-fda30c57e353@linaro.org/ >>> >>> Maybe rebase on top of that one and add the 85degC trip point or >>> something? >> >> Actually, I think the patch is wrong. > > I recommend telling Konrad in that patch then, not me :) That's good Konrad is in the recipient list :) >> The cooling effect does not operate on 'hot' trip point type as it is >> considered as a critical trip point. The governor is not invoked, so no >> mitigation happen. The 'hot' trip point type results in sending a >> notification to userspace to give the last chance to do something before >> 'critical' is reached where the system is shut down. >> >> I suggest to revert it and pick the one I proposed. > > It hasn't been applied yet so it can be fixed in v2 there. The patch was submitted without testing AFAICT. So it is preferable to pick the one I sent which was tested by Amit and me.
diff --git a/arch/arm64/boot/dts/qcom/sdm845.dtsi b/arch/arm64/boot/dts/qcom/sdm845.dtsi index c2244824355a..20fefd6af0f8 100644 --- a/arch/arm64/boot/dts/qcom/sdm845.dtsi +++ b/arch/arm64/boot/dts/qcom/sdm845.dtsi @@ -4764,6 +4764,8 @@ gpu: gpu@5000000 { interconnects = <&mem_noc MASTER_GFX3D 0 &mem_noc SLAVE_EBI1 0>; interconnect-names = "gfx-mem"; + #cooling-cells = <2>; + status = "disabled"; gpu_opp_table: opp-table { @@ -5603,12 +5605,25 @@ gpu-top-thermal { thermal-sensors = <&tsens0 11>; trips { - gpu1_alert0: trip-point0 { + gpu1_alert0: trip-point0 { + temperature = <85000>; + hysteresis = <2000>; + type = "passive"; + }; + + gpu1_alert1: trip-point1 { temperature = <90000>; hysteresis = <2000>; type = "hot"; }; }; + + cooling-maps { + map0 { + trip = <&gpu1_alert0>; + cooling-device = <&gpu THERMAL_NO_LIMIT THERMAL_NO_LIMIT>; + }; + }; }; gpu-bottom-thermal { @@ -5618,12 +5633,25 @@ gpu-bottom-thermal { thermal-sensors = <&tsens0 12>; trips { - gpu2_alert0: trip-point0 { + gpu2_alert0: trip-point0 { + temperature = <85000>; + hysteresis = <2000>; + type = "passive"; + }; + + gpu2_alert1: trip-point1 { temperature = <90000>; hysteresis = <2000>; type = "hot"; }; }; + + cooling-maps { + map0 { + trip = <&gpu2_alert0>; + cooling-device = <&gpu THERMAL_NO_LIMIT THERMAL_NO_LIMIT>; + }; + }; }; aoss1-thermal {
Running an Antutu benchmark makes the board to do a hard reboot. Cause: it appears the gpu-bottom and gpu-top temperature sensors are showing too high temperatures, above 115°C. Out of tree configuratons show the gpu thermal zone is configured to be mitigated at 85°C with devfreq. Add the DT snippet to enable the thermal mitigation on the sdm845 based board. Fixes: c79800103eb18 ("arm64: dts: sdm845: Add gpu and gmu device nodes") Cc: Amit Pundir <amit.pundir@linaro.org> Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org> --- arch/arm64/boot/dts/qcom/sdm845.dtsi | 32 ++++++++++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-)