diff mbox series

[RFC] base: power: replace generic_pm_domain spinlock by raw spinlock

Message ID 20220615203605.1068453-1-athierry@redhat.com
State New
Headers show
Series [RFC] base: power: replace generic_pm_domain spinlock by raw spinlock | expand

Commit Message

Adrien Thierry June 15, 2022, 8:36 p.m. UTC
We've been encountering a BUG: scheduling while atomic while running the
5.18.0-rt11 kernel on a Qualcomm SoC (see stacktrace below).

It seems to occur because a spinlock is taken in the PSCI idle code path
in the idle loop. With the RT patchset applied and CONFIG_PREEMPT_RT
enabled, spinlocks can sleep, thus triggering the bug.

In order to prevent this, replace the generic_pm_domain spinlock by a
raw spinlock.

[    2.994433] BUG: scheduling while atomic: swapper/6/0/0x00000002
[    2.994439] Modules linked in:
[    2.994447] [<ffff80000810b0ec>] migrate_enable+0x3c/0x160
[    2.994461] CPU: 6 PID: 0 Comm: swapper/6 Not tainted 5.18.0-rt11+ #1
[    2.994464] Hardware name: Qualcomm SA8295P ADP (DT)
[    2.994466] Call trace:
[    2.994467]  dump_backtrace+0xb0/0x120
[    2.994473]  show_stack+0x1c/0x6c
[    2.994477]  dump_stack_lvl+0x64/0x7c
[    2.994483]  dump_stack+0x14/0x2c
[    2.994487]  __schedule_bug+0xa8/0xc0
[    2.994489]  schedule_debug.constprop.0+0x154/0x170
[    2.994492]  __schedule+0x58/0x520
[    2.994496]  schedule_rtlock+0x20/0x44
[    2.994499]  rtlock_slowlock_locked+0x110/0x260
[    2.994503]  rt_spin_lock+0x74/0x94
[    2.994505]  genpd_lock_nested_spin+0x20/0x30
[    2.994509]  genpd_power_off.part.0.isra.0+0x248/0x2cc
[    2.994512]  genpd_runtime_suspend+0x1a0/0x300
[    2.994515]  __rpm_callback+0x4c/0x16c
[    2.994518]  rpm_callback+0x6c/0x80
[    2.994520]  rpm_suspend+0x10c/0x63c
[    2.994523]  __pm_runtime_suspend+0x54/0xa4
[    2.994526]  __psci_enter_domain_idle_state.constprop.0+0x64/0x10c
[    2.994532]  psci_enter_domain_idle_state+0x1c/0x24
[    2.994534]  cpuidle_enter_state+0x8c/0x3f0
[    2.994539]  cpuidle_enter+0x3c/0x50
[    2.994543]  cpuidle_idle_call+0x158/0x1d4
[    2.994548]  do_idle+0xa8/0xfc
[    2.994551]  cpu_startup_entry+0x28/0x30
[    2.994556]  secondary_start_kernel+0xe4/0x140
[    2.994563]  __secondary_switched+0x54/0x58

Signed-off-by: Adrien Thierry <athierry@redhat.com>
---

This patch fixes the bug but I'm not sure if this is the proper way to do
so. Suggestions for other ways to fix this are very welcome. 

 drivers/base/power/domain.c | 10 +++++-----
 include/linux/pm_domain.h   |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)


base-commit: 979086f5e0066b4eff66e1eee123da228489985c

Comments

Ulf Hansson June 15, 2022, 9:44 p.m. UTC | #1
On Wed, 15 Jun 2022 at 13:36, Adrien Thierry <athierry@redhat.com> wrote:
>
> We've been encountering a BUG: scheduling while atomic while running the
> 5.18.0-rt11 kernel on a Qualcomm SoC (see stacktrace below).
>
> It seems to occur because a spinlock is taken in the PSCI idle code path
> in the idle loop. With the RT patchset applied and CONFIG_PREEMPT_RT
> enabled, spinlocks can sleep, thus triggering the bug.
>
> In order to prevent this, replace the generic_pm_domain spinlock by a
> raw spinlock.
>
> [    2.994433] BUG: scheduling while atomic: swapper/6/0/0x00000002
> [    2.994439] Modules linked in:
> [    2.994447] [<ffff80000810b0ec>] migrate_enable+0x3c/0x160
> [    2.994461] CPU: 6 PID: 0 Comm: swapper/6 Not tainted 5.18.0-rt11+ #1
> [    2.994464] Hardware name: Qualcomm SA8295P ADP (DT)
> [    2.994466] Call trace:
> [    2.994467]  dump_backtrace+0xb0/0x120
> [    2.994473]  show_stack+0x1c/0x6c
> [    2.994477]  dump_stack_lvl+0x64/0x7c
> [    2.994483]  dump_stack+0x14/0x2c
> [    2.994487]  __schedule_bug+0xa8/0xc0
> [    2.994489]  schedule_debug.constprop.0+0x154/0x170
> [    2.994492]  __schedule+0x58/0x520
> [    2.994496]  schedule_rtlock+0x20/0x44
> [    2.994499]  rtlock_slowlock_locked+0x110/0x260
> [    2.994503]  rt_spin_lock+0x74/0x94
> [    2.994505]  genpd_lock_nested_spin+0x20/0x30
> [    2.994509]  genpd_power_off.part.0.isra.0+0x248/0x2cc
> [    2.994512]  genpd_runtime_suspend+0x1a0/0x300
> [    2.994515]  __rpm_callback+0x4c/0x16c
> [    2.994518]  rpm_callback+0x6c/0x80
> [    2.994520]  rpm_suspend+0x10c/0x63c
> [    2.994523]  __pm_runtime_suspend+0x54/0xa4
> [    2.994526]  __psci_enter_domain_idle_state.constprop.0+0x64/0x10c
> [    2.994532]  psci_enter_domain_idle_state+0x1c/0x24
> [    2.994534]  cpuidle_enter_state+0x8c/0x3f0
> [    2.994539]  cpuidle_enter+0x3c/0x50
> [    2.994543]  cpuidle_idle_call+0x158/0x1d4
> [    2.994548]  do_idle+0xa8/0xfc
> [    2.994551]  cpu_startup_entry+0x28/0x30
> [    2.994556]  secondary_start_kernel+0xe4/0x140
> [    2.994563]  __secondary_switched+0x54/0x58
>
> Signed-off-by: Adrien Thierry <athierry@redhat.com>
> ---
>
> This patch fixes the bug but I'm not sure if this is the proper way to do
> so. Suggestions for other ways to fix this are very welcome.

Honestly, I am not so sure about this either.

Turning the lock into spinlock_t into a raw_spinlock_t, may have the
effect of spreading into constraints on the genpd providers. Thus
those may need to be converted to use raw_spinlock_t too (assuming
they use a spinlock_t and GENPD_FLAG_IRQ_SAFE). On the other hand,
there are a limited number of genpd providers that this can become a
problem for, if any, so maybe it would not be a big problem after all.

Kind regards
Uffe

>
>  drivers/base/power/domain.c | 10 +++++-----
>  include/linux/pm_domain.h   |  2 +-
>  2 files changed, 6 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c
> index 739e52cd4aba..9378decb58cf 100644
> --- a/drivers/base/power/domain.c
> +++ b/drivers/base/power/domain.c
> @@ -82,7 +82,7 @@ static void genpd_lock_spin(struct generic_pm_domain *genpd)
>  {
>         unsigned long flags;
>
> -       spin_lock_irqsave(&genpd->slock, flags);
> +       raw_spin_lock_irqsave(&genpd->slock, flags);
>         genpd->lock_flags = flags;
>  }
>
> @@ -92,7 +92,7 @@ static void genpd_lock_nested_spin(struct generic_pm_domain *genpd,
>  {
>         unsigned long flags;
>
> -       spin_lock_irqsave_nested(&genpd->slock, flags, depth);
> +       raw_spin_lock_irqsave_nested(&genpd->slock, flags, depth);
>         genpd->lock_flags = flags;
>  }
>
> @@ -101,7 +101,7 @@ static int genpd_lock_interruptible_spin(struct generic_pm_domain *genpd)
>  {
>         unsigned long flags;
>
> -       spin_lock_irqsave(&genpd->slock, flags);
> +       raw_spin_lock_irqsave(&genpd->slock, flags);
>         genpd->lock_flags = flags;
>         return 0;
>  }
> @@ -109,7 +109,7 @@ static int genpd_lock_interruptible_spin(struct generic_pm_domain *genpd)
>  static void genpd_unlock_spin(struct generic_pm_domain *genpd)
>         __releases(&genpd->slock)
>  {
> -       spin_unlock_irqrestore(&genpd->slock, genpd->lock_flags);
> +       raw_spin_unlock_irqrestore(&genpd->slock, genpd->lock_flags);
>  }
>
>  static const struct genpd_lock_ops genpd_spin_ops = {
> @@ -2022,7 +2022,7 @@ static void genpd_free_data(struct generic_pm_domain *genpd)
>  static void genpd_lock_init(struct generic_pm_domain *genpd)
>  {
>         if (genpd->flags & GENPD_FLAG_IRQ_SAFE) {
> -               spin_lock_init(&genpd->slock);
> +               raw_spin_lock_init(&genpd->slock);
>                 genpd->lock_ops = &genpd_spin_ops;
>         } else {
>                 mutex_init(&genpd->mlock);
> diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h
> index ebc351698090..80166a915b0d 100644
> --- a/include/linux/pm_domain.h
> +++ b/include/linux/pm_domain.h
> @@ -159,7 +159,7 @@ struct generic_pm_domain {
>         union {
>                 struct mutex mlock;
>                 struct {
> -                       spinlock_t slock;
> +                       raw_spinlock_t slock;
>                         unsigned long lock_flags;
>                 };
>         };
>
> base-commit: 979086f5e0066b4eff66e1eee123da228489985c
> --
> 2.35.3
>
Ulf Hansson June 15, 2022, 9:53 p.m. UTC | #2
On Wed, 15 Jun 2022 at 14:44, Ulf Hansson <ulf.hansson@linaro.org> wrote:
>
> On Wed, 15 Jun 2022 at 13:36, Adrien Thierry <athierry@redhat.com> wrote:
> >
> > We've been encountering a BUG: scheduling while atomic while running the
> > 5.18.0-rt11 kernel on a Qualcomm SoC (see stacktrace below).
> >
> > It seems to occur because a spinlock is taken in the PSCI idle code path
> > in the idle loop. With the RT patchset applied and CONFIG_PREEMPT_RT
> > enabled, spinlocks can sleep, thus triggering the bug.
> >
> > In order to prevent this, replace the generic_pm_domain spinlock by a
> > raw spinlock.
> >
> > [    2.994433] BUG: scheduling while atomic: swapper/6/0/0x00000002
> > [    2.994439] Modules linked in:
> > [    2.994447] [<ffff80000810b0ec>] migrate_enable+0x3c/0x160
> > [    2.994461] CPU: 6 PID: 0 Comm: swapper/6 Not tainted 5.18.0-rt11+ #1
> > [    2.994464] Hardware name: Qualcomm SA8295P ADP (DT)
> > [    2.994466] Call trace:
> > [    2.994467]  dump_backtrace+0xb0/0x120
> > [    2.994473]  show_stack+0x1c/0x6c
> > [    2.994477]  dump_stack_lvl+0x64/0x7c
> > [    2.994483]  dump_stack+0x14/0x2c
> > [    2.994487]  __schedule_bug+0xa8/0xc0
> > [    2.994489]  schedule_debug.constprop.0+0x154/0x170
> > [    2.994492]  __schedule+0x58/0x520
> > [    2.994496]  schedule_rtlock+0x20/0x44
> > [    2.994499]  rtlock_slowlock_locked+0x110/0x260
> > [    2.994503]  rt_spin_lock+0x74/0x94
> > [    2.994505]  genpd_lock_nested_spin+0x20/0x30
> > [    2.994509]  genpd_power_off.part.0.isra.0+0x248/0x2cc
> > [    2.994512]  genpd_runtime_suspend+0x1a0/0x300
> > [    2.994515]  __rpm_callback+0x4c/0x16c
> > [    2.994518]  rpm_callback+0x6c/0x80
> > [    2.994520]  rpm_suspend+0x10c/0x63c
> > [    2.994523]  __pm_runtime_suspend+0x54/0xa4
> > [    2.994526]  __psci_enter_domain_idle_state.constprop.0+0x64/0x10c
> > [    2.994532]  psci_enter_domain_idle_state+0x1c/0x24
> > [    2.994534]  cpuidle_enter_state+0x8c/0x3f0
> > [    2.994539]  cpuidle_enter+0x3c/0x50
> > [    2.994543]  cpuidle_idle_call+0x158/0x1d4
> > [    2.994548]  do_idle+0xa8/0xfc
> > [    2.994551]  cpu_startup_entry+0x28/0x30
> > [    2.994556]  secondary_start_kernel+0xe4/0x140
> > [    2.994563]  __secondary_switched+0x54/0x58
> >
> > Signed-off-by: Adrien Thierry <athierry@redhat.com>
> > ---
> >
> > This patch fixes the bug but I'm not sure if this is the proper way to do
> > so. Suggestions for other ways to fix this are very welcome.
>
> Honestly, I am not so sure about this either.
>
> Turning the lock into spinlock_t into a raw_spinlock_t, may have the
> effect of spreading into constraints on the genpd providers. Thus
> those may need to be converted to use raw_spinlock_t too (assuming
> they use a spinlock_t and GENPD_FLAG_IRQ_SAFE). On the other hand,
> there are a limited number of genpd providers that this can become a
> problem for, if any, so maybe it would not be a big problem after all.

Ohh, one more thing. For cpuidle-psci in the idle path, we call
pm_runtime_get|put_sync(). Those calls end up using another spinlock_t
(dev->power.lock). That seems like a similar problem, but may be
harder to solve.

[...]

Kind regards
Uffe
Greg KH June 16, 2022, 6:17 a.m. UTC | #3
On Wed, Jun 15, 2022 at 04:36:05PM -0400, Adrien Thierry wrote:
> We've been encountering a BUG: scheduling while atomic while running the
> 5.18.0-rt11 kernel on a Qualcomm SoC (see stacktrace below).
> 
> It seems to occur because a spinlock is taken in the PSCI idle code path
> in the idle loop. With the RT patchset applied and CONFIG_PREEMPT_RT
> enabled, spinlocks can sleep, thus triggering the bug.
> 
> In order to prevent this, replace the generic_pm_domain spinlock by a
> raw spinlock.

Ick.  I'll leave this to the RT people, but having a spinlock sleep
feels totally wrong overall and I didn't think that was something that
actually happened.

For mainline, I don't think this is a valid change at this point in time
untill you all work out the details more.

thanks,

greg k-h
Krzysztof Kozlowski Dec. 16, 2022, 10:20 a.m. UTC | #4
On 15/06/2022 22:36, Adrien Thierry wrote:
> We've been encountering a BUG: scheduling while atomic while running the
> 5.18.0-rt11 kernel on a Qualcomm SoC (see stacktrace below).
> 
> It seems to occur because a spinlock is taken in the PSCI idle code path
> in the idle loop. With the RT patchset applied and CONFIG_PREEMPT_RT
> enabled, spinlocks can sleep, thus triggering the bug.
> 
> In order to prevent this, replace the generic_pm_domain spinlock by a
> raw spinlock.
> 
> [    2.994433] BUG: scheduling while atomic: swapper/6/0/0x00000002
> [    2.994439] Modules linked in:
> [    2.994447] [<ffff80000810b0ec>] migrate_enable+0x3c/0x160
> [    2.994461] CPU: 6 PID: 0 Comm: swapper/6 Not tainted 5.18.0-rt11+ #1
> [    2.994464] Hardware name: Qualcomm SA8295P ADP (DT)
> [    2.994466] Call trace:
> [    2.994467]  dump_backtrace+0xb0/0x120
> [    2.994473]  show_stack+0x1c/0x6c
> [    2.994477]  dump_stack_lvl+0x64/0x7c
> [    2.994483]  dump_stack+0x14/0x2c
> [    2.994487]  __schedule_bug+0xa8/0xc0
> [    2.994489]  schedule_debug.constprop.0+0x154/0x170
> [    2.994492]  __schedule+0x58/0x520
> [    2.994496]  schedule_rtlock+0x20/0x44
> [    2.994499]  rtlock_slowlock_locked+0x110/0x260
> [    2.994503]  rt_spin_lock+0x74/0x94
> [    2.994505]  genpd_lock_nested_spin+0x20/0x30

Hi Adrian,

I also hit it now on v6.1 RT kernel. I see no more discussions happened
here. Do you have any progress on your side for this issue?

Best regards,
Krzysztof
Adrien Thierry Dec. 16, 2022, 2:13 p.m. UTC | #5
Hi Krzysztof,

> I also hit it now on v6.1 RT kernel. I see no more discussions happened
> here. Do you have any progress on your side for this issue?

Unfortunately I haven't investigated more since June. I was hoping RT
people could chime in.

Best,

Adrien
Krzysztof Kozlowski Dec. 16, 2022, 2:42 p.m. UTC | #6
On 16/12/2022 15:13, Adrien Thierry wrote:
> Hi Krzysztof,
> 
>> I also hit it now on v6.1 RT kernel. I see no more discussions happened
>> here. Do you have any progress on your side for this issue?
> 
> Unfortunately I haven't investigated more since June. I was hoping RT
> people could chime in.
> 

OK. I have a fix (change) for this which might be acceptable by Ulf and
others, so let me send a patch soon.

As Ulf said, even after fixing this, we have PSCI
psci_enter_domain_idle_state problem....

Best regards,
Krzysztof
diff mbox series

Patch

diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c
index 739e52cd4aba..9378decb58cf 100644
--- a/drivers/base/power/domain.c
+++ b/drivers/base/power/domain.c
@@ -82,7 +82,7 @@  static void genpd_lock_spin(struct generic_pm_domain *genpd)
 {
 	unsigned long flags;
 
-	spin_lock_irqsave(&genpd->slock, flags);
+	raw_spin_lock_irqsave(&genpd->slock, flags);
 	genpd->lock_flags = flags;
 }
 
@@ -92,7 +92,7 @@  static void genpd_lock_nested_spin(struct generic_pm_domain *genpd,
 {
 	unsigned long flags;
 
-	spin_lock_irqsave_nested(&genpd->slock, flags, depth);
+	raw_spin_lock_irqsave_nested(&genpd->slock, flags, depth);
 	genpd->lock_flags = flags;
 }
 
@@ -101,7 +101,7 @@  static int genpd_lock_interruptible_spin(struct generic_pm_domain *genpd)
 {
 	unsigned long flags;
 
-	spin_lock_irqsave(&genpd->slock, flags);
+	raw_spin_lock_irqsave(&genpd->slock, flags);
 	genpd->lock_flags = flags;
 	return 0;
 }
@@ -109,7 +109,7 @@  static int genpd_lock_interruptible_spin(struct generic_pm_domain *genpd)
 static void genpd_unlock_spin(struct generic_pm_domain *genpd)
 	__releases(&genpd->slock)
 {
-	spin_unlock_irqrestore(&genpd->slock, genpd->lock_flags);
+	raw_spin_unlock_irqrestore(&genpd->slock, genpd->lock_flags);
 }
 
 static const struct genpd_lock_ops genpd_spin_ops = {
@@ -2022,7 +2022,7 @@  static void genpd_free_data(struct generic_pm_domain *genpd)
 static void genpd_lock_init(struct generic_pm_domain *genpd)
 {
 	if (genpd->flags & GENPD_FLAG_IRQ_SAFE) {
-		spin_lock_init(&genpd->slock);
+		raw_spin_lock_init(&genpd->slock);
 		genpd->lock_ops = &genpd_spin_ops;
 	} else {
 		mutex_init(&genpd->mlock);
diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h
index ebc351698090..80166a915b0d 100644
--- a/include/linux/pm_domain.h
+++ b/include/linux/pm_domain.h
@@ -159,7 +159,7 @@  struct generic_pm_domain {
 	union {
 		struct mutex mlock;
 		struct {
-			spinlock_t slock;
+			raw_spinlock_t slock;
 			unsigned long lock_flags;
 		};
 	};