diff mbox series

[1/2] wifi: ath9k: work around AR_CFG 0xdeadbeef chip hang

Message ID 20241104171627.3789199-1-ih@simonwunderlich.de
State Superseded
Headers show
Series [1/2] wifi: ath9k: work around AR_CFG 0xdeadbeef chip hang | expand

Commit Message

Issam Hamdi Nov. 4, 2024, 5:16 p.m. UTC
From: Simon Wunderlich <simon.wunderlich@open-mesh.com>

QCA 802.11n chips (especially AR9330/AR9340) sometimes end up in a state in
which a read of AR_CFG always returns 0xdeadbeef. This should not happen
when when the power_mode of the device is ATH9K_PM_AWAKE.

This problem is not yet detected by any other workaround in ath9k. No way
is known to reproduce the problem easily.

This patch originally developed by "Simon Wunderlich <simon.wunderlich@open-mesh.com>"
and "Sven Eckelmann <sven.eckelmann@open-mesh.com>"

Co-developed-by: Simon Wunderlich <sw@simonwunderlich.de>
Co-developed-by: Sven Eckelmann <se@simonwunderlich.de>
Signed-off-by: Issam Hamdi <ih@simonwunderlich.de>
---
 drivers/net/wireless/ath/ath9k/ath9k.h |  3 +++
 drivers/net/wireless/ath/ath9k/debug.c |  1 +
 drivers/net/wireless/ath/ath9k/debug.h |  1 +
 drivers/net/wireless/ath/ath9k/init.c  |  1 +
 drivers/net/wireless/ath/ath9k/link.c  | 31 ++++++++++++++++++++++++++
 drivers/net/wireless/ath/ath9k/main.c  |  4 ++++
 6 files changed, 41 insertions(+)


base-commit: 2b94751626a6d49bbe42a19cc1503bd391016bd5

Comments

Kalle Valo Nov. 5, 2024, 10:49 a.m. UTC | #1
Issam Hamdi <ih@simonwunderlich.de> writes:

> From: Simon Wunderlich <simon.wunderlich@open-mesh.com>
>
> QCA 802.11n chips (especially AR9330/AR9340) sometimes end up in a state in
> which a read of AR_CFG always returns 0xdeadbeef. This should not happen
> when when the power_mode of the device is ATH9K_PM_AWAKE.
>
> This problem is not yet detected by any other workaround in ath9k. No way
> is known to reproduce the problem easily.
>
> This patch originally developed by "Simon Wunderlich <simon.wunderlich@open-mesh.com>"
> and "Sven Eckelmann <sven.eckelmann@open-mesh.com>"
>
> Co-developed-by: Simon Wunderlich <sw@simonwunderlich.de>
> Co-developed-by: Sven Eckelmann <se@simonwunderlich.de>
> Signed-off-by: Issam Hamdi <ih@simonwunderlich.de>

s-o-b missing from Simon and Sven, more info:

https://docs.kernel.org/process/submitting-patches.html#when-to-use-acked-by-cc-and-co-developed-by
Toke Høiland-Jørgensen Nov. 5, 2024, 12:31 p.m. UTC | #2
Issam Hamdi <ih@simonwunderlich.de> writes:

> From: Simon Wunderlich <simon.wunderlich@open-mesh.com>
>
> QCA 802.11n chips (especially AR9330/AR9340) sometimes end up in a state in
> which a read of AR_CFG always returns 0xdeadbeef. This should not happen
> when when the power_mode of the device is ATH9K_PM_AWAKE.
>
> This problem is not yet detected by any other workaround in ath9k. No way
> is known to reproduce the problem easily.
>
> This patch originally developed by "Simon Wunderlich <simon.wunderlich@open-mesh.com>"
> and "Sven Eckelmann <sven.eckelmann@open-mesh.com>"
>
> Co-developed-by: Simon Wunderlich <sw@simonwunderlich.de>
> Co-developed-by: Sven Eckelmann <se@simonwunderlich.de>
> Signed-off-by: Issam Hamdi <ih@simonwunderlich.de>
> ---
>  drivers/net/wireless/ath/ath9k/ath9k.h |  3 +++
>  drivers/net/wireless/ath/ath9k/debug.c |  1 +
>  drivers/net/wireless/ath/ath9k/debug.h |  1 +
>  drivers/net/wireless/ath/ath9k/init.c  |  1 +
>  drivers/net/wireless/ath/ath9k/link.c  | 31 ++++++++++++++++++++++++++
>  drivers/net/wireless/ath/ath9k/main.c  |  4 ++++
>  6 files changed, 41 insertions(+)
>
> diff --git a/drivers/net/wireless/ath/ath9k/ath9k.h b/drivers/net/wireless/ath/ath9k/ath9k.h
> index 29ca65a732a6..c1ce081445a9 100644
> --- a/drivers/net/wireless/ath/ath9k/ath9k.h
> +++ b/drivers/net/wireless/ath/ath9k/ath9k.h
> @@ -739,11 +739,13 @@ void ath9k_csa_update(struct ath_softc *sc);
>  #define ATH_ANI_MAX_SKIP_COUNT    10
>  #define ATH_PAPRD_TIMEOUT         100 /* msecs */
>  #define ATH_PLL_WORK_INTERVAL     100
> +#define ATH_HANG_WORK_INTERVAL    4000
>  
>  void ath_hw_check_work(struct work_struct *work);
>  void ath_reset_work(struct work_struct *work);
>  bool ath_hw_check(struct ath_softc *sc);
>  void ath_hw_pll_work(struct work_struct *work);
> +void ath_hw_hang_work(struct work_struct *work);
>  void ath_paprd_calibrate(struct work_struct *work);
>  void ath_ani_calibrate(struct timer_list *t);
>  void ath_start_ani(struct ath_softc *sc);
> @@ -1044,6 +1046,7 @@ struct ath_softc {
>  #endif
>  	struct delayed_work hw_check_work;
>  	struct delayed_work hw_pll_work;
> +	struct delayed_work hw_hang_work;
>  	struct timer_list sleep_timer;
>  
>  #ifdef CONFIG_ATH9K_BTCOEX_SUPPORT
> diff --git a/drivers/net/wireless/ath/ath9k/debug.c b/drivers/net/wireless/ath/ath9k/debug.c
> index eff894958a73..6b2469a01f17 100644
> --- a/drivers/net/wireless/ath/ath9k/debug.c
> +++ b/drivers/net/wireless/ath/ath9k/debug.c
> @@ -750,6 +750,7 @@ static int read_file_reset(struct seq_file *file, void *data)
>  		[RESET_TYPE_CALIBRATION] = "Calibration error",
>  		[RESET_TX_DMA_ERROR] = "Tx DMA stop error",
>  		[RESET_RX_DMA_ERROR] = "Rx DMA stop error",
> +		[RESET_TYPE_DEADBEEF] = "deadbeef hang",
>  	};
>  	int i;
>  
> diff --git a/drivers/net/wireless/ath/ath9k/debug.h b/drivers/net/wireless/ath/ath9k/debug.h
> index 389459c04d14..6ebb6053a8c1 100644
> --- a/drivers/net/wireless/ath/ath9k/debug.h
> +++ b/drivers/net/wireless/ath/ath9k/debug.h
> @@ -53,6 +53,7 @@ enum ath_reset_type {
>  	RESET_TYPE_CALIBRATION,
>  	RESET_TX_DMA_ERROR,
>  	RESET_RX_DMA_ERROR,
> +	RESET_TYPE_DEADBEEF,
>  	__RESET_TYPE_MAX
>  };
>  
> diff --git a/drivers/net/wireless/ath/ath9k/init.c b/drivers/net/wireless/ath/ath9k/init.c
> index f9e77c4624d9..833474d7281f 100644
> --- a/drivers/net/wireless/ath/ath9k/init.c
> +++ b/drivers/net/wireless/ath/ath9k/init.c
> @@ -740,6 +740,7 @@ static int ath9k_init_softc(u16 devid, struct ath_softc *sc,
>  	INIT_WORK(&sc->paprd_work, ath_paprd_calibrate);
>  	INIT_DELAYED_WORK(&sc->hw_pll_work, ath_hw_pll_work);
>  	INIT_DELAYED_WORK(&sc->hw_check_work, ath_hw_check_work);
> +	INIT_DELAYED_WORK(&sc->hw_hang_work, ath_hw_hang_work);
>  
>  	ath9k_init_channel_context(sc);
>  
> diff --git a/drivers/net/wireless/ath/ath9k/link.c b/drivers/net/wireless/ath/ath9k/link.c
> index d1e5767aab3c..37438960c278 100644
> --- a/drivers/net/wireless/ath/ath9k/link.c
> +++ b/drivers/net/wireless/ath/ath9k/link.c
> @@ -142,6 +142,37 @@ void ath_hw_pll_work(struct work_struct *work)
>  				     msecs_to_jiffies(ATH_PLL_WORK_INTERVAL));
>  }
>  
> +static bool ath_hw_hang_deadbeef(struct ath_softc *sc)
> +{
> +	struct ath_common *common = ath9k_hw_common(sc->sc_ah);
> +	u32 reg;
> +
> +	/* check for stucked MAC */
> +	ath9k_ps_wakeup(sc);
> +	reg = REG_READ(sc->sc_ah, AR_CFG);
> +	ath9k_ps_restore(sc);
> +
> +	if (reg != 0xdeadbeef)
> +		return false;

ath9k_hw_check_alive() already does this exact check...

-Toke
Simon Wunderlich Nov. 5, 2024, 1:34 p.m. UTC | #3
On Tuesday, November 5, 2024 2:02:31 PM CET Toke Høiland-Jørgensen wrote:
> Relying on the debugfs counters for this seems like an odd roundabout
> way of going about things. Why not just record the last time an RX
> interrupt was received directly in the interrupt handler code, and then
> have the watchdog check if that time was too far in the past?
> 
> Recording both TX and RX times may even help distinguish between 'deaf'
> and 'idle' (cf the comment above): if we transmitted something, but got
> no RX, that's a good indication of the deaf state; but if nothing
> happened in either direction, it's probably just the network that's
> idle. I think? 
> 
> -Toke

Forgot to comment here: On the AR934x hardware we worked on in the very 
beginning, we actually still had a few interrupts coming even if the hardware 
was 'deaf'. This why we did not implement it with a timer, but counted the 
number of interrupts for a given time and compared it to a minimum expected 
ratio, as done in this patch.

I understand your argument for the TX part, but I think it actually breaks the 
AP mode and prevents the recovery: if we can't hear any clients, they will not 
use the Internet and the AP has not much to TX either. So an already deaf AP 
has nothing to transmit just as an idle AP, but for a different reason ...

Cheers,
       Simon
Sven Eckelmann Nov. 6, 2024, 10:06 a.m. UTC | #4
On Wednesday, 6 November 2024 10:04:38 CET Issam Hamdi wrote:
> +static bool ath_hw_hang_deadbeef(struct ath_softc *sc)
> +{
> +       struct ath_common *common = ath9k_hw_common(sc->sc_ah);
> +       u32 reg;
> +
> +       /* check for stucked MAC */
> +       ath9k_ps_wakeup(sc);
> +       reg = REG_READ(sc->sc_ah, AR_CFG);
> +       ath9k_ps_restore(sc);
> +
> +       if (reg != 0xdeadbeef)
> +               return false;
> +
> +       ath_dbg(common, RESET,
> +               "0xdeadbeef hang is detected. Schedule chip reset\n");
> +       ath9k_queue_reset(sc, RESET_TYPE_DEADBEEF);
> +
> +       return true;
> +}

I don't really get why this was proposed again. Can you please explain why the 
reworked version of this patch [1] (by Felix) is not good enough?

If there are any deficits with Felix's version then it would be nice to know 
get informed about it (in the commit message) and then work on the deficits. 
Having two different checks at the same time for the same thing in upstream 
Linux doesn't seem to be helpful.

I have more problems with the submission style - see next mail.

Kind regards,
	Sven

[1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=a34d0a0da1abae46a5f6ebd06fb0ec484ca099d9
diff mbox series

Patch

diff --git a/drivers/net/wireless/ath/ath9k/ath9k.h b/drivers/net/wireless/ath/ath9k/ath9k.h
index 29ca65a732a6..c1ce081445a9 100644
--- a/drivers/net/wireless/ath/ath9k/ath9k.h
+++ b/drivers/net/wireless/ath/ath9k/ath9k.h
@@ -739,11 +739,13 @@  void ath9k_csa_update(struct ath_softc *sc);
 #define ATH_ANI_MAX_SKIP_COUNT    10
 #define ATH_PAPRD_TIMEOUT         100 /* msecs */
 #define ATH_PLL_WORK_INTERVAL     100
+#define ATH_HANG_WORK_INTERVAL    4000
 
 void ath_hw_check_work(struct work_struct *work);
 void ath_reset_work(struct work_struct *work);
 bool ath_hw_check(struct ath_softc *sc);
 void ath_hw_pll_work(struct work_struct *work);
+void ath_hw_hang_work(struct work_struct *work);
 void ath_paprd_calibrate(struct work_struct *work);
 void ath_ani_calibrate(struct timer_list *t);
 void ath_start_ani(struct ath_softc *sc);
@@ -1044,6 +1046,7 @@  struct ath_softc {
 #endif
 	struct delayed_work hw_check_work;
 	struct delayed_work hw_pll_work;
+	struct delayed_work hw_hang_work;
 	struct timer_list sleep_timer;
 
 #ifdef CONFIG_ATH9K_BTCOEX_SUPPORT
diff --git a/drivers/net/wireless/ath/ath9k/debug.c b/drivers/net/wireless/ath/ath9k/debug.c
index eff894958a73..6b2469a01f17 100644
--- a/drivers/net/wireless/ath/ath9k/debug.c
+++ b/drivers/net/wireless/ath/ath9k/debug.c
@@ -750,6 +750,7 @@  static int read_file_reset(struct seq_file *file, void *data)
 		[RESET_TYPE_CALIBRATION] = "Calibration error",
 		[RESET_TX_DMA_ERROR] = "Tx DMA stop error",
 		[RESET_RX_DMA_ERROR] = "Rx DMA stop error",
+		[RESET_TYPE_DEADBEEF] = "deadbeef hang",
 	};
 	int i;
 
diff --git a/drivers/net/wireless/ath/ath9k/debug.h b/drivers/net/wireless/ath/ath9k/debug.h
index 389459c04d14..6ebb6053a8c1 100644
--- a/drivers/net/wireless/ath/ath9k/debug.h
+++ b/drivers/net/wireless/ath/ath9k/debug.h
@@ -53,6 +53,7 @@  enum ath_reset_type {
 	RESET_TYPE_CALIBRATION,
 	RESET_TX_DMA_ERROR,
 	RESET_RX_DMA_ERROR,
+	RESET_TYPE_DEADBEEF,
 	__RESET_TYPE_MAX
 };
 
diff --git a/drivers/net/wireless/ath/ath9k/init.c b/drivers/net/wireless/ath/ath9k/init.c
index f9e77c4624d9..833474d7281f 100644
--- a/drivers/net/wireless/ath/ath9k/init.c
+++ b/drivers/net/wireless/ath/ath9k/init.c
@@ -740,6 +740,7 @@  static int ath9k_init_softc(u16 devid, struct ath_softc *sc,
 	INIT_WORK(&sc->paprd_work, ath_paprd_calibrate);
 	INIT_DELAYED_WORK(&sc->hw_pll_work, ath_hw_pll_work);
 	INIT_DELAYED_WORK(&sc->hw_check_work, ath_hw_check_work);
+	INIT_DELAYED_WORK(&sc->hw_hang_work, ath_hw_hang_work);
 
 	ath9k_init_channel_context(sc);
 
diff --git a/drivers/net/wireless/ath/ath9k/link.c b/drivers/net/wireless/ath/ath9k/link.c
index d1e5767aab3c..37438960c278 100644
--- a/drivers/net/wireless/ath/ath9k/link.c
+++ b/drivers/net/wireless/ath/ath9k/link.c
@@ -142,6 +142,37 @@  void ath_hw_pll_work(struct work_struct *work)
 				     msecs_to_jiffies(ATH_PLL_WORK_INTERVAL));
 }
 
+static bool ath_hw_hang_deadbeef(struct ath_softc *sc)
+{
+	struct ath_common *common = ath9k_hw_common(sc->sc_ah);
+	u32 reg;
+
+	/* check for stucked MAC */
+	ath9k_ps_wakeup(sc);
+	reg = REG_READ(sc->sc_ah, AR_CFG);
+	ath9k_ps_restore(sc);
+
+	if (reg != 0xdeadbeef)
+		return false;
+
+	ath_dbg(common, RESET,
+		"0xdeadbeef hang is detected. Schedule chip reset\n");
+	ath9k_queue_reset(sc, RESET_TYPE_DEADBEEF);
+
+	return true;
+}
+
+void ath_hw_hang_work(struct work_struct *work)
+{
+	struct ath_softc *sc = container_of(work, struct ath_softc,
+					    hw_hang_work.work);
+
+	ath_hw_hang_deadbeef(sc);
+
+	ieee80211_queue_delayed_work(sc->hw, &sc->hw_hang_work,
+				     msecs_to_jiffies(ATH_HANG_WORK_INTERVAL));
+}
+
 /*
  * PA Pre-distortion.
  */
diff --git a/drivers/net/wireless/ath/ath9k/main.c b/drivers/net/wireless/ath/ath9k/main.c
index b92c89dad8de..024028ce8417 100644
--- a/drivers/net/wireless/ath/ath9k/main.c
+++ b/drivers/net/wireless/ath/ath9k/main.c
@@ -186,6 +186,7 @@  static void __ath_cancel_work(struct ath_softc *sc)
 	cancel_work_sync(&sc->paprd_work);
 	cancel_delayed_work_sync(&sc->hw_check_work);
 	cancel_delayed_work_sync(&sc->hw_pll_work);
+	cancel_delayed_work_sync(&sc->hw_hang_work);
 
 #ifdef CONFIG_ATH9K_BTCOEX_SUPPORT
 	if (ath9k_hw_mci_is_enabled(sc->sc_ah))
@@ -208,6 +209,9 @@  void ath_restart_work(struct ath_softc *sc)
 		ieee80211_queue_delayed_work(sc->hw, &sc->hw_pll_work,
 				     msecs_to_jiffies(ATH_PLL_WORK_INTERVAL));
 
+	ieee80211_queue_delayed_work(sc->hw, &sc->hw_hang_work,
+				     msecs_to_jiffies(ATH_HANG_WORK_INTERVAL));
+
 	ath_start_ani(sc);
 }