diff mbox series

[2/2] wifi: ath9k: Reset chip on potential deaf state

Message ID 20241104171627.3789199-2-ih@simonwunderlich.de
State Superseded
Headers show
Series [1/2] wifi: ath9k: work around AR_CFG 0xdeadbeef chip hang | expand

Commit Message

Issam Hamdi Nov. 4, 2024, 5:16 p.m. UTC
From: Simon Wunderlich <simon.wunderlich@open-mesh.com>

The chip is switching seemingly random into a state which can be described
as "deaf". No or nearly no interrupts are generated anymore for incoming
packets. Existing links either break down after a while and new links will
not be established.

The driver doesn't know if there is no other device available or if it
ended up in an "deaf" state. Resetting the chip proactively avoids
permanent problems in case the chip really was in its "deaf" state but
maybe causes unnecessary resets in case it wasn't "deaf".

This patch originally developed by "Simon Wunderlich <simon.wunderlich@open-mesh.com>"
and "Sven Eckelmann <sven.eckelmann@open-mesh.com>"

Co-developed-by: Simon Wunderlich <sw@simonwunderlich.de>
Co-developed-by: Sven Eckelmann <se@simonwunderlich.de>
Signed-off-by: Issam Hamdi <ih@simonwunderlich.de>
---
 drivers/net/wireless/ath/ath9k/ath9k.h |  3 ++
 drivers/net/wireless/ath/ath9k/debug.c |  1 +
 drivers/net/wireless/ath/ath9k/debug.h |  1 +
 drivers/net/wireless/ath/ath9k/link.c  | 48 +++++++++++++++++++++++++-
 4 files changed, 52 insertions(+), 1 deletion(-)

Comments

Kalle Valo Nov. 5, 2024, 10:53 a.m. UTC | #1
Issam Hamdi <ih@simonwunderlich.de> writes:

> From: Simon Wunderlich <simon.wunderlich@open-mesh.com>
>
> The chip is switching seemingly random into a state which can be described
> as "deaf". No or nearly no interrupts are generated anymore for incoming
> packets. Existing links either break down after a while and new links will
> not be established.
>
> The driver doesn't know if there is no other device available or if it
> ended up in an "deaf" state. Resetting the chip proactively avoids
> permanent problems in case the chip really was in its "deaf" state but
> maybe causes unnecessary resets in case it wasn't "deaf".
>
> This patch originally developed by "Simon Wunderlich <simon.wunderlich@open-mesh.com>"
> and "Sven Eckelmann <sven.eckelmann@open-mesh.com>"
>
> Co-developed-by: Simon Wunderlich <sw@simonwunderlich.de>
> Co-developed-by: Sven Eckelmann <se@simonwunderlich.de>
> Signed-off-by: Issam Hamdi <ih@simonwunderlich.de>

s-o-b missing here as well.

> --- a/drivers/net/wireless/ath/ath9k/link.c
> +++ b/drivers/net/wireless/ath/ath9k/link.c
> @@ -162,13 +162,59 @@ static bool ath_hw_hang_deadbeef(struct ath_softc *sc)
>  	return true;
>  }
>  
> +static bool ath_hw_hang_deaf(struct ath_softc *sc)
> +{
> +#if !defined(CPTCFG_ATH9K_DEBUGFS) || defined(CPTCFG_ATH9K_TX99)
> +	return false;
> +#else

We don't use CPTCFG_ in upstream.

Try to avoid '#if' if possible, what about IS_ENABLED():

if (!IS_ENABLED(CONFIG_ATH9K_DEBUGFS) || !IS_ENABLED(CONFIG_ATH9K_TX99)
	return false;
Toke Høiland-Jørgensen Nov. 5, 2024, 1:02 p.m. UTC | #2
Issam Hamdi <ih@simonwunderlich.de> writes:

> From: Simon Wunderlich <simon.wunderlich@open-mesh.com>
>
> The chip is switching seemingly random into a state which can be described
> as "deaf". No or nearly no interrupts are generated anymore for incoming
> packets. Existing links either break down after a while and new links will
> not be established.
>
> The driver doesn't know if there is no other device available or if it
> ended up in an "deaf" state. Resetting the chip proactively avoids
> permanent problems in case the chip really was in its "deaf" state but
> maybe causes unnecessary resets in case it wasn't "deaf".

Proactively resetting the device if there is no traffic on the network
for four seconds seems like a tad aggressive. Do you have any
information on under which conditions this actually happens in practice?
I assume this is a patch that has been lying around in openwrt for a
while, or something?

As for the code itself, see below:

> This patch originally developed by "Simon Wunderlich <simon.wunderlich@open-mesh.com>"
> and "Sven Eckelmann <sven.eckelmann@open-mesh.com>"
>
> Co-developed-by: Simon Wunderlich <sw@simonwunderlich.de>
> Co-developed-by: Sven Eckelmann <se@simonwunderlich.de>
> Signed-off-by: Issam Hamdi <ih@simonwunderlich.de>
> ---
>  drivers/net/wireless/ath/ath9k/ath9k.h |  3 ++
>  drivers/net/wireless/ath/ath9k/debug.c |  1 +
>  drivers/net/wireless/ath/ath9k/debug.h |  1 +
>  drivers/net/wireless/ath/ath9k/link.c  | 48 +++++++++++++++++++++++++-
>  4 files changed, 52 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/net/wireless/ath/ath9k/ath9k.h b/drivers/net/wireless/ath/ath9k/ath9k.h
> index c1ce081445a9..2b98c69fa37f 100644
> --- a/drivers/net/wireless/ath/ath9k/ath9k.h
> +++ b/drivers/net/wireless/ath/ath9k/ath9k.h
> @@ -1026,6 +1026,9 @@ struct ath_softc {
>  	short nbcnvifs;
>  	unsigned long ps_usecount;
>  
> +	unsigned long last_check_time;
> +	u32 last_check_interrupts;
> +
>  	struct ath_rx rx;
>  	struct ath_tx tx;
>  	struct ath_beacon beacon;
> diff --git a/drivers/net/wireless/ath/ath9k/debug.c b/drivers/net/wireless/ath/ath9k/debug.c
> index 6b2469a01f17..4128cf691166 100644
> --- a/drivers/net/wireless/ath/ath9k/debug.c
> +++ b/drivers/net/wireless/ath/ath9k/debug.c
> @@ -751,6 +751,7 @@ static int read_file_reset(struct seq_file *file, void *data)
>  		[RESET_TX_DMA_ERROR] = "Tx DMA stop error",
>  		[RESET_RX_DMA_ERROR] = "Rx DMA stop error",
>  		[RESET_TYPE_DEADBEEF] = "deadbeef hang",
> +		[RESET_TYPE_DEAF] = "deaf hang",
>  	};
>  	int i;
>  
> diff --git a/drivers/net/wireless/ath/ath9k/debug.h b/drivers/net/wireless/ath/ath9k/debug.h
> index 6ebb6053a8c1..76e27860455c 100644
> --- a/drivers/net/wireless/ath/ath9k/debug.h
> +++ b/drivers/net/wireless/ath/ath9k/debug.h
> @@ -54,6 +54,7 @@ enum ath_reset_type {
>  	RESET_TX_DMA_ERROR,
>  	RESET_RX_DMA_ERROR,
>  	RESET_TYPE_DEADBEEF,
> +	RESET_TYPE_DEAF,
>  	__RESET_TYPE_MAX
>  };
>  
> diff --git a/drivers/net/wireless/ath/ath9k/link.c b/drivers/net/wireless/ath/ath9k/link.c
> index 37438960c278..d1762cc3129d 100644
> --- a/drivers/net/wireless/ath/ath9k/link.c
> +++ b/drivers/net/wireless/ath/ath9k/link.c
> @@ -162,13 +162,59 @@ static bool ath_hw_hang_deadbeef(struct ath_softc *sc)
>  	return true;
>  }
>  
> +static bool ath_hw_hang_deaf(struct ath_softc *sc)
> +{
> +#if !defined(CPTCFG_ATH9K_DEBUGFS) || defined(CPTCFG_ATH9K_TX99)
> +	return false;
> +#else
> +	struct ath_common *common = ath9k_hw_common(sc->sc_ah);
> +	u32 interrupts, interrupt_per_s;
> +	unsigned int interval;
> +
> +	/* get historic data */
> +	interval = jiffies_to_msecs(jiffies - sc->last_check_time);
> +	if (sc->sc_ah->caps.hw_caps & ATH9K_HW_CAP_EDMA)
> +		interrupts = sc->debug.stats.istats.rxlp;
> +	else
> +		interrupts = sc->debug.stats.istats.rxok;
> +
> +	interrupts -= sc->last_check_interrupts;

Relying on the debugfs counters for this seems like an odd roundabout
way of going about things. Why not just record the last time an RX
interrupt was received directly in the interrupt handler code, and then
have the watchdog check if that time was too far in the past?

Recording both TX and RX times may even help distinguish between 'deaf'
and 'idle' (cf the comment above): if we transmitted something, but got
no RX, that's a good indication of the deaf state; but if nothing
happened in either direction, it's probably just the network that's
idle. I think? :)

-Toke
Simon Wunderlich Nov. 5, 2024, 1:30 p.m. UTC | #3
On Tuesday, November 5, 2024 2:02:31 PM CET Toke Høiland-Jørgensen wrote:
> Issam Hamdi <ih@simonwunderlich.de> writes:
> > From: Simon Wunderlich <simon.wunderlich@open-mesh.com>
> > 
> > The chip is switching seemingly random into a state which can be described
> > as "deaf". No or nearly no interrupts are generated anymore for incoming
> > packets. Existing links either break down after a while and new links will
> > not be established.
> > 
> > The driver doesn't know if there is no other device available or if it
> > ended up in an "deaf" state. Resetting the chip proactively avoids
> > permanent problems in case the chip really was in its "deaf" state but
> > maybe causes unnecessary resets in case it wasn't "deaf".
> 
> Proactively resetting the device if there is no traffic on the network
> for four seconds seems like a tad aggressive. Do you have any
> information on under which conditions this actually happens in practice?
> I assume this is a patch that has been lying around in openwrt for a
> while, or something?

Hi Toke,

this patch has been around for a long time (8 years or so?), and it has been 
integrated in various vendor firmwares (at least three I know of) as well as 
mesh community firmwares [1]. The circumstances leading to this "deafness" is 
still unclear, but we see that some particular chips (especially 2-stream 11n 
SoCs, but also others) can go 'deaf' when running AP or mesh (or both) after 
some time. It's probably a hardware issue, and doing a channel scan to trigger 
a chip reset (which one normally can't do on an AP interface) recovers the 
hardware. This patch provides a workaround within the kernel.

We submitted it only as RFC back then [2], and since we had colleagues 
suffering the same problem again we though we give it another shot to finally 
get it integrated upstream. :)

The idea is that if the radio is idle anyway, a quick reset (which takes a few 
tens of ms maximum) doesn't hurt much, and it helps to recover non-functional 
APs or mesh points.

Cheers,
      Simon

[1] https://github.com/freifunk-gluon/gluon/pull/2114
[2] https://patchwork.kernel.org/project/linux-wireless/patch/
20161117083614.19188-2-sven.eckelmann@open-mesh.com/
Toke Høiland-Jørgensen Nov. 5, 2024, 3:10 p.m. UTC | #4
Simon Wunderlich <sw@simonwunderlich.de> writes:

> On Tuesday, November 5, 2024 2:02:31 PM CET Toke Høiland-Jørgensen wrote:
>> Issam Hamdi <ih@simonwunderlich.de> writes:
>> > From: Simon Wunderlich <simon.wunderlich@open-mesh.com>
>> > 
>> > The chip is switching seemingly random into a state which can be described
>> > as "deaf". No or nearly no interrupts are generated anymore for incoming
>> > packets. Existing links either break down after a while and new links will
>> > not be established.
>> > 
>> > The driver doesn't know if there is no other device available or if it
>> > ended up in an "deaf" state. Resetting the chip proactively avoids
>> > permanent problems in case the chip really was in its "deaf" state but
>> > maybe causes unnecessary resets in case it wasn't "deaf".
>> 
>> Proactively resetting the device if there is no traffic on the network
>> for four seconds seems like a tad aggressive. Do you have any
>> information on under which conditions this actually happens in practice?
>> I assume this is a patch that has been lying around in openwrt for a
>> while, or something?
>
> Hi Toke,
>
> this patch has been around for a long time (8 years or so?), and it has been 
> integrated in various vendor firmwares (at least three I know of) as well as 
> mesh community firmwares [1]. The circumstances leading to this "deafness" is 
> still unclear, but we see that some particular chips (especially 2-stream 11n 
> SoCs, but also others) can go 'deaf' when running AP or mesh (or both) after 
> some time. It's probably a hardware issue, and doing a channel scan to trigger 
> a chip reset (which one normally can't do on an AP interface) recovers the 
> hardware. This patch provides a workaround within the kernel.
>
> We submitted it only as RFC back then [2], and since we had colleagues 
> suffering the same problem again we though we give it another shot to finally 
> get it integrated upstream. :)
>
> The idea is that if the radio is idle anyway, a quick reset (which takes a few 
> tens of ms maximum) doesn't hurt much, and it helps to recover non-functional 
> APs or mesh points.

Alright, thanks for the extra context (would have been nice if this was
part of the commit message in the initial submission ;)).

(including both your emails in one):

> Forgot to comment here: On the AR934x hardware we worked on in the very 
> beginning, we actually still had a few interrupts coming even if the hardware 
> was 'deaf'. This why we did not implement it with a timer, but counted the 
> number of interrupts for a given time and compared it to a minimum expected 
> ratio, as done in this patch.
>
> I understand your argument for the TX part, but I think it actually breaks the 
> AP mode and prevents the recovery: if we can't hear any clients, they will not 
> use the Internet and the AP has not much to TX either. So an already deaf AP 
> has nothing to transmit just as an idle AP, but for a different reason ...

Right, okay. I guess that is also why you prefer this one to Felix'
similar patch that was also linked from that gluon issue[0]?

However, I still don't like tying this to the debugfs: if this is
something that the driver needs to react to, it should not depend on
debug features. Even if OpenWrt and derivatives always compile-in the
debugfs, not everyone does, as we discovered back when we accidentally
broke the driver when it wasn't there :)

So how about something like the patch below - it keeps the "average per
time interval" behaviour, but uses the same approach as Felix' patch to
avoid relying on debugfs. WDYT?

-Toke


[0] https://github.com/freifunk-gluon/gluon/commit/fa0740cca4683f90bbf05157fd80109d2c40aa84



diff --git a/drivers/net/wireless/ath/ath9k/ath9k.h b/drivers/net/wireless/ath/ath9k/ath9k.h
index 29ca65a732a6..bcfc8df0efe5 100644
--- a/drivers/net/wireless/ath/ath9k/ath9k.h
+++ b/drivers/net/wireless/ath/ath9k/ath9k.h
@@ -1018,6 +1018,8 @@ struct ath_softc {
 
 	u8 gtt_cnt;
 	u32 intrstatus;
+	u32 rx_active_check_time;
+	u32 rx_active_count;
 	u16 ps_flags; /* PS_* */
 	bool ps_enabled;
 	bool ps_idle;
diff --git a/drivers/net/wireless/ath/ath9k/debug.c b/drivers/net/wireless/ath/ath9k/debug.c
index 51abc470125b..a1e3bcf796f2 100644
--- a/drivers/net/wireless/ath/ath9k/debug.c
+++ b/drivers/net/wireless/ath/ath9k/debug.c
@@ -750,6 +750,7 @@ static int read_file_reset(struct seq_file *file, void *data)
 		[RESET_TYPE_CALIBRATION] = "Calibration error",
 		[RESET_TX_DMA_ERROR] = "Tx DMA stop error",
 		[RESET_RX_DMA_ERROR] = "Rx DMA stop error",
+		[RESET_TYPE_RX_INACTIVE] = "Rx path inactive",
 	};
 	int i;
 
diff --git a/drivers/net/wireless/ath/ath9k/debug.h b/drivers/net/wireless/ath/ath9k/debug.h
index 389459c04d14..cb3e75969875 100644
--- a/drivers/net/wireless/ath/ath9k/debug.h
+++ b/drivers/net/wireless/ath/ath9k/debug.h
@@ -53,6 +53,7 @@ enum ath_reset_type {
 	RESET_TYPE_CALIBRATION,
 	RESET_TX_DMA_ERROR,
 	RESET_RX_DMA_ERROR,
+	RESET_TYPE_RX_INACTIVE,
 	__RESET_TYPE_MAX
 };
 
diff --git a/drivers/net/wireless/ath/ath9k/link.c b/drivers/net/wireless/ath/ath9k/link.c
index d1e5767aab3c..054c0781287e 100644
--- a/drivers/net/wireless/ath/ath9k/link.c
+++ b/drivers/net/wireless/ath/ath9k/link.c
@@ -50,7 +50,36 @@ static bool ath_tx_complete_check(struct ath_softc *sc)
 		"tx hung, resetting the chip\n");
 	ath9k_queue_reset(sc, RESET_TYPE_TX_HANG);
 	return false;
+}
+
+#define RX_INACTIVE_CHECK_INTERVAL (4 * MSEC_PER_SEC)
+
+static bool ath_hw_rx_inactive_check(struct ath_softc *sc)
+{
+	struct ath_common *common = ath9k_hw_common(sc->sc_ah);
+	u32 interval, count;
+
+	interval = jiffies_to_msecs(jiffies - sc->rx_active_check_time);
+	count = sc->rx_active_count;
+
+	if (interval < RX_INACTIVE_CHECK_INTERVAL)
+		return true; /* too soon to check */
 
+	sc->rx_active_count = 0;
+	sc->rx_active_check_time = jiffies;
+
+	/* Need at least one interrupt per second, and we should only react if
+	 * we are within a factor two of the expected interval
+	 */
+	if (interval > RX_INACTIVE_CHECK_INTERVAL * 2 ||
+	    count >= interval / MSEC_PER_SEC)
+		return true;
+
+	ath_dbg(common, RESET,
+		"RX inactivity is detected. Schedule chip reset\n");
+	ath9k_queue_reset(sc, RESET_TYPE_RX_INACTIVE);
+
+	return false;
 }
 
 void ath_hw_check_work(struct work_struct *work)
@@ -58,8 +87,8 @@ void ath_hw_check_work(struct work_struct *work)
 	struct ath_softc *sc = container_of(work, struct ath_softc,
 					    hw_check_work.work);
 
-	if (!ath_hw_check(sc) ||
-	    !ath_tx_complete_check(sc))
+	if (!ath_hw_check(sc) || !ath_tx_complete_check(sc) ||
+	    !ath_hw_rx_inactive_check(sc))
 		return;
 
 	ieee80211_queue_delayed_work(sc->hw, &sc->hw_check_work,
diff --git a/drivers/net/wireless/ath/ath9k/main.c b/drivers/net/wireless/ath/ath9k/main.c
index b92c89dad8de..998f717a1a86 100644
--- a/drivers/net/wireless/ath/ath9k/main.c
+++ b/drivers/net/wireless/ath/ath9k/main.c
@@ -453,6 +453,7 @@ void ath9k_tasklet(struct tasklet_struct *t)
 			ath_rx_tasklet(sc, 0, true);
 
 		ath_rx_tasklet(sc, 0, false);
+		sc->rx_active_count++;
 	}
 
 	if (status & ATH9K_INT_TX) {
Issam Hamdi Nov. 6, 2024, 10:05 a.m. UTC | #5
On 11/5/24 16:10, Toke Høiland-Jørgensen wrote:
> Right, okay. I guess that is also why you prefer this one to Felix'
> similar patch that was also linked from that gluon issue[0]?
>
> However, I still don't like tying this to the debugfs: if this is
> something that the driver needs to react to, it should not depend on
> debug features. Even if OpenWrt and derivatives always compile-in the
> debugfs, not everyone does, as we discovered back when we accidentally
> broke the driver when it wasn't there :)
>
> So how about something like the patch below - it keeps the "average per
> time interval" behaviour, but uses the same approach as Felix' patch to
> avoid relying on debugfs. WDYT?
Yes, you are right, I will update the patch to not depend on debugfs.

Regards,

Issam
diff mbox series

Patch

diff --git a/drivers/net/wireless/ath/ath9k/ath9k.h b/drivers/net/wireless/ath/ath9k/ath9k.h
index c1ce081445a9..2b98c69fa37f 100644
--- a/drivers/net/wireless/ath/ath9k/ath9k.h
+++ b/drivers/net/wireless/ath/ath9k/ath9k.h
@@ -1026,6 +1026,9 @@  struct ath_softc {
 	short nbcnvifs;
 	unsigned long ps_usecount;
 
+	unsigned long last_check_time;
+	u32 last_check_interrupts;
+
 	struct ath_rx rx;
 	struct ath_tx tx;
 	struct ath_beacon beacon;
diff --git a/drivers/net/wireless/ath/ath9k/debug.c b/drivers/net/wireless/ath/ath9k/debug.c
index 6b2469a01f17..4128cf691166 100644
--- a/drivers/net/wireless/ath/ath9k/debug.c
+++ b/drivers/net/wireless/ath/ath9k/debug.c
@@ -751,6 +751,7 @@  static int read_file_reset(struct seq_file *file, void *data)
 		[RESET_TX_DMA_ERROR] = "Tx DMA stop error",
 		[RESET_RX_DMA_ERROR] = "Rx DMA stop error",
 		[RESET_TYPE_DEADBEEF] = "deadbeef hang",
+		[RESET_TYPE_DEAF] = "deaf hang",
 	};
 	int i;
 
diff --git a/drivers/net/wireless/ath/ath9k/debug.h b/drivers/net/wireless/ath/ath9k/debug.h
index 6ebb6053a8c1..76e27860455c 100644
--- a/drivers/net/wireless/ath/ath9k/debug.h
+++ b/drivers/net/wireless/ath/ath9k/debug.h
@@ -54,6 +54,7 @@  enum ath_reset_type {
 	RESET_TX_DMA_ERROR,
 	RESET_RX_DMA_ERROR,
 	RESET_TYPE_DEADBEEF,
+	RESET_TYPE_DEAF,
 	__RESET_TYPE_MAX
 };
 
diff --git a/drivers/net/wireless/ath/ath9k/link.c b/drivers/net/wireless/ath/ath9k/link.c
index 37438960c278..d1762cc3129d 100644
--- a/drivers/net/wireless/ath/ath9k/link.c
+++ b/drivers/net/wireless/ath/ath9k/link.c
@@ -162,13 +162,59 @@  static bool ath_hw_hang_deadbeef(struct ath_softc *sc)
 	return true;
 }
 
+static bool ath_hw_hang_deaf(struct ath_softc *sc)
+{
+#if !defined(CPTCFG_ATH9K_DEBUGFS) || defined(CPTCFG_ATH9K_TX99)
+	return false;
+#else
+	struct ath_common *common = ath9k_hw_common(sc->sc_ah);
+	u32 interrupts, interrupt_per_s;
+	unsigned int interval;
+
+	/* get historic data */
+	interval = jiffies_to_msecs(jiffies - sc->last_check_time);
+	if (sc->sc_ah->caps.hw_caps & ATH9K_HW_CAP_EDMA)
+		interrupts = sc->debug.stats.istats.rxlp;
+	else
+		interrupts = sc->debug.stats.istats.rxok;
+
+	interrupts -= sc->last_check_interrupts;
+
+	/* save current data */
+	sc->last_check_time = jiffies;
+	if (sc->sc_ah->caps.hw_caps & ATH9K_HW_CAP_EDMA)
+		sc->last_check_interrupts = sc->debug.stats.istats.rxlp;
+	else
+		sc->last_check_interrupts = sc->debug.stats.istats.rxok;
+
+	/* sanity check, should be 4 seconds */
+	if (interval > 10000 || interval < 1000)
+		return false;
+
+	/* should be at least one interrupt per second */
+	interrupt_per_s = interrupts / (interval / 1000);
+	if (interrupt_per_s >= 1)
+		return false;
+
+	ath_dbg(common, RESET,
+		"RX deaf hang is detected. Schedule chip reset\n");
+	ath9k_queue_reset(sc, RESET_TYPE_DEAF);
+
+	return true;
+#endif
+}
+
 void ath_hw_hang_work(struct work_struct *work)
 {
 	struct ath_softc *sc = container_of(work, struct ath_softc,
 					    hw_hang_work.work);
 
-	ath_hw_hang_deadbeef(sc);
+	if (ath_hw_hang_deadbeef(sc))
+		goto requeue_worker;
 
+	ath_hw_hang_deaf(sc);
+
+requeue_worker:
 	ieee80211_queue_delayed_work(sc->hw, &sc->hw_hang_work,
 				     msecs_to_jiffies(ATH_HANG_WORK_INTERVAL));
 }