Message ID | 20221104150847.525173-3-robdclark@gmail.com |
---|---|
State | Superseded |
Headers | show |
Series | drm/msm: Improved hang detection | expand |
On Fri, Nov 4, 2022 at 8:08 AM Rob Clark <robdclark@gmail.com> wrote: > > From: Rob Clark <robdclark@chromium.org> > > If the hangcheck timer expires, check if the fw's position in the > cmdstream has advanced (changed) since last timer expiration, and > allow it up to three additional "extensions" to it's alotted time. > The intention is to continue to catch "shader stuck in a loop" type > hangs quickly, but allow more time for things that are actually > making forward progress. > > Because we need to sample the CP state twice to detect if there has > not been progress, this also cuts the the timer's duration in half. > > v2: Fix typo (REG_A6XX_CP_CSQ_IB2_STAT), add comment > v3: Only halve hangcheck timer duration for generations which > support progress detection (hdanton); removed unused a5xx > progress (without knowing how to adjust for data buffered > in ROQ it is too likely to report a false negative) > > Signed-off-by: Rob Clark <robdclark@chromium.org> > Reviewed-by: Akhil P Oommen <quic_akhilpo@quicinc.com> > --- > drivers/gpu/drm/msm/adreno/a6xx_gpu.c | 34 +++++++++++++++++++++++++++ > drivers/gpu/drm/msm/msm_drv.c | 1 - > drivers/gpu/drm/msm/msm_drv.h | 8 ++++++- > drivers/gpu/drm/msm/msm_gpu.c | 31 +++++++++++++++++++++++- > drivers/gpu/drm/msm/msm_gpu.h | 3 +++ > drivers/gpu/drm/msm/msm_ringbuffer.h | 24 +++++++++++++++++++ > 6 files changed, 98 insertions(+), 3 deletions(-) > > diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c > index 1ff605c18ee6..7fe60c65a1eb 100644 > --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c > +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c > @@ -1843,6 +1843,39 @@ static uint32_t a6xx_get_rptr(struct msm_gpu *gpu, struct msm_ringbuffer *ring) > return ring->memptrs->rptr = gpu_read(gpu, REG_A6XX_CP_RB_RPTR); > } > > +static bool a6xx_progress(struct msm_gpu *gpu, struct msm_ringbuffer *ring) > +{ > + struct msm_cp_state cp_state = { > + .ib1_base = gpu_read64(gpu, REG_A6XX_CP_IB1_BASE), > + .ib2_base = gpu_read64(gpu, REG_A6XX_CP_IB2_BASE), > + .ib1_rem = gpu_read(gpu, REG_A6XX_CP_IB1_REM_SIZE), > + .ib2_rem = gpu_read(gpu, REG_A6XX_CP_IB2_REM_SIZE), > + }; > + bool progress; > + > + /* > + * Adjust the remaining data to account for what has already been > + * fetched from memory, but not yet consumed by the SQE. > + * > + * This is not *technically* correct, the amount buffered could > + * exceed the IB size due to hw prefetching ahead, but: > + * > + * (1) We aren't trying to find the exact position, just whether > + * progress has been made > + * (2) The CP_REG_TO_MEM at the end of a submit should be enough > + * to prevent prefetching into an unrelated submit. (And > + * either way, at some point the ROQ will be full.) > + */ > + cp_state.ib1_rem += gpu_read(gpu, REG_A6XX_CP_CSQ_IB1_STAT) >> 16; > + cp_state.ib2_rem += gpu_read(gpu, REG_A6XX_CP_CSQ_IB2_STAT) >> 16; > + > + progress = !!memcmp(&cp_state, &ring->last_cp_state, sizeof(cp_state)); > + > + ring->last_cp_state = cp_state; > + > + return progress; > +} > + > static u32 a618_get_speed_bin(u32 fuse) > { > if (fuse == 0) > @@ -1961,6 +1994,7 @@ static const struct adreno_gpu_funcs funcs = { > .create_address_space = a6xx_create_address_space, > .create_private_address_space = a6xx_create_private_address_space, > .get_rptr = a6xx_get_rptr, > + .progress = a6xx_progress, > }, > .get_timestamp = a6xx_get_timestamp, > }; > diff --git a/drivers/gpu/drm/msm/msm_drv.c b/drivers/gpu/drm/msm/msm_drv.c > index 670651cdfa79..c3b77b44b2aa 100644 > --- a/drivers/gpu/drm/msm/msm_drv.c > +++ b/drivers/gpu/drm/msm/msm_drv.c > @@ -419,7 +419,6 @@ static int msm_drm_init(struct device *dev, const struct drm_driver *drv) > priv->dev = ddev; > > priv->wq = alloc_ordered_workqueue("msm", 0); > - priv->hangcheck_period = DRM_MSM_HANGCHECK_DEFAULT_PERIOD; > > INIT_LIST_HEAD(&priv->objects); > mutex_init(&priv->obj_lock); > diff --git a/drivers/gpu/drm/msm/msm_drv.h b/drivers/gpu/drm/msm/msm_drv.h > index 0609daf4fa4c..876d8d5eec2f 100644 > --- a/drivers/gpu/drm/msm/msm_drv.h > +++ b/drivers/gpu/drm/msm/msm_drv.h > @@ -225,7 +225,13 @@ struct msm_drm_private { > > struct drm_atomic_state *pm_state; > > - /* For hang detection, in ms */ > + /** > + * hangcheck_period: For hang detection, in ms > + * > + * Note that in practice, a submit/job will get at least two hangcheck > + * periods, due to checking for progress being implemented as simply > + * "have the CP position registers changed since last time?" > + */ > unsigned int hangcheck_period; > > /** > diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c > index 3dffee54a951..bfef659d3a5c 100644 > --- a/drivers/gpu/drm/msm/msm_gpu.c > +++ b/drivers/gpu/drm/msm/msm_gpu.c > @@ -500,6 +500,21 @@ static void hangcheck_timer_reset(struct msm_gpu *gpu) > round_jiffies_up(jiffies + msecs_to_jiffies(priv->hangcheck_period))); > } > > +static bool made_progress(struct msm_gpu *gpu, struct msm_ringbuffer *ring) > +{ > + if (ring->hangcheck_progress_retries >= DRM_MSM_HANGCHECK_PROGRESS_RETRIES) > + return false; > + > + if (!gpu->funcs->progress) > + return false; > + > + if (!gpu->funcs->progress(gpu, ring)) > + return false; > + > + ring->hangcheck_progress_retries++; > + return true; > +} > + > static void hangcheck_handler(struct timer_list *t) > { > struct msm_gpu *gpu = from_timer(gpu, t, hangcheck_timer); > @@ -511,9 +526,12 @@ static void hangcheck_handler(struct timer_list *t) > if (fence != ring->hangcheck_fence) { > /* some progress has been made.. ya! */ > ring->hangcheck_fence = fence; > - } else if (fence_before(fence, ring->fctx->last_fence)) { > + ring->hangcheck_progress_retries = 0; > + } else if (fence_before(fence, ring->fctx->last_fence) && > + !made_progress(gpu, ring)) { > /* no progress and not done.. hung! */ > ring->hangcheck_fence = fence; > + ring->hangcheck_progress_retries = 0; > DRM_DEV_ERROR(dev->dev, "%s: hangcheck detected gpu lockup rb %d!\n", > gpu->name, ring->id); > DRM_DEV_ERROR(dev->dev, "%s: completed fence: %u\n", > @@ -845,6 +863,7 @@ int msm_gpu_init(struct drm_device *drm, struct platform_device *pdev, > struct msm_gpu *gpu, const struct msm_gpu_funcs *funcs, > const char *name, struct msm_gpu_config *config) > { > + struct msm_drm_private *priv = drm->dev_private; > int i, ret, nr_rings = config->nr_rings; > void *memptrs; > uint64_t memptrs_iova; > @@ -872,6 +891,16 @@ int msm_gpu_init(struct drm_device *drm, struct platform_device *pdev, > kthread_init_work(&gpu->recover_work, recover_worker); > kthread_init_work(&gpu->fault_work, fault_worker); > > + priv->hangcheck_period = DRM_MSM_HANGCHECK_DEFAULT_PERIOD; > + > + /* > + * If progress detection is supported, halve the hangcheck timer > + * duration, as it takes two iterations of the hangcheck handler > + * to detect a hang. > + */ > + if (funcs->progress) > + priv->hangcheck_period /= 2; > + > timer_setup(&gpu->hangcheck_timer, hangcheck_handler, 0); > > spin_lock_init(&gpu->perf_lock); > diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h > index 585fd9c8d45a..f0fbf6063baa 100644 > --- a/drivers/gpu/drm/msm/msm_gpu.h > +++ b/drivers/gpu/drm/msm/msm_gpu.h > @@ -78,6 +78,8 @@ struct msm_gpu_funcs { > struct msm_gem_address_space *(*create_private_address_space) > (struct msm_gpu *gpu); > uint32_t (*get_rptr)(struct msm_gpu *gpu, struct msm_ringbuffer *ring); > + > + bool (*progress)(struct msm_gpu *gpu, struct msm_ringbuffer *ring); > }; > > /* Additional state for iommu faults: */ > @@ -237,6 +239,7 @@ struct msm_gpu { > #define DRM_MSM_INACTIVE_PERIOD 66 /* in ms (roughly four frames) */ > > #define DRM_MSM_HANGCHECK_DEFAULT_PERIOD 500 /* in ms */ > +#define DRM_MSM_HANGCHECK_PROGRESS_RETRIES 3 > struct timer_list hangcheck_timer; > > /* Fault info for most recent iova fault: */ > diff --git a/drivers/gpu/drm/msm/msm_ringbuffer.h b/drivers/gpu/drm/msm/msm_ringbuffer.h > index 2a5045abe46e..e3d33bae3380 100644 > --- a/drivers/gpu/drm/msm/msm_ringbuffer.h > +++ b/drivers/gpu/drm/msm/msm_ringbuffer.h > @@ -35,6 +35,11 @@ struct msm_rbmemptrs { > volatile u64 ttbr0; > }; > > +struct msm_cp_state { > + uint64_t ib1_base, ib2_base; > + uint32_t ib1_rem, ib2_rem; > +}; > + > struct msm_ringbuffer { > struct msm_gpu *gpu; > int id; > @@ -64,6 +69,25 @@ struct msm_ringbuffer { > uint64_t memptrs_iova; > struct msm_fence_context *fctx; > > + /** > + * hangcheck_progress_retries: > + * > + * The number of extra hangcheck duration cycles that we have given > + * due to it appearing that the GPU is making forward progress. > + * > + * If the GPU appears to be making progress (ie. the CP has advanced > + * in the command stream, we'll allow up to DRM_MSM_HANGCHECK_PROGRESS_RETRIES > + * expirations of the hangcheck timer before killing the job. In other > + * words we'll let the submit run for up to > + * DRM_MSM_HANGCHECK_DEFAULT_PERIOD * DRM_MSM_HANGCHECK_PROGRESS_RETRIES Rather than 500*3ms, the effective timeout is 250*4ms if I read made_progress correctly. That is, the formula is (DRM_MSM_HANGCHECK_DEFAULT_PERIOD / 2) * (DRM_MSM_HANGCHECK_PROGRESS_RETRIES + 1) Are you targeting 1500ms or 1000ms? Either way, series is Reviewed-by: Chia-I Wu <olvaffe@gmail.com> Tested-by: Chia-I Wu <olvaffe@gmail.com> (dEQP-GLES2.functional.flush_finish.wait) > + */ > + int hangcheck_progress_retries; > + > + /** > + * last_cp_state: The state of the CP at the last call to gpu->progress() > + */ > + struct msm_cp_state last_cp_state; > + > /* > * preempt_lock protects preemption and serializes wptr updates against > * preemption. Can be aquired from irq context. > -- > 2.38.1 >
diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c index 1ff605c18ee6..7fe60c65a1eb 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c @@ -1843,6 +1843,39 @@ static uint32_t a6xx_get_rptr(struct msm_gpu *gpu, struct msm_ringbuffer *ring) return ring->memptrs->rptr = gpu_read(gpu, REG_A6XX_CP_RB_RPTR); } +static bool a6xx_progress(struct msm_gpu *gpu, struct msm_ringbuffer *ring) +{ + struct msm_cp_state cp_state = { + .ib1_base = gpu_read64(gpu, REG_A6XX_CP_IB1_BASE), + .ib2_base = gpu_read64(gpu, REG_A6XX_CP_IB2_BASE), + .ib1_rem = gpu_read(gpu, REG_A6XX_CP_IB1_REM_SIZE), + .ib2_rem = gpu_read(gpu, REG_A6XX_CP_IB2_REM_SIZE), + }; + bool progress; + + /* + * Adjust the remaining data to account for what has already been + * fetched from memory, but not yet consumed by the SQE. + * + * This is not *technically* correct, the amount buffered could + * exceed the IB size due to hw prefetching ahead, but: + * + * (1) We aren't trying to find the exact position, just whether + * progress has been made + * (2) The CP_REG_TO_MEM at the end of a submit should be enough + * to prevent prefetching into an unrelated submit. (And + * either way, at some point the ROQ will be full.) + */ + cp_state.ib1_rem += gpu_read(gpu, REG_A6XX_CP_CSQ_IB1_STAT) >> 16; + cp_state.ib2_rem += gpu_read(gpu, REG_A6XX_CP_CSQ_IB2_STAT) >> 16; + + progress = !!memcmp(&cp_state, &ring->last_cp_state, sizeof(cp_state)); + + ring->last_cp_state = cp_state; + + return progress; +} + static u32 a618_get_speed_bin(u32 fuse) { if (fuse == 0) @@ -1961,6 +1994,7 @@ static const struct adreno_gpu_funcs funcs = { .create_address_space = a6xx_create_address_space, .create_private_address_space = a6xx_create_private_address_space, .get_rptr = a6xx_get_rptr, + .progress = a6xx_progress, }, .get_timestamp = a6xx_get_timestamp, }; diff --git a/drivers/gpu/drm/msm/msm_drv.c b/drivers/gpu/drm/msm/msm_drv.c index 670651cdfa79..c3b77b44b2aa 100644 --- a/drivers/gpu/drm/msm/msm_drv.c +++ b/drivers/gpu/drm/msm/msm_drv.c @@ -419,7 +419,6 @@ static int msm_drm_init(struct device *dev, const struct drm_driver *drv) priv->dev = ddev; priv->wq = alloc_ordered_workqueue("msm", 0); - priv->hangcheck_period = DRM_MSM_HANGCHECK_DEFAULT_PERIOD; INIT_LIST_HEAD(&priv->objects); mutex_init(&priv->obj_lock); diff --git a/drivers/gpu/drm/msm/msm_drv.h b/drivers/gpu/drm/msm/msm_drv.h index 0609daf4fa4c..876d8d5eec2f 100644 --- a/drivers/gpu/drm/msm/msm_drv.h +++ b/drivers/gpu/drm/msm/msm_drv.h @@ -225,7 +225,13 @@ struct msm_drm_private { struct drm_atomic_state *pm_state; - /* For hang detection, in ms */ + /** + * hangcheck_period: For hang detection, in ms + * + * Note that in practice, a submit/job will get at least two hangcheck + * periods, due to checking for progress being implemented as simply + * "have the CP position registers changed since last time?" + */ unsigned int hangcheck_period; /** diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c index 3dffee54a951..bfef659d3a5c 100644 --- a/drivers/gpu/drm/msm/msm_gpu.c +++ b/drivers/gpu/drm/msm/msm_gpu.c @@ -500,6 +500,21 @@ static void hangcheck_timer_reset(struct msm_gpu *gpu) round_jiffies_up(jiffies + msecs_to_jiffies(priv->hangcheck_period))); } +static bool made_progress(struct msm_gpu *gpu, struct msm_ringbuffer *ring) +{ + if (ring->hangcheck_progress_retries >= DRM_MSM_HANGCHECK_PROGRESS_RETRIES) + return false; + + if (!gpu->funcs->progress) + return false; + + if (!gpu->funcs->progress(gpu, ring)) + return false; + + ring->hangcheck_progress_retries++; + return true; +} + static void hangcheck_handler(struct timer_list *t) { struct msm_gpu *gpu = from_timer(gpu, t, hangcheck_timer); @@ -511,9 +526,12 @@ static void hangcheck_handler(struct timer_list *t) if (fence != ring->hangcheck_fence) { /* some progress has been made.. ya! */ ring->hangcheck_fence = fence; - } else if (fence_before(fence, ring->fctx->last_fence)) { + ring->hangcheck_progress_retries = 0; + } else if (fence_before(fence, ring->fctx->last_fence) && + !made_progress(gpu, ring)) { /* no progress and not done.. hung! */ ring->hangcheck_fence = fence; + ring->hangcheck_progress_retries = 0; DRM_DEV_ERROR(dev->dev, "%s: hangcheck detected gpu lockup rb %d!\n", gpu->name, ring->id); DRM_DEV_ERROR(dev->dev, "%s: completed fence: %u\n", @@ -845,6 +863,7 @@ int msm_gpu_init(struct drm_device *drm, struct platform_device *pdev, struct msm_gpu *gpu, const struct msm_gpu_funcs *funcs, const char *name, struct msm_gpu_config *config) { + struct msm_drm_private *priv = drm->dev_private; int i, ret, nr_rings = config->nr_rings; void *memptrs; uint64_t memptrs_iova; @@ -872,6 +891,16 @@ int msm_gpu_init(struct drm_device *drm, struct platform_device *pdev, kthread_init_work(&gpu->recover_work, recover_worker); kthread_init_work(&gpu->fault_work, fault_worker); + priv->hangcheck_period = DRM_MSM_HANGCHECK_DEFAULT_PERIOD; + + /* + * If progress detection is supported, halve the hangcheck timer + * duration, as it takes two iterations of the hangcheck handler + * to detect a hang. + */ + if (funcs->progress) + priv->hangcheck_period /= 2; + timer_setup(&gpu->hangcheck_timer, hangcheck_handler, 0); spin_lock_init(&gpu->perf_lock); diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h index 585fd9c8d45a..f0fbf6063baa 100644 --- a/drivers/gpu/drm/msm/msm_gpu.h +++ b/drivers/gpu/drm/msm/msm_gpu.h @@ -78,6 +78,8 @@ struct msm_gpu_funcs { struct msm_gem_address_space *(*create_private_address_space) (struct msm_gpu *gpu); uint32_t (*get_rptr)(struct msm_gpu *gpu, struct msm_ringbuffer *ring); + + bool (*progress)(struct msm_gpu *gpu, struct msm_ringbuffer *ring); }; /* Additional state for iommu faults: */ @@ -237,6 +239,7 @@ struct msm_gpu { #define DRM_MSM_INACTIVE_PERIOD 66 /* in ms (roughly four frames) */ #define DRM_MSM_HANGCHECK_DEFAULT_PERIOD 500 /* in ms */ +#define DRM_MSM_HANGCHECK_PROGRESS_RETRIES 3 struct timer_list hangcheck_timer; /* Fault info for most recent iova fault: */ diff --git a/drivers/gpu/drm/msm/msm_ringbuffer.h b/drivers/gpu/drm/msm/msm_ringbuffer.h index 2a5045abe46e..e3d33bae3380 100644 --- a/drivers/gpu/drm/msm/msm_ringbuffer.h +++ b/drivers/gpu/drm/msm/msm_ringbuffer.h @@ -35,6 +35,11 @@ struct msm_rbmemptrs { volatile u64 ttbr0; }; +struct msm_cp_state { + uint64_t ib1_base, ib2_base; + uint32_t ib1_rem, ib2_rem; +}; + struct msm_ringbuffer { struct msm_gpu *gpu; int id; @@ -64,6 +69,25 @@ struct msm_ringbuffer { uint64_t memptrs_iova; struct msm_fence_context *fctx; + /** + * hangcheck_progress_retries: + * + * The number of extra hangcheck duration cycles that we have given + * due to it appearing that the GPU is making forward progress. + * + * If the GPU appears to be making progress (ie. the CP has advanced + * in the command stream, we'll allow up to DRM_MSM_HANGCHECK_PROGRESS_RETRIES + * expirations of the hangcheck timer before killing the job. In other + * words we'll let the submit run for up to + * DRM_MSM_HANGCHECK_DEFAULT_PERIOD * DRM_MSM_HANGCHECK_PROGRESS_RETRIES + */ + int hangcheck_progress_retries; + + /** + * last_cp_state: The state of the CP at the last call to gpu->progress() + */ + struct msm_cp_state last_cp_state; + /* * preempt_lock protects preemption and serializes wptr updates against * preemption. Can be aquired from irq context.