Message ID | 20240522150839.27578-4-Smita.KoralahalliChannabasappa@amd.com |
---|---|
State | New |
Headers | show |
Series | acpi/ghes, cper, cxl: Trace FW-First CXL Protocol Errors | expand |
On 5/22/24 8:08 AM, Smita Koralahalli wrote: > When PCIe AER is in FW-First, OS should process CXL Protocol errors from > CPER records. > > Reuse the existing work queue cxl_cper_work registered with GHES to notify > the CXL subsystem on a Protocol error. > > The defined trace events cxl_aer_uncorrectable_error and > cxl_aer_correctable_error currently trace native CXL AER errors. Reuse > them to trace FW-First Protocol Errors. > > Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com> > --- > drivers/acpi/apei/ghes.c | 14 ++++++++++++++ > drivers/cxl/core/pci.c | 24 ++++++++++++++++++++++++ > drivers/cxl/cxlpci.h | 3 +++ > drivers/cxl/pci.c | 34 ++++++++++++++++++++++++++++++++-- > include/linux/cxl-event.h | 1 + > 5 files changed, 74 insertions(+), 2 deletions(-) > > diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c > index 1a58032770ee..a31bd91e9475 100644 > --- a/drivers/acpi/apei/ghes.c > +++ b/drivers/acpi/apei/ghes.c > @@ -723,6 +723,20 @@ static void cxl_cper_handle_prot_err(struct acpi_hest_generic_data *gdata) > > if (cxl_cper_handle_prot_err_info(gdata, &wd.p_err)) > return; > + > + guard(spinlock_irqsave)(&cxl_cper_work_lock); > + > + if (!cxl_cper_work) > + return; > + > + wd.event_type = CXL_CPER_EVENT_PROT_ERR; > + > + if (!kfifo_put(&cxl_cper_fifo, wd)) { > + pr_err_ratelimited("CXL CPER kfifo overflow\n"); > + return; > + } > + > + schedule_work(cxl_cper_work); > } > > int cxl_cper_register_work(struct work_struct *work) > diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c > index 0df09bd79408..ef9438cb1dd6 100644 > --- a/drivers/cxl/core/pci.c > +++ b/drivers/cxl/core/pci.c > @@ -686,6 +686,30 @@ void read_cdat_data(struct cxl_port *port) > } > EXPORT_SYMBOL_NS_GPL(read_cdat_data, CXL); > > +void cxl_trace_prot_err(struct cxl_dev_state *cxlds, > + struct cxl_cper_prot_err *p_err) > +{ > + u32 status, fe; > + > + if (p_err->severity == CXL_AER_CORRECTABLE) { > + status = p_err->cxl_ras.cor_status & ~p_err->cxl_ras.cor_mask; > + > + trace_cxl_aer_correctable_error(cxlds->cxlmd, status); > + } else { > + status = p_err->cxl_ras.uncor_status & ~p_err->cxl_ras.uncor_mask; > + > + if (hweight32(status) > 1) > + fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK, > + p_err->cxl_ras.cap_control)); > + else > + fe = status; > + > + trace_cxl_aer_uncorrectable_error(cxlds->cxlmd, status, fe, > + p_err->cxl_ras.header_log); > + } > +} > +EXPORT_SYMBOL_NS_GPL(cxl_trace_prot_err, CXL); > + > static void __cxl_handle_cor_ras(struct cxl_dev_state *cxlds, > void __iomem *ras_base) > { > diff --git a/drivers/cxl/cxlpci.h b/drivers/cxl/cxlpci.h > index 93992a1c8eec..0ba3215786e1 100644 > --- a/drivers/cxl/cxlpci.h > +++ b/drivers/cxl/cxlpci.h > @@ -130,4 +130,7 @@ void read_cdat_data(struct cxl_port *port); > void cxl_cor_error_detected(struct pci_dev *pdev); > pci_ers_result_t cxl_error_detected(struct pci_dev *pdev, > pci_channel_state_t state); > +struct cxl_cper_prot_err; > +void cxl_trace_prot_err(struct cxl_dev_state *cxlds, > + struct cxl_cper_prot_err *p_err); > #endif /* __CXL_PCI_H__ */ > diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c > index 74876c9835e8..3e3c36983686 100644 > --- a/drivers/cxl/pci.c > +++ b/drivers/cxl/pci.c > @@ -1011,12 +1011,42 @@ static void cxl_handle_cper_event(enum cxl_event_type ev_type, > &uuid_null, &rec->event); > } > > +static void cxl_handle_prot_err(struct cxl_cper_prot_err *p_err) > +{ > + struct pci_dev *pdev __free(pci_dev_put) = NULL; > + struct cxl_dev_state *cxlds; > + unsigned int devfn; > + > + devfn = PCI_DEVFN(p_err->device, p_err->function); > + pdev = pci_get_domain_bus_and_slot(p_err->segment, > + p_err->bus, devfn); > + if (!pdev) > + return; > + > + guard(device)(&pdev->dev); > + if (pdev->driver != &cxl_pci_driver) > + return; > + > + cxlds = pci_get_drvdata(pdev); > + if (!cxlds) > + return; > + > + if (((u64)p_err->upper_dw << 32 | p_err->lower_dw) != cxlds->serial) > + pr_warn("CPER-reported device serial number does not match expected value\n"); Given that we are operating on a device, perhaps dev_warn(&pdev->dev, ...) may be better served. DJ > + > + cxl_trace_prot_err(cxlds, p_err); > +} > + > static void cxl_cper_work_fn(struct work_struct *work) > { > struct cxl_cper_work_data wd; > > - while (cxl_cper_kfifo_get(&wd)) > - cxl_handle_cper_event(wd.event_type, &wd.rec); > + while (cxl_cper_kfifo_get(&wd)) { > + if (wd.event_type == CXL_CPER_EVENT_PROT_ERR) > + cxl_handle_prot_err(&wd.p_err); > + else > + cxl_handle_cper_event(wd.event_type, &wd.rec); > + } > } > static DECLARE_WORK(cxl_cper_work, cxl_cper_work_fn); > > diff --git a/include/linux/cxl-event.h b/include/linux/cxl-event.h > index 9c7b69e076a0..5562844df850 100644 > --- a/include/linux/cxl-event.h > +++ b/include/linux/cxl-event.h > @@ -122,6 +122,7 @@ struct cxl_event_record_raw { > } __packed; > > enum cxl_event_type { > + CXL_CPER_EVENT_PROT_ERR, > CXL_CPER_EVENT_GENERIC, > CXL_CPER_EVENT_GEN_MEDIA, > CXL_CPER_EVENT_DRAM,
On Wed, May 22, 2024 at 03:08:38PM +0000, Smita Koralahalli wrote: > When PCIe AER is in FW-First, OS should process CXL Protocol errors from > CPER records. > > Reuse the existing work queue cxl_cper_work registered with GHES to notify > the CXL subsystem on a Protocol error. > > The defined trace events cxl_aer_uncorrectable_error and > cxl_aer_correctable_error currently trace native CXL AER errors. Reuse > them to trace FW-First Protocol Errors. Will the trace log differentiate between errors reported in FW-First versus Native mode? Wondering if that bit of info needs to be logged or is discoverable elsewhere. Otherwise, LGTM, Reviewed-by: Alison Schofield <alison.schofield@intel.com> > > Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com> > --- > drivers/acpi/apei/ghes.c | 14 ++++++++++++++ > drivers/cxl/core/pci.c | 24 ++++++++++++++++++++++++ > drivers/cxl/cxlpci.h | 3 +++ > drivers/cxl/pci.c | 34 ++++++++++++++++++++++++++++++++-- > include/linux/cxl-event.h | 1 + > 5 files changed, 74 insertions(+), 2 deletions(-) snip
On Wed, May 22, 2024 at 11:05:49AM -0700, Dave Jiang wrote: > > > On 5/22/24 8:08 AM, Smita Koralahalli wrote: > > When PCIe AER is in FW-First, OS should process CXL Protocol errors from > > CPER records. > > > > Reuse the existing work queue cxl_cper_work registered with GHES to notify > > the CXL subsystem on a Protocol error. > > > > The defined trace events cxl_aer_uncorrectable_error and > > cxl_aer_correctable_error currently trace native CXL AER errors. Reuse > > them to trace FW-First Protocol Errors. > > > > Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com> > > --- > > drivers/acpi/apei/ghes.c | 14 ++++++++++++++ > > drivers/cxl/core/pci.c | 24 ++++++++++++++++++++++++ > > drivers/cxl/cxlpci.h | 3 +++ > > drivers/cxl/pci.c | 34 ++++++++++++++++++++++++++++++++-- > > include/linux/cxl-event.h | 1 + > > 5 files changed, 74 insertions(+), 2 deletions(-) > > > > diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c > > index 1a58032770ee..a31bd91e9475 100644 > > --- a/drivers/acpi/apei/ghes.c > > +++ b/drivers/acpi/apei/ghes.c > > @@ -723,6 +723,20 @@ static void cxl_cper_handle_prot_err(struct acpi_hest_generic_data *gdata) > > > > if (cxl_cper_handle_prot_err_info(gdata, &wd.p_err)) > > return; > > + > > + guard(spinlock_irqsave)(&cxl_cper_work_lock); > > + > > + if (!cxl_cper_work) > > + return; > > + > > + wd.event_type = CXL_CPER_EVENT_PROT_ERR; > > + > > + if (!kfifo_put(&cxl_cper_fifo, wd)) { > > + pr_err_ratelimited("CXL CPER kfifo overflow\n"); > > + return; > > + } > > + > > + schedule_work(cxl_cper_work); > > } > > > > int cxl_cper_register_work(struct work_struct *work) > > diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c > > index 0df09bd79408..ef9438cb1dd6 100644 > > --- a/drivers/cxl/core/pci.c > > +++ b/drivers/cxl/core/pci.c > > @@ -686,6 +686,30 @@ void read_cdat_data(struct cxl_port *port) > > } > > EXPORT_SYMBOL_NS_GPL(read_cdat_data, CXL); > > > > +void cxl_trace_prot_err(struct cxl_dev_state *cxlds, > > + struct cxl_cper_prot_err *p_err) > > +{ > > + u32 status, fe; > > + > > + if (p_err->severity == CXL_AER_CORRECTABLE) { > > + status = p_err->cxl_ras.cor_status & ~p_err->cxl_ras.cor_mask; > > + > > + trace_cxl_aer_correctable_error(cxlds->cxlmd, status); > > + } else { > > + status = p_err->cxl_ras.uncor_status & ~p_err->cxl_ras.uncor_mask; > > + > > + if (hweight32(status) > 1) > > + fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK, > > + p_err->cxl_ras.cap_control)); > > + else > > + fe = status; > > + > > + trace_cxl_aer_uncorrectable_error(cxlds->cxlmd, status, fe, > > + p_err->cxl_ras.header_log); > > + } > > +} > > +EXPORT_SYMBOL_NS_GPL(cxl_trace_prot_err, CXL); > > + > > static void __cxl_handle_cor_ras(struct cxl_dev_state *cxlds, > > void __iomem *ras_base) > > { > > diff --git a/drivers/cxl/cxlpci.h b/drivers/cxl/cxlpci.h > > index 93992a1c8eec..0ba3215786e1 100644 > > --- a/drivers/cxl/cxlpci.h > > +++ b/drivers/cxl/cxlpci.h > > @@ -130,4 +130,7 @@ void read_cdat_data(struct cxl_port *port); > > void cxl_cor_error_detected(struct pci_dev *pdev); > > pci_ers_result_t cxl_error_detected(struct pci_dev *pdev, > > pci_channel_state_t state); > > +struct cxl_cper_prot_err; > > +void cxl_trace_prot_err(struct cxl_dev_state *cxlds, > > + struct cxl_cper_prot_err *p_err); > > #endif /* __CXL_PCI_H__ */ > > diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c > > index 74876c9835e8..3e3c36983686 100644 > > --- a/drivers/cxl/pci.c > > +++ b/drivers/cxl/pci.c > > @@ -1011,12 +1011,42 @@ static void cxl_handle_cper_event(enum cxl_event_type ev_type, > > &uuid_null, &rec->event); > > } > > > > +static void cxl_handle_prot_err(struct cxl_cper_prot_err *p_err) > > +{ > > + struct pci_dev *pdev __free(pci_dev_put) = NULL; > > + struct cxl_dev_state *cxlds; > > + unsigned int devfn; > > + > > + devfn = PCI_DEVFN(p_err->device, p_err->function); > > + pdev = pci_get_domain_bus_and_slot(p_err->segment, > > + p_err->bus, devfn); > > + if (!pdev) > > + return; > > + > > + guard(device)(&pdev->dev); > > + if (pdev->driver != &cxl_pci_driver) > > + return; > > + > > + cxlds = pci_get_drvdata(pdev); > > + if (!cxlds) > > + return; > > + > > + if (((u64)p_err->upper_dw << 32 | p_err->lower_dw) != cxlds->serial) > > + pr_warn("CPER-reported device serial number does not match expected value\n"); > > Given that we are operating on a device, perhaps dev_warn(&pdev->dev, ...) may be better served. > > DJ Good point. Providing the dev lets the user look up the serial number, meaning this message doesn't need to include an 'expected' but not found value. -- Alison > > + > > + cxl_trace_prot_err(cxlds, p_err); > > +} > > + > > static void cxl_cper_work_fn(struct work_struct *work) > > { > > struct cxl_cper_work_data wd; > > > > - while (cxl_cper_kfifo_get(&wd)) > > - cxl_handle_cper_event(wd.event_type, &wd.rec); > > + while (cxl_cper_kfifo_get(&wd)) { > > + if (wd.event_type == CXL_CPER_EVENT_PROT_ERR) > > + cxl_handle_prot_err(&wd.p_err); > > + else > > + cxl_handle_cper_event(wd.event_type, &wd.rec); > > + } > > } > > static DECLARE_WORK(cxl_cper_work, cxl_cper_work_fn); > > > > diff --git a/include/linux/cxl-event.h b/include/linux/cxl-event.h > > index 9c7b69e076a0..5562844df850 100644 > > --- a/include/linux/cxl-event.h > > +++ b/include/linux/cxl-event.h > > @@ -122,6 +122,7 @@ struct cxl_event_record_raw { > > } __packed; > > > > enum cxl_event_type { > > + CXL_CPER_EVENT_PROT_ERR, > > CXL_CPER_EVENT_GENERIC, > > CXL_CPER_EVENT_GEN_MEDIA, > > CXL_CPER_EVENT_DRAM,
On 5/22/2024 11:05 AM, Dave Jiang wrote: > > > On 5/22/24 8:08 AM, Smita Koralahalli wrote: >> When PCIe AER is in FW-First, OS should process CXL Protocol errors from >> CPER records. >> >> Reuse the existing work queue cxl_cper_work registered with GHES to notify >> the CXL subsystem on a Protocol error. >> >> The defined trace events cxl_aer_uncorrectable_error and >> cxl_aer_correctable_error currently trace native CXL AER errors. Reuse >> them to trace FW-First Protocol Errors. >> >> Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com> >> --- >> drivers/acpi/apei/ghes.c | 14 ++++++++++++++ >> drivers/cxl/core/pci.c | 24 ++++++++++++++++++++++++ >> drivers/cxl/cxlpci.h | 3 +++ >> drivers/cxl/pci.c | 34 ++++++++++++++++++++++++++++++++-- >> include/linux/cxl-event.h | 1 + >> 5 files changed, 74 insertions(+), 2 deletions(-) >> >> diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c >> index 1a58032770ee..a31bd91e9475 100644 >> --- a/drivers/acpi/apei/ghes.c >> +++ b/drivers/acpi/apei/ghes.c >> @@ -723,6 +723,20 @@ static void cxl_cper_handle_prot_err(struct acpi_hest_generic_data *gdata) >> >> if (cxl_cper_handle_prot_err_info(gdata, &wd.p_err)) >> return; >> + >> + guard(spinlock_irqsave)(&cxl_cper_work_lock); >> + >> + if (!cxl_cper_work) >> + return; >> + >> + wd.event_type = CXL_CPER_EVENT_PROT_ERR; >> + >> + if (!kfifo_put(&cxl_cper_fifo, wd)) { >> + pr_err_ratelimited("CXL CPER kfifo overflow\n"); >> + return; >> + } >> + >> + schedule_work(cxl_cper_work); >> } >> >> int cxl_cper_register_work(struct work_struct *work) >> diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c >> index 0df09bd79408..ef9438cb1dd6 100644 >> --- a/drivers/cxl/core/pci.c >> +++ b/drivers/cxl/core/pci.c >> @@ -686,6 +686,30 @@ void read_cdat_data(struct cxl_port *port) >> } >> EXPORT_SYMBOL_NS_GPL(read_cdat_data, CXL); >> >> +void cxl_trace_prot_err(struct cxl_dev_state *cxlds, >> + struct cxl_cper_prot_err *p_err) >> +{ >> + u32 status, fe; >> + >> + if (p_err->severity == CXL_AER_CORRECTABLE) { >> + status = p_err->cxl_ras.cor_status & ~p_err->cxl_ras.cor_mask; >> + >> + trace_cxl_aer_correctable_error(cxlds->cxlmd, status); >> + } else { >> + status = p_err->cxl_ras.uncor_status & ~p_err->cxl_ras.uncor_mask; >> + >> + if (hweight32(status) > 1) >> + fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK, >> + p_err->cxl_ras.cap_control)); >> + else >> + fe = status; >> + >> + trace_cxl_aer_uncorrectable_error(cxlds->cxlmd, status, fe, >> + p_err->cxl_ras.header_log); >> + } >> +} >> +EXPORT_SYMBOL_NS_GPL(cxl_trace_prot_err, CXL); >> + >> static void __cxl_handle_cor_ras(struct cxl_dev_state *cxlds, >> void __iomem *ras_base) >> { >> diff --git a/drivers/cxl/cxlpci.h b/drivers/cxl/cxlpci.h >> index 93992a1c8eec..0ba3215786e1 100644 >> --- a/drivers/cxl/cxlpci.h >> +++ b/drivers/cxl/cxlpci.h >> @@ -130,4 +130,7 @@ void read_cdat_data(struct cxl_port *port); >> void cxl_cor_error_detected(struct pci_dev *pdev); >> pci_ers_result_t cxl_error_detected(struct pci_dev *pdev, >> pci_channel_state_t state); >> +struct cxl_cper_prot_err; >> +void cxl_trace_prot_err(struct cxl_dev_state *cxlds, >> + struct cxl_cper_prot_err *p_err); >> #endif /* __CXL_PCI_H__ */ >> diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c >> index 74876c9835e8..3e3c36983686 100644 >> --- a/drivers/cxl/pci.c >> +++ b/drivers/cxl/pci.c >> @@ -1011,12 +1011,42 @@ static void cxl_handle_cper_event(enum cxl_event_type ev_type, >> &uuid_null, &rec->event); >> } >> >> +static void cxl_handle_prot_err(struct cxl_cper_prot_err *p_err) >> +{ >> + struct pci_dev *pdev __free(pci_dev_put) = NULL; >> + struct cxl_dev_state *cxlds; >> + unsigned int devfn; >> + >> + devfn = PCI_DEVFN(p_err->device, p_err->function); >> + pdev = pci_get_domain_bus_and_slot(p_err->segment, >> + p_err->bus, devfn); >> + if (!pdev) >> + return; >> + >> + guard(device)(&pdev->dev); >> + if (pdev->driver != &cxl_pci_driver) >> + return; >> + >> + cxlds = pci_get_drvdata(pdev); >> + if (!cxlds) >> + return; >> + >> + if (((u64)p_err->upper_dw << 32 | p_err->lower_dw) != cxlds->serial) >> + pr_warn("CPER-reported device serial number does not match expected value\n"); > > Given that we are operating on a device, perhaps dev_warn(&pdev->dev, ...) may be better served. Will fix. Thanks, Smita [snip]
On 5/22/2024 5:22 PM, Alison Schofield wrote: > On Wed, May 22, 2024 at 03:08:38PM +0000, Smita Koralahalli wrote: >> When PCIe AER is in FW-First, OS should process CXL Protocol errors from >> CPER records. >> >> Reuse the existing work queue cxl_cper_work registered with GHES to notify >> the CXL subsystem on a Protocol error. >> >> The defined trace events cxl_aer_uncorrectable_error and >> cxl_aer_correctable_error currently trace native CXL AER errors. Reuse >> them to trace FW-First Protocol Errors. > > Will the trace log differentiate between errors reported in FW-First > versus Native mode? Wondering if that bit of info needs to be logged > or is discoverable elsewhere. No, the trace log won't differentiate currently. But just a side note, FW-First also logs errors in dmesg. I'm not sure if going forward, we would still continue to log errors in dmesg. But I feel it might be needed so that we don't miss errors from RCH Downstream Port or hexdump of unrecognized agent types. Thanks Smita > > Otherwise, LGTM, > Reviewed-by: Alison Schofield <alison.schofield@intel.com> > > >> >> Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com> >> --- >> drivers/acpi/apei/ghes.c | 14 ++++++++++++++ >> drivers/cxl/core/pci.c | 24 ++++++++++++++++++++++++ >> drivers/cxl/cxlpci.h | 3 +++ >> drivers/cxl/pci.c | 34 ++++++++++++++++++++++++++++++++-- >> include/linux/cxl-event.h | 1 + >> 5 files changed, 74 insertions(+), 2 deletions(-) > > snip > >
Smita Koralahalli wrote: > When PCIe AER is in FW-First, OS should process CXL Protocol errors from > CPER records. > > Reuse the existing work queue cxl_cper_work registered with GHES to notify > the CXL subsystem on a Protocol error. > > The defined trace events cxl_aer_uncorrectable_error and > cxl_aer_correctable_error currently trace native CXL AER errors. Reuse > them to trace FW-First Protocol Errors. > > Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com> > --- > drivers/acpi/apei/ghes.c | 14 ++++++++++++++ > drivers/cxl/core/pci.c | 24 ++++++++++++++++++++++++ > drivers/cxl/cxlpci.h | 3 +++ > drivers/cxl/pci.c | 34 ++++++++++++++++++++++++++++++++-- > include/linux/cxl-event.h | 1 + > 5 files changed, 74 insertions(+), 2 deletions(-) > > diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c > index 1a58032770ee..a31bd91e9475 100644 > --- a/drivers/acpi/apei/ghes.c > +++ b/drivers/acpi/apei/ghes.c > @@ -723,6 +723,20 @@ static void cxl_cper_handle_prot_err(struct acpi_hest_generic_data *gdata) > > if (cxl_cper_handle_prot_err_info(gdata, &wd.p_err)) > return; > + > + guard(spinlock_irqsave)(&cxl_cper_work_lock); > + > + if (!cxl_cper_work) > + return; > + > + wd.event_type = CXL_CPER_EVENT_PROT_ERR; > + > + if (!kfifo_put(&cxl_cper_fifo, wd)) { > + pr_err_ratelimited("CXL CPER kfifo overflow\n"); > + return; > + } > + > + schedule_work(cxl_cper_work); This seems wrong to unconditionally schedule the cxl_pci driver to look at potentially "non-device" errors. With Terry's upcoming CXL switch port error handling there will be a native path for those errors, but until that arrives, I see no point in this code trying to convey root/switch port errors to the endpoint driver. > } > > int cxl_cper_register_work(struct work_struct *work) > diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c > index 0df09bd79408..ef9438cb1dd6 100644 > --- a/drivers/cxl/core/pci.c > +++ b/drivers/cxl/core/pci.c > @@ -686,6 +686,30 @@ void read_cdat_data(struct cxl_port *port) > } > EXPORT_SYMBOL_NS_GPL(read_cdat_data, CXL); > > +void cxl_trace_prot_err(struct cxl_dev_state *cxlds, > + struct cxl_cper_prot_err *p_err) > +{ > + u32 status, fe; > + > + if (p_err->severity == CXL_AER_CORRECTABLE) { > + status = p_err->cxl_ras.cor_status & ~p_err->cxl_ras.cor_mask; > + > + trace_cxl_aer_correctable_error(cxlds->cxlmd, status); > + } else { > + status = p_err->cxl_ras.uncor_status & ~p_err->cxl_ras.uncor_mask; > + > + if (hweight32(status) > 1) > + fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK, > + p_err->cxl_ras.cap_control)); > + else > + fe = status; > + > + trace_cxl_aer_uncorrectable_error(cxlds->cxlmd, status, fe, > + p_err->cxl_ras.header_log); > + } > +} > +EXPORT_SYMBOL_NS_GPL(cxl_trace_prot_err, CXL); > + > static void __cxl_handle_cor_ras(struct cxl_dev_state *cxlds, > void __iomem *ras_base) > { > diff --git a/drivers/cxl/cxlpci.h b/drivers/cxl/cxlpci.h > index 93992a1c8eec..0ba3215786e1 100644 > --- a/drivers/cxl/cxlpci.h > +++ b/drivers/cxl/cxlpci.h > @@ -130,4 +130,7 @@ void read_cdat_data(struct cxl_port *port); > void cxl_cor_error_detected(struct pci_dev *pdev); > pci_ers_result_t cxl_error_detected(struct pci_dev *pdev, > pci_channel_state_t state); > +struct cxl_cper_prot_err; > +void cxl_trace_prot_err(struct cxl_dev_state *cxlds, > + struct cxl_cper_prot_err *p_err); > #endif /* __CXL_PCI_H__ */ > diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c > index 74876c9835e8..3e3c36983686 100644 > --- a/drivers/cxl/pci.c > +++ b/drivers/cxl/pci.c > @@ -1011,12 +1011,42 @@ static void cxl_handle_cper_event(enum cxl_event_type ev_type, > &uuid_null, &rec->event); > } > > +static void cxl_handle_prot_err(struct cxl_cper_prot_err *p_err) Can we call this variable @rec instead of @p_err? The data being passed is CPER data which is a "record" structure. > +{ > + struct pci_dev *pdev __free(pci_dev_put) = NULL; > + struct cxl_dev_state *cxlds; > + unsigned int devfn; > + > + devfn = PCI_DEVFN(p_err->device, p_err->function); > + pdev = pci_get_domain_bus_and_slot(p_err->segment, > + p_err->bus, devfn); > + if (!pdev) > + return; > + > + guard(device)(&pdev->dev); > + if (pdev->driver != &cxl_pci_driver) > + return; > + > + cxlds = pci_get_drvdata(pdev); > + if (!cxlds) > + return; > + > + if (((u64)p_err->upper_dw << 32 | p_err->lower_dw) != cxlds->serial) > + pr_warn("CPER-reported device serial number does not match expected value\n"); Not much the end user can do about this warning, I would say skip this message, or make it a pci_dbg() because matching by BDF should be sufficient. > + > + cxl_trace_prot_err(cxlds, p_err); > +} > + > static void cxl_cper_work_fn(struct work_struct *work) > { > struct cxl_cper_work_data wd; > > - while (cxl_cper_kfifo_get(&wd)) > - cxl_handle_cper_event(wd.event_type, &wd.rec); > + while (cxl_cper_kfifo_get(&wd)) { > + if (wd.event_type == CXL_CPER_EVENT_PROT_ERR) > + cxl_handle_prot_err(&wd.p_err); > + else > + cxl_handle_cper_event(wd.event_type, &wd.rec); > + } > } > static DECLARE_WORK(cxl_cper_work, cxl_cper_work_fn); > > diff --git a/include/linux/cxl-event.h b/include/linux/cxl-event.h > index 9c7b69e076a0..5562844df850 100644 > --- a/include/linux/cxl-event.h > +++ b/include/linux/cxl-event.h > @@ -122,6 +122,7 @@ struct cxl_event_record_raw { > } __packed; > > enum cxl_event_type { > + CXL_CPER_EVENT_PROT_ERR, > CXL_CPER_EVENT_GENERIC, > CXL_CPER_EVENT_GEN_MEDIA, > CXL_CPER_EVENT_DRAM, > -- > 2.17.1 >
Hi Dan, On 6/11/2024 5:07 PM, Dan Williams wrote: > Smita Koralahalli wrote: >> When PCIe AER is in FW-First, OS should process CXL Protocol errors from >> CPER records. >> >> Reuse the existing work queue cxl_cper_work registered with GHES to notify >> the CXL subsystem on a Protocol error. >> >> The defined trace events cxl_aer_uncorrectable_error and >> cxl_aer_correctable_error currently trace native CXL AER errors. Reuse >> them to trace FW-First Protocol Errors. >> >> Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com> >> --- >> drivers/acpi/apei/ghes.c | 14 ++++++++++++++ >> drivers/cxl/core/pci.c | 24 ++++++++++++++++++++++++ >> drivers/cxl/cxlpci.h | 3 +++ >> drivers/cxl/pci.c | 34 ++++++++++++++++++++++++++++++++-- >> include/linux/cxl-event.h | 1 + >> 5 files changed, 74 insertions(+), 2 deletions(-) >> >> diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c >> index 1a58032770ee..a31bd91e9475 100644 >> --- a/drivers/acpi/apei/ghes.c >> +++ b/drivers/acpi/apei/ghes.c >> @@ -723,6 +723,20 @@ static void cxl_cper_handle_prot_err(struct acpi_hest_generic_data *gdata) >> >> if (cxl_cper_handle_prot_err_info(gdata, &wd.p_err)) >> return; >> + >> + guard(spinlock_irqsave)(&cxl_cper_work_lock); >> + >> + if (!cxl_cper_work) >> + return; >> + >> + wd.event_type = CXL_CPER_EVENT_PROT_ERR; >> + >> + if (!kfifo_put(&cxl_cper_fifo, wd)) { >> + pr_err_ratelimited("CXL CPER kfifo overflow\n"); >> + return; >> + } >> + >> + schedule_work(cxl_cper_work); > > This seems wrong to unconditionally schedule the cxl_pci driver to look > at potentially "non-device" errors. With Terry's upcoming CXL switch > port error handling there will be a native path for those errors, but > until that arrives, I see no point in this code trying to convey > root/switch port errors to the endpoint driver. I see okay. What are your recommendations on this? Just confine it to CXL RCD, CXL SLD and CXL LD? And then extend it to ports once Terry sends patches? Also, I'm not sure about FMLD. Should we just drop it as of now? > >> } >> >> int cxl_cper_register_work(struct work_struct *work) >> diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c >> index 0df09bd79408..ef9438cb1dd6 100644 >> --- a/drivers/cxl/core/pci.c >> +++ b/drivers/cxl/core/pci.c >> @@ -686,6 +686,30 @@ void read_cdat_data(struct cxl_port *port) >> } >> EXPORT_SYMBOL_NS_GPL(read_cdat_data, CXL); >> >> +void cxl_trace_prot_err(struct cxl_dev_state *cxlds, >> + struct cxl_cper_prot_err *p_err) >> +{ >> + u32 status, fe; >> + >> + if (p_err->severity == CXL_AER_CORRECTABLE) { >> + status = p_err->cxl_ras.cor_status & ~p_err->cxl_ras.cor_mask; >> + >> + trace_cxl_aer_correctable_error(cxlds->cxlmd, status); >> + } else { >> + status = p_err->cxl_ras.uncor_status & ~p_err->cxl_ras.uncor_mask; >> + >> + if (hweight32(status) > 1) >> + fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK, >> + p_err->cxl_ras.cap_control)); >> + else >> + fe = status; >> + >> + trace_cxl_aer_uncorrectable_error(cxlds->cxlmd, status, fe, >> + p_err->cxl_ras.header_log); >> + } >> +} >> +EXPORT_SYMBOL_NS_GPL(cxl_trace_prot_err, CXL); >> + >> static void __cxl_handle_cor_ras(struct cxl_dev_state *cxlds, >> void __iomem *ras_base) >> { >> diff --git a/drivers/cxl/cxlpci.h b/drivers/cxl/cxlpci.h >> index 93992a1c8eec..0ba3215786e1 100644 >> --- a/drivers/cxl/cxlpci.h >> +++ b/drivers/cxl/cxlpci.h >> @@ -130,4 +130,7 @@ void read_cdat_data(struct cxl_port *port); >> void cxl_cor_error_detected(struct pci_dev *pdev); >> pci_ers_result_t cxl_error_detected(struct pci_dev *pdev, >> pci_channel_state_t state); >> +struct cxl_cper_prot_err; >> +void cxl_trace_prot_err(struct cxl_dev_state *cxlds, >> + struct cxl_cper_prot_err *p_err); >> #endif /* __CXL_PCI_H__ */ >> diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c >> index 74876c9835e8..3e3c36983686 100644 >> --- a/drivers/cxl/pci.c >> +++ b/drivers/cxl/pci.c >> @@ -1011,12 +1011,42 @@ static void cxl_handle_cper_event(enum cxl_event_type ev_type, >> &uuid_null, &rec->event); >> } >> >> +static void cxl_handle_prot_err(struct cxl_cper_prot_err *p_err) > > Can we call this variable @rec instead of @p_err? The data being passed > is CPER data which is a "record" structure. Will change. > >> +{ >> + struct pci_dev *pdev __free(pci_dev_put) = NULL; >> + struct cxl_dev_state *cxlds; >> + unsigned int devfn; >> + >> + devfn = PCI_DEVFN(p_err->device, p_err->function); >> + pdev = pci_get_domain_bus_and_slot(p_err->segment, >> + p_err->bus, devfn); >> + if (!pdev) >> + return; >> + >> + guard(device)(&pdev->dev); >> + if (pdev->driver != &cxl_pci_driver) >> + return; >> + >> + cxlds = pci_get_drvdata(pdev); >> + if (!cxlds) >> + return; >> + >> + if (((u64)p_err->upper_dw << 32 | p_err->lower_dw) != cxlds->serial) >> + pr_warn("CPER-reported device serial number does not match expected value\n"); > > Not much the end user can do about this warning, I would say skip this > message, or make it a pci_dbg() because matching by BDF should be > sufficient. Will skip this message. Thanks Smita > >> + >> + cxl_trace_prot_err(cxlds, p_err); >> +} >> + >> static void cxl_cper_work_fn(struct work_struct *work) >> { >> struct cxl_cper_work_data wd; >> >> - while (cxl_cper_kfifo_get(&wd)) >> - cxl_handle_cper_event(wd.event_type, &wd.rec); >> + while (cxl_cper_kfifo_get(&wd)) { >> + if (wd.event_type == CXL_CPER_EVENT_PROT_ERR) >> + cxl_handle_prot_err(&wd.p_err); >> + else >> + cxl_handle_cper_event(wd.event_type, &wd.rec); >> + } >> } >> static DECLARE_WORK(cxl_cper_work, cxl_cper_work_fn); >> >> diff --git a/include/linux/cxl-event.h b/include/linux/cxl-event.h >> index 9c7b69e076a0..5562844df850 100644 >> --- a/include/linux/cxl-event.h >> +++ b/include/linux/cxl-event.h >> @@ -122,6 +122,7 @@ struct cxl_event_record_raw { >> } __packed; >> >> enum cxl_event_type { >> + CXL_CPER_EVENT_PROT_ERR, >> CXL_CPER_EVENT_GENERIC, >> CXL_CPER_EVENT_GEN_MEDIA, >> CXL_CPER_EVENT_DRAM, >> -- >> 2.17.1 >> > >
Hi Dan, On 6/13/2024 10:47 AM, Smita Koralahalli wrote: > Hi Dan, > > On 6/11/2024 5:07 PM, Dan Williams wrote: >> Smita Koralahalli wrote: >>> When PCIe AER is in FW-First, OS should process CXL Protocol errors from >>> CPER records. >>> >>> Reuse the existing work queue cxl_cper_work registered with GHES to >>> notify >>> the CXL subsystem on a Protocol error. >>> >>> The defined trace events cxl_aer_uncorrectable_error and >>> cxl_aer_correctable_error currently trace native CXL AER errors. Reuse >>> them to trace FW-First Protocol Errors. >>> >>> Signed-off-by: Smita Koralahalli >>> <Smita.KoralahalliChannabasappa@amd.com> >>> --- >>> drivers/acpi/apei/ghes.c | 14 ++++++++++++++ >>> drivers/cxl/core/pci.c | 24 ++++++++++++++++++++++++ >>> drivers/cxl/cxlpci.h | 3 +++ >>> drivers/cxl/pci.c | 34 ++++++++++++++++++++++++++++++++-- >>> include/linux/cxl-event.h | 1 + >>> 5 files changed, 74 insertions(+), 2 deletions(-) >>> >>> diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c >>> index 1a58032770ee..a31bd91e9475 100644 >>> --- a/drivers/acpi/apei/ghes.c >>> +++ b/drivers/acpi/apei/ghes.c >>> @@ -723,6 +723,20 @@ static void cxl_cper_handle_prot_err(struct >>> acpi_hest_generic_data *gdata) >>> if (cxl_cper_handle_prot_err_info(gdata, &wd.p_err)) >>> return; >>> + >>> + guard(spinlock_irqsave)(&cxl_cper_work_lock); >>> + >>> + if (!cxl_cper_work) >>> + return; >>> + >>> + wd.event_type = CXL_CPER_EVENT_PROT_ERR; >>> + >>> + if (!kfifo_put(&cxl_cper_fifo, wd)) { >>> + pr_err_ratelimited("CXL CPER kfifo overflow\n"); >>> + return; >>> + } >>> + >>> + schedule_work(cxl_cper_work); >> >> This seems wrong to unconditionally schedule the cxl_pci driver to look >> at potentially "non-device" errors. With Terry's upcoming CXL switch >> port error handling there will be a native path for those errors, but >> until that arrives, I see no point in this code trying to convey >> root/switch port errors to the endpoint driver. > > I see okay. What are your recommendations on this? Just confine it to > CXL RCD, CXL SLD and CXL LD? And then extend it to ports once Terry > sends patches? > > Also, I'm not sure about FMLD. Should we just drop it as of now? > Since, Terry sent his port error handling patches, shall I keep the above check as is? That is schedule cxl_pci driver on all device and port errors with mention to be rebased on Terry's. I'm slightly doubtful on FMLD though. Thanks, Smita [snip]
diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index 1a58032770ee..a31bd91e9475 100644 --- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c @@ -723,6 +723,20 @@ static void cxl_cper_handle_prot_err(struct acpi_hest_generic_data *gdata) if (cxl_cper_handle_prot_err_info(gdata, &wd.p_err)) return; + + guard(spinlock_irqsave)(&cxl_cper_work_lock); + + if (!cxl_cper_work) + return; + + wd.event_type = CXL_CPER_EVENT_PROT_ERR; + + if (!kfifo_put(&cxl_cper_fifo, wd)) { + pr_err_ratelimited("CXL CPER kfifo overflow\n"); + return; + } + + schedule_work(cxl_cper_work); } int cxl_cper_register_work(struct work_struct *work) diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c index 0df09bd79408..ef9438cb1dd6 100644 --- a/drivers/cxl/core/pci.c +++ b/drivers/cxl/core/pci.c @@ -686,6 +686,30 @@ void read_cdat_data(struct cxl_port *port) } EXPORT_SYMBOL_NS_GPL(read_cdat_data, CXL); +void cxl_trace_prot_err(struct cxl_dev_state *cxlds, + struct cxl_cper_prot_err *p_err) +{ + u32 status, fe; + + if (p_err->severity == CXL_AER_CORRECTABLE) { + status = p_err->cxl_ras.cor_status & ~p_err->cxl_ras.cor_mask; + + trace_cxl_aer_correctable_error(cxlds->cxlmd, status); + } else { + status = p_err->cxl_ras.uncor_status & ~p_err->cxl_ras.uncor_mask; + + if (hweight32(status) > 1) + fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK, + p_err->cxl_ras.cap_control)); + else + fe = status; + + trace_cxl_aer_uncorrectable_error(cxlds->cxlmd, status, fe, + p_err->cxl_ras.header_log); + } +} +EXPORT_SYMBOL_NS_GPL(cxl_trace_prot_err, CXL); + static void __cxl_handle_cor_ras(struct cxl_dev_state *cxlds, void __iomem *ras_base) { diff --git a/drivers/cxl/cxlpci.h b/drivers/cxl/cxlpci.h index 93992a1c8eec..0ba3215786e1 100644 --- a/drivers/cxl/cxlpci.h +++ b/drivers/cxl/cxlpci.h @@ -130,4 +130,7 @@ void read_cdat_data(struct cxl_port *port); void cxl_cor_error_detected(struct pci_dev *pdev); pci_ers_result_t cxl_error_detected(struct pci_dev *pdev, pci_channel_state_t state); +struct cxl_cper_prot_err; +void cxl_trace_prot_err(struct cxl_dev_state *cxlds, + struct cxl_cper_prot_err *p_err); #endif /* __CXL_PCI_H__ */ diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c index 74876c9835e8..3e3c36983686 100644 --- a/drivers/cxl/pci.c +++ b/drivers/cxl/pci.c @@ -1011,12 +1011,42 @@ static void cxl_handle_cper_event(enum cxl_event_type ev_type, &uuid_null, &rec->event); } +static void cxl_handle_prot_err(struct cxl_cper_prot_err *p_err) +{ + struct pci_dev *pdev __free(pci_dev_put) = NULL; + struct cxl_dev_state *cxlds; + unsigned int devfn; + + devfn = PCI_DEVFN(p_err->device, p_err->function); + pdev = pci_get_domain_bus_and_slot(p_err->segment, + p_err->bus, devfn); + if (!pdev) + return; + + guard(device)(&pdev->dev); + if (pdev->driver != &cxl_pci_driver) + return; + + cxlds = pci_get_drvdata(pdev); + if (!cxlds) + return; + + if (((u64)p_err->upper_dw << 32 | p_err->lower_dw) != cxlds->serial) + pr_warn("CPER-reported device serial number does not match expected value\n"); + + cxl_trace_prot_err(cxlds, p_err); +} + static void cxl_cper_work_fn(struct work_struct *work) { struct cxl_cper_work_data wd; - while (cxl_cper_kfifo_get(&wd)) - cxl_handle_cper_event(wd.event_type, &wd.rec); + while (cxl_cper_kfifo_get(&wd)) { + if (wd.event_type == CXL_CPER_EVENT_PROT_ERR) + cxl_handle_prot_err(&wd.p_err); + else + cxl_handle_cper_event(wd.event_type, &wd.rec); + } } static DECLARE_WORK(cxl_cper_work, cxl_cper_work_fn); diff --git a/include/linux/cxl-event.h b/include/linux/cxl-event.h index 9c7b69e076a0..5562844df850 100644 --- a/include/linux/cxl-event.h +++ b/include/linux/cxl-event.h @@ -122,6 +122,7 @@ struct cxl_event_record_raw { } __packed; enum cxl_event_type { + CXL_CPER_EVENT_PROT_ERR, CXL_CPER_EVENT_GENERIC, CXL_CPER_EVENT_GEN_MEDIA, CXL_CPER_EVENT_DRAM,
When PCIe AER is in FW-First, OS should process CXL Protocol errors from CPER records. Reuse the existing work queue cxl_cper_work registered with GHES to notify the CXL subsystem on a Protocol error. The defined trace events cxl_aer_uncorrectable_error and cxl_aer_correctable_error currently trace native CXL AER errors. Reuse them to trace FW-First Protocol Errors. Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com> --- drivers/acpi/apei/ghes.c | 14 ++++++++++++++ drivers/cxl/core/pci.c | 24 ++++++++++++++++++++++++ drivers/cxl/cxlpci.h | 3 +++ drivers/cxl/pci.c | 34 ++++++++++++++++++++++++++++++++-- include/linux/cxl-event.h | 1 + 5 files changed, 74 insertions(+), 2 deletions(-)