Message ID | 20250123084421.127697-7-Smita.KoralahalliChannabasappa@amd.com |
---|---|
State | New |
Headers | show |
Series | acpi/ghes, cper, cxl: Process CXL CPER Protocol errors | expand |
Smita Koralahalli wrote: > The CXL drivers use kernel trace functions for logging endpoint and > Restricted CXL host (RCH) Downstream Port RAS errors. Similar functionality > is required for CXL Root Ports, CXL Downstream Switch Ports, and CXL > Upstream Switch Ports. > > Introduce trace logging functions for both RAS correctable and > uncorrectable errors specific to CXL PCIe Ports. Use them to trace > FW-First Protocol errors. > > Co-developed-by: Terry Bowman <terry.bowman@amd.com> > Signed-off-by: Terry Bowman <terry.bowman@amd.com> > Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com> Reviewed-by: Ira Weiny <ira.weiny@intel.com>
On Thu, Jan 23, 2025 at 08:44:21AM +0000, Smita Koralahalli wrote: > The CXL drivers use kernel trace functions for logging endpoint and > Restricted CXL host (RCH) Downstream Port RAS errors. Similar functionality > is required for CXL Root Ports, CXL Downstream Switch Ports, and CXL > Upstream Switch Ports. > > Introduce trace logging functions for both RAS correctable and > uncorrectable errors specific to CXL PCIe Ports. Use them to trace > FW-First Protocol errors. > > Co-developed-by: Terry Bowman <terry.bowman@amd.com> > Signed-off-by: Terry Bowman <terry.bowman@amd.com> > Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com> Reviewed-by: Gregory Price <gourry@gourry.net> > diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c > index 9d4b5f39b21a..766447c169c8 100644 > --- a/drivers/cxl/pci.c > +++ b/drivers/cxl/pci.c > @@ -1168,6 +1168,7 @@ static void cxl_cper_handle_prot_err(struct cxl_cper_prot_err_work_data *data) > pci_get_domain_bus_and_slot(data->prot_err.agent_addr.segment, > data->prot_err.agent_addr.bus, > devfn); > + int port_type; > > if (!pdev) > return; > @@ -1176,6 +1177,18 @@ static void cxl_cper_handle_prot_err(struct cxl_cper_prot_err_work_data *data) > if (pdev->driver != &cxl_pci_driver) > return; > > + port_type = pci_pcie_type(pdev); > + if (port_type == PCI_EXP_TYPE_ROOT_PORT || > + port_type == PCI_EXP_TYPE_DOWNSTREAM || > + port_type == PCI_EXP_TYPE_UPSTREAM) { Almost wish this was a macro for the sake of style, but not worth it. "corr_prot_err" and "port_prot_err" kind of blend at first glance. Not worth holding anything up. > + if (data->severity == AER_CORRECTABLE) > + cxl_cper_trace_corr_port_prot_err(pdev, data->ras_cap); > + else > + cxl_cper_trace_uncorr_port_prot_err(pdev, data->ras_cap); > + > + return; > + } > + > if (data->severity == AER_CORRECTABLE) > cxl_cper_trace_corr_prot_err(pdev, data->ras_cap); > else > -- > 2.17.1 >
Smita Koralahalli wrote: > The CXL drivers use kernel trace functions for logging endpoint and > Restricted CXL host (RCH) Downstream Port RAS errors. Similar functionality > is required for CXL Root Ports, CXL Downstream Switch Ports, and CXL > Upstream Switch Ports. > > Introduce trace logging functions for both RAS correctable and > uncorrectable errors specific to CXL PCIe Ports. Use them to trace > FW-First Protocol errors. > > Co-developed-by: Terry Bowman <terry.bowman@amd.com> > Signed-off-by: Terry Bowman <terry.bowman@amd.com> > Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com> I think this functionality moves to a central / non-cxl_pci location once we have a formal CXL AER path established. So, for this series you can add my Reviewed-by: to patches 1-4, but I am not yet convinced cxl_pci should play a role in emitting protocol errors compared to a centralized place in the core.
diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c index 5840056bb9a3..b535da901dec 100644 --- a/drivers/cxl/core/pci.c +++ b/drivers/cxl/core/pci.c @@ -686,6 +686,32 @@ void cxl_cper_trace_uncorr_prot_err(struct pci_dev *pdev, } EXPORT_SYMBOL_NS_GPL(cxl_cper_trace_uncorr_prot_err, "CXL"); +void cxl_cper_trace_corr_port_prot_err(struct pci_dev *pdev, + struct cxl_ras_capability_regs ras_cap) +{ + u32 status = ras_cap.cor_status & ~ras_cap.cor_mask; + + trace_cxl_port_aer_correctable_error(&pdev->dev, status); +} +EXPORT_SYMBOL_NS_GPL(cxl_cper_trace_corr_port_prot_err, "CXL"); + +void cxl_cper_trace_uncorr_port_prot_err(struct pci_dev *pdev, + struct cxl_ras_capability_regs ras_cap) +{ + u32 status = ras_cap.uncor_status & ~ras_cap.uncor_mask; + u32 fe; + + if (hweight32(status) > 1) + fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK, + ras_cap.cap_control)); + else + fe = status; + + trace_cxl_port_aer_uncorrectable_error(&pdev->dev, status, fe, + ras_cap.header_log); +} +EXPORT_SYMBOL_NS_GPL(cxl_cper_trace_uncorr_port_prot_err, "CXL"); + static void __cxl_handle_cor_ras(struct cxl_dev_state *cxlds, void __iomem *ras_base) { diff --git a/drivers/cxl/core/trace.h b/drivers/cxl/core/trace.h index 8389a94adb1a..f684a2ae14e8 100644 --- a/drivers/cxl/core/trace.h +++ b/drivers/cxl/core/trace.h @@ -48,6 +48,34 @@ { CXL_RAS_UC_IDE_RX_ERR, "IDE Rx Error" } \ ) +TRACE_EVENT(cxl_port_aer_uncorrectable_error, + TP_PROTO(struct device *dev, u32 status, u32 fe, u32 *hl), + TP_ARGS(dev, status, fe, hl), + TP_STRUCT__entry( + __string(devname, dev_name(dev)) + __string(parent, dev_name(dev->parent)) + __field(u32, status) + __field(u32, first_error) + __array(u32, header_log, CXL_HEADERLOG_SIZE_U32) + ), + TP_fast_assign( + __assign_str(devname); + __assign_str(parent); + __entry->status = status; + __entry->first_error = fe; + /* + * Embed the 512B headerlog data for user app retrieval and + * parsing, but no need to print this in the trace buffer. + */ + memcpy(__entry->header_log, hl, CXL_HEADERLOG_SIZE); + ), + TP_printk("device=%s host=%s status: '%s' first_error: '%s'", + __get_str(devname), __get_str(parent), + show_uc_errs(__entry->status), + show_uc_errs(__entry->first_error) + ) +); + TRACE_EVENT(cxl_aer_uncorrectable_error, TP_PROTO(const struct cxl_memdev *cxlmd, u32 status, u32 fe, u32 *hl), TP_ARGS(cxlmd, status, fe, hl), @@ -96,6 +124,25 @@ TRACE_EVENT(cxl_aer_uncorrectable_error, { CXL_RAS_CE_PHYS_LAYER_ERR, "Received Error From Physical Layer" } \ ) +TRACE_EVENT(cxl_port_aer_correctable_error, + TP_PROTO(struct device *dev, u32 status), + TP_ARGS(dev, status), + TP_STRUCT__entry( + __string(devname, dev_name(dev)) + __string(parent, dev_name(dev->parent)) + __field(u32, status) + ), + TP_fast_assign( + __assign_str(devname); + __assign_str(parent); + __entry->status = status; + ), + TP_printk("device=%s host=%s status='%s'", + __get_str(devname), __get_str(parent), + show_ce_errs(__entry->status) + ) +); + TRACE_EVENT(cxl_aer_correctable_error, TP_PROTO(const struct cxl_memdev *cxlmd, u32 status), TP_ARGS(cxlmd, status), diff --git a/drivers/cxl/cxlpci.h b/drivers/cxl/cxlpci.h index e457616373ed..23f2b1c9bd13 100644 --- a/drivers/cxl/cxlpci.h +++ b/drivers/cxl/cxlpci.h @@ -134,4 +134,8 @@ void cxl_cper_trace_corr_prot_err(struct pci_dev *pdev, struct cxl_ras_capability_regs ras_cap); void cxl_cper_trace_uncorr_prot_err(struct pci_dev *pdev, struct cxl_ras_capability_regs ras_cap); +void cxl_cper_trace_corr_port_prot_err(struct pci_dev *pdev, + struct cxl_ras_capability_regs ras_cap); +void cxl_cper_trace_uncorr_port_prot_err(struct pci_dev *pdev, + struct cxl_ras_capability_regs ras_cap); #endif /* __CXL_PCI_H__ */ diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c index 9d4b5f39b21a..766447c169c8 100644 --- a/drivers/cxl/pci.c +++ b/drivers/cxl/pci.c @@ -1168,6 +1168,7 @@ static void cxl_cper_handle_prot_err(struct cxl_cper_prot_err_work_data *data) pci_get_domain_bus_and_slot(data->prot_err.agent_addr.segment, data->prot_err.agent_addr.bus, devfn); + int port_type; if (!pdev) return; @@ -1176,6 +1177,18 @@ static void cxl_cper_handle_prot_err(struct cxl_cper_prot_err_work_data *data) if (pdev->driver != &cxl_pci_driver) return; + port_type = pci_pcie_type(pdev); + if (port_type == PCI_EXP_TYPE_ROOT_PORT || + port_type == PCI_EXP_TYPE_DOWNSTREAM || + port_type == PCI_EXP_TYPE_UPSTREAM) { + if (data->severity == AER_CORRECTABLE) + cxl_cper_trace_corr_port_prot_err(pdev, data->ras_cap); + else + cxl_cper_trace_uncorr_port_prot_err(pdev, data->ras_cap); + + return; + } + if (data->severity == AER_CORRECTABLE) cxl_cper_trace_corr_prot_err(pdev, data->ras_cap); else