diff mbox series

[v6,5/6] acpi/ghes, cxl/pci: Process CXL CPER Protocol Errors

Message ID 20250123084421.127697-6-Smita.KoralahalliChannabasappa@amd.com
State New
Headers show
Series acpi/ghes, cper, cxl: Process CXL CPER Protocol errors | expand

Commit Message

Koralahalli Channabasappa, Smita Jan. 23, 2025, 8:44 a.m. UTC
When PCIe AER is in FW-First, OS should process CXL Protocol errors from
CPER records. Introduce support for handling and logging CXL Protocol
errors.

The defined trace events cxl_aer_uncorrectable_error and
cxl_aer_correctable_error trace native CXL AER endpoint errors. Reuse them
to trace FW-First Protocol errors.

Since the CXL code is required to be called from process context and
GHES is in interrupt context, use workqueues for processing.

Similar to CXL CPER event handling, use kfifo to handle errors as it
simplifies queue processing by providing lock free fifo operations.

Add the ability for the CXL sub-system to register a workqueue to
process CXL CPER protocol errors.

Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
Reviewed-by: Dave Jiang <dave.jiang@intel.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
---
 drivers/acpi/apei/ghes.c | 49 ++++++++++++++++++++++++++++++++++++++++
 drivers/cxl/core/pci.c   | 36 +++++++++++++++++++++++++++++
 drivers/cxl/cxlpci.h     |  5 ++++
 drivers/cxl/pci.c        | 46 ++++++++++++++++++++++++++++++++++++-
 include/cxl/event.h      | 15 ++++++++++++
 5 files changed, 150 insertions(+), 1 deletion(-)

Comments

Luck, Tony Feb. 3, 2025, 7:03 p.m. UTC | #1
On Thu, Jan 23, 2025 at 08:44:20AM +0000, Smita Koralahalli wrote:
> When PCIe AER is in FW-First, OS should process CXL Protocol errors from
> CPER records. Introduce support for handling and logging CXL Protocol
> errors.
> 
> The defined trace events cxl_aer_uncorrectable_error and
> cxl_aer_correctable_error trace native CXL AER endpoint errors. Reuse them
> to trace FW-First Protocol errors.
> 
> Since the CXL code is required to be called from process context and
> GHES is in interrupt context, use workqueues for processing.
> 
> Similar to CXL CPER event handling, use kfifo to handle errors as it
> simplifies queue processing by providing lock free fifo operations.
> 
> Add the ability for the CXL sub-system to register a workqueue to
> process CXL CPER protocol errors.
> 
> Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
> Reviewed-by: Dave Jiang <dave.jiang@intel.com>
> Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
> Reviewed-by: Ira Weiny <ira.weiny@intel.com>
> ---
>  drivers/acpi/apei/ghes.c | 49 ++++++++++++++++++++++++++++++++++++++++
>  drivers/cxl/core/pci.c   | 36 +++++++++++++++++++++++++++++
>  drivers/cxl/cxlpci.h     |  5 ++++
>  drivers/cxl/pci.c        | 46 ++++++++++++++++++++++++++++++++++++-
>  include/cxl/event.h      | 15 ++++++++++++
>  5 files changed, 150 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
> index 4d725d988c43..289e365f84b2 100644
> --- a/drivers/acpi/apei/ghes.c
> +++ b/drivers/acpi/apei/ghes.c
> @@ -674,6 +674,15 @@ static void ghes_defer_non_standard_event(struct acpi_hest_generic_data *gdata,
>  	schedule_work(&entry->work);
>  }
>  
> +/* Room for 8 entries */

Any science behind the choice of "8" here? This comment is merely
stating what the #define is used for, not why 8 was chosen.

> +#define CXL_CPER_PROT_ERR_FIFO_DEPTH 8
> +static DEFINE_KFIFO(cxl_cper_prot_err_fifo, struct cxl_cper_prot_err_work_data,
> +		    CXL_CPER_PROT_ERR_FIFO_DEPTH);
> +
> +/* Synchronize schedule_work() with cxl_cper_prot_err_work changes */
> +static DEFINE_SPINLOCK(cxl_cper_prot_err_work_lock);
> +struct work_struct *cxl_cper_prot_err_work;
> +
>  static void cxl_cper_post_prot_err(struct cxl_cper_sec_prot_err *prot_err,
>  				   int severity)
>  {
> @@ -700,6 +709,11 @@ static void cxl_cper_post_prot_err(struct cxl_cper_sec_prot_err *prot_err,
>  	if (!(prot_err->valid_bits & PROT_ERR_VALID_SERIAL_NUMBER))
>  		pr_warn(FW_WARN "CXL CPER no device serial number\n");
>  
> +	guard(spinlock_irqsave)(&cxl_cper_prot_err_work_lock);
> +
> +	if (!cxl_cper_prot_err_work)
> +		return;
> +
>  	switch (prot_err->agent_type) {
>  	case RCD:
>  	case DEVICE:
> @@ -721,9 +735,44 @@ static void cxl_cper_post_prot_err(struct cxl_cper_sec_prot_err *prot_err,
>  				   prot_err->agent_type);
>  		return;
>  	}
> +
> +	if (!kfifo_put(&cxl_cper_prot_err_fifo, wd)) {
> +		pr_err_ratelimited("CXL CPER kfifo overflow\n");
> +		return;
> +	}
> +
> +	schedule_work(cxl_cper_prot_err_work);
>  #endif
>  }
>  
> +int cxl_cper_register_prot_err_work(struct work_struct *work)
> +{
> +	if (cxl_cper_prot_err_work)
> +		return -EINVAL;
> +
> +	guard(spinlock)(&cxl_cper_prot_err_work_lock);
> +	cxl_cper_prot_err_work = work;
> +	return 0;
> +}
> +EXPORT_SYMBOL_NS_GPL(cxl_cper_register_prot_err_work, "CXL");
> +
> +int cxl_cper_unregister_prot_err_work(struct work_struct *work)
> +{
> +	if (cxl_cper_prot_err_work != work)
> +		return -EINVAL;
> +
> +	guard(spinlock)(&cxl_cper_prot_err_work_lock);
> +	cxl_cper_prot_err_work = NULL;
> +	return 0;
> +}
> +EXPORT_SYMBOL_NS_GPL(cxl_cper_unregister_prot_err_work, "CXL");
> +
> +int cxl_cper_prot_err_kfifo_get(struct cxl_cper_prot_err_work_data *wd)
> +{
> +	return kfifo_get(&cxl_cper_prot_err_fifo, wd);
> +}
> +EXPORT_SYMBOL_NS_GPL(cxl_cper_prot_err_kfifo_get, "CXL");
> +
>  /* Room for 8 entries for each of the 4 event log queues */
>  #define CXL_CPER_FIFO_DEPTH 32
>  DEFINE_KFIFO(cxl_cper_fifo, struct cxl_cper_work_data, CXL_CPER_FIFO_DEPTH);
> diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
> index 9d58ab9d33c5..5840056bb9a3 100644
> --- a/drivers/cxl/core/pci.c
> +++ b/drivers/cxl/core/pci.c
> @@ -650,6 +650,42 @@ void read_cdat_data(struct cxl_port *port)
>  }
>  EXPORT_SYMBOL_NS_GPL(read_cdat_data, "CXL");
>  
> +void cxl_cper_trace_corr_prot_err(struct pci_dev *pdev,
> +				  struct cxl_ras_capability_regs ras_cap)
> +{
> +	u32 status = ras_cap.cor_status & ~ras_cap.cor_mask;
> +	struct cxl_dev_state *cxlds;
> +
> +	cxlds = pci_get_drvdata(pdev);
> +	if (!cxlds)
> +		return;
> +
> +	trace_cxl_aer_correctable_error(cxlds->cxlmd, status);
> +}
> +EXPORT_SYMBOL_NS_GPL(cxl_cper_trace_corr_prot_err, "CXL");
> +
> +void cxl_cper_trace_uncorr_prot_err(struct pci_dev *pdev,
> +				    struct cxl_ras_capability_regs ras_cap)
> +{
> +	u32 status = ras_cap.uncor_status & ~ras_cap.uncor_mask;
> +	struct cxl_dev_state *cxlds;
> +	u32 fe;
> +
> +	cxlds = pci_get_drvdata(pdev);
> +	if (!cxlds)
> +		return;
> +
> +	if (hweight32(status) > 1)
> +		fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK,
> +				   ras_cap.cap_control));
> +	else
> +		fe = status;
> +
> +	trace_cxl_aer_uncorrectable_error(cxlds->cxlmd, status, fe,
> +					  ras_cap.header_log);
> +}
> +EXPORT_SYMBOL_NS_GPL(cxl_cper_trace_uncorr_prot_err, "CXL");
> +
>  static void __cxl_handle_cor_ras(struct cxl_dev_state *cxlds,
>  				 void __iomem *ras_base)
>  {
> diff --git a/drivers/cxl/cxlpci.h b/drivers/cxl/cxlpci.h
> index 4da07727ab9c..e457616373ed 100644
> --- a/drivers/cxl/cxlpci.h
> +++ b/drivers/cxl/cxlpci.h
> @@ -129,4 +129,9 @@ void read_cdat_data(struct cxl_port *port);
>  void cxl_cor_error_detected(struct pci_dev *pdev);
>  pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
>  				    pci_channel_state_t state);
> +struct cxl_ras_capability_regs;
> +void cxl_cper_trace_corr_prot_err(struct pci_dev *pdev,
> +				  struct cxl_ras_capability_regs ras_cap);
> +void cxl_cper_trace_uncorr_prot_err(struct pci_dev *pdev,
> +				    struct cxl_ras_capability_regs ras_cap);
>  #endif /* __CXL_PCI_H__ */
> diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
> index 6d94ff4a4f1a..9d4b5f39b21a 100644
> --- a/drivers/cxl/pci.c
> +++ b/drivers/cxl/pci.c
> @@ -1160,6 +1160,37 @@ static void cxl_cper_work_fn(struct work_struct *work)
>  }
>  static DECLARE_WORK(cxl_cper_work, cxl_cper_work_fn);
>  
> +static void cxl_cper_handle_prot_err(struct cxl_cper_prot_err_work_data *data)
> +{
> +	unsigned int devfn = PCI_DEVFN(data->prot_err.agent_addr.device,
> +				       data->prot_err.agent_addr.function);
> +	struct pci_dev *pdev __free(pci_dev_put) =
> +		pci_get_domain_bus_and_slot(data->prot_err.agent_addr.segment,
> +					    data->prot_err.agent_addr.bus,
> +					    devfn);
> +
> +	if (!pdev)
> +		return;
> +
> +	guard(device)(&pdev->dev);
> +	if (pdev->driver != &cxl_pci_driver)
> +		return;
> +
> +	if (data->severity == AER_CORRECTABLE)
> +		cxl_cper_trace_corr_prot_err(pdev, data->ras_cap);
> +	else
> +		cxl_cper_trace_uncorr_prot_err(pdev, data->ras_cap);
> +}
> +
> +static void cxl_cper_prot_err_work_fn(struct work_struct *work)
> +{
> +	struct cxl_cper_prot_err_work_data wd;
> +
> +	while (cxl_cper_prot_err_kfifo_get(&wd))
> +		cxl_cper_handle_prot_err(&wd);
> +}
> +static DECLARE_WORK(cxl_cper_prot_err_work, cxl_cper_prot_err_work_fn);
> +
>  static int __init cxl_pci_driver_init(void)
>  {
>  	int rc;
> @@ -1170,7 +1201,18 @@ static int __init cxl_pci_driver_init(void)
>  
>  	rc = cxl_cper_register_work(&cxl_cper_work);
>  	if (rc)
> -		pci_unregister_driver(&cxl_pci_driver);
> +		goto err_unreg;
> +
> +	rc = cxl_cper_register_prot_err_work(&cxl_cper_prot_err_work);
> +	if (rc)
> +		goto err_unregister_work;
> +
> +	return 0;
> +
> +err_unregister_work:
> +	cxl_cper_unregister_work(&cxl_cper_work);
> +err_unreg:
> +	pci_unregister_driver(&cxl_pci_driver);
>  
>  	return rc;
>  }
> @@ -1178,7 +1220,9 @@ static int __init cxl_pci_driver_init(void)
>  static void __exit cxl_pci_driver_exit(void)
>  {
>  	cxl_cper_unregister_work(&cxl_cper_work);
> +	cxl_cper_unregister_prot_err_work(&cxl_cper_prot_err_work);
>  	cancel_work_sync(&cxl_cper_work);
> +	cancel_work_sync(&cxl_cper_prot_err_work);
>  	pci_unregister_driver(&cxl_pci_driver);
>  }
>  
> diff --git a/include/cxl/event.h b/include/cxl/event.h
> index ee1c3dec62fa..359a8f44a2e0 100644
> --- a/include/cxl/event.h
> +++ b/include/cxl/event.h
> @@ -242,6 +242,9 @@ struct cxl_cper_prot_err_work_data {
>  int cxl_cper_register_work(struct work_struct *work);
>  int cxl_cper_unregister_work(struct work_struct *work);
>  int cxl_cper_kfifo_get(struct cxl_cper_work_data *wd);
> +int cxl_cper_register_prot_err_work(struct work_struct *work);
> +int cxl_cper_unregister_prot_err_work(struct work_struct *work);
> +int cxl_cper_prot_err_kfifo_get(struct cxl_cper_prot_err_work_data *wd);
>  #else
>  static inline int cxl_cper_register_work(struct work_struct *work)
>  {
> @@ -256,6 +259,18 @@ static inline int cxl_cper_kfifo_get(struct cxl_cper_work_data *wd)
>  {
>  	return 0;
>  }
> +static inline int cxl_cper_register_prot_err_work(struct work_struct *work)
> +{
> +	return 0;
> +}
> +static inline int cxl_cper_unregister_prot_err_work(struct work_struct *work)
> +{
> +	return 0;
> +}
> +static inline int cxl_cper_prot_err_kfifo_get(struct cxl_cper_prot_err_work_data *wd)
> +{
> +	return 0;
> +}
>  #endif
>  
>  #endif /* _LINUX_CXL_EVENT_H */
> -- 
> 2.17.1
> 

Nitpick about the "8" value, but patch can go in like this
and the value changed if there is later evidence to justify
some other value.

Reviewed-by: Tony Luck <tony.luck@intel.com>

-Tony
Gregory Price Feb. 5, 2025, 7:50 p.m. UTC | #2
On Thu, Jan 23, 2025 at 08:44:20AM +0000, Smita Koralahalli wrote:
> When PCIe AER is in FW-First, OS should process CXL Protocol errors from
> CPER records. Introduce support for handling and logging CXL Protocol
> errors.
> 
> The defined trace events cxl_aer_uncorrectable_error and
> cxl_aer_correctable_error trace native CXL AER endpoint errors. Reuse them
> to trace FW-First Protocol errors.
> 
> Since the CXL code is required to be called from process context and
> GHES is in interrupt context, use workqueues for processing.
> 
> Similar to CXL CPER event handling, use kfifo to handle errors as it
> simplifies queue processing by providing lock free fifo operations.
> 
> Add the ability for the CXL sub-system to register a workqueue to
> process CXL CPER protocol errors.
> 
> Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
> Reviewed-by: Dave Jiang <dave.jiang@intel.com>
> Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
> Reviewed-by: Ira Weiny <ira.weiny@intel.com>

Reviewed-by: Gregory Price <gourry@gourry.net>
Dan Williams Feb. 5, 2025, 10:58 p.m. UTC | #3
Smita Koralahalli wrote:
> When PCIe AER is in FW-First, OS should process CXL Protocol errors from
> CPER records. Introduce support for handling and logging CXL Protocol
> errors.
> 
> The defined trace events cxl_aer_uncorrectable_error and
> cxl_aer_correctable_error trace native CXL AER endpoint errors. Reuse them
> to trace FW-First Protocol errors.
> 
> Since the CXL code is required to be called from process context and
> GHES is in interrupt context, use workqueues for processing.
> 
> Similar to CXL CPER event handling, use kfifo to handle errors as it
> simplifies queue processing by providing lock free fifo operations.
> 
> Add the ability for the CXL sub-system to register a workqueue to
> process CXL CPER protocol errors.
> 
> Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
> Reviewed-by: Dave Jiang <dave.jiang@intel.com>
> Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
> Reviewed-by: Ira Weiny <ira.weiny@intel.com>
> ---

This patch confuses me. The plumbing to route CXL component error
records back to the cxl_pci driver was motivated by the driver having a
significant amount of context about component state and code to handle
OS first reporting of component errors from the device mailbox.

Protocol errors are different. They implicate various ports where the
cxl_pci driver may not have any additional information to add.

I feel like this patch makes more sense after CXL protocol errors become
a first class citizen in the core, and that generic CXL protocol error
tracing lives in the core, not a cxl_pci driver callback.

So, similar to how aer_recover_queue() traces all PCIe protocol errors
and optionally lets endpoint drivers recover the link via
pcie_do_recovery(), a cxl_recover_queue() is needed. That would be the
place to land general CXL protocol error prints and optionally call back
into drivers to add more device specific color if necessary.

I am ok with the CXL core centralizing all protocol error processing
like the built-in PCI core, but the generic CXL memory expander driver,
cxl_pci, is the wrong place to handle system wide protocol errors across
all device types.

I expect this is new infrastructure that we will get from Terry's
patches, but please do challenge me if you think I am missing something.
Koralahalli Channabasappa, Smita Feb. 12, 2025, 8:57 p.m. UTC | #4
Hi Dan,

On 2/5/2025 2:58 PM, Dan Williams wrote:
> Smita Koralahalli wrote:
>> When PCIe AER is in FW-First, OS should process CXL Protocol errors from
>> CPER records. Introduce support for handling and logging CXL Protocol
>> errors.
>>
>> The defined trace events cxl_aer_uncorrectable_error and
>> cxl_aer_correctable_error trace native CXL AER endpoint errors. Reuse them
>> to trace FW-First Protocol errors.
>>
>> Since the CXL code is required to be called from process context and
>> GHES is in interrupt context, use workqueues for processing.
>>
>> Similar to CXL CPER event handling, use kfifo to handle errors as it
>> simplifies queue processing by providing lock free fifo operations.
>>
>> Add the ability for the CXL sub-system to register a workqueue to
>> process CXL CPER protocol errors.
>>
>> Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
>> Reviewed-by: Dave Jiang <dave.jiang@intel.com>
>> Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
>> Reviewed-by: Ira Weiny <ira.weiny@intel.com>
>> ---
> 
> This patch confuses me. The plumbing to route CXL component error
> records back to the cxl_pci driver was motivated by the driver having a
> significant amount of context about component state and code to handle
> OS first reporting of component errors from the device mailbox.
> 
> Protocol errors are different. They implicate various ports where the
> cxl_pci driver may not have any additional information to add.
> 
> I feel like this patch makes more sense after CXL protocol errors become
> a first class citizen in the core, and that generic CXL protocol error
> tracing lives in the core, not a cxl_pci driver callback.
> 
> So, similar to how aer_recover_queue() traces all PCIe protocol errors
> and optionally lets endpoint drivers recover the link via
> pcie_do_recovery(), a cxl_recover_queue() is needed. That would be the
> place to land general CXL protocol error prints and optionally call back
> into drivers to add more device specific color if necessary.
> 
> I am ok with the CXL core centralizing all protocol error processing
> like the built-in PCI core, but the generic CXL memory expander driver,
> cxl_pci, is the wrong place to handle system wide protocol errors across
> all device types.
> 
> I expect this is new infrastructure that we will get from Terry's
> patches, but please do challenge me if you think I am missing something.

I agree. I don't have any specific reason to place all of this in 
cxl_pci driver. Given that CXL protocol errors affect multiple ports and 
should be handled centrally, should their processing be placed in 
pcie/aer.c or pcie/err.c?

Or would it be better to introduce a new pcie/cxl_err.c file to handle 
all CXL protocol errors separately?

Additionally, since the patch involves calling CXL tracing routines, 
some integration with the CXL driver is required. Would you suggest 
handling this plumbing within the PCIe error handling core, or should 
there be a specific callback mechanism into the CXL driver?

Thanks
Smita
Koralahalli Channabasappa, Smita Feb. 12, 2025, 9:04 p.m. UTC | #5
On 2/3/2025 11:03 AM, Luck, Tony wrote:
> On Thu, Jan 23, 2025 at 08:44:20AM +0000, Smita Koralahalli wrote:
>> When PCIe AER is in FW-First, OS should process CXL Protocol errors from
>> CPER records. Introduce support for handling and logging CXL Protocol
>> errors.
>>
>> The defined trace events cxl_aer_uncorrectable_error and
>> cxl_aer_correctable_error trace native CXL AER endpoint errors. Reuse them
>> to trace FW-First Protocol errors.
>>
>> Since the CXL code is required to be called from process context and
>> GHES is in interrupt context, use workqueues for processing.
>>
>> Similar to CXL CPER event handling, use kfifo to handle errors as it
>> simplifies queue processing by providing lock free fifo operations.
>>
>> Add the ability for the CXL sub-system to register a workqueue to
>> process CXL CPER protocol errors.
>>
>> Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
>> Reviewed-by: Dave Jiang <dave.jiang@intel.com>
>> Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
>> Reviewed-by: Ira Weiny <ira.weiny@intel.com>
>> ---
>>   drivers/acpi/apei/ghes.c | 49 ++++++++++++++++++++++++++++++++++++++++
>>   drivers/cxl/core/pci.c   | 36 +++++++++++++++++++++++++++++
>>   drivers/cxl/cxlpci.h     |  5 ++++
>>   drivers/cxl/pci.c        | 46 ++++++++++++++++++++++++++++++++++++-
>>   include/cxl/event.h      | 15 ++++++++++++
>>   5 files changed, 150 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
>> index 4d725d988c43..289e365f84b2 100644
>> --- a/drivers/acpi/apei/ghes.c
>> +++ b/drivers/acpi/apei/ghes.c
>> @@ -674,6 +674,15 @@ static void ghes_defer_non_standard_event(struct acpi_hest_generic_data *gdata,
>>   	schedule_work(&entry->work);
>>   }
>>   
>> +/* Room for 8 entries */
> 
> Any science behind the choice of "8" here? This comment is merely
> stating what the #define is used for, not why 8 was chosen.
> 

The choice of "8" was arbitrary and not based on a specific rationale. 
If there are better heuristics or considerations for determining the 
optimal number of entries, I’d appreciate any suggestions.

Thanks
Smita
diff mbox series

Patch

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index 4d725d988c43..289e365f84b2 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -674,6 +674,15 @@  static void ghes_defer_non_standard_event(struct acpi_hest_generic_data *gdata,
 	schedule_work(&entry->work);
 }
 
+/* Room for 8 entries */
+#define CXL_CPER_PROT_ERR_FIFO_DEPTH 8
+static DEFINE_KFIFO(cxl_cper_prot_err_fifo, struct cxl_cper_prot_err_work_data,
+		    CXL_CPER_PROT_ERR_FIFO_DEPTH);
+
+/* Synchronize schedule_work() with cxl_cper_prot_err_work changes */
+static DEFINE_SPINLOCK(cxl_cper_prot_err_work_lock);
+struct work_struct *cxl_cper_prot_err_work;
+
 static void cxl_cper_post_prot_err(struct cxl_cper_sec_prot_err *prot_err,
 				   int severity)
 {
@@ -700,6 +709,11 @@  static void cxl_cper_post_prot_err(struct cxl_cper_sec_prot_err *prot_err,
 	if (!(prot_err->valid_bits & PROT_ERR_VALID_SERIAL_NUMBER))
 		pr_warn(FW_WARN "CXL CPER no device serial number\n");
 
+	guard(spinlock_irqsave)(&cxl_cper_prot_err_work_lock);
+
+	if (!cxl_cper_prot_err_work)
+		return;
+
 	switch (prot_err->agent_type) {
 	case RCD:
 	case DEVICE:
@@ -721,9 +735,44 @@  static void cxl_cper_post_prot_err(struct cxl_cper_sec_prot_err *prot_err,
 				   prot_err->agent_type);
 		return;
 	}
+
+	if (!kfifo_put(&cxl_cper_prot_err_fifo, wd)) {
+		pr_err_ratelimited("CXL CPER kfifo overflow\n");
+		return;
+	}
+
+	schedule_work(cxl_cper_prot_err_work);
 #endif
 }
 
+int cxl_cper_register_prot_err_work(struct work_struct *work)
+{
+	if (cxl_cper_prot_err_work)
+		return -EINVAL;
+
+	guard(spinlock)(&cxl_cper_prot_err_work_lock);
+	cxl_cper_prot_err_work = work;
+	return 0;
+}
+EXPORT_SYMBOL_NS_GPL(cxl_cper_register_prot_err_work, "CXL");
+
+int cxl_cper_unregister_prot_err_work(struct work_struct *work)
+{
+	if (cxl_cper_prot_err_work != work)
+		return -EINVAL;
+
+	guard(spinlock)(&cxl_cper_prot_err_work_lock);
+	cxl_cper_prot_err_work = NULL;
+	return 0;
+}
+EXPORT_SYMBOL_NS_GPL(cxl_cper_unregister_prot_err_work, "CXL");
+
+int cxl_cper_prot_err_kfifo_get(struct cxl_cper_prot_err_work_data *wd)
+{
+	return kfifo_get(&cxl_cper_prot_err_fifo, wd);
+}
+EXPORT_SYMBOL_NS_GPL(cxl_cper_prot_err_kfifo_get, "CXL");
+
 /* Room for 8 entries for each of the 4 event log queues */
 #define CXL_CPER_FIFO_DEPTH 32
 DEFINE_KFIFO(cxl_cper_fifo, struct cxl_cper_work_data, CXL_CPER_FIFO_DEPTH);
diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
index 9d58ab9d33c5..5840056bb9a3 100644
--- a/drivers/cxl/core/pci.c
+++ b/drivers/cxl/core/pci.c
@@ -650,6 +650,42 @@  void read_cdat_data(struct cxl_port *port)
 }
 EXPORT_SYMBOL_NS_GPL(read_cdat_data, "CXL");
 
+void cxl_cper_trace_corr_prot_err(struct pci_dev *pdev,
+				  struct cxl_ras_capability_regs ras_cap)
+{
+	u32 status = ras_cap.cor_status & ~ras_cap.cor_mask;
+	struct cxl_dev_state *cxlds;
+
+	cxlds = pci_get_drvdata(pdev);
+	if (!cxlds)
+		return;
+
+	trace_cxl_aer_correctable_error(cxlds->cxlmd, status);
+}
+EXPORT_SYMBOL_NS_GPL(cxl_cper_trace_corr_prot_err, "CXL");
+
+void cxl_cper_trace_uncorr_prot_err(struct pci_dev *pdev,
+				    struct cxl_ras_capability_regs ras_cap)
+{
+	u32 status = ras_cap.uncor_status & ~ras_cap.uncor_mask;
+	struct cxl_dev_state *cxlds;
+	u32 fe;
+
+	cxlds = pci_get_drvdata(pdev);
+	if (!cxlds)
+		return;
+
+	if (hweight32(status) > 1)
+		fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK,
+				   ras_cap.cap_control));
+	else
+		fe = status;
+
+	trace_cxl_aer_uncorrectable_error(cxlds->cxlmd, status, fe,
+					  ras_cap.header_log);
+}
+EXPORT_SYMBOL_NS_GPL(cxl_cper_trace_uncorr_prot_err, "CXL");
+
 static void __cxl_handle_cor_ras(struct cxl_dev_state *cxlds,
 				 void __iomem *ras_base)
 {
diff --git a/drivers/cxl/cxlpci.h b/drivers/cxl/cxlpci.h
index 4da07727ab9c..e457616373ed 100644
--- a/drivers/cxl/cxlpci.h
+++ b/drivers/cxl/cxlpci.h
@@ -129,4 +129,9 @@  void read_cdat_data(struct cxl_port *port);
 void cxl_cor_error_detected(struct pci_dev *pdev);
 pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
 				    pci_channel_state_t state);
+struct cxl_ras_capability_regs;
+void cxl_cper_trace_corr_prot_err(struct pci_dev *pdev,
+				  struct cxl_ras_capability_regs ras_cap);
+void cxl_cper_trace_uncorr_prot_err(struct pci_dev *pdev,
+				    struct cxl_ras_capability_regs ras_cap);
 #endif /* __CXL_PCI_H__ */
diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
index 6d94ff4a4f1a..9d4b5f39b21a 100644
--- a/drivers/cxl/pci.c
+++ b/drivers/cxl/pci.c
@@ -1160,6 +1160,37 @@  static void cxl_cper_work_fn(struct work_struct *work)
 }
 static DECLARE_WORK(cxl_cper_work, cxl_cper_work_fn);
 
+static void cxl_cper_handle_prot_err(struct cxl_cper_prot_err_work_data *data)
+{
+	unsigned int devfn = PCI_DEVFN(data->prot_err.agent_addr.device,
+				       data->prot_err.agent_addr.function);
+	struct pci_dev *pdev __free(pci_dev_put) =
+		pci_get_domain_bus_and_slot(data->prot_err.agent_addr.segment,
+					    data->prot_err.agent_addr.bus,
+					    devfn);
+
+	if (!pdev)
+		return;
+
+	guard(device)(&pdev->dev);
+	if (pdev->driver != &cxl_pci_driver)
+		return;
+
+	if (data->severity == AER_CORRECTABLE)
+		cxl_cper_trace_corr_prot_err(pdev, data->ras_cap);
+	else
+		cxl_cper_trace_uncorr_prot_err(pdev, data->ras_cap);
+}
+
+static void cxl_cper_prot_err_work_fn(struct work_struct *work)
+{
+	struct cxl_cper_prot_err_work_data wd;
+
+	while (cxl_cper_prot_err_kfifo_get(&wd))
+		cxl_cper_handle_prot_err(&wd);
+}
+static DECLARE_WORK(cxl_cper_prot_err_work, cxl_cper_prot_err_work_fn);
+
 static int __init cxl_pci_driver_init(void)
 {
 	int rc;
@@ -1170,7 +1201,18 @@  static int __init cxl_pci_driver_init(void)
 
 	rc = cxl_cper_register_work(&cxl_cper_work);
 	if (rc)
-		pci_unregister_driver(&cxl_pci_driver);
+		goto err_unreg;
+
+	rc = cxl_cper_register_prot_err_work(&cxl_cper_prot_err_work);
+	if (rc)
+		goto err_unregister_work;
+
+	return 0;
+
+err_unregister_work:
+	cxl_cper_unregister_work(&cxl_cper_work);
+err_unreg:
+	pci_unregister_driver(&cxl_pci_driver);
 
 	return rc;
 }
@@ -1178,7 +1220,9 @@  static int __init cxl_pci_driver_init(void)
 static void __exit cxl_pci_driver_exit(void)
 {
 	cxl_cper_unregister_work(&cxl_cper_work);
+	cxl_cper_unregister_prot_err_work(&cxl_cper_prot_err_work);
 	cancel_work_sync(&cxl_cper_work);
+	cancel_work_sync(&cxl_cper_prot_err_work);
 	pci_unregister_driver(&cxl_pci_driver);
 }
 
diff --git a/include/cxl/event.h b/include/cxl/event.h
index ee1c3dec62fa..359a8f44a2e0 100644
--- a/include/cxl/event.h
+++ b/include/cxl/event.h
@@ -242,6 +242,9 @@  struct cxl_cper_prot_err_work_data {
 int cxl_cper_register_work(struct work_struct *work);
 int cxl_cper_unregister_work(struct work_struct *work);
 int cxl_cper_kfifo_get(struct cxl_cper_work_data *wd);
+int cxl_cper_register_prot_err_work(struct work_struct *work);
+int cxl_cper_unregister_prot_err_work(struct work_struct *work);
+int cxl_cper_prot_err_kfifo_get(struct cxl_cper_prot_err_work_data *wd);
 #else
 static inline int cxl_cper_register_work(struct work_struct *work)
 {
@@ -256,6 +259,18 @@  static inline int cxl_cper_kfifo_get(struct cxl_cper_work_data *wd)
 {
 	return 0;
 }
+static inline int cxl_cper_register_prot_err_work(struct work_struct *work)
+{
+	return 0;
+}
+static inline int cxl_cper_unregister_prot_err_work(struct work_struct *work)
+{
+	return 0;
+}
+static inline int cxl_cper_prot_err_kfifo_get(struct cxl_cper_prot_err_work_data *wd)
+{
+	return 0;
+}
 #endif
 
 #endif /* _LINUX_CXL_EVENT_H */