diff mbox series

[v3,7/7] acpi/ghes, cxl/pci: Process CXL CPER Protocol Errors

Message ID 20241119003915.174386-8-Smita.KoralahalliChannabasappa@amd.com
State New
Headers show
Series acpi/ghes, cper, cxl: Process CXL CPER Protocol errors | expand

Commit Message

Koralahalli Channabasappa, Smita Nov. 19, 2024, 12:39 a.m. UTC
When PCIe AER is in FW-First, OS should process CXL Protocol errors from
CPER records. Introduce support for handling and logging CXL Protocol
errors.

The defined trace events cxl_aer_uncorrectable_error and
cxl_aer_correctable_error trace native CXL AER endpoint errors, while
cxl_cper_trace_corr_prot_err and cxl_cper_trace_uncorr_prot_err
trace native CXL AER port errors. Reuse both sets to trace FW-First
protocol errors.

Since the CXL code is required to be called from process context and
GHES is in interrupt context, use workqueues for processing.

Similar to CXL CPER event handling, use kfifo to handle errors as it
simplifies queue processing by providing lock free fifo operations.

Add the ability for the CXL sub-system to register a workqueue to
process CXL CPER protocol errors.

Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
---
 drivers/acpi/apei/ghes.c | 41 ++++++++++++++++++++++++++++++
 drivers/cxl/core/pci.c   | 50 ++++++++++++++++++++++++++++++++++++
 drivers/cxl/cxlpci.h     |  6 +++++
 drivers/cxl/pci.c        | 55 ++++++++++++++++++++++++++++++++++++++++
 include/cxl/event.h      | 15 +++++++++++
 5 files changed, 167 insertions(+)

Comments

Jonathan Cameron Nov. 26, 2024, 4:05 p.m. UTC | #1
On Tue, 19 Nov 2024 00:39:15 +0000
Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com> wrote:

> When PCIe AER is in FW-First, OS should process CXL Protocol errors from
> CPER records. Introduce support for handling and logging CXL Protocol
> errors.
> 
> The defined trace events cxl_aer_uncorrectable_error and
> cxl_aer_correctable_error trace native CXL AER endpoint errors, while
> cxl_cper_trace_corr_prot_err and cxl_cper_trace_uncorr_prot_err
> trace native CXL AER port errors. Reuse both sets to trace FW-First
> protocol errors.
> 
> Since the CXL code is required to be called from process context and
> GHES is in interrupt context, use workqueues for processing.
> 
> Similar to CXL CPER event handling, use kfifo to handle errors as it
> simplifies queue processing by providing lock free fifo operations.
> 
> Add the ability for the CXL sub-system to register a workqueue to
> process CXL CPER protocol errors.
> 
> Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>

A few minor comments inline.

Thanks

Jonathan

> diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
> index 4ede038a7148..c992b34c290b 100644
> --- a/drivers/cxl/core/pci.c
> +++ b/drivers/cxl/core/pci.c
> @@ -650,6 +650,56 @@ void read_cdat_data(struct cxl_port *port)
>  }
>  EXPORT_SYMBOL_NS_GPL(read_cdat_data, CXL);
>  
> +void cxl_cper_trace_corr_prot_err(struct pci_dev *pdev, bool flag,
> +				  struct cxl_ras_capability_regs ras_cap)
> +{
> +	struct cxl_dev_state *cxlds;
> +	u32 status;
> +
> +	status = ras_cap.cor_status & ~ras_cap.cor_mask;
> +
> +	if (!flag) {

As below. Name of flag is not very helpful when reading the code.
Perhaps we can rename?

> +		trace_cxl_port_aer_correctable_error(&pdev->dev, status);
> +		return;
> +	}
> +
> +	cxlds = pci_get_drvdata(pdev);
> +	if (!cxlds)
> +		return;
> +
> +	trace_cxl_aer_correctable_error(cxlds->cxlmd, status);
> +}
> +EXPORT_SYMBOL_NS_GPL(cxl_cper_trace_corr_prot_err, CXL);
> +
> +void cxl_cper_trace_uncorr_prot_err(struct pci_dev *pdev, bool flag,
> +				    struct cxl_ras_capability_regs ras_cap)
> +{
> +	struct cxl_dev_state *cxlds;
> +	u32 status, fe;
> +
> +	status = ras_cap.uncor_status & ~ras_cap.uncor_mask;
> +
> +	if (hweight32(status) > 1)
> +		fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK,
> +				   ras_cap.cap_control));
> +	else
> +		fe = status;
> +
> +	if (!flag) {

Why does  a bool named flag indicate it's a port error?

> +		trace_cxl_port_aer_uncorrectable_error(&pdev->dev, status, fe,
> +						       ras_cap.header_log);
> +		return;
> +	}
> +
> +	cxlds = pci_get_drvdata(pdev);
> +	if (!cxlds)
> +		return;
> +
> +	trace_cxl_aer_uncorrectable_error(cxlds->cxlmd, status, fe,
> +					  ras_cap.header_log);
> +}
> +EXPORT_SYMBOL_NS_GPL(cxl_cper_trace_uncorr_prot_err, CXL);
> +
>  static void __cxl_handle_cor_ras(struct device *dev,
>  				 void __iomem *ras_base)
>  {
> diff --git a/drivers/cxl/cxlpci.h b/drivers/cxl/cxlpci.h
> index 4da07727ab9c..5e4aa8681937 100644
> --- a/drivers/cxl/cxlpci.h
> +++ b/drivers/cxl/cxlpci.h
> @@ -129,4 +129,10 @@ void read_cdat_data(struct cxl_port *port);
>  void cxl_cor_error_detected(struct pci_dev *pdev);
>  pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
>  				    pci_channel_state_t state);
> +
> +struct cxl_ras_capability_regs;
> +void cxl_cper_trace_corr_prot_err(struct pci_dev *pdev, bool flag,
> +				  struct cxl_ras_capability_regs ras_cap);
> +void cxl_cper_trace_uncorr_prot_err(struct pci_dev *pdev, bool flag,
> +				    struct cxl_ras_capability_regs ras_cap);
>  #endif /* __CXL_PCI_H__ */
> diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
> index 88a14d7baa65..e261abe60e90 100644
> --- a/drivers/cxl/pci.c
> +++ b/drivers/cxl/pci.c
> @@ -1067,6 +1067,53 @@ static void cxl_cper_work_fn(struct work_struct *work)
>  }
>  static DECLARE_WORK(cxl_cper_work, cxl_cper_work_fn);
>  
> +static void cxl_cper_handle_prot_err(struct cxl_cper_prot_err_work_data *data)
> +{
> +	unsigned int devfn = PCI_DEVFN(data->prot_err.agent_addr.device,
> +				       data->prot_err.agent_addr.function);
> +	struct pci_dev *pdev __free(pci_dev_put) =
> +		pci_get_domain_bus_and_slot(
> +			data->prot_err.agent_addr.segment,
> +			data->prot_err.agent_addr.bus,
> +			devfn
> +		);
		pci_get_domain_bus_and_slot(data->prot_err.agent_addr.segment,
					    data->prot_err.agent_addr.bus,
					    devfn);

> +	int port_type;
> +
> +	if (!pdev)
> +		return;
> +
> +	guard(device)(&pdev->dev);
> +	if (pdev->driver != &cxl_pci_driver)
> +		return;
> +
> +	port_type = pci_pcie_type(pdev);
> +	if (port_type == PCI_EXP_TYPE_ROOT_PORT ||
> +	    port_type == PCI_EXP_TYPE_DOWNSTREAM ||
> +	    port_type == PCI_EXP_TYPE_UPSTREAM) {
> +		if (data->severity == AER_CORRECTABLE)
> +			cxl_cper_trace_corr_prot_err(pdev, false, data->ras_cap);
> +		else
> +			cxl_cper_trace_uncorr_prot_err(pdev, false, data->ras_cap);
> +
> +		return;
> +	}
> +
> +	if (data->severity == AER_CORRECTABLE)
> +		cxl_cper_trace_corr_prot_err(pdev, true, data->ras_cap);
> +	else
> +		cxl_cper_trace_uncorr_prot_err(pdev, true, data->ras_cap);
> +
> +}

>  static int __init cxl_pci_driver_init(void)
>  {
>  	int rc;
> @@ -1079,13 +1126,21 @@ static int __init cxl_pci_driver_init(void)
>  	if (rc)
>  		pci_unregister_driver(&cxl_pci_driver);
>  
> +	rc = cxl_cper_register_prot_err_work(&cxl_cper_prot_err_work);
> +	if (rc) {
> +		cxl_cper_unregister_event_work(&cxl_cper_work);
> +		pci_unregister_driver(&cxl_pci_driver);
I'd switch this to a goto style for error handling.


> +	}
> +
>  	return rc;

that is
	return 0;

err_unregister_event_work:
	cxl_cper_unregister_event_work(&cxl_cper_work);
err_unreg:
	pci_unregister_driver(&cxl_pci_driver);
	return rc;
>  }
Koralahalli Channabasappa, Smita Nov. 27, 2024, 8:35 p.m. UTC | #2
On 11/26/2024 8:05 AM, Jonathan Cameron wrote:
> On Tue, 19 Nov 2024 00:39:15 +0000
> Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com> wrote:
> 
>> When PCIe AER is in FW-First, OS should process CXL Protocol errors from
>> CPER records. Introduce support for handling and logging CXL Protocol
>> errors.
>>
>> The defined trace events cxl_aer_uncorrectable_error and
>> cxl_aer_correctable_error trace native CXL AER endpoint errors, while
>> cxl_cper_trace_corr_prot_err and cxl_cper_trace_uncorr_prot_err
>> trace native CXL AER port errors. Reuse both sets to trace FW-First
>> protocol errors.
>>
>> Since the CXL code is required to be called from process context and
>> GHES is in interrupt context, use workqueues for processing.
>>
>> Similar to CXL CPER event handling, use kfifo to handle errors as it
>> simplifies queue processing by providing lock free fifo operations.
>>
>> Add the ability for the CXL sub-system to register a workqueue to
>> process CXL CPER protocol errors.
>>
>> Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
> 
> A few minor comments inline.
> 
> Thanks
> 
> Jonathan
> 
>> diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
>> index 4ede038a7148..c992b34c290b 100644
>> --- a/drivers/cxl/core/pci.c
>> +++ b/drivers/cxl/core/pci.c
>> @@ -650,6 +650,56 @@ void read_cdat_data(struct cxl_port *port)
>>   }
>>   EXPORT_SYMBOL_NS_GPL(read_cdat_data, CXL);
>>   
>> +void cxl_cper_trace_corr_prot_err(struct pci_dev *pdev, bool flag,
>> +				  struct cxl_ras_capability_regs ras_cap)
>> +{
>> +	struct cxl_dev_state *cxlds;
>> +	u32 status;
>> +
>> +	status = ras_cap.cor_status & ~ras_cap.cor_mask;
>> +
>> +	if (!flag) {
> 
> As below. Name of flag is not very helpful when reading the code.
> Perhaps we can rename?

Okay. May be flag -> is_device_error ?
> 
>> +		trace_cxl_port_aer_correctable_error(&pdev->dev, status);
>> +		return;
>> +	}
>> +
>> +	cxlds = pci_get_drvdata(pdev);
>> +	if (!cxlds)
>> +		return;
>> +
>> +	trace_cxl_aer_correctable_error(cxlds->cxlmd, status);
>> +}
>> +EXPORT_SYMBOL_NS_GPL(cxl_cper_trace_corr_prot_err, CXL);
>> +
>> +void cxl_cper_trace_uncorr_prot_err(struct pci_dev *pdev, bool flag,
>> +				    struct cxl_ras_capability_regs ras_cap)
>> +{
>> +	struct cxl_dev_state *cxlds;
>> +	u32 status, fe;
>> +
>> +	status = ras_cap.uncor_status & ~ras_cap.uncor_mask;
>> +
>> +	if (hweight32(status) > 1)
>> +		fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK,
>> +				   ras_cap.cap_control));
>> +	else
>> +		fe = status;
>> +
>> +	if (!flag) {
> 
> Why does  a bool named flag indicate it's a port error?

I will rename it.

Or may be use an enum to explicitly define the error type 
(CXL_ERROR_TYPE_DEVICE and CXL_ERROR_TYPE_PORT).

Or may be split the function into two distinct ones, one for port errors 
and one for device errors.

Let me know your preference or any other suggestions here. I will change 
it accordingly.

> 
>> +		trace_cxl_port_aer_uncorrectable_error(&pdev->dev, status, fe,
>> +						       ras_cap.header_log);
>> +		return;
>> +	}
>> +
>> +	cxlds = pci_get_drvdata(pdev);
>> +	if (!cxlds)
>> +		return;
>> +
>> +	trace_cxl_aer_uncorrectable_error(cxlds->cxlmd, status, fe,
>> +					  ras_cap.header_log);
>> +}
>> +EXPORT_SYMBOL_NS_GPL(cxl_cper_trace_uncorr_prot_err, CXL);
>> +
>>   static void __cxl_handle_cor_ras(struct device *dev,
>>   				 void __iomem *ras_base)
>>   {
>> diff --git a/drivers/cxl/cxlpci.h b/drivers/cxl/cxlpci.h
>> index 4da07727ab9c..5e4aa8681937 100644
>> --- a/drivers/cxl/cxlpci.h
>> +++ b/drivers/cxl/cxlpci.h
>> @@ -129,4 +129,10 @@ void read_cdat_data(struct cxl_port *port);
>>   void cxl_cor_error_detected(struct pci_dev *pdev);
>>   pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
>>   				    pci_channel_state_t state);
>> +
>> +struct cxl_ras_capability_regs;
>> +void cxl_cper_trace_corr_prot_err(struct pci_dev *pdev, bool flag,
>> +				  struct cxl_ras_capability_regs ras_cap);
>> +void cxl_cper_trace_uncorr_prot_err(struct pci_dev *pdev, bool flag,
>> +				    struct cxl_ras_capability_regs ras_cap);
>>   #endif /* __CXL_PCI_H__ */
>> diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
>> index 88a14d7baa65..e261abe60e90 100644
>> --- a/drivers/cxl/pci.c
>> +++ b/drivers/cxl/pci.c
>> @@ -1067,6 +1067,53 @@ static void cxl_cper_work_fn(struct work_struct *work)
>>   }
>>   static DECLARE_WORK(cxl_cper_work, cxl_cper_work_fn);
>>   
>> +static void cxl_cper_handle_prot_err(struct cxl_cper_prot_err_work_data *data)
>> +{
>> +	unsigned int devfn = PCI_DEVFN(data->prot_err.agent_addr.device,
>> +				       data->prot_err.agent_addr.function);
>> +	struct pci_dev *pdev __free(pci_dev_put) =
>> +		pci_get_domain_bus_and_slot(
>> +			data->prot_err.agent_addr.segment,
>> +			data->prot_err.agent_addr.bus,
>> +			devfn
>> +		);
> 		pci_get_domain_bus_and_slot(data->prot_err.agent_addr.segment,
> 					    data->prot_err.agent_addr.bus,
> 					    devfn);

Noted.

> 
>> +	int port_type;
>> +
>> +	if (!pdev)
>> +		return;
>> +
>> +	guard(device)(&pdev->dev);
>> +	if (pdev->driver != &cxl_pci_driver)
>> +		return;
>> +
>> +	port_type = pci_pcie_type(pdev);
>> +	if (port_type == PCI_EXP_TYPE_ROOT_PORT ||
>> +	    port_type == PCI_EXP_TYPE_DOWNSTREAM ||
>> +	    port_type == PCI_EXP_TYPE_UPSTREAM) {
>> +		if (data->severity == AER_CORRECTABLE)
>> +			cxl_cper_trace_corr_prot_err(pdev, false, data->ras_cap);
>> +		else
>> +			cxl_cper_trace_uncorr_prot_err(pdev, false, data->ras_cap);
>> +
>> +		return;
>> +	}
>> +
>> +	if (data->severity == AER_CORRECTABLE)
>> +		cxl_cper_trace_corr_prot_err(pdev, true, data->ras_cap);
>> +	else
>> +		cxl_cper_trace_uncorr_prot_err(pdev, true, data->ras_cap);
>> +
>> +}
> 
>>   static int __init cxl_pci_driver_init(void)
>>   {
>>   	int rc;
>> @@ -1079,13 +1126,21 @@ static int __init cxl_pci_driver_init(void)
>>   	if (rc)
>>   		pci_unregister_driver(&cxl_pci_driver);
>>   
>> +	rc = cxl_cper_register_prot_err_work(&cxl_cper_prot_err_work);
>> +	if (rc) {
>> +		cxl_cper_unregister_event_work(&cxl_cper_work);
>> +		pci_unregister_driver(&cxl_pci_driver);
> I'd switch this to a goto style for error handling.
> 
> 
>> +	}
>> +
>>   	return rc;
> 
> that is
> 	return 0;
> 
> err_unregister_event_work:
> 	cxl_cper_unregister_event_work(&cxl_cper_work);
> err_unreg:
> 	pci_unregister_driver(&cxl_pci_driver);
> 	return rc;
>>   }

Noted.

Thanks
Smita
>
Ira Weiny Dec. 2, 2024, 6:48 p.m. UTC | #3
Smita Koralahalli wrote:
> On 11/26/2024 8:05 AM, Jonathan Cameron wrote:
> > On Tue, 19 Nov 2024 00:39:15 +0000
> > Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com> wrote:
> > 
> >> When PCIe AER is in FW-First, OS should process CXL Protocol errors from
> >> CPER records. Introduce support for handling and logging CXL Protocol
> >> errors.
> >>
> >> The defined trace events cxl_aer_uncorrectable_error and
> >> cxl_aer_correctable_error trace native CXL AER endpoint errors, while
> >> cxl_cper_trace_corr_prot_err and cxl_cper_trace_uncorr_prot_err
> >> trace native CXL AER port errors. Reuse both sets to trace FW-First
> >> protocol errors.
> >>
> >> Since the CXL code is required to be called from process context and
> >> GHES is in interrupt context, use workqueues for processing.
> >>
> >> Similar to CXL CPER event handling, use kfifo to handle errors as it
> >> simplifies queue processing by providing lock free fifo operations.
> >>
> >> Add the ability for the CXL sub-system to register a workqueue to
> >> process CXL CPER protocol errors.
> >>
> >> Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
> > 
> > A few minor comments inline.
> > 
> > Thanks
> > 
> > Jonathan
> > 
> >> diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
> >> index 4ede038a7148..c992b34c290b 100644
> >> --- a/drivers/cxl/core/pci.c
> >> +++ b/drivers/cxl/core/pci.c
> >> @@ -650,6 +650,56 @@ void read_cdat_data(struct cxl_port *port)
> >>   }
> >>   EXPORT_SYMBOL_NS_GPL(read_cdat_data, CXL);
> >>   
> >> +void cxl_cper_trace_corr_prot_err(struct pci_dev *pdev, bool flag,
> >> +				  struct cxl_ras_capability_regs ras_cap)
> >> +{
> >> +	struct cxl_dev_state *cxlds;
> >> +	u32 status;
> >> +
> >> +	status = ras_cap.cor_status & ~ras_cap.cor_mask;
> >> +
> >> +	if (!flag) {
> > 
> > As below. Name of flag is not very helpful when reading the code.
> > Perhaps we can rename?
> 
> Okay. May be flag -> is_device_error ?

I had the same question about 'flag'.

> > 
> >> +		trace_cxl_port_aer_correctable_error(&pdev->dev, status);
> >> +		return;
> >> +	}
> >> +
> >> +	cxlds = pci_get_drvdata(pdev);
> >> +	if (!cxlds)
> >> +		return;
> >> +
> >> +	trace_cxl_aer_correctable_error(cxlds->cxlmd, status);
> >> +}
> >> +EXPORT_SYMBOL_NS_GPL(cxl_cper_trace_corr_prot_err, CXL);
> >> +
> >> +void cxl_cper_trace_uncorr_prot_err(struct pci_dev *pdev, bool flag,
> >> +				    struct cxl_ras_capability_regs ras_cap)
> >> +{
> >> +	struct cxl_dev_state *cxlds;
> >> +	u32 status, fe;
> >> +
> >> +	status = ras_cap.uncor_status & ~ras_cap.uncor_mask;
> >> +
> >> +	if (hweight32(status) > 1)
> >> +		fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK,
> >> +				   ras_cap.cap_control));
> >> +	else
> >> +		fe = status;
> >> +
> >> +	if (!flag) {
> > 
> > Why does  a bool named flag indicate it's a port error?
> 
> I will rename it.
> 
> Or may be use an enum to explicitly define the error type 
> (CXL_ERROR_TYPE_DEVICE and CXL_ERROR_TYPE_PORT).
> 
> Or may be split the function into two distinct ones, one for port errors 
> and one for device errors.

I would vote for 2 functions.
Ira
Koralahalli Channabasappa, Smita Dec. 6, 2024, 4:29 p.m. UTC | #4
On 12/2/2024 10:48 AM, Ira Weiny wrote:
> Smita Koralahalli wrote:
>> On 11/26/2024 8:05 AM, Jonathan Cameron wrote:
>>> On Tue, 19 Nov 2024 00:39:15 +0000
>>> Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com> wrote:
>>>
>>>> When PCIe AER is in FW-First, OS should process CXL Protocol errors from
>>>> CPER records. Introduce support for handling and logging CXL Protocol
>>>> errors.
>>>>
>>>> The defined trace events cxl_aer_uncorrectable_error and
>>>> cxl_aer_correctable_error trace native CXL AER endpoint errors, while
>>>> cxl_cper_trace_corr_prot_err and cxl_cper_trace_uncorr_prot_err
>>>> trace native CXL AER port errors. Reuse both sets to trace FW-First
>>>> protocol errors.
>>>>
>>>> Since the CXL code is required to be called from process context and
>>>> GHES is in interrupt context, use workqueues for processing.
>>>>
>>>> Similar to CXL CPER event handling, use kfifo to handle errors as it
>>>> simplifies queue processing by providing lock free fifo operations.
>>>>
>>>> Add the ability for the CXL sub-system to register a workqueue to
>>>> process CXL CPER protocol errors.
>>>>
>>>> Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
>>>
>>> A few minor comments inline.
>>>
>>> Thanks
>>>
>>> Jonathan
>>>
>>>> diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
>>>> index 4ede038a7148..c992b34c290b 100644
>>>> --- a/drivers/cxl/core/pci.c
>>>> +++ b/drivers/cxl/core/pci.c
>>>> @@ -650,6 +650,56 @@ void read_cdat_data(struct cxl_port *port)
>>>>    }
>>>>    EXPORT_SYMBOL_NS_GPL(read_cdat_data, CXL);
>>>>    
>>>> +void cxl_cper_trace_corr_prot_err(struct pci_dev *pdev, bool flag,
>>>> +				  struct cxl_ras_capability_regs ras_cap)
>>>> +{
>>>> +	struct cxl_dev_state *cxlds;
>>>> +	u32 status;
>>>> +
>>>> +	status = ras_cap.cor_status & ~ras_cap.cor_mask;
>>>> +
>>>> +	if (!flag) {
>>>
>>> As below. Name of flag is not very helpful when reading the code.
>>> Perhaps we can rename?
>>
>> Okay. May be flag -> is_device_error ?
> 
> I had the same question about 'flag'.
> 
>>>
>>>> +		trace_cxl_port_aer_correctable_error(&pdev->dev, status);
>>>> +		return;
>>>> +	}
>>>> +
>>>> +	cxlds = pci_get_drvdata(pdev);
>>>> +	if (!cxlds)
>>>> +		return;
>>>> +
>>>> +	trace_cxl_aer_correctable_error(cxlds->cxlmd, status);
>>>> +}
>>>> +EXPORT_SYMBOL_NS_GPL(cxl_cper_trace_corr_prot_err, CXL);
>>>> +
>>>> +void cxl_cper_trace_uncorr_prot_err(struct pci_dev *pdev, bool flag,
>>>> +				    struct cxl_ras_capability_regs ras_cap)
>>>> +{
>>>> +	struct cxl_dev_state *cxlds;
>>>> +	u32 status, fe;
>>>> +
>>>> +	status = ras_cap.uncor_status & ~ras_cap.uncor_mask;
>>>> +
>>>> +	if (hweight32(status) > 1)
>>>> +		fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK,
>>>> +				   ras_cap.cap_control));
>>>> +	else
>>>> +		fe = status;
>>>> +
>>>> +	if (!flag) {
>>>
>>> Why does  a bool named flag indicate it's a port error?
>>
>> I will rename it.
>>
>> Or may be use an enum to explicitly define the error type
>> (CXL_ERROR_TYPE_DEVICE and CXL_ERROR_TYPE_PORT).
>>
>> Or may be split the function into two distinct ones, one for port errors
>> and one for device errors.
> 
> I would vote for 2 functions.
> Ira

Noted. Thanks!

Thanks
Smita
diff mbox series

Patch

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index 6cd9d5375d7c..32062b6a9985 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -676,6 +676,15 @@  static void ghes_defer_non_standard_event(struct acpi_hest_generic_data *gdata,
 	schedule_work(&entry->work);
 }
 
+/* Room for 8 entries */
+#define CXL_CPER_PROT_ERR_FIFO_DEPTH 8
+static DEFINE_KFIFO(cxl_cper_prot_err_fifo, struct cxl_cper_prot_err_work_data,
+		    CXL_CPER_PROT_ERR_FIFO_DEPTH);
+
+/* Synchronize schedule_work() with cxl_cper_prot_err_work changes */
+static DEFINE_SPINLOCK(cxl_cper_prot_err_work_lock);
+struct work_struct *cxl_cper_prot_err_work;
+
 static void cxl_cper_post_prot_err(struct cxl_cper_sec_prot_err *prot_err,
 				   int severity)
 {
@@ -701,6 +710,11 @@  static void cxl_cper_post_prot_err(struct cxl_cper_sec_prot_err *prot_err,
 	if (!(prot_err->valid_bits & PROT_ERR_VALID_SERIAL_NUMBER))
 		pr_warn(FW_WARN "CXL CPER no device serial number\n");
 
+	guard(spinlock_irqsave)(&cxl_cper_prot_err_work_lock);
+
+	if (!cxl_cper_prot_err_work)
+		return;
+
 	switch (prot_err->agent_type) {
 	case RCD:
 	case DEVICE:
@@ -722,6 +736,13 @@  static void cxl_cper_post_prot_err(struct cxl_cper_sec_prot_err *prot_err,
 				   prot_err->agent_type);
 		return;
 	}
+
+	if (!kfifo_put(&cxl_cper_prot_err_fifo, wd)) {
+		pr_err_ratelimited("CXL CPER kfifo overflow\n");
+		return;
+	}
+
+	schedule_work(cxl_cper_prot_err_work);
 }
 
 /* Room for 8 entries for each of the 4 event log queues */
@@ -809,6 +830,26 @@  int cxl_cper_kfifo_get(struct cxl_cper_work_data *wd)
 }
 EXPORT_SYMBOL_NS_GPL(cxl_cper_kfifo_get, CXL);
 
+int cxl_cper_register_prot_err_work(struct work_struct *work)
+{
+	return cxl_cper_register_work(&cxl_cper_prot_err_work,
+				      &cxl_cper_prot_err_work_lock, work);
+}
+EXPORT_SYMBOL_NS_GPL(cxl_cper_register_prot_err_work, CXL);
+
+int cxl_cper_unregister_prot_err_work(struct work_struct *work)
+{
+	return cxl_cper_unregister_work(&cxl_cper_prot_err_work,
+					&cxl_cper_prot_err_work_lock, work);
+}
+EXPORT_SYMBOL_NS_GPL(cxl_cper_unregister_prot_err_work, CXL);
+
+int cxl_cper_prot_err_kfifo_get(struct cxl_cper_prot_err_work_data *wd)
+{
+	return kfifo_get(&cxl_cper_prot_err_fifo, wd);
+}
+EXPORT_SYMBOL_NS_GPL(cxl_cper_prot_err_kfifo_get, CXL);
+
 static bool ghes_do_proc(struct ghes *ghes,
 			 const struct acpi_hest_generic_status *estatus)
 {
diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
index 4ede038a7148..c992b34c290b 100644
--- a/drivers/cxl/core/pci.c
+++ b/drivers/cxl/core/pci.c
@@ -650,6 +650,56 @@  void read_cdat_data(struct cxl_port *port)
 }
 EXPORT_SYMBOL_NS_GPL(read_cdat_data, CXL);
 
+void cxl_cper_trace_corr_prot_err(struct pci_dev *pdev, bool flag,
+				  struct cxl_ras_capability_regs ras_cap)
+{
+	struct cxl_dev_state *cxlds;
+	u32 status;
+
+	status = ras_cap.cor_status & ~ras_cap.cor_mask;
+
+	if (!flag) {
+		trace_cxl_port_aer_correctable_error(&pdev->dev, status);
+		return;
+	}
+
+	cxlds = pci_get_drvdata(pdev);
+	if (!cxlds)
+		return;
+
+	trace_cxl_aer_correctable_error(cxlds->cxlmd, status);
+}
+EXPORT_SYMBOL_NS_GPL(cxl_cper_trace_corr_prot_err, CXL);
+
+void cxl_cper_trace_uncorr_prot_err(struct pci_dev *pdev, bool flag,
+				    struct cxl_ras_capability_regs ras_cap)
+{
+	struct cxl_dev_state *cxlds;
+	u32 status, fe;
+
+	status = ras_cap.uncor_status & ~ras_cap.uncor_mask;
+
+	if (hweight32(status) > 1)
+		fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK,
+				   ras_cap.cap_control));
+	else
+		fe = status;
+
+	if (!flag) {
+		trace_cxl_port_aer_uncorrectable_error(&pdev->dev, status, fe,
+						       ras_cap.header_log);
+		return;
+	}
+
+	cxlds = pci_get_drvdata(pdev);
+	if (!cxlds)
+		return;
+
+	trace_cxl_aer_uncorrectable_error(cxlds->cxlmd, status, fe,
+					  ras_cap.header_log);
+}
+EXPORT_SYMBOL_NS_GPL(cxl_cper_trace_uncorr_prot_err, CXL);
+
 static void __cxl_handle_cor_ras(struct device *dev,
 				 void __iomem *ras_base)
 {
diff --git a/drivers/cxl/cxlpci.h b/drivers/cxl/cxlpci.h
index 4da07727ab9c..5e4aa8681937 100644
--- a/drivers/cxl/cxlpci.h
+++ b/drivers/cxl/cxlpci.h
@@ -129,4 +129,10 @@  void read_cdat_data(struct cxl_port *port);
 void cxl_cor_error_detected(struct pci_dev *pdev);
 pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
 				    pci_channel_state_t state);
+
+struct cxl_ras_capability_regs;
+void cxl_cper_trace_corr_prot_err(struct pci_dev *pdev, bool flag,
+				  struct cxl_ras_capability_regs ras_cap);
+void cxl_cper_trace_uncorr_prot_err(struct pci_dev *pdev, bool flag,
+				    struct cxl_ras_capability_regs ras_cap);
 #endif /* __CXL_PCI_H__ */
diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
index 88a14d7baa65..e261abe60e90 100644
--- a/drivers/cxl/pci.c
+++ b/drivers/cxl/pci.c
@@ -1067,6 +1067,53 @@  static void cxl_cper_work_fn(struct work_struct *work)
 }
 static DECLARE_WORK(cxl_cper_work, cxl_cper_work_fn);
 
+static void cxl_cper_handle_prot_err(struct cxl_cper_prot_err_work_data *data)
+{
+	unsigned int devfn = PCI_DEVFN(data->prot_err.agent_addr.device,
+				       data->prot_err.agent_addr.function);
+	struct pci_dev *pdev __free(pci_dev_put) =
+		pci_get_domain_bus_and_slot(
+			data->prot_err.agent_addr.segment,
+			data->prot_err.agent_addr.bus,
+			devfn
+		);
+	int port_type;
+
+	if (!pdev)
+		return;
+
+	guard(device)(&pdev->dev);
+	if (pdev->driver != &cxl_pci_driver)
+		return;
+
+	port_type = pci_pcie_type(pdev);
+	if (port_type == PCI_EXP_TYPE_ROOT_PORT ||
+	    port_type == PCI_EXP_TYPE_DOWNSTREAM ||
+	    port_type == PCI_EXP_TYPE_UPSTREAM) {
+		if (data->severity == AER_CORRECTABLE)
+			cxl_cper_trace_corr_prot_err(pdev, false, data->ras_cap);
+		else
+			cxl_cper_trace_uncorr_prot_err(pdev, false, data->ras_cap);
+
+		return;
+	}
+
+	if (data->severity == AER_CORRECTABLE)
+		cxl_cper_trace_corr_prot_err(pdev, true, data->ras_cap);
+	else
+		cxl_cper_trace_uncorr_prot_err(pdev, true, data->ras_cap);
+
+}
+
+static void cxl_cper_prot_err_work_fn(struct work_struct *work)
+{
+	struct cxl_cper_prot_err_work_data wd;
+
+	while (cxl_cper_prot_err_kfifo_get(&wd))
+		cxl_cper_handle_prot_err(&wd);
+}
+static DECLARE_WORK(cxl_cper_prot_err_work, cxl_cper_prot_err_work_fn);
+
 static int __init cxl_pci_driver_init(void)
 {
 	int rc;
@@ -1079,13 +1126,21 @@  static int __init cxl_pci_driver_init(void)
 	if (rc)
 		pci_unregister_driver(&cxl_pci_driver);
 
+	rc = cxl_cper_register_prot_err_work(&cxl_cper_prot_err_work);
+	if (rc) {
+		cxl_cper_unregister_event_work(&cxl_cper_work);
+		pci_unregister_driver(&cxl_pci_driver);
+	}
+
 	return rc;
 }
 
 static void __exit cxl_pci_driver_exit(void)
 {
 	cxl_cper_unregister_event_work(&cxl_cper_work);
+	cxl_cper_unregister_prot_err_work(&cxl_cper_prot_err_work);
 	cancel_work_sync(&cxl_cper_work);
+	cancel_work_sync(&cxl_cper_prot_err_work);
 	pci_unregister_driver(&cxl_pci_driver);
 }
 
diff --git a/include/cxl/event.h b/include/cxl/event.h
index c9a38ebaf207..5f83c3bfc813 100644
--- a/include/cxl/event.h
+++ b/include/cxl/event.h
@@ -242,6 +242,9 @@  struct cxl_cper_prot_err_work_data {
 int cxl_cper_register_event_work(struct work_struct *work);
 int cxl_cper_unregister_event_work(struct work_struct *work);
 int cxl_cper_kfifo_get(struct cxl_cper_work_data *wd);
+int cxl_cper_register_prot_err_work(struct work_struct *work);
+int cxl_cper_unregister_prot_err_work(struct work_struct *work);
+int cxl_cper_prot_err_kfifo_get(struct cxl_cper_prot_err_work_data *wd);
 #else
 static inline int cxl_cper_register_event_work(struct work_struct *work)
 {
@@ -256,6 +259,18 @@  static inline int cxl_cper_kfifo_get(struct cxl_cper_work_data *wd)
 {
 	return 0;
 }
+static inline int cxl_cper_register_prot_err_work(struct work_struct *work)
+{
+	return 0;
+}
+static inline int cxl_cper_unregister_prot_err_work(struct work_struct *work)
+{
+	return 0;
+}
+static inline int cxl_cper_prot_err_kfifo_get(struct cxl_cper_prot_err_work_data *wd)
+{
+	return 0;
+}
 #endif
 
 #endif /* _LINUX_CXL_EVENT_H */