diff mbox series

[2/4] spi: spi-fsl-dspi: Use non-coherent memory for DMA

Message ID 20250609-james-nxp-spi-dma-v1-2-2b831e714be2@linaro.org
State Superseded
Headers show
Series spi: spi-fsl-dspi: Target mode improvements | expand

Commit Message

James Clark June 9, 2025, 3:32 p.m. UTC
Using coherent memory here isn't functionally necessary. Because the
change to use non-coherent memory isn't overly complex and only a few
synchronization points are required, we might as well do it while fixing
up some other DMA issues.

Suggested-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: James Clark <james.clark@linaro.org>
---
 drivers/spi/spi-fsl-dspi.c | 55 +++++++++++++++++++++++++++++-----------------
 1 file changed, 35 insertions(+), 20 deletions(-)

Comments

James Clark June 10, 2025, 9:03 a.m. UTC | #1
On 10/06/2025 9:26 am, Arnd Bergmann wrote:
> On Mon, Jun 9, 2025, at 17:32, James Clark wrote:
>> Using coherent memory here isn't functionally necessary. Because the
>> change to use non-coherent memory isn't overly complex and only a few
>> synchronization points are required, we might as well do it while fixing
>> up some other DMA issues.
>>
>> Suggested-by: Arnd Bergmann <arnd@arndb.de>
>> Signed-off-by: James Clark <james.clark@linaro.org>
> 
> This version looks good to me,
> 
> Acked-by: Arnd Bergmann <arnd@arndb.de>
> 
> I had reviewed an internal version originally and had some comment
> on that, all of which are addressed now. You did not Cc me on the
> other patches, so I looked them up in the archive, Patch 3 also

Yes sorry about that. I've just started using "b4 send" and I was under 
the impression that it would automatically CC all patches the same way, 
but apparently not. Maybe I'm holding it wrong.

> looks good to me and complements this one (i.e. you really want
> the combination). I did not understand the logic in patch 4,
> and it would be good if someone else can take a closer look
> at that in order to Ack that.
> 
>        Arnd
Frank Li June 10, 2025, 3:15 p.m. UTC | #2
On Mon, Jun 09, 2025 at 04:32:39PM +0100, James Clark wrote:
> Using coherent memory here isn't functionally necessary.
> Because the
> change to use non-coherent memory isn't overly complex and only a few
> synchronization points are required, we might as well do it while fixing
> up some other DMA issues.

Any beanfit by use on-coherent memory here?

Frank

>
> Suggested-by: Arnd Bergmann <arnd@arndb.de>
> Signed-off-by: James Clark <james.clark@linaro.org>
> ---
>  drivers/spi/spi-fsl-dspi.c | 55 +++++++++++++++++++++++++++++-----------------
>  1 file changed, 35 insertions(+), 20 deletions(-)
>
> diff --git a/drivers/spi/spi-fsl-dspi.c b/drivers/spi/spi-fsl-dspi.c
> index 386a17871e79..567632042f8f 100644
> --- a/drivers/spi/spi-fsl-dspi.c
> +++ b/drivers/spi/spi-fsl-dspi.c
> @@ -247,6 +247,11 @@ struct fsl_dspi {
>  	void (*dev_to_host)(struct fsl_dspi *dspi, u32 rxdata);
>  };
>
> +static int dspi_dma_transfer_size(struct fsl_dspi *dspi)
> +{
> +	return dspi->words_in_flight * DMA_SLAVE_BUSWIDTH_4_BYTES;
> +}
> +
>  static void dspi_native_host_to_dev(struct fsl_dspi *dspi, u32 *txdata)
>  {
>  	switch (dspi->oper_word_size) {
> @@ -361,7 +366,10 @@ static void dspi_tx_dma_callback(void *arg)
>  {
>  	struct fsl_dspi *dspi = arg;
>  	struct fsl_dspi_dma *dma = dspi->dma;
> +	struct device *dev = &dspi->pdev->dev;
>
> +	dma_sync_single_for_cpu(dev, dma->tx_dma_phys,
> +				dspi_dma_transfer_size(dspi), DMA_TO_DEVICE);
>  	complete(&dma->cmd_tx_complete);
>  }
>
> @@ -369,9 +377,13 @@ static void dspi_rx_dma_callback(void *arg)
>  {
>  	struct fsl_dspi *dspi = arg;
>  	struct fsl_dspi_dma *dma = dspi->dma;
> +	struct device *dev = &dspi->pdev->dev;
>  	int i;
>
>  	if (dspi->rx) {
> +		dma_sync_single_for_cpu(dev, dma->rx_dma_phys,
> +					dspi_dma_transfer_size(dspi),
> +					DMA_FROM_DEVICE);
>  		for (i = 0; i < dspi->words_in_flight; i++)
>  			dspi_push_rx(dspi, dspi->dma->rx_dma_buf[i]);
>  	}
> @@ -381,6 +393,7 @@ static void dspi_rx_dma_callback(void *arg)
>
>  static int dspi_next_xfer_dma_submit(struct fsl_dspi *dspi)
>  {
> +	size_t size = dspi_dma_transfer_size(dspi);
>  	struct device *dev = &dspi->pdev->dev;
>  	struct fsl_dspi_dma *dma = dspi->dma;
>  	int time_left;
> @@ -389,10 +402,9 @@ static int dspi_next_xfer_dma_submit(struct fsl_dspi *dspi)
>  	for (i = 0; i < dspi->words_in_flight; i++)
>  		dspi->dma->tx_dma_buf[i] = dspi_pop_tx_pushr(dspi);
>
> +	dma_sync_single_for_device(dev, dma->tx_dma_phys, size, DMA_TO_DEVICE);
>  	dma->tx_desc = dmaengine_prep_slave_single(dma->chan_tx,
> -					dma->tx_dma_phys,
> -					dspi->words_in_flight *
> -					DMA_SLAVE_BUSWIDTH_4_BYTES,
> +					dma->tx_dma_phys, size,
>  					DMA_MEM_TO_DEV,
>  					DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
>  	if (!dma->tx_desc) {
> @@ -407,10 +419,10 @@ static int dspi_next_xfer_dma_submit(struct fsl_dspi *dspi)
>  		return -EINVAL;
>  	}
>
> +	dma_sync_single_for_device(dev, dma->rx_dma_phys, size,
> +				   DMA_FROM_DEVICE);
>  	dma->rx_desc = dmaengine_prep_slave_single(dma->chan_rx,
> -					dma->rx_dma_phys,
> -					dspi->words_in_flight *
> -					DMA_SLAVE_BUSWIDTH_4_BYTES,
> +					dma->rx_dma_phys, size,
>  					DMA_DEV_TO_MEM,
>  					DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
>  	if (!dma->rx_desc) {
> @@ -512,17 +524,17 @@ static int dspi_request_dma(struct fsl_dspi *dspi, phys_addr_t phy_addr)
>  		goto err_tx_channel;
>  	}
>
> -	dma->tx_dma_buf = dma_alloc_coherent(dma->chan_tx->device->dev,
> -					     dma_bufsize, &dma->tx_dma_phys,
> -					     GFP_KERNEL);
> +	dma->tx_dma_buf = dma_alloc_noncoherent(dma->chan_tx->device->dev,
> +						dma_bufsize, &dma->tx_dma_phys,
> +						DMA_TO_DEVICE, GFP_KERNEL);
>  	if (!dma->tx_dma_buf) {
>  		ret = -ENOMEM;
>  		goto err_tx_dma_buf;
>  	}
>
> -	dma->rx_dma_buf = dma_alloc_coherent(dma->chan_rx->device->dev,
> -					     dma_bufsize, &dma->rx_dma_phys,
> -					     GFP_KERNEL);
> +	dma->rx_dma_buf = dma_alloc_noncoherent(dma->chan_rx->device->dev,
> +						dma_bufsize, &dma->rx_dma_phys,
> +						DMA_FROM_DEVICE, GFP_KERNEL);
>  	if (!dma->rx_dma_buf) {
>  		ret = -ENOMEM;
>  		goto err_rx_dma_buf;
> @@ -557,11 +569,12 @@ static int dspi_request_dma(struct fsl_dspi *dspi, phys_addr_t phy_addr)
>  	return 0;
>
>  err_slave_config:
> -	dma_free_coherent(dma->chan_rx->device->dev,
> -			  dma_bufsize, dma->rx_dma_buf, dma->rx_dma_phys);
> +	dma_free_noncoherent(dma->chan_rx->device->dev, dma_bufsize,
> +			     dma->rx_dma_buf, dma->rx_dma_phys,
> +			     DMA_FROM_DEVICE);
>  err_rx_dma_buf:
> -	dma_free_coherent(dma->chan_tx->device->dev,
> -			  dma_bufsize, dma->tx_dma_buf, dma->tx_dma_phys);
> +	dma_free_noncoherent(dma->chan_tx->device->dev, dma_bufsize,
> +			     dma->tx_dma_buf, dma->tx_dma_phys, DMA_TO_DEVICE);
>  err_tx_dma_buf:
>  	dma_release_channel(dma->chan_tx);
>  err_tx_channel:
> @@ -582,14 +595,16 @@ static void dspi_release_dma(struct fsl_dspi *dspi)
>  		return;
>
>  	if (dma->chan_tx) {
> -		dma_free_coherent(dma->chan_tx->device->dev, dma_bufsize,
> -				  dma->tx_dma_buf, dma->tx_dma_phys);
> +		dma_free_noncoherent(dma->chan_tx->device->dev, dma_bufsize,
> +				     dma->tx_dma_buf, dma->tx_dma_phys,
> +				     DMA_TO_DEVICE);
>  		dma_release_channel(dma->chan_tx);
>  	}
>
>  	if (dma->chan_rx) {
> -		dma_free_coherent(dma->chan_rx->device->dev, dma_bufsize,
> -				  dma->rx_dma_buf, dma->rx_dma_phys);
> +		dma_free_noncoherent(dma->chan_rx->device->dev, dma_bufsize,
> +				     dma->rx_dma_buf, dma->rx_dma_phys,
> +				     DMA_FROM_DEVICE);
>  		dma_release_channel(dma->chan_rx);
>  	}
>  }
>
> --
> 2.34.1
>
James Clark June 10, 2025, 3:46 p.m. UTC | #3
On 10/06/2025 4:15 pm, Frank Li wrote:
> On Mon, Jun 09, 2025 at 04:32:39PM +0100, James Clark wrote:
>> Using coherent memory here isn't functionally necessary.
>> Because the
>> change to use non-coherent memory isn't overly complex and only a few
>> synchronization points are required, we might as well do it while fixing
>> up some other DMA issues.
> 
> Any beanfit by use on-coherent memory here?
> 
> Frank
> 

Presumably less cache maintenance traffic?

Thanks
James

>>
>> Suggested-by: Arnd Bergmann <arnd@arndb.de>
>> Signed-off-by: James Clark <james.clark@linaro.org>
>> ---
>>   drivers/spi/spi-fsl-dspi.c | 55 +++++++++++++++++++++++++++++-----------------
>>   1 file changed, 35 insertions(+), 20 deletions(-)
>>
>> diff --git a/drivers/spi/spi-fsl-dspi.c b/drivers/spi/spi-fsl-dspi.c
>> index 386a17871e79..567632042f8f 100644
>> --- a/drivers/spi/spi-fsl-dspi.c
>> +++ b/drivers/spi/spi-fsl-dspi.c
>> @@ -247,6 +247,11 @@ struct fsl_dspi {
>>   	void (*dev_to_host)(struct fsl_dspi *dspi, u32 rxdata);
>>   };
>>
>> +static int dspi_dma_transfer_size(struct fsl_dspi *dspi)
>> +{
>> +	return dspi->words_in_flight * DMA_SLAVE_BUSWIDTH_4_BYTES;
>> +}
>> +
>>   static void dspi_native_host_to_dev(struct fsl_dspi *dspi, u32 *txdata)
>>   {
>>   	switch (dspi->oper_word_size) {
>> @@ -361,7 +366,10 @@ static void dspi_tx_dma_callback(void *arg)
>>   {
>>   	struct fsl_dspi *dspi = arg;
>>   	struct fsl_dspi_dma *dma = dspi->dma;
>> +	struct device *dev = &dspi->pdev->dev;
>>
>> +	dma_sync_single_for_cpu(dev, dma->tx_dma_phys,
>> +				dspi_dma_transfer_size(dspi), DMA_TO_DEVICE);
>>   	complete(&dma->cmd_tx_complete);
>>   }
>>
>> @@ -369,9 +377,13 @@ static void dspi_rx_dma_callback(void *arg)
>>   {
>>   	struct fsl_dspi *dspi = arg;
>>   	struct fsl_dspi_dma *dma = dspi->dma;
>> +	struct device *dev = &dspi->pdev->dev;
>>   	int i;
>>
>>   	if (dspi->rx) {
>> +		dma_sync_single_for_cpu(dev, dma->rx_dma_phys,
>> +					dspi_dma_transfer_size(dspi),
>> +					DMA_FROM_DEVICE);
>>   		for (i = 0; i < dspi->words_in_flight; i++)
>>   			dspi_push_rx(dspi, dspi->dma->rx_dma_buf[i]);
>>   	}
>> @@ -381,6 +393,7 @@ static void dspi_rx_dma_callback(void *arg)
>>
>>   static int dspi_next_xfer_dma_submit(struct fsl_dspi *dspi)
>>   {
>> +	size_t size = dspi_dma_transfer_size(dspi);
>>   	struct device *dev = &dspi->pdev->dev;
>>   	struct fsl_dspi_dma *dma = dspi->dma;
>>   	int time_left;
>> @@ -389,10 +402,9 @@ static int dspi_next_xfer_dma_submit(struct fsl_dspi *dspi)
>>   	for (i = 0; i < dspi->words_in_flight; i++)
>>   		dspi->dma->tx_dma_buf[i] = dspi_pop_tx_pushr(dspi);
>>
>> +	dma_sync_single_for_device(dev, dma->tx_dma_phys, size, DMA_TO_DEVICE);
>>   	dma->tx_desc = dmaengine_prep_slave_single(dma->chan_tx,
>> -					dma->tx_dma_phys,
>> -					dspi->words_in_flight *
>> -					DMA_SLAVE_BUSWIDTH_4_BYTES,
>> +					dma->tx_dma_phys, size,
>>   					DMA_MEM_TO_DEV,
>>   					DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
>>   	if (!dma->tx_desc) {
>> @@ -407,10 +419,10 @@ static int dspi_next_xfer_dma_submit(struct fsl_dspi *dspi)
>>   		return -EINVAL;
>>   	}
>>
>> +	dma_sync_single_for_device(dev, dma->rx_dma_phys, size,
>> +				   DMA_FROM_DEVICE);
>>   	dma->rx_desc = dmaengine_prep_slave_single(dma->chan_rx,
>> -					dma->rx_dma_phys,
>> -					dspi->words_in_flight *
>> -					DMA_SLAVE_BUSWIDTH_4_BYTES,
>> +					dma->rx_dma_phys, size,
>>   					DMA_DEV_TO_MEM,
>>   					DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
>>   	if (!dma->rx_desc) {
>> @@ -512,17 +524,17 @@ static int dspi_request_dma(struct fsl_dspi *dspi, phys_addr_t phy_addr)
>>   		goto err_tx_channel;
>>   	}
>>
>> -	dma->tx_dma_buf = dma_alloc_coherent(dma->chan_tx->device->dev,
>> -					     dma_bufsize, &dma->tx_dma_phys,
>> -					     GFP_KERNEL);
>> +	dma->tx_dma_buf = dma_alloc_noncoherent(dma->chan_tx->device->dev,
>> +						dma_bufsize, &dma->tx_dma_phys,
>> +						DMA_TO_DEVICE, GFP_KERNEL);
>>   	if (!dma->tx_dma_buf) {
>>   		ret = -ENOMEM;
>>   		goto err_tx_dma_buf;
>>   	}
>>
>> -	dma->rx_dma_buf = dma_alloc_coherent(dma->chan_rx->device->dev,
>> -					     dma_bufsize, &dma->rx_dma_phys,
>> -					     GFP_KERNEL);
>> +	dma->rx_dma_buf = dma_alloc_noncoherent(dma->chan_rx->device->dev,
>> +						dma_bufsize, &dma->rx_dma_phys,
>> +						DMA_FROM_DEVICE, GFP_KERNEL);
>>   	if (!dma->rx_dma_buf) {
>>   		ret = -ENOMEM;
>>   		goto err_rx_dma_buf;
>> @@ -557,11 +569,12 @@ static int dspi_request_dma(struct fsl_dspi *dspi, phys_addr_t phy_addr)
>>   	return 0;
>>
>>   err_slave_config:
>> -	dma_free_coherent(dma->chan_rx->device->dev,
>> -			  dma_bufsize, dma->rx_dma_buf, dma->rx_dma_phys);
>> +	dma_free_noncoherent(dma->chan_rx->device->dev, dma_bufsize,
>> +			     dma->rx_dma_buf, dma->rx_dma_phys,
>> +			     DMA_FROM_DEVICE);
>>   err_rx_dma_buf:
>> -	dma_free_coherent(dma->chan_tx->device->dev,
>> -			  dma_bufsize, dma->tx_dma_buf, dma->tx_dma_phys);
>> +	dma_free_noncoherent(dma->chan_tx->device->dev, dma_bufsize,
>> +			     dma->tx_dma_buf, dma->tx_dma_phys, DMA_TO_DEVICE);
>>   err_tx_dma_buf:
>>   	dma_release_channel(dma->chan_tx);
>>   err_tx_channel:
>> @@ -582,14 +595,16 @@ static void dspi_release_dma(struct fsl_dspi *dspi)
>>   		return;
>>
>>   	if (dma->chan_tx) {
>> -		dma_free_coherent(dma->chan_tx->device->dev, dma_bufsize,
>> -				  dma->tx_dma_buf, dma->tx_dma_phys);
>> +		dma_free_noncoherent(dma->chan_tx->device->dev, dma_bufsize,
>> +				     dma->tx_dma_buf, dma->tx_dma_phys,
>> +				     DMA_TO_DEVICE);
>>   		dma_release_channel(dma->chan_tx);
>>   	}
>>
>>   	if (dma->chan_rx) {
>> -		dma_free_coherent(dma->chan_rx->device->dev, dma_bufsize,
>> -				  dma->rx_dma_buf, dma->rx_dma_phys);
>> +		dma_free_noncoherent(dma->chan_rx->device->dev, dma_bufsize,
>> +				     dma->rx_dma_buf, dma->rx_dma_phys,
>> +				     DMA_FROM_DEVICE);
>>   		dma_release_channel(dma->chan_rx);
>>   	}
>>   }
>>
>> --
>> 2.34.1
>>
Arnd Bergmann June 10, 2025, 3:48 p.m. UTC | #4
On Tue, Jun 10, 2025, at 17:15, Frank Li wrote:
> On Mon, Jun 09, 2025 at 04:32:39PM +0100, James Clark wrote:
>> Using coherent memory here isn't functionally necessary.
>> Because the
>> change to use non-coherent memory isn't overly complex and only a few
>> synchronization points are required, we might as well do it while fixing
>> up some other DMA issues.
>
> Any beanfit by use on-coherent memory here?

The driver copies data in and out of a coherent buffer by default. This is
fine if the buffer is only a few bytes in size, but for large transfers
this is quite slow because this bypasses the cache for any DMA master
that is marked as not "dma-coherent" in devicetree.

Patch 3/4 changes the size from a few bytes to many pages of memory,
so it's access the buffer in cache first and manually maintain
coherency.

     Arnd
Frank Li June 10, 2025, 3:56 p.m. UTC | #5
On Tue, Jun 10, 2025 at 05:48:40PM +0200, Arnd Bergmann wrote:
> On Tue, Jun 10, 2025, at 17:15, Frank Li wrote:
> > On Mon, Jun 09, 2025 at 04:32:39PM +0100, James Clark wrote:
> >> Using coherent memory here isn't functionally necessary.
> >> Because the
> >> change to use non-coherent memory isn't overly complex and only a few
> >> synchronization points are required, we might as well do it while fixing
> >> up some other DMA issues.
> >
> > Any beanfit by use on-coherent memory here?
>
> The driver copies data in and out of a coherent buffer by default. This is
> fine if the buffer is only a few bytes in size, but for large transfers
> this is quite slow because this bypasses the cache for any DMA master
> that is marked as not "dma-coherent" in devicetree.

I see, non-coherent memory use cachable normal memory page. but coherent
use non-cachable normal memory page.

Can you add performance beneafit information after use non-coherent memory
in commit message to let reviewer easily know your intention.

Frank

>
> Patch 3/4 changes the size from a few bytes to many pages of memory,
> so it's access the buffer in cache first and manually maintain
> coherency.
>
>      Arnd
Vladimir Oltean June 11, 2025, 9:01 a.m. UTC | #6
On Tue, Jun 10, 2025 at 11:56:34AM -0400, Frank Li wrote:
> Can you add performance beneafit information after use non-coherent memory
> in commit message to let reviewer easily know your intention.

To expand on that, you can post the output of something like this
(before and after):
$ spidev_test --device /dev/spidev1.0 --bpw 8 --size 256 --cpha --iter 10000000 --speed 10000000
where /dev/spidev1.0 is an unconnected chip select with a dummy entry in
the device tree.
Vladimir Oltean June 12, 2025, 11:15 a.m. UTC | #7
On Thu, Jun 12, 2025 at 12:05:26PM +0100, James Clark wrote:
> (No idea why it goes faster when it's under load, but I hope that can be
> ignored for this test)

Might be because of dynamic CPU frequency scaling as done by the governor.
If the CPU utilization of spidev_test isn't high enough, the governor
will prefer lower CPU frequencies. You can try to repeat the test with
the "performance" governor and/or setting the min frequency equal to the
max one.

That's why I don't like the DMA mode in DSPI, it's still CPU-bound,
because the DMA buffers are very small (you can only provide one TX FIFO
worth of data per DMA transfer, rather than the whole buffer).

FWIW, the XSPI FIFO performance should be higher.
James Clark June 12, 2025, 2:14 p.m. UTC | #8
On 12/06/2025 12:15 pm, Vladimir Oltean wrote:
> On Thu, Jun 12, 2025 at 12:05:26PM +0100, James Clark wrote:
>> (No idea why it goes faster when it's under load, but I hope that can be
>> ignored for this test)
> 
> Might be because of dynamic CPU frequency scaling as done by the governor.
> If the CPU utilization of spidev_test isn't high enough, the governor
> will prefer lower CPU frequencies. You can try to repeat the test with
> the "performance" governor and/or setting the min frequency equal to the
> max one.
> 

That doesn't seem to make a difference, I get the same results with 
this. Even for the "fixed" DMA test results below there is a similar 
small performance increase when stressing the system:

   # cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_min_freq
   1300000
   ...

   # cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_max_freq
   1300000
   ...

   # cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor
   performance
   ...

> That's why I don't like the DMA mode in DSPI, it's still CPU-bound,
> because the DMA buffers are very small (you can only provide one TX FIFO
> worth of data per DMA transfer, rather than the whole buffer).

Is that right? The FIFO size isn't used in any of the DMA codepaths, it 
looks like the whole DMA buffer is filled before initiating the 
transfer. And we increase the buffer to 4k in this patchset to fully use 
the existing allocation.

> 
> FWIW, the XSPI FIFO performance should be higher.

This leads me to realise a mistake in my original figures. My head was 
stuck in target mode where we use DMA so I forgot to force DMA in host 
mode to run the performance tests. The previous figures were all XSPI 
mode and the small difference in performance could have been just down 
to the layout of the code changing?

Changing it to DMA mode gives figures that make much more sense:

Coherent (4096 byte transfers): 6534 kbps
Non-coherent:                   7347 kbps

Coherent (16 byte transfers):    447 kbps
Non-coherent:                    448 kbps


Just for comparison running the same test in XSPI mode:

4096 byte transfers:            2143 kbps
16 byte transfers:               637 kbps


So for small transfers XSPI is slightly better but for large ones DMA is 
much better, with non-coherent memory giving another 800kbps gain. 
Perhaps we could find the midpoint and then auto select the mode 
depending on the size, but maybe there is latency to consider too which 
could be important.
James Clark June 12, 2025, 2:28 p.m. UTC | #9
On 12/06/2025 3:23 pm, Vladimir Oltean wrote:
> On Thu, Jun 12, 2025 at 03:14:32PM +0100, James Clark wrote:
>>> That's why I don't like the DMA mode in DSPI, it's still CPU-bound,
>>> because the DMA buffers are very small (you can only provide one TX FIFO
>>> worth of data per DMA transfer, rather than the whole buffer).
>>
>> Is that right? The FIFO size isn't used in any of the DMA codepaths, it
>> looks like the whole DMA buffer is filled before initiating the transfer.
>> And we increase the buffer to 4k in this patchset to fully use the existing
>> allocation.
> 
> Uhm, yeah, no?
> 
> dspi_dma_xfer():
> 
> 	while (dspi->len) {
> 		dspi->words_in_flight = dspi->len / dspi->oper_word_size;
> 		if (dspi->words_in_flight > dspi->devtype_data->fifo_size)
> 			dspi->words_in_flight = dspi->devtype_data->fifo_size;
> 		dspi_next_xfer_dma_submit();
> 	}

Right but that's before the change in this patchset to use the whole 
page that was allocated, hence the next bit:

 > And we increase the buffer to 4k in this patchset to fully use the
   existing allocation.

We were allocating for the size of the FIFO (multiplied by two to hold 
the control words), but dma_alloc_coherent() will be backed by a whole 
page anyway, even if you only ask for a few bytes.

After changing that to make use of the full allocation the FIFO length 
is no longer involved.
James Clark June 12, 2025, 2:35 p.m. UTC | #10
On 12/06/2025 3:31 pm, Vladimir Oltean wrote:
> On Thu, Jun 12, 2025 at 03:28:37PM +0100, James Clark wrote:
>>
>>
>> On 12/06/2025 3:23 pm, Vladimir Oltean wrote:
>>> On Thu, Jun 12, 2025 at 03:14:32PM +0100, James Clark wrote:
>>>>> That's why I don't like the DMA mode in DSPI, it's still CPU-bound,
>>>>> because the DMA buffers are very small (you can only provide one TX FIFO
>>>>> worth of data per DMA transfer, rather than the whole buffer).
>>>>
>>>> Is that right? The FIFO size isn't used in any of the DMA codepaths, it
>>>> looks like the whole DMA buffer is filled before initiating the transfer.
>>>> And we increase the buffer to 4k in this patchset to fully use the existing
>>>> allocation.
>>>
>>> Uhm, yeah, no?
>>>
>>> dspi_dma_xfer():
>>>
>>> 	while (dspi->len) {
>>> 		dspi->words_in_flight = dspi->len / dspi->oper_word_size;
>>> 		if (dspi->words_in_flight > dspi->devtype_data->fifo_size)
>>> 			dspi->words_in_flight = dspi->devtype_data->fifo_size;
>>> 		dspi_next_xfer_dma_submit();
>>> 	}
>>
>> Right but that's before the change in this patchset to use the whole page
>> that was allocated, hence the next bit:
>>
>>> And we increase the buffer to 4k in this patchset to fully use the
>>    existing allocation.
>>
>> We were allocating for the size of the FIFO (multiplied by two to hold the
>> control words), but dma_alloc_coherent() will be backed by a whole page
>> anyway, even if you only ask for a few bytes.
>>
>> After changing that to make use of the full allocation the FIFO length is no
>> longer involved.
> 
> Ok, I haven't walked through patch 3 yet, I didn't realize it would be
> changing that. I will want to test it on LS1028A.

Yeah testing it somewhere else would be good. Maybe there is some 
limitation there about the max size of the DMA transfer, but I didn't 
see it.

I realise the tense in my original message may have been a bit 
confusing. I was mixing up the existing code and proposed changes...
Mark Brown June 12, 2025, 2:43 p.m. UTC | #11
On Thu, Jun 12, 2025 at 03:14:32PM +0100, James Clark wrote:
> On 12/06/2025 12:15 pm, Vladimir Oltean wrote:

> > FWIW, the XSPI FIFO performance should be higher.

> This leads me to realise a mistake in my original figures. My head was stuck
> in target mode where we use DMA so I forgot to force DMA in host mode to run
> the performance tests. The previous figures were all XSPI mode and the small
> difference in performance could have been just down to the layout of the
> code changing?

> Changing it to DMA mode gives figures that make much more sense:

If not having DMA mode is making this much of a difference shouldn't the
driver just do that?  I'm not seeing runtime configuration in the driver
so I guess this is local hacking...

> So for small transfers XSPI is slightly better but for large ones DMA is
> much better, with non-coherent memory giving another 800kbps gain. Perhaps
> we could find the midpoint and then auto select the mode depending on the
> size, but maybe there is latency to consider too which could be important.

This is a fairly normal pattern, it's a big part of why the can_dma()
callback is per transfer - so you can do a copybreak and use PIO for
smaller transfers where the overhead of setting up DMA is often more
than the overhead of just doing PIO.
James Clark June 12, 2025, 3:37 p.m. UTC | #12
On 12/06/2025 4:36 pm, James Clark wrote:
> 
> 
> On 12/06/2025 3:36 pm, Vladimir Oltean wrote:
>> On Thu, Jun 12, 2025 at 03:14:32PM +0100, James Clark wrote:
>>>> FWIW, the XSPI FIFO performance should be higher.
>>>
>>> This leads me to realise a mistake in my original figures. My head 
>>> was stuck
>>> in target mode where we use DMA so I forgot to force DMA in host mode 
>>> to run
>>> the performance tests. The previous figures were all XSPI mode and 
>>> the small
>>> difference in performance could have been just down to the layout of the
>>> code changing?
>>>
>>> Changing it to DMA mode gives figures that make much more sense:
>>>
>>> Coherent (4096 byte transfers): 6534 kbps
>>> Non-coherent:                   7347 kbps
>>>
>>> Coherent (16 byte transfers):    447 kbps
>>> Non-coherent:                    448 kbps
>>>
>>>
>>> Just for comparison running the same test in XSPI mode:
>>>
>>> 4096 byte transfers:            2143 kbps
>>> 16 byte transfers:               637 kbps
>>
>> So to be clear, the 'non-coherent' test was done just with patch 2
>> applied, or also with 3?
> 
> The whole set, and then the non-coherent patch reverted.
> 

And with DMA forced in host mode as a hack.
James Clark June 12, 2025, 3:47 p.m. UTC | #13
On 12/06/2025 3:43 pm, Mark Brown wrote:
> On Thu, Jun 12, 2025 at 03:14:32PM +0100, James Clark wrote:
>> On 12/06/2025 12:15 pm, Vladimir Oltean wrote:
> 
>>> FWIW, the XSPI FIFO performance should be higher.
> 
>> This leads me to realise a mistake in my original figures. My head was stuck
>> in target mode where we use DMA so I forgot to force DMA in host mode to run
>> the performance tests. The previous figures were all XSPI mode and the small
>> difference in performance could have been just down to the layout of the
>> code changing?
> 
>> Changing it to DMA mode gives figures that make much more sense:
> 
> If not having DMA mode is making this much of a difference shouldn't the
> driver just do that?  I'm not seeing runtime configuration in the driver
> so I guess this is local hacking...
> 

Yes just changed locally.

>> So for small transfers XSPI is slightly better but for large ones DMA is
>> much better, with non-coherent memory giving another 800kbps gain. Perhaps
>> we could find the midpoint and then auto select the mode depending on the
>> size, but maybe there is latency to consider too which could be important.
> 
> This is a fairly normal pattern, it's a big part of why the can_dma()
> callback is per transfer - so you can do a copybreak and use PIO for
> smaller transfers where the overhead of setting up DMA is often more
> than the overhead of just doing PIO.

Makes sense. Although for some reason two devices already use DMA for 
host mode and it's not that clear to me what the reason is.
diff mbox series

Patch

diff --git a/drivers/spi/spi-fsl-dspi.c b/drivers/spi/spi-fsl-dspi.c
index 386a17871e79..567632042f8f 100644
--- a/drivers/spi/spi-fsl-dspi.c
+++ b/drivers/spi/spi-fsl-dspi.c
@@ -247,6 +247,11 @@  struct fsl_dspi {
 	void (*dev_to_host)(struct fsl_dspi *dspi, u32 rxdata);
 };
 
+static int dspi_dma_transfer_size(struct fsl_dspi *dspi)
+{
+	return dspi->words_in_flight * DMA_SLAVE_BUSWIDTH_4_BYTES;
+}
+
 static void dspi_native_host_to_dev(struct fsl_dspi *dspi, u32 *txdata)
 {
 	switch (dspi->oper_word_size) {
@@ -361,7 +366,10 @@  static void dspi_tx_dma_callback(void *arg)
 {
 	struct fsl_dspi *dspi = arg;
 	struct fsl_dspi_dma *dma = dspi->dma;
+	struct device *dev = &dspi->pdev->dev;
 
+	dma_sync_single_for_cpu(dev, dma->tx_dma_phys,
+				dspi_dma_transfer_size(dspi), DMA_TO_DEVICE);
 	complete(&dma->cmd_tx_complete);
 }
 
@@ -369,9 +377,13 @@  static void dspi_rx_dma_callback(void *arg)
 {
 	struct fsl_dspi *dspi = arg;
 	struct fsl_dspi_dma *dma = dspi->dma;
+	struct device *dev = &dspi->pdev->dev;
 	int i;
 
 	if (dspi->rx) {
+		dma_sync_single_for_cpu(dev, dma->rx_dma_phys,
+					dspi_dma_transfer_size(dspi),
+					DMA_FROM_DEVICE);
 		for (i = 0; i < dspi->words_in_flight; i++)
 			dspi_push_rx(dspi, dspi->dma->rx_dma_buf[i]);
 	}
@@ -381,6 +393,7 @@  static void dspi_rx_dma_callback(void *arg)
 
 static int dspi_next_xfer_dma_submit(struct fsl_dspi *dspi)
 {
+	size_t size = dspi_dma_transfer_size(dspi);
 	struct device *dev = &dspi->pdev->dev;
 	struct fsl_dspi_dma *dma = dspi->dma;
 	int time_left;
@@ -389,10 +402,9 @@  static int dspi_next_xfer_dma_submit(struct fsl_dspi *dspi)
 	for (i = 0; i < dspi->words_in_flight; i++)
 		dspi->dma->tx_dma_buf[i] = dspi_pop_tx_pushr(dspi);
 
+	dma_sync_single_for_device(dev, dma->tx_dma_phys, size, DMA_TO_DEVICE);
 	dma->tx_desc = dmaengine_prep_slave_single(dma->chan_tx,
-					dma->tx_dma_phys,
-					dspi->words_in_flight *
-					DMA_SLAVE_BUSWIDTH_4_BYTES,
+					dma->tx_dma_phys, size,
 					DMA_MEM_TO_DEV,
 					DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
 	if (!dma->tx_desc) {
@@ -407,10 +419,10 @@  static int dspi_next_xfer_dma_submit(struct fsl_dspi *dspi)
 		return -EINVAL;
 	}
 
+	dma_sync_single_for_device(dev, dma->rx_dma_phys, size,
+				   DMA_FROM_DEVICE);
 	dma->rx_desc = dmaengine_prep_slave_single(dma->chan_rx,
-					dma->rx_dma_phys,
-					dspi->words_in_flight *
-					DMA_SLAVE_BUSWIDTH_4_BYTES,
+					dma->rx_dma_phys, size,
 					DMA_DEV_TO_MEM,
 					DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
 	if (!dma->rx_desc) {
@@ -512,17 +524,17 @@  static int dspi_request_dma(struct fsl_dspi *dspi, phys_addr_t phy_addr)
 		goto err_tx_channel;
 	}
 
-	dma->tx_dma_buf = dma_alloc_coherent(dma->chan_tx->device->dev,
-					     dma_bufsize, &dma->tx_dma_phys,
-					     GFP_KERNEL);
+	dma->tx_dma_buf = dma_alloc_noncoherent(dma->chan_tx->device->dev,
+						dma_bufsize, &dma->tx_dma_phys,
+						DMA_TO_DEVICE, GFP_KERNEL);
 	if (!dma->tx_dma_buf) {
 		ret = -ENOMEM;
 		goto err_tx_dma_buf;
 	}
 
-	dma->rx_dma_buf = dma_alloc_coherent(dma->chan_rx->device->dev,
-					     dma_bufsize, &dma->rx_dma_phys,
-					     GFP_KERNEL);
+	dma->rx_dma_buf = dma_alloc_noncoherent(dma->chan_rx->device->dev,
+						dma_bufsize, &dma->rx_dma_phys,
+						DMA_FROM_DEVICE, GFP_KERNEL);
 	if (!dma->rx_dma_buf) {
 		ret = -ENOMEM;
 		goto err_rx_dma_buf;
@@ -557,11 +569,12 @@  static int dspi_request_dma(struct fsl_dspi *dspi, phys_addr_t phy_addr)
 	return 0;
 
 err_slave_config:
-	dma_free_coherent(dma->chan_rx->device->dev,
-			  dma_bufsize, dma->rx_dma_buf, dma->rx_dma_phys);
+	dma_free_noncoherent(dma->chan_rx->device->dev, dma_bufsize,
+			     dma->rx_dma_buf, dma->rx_dma_phys,
+			     DMA_FROM_DEVICE);
 err_rx_dma_buf:
-	dma_free_coherent(dma->chan_tx->device->dev,
-			  dma_bufsize, dma->tx_dma_buf, dma->tx_dma_phys);
+	dma_free_noncoherent(dma->chan_tx->device->dev, dma_bufsize,
+			     dma->tx_dma_buf, dma->tx_dma_phys, DMA_TO_DEVICE);
 err_tx_dma_buf:
 	dma_release_channel(dma->chan_tx);
 err_tx_channel:
@@ -582,14 +595,16 @@  static void dspi_release_dma(struct fsl_dspi *dspi)
 		return;
 
 	if (dma->chan_tx) {
-		dma_free_coherent(dma->chan_tx->device->dev, dma_bufsize,
-				  dma->tx_dma_buf, dma->tx_dma_phys);
+		dma_free_noncoherent(dma->chan_tx->device->dev, dma_bufsize,
+				     dma->tx_dma_buf, dma->tx_dma_phys,
+				     DMA_TO_DEVICE);
 		dma_release_channel(dma->chan_tx);
 	}
 
 	if (dma->chan_rx) {
-		dma_free_coherent(dma->chan_rx->device->dev, dma_bufsize,
-				  dma->rx_dma_buf, dma->rx_dma_phys);
+		dma_free_noncoherent(dma->chan_rx->device->dev, dma_bufsize,
+				     dma->rx_dma_buf, dma->rx_dma_phys,
+				     DMA_FROM_DEVICE);
 		dma_release_channel(dma->chan_rx);
 	}
 }