Message ID | 20250609-james-nxp-spi-dma-v1-2-2b831e714be2@linaro.org |
---|---|
State | Superseded |
Headers | show |
Series | spi: spi-fsl-dspi: Target mode improvements | expand |
On 10/06/2025 9:26 am, Arnd Bergmann wrote: > On Mon, Jun 9, 2025, at 17:32, James Clark wrote: >> Using coherent memory here isn't functionally necessary. Because the >> change to use non-coherent memory isn't overly complex and only a few >> synchronization points are required, we might as well do it while fixing >> up some other DMA issues. >> >> Suggested-by: Arnd Bergmann <arnd@arndb.de> >> Signed-off-by: James Clark <james.clark@linaro.org> > > This version looks good to me, > > Acked-by: Arnd Bergmann <arnd@arndb.de> > > I had reviewed an internal version originally and had some comment > on that, all of which are addressed now. You did not Cc me on the > other patches, so I looked them up in the archive, Patch 3 also Yes sorry about that. I've just started using "b4 send" and I was under the impression that it would automatically CC all patches the same way, but apparently not. Maybe I'm holding it wrong. > looks good to me and complements this one (i.e. you really want > the combination). I did not understand the logic in patch 4, > and it would be good if someone else can take a closer look > at that in order to Ack that. > > Arnd
On Mon, Jun 09, 2025 at 04:32:39PM +0100, James Clark wrote: > Using coherent memory here isn't functionally necessary. > Because the > change to use non-coherent memory isn't overly complex and only a few > synchronization points are required, we might as well do it while fixing > up some other DMA issues. Any beanfit by use on-coherent memory here? Frank > > Suggested-by: Arnd Bergmann <arnd@arndb.de> > Signed-off-by: James Clark <james.clark@linaro.org> > --- > drivers/spi/spi-fsl-dspi.c | 55 +++++++++++++++++++++++++++++----------------- > 1 file changed, 35 insertions(+), 20 deletions(-) > > diff --git a/drivers/spi/spi-fsl-dspi.c b/drivers/spi/spi-fsl-dspi.c > index 386a17871e79..567632042f8f 100644 > --- a/drivers/spi/spi-fsl-dspi.c > +++ b/drivers/spi/spi-fsl-dspi.c > @@ -247,6 +247,11 @@ struct fsl_dspi { > void (*dev_to_host)(struct fsl_dspi *dspi, u32 rxdata); > }; > > +static int dspi_dma_transfer_size(struct fsl_dspi *dspi) > +{ > + return dspi->words_in_flight * DMA_SLAVE_BUSWIDTH_4_BYTES; > +} > + > static void dspi_native_host_to_dev(struct fsl_dspi *dspi, u32 *txdata) > { > switch (dspi->oper_word_size) { > @@ -361,7 +366,10 @@ static void dspi_tx_dma_callback(void *arg) > { > struct fsl_dspi *dspi = arg; > struct fsl_dspi_dma *dma = dspi->dma; > + struct device *dev = &dspi->pdev->dev; > > + dma_sync_single_for_cpu(dev, dma->tx_dma_phys, > + dspi_dma_transfer_size(dspi), DMA_TO_DEVICE); > complete(&dma->cmd_tx_complete); > } > > @@ -369,9 +377,13 @@ static void dspi_rx_dma_callback(void *arg) > { > struct fsl_dspi *dspi = arg; > struct fsl_dspi_dma *dma = dspi->dma; > + struct device *dev = &dspi->pdev->dev; > int i; > > if (dspi->rx) { > + dma_sync_single_for_cpu(dev, dma->rx_dma_phys, > + dspi_dma_transfer_size(dspi), > + DMA_FROM_DEVICE); > for (i = 0; i < dspi->words_in_flight; i++) > dspi_push_rx(dspi, dspi->dma->rx_dma_buf[i]); > } > @@ -381,6 +393,7 @@ static void dspi_rx_dma_callback(void *arg) > > static int dspi_next_xfer_dma_submit(struct fsl_dspi *dspi) > { > + size_t size = dspi_dma_transfer_size(dspi); > struct device *dev = &dspi->pdev->dev; > struct fsl_dspi_dma *dma = dspi->dma; > int time_left; > @@ -389,10 +402,9 @@ static int dspi_next_xfer_dma_submit(struct fsl_dspi *dspi) > for (i = 0; i < dspi->words_in_flight; i++) > dspi->dma->tx_dma_buf[i] = dspi_pop_tx_pushr(dspi); > > + dma_sync_single_for_device(dev, dma->tx_dma_phys, size, DMA_TO_DEVICE); > dma->tx_desc = dmaengine_prep_slave_single(dma->chan_tx, > - dma->tx_dma_phys, > - dspi->words_in_flight * > - DMA_SLAVE_BUSWIDTH_4_BYTES, > + dma->tx_dma_phys, size, > DMA_MEM_TO_DEV, > DMA_PREP_INTERRUPT | DMA_CTRL_ACK); > if (!dma->tx_desc) { > @@ -407,10 +419,10 @@ static int dspi_next_xfer_dma_submit(struct fsl_dspi *dspi) > return -EINVAL; > } > > + dma_sync_single_for_device(dev, dma->rx_dma_phys, size, > + DMA_FROM_DEVICE); > dma->rx_desc = dmaengine_prep_slave_single(dma->chan_rx, > - dma->rx_dma_phys, > - dspi->words_in_flight * > - DMA_SLAVE_BUSWIDTH_4_BYTES, > + dma->rx_dma_phys, size, > DMA_DEV_TO_MEM, > DMA_PREP_INTERRUPT | DMA_CTRL_ACK); > if (!dma->rx_desc) { > @@ -512,17 +524,17 @@ static int dspi_request_dma(struct fsl_dspi *dspi, phys_addr_t phy_addr) > goto err_tx_channel; > } > > - dma->tx_dma_buf = dma_alloc_coherent(dma->chan_tx->device->dev, > - dma_bufsize, &dma->tx_dma_phys, > - GFP_KERNEL); > + dma->tx_dma_buf = dma_alloc_noncoherent(dma->chan_tx->device->dev, > + dma_bufsize, &dma->tx_dma_phys, > + DMA_TO_DEVICE, GFP_KERNEL); > if (!dma->tx_dma_buf) { > ret = -ENOMEM; > goto err_tx_dma_buf; > } > > - dma->rx_dma_buf = dma_alloc_coherent(dma->chan_rx->device->dev, > - dma_bufsize, &dma->rx_dma_phys, > - GFP_KERNEL); > + dma->rx_dma_buf = dma_alloc_noncoherent(dma->chan_rx->device->dev, > + dma_bufsize, &dma->rx_dma_phys, > + DMA_FROM_DEVICE, GFP_KERNEL); > if (!dma->rx_dma_buf) { > ret = -ENOMEM; > goto err_rx_dma_buf; > @@ -557,11 +569,12 @@ static int dspi_request_dma(struct fsl_dspi *dspi, phys_addr_t phy_addr) > return 0; > > err_slave_config: > - dma_free_coherent(dma->chan_rx->device->dev, > - dma_bufsize, dma->rx_dma_buf, dma->rx_dma_phys); > + dma_free_noncoherent(dma->chan_rx->device->dev, dma_bufsize, > + dma->rx_dma_buf, dma->rx_dma_phys, > + DMA_FROM_DEVICE); > err_rx_dma_buf: > - dma_free_coherent(dma->chan_tx->device->dev, > - dma_bufsize, dma->tx_dma_buf, dma->tx_dma_phys); > + dma_free_noncoherent(dma->chan_tx->device->dev, dma_bufsize, > + dma->tx_dma_buf, dma->tx_dma_phys, DMA_TO_DEVICE); > err_tx_dma_buf: > dma_release_channel(dma->chan_tx); > err_tx_channel: > @@ -582,14 +595,16 @@ static void dspi_release_dma(struct fsl_dspi *dspi) > return; > > if (dma->chan_tx) { > - dma_free_coherent(dma->chan_tx->device->dev, dma_bufsize, > - dma->tx_dma_buf, dma->tx_dma_phys); > + dma_free_noncoherent(dma->chan_tx->device->dev, dma_bufsize, > + dma->tx_dma_buf, dma->tx_dma_phys, > + DMA_TO_DEVICE); > dma_release_channel(dma->chan_tx); > } > > if (dma->chan_rx) { > - dma_free_coherent(dma->chan_rx->device->dev, dma_bufsize, > - dma->rx_dma_buf, dma->rx_dma_phys); > + dma_free_noncoherent(dma->chan_rx->device->dev, dma_bufsize, > + dma->rx_dma_buf, dma->rx_dma_phys, > + DMA_FROM_DEVICE); > dma_release_channel(dma->chan_rx); > } > } > > -- > 2.34.1 >
On 10/06/2025 4:15 pm, Frank Li wrote: > On Mon, Jun 09, 2025 at 04:32:39PM +0100, James Clark wrote: >> Using coherent memory here isn't functionally necessary. >> Because the >> change to use non-coherent memory isn't overly complex and only a few >> synchronization points are required, we might as well do it while fixing >> up some other DMA issues. > > Any beanfit by use on-coherent memory here? > > Frank > Presumably less cache maintenance traffic? Thanks James >> >> Suggested-by: Arnd Bergmann <arnd@arndb.de> >> Signed-off-by: James Clark <james.clark@linaro.org> >> --- >> drivers/spi/spi-fsl-dspi.c | 55 +++++++++++++++++++++++++++++----------------- >> 1 file changed, 35 insertions(+), 20 deletions(-) >> >> diff --git a/drivers/spi/spi-fsl-dspi.c b/drivers/spi/spi-fsl-dspi.c >> index 386a17871e79..567632042f8f 100644 >> --- a/drivers/spi/spi-fsl-dspi.c >> +++ b/drivers/spi/spi-fsl-dspi.c >> @@ -247,6 +247,11 @@ struct fsl_dspi { >> void (*dev_to_host)(struct fsl_dspi *dspi, u32 rxdata); >> }; >> >> +static int dspi_dma_transfer_size(struct fsl_dspi *dspi) >> +{ >> + return dspi->words_in_flight * DMA_SLAVE_BUSWIDTH_4_BYTES; >> +} >> + >> static void dspi_native_host_to_dev(struct fsl_dspi *dspi, u32 *txdata) >> { >> switch (dspi->oper_word_size) { >> @@ -361,7 +366,10 @@ static void dspi_tx_dma_callback(void *arg) >> { >> struct fsl_dspi *dspi = arg; >> struct fsl_dspi_dma *dma = dspi->dma; >> + struct device *dev = &dspi->pdev->dev; >> >> + dma_sync_single_for_cpu(dev, dma->tx_dma_phys, >> + dspi_dma_transfer_size(dspi), DMA_TO_DEVICE); >> complete(&dma->cmd_tx_complete); >> } >> >> @@ -369,9 +377,13 @@ static void dspi_rx_dma_callback(void *arg) >> { >> struct fsl_dspi *dspi = arg; >> struct fsl_dspi_dma *dma = dspi->dma; >> + struct device *dev = &dspi->pdev->dev; >> int i; >> >> if (dspi->rx) { >> + dma_sync_single_for_cpu(dev, dma->rx_dma_phys, >> + dspi_dma_transfer_size(dspi), >> + DMA_FROM_DEVICE); >> for (i = 0; i < dspi->words_in_flight; i++) >> dspi_push_rx(dspi, dspi->dma->rx_dma_buf[i]); >> } >> @@ -381,6 +393,7 @@ static void dspi_rx_dma_callback(void *arg) >> >> static int dspi_next_xfer_dma_submit(struct fsl_dspi *dspi) >> { >> + size_t size = dspi_dma_transfer_size(dspi); >> struct device *dev = &dspi->pdev->dev; >> struct fsl_dspi_dma *dma = dspi->dma; >> int time_left; >> @@ -389,10 +402,9 @@ static int dspi_next_xfer_dma_submit(struct fsl_dspi *dspi) >> for (i = 0; i < dspi->words_in_flight; i++) >> dspi->dma->tx_dma_buf[i] = dspi_pop_tx_pushr(dspi); >> >> + dma_sync_single_for_device(dev, dma->tx_dma_phys, size, DMA_TO_DEVICE); >> dma->tx_desc = dmaengine_prep_slave_single(dma->chan_tx, >> - dma->tx_dma_phys, >> - dspi->words_in_flight * >> - DMA_SLAVE_BUSWIDTH_4_BYTES, >> + dma->tx_dma_phys, size, >> DMA_MEM_TO_DEV, >> DMA_PREP_INTERRUPT | DMA_CTRL_ACK); >> if (!dma->tx_desc) { >> @@ -407,10 +419,10 @@ static int dspi_next_xfer_dma_submit(struct fsl_dspi *dspi) >> return -EINVAL; >> } >> >> + dma_sync_single_for_device(dev, dma->rx_dma_phys, size, >> + DMA_FROM_DEVICE); >> dma->rx_desc = dmaengine_prep_slave_single(dma->chan_rx, >> - dma->rx_dma_phys, >> - dspi->words_in_flight * >> - DMA_SLAVE_BUSWIDTH_4_BYTES, >> + dma->rx_dma_phys, size, >> DMA_DEV_TO_MEM, >> DMA_PREP_INTERRUPT | DMA_CTRL_ACK); >> if (!dma->rx_desc) { >> @@ -512,17 +524,17 @@ static int dspi_request_dma(struct fsl_dspi *dspi, phys_addr_t phy_addr) >> goto err_tx_channel; >> } >> >> - dma->tx_dma_buf = dma_alloc_coherent(dma->chan_tx->device->dev, >> - dma_bufsize, &dma->tx_dma_phys, >> - GFP_KERNEL); >> + dma->tx_dma_buf = dma_alloc_noncoherent(dma->chan_tx->device->dev, >> + dma_bufsize, &dma->tx_dma_phys, >> + DMA_TO_DEVICE, GFP_KERNEL); >> if (!dma->tx_dma_buf) { >> ret = -ENOMEM; >> goto err_tx_dma_buf; >> } >> >> - dma->rx_dma_buf = dma_alloc_coherent(dma->chan_rx->device->dev, >> - dma_bufsize, &dma->rx_dma_phys, >> - GFP_KERNEL); >> + dma->rx_dma_buf = dma_alloc_noncoherent(dma->chan_rx->device->dev, >> + dma_bufsize, &dma->rx_dma_phys, >> + DMA_FROM_DEVICE, GFP_KERNEL); >> if (!dma->rx_dma_buf) { >> ret = -ENOMEM; >> goto err_rx_dma_buf; >> @@ -557,11 +569,12 @@ static int dspi_request_dma(struct fsl_dspi *dspi, phys_addr_t phy_addr) >> return 0; >> >> err_slave_config: >> - dma_free_coherent(dma->chan_rx->device->dev, >> - dma_bufsize, dma->rx_dma_buf, dma->rx_dma_phys); >> + dma_free_noncoherent(dma->chan_rx->device->dev, dma_bufsize, >> + dma->rx_dma_buf, dma->rx_dma_phys, >> + DMA_FROM_DEVICE); >> err_rx_dma_buf: >> - dma_free_coherent(dma->chan_tx->device->dev, >> - dma_bufsize, dma->tx_dma_buf, dma->tx_dma_phys); >> + dma_free_noncoherent(dma->chan_tx->device->dev, dma_bufsize, >> + dma->tx_dma_buf, dma->tx_dma_phys, DMA_TO_DEVICE); >> err_tx_dma_buf: >> dma_release_channel(dma->chan_tx); >> err_tx_channel: >> @@ -582,14 +595,16 @@ static void dspi_release_dma(struct fsl_dspi *dspi) >> return; >> >> if (dma->chan_tx) { >> - dma_free_coherent(dma->chan_tx->device->dev, dma_bufsize, >> - dma->tx_dma_buf, dma->tx_dma_phys); >> + dma_free_noncoherent(dma->chan_tx->device->dev, dma_bufsize, >> + dma->tx_dma_buf, dma->tx_dma_phys, >> + DMA_TO_DEVICE); >> dma_release_channel(dma->chan_tx); >> } >> >> if (dma->chan_rx) { >> - dma_free_coherent(dma->chan_rx->device->dev, dma_bufsize, >> - dma->rx_dma_buf, dma->rx_dma_phys); >> + dma_free_noncoherent(dma->chan_rx->device->dev, dma_bufsize, >> + dma->rx_dma_buf, dma->rx_dma_phys, >> + DMA_FROM_DEVICE); >> dma_release_channel(dma->chan_rx); >> } >> } >> >> -- >> 2.34.1 >>
On Tue, Jun 10, 2025, at 17:15, Frank Li wrote: > On Mon, Jun 09, 2025 at 04:32:39PM +0100, James Clark wrote: >> Using coherent memory here isn't functionally necessary. >> Because the >> change to use non-coherent memory isn't overly complex and only a few >> synchronization points are required, we might as well do it while fixing >> up some other DMA issues. > > Any beanfit by use on-coherent memory here? The driver copies data in and out of a coherent buffer by default. This is fine if the buffer is only a few bytes in size, but for large transfers this is quite slow because this bypasses the cache for any DMA master that is marked as not "dma-coherent" in devicetree. Patch 3/4 changes the size from a few bytes to many pages of memory, so it's access the buffer in cache first and manually maintain coherency. Arnd
On Tue, Jun 10, 2025 at 05:48:40PM +0200, Arnd Bergmann wrote: > On Tue, Jun 10, 2025, at 17:15, Frank Li wrote: > > On Mon, Jun 09, 2025 at 04:32:39PM +0100, James Clark wrote: > >> Using coherent memory here isn't functionally necessary. > >> Because the > >> change to use non-coherent memory isn't overly complex and only a few > >> synchronization points are required, we might as well do it while fixing > >> up some other DMA issues. > > > > Any beanfit by use on-coherent memory here? > > The driver copies data in and out of a coherent buffer by default. This is > fine if the buffer is only a few bytes in size, but for large transfers > this is quite slow because this bypasses the cache for any DMA master > that is marked as not "dma-coherent" in devicetree. I see, non-coherent memory use cachable normal memory page. but coherent use non-cachable normal memory page. Can you add performance beneafit information after use non-coherent memory in commit message to let reviewer easily know your intention. Frank > > Patch 3/4 changes the size from a few bytes to many pages of memory, > so it's access the buffer in cache first and manually maintain > coherency. > > Arnd
On Tue, Jun 10, 2025 at 11:56:34AM -0400, Frank Li wrote: > Can you add performance beneafit information after use non-coherent memory > in commit message to let reviewer easily know your intention. To expand on that, you can post the output of something like this (before and after): $ spidev_test --device /dev/spidev1.0 --bpw 8 --size 256 --cpha --iter 10000000 --speed 10000000 where /dev/spidev1.0 is an unconnected chip select with a dummy entry in the device tree.
On Thu, Jun 12, 2025 at 12:05:26PM +0100, James Clark wrote: > (No idea why it goes faster when it's under load, but I hope that can be > ignored for this test) Might be because of dynamic CPU frequency scaling as done by the governor. If the CPU utilization of spidev_test isn't high enough, the governor will prefer lower CPU frequencies. You can try to repeat the test with the "performance" governor and/or setting the min frequency equal to the max one. That's why I don't like the DMA mode in DSPI, it's still CPU-bound, because the DMA buffers are very small (you can only provide one TX FIFO worth of data per DMA transfer, rather than the whole buffer). FWIW, the XSPI FIFO performance should be higher.
On 12/06/2025 12:15 pm, Vladimir Oltean wrote: > On Thu, Jun 12, 2025 at 12:05:26PM +0100, James Clark wrote: >> (No idea why it goes faster when it's under load, but I hope that can be >> ignored for this test) > > Might be because of dynamic CPU frequency scaling as done by the governor. > If the CPU utilization of spidev_test isn't high enough, the governor > will prefer lower CPU frequencies. You can try to repeat the test with > the "performance" governor and/or setting the min frequency equal to the > max one. > That doesn't seem to make a difference, I get the same results with this. Even for the "fixed" DMA test results below there is a similar small performance increase when stressing the system: # cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_min_freq 1300000 ... # cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_max_freq 1300000 ... # cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor performance ... > That's why I don't like the DMA mode in DSPI, it's still CPU-bound, > because the DMA buffers are very small (you can only provide one TX FIFO > worth of data per DMA transfer, rather than the whole buffer). Is that right? The FIFO size isn't used in any of the DMA codepaths, it looks like the whole DMA buffer is filled before initiating the transfer. And we increase the buffer to 4k in this patchset to fully use the existing allocation. > > FWIW, the XSPI FIFO performance should be higher. This leads me to realise a mistake in my original figures. My head was stuck in target mode where we use DMA so I forgot to force DMA in host mode to run the performance tests. The previous figures were all XSPI mode and the small difference in performance could have been just down to the layout of the code changing? Changing it to DMA mode gives figures that make much more sense: Coherent (4096 byte transfers): 6534 kbps Non-coherent: 7347 kbps Coherent (16 byte transfers): 447 kbps Non-coherent: 448 kbps Just for comparison running the same test in XSPI mode: 4096 byte transfers: 2143 kbps 16 byte transfers: 637 kbps So for small transfers XSPI is slightly better but for large ones DMA is much better, with non-coherent memory giving another 800kbps gain. Perhaps we could find the midpoint and then auto select the mode depending on the size, but maybe there is latency to consider too which could be important.
On 12/06/2025 3:23 pm, Vladimir Oltean wrote: > On Thu, Jun 12, 2025 at 03:14:32PM +0100, James Clark wrote: >>> That's why I don't like the DMA mode in DSPI, it's still CPU-bound, >>> because the DMA buffers are very small (you can only provide one TX FIFO >>> worth of data per DMA transfer, rather than the whole buffer). >> >> Is that right? The FIFO size isn't used in any of the DMA codepaths, it >> looks like the whole DMA buffer is filled before initiating the transfer. >> And we increase the buffer to 4k in this patchset to fully use the existing >> allocation. > > Uhm, yeah, no? > > dspi_dma_xfer(): > > while (dspi->len) { > dspi->words_in_flight = dspi->len / dspi->oper_word_size; > if (dspi->words_in_flight > dspi->devtype_data->fifo_size) > dspi->words_in_flight = dspi->devtype_data->fifo_size; > dspi_next_xfer_dma_submit(); > } Right but that's before the change in this patchset to use the whole page that was allocated, hence the next bit: > And we increase the buffer to 4k in this patchset to fully use the existing allocation. We were allocating for the size of the FIFO (multiplied by two to hold the control words), but dma_alloc_coherent() will be backed by a whole page anyway, even if you only ask for a few bytes. After changing that to make use of the full allocation the FIFO length is no longer involved.
On 12/06/2025 3:31 pm, Vladimir Oltean wrote: > On Thu, Jun 12, 2025 at 03:28:37PM +0100, James Clark wrote: >> >> >> On 12/06/2025 3:23 pm, Vladimir Oltean wrote: >>> On Thu, Jun 12, 2025 at 03:14:32PM +0100, James Clark wrote: >>>>> That's why I don't like the DMA mode in DSPI, it's still CPU-bound, >>>>> because the DMA buffers are very small (you can only provide one TX FIFO >>>>> worth of data per DMA transfer, rather than the whole buffer). >>>> >>>> Is that right? The FIFO size isn't used in any of the DMA codepaths, it >>>> looks like the whole DMA buffer is filled before initiating the transfer. >>>> And we increase the buffer to 4k in this patchset to fully use the existing >>>> allocation. >>> >>> Uhm, yeah, no? >>> >>> dspi_dma_xfer(): >>> >>> while (dspi->len) { >>> dspi->words_in_flight = dspi->len / dspi->oper_word_size; >>> if (dspi->words_in_flight > dspi->devtype_data->fifo_size) >>> dspi->words_in_flight = dspi->devtype_data->fifo_size; >>> dspi_next_xfer_dma_submit(); >>> } >> >> Right but that's before the change in this patchset to use the whole page >> that was allocated, hence the next bit: >> >>> And we increase the buffer to 4k in this patchset to fully use the >> existing allocation. >> >> We were allocating for the size of the FIFO (multiplied by two to hold the >> control words), but dma_alloc_coherent() will be backed by a whole page >> anyway, even if you only ask for a few bytes. >> >> After changing that to make use of the full allocation the FIFO length is no >> longer involved. > > Ok, I haven't walked through patch 3 yet, I didn't realize it would be > changing that. I will want to test it on LS1028A. Yeah testing it somewhere else would be good. Maybe there is some limitation there about the max size of the DMA transfer, but I didn't see it. I realise the tense in my original message may have been a bit confusing. I was mixing up the existing code and proposed changes...
On Thu, Jun 12, 2025 at 03:14:32PM +0100, James Clark wrote: > On 12/06/2025 12:15 pm, Vladimir Oltean wrote: > > FWIW, the XSPI FIFO performance should be higher. > This leads me to realise a mistake in my original figures. My head was stuck > in target mode where we use DMA so I forgot to force DMA in host mode to run > the performance tests. The previous figures were all XSPI mode and the small > difference in performance could have been just down to the layout of the > code changing? > Changing it to DMA mode gives figures that make much more sense: If not having DMA mode is making this much of a difference shouldn't the driver just do that? I'm not seeing runtime configuration in the driver so I guess this is local hacking... > So for small transfers XSPI is slightly better but for large ones DMA is > much better, with non-coherent memory giving another 800kbps gain. Perhaps > we could find the midpoint and then auto select the mode depending on the > size, but maybe there is latency to consider too which could be important. This is a fairly normal pattern, it's a big part of why the can_dma() callback is per transfer - so you can do a copybreak and use PIO for smaller transfers where the overhead of setting up DMA is often more than the overhead of just doing PIO.
On 12/06/2025 4:36 pm, James Clark wrote: > > > On 12/06/2025 3:36 pm, Vladimir Oltean wrote: >> On Thu, Jun 12, 2025 at 03:14:32PM +0100, James Clark wrote: >>>> FWIW, the XSPI FIFO performance should be higher. >>> >>> This leads me to realise a mistake in my original figures. My head >>> was stuck >>> in target mode where we use DMA so I forgot to force DMA in host mode >>> to run >>> the performance tests. The previous figures were all XSPI mode and >>> the small >>> difference in performance could have been just down to the layout of the >>> code changing? >>> >>> Changing it to DMA mode gives figures that make much more sense: >>> >>> Coherent (4096 byte transfers): 6534 kbps >>> Non-coherent: 7347 kbps >>> >>> Coherent (16 byte transfers): 447 kbps >>> Non-coherent: 448 kbps >>> >>> >>> Just for comparison running the same test in XSPI mode: >>> >>> 4096 byte transfers: 2143 kbps >>> 16 byte transfers: 637 kbps >> >> So to be clear, the 'non-coherent' test was done just with patch 2 >> applied, or also with 3? > > The whole set, and then the non-coherent patch reverted. > And with DMA forced in host mode as a hack.
On 12/06/2025 3:43 pm, Mark Brown wrote: > On Thu, Jun 12, 2025 at 03:14:32PM +0100, James Clark wrote: >> On 12/06/2025 12:15 pm, Vladimir Oltean wrote: > >>> FWIW, the XSPI FIFO performance should be higher. > >> This leads me to realise a mistake in my original figures. My head was stuck >> in target mode where we use DMA so I forgot to force DMA in host mode to run >> the performance tests. The previous figures were all XSPI mode and the small >> difference in performance could have been just down to the layout of the >> code changing? > >> Changing it to DMA mode gives figures that make much more sense: > > If not having DMA mode is making this much of a difference shouldn't the > driver just do that? I'm not seeing runtime configuration in the driver > so I guess this is local hacking... > Yes just changed locally. >> So for small transfers XSPI is slightly better but for large ones DMA is >> much better, with non-coherent memory giving another 800kbps gain. Perhaps >> we could find the midpoint and then auto select the mode depending on the >> size, but maybe there is latency to consider too which could be important. > > This is a fairly normal pattern, it's a big part of why the can_dma() > callback is per transfer - so you can do a copybreak and use PIO for > smaller transfers where the overhead of setting up DMA is often more > than the overhead of just doing PIO. Makes sense. Although for some reason two devices already use DMA for host mode and it's not that clear to me what the reason is.
diff --git a/drivers/spi/spi-fsl-dspi.c b/drivers/spi/spi-fsl-dspi.c index 386a17871e79..567632042f8f 100644 --- a/drivers/spi/spi-fsl-dspi.c +++ b/drivers/spi/spi-fsl-dspi.c @@ -247,6 +247,11 @@ struct fsl_dspi { void (*dev_to_host)(struct fsl_dspi *dspi, u32 rxdata); }; +static int dspi_dma_transfer_size(struct fsl_dspi *dspi) +{ + return dspi->words_in_flight * DMA_SLAVE_BUSWIDTH_4_BYTES; +} + static void dspi_native_host_to_dev(struct fsl_dspi *dspi, u32 *txdata) { switch (dspi->oper_word_size) { @@ -361,7 +366,10 @@ static void dspi_tx_dma_callback(void *arg) { struct fsl_dspi *dspi = arg; struct fsl_dspi_dma *dma = dspi->dma; + struct device *dev = &dspi->pdev->dev; + dma_sync_single_for_cpu(dev, dma->tx_dma_phys, + dspi_dma_transfer_size(dspi), DMA_TO_DEVICE); complete(&dma->cmd_tx_complete); } @@ -369,9 +377,13 @@ static void dspi_rx_dma_callback(void *arg) { struct fsl_dspi *dspi = arg; struct fsl_dspi_dma *dma = dspi->dma; + struct device *dev = &dspi->pdev->dev; int i; if (dspi->rx) { + dma_sync_single_for_cpu(dev, dma->rx_dma_phys, + dspi_dma_transfer_size(dspi), + DMA_FROM_DEVICE); for (i = 0; i < dspi->words_in_flight; i++) dspi_push_rx(dspi, dspi->dma->rx_dma_buf[i]); } @@ -381,6 +393,7 @@ static void dspi_rx_dma_callback(void *arg) static int dspi_next_xfer_dma_submit(struct fsl_dspi *dspi) { + size_t size = dspi_dma_transfer_size(dspi); struct device *dev = &dspi->pdev->dev; struct fsl_dspi_dma *dma = dspi->dma; int time_left; @@ -389,10 +402,9 @@ static int dspi_next_xfer_dma_submit(struct fsl_dspi *dspi) for (i = 0; i < dspi->words_in_flight; i++) dspi->dma->tx_dma_buf[i] = dspi_pop_tx_pushr(dspi); + dma_sync_single_for_device(dev, dma->tx_dma_phys, size, DMA_TO_DEVICE); dma->tx_desc = dmaengine_prep_slave_single(dma->chan_tx, - dma->tx_dma_phys, - dspi->words_in_flight * - DMA_SLAVE_BUSWIDTH_4_BYTES, + dma->tx_dma_phys, size, DMA_MEM_TO_DEV, DMA_PREP_INTERRUPT | DMA_CTRL_ACK); if (!dma->tx_desc) { @@ -407,10 +419,10 @@ static int dspi_next_xfer_dma_submit(struct fsl_dspi *dspi) return -EINVAL; } + dma_sync_single_for_device(dev, dma->rx_dma_phys, size, + DMA_FROM_DEVICE); dma->rx_desc = dmaengine_prep_slave_single(dma->chan_rx, - dma->rx_dma_phys, - dspi->words_in_flight * - DMA_SLAVE_BUSWIDTH_4_BYTES, + dma->rx_dma_phys, size, DMA_DEV_TO_MEM, DMA_PREP_INTERRUPT | DMA_CTRL_ACK); if (!dma->rx_desc) { @@ -512,17 +524,17 @@ static int dspi_request_dma(struct fsl_dspi *dspi, phys_addr_t phy_addr) goto err_tx_channel; } - dma->tx_dma_buf = dma_alloc_coherent(dma->chan_tx->device->dev, - dma_bufsize, &dma->tx_dma_phys, - GFP_KERNEL); + dma->tx_dma_buf = dma_alloc_noncoherent(dma->chan_tx->device->dev, + dma_bufsize, &dma->tx_dma_phys, + DMA_TO_DEVICE, GFP_KERNEL); if (!dma->tx_dma_buf) { ret = -ENOMEM; goto err_tx_dma_buf; } - dma->rx_dma_buf = dma_alloc_coherent(dma->chan_rx->device->dev, - dma_bufsize, &dma->rx_dma_phys, - GFP_KERNEL); + dma->rx_dma_buf = dma_alloc_noncoherent(dma->chan_rx->device->dev, + dma_bufsize, &dma->rx_dma_phys, + DMA_FROM_DEVICE, GFP_KERNEL); if (!dma->rx_dma_buf) { ret = -ENOMEM; goto err_rx_dma_buf; @@ -557,11 +569,12 @@ static int dspi_request_dma(struct fsl_dspi *dspi, phys_addr_t phy_addr) return 0; err_slave_config: - dma_free_coherent(dma->chan_rx->device->dev, - dma_bufsize, dma->rx_dma_buf, dma->rx_dma_phys); + dma_free_noncoherent(dma->chan_rx->device->dev, dma_bufsize, + dma->rx_dma_buf, dma->rx_dma_phys, + DMA_FROM_DEVICE); err_rx_dma_buf: - dma_free_coherent(dma->chan_tx->device->dev, - dma_bufsize, dma->tx_dma_buf, dma->tx_dma_phys); + dma_free_noncoherent(dma->chan_tx->device->dev, dma_bufsize, + dma->tx_dma_buf, dma->tx_dma_phys, DMA_TO_DEVICE); err_tx_dma_buf: dma_release_channel(dma->chan_tx); err_tx_channel: @@ -582,14 +595,16 @@ static void dspi_release_dma(struct fsl_dspi *dspi) return; if (dma->chan_tx) { - dma_free_coherent(dma->chan_tx->device->dev, dma_bufsize, - dma->tx_dma_buf, dma->tx_dma_phys); + dma_free_noncoherent(dma->chan_tx->device->dev, dma_bufsize, + dma->tx_dma_buf, dma->tx_dma_phys, + DMA_TO_DEVICE); dma_release_channel(dma->chan_tx); } if (dma->chan_rx) { - dma_free_coherent(dma->chan_rx->device->dev, dma_bufsize, - dma->rx_dma_buf, dma->rx_dma_phys); + dma_free_noncoherent(dma->chan_rx->device->dev, dma_bufsize, + dma->rx_dma_buf, dma->rx_dma_phys, + DMA_FROM_DEVICE); dma_release_channel(dma->chan_rx); } }
Using coherent memory here isn't functionally necessary. Because the change to use non-coherent memory isn't overly complex and only a few synchronization points are required, we might as well do it while fixing up some other DMA issues. Suggested-by: Arnd Bergmann <arnd@arndb.de> Signed-off-by: James Clark <james.clark@linaro.org> --- drivers/spi/spi-fsl-dspi.c | 55 +++++++++++++++++++++++++++++----------------- 1 file changed, 35 insertions(+), 20 deletions(-)