Message ID | 20250513092803.2096-1-tao.wangtao@honor.com |
---|---|
State | Superseded |
Headers | show |
Series | [1/2] dmabuf: add DMA_BUF_IOCTL_RW_FILE | expand |
On 5/13/25 14:30, wangtao wrote: >> -----Original Message----- >> From: Christian König <christian.koenig@amd.com> >> Sent: Tuesday, May 13, 2025 7:32 PM >> To: wangtao <tao.wangtao@honor.com>; sumit.semwal@linaro.org; >> benjamin.gaignard@collabora.com; Brian.Starkey@arm.com; >> jstultz@google.com; tjmercier@google.com >> Cc: linux-media@vger.kernel.org; dri-devel@lists.freedesktop.org; linaro- >> mm-sig@lists.linaro.org; linux-kernel@vger.kernel.org; >> wangbintian(BintianWang) <bintian.wang@honor.com>; yipengxiang >> <yipengxiang@honor.com>; liulu 00013167 <liulu.liu@honor.com>; hanfeng >> 00012985 <feng.han@honor.com> >> Subject: Re: [PATCH 2/2] dmabuf/heaps: implement >> DMA_BUF_IOCTL_RW_FILE for system_heap >> >> On 5/13/25 11:28, wangtao wrote: >>> Support direct file I/O operations for system_heap dma-buf objects. >>> Implementation includes: >>> 1. Convert sg_table to bio_vec >> >> That is usually illegal for DMA-bufs. > [wangtao] The term 'convert' is misleading in this context. The appropriate phrasing should be: Construct bio_vec from sg_table. Well it doesn't matter what you call it. Touching the page inside an sg table of a DMA-buf is illegal, we even have code to actively prevent that. Once more: This approach was already rejected multiple times! Please use udmabuf instead! The hack you came up here is simply not necessary. Regards, Christian. > Appreciate your feedback. >> >> Regards, >> Christian. >> >>> 2. Set IOCB_DIRECT when O_DIRECT is supported 3. Invoke >>> vfs_iocb_iter_read()/vfs_iocb_iter_write() for actual I/O >>> >>> Performance metrics (UFS 4.0 device @4GB/s, Arm64 CPU @1GHz): >>> >>> | Metric | 1MB | 8MB | 64MB | 1024MB | 3072MB | >>> |--------------------|-------:|-------:|--------:|---------:|--------- >>> |--------------------|:| >>> | Buffer Read (us) | 1658 | 9028 | 69295 | 1019783 | 2978179 | >>> | Direct Read (us) | 707 | 2647 | 18689 | 299627 | 937758 | >>> | Buffer Rate (MB/s) | 603 | 886 | 924 | 1004 | 1032 | >>> | Direct Rate (MB/s) | 1414 | 3022 | 3425 | 3418 | 3276 | >>> >>> Signed-off-by: wangtao <tao.wangtao@honor.com> >>> --- >>> drivers/dma-buf/heaps/system_heap.c | 118 >>> ++++++++++++++++++++++++++++ >>> 1 file changed, 118 insertions(+) >>> >>> diff --git a/drivers/dma-buf/heaps/system_heap.c >>> b/drivers/dma-buf/heaps/system_heap.c >>> index 26d5dc89ea16..f7b71b9843aa 100644 >>> --- a/drivers/dma-buf/heaps/system_heap.c >>> +++ b/drivers/dma-buf/heaps/system_heap.c >>> @@ -20,6 +20,8 @@ >>> #include <linux/scatterlist.h> >>> #include <linux/slab.h> >>> #include <linux/vmalloc.h> >>> +#include <linux/bvec.h> >>> +#include <linux/uio.h> >>> >>> static struct dma_heap *sys_heap; >>> >>> @@ -281,6 +283,121 @@ static void system_heap_vunmap(struct dma_buf >> *dmabuf, struct iosys_map *map) >>> iosys_map_clear(map); >>> } >>> >>> +static struct bio_vec *system_heap_init_bvec(struct >> system_heap_buffer *buffer, >>> + size_t offset, size_t len, int *nr_segs) { >>> + struct sg_table *sgt = &buffer->sg_table; >>> + struct scatterlist *sg; >>> + size_t length = 0; >>> + unsigned int i, k = 0; >>> + struct bio_vec *bvec; >>> + size_t sg_left; >>> + size_t sg_offset; >>> + size_t sg_len; >>> + >>> + bvec = kvcalloc(sgt->nents, sizeof(*bvec), GFP_KERNEL); >>> + if (!bvec) >>> + return NULL; >>> + >>> + for_each_sg(sgt->sgl, sg, sgt->nents, i) { >>> + length += sg->length; >>> + if (length <= offset) >>> + continue; >>> + >>> + sg_left = length - offset; >>> + sg_offset = sg->offset + sg->length - sg_left; >>> + sg_len = min(sg_left, len); >>> + >>> + bvec[k].bv_page = sg_page(sg); >>> + bvec[k].bv_len = sg_len; >>> + bvec[k].bv_offset = sg_offset; >>> + k++; >>> + >>> + offset += sg_len; >>> + len -= sg_len; >>> + if (len <= 0) >>> + break; >>> + } >>> + >>> + *nr_segs = k; >>> + return bvec; >>> +} >>> + >>> +static int system_heap_rw_file(struct system_heap_buffer *buffer, bool >> is_read, >>> + bool direct_io, struct file *filp, loff_t file_offset, >>> + size_t buf_offset, size_t len) >>> +{ >>> + struct bio_vec *bvec; >>> + int nr_segs = 0; >>> + struct iov_iter iter; >>> + struct kiocb kiocb; >>> + ssize_t ret = 0; >>> + >>> + if (direct_io) { >>> + if (!(filp->f_mode & FMODE_CAN_ODIRECT)) >>> + return -EINVAL; >>> + } >>> + >>> + bvec = system_heap_init_bvec(buffer, buf_offset, len, &nr_segs); >>> + if (!bvec) >>> + return -ENOMEM; >>> + >>> + iov_iter_bvec(&iter, is_read ? ITER_DEST : ITER_SOURCE, bvec, >> nr_segs, len); >>> + init_sync_kiocb(&kiocb, filp); >>> + kiocb.ki_pos = file_offset; >>> + if (direct_io) >>> + kiocb.ki_flags |= IOCB_DIRECT; >>> + >>> + while (kiocb.ki_pos < file_offset + len) { >>> + if (is_read) >>> + ret = vfs_iocb_iter_read(filp, &kiocb, &iter); >>> + else >>> + ret = vfs_iocb_iter_write(filp, &kiocb, &iter); >>> + if (ret <= 0) >>> + break; >>> + } >>> + >>> + kvfree(bvec); >>> + return ret < 0 ? ret : 0; >>> +} >>> + >>> +static int system_heap_dma_buf_rw_file(struct dma_buf *dmabuf, >>> + struct dma_buf_rw_file *back) >>> +{ >>> + struct system_heap_buffer *buffer = dmabuf->priv; >>> + int ret = 0; >>> + __u32 op = back->flags & DMA_BUF_RW_FLAGS_OP_MASK; >>> + bool direct_io = back->flags & DMA_BUF_RW_FLAGS_DIRECT; >>> + struct file *filp; >>> + >>> + if (op != DMA_BUF_RW_FLAGS_READ && op != >> DMA_BUF_RW_FLAGS_WRITE) >>> + return -EINVAL; >>> + if (direct_io) { >>> + if (!PAGE_ALIGNED(back->file_offset) || >>> + !PAGE_ALIGNED(back->buf_offset) || >>> + !PAGE_ALIGNED(back->buf_len)) >>> + return -EINVAL; >>> + } >>> + if (!back->buf_len || back->buf_len > dmabuf->size || >>> + back->buf_offset >= dmabuf->size || >>> + back->buf_offset + back->buf_len > dmabuf->size) >>> + return -EINVAL; >>> + if (back->file_offset + back->buf_len < back->file_offset) >>> + return -EINVAL; >>> + >>> + filp = fget(back->fd); >>> + if (!filp) >>> + return -EBADF; >>> + >>> + mutex_lock(&buffer->lock); >>> + ret = system_heap_rw_file(buffer, op == >> DMA_BUF_RW_FLAGS_READ, direct_io, >>> + filp, back->file_offset, back->buf_offset, back- >>> buf_len); >>> + mutex_unlock(&buffer->lock); >>> + >>> + fput(filp); >>> + return ret; >>> +} >>> + >>> static void system_heap_dma_buf_release(struct dma_buf *dmabuf) { >>> struct system_heap_buffer *buffer = dmabuf->priv; @@ -308,6 >> +425,7 >>> @@ static const struct dma_buf_ops system_heap_buf_ops = { >>> .mmap = system_heap_mmap, >>> .vmap = system_heap_vmap, >>> .vunmap = system_heap_vunmap, >>> + .rw_file = system_heap_dma_buf_rw_file, >>> .release = system_heap_dma_buf_release, }; >>> >
> -----Original Message----- > From: Christian König <christian.koenig@amd.com> > Sent: Tuesday, May 13, 2025 9:18 PM > To: wangtao <tao.wangtao@honor.com>; sumit.semwal@linaro.org; > benjamin.gaignard@collabora.com; Brian.Starkey@arm.com; > jstultz@google.com; tjmercier@google.com > Cc: linux-media@vger.kernel.org; dri-devel@lists.freedesktop.org; linaro- > mm-sig@lists.linaro.org; linux-kernel@vger.kernel.org; > wangbintian(BintianWang) <bintian.wang@honor.com>; yipengxiang > <yipengxiang@honor.com>; <liulu.liu@honor.com>; <feng.han@honor.com> > Subject: Re: [PATCH 2/2] dmabuf/heaps: implement > DMA_BUF_IOCTL_RW_FILE for system_heap > > On 5/13/25 14:30, wangtao wrote: > >> -----Original Message----- > >> From: Christian König <christian.koenig@amd.com> > >> Sent: Tuesday, May 13, 2025 7:32 PM > >> To: wangtao <tao.wangtao@honor.com>; sumit.semwal@linaro.org; > >> benjamin.gaignard@collabora.com; Brian.Starkey@arm.com; > >> jstultz@google.com; tjmercier@google.com > >> Cc: linux-media@vger.kernel.org; dri-devel@lists.freedesktop.org; > >> linaro- mm-sig@lists.linaro.org; linux-kernel@vger.kernel.org; > >> wangbintian(BintianWang) <bintian.wang@honor.com>; yipengxiang > >> <yipengxiang@honor.com>; <liulu.liu@honor.com>; > >> <feng.han@honor.com> > >> Subject: Re: [PATCH 2/2] dmabuf/heaps: implement > >> DMA_BUF_IOCTL_RW_FILE for system_heap > >> > >> On 5/13/25 11:28, wangtao wrote: > >>> Support direct file I/O operations for system_heap dma-buf objects. > >>> Implementation includes: > >>> 1. Convert sg_table to bio_vec > >> > >> That is usually illegal for DMA-bufs. > > [wangtao] The term 'convert' is misleading in this context. The appropriate > phrasing should be: Construct bio_vec from sg_table. > > Well it doesn't matter what you call it. Touching the page inside an sg table of > a DMA-buf is illegal, we even have code to actively prevent that. [wangtao] For a driver using DMA-buf: Don't touch pages in the sg_table. But the system heap exporter (sg_table owner) should be allowed to use them. If a driver takes ownership via dma_buf_map_attachment or similar calls, the exporter must stop using the sg_table. User-space programs should call DMA_BUF_IOCTL_RW_FILE only when the DMA-buf is not attached. The exporter must check ownership (e.g., ensure no map_dma_buf/vmap is active) and block new calls during operations. I'll add these checks in patch v2. > > Once more: This approach was already rejected multiple times! Please use > udmabuf instead! > > The hack you came up here is simply not necessary. [wangtao] Many people need DMA-buf direct I/O. I tried it 2 years ago. My method is simpler, uses less CPU/power, and performs better: - Speed: 3418 MB/s vs. 2073 MB/s (udmabuf) at 1GHz CPU. - udmabuf wastes half its CPU time on __get_user_pages. - Creating 32x32MB DMA-bufs + reading 1GB file takes 346 ms vs. 1145 ms for udmabuf (10x slower) vs. 1503 ms for DMA-buf normal. udmabuf is slightly faster but not enough. Switching to udmabuf is easy for small apps but hard in complex systems without major benefits. > > Regards, > Christian. > > > > Appreciate your feedback. > >> > >> Regards, > >> Christian. > >> > >>> 2. Set IOCB_DIRECT when O_DIRECT is supported 3. Invoke > >>> vfs_iocb_iter_read()/vfs_iocb_iter_write() for actual I/O > >>> > >>> Performance metrics (UFS 4.0 device @4GB/s, Arm64 CPU @1GHz): > >>> > >>> | Metric | 1MB | 8MB | 64MB | 1024MB | 3072MB | > >>> |--------------------|-------:|-------:|--------:|---------:|------- > >>> |--------------------|-- > >>> |--------------------|:| > >>> | Buffer Read (us) | 1658 | 9028 | 69295 | 1019783 | 2978179 | > >>> | Direct Read (us) | 707 | 2647 | 18689 | 299627 | 937758 | > >>> | Buffer Rate (MB/s) | 603 | 886 | 924 | 1004 | 1032 | > >>> | Direct Rate (MB/s) | 1414 | 3022 | 3425 | 3418 | 3276 | > >>> > >>> Signed-off-by: wangtao <tao.wangtao@honor.com> > >>> --- > >>> drivers/dma-buf/heaps/system_heap.c | 118 > >>> ++++++++++++++++++++++++++++ > >>> 1 file changed, 118 insertions(+) > >>> > >>> diff --git a/drivers/dma-buf/heaps/system_heap.c > >>> b/drivers/dma-buf/heaps/system_heap.c > >>> index 26d5dc89ea16..f7b71b9843aa 100644 > >>> --- a/drivers/dma-buf/heaps/system_heap.c > >>> +++ b/drivers/dma-buf/heaps/system_heap.c > >>> @@ -20,6 +20,8 @@ > >>> #include <linux/scatterlist.h> > >>> #include <linux/slab.h> > >>> #include <linux/vmalloc.h> > >>> +#include <linux/bvec.h> > >>> +#include <linux/uio.h> > >>> > >>> static struct dma_heap *sys_heap; > >>> > >>> @@ -281,6 +283,121 @@ static void system_heap_vunmap(struct > dma_buf > >> *dmabuf, struct iosys_map *map) > >>> iosys_map_clear(map); > >>> } > >>> > >>> +static struct bio_vec *system_heap_init_bvec(struct > >> system_heap_buffer *buffer, > >>> + size_t offset, size_t len, int *nr_segs) { > >>> + struct sg_table *sgt = &buffer->sg_table; > >>> + struct scatterlist *sg; > >>> + size_t length = 0; > >>> + unsigned int i, k = 0; > >>> + struct bio_vec *bvec; > >>> + size_t sg_left; > >>> + size_t sg_offset; > >>> + size_t sg_len; > >>> + > >>> + bvec = kvcalloc(sgt->nents, sizeof(*bvec), GFP_KERNEL); > >>> + if (!bvec) > >>> + return NULL; > >>> + > >>> + for_each_sg(sgt->sgl, sg, sgt->nents, i) { > >>> + length += sg->length; > >>> + if (length <= offset) > >>> + continue; > >>> + > >>> + sg_left = length - offset; > >>> + sg_offset = sg->offset + sg->length - sg_left; > >>> + sg_len = min(sg_left, len); > >>> + > >>> + bvec[k].bv_page = sg_page(sg); > >>> + bvec[k].bv_len = sg_len; > >>> + bvec[k].bv_offset = sg_offset; > >>> + k++; > >>> + > >>> + offset += sg_len; > >>> + len -= sg_len; > >>> + if (len <= 0) > >>> + break; > >>> + } > >>> + > >>> + *nr_segs = k; > >>> + return bvec; > >>> +} > >>> + > >>> +static int system_heap_rw_file(struct system_heap_buffer *buffer, > >>> +bool > >> is_read, > >>> + bool direct_io, struct file *filp, loff_t file_offset, > >>> + size_t buf_offset, size_t len) > >>> +{ > >>> + struct bio_vec *bvec; > >>> + int nr_segs = 0; > >>> + struct iov_iter iter; > >>> + struct kiocb kiocb; > >>> + ssize_t ret = 0; > >>> + > >>> + if (direct_io) { > >>> + if (!(filp->f_mode & FMODE_CAN_ODIRECT)) > >>> + return -EINVAL; > >>> + } > >>> + > >>> + bvec = system_heap_init_bvec(buffer, buf_offset, len, &nr_segs); > >>> + if (!bvec) > >>> + return -ENOMEM; > >>> + > >>> + iov_iter_bvec(&iter, is_read ? ITER_DEST : ITER_SOURCE, bvec, > >> nr_segs, len); > >>> + init_sync_kiocb(&kiocb, filp); > >>> + kiocb.ki_pos = file_offset; > >>> + if (direct_io) > >>> + kiocb.ki_flags |= IOCB_DIRECT; > >>> + > >>> + while (kiocb.ki_pos < file_offset + len) { > >>> + if (is_read) > >>> + ret = vfs_iocb_iter_read(filp, &kiocb, &iter); > >>> + else > >>> + ret = vfs_iocb_iter_write(filp, &kiocb, &iter); > >>> + if (ret <= 0) > >>> + break; > >>> + } > >>> + > >>> + kvfree(bvec); > >>> + return ret < 0 ? ret : 0; > >>> +} > >>> + > >>> +static int system_heap_dma_buf_rw_file(struct dma_buf *dmabuf, > >>> + struct dma_buf_rw_file *back) > >>> +{ > >>> + struct system_heap_buffer *buffer = dmabuf->priv; > >>> + int ret = 0; > >>> + __u32 op = back->flags & DMA_BUF_RW_FLAGS_OP_MASK; > >>> + bool direct_io = back->flags & DMA_BUF_RW_FLAGS_DIRECT; > >>> + struct file *filp; > >>> + > >>> + if (op != DMA_BUF_RW_FLAGS_READ && op != > >> DMA_BUF_RW_FLAGS_WRITE) > >>> + return -EINVAL; > >>> + if (direct_io) { > >>> + if (!PAGE_ALIGNED(back->file_offset) || > >>> + !PAGE_ALIGNED(back->buf_offset) || > >>> + !PAGE_ALIGNED(back->buf_len)) > >>> + return -EINVAL; > >>> + } > >>> + if (!back->buf_len || back->buf_len > dmabuf->size || > >>> + back->buf_offset >= dmabuf->size || > >>> + back->buf_offset + back->buf_len > dmabuf->size) > >>> + return -EINVAL; > >>> + if (back->file_offset + back->buf_len < back->file_offset) > >>> + return -EINVAL; > >>> + > >>> + filp = fget(back->fd); > >>> + if (!filp) > >>> + return -EBADF; > >>> + > >>> + mutex_lock(&buffer->lock); > >>> + ret = system_heap_rw_file(buffer, op == > >> DMA_BUF_RW_FLAGS_READ, direct_io, > >>> + filp, back->file_offset, back->buf_offset, back- > >>> buf_len); > >>> + mutex_unlock(&buffer->lock); > >>> + > >>> + fput(filp); > >>> + return ret; > >>> +} > >>> + > >>> static void system_heap_dma_buf_release(struct dma_buf *dmabuf) { > >>> struct system_heap_buffer *buffer = dmabuf->priv; @@ -308,6 > >> +425,7 > >>> @@ static const struct dma_buf_ops system_heap_buf_ops = { > >>> .mmap = system_heap_mmap, > >>> .vmap = system_heap_vmap, > >>> .vunmap = system_heap_vunmap, > >>> + .rw_file = system_heap_dma_buf_rw_file, > >>> .release = system_heap_dma_buf_release, }; > >>> > >
On 5/14/25 13:02, wangtao wrote: >> -----Original Message----- >> From: Christian König <christian.koenig@amd.com> >> Sent: Tuesday, May 13, 2025 9:18 PM >> To: wangtao <tao.wangtao@honor.com>; sumit.semwal@linaro.org; >> benjamin.gaignard@collabora.com; Brian.Starkey@arm.com; >> jstultz@google.com; tjmercier@google.com >> Cc: linux-media@vger.kernel.org; dri-devel@lists.freedesktop.org; linaro- >> mm-sig@lists.linaro.org; linux-kernel@vger.kernel.org; >> wangbintian(BintianWang) <bintian.wang@honor.com>; yipengxiang >> <yipengxiang@honor.com>; <liulu.liu@honor.com>; <feng.han@honor.com> >> Subject: Re: [PATCH 2/2] dmabuf/heaps: implement >> DMA_BUF_IOCTL_RW_FILE for system_heap >> >> On 5/13/25 14:30, wangtao wrote: >>>> -----Original Message----- >>>> From: Christian König <christian.koenig@amd.com> >>>> Sent: Tuesday, May 13, 2025 7:32 PM >>>> To: wangtao <tao.wangtao@honor.com>; sumit.semwal@linaro.org; >>>> benjamin.gaignard@collabora.com; Brian.Starkey@arm.com; >>>> jstultz@google.com; tjmercier@google.com >>>> Cc: linux-media@vger.kernel.org; dri-devel@lists.freedesktop.org; >>>> linaro- mm-sig@lists.linaro.org; linux-kernel@vger.kernel.org; >>>> wangbintian(BintianWang) <bintian.wang@honor.com>; yipengxiang >>>> <yipengxiang@honor.com>; <liulu.liu@honor.com>; >>>> <feng.han@honor.com> >>>> Subject: Re: [PATCH 2/2] dmabuf/heaps: implement >>>> DMA_BUF_IOCTL_RW_FILE for system_heap >>>> >>>> On 5/13/25 11:28, wangtao wrote: >>>>> Support direct file I/O operations for system_heap dma-buf objects. >>>>> Implementation includes: >>>>> 1. Convert sg_table to bio_vec >>>> >>>> That is usually illegal for DMA-bufs. >>> [wangtao] The term 'convert' is misleading in this context. The appropriate >> phrasing should be: Construct bio_vec from sg_table. >> >> Well it doesn't matter what you call it. Touching the page inside an sg table of >> a DMA-buf is illegal, we even have code to actively prevent that. > [wangtao] For a driver using DMA-buf: Don't touch pages in the sg_table. But the system heap exporter (sg_table owner) should be allowed to use them. Good point that might be possible. > If a driver takes ownership via dma_buf_map_attachment or similar calls, the exporter must stop using the sg_table. > User-space programs should call DMA_BUF_IOCTL_RW_FILE only when the DMA-buf is not attached. > The exporter must check ownership (e.g., ensure no map_dma_buf/vmap is active) and block new calls during operations. > I'll add these checks in patch v2. > >> >> Once more: This approach was already rejected multiple times! Please use >> udmabuf instead! >> >> The hack you came up here is simply not necessary. > [wangtao] Many people need DMA-buf direct I/O. I tried it 2 years ago. My method is simpler, uses less CPU/power, and performs better: I don't think that this is a valid argument. > - Speed: 3418 MB/s vs. 2073 MB/s (udmabuf) at 1GHz CPU. > - udmabuf wastes half its CPU time on __get_user_pages. > - Creating 32x32MB DMA-bufs + reading 1GB file takes 346 ms vs. 1145 ms for udmabuf (10x slower) vs. 1503 ms for DMA-buf normal. Why would using udmabuf be slower here? > udmabuf is slightly faster but not enough. Switching to udmabuf is easy for small apps but hard in complex systems without major benefits. Yeah, but your approach here is a rather clear hack. Using udmabuf is much more cleaner and generally accepted by everybody now. As far as I can see I have to reject your approach here. Regards, Christian. >> >> Regards, >> Christian. >> >> >>> Appreciate your feedback. >>>> >>>> Regards, >>>> Christian. >>>> >>>>> 2. Set IOCB_DIRECT when O_DIRECT is supported 3. Invoke >>>>> vfs_iocb_iter_read()/vfs_iocb_iter_write() for actual I/O >>>>> >>>>> Performance metrics (UFS 4.0 device @4GB/s, Arm64 CPU @1GHz): >>>>> >>>>> | Metric | 1MB | 8MB | 64MB | 1024MB | 3072MB | >>>>> |--------------------|-------:|-------:|--------:|---------:|------- >>>>> |--------------------|-- >>>>> |--------------------|:| >>>>> | Buffer Read (us) | 1658 | 9028 | 69295 | 1019783 | 2978179 | >>>>> | Direct Read (us) | 707 | 2647 | 18689 | 299627 | 937758 | >>>>> | Buffer Rate (MB/s) | 603 | 886 | 924 | 1004 | 1032 | >>>>> | Direct Rate (MB/s) | 1414 | 3022 | 3425 | 3418 | 3276 | >>>>> >>>>> Signed-off-by: wangtao <tao.wangtao@honor.com> >>>>> --- >>>>> drivers/dma-buf/heaps/system_heap.c | 118 >>>>> ++++++++++++++++++++++++++++ >>>>> 1 file changed, 118 insertions(+) >>>>> >>>>> diff --git a/drivers/dma-buf/heaps/system_heap.c >>>>> b/drivers/dma-buf/heaps/system_heap.c >>>>> index 26d5dc89ea16..f7b71b9843aa 100644 >>>>> --- a/drivers/dma-buf/heaps/system_heap.c >>>>> +++ b/drivers/dma-buf/heaps/system_heap.c >>>>> @@ -20,6 +20,8 @@ >>>>> #include <linux/scatterlist.h> >>>>> #include <linux/slab.h> >>>>> #include <linux/vmalloc.h> >>>>> +#include <linux/bvec.h> >>>>> +#include <linux/uio.h> >>>>> >>>>> static struct dma_heap *sys_heap; >>>>> >>>>> @@ -281,6 +283,121 @@ static void system_heap_vunmap(struct >> dma_buf >>>> *dmabuf, struct iosys_map *map) >>>>> iosys_map_clear(map); >>>>> } >>>>> >>>>> +static struct bio_vec *system_heap_init_bvec(struct >>>> system_heap_buffer *buffer, >>>>> + size_t offset, size_t len, int *nr_segs) { >>>>> + struct sg_table *sgt = &buffer->sg_table; >>>>> + struct scatterlist *sg; >>>>> + size_t length = 0; >>>>> + unsigned int i, k = 0; >>>>> + struct bio_vec *bvec; >>>>> + size_t sg_left; >>>>> + size_t sg_offset; >>>>> + size_t sg_len; >>>>> + >>>>> + bvec = kvcalloc(sgt->nents, sizeof(*bvec), GFP_KERNEL); >>>>> + if (!bvec) >>>>> + return NULL; >>>>> + >>>>> + for_each_sg(sgt->sgl, sg, sgt->nents, i) { >>>>> + length += sg->length; >>>>> + if (length <= offset) >>>>> + continue; >>>>> + >>>>> + sg_left = length - offset; >>>>> + sg_offset = sg->offset + sg->length - sg_left; >>>>> + sg_len = min(sg_left, len); >>>>> + >>>>> + bvec[k].bv_page = sg_page(sg); >>>>> + bvec[k].bv_len = sg_len; >>>>> + bvec[k].bv_offset = sg_offset; >>>>> + k++; >>>>> + >>>>> + offset += sg_len; >>>>> + len -= sg_len; >>>>> + if (len <= 0) >>>>> + break; >>>>> + } >>>>> + >>>>> + *nr_segs = k; >>>>> + return bvec; >>>>> +} >>>>> + >>>>> +static int system_heap_rw_file(struct system_heap_buffer *buffer, >>>>> +bool >>>> is_read, >>>>> + bool direct_io, struct file *filp, loff_t file_offset, >>>>> + size_t buf_offset, size_t len) >>>>> +{ >>>>> + struct bio_vec *bvec; >>>>> + int nr_segs = 0; >>>>> + struct iov_iter iter; >>>>> + struct kiocb kiocb; >>>>> + ssize_t ret = 0; >>>>> + >>>>> + if (direct_io) { >>>>> + if (!(filp->f_mode & FMODE_CAN_ODIRECT)) >>>>> + return -EINVAL; >>>>> + } >>>>> + >>>>> + bvec = system_heap_init_bvec(buffer, buf_offset, len, &nr_segs); >>>>> + if (!bvec) >>>>> + return -ENOMEM; >>>>> + >>>>> + iov_iter_bvec(&iter, is_read ? ITER_DEST : ITER_SOURCE, bvec, >>>> nr_segs, len); >>>>> + init_sync_kiocb(&kiocb, filp); >>>>> + kiocb.ki_pos = file_offset; >>>>> + if (direct_io) >>>>> + kiocb.ki_flags |= IOCB_DIRECT; >>>>> + >>>>> + while (kiocb.ki_pos < file_offset + len) { >>>>> + if (is_read) >>>>> + ret = vfs_iocb_iter_read(filp, &kiocb, &iter); >>>>> + else >>>>> + ret = vfs_iocb_iter_write(filp, &kiocb, &iter); >>>>> + if (ret <= 0) >>>>> + break; >>>>> + } >>>>> + >>>>> + kvfree(bvec); >>>>> + return ret < 0 ? ret : 0; >>>>> +} >>>>> + >>>>> +static int system_heap_dma_buf_rw_file(struct dma_buf *dmabuf, >>>>> + struct dma_buf_rw_file *back) >>>>> +{ >>>>> + struct system_heap_buffer *buffer = dmabuf->priv; >>>>> + int ret = 0; >>>>> + __u32 op = back->flags & DMA_BUF_RW_FLAGS_OP_MASK; >>>>> + bool direct_io = back->flags & DMA_BUF_RW_FLAGS_DIRECT; >>>>> + struct file *filp; >>>>> + >>>>> + if (op != DMA_BUF_RW_FLAGS_READ && op != >>>> DMA_BUF_RW_FLAGS_WRITE) >>>>> + return -EINVAL; >>>>> + if (direct_io) { >>>>> + if (!PAGE_ALIGNED(back->file_offset) || >>>>> + !PAGE_ALIGNED(back->buf_offset) || >>>>> + !PAGE_ALIGNED(back->buf_len)) >>>>> + return -EINVAL; >>>>> + } >>>>> + if (!back->buf_len || back->buf_len > dmabuf->size || >>>>> + back->buf_offset >= dmabuf->size || >>>>> + back->buf_offset + back->buf_len > dmabuf->size) >>>>> + return -EINVAL; >>>>> + if (back->file_offset + back->buf_len < back->file_offset) >>>>> + return -EINVAL; >>>>> + >>>>> + filp = fget(back->fd); >>>>> + if (!filp) >>>>> + return -EBADF; >>>>> + >>>>> + mutex_lock(&buffer->lock); >>>>> + ret = system_heap_rw_file(buffer, op == >>>> DMA_BUF_RW_FLAGS_READ, direct_io, >>>>> + filp, back->file_offset, back->buf_offset, back- >>>>> buf_len); >>>>> + mutex_unlock(&buffer->lock); >>>>> + >>>>> + fput(filp); >>>>> + return ret; >>>>> +} >>>>> + >>>>> static void system_heap_dma_buf_release(struct dma_buf *dmabuf) { >>>>> struct system_heap_buffer *buffer = dmabuf->priv; @@ -308,6 >>>> +425,7 >>>>> @@ static const struct dma_buf_ops system_heap_buf_ops = { >>>>> .mmap = system_heap_mmap, >>>>> .vmap = system_heap_vmap, >>>>> .vunmap = system_heap_vunmap, >>>>> + .rw_file = system_heap_dma_buf_rw_file, >>>>> .release = system_heap_dma_buf_release, }; >>>>> >>> >
> -----Original Message----- > From: Christian König <christian.koenig@amd.com> > Sent: Wednesday, May 14, 2025 8:00 PM > To: wangtao <tao.wangtao@honor.com>; sumit.semwal@linaro.org; > benjamin.gaignard@collabora.com; Brian.Starkey@arm.com; > jstultz@google.com; tjmercier@google.com > Cc: linux-media@vger.kernel.org; dri-devel@lists.freedesktop.org; linaro- > mm-sig@lists.linaro.org; linux-kernel@vger.kernel.org; > wangbintian(BintianWang) <bintian.wang@honor.com>; yipengxiang > <yipengxiang@honor.com>; <liulu.liu@honor.com>; hanfeng > <feng.han@honor.com> > Subject: Re: [PATCH 2/2] dmabuf/heaps: implement > DMA_BUF_IOCTL_RW_FILE for system_heap > > On 5/14/25 13:02, wangtao wrote: > >> -----Original Message----- > >> From: Christian König <christian.koenig@amd.com> > >> Sent: Tuesday, May 13, 2025 9:18 PM > >> To: wangtao <tao.wangtao@honor.com>; sumit.semwal@linaro.org; > >> benjamin.gaignard@collabora.com; Brian.Starkey@arm.com; > >> jstultz@google.com; tjmercier@google.com > >> Cc: linux-media@vger.kernel.org; dri-devel@lists.freedesktop.org; > >> linaro- mm-sig@lists.linaro.org; linux-kernel@vger.kernel.org; > >> wangbintian(BintianWang) <bintian.wang@honor.com>; yipengxiang > >> <yipengxiang@honor.com>; <liulu.liu@honor.com>; > <feng.han@honor.com> > >> Subject: Re: [PATCH 2/2] dmabuf/heaps: implement > >> DMA_BUF_IOCTL_RW_FILE for system_heap > >> > >> On 5/13/25 14:30, wangtao wrote: > >>>> -----Original Message----- > >>>> From: Christian König <christian.koenig@amd.com> > >>>> Sent: Tuesday, May 13, 2025 7:32 PM > >>>> To: wangtao <tao.wangtao@honor.com>; sumit.semwal@linaro.org; > >>>> benjamin.gaignard@collabora.com; Brian.Starkey@arm.com; > >>>> jstultz@google.com; tjmercier@google.com > >>>> Cc: linux-media@vger.kernel.org; dri-devel@lists.freedesktop.org; > >>>> linaro- mm-sig@lists.linaro.org; linux-kernel@vger.kernel.org; > >>>> wangbintian(BintianWang) <bintian.wang@honor.com>; yipengxiang > >>>> <yipengxiang@honor.com>; <liulu.liu@honor.com>; > >>>> <feng.han@honor.com> > >>>> Subject: Re: [PATCH 2/2] dmabuf/heaps: implement > >>>> DMA_BUF_IOCTL_RW_FILE for system_heap > >>>> > >>>> On 5/13/25 11:28, wangtao wrote: > >>>>> Support direct file I/O operations for system_heap dma-buf objects. > >>>>> Implementation includes: > >>>>> 1. Convert sg_table to bio_vec > >>>> > >>>> That is usually illegal for DMA-bufs. > >>> [wangtao] The term 'convert' is misleading in this context. The > >>> appropriate > >> phrasing should be: Construct bio_vec from sg_table. > >> > >> Well it doesn't matter what you call it. Touching the page inside an > >> sg table of a DMA-buf is illegal, we even have code to actively prevent > that. > > [wangtao] For a driver using DMA-buf: Don't touch pages in the sg_table. > But the system heap exporter (sg_table owner) should be allowed to use > them. > > Good point that might be possible. > > > If a driver takes ownership via dma_buf_map_attachment or similar calls, > the exporter must stop using the sg_table. > > User-space programs should call DMA_BUF_IOCTL_RW_FILE only when the > DMA-buf is not attached. > > The exporter must check ownership (e.g., ensure no map_dma_buf/vmap > is active) and block new calls during operations. > > I'll add these checks in patch v2. > > > >> > >> Once more: This approach was already rejected multiple times! Please > >> use udmabuf instead! > >> > >> The hack you came up here is simply not necessary. > > [wangtao] Many people need DMA-buf direct I/O. I tried it 2 years ago. My > method is simpler, uses less CPU/power, and performs better: > > I don't think that this is a valid argument. > > > - Speed: 3418 MB/s vs. 2073 MB/s (udmabuf) at 1GHz CPU. > > - udmabuf wastes half its CPU time on __get_user_pages. > > - Creating 32x32MB DMA-bufs + reading 1GB file takes 346 ms vs. 1145 ms > for udmabuf (10x slower) vs. 1503 ms for DMA-buf normal. > > Why would using udmabuf be slower here? > > > udmabuf is slightly faster but not enough. Switching to udmabuf is easy for > small apps but hard in complex systems without major benefits. > > Yeah, but your approach here is a rather clear hack. Using udmabuf is much > more cleaner and generally accepted by everybody now. > > As far as I can see I have to reject your approach here. > [wangtao] My Test Configuration (CPU 1GHz, 5-test average): Allocation: 32x32MB buffer creation - dmabuf 53ms vs. udmabuf 694ms (10X slower) - Note: shmem shows excessive allocation time Read 1024MB File: - dmabuf direct 326ms vs. udmabuf direct 461ms (40% slower) - Note: pin_user_pages_fast consumes majority CPU cycles Key function call timing: See details below. Dmabuf direct io: |- 12.39% DmaBufTest_PerfDmabufDirectIO_Test::TestBody() |-|- 5.95% perf_dmabuf_alloc_and_io |-|-|- 4.38% dmabuf_io_back |-|-|-|- 3.47% ioctl |-|-|-|-|- 3.47% __ioctl |-|-|-|-|-|-|-|-|-|-|-|- 3.47% dma_buf_ioctl |-|-|-|-|-|-|-|-|-|-|-|-|- 3.47% system_heap_dma_buf_rw_file |-|-|-|-|-|-|-|-|-|-|-|-|-|- 3.46% system_heap_rw_file |-|-|-|-|-|-|-|-|-|-|-|-|-|-|- 3.46% f2fs_file_read_iter |-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|- 3.46% __iomap_dio_rw |-|-|- 1.33% ioctl |-|-|-|- 1.33% __ioctl |-|-|-|-|-|-|-|-|-|-|- 1.33% dma_heap_ioctl |-|-|-|-|-|-|-|-|-|-|-|- 1.33% dma_heap_buffer_alloc |-|-|-|-|-|-|-|-|-|-|-|-|- 1.33% system_heap_allocate |-|-|-|-|-|-|-|-|-|-|-|-|-|- 1.33% system_heap_do_allocate |-|-|-|-|-|-|-|-|-|-|-|-|-|-|- 1.28% __alloc_pages Udmabuf direct io: |- 39.35% DmaBufTest_PerfDmabufUDirectIO_Test::TestBody() |-|- 32.76% perf_dmabuf_alloc_and_io |-|-|- 17.43% alloc_udmabuf |-|-|-|- 13.34% ioctl |-|-|-|-|-|-|-|-|-|-|-|- 13.34% udmabuf_ioctl |-|-|-|-|-|-|-|-|-|-|-|-|- 13.32% udmabuf_create |-|-|-|-|-|-|-|-|-|-|-|-|-|- 13.26% shmem_read_mapping_page_gfp |-|-|-|-|-|-|-|-|-|-|-|-|-|-|- 11.94% shmem_read_folio_gfp |-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|- 11.28% shmem_get_folio_gfp |-|-|- 10.81% dmabuf_io_back |-|-|-|- 8.85% read |-|-|-|-|-|-|-|-|-|- 8.85% __arm64_sys_read |-|-|-|-|-|-|-|-|-|-|- 8.85% f2fs_file_read_iter |-|-|-|-|-|-|-|-|-|-|-|- 8.84% __iomap_dio_rw |-|-|-|-|-|-|-|-|-|-|-|-|- 7.85% iomap_dio_bio_iter |-|-|-|-|-|-|-|-|-|-|-|-|-|- 5.61% bio_iov_iter_get_pages |-|-|-|-|-|-|-|-|-|-|-|-|-|-|- 5.46% iov_iter_extract_pages |-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|- 5.46% pin_user_pages_fast |-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|- 5.46% internal_get_user_pages_fast |-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|- 5.46% __gup_longterm_locked |-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|- 5.36% __get_user_pages > Regards, > Christian. > > > >> > >> Regards, > >> Christian. > >> > >> > >>> Appreciate your feedback. > >>>> > >>>> Regards, > >>>> Christian. > >>>> > >>>>> 2. Set IOCB_DIRECT when O_DIRECT is supported 3. Invoke > >>>>> vfs_iocb_iter_read()/vfs_iocb_iter_write() for actual I/O > >>>>> > >>>>> Performance metrics (UFS 4.0 device @4GB/s, Arm64 CPU @1GHz): > >>>>> > >>>>> | Metric | 1MB | 8MB | 64MB | 1024MB | 3072MB | > >>>>> |--------------------|-------:|-------:|--------:|---------:|----- > >>>>> |--------------------|-- > >>>>> |--------------------|-- > >>>>> |--------------------|:| > >>>>> | Buffer Read (us) | 1658 | 9028 | 69295 | 1019783 | 2978179 | > >>>>> | Direct Read (us) | 707 | 2647 | 18689 | 299627 | 937758 | > >>>>> | Buffer Rate (MB/s) | 603 | 886 | 924 | 1004 | 1032 | > >>>>> | Direct Rate (MB/s) | 1414 | 3022 | 3425 | 3418 | 3276 | > >>>>> > >>>>> Signed-off-by: wangtao <tao.wangtao@honor.com> > >>>>> --- > >>>>> drivers/dma-buf/heaps/system_heap.c | 118 > >>>>> ++++++++++++++++++++++++++++ > >>>>> 1 file changed, 118 insertions(+) > >>>>> > >>>>> diff --git a/drivers/dma-buf/heaps/system_heap.c > >>>>> b/drivers/dma-buf/heaps/system_heap.c > >>>>> index 26d5dc89ea16..f7b71b9843aa 100644 > >>>>> --- a/drivers/dma-buf/heaps/system_heap.c > >>>>> +++ b/drivers/dma-buf/heaps/system_heap.c > >>>>> @@ -20,6 +20,8 @@ > >>>>> #include <linux/scatterlist.h> > >>>>> #include <linux/slab.h> > >>>>> #include <linux/vmalloc.h> > >>>>> +#include <linux/bvec.h> > >>>>> +#include <linux/uio.h> > >>>>> > >>>>> static struct dma_heap *sys_heap; > >>>>> > >>>>> @@ -281,6 +283,121 @@ static void system_heap_vunmap(struct > >> dma_buf > >>>> *dmabuf, struct iosys_map *map) > >>>>> iosys_map_clear(map); > >>>>> } > >>>>> > >>>>> +static struct bio_vec *system_heap_init_bvec(struct > >>>> system_heap_buffer *buffer, > >>>>> + size_t offset, size_t len, int *nr_segs) { > >>>>> + struct sg_table *sgt = &buffer->sg_table; > >>>>> + struct scatterlist *sg; > >>>>> + size_t length = 0; > >>>>> + unsigned int i, k = 0; > >>>>> + struct bio_vec *bvec; > >>>>> + size_t sg_left; > >>>>> + size_t sg_offset; > >>>>> + size_t sg_len; > >>>>> + > >>>>> + bvec = kvcalloc(sgt->nents, sizeof(*bvec), GFP_KERNEL); > >>>>> + if (!bvec) > >>>>> + return NULL; > >>>>> + > >>>>> + for_each_sg(sgt->sgl, sg, sgt->nents, i) { > >>>>> + length += sg->length; > >>>>> + if (length <= offset) > >>>>> + continue; > >>>>> + > >>>>> + sg_left = length - offset; > >>>>> + sg_offset = sg->offset + sg->length - sg_left; > >>>>> + sg_len = min(sg_left, len); > >>>>> + > >>>>> + bvec[k].bv_page = sg_page(sg); > >>>>> + bvec[k].bv_len = sg_len; > >>>>> + bvec[k].bv_offset = sg_offset; > >>>>> + k++; > >>>>> + > >>>>> + offset += sg_len; > >>>>> + len -= sg_len; > >>>>> + if (len <= 0) > >>>>> + break; > >>>>> + } > >>>>> + > >>>>> + *nr_segs = k; > >>>>> + return bvec; > >>>>> +} > >>>>> + > >>>>> +static int system_heap_rw_file(struct system_heap_buffer *buffer, > >>>>> +bool > >>>> is_read, > >>>>> + bool direct_io, struct file *filp, loff_t file_offset, > >>>>> + size_t buf_offset, size_t len) > >>>>> +{ > >>>>> + struct bio_vec *bvec; > >>>>> + int nr_segs = 0; > >>>>> + struct iov_iter iter; > >>>>> + struct kiocb kiocb; > >>>>> + ssize_t ret = 0; > >>>>> + > >>>>> + if (direct_io) { > >>>>> + if (!(filp->f_mode & FMODE_CAN_ODIRECT)) > >>>>> + return -EINVAL; > >>>>> + } > >>>>> + > >>>>> + bvec = system_heap_init_bvec(buffer, buf_offset, len, > &nr_segs); > >>>>> + if (!bvec) > >>>>> + return -ENOMEM; > >>>>> + > >>>>> + iov_iter_bvec(&iter, is_read ? ITER_DEST : ITER_SOURCE, > bvec, > >>>> nr_segs, len); > >>>>> + init_sync_kiocb(&kiocb, filp); > >>>>> + kiocb.ki_pos = file_offset; > >>>>> + if (direct_io) > >>>>> + kiocb.ki_flags |= IOCB_DIRECT; > >>>>> + > >>>>> + while (kiocb.ki_pos < file_offset + len) { > >>>>> + if (is_read) > >>>>> + ret = vfs_iocb_iter_read(filp, &kiocb, &iter); > >>>>> + else > >>>>> + ret = vfs_iocb_iter_write(filp, &kiocb, &iter); > >>>>> + if (ret <= 0) > >>>>> + break; > >>>>> + } > >>>>> + > >>>>> + kvfree(bvec); > >>>>> + return ret < 0 ? ret : 0; > >>>>> +} > >>>>> + > >>>>> +static int system_heap_dma_buf_rw_file(struct dma_buf *dmabuf, > >>>>> + struct dma_buf_rw_file *back) > >>>>> +{ > >>>>> + struct system_heap_buffer *buffer = dmabuf->priv; > >>>>> + int ret = 0; > >>>>> + __u32 op = back->flags & DMA_BUF_RW_FLAGS_OP_MASK; > >>>>> + bool direct_io = back->flags & DMA_BUF_RW_FLAGS_DIRECT; > >>>>> + struct file *filp; > >>>>> + > >>>>> + if (op != DMA_BUF_RW_FLAGS_READ && op != > >>>> DMA_BUF_RW_FLAGS_WRITE) > >>>>> + return -EINVAL; > >>>>> + if (direct_io) { > >>>>> + if (!PAGE_ALIGNED(back->file_offset) || > >>>>> + !PAGE_ALIGNED(back->buf_offset) || > >>>>> + !PAGE_ALIGNED(back->buf_len)) > >>>>> + return -EINVAL; > >>>>> + } > >>>>> + if (!back->buf_len || back->buf_len > dmabuf->size || > >>>>> + back->buf_offset >= dmabuf->size || > >>>>> + back->buf_offset + back->buf_len > dmabuf->size) > >>>>> + return -EINVAL; > >>>>> + if (back->file_offset + back->buf_len < back->file_offset) > >>>>> + return -EINVAL; > >>>>> + > >>>>> + filp = fget(back->fd); > >>>>> + if (!filp) > >>>>> + return -EBADF; > >>>>> + > >>>>> + mutex_lock(&buffer->lock); > >>>>> + ret = system_heap_rw_file(buffer, op == > >>>> DMA_BUF_RW_FLAGS_READ, direct_io, > >>>>> + filp, back->file_offset, back->buf_offset, > back- > >>>>> buf_len); > >>>>> + mutex_unlock(&buffer->lock); > >>>>> + > >>>>> + fput(filp); > >>>>> + return ret; > >>>>> +} > >>>>> + > >>>>> static void system_heap_dma_buf_release(struct dma_buf *dmabuf) > { > >>>>> struct system_heap_buffer *buffer = dmabuf->priv; @@ -308,6 > >>>> +425,7 > >>>>> @@ static const struct dma_buf_ops system_heap_buf_ops = { > >>>>> .mmap = system_heap_mmap, > >>>>> .vmap = system_heap_vmap, > >>>>> .vunmap = system_heap_vunmap, > >>>>> + .rw_file = system_heap_dma_buf_rw_file, > >>>>> .release = system_heap_dma_buf_release, }; > >>>>> > >>> > >
> -----Original Message----- > From: Christian König <christian.koenig@amd.com> > Sent: Friday, May 16, 2025 4:36 PM > To: wangtao <tao.wangtao@honor.com>; sumit.semwal@linaro.org; > benjamin.gaignard@collabora.com; Brian.Starkey@arm.com; > jstultz@google.com; tjmercier@google.com > Cc: linux-media@vger.kernel.org; dri-devel@lists.freedesktop.org; linaro- > mm-sig@lists.linaro.org; linux-kernel@vger.kernel.org; > wangbintian(BintianWang) <bintian.wang@honor.com>; yipengxiang > <yipengxiang@honor.com>; liulu 00013167 <liulu.liu@honor.com>; hanfeng > 00012985 <feng.han@honor.com> > Subject: Re: [PATCH 2/2] dmabuf/heaps: implement > DMA_BUF_IOCTL_RW_FILE for system_heap > > On 5/16/25 09:40, wangtao wrote: > > > > > >> -----Original Message----- > >> From: Christian König <christian.koenig@amd.com> > >> Sent: Thursday, May 15, 2025 10:26 PM > >> To: wangtao <tao.wangtao@honor.com>; sumit.semwal@linaro.org; > >> benjamin.gaignard@collabora.com; Brian.Starkey@arm.com; > >> jstultz@google.com; tjmercier@google.com > >> Cc: linux-media@vger.kernel.org; dri-devel@lists.freedesktop.org; > >> linaro- mm-sig@lists.linaro.org; linux-kernel@vger.kernel.org; > >> wangbintian(BintianWang) <bintian.wang@honor.com>; yipengxiang > >> <yipengxiang@honor.com>; liulu 00013167 <liulu.liu@honor.com>; > >> hanfeng > >> 00012985 <feng.han@honor.com> > >> Subject: Re: [PATCH 2/2] dmabuf/heaps: implement > >> DMA_BUF_IOCTL_RW_FILE for system_heap > >> > >> On 5/15/25 16:03, wangtao wrote: > >>> [wangtao] My Test Configuration (CPU 1GHz, 5-test average): > >>> Allocation: 32x32MB buffer creation > >>> - dmabuf 53ms vs. udmabuf 694ms (10X slower) > >>> - Note: shmem shows excessive allocation time > >> > >> Yeah, that is something already noted by others as well. But that is > >> orthogonal. > >> > >>> > >>> Read 1024MB File: > >>> - dmabuf direct 326ms vs. udmabuf direct 461ms (40% slower) > >>> - Note: pin_user_pages_fast consumes majority CPU cycles > >>> > >>> Key function call timing: See details below. > >> > >> Those aren't valid, you are comparing different functionalities here. > >> > >> Please try using udmabuf with sendfile() as confirmed to be working by > T.J. > > [wangtao] Using buffer IO with dmabuf file read/write requires one > memory copy. > > Direct IO removes this copy to enable zero-copy. The sendfile system > > call reduces memory copies from two (read/write) to one. However, with > > udmabuf, sendfile still keeps at least one copy, failing zero-copy. > > > Then please work on fixing this. [wangtao] What needs fixing? Does sendfile achieve zero-copy? sendfile reduces memory copies (from 2 to 1) for network sockets, but still requires one copy and cannot achieve zero copies. > > Regards, > Christian. > > > > > > If udmabuf sendfile uses buffer IO (file page cache), read latency > > matches dmabuf buffer read, but allocation time is much longer. > > With Direct IO, the default 16-page pipe size makes it slower than buffer IO. > > > > Test data shows: > > udmabuf direct read is much faster than udmabuf sendfile. > > dmabuf direct read outperforms udmabuf direct read by a large margin. > > > > Issue: After udmabuf is mapped via map_dma_buf, apps using memfd or > > udmabuf for Direct IO might cause errors, but there are no safeguards > > to prevent this. > > > > Allocate 32x32MB buffer and read 1024 MB file Test: > > Metric | alloc (ms) | read (ms) | total (ms) > > -----------------------|------------|-----------|----------- > > udmabuf buffer read | 539 | 2017 | 2555 > > udmabuf direct read | 522 | 658 | 1179 > > udmabuf buffer sendfile| 505 | 1040 | 1546 > > udmabuf direct sendfile| 510 | 2269 | 2780 > > dmabuf buffer read | 51 | 1068 | 1118 > > dmabuf direct read | 52 | 297 | 349 > > > > udmabuf sendfile test steps: > > 1. Open data file(1024MB), get back_fd 2. Create memfd(32MB) # Loop > > steps 2-6 3. Allocate udmabuf with memfd 4. Call sendfile(memfd, > > back_fd) 5. Close memfd after sendfile 6. Close udmabuf 7. Close > > back_fd > > > >> > >> Regards, > >> Christian. > >
> -----Original Message----- > From: T.J. Mercier <tjmercier@google.com> > Sent: Saturday, May 17, 2025 2:37 AM > To: Christian König <christian.koenig@amd.com> > Cc: wangtao <tao.wangtao@honor.com>; sumit.semwal@linaro.org; > benjamin.gaignard@collabora.com; Brian.Starkey@arm.com; > jstultz@google.com; linux-media@vger.kernel.org; dri- > devel@lists.freedesktop.org; linaro-mm-sig@lists.linaro.org; linux- > kernel@vger.kernel.org; wangbintian(BintianWang) > <bintian.wang@honor.com>; yipengxiang <yipengxiang@honor.com>; liulu > 00013167 <liulu.liu@honor.com>; hanfeng 00012985 <feng.han@honor.com> > Subject: Re: [PATCH 2/2] dmabuf/heaps: implement > DMA_BUF_IOCTL_RW_FILE for system_heap > > On Fri, May 16, 2025 at 1:36 AM Christian König <christian.koenig@amd.com> > wrote: > > > > On 5/16/25 09:40, wangtao wrote: > > > > > > > > >> -----Original Message----- > > >> From: Christian König <christian.koenig@amd.com> > > >> Sent: Thursday, May 15, 2025 10:26 PM > > >> To: wangtao <tao.wangtao@honor.com>; sumit.semwal@linaro.org; > > >> benjamin.gaignard@collabora.com; Brian.Starkey@arm.com; > > >> jstultz@google.com; tjmercier@google.com > > >> Cc: linux-media@vger.kernel.org; dri-devel@lists.freedesktop.org; > > >> linaro- mm-sig@lists.linaro.org; linux-kernel@vger.kernel.org; > > >> wangbintian(BintianWang) <bintian.wang@honor.com>; yipengxiang > > >> <yipengxiang@honor.com>; liulu 00013167 <liulu.liu@honor.com>; > > >> hanfeng > > >> 00012985 <feng.han@honor.com> > > >> Subject: Re: [PATCH 2/2] dmabuf/heaps: implement > > >> DMA_BUF_IOCTL_RW_FILE for system_heap > > >> > > >> On 5/15/25 16:03, wangtao wrote: > > >>> [wangtao] My Test Configuration (CPU 1GHz, 5-test average): > > >>> Allocation: 32x32MB buffer creation > > >>> - dmabuf 53ms vs. udmabuf 694ms (10X slower) > > >>> - Note: shmem shows excessive allocation time > > >> > > >> Yeah, that is something already noted by others as well. But that > > >> is orthogonal. > > >> > > >>> > > >>> Read 1024MB File: > > >>> - dmabuf direct 326ms vs. udmabuf direct 461ms (40% slower) > > >>> - Note: pin_user_pages_fast consumes majority CPU cycles > > >>> > > >>> Key function call timing: See details below. > > >> > > >> Those aren't valid, you are comparing different functionalities here. > > >> > > >> Please try using udmabuf with sendfile() as confirmed to be working by > T.J. > > > [wangtao] Using buffer IO with dmabuf file read/write requires one > memory copy. > > > Direct IO removes this copy to enable zero-copy. The sendfile system > > > call reduces memory copies from two (read/write) to one. However, > > > with udmabuf, sendfile still keeps at least one copy, failing zero-copy. > > > > > > Then please work on fixing this. > > > > Regards, > > Christian. > > > > > > > > > > If udmabuf sendfile uses buffer IO (file page cache), read latency > > > matches dmabuf buffer read, but allocation time is much longer. > > > With Direct IO, the default 16-page pipe size makes it slower than buffer > IO. > > > > > > Test data shows: > > > udmabuf direct read is much faster than udmabuf sendfile. > > > dmabuf direct read outperforms udmabuf direct read by a large margin. > > > > > > Issue: After udmabuf is mapped via map_dma_buf, apps using memfd or > > > udmabuf for Direct IO might cause errors, but there are no > > > safeguards to prevent this. > > > > > > Allocate 32x32MB buffer and read 1024 MB file Test: > > > Metric | alloc (ms) | read (ms) | total (ms) > > > -----------------------|------------|-----------|----------- > > > udmabuf buffer read | 539 | 2017 | 2555 > > > udmabuf direct read | 522 | 658 | 1179 > > I can't reproduce the part where udmabuf direct reads are faster than > buffered reads. That's the opposite of what I'd expect. Something seems > wrong with those buffered reads. > > > > udmabuf buffer sendfile| 505 | 1040 | 1546 > > > udmabuf direct sendfile| 510 | 2269 | 2780 > > I can reproduce the 3.5x slower udambuf direct sendfile compared to > udmabuf direct read. It's a pretty disappointing result, so it seems like > something could be improved there. > > 1G from ext4 on 6.12.17 | read/sendfile (ms) > ------------------------|------------------- > udmabuf buffer read | 351 > udmabuf direct read | 540 > udmabuf buffer sendfile | 255 > udmabuf direct sendfile | 1990 > [wangtao] By the way, did you clear the file cache during testing? Looking at your data again, read and sendfile buffers are faster than Direct I/O, which suggests the file cache wasn’t cleared. If you didn’t clear the file cache, the test results are unfair and unreliable for reference. On embedded devices, it’s nearly impossible to maintain stable caching for multi-GB files. If such files could be cached, we might as well cache dmabufs directly to save time on creating dmabufs and reading file data. You can call posix_fadvise(file_fd, 0, len, POSIX_FADV_DONTNEED) after opening the file or before closing it to clear the file cache, ensuring actual file I/O operations are tested. > > > > dmabuf buffer read | 51 | 1068 | 1118 > > > dmabuf direct read | 52 | 297 | 349 > > > > > > udmabuf sendfile test steps: > > > 1. Open data file(1024MB), get back_fd 2. Create memfd(32MB) # Loop > > > steps 2-6 3. Allocate udmabuf with memfd 4. Call sendfile(memfd, > > > back_fd) 5. Close memfd after sendfile 6. Close udmabuf 7. Close > > > back_fd > > > > > >> > > >> Regards, > > >> Christian. > > > > >
diff --git a/drivers/dma-buf/heaps/system_heap.c b/drivers/dma-buf/heaps/system_heap.c index 26d5dc89ea16..f7b71b9843aa 100644 --- a/drivers/dma-buf/heaps/system_heap.c +++ b/drivers/dma-buf/heaps/system_heap.c @@ -20,6 +20,8 @@ #include <linux/scatterlist.h> #include <linux/slab.h> #include <linux/vmalloc.h> +#include <linux/bvec.h> +#include <linux/uio.h> static struct dma_heap *sys_heap; @@ -281,6 +283,121 @@ static void system_heap_vunmap(struct dma_buf *dmabuf, struct iosys_map *map) iosys_map_clear(map); } +static struct bio_vec *system_heap_init_bvec(struct system_heap_buffer *buffer, + size_t offset, size_t len, int *nr_segs) +{ + struct sg_table *sgt = &buffer->sg_table; + struct scatterlist *sg; + size_t length = 0; + unsigned int i, k = 0; + struct bio_vec *bvec; + size_t sg_left; + size_t sg_offset; + size_t sg_len; + + bvec = kvcalloc(sgt->nents, sizeof(*bvec), GFP_KERNEL); + if (!bvec) + return NULL; + + for_each_sg(sgt->sgl, sg, sgt->nents, i) { + length += sg->length; + if (length <= offset) + continue; + + sg_left = length - offset; + sg_offset = sg->offset + sg->length - sg_left; + sg_len = min(sg_left, len); + + bvec[k].bv_page = sg_page(sg); + bvec[k].bv_len = sg_len; + bvec[k].bv_offset = sg_offset; + k++; + + offset += sg_len; + len -= sg_len; + if (len <= 0) + break; + } + + *nr_segs = k; + return bvec; +} + +static int system_heap_rw_file(struct system_heap_buffer *buffer, bool is_read, + bool direct_io, struct file *filp, loff_t file_offset, + size_t buf_offset, size_t len) +{ + struct bio_vec *bvec; + int nr_segs = 0; + struct iov_iter iter; + struct kiocb kiocb; + ssize_t ret = 0; + + if (direct_io) { + if (!(filp->f_mode & FMODE_CAN_ODIRECT)) + return -EINVAL; + } + + bvec = system_heap_init_bvec(buffer, buf_offset, len, &nr_segs); + if (!bvec) + return -ENOMEM; + + iov_iter_bvec(&iter, is_read ? ITER_DEST : ITER_SOURCE, bvec, nr_segs, len); + init_sync_kiocb(&kiocb, filp); + kiocb.ki_pos = file_offset; + if (direct_io) + kiocb.ki_flags |= IOCB_DIRECT; + + while (kiocb.ki_pos < file_offset + len) { + if (is_read) + ret = vfs_iocb_iter_read(filp, &kiocb, &iter); + else + ret = vfs_iocb_iter_write(filp, &kiocb, &iter); + if (ret <= 0) + break; + } + + kvfree(bvec); + return ret < 0 ? ret : 0; +} + +static int system_heap_dma_buf_rw_file(struct dma_buf *dmabuf, + struct dma_buf_rw_file *back) +{ + struct system_heap_buffer *buffer = dmabuf->priv; + int ret = 0; + __u32 op = back->flags & DMA_BUF_RW_FLAGS_OP_MASK; + bool direct_io = back->flags & DMA_BUF_RW_FLAGS_DIRECT; + struct file *filp; + + if (op != DMA_BUF_RW_FLAGS_READ && op != DMA_BUF_RW_FLAGS_WRITE) + return -EINVAL; + if (direct_io) { + if (!PAGE_ALIGNED(back->file_offset) || + !PAGE_ALIGNED(back->buf_offset) || + !PAGE_ALIGNED(back->buf_len)) + return -EINVAL; + } + if (!back->buf_len || back->buf_len > dmabuf->size || + back->buf_offset >= dmabuf->size || + back->buf_offset + back->buf_len > dmabuf->size) + return -EINVAL; + if (back->file_offset + back->buf_len < back->file_offset) + return -EINVAL; + + filp = fget(back->fd); + if (!filp) + return -EBADF; + + mutex_lock(&buffer->lock); + ret = system_heap_rw_file(buffer, op == DMA_BUF_RW_FLAGS_READ, direct_io, + filp, back->file_offset, back->buf_offset, back->buf_len); + mutex_unlock(&buffer->lock); + + fput(filp); + return ret; +} + static void system_heap_dma_buf_release(struct dma_buf *dmabuf) { struct system_heap_buffer *buffer = dmabuf->priv; @@ -308,6 +425,7 @@ static const struct dma_buf_ops system_heap_buf_ops = { .mmap = system_heap_mmap, .vmap = system_heap_vmap, .vunmap = system_heap_vunmap, + .rw_file = system_heap_dma_buf_rw_file, .release = system_heap_dma_buf_release, };
Support direct file I/O operations for system_heap dma-buf objects. Implementation includes: 1. Convert sg_table to bio_vec 2. Set IOCB_DIRECT when O_DIRECT is supported 3. Invoke vfs_iocb_iter_read()/vfs_iocb_iter_write() for actual I/O Performance metrics (UFS 4.0 device @4GB/s, Arm64 CPU @1GHz): | Metric | 1MB | 8MB | 64MB | 1024MB | 3072MB | |--------------------|-------:|-------:|--------:|---------:|---------:| | Buffer Read (us) | 1658 | 9028 | 69295 | 1019783 | 2978179 | | Direct Read (us) | 707 | 2647 | 18689 | 299627 | 937758 | | Buffer Rate (MB/s) | 603 | 886 | 924 | 1004 | 1032 | | Direct Rate (MB/s) | 1414 | 3022 | 3425 | 3418 | 3276 | Signed-off-by: wangtao <tao.wangtao@honor.com> --- drivers/dma-buf/heaps/system_heap.c | 118 ++++++++++++++++++++++++++++ 1 file changed, 118 insertions(+)