Message ID | 20241025213645.3464331-5-kbusch@meta.com |
---|---|
State | New |
Headers | show |
Series | write hints with nvme fdp, scsi streams | expand |
On Fri, Oct 25, 2024 at 02:36:42PM -0700, Keith Busch wrote: > +static u16 blkdev_write_hint(struct kiocb *iocb, struct block_device *bdev) > +{ > + u16 hint = iocb->ki_write_hint; > + > + if (!hint) > + return file_inode(iocb->ki_filp)->i_write_hint; > + > + if (hint > bdev_max_write_hints(bdev)) > + return file_inode(iocb->ki_filp)->i_write_hint; > + > + if (bdev_is_partition(bdev) && > + !test_bit(hint - 1, bdev->write_hint_mask)) > + return file_inode(iocb->ki_filp)->i_write_hint; I would have expected an error when using an invalid stream identifier. That of course requires telling the application how many are available through e.g. statx as requested last time.
On Mon, Oct 28, 2024 at 12:59:32PM +0100, Christoph Hellwig wrote: > On Fri, Oct 25, 2024 at 02:36:42PM -0700, Keith Busch wrote: > > +static u16 blkdev_write_hint(struct kiocb *iocb, struct block_device *bdev) > > +{ > > + u16 hint = iocb->ki_write_hint; > > + > > + if (!hint) > > + return file_inode(iocb->ki_filp)->i_write_hint; > > + > > + if (hint > bdev_max_write_hints(bdev)) > > + return file_inode(iocb->ki_filp)->i_write_hint; > > + > > + if (bdev_is_partition(bdev) && > > + !test_bit(hint - 1, bdev->write_hint_mask)) > > + return file_inode(iocb->ki_filp)->i_write_hint; > > I would have expected an error when using an invalid stream identifier. It's a hint. fcntl doesn't error if you give an unusable hint, so neither should this. You get sane default behavior.
On Mon, Oct 28, 2024 at 08:38:05AM -0600, Keith Busch wrote: > > > + if (bdev_is_partition(bdev) && > > > + !test_bit(hint - 1, bdev->write_hint_mask)) > > > + return file_inode(iocb->ki_filp)->i_write_hint; > > > > I would have expected an error when using an invalid stream identifier. > > It's a hint. fcntl doesn't error if you give an unusable hint, so > neither should this. You get sane default behavior. Well, why does it have to be a hint? If I have a data placement aware application I really want to know into how many buckets I can sort and adjust my algorithm for it. And I'd rather have an error checked interface to pass that down as far as I can. Same for my file system use case. I guess you have a use case where a hint would be enough, at least with good enough knowledge of the underlying implementation. But would there be an actual downside in not having a hint? Because historically speaking everything we've done as a not error checked vaguely defined hint has not been all the useful, but it in hardware or software interfaces.
diff --git a/block/fops.c b/block/fops.c index 2d01c90076813..e3f3f1957d86d 100644 --- a/block/fops.c +++ b/block/fops.c @@ -71,7 +71,7 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb, bio_init(&bio, bdev, vecs, nr_pages, dio_bio_write_op(iocb)); } bio.bi_iter.bi_sector = pos >> SECTOR_SHIFT; - bio.bi_write_hint = file_inode(iocb->ki_filp)->i_write_hint; + bio.bi_write_hint = iocb->ki_write_hint; bio.bi_ioprio = iocb->ki_ioprio; if (iocb->ki_flags & IOCB_ATOMIC) bio.bi_opf |= REQ_ATOMIC; @@ -200,7 +200,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, for (;;) { bio->bi_iter.bi_sector = pos >> SECTOR_SHIFT; - bio->bi_write_hint = file_inode(iocb->ki_filp)->i_write_hint; + bio->bi_write_hint = iocb->ki_write_hint; bio->bi_private = dio; bio->bi_end_io = blkdev_bio_end_io; bio->bi_ioprio = iocb->ki_ioprio; @@ -316,7 +316,7 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb, dio->flags = 0; dio->iocb = iocb; bio->bi_iter.bi_sector = pos >> SECTOR_SHIFT; - bio->bi_write_hint = file_inode(iocb->ki_filp)->i_write_hint; + bio->bi_write_hint = iocb->ki_write_hint; bio->bi_end_io = blkdev_bio_end_io_async; bio->bi_ioprio = iocb->ki_ioprio; @@ -362,6 +362,23 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb, return -EIOCBQUEUED; } +static u16 blkdev_write_hint(struct kiocb *iocb, struct block_device *bdev) +{ + u16 hint = iocb->ki_write_hint; + + if (!hint) + return file_inode(iocb->ki_filp)->i_write_hint; + + if (hint > bdev_max_write_hints(bdev)) + return file_inode(iocb->ki_filp)->i_write_hint; + + if (bdev_is_partition(bdev) && + !test_bit(hint - 1, bdev->write_hint_mask)) + return file_inode(iocb->ki_filp)->i_write_hint; + + return hint; +} + static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host); @@ -373,6 +390,9 @@ static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) if (blkdev_dio_invalid(bdev, iocb, iter)) return -EINVAL; + if (iov_iter_rw(iter) == WRITE) + iocb->ki_write_hint = blkdev_write_hint(iocb, bdev); + nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS + 1); if (likely(nr_pages <= BIO_MAX_VECS)) { if (is_sync_kiocb(iocb)) diff --git a/include/linux/fs.h b/include/linux/fs.h index 4b5cad44a1268..1a00accf412e5 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -370,6 +370,7 @@ struct kiocb { void *private; int ki_flags; u16 ki_ioprio; /* See linux/ioprio.h */ + u16 ki_write_hint; union { /* * Only used for async buffered reads, where it denotes the