Message ID | 20241128112240.8867-7-anuj20.g@samsung.com |
---|---|
State | New |
Headers | show |
Series | Read/Write with meta/integrity | expand |
Anuj, > Add the ability to pass additional attributes along with read/write. > Application can prepare attibute specific information and pass its > address using the SQE field: > __u64 attr_ptr; > > Along with setting a mask indicating attributes being passed: > __u64 attr_type_mask; > > Overall 64 attributes are allowed and currently one attribute > 'IORING_RW_ATTR_FLAG_PI' is supported. I have things running on my end on top of Jens' tree (without error injection, that's to come). One question, though: How am I to determine that the kernel supports attr_ptr and IORING_RW_ATTR_FLAG_PI? Now that we no longer have separate IORING_OP_{READ,WRITE}_META commands I can't use IO_URING_OP_SUPPORTED to find out whether the running kernel supports PI passthrough.
On Mon, Dec 02, 2024 at 09:13:14PM -0500, Martin K. Petersen wrote: > > I have things running on my end on top of Jens' tree (without error > injection, that's to come). > > One question, though: How am I to determine that the kernel supports > attr_ptr and IORING_RW_ATTR_FLAG_PI? Now that we no longer have separate > IORING_OP_{READ,WRITE}_META commands I can't use IO_URING_OP_SUPPORTED > to find out whether the running kernel supports PI passthrough. Martin, right currently there is no way to probe whether the kernel supports read/write attributes or not. Jens, Pavel how about introducing a new IO_URING_OP_* flag (something like IO_URING_OP_RW_ATTR_SUPPORTED) to probe whether read/write attributes are supported or not. Something like this [*] [*] diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 38f0d6b10eaf..787a2df8037f 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -723,6 +723,7 @@ struct io_uring_rsrc_update2 { #define IORING_REGISTER_FILES_SKIP (-2) #define IO_URING_OP_SUPPORTED (1U << 0) +#define IO_URING_OP_RW_ATTR_SUPPORTED (1U << 1) struct io_uring_probe_op { __u8 op; diff --git a/io_uring/opdef.c b/io_uring/opdef.c index 3de75eca1c92..64e1e5d48dec 100644 --- a/io_uring/opdef.c +++ b/io_uring/opdef.c @@ -67,6 +67,7 @@ const struct io_issue_def io_issue_defs[] = { .iopoll = 1, .iopoll_queue = 1, .vectored = 1, + .rw_attr = 1, .async_size = sizeof(struct io_async_rw), .prep = io_prep_readv, .issue = io_read, @@ -82,6 +83,7 @@ const struct io_issue_def io_issue_defs[] = { .iopoll = 1, .iopoll_queue = 1, .vectored = 1, + .rw_attr = 1, .async_size = sizeof(struct io_async_rw), .prep = io_prep_writev, .issue = io_write, @@ -101,6 +103,7 @@ const struct io_issue_def io_issue_defs[] = { .ioprio = 1, .iopoll = 1, .iopoll_queue = 1, + .rw_attr = 1, .async_size = sizeof(struct io_async_rw), .prep = io_prep_read_fixed, .issue = io_read, @@ -115,6 +118,7 @@ const struct io_issue_def io_issue_defs[] = { .ioprio = 1, .iopoll = 1, .iopoll_queue = 1, + .rw_attr = 1, .async_size = sizeof(struct io_async_rw), .prep = io_prep_write_fixed, .issue = io_write, @@ -246,6 +250,7 @@ const struct io_issue_def io_issue_defs[] = { .ioprio = 1, .iopoll = 1, .iopoll_queue = 1, + .rw_attr = 1, .async_size = sizeof(struct io_async_rw), .prep = io_prep_read, .issue = io_read, @@ -260,6 +265,7 @@ const struct io_issue_def io_issue_defs[] = { .ioprio = 1, .iopoll = 1, .iopoll_queue = 1, + .rw_attr = 1, .async_size = sizeof(struct io_async_rw), .prep = io_prep_write, .issue = io_write, diff --git a/io_uring/opdef.h b/io_uring/opdef.h index 14456436ff74..61460c762ea7 100644 --- a/io_uring/opdef.h +++ b/io_uring/opdef.h @@ -27,6 +27,8 @@ struct io_issue_def { unsigned iopoll_queue : 1; /* vectored opcode, set if 1) vectored, and 2) handler needs to know */ unsigned vectored : 1; + /* supports rw attributes */ + unsigned rw_attr : 1; /* size of async data needed, if any */ unsigned short async_size; diff --git a/io_uring/register.c b/io_uring/register.c index f1698c18c7cb..a54aeaec116c 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -60,8 +60,11 @@ static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg, for (i = 0; i < nr_args; i++) { p->ops[i].op = i; - if (io_uring_op_supported(i)) + if (io_uring_op_supported(i)) { p->ops[i].flags = IO_URING_OP_SUPPORTED; + if (io_issue_defs[i].rw_attr) + p->ops[i].flags |= IO_URING_OP_RW_ATTR_SUPPORTED; + } } p->ops_len = i;
On 12/3/24 06:56, Anuj Gupta wrote: > On Mon, Dec 02, 2024 at 09:13:14PM -0500, Martin K. Petersen wrote: >> >> I have things running on my end on top of Jens' tree (without error >> injection, that's to come). >> >> One question, though: How am I to determine that the kernel supports >> attr_ptr and IORING_RW_ATTR_FLAG_PI? Now that we no longer have separate >> IORING_OP_{READ,WRITE}_META commands I can't use IO_URING_OP_SUPPORTED >> to find out whether the running kernel supports PI passthrough. > > Martin, right currently there is no way to probe whether the kernel > supports read/write attributes or not. > > Jens, Pavel how about introducing a new IO_URING_OP_* flag (something > like IO_URING_OP_RW_ATTR_SUPPORTED) to probe whether read/write attributes > are supported or not. Something like this [*] An IORING_FEAT_ flag might be simpler for now. Or is it in plans to somehow support IORING_OP_READ_MULTISHOT as well? I have to say Adding a good probing infra is long overdue. For example a user might want to know which attributes are supported. And beyond io_uring it might be interesting to probe whether a particular file supports it. > [*] > diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h > index 38f0d6b10eaf..787a2df8037f 100644 > --- a/include/uapi/linux/io_uring.h > +++ b/include/uapi/linux/io_uring.h > @@ -723,6 +723,7 @@ struct io_uring_rsrc_update2 { > #define IORING_REGISTER_FILES_SKIP (-2) > > #define IO_URING_OP_SUPPORTED (1U << 0) > +#define IO_URING_OP_RW_ATTR_SUPPORTED (1U << 1) > > struct io_uring_probe_op { > __u8 op; > diff --git a/io_uring/opdef.c b/io_uring/opdef.c > index 3de75eca1c92..64e1e5d48dec 100644 > --- a/io_uring/opdef.c > +++ b/io_uring/opdef.c > @@ -67,6 +67,7 @@ const struct io_issue_def io_issue_defs[] = { io_cold_defs would be better
On Thu, Nov 28, 2024 at 04:52:36PM +0530, Anuj Gupta wrote: > Add the ability to pass additional attributes along with read/write. > Application can prepare attibute specific information and pass its > address using the SQE field: > __u64 attr_ptr; > > Along with setting a mask indicating attributes being passed: > __u64 attr_type_mask; > > Overall 64 attributes are allowed and currently one attribute > 'IORING_RW_ATTR_FLAG_PI' is supported. > > With PI attribute, userspace can pass following information: > - flags: integrity check flags IO_INTEGRITY_CHK_{GUARD/APPTAG/REFTAG} > - len: length of PI/metadata buffer > - addr: address of metadata buffer > - seed: seed value for reftag remapping > - app_tag: application defined 16b value > > Process this information to prepare uio_meta_descriptor and pass it down > using kiocb->private. > > PI attribute is supported only for direct IO. > > Signed-off-by: Anuj Gupta <anuj20.g@samsung.com> > Signed-off-by: Kanchan Joshi <joshi.k@samsung.com> > --- > include/uapi/linux/io_uring.h | 16 +++++++ > io_uring/io_uring.c | 2 + > io_uring/rw.c | 83 ++++++++++++++++++++++++++++++++++- > io_uring/rw.h | 14 +++++- > 4 files changed, 112 insertions(+), 3 deletions(-) > > diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h > index aac9a4f8fa9a..38f0d6b10eaf 100644 > --- a/include/uapi/linux/io_uring.h > +++ b/include/uapi/linux/io_uring.h > @@ -98,6 +98,10 @@ struct io_uring_sqe { > __u64 addr3; > __u64 __pad2[1]; > }; > + struct { > + __u64 attr_ptr; /* pointer to attribute information */ > + __u64 attr_type_mask; /* bit mask of attributes */ > + }; I can't say I'm a fan of how this turned out. I'm merging up the write hint stuff, and these new fields occupy where that 16-bit value was initially going. Okay, so I guess I need to just add a new attribute flag? That might work if I am only appending exactly one extra attribute per SQE, but what if I need both PI and Write Hint? Do the attributes need to appear in a strict order?
Keith, >> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h >> index aac9a4f8fa9a..38f0d6b10eaf 100644 >> --- a/include/uapi/linux/io_uring.h >> +++ b/include/uapi/linux/io_uring.h >> @@ -98,6 +98,10 @@ struct io_uring_sqe { >> __u64 addr3; >> __u64 __pad2[1]; >> }; >> + struct { >> + __u64 attr_ptr; /* pointer to attribute information */ >> + __u64 attr_type_mask; /* bit mask of attributes */ >> + }; > > I can't say I'm a fan of how this turned out. I'm merging up the write > hint stuff, and these new fields occupy where that 16-bit value was > initially going. Okay, so I guess I need to just add a new attribute > flag? That might work if I am only appending exactly one extra attribute > per SQE, but what if I need both PI and Write Hint? Do the attributes > need to appear in a strict order? We'll definitely need to be able to include multiple attributes. Not sure how the attr_type_mask was intended to work. I guess parsing attributes based on bit position in the mask would be one option. The SCSI approach would be for each attribute to have a 4-byte header with a type and a length so multiple attributes can be described by a single buffer. FWIW, I wasn't initially a big fan of having the attrs stored outside of the sqe but it actually made things a lot cleaner for my use case.
On 12/5/24 18:04, Keith Busch wrote: ... >> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h >> index aac9a4f8fa9a..38f0d6b10eaf 100644 >> --- a/include/uapi/linux/io_uring.h >> +++ b/include/uapi/linux/io_uring.h >> @@ -98,6 +98,10 @@ struct io_uring_sqe { >> __u64 addr3; >> __u64 __pad2[1]; >> }; >> + struct { >> + __u64 attr_ptr; /* pointer to attribute information */ >> + __u64 attr_type_mask; /* bit mask of attributes */ >> + }; > > I can't say I'm a fan of how this turned out. I'm merging up the write > hint stuff, and these new fields occupy where that 16-bit value was > initially going. Okay, so I guess I need to just add a new attribute > flag? That might work if I am only appending exactly one extra attribute > per SQE, but what if I need both PI and Write Hint? Do the attributes > need to appear in a strict order? Martin put it well, this version requires attributes to be placed in their id order, but FWIW without any holes, i.e. the following is fine: ATTR_PI = 1 ATTR_WRITE_STREAM = 2 strcut attr_stream attr = { ... }; sqe->attr_mask = ATTR_WRITE_HINT; sqe->attr_ptr = &attr; In case of multiple attributes: struct compound_attr { struct attr_pi a; struct attr_stream b; } attr = { ... }; sqe->attr_mask = ATTR_WRITE_HINT | ATTR_PI; sqe->attr_ptr = &attr; The other option for the uapi, and Martin mentioned it as well, is to add a type field to all attributes. FWIW, I think the space aliased with sqe->splice_fd_in is not used by read/write, but I haven't looked into streams to get an opinion on which way is more appropriate.
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index aac9a4f8fa9a..38f0d6b10eaf 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -98,6 +98,10 @@ struct io_uring_sqe { __u64 addr3; __u64 __pad2[1]; }; + struct { + __u64 attr_ptr; /* pointer to attribute information */ + __u64 attr_type_mask; /* bit mask of attributes */ + }; __u64 optval; /* * If the ring is initialized with IORING_SETUP_SQE128, then @@ -107,6 +111,18 @@ struct io_uring_sqe { }; }; +/* sqe->attr_type_mask flags */ +#define IORING_RW_ATTR_FLAG_PI (1U << 0) +/* PI attribute information */ +struct io_uring_attr_pi { + __u16 flags; + __u16 app_tag; + __u32 len; + __u64 addr; + __u64 seed; + __u64 rsvd; +}; + /* * If sqe->file_index is set to this for opcodes that instantiate a new * direct descriptor (like openat/openat2/accept), then io_uring will allocate diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 06ff41484e29..e4891f1ce52d 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -3893,6 +3893,8 @@ static int __init io_uring_init(void) BUILD_BUG_SQE_ELEM(46, __u16, __pad3[0]); BUILD_BUG_SQE_ELEM(48, __u64, addr3); BUILD_BUG_SQE_ELEM_SIZE(48, 0, cmd); + BUILD_BUG_SQE_ELEM(48, __u64, attr_ptr); + BUILD_BUG_SQE_ELEM(56, __u64, attr_type_mask); BUILD_BUG_SQE_ELEM(56, __u64, __pad2); BUILD_BUG_ON(sizeof(struct io_uring_files_update) != diff --git a/io_uring/rw.c b/io_uring/rw.c index 0bcb83e4ce3c..04e4467ab0ee 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -257,11 +257,53 @@ static int io_prep_rw_setup(struct io_kiocb *req, int ddir, bool do_import) return 0; } +static inline void io_meta_save_state(struct io_async_rw *io) +{ + io->meta_state.seed = io->meta.seed; + iov_iter_save_state(&io->meta.iter, &io->meta_state.iter_meta); +} + +static inline void io_meta_restore(struct io_async_rw *io, struct kiocb *kiocb) +{ + if (kiocb->ki_flags & IOCB_HAS_METADATA) { + io->meta.seed = io->meta_state.seed; + iov_iter_restore(&io->meta.iter, &io->meta_state.iter_meta); + } +} + +static int io_prep_rw_pi(struct io_kiocb *req, struct io_rw *rw, int ddir, + u64 attr_ptr, u64 attr_type_mask) +{ + struct io_uring_attr_pi pi_attr; + struct io_async_rw *io; + int ret; + + if (copy_from_user(&pi_attr, u64_to_user_ptr(attr_ptr), + sizeof(pi_attr))) + return -EFAULT; + + if (pi_attr.rsvd) + return -EINVAL; + + io = req->async_data; + io->meta.flags = pi_attr.flags; + io->meta.app_tag = pi_attr.app_tag; + io->meta.seed = pi_attr.seed; + ret = import_ubuf(ddir, u64_to_user_ptr(pi_attr.addr), + pi_attr.len, &io->meta.iter); + if (unlikely(ret < 0)) + return ret; + rw->kiocb.ki_flags |= IOCB_HAS_METADATA; + io_meta_save_state(io); + return ret; +} + static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, int ddir, bool do_import) { struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); unsigned ioprio; + u64 attr_type_mask; int ret; rw->kiocb.ki_pos = READ_ONCE(sqe->off); @@ -279,11 +321,28 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, rw->kiocb.ki_ioprio = get_current_ioprio(); } rw->kiocb.dio_complete = NULL; + rw->kiocb.ki_flags = 0; rw->addr = READ_ONCE(sqe->addr); rw->len = READ_ONCE(sqe->len); rw->flags = READ_ONCE(sqe->rw_flags); - return io_prep_rw_setup(req, ddir, do_import); + ret = io_prep_rw_setup(req, ddir, do_import); + + if (unlikely(ret)) + return ret; + + attr_type_mask = READ_ONCE(sqe->attr_type_mask); + if (attr_type_mask) { + u64 attr_ptr; + + /* only PI attribute is supported currently */ + if (attr_type_mask != IORING_RW_ATTR_FLAG_PI) + return -EINVAL; + + attr_ptr = READ_ONCE(sqe->attr_ptr); + ret = io_prep_rw_pi(req, rw, ddir, attr_ptr, attr_type_mask); + } + return ret; } int io_prep_read(struct io_kiocb *req, const struct io_uring_sqe *sqe) @@ -409,7 +468,9 @@ static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req) static void io_resubmit_prep(struct io_kiocb *req) { struct io_async_rw *io = req->async_data; + struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); + io_meta_restore(io, &rw->kiocb); iov_iter_restore(&io->iter, &io->iter_state); } @@ -744,6 +805,10 @@ static bool io_rw_should_retry(struct io_kiocb *req) if (kiocb->ki_flags & (IOCB_DIRECT | IOCB_HIPRI)) return false; + /* never retry for meta io */ + if (kiocb->ki_flags & IOCB_HAS_METADATA) + return false; + /* * just use poll if we can, and don't attempt if the fs doesn't * support callback based unlocks @@ -794,7 +859,7 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type) if (!(req->flags & REQ_F_FIXED_FILE)) req->flags |= io_file_get_flags(file); - kiocb->ki_flags = file->f_iocb_flags; + kiocb->ki_flags |= file->f_iocb_flags; ret = kiocb_set_rw_flags(kiocb, rw->flags, rw_type); if (unlikely(ret)) return ret; @@ -828,6 +893,18 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type) kiocb->ki_complete = io_complete_rw; } + if (kiocb->ki_flags & IOCB_HAS_METADATA) { + struct io_async_rw *io = req->async_data; + + /* + * We have a union of meta fields with wpq used for buffered-io + * in io_async_rw, so fail it here. + */ + if (!(req->file->f_flags & O_DIRECT)) + return -EOPNOTSUPP; + kiocb->private = &io->meta; + } + return 0; } @@ -902,6 +979,7 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags) * manually if we need to. */ iov_iter_restore(&io->iter, &io->iter_state); + io_meta_restore(io, kiocb); do { /* @@ -1125,6 +1203,7 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags) } else { ret_eagain: iov_iter_restore(&io->iter, &io->iter_state); + io_meta_restore(io, kiocb); if (kiocb->ki_flags & IOCB_WRITE) io_req_end_write(req); return -EAGAIN; diff --git a/io_uring/rw.h b/io_uring/rw.h index 3f432dc75441..2d7656bd268d 100644 --- a/io_uring/rw.h +++ b/io_uring/rw.h @@ -2,6 +2,11 @@ #include <linux/pagemap.h> +struct io_meta_state { + u32 seed; + struct iov_iter_state iter_meta; +}; + struct io_async_rw { size_t bytes_done; struct iov_iter iter; @@ -9,7 +14,14 @@ struct io_async_rw { struct iovec fast_iov; struct iovec *free_iovec; int free_iov_nr; - struct wait_page_queue wpq; + /* wpq is for buffered io, while meta fields are used with direct io */ + union { + struct wait_page_queue wpq; + struct { + struct uio_meta meta; + struct io_meta_state meta_state; + }; + }; }; int io_prep_read_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe);