diff mbox series

[v11,06/10] io_uring: introduce attributes for read/write and PI support

Message ID 20241128112240.8867-7-anuj20.g@samsung.com
State New
Headers show
Series Read/Write with meta/integrity | expand

Commit Message

Anuj Gupta Nov. 28, 2024, 11:22 a.m. UTC
Add the ability to pass additional attributes along with read/write.
Application can prepare attibute specific information and pass its
address using the SQE field:
	__u64	attr_ptr;

Along with setting a mask indicating attributes being passed:
	__u64	attr_type_mask;

Overall 64 attributes are allowed and currently one attribute
'IORING_RW_ATTR_FLAG_PI' is supported.

With PI attribute, userspace can pass following information:
- flags: integrity check flags IO_INTEGRITY_CHK_{GUARD/APPTAG/REFTAG}
- len: length of PI/metadata buffer
- addr: address of metadata buffer
- seed: seed value for reftag remapping
- app_tag: application defined 16b value

Process this information to prepare uio_meta_descriptor and pass it down
using kiocb->private.

PI attribute is supported only for direct IO.

Signed-off-by: Anuj Gupta <anuj20.g@samsung.com>
Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
---
 include/uapi/linux/io_uring.h | 16 +++++++
 io_uring/io_uring.c           |  2 +
 io_uring/rw.c                 | 83 ++++++++++++++++++++++++++++++++++-
 io_uring/rw.h                 | 14 +++++-
 4 files changed, 112 insertions(+), 3 deletions(-)

Comments

Martin K. Petersen Dec. 3, 2024, 2:13 a.m. UTC | #1
Anuj,

> Add the ability to pass additional attributes along with read/write.
> Application can prepare attibute specific information and pass its
> address using the SQE field:
> 	__u64	attr_ptr;
>
> Along with setting a mask indicating attributes being passed:
> 	__u64	attr_type_mask;
>
> Overall 64 attributes are allowed and currently one attribute
> 'IORING_RW_ATTR_FLAG_PI' is supported.

I have things running on my end on top of Jens' tree (without error
injection, that's to come).

One question, though: How am I to determine that the kernel supports
attr_ptr and IORING_RW_ATTR_FLAG_PI? Now that we no longer have separate
IORING_OP_{READ,WRITE}_META commands I can't use IO_URING_OP_SUPPORTED
to find out whether the running kernel supports PI passthrough.
Anuj Gupta Dec. 3, 2024, 6:56 a.m. UTC | #2
On Mon, Dec 02, 2024 at 09:13:14PM -0500, Martin K. Petersen wrote:
> 
> I have things running on my end on top of Jens' tree (without error
> injection, that's to come).
> 
> One question, though: How am I to determine that the kernel supports
> attr_ptr and IORING_RW_ATTR_FLAG_PI? Now that we no longer have separate
> IORING_OP_{READ,WRITE}_META commands I can't use IO_URING_OP_SUPPORTED
> to find out whether the running kernel supports PI passthrough.

Martin, right currently there is no way to probe whether the kernel
supports read/write attributes or not.

Jens, Pavel how about introducing a new IO_URING_OP_* flag (something
like IO_URING_OP_RW_ATTR_SUPPORTED) to probe whether read/write attributes
are supported or not. Something like this [*]

[*]
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 38f0d6b10eaf..787a2df8037f 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -723,6 +723,7 @@ struct io_uring_rsrc_update2 {
 #define IORING_REGISTER_FILES_SKIP	(-2)
 
 #define IO_URING_OP_SUPPORTED	(1U << 0)
+#define IO_URING_OP_RW_ATTR_SUPPORTED	(1U << 1)
 
 struct io_uring_probe_op {
 	__u8 op;
diff --git a/io_uring/opdef.c b/io_uring/opdef.c
index 3de75eca1c92..64e1e5d48dec 100644
--- a/io_uring/opdef.c
+++ b/io_uring/opdef.c
@@ -67,6 +67,7 @@ const struct io_issue_def io_issue_defs[] = {
 		.iopoll			= 1,
 		.iopoll_queue		= 1,
 		.vectored		= 1,
+		.rw_attr		= 1,
 		.async_size		= sizeof(struct io_async_rw),
 		.prep			= io_prep_readv,
 		.issue			= io_read,
@@ -82,6 +83,7 @@ const struct io_issue_def io_issue_defs[] = {
 		.iopoll			= 1,
 		.iopoll_queue		= 1,
 		.vectored		= 1,
+		.rw_attr		= 1,
 		.async_size		= sizeof(struct io_async_rw),
 		.prep			= io_prep_writev,
 		.issue			= io_write,
@@ -101,6 +103,7 @@ const struct io_issue_def io_issue_defs[] = {
 		.ioprio			= 1,
 		.iopoll			= 1,
 		.iopoll_queue		= 1,
+		.rw_attr		= 1,
 		.async_size		= sizeof(struct io_async_rw),
 		.prep			= io_prep_read_fixed,
 		.issue			= io_read,
@@ -115,6 +118,7 @@ const struct io_issue_def io_issue_defs[] = {
 		.ioprio			= 1,
 		.iopoll			= 1,
 		.iopoll_queue		= 1,
+		.rw_attr		= 1,
 		.async_size		= sizeof(struct io_async_rw),
 		.prep			= io_prep_write_fixed,
 		.issue			= io_write,
@@ -246,6 +250,7 @@ const struct io_issue_def io_issue_defs[] = {
 		.ioprio			= 1,
 		.iopoll			= 1,
 		.iopoll_queue		= 1,
+		.rw_attr		= 1,
 		.async_size		= sizeof(struct io_async_rw),
 		.prep			= io_prep_read,
 		.issue			= io_read,
@@ -260,6 +265,7 @@ const struct io_issue_def io_issue_defs[] = {
 		.ioprio			= 1,
 		.iopoll			= 1,
 		.iopoll_queue		= 1,
+		.rw_attr		= 1,
 		.async_size		= sizeof(struct io_async_rw),
 		.prep			= io_prep_write,
 		.issue			= io_write,
diff --git a/io_uring/opdef.h b/io_uring/opdef.h
index 14456436ff74..61460c762ea7 100644
--- a/io_uring/opdef.h
+++ b/io_uring/opdef.h
@@ -27,6 +27,8 @@ struct io_issue_def {
 	unsigned		iopoll_queue : 1;
 	/* vectored opcode, set if 1) vectored, and 2) handler needs to know */
 	unsigned		vectored : 1;
+	/* supports rw attributes */
+	unsigned		rw_attr : 1;
 
 	/* size of async data needed, if any */
 	unsigned short		async_size;
diff --git a/io_uring/register.c b/io_uring/register.c
index f1698c18c7cb..a54aeaec116c 100644
--- a/io_uring/register.c
+++ b/io_uring/register.c
@@ -60,8 +60,11 @@ static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
 
 	for (i = 0; i < nr_args; i++) {
 		p->ops[i].op = i;
-		if (io_uring_op_supported(i))
+		if (io_uring_op_supported(i)) {
 			p->ops[i].flags = IO_URING_OP_SUPPORTED;
+			if (io_issue_defs[i].rw_attr)
+				p->ops[i].flags |= IO_URING_OP_RW_ATTR_SUPPORTED;
+		}
 	}
 	p->ops_len = i;
Pavel Begunkov Dec. 3, 2024, noon UTC | #3
On 12/3/24 06:56, Anuj Gupta wrote:
> On Mon, Dec 02, 2024 at 09:13:14PM -0500, Martin K. Petersen wrote:
>>
>> I have things running on my end on top of Jens' tree (without error
>> injection, that's to come).
>>
>> One question, though: How am I to determine that the kernel supports
>> attr_ptr and IORING_RW_ATTR_FLAG_PI? Now that we no longer have separate
>> IORING_OP_{READ,WRITE}_META commands I can't use IO_URING_OP_SUPPORTED
>> to find out whether the running kernel supports PI passthrough.
> 
> Martin, right currently there is no way to probe whether the kernel
> supports read/write attributes or not.
> 
> Jens, Pavel how about introducing a new IO_URING_OP_* flag (something
> like IO_URING_OP_RW_ATTR_SUPPORTED) to probe whether read/write attributes
> are supported or not. Something like this [*]

An IORING_FEAT_ flag might be simpler for now. Or is it in plans to
somehow support IORING_OP_READ_MULTISHOT as well?

I have to say Adding a good probing infra is long overdue. For example a
user might want to know which attributes are supported. And beyond
io_uring it might be interesting to probe whether a particular file
supports it.


> [*]
> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
> index 38f0d6b10eaf..787a2df8037f 100644
> --- a/include/uapi/linux/io_uring.h
> +++ b/include/uapi/linux/io_uring.h
> @@ -723,6 +723,7 @@ struct io_uring_rsrc_update2 {
>   #define IORING_REGISTER_FILES_SKIP	(-2)
>   
>   #define IO_URING_OP_SUPPORTED	(1U << 0)
> +#define IO_URING_OP_RW_ATTR_SUPPORTED	(1U << 1)
>   
>   struct io_uring_probe_op {
>   	__u8 op;
> diff --git a/io_uring/opdef.c b/io_uring/opdef.c
> index 3de75eca1c92..64e1e5d48dec 100644
> --- a/io_uring/opdef.c
> +++ b/io_uring/opdef.c
> @@ -67,6 +67,7 @@ const struct io_issue_def io_issue_defs[] = {

io_cold_defs would be better
diff mbox series

Patch

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index aac9a4f8fa9a..38f0d6b10eaf 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -98,6 +98,10 @@  struct io_uring_sqe {
 			__u64	addr3;
 			__u64	__pad2[1];
 		};
+		struct {
+			__u64	attr_ptr; /* pointer to attribute information */
+			__u64	attr_type_mask; /* bit mask of attributes */
+		};
 		__u64	optval;
 		/*
 		 * If the ring is initialized with IORING_SETUP_SQE128, then
@@ -107,6 +111,18 @@  struct io_uring_sqe {
 	};
 };
 
+/* sqe->attr_type_mask flags */
+#define IORING_RW_ATTR_FLAG_PI	(1U << 0)
+/* PI attribute information */
+struct io_uring_attr_pi {
+		__u16	flags;
+		__u16	app_tag;
+		__u32	len;
+		__u64	addr;
+		__u64	seed;
+		__u64	rsvd;
+};
+
 /*
  * If sqe->file_index is set to this for opcodes that instantiate a new
  * direct descriptor (like openat/openat2/accept), then io_uring will allocate
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 06ff41484e29..e4891f1ce52d 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -3893,6 +3893,8 @@  static int __init io_uring_init(void)
 	BUILD_BUG_SQE_ELEM(46, __u16,  __pad3[0]);
 	BUILD_BUG_SQE_ELEM(48, __u64,  addr3);
 	BUILD_BUG_SQE_ELEM_SIZE(48, 0, cmd);
+	BUILD_BUG_SQE_ELEM(48, __u64, attr_ptr);
+	BUILD_BUG_SQE_ELEM(56, __u64, attr_type_mask);
 	BUILD_BUG_SQE_ELEM(56, __u64,  __pad2);
 
 	BUILD_BUG_ON(sizeof(struct io_uring_files_update) !=
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 0bcb83e4ce3c..04e4467ab0ee 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -257,11 +257,53 @@  static int io_prep_rw_setup(struct io_kiocb *req, int ddir, bool do_import)
 	return 0;
 }
 
+static inline void io_meta_save_state(struct io_async_rw *io)
+{
+	io->meta_state.seed = io->meta.seed;
+	iov_iter_save_state(&io->meta.iter, &io->meta_state.iter_meta);
+}
+
+static inline void io_meta_restore(struct io_async_rw *io, struct kiocb *kiocb)
+{
+	if (kiocb->ki_flags & IOCB_HAS_METADATA) {
+		io->meta.seed = io->meta_state.seed;
+		iov_iter_restore(&io->meta.iter, &io->meta_state.iter_meta);
+	}
+}
+
+static int io_prep_rw_pi(struct io_kiocb *req, struct io_rw *rw, int ddir,
+			 u64 attr_ptr, u64 attr_type_mask)
+{
+	struct io_uring_attr_pi pi_attr;
+	struct io_async_rw *io;
+	int ret;
+
+	if (copy_from_user(&pi_attr, u64_to_user_ptr(attr_ptr),
+	    sizeof(pi_attr)))
+		return -EFAULT;
+
+	if (pi_attr.rsvd)
+		return -EINVAL;
+
+	io = req->async_data;
+	io->meta.flags = pi_attr.flags;
+	io->meta.app_tag = pi_attr.app_tag;
+	io->meta.seed = pi_attr.seed;
+	ret = import_ubuf(ddir, u64_to_user_ptr(pi_attr.addr),
+			  pi_attr.len, &io->meta.iter);
+	if (unlikely(ret < 0))
+		return ret;
+	rw->kiocb.ki_flags |= IOCB_HAS_METADATA;
+	io_meta_save_state(io);
+	return ret;
+}
+
 static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 		      int ddir, bool do_import)
 {
 	struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
 	unsigned ioprio;
+	u64 attr_type_mask;
 	int ret;
 
 	rw->kiocb.ki_pos = READ_ONCE(sqe->off);
@@ -279,11 +321,28 @@  static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 		rw->kiocb.ki_ioprio = get_current_ioprio();
 	}
 	rw->kiocb.dio_complete = NULL;
+	rw->kiocb.ki_flags = 0;
 
 	rw->addr = READ_ONCE(sqe->addr);
 	rw->len = READ_ONCE(sqe->len);
 	rw->flags = READ_ONCE(sqe->rw_flags);
-	return io_prep_rw_setup(req, ddir, do_import);
+	ret = io_prep_rw_setup(req, ddir, do_import);
+
+	if (unlikely(ret))
+		return ret;
+
+	attr_type_mask = READ_ONCE(sqe->attr_type_mask);
+	if (attr_type_mask) {
+		u64 attr_ptr;
+
+		/* only PI attribute is supported currently */
+		if (attr_type_mask != IORING_RW_ATTR_FLAG_PI)
+			return -EINVAL;
+
+		attr_ptr = READ_ONCE(sqe->attr_ptr);
+		ret = io_prep_rw_pi(req, rw, ddir, attr_ptr, attr_type_mask);
+	}
+	return ret;
 }
 
 int io_prep_read(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@@ -409,7 +468,9 @@  static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req)
 static void io_resubmit_prep(struct io_kiocb *req)
 {
 	struct io_async_rw *io = req->async_data;
+	struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
 
+	io_meta_restore(io, &rw->kiocb);
 	iov_iter_restore(&io->iter, &io->iter_state);
 }
 
@@ -744,6 +805,10 @@  static bool io_rw_should_retry(struct io_kiocb *req)
 	if (kiocb->ki_flags & (IOCB_DIRECT | IOCB_HIPRI))
 		return false;
 
+	/* never retry for meta io */
+	if (kiocb->ki_flags & IOCB_HAS_METADATA)
+		return false;
+
 	/*
 	 * just use poll if we can, and don't attempt if the fs doesn't
 	 * support callback based unlocks
@@ -794,7 +859,7 @@  static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type)
 	if (!(req->flags & REQ_F_FIXED_FILE))
 		req->flags |= io_file_get_flags(file);
 
-	kiocb->ki_flags = file->f_iocb_flags;
+	kiocb->ki_flags |= file->f_iocb_flags;
 	ret = kiocb_set_rw_flags(kiocb, rw->flags, rw_type);
 	if (unlikely(ret))
 		return ret;
@@ -828,6 +893,18 @@  static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type)
 		kiocb->ki_complete = io_complete_rw;
 	}
 
+	if (kiocb->ki_flags & IOCB_HAS_METADATA) {
+		struct io_async_rw *io = req->async_data;
+
+		/*
+		 * We have a union of meta fields with wpq used for buffered-io
+		 * in io_async_rw, so fail it here.
+		 */
+		if (!(req->file->f_flags & O_DIRECT))
+			return -EOPNOTSUPP;
+		kiocb->private = &io->meta;
+	}
+
 	return 0;
 }
 
@@ -902,6 +979,7 @@  static int __io_read(struct io_kiocb *req, unsigned int issue_flags)
 	 * manually if we need to.
 	 */
 	iov_iter_restore(&io->iter, &io->iter_state);
+	io_meta_restore(io, kiocb);
 
 	do {
 		/*
@@ -1125,6 +1203,7 @@  int io_write(struct io_kiocb *req, unsigned int issue_flags)
 	} else {
 ret_eagain:
 		iov_iter_restore(&io->iter, &io->iter_state);
+		io_meta_restore(io, kiocb);
 		if (kiocb->ki_flags & IOCB_WRITE)
 			io_req_end_write(req);
 		return -EAGAIN;
diff --git a/io_uring/rw.h b/io_uring/rw.h
index 3f432dc75441..2d7656bd268d 100644
--- a/io_uring/rw.h
+++ b/io_uring/rw.h
@@ -2,6 +2,11 @@ 
 
 #include <linux/pagemap.h>
 
+struct io_meta_state {
+	u32			seed;
+	struct iov_iter_state	iter_meta;
+};
+
 struct io_async_rw {
 	size_t				bytes_done;
 	struct iov_iter			iter;
@@ -9,7 +14,14 @@  struct io_async_rw {
 	struct iovec			fast_iov;
 	struct iovec			*free_iovec;
 	int				free_iov_nr;
-	struct wait_page_queue		wpq;
+	/* wpq is for buffered io, while meta fields are used with direct io */
+	union {
+		struct wait_page_queue		wpq;
+		struct {
+			struct uio_meta			meta;
+			struct io_meta_state		meta_state;
+		};
+	};
 };
 
 int io_prep_read_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe);