Message ID | 20220816132759.43248-1-jlayton@kernel.org |
---|---|
Headers | show |
Series | vfs: expose the inode change attribute via statx | expand |
On Tue, Aug 16, 2022 at 09:27:56AM -0400, Jeff Layton wrote: > From: Jeff Layton <jlayton@redhat.com> > > Claim one of the spare fields in struct statx to hold a 64-bit change > attribute. When statx requests this attribute, do an > inode_query_iversion and fill the result in the field. > > Also update the test-statx.c program to display the change attribute and > the mountid as well. > > Signed-off-by: Jeff Layton <jlayton@kernel.org> > --- > fs/stat.c | 7 +++++++ > include/linux/stat.h | 1 + > include/uapi/linux/stat.h | 3 ++- > samples/vfs/test-statx.c | 8 ++++++-- > 4 files changed, 16 insertions(+), 3 deletions(-) > > diff --git a/fs/stat.c b/fs/stat.c > index 9ced8860e0f3..7c3d063c31ba 100644 > --- a/fs/stat.c > +++ b/fs/stat.c > @@ -17,6 +17,7 @@ > #include <linux/syscalls.h> > #include <linux/pagemap.h> > #include <linux/compat.h> > +#include <linux/iversion.h> > > #include <linux/uaccess.h> > #include <asm/unistd.h> > @@ -118,6 +119,11 @@ int vfs_getattr_nosec(const struct path *path, struct kstat *stat, > stat->attributes_mask |= (STATX_ATTR_AUTOMOUNT | > STATX_ATTR_DAX); > > + if ((request_mask & STATX_CHANGE_ATTR) && IS_I_VERSION(inode)) { > + stat->result_mask |= STATX_CHANGE_ATTR; > + stat->change_attr = inode_query_iversion(inode); > + } > + > mnt_userns = mnt_user_ns(path->mnt); > if (inode->i_op->getattr) > return inode->i_op->getattr(mnt_userns, path, stat, > @@ -611,6 +617,7 @@ cp_statx(const struct kstat *stat, struct statx __user *buffer) > tmp.stx_dev_major = MAJOR(stat->dev); > tmp.stx_dev_minor = MINOR(stat->dev); > tmp.stx_mnt_id = stat->mnt_id; > + tmp.stx_change_attr = stat->change_attr; > > return copy_to_user(buffer, &tmp, sizeof(tmp)) ? -EFAULT : 0; > } > diff --git a/include/linux/stat.h b/include/linux/stat.h > index 7df06931f25d..7b444c2ad0ad 100644 > --- a/include/linux/stat.h > +++ b/include/linux/stat.h > @@ -50,6 +50,7 @@ struct kstat { > struct timespec64 btime; /* File creation time */ > u64 blocks; > u64 mnt_id; > + u64 change_attr; > }; > > #endif > diff --git a/include/uapi/linux/stat.h b/include/uapi/linux/stat.h > index 1500a0f58041..fd839ec76aa4 100644 > --- a/include/uapi/linux/stat.h > +++ b/include/uapi/linux/stat.h > @@ -124,7 +124,7 @@ struct statx { > __u32 stx_dev_minor; > /* 0x90 */ > __u64 stx_mnt_id; > - __u64 __spare2; > + __u64 stx_change_attr; /* Inode change attribute */ > /* 0xa0 */ > __u64 __spare3[12]; /* Spare space for future expansion */ > /* 0x100 */ > @@ -152,6 +152,7 @@ struct statx { > #define STATX_BASIC_STATS 0x000007ffU /* The stuff in the normal stat struct */ > #define STATX_BTIME 0x00000800U /* Want/got stx_btime */ > #define STATX_MNT_ID 0x00001000U /* Got stx_mnt_id */ > +#define STATX_CHANGE_ATTR 0x00002000U /* Want/got stx_change_attr */ I'm a bit worried that STATX_CHANGE_ATTR isn't a good name for the flag and field. Or I fail to understand what exact information this will expose and how userspace will consume it. To me the naming gives the impression that some set of generic attributes have changed but given that statx is about querying file attributes this becomes confusing. Wouldn't it make more sense this time to expose it as what it is and call this STATX_INO_VERSION and __u64 stx_ino_version?
On Tue, 2022-08-16 at 15:44 +0200, Christian Brauner wrote: > On Tue, Aug 16, 2022 at 09:27:56AM -0400, Jeff Layton wrote: > > From: Jeff Layton <jlayton@redhat.com> > > > > Claim one of the spare fields in struct statx to hold a 64-bit change > > attribute. When statx requests this attribute, do an > > inode_query_iversion and fill the result in the field. > > > > Also update the test-statx.c program to display the change attribute and > > the mountid as well. > > > > Signed-off-by: Jeff Layton <jlayton@kernel.org> > > --- > > fs/stat.c | 7 +++++++ > > include/linux/stat.h | 1 + > > include/uapi/linux/stat.h | 3 ++- > > samples/vfs/test-statx.c | 8 ++++++-- > > 4 files changed, 16 insertions(+), 3 deletions(-) > > > > diff --git a/fs/stat.c b/fs/stat.c > > index 9ced8860e0f3..7c3d063c31ba 100644 > > --- a/fs/stat.c > > +++ b/fs/stat.c > > @@ -17,6 +17,7 @@ > > #include <linux/syscalls.h> > > #include <linux/pagemap.h> > > #include <linux/compat.h> > > +#include <linux/iversion.h> > > > > #include <linux/uaccess.h> > > #include <asm/unistd.h> > > @@ -118,6 +119,11 @@ int vfs_getattr_nosec(const struct path *path, struct kstat *stat, > > stat->attributes_mask |= (STATX_ATTR_AUTOMOUNT | > > STATX_ATTR_DAX); > > > > + if ((request_mask & STATX_CHANGE_ATTR) && IS_I_VERSION(inode)) { > > + stat->result_mask |= STATX_CHANGE_ATTR; > > + stat->change_attr = inode_query_iversion(inode); > > + } > > + > > mnt_userns = mnt_user_ns(path->mnt); > > if (inode->i_op->getattr) > > return inode->i_op->getattr(mnt_userns, path, stat, > > @@ -611,6 +617,7 @@ cp_statx(const struct kstat *stat, struct statx __user *buffer) > > tmp.stx_dev_major = MAJOR(stat->dev); > > tmp.stx_dev_minor = MINOR(stat->dev); > > tmp.stx_mnt_id = stat->mnt_id; > > + tmp.stx_change_attr = stat->change_attr; > > > > return copy_to_user(buffer, &tmp, sizeof(tmp)) ? -EFAULT : 0; > > } > > diff --git a/include/linux/stat.h b/include/linux/stat.h > > index 7df06931f25d..7b444c2ad0ad 100644 > > --- a/include/linux/stat.h > > +++ b/include/linux/stat.h > > @@ -50,6 +50,7 @@ struct kstat { > > struct timespec64 btime; /* File creation time */ > > u64 blocks; > > u64 mnt_id; > > + u64 change_attr; > > }; > > > > #endif > > diff --git a/include/uapi/linux/stat.h b/include/uapi/linux/stat.h > > index 1500a0f58041..fd839ec76aa4 100644 > > --- a/include/uapi/linux/stat.h > > +++ b/include/uapi/linux/stat.h > > @@ -124,7 +124,7 @@ struct statx { > > __u32 stx_dev_minor; > > /* 0x90 */ > > __u64 stx_mnt_id; > > - __u64 __spare2; > > + __u64 stx_change_attr; /* Inode change attribute */ > > /* 0xa0 */ > > __u64 __spare3[12]; /* Spare space for future expansion */ > > /* 0x100 */ > > @@ -152,6 +152,7 @@ struct statx { > > #define STATX_BASIC_STATS 0x000007ffU /* The stuff in the normal stat struct */ > > #define STATX_BTIME 0x00000800U /* Want/got stx_btime */ > > #define STATX_MNT_ID 0x00001000U /* Got stx_mnt_id */ > > +#define STATX_CHANGE_ATTR 0x00002000U /* Want/got stx_change_attr */ > > I'm a bit worried that STATX_CHANGE_ATTR isn't a good name for the flag > and field. Or I fail to understand what exact information this will > expose and how userspace will consume it. > To me the naming gives the impression that some set of generic > attributes have changed but given that statx is about querying file > attributes this becomes confusing. > > Wouldn't it make more sense this time to expose it as what it is and > call this STATX_INO_VERSION and __u64 stx_ino_version? "Let the great bikesheddening begin!" In all seriousness though, you do have a good point. The NFS RFCs call this the "change attribute", so I went forward with that parlance here. I'm not opposed to changing the naming -- STATX_INO_VERSION sounds fine to me. Let's see if anyone else has a better name before I make any changes though.
Christian Brauner <brauner@kernel.org> wrote: > > +#define STATX_CHANGE_ATTR 0x00002000U /* Want/got stx_change_attr */ > > I'm a bit worried that STATX_CHANGE_ATTR isn't a good name for the flag > and field. Or I fail to understand what exact information this will > expose and how userspace will consume it. > To me the naming gives the impression that some set of generic > attributes have changed but given that statx is about querying file > attributes this becomes confusing. > > Wouldn't it make more sense this time to expose it as what it is and > call this STATX_INO_VERSION and __u64 stx_ino_version? I'm not sure that STATX_INO_VERSION is better that might get confused with the version number that's used to uniquify inode slots (ie. deal with inode number reuse). The problem is that we need fsinfo() or similar to qualify what this means. On some filesystems, it's only changed when the data content changes, but on others it may get changed when, say, xattrs get changed; further, on some filesystems it might be monotonically incremented, but on others it's just supposed to be different between two consecutive changes (nfs, IIRC). David
On Tue, 2022-08-16 at 14:55 +0100, David Howells wrote: > Christian Brauner <brauner@kernel.org> wrote: > > > > +#define STATX_CHANGE_ATTR 0x00002000U /* Want/got stx_change_attr */ > > > > I'm a bit worried that STATX_CHANGE_ATTR isn't a good name for the flag > > and field. Or I fail to understand what exact information this will > > expose and how userspace will consume it. > > To me the naming gives the impression that some set of generic > > attributes have changed but given that statx is about querying file > > attributes this becomes confusing. > > > > Wouldn't it make more sense this time to expose it as what it is and > > call this STATX_INO_VERSION and __u64 stx_ino_version? > > I'm not sure that STATX_INO_VERSION is better that might get confused with the > version number that's used to uniquify inode slots (ie. deal with inode number > reuse). > > The problem is that we need fsinfo() or similar to qualify what this means. > On some filesystems, it's only changed when the data content changes, but on > others it may get changed when, say, xattrs get changed; further, on some > filesystems it might be monotonically incremented, but on others it's just > supposed to be different between two consecutive changes (nfs, IIRC). > I think we'll just have to ensure that before we expose this for any filesystem that it conforms to some minimum standards. i.e.: it must change if there are data or metadata changes to the inode, modulo atime changes due to reads on regular files or readdir on dirs. The local filesystems, ceph and NFS should all be fine. I guess that just leaves AFS. If it can't guarantee that, then we might want to avoid exposing the counter for it.
Jeff Layton <jlayton@kernel.org> wrote: > I think we'll just have to ensure that before we expose this for any > filesystem that it conforms to some minimum standards. i.e.: it must > change if there are data or metadata changes to the inode, modulo atime > changes due to reads on regular files or readdir on dirs. > > The local filesystems, ceph and NFS should all be fine. I guess that > just leaves AFS. If it can't guarantee that, then we might want to avoid > exposing the counter for it. AFS monotonically increments the counter on data changes; doesn't make any change for metadata changes (other than the file size). But you can't assume NFS works as per your suggestion as you don't know what's backing it (it could be AFS, for example - there's a converter for that). Further, for ordinary disk filesystems, two data changes may get elided and only increment the counter once. And then there's mmap... It might be better to reduce the scope of your definition and just say that it must change if there's a data change and may also be changed if there's a metadata change. David
On Tue, 2022-08-16 at 16:15 +0100, David Howells wrote: > Jeff Layton <jlayton@kernel.org> wrote: > > > I think we'll just have to ensure that before we expose this for any > > filesystem that it conforms to some minimum standards. i.e.: it must > > change if there are data or metadata changes to the inode, modulo atime > > changes due to reads on regular files or readdir on dirs. > > > > The local filesystems, ceph and NFS should all be fine. I guess that > > just leaves AFS. If it can't guarantee that, then we might want to avoid > > exposing the counter for it. > > AFS monotonically increments the counter on data changes; doesn't make any > change for metadata changes (other than the file size). > > But you can't assume NFS works as per your suggestion as you don't know what's > backing it (it could be AFS, for example - there's a converter for that). > In that case, the NFS server must synthesize a proper change attr. The NFS spec mandates that it change on most metadata changes. > Further, for ordinary disk filesystems, two data changes may get elided and > only increment the counter once. > Not a problem as long as nothing queried the counter in between the changes. > And then there's mmap... > Not sure how that matters here. > It might be better to reduce the scope of your definition and just say that it > must change if there's a data change and may also be changed if there's a > metadata change. > I'd prefer that we mandate that it change on metadata changes as well. That's what most of the in-kernel users want, and what most of the existing filesystems provide. If AFS can't give that guarantee then we can just omit exposing i_version on it.
On Tue, Aug 16, 2022 at 11:32:24AM -0400, Jeff Layton wrote: > On Tue, 2022-08-16 at 16:15 +0100, David Howells wrote: > > Jeff Layton <jlayton@kernel.org> wrote: > > > > > I think we'll just have to ensure that before we expose this for any > > > filesystem that it conforms to some minimum standards. i.e.: it must > > > change if there are data or metadata changes to the inode, modulo atime > > > changes due to reads on regular files or readdir on dirs. > > > > > > The local filesystems, ceph and NFS should all be fine. I guess that > > > just leaves AFS. If it can't guarantee that, then we might want to avoid > > > exposing the counter for it. > > > > AFS monotonically increments the counter on data changes; doesn't make any > > change for metadata changes (other than the file size). > > > > But you can't assume NFS works as per your suggestion as you don't know what's > > backing it (it could be AFS, for example - there's a converter for that). > > > > In that case, the NFS server must synthesize a proper change attr. The > NFS spec mandates that it change on most metadata changes. > > > Further, for ordinary disk filesystems, two data changes may get elided and > > only increment the counter once. > > > > Not a problem as long as nothing queried the counter in between the > changes. > > > And then there's mmap... > > > > Not sure how that matters here. > > > It might be better to reduce the scope of your definition and just say that it > > must change if there's a data change and may also be changed if there's a > > metadata change. > > > > I'd prefer that we mandate that it change on metadata changes as well. ...in that case, why not leave the i_version bump in xfs_trans_log_inode? That will capture all changes to file data, attribues, and metadata. ;) --D > That's what most of the in-kernel users want, and what most of the > existing filesystems provide. If AFS can't give that guarantee then we > can just omit exposing i_version on it. > -- > Jeff Layton <jlayton@kernel.org>
On Tue, 2022-08-16 at 08:51 -0700, Darrick J. Wong wrote: > On Tue, Aug 16, 2022 at 11:32:24AM -0400, Jeff Layton wrote: > > On Tue, 2022-08-16 at 16:15 +0100, David Howells wrote: > > > Jeff Layton <jlayton@kernel.org> wrote: > > > > > > > I think we'll just have to ensure that before we expose this for any > > > > filesystem that it conforms to some minimum standards. i.e.: it must > > > > change if there are data or metadata changes to the inode, modulo atime > > > > changes due to reads on regular files or readdir on dirs. > > > > > > > > The local filesystems, ceph and NFS should all be fine. I guess that > > > > just leaves AFS. If it can't guarantee that, then we might want to avoid > > > > exposing the counter for it. > > > > > > AFS monotonically increments the counter on data changes; doesn't make any > > > change for metadata changes (other than the file size). > > > > > > But you can't assume NFS works as per your suggestion as you don't know what's > > > backing it (it could be AFS, for example - there's a converter for that). > > > > > > > In that case, the NFS server must synthesize a proper change attr. The > > NFS spec mandates that it change on most metadata changes. > > > > > Further, for ordinary disk filesystems, two data changes may get elided and > > > only increment the counter once. > > > > > > > Not a problem as long as nothing queried the counter in between the > > changes. > > > > > And then there's mmap... > > > > > > > Not sure how that matters here. > > > > > It might be better to reduce the scope of your definition and just say that it > > > must change if there's a data change and may also be changed if there's a > > > metadata change. > > > > > > > I'd prefer that we mandate that it change on metadata changes as well. > > ...in that case, why not leave the i_version bump in > xfs_trans_log_inode? That will capture all changes to file data, > attribues, and metadata. ;) > > Because that includes changes to the atime due to reads which should be specifically omitted. We could still keep that callsite instead, if you can see some way to exclude those. In practice, we are using a change to i_version to mean that "something changed" in the inode, which usually implies a change to the ctime and mtime. Trond pointed out that the NFSv4 spec implies that time_access updates should be omitted from what we consider to be "metadata" here: https://mailarchive.ietf.org/arch/msg/nfsv4/yrRBMrVwWWDCrgHPAzq_yAEc7BU/ IMA (which is the only other in-kernel consumer of i_version) also wants the same behavior. > > That's what most of the in-kernel users want, and what most of the > > existing filesystems provide. If AFS can't give that guarantee then we > > can just omit exposing i_version on it.
On Tue, 2022-08-16 at 15:44 +0200, Christian Brauner wrote: > On Tue, Aug 16, 2022 at 09:27:56AM -0400, Jeff Layton wrote: > > From: Jeff Layton <jlayton@redhat.com> > > > > Claim one of the spare fields in struct statx to hold a 64-bit change > > attribute. When statx requests this attribute, do an > > inode_query_iversion and fill the result in the field. > > > > Also update the test-statx.c program to display the change attribute and > > the mountid as well. > > > > Signed-off-by: Jeff Layton <jlayton@kernel.org> > > --- > > fs/stat.c | 7 +++++++ > > include/linux/stat.h | 1 + > > include/uapi/linux/stat.h | 3 ++- > > samples/vfs/test-statx.c | 8 ++++++-- > > 4 files changed, 16 insertions(+), 3 deletions(-) > > > > diff --git a/fs/stat.c b/fs/stat.c > > index 9ced8860e0f3..7c3d063c31ba 100644 > > --- a/fs/stat.c > > +++ b/fs/stat.c > > @@ -17,6 +17,7 @@ > > #include <linux/syscalls.h> > > #include <linux/pagemap.h> > > #include <linux/compat.h> > > +#include <linux/iversion.h> > > > > #include <linux/uaccess.h> > > #include <asm/unistd.h> > > @@ -118,6 +119,11 @@ int vfs_getattr_nosec(const struct path *path, struct kstat *stat, > > stat->attributes_mask |= (STATX_ATTR_AUTOMOUNT | > > STATX_ATTR_DAX); > > > > + if ((request_mask & STATX_CHANGE_ATTR) && IS_I_VERSION(inode)) { > > + stat->result_mask |= STATX_CHANGE_ATTR; > > + stat->change_attr = inode_query_iversion(inode); > > + } > > + > > mnt_userns = mnt_user_ns(path->mnt); > > if (inode->i_op->getattr) > > return inode->i_op->getattr(mnt_userns, path, stat, > > @@ -611,6 +617,7 @@ cp_statx(const struct kstat *stat, struct statx __user *buffer) > > tmp.stx_dev_major = MAJOR(stat->dev); > > tmp.stx_dev_minor = MINOR(stat->dev); > > tmp.stx_mnt_id = stat->mnt_id; > > + tmp.stx_change_attr = stat->change_attr; > > > > return copy_to_user(buffer, &tmp, sizeof(tmp)) ? -EFAULT : 0; > > } > > diff --git a/include/linux/stat.h b/include/linux/stat.h > > index 7df06931f25d..7b444c2ad0ad 100644 > > --- a/include/linux/stat.h > > +++ b/include/linux/stat.h > > @@ -50,6 +50,7 @@ struct kstat { > > struct timespec64 btime; /* File creation time */ > > u64 blocks; > > u64 mnt_id; > > + u64 change_attr; > > }; > > > > #endif > > diff --git a/include/uapi/linux/stat.h b/include/uapi/linux/stat.h > > index 1500a0f58041..fd839ec76aa4 100644 > > --- a/include/uapi/linux/stat.h > > +++ b/include/uapi/linux/stat.h > > @@ -124,7 +124,7 @@ struct statx { > > __u32 stx_dev_minor; > > /* 0x90 */ > > __u64 stx_mnt_id; > > - __u64 __spare2; > > + __u64 stx_change_attr; /* Inode change attribute */ > > /* 0xa0 */ > > __u64 __spare3[12]; /* Spare space for future expansion */ > > /* 0x100 */ > > @@ -152,6 +152,7 @@ struct statx { > > #define STATX_BASIC_STATS 0x000007ffU /* The stuff in the normal stat struct */ > > #define STATX_BTIME 0x00000800U /* Want/got stx_btime */ > > #define STATX_MNT_ID 0x00001000U /* Got stx_mnt_id */ > > +#define STATX_CHANGE_ATTR 0x00002000U /* Want/got stx_change_attr */ > > I'm a bit worried that STATX_CHANGE_ATTR isn't a good name for the flag > and field. Or I fail to understand what exact information this will > expose and how userspace will consume it. > To me the naming gives the impression that some set of generic > attributes have changed but given that statx is about querying file > attributes this becomes confusing. > > Wouldn't it make more sense this time to expose it as what it is and > call this STATX_INO_VERSION and __u64 stx_ino_version? Ok, having thought about this some more, I think this is a reasonable name. It _does_ sort of imply that this value will increase over time. That's true of all of the existing implementations, but I think we ought to define such that there could be alternative implementations. I'll respin this patch and resend it with a wider audience. Thanks for the input so far!