Message ID | 20250318194111.19419-1-James.Bottomley@HansenPartnership.com |
---|---|
Headers | show |
Series | create simple libfs directory iterator and make efivarfs use it | expand |
Hi James, Thanks for persisting with this. On Tue, 18 Mar 2025 at 20:44, James Bottomley <James.Bottomley@hansenpartnership.com> wrote: > > No functional change. Preparatory to using the internal function to > iterate a directory with just a dentry not a file. > > Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com> > --- > fs/libfs.c | 41 +++++++++++++++++++++++++++++------------ > 1 file changed, 29 insertions(+), 12 deletions(-) > > diff --git a/fs/libfs.c b/fs/libfs.c > index 8444f5cc4064..816bfe6c0430 100644 > --- a/fs/libfs.c > +++ b/fs/libfs.c > @@ -189,28 +189,21 @@ EXPORT_SYMBOL(dcache_dir_lseek); > * for ramfs-type trees they can't go away without unlink() or rmdir(), > * both impossible due to the lock on directory. > */ > - > -int dcache_readdir(struct file *file, struct dir_context *ctx) > +static void internal_readdir(struct dentry *dentry, struct dentry *cursor, It might make sense to make this __always_inline, so that the callback argument is guaranteed to become a compile time constant when the caller is dcache_readdir(). Otherwise, the indirect call overhead might impact its performance. > + void *data, bool start, > + bool (*callback)(void *, struct dentry *)) > { > - struct dentry *dentry = file->f_path.dentry; > - struct dentry *cursor = file->private_data; > struct dentry *next = NULL; > struct hlist_node **p; > > - if (!dir_emit_dots(file, ctx)) > - return 0; > - > - if (ctx->pos == 2) > + if (start) > p = &dentry->d_children.first; > else > p = &cursor->d_sib.next; > > while ((next = scan_positives(cursor, p, 1, next)) != NULL) { > - if (!dir_emit(ctx, next->d_name.name, next->d_name.len, > - d_inode(next)->i_ino, > - fs_umode_to_dtype(d_inode(next)->i_mode))) > + if (!callback(data, next)) > break; > - ctx->pos++; > p = &next->d_sib.next; > } > spin_lock(&dentry->d_lock); > @@ -219,6 +212,30 @@ int dcache_readdir(struct file *file, struct dir_context *ctx) > hlist_add_before(&cursor->d_sib, &next->d_sib); > spin_unlock(&dentry->d_lock); > dput(next); > +} > + > +static bool dcache_readdir_callback(void *data, struct dentry *entry) > +{ > + struct dir_context *ctx = data; > + > + if (!dir_emit(ctx, entry->d_name.name, entry->d_name.len, > + d_inode(entry)->i_ino, > + fs_umode_to_dtype(d_inode(entry)->i_mode))) > + return false; > + ctx->pos++; > + return true; > +} > + > +int dcache_readdir(struct file *file, struct dir_context *ctx) > +{ > + struct dentry *dentry = file->f_path.dentry; > + struct dentry *cursor = file->private_data; > + > + if (!dir_emit_dots(file, ctx)) > + return 0; > + > + internal_readdir(dentry, cursor, ctx, ctx->pos == 2, > + dcache_readdir_callback); > > return 0; > } > -- > 2.43.0 >
On Tue, 18 Mar 2025 at 20:46, James Bottomley <James.Bottomley@hansenpartnership.com> wrote: > > This relieves us of the requirement to have a struct path and use file > operations, which greatly simplifies the code. The superblock is now > pinned by the blocking notifier (which is why deregistration moves > above kill_litter_super). > > Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com> > --- > fs/efivarfs/super.c | 103 +++----------------------------------------- > 1 file changed, 7 insertions(+), 96 deletions(-) > I like this a lot. Assuming it gets blessed by the VFS maintainers, Reviewed-by: Ard Biesheuvel <ardb@kernel.org> > diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c > index 81b3c6b7e100..135b0bb9b4b5 100644 > --- a/fs/efivarfs/super.c > +++ b/fs/efivarfs/super.c > @@ -393,29 +393,13 @@ static const struct fs_context_operations efivarfs_context_ops = { > .reconfigure = efivarfs_reconfigure, > }; > > -struct efivarfs_ctx { > - struct dir_context ctx; > - struct super_block *sb; > - struct dentry *dentry; > -}; > - > -static bool efivarfs_actor(struct dir_context *ctx, const char *name, int len, > - loff_t offset, u64 ino, unsigned mode) > +static bool efivarfs_iterate_callback(void *data, struct dentry *dentry) > { > unsigned long size; > - struct efivarfs_ctx *ectx = container_of(ctx, struct efivarfs_ctx, ctx); > - struct qstr qstr = { .name = name, .len = len }; > - struct dentry *dentry = d_hash_and_lookup(ectx->sb->s_root, &qstr); > - struct inode *inode; > - struct efivar_entry *entry; > + struct inode *inode = d_inode(dentry); > + struct efivar_entry *entry = efivar_entry(inode); > int err; > > - if (IS_ERR_OR_NULL(dentry)) > - return true; > - > - inode = d_inode(dentry); > - entry = efivar_entry(inode); > - > err = efivar_entry_size(entry, &size); > size += sizeof(__u32); /* attributes */ > if (err) > @@ -426,12 +410,10 @@ static bool efivarfs_actor(struct dir_context *ctx, const char *name, int len, > inode_unlock(inode); > > if (!size) { > - ectx->dentry = dentry; > - return false; > + pr_info("efivarfs: removing variable %pd\n", dentry); > + simple_recursive_removal(dentry, NULL); > } > > - dput(dentry); > - > return true; > } > > @@ -474,33 +456,11 @@ static int efivarfs_check_missing(efi_char16_t *name16, efi_guid_t vendor, > return err; > } > > -static void efivarfs_deactivate_super_work(struct work_struct *work) > -{ > - struct super_block *s = container_of(work, struct super_block, > - destroy_work); > - /* > - * note: here s->destroy_work is free for reuse (which > - * will happen in deactivate_super) > - */ > - deactivate_super(s); > -} > - > -static struct file_system_type efivarfs_type; > - > static int efivarfs_pm_notify(struct notifier_block *nb, unsigned long action, > void *ptr) > { > struct efivarfs_fs_info *sfi = container_of(nb, struct efivarfs_fs_info, > pm_nb); > - struct path path; > - struct efivarfs_ctx ectx = { > - .ctx = { > - .actor = efivarfs_actor, > - }, > - .sb = sfi->sb, > - }; > - struct file *file; > - struct super_block *s = sfi->sb; > static bool rescan_done = true; > > if (action == PM_HIBERNATION_PREPARE) { > @@ -513,64 +473,15 @@ static int efivarfs_pm_notify(struct notifier_block *nb, unsigned long action, > if (rescan_done) > return NOTIFY_DONE; > > - /* ensure single superblock is alive and pin it */ > - if (!atomic_inc_not_zero(&s->s_active)) > - return NOTIFY_DONE; > - > pr_info("efivarfs: resyncing variable state\n"); > > - path.dentry = sfi->sb->s_root; > - > - /* > - * do not add SB_KERNMOUNT which a single superblock could > - * expose to userspace and which also causes MNT_INTERNAL, see > - * below > - */ > - path.mnt = vfs_kern_mount(&efivarfs_type, 0, > - efivarfs_type.name, NULL); > - if (IS_ERR(path.mnt)) { > - pr_err("efivarfs: internal mount failed\n"); > - /* > - * We may be the last pinner of the superblock but > - * calling efivarfs_kill_sb from within the notifier > - * here would deadlock trying to unregister it > - */ > - INIT_WORK(&s->destroy_work, efivarfs_deactivate_super_work); > - schedule_work(&s->destroy_work); > - return PTR_ERR(path.mnt); > - } > - > - /* path.mnt now has pin on superblock, so this must be above one */ > - atomic_dec(&s->s_active); > - > - file = kernel_file_open(&path, O_RDONLY | O_DIRECTORY | O_NOATIME, > - current_cred()); > - /* > - * safe even if last put because no MNT_INTERNAL means this > - * will do delayed deactivate_super and not deadlock > - */ > - mntput(path.mnt); > - if (IS_ERR(file)) > - return NOTIFY_DONE; > - > rescan_done = true; > > /* > * First loop over the directory and verify each entry exists, > * removing it if it doesn't > */ > - file->f_pos = 2; /* skip . and .. */ > - do { > - ectx.dentry = NULL; > - iterate_dir(file, &ectx.ctx); > - if (ectx.dentry) { > - pr_info("efivarfs: removing variable %pd\n", > - ectx.dentry); > - simple_recursive_removal(ectx.dentry, NULL); > - dput(ectx.dentry); > - } > - } while (ectx.dentry); > - fput(file); > + simple_iterate_call(sfi->sb->s_root, NULL, efivarfs_iterate_callback); > > /* > * then loop over variables, creating them if there's no matching > @@ -609,8 +520,8 @@ static void efivarfs_kill_sb(struct super_block *sb) > struct efivarfs_fs_info *sfi = sb->s_fs_info; > > blocking_notifier_chain_unregister(&efivar_ops_nh, &sfi->nb); > - kill_litter_super(sb); > unregister_pm_notifier(&sfi->pm_nb); > + kill_litter_super(sb); > > kfree(sfi); > } > -- > 2.43.0 > >
On Tue, 2025-03-18 at 22:32 +0100, Ard Biesheuvel wrote: > Hi James, > > Thanks for persisting with this. Heh, well, it is starting to feel a bit like the swamp I can't get out of ... > On Tue, 18 Mar 2025 at 20:44, James Bottomley > <James.Bottomley@hansenpartnership.com> wrote: > > > > No functional change. Preparatory to using the internal function > > to iterate a directory with just a dentry not a file. > > > > Signed-off-by: James Bottomley > > <James.Bottomley@HansenPartnership.com> > > --- > > fs/libfs.c | 41 +++++++++++++++++++++++++++++------------ > > 1 file changed, 29 insertions(+), 12 deletions(-) > > > > diff --git a/fs/libfs.c b/fs/libfs.c > > index 8444f5cc4064..816bfe6c0430 100644 > > --- a/fs/libfs.c > > +++ b/fs/libfs.c > > @@ -189,28 +189,21 @@ EXPORT_SYMBOL(dcache_dir_lseek); > > * for ramfs-type trees they can't go away without unlink() or > > rmdir(), > > * both impossible due to the lock on directory. > > */ > > - > > -int dcache_readdir(struct file *file, struct dir_context *ctx) > > +static void internal_readdir(struct dentry *dentry, struct dentry > > *cursor, > > It might make sense to make this __always_inline, so that the > callback argument is guaranteed to become a compile time constant > when the caller is dcache_readdir(). Otherwise, the indirect call > overhead might impact its performance. I was hoping the compiler would pick that up ... especially as it's a tail call, but I can add it if necessary. Regards, James
On Tue, Mar 18, 2025 at 03:41:08PM -0400, James Bottomley wrote: > [Note this is built on top of the previous patch to populate path.mnt] > > This turned out to be much simpler than I feared. The first patch > breaks out the core of the current dcache_readdir() into an internal > function with a callback (there should be no functional change). The > second adds a new API, simple_iterate_call(), which loops over the > dentries in the next level and executes a callback for each one and > the third which removes all the efivarfs superblock and mnt crud and > replaces it with this simple callback interface. I think the > diffstats of the third patch demonstrate how much nicer it is for us: I suspect that you are making it too generic for its own good. dcache_readdir() needs to cope with the situation when there are fuckloads of opened-and-unliked files in there. That's why we play those games with cursors under if (need_resched()) there. That's not the case for efivarfs. There you really want just "grab a reference to the next positive, drop the reference we were given" and that's it. IOW, find_next_child() instead of scan_positives(). Export that and it becomes just a simple loop - child = NULL; while ((child = find_next_child(parent, child)) != NULL) { struct inode *inode = d_inode(child); struct efivar_entry *entry = efivar_entry(inode); err = efivar_entry_size(entry, &size); inode_lock(inode); i_size_write(inode, err ? 0 : size + sizeof(__u32)); inode_unlock(inode); if (err) simple_recursive_removal(child, NULL); } and that's it. No callbacks, no cursors, no iterators - just an export of helper already there.
On Tue, Mar 18, 2025 at 11:45:05PM +0000, Al Viro wrote: > and it becomes just a simple loop - > child = NULL; > while ((child = find_next_child(parent, child)) != NULL) { that being root, obviously. And we might want a better function name than that...
On Tue, 2025-03-18 at 23:45 +0000, Al Viro wrote: [...] > I suspect that you are making it too generic for its own good. > > dcache_readdir() needs to cope with the situation when there are > fuckloads of opened-and-unliked files in there. That's why we > play those games with cursors under if (need_resched()) there. Yes, but those games will never really activate for the efivarfs code because we expect to be mostly positive. What I was aiming for was to make sure there's no duplication of the subtle code, so someone can't forget to update it in two places, so doing a callback approach seemed natural. > That's not the case for efivarfs. There you really want just > "grab a reference to the next positive, drop the reference we > were given" and that's it. > > IOW, find_next_child() instead of scan_positives(). Export that > and it becomes just a simple loop - > child = NULL; > while ((child = find_next_child(parent, child)) != NULL) { > struct inode *inode = d_inode(child); > struct efivar_entry *entry = efivar_entry(inode); > > err = efivar_entry_size(entry, &size); > > inode_lock(inode); > i_size_write(inode, err ? 0 : size + sizeof(__u32)); > inode_unlock(inode); > > if (err) > simple_recursive_removal(child, NULL); > } > and that's it. No callbacks, no cursors, no iterators - just an > export of helper already there. So we don't mind the callbacks; we have to do it for efivar_init() lower down anyway. However, I did take a cut at doing this based on simple positive (see below). I assume you won't like the way I have to allocate and toss a cursor for each iteration, nor the fact that there's still the cond_resched() in there? I think I can fix that but I'd have to slice apart simple_positive(), which is a bigger undertaking. > And we might want a better function name than that... I went for simple_next_child() ... Regards, James --- fs/libfs.c | 41 +++++++++++++++++++++++++++++++++++++++++ include/linux/fs.h | 2 ++ 2 files changed, 43 insertions(+) diff --git a/fs/libfs.c b/fs/libfs.c index 8444f5cc4064..86f29cc2b85a 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -146,6 +146,47 @@ static struct dentry *scan_positives(struct dentry *cursor, return found; } +/** + * simple_next_child - get next child of the parent directory + * + * @parent: the directory to scan + * @child: last child (or NULL to start at the beginning) + * + * returns the next positive child in sequence with the reference + * elevated but if a child is passed in will drop that + * reference. Returns NULL on either error or when directory has been + * fully scanned. + * + * The intended use is as an iterator because all the references will + * be dropped by the end of the while loop: + * + * child = NULL + * while(child = simple_next_child(parent, child)) { + * // do something + * } + */ +struct dentry *simple_next_child(struct dentry *parent, struct dentry *child) +{ + struct hlist_node **p; + struct dentry *cursor = d_alloc_cursor(parent); + + if (!cursor) { + dput(child); + return NULL; + } + + if (child) + p = &child->d_sib.next; + else + p = &parent->d_children.first; + + child = scan_positives(cursor, p, 1, child); + dput(cursor); + + return child; +} +EXPORT_SYMBOL(simple_next_child); + loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence) { struct dentry *dentry = file->f_path.dentry; diff --git a/include/linux/fs.h b/include/linux/fs.h index 2788df98080f..dd84d1c3b8af 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3531,6 +3531,8 @@ extern int simple_rename(struct mnt_idmap *, struct inode *, unsigned int); extern void simple_recursive_removal(struct dentry *, void (*callback)(struct dentry *)); +extern struct dentry *simple_next_child(struct dentry *parent, + struct dentry *child); extern int noop_fsync(struct file *, loff_t, loff_t, int); extern ssize_t noop_direct_IO(struct kiocb *iocb, struct iov_iter *iter); extern int simple_empty(struct dentry *);
On Wed, 2025-03-19 at 12:46 -0400, James Bottomley wrote: > I assume you won't like the way I have to allocate and toss a cursor > for each iteration, nor the fact that there's still the > cond_resched() in there? I think I can fix that but I'd have to > slice apart simple_positive(), which is a bigger undertaking. So this is what I think that would look like: it still exports the same simple_next_child() function but doesn't need to allocate cursors or do any of the cond_resched() stuff. The down side is that the slice is a lot less easily verified than the other two patches, so this one's going to need a lot of code inspection to make sure I got it right. Regards, James --- fs/libfs.c | 85 +++++++++++++++++++++++++++++++++++++++------- include/linux/fs.h | 2 ++ 2 files changed, 74 insertions(+), 13 deletions(-) diff --git a/fs/libfs.c b/fs/libfs.c index 8444f5cc4064..3a32910f44dc 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -101,6 +101,34 @@ int dcache_dir_close(struct inode *inode, struct file *file) } EXPORT_SYMBOL(dcache_dir_close); +/* + * helper for positive scanning. Requires a nested while loop which should + * be broken if this returns false + */ +static bool internal_scan_positive(struct hlist_node **p, struct dentry *d, + struct dentry **found, loff_t *count) +{ + while (*p) { + p = &d->d_sib.next; + + // we must at least skip cursors, to avoid livelocks + if (d->d_flags & DCACHE_DENTRY_CURSOR) + continue; + + if (simple_positive(d) && !--*count) { + spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED); + if (simple_positive(d)) + *found = dget_dlock(d); + spin_unlock(&d->d_lock); + if (likely(*found)) + return true; + *count = 1; + } + return false; + } + return true; +} + /* parent is locked at least shared */ /* * Returns an element of siblings' list. @@ -118,19 +146,9 @@ static struct dentry *scan_positives(struct dentry *cursor, spin_lock(&dentry->d_lock); while (*p) { struct dentry *d = hlist_entry(*p, struct dentry, d_sib); - p = &d->d_sib.next; - // we must at least skip cursors, to avoid livelocks - if (d->d_flags & DCACHE_DENTRY_CURSOR) - continue; - if (simple_positive(d) && !--count) { - spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED); - if (simple_positive(d)) - found = dget_dlock(d); - spin_unlock(&d->d_lock); - if (likely(found)) - break; - count = 1; - } + + if (internal_scan_positive(p, d, &found, &count)) + break; if (need_resched()) { if (!hlist_unhashed(&cursor->d_sib)) __hlist_del(&cursor->d_sib); @@ -146,6 +164,47 @@ static struct dentry *scan_positives(struct dentry *cursor, return found; } +/** + * simple_next_child - get next child of the parent directory + * + * @parent: the directory to scan + * @child: last child (or NULL to start at the beginning) + * + * returns the next positive child in sequence with the reference + * elevated but if a child is passed in will drop that + * reference. Returns NULL on either error or when directory has been + * fully scanned. + * + * The intended use is as an iterator because all the references will + * be dropped by the end of the while loop: + * + * child = NULL + * while(child = simple_next_child(parent, child)) { + * // do something + * } + */ +struct dentry *simple_next_child(struct dentry *parent, struct dentry *child) +{ + struct hlist_node **p; + loff_t count = 1; + struct dentry *found = NULL; + + if (child) + p = &child->d_sib.next; + else + p = &parent->d_children.first; + + while (*p) { + struct dentry *d = hlist_entry(*p, struct dentry, d_sib); + + if (internal_scan_positive(p, d, &found, &count)) + break; + } + dput(child); + return found; +} +EXPORT_SYMBOL(simple_next_child); + loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence) { struct dentry *dentry = file->f_path.dentry; diff --git a/include/linux/fs.h b/include/linux/fs.h index 2788df98080f..dd84d1c3b8af 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3531,6 +3531,8 @@ extern int simple_rename(struct mnt_idmap *, struct inode *, unsigned int); extern void simple_recursive_removal(struct dentry *, void (*callback)(struct dentry *)); +extern struct dentry *simple_next_child(struct dentry *parent, + struct dentry *child); extern int noop_fsync(struct file *, loff_t, loff_t, int); extern ssize_t noop_direct_IO(struct kiocb *iocb, struct iov_iter *iter); extern int simple_empty(struct dentry *);