mbox series

[RFC,0/3] create simple libfs directory iterator and make efivarfs use it

Message ID 20250318194111.19419-1-James.Bottomley@HansenPartnership.com
Headers show
Series create simple libfs directory iterator and make efivarfs use it | expand

Message

James Bottomley March 18, 2025, 7:41 p.m. UTC
[Note this is built on top of the previous patch to populate path.mnt]

This turned out to be much simpler than I feared.  The first patch
breaks out the core of the current dcache_readdir() into an internal
function with a callback (there should be no functional change).  The
second adds a new API, simple_iterate_call(), which loops over the
dentries in the next level and executes a callback for each one and
the third which removes all the efivarfs superblock and mnt crud and
replaces it with this simple callback interface.  I think the
diffstats of the third patch demonstrate how much nicer it is for us:

 1 file changed, 7 insertions(+), 96 deletions(-)

Regards,

James

---

James Bottomley (3):
  libfs: rework dcache_readdir to use an internal function with callback
  libfs: add simple directory iteration function with callback
  efivarfs: replace iterate_dir with libfs function simple_iterate_call

 fs/efivarfs/super.c | 103 +++-----------------------------------------
 fs/libfs.c          |  74 +++++++++++++++++++++++++------
 include/linux/fs.h  |   2 +
 3 files changed, 71 insertions(+), 108 deletions(-)

Comments

Ard Biesheuvel March 18, 2025, 9:32 p.m. UTC | #1
Hi James,

Thanks for persisting with this.


On Tue, 18 Mar 2025 at 20:44, James Bottomley
<James.Bottomley@hansenpartnership.com> wrote:
>
> No functional change.  Preparatory to using the internal function to
> iterate a directory with just a dentry not a file.
>
> Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
> ---
>  fs/libfs.c | 41 +++++++++++++++++++++++++++++------------
>  1 file changed, 29 insertions(+), 12 deletions(-)
>
> diff --git a/fs/libfs.c b/fs/libfs.c
> index 8444f5cc4064..816bfe6c0430 100644
> --- a/fs/libfs.c
> +++ b/fs/libfs.c
> @@ -189,28 +189,21 @@ EXPORT_SYMBOL(dcache_dir_lseek);
>   * for ramfs-type trees they can't go away without unlink() or rmdir(),
>   * both impossible due to the lock on directory.
>   */
> -
> -int dcache_readdir(struct file *file, struct dir_context *ctx)
> +static void internal_readdir(struct dentry *dentry, struct dentry *cursor,

It might make sense to make this __always_inline, so that the callback
argument is guaranteed to become a compile time constant when the
caller is dcache_readdir(). Otherwise, the indirect call overhead
might impact its performance.

> +                            void *data, bool start,
> +                            bool (*callback)(void *, struct dentry *))
>  {
> -       struct dentry *dentry = file->f_path.dentry;
> -       struct dentry *cursor = file->private_data;
>         struct dentry *next = NULL;
>         struct hlist_node **p;
>
> -       if (!dir_emit_dots(file, ctx))
> -               return 0;
> -
> -       if (ctx->pos == 2)
> +       if (start)
>                 p = &dentry->d_children.first;
>         else
>                 p = &cursor->d_sib.next;
>
>         while ((next = scan_positives(cursor, p, 1, next)) != NULL) {
> -               if (!dir_emit(ctx, next->d_name.name, next->d_name.len,
> -                             d_inode(next)->i_ino,
> -                             fs_umode_to_dtype(d_inode(next)->i_mode)))
> +               if (!callback(data, next))
>                         break;
> -               ctx->pos++;
>                 p = &next->d_sib.next;
>         }
>         spin_lock(&dentry->d_lock);
> @@ -219,6 +212,30 @@ int dcache_readdir(struct file *file, struct dir_context *ctx)
>                 hlist_add_before(&cursor->d_sib, &next->d_sib);
>         spin_unlock(&dentry->d_lock);
>         dput(next);
> +}
> +
> +static bool dcache_readdir_callback(void *data, struct dentry *entry)
> +{
> +       struct dir_context *ctx = data;
> +
> +       if (!dir_emit(ctx, entry->d_name.name, entry->d_name.len,
> +                     d_inode(entry)->i_ino,
> +                     fs_umode_to_dtype(d_inode(entry)->i_mode)))
> +               return false;
> +       ctx->pos++;
> +       return true;
> +}
> +
> +int dcache_readdir(struct file *file, struct dir_context *ctx)
> +{
> +       struct dentry *dentry = file->f_path.dentry;
> +       struct dentry *cursor = file->private_data;
> +
> +       if (!dir_emit_dots(file, ctx))
> +               return 0;
> +
> +       internal_readdir(dentry, cursor, ctx, ctx->pos == 2,
> +                        dcache_readdir_callback);
>
>         return 0;
>  }
> --
> 2.43.0
>
Ard Biesheuvel March 18, 2025, 9:34 p.m. UTC | #2
On Tue, 18 Mar 2025 at 20:46, James Bottomley
<James.Bottomley@hansenpartnership.com> wrote:
>
> This relieves us of the requirement to have a struct path and use file
> operations, which greatly simplifies the code.  The superblock is now
> pinned by the blocking notifier (which is why deregistration moves
> above kill_litter_super).
>
> Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
> ---
>  fs/efivarfs/super.c | 103 +++-----------------------------------------
>  1 file changed, 7 insertions(+), 96 deletions(-)
>

I like this a lot. Assuming it gets blessed by the VFS maintainers,

Reviewed-by: Ard Biesheuvel <ardb@kernel.org>

> diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c
> index 81b3c6b7e100..135b0bb9b4b5 100644
> --- a/fs/efivarfs/super.c
> +++ b/fs/efivarfs/super.c
> @@ -393,29 +393,13 @@ static const struct fs_context_operations efivarfs_context_ops = {
>         .reconfigure    = efivarfs_reconfigure,
>  };
>
> -struct efivarfs_ctx {
> -       struct dir_context ctx;
> -       struct super_block *sb;
> -       struct dentry *dentry;
> -};
> -
> -static bool efivarfs_actor(struct dir_context *ctx, const char *name, int len,
> -                          loff_t offset, u64 ino, unsigned mode)
> +static bool efivarfs_iterate_callback(void *data, struct dentry *dentry)
>  {
>         unsigned long size;
> -       struct efivarfs_ctx *ectx = container_of(ctx, struct efivarfs_ctx, ctx);
> -       struct qstr qstr = { .name = name, .len = len };
> -       struct dentry *dentry = d_hash_and_lookup(ectx->sb->s_root, &qstr);
> -       struct inode *inode;
> -       struct efivar_entry *entry;
> +       struct inode *inode = d_inode(dentry);
> +       struct efivar_entry *entry = efivar_entry(inode);
>         int err;
>
> -       if (IS_ERR_OR_NULL(dentry))
> -               return true;
> -
> -       inode = d_inode(dentry);
> -       entry = efivar_entry(inode);
> -
>         err = efivar_entry_size(entry, &size);
>         size += sizeof(__u32);  /* attributes */
>         if (err)
> @@ -426,12 +410,10 @@ static bool efivarfs_actor(struct dir_context *ctx, const char *name, int len,
>         inode_unlock(inode);
>
>         if (!size) {
> -               ectx->dentry = dentry;
> -               return false;
> +               pr_info("efivarfs: removing variable %pd\n", dentry);
> +               simple_recursive_removal(dentry, NULL);
>         }
>
> -       dput(dentry);
> -
>         return true;
>  }
>
> @@ -474,33 +456,11 @@ static int efivarfs_check_missing(efi_char16_t *name16, efi_guid_t vendor,
>         return err;
>  }
>
> -static void efivarfs_deactivate_super_work(struct work_struct *work)
> -{
> -       struct super_block *s = container_of(work, struct super_block,
> -                                            destroy_work);
> -       /*
> -        * note: here s->destroy_work is free for reuse (which
> -        * will happen in deactivate_super)
> -        */
> -       deactivate_super(s);
> -}
> -
> -static struct file_system_type efivarfs_type;
> -
>  static int efivarfs_pm_notify(struct notifier_block *nb, unsigned long action,
>                               void *ptr)
>  {
>         struct efivarfs_fs_info *sfi = container_of(nb, struct efivarfs_fs_info,
>                                                     pm_nb);
> -       struct path path;
> -       struct efivarfs_ctx ectx = {
> -               .ctx = {
> -                       .actor  = efivarfs_actor,
> -               },
> -               .sb = sfi->sb,
> -       };
> -       struct file *file;
> -       struct super_block *s = sfi->sb;
>         static bool rescan_done = true;
>
>         if (action == PM_HIBERNATION_PREPARE) {
> @@ -513,64 +473,15 @@ static int efivarfs_pm_notify(struct notifier_block *nb, unsigned long action,
>         if (rescan_done)
>                 return NOTIFY_DONE;
>
> -       /* ensure single superblock is alive and pin it */
> -       if (!atomic_inc_not_zero(&s->s_active))
> -               return NOTIFY_DONE;
> -
>         pr_info("efivarfs: resyncing variable state\n");
>
> -       path.dentry = sfi->sb->s_root;
> -
> -       /*
> -        * do not add SB_KERNMOUNT which a single superblock could
> -        * expose to userspace and which also causes MNT_INTERNAL, see
> -        * below
> -        */
> -       path.mnt = vfs_kern_mount(&efivarfs_type, 0,
> -                                 efivarfs_type.name, NULL);
> -       if (IS_ERR(path.mnt)) {
> -               pr_err("efivarfs: internal mount failed\n");
> -               /*
> -                * We may be the last pinner of the superblock but
> -                * calling efivarfs_kill_sb from within the notifier
> -                * here would deadlock trying to unregister it
> -                */
> -               INIT_WORK(&s->destroy_work, efivarfs_deactivate_super_work);
> -               schedule_work(&s->destroy_work);
> -               return PTR_ERR(path.mnt);
> -       }
> -
> -       /* path.mnt now has pin on superblock, so this must be above one */
> -       atomic_dec(&s->s_active);
> -
> -       file = kernel_file_open(&path, O_RDONLY | O_DIRECTORY | O_NOATIME,
> -                               current_cred());
> -       /*
> -        * safe even if last put because no MNT_INTERNAL means this
> -        * will do delayed deactivate_super and not deadlock
> -        */
> -       mntput(path.mnt);
> -       if (IS_ERR(file))
> -               return NOTIFY_DONE;
> -
>         rescan_done = true;
>
>         /*
>          * First loop over the directory and verify each entry exists,
>          * removing it if it doesn't
>          */
> -       file->f_pos = 2;        /* skip . and .. */
> -       do {
> -               ectx.dentry = NULL;
> -               iterate_dir(file, &ectx.ctx);
> -               if (ectx.dentry) {
> -                       pr_info("efivarfs: removing variable %pd\n",
> -                               ectx.dentry);
> -                       simple_recursive_removal(ectx.dentry, NULL);
> -                       dput(ectx.dentry);
> -               }
> -       } while (ectx.dentry);
> -       fput(file);
> +       simple_iterate_call(sfi->sb->s_root, NULL, efivarfs_iterate_callback);
>
>         /*
>          * then loop over variables, creating them if there's no matching
> @@ -609,8 +520,8 @@ static void efivarfs_kill_sb(struct super_block *sb)
>         struct efivarfs_fs_info *sfi = sb->s_fs_info;
>
>         blocking_notifier_chain_unregister(&efivar_ops_nh, &sfi->nb);
> -       kill_litter_super(sb);
>         unregister_pm_notifier(&sfi->pm_nb);
> +       kill_litter_super(sb);
>
>         kfree(sfi);
>  }
> --
> 2.43.0
>
>
James Bottomley March 18, 2025, 9:49 p.m. UTC | #3
On Tue, 2025-03-18 at 22:32 +0100, Ard Biesheuvel wrote:
> Hi James,
> 
> Thanks for persisting with this.

Heh, well, it is starting to feel a bit like the swamp I can't get out
of ...

> On Tue, 18 Mar 2025 at 20:44, James Bottomley
> <James.Bottomley@hansenpartnership.com> wrote:
> > 
> > No functional change.  Preparatory to using the internal function
> > to iterate a directory with just a dentry not a file.
> > 
> > Signed-off-by: James Bottomley
> > <James.Bottomley@HansenPartnership.com>
> > ---
> >  fs/libfs.c | 41 +++++++++++++++++++++++++++++------------
> >  1 file changed, 29 insertions(+), 12 deletions(-)
> > 
> > diff --git a/fs/libfs.c b/fs/libfs.c
> > index 8444f5cc4064..816bfe6c0430 100644
> > --- a/fs/libfs.c
> > +++ b/fs/libfs.c
> > @@ -189,28 +189,21 @@ EXPORT_SYMBOL(dcache_dir_lseek);
> >   * for ramfs-type trees they can't go away without unlink() or
> > rmdir(),
> >   * both impossible due to the lock on directory.
> >   */
> > -
> > -int dcache_readdir(struct file *file, struct dir_context *ctx)
> > +static void internal_readdir(struct dentry *dentry, struct dentry
> > *cursor,
> 
> It might make sense to make this __always_inline, so that the
> callback argument is guaranteed to become a compile time constant
> when the caller is dcache_readdir(). Otherwise, the indirect call
> overhead might impact its performance.

I was hoping the compiler would pick that up ... especially as it's a
tail call, but I can add it if necessary.

Regards,

James
Al Viro March 18, 2025, 11:45 p.m. UTC | #4
On Tue, Mar 18, 2025 at 03:41:08PM -0400, James Bottomley wrote:
> [Note this is built on top of the previous patch to populate path.mnt]
> 
> This turned out to be much simpler than I feared.  The first patch
> breaks out the core of the current dcache_readdir() into an internal
> function with a callback (there should be no functional change).  The
> second adds a new API, simple_iterate_call(), which loops over the
> dentries in the next level and executes a callback for each one and
> the third which removes all the efivarfs superblock and mnt crud and
> replaces it with this simple callback interface.  I think the
> diffstats of the third patch demonstrate how much nicer it is for us:

I suspect that you are making it too generic for its own good.

dcache_readdir() needs to cope with the situation when there are
fuckloads of opened-and-unliked files in there.  That's why we
play those games with cursors under if (need_resched()) there.
That's not the case for efivarfs.  There you really want just
"grab a reference to the next positive, drop the reference we
were given" and that's it.

IOW, find_next_child() instead of scan_positives().  Export that
and it becomes just a simple loop -
	child = NULL;
	while ((child = find_next_child(parent, child)) != NULL) {
		struct inode *inode = d_inode(child);
		struct efivar_entry *entry = efivar_entry(inode);

		err = efivar_entry_size(entry, &size);

		inode_lock(inode);
		i_size_write(inode, err ? 0 : size + sizeof(__u32));
		inode_unlock(inode);

		if (err)
			simple_recursive_removal(child, NULL);
	}
and that's it.  No callbacks, no cursors, no iterators - just an
export of helper already there.
Al Viro March 18, 2025, 11:49 p.m. UTC | #5
On Tue, Mar 18, 2025 at 11:45:05PM +0000, Al Viro wrote:
> and it becomes just a simple loop -
> 	child = NULL;
> 	while ((child = find_next_child(parent, child)) != NULL) {
					that being root, obviously.

And we might want a better function name than that...
James Bottomley March 19, 2025, 4:46 p.m. UTC | #6
On Tue, 2025-03-18 at 23:45 +0000, Al Viro wrote:
[...]
> I suspect that you are making it too generic for its own good.
> 
> dcache_readdir() needs to cope with the situation when there are
> fuckloads of opened-and-unliked files in there.  That's why we
> play those games with cursors under if (need_resched()) there.

Yes, but those games will never really activate for the efivarfs code
because we expect to be mostly positive.  What I was aiming for was to
make sure there's no duplication of the subtle code,  so someone can't
forget to update it in two places, so doing a callback approach seemed
natural.

> That's not the case for efivarfs.  There you really want just
> "grab a reference to the next positive, drop the reference we
> were given" and that's it.
> 
> IOW, find_next_child() instead of scan_positives().  Export that
> and it becomes just a simple loop -
> 	child = NULL;
> 	while ((child = find_next_child(parent, child)) != NULL) {
> 		struct inode *inode = d_inode(child);
> 		struct efivar_entry *entry = efivar_entry(inode);
> 
> 		err = efivar_entry_size(entry, &size);
> 
> 		inode_lock(inode);
> 		i_size_write(inode, err ? 0 : size + sizeof(__u32));
> 		inode_unlock(inode);
> 
> 		if (err)
> 			simple_recursive_removal(child, NULL);
> 	}
> and that's it.  No callbacks, no cursors, no iterators - just an
> export of helper already there.

So we don't mind the callbacks; we have to do it for efivar_init()
lower down anyway.  However, I did take a cut at doing this based on
simple positive (see below).  I assume you won't like the way I have to
allocate and toss a cursor for each iteration, nor the fact that
there's still the cond_resched() in there?  I think I can fix that but
I'd have to slice apart simple_positive(), which is a bigger
undertaking.

> And we might want a better function name than that...

I went for simple_next_child() ...

Regards,

James

---

 fs/libfs.c         | 41 +++++++++++++++++++++++++++++++++++++++++
 include/linux/fs.h |  2 ++
 2 files changed, 43 insertions(+)

diff --git a/fs/libfs.c b/fs/libfs.c
index 8444f5cc4064..86f29cc2b85a 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -146,6 +146,47 @@ static struct dentry *scan_positives(struct dentry *cursor,
 	return found;
 }
 
+/**
+ * simple_next_child - get next child of the parent directory
+ *
+ * @parent: the directory to scan
+ * @child: last child (or NULL to start at the beginning)
+ *
+ * returns the next positive child in sequence with the reference
+ * elevated but if a child is passed in will drop that
+ * reference. Returns NULL on either error or when directory has been
+ * fully scanned.
+ *
+ * The intended use is as an iterator because all the references will
+ * be dropped by the end of the while loop:
+ *
+ *     child = NULL
+ *     while(child = simple_next_child(parent, child)) {
+ *          // do something
+ *     }
+ */
+struct dentry *simple_next_child(struct dentry *parent, struct dentry *child)
+{
+	struct hlist_node **p;
+	struct dentry *cursor = d_alloc_cursor(parent);
+
+	if (!cursor) {
+		dput(child);
+		return NULL;
+	}
+
+	if (child)
+		p = &child->d_sib.next;
+	else
+		p = &parent->d_children.first;
+
+	child = scan_positives(cursor, p, 1, child);
+	dput(cursor);
+
+	return child;
+}
+EXPORT_SYMBOL(simple_next_child);
+
 loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence)
 {
 	struct dentry *dentry = file->f_path.dentry;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 2788df98080f..dd84d1c3b8af 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3531,6 +3531,8 @@ extern int simple_rename(struct mnt_idmap *, struct inode *,
 			 unsigned int);
 extern void simple_recursive_removal(struct dentry *,
                               void (*callback)(struct dentry *));
+extern struct dentry *simple_next_child(struct dentry *parent,
+					struct dentry *child);
 extern int noop_fsync(struct file *, loff_t, loff_t, int);
 extern ssize_t noop_direct_IO(struct kiocb *iocb, struct iov_iter *iter);
 extern int simple_empty(struct dentry *);
James Bottomley March 19, 2025, 6:45 p.m. UTC | #7
On Wed, 2025-03-19 at 12:46 -0400, James Bottomley wrote:
> I assume you won't like the way I have to allocate and toss a cursor
> for each iteration, nor the fact that there's still the
> cond_resched() in there?  I think I can fix that but I'd have to
> slice apart simple_positive(), which is a bigger undertaking.

So this is what I think that would look like: it still exports the same
simple_next_child() function but doesn't need to allocate cursors or do
any of the cond_resched() stuff.  The down side is that the slice is a
lot less easily verified than the other two patches, so this one's
going to need a lot of code inspection to make sure I got it right.

Regards,

James

---

 fs/libfs.c         | 85 +++++++++++++++++++++++++++++++++++++++-------
 include/linux/fs.h |  2 ++
 2 files changed, 74 insertions(+), 13 deletions(-)

diff --git a/fs/libfs.c b/fs/libfs.c
index 8444f5cc4064..3a32910f44dc 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -101,6 +101,34 @@ int dcache_dir_close(struct inode *inode, struct
file *file)
 }
 EXPORT_SYMBOL(dcache_dir_close);
 
+/*
+ * helper for positive scanning.  Requires a nested while loop which should
+ * be broken if this returns false
+ */
+static bool internal_scan_positive(struct hlist_node **p, struct dentry *d,
+				   struct dentry **found, loff_t *count)
+{
+	while (*p) {
+		p = &d->d_sib.next;
+
+		// we must at least skip cursors, to avoid livelocks
+		if (d->d_flags & DCACHE_DENTRY_CURSOR)
+			continue;
+
+		if (simple_positive(d) && !--*count) {
+			spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED);
+			if (simple_positive(d))
+				*found = dget_dlock(d);
+			spin_unlock(&d->d_lock);
+			if (likely(*found))
+				return true;
+			*count = 1;
+		}
+		return false;
+	}
+	return true;
+}
+
 /* parent is locked at least shared */
 /*
  * Returns an element of siblings' list.
@@ -118,19 +146,9 @@ static struct dentry *scan_positives(struct dentry *cursor,
 	spin_lock(&dentry->d_lock);
 	while (*p) {
 		struct dentry *d = hlist_entry(*p, struct dentry, d_sib);
-		p = &d->d_sib.next;
-		// we must at least skip cursors, to avoid livelocks
-		if (d->d_flags & DCACHE_DENTRY_CURSOR)
-			continue;
-		if (simple_positive(d) && !--count) {
-			spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED);
-			if (simple_positive(d))
-				found = dget_dlock(d);
-			spin_unlock(&d->d_lock);
-			if (likely(found))
-				break;
-			count = 1;
-		}
+
+		if (internal_scan_positive(p, d, &found, &count))
+			break;
 		if (need_resched()) {
 			if (!hlist_unhashed(&cursor->d_sib))
 				__hlist_del(&cursor->d_sib);
@@ -146,6 +164,47 @@ static struct dentry *scan_positives(struct dentry *cursor,
 	return found;
 }
 
+/**
+ * simple_next_child - get next child of the parent directory
+ *
+ * @parent: the directory to scan
+ * @child: last child (or NULL to start at the beginning)
+ *
+ * returns the next positive child in sequence with the reference
+ * elevated but if a child is passed in will drop that
+ * reference. Returns NULL on either error or when directory has been
+ * fully scanned.
+ *
+ * The intended use is as an iterator because all the references will
+ * be dropped by the end of the while loop:
+ *
+ *     child = NULL
+ *     while(child = simple_next_child(parent, child)) {
+ *          // do something
+ *     }
+ */
+struct dentry *simple_next_child(struct dentry *parent, struct dentry *child)
+{
+	struct hlist_node **p;
+	loff_t count = 1;
+	struct dentry *found = NULL;
+
+	if (child)
+		p = &child->d_sib.next;
+	else
+		p = &parent->d_children.first;
+
+	while (*p) {
+		struct dentry *d = hlist_entry(*p, struct dentry, d_sib);
+
+		if (internal_scan_positive(p, d, &found, &count))
+			break;
+	}
+	dput(child);
+	return found;
+}
+EXPORT_SYMBOL(simple_next_child);
+
 loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence)
 {
 	struct dentry *dentry = file->f_path.dentry;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 2788df98080f..dd84d1c3b8af 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3531,6 +3531,8 @@ extern int simple_rename(struct mnt_idmap *, struct inode *,
 			 unsigned int);
 extern void simple_recursive_removal(struct dentry *,
                               void (*callback)(struct dentry *));
+extern struct dentry *simple_next_child(struct dentry *parent,
+					struct dentry *child);
 extern int noop_fsync(struct file *, loff_t, loff_t, int);
 extern ssize_t noop_direct_IO(struct kiocb *iocb, struct iov_iter *iter);
 extern int simple_empty(struct dentry *);