diff mbox series

[v5] ceph: fix slab-use-after-free in have_mon_and_osd_map()

Message ID 20250414221013.157819-1-slava@dubeyko.com
State New
Headers show
Series [v5] ceph: fix slab-use-after-free in have_mon_and_osd_map() | expand

Commit Message

Viacheslav Dubeyko April 14, 2025, 10:10 p.m. UTC
From: Viacheslav Dubeyko <Slava.Dubeyko@ibm.com>

The generic/395 and generic/397 is capable of generating
the oops is on line net/ceph/ceph_common.c:794 with
KASAN enabled.

BUG: KASAN: slab-use-after-free in have_mon_and_osd_map+0x56/0x70
Read of size 4 at addr ffff88811012d810 by task mount.ceph/13305

CPU: 2 UID: 0 PID: 13305 Comm: mount.ceph Not tainted 6.14.0-rc2-build2+ #1266
Hardware name: ASUS All Series/H97-PLUS, BIOS 2306 10/09/2014
Call Trace:
<TASK>
dump_stack_lvl+0x57/0x80
? have_mon_and_osd_map+0x56/0x70
print_address_description.constprop.0+0x84/0x330
? have_mon_and_osd_map+0x56/0x70
print_report+0xe2/0x1e0
? rcu_read_unlock_sched+0x60/0x80
? kmem_cache_debug_flags+0xc/0x20
? fixup_red_left+0x17/0x30
? have_mon_and_osd_map+0x56/0x70
kasan_report+0x8d/0xc0
? have_mon_and_osd_map+0x56/0x70
have_mon_and_osd_map+0x56/0x70
ceph_open_session+0x182/0x290
? __pfx_ceph_open_session+0x10/0x10
? __init_swait_queue_head+0x8d/0xa0
? __pfx_autoremove_wake_function+0x10/0x10
? shrinker_register+0xdd/0xf0
ceph_get_tree+0x333/0x680
vfs_get_tree+0x49/0x180
do_new_mount+0x1a3/0x2d0
? __pfx_do_new_mount+0x10/0x10
? security_capable+0x39/0x70
path_mount+0x6dd/0x730
? __pfx_path_mount+0x10/0x10
? kmem_cache_free+0x1e5/0x270
? user_path_at+0x48/0x60
do_mount+0x99/0xe0
? __pfx_do_mount+0x10/0x10
? lock_release+0x155/0x190
__do_sys_mount+0x141/0x180
do_syscall_64+0x9f/0x100
entry_SYSCALL_64_after_hwframe+0x76/0x7e
RIP: 0033:0x7f01b1b14f3e
Code: 48 8b 0d d5 3e 0f 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 49 89 ca b8 a5 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d a2 3e 0f 00 f7 d8 64 89 01 48
RSP: 002b:00007fffd129fa08 EFLAGS: 00000246 ORIG_RAX: 00000000000000a5
RAX: ffffffffffffffda RBX: 0000564ec01a7850 RCX: 00007f01b1b14f3e
RDX: 0000564ec00f2225 RSI: 00007fffd12a1964 RDI: 0000564ec0147a20
RBP: 00007fffd129fbd0 R08: 0000564ec014da90 R09: 0000000000000080
R10: 0000000000000000 R11: 0000000000000246 R12: 00007fffd12a194e
R13: 0000000000000000 R14: 00007fffd129fa50 R15: 00007fffd129fa40
</TASK>

Allocated by task 13305:
stack_trace_save+0x8c/0xc0
kasan_save_stack+0x1e/0x40
kasan_save_track+0x10/0x30
__kasan_kmalloc+0x3a/0x50
__kmalloc_noprof+0x247/0x290
ceph_osdmap_alloc+0x16/0x130
ceph_osdc_init+0x27a/0x4c0
ceph_create_client+0x153/0x190
create_fs_client+0x50/0x2a0
ceph_get_tree+0xff/0x680
vfs_get_tree+0x49/0x180
do_new_mount+0x1a3/0x2d0
path_mount+0x6dd/0x730
do_mount+0x99/0xe0
__do_sys_mount+0x141/0x180
do_syscall_64+0x9f/0x100
entry_SYSCALL_64_after_hwframe+0x76/0x7e

Freed by task 9475:
stack_trace_save+0x8c/0xc0
kasan_save_stack+0x1e/0x40
kasan_save_track+0x10/0x30
kasan_save_free_info+0x3b/0x50
__kasan_slab_free+0x18/0x30
kfree+0x212/0x290
handle_one_map+0x23c/0x3b0
ceph_osdc_handle_map+0x3c9/0x590
mon_dispatch+0x655/0x6f0
ceph_con_process_message+0xc3/0xe0
ceph_con_v1_try_read+0x614/0x760
ceph_con_workfn+0x2de/0x650
process_one_work+0x486/0x7c0
process_scheduled_works+0x73/0x90
worker_thread+0x1c8/0x2a0
kthread+0x2ec/0x300
ret_from_fork+0x24/0x40
ret_from_fork_asm+0x1a/0x30

The buggy address belongs to the object at ffff88811012d800
which belongs to the cache kmalloc-512 of size 512
The buggy address is located 16 bytes inside of
freed 512-byte region [ffff88811012d800, ffff88811012da00)

The buggy address belongs to the physical page:
page: refcount:0 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x11012c
head: order:2 mapcount:0 entire_mapcount:0 nr_pages_mapped:0 pincount:0
flags: 0x200000000000040(head|node=0|zone=2)
page_type: f5(slab)
raw: 0200000000000040 ffff888100042c80 dead000000000100 dead000000000122
raw: 0000000000000000 0000000080100010 00000000f5000000 0000000000000000
head: 0200000000000040 ffff888100042c80 dead000000000100 dead000000000122
head: 0000000000000000 0000000080100010 00000000f5000000 0000000000000000
head: 0200000000000002 ffffea0004404b01 ffffffffffffffff 0000000000000000
head: 0000000000000004 0000000000000000 00000000ffffffff 0000000000000000
page dumped because: kasan: bad access detected

Memory state around the buggy address:
ffff88811012d700: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
ffff88811012d780: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc

    ffff88811012d800: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb

^
ffff88811012d880: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
ffff88811012d900: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ==================================================================
Disabling lock debugging due to kernel taint
libceph: client274326 fsid 8598140e-35c2-11ee-b97c-001517c545cc
libceph: mon0 (1)90.155.74.19:6789 session established
libceph: client274327 fsid 8598140e-35c2-11ee-b97c-001517c545cc

We have such scenario:

Thread 1:
void ceph_osdmap_destroy(...) {
    <skipped>
    kfree(map);
}
Thread 1 sleep...

Thread 2:
static bool have_mon_and_osd_map(struct ceph_client *client) {
    return client->monc.monmap && client->monc.monmap->epoch &&
        client->osdc.osdmap && client->osdc.osdmap->epoch;
}
Thread 2 has oops...

Thread 1 wake up:
static int handle_one_map(...) {
    <skipped>
    osdc->osdmap = newmap;
    <skipped>
}

This patch introduces a have_mon_and_osd_map atomic_t
field in struct ceph_client. If there is no OSD and
monitor maps, then the client->have_mon_and_osd_map
is equal to zero. The OSD and monitor maps initialization
results in incrementing of client->have_mon_and_osd_map
under the lock. As a result, have_mon_and_osd_map() function
simply checks now that client->have_mon_and_osd_map is equal to
CEPH_CLIENT_HAS_MON_AND_OSD_MAP.

Patch adds locking in the ceph_osdc_stop()
method during the destructruction of osdc->osdmap and
assigning of NULL to the pointer. The lock is used
in the ceph_monc_stop() during the freeing of monc->monmap
and assigning NULL to the pointer too. The monmap_show()
and osdmap_show() methods were reworked to prevent
the potential race condition during the methods call.

Signed-off-by: Viacheslav Dubeyko <Slava.Dubeyko@ibm.com>
---
 include/linux/ceph/libceph.h | 20 ++++++++++++++++++++
 net/ceph/ceph_common.c       |  6 ++++--
 net/ceph/debugfs.c           | 17 +++++++++++++----
 net/ceph/mon_client.c        | 18 +++++++++++++++++-
 net/ceph/osd_client.c        | 11 +++++++++++
 5 files changed, 65 insertions(+), 7 deletions(-)

Comments

Ilya Dryomov June 11, 2025, 11:22 a.m. UTC | #1
On Tue, Apr 15, 2025 at 12:10 AM Viacheslav Dubeyko <slava@dubeyko.com> wrote:
>
> From: Viacheslav Dubeyko <Slava.Dubeyko@ibm.com>
>
> The generic/395 and generic/397 is capable of generating
> the oops is on line net/ceph/ceph_common.c:794 with
> KASAN enabled.
>
> BUG: KASAN: slab-use-after-free in have_mon_and_osd_map+0x56/0x70
> Read of size 4 at addr ffff88811012d810 by task mount.ceph/13305
>
> CPU: 2 UID: 0 PID: 13305 Comm: mount.ceph Not tainted 6.14.0-rc2-build2+ #1266
> Hardware name: ASUS All Series/H97-PLUS, BIOS 2306 10/09/2014
> Call Trace:
> <TASK>
> dump_stack_lvl+0x57/0x80
> ? have_mon_and_osd_map+0x56/0x70
> print_address_description.constprop.0+0x84/0x330
> ? have_mon_and_osd_map+0x56/0x70
> print_report+0xe2/0x1e0
> ? rcu_read_unlock_sched+0x60/0x80
> ? kmem_cache_debug_flags+0xc/0x20
> ? fixup_red_left+0x17/0x30
> ? have_mon_and_osd_map+0x56/0x70
> kasan_report+0x8d/0xc0
> ? have_mon_and_osd_map+0x56/0x70
> have_mon_and_osd_map+0x56/0x70
> ceph_open_session+0x182/0x290
> ? __pfx_ceph_open_session+0x10/0x10
> ? __init_swait_queue_head+0x8d/0xa0
> ? __pfx_autoremove_wake_function+0x10/0x10
> ? shrinker_register+0xdd/0xf0
> ceph_get_tree+0x333/0x680
> vfs_get_tree+0x49/0x180
> do_new_mount+0x1a3/0x2d0
> ? __pfx_do_new_mount+0x10/0x10
> ? security_capable+0x39/0x70
> path_mount+0x6dd/0x730
> ? __pfx_path_mount+0x10/0x10
> ? kmem_cache_free+0x1e5/0x270
> ? user_path_at+0x48/0x60
> do_mount+0x99/0xe0
> ? __pfx_do_mount+0x10/0x10
> ? lock_release+0x155/0x190
> __do_sys_mount+0x141/0x180
> do_syscall_64+0x9f/0x100
> entry_SYSCALL_64_after_hwframe+0x76/0x7e
> RIP: 0033:0x7f01b1b14f3e
> Code: 48 8b 0d d5 3e 0f 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 49 89 ca b8 a5 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d a2 3e 0f 00 f7 d8 64 89 01 48
> RSP: 002b:00007fffd129fa08 EFLAGS: 00000246 ORIG_RAX: 00000000000000a5
> RAX: ffffffffffffffda RBX: 0000564ec01a7850 RCX: 00007f01b1b14f3e
> RDX: 0000564ec00f2225 RSI: 00007fffd12a1964 RDI: 0000564ec0147a20
> RBP: 00007fffd129fbd0 R08: 0000564ec014da90 R09: 0000000000000080
> R10: 0000000000000000 R11: 0000000000000246 R12: 00007fffd12a194e
> R13: 0000000000000000 R14: 00007fffd129fa50 R15: 00007fffd129fa40
> </TASK>
>
> Allocated by task 13305:
> stack_trace_save+0x8c/0xc0
> kasan_save_stack+0x1e/0x40
> kasan_save_track+0x10/0x30
> __kasan_kmalloc+0x3a/0x50
> __kmalloc_noprof+0x247/0x290
> ceph_osdmap_alloc+0x16/0x130
> ceph_osdc_init+0x27a/0x4c0
> ceph_create_client+0x153/0x190
> create_fs_client+0x50/0x2a0
> ceph_get_tree+0xff/0x680
> vfs_get_tree+0x49/0x180
> do_new_mount+0x1a3/0x2d0
> path_mount+0x6dd/0x730
> do_mount+0x99/0xe0
> __do_sys_mount+0x141/0x180
> do_syscall_64+0x9f/0x100
> entry_SYSCALL_64_after_hwframe+0x76/0x7e
>
> Freed by task 9475:
> stack_trace_save+0x8c/0xc0
> kasan_save_stack+0x1e/0x40
> kasan_save_track+0x10/0x30
> kasan_save_free_info+0x3b/0x50
> __kasan_slab_free+0x18/0x30
> kfree+0x212/0x290
> handle_one_map+0x23c/0x3b0
> ceph_osdc_handle_map+0x3c9/0x590
> mon_dispatch+0x655/0x6f0
> ceph_con_process_message+0xc3/0xe0
> ceph_con_v1_try_read+0x614/0x760
> ceph_con_workfn+0x2de/0x650
> process_one_work+0x486/0x7c0
> process_scheduled_works+0x73/0x90
> worker_thread+0x1c8/0x2a0
> kthread+0x2ec/0x300
> ret_from_fork+0x24/0x40
> ret_from_fork_asm+0x1a/0x30
>
> The buggy address belongs to the object at ffff88811012d800
> which belongs to the cache kmalloc-512 of size 512
> The buggy address is located 16 bytes inside of
> freed 512-byte region [ffff88811012d800, ffff88811012da00)
>
> The buggy address belongs to the physical page:
> page: refcount:0 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x11012c
> head: order:2 mapcount:0 entire_mapcount:0 nr_pages_mapped:0 pincount:0
> flags: 0x200000000000040(head|node=0|zone=2)
> page_type: f5(slab)
> raw: 0200000000000040 ffff888100042c80 dead000000000100 dead000000000122
> raw: 0000000000000000 0000000080100010 00000000f5000000 0000000000000000
> head: 0200000000000040 ffff888100042c80 dead000000000100 dead000000000122
> head: 0000000000000000 0000000080100010 00000000f5000000 0000000000000000
> head: 0200000000000002 ffffea0004404b01 ffffffffffffffff 0000000000000000
> head: 0000000000000004 0000000000000000 00000000ffffffff 0000000000000000
> page dumped because: kasan: bad access detected
>
> Memory state around the buggy address:
> ffff88811012d700: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
> ffff88811012d780: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
>
>     ffff88811012d800: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
>
> ^
> ffff88811012d880: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
> ffff88811012d900: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ==================================================================
> Disabling lock debugging due to kernel taint
> libceph: client274326 fsid 8598140e-35c2-11ee-b97c-001517c545cc
> libceph: mon0 (1)90.155.74.19:6789 session established
> libceph: client274327 fsid 8598140e-35c2-11ee-b97c-001517c545cc
>
> We have such scenario:
>
> Thread 1:
> void ceph_osdmap_destroy(...) {
>     <skipped>
>     kfree(map);
> }
> Thread 1 sleep...
>
> Thread 2:
> static bool have_mon_and_osd_map(struct ceph_client *client) {
>     return client->monc.monmap && client->monc.monmap->epoch &&
>         client->osdc.osdmap && client->osdc.osdmap->epoch;
> }
> Thread 2 has oops...
>
> Thread 1 wake up:
> static int handle_one_map(...) {
>     <skipped>
>     osdc->osdmap = newmap;
>     <skipped>
> }
>
> This patch introduces a have_mon_and_osd_map atomic_t
> field in struct ceph_client. If there is no OSD and
> monitor maps, then the client->have_mon_and_osd_map
> is equal to zero. The OSD and monitor maps initialization
> results in incrementing of client->have_mon_and_osd_map
> under the lock. As a result, have_mon_and_osd_map() function
> simply checks now that client->have_mon_and_osd_map is equal to
> CEPH_CLIENT_HAS_MON_AND_OSD_MAP.
>
> Patch adds locking in the ceph_osdc_stop()
> method during the destructruction of osdc->osdmap and
> assigning of NULL to the pointer. The lock is used
> in the ceph_monc_stop() during the freeing of monc->monmap
> and assigning NULL to the pointer too. The monmap_show()
> and osdmap_show() methods were reworked to prevent
> the potential race condition during the methods call.
>
> Signed-off-by: Viacheslav Dubeyko <Slava.Dubeyko@ibm.com>
> ---
>  include/linux/ceph/libceph.h | 20 ++++++++++++++++++++
>  net/ceph/ceph_common.c       |  6 ++++--
>  net/ceph/debugfs.c           | 17 +++++++++++++----
>  net/ceph/mon_client.c        | 18 +++++++++++++++++-
>  net/ceph/osd_client.c        | 11 +++++++++++
>  5 files changed, 65 insertions(+), 7 deletions(-)
>
> diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
> index 733e7f93db66..f5694bf5bd54 100644
> --- a/include/linux/ceph/libceph.h
> +++ b/include/linux/ceph/libceph.h
> @@ -132,6 +132,7 @@ struct ceph_client {
>         struct ceph_messenger msgr;   /* messenger instance */
>         struct ceph_mon_client monc;
>         struct ceph_osd_client osdc;
> +       atomic_t have_mon_and_osd_map;
>
>  #ifdef CONFIG_DEBUG_FS
>         struct dentry *debugfs_dir;
> @@ -141,6 +142,25 @@ struct ceph_client {
>  #endif
>  };
>
> +/*
> + * The have_mon_and_osd_map possible states
> + */
> +enum {
> +       CEPH_CLIENT_HAS_NO_MON_AND_NO_OSD_MAP = 0,
> +       CEPH_CLIENT_HAS_ONLY_ONE_MAP = 1,
> +       CEPH_CLIENT_HAS_MON_AND_OSD_MAP = 2,
> +       CEPH_CLIENT_MAP_STATE_UNKNOWN
> +};
> +
> +static inline
> +bool is_mon_and_osd_map_state_invalid(struct ceph_client *client)
> +{
> +       int have_mon_and_osd_map = atomic_read(&client->have_mon_and_osd_map);
> +
> +       return have_mon_and_osd_map < CEPH_CLIENT_HAS_NO_MON_AND_NO_OSD_MAP ||
> +               have_mon_and_osd_map >= CEPH_CLIENT_MAP_STATE_UNKNOWN;
> +}
> +
>  #define from_msgr(ms)  container_of(ms, struct ceph_client, msgr)
>
>  static inline bool ceph_msgr2(struct ceph_client *client)
> diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
> index 4c6441536d55..62efceb3b19d 100644
> --- a/net/ceph/ceph_common.c
> +++ b/net/ceph/ceph_common.c
> @@ -723,6 +723,8 @@ struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private)
>
>         mutex_init(&client->mount_mutex);
>         init_waitqueue_head(&client->auth_wq);
> +       atomic_set(&client->have_mon_and_osd_map,
> +                  CEPH_CLIENT_HAS_NO_MON_AND_NO_OSD_MAP);
>         client->auth_err = 0;
>
>         client->extra_mon_dispatch = NULL;
> @@ -790,8 +792,8 @@ EXPORT_SYMBOL(ceph_reset_client_addr);
>   */
>  static bool have_mon_and_osd_map(struct ceph_client *client)
>  {
> -       return client->monc.monmap && client->monc.monmap->epoch &&
> -              client->osdc.osdmap && client->osdc.osdmap->epoch;
> +       return atomic_read(&client->have_mon_and_osd_map) ==
> +                               CEPH_CLIENT_HAS_MON_AND_OSD_MAP;
>  }
>
>  /*
> diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
> index 2110439f8a24..7b45c169a859 100644
> --- a/net/ceph/debugfs.c
> +++ b/net/ceph/debugfs.c
> @@ -36,8 +36,10 @@ static int monmap_show(struct seq_file *s, void *p)
>         int i;
>         struct ceph_client *client = s->private;
>
> +       mutex_lock(&client->monc.mutex);
> +
>         if (client->monc.monmap == NULL)
> -               return 0;
> +               goto out_unlock;
>
>         seq_printf(s, "epoch %d\n", client->monc.monmap->epoch);
>         for (i = 0; i < client->monc.monmap->num_mon; i++) {
> @@ -48,6 +50,10 @@ static int monmap_show(struct seq_file *s, void *p)
>                            ENTITY_NAME(inst->name),
>                            ceph_pr_addr(&inst->addr));
>         }
> +
> +out_unlock:
> +       mutex_unlock(&client->monc.mutex);
> +
>         return 0;
>  }
>
> @@ -56,13 +62,15 @@ static int osdmap_show(struct seq_file *s, void *p)
>         int i;
>         struct ceph_client *client = s->private;
>         struct ceph_osd_client *osdc = &client->osdc;
> -       struct ceph_osdmap *map = osdc->osdmap;
> +       struct ceph_osdmap *map = NULL;
>         struct rb_node *n;
>
> +       down_read(&osdc->lock);
> +
> +       map = osdc->osdmap;
>         if (map == NULL)
> -               return 0;
> +               goto out_unlock;
>
> -       down_read(&osdc->lock);
>         seq_printf(s, "epoch %u barrier %u flags 0x%x\n", map->epoch,
>                         osdc->epoch_barrier, map->flags);
>
> @@ -131,6 +139,7 @@ static int osdmap_show(struct seq_file *s, void *p)
>                 seq_printf(s, "]\n");
>         }
>
> +out_unlock:
>         up_read(&osdc->lock);
>         return 0;
>  }
> diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
> index ab66b599ac47..5cf802236426 100644
> --- a/net/ceph/mon_client.c
> +++ b/net/ceph/mon_client.c
> @@ -562,12 +562,16 @@ static void ceph_monc_handle_map(struct ceph_mon_client *monc,
>                 goto out;
>         }
>
> +       atomic_dec(&client->have_mon_and_osd_map);
> +
>         kfree(monc->monmap);
>         monc->monmap = monmap;
>
>         __ceph_monc_got_map(monc, CEPH_SUB_MONMAP, monc->monmap->epoch);
>         client->have_fsid = true;
>
> +       atomic_inc(&client->have_mon_and_osd_map);
> +
>  out:
>         mutex_unlock(&monc->mutex);
>         wake_up_all(&client->auth_wq);
> @@ -1220,6 +1224,9 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
>
>         monc->fs_cluster_id = CEPH_FS_CLUSTER_ID_NONE;
>
> +       atomic_inc(&monc->client->have_mon_and_osd_map);

Hi Slava,

Incrementing client->have_mon_and_osd_map here and in ceph_osdc_init()
means that counter would be set to 2 (CEPH_CLIENT_HAS_MON_AND_OSD_MAP)
at the initialization time, way before a session with the monitor is
established and any map is received.  This effectively disables the
wait logic in __ceph_open_session() because of have_mon_and_osd_map()
immediately returning true.  __ceph_open_session() is responsible for
setting up the debugfs directory and that is affected too: it's created
as 00000000-0000-0000-0000-000000000000.client0 because neither the
cluster FSID nor the client ID is known without the monmap.

This patch seems to be over-complicated for what it needs to do:
I don't see a compelling reason for introducing the atomic and as
mentioned before there is no need to attempt to guard against someone
continuing to use the client after ceph_osdc_stop() and ceph_monc_stop()
are called.  It's the point of no return and the client itself gets
freed very shortly after.

Why not just open-code the wait loop in __ceph_open_session() to allow
for monc->mutex and osdc->lock (for read) to be taken freely?  It should
be a small change in __ceph_open_session() -- net/ceph/mon_client.c and
net/ceph/osd_client.c wouldn't need to be touched at all.

Thanks,

                Ilya

> +       WARN_ON(is_mon_and_osd_map_state_invalid(monc->client));
> +
>         return 0;
>
>  out_auth_reply:
> @@ -1232,6 +1239,7 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
>         ceph_auth_destroy(monc->auth);
>  out_monmap:
>         kfree(monc->monmap);
> +       monc->monmap = NULL;
>  out:
>         return err;
>  }
> @@ -1239,6 +1247,8 @@ EXPORT_SYMBOL(ceph_monc_init);
>
>  void ceph_monc_stop(struct ceph_mon_client *monc)
>  {
> +       struct ceph_monmap *old_monmap;
> +
>         dout("stop\n");
>
>         mutex_lock(&monc->mutex);
> @@ -1266,7 +1276,13 @@ void ceph_monc_stop(struct ceph_mon_client *monc)
>         ceph_msg_put(monc->m_subscribe);
>         ceph_msg_put(monc->m_subscribe_ack);
>
> -       kfree(monc->monmap);
> +       mutex_lock(&monc->mutex);
> +       WARN_ON(is_mon_and_osd_map_state_invalid(monc->client));
> +       atomic_dec(&monc->client->have_mon_and_osd_map);
> +       old_monmap = monc->monmap;
> +       monc->monmap = NULL;
> +       mutex_unlock(&monc->mutex);
> +       kfree(old_monmap);
>  }
>  EXPORT_SYMBOL(ceph_monc_stop);
>
> diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
> index b24afec24138..14a91603bf6d 100644
> --- a/net/ceph/osd_client.c
> +++ b/net/ceph/osd_client.c
> @@ -4068,8 +4068,10 @@ static int handle_one_map(struct ceph_osd_client *osdc,
>                         skipped_map = true;
>                 }
>
> +               atomic_dec(&osdc->client->have_mon_and_osd_map);
>                 ceph_osdmap_destroy(osdc->osdmap);
>                 osdc->osdmap = newmap;
> +               atomic_inc(&osdc->client->have_mon_and_osd_map);
>         }
>
>         was_full &= !ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL);
> @@ -5266,6 +5268,9 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
>         schedule_delayed_work(&osdc->osds_timeout_work,
>             round_jiffies_relative(osdc->client->options->osd_idle_ttl));
>
> +       atomic_inc(&osdc->client->have_mon_and_osd_map);
> +       WARN_ON(is_mon_and_osd_map_state_invalid(osdc->client));
> +
>         return 0;
>
>  out_notify_wq:
> @@ -5278,6 +5283,7 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
>         mempool_destroy(osdc->req_mempool);
>  out_map:
>         ceph_osdmap_destroy(osdc->osdmap);
> +       osdc->osdmap = NULL;
>  out:
>         return err;
>  }
> @@ -5306,10 +5312,15 @@ void ceph_osdc_stop(struct ceph_osd_client *osdc)
>         WARN_ON(atomic_read(&osdc->num_requests));
>         WARN_ON(atomic_read(&osdc->num_homeless));
>
> +       down_write(&osdc->lock);
> +       WARN_ON(is_mon_and_osd_map_state_invalid(osdc->client));
> +       atomic_dec(&osdc->client->have_mon_and_osd_map);
>         ceph_osdmap_destroy(osdc->osdmap);
> +       osdc->osdmap = NULL;
>         mempool_destroy(osdc->req_mempool);
>         ceph_msgpool_destroy(&osdc->msgpool_op);
>         ceph_msgpool_destroy(&osdc->msgpool_op_reply);
> +       up_write(&osdc->lock);
>  }
>
>  int osd_req_op_copy_from_init(struct ceph_osd_request *req,
> --
> 2.48.0
>
Viacheslav Dubeyko June 11, 2025, 11:14 p.m. UTC | #2
On Wed, 2025-06-11 at 13:22 +0200, Ilya Dryomov wrote:
> On Tue, Apr 15, 2025 at 12:10 AM Viacheslav Dubeyko
> <slava@dubeyko.com> wrote:
> > 
> > From: Viacheslav Dubeyko <Slava.Dubeyko@ibm.com>
> > 
> > The generic/395 and generic/397 is capable of generating
> > the oops is on line net/ceph/ceph_common.c:794 with
> > KASAN enabled.
> > 
> > BUG: KASAN: slab-use-after-free in have_mon_and_osd_map+0x56/0x70
> > Read of size 4 at addr ffff88811012d810 by task mount.ceph/13305
> > 
> > CPU: 2 UID: 0 PID: 13305 Comm: mount.ceph Not tainted 6.14.0-rc2-
> > build2+ #1266
> > Hardware name: ASUS All Series/H97-PLUS, BIOS 2306 10/09/2014
> > Call Trace:
> > <TASK>
> > dump_stack_lvl+0x57/0x80
> > ? have_mon_and_osd_map+0x56/0x70
> > print_address_description.constprop.0+0x84/0x330
> > ? have_mon_and_osd_map+0x56/0x70
> > print_report+0xe2/0x1e0
> > ? rcu_read_unlock_sched+0x60/0x80
> > ? kmem_cache_debug_flags+0xc/0x20
> > ? fixup_red_left+0x17/0x30
> > ? have_mon_and_osd_map+0x56/0x70
> > kasan_report+0x8d/0xc0
> > ? have_mon_and_osd_map+0x56/0x70
> > have_mon_and_osd_map+0x56/0x70
> > ceph_open_session+0x182/0x290
> > ? __pfx_ceph_open_session+0x10/0x10
> > ? __init_swait_queue_head+0x8d/0xa0
> > ? __pfx_autoremove_wake_function+0x10/0x10
> > ? shrinker_register+0xdd/0xf0
> > ceph_get_tree+0x333/0x680
> > vfs_get_tree+0x49/0x180
> > do_new_mount+0x1a3/0x2d0
> > ? __pfx_do_new_mount+0x10/0x10
> > ? security_capable+0x39/0x70
> > path_mount+0x6dd/0x730
> > ? __pfx_path_mount+0x10/0x10
> > ? kmem_cache_free+0x1e5/0x270
> > ? user_path_at+0x48/0x60
> > do_mount+0x99/0xe0
> > ? __pfx_do_mount+0x10/0x10
> > ? lock_release+0x155/0x190
> > __do_sys_mount+0x141/0x180
> > do_syscall_64+0x9f/0x100
> > entry_SYSCALL_64_after_hwframe+0x76/0x7e
> > RIP: 0033:0x7f01b1b14f3e
> > Code: 48 8b 0d d5 3e 0f 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f
> > 1f 84 00 00 00 00 00 90 f3 0f 1e fa 49 89 ca b8 a5 00 00 00 0f 05
> > <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d a2 3e 0f 00 f7 d8 64 89 01 48
> > RSP: 002b:00007fffd129fa08 EFLAGS: 00000246 ORIG_RAX:
> > 00000000000000a5
> > RAX: ffffffffffffffda RBX: 0000564ec01a7850 RCX: 00007f01b1b14f3e
> > RDX: 0000564ec00f2225 RSI: 00007fffd12a1964 RDI: 0000564ec0147a20
> > RBP: 00007fffd129fbd0 R08: 0000564ec014da90 R09: 0000000000000080
> > R10: 0000000000000000 R11: 0000000000000246 R12: 00007fffd12a194e
> > R13: 0000000000000000 R14: 00007fffd129fa50 R15: 00007fffd129fa40
> > </TASK>
> > 
> > Allocated by task 13305:
> > stack_trace_save+0x8c/0xc0
> > kasan_save_stack+0x1e/0x40
> > kasan_save_track+0x10/0x30
> > __kasan_kmalloc+0x3a/0x50
> > __kmalloc_noprof+0x247/0x290
> > ceph_osdmap_alloc+0x16/0x130
> > ceph_osdc_init+0x27a/0x4c0
> > ceph_create_client+0x153/0x190
> > create_fs_client+0x50/0x2a0
> > ceph_get_tree+0xff/0x680
> > vfs_get_tree+0x49/0x180
> > do_new_mount+0x1a3/0x2d0
> > path_mount+0x6dd/0x730
> > do_mount+0x99/0xe0
> > __do_sys_mount+0x141/0x180
> > do_syscall_64+0x9f/0x100
> > entry_SYSCALL_64_after_hwframe+0x76/0x7e
> > 
> > Freed by task 9475:
> > stack_trace_save+0x8c/0xc0
> > kasan_save_stack+0x1e/0x40
> > kasan_save_track+0x10/0x30
> > kasan_save_free_info+0x3b/0x50
> > __kasan_slab_free+0x18/0x30
> > kfree+0x212/0x290
> > handle_one_map+0x23c/0x3b0
> > ceph_osdc_handle_map+0x3c9/0x590
> > mon_dispatch+0x655/0x6f0
> > ceph_con_process_message+0xc3/0xe0
> > ceph_con_v1_try_read+0x614/0x760
> > ceph_con_workfn+0x2de/0x650
> > process_one_work+0x486/0x7c0
> > process_scheduled_works+0x73/0x90
> > worker_thread+0x1c8/0x2a0
> > kthread+0x2ec/0x300
> > ret_from_fork+0x24/0x40
> > ret_from_fork_asm+0x1a/0x30
> > 
> > The buggy address belongs to the object at ffff88811012d800
> > which belongs to the cache kmalloc-512 of size 512
> > The buggy address is located 16 bytes inside of
> > freed 512-byte region [ffff88811012d800, ffff88811012da00)
> > 
> > The buggy address belongs to the physical page:
> > page: refcount:0 mapcount:0 mapping:0000000000000000 index:0x0
> > pfn:0x11012c
> > head: order:2 mapcount:0 entire_mapcount:0 nr_pages_mapped:0
> > pincount:0
> > flags: 0x200000000000040(head|node=0|zone=2)
> > page_type: f5(slab)
> > raw: 0200000000000040 ffff888100042c80 dead000000000100
> > dead000000000122
> > raw: 0000000000000000 0000000080100010 00000000f5000000
> > 0000000000000000
> > head: 0200000000000040 ffff888100042c80 dead000000000100
> > dead000000000122
> > head: 0000000000000000 0000000080100010 00000000f5000000
> > 0000000000000000
> > head: 0200000000000002 ffffea0004404b01 ffffffffffffffff
> > 0000000000000000
> > head: 0000000000000004 0000000000000000 00000000ffffffff
> > 0000000000000000
> > page dumped because: kasan: bad access detected
> > 
> > Memory state around the buggy address:
> > ffff88811012d700: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
> > ffff88811012d780: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
> > 
> >     ffff88811012d800: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb
> > fb
> > 
> > ^
> > ffff88811012d880: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
> > ffff88811012d900: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
> > ==================================================================
> > Disabling lock debugging due to kernel taint
> > libceph: client274326 fsid 8598140e-35c2-11ee-b97c-001517c545cc
> > libceph: mon0 (1)90.155.74.19:6789 session established
> > libceph: client274327 fsid 8598140e-35c2-11ee-b97c-001517c545cc
> > 
> > We have such scenario:
> > 
> > Thread 1:
> > void ceph_osdmap_destroy(...) {
> >     <skipped>
> >     kfree(map);
> > }
> > Thread 1 sleep...
> > 
> > Thread 2:
> > static bool have_mon_and_osd_map(struct ceph_client *client) {
> >     return client->monc.monmap && client->monc.monmap->epoch &&
> >         client->osdc.osdmap && client->osdc.osdmap->epoch;
> > }
> > Thread 2 has oops...
> > 
> > Thread 1 wake up:
> > static int handle_one_map(...) {
> >     <skipped>
> >     osdc->osdmap = newmap;
> >     <skipped>
> > }
> > 
> > This patch introduces a have_mon_and_osd_map atomic_t
> > field in struct ceph_client. If there is no OSD and
> > monitor maps, then the client->have_mon_and_osd_map
> > is equal to zero. The OSD and monitor maps initialization
> > results in incrementing of client->have_mon_and_osd_map
> > under the lock. As a result, have_mon_and_osd_map() function
> > simply checks now that client->have_mon_and_osd_map is equal to
> > CEPH_CLIENT_HAS_MON_AND_OSD_MAP.
> > 
> > Patch adds locking in the ceph_osdc_stop()
> > method during the destructruction of osdc->osdmap and
> > assigning of NULL to the pointer. The lock is used
> > in the ceph_monc_stop() during the freeing of monc->monmap
> > and assigning NULL to the pointer too. The monmap_show()
> > and osdmap_show() methods were reworked to prevent
> > the potential race condition during the methods call.
> > 
> > Signed-off-by: Viacheslav Dubeyko <Slava.Dubeyko@ibm.com>
> > ---
> >  include/linux/ceph/libceph.h | 20 ++++++++++++++++++++
> >  net/ceph/ceph_common.c       |  6 ++++--
> >  net/ceph/debugfs.c           | 17 +++++++++++++----
> >  net/ceph/mon_client.c        | 18 +++++++++++++++++-
> >  net/ceph/osd_client.c        | 11 +++++++++++
> >  5 files changed, 65 insertions(+), 7 deletions(-)
> > 
> > diff --git a/include/linux/ceph/libceph.h
> > b/include/linux/ceph/libceph.h
> > index 733e7f93db66..f5694bf5bd54 100644
> > --- a/include/linux/ceph/libceph.h
> > +++ b/include/linux/ceph/libceph.h
> > @@ -132,6 +132,7 @@ struct ceph_client {
> >         struct ceph_messenger msgr;   /* messenger instance */
> >         struct ceph_mon_client monc;
> >         struct ceph_osd_client osdc;
> > +       atomic_t have_mon_and_osd_map;
> > 
> >  #ifdef CONFIG_DEBUG_FS
> >         struct dentry *debugfs_dir;
> > @@ -141,6 +142,25 @@ struct ceph_client {
> >  #endif
> >  };
> > 
> > +/*
> > + * The have_mon_and_osd_map possible states
> > + */
> > +enum {
> > +       CEPH_CLIENT_HAS_NO_MON_AND_NO_OSD_MAP = 0,
> > +       CEPH_CLIENT_HAS_ONLY_ONE_MAP = 1,
> > +       CEPH_CLIENT_HAS_MON_AND_OSD_MAP = 2,
> > +       CEPH_CLIENT_MAP_STATE_UNKNOWN
> > +};
> > +
> > +static inline
> > +bool is_mon_and_osd_map_state_invalid(struct ceph_client *client)
> > +{
> > +       int have_mon_and_osd_map = atomic_read(&client-
> > >have_mon_and_osd_map);
> > +
> > +       return have_mon_and_osd_map <
> > CEPH_CLIENT_HAS_NO_MON_AND_NO_OSD_MAP ||
> > +               have_mon_and_osd_map >=
> > CEPH_CLIENT_MAP_STATE_UNKNOWN;
> > +}
> > +
> >  #define from_msgr(ms)  container_of(ms, struct ceph_client, msgr)
> > 
> >  static inline bool ceph_msgr2(struct ceph_client *client)
> > diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
> > index 4c6441536d55..62efceb3b19d 100644
> > --- a/net/ceph/ceph_common.c
> > +++ b/net/ceph/ceph_common.c
> > @@ -723,6 +723,8 @@ struct ceph_client *ceph_create_client(struct
> > ceph_options *opt, void *private)
> > 
> >         mutex_init(&client->mount_mutex);
> >         init_waitqueue_head(&client->auth_wq);
> > +       atomic_set(&client->have_mon_and_osd_map,
> > +                  CEPH_CLIENT_HAS_NO_MON_AND_NO_OSD_MAP);
> >         client->auth_err = 0;
> > 
> >         client->extra_mon_dispatch = NULL;
> > @@ -790,8 +792,8 @@ EXPORT_SYMBOL(ceph_reset_client_addr);
> >   */
> >  static bool have_mon_and_osd_map(struct ceph_client *client)
> >  {
> > -       return client->monc.monmap && client->monc.monmap->epoch &&
> > -              client->osdc.osdmap && client->osdc.osdmap->epoch;
> > +       return atomic_read(&client->have_mon_and_osd_map) ==
> > +                               CEPH_CLIENT_HAS_MON_AND_OSD_MAP;
> >  }
> > 
> >  /*
> > diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
> > index 2110439f8a24..7b45c169a859 100644
> > --- a/net/ceph/debugfs.c
> > +++ b/net/ceph/debugfs.c
> > @@ -36,8 +36,10 @@ static int monmap_show(struct seq_file *s, void
> > *p)
> >         int i;
> >         struct ceph_client *client = s->private;
> > 
> > +       mutex_lock(&client->monc.mutex);
> > +
> >         if (client->monc.monmap == NULL)
> > -               return 0;
> > +               goto out_unlock;
> > 
> >         seq_printf(s, "epoch %d\n", client->monc.monmap->epoch);
> >         for (i = 0; i < client->monc.monmap->num_mon; i++) {
> > @@ -48,6 +50,10 @@ static int monmap_show(struct seq_file *s, void
> > *p)
> >                            ENTITY_NAME(inst->name),
> >                            ceph_pr_addr(&inst->addr));
> >         }
> > +
> > +out_unlock:
> > +       mutex_unlock(&client->monc.mutex);
> > +
> >         return 0;
> >  }
> > 
> > @@ -56,13 +62,15 @@ static int osdmap_show(struct seq_file *s, void
> > *p)
> >         int i;
> >         struct ceph_client *client = s->private;
> >         struct ceph_osd_client *osdc = &client->osdc;
> > -       struct ceph_osdmap *map = osdc->osdmap;
> > +       struct ceph_osdmap *map = NULL;
> >         struct rb_node *n;
> > 
> > +       down_read(&osdc->lock);
> > +
> > +       map = osdc->osdmap;
> >         if (map == NULL)
> > -               return 0;
> > +               goto out_unlock;
> > 
> > -       down_read(&osdc->lock);
> >         seq_printf(s, "epoch %u barrier %u flags 0x%x\n", map-
> > >epoch,
> >                         osdc->epoch_barrier, map->flags);
> > 
> > @@ -131,6 +139,7 @@ static int osdmap_show(struct seq_file *s, void
> > *p)
> >                 seq_printf(s, "]\n");
> >         }
> > 
> > +out_unlock:
> >         up_read(&osdc->lock);
> >         return 0;
> >  }
> > diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
> > index ab66b599ac47..5cf802236426 100644
> > --- a/net/ceph/mon_client.c
> > +++ b/net/ceph/mon_client.c
> > @@ -562,12 +562,16 @@ static void ceph_monc_handle_map(struct
> > ceph_mon_client *monc,
> >                 goto out;
> >         }
> > 
> > +       atomic_dec(&client->have_mon_and_osd_map);
> > +
> >         kfree(monc->monmap);
> >         monc->monmap = monmap;
> > 
> >         __ceph_monc_got_map(monc, CEPH_SUB_MONMAP, monc->monmap-
> > >epoch);
> >         client->have_fsid = true;
> > 
> > +       atomic_inc(&client->have_mon_and_osd_map);
> > +
> >  out:
> >         mutex_unlock(&monc->mutex);
> >         wake_up_all(&client->auth_wq);
> > @@ -1220,6 +1224,9 @@ int ceph_monc_init(struct ceph_mon_client
> > *monc, struct ceph_client *cl)
> > 
> >         monc->fs_cluster_id = CEPH_FS_CLUSTER_ID_NONE;
> > 
> > +       atomic_inc(&monc->client->have_mon_and_osd_map);
> 
> Hi Slava,
> 
> Incrementing client->have_mon_and_osd_map here and in
> ceph_osdc_init()
> means that counter would be set to 2
> (CEPH_CLIENT_HAS_MON_AND_OSD_MAP)
> at the initialization time, way before a session with the monitor is
> established and any map is received.  This effectively disables the
> wait logic in __ceph_open_session() because of have_mon_and_osd_map()
> immediately returning true.  __ceph_open_session() is responsible for
> setting up the debugfs directory and that is affected too: it's
> created
> as 00000000-0000-0000-0000-000000000000.client0 because neither the
> cluster FSID nor the client ID is known without the monmap.
> 
> This patch seems to be over-complicated for what it needs to do:
> I don't see a compelling reason for introducing the atomic and as
> mentioned before there is no need to attempt to guard against someone
> continuing to use the client after ceph_osdc_stop() and
> ceph_monc_stop()
> are called.  It's the point of no return and the client itself gets
> freed very shortly after.
> 
> Why not just open-code the wait loop in __ceph_open_session() to
> allow
> for monc->mutex and osdc->lock (for read) to be taken freely?  It
> should
> be a small change in __ceph_open_session() -- net/ceph/mon_client.c
> and
> net/ceph/osd_client.c wouldn't need to be touched at all.
> 

Hi Ilya,

Frankly speaking, I don't quite follow to your point. The main issue
happens when one thread calls ceph_osdc_handle_map() [1] ->
handle_one_map() [2]:

ceph_osdmap_destroy() [3] -> kfree(map) -> go to sleep

<-- another thread receives time slices to execute:
have_mon_and_osd_map() BUT osdc->osdmap is already freed and invalid
here!!!

osdc->osdmap = newmap;

So, it's not about ceph_osdc_stop() or ceph_monc_stop() but it's about
regular operations.

I've tried to exclude the necessity to use locks at all in
have_mon_and_osd_map(). Do you mean that wait loop will be better
solution? It sounds pretty complicated too for my taste and it will
require coordination among threads. No? I am not completely sure that I
follow to your vision.

Thanks,
Slava. 

[1]
https://elixir.bootlin.com/linux/v6.15/source/net/ceph/osd_client.c#L4130
[2]
https://elixir.bootlin.com/linux/v6.15/source/net/ceph/osd_client.c#L4048
[3]
https://elixir.bootlin.com/linux/v6.15/source/net/ceph/osdmap.c#L1143


> Thanks,
> 
>                 Ilya
> 
> > +       WARN_ON(is_mon_and_osd_map_state_invalid(monc->client));
> > +
> >         return 0;
> > 
> >  out_auth_reply:
> > @@ -1232,6 +1239,7 @@ int ceph_monc_init(struct ceph_mon_client
> > *monc, struct ceph_client *cl)
> >         ceph_auth_destroy(monc->auth);
> >  out_monmap:
> >         kfree(monc->monmap);
> > +       monc->monmap = NULL;
> >  out:
> >         return err;
> >  }
> > @@ -1239,6 +1247,8 @@ EXPORT_SYMBOL(ceph_monc_init);
> > 
> >  void ceph_monc_stop(struct ceph_mon_client *monc)
> >  {
> > +       struct ceph_monmap *old_monmap;
> > +
> >         dout("stop\n");
> > 
> >         mutex_lock(&monc->mutex);
> > @@ -1266,7 +1276,13 @@ void ceph_monc_stop(struct ceph_mon_client
> > *monc)
> >         ceph_msg_put(monc->m_subscribe);
> >         ceph_msg_put(monc->m_subscribe_ack);
> > 
> > -       kfree(monc->monmap);
> > +       mutex_lock(&monc->mutex);
> > +       WARN_ON(is_mon_and_osd_map_state_invalid(monc->client));
> > +       atomic_dec(&monc->client->have_mon_and_osd_map);
> > +       old_monmap = monc->monmap;
> > +       monc->monmap = NULL;
> > +       mutex_unlock(&monc->mutex);
> > +       kfree(old_monmap);
> >  }
> >  EXPORT_SYMBOL(ceph_monc_stop);
> > 
> > diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
> > index b24afec24138..14a91603bf6d 100644
> > --- a/net/ceph/osd_client.c
> > +++ b/net/ceph/osd_client.c
> > @@ -4068,8 +4068,10 @@ static int handle_one_map(struct
> > ceph_osd_client *osdc,
> >                         skipped_map = true;
> >                 }
> > 
> > +               atomic_dec(&osdc->client->have_mon_and_osd_map);
> >                 ceph_osdmap_destroy(osdc->osdmap);
> >                 osdc->osdmap = newmap;
> > +               atomic_inc(&osdc->client->have_mon_and_osd_map);
> >         }
> > 
> >         was_full &= !ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL);
> > @@ -5266,6 +5268,9 @@ int ceph_osdc_init(struct ceph_osd_client
> > *osdc, struct ceph_client *client)
> >         schedule_delayed_work(&osdc->osds_timeout_work,
> >             round_jiffies_relative(osdc->client->options-
> > >osd_idle_ttl));
> > 
> > +       atomic_inc(&osdc->client->have_mon_and_osd_map);
> > +       WARN_ON(is_mon_and_osd_map_state_invalid(osdc->client));
> > +
> >         return 0;
> > 
> >  out_notify_wq:
> > @@ -5278,6 +5283,7 @@ int ceph_osdc_init(struct ceph_osd_client
> > *osdc, struct ceph_client *client)
> >         mempool_destroy(osdc->req_mempool);
> >  out_map:
> >         ceph_osdmap_destroy(osdc->osdmap);
> > +       osdc->osdmap = NULL;
> >  out:
> >         return err;
> >  }
> > @@ -5306,10 +5312,15 @@ void ceph_osdc_stop(struct ceph_osd_client
> > *osdc)
> >         WARN_ON(atomic_read(&osdc->num_requests));
> >         WARN_ON(atomic_read(&osdc->num_homeless));
> > 
> > +       down_write(&osdc->lock);
> > +       WARN_ON(is_mon_and_osd_map_state_invalid(osdc->client));
> > +       atomic_dec(&osdc->client->have_mon_and_osd_map);
> >         ceph_osdmap_destroy(osdc->osdmap);
> > +       osdc->osdmap = NULL;
> >         mempool_destroy(osdc->req_mempool);
> >         ceph_msgpool_destroy(&osdc->msgpool_op);
> >         ceph_msgpool_destroy(&osdc->msgpool_op_reply);
> > +       up_write(&osdc->lock);
> >  }
> > 
> >  int osd_req_op_copy_from_init(struct ceph_osd_request *req,
> > --
> > 2.48.0
> >
Ilya Dryomov June 12, 2025, 8:20 a.m. UTC | #3
On Thu, Jun 12, 2025 at 1:14 AM Viacheslav Dubeyko <slava@dubeyko.com> wrote:
>
> On Wed, 2025-06-11 at 13:22 +0200, Ilya Dryomov wrote:
> > On Tue, Apr 15, 2025 at 12:10 AM Viacheslav Dubeyko
> > <slava@dubeyko.com> wrote:
> > >
> > > From: Viacheslav Dubeyko <Slava.Dubeyko@ibm.com>
> > >
> > > The generic/395 and generic/397 is capable of generating
> > > the oops is on line net/ceph/ceph_common.c:794 with
> > > KASAN enabled.
> > >
> > > BUG: KASAN: slab-use-after-free in have_mon_and_osd_map+0x56/0x70
> > > Read of size 4 at addr ffff88811012d810 by task mount.ceph/13305
> > >
> > > CPU: 2 UID: 0 PID: 13305 Comm: mount.ceph Not tainted 6.14.0-rc2-
> > > build2+ #1266
> > > Hardware name: ASUS All Series/H97-PLUS, BIOS 2306 10/09/2014
> > > Call Trace:
> > > <TASK>
> > > dump_stack_lvl+0x57/0x80
> > > ? have_mon_and_osd_map+0x56/0x70
> > > print_address_description.constprop.0+0x84/0x330
> > > ? have_mon_and_osd_map+0x56/0x70
> > > print_report+0xe2/0x1e0
> > > ? rcu_read_unlock_sched+0x60/0x80
> > > ? kmem_cache_debug_flags+0xc/0x20
> > > ? fixup_red_left+0x17/0x30
> > > ? have_mon_and_osd_map+0x56/0x70
> > > kasan_report+0x8d/0xc0
> > > ? have_mon_and_osd_map+0x56/0x70
> > > have_mon_and_osd_map+0x56/0x70
> > > ceph_open_session+0x182/0x290
> > > ? __pfx_ceph_open_session+0x10/0x10
> > > ? __init_swait_queue_head+0x8d/0xa0
> > > ? __pfx_autoremove_wake_function+0x10/0x10
> > > ? shrinker_register+0xdd/0xf0
> > > ceph_get_tree+0x333/0x680
> > > vfs_get_tree+0x49/0x180
> > > do_new_mount+0x1a3/0x2d0
> > > ? __pfx_do_new_mount+0x10/0x10
> > > ? security_capable+0x39/0x70
> > > path_mount+0x6dd/0x730
> > > ? __pfx_path_mount+0x10/0x10
> > > ? kmem_cache_free+0x1e5/0x270
> > > ? user_path_at+0x48/0x60
> > > do_mount+0x99/0xe0
> > > ? __pfx_do_mount+0x10/0x10
> > > ? lock_release+0x155/0x190
> > > __do_sys_mount+0x141/0x180
> > > do_syscall_64+0x9f/0x100
> > > entry_SYSCALL_64_after_hwframe+0x76/0x7e
> > > RIP: 0033:0x7f01b1b14f3e
> > > Code: 48 8b 0d d5 3e 0f 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f
> > > 1f 84 00 00 00 00 00 90 f3 0f 1e fa 49 89 ca b8 a5 00 00 00 0f 05
> > > <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d a2 3e 0f 00 f7 d8 64 89 01 48
> > > RSP: 002b:00007fffd129fa08 EFLAGS: 00000246 ORIG_RAX:
> > > 00000000000000a5
> > > RAX: ffffffffffffffda RBX: 0000564ec01a7850 RCX: 00007f01b1b14f3e
> > > RDX: 0000564ec00f2225 RSI: 00007fffd12a1964 RDI: 0000564ec0147a20
> > > RBP: 00007fffd129fbd0 R08: 0000564ec014da90 R09: 0000000000000080
> > > R10: 0000000000000000 R11: 0000000000000246 R12: 00007fffd12a194e
> > > R13: 0000000000000000 R14: 00007fffd129fa50 R15: 00007fffd129fa40
> > > </TASK>
> > >
> > > Allocated by task 13305:
> > > stack_trace_save+0x8c/0xc0
> > > kasan_save_stack+0x1e/0x40
> > > kasan_save_track+0x10/0x30
> > > __kasan_kmalloc+0x3a/0x50
> > > __kmalloc_noprof+0x247/0x290
> > > ceph_osdmap_alloc+0x16/0x130
> > > ceph_osdc_init+0x27a/0x4c0
> > > ceph_create_client+0x153/0x190
> > > create_fs_client+0x50/0x2a0
> > > ceph_get_tree+0xff/0x680
> > > vfs_get_tree+0x49/0x180
> > > do_new_mount+0x1a3/0x2d0
> > > path_mount+0x6dd/0x730
> > > do_mount+0x99/0xe0
> > > __do_sys_mount+0x141/0x180
> > > do_syscall_64+0x9f/0x100
> > > entry_SYSCALL_64_after_hwframe+0x76/0x7e
> > >
> > > Freed by task 9475:
> > > stack_trace_save+0x8c/0xc0
> > > kasan_save_stack+0x1e/0x40
> > > kasan_save_track+0x10/0x30
> > > kasan_save_free_info+0x3b/0x50
> > > __kasan_slab_free+0x18/0x30
> > > kfree+0x212/0x290
> > > handle_one_map+0x23c/0x3b0
> > > ceph_osdc_handle_map+0x3c9/0x590
> > > mon_dispatch+0x655/0x6f0
> > > ceph_con_process_message+0xc3/0xe0
> > > ceph_con_v1_try_read+0x614/0x760
> > > ceph_con_workfn+0x2de/0x650
> > > process_one_work+0x486/0x7c0
> > > process_scheduled_works+0x73/0x90
> > > worker_thread+0x1c8/0x2a0
> > > kthread+0x2ec/0x300
> > > ret_from_fork+0x24/0x40
> > > ret_from_fork_asm+0x1a/0x30
> > >
> > > The buggy address belongs to the object at ffff88811012d800
> > > which belongs to the cache kmalloc-512 of size 512
> > > The buggy address is located 16 bytes inside of
> > > freed 512-byte region [ffff88811012d800, ffff88811012da00)
> > >
> > > The buggy address belongs to the physical page:
> > > page: refcount:0 mapcount:0 mapping:0000000000000000 index:0x0
> > > pfn:0x11012c
> > > head: order:2 mapcount:0 entire_mapcount:0 nr_pages_mapped:0
> > > pincount:0
> > > flags: 0x200000000000040(head|node=0|zone=2)
> > > page_type: f5(slab)
> > > raw: 0200000000000040 ffff888100042c80 dead000000000100
> > > dead000000000122
> > > raw: 0000000000000000 0000000080100010 00000000f5000000
> > > 0000000000000000
> > > head: 0200000000000040 ffff888100042c80 dead000000000100
> > > dead000000000122
> > > head: 0000000000000000 0000000080100010 00000000f5000000
> > > 0000000000000000
> > > head: 0200000000000002 ffffea0004404b01 ffffffffffffffff
> > > 0000000000000000
> > > head: 0000000000000004 0000000000000000 00000000ffffffff
> > > 0000000000000000
> > > page dumped because: kasan: bad access detected
> > >
> > > Memory state around the buggy address:
> > > ffff88811012d700: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
> > > ffff88811012d780: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
> > >
> > >     ffff88811012d800: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb
> > > fb
> > >
> > > ^
> > > ffff88811012d880: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
> > > ffff88811012d900: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
> > > ==================================================================
> > > Disabling lock debugging due to kernel taint
> > > libceph: client274326 fsid 8598140e-35c2-11ee-b97c-001517c545cc
> > > libceph: mon0 (1)90.155.74.19:6789 session established
> > > libceph: client274327 fsid 8598140e-35c2-11ee-b97c-001517c545cc
> > >
> > > We have such scenario:
> > >
> > > Thread 1:
> > > void ceph_osdmap_destroy(...) {
> > >     <skipped>
> > >     kfree(map);
> > > }
> > > Thread 1 sleep...
> > >
> > > Thread 2:
> > > static bool have_mon_and_osd_map(struct ceph_client *client) {
> > >     return client->monc.monmap && client->monc.monmap->epoch &&
> > >         client->osdc.osdmap && client->osdc.osdmap->epoch;
> > > }
> > > Thread 2 has oops...
> > >
> > > Thread 1 wake up:
> > > static int handle_one_map(...) {
> > >     <skipped>
> > >     osdc->osdmap = newmap;
> > >     <skipped>
> > > }
> > >
> > > This patch introduces a have_mon_and_osd_map atomic_t
> > > field in struct ceph_client. If there is no OSD and
> > > monitor maps, then the client->have_mon_and_osd_map
> > > is equal to zero. The OSD and monitor maps initialization
> > > results in incrementing of client->have_mon_and_osd_map
> > > under the lock. As a result, have_mon_and_osd_map() function
> > > simply checks now that client->have_mon_and_osd_map is equal to
> > > CEPH_CLIENT_HAS_MON_AND_OSD_MAP.
> > >
> > > Patch adds locking in the ceph_osdc_stop()
> > > method during the destructruction of osdc->osdmap and
> > > assigning of NULL to the pointer. The lock is used
> > > in the ceph_monc_stop() during the freeing of monc->monmap
> > > and assigning NULL to the pointer too. The monmap_show()
> > > and osdmap_show() methods were reworked to prevent
> > > the potential race condition during the methods call.
> > >
> > > Signed-off-by: Viacheslav Dubeyko <Slava.Dubeyko@ibm.com>
> > > ---
> > >  include/linux/ceph/libceph.h | 20 ++++++++++++++++++++
> > >  net/ceph/ceph_common.c       |  6 ++++--
> > >  net/ceph/debugfs.c           | 17 +++++++++++++----
> > >  net/ceph/mon_client.c        | 18 +++++++++++++++++-
> > >  net/ceph/osd_client.c        | 11 +++++++++++
> > >  5 files changed, 65 insertions(+), 7 deletions(-)
> > >
> > > diff --git a/include/linux/ceph/libceph.h
> > > b/include/linux/ceph/libceph.h
> > > index 733e7f93db66..f5694bf5bd54 100644
> > > --- a/include/linux/ceph/libceph.h
> > > +++ b/include/linux/ceph/libceph.h
> > > @@ -132,6 +132,7 @@ struct ceph_client {
> > >         struct ceph_messenger msgr;   /* messenger instance */
> > >         struct ceph_mon_client monc;
> > >         struct ceph_osd_client osdc;
> > > +       atomic_t have_mon_and_osd_map;
> > >
> > >  #ifdef CONFIG_DEBUG_FS
> > >         struct dentry *debugfs_dir;
> > > @@ -141,6 +142,25 @@ struct ceph_client {
> > >  #endif
> > >  };
> > >
> > > +/*
> > > + * The have_mon_and_osd_map possible states
> > > + */
> > > +enum {
> > > +       CEPH_CLIENT_HAS_NO_MON_AND_NO_OSD_MAP = 0,
> > > +       CEPH_CLIENT_HAS_ONLY_ONE_MAP = 1,
> > > +       CEPH_CLIENT_HAS_MON_AND_OSD_MAP = 2,
> > > +       CEPH_CLIENT_MAP_STATE_UNKNOWN
> > > +};
> > > +
> > > +static inline
> > > +bool is_mon_and_osd_map_state_invalid(struct ceph_client *client)
> > > +{
> > > +       int have_mon_and_osd_map = atomic_read(&client-
> > > >have_mon_and_osd_map);
> > > +
> > > +       return have_mon_and_osd_map <
> > > CEPH_CLIENT_HAS_NO_MON_AND_NO_OSD_MAP ||
> > > +               have_mon_and_osd_map >=
> > > CEPH_CLIENT_MAP_STATE_UNKNOWN;
> > > +}
> > > +
> > >  #define from_msgr(ms)  container_of(ms, struct ceph_client, msgr)
> > >
> > >  static inline bool ceph_msgr2(struct ceph_client *client)
> > > diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
> > > index 4c6441536d55..62efceb3b19d 100644
> > > --- a/net/ceph/ceph_common.c
> > > +++ b/net/ceph/ceph_common.c
> > > @@ -723,6 +723,8 @@ struct ceph_client *ceph_create_client(struct
> > > ceph_options *opt, void *private)
> > >
> > >         mutex_init(&client->mount_mutex);
> > >         init_waitqueue_head(&client->auth_wq);
> > > +       atomic_set(&client->have_mon_and_osd_map,
> > > +                  CEPH_CLIENT_HAS_NO_MON_AND_NO_OSD_MAP);
> > >         client->auth_err = 0;
> > >
> > >         client->extra_mon_dispatch = NULL;
> > > @@ -790,8 +792,8 @@ EXPORT_SYMBOL(ceph_reset_client_addr);
> > >   */
> > >  static bool have_mon_and_osd_map(struct ceph_client *client)
> > >  {
> > > -       return client->monc.monmap && client->monc.monmap->epoch &&
> > > -              client->osdc.osdmap && client->osdc.osdmap->epoch;
> > > +       return atomic_read(&client->have_mon_and_osd_map) ==
> > > +                               CEPH_CLIENT_HAS_MON_AND_OSD_MAP;
> > >  }
> > >
> > >  /*
> > > diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
> > > index 2110439f8a24..7b45c169a859 100644
> > > --- a/net/ceph/debugfs.c
> > > +++ b/net/ceph/debugfs.c
> > > @@ -36,8 +36,10 @@ static int monmap_show(struct seq_file *s, void
> > > *p)
> > >         int i;
> > >         struct ceph_client *client = s->private;
> > >
> > > +       mutex_lock(&client->monc.mutex);
> > > +
> > >         if (client->monc.monmap == NULL)
> > > -               return 0;
> > > +               goto out_unlock;
> > >
> > >         seq_printf(s, "epoch %d\n", client->monc.monmap->epoch);
> > >         for (i = 0; i < client->monc.monmap->num_mon; i++) {
> > > @@ -48,6 +50,10 @@ static int monmap_show(struct seq_file *s, void
> > > *p)
> > >                            ENTITY_NAME(inst->name),
> > >                            ceph_pr_addr(&inst->addr));
> > >         }
> > > +
> > > +out_unlock:
> > > +       mutex_unlock(&client->monc.mutex);
> > > +
> > >         return 0;
> > >  }
> > >
> > > @@ -56,13 +62,15 @@ static int osdmap_show(struct seq_file *s, void
> > > *p)
> > >         int i;
> > >         struct ceph_client *client = s->private;
> > >         struct ceph_osd_client *osdc = &client->osdc;
> > > -       struct ceph_osdmap *map = osdc->osdmap;
> > > +       struct ceph_osdmap *map = NULL;
> > >         struct rb_node *n;
> > >
> > > +       down_read(&osdc->lock);
> > > +
> > > +       map = osdc->osdmap;
> > >         if (map == NULL)
> > > -               return 0;
> > > +               goto out_unlock;
> > >
> > > -       down_read(&osdc->lock);
> > >         seq_printf(s, "epoch %u barrier %u flags 0x%x\n", map-
> > > >epoch,
> > >                         osdc->epoch_barrier, map->flags);
> > >
> > > @@ -131,6 +139,7 @@ static int osdmap_show(struct seq_file *s, void
> > > *p)
> > >                 seq_printf(s, "]\n");
> > >         }
> > >
> > > +out_unlock:
> > >         up_read(&osdc->lock);
> > >         return 0;
> > >  }
> > > diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
> > > index ab66b599ac47..5cf802236426 100644
> > > --- a/net/ceph/mon_client.c
> > > +++ b/net/ceph/mon_client.c
> > > @@ -562,12 +562,16 @@ static void ceph_monc_handle_map(struct
> > > ceph_mon_client *monc,
> > >                 goto out;
> > >         }
> > >
> > > +       atomic_dec(&client->have_mon_and_osd_map);
> > > +
> > >         kfree(monc->monmap);
> > >         monc->monmap = monmap;
> > >
> > >         __ceph_monc_got_map(monc, CEPH_SUB_MONMAP, monc->monmap-
> > > >epoch);
> > >         client->have_fsid = true;
> > >
> > > +       atomic_inc(&client->have_mon_and_osd_map);
> > > +
> > >  out:
> > >         mutex_unlock(&monc->mutex);
> > >         wake_up_all(&client->auth_wq);
> > > @@ -1220,6 +1224,9 @@ int ceph_monc_init(struct ceph_mon_client
> > > *monc, struct ceph_client *cl)
> > >
> > >         monc->fs_cluster_id = CEPH_FS_CLUSTER_ID_NONE;
> > >
> > > +       atomic_inc(&monc->client->have_mon_and_osd_map);
> >
> > Hi Slava,
> >
> > Incrementing client->have_mon_and_osd_map here and in
> > ceph_osdc_init()
> > means that counter would be set to 2
> > (CEPH_CLIENT_HAS_MON_AND_OSD_MAP)
> > at the initialization time, way before a session with the monitor is
> > established and any map is received.  This effectively disables the
> > wait logic in __ceph_open_session() because of have_mon_and_osd_map()
> > immediately returning true.  __ceph_open_session() is responsible for
> > setting up the debugfs directory and that is affected too: it's
> > created
> > as 00000000-0000-0000-0000-000000000000.client0 because neither the
> > cluster FSID nor the client ID is known without the monmap.
> >
> > This patch seems to be over-complicated for what it needs to do:
> > I don't see a compelling reason for introducing the atomic and as
> > mentioned before there is no need to attempt to guard against someone
> > continuing to use the client after ceph_osdc_stop() and
> > ceph_monc_stop()
> > are called.  It's the point of no return and the client itself gets
> > freed very shortly after.
> >
> > Why not just open-code the wait loop in __ceph_open_session() to
> > allow
> > for monc->mutex and osdc->lock (for read) to be taken freely?  It
> > should
> > be a small change in __ceph_open_session() -- net/ceph/mon_client.c
> > and
> > net/ceph/osd_client.c wouldn't need to be touched at all.
> >
>
> Hi Ilya,
>
> Frankly speaking, I don't quite follow to your point. The main issue
> happens when one thread calls ceph_osdc_handle_map() [1] ->
> handle_one_map() [2]:
>
> ceph_osdmap_destroy() [3] -> kfree(map) -> go to sleep
>
> <-- another thread receives time slices to execute:
> have_mon_and_osd_map() BUT osdc->osdmap is already freed and invalid
> here!!!
>
> osdc->osdmap = newmap;
>
> So, it's not about ceph_osdc_stop() or ceph_monc_stop() but it's about
> regular operations.

I know, but on top of the regular operations (to be precise, one
regular operation -- __ceph_open_session()) the current patch also
tries to harden ceph_osdc_stop() and ceph_monc_stop().  I wanted to
reiterate that it's not needed.

>
> I've tried to exclude the necessity to use locks at all in
> have_mon_and_osd_map(). Do you mean that wait loop will be better
> solution?

Yes, it seems preferable over an otherwise redundant (i.e. not used for
anything else) atomic which turned out to be tricky enough to get right
on the first try.

> It sounds pretty complicated too for my taste and it will
> require coordination among threads. No? I am not completely sure that I
> follow to your vision.

With the help of woken_wake_function() primitive it shouldn't be
complicated at all.  The diff would be limited to __ceph_open_session()
and I would expect it to be on par with the current patch.  Making it
possible to freely take locks there would also squash another related
buglet: client->auth_err shouldn't be accessed outside of monc->mutex
either.  Being just an int, it's not complained about by KASAN ;)

Since __ceph_open_session() is the only user of have_mon_and_osd_map()
it could be open-coded inside of the wait loop.

Thanks,

                Ilya
Viacheslav Dubeyko June 12, 2025, 7:50 p.m. UTC | #4
On Thu, 2025-06-12 at 10:20 +0200, Ilya Dryomov wrote:
> On Thu, Jun 12, 2025 at 1:14 AM Viacheslav Dubeyko
> <slava@dubeyko.com> wrote:
> > 
> > On Wed, 2025-06-11 at 13:22 +0200, Ilya Dryomov wrote:
> > > On Tue, Apr 15, 2025 at 12:10 AM Viacheslav Dubeyko
> > > <slava@dubeyko.com> wrote:
> > > > 
> > > > From: Viacheslav Dubeyko <Slava.Dubeyko@ibm.com>
> > > > 
> > > > The generic/395 and generic/397 is capable of generating
> > > > the oops is on line net/ceph/ceph_common.c:794 with
> > > > KASAN enabled.
> > > > 
> > > > BUG: KASAN: slab-use-after-free in
> > > > have_mon_and_osd_map+0x56/0x70
> > > > Read of size 4 at addr ffff88811012d810 by task
> > > > mount.ceph/13305
> > > > 
> > > > CPU: 2 UID: 0 PID: 13305 Comm: mount.ceph Not tainted 6.14.0-
> > > > rc2-
> > > > build2+ #1266
> > > > Hardware name: ASUS All Series/H97-PLUS, BIOS 2306 10/09/2014
> > > > Call Trace:
> > > > <TASK>
> > > > dump_stack_lvl+0x57/0x80
> > > > ? have_mon_and_osd_map+0x56/0x70
> > > > print_address_description.constprop.0+0x84/0x330
> > > > ? have_mon_and_osd_map+0x56/0x70
> > > > print_report+0xe2/0x1e0
> > > > ? rcu_read_unlock_sched+0x60/0x80
> > > > ? kmem_cache_debug_flags+0xc/0x20
> > > > ? fixup_red_left+0x17/0x30
> > > > ? have_mon_and_osd_map+0x56/0x70
> > > > kasan_report+0x8d/0xc0
> > > > ? have_mon_and_osd_map+0x56/0x70
> > > > have_mon_and_osd_map+0x56/0x70
> > > > ceph_open_session+0x182/0x290
> > > > ? __pfx_ceph_open_session+0x10/0x10
> > > > ? __init_swait_queue_head+0x8d/0xa0
> > > > ? __pfx_autoremove_wake_function+0x10/0x10
> > > > ? shrinker_register+0xdd/0xf0
> > > > ceph_get_tree+0x333/0x680
> > > > vfs_get_tree+0x49/0x180
> > > > do_new_mount+0x1a3/0x2d0
> > > > ? __pfx_do_new_mount+0x10/0x10
> > > > ? security_capable+0x39/0x70
> > > > path_mount+0x6dd/0x730
> > > > ? __pfx_path_mount+0x10/0x10
> > > > ? kmem_cache_free+0x1e5/0x270
> > > > ? user_path_at+0x48/0x60
> > > > do_mount+0x99/0xe0
> > > > ? __pfx_do_mount+0x10/0x10
> > > > ? lock_release+0x155/0x190
> > > > __do_sys_mount+0x141/0x180
> > > > do_syscall_64+0x9f/0x100
> > > > entry_SYSCALL_64_after_hwframe+0x76/0x7e
> > > > RIP: 0033:0x7f01b1b14f3e
> > > > Code: 48 8b 0d d5 3e 0f 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e
> > > > 0f
> > > > 1f 84 00 00 00 00 00 90 f3 0f 1e fa 49 89 ca b8 a5 00 00 00 0f
> > > > 05
> > > > <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d a2 3e 0f 00 f7 d8 64 89
> > > > 01 48
> > > > RSP: 002b:00007fffd129fa08 EFLAGS: 00000246 ORIG_RAX:
> > > > 00000000000000a5
> > > > RAX: ffffffffffffffda RBX: 0000564ec01a7850 RCX:
> > > > 00007f01b1b14f3e
> > > > RDX: 0000564ec00f2225 RSI: 00007fffd12a1964 RDI:
> > > > 0000564ec0147a20
> > > > RBP: 00007fffd129fbd0 R08: 0000564ec014da90 R09:
> > > > 0000000000000080
> > > > R10: 0000000000000000 R11: 0000000000000246 R12:
> > > > 00007fffd12a194e
> > > > R13: 0000000000000000 R14: 00007fffd129fa50 R15:
> > > > 00007fffd129fa40
> > > > </TASK>
> > > > 
> > > > Allocated by task 13305:
> > > > stack_trace_save+0x8c/0xc0
> > > > kasan_save_stack+0x1e/0x40
> > > > kasan_save_track+0x10/0x30
> > > > __kasan_kmalloc+0x3a/0x50
> > > > __kmalloc_noprof+0x247/0x290
> > > > ceph_osdmap_alloc+0x16/0x130
> > > > ceph_osdc_init+0x27a/0x4c0
> > > > ceph_create_client+0x153/0x190
> > > > create_fs_client+0x50/0x2a0
> > > > ceph_get_tree+0xff/0x680
> > > > vfs_get_tree+0x49/0x180
> > > > do_new_mount+0x1a3/0x2d0
> > > > path_mount+0x6dd/0x730
> > > > do_mount+0x99/0xe0
> > > > __do_sys_mount+0x141/0x180
> > > > do_syscall_64+0x9f/0x100
> > > > entry_SYSCALL_64_after_hwframe+0x76/0x7e
> > > > 
> > > > Freed by task 9475:
> > > > stack_trace_save+0x8c/0xc0
> > > > kasan_save_stack+0x1e/0x40
> > > > kasan_save_track+0x10/0x30
> > > > kasan_save_free_info+0x3b/0x50
> > > > __kasan_slab_free+0x18/0x30
> > > > kfree+0x212/0x290
> > > > handle_one_map+0x23c/0x3b0
> > > > ceph_osdc_handle_map+0x3c9/0x590
> > > > mon_dispatch+0x655/0x6f0
> > > > ceph_con_process_message+0xc3/0xe0
> > > > ceph_con_v1_try_read+0x614/0x760
> > > > ceph_con_workfn+0x2de/0x650
> > > > process_one_work+0x486/0x7c0
> > > > process_scheduled_works+0x73/0x90
> > > > worker_thread+0x1c8/0x2a0
> > > > kthread+0x2ec/0x300
> > > > ret_from_fork+0x24/0x40
> > > > ret_from_fork_asm+0x1a/0x30
> > > > 
> > > > The buggy address belongs to the object at ffff88811012d800
> > > > which belongs to the cache kmalloc-512 of size 512
> > > > The buggy address is located 16 bytes inside of
> > > > freed 512-byte region [ffff88811012d800, ffff88811012da00)
> > > > 
> > > > The buggy address belongs to the physical page:
> > > > page: refcount:0 mapcount:0 mapping:0000000000000000 index:0x0
> > > > pfn:0x11012c
> > > > head: order:2 mapcount:0 entire_mapcount:0 nr_pages_mapped:0
> > > > pincount:0
> > > > flags: 0x200000000000040(head|node=0|zone=2)
> > > > page_type: f5(slab)
> > > > raw: 0200000000000040 ffff888100042c80 dead000000000100
> > > > dead000000000122
> > > > raw: 0000000000000000 0000000080100010 00000000f5000000
> > > > 0000000000000000
> > > > head: 0200000000000040 ffff888100042c80 dead000000000100
> > > > dead000000000122
> > > > head: 0000000000000000 0000000080100010 00000000f5000000
> > > > 0000000000000000
> > > > head: 0200000000000002 ffffea0004404b01 ffffffffffffffff
> > > > 0000000000000000
> > > > head: 0000000000000004 0000000000000000 00000000ffffffff
> > > > 0000000000000000
> > > > page dumped because: kasan: bad access detected
> > > > 
> > > > Memory state around the buggy address:
> > > > ffff88811012d700: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
> > > > fc
> > > > ffff88811012d780: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
> > > > fc
> > > > 
> > > >     ffff88811012d800: fa fb fb fb fb fb fb fb fb fb fb fb fb fb
> > > > fb
> > > > fb
> > > > 
> > > > ^
> > > > ffff88811012d880: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
> > > > fb
> > > > ffff88811012d900: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
> > > > fb
> > > > ===============================================================
> > > > ===
> > > > Disabling lock debugging due to kernel taint
> > > > libceph: client274326 fsid 8598140e-35c2-11ee-b97c-001517c545cc
> > > > libceph: mon0 (1)90.155.74.19:6789 session established
> > > > libceph: client274327 fsid 8598140e-35c2-11ee-b97c-001517c545cc
> > > > 
> > > > We have such scenario:
> > > > 
> > > > Thread 1:
> > > > void ceph_osdmap_destroy(...) {
> > > >     <skipped>
> > > >     kfree(map);
> > > > }
> > > > Thread 1 sleep...
> > > > 
> > > > Thread 2:
> > > > static bool have_mon_and_osd_map(struct ceph_client *client) {
> > > >     return client->monc.monmap && client->monc.monmap->epoch &&
> > > >         client->osdc.osdmap && client->osdc.osdmap->epoch;
> > > > }
> > > > Thread 2 has oops...
> > > > 
> > > > Thread 1 wake up:
> > > > static int handle_one_map(...) {
> > > >     <skipped>
> > > >     osdc->osdmap = newmap;
> > > >     <skipped>
> > > > }
> > > > 
> > > > This patch introduces a have_mon_and_osd_map atomic_t
> > > > field in struct ceph_client. If there is no OSD and
> > > > monitor maps, then the client->have_mon_and_osd_map
> > > > is equal to zero. The OSD and monitor maps initialization
> > > > results in incrementing of client->have_mon_and_osd_map
> > > > under the lock. As a result, have_mon_and_osd_map() function
> > > > simply checks now that client->have_mon_and_osd_map is equal to
> > > > CEPH_CLIENT_HAS_MON_AND_OSD_MAP.
> > > > 
> > > > Patch adds locking in the ceph_osdc_stop()
> > > > method during the destructruction of osdc->osdmap and
> > > > assigning of NULL to the pointer. The lock is used
> > > > in the ceph_monc_stop() during the freeing of monc->monmap
> > > > and assigning NULL to the pointer too. The monmap_show()
> > > > and osdmap_show() methods were reworked to prevent
> > > > the potential race condition during the methods call.
> > > > 
> > > > Signed-off-by: Viacheslav Dubeyko <Slava.Dubeyko@ibm.com>
> > > > ---
> > > >  include/linux/ceph/libceph.h | 20 ++++++++++++++++++++
> > > >  net/ceph/ceph_common.c       |  6 ++++--
> > > >  net/ceph/debugfs.c           | 17 +++++++++++++----
> > > >  net/ceph/mon_client.c        | 18 +++++++++++++++++-
> > > >  net/ceph/osd_client.c        | 11 +++++++++++
> > > >  5 files changed, 65 insertions(+), 7 deletions(-)
> > > > 
> > > > diff --git a/include/linux/ceph/libceph.h
> > > > b/include/linux/ceph/libceph.h
> > > > index 733e7f93db66..f5694bf5bd54 100644
> > > > --- a/include/linux/ceph/libceph.h
> > > > +++ b/include/linux/ceph/libceph.h
> > > > @@ -132,6 +132,7 @@ struct ceph_client {
> > > >         struct ceph_messenger msgr;   /* messenger instance */
> > > >         struct ceph_mon_client monc;
> > > >         struct ceph_osd_client osdc;
> > > > +       atomic_t have_mon_and_osd_map;
> > > > 
> > > >  #ifdef CONFIG_DEBUG_FS
> > > >         struct dentry *debugfs_dir;
> > > > @@ -141,6 +142,25 @@ struct ceph_client {
> > > >  #endif
> > > >  };
> > > > 
> > > > +/*
> > > > + * The have_mon_and_osd_map possible states
> > > > + */
> > > > +enum {
> > > > +       CEPH_CLIENT_HAS_NO_MON_AND_NO_OSD_MAP = 0,
> > > > +       CEPH_CLIENT_HAS_ONLY_ONE_MAP = 1,
> > > > +       CEPH_CLIENT_HAS_MON_AND_OSD_MAP = 2,
> > > > +       CEPH_CLIENT_MAP_STATE_UNKNOWN
> > > > +};
> > > > +
> > > > +static inline
> > > > +bool is_mon_and_osd_map_state_invalid(struct ceph_client
> > > > *client)
> > > > +{
> > > > +       int have_mon_and_osd_map = atomic_read(&client-
> > > > > have_mon_and_osd_map);
> > > > +
> > > > +       return have_mon_and_osd_map <
> > > > CEPH_CLIENT_HAS_NO_MON_AND_NO_OSD_MAP ||
> > > > +               have_mon_and_osd_map >=
> > > > CEPH_CLIENT_MAP_STATE_UNKNOWN;
> > > > +}
> > > > +
> > > >  #define from_msgr(ms)  container_of(ms, struct ceph_client,
> > > > msgr)
> > > > 
> > > >  static inline bool ceph_msgr2(struct ceph_client *client)
> > > > diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
> > > > index 4c6441536d55..62efceb3b19d 100644
> > > > --- a/net/ceph/ceph_common.c
> > > > +++ b/net/ceph/ceph_common.c
> > > > @@ -723,6 +723,8 @@ struct ceph_client
> > > > *ceph_create_client(struct
> > > > ceph_options *opt, void *private)
> > > > 
> > > >         mutex_init(&client->mount_mutex);
> > > >         init_waitqueue_head(&client->auth_wq);
> > > > +       atomic_set(&client->have_mon_and_osd_map,
> > > > +                  CEPH_CLIENT_HAS_NO_MON_AND_NO_OSD_MAP);
> > > >         client->auth_err = 0;
> > > > 
> > > >         client->extra_mon_dispatch = NULL;
> > > > @@ -790,8 +792,8 @@ EXPORT_SYMBOL(ceph_reset_client_addr);
> > > >   */
> > > >  static bool have_mon_and_osd_map(struct ceph_client *client)
> > > >  {
> > > > -       return client->monc.monmap && client->monc.monmap-
> > > > >epoch &&
> > > > -              client->osdc.osdmap && client->osdc.osdmap-
> > > > >epoch;
> > > > +       return atomic_read(&client->have_mon_and_osd_map) ==
> > > > +                              
> > > > CEPH_CLIENT_HAS_MON_AND_OSD_MAP;
> > > >  }
> > > > 
> > > >  /*
> > > > diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
> > > > index 2110439f8a24..7b45c169a859 100644
> > > > --- a/net/ceph/debugfs.c
> > > > +++ b/net/ceph/debugfs.c
> > > > @@ -36,8 +36,10 @@ static int monmap_show(struct seq_file *s,
> > > > void
> > > > *p)
> > > >         int i;
> > > >         struct ceph_client *client = s->private;
> > > > 
> > > > +       mutex_lock(&client->monc.mutex);
> > > > +
> > > >         if (client->monc.monmap == NULL)
> > > > -               return 0;
> > > > +               goto out_unlock;
> > > > 
> > > >         seq_printf(s, "epoch %d\n", client->monc.monmap-
> > > > >epoch);
> > > >         for (i = 0; i < client->monc.monmap->num_mon; i++) {
> > > > @@ -48,6 +50,10 @@ static int monmap_show(struct seq_file *s,
> > > > void
> > > > *p)
> > > >                            ENTITY_NAME(inst->name),
> > > >                            ceph_pr_addr(&inst->addr));
> > > >         }
> > > > +
> > > > +out_unlock:
> > > > +       mutex_unlock(&client->monc.mutex);
> > > > +
> > > >         return 0;
> > > >  }
> > > > 
> > > > @@ -56,13 +62,15 @@ static int osdmap_show(struct seq_file *s,
> > > > void
> > > > *p)
> > > >         int i;
> > > >         struct ceph_client *client = s->private;
> > > >         struct ceph_osd_client *osdc = &client->osdc;
> > > > -       struct ceph_osdmap *map = osdc->osdmap;
> > > > +       struct ceph_osdmap *map = NULL;
> > > >         struct rb_node *n;
> > > > 
> > > > +       down_read(&osdc->lock);
> > > > +
> > > > +       map = osdc->osdmap;
> > > >         if (map == NULL)
> > > > -               return 0;
> > > > +               goto out_unlock;
> > > > 
> > > > -       down_read(&osdc->lock);
> > > >         seq_printf(s, "epoch %u barrier %u flags 0x%x\n", map-
> > > > > epoch,
> > > >                         osdc->epoch_barrier, map->flags);
> > > > 
> > > > @@ -131,6 +139,7 @@ static int osdmap_show(struct seq_file *s,
> > > > void
> > > > *p)
> > > >                 seq_printf(s, "]\n");
> > > >         }
> > > > 
> > > > +out_unlock:
> > > >         up_read(&osdc->lock);
> > > >         return 0;
> > > >  }
> > > > diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
> > > > index ab66b599ac47..5cf802236426 100644
> > > > --- a/net/ceph/mon_client.c
> > > > +++ b/net/ceph/mon_client.c
> > > > @@ -562,12 +562,16 @@ static void ceph_monc_handle_map(struct
> > > > ceph_mon_client *monc,
> > > >                 goto out;
> > > >         }
> > > > 
> > > > +       atomic_dec(&client->have_mon_and_osd_map);
> > > > +
> > > >         kfree(monc->monmap);
> > > >         monc->monmap = monmap;
> > > > 
> > > >         __ceph_monc_got_map(monc, CEPH_SUB_MONMAP, monc-
> > > > >monmap-
> > > > > epoch);
> > > >         client->have_fsid = true;
> > > > 
> > > > +       atomic_inc(&client->have_mon_and_osd_map);
> > > > +
> > > >  out:
> > > >         mutex_unlock(&monc->mutex);
> > > >         wake_up_all(&client->auth_wq);
> > > > @@ -1220,6 +1224,9 @@ int ceph_monc_init(struct ceph_mon_client
> > > > *monc, struct ceph_client *cl)
> > > > 
> > > >         monc->fs_cluster_id = CEPH_FS_CLUSTER_ID_NONE;
> > > > 
> > > > +       atomic_inc(&monc->client->have_mon_and_osd_map);
> > > 
> > > Hi Slava,
> > > 
> > > Incrementing client->have_mon_and_osd_map here and in
> > > ceph_osdc_init()
> > > means that counter would be set to 2
> > > (CEPH_CLIENT_HAS_MON_AND_OSD_MAP)
> > > at the initialization time, way before a session with the monitor
> > > is
> > > established and any map is received.  This effectively disables
> > > the
> > > wait logic in __ceph_open_session() because of
> > > have_mon_and_osd_map()
> > > immediately returning true.  __ceph_open_session() is responsible
> > > for
> > > setting up the debugfs directory and that is affected too: it's
> > > created
> > > as 00000000-0000-0000-0000-000000000000.client0 because neither
> > > the
> > > cluster FSID nor the client ID is known without the monmap.
> > > 
> > > This patch seems to be over-complicated for what it needs to do:
> > > I don't see a compelling reason for introducing the atomic and as
> > > mentioned before there is no need to attempt to guard against
> > > someone
> > > continuing to use the client after ceph_osdc_stop() and
> > > ceph_monc_stop()
> > > are called.  It's the point of no return and the client itself
> > > gets
> > > freed very shortly after.
> > > 
> > > Why not just open-code the wait loop in __ceph_open_session() to
> > > allow
> > > for monc->mutex and osdc->lock (for read) to be taken freely?  It
> > > should
> > > be a small change in __ceph_open_session() --
> > > net/ceph/mon_client.c
> > > and
> > > net/ceph/osd_client.c wouldn't need to be touched at all.
> > > 
> > 
> > Hi Ilya,
> > 
> > Frankly speaking, I don't quite follow to your point. The main
> > issue
> > happens when one thread calls ceph_osdc_handle_map() [1] ->
> > handle_one_map() [2]:
> > 
> > ceph_osdmap_destroy() [3] -> kfree(map) -> go to sleep
> > 
> > <-- another thread receives time slices to execute:
> > have_mon_and_osd_map() BUT osdc->osdmap is already freed and
> > invalid
> > here!!!
> > 
> > osdc->osdmap = newmap;
> > 
> > So, it's not about ceph_osdc_stop() or ceph_monc_stop() but it's
> > about
> > regular operations.
> 
> I know, but on top of the regular operations (to be precise, one
> regular operation -- __ceph_open_session()) the current patch also
> tries to harden ceph_osdc_stop() and ceph_monc_stop().  I wanted to
> reiterate that it's not needed.
> 
> > 
> > I've tried to exclude the necessity to use locks at all in
> > have_mon_and_osd_map(). Do you mean that wait loop will be better
> > solution?
> 
> Yes, it seems preferable over an otherwise redundant (i.e. not used
> for
> anything else) atomic which turned out to be tricky enough to get
> right
> on the first try.
> 
> > It sounds pretty complicated too for my taste and it will
> > require coordination among threads. No? I am not completely sure
> > that I
> > follow to your vision.
> 
> With the help of woken_wake_function() primitive it shouldn't be
> complicated at all.  The diff would be limited to
> __ceph_open_session()
> and I would expect it to be on par with the current patch.  Making it
> possible to freely take locks there would also squash another related
> buglet: client->auth_err shouldn't be accessed outside of monc->mutex
> either.  Being just an int, it's not complained about by KASAN ;)
> 
> Since __ceph_open_session() is the only user of
> have_mon_and_osd_map()
> it could be open-coded inside of the wait loop.
> 

OK. Let me try to rework my patch yet another time. :)

Thanks,
Slava.
diff mbox series

Patch

diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index 733e7f93db66..f5694bf5bd54 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -132,6 +132,7 @@  struct ceph_client {
 	struct ceph_messenger msgr;   /* messenger instance */
 	struct ceph_mon_client monc;
 	struct ceph_osd_client osdc;
+	atomic_t have_mon_and_osd_map;
 
 #ifdef CONFIG_DEBUG_FS
 	struct dentry *debugfs_dir;
@@ -141,6 +142,25 @@  struct ceph_client {
 #endif
 };
 
+/*
+ * The have_mon_and_osd_map possible states
+ */
+enum {
+	CEPH_CLIENT_HAS_NO_MON_AND_NO_OSD_MAP = 0,
+	CEPH_CLIENT_HAS_ONLY_ONE_MAP = 1,
+	CEPH_CLIENT_HAS_MON_AND_OSD_MAP = 2,
+	CEPH_CLIENT_MAP_STATE_UNKNOWN
+};
+
+static inline
+bool is_mon_and_osd_map_state_invalid(struct ceph_client *client)
+{
+	int have_mon_and_osd_map = atomic_read(&client->have_mon_and_osd_map);
+
+	return have_mon_and_osd_map < CEPH_CLIENT_HAS_NO_MON_AND_NO_OSD_MAP ||
+		have_mon_and_osd_map >= CEPH_CLIENT_MAP_STATE_UNKNOWN;
+}
+
 #define from_msgr(ms)	container_of(ms, struct ceph_client, msgr)
 
 static inline bool ceph_msgr2(struct ceph_client *client)
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index 4c6441536d55..62efceb3b19d 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -723,6 +723,8 @@  struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private)
 
 	mutex_init(&client->mount_mutex);
 	init_waitqueue_head(&client->auth_wq);
+	atomic_set(&client->have_mon_and_osd_map,
+		   CEPH_CLIENT_HAS_NO_MON_AND_NO_OSD_MAP);
 	client->auth_err = 0;
 
 	client->extra_mon_dispatch = NULL;
@@ -790,8 +792,8 @@  EXPORT_SYMBOL(ceph_reset_client_addr);
  */
 static bool have_mon_and_osd_map(struct ceph_client *client)
 {
-	return client->monc.monmap && client->monc.monmap->epoch &&
-	       client->osdc.osdmap && client->osdc.osdmap->epoch;
+	return atomic_read(&client->have_mon_and_osd_map) ==
+				CEPH_CLIENT_HAS_MON_AND_OSD_MAP;
 }
 
 /*
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
index 2110439f8a24..7b45c169a859 100644
--- a/net/ceph/debugfs.c
+++ b/net/ceph/debugfs.c
@@ -36,8 +36,10 @@  static int monmap_show(struct seq_file *s, void *p)
 	int i;
 	struct ceph_client *client = s->private;
 
+	mutex_lock(&client->monc.mutex);
+
 	if (client->monc.monmap == NULL)
-		return 0;
+		goto out_unlock;
 
 	seq_printf(s, "epoch %d\n", client->monc.monmap->epoch);
 	for (i = 0; i < client->monc.monmap->num_mon; i++) {
@@ -48,6 +50,10 @@  static int monmap_show(struct seq_file *s, void *p)
 			   ENTITY_NAME(inst->name),
 			   ceph_pr_addr(&inst->addr));
 	}
+
+out_unlock:
+	mutex_unlock(&client->monc.mutex);
+
 	return 0;
 }
 
@@ -56,13 +62,15 @@  static int osdmap_show(struct seq_file *s, void *p)
 	int i;
 	struct ceph_client *client = s->private;
 	struct ceph_osd_client *osdc = &client->osdc;
-	struct ceph_osdmap *map = osdc->osdmap;
+	struct ceph_osdmap *map = NULL;
 	struct rb_node *n;
 
+	down_read(&osdc->lock);
+
+	map = osdc->osdmap;
 	if (map == NULL)
-		return 0;
+		goto out_unlock;
 
-	down_read(&osdc->lock);
 	seq_printf(s, "epoch %u barrier %u flags 0x%x\n", map->epoch,
 			osdc->epoch_barrier, map->flags);
 
@@ -131,6 +139,7 @@  static int osdmap_show(struct seq_file *s, void *p)
 		seq_printf(s, "]\n");
 	}
 
+out_unlock:
 	up_read(&osdc->lock);
 	return 0;
 }
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index ab66b599ac47..5cf802236426 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -562,12 +562,16 @@  static void ceph_monc_handle_map(struct ceph_mon_client *monc,
 		goto out;
 	}
 
+	atomic_dec(&client->have_mon_and_osd_map);
+
 	kfree(monc->monmap);
 	monc->monmap = monmap;
 
 	__ceph_monc_got_map(monc, CEPH_SUB_MONMAP, monc->monmap->epoch);
 	client->have_fsid = true;
 
+	atomic_inc(&client->have_mon_and_osd_map);
+
 out:
 	mutex_unlock(&monc->mutex);
 	wake_up_all(&client->auth_wq);
@@ -1220,6 +1224,9 @@  int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
 
 	monc->fs_cluster_id = CEPH_FS_CLUSTER_ID_NONE;
 
+	atomic_inc(&monc->client->have_mon_and_osd_map);
+	WARN_ON(is_mon_and_osd_map_state_invalid(monc->client));
+
 	return 0;
 
 out_auth_reply:
@@ -1232,6 +1239,7 @@  int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
 	ceph_auth_destroy(monc->auth);
 out_monmap:
 	kfree(monc->monmap);
+	monc->monmap = NULL;
 out:
 	return err;
 }
@@ -1239,6 +1247,8 @@  EXPORT_SYMBOL(ceph_monc_init);
 
 void ceph_monc_stop(struct ceph_mon_client *monc)
 {
+	struct ceph_monmap *old_monmap;
+
 	dout("stop\n");
 
 	mutex_lock(&monc->mutex);
@@ -1266,7 +1276,13 @@  void ceph_monc_stop(struct ceph_mon_client *monc)
 	ceph_msg_put(monc->m_subscribe);
 	ceph_msg_put(monc->m_subscribe_ack);
 
-	kfree(monc->monmap);
+	mutex_lock(&monc->mutex);
+	WARN_ON(is_mon_and_osd_map_state_invalid(monc->client));
+	atomic_dec(&monc->client->have_mon_and_osd_map);
+	old_monmap = monc->monmap;
+	monc->monmap = NULL;
+	mutex_unlock(&monc->mutex);
+	kfree(old_monmap);
 }
 EXPORT_SYMBOL(ceph_monc_stop);
 
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index b24afec24138..14a91603bf6d 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -4068,8 +4068,10 @@  static int handle_one_map(struct ceph_osd_client *osdc,
 			skipped_map = true;
 		}
 
+		atomic_dec(&osdc->client->have_mon_and_osd_map);
 		ceph_osdmap_destroy(osdc->osdmap);
 		osdc->osdmap = newmap;
+		atomic_inc(&osdc->client->have_mon_and_osd_map);
 	}
 
 	was_full &= !ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL);
@@ -5266,6 +5268,9 @@  int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
 	schedule_delayed_work(&osdc->osds_timeout_work,
 	    round_jiffies_relative(osdc->client->options->osd_idle_ttl));
 
+	atomic_inc(&osdc->client->have_mon_and_osd_map);
+	WARN_ON(is_mon_and_osd_map_state_invalid(osdc->client));
+
 	return 0;
 
 out_notify_wq:
@@ -5278,6 +5283,7 @@  int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
 	mempool_destroy(osdc->req_mempool);
 out_map:
 	ceph_osdmap_destroy(osdc->osdmap);
+	osdc->osdmap = NULL;
 out:
 	return err;
 }
@@ -5306,10 +5312,15 @@  void ceph_osdc_stop(struct ceph_osd_client *osdc)
 	WARN_ON(atomic_read(&osdc->num_requests));
 	WARN_ON(atomic_read(&osdc->num_homeless));
 
+	down_write(&osdc->lock);
+	WARN_ON(is_mon_and_osd_map_state_invalid(osdc->client));
+	atomic_dec(&osdc->client->have_mon_and_osd_map);
 	ceph_osdmap_destroy(osdc->osdmap);
+	osdc->osdmap = NULL;
 	mempool_destroy(osdc->req_mempool);
 	ceph_msgpool_destroy(&osdc->msgpool_op);
 	ceph_msgpool_destroy(&osdc->msgpool_op_reply);
+	up_write(&osdc->lock);
 }
 
 int osd_req_op_copy_from_init(struct ceph_osd_request *req,