diff mbox series

[v2] scsi: mpt3sas: fix uaf in _scsih_fw_event_cleanup_queue() during hard reset

Message ID 20250528004935.2000196-1-zhengqixing@huaweicloud.com
State New
Headers show
Series [v2] scsi: mpt3sas: fix uaf in _scsih_fw_event_cleanup_queue() during hard reset | expand

Commit Message

Zheng Qixing May 28, 2025, 12:49 a.m. UTC
From: Zheng Qixing <zhengqixing@huawei.com>

Changes in v2:

  1. Reference counting: Ensuring the fw_event_work has proper reference
    counting so it's not freed while still in use
  2. Read-write locks: Protecting access to ioc->current_event with locks
     to prevent race conditions between readers and writers

During mpt3sas hard reset, there are two asynchronous execution paths that
can lead to use-after-free issues:

Path A (cleanup):
  _base_clear_outstanding_commands()
    mpt3sas_scsih_clear_outstanding_scsi_tm_commands()
      _scsih_fw_event_cleanup_queue()
	cancel_work_sync // UAF!

Path B (recovery):
  _base_reset_done_handler()
    mpt3sas_scsih_reset_done_handler()
      _scsih_error_recovery_delete_devices()
        alloc_fw_event_work()
        _scsih_fw_event_add()
          _firmware_event_work()
            _mpt3sas_fw_work() // free fw_event

Here is a use-after-free issue during hard reset:

==================================================================
BUG: KASAN: slab-use-after-free in __cancel_work_timer+0x172/0x3e0
Read of size 8 at addr ffff88be08e72610 by task scsi_eh_10/980

CPU: 50 PID: 980 Comm: scsi_eh_10 Kdump: loaded Not tainted 6.6.0+ #24
Call Trace:
 <TASK>
 __cancel_work_timer+0x172/0x3e0
 _scsih_fw_event_cleanup_queue+0x2a2/0x570 [mpt3sas]
 mpt3sas_scsih_clear_outstanding_scsi_tm_commands+0x171/0x2c0 [mpt3sas]
 mpt3sas_base_hard_reset_handler+0x2e8/0x9d0 [mpt3sas]
 mpt3sas_scsih_issue_tm+0xaa1/0xc10 [mpt3sas]
 scsih_target_reset+0x344/0x7f0 [mpt3sas]
 scsi_try_target_reset+0xa7/0x1f0
 scsi_eh_target_reset+0x4e8/0xc50
 scsi_eh_ready_devs+0xc8/0x5b0
 scsi_unjam_host+0x2fa/0x700
 scsi_error_handler+0x434/0x700
 kthread+0x2d1/0x3b0
 ret_from_fork+0x2b/0x70
 ret_from_fork_asm+0x1b/0x30
 </TASK>

Allocated by task 980:
 mpt3sas_scsih_reset_done_handler+0x575/0x7f0 [mpt3sas]
 mpt3sas_base_hard_reset_handler+0x7a7/0x9d0 [mpt3sas]
 mpt3sas_scsih_issue_tm+0xaa1/0xc10 [mpt3sas]
 scsih_dev_reset+0x354/0x8e0 [mpt3sas]
 scsi_eh_bus_device_reset+0x255/0x7a0
 scsi_eh_ready_devs+0xb6/0x5b0
 scsi_unjam_host+0x2fa/0x700
 scsi_error_handler+0x434/0x700
 kthread+0x2d1/0x3b0
 ret_from_fork+0x2b/0x70
 ret_from_fork_asm+0x1b/0x30

Freed by task 660838:
 __kmem_cache_free+0x174/0x370
 _mpt3sas_fw_work+0x269/0x2510 [mpt3sas]
 process_one_work+0x578/0xc60
 worker_thread+0x6c0/0xc90
 kthread+0x2d1/0x3b0
 ret_from_fork+0x2b/0x70
 ret_from_fork_asm+0x1b/0x30

Last potentially related work creation:
 insert_work+0x24/0x230
 __queue_work.part.0+0x3d2/0x840
 queue_work_on+0x4b/0x60
 _scsih_fw_event_add.part.0+0x20e/0x2c0 [mpt3sas]
 mpt3sas_scsih_reset_done_handler+0x64b/0x7f0 [mpt3sas]
 mpt3sas_base_hard_reset_handler+0x7a7/0x9d0 [mpt3sas]
 mpt3sas_scsih_issue_tm+0xaa1/0xc10 [mpt3sas]
 scsih_dev_reset+0x354/0x8e0 [mpt3sas]
 scsi_eh_bus_device_reset+0x255/0x7a0
 scsi_eh_ready_devs+0xb6/0x5b0
 scsi_unjam_host+0x2fa/0x700
 scsi_error_handler+0x434/0x700
 kthread+0x2d1/0x3b0
 ret_from_fork+0x2b/0x70
 ret_from_fork_asm+0x1b/0x30

Second to last potentially related work creation:
 kasan_save_stack+0x21/0x40
 __kasan_record_aux_stack+0x94/0xa0
 kvfree_call_rcu+0x25/0xa20
 kernfs_unlink_open_file+0x2dd/0x410
 kernfs_fop_release+0xc4/0x320
 __fput+0x35e/0xa10
 __se_sys_close+0x4f/0xa0
 do_syscall_64+0x55/0x100
 entry_SYSCALL_64_after_hwframe+0x78/0xe2

After commit 991df3dd5144 ("scsi: mpt3sas: Fix use-after-free warning"),
a race condition exists in _scsih_fw_event_cleanup_queue(). When Path A
dequeues a fw_event and Path B concurrently processes the same fw_event,
the reference count can drop to zero before cancel_work_sync() is called
in Path A, leading to use-after-free when accessing the already freed
fw_event structure.

Fix this by:
1. Protecting all accesses to ioc->current_event with ioc->fw_event_lock
2. Adding reference counting when accessing current_event
3. Moving the fw_event_work_put() call from dequeue_next_fw_event() to
   the caller, ensuring fw_event remains valid during cancel_work_sync()

Fixes: 991df3dd5144 ("scsi: mpt3sas: Fix use-after-free warning")
Signed-off-by: Zheng Qixing <zhengqixing@huawei.com>
---
 drivers/scsi/mpt3sas/mpt3sas_scsih.c | 40 +++++++++++++++++++++-------
 1 file changed, 31 insertions(+), 9 deletions(-)
diff mbox series

Patch

diff --git a/drivers/scsi/mpt3sas/mpt3sas_scsih.c b/drivers/scsi/mpt3sas/mpt3sas_scsih.c
index 508861e88d9f..a17963ce3f93 100644
--- a/drivers/scsi/mpt3sas/mpt3sas_scsih.c
+++ b/drivers/scsi/mpt3sas/mpt3sas_scsih.c
@@ -3659,7 +3659,6 @@  static struct fw_event_work *dequeue_next_fw_event(struct MPT3SAS_ADAPTER *ioc)
 		fw_event = list_first_entry(&ioc->fw_event_list,
 				struct fw_event_work, list);
 		list_del_init(&fw_event->list);
-		fw_event_work_put(fw_event);
 	}
 	spin_unlock_irqrestore(&ioc->fw_event_lock, flags);
 
@@ -3679,10 +3678,15 @@  static void
 _scsih_fw_event_cleanup_queue(struct MPT3SAS_ADAPTER *ioc)
 {
 	struct fw_event_work *fw_event;
+	unsigned long flags;
 
+	spin_lock_irqsave(&ioc->fw_event_lock, flags);
 	if ((list_empty(&ioc->fw_event_list) && !ioc->current_event) ||
-	    !ioc->firmware_event_thread)
+	    !ioc->firmware_event_thread) {
+		spin_unlock_irqrestore(&ioc->fw_event_lock, flags);
 		return;
+	}
+
 	/*
 	 * Set current running event as ignore, so that
 	 * current running event will exit quickly.
@@ -3691,10 +3695,21 @@  _scsih_fw_event_cleanup_queue(struct MPT3SAS_ADAPTER *ioc)
 	 */
 	if (ioc->shost_recovery && ioc->current_event)
 		ioc->current_event->ignore = 1;
+	spin_unlock_irqrestore(&ioc->fw_event_lock, flags);
 
 	ioc->fw_events_cleanup = 1;
-	while ((fw_event = dequeue_next_fw_event(ioc)) ||
-	     (fw_event = ioc->current_event)) {
+	while (true) {
+		fw_event = dequeue_next_fw_event(ioc);
+
+		spin_lock_irqsave(&ioc->fw_event_lock, flags);
+		if (!fw_event) {
+			fw_event = ioc->current_event;
+			if (!fw_event) {
+				spin_unlock_irqrestore(&ioc->fw_event_lock, flags);
+				break;
+			}
+			fw_event_work_get(fw_event);
+		}
 
 		/*
 		 * Don't call cancel_work_sync() for current_event
@@ -3714,8 +3729,11 @@  _scsih_fw_event_cleanup_queue(struct MPT3SAS_ADAPTER *ioc)
 		    ioc->current_event->event !=
 		    MPT3SAS_REMOVE_UNRESPONDING_DEVICES) {
 			ioc->current_event = NULL;
+			spin_unlock_irqrestore(&ioc->fw_event_lock, flags);
+			fw_event_work_put(fw_event);
 			continue;
 		}
+		spin_unlock_irqrestore(&ioc->fw_event_lock, flags);
 
 		/*
 		 * Driver has to clear ioc->start_scan flag when
@@ -3741,6 +3759,7 @@  _scsih_fw_event_cleanup_queue(struct MPT3SAS_ADAPTER *ioc)
 		if (cancel_work_sync(&fw_event->work))
 			fw_event_work_put(fw_event);
 
+		fw_event_work_put(fw_event);
 	}
 	ioc->fw_events_cleanup = 0;
 }
@@ -10690,15 +10709,16 @@  mpt3sas_scsih_reset_done_handler(struct MPT3SAS_ADAPTER *ioc)
 static void
 _mpt3sas_fw_work(struct MPT3SAS_ADAPTER *ioc, struct fw_event_work *fw_event)
 {
+	unsigned long flags;
+
+	spin_lock_irqsave(&ioc->fw_event_lock, flags);
 	ioc->current_event = fw_event;
+	spin_unlock_irqrestore(&ioc->fw_event_lock, flags);
 	_scsih_fw_event_del_from_list(ioc, fw_event);
 
 	/* the queue is being flushed so ignore this event */
-	if (ioc->remove_host || ioc->pci_error_recovery) {
-		fw_event_work_put(fw_event);
-		ioc->current_event = NULL;
-		return;
-	}
+	if (ioc->remove_host || ioc->pci_error_recovery)
+		goto out;
 
 	switch (fw_event->event) {
 	case MPT3SAS_PROCESS_TRIGGER_DIAG:
@@ -10794,8 +10814,10 @@  _mpt3sas_fw_work(struct MPT3SAS_ADAPTER *ioc, struct fw_event_work *fw_event)
 		return;
 	}
 out:
+	spin_lock_irqsave(&ioc->fw_event_lock, flags);
 	fw_event_work_put(fw_event);
 	ioc->current_event = NULL;
+	spin_unlock_irqrestore(&ioc->fw_event_lock, flags);
 }
 
 /**