Message ID | 20210410184016.21603-1-michael.christie@oracle.com |
---|---|
Headers | show |
Series | : qedi tmf fixes | expand |
> -----Original Message----- > From: Mike Christie <michael.christie@oracle.com> > Sent: Sunday, April 11, 2021 12:10 AM > To: lduncan@suse.com; martin.petersen@oracle.com; Manish Rangankar > <mrangankar@marvell.com>; Santosh Vernekar <svernekar@marvell.com>; > linux-scsi@vger.kernel.org; jejb@linux.ibm.com > Subject: [EXT] [RFC PATCH 0/13]: qedi tmf fixes > > External Email > > ---------------------------------------------------------------------- > The following patches made over Linus's tree fixes a bunch of issues in qedi's tmf > handling. > > Manish, I did some tests here. Please test it out and review. > Mike, Thanks for the fixes. Various recovery Test with and without IOs with the patch set went fine.
> -----Original Message----- > From: Mike Christie <michael.christie@oracle.com> > Sent: Sunday, April 11, 2021 12:10 AM > To: lduncan@suse.com; martin.petersen@oracle.com; Manish Rangankar > <mrangankar@marvell.com>; Santosh Vernekar <svernekar@marvell.com>; > linux-scsi@vger.kernel.org; jejb@linux.ibm.com > Cc: Mike Christie <michael.christie@oracle.com> > Subject: [EXT] [PATCH 04/13] scsi: qedi: fix abort vs cmd re-use race > > External Email > > ---------------------------------------------------------------------- > If the scsi cmd completes after qedi_tmf_work calls iscsi_itt_to_task then the > qedi qedi_cmd->task_id could be freed and used for another cmd. If we then call > qedi_iscsi_cleanup_task with that task_id we will be cleaning up the wrong cmd. > > This patch has us wait to release the task_id until the last put has been done on > the iscsi_task. Because libiscsi grabs a ref to the task when sending the abort, > we know that for the non abort timeout case that the task_id we are > referencing is for the cmd that was supposed to be aborted. > > The next patch will fix the case where the abort timesout while we are running > qedi_tmf_work. > > Signed-off-by: Mike Christie <michael.christie@oracle.com> > --- > drivers/scsi/qedi/qedi_fw.c | 13 ------------- > drivers/scsi/qedi/qedi_iscsi.c | 20 +++++++++++++++++--- > 2 files changed, 17 insertions(+), 16 deletions(-) > > diff --git a/drivers/scsi/qedi/qedi_fw.c b/drivers/scsi/qedi/qedi_fw.c index > cf57b4e49700..ad4357e4c15d 100644 > --- a/drivers/scsi/qedi/qedi_fw.c > +++ b/drivers/scsi/qedi/qedi_fw.c > @@ -73,7 +73,6 @@ static void qedi_process_logout_resp(struct qedi_ctx > *qedi, > spin_unlock(&qedi_conn->list_lock); > > cmd->state = RESPONSE_RECEIVED; > - qedi_clear_task_idx(qedi, cmd->task_id); > __iscsi_complete_pdu(conn, (struct iscsi_hdr *)resp_hdr, NULL, 0); > > spin_unlock(&session->back_lock); > @@ -138,7 +137,6 @@ static void qedi_process_text_resp(struct qedi_ctx > *qedi, > spin_unlock(&qedi_conn->list_lock); > > cmd->state = RESPONSE_RECEIVED; > - qedi_clear_task_idx(qedi, cmd->task_id); > > __iscsi_complete_pdu(conn, (struct iscsi_hdr *)resp_hdr_ptr, > qedi_conn->gen_pdu.resp_buf, > @@ -164,13 +162,11 @@ static void qedi_tmf_resp_work(struct work_struct > *work) > iscsi_block_session(session->cls_session); > rval = qedi_cleanup_all_io(qedi, qedi_conn, qedi_cmd->task, true); > if (rval) { > - qedi_clear_task_idx(qedi, qedi_cmd->task_id); > iscsi_unblock_session(session->cls_session); > goto exit_tmf_resp; > } > > iscsi_unblock_session(session->cls_session); > - qedi_clear_task_idx(qedi, qedi_cmd->task_id); > > spin_lock(&session->back_lock); > __iscsi_complete_pdu(conn, (struct iscsi_hdr *)resp_hdr_ptr, NULL, 0); > @@ -245,8 +241,6 @@ static void qedi_process_tmf_resp(struct qedi_ctx > *qedi, > goto unblock_sess; > } > > - qedi_clear_task_idx(qedi, qedi_cmd->task_id); > - > __iscsi_complete_pdu(conn, (struct iscsi_hdr *)resp_hdr_ptr, NULL, 0); > kfree(resp_hdr_ptr); > > @@ -314,7 +308,6 @@ static void qedi_process_login_resp(struct qedi_ctx > *qedi, > "Freeing tid=0x%x for cid=0x%x\n", > cmd->task_id, qedi_conn->iscsi_conn_id); > cmd->state = RESPONSE_RECEIVED; > - qedi_clear_task_idx(qedi, cmd->task_id); > } > > static void qedi_get_rq_bdq_buf(struct qedi_ctx *qedi, @@ -468,7 +461,6 @@ > static int qedi_process_nopin_mesg(struct qedi_ctx *qedi, > } > > spin_unlock(&qedi_conn->list_lock); > - qedi_clear_task_idx(qedi, cmd->task_id); > } > > done: > @@ -673,7 +665,6 @@ static void qedi_scsi_completion(struct qedi_ctx *qedi, > if (qedi_io_tracing) > qedi_trace_io(qedi, task, cmd->task_id, QEDI_IO_TRACE_RSP); > > - qedi_clear_task_idx(qedi, cmd->task_id); > __iscsi_complete_pdu(conn, (struct iscsi_hdr *)hdr, > conn->data, datalen); > error: > @@ -730,7 +721,6 @@ static void qedi_process_nopin_local_cmpl(struct > qedi_ctx *qedi, > cqe->itid, cmd->task_id); > > cmd->state = RESPONSE_RECEIVED; > - qedi_clear_task_idx(qedi, cmd->task_id); > > spin_lock_bh(&session->back_lock); > __iscsi_put_task(task); > @@ -821,8 +811,6 @@ static void qedi_process_cmd_cleanup_resp(struct > qedi_ctx *qedi, > if (qedi_cmd->state == CLEANUP_WAIT_FAILED) > qedi_cmd->state = CLEANUP_RECV; > > - qedi_clear_task_idx(qedi_conn->qedi, rtid); > - > spin_lock(&qedi_conn->list_lock); > if (likely(dbg_cmd->io_cmd_in_list)) { > dbg_cmd->io_cmd_in_list = false; > @@ -856,7 +844,6 @@ static void qedi_process_cmd_cleanup_resp(struct > qedi_ctx *qedi, > QEDI_INFO(&qedi->dbg_ctx, QEDI_LOG_TID, > "Freeing tid=0x%x for cid=0x%x\n", > cqe->itid, qedi_conn->iscsi_conn_id); > - qedi_clear_task_idx(qedi_conn->qedi, cqe->itid); > > } else { > qedi_get_proto_itt(qedi, cqe->itid, &ptmp_itt); diff --git > a/drivers/scsi/qedi/qedi_iscsi.c b/drivers/scsi/qedi/qedi_iscsi.c index > 08c05403cd72..d1da34a938da 100644 > --- a/drivers/scsi/qedi/qedi_iscsi.c > +++ b/drivers/scsi/qedi/qedi_iscsi.c > @@ -772,7 +772,6 @@ static int qedi_mtask_xmit(struct iscsi_conn *conn, > struct iscsi_task *task) > } > > cmd->conn = conn->dd_data; > - cmd->scsi_cmd = NULL; > return qedi_iscsi_send_generic_request(task); > } > > @@ -783,6 +782,10 @@ static int qedi_task_xmit(struct iscsi_task *task) > struct qedi_cmd *cmd = task->dd_data; > struct scsi_cmnd *sc = task->sc; > > + /* Clear now so in cleanup_task we know it didn't make it */ > + cmd->scsi_cmd = NULL; > + cmd->task_id = -1; > + > if (test_bit(QEDI_IN_SHUTDOWN, &qedi_conn->qedi->flags)) > return -ENODEV; > > @@ -1383,13 +1386,24 @@ static umode_t qedi_attr_is_visible(int > param_type, int param) > > static void qedi_cleanup_task(struct iscsi_task *task) { > - if (!task->sc || task->state == ISCSI_TASK_PENDING) { > + struct qedi_cmd *cmd; > + > + if (task->state == ISCSI_TASK_PENDING) { > QEDI_INFO(NULL, QEDI_LOG_IO, "Returning ref_cnt=%d\n", > refcount_read(&task->refcount)); > return; > } > > - qedi_iscsi_unmap_sg_list(task->dd_data); > + if (task->sc) > + qedi_iscsi_unmap_sg_list(task->dd_data); > + > + cmd = task->dd_data; > + if (cmd->task_id != -1) > + qedi_clear_task_idx(iscsi_host_priv(task->conn->session->host), > + cmd->task_id); > + > + cmd->task_id = -1; > + cmd->scsi_cmd = NULL; > } > > struct iscsi_transport qedi_iscsi_transport = { > -- > 2.25.1 Reviewed-by: Manish Rangankar <mrangankar@marvell.com>
> -----Original Message----- > From: Mike Christie <michael.christie@oracle.com> > Sent: Sunday, April 11, 2021 12:10 AM > To: lduncan@suse.com; martin.petersen@oracle.com; Manish Rangankar > <mrangankar@marvell.com>; Santosh Vernekar <svernekar@marvell.com>; > linux-scsi@vger.kernel.org; jejb@linux.ibm.com > Cc: Mike Christie <michael.christie@oracle.com> > Subject: [EXT] [PATCH 05/13] scsi: qedi: fix use after free during abort cleanup > > External Email > > ---------------------------------------------------------------------- > This fixes two bugs: > > 1. The scsi cmd task could be completed and the abort could timeout while we > are running qedi_tmf_work so we need to get a ref to the task. > > 2. If qedi_tmf_work's qedi_wait_for_cleanup_request call times out then we will > also force the clean up of the qedi_work_map but > qedi_process_cmd_cleanup_resp could still be accessing the qedi_cmd for the > abort tmf. We can then race where qedi_process_cmd_cleanup_resp is still > accessing the mtask's qedi_cmd but libiscsi has escalated to session level > cleanup and is cleaning up the mtask while we are still accessing it. > > Signed-off-by: Mike Christie <michael.christie@oracle.com> > --- > drivers/scsi/qedi/qedi_fw.c | 110 ++++++++++++++++++--------------- > drivers/scsi/qedi/qedi_iscsi.h | 1 + > 2 files changed, 61 insertions(+), 50 deletions(-) > > diff --git a/drivers/scsi/qedi/qedi_fw.c b/drivers/scsi/qedi/qedi_fw.c index > ad4357e4c15d..c5699421ec37 100644 > --- a/drivers/scsi/qedi/qedi_fw.c > +++ b/drivers/scsi/qedi/qedi_fw.c > @@ -729,7 +729,6 @@ static void qedi_process_nopin_local_cmpl(struct > qedi_ctx *qedi, > > static void qedi_process_cmd_cleanup_resp(struct qedi_ctx *qedi, > struct iscsi_cqe_solicited *cqe, > - struct iscsi_task *task, > struct iscsi_conn *conn) > { > struct qedi_work_map *work, *work_tmp; @@ -742,7 +741,7 @@ > static void qedi_process_cmd_cleanup_resp(struct qedi_ctx *qedi, > u32 iscsi_cid; > struct qedi_conn *qedi_conn; > struct qedi_cmd *dbg_cmd; > - struct iscsi_task *mtask; > + struct iscsi_task *mtask, *task; > struct iscsi_tm *tmf_hdr = NULL; > > iscsi_cid = cqe->conn_id; > @@ -768,6 +767,7 @@ static void qedi_process_cmd_cleanup_resp(struct > qedi_ctx *qedi, > } > found = 1; > mtask = qedi_cmd->task; > + task = work->ctask; > tmf_hdr = (struct iscsi_tm *)mtask->hdr; > rtid = work->rtid; > > @@ -776,52 +776,47 @@ static void qedi_process_cmd_cleanup_resp(struct > qedi_ctx *qedi, > qedi_cmd->list_tmf_work = NULL; > } > } > - spin_unlock_bh(&qedi_conn->tmf_work_lock); > - > - if (found) { > - QEDI_INFO(&qedi->dbg_ctx, QEDI_LOG_SCSI_TM, > - "TMF work, cqe->tid=0x%x, tmf flags=0x%x, > cid=0x%x\n", > - proto_itt, tmf_hdr->flags, qedi_conn->iscsi_conn_id); > - > - if ((tmf_hdr->flags & ISCSI_FLAG_TM_FUNC_MASK) == > - ISCSI_TM_FUNC_ABORT_TASK) { > - spin_lock_bh(&conn->session->back_lock); > > - protoitt = build_itt(get_itt(tmf_hdr->rtt), > - conn->session->age); > - task = iscsi_itt_to_task(conn, protoitt); > - > - spin_unlock_bh(&conn->session->back_lock); > + if (!found) { > + spin_unlock_bh(&qedi_conn->tmf_work_lock); > + goto check_cleanup_reqs; > + } > > - if (!task) { > - QEDI_NOTICE(&qedi->dbg_ctx, > - "IO task completed, tmf rtt=0x%x, > cid=0x%x\n", > - get_itt(tmf_hdr->rtt), > - qedi_conn->iscsi_conn_id); > - return; > - } > + QEDI_INFO(&qedi->dbg_ctx, QEDI_LOG_SCSI_TM, > + "TMF work, cqe->tid=0x%x, tmf flags=0x%x, cid=0x%x\n", > + proto_itt, tmf_hdr->flags, qedi_conn->iscsi_conn_id); > + > + spin_lock_bh(&conn->session->back_lock); > + if (iscsi_task_is_completed(task)) { > + QEDI_NOTICE(&qedi->dbg_ctx, > + "IO task completed, tmf rtt=0x%x, cid=0x%x\n", > + get_itt(tmf_hdr->rtt), qedi_conn->iscsi_conn_id); > + goto unlock; > + } > > - dbg_cmd = task->dd_data; > + dbg_cmd = task->dd_data; > > - QEDI_INFO(&qedi->dbg_ctx, QEDI_LOG_SCSI_TM, > - "Abort tmf rtt=0x%x, i/o itt=0x%x, i/o > tid=0x%x, cid=0x%x\n", > - get_itt(tmf_hdr->rtt), get_itt(task->itt), > - dbg_cmd->task_id, qedi_conn- > >iscsi_conn_id); > + QEDI_INFO(&qedi->dbg_ctx, QEDI_LOG_SCSI_TM, > + "Abort tmf rtt=0x%x, i/o itt=0x%x, i/o tid=0x%x, cid=0x%x\n", > + get_itt(tmf_hdr->rtt), get_itt(task->itt), dbg_cmd->task_id, > + qedi_conn->iscsi_conn_id); > > - if (qedi_cmd->state == CLEANUP_WAIT_FAILED) > - qedi_cmd->state = CLEANUP_RECV; > + spin_lock(&qedi_conn->list_lock); > + if (likely(dbg_cmd->io_cmd_in_list)) { > + dbg_cmd->io_cmd_in_list = false; > + list_del_init(&dbg_cmd->io_cmd); > + qedi_conn->active_cmd_count--; > + } > + spin_unlock(&qedi_conn->list_lock); > + qedi_cmd->state = CLEANUP_RECV; > +unlock: > + spin_unlock_bh(&conn->session->back_lock); > + spin_unlock_bh(&qedi_conn->tmf_work_lock); > + wake_up_interruptible(&qedi_conn->wait_queue); > + return; > > - spin_lock(&qedi_conn->list_lock); > - if (likely(dbg_cmd->io_cmd_in_list)) { > - dbg_cmd->io_cmd_in_list = false; > - list_del_init(&dbg_cmd->io_cmd); > - qedi_conn->active_cmd_count--; > - } > - spin_unlock(&qedi_conn->list_lock); > - qedi_cmd->state = CLEANUP_RECV; > - wake_up_interruptible(&qedi_conn->wait_queue); > - } > - } else if (qedi_conn->cmd_cleanup_req > 0) { > +check_cleanup_reqs: > + if (qedi_conn->cmd_cleanup_req > 0) { > spin_lock_bh(&conn->session->back_lock); > qedi_get_proto_itt(qedi, cqe->itid, &ptmp_itt); > protoitt = build_itt(ptmp_itt, conn->session->age); @@ -844,6 > +839,7 @@ static void qedi_process_cmd_cleanup_resp(struct qedi_ctx *qedi, > QEDI_INFO(&qedi->dbg_ctx, QEDI_LOG_TID, > "Freeing tid=0x%x for cid=0x%x\n", > cqe->itid, qedi_conn->iscsi_conn_id); > + qedi_clear_task_idx(qedi_conn->qedi, cqe->itid); > > } else { > qedi_get_proto_itt(qedi, cqe->itid, &ptmp_itt); @@ -946,8 > +942,7 @@ void qedi_fp_process_cqes(struct qedi_work *work) > goto exit_fp_process; > case ISCSI_CQE_TYPE_TASK_CLEANUP: > QEDI_INFO(&qedi->dbg_ctx, QEDI_LOG_SCSI_TM, "CleanUp > CqE\n"); > - qedi_process_cmd_cleanup_resp(qedi, &cqe->cqe_solicited, > task, > - conn); > + qedi_process_cmd_cleanup_resp(qedi, &cqe->cqe_solicited, > conn); > goto exit_fp_process; > default: > QEDI_ERR(&qedi->dbg_ctx, "Error cqe.\n"); @@ -1374,12 > +1369,22 @@ static void qedi_tmf_work(struct work_struct *work) > tmf_hdr = (struct iscsi_tm *)mtask->hdr; > set_bit(QEDI_CONN_FW_CLEANUP, &qedi_conn->flags); > > - ctask = iscsi_itt_to_task(conn, tmf_hdr->rtt); > - if (!ctask || !ctask->sc) { > + spin_lock_bh(&conn->session->back_lock); > + ctask = iscsi_itt_to_ctask(conn, tmf_hdr->rtt); > + if (!ctask || iscsi_task_is_completed(ctask)) { > + spin_unlock_bh(&conn->session->back_lock); > QEDI_ERR(&qedi->dbg_ctx, "Task already completed\n"); > - goto abort_ret; > + goto clear_cleanup; > } > > + /* > + * libiscsi gets a ref before sending the abort, but if libiscsi times > + * it out then it could release it and the cmd could complete from > + * under us. > + */ > + __iscsi_get_task(ctask); > + spin_unlock_bh(&conn->session->back_lock); > + > cmd = (struct qedi_cmd *)ctask->dd_data; > QEDI_INFO(&qedi->dbg_ctx, QEDI_LOG_INFO, > "Abort tmf rtt=0x%x, cmd itt=0x%x, cmd tid=0x%x, > cid=0x%x\n", @@ -1389,19 +1394,20 @@ static void qedi_tmf_work(struct > work_struct *work) > if (qedi_do_not_recover) { > QEDI_ERR(&qedi->dbg_ctx, "DONT SEND CLEANUP/ABORT > %d\n", > qedi_do_not_recover); > - goto abort_ret; > + goto put_task; > } > > list_work = kzalloc(sizeof(*list_work), GFP_ATOMIC); > if (!list_work) { > QEDI_ERR(&qedi->dbg_ctx, "Memory allocation failed\n"); > - goto abort_ret; > + goto put_task; > } > > qedi_cmd->type = TYPEIO; > list_work->qedi_cmd = qedi_cmd; > list_work->rtid = cmd->task_id; > list_work->state = QEDI_WORK_SCHEDULED; > + list_work->ctask = ctask; > qedi_cmd->list_tmf_work = list_work; > > QEDI_INFO(&qedi->dbg_ctx, QEDI_LOG_SCSI_TM, @@ -1434,7 +1440,9 > @@ static void qedi_tmf_work(struct work_struct *work) > qedi_cmd->task_id = tid; > qedi_send_iscsi_tmf(qedi_conn, qedi_cmd->task); > > -abort_ret: > +put_task: > + iscsi_put_task(ctask); > +clear_cleanup: > clear_bit(QEDI_CONN_FW_CLEANUP, &qedi_conn->flags); > return; > > @@ -1455,6 +1463,8 @@ static void qedi_tmf_work(struct work_struct *work) > } > spin_unlock(&qedi_conn->list_lock); > > + iscsi_put_task(ctask); > + > clear_bit(QEDI_CONN_FW_CLEANUP, &qedi_conn->flags); } > > diff --git a/drivers/scsi/qedi/qedi_iscsi.h b/drivers/scsi/qedi/qedi_iscsi.h index > 39dc27c85e3c..68ef519f5480 100644 > --- a/drivers/scsi/qedi/qedi_iscsi.h > +++ b/drivers/scsi/qedi/qedi_iscsi.h > @@ -212,6 +212,7 @@ struct qedi_cmd { > struct qedi_work_map { > struct list_head list; > struct qedi_cmd *qedi_cmd; > + struct iscsi_task *ctask; > int rtid; > > int state; > -- > 2.25.1 Thanks, Reviewed-by: Manish Rangankar <mrangankar@marvell.com>
> -----Original Message----- > From: Mike Christie <michael.christie@oracle.com> > Sent: Sunday, April 11, 2021 12:10 AM > To: lduncan@suse.com; martin.petersen@oracle.com; Manish Rangankar > <mrangankar@marvell.com>; Santosh Vernekar <svernekar@marvell.com>; > linux-scsi@vger.kernel.org; jejb@linux.ibm.com > Cc: Mike Christie <michael.christie@oracle.com> > Subject: [EXT] [PATCH 07/13] scsi: qedi: use GFP_NOIO for tmf allocation > > External Email > > ---------------------------------------------------------------------- > We run from a workqueue with no locks held so use GFP_NOIO. > > Signed-off-by: Mike Christie <michael.christie@oracle.com> > --- > drivers/scsi/qedi/qedi_fw.c | 2 +- > 1 file changed, 1 insertion(+), 1 deletion(-) > > diff --git a/drivers/scsi/qedi/qedi_fw.c b/drivers/scsi/qedi/qedi_fw.c index > 542255c94d96..84402e827d42 100644 > --- a/drivers/scsi/qedi/qedi_fw.c > +++ b/drivers/scsi/qedi/qedi_fw.c > @@ -1396,7 +1396,7 @@ static void qedi_abort_work(struct work_struct > *work) > goto put_task; > } > > - list_work = kzalloc(sizeof(*list_work), GFP_ATOMIC); > + list_work = kzalloc(sizeof(*list_work), GFP_NOIO); > if (!list_work) { > QEDI_ERR(&qedi->dbg_ctx, "Memory allocation failed\n"); > goto put_task; > -- > 2.25.1 Thanks, Reviewed-by: Manish Rangankar <mrangankar@marvell.com>
> -----Original Message----- > From: Mike Christie <michael.christie@oracle.com> > Sent: Sunday, April 11, 2021 12:10 AM > To: lduncan@suse.com; martin.petersen@oracle.com; Manish Rangankar > <mrangankar@marvell.com>; Santosh Vernekar <svernekar@marvell.com>; > linux-scsi@vger.kernel.org; jejb@linux.ibm.com > Cc: Mike Christie <michael.christie@oracle.com> > Subject: [EXT] [PATCH 10/13] scsi: qedi: fix session block/unblock abuse during > cleanup > > External Email > > ---------------------------------------------------------------------- > Drivers shouldn't be calling block/unblock session for cmd cleanup because the > functions can change the session state from under libiscsi. > This adds a new a driver level bit so it can block all IO the host while it drains the > card. > > Signed-off-by: Mike Christie <michael.christie@oracle.com> > --- > drivers/scsi/qedi/qedi.h | 1 + > drivers/scsi/qedi/qedi_iscsi.c | 17 +++++++++++++++-- > 2 files changed, 16 insertions(+), 2 deletions(-) > > diff --git a/drivers/scsi/qedi/qedi.h b/drivers/scsi/qedi/qedi.h index > c342defc3f52..ce199a7a16b8 100644 > --- a/drivers/scsi/qedi/qedi.h > +++ b/drivers/scsi/qedi/qedi.h > @@ -284,6 +284,7 @@ struct qedi_ctx { > #define QEDI_IN_RECOVERY 5 > #define QEDI_IN_OFFLINE 6 > #define QEDI_IN_SHUTDOWN 7 > +#define QEDI_BLOCK_IO 8 > > u8 mac[ETH_ALEN]; > u32 src_ip[4]; > diff --git a/drivers/scsi/qedi/qedi_iscsi.c b/drivers/scsi/qedi/qedi_iscsi.c index > 821225f9beb0..536d6653ef8e 100644 > --- a/drivers/scsi/qedi/qedi_iscsi.c > +++ b/drivers/scsi/qedi/qedi_iscsi.c > @@ -330,12 +330,22 @@ qedi_conn_create(struct iscsi_cls_session > *cls_session, uint32_t cid) > > void qedi_mark_device_missing(struct iscsi_cls_session *cls_session) { > - iscsi_block_session(cls_session); > + struct iscsi_session *session = cls_session->dd_data; > + struct qedi_conn *qedi_conn = session->leadconn->dd_data; > + > + spin_lock_bh(&session->frwd_lock); > + set_bit(QEDI_BLOCK_IO, &qedi_conn->qedi->flags); > + spin_unlock_bh(&session->frwd_lock); > } > > void qedi_mark_device_available(struct iscsi_cls_session *cls_session) { > - iscsi_unblock_session(cls_session); > + struct iscsi_session *session = cls_session->dd_data; > + struct qedi_conn *qedi_conn = session->leadconn->dd_data; > + > + spin_lock_bh(&session->frwd_lock); > + clear_bit(QEDI_BLOCK_IO, &qedi_conn->qedi->flags); > + spin_unlock_bh(&session->frwd_lock); > } > > static int qedi_bind_conn_to_iscsi_cid(struct qedi_ctx *qedi, @@ -789,6 +799,9 > @@ static int qedi_task_xmit(struct iscsi_task *task) > if (test_bit(QEDI_IN_SHUTDOWN, &qedi_conn->qedi->flags)) > return -ENODEV; > > + if (test_bit(QEDI_BLOCK_IO, &qedi_conn->qedi->flags)) > + return -EACCES; > + > cmd->state = 0; > cmd->task = NULL; > cmd->use_slowpath = false; > -- > 2.25.1 Thanks, Reviewed-by: Manish Rangankar <mrangankar@marvell.com>
> -----Original Message----- > From: Mike Christie <michael.christie@oracle.com> > Sent: Sunday, April 11, 2021 12:10 AM > To: lduncan@suse.com; martin.petersen@oracle.com; Manish Rangankar > <mrangankar@marvell.com>; Santosh Vernekar <svernekar@marvell.com>; > linux-scsi@vger.kernel.org; jejb@linux.ibm.com > Cc: Mike Christie <michael.christie@oracle.com> > Subject: [EXT] [PATCH 11/13] scsi: qedi: pass send_iscsi_tmf task to abort > > External Email > > ---------------------------------------------------------------------- > qedi_abort_work knows what task to abort so just pass it to send_iscsi_tmf. > > Signed-off-by: Mike Christie <michael.christie@oracle.com> > --- > drivers/scsi/qedi/qedi_fw.c | 17 +++++------------ > 1 file changed, 5 insertions(+), 12 deletions(-) > > diff --git a/drivers/scsi/qedi/qedi_fw.c b/drivers/scsi/qedi/qedi_fw.c index > f13f8af6d931..475cb7823cf1 100644 > --- a/drivers/scsi/qedi/qedi_fw.c > +++ b/drivers/scsi/qedi/qedi_fw.c > @@ -15,7 +15,7 @@ > #include "qedi_fw_scsi.h" > > static int send_iscsi_tmf(struct qedi_conn *qedi_conn, > - struct iscsi_task *mtask); > + struct iscsi_task *mtask, struct iscsi_task *ctask); > > void qedi_iscsi_unmap_sg_list(struct qedi_cmd *cmd) { @@ -1425,7 +1425,7 > @@ static void qedi_abort_work(struct work_struct *work) > goto ldel_exit; > } > > - send_iscsi_tmf(qedi_conn, qedi_cmd->task); > + send_iscsi_tmf(qedi_conn, qedi_cmd->task, ctask); > > put_task: > iscsi_put_task(ctask); > @@ -1455,14 +1455,13 @@ static void qedi_abort_work(struct work_struct > *work) > clear_bit(QEDI_CONN_FW_CLEANUP, &qedi_conn->flags); } > > -static int send_iscsi_tmf(struct qedi_conn *qedi_conn, struct iscsi_task *mtask) > +static int send_iscsi_tmf(struct qedi_conn *qedi_conn, struct iscsi_task *mtask, > + struct iscsi_task *ctask) > { > struct iscsi_tmf_request_hdr tmf_pdu_header; > struct iscsi_task_params task_params; > struct qedi_ctx *qedi = qedi_conn->qedi; > struct e4_iscsi_task_context *fw_task_ctx; > - struct iscsi_conn *conn = qedi_conn->cls_conn->dd_data; > - struct iscsi_task *ctask; > struct iscsi_tm *tmf_hdr; > struct qedi_cmd *qedi_cmd; > struct qedi_cmd *cmd; > @@ -1502,12 +1501,6 @@ static int send_iscsi_tmf(struct qedi_conn > *qedi_conn, struct iscsi_task *mtask) > > if ((tmf_hdr->flags & ISCSI_FLAG_TM_FUNC_MASK) == > ISCSI_TM_FUNC_ABORT_TASK) { > - ctask = iscsi_itt_to_task(conn, tmf_hdr->rtt); > - if (!ctask || !ctask->sc) { > - QEDI_ERR(&qedi->dbg_ctx, > - "Could not get reference task\n"); > - return 0; > - } > cmd = (struct qedi_cmd *)ctask->dd_data; > tmf_pdu_header.rtt = > qedi_set_itt(cmd->task_id, > @@ -1560,7 +1553,7 @@ int qedi_send_iscsi_tmf(struct qedi_conn *qedi_conn, > struct iscsi_task *mtask) > case ISCSI_TM_FUNC_LOGICAL_UNIT_RESET: > case ISCSI_TM_FUNC_TARGET_WARM_RESET: > case ISCSI_TM_FUNC_TARGET_COLD_RESET: > - rc = send_iscsi_tmf(qedi_conn, mtask); > + rc = send_iscsi_tmf(qedi_conn, mtask, NULL); > break; > default: > QEDI_ERR(&qedi->dbg_ctx, "Invalid tmf, cid=0x%x\n", > -- > 2.25.1 Thanks, Reviewed-by: Manish Rangankar <mrangankar@marvell.com>
> -----Original Message----- > From: Mike Christie <michael.christie@oracle.com> > Sent: Sunday, April 11, 2021 12:10 AM > To: lduncan@suse.com; martin.petersen@oracle.com; Manish Rangankar > <mrangankar@marvell.com>; Santosh Vernekar <svernekar@marvell.com>; > linux-scsi@vger.kernel.org; jejb@linux.ibm.com > Cc: Mike Christie <michael.christie@oracle.com> > Subject: [EXT] [PATCH 12/13] scsi: qedi: make sure tmf works are done before > disconnect > > External Email > > ---------------------------------------------------------------------- > We need to make sure that abort and reset completion work has completed > before ep_disconnect returns. After ep_disconnect we can't manipulate cmds > because libiscsi will call conn_stop and take onwership. > > We are trying to make sure abort work and reset completion work has > completed before we do the cmd clean up in ep_disconnect. The problem is > that: > > 1. the work function sets the QEDI_CONN_FW_CLEANUP bit, so if the work was > still pending we would not see the bit set. We need to do this before the work is > queued. > > 2. If we had multiple works queued then we could break from the loop in > qedi_ep_disconnect early because when abort work 1 completes it could clear > QEDI_CONN_FW_CLEANUP. qedi_ep_disconnect could then see that before > work 2 has run. > > 3. A tmf reset completion work could run after ep_disconnect starts cleaning up > cmds via qedi_clearsq. ep_disconnect's call to qedi_clearsq > -> qedi_cleanup_all_io would might think it's done cleaning up cmds, > but the reset completion work could still be running. We then return from > ep_disconnect while still doing cleanup. > > This replaces the bit with a counter and adds a bool to prevent new works from > starting from the completion path once a ep_disconnect starts. > > Signed-off-by: Mike Christie <michael.christie@oracle.com> > --- > drivers/scsi/qedi/qedi_fw.c | 46 +++++++++++++++++++++------------- > drivers/scsi/qedi/qedi_iscsi.c | 23 +++++++++++------ > drivers/scsi/qedi/qedi_iscsi.h | 4 +-- > 3 files changed, 47 insertions(+), 26 deletions(-) > > diff --git a/drivers/scsi/qedi/qedi_fw.c b/drivers/scsi/qedi/qedi_fw.c index > 475cb7823cf1..13dd06915d74 100644 > --- a/drivers/scsi/qedi/qedi_fw.c > +++ b/drivers/scsi/qedi/qedi_fw.c > @@ -156,7 +156,6 @@ static void qedi_tmf_resp_work(struct work_struct > *work) > struct iscsi_tm_rsp *resp_hdr_ptr; > int rval = 0; > > - set_bit(QEDI_CONN_FW_CLEANUP, &qedi_conn->flags); > resp_hdr_ptr = (struct iscsi_tm_rsp *)qedi_cmd->tmf_resp_buf; > > rval = qedi_cleanup_all_io(qedi, qedi_conn, qedi_cmd->task, true); @@ > -169,7 +168,10 @@ static void qedi_tmf_resp_work(struct work_struct *work) > > exit_tmf_resp: > kfree(resp_hdr_ptr); > - clear_bit(QEDI_CONN_FW_CLEANUP, &qedi_conn->flags); > + > + spin_lock(&qedi_conn->tmf_work_lock); > + qedi_conn->fw_cleanup_works--; > + spin_unlock(&qedi_conn->tmf_work_lock); > } > > static void qedi_process_tmf_resp(struct qedi_ctx *qedi, @@ -225,16 +227,25 > @@ static void qedi_process_tmf_resp(struct qedi_ctx *qedi, > } > spin_unlock(&qedi_conn->list_lock); > > - if (((tmf_hdr->flags & ISCSI_FLAG_TM_FUNC_MASK) == > - ISCSI_TM_FUNC_LOGICAL_UNIT_RESET) || > - ((tmf_hdr->flags & ISCSI_FLAG_TM_FUNC_MASK) == > - ISCSI_TM_FUNC_TARGET_WARM_RESET) || > - ((tmf_hdr->flags & ISCSI_FLAG_TM_FUNC_MASK) == > - ISCSI_TM_FUNC_TARGET_COLD_RESET)) { > + spin_lock(&qedi_conn->tmf_work_lock); > + switch (tmf_hdr->flags & ISCSI_FLAG_TM_FUNC_MASK) { > + case ISCSI_TM_FUNC_LOGICAL_UNIT_RESET: > + case ISCSI_TM_FUNC_TARGET_WARM_RESET: > + case ISCSI_TM_FUNC_TARGET_COLD_RESET: > + if (qedi_conn->ep_disconnect_starting) { > + /* Session is down so ep_disconnect will clean up */ > + spin_unlock(&qedi_conn->tmf_work_lock); > + goto unblock_sess; > + } > + > + qedi_conn->fw_cleanup_works++; > + spin_unlock(&qedi_conn->tmf_work_lock); > + > INIT_WORK(&qedi_cmd->tmf_work, qedi_tmf_resp_work); > queue_work(qedi->tmf_thread, &qedi_cmd->tmf_work); > goto unblock_sess; > } > + spin_unlock(&qedi_conn->tmf_work_lock); > > __iscsi_complete_pdu(conn, (struct iscsi_hdr *)resp_hdr_ptr, NULL, 0); > kfree(resp_hdr_ptr); > @@ -1361,7 +1372,6 @@ static void qedi_abort_work(struct work_struct > *work) > > mtask = qedi_cmd->task; > tmf_hdr = (struct iscsi_tm *)mtask->hdr; > - set_bit(QEDI_CONN_FW_CLEANUP, &qedi_conn->flags); > > spin_lock_bh(&conn->session->back_lock); > ctask = iscsi_itt_to_ctask(conn, tmf_hdr->rtt); @@ -1427,11 +1437,7 > @@ static void qedi_abort_work(struct work_struct *work) > > send_iscsi_tmf(qedi_conn, qedi_cmd->task, ctask); > > -put_task: > - iscsi_put_task(ctask); > -clear_cleanup: > - clear_bit(QEDI_CONN_FW_CLEANUP, &qedi_conn->flags); > - return; > + goto put_task; > > ldel_exit: > spin_lock_bh(&qedi_conn->tmf_work_lock); > @@ -1449,10 +1455,12 @@ static void qedi_abort_work(struct work_struct > *work) > qedi_conn->active_cmd_count--; > } > spin_unlock(&qedi_conn->list_lock); > - > +put_task: > iscsi_put_task(ctask); > - > - clear_bit(QEDI_CONN_FW_CLEANUP, &qedi_conn->flags); > +clear_cleanup: > + spin_lock(&qedi_conn->tmf_work_lock); > + qedi_conn->fw_cleanup_works--; > + spin_unlock(&qedi_conn->tmf_work_lock); > } > > static int send_iscsi_tmf(struct qedi_conn *qedi_conn, struct iscsi_task *mtask, > @@ -1547,6 +1555,10 @@ int qedi_send_iscsi_tmf(struct qedi_conn > *qedi_conn, struct iscsi_task *mtask) > > switch (tmf_hdr->flags & ISCSI_FLAG_TM_FUNC_MASK) { > case ISCSI_TM_FUNC_ABORT_TASK: > + spin_lock(&qedi_conn->tmf_work_lock); > + qedi_conn->fw_cleanup_works++; > + spin_unlock(&qedi_conn->tmf_work_lock); > + > INIT_WORK(&qedi_cmd->tmf_work, qedi_abort_work); > queue_work(qedi->tmf_thread, &qedi_cmd->tmf_work); > break; > diff --git a/drivers/scsi/qedi/qedi_iscsi.c b/drivers/scsi/qedi/qedi_iscsi.c index > 536d6653ef8e..8bbdd45ff2a1 100644 > --- a/drivers/scsi/qedi/qedi_iscsi.c > +++ b/drivers/scsi/qedi/qedi_iscsi.c > @@ -592,7 +592,11 @@ static int qedi_conn_start(struct iscsi_cls_conn > *cls_conn) > goto start_err; > } > > - clear_bit(QEDI_CONN_FW_CLEANUP, &qedi_conn->flags); > + spin_lock(&qedi_conn->tmf_work_lock); > + qedi_conn->fw_cleanup_works = 0; > + qedi_conn->ep_disconnect_starting = false; > + spin_unlock(&qedi_conn->tmf_work_lock); > + > qedi_conn->abrt_conn = 0; > > rval = iscsi_conn_start(cls_conn); > @@ -1009,7 +1013,6 @@ static void qedi_ep_disconnect(struct iscsi_endpoint > *ep) > int ret = 0; > int wait_delay; > int abrt_conn = 0; > - int count = 10; > > wait_delay = 60 * HZ + DEF_MAX_RT_TIME; > qedi_ep = ep->dd_data; > @@ -1027,13 +1030,19 @@ static void qedi_ep_disconnect(struct > iscsi_endpoint *ep) > iscsi_suspend_queue(conn); > abrt_conn = qedi_conn->abrt_conn; > > - while (count--) { > - if (!test_bit(QEDI_CONN_FW_CLEANUP, > - &qedi_conn->flags)) { > - break; > - } > + QEDI_INFO(&qedi->dbg_ctx, QEDI_LOG_INFO, > + "cid=0x%x qedi_ep=%p waiting for %d tmfs\n", > + qedi_ep->iscsi_cid, qedi_ep, > + qedi_conn->fw_cleanup_works); > + > + spin_lock(&qedi_conn->tmf_work_lock); > + qedi_conn->ep_disconnect_starting = true; > + while (qedi_conn->fw_cleanup_works > 0) { > + spin_unlock(&qedi_conn->tmf_work_lock); > msleep(1000); > + spin_lock(&qedi_conn->tmf_work_lock); > } > + spin_unlock(&qedi_conn->tmf_work_lock); > > if (test_bit(QEDI_IN_RECOVERY, &qedi->flags)) { > if (qedi_do_not_recover) { > diff --git a/drivers/scsi/qedi/qedi_iscsi.h b/drivers/scsi/qedi/qedi_iscsi.h index > 68ef519f5480..758735209e15 100644 > --- a/drivers/scsi/qedi/qedi_iscsi.h > +++ b/drivers/scsi/qedi/qedi_iscsi.h > @@ -169,8 +169,8 @@ struct qedi_conn { > struct list_head tmf_work_list; > wait_queue_head_t wait_queue; > spinlock_t tmf_work_lock; /* tmf work lock */ > - unsigned long flags; > -#define QEDI_CONN_FW_CLEANUP 1 > + bool ep_disconnect_starting; > + int fw_cleanup_works; > }; > > struct qedi_cmd { > -- > 2.25.1 Thanks, Reviewed-by: Manish Rangankar <mrangankar@marvell.com>