@@ -3293,6 +3293,7 @@ static void handle_forward(struct ceph_mds_client *mdsc,
int err = -EINVAL;
void *p = msg->front.iov_base;
void *end = p + msg->front.iov_len;
+ bool aborted = false;
ceph_decode_need(&p, end, 2*sizeof(u32), bad);
next_mds = ceph_decode_32(&p);
@@ -3309,8 +3310,28 @@ static void handle_forward(struct ceph_mds_client *mdsc,
dout("forward tid %llu aborted, unregistering\n", tid);
__unregister_request(mdsc, req);
} else if (fwd_seq <= req->r_num_fwd) {
- dout("forward tid %llu to mds%d - old seq %d <= %d\n",
- tid, next_mds, req->r_num_fwd, fwd_seq);
+ /*
+ * The type of 'num_fwd' in ceph 'MClientRequestForward'
+ * is 'int32_t', while in 'ceph_mds_request_head' the
+ * type is '__u8'. So in case the request bounces between
+ * MDSes exceeding 256 times, the client will get stuck.
+ *
+ * In this case it's ususally a bug in MDS and continue
+ * bouncing the request makes no sense.
+ */
+ if (req->r_num_fwd == 256) {
+ mutex_lock(&req->r_fill_mutex);
+ req->r_err = -EIO;
+ set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags);
+ mutex_unlock(&req->r_fill_mutex);
+ aborted = true;
+ dout("forward tid %llu to mds%d - seq overflowed %d <= %d\n",
+ tid, next_mds, req->r_num_fwd, fwd_seq);
+ goto out;
+ } else {
+ dout("forward tid %llu to mds%d - old seq %d <= %d\n",
+ tid, next_mds, req->r_num_fwd, fwd_seq);
+ }
} else {
/* resend. forward race not possible; mds would drop */
dout("forward tid %llu to mds%d (we resend)\n", tid, next_mds);
@@ -3322,9 +3343,13 @@ static void handle_forward(struct ceph_mds_client *mdsc,
put_request_session(req);
__do_request(mdsc, req);
}
- ceph_mdsc_put_request(req);
out:
mutex_unlock(&mdsc->mutex);
+
+ /* kick calling process */
+ if (aborted)
+ complete_request(mdsc, req);
+ ceph_mdsc_put_request(req);
return;
bad: