diff options
author | José Pekkarinen <jose.pekkarinen@nokia.com> | 2016-04-11 10:41:07 +0300 |
---|---|---|
committer | José Pekkarinen <jose.pekkarinen@nokia.com> | 2016-04-13 08:17:18 +0300 |
commit | e09b41010ba33a20a87472ee821fa407a5b8da36 (patch) | |
tree | d10dc367189862e7ca5c592f033dc3726e1df4e3 /kernel/fs/ceph/caps.c | |
parent | f93b97fd65072de626c074dbe099a1fff05ce060 (diff) |
These changes are the raw update to linux-4.4.6-rt14. Kernel sources
are taken from kernel.org, and rt patch from the rt wiki download page.
During the rebasing, the following patch collided:
Force tick interrupt and get rid of softirq magic(I70131fb85).
Collisions have been removed because its logic was found on the
source already.
Change-Id: I7f57a4081d9deaa0d9ccfc41a6c8daccdee3b769
Signed-off-by: José Pekkarinen <jose.pekkarinen@nokia.com>
Diffstat (limited to 'kernel/fs/ceph/caps.c')
-rw-r--r-- | kernel/fs/ceph/caps.c | 832 |
1 files changed, 571 insertions, 261 deletions
diff --git a/kernel/fs/ceph/caps.c b/kernel/fs/ceph/caps.c index be5ea6af8..c69e1253b 100644 --- a/kernel/fs/ceph/caps.c +++ b/kernel/fs/ceph/caps.c @@ -833,7 +833,9 @@ int __ceph_caps_used(struct ceph_inode_info *ci) used |= CEPH_CAP_PIN; if (ci->i_rd_ref) used |= CEPH_CAP_FILE_RD; - if (ci->i_rdcache_ref || ci->vfs_inode.i_data.nrpages) + if (ci->i_rdcache_ref || + (!S_ISDIR(ci->vfs_inode.i_mode) && /* ignore readdir cache */ + ci->vfs_inode.i_data.nrpages)) used |= CEPH_CAP_FILE_CACHE; if (ci->i_wr_ref) used |= CEPH_CAP_FILE_WR; @@ -926,16 +928,6 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release) /* remove from session list */ spin_lock(&session->s_cap_lock); - /* - * s_cap_reconnect is protected by s_cap_lock. no one changes - * s_cap_gen while session is in the reconnect state. - */ - if (queue_release && - (!session->s_cap_reconnect || - cap->cap_gen == session->s_cap_gen)) - __queue_cap_release(session, ci->i_vino.ino, cap->cap_id, - cap->mseq, cap->issue_seq); - if (session->s_cap_iterator == cap) { /* not yet, we are iterating over this very cap */ dout("__ceph_remove_cap delaying %p removal from session %p\n", @@ -948,6 +940,25 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release) } /* protect backpointer with s_cap_lock: see iterate_session_caps */ cap->ci = NULL; + + /* + * s_cap_reconnect is protected by s_cap_lock. no one changes + * s_cap_gen while session is in the reconnect state. + */ + if (queue_release && + (!session->s_cap_reconnect || cap->cap_gen == session->s_cap_gen)) { + cap->queue_release = 1; + if (removed) { + list_add_tail(&cap->session_caps, + &session->s_cap_releases); + session->s_num_cap_releases++; + removed = 0; + } + } else { + cap->queue_release = 0; + } + cap->cap_ino = ci->i_vino.ino; + spin_unlock(&session->s_cap_lock); /* remove from inode list */ @@ -977,8 +988,8 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release) static int send_cap_msg(struct ceph_mds_session *session, u64 ino, u64 cid, int op, int caps, int wanted, int dirty, - u32 seq, u64 flush_tid, u32 issue_seq, u32 mseq, - u64 size, u64 max_size, + u32 seq, u64 flush_tid, u64 oldest_flush_tid, + u32 issue_seq, u32 mseq, u64 size, u64 max_size, struct timespec *mtime, struct timespec *atime, u64 time_warp_seq, kuid_t uid, kgid_t gid, umode_t mode, @@ -992,20 +1003,23 @@ static int send_cap_msg(struct ceph_mds_session *session, size_t extra_len; dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s" - " seq %u/%u mseq %u follows %lld size %llu/%llu" + " seq %u/%u tid %llu/%llu mseq %u follows %lld size %llu/%llu" " xattr_ver %llu xattr_len %d\n", ceph_cap_op_name(op), cid, ino, ceph_cap_string(caps), ceph_cap_string(wanted), ceph_cap_string(dirty), - seq, issue_seq, mseq, follows, size, max_size, + seq, issue_seq, flush_tid, oldest_flush_tid, + mseq, follows, size, max_size, xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0); - /* flock buffer size + inline version + inline data size */ - extra_len = 4 + 8 + 4; + /* flock buffer size + inline version + inline data size + + * osd_epoch_barrier + oldest_flush_tid */ + extra_len = 4 + 8 + 4 + 4 + 8; msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc) + extra_len, GFP_NOFS, false); if (!msg) return -ENOMEM; + msg->hdr.version = cpu_to_le16(6); msg->hdr.tid = cpu_to_le64(flush_tid); fc = msg->front.iov_base; @@ -1041,6 +1055,10 @@ static int send_cap_msg(struct ceph_mds_session *session, ceph_encode_64(&p, inline_data ? 0 : CEPH_INLINE_NONE); /* inline data size */ ceph_encode_32(&p, 0); + /* osd_epoch_barrier */ + ceph_encode_32(&p, 0); + /* oldest_flush_tid */ + ceph_encode_64(&p, oldest_flush_tid); fc->xattr_version = cpu_to_le64(xattr_version); if (xattrs_buf) { @@ -1053,44 +1071,6 @@ static int send_cap_msg(struct ceph_mds_session *session, return 0; } -void __queue_cap_release(struct ceph_mds_session *session, - u64 ino, u64 cap_id, u32 migrate_seq, - u32 issue_seq) -{ - struct ceph_msg *msg; - struct ceph_mds_cap_release *head; - struct ceph_mds_cap_item *item; - - BUG_ON(!session->s_num_cap_releases); - msg = list_first_entry(&session->s_cap_releases, - struct ceph_msg, list_head); - - dout(" adding %llx release to mds%d msg %p (%d left)\n", - ino, session->s_mds, msg, session->s_num_cap_releases); - - BUG_ON(msg->front.iov_len + sizeof(*item) > PAGE_CACHE_SIZE); - head = msg->front.iov_base; - le32_add_cpu(&head->num, 1); - item = msg->front.iov_base + msg->front.iov_len; - item->ino = cpu_to_le64(ino); - item->cap_id = cpu_to_le64(cap_id); - item->migrate_seq = cpu_to_le32(migrate_seq); - item->seq = cpu_to_le32(issue_seq); - - session->s_num_cap_releases--; - - msg->front.iov_len += sizeof(*item); - if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) { - dout(" release msg %p full\n", msg); - list_move_tail(&msg->list_head, &session->s_cap_releases_done); - } else { - dout(" release msg %p at %d/%d (%d)\n", msg, - (int)le32_to_cpu(head->num), - (int)CEPH_CAPS_PER_RELEASE, - (int)msg->front.iov_len); - } -} - /* * Queue cap releases when an inode is dropped from our cache. Since * inode is about to be destroyed, there is no need for i_ceph_lock. @@ -1127,7 +1107,7 @@ void ceph_queue_caps_release(struct inode *inode) */ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, int op, int used, int want, int retain, int flushing, - unsigned *pflush_tid) + u64 flush_tid, u64 oldest_flush_tid) __releases(cap->ci->i_ceph_lock) { struct ceph_inode_info *ci = cap->ci; @@ -1145,8 +1125,6 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, u64 xattr_version = 0; struct ceph_buffer *xattr_blob = NULL; int delayed = 0; - u64 flush_tid = 0; - int i; int ret; bool inline_data; @@ -1190,26 +1168,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, cap->implemented &= cap->issued | used; cap->mds_wanted = want; - if (flushing) { - /* - * assign a tid for flush operations so we can avoid - * flush1 -> dirty1 -> flush2 -> flushack1 -> mark - * clean type races. track latest tid for every bit - * so we can handle flush AxFw, flush Fw, and have the - * first ack clean Ax. - */ - flush_tid = ++ci->i_cap_flush_last_tid; - if (pflush_tid) - *pflush_tid = flush_tid; - dout(" cap_flush_tid %d\n", (int)flush_tid); - for (i = 0; i < CEPH_CAP_BITS; i++) - if (flushing & (1 << i)) - ci->i_cap_flush_tid[i] = flush_tid; - - follows = ci->i_head_snapc->seq; - } else { - follows = 0; - } + follows = flushing ? ci->i_head_snapc->seq : 0; keep = cap->implemented; seq = cap->seq; @@ -1237,7 +1196,8 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, spin_unlock(&ci->i_ceph_lock); ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id, - op, keep, want, flushing, seq, flush_tid, issue_seq, mseq, + op, keep, want, flushing, seq, + flush_tid, oldest_flush_tid, issue_seq, mseq, size, max_size, &mtime, &atime, time_warp_seq, uid, gid, mode, xattr_version, xattr_blob, follows, inline_data); @@ -1259,14 +1219,14 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, * asynchronously back to the MDS once sync writes complete and dirty * data is written out. * - * Unless @again is true, skip cap_snaps that were already sent to + * Unless @kick is true, skip cap_snaps that were already sent to * the MDS (i.e., during this session). * * Called under i_ceph_lock. Takes s_mutex as needed. */ void __ceph_flush_snaps(struct ceph_inode_info *ci, struct ceph_mds_session **psession, - int again) + int kick) __releases(ci->i_ceph_lock) __acquires(ci->i_ceph_lock) { @@ -1297,11 +1257,8 @@ retry: if (capsnap->dirty_pages || capsnap->writing) break; - /* - * if cap writeback already occurred, we should have dropped - * the capsnap in ceph_put_wrbuffer_cap_refs. - */ - BUG_ON(capsnap->dirty == 0); + /* should be removed by ceph_try_drop_cap_snap() */ + BUG_ON(!capsnap->need_flush); /* pick mds, take s_mutex */ if (ci->i_auth_cap == NULL) { @@ -1310,7 +1267,7 @@ retry: } /* only flush each capsnap once */ - if (!again && !list_empty(&capsnap->flushing_item)) { + if (!kick && !list_empty(&capsnap->flushing_item)) { dout("already flushed %p, skipping\n", capsnap); continue; } @@ -1320,6 +1277,9 @@ retry: if (session && session->s_mds != mds) { dout("oops, wrong session %p mutex\n", session); + if (kick) + goto out; + mutex_unlock(&session->s_mutex); ceph_put_mds_session(session); session = NULL; @@ -1343,20 +1303,22 @@ retry: goto retry; } - capsnap->flush_tid = ++ci->i_cap_flush_last_tid; + spin_lock(&mdsc->cap_dirty_lock); + capsnap->flush_tid = ++mdsc->last_cap_flush_tid; + spin_unlock(&mdsc->cap_dirty_lock); + atomic_inc(&capsnap->nref); - if (!list_empty(&capsnap->flushing_item)) - list_del_init(&capsnap->flushing_item); - list_add_tail(&capsnap->flushing_item, - &session->s_cap_snaps_flushing); + if (list_empty(&capsnap->flushing_item)) + list_add_tail(&capsnap->flushing_item, + &session->s_cap_snaps_flushing); spin_unlock(&ci->i_ceph_lock); dout("flush_snaps %p cap_snap %p follows %lld tid %llu\n", inode, capsnap, capsnap->follows, capsnap->flush_tid); send_cap_msg(session, ceph_vino(inode).ino, 0, CEPH_CAP_OP_FLUSHSNAP, capsnap->issued, 0, - capsnap->dirty, 0, capsnap->flush_tid, 0, mseq, - capsnap->size, 0, + capsnap->dirty, 0, capsnap->flush_tid, 0, + 0, mseq, capsnap->size, 0, &capsnap->mtime, &capsnap->atime, capsnap->time_warp_seq, capsnap->uid, capsnap->gid, capsnap->mode, @@ -1396,7 +1358,8 @@ static void ceph_flush_snaps(struct ceph_inode_info *ci) * Caller is then responsible for calling __mark_inode_dirty with the * returned flags value. */ -int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) +int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask, + struct ceph_cap_flush **pcf) { struct ceph_mds_client *mdsc = ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc; @@ -1416,9 +1379,14 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) ceph_cap_string(was | mask)); ci->i_dirty_caps |= mask; if (was == 0) { - if (!ci->i_head_snapc) + WARN_ON_ONCE(ci->i_prealloc_cap_flush); + swap(ci->i_prealloc_cap_flush, *pcf); + + if (!ci->i_head_snapc) { + WARN_ON_ONCE(!rwsem_is_locked(&mdsc->snap_rwsem)); ci->i_head_snapc = ceph_get_snap_context( ci->i_snap_realm->cached_context); + } dout(" inode %p now dirty snapc %p auth cap %p\n", &ci->vfs_inode, ci->i_head_snapc, ci->i_auth_cap); BUG_ON(!list_empty(&ci->i_dirty_item)); @@ -1429,6 +1397,8 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) ihold(inode); dirty |= I_DIRTY_SYNC; } + } else { + WARN_ON_ONCE(!ci->i_prealloc_cap_flush); } BUG_ON(list_empty(&ci->i_dirty_item)); if (((was | ci->i_flushing_caps) & CEPH_CAP_FILE_BUFFER) && @@ -1438,6 +1408,74 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) return dirty; } +static void __add_cap_flushing_to_inode(struct ceph_inode_info *ci, + struct ceph_cap_flush *cf) +{ + struct rb_node **p = &ci->i_cap_flush_tree.rb_node; + struct rb_node *parent = NULL; + struct ceph_cap_flush *other = NULL; + + while (*p) { + parent = *p; + other = rb_entry(parent, struct ceph_cap_flush, i_node); + + if (cf->tid < other->tid) + p = &(*p)->rb_left; + else if (cf->tid > other->tid) + p = &(*p)->rb_right; + else + BUG(); + } + + rb_link_node(&cf->i_node, parent, p); + rb_insert_color(&cf->i_node, &ci->i_cap_flush_tree); +} + +static void __add_cap_flushing_to_mdsc(struct ceph_mds_client *mdsc, + struct ceph_cap_flush *cf) +{ + struct rb_node **p = &mdsc->cap_flush_tree.rb_node; + struct rb_node *parent = NULL; + struct ceph_cap_flush *other = NULL; + + while (*p) { + parent = *p; + other = rb_entry(parent, struct ceph_cap_flush, g_node); + + if (cf->tid < other->tid) + p = &(*p)->rb_left; + else if (cf->tid > other->tid) + p = &(*p)->rb_right; + else + BUG(); + } + + rb_link_node(&cf->g_node, parent, p); + rb_insert_color(&cf->g_node, &mdsc->cap_flush_tree); +} + +struct ceph_cap_flush *ceph_alloc_cap_flush(void) +{ + return kmem_cache_alloc(ceph_cap_flush_cachep, GFP_KERNEL); +} + +void ceph_free_cap_flush(struct ceph_cap_flush *cf) +{ + if (cf) + kmem_cache_free(ceph_cap_flush_cachep, cf); +} + +static u64 __get_oldest_flush_tid(struct ceph_mds_client *mdsc) +{ + struct rb_node *n = rb_first(&mdsc->cap_flush_tree); + if (n) { + struct ceph_cap_flush *cf = + rb_entry(n, struct ceph_cap_flush, g_node); + return cf->tid; + } + return 0; +} + /* * Add dirty inode to the flushing list. Assigned a seq number so we * can wait for caps to flush without starving. @@ -1445,14 +1483,17 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) * Called under i_ceph_lock. */ static int __mark_caps_flushing(struct inode *inode, - struct ceph_mds_session *session) + struct ceph_mds_session *session, + u64 *flush_tid, u64 *oldest_flush_tid) { struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_cap_flush *cf = NULL; int flushing; BUG_ON(ci->i_dirty_caps == 0); BUG_ON(list_empty(&ci->i_dirty_item)); + BUG_ON(!ci->i_prealloc_cap_flush); flushing = ci->i_dirty_caps; dout("__mark_caps_flushing flushing %s, flushing_caps %s -> %s\n", @@ -1463,22 +1504,30 @@ static int __mark_caps_flushing(struct inode *inode, ci->i_dirty_caps = 0; dout(" inode %p now !dirty\n", inode); + swap(cf, ci->i_prealloc_cap_flush); + cf->caps = flushing; + spin_lock(&mdsc->cap_dirty_lock); list_del_init(&ci->i_dirty_item); + cf->tid = ++mdsc->last_cap_flush_tid; + __add_cap_flushing_to_mdsc(mdsc, cf); + *oldest_flush_tid = __get_oldest_flush_tid(mdsc); + if (list_empty(&ci->i_flushing_item)) { - ci->i_cap_flush_seq = ++mdsc->cap_flush_seq; list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing); mdsc->num_cap_flushing++; - dout(" inode %p now flushing seq %lld\n", inode, - ci->i_cap_flush_seq); + dout(" inode %p now flushing tid %llu\n", inode, cf->tid); } else { list_move_tail(&ci->i_flushing_item, &session->s_cap_flushing); - dout(" inode %p now flushing (more) seq %lld\n", inode, - ci->i_cap_flush_seq); + dout(" inode %p now flushing (more) tid %llu\n", + inode, cf->tid); } spin_unlock(&mdsc->cap_dirty_lock); + __add_cap_flushing_to_inode(ci, cf); + + *flush_tid = cf->tid; return flushing; } @@ -1524,6 +1573,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags, struct ceph_mds_client *mdsc = fsc->mdsc; struct inode *inode = &ci->vfs_inode; struct ceph_cap *cap; + u64 flush_tid, oldest_flush_tid; int file_wanted, used, cap_used; int took_snap_rwsem = 0; /* true if mdsc->snap_rwsem held */ int issued, implemented, want, retain, revoking, flushing = 0; @@ -1553,13 +1603,13 @@ retry: retry_locked: file_wanted = __ceph_caps_file_wanted(ci); used = __ceph_caps_used(ci); - want = file_wanted | used; issued = __ceph_caps_issued(ci, &implemented); revoking = implemented & ~issued; - retain = want | CEPH_CAP_PIN; + want = file_wanted; + retain = file_wanted | used | CEPH_CAP_PIN; if (!mdsc->stopping && inode->i_nlink > 0) { - if (want) { + if (file_wanted) { retain |= CEPH_CAP_ANY; /* be greedy */ } else if (S_ISDIR(inode->i_mode) && (issued & CEPH_CAP_FILE_SHARED) && @@ -1602,11 +1652,11 @@ retry_locked: * If we fail, it's because pages are locked.... try again later. */ if ((!is_delayed || mdsc->stopping) && - ci->i_wrbuffer_ref == 0 && /* no dirty pages... */ - inode->i_data.nrpages && /* have cached pages */ - (file_wanted == 0 || /* no open files */ - (revoking & (CEPH_CAP_FILE_CACHE| - CEPH_CAP_FILE_LAZYIO))) && /* or revoking cache */ + !S_ISDIR(inode->i_mode) && /* ignore readdir cache */ + ci->i_wrbuffer_ref == 0 && /* no dirty pages... */ + inode->i_data.nrpages && /* have cached pages */ + (revoking & (CEPH_CAP_FILE_CACHE| + CEPH_CAP_FILE_LAZYIO)) && /* or revoking cache */ !tried_invalidate) { dout("check_caps trying to invalidate on %p\n", inode); if (try_nonblocking_invalidate(inode) < 0) { @@ -1742,17 +1792,25 @@ ack: took_snap_rwsem = 1; } - if (cap == ci->i_auth_cap && ci->i_dirty_caps) - flushing = __mark_caps_flushing(inode, session); - else + if (cap == ci->i_auth_cap && ci->i_dirty_caps) { + flushing = __mark_caps_flushing(inode, session, + &flush_tid, + &oldest_flush_tid); + } else { flushing = 0; + flush_tid = 0; + spin_lock(&mdsc->cap_dirty_lock); + oldest_flush_tid = __get_oldest_flush_tid(mdsc); + spin_unlock(&mdsc->cap_dirty_lock); + } mds = cap->mds; /* remember mds, so we don't repeat */ sent++; /* __send_cap drops i_ceph_lock */ delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, cap_used, - want, retain, flushing, NULL); + want, retain, flushing, + flush_tid, oldest_flush_tid); goto retry; /* retake i_ceph_lock and restart our cap scan. */ } @@ -1781,12 +1839,13 @@ ack: /* * Try to flush dirty caps back to the auth mds. */ -static int try_flush_caps(struct inode *inode, unsigned *flush_tid) +static int try_flush_caps(struct inode *inode, u64 *ptid) { struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_inode_info *ci = ceph_inode(inode); - int flushing = 0; struct ceph_mds_session *session = NULL; + int flushing = 0; + u64 flush_tid = 0, oldest_flush_tid = 0; retry: spin_lock(&ci->i_ceph_lock); @@ -1811,42 +1870,54 @@ retry: if (cap->session->s_state < CEPH_MDS_SESSION_OPEN) goto out; - flushing = __mark_caps_flushing(inode, session); + flushing = __mark_caps_flushing(inode, session, &flush_tid, + &oldest_flush_tid); /* __send_cap drops i_ceph_lock */ delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, used, want, - cap->issued | cap->implemented, flushing, - flush_tid); - if (!delayed) - goto out_unlocked; + (cap->issued | cap->implemented), + flushing, flush_tid, oldest_flush_tid); - spin_lock(&ci->i_ceph_lock); - __cap_delay_requeue(mdsc, ci); + if (delayed) { + spin_lock(&ci->i_ceph_lock); + __cap_delay_requeue(mdsc, ci); + spin_unlock(&ci->i_ceph_lock); + } + } else { + struct rb_node *n = rb_last(&ci->i_cap_flush_tree); + if (n) { + struct ceph_cap_flush *cf = + rb_entry(n, struct ceph_cap_flush, i_node); + flush_tid = cf->tid; + } + flushing = ci->i_flushing_caps; + spin_unlock(&ci->i_ceph_lock); } out: - spin_unlock(&ci->i_ceph_lock); -out_unlocked: if (session) mutex_unlock(&session->s_mutex); + + *ptid = flush_tid; return flushing; } /* * Return true if we've flushed caps through the given flush_tid. */ -static int caps_are_flushed(struct inode *inode, unsigned tid) +static int caps_are_flushed(struct inode *inode, u64 flush_tid) { struct ceph_inode_info *ci = ceph_inode(inode); - int i, ret = 1; + struct ceph_cap_flush *cf; + struct rb_node *n; + int ret = 1; spin_lock(&ci->i_ceph_lock); - for (i = 0; i < CEPH_CAP_BITS; i++) - if ((ci->i_flushing_caps & (1 << i)) && - ci->i_cap_flush_tid[i] <= tid) { - /* still flushing this bit */ + n = rb_first(&ci->i_cap_flush_tree); + if (n) { + cf = rb_entry(n, struct ceph_cap_flush, i_node); + if (cf->tid <= flush_tid) ret = 0; - break; - } + } spin_unlock(&ci->i_ceph_lock); return ret; } @@ -1864,13 +1935,16 @@ static void sync_write_wait(struct inode *inode) struct ceph_osd_request *req; u64 last_tid; + if (!S_ISREG(inode->i_mode)) + return; + spin_lock(&ci->i_unsafe_lock); if (list_empty(head)) goto out; /* set upper bound as _last_ entry in chain */ - req = list_entry(head->prev, struct ceph_osd_request, - r_unsafe_item); + req = list_last_entry(head, struct ceph_osd_request, + r_unsafe_item); last_tid = req->r_tid; do { @@ -1888,18 +1962,61 @@ static void sync_write_wait(struct inode *inode) */ if (list_empty(head)) break; - req = list_entry(head->next, struct ceph_osd_request, - r_unsafe_item); + req = list_first_entry(head, struct ceph_osd_request, + r_unsafe_item); } while (req->r_tid < last_tid); out: spin_unlock(&ci->i_unsafe_lock); } +/* + * wait for any unsafe requests to complete. + */ +static int unsafe_request_wait(struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_mds_request *req1 = NULL, *req2 = NULL; + int ret, err = 0; + + spin_lock(&ci->i_unsafe_lock); + if (S_ISDIR(inode->i_mode) && !list_empty(&ci->i_unsafe_dirops)) { + req1 = list_last_entry(&ci->i_unsafe_dirops, + struct ceph_mds_request, + r_unsafe_dir_item); + ceph_mdsc_get_request(req1); + } + if (!list_empty(&ci->i_unsafe_iops)) { + req2 = list_last_entry(&ci->i_unsafe_iops, + struct ceph_mds_request, + r_unsafe_target_item); + ceph_mdsc_get_request(req2); + } + spin_unlock(&ci->i_unsafe_lock); + + dout("unsafe_requeset_wait %p wait on tid %llu %llu\n", + inode, req1 ? req1->r_tid : 0ULL, req2 ? req2->r_tid : 0ULL); + if (req1) { + ret = !wait_for_completion_timeout(&req1->r_safe_completion, + ceph_timeout_jiffies(req1->r_timeout)); + if (ret) + err = -EIO; + ceph_mdsc_put_request(req1); + } + if (req2) { + ret = !wait_for_completion_timeout(&req2->r_safe_completion, + ceph_timeout_jiffies(req2->r_timeout)); + if (ret) + err = -EIO; + ceph_mdsc_put_request(req2); + } + return err; +} + int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync) { struct inode *inode = file->f_mapping->host; struct ceph_inode_info *ci = ceph_inode(inode); - unsigned flush_tid; + u64 flush_tid; int ret; int dirty; @@ -1908,25 +2025,30 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync) ret = filemap_write_and_wait_range(inode->i_mapping, start, end); if (ret < 0) - return ret; + goto out; + + if (datasync) + goto out; + mutex_lock(&inode->i_mutex); dirty = try_flush_caps(inode, &flush_tid); dout("fsync dirty caps are %s\n", ceph_cap_string(dirty)); + ret = unsafe_request_wait(inode); + /* * only wait on non-file metadata writeback (the mds * can recover size and mtime, so we don't need to * wait for that) */ - if (!datasync && (dirty & ~CEPH_CAP_ANY_FILE_WR)) { - dout("fsync waiting for flush_tid %u\n", flush_tid); + if (!ret && (dirty & ~CEPH_CAP_ANY_FILE_WR)) { ret = wait_event_interruptible(ci->i_cap_wq, - caps_are_flushed(inode, flush_tid)); + caps_are_flushed(inode, flush_tid)); } - - dout("fsync %p%s done\n", inode, datasync ? " datasync" : ""); mutex_unlock(&inode->i_mutex); +out: + dout("fsync %p%s result=%d\n", inode, datasync ? " datasync" : "", ret); return ret; } @@ -1939,7 +2061,7 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync) int ceph_write_inode(struct inode *inode, struct writeback_control *wbc) { struct ceph_inode_info *ci = ceph_inode(inode); - unsigned flush_tid; + u64 flush_tid; int err = 0; int dirty; int wait = wbc->sync_mode == WB_SYNC_ALL; @@ -1994,6 +2116,93 @@ static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc, } } +static int __kick_flushing_caps(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session, + struct ceph_inode_info *ci) +{ + struct inode *inode = &ci->vfs_inode; + struct ceph_cap *cap; + struct ceph_cap_flush *cf; + struct rb_node *n; + int delayed = 0; + u64 first_tid = 0; + u64 oldest_flush_tid; + + spin_lock(&mdsc->cap_dirty_lock); + oldest_flush_tid = __get_oldest_flush_tid(mdsc); + spin_unlock(&mdsc->cap_dirty_lock); + + while (true) { + spin_lock(&ci->i_ceph_lock); + cap = ci->i_auth_cap; + if (!(cap && cap->session == session)) { + pr_err("%p auth cap %p not mds%d ???\n", inode, + cap, session->s_mds); + spin_unlock(&ci->i_ceph_lock); + break; + } + + for (n = rb_first(&ci->i_cap_flush_tree); n; n = rb_next(n)) { + cf = rb_entry(n, struct ceph_cap_flush, i_node); + if (cf->tid >= first_tid) + break; + } + if (!n) { + spin_unlock(&ci->i_ceph_lock); + break; + } + + cf = rb_entry(n, struct ceph_cap_flush, i_node); + + first_tid = cf->tid + 1; + + dout("kick_flushing_caps %p cap %p tid %llu %s\n", inode, + cap, cf->tid, ceph_cap_string(cf->caps)); + delayed |= __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, + __ceph_caps_used(ci), + __ceph_caps_wanted(ci), + cap->issued | cap->implemented, + cf->caps, cf->tid, oldest_flush_tid); + } + return delayed; +} + +void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session) +{ + struct ceph_inode_info *ci; + struct ceph_cap *cap; + + dout("early_kick_flushing_caps mds%d\n", session->s_mds); + list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) { + spin_lock(&ci->i_ceph_lock); + cap = ci->i_auth_cap; + if (!(cap && cap->session == session)) { + pr_err("%p auth cap %p not mds%d ???\n", + &ci->vfs_inode, cap, session->s_mds); + spin_unlock(&ci->i_ceph_lock); + continue; + } + + + /* + * if flushing caps were revoked, we re-send the cap flush + * in client reconnect stage. This guarantees MDS * processes + * the cap flush message before issuing the flushing caps to + * other client. + */ + if ((cap->issued & ci->i_flushing_caps) != + ci->i_flushing_caps) { + spin_unlock(&ci->i_ceph_lock); + if (!__kick_flushing_caps(mdsc, session, ci)) + continue; + spin_lock(&ci->i_ceph_lock); + } + + spin_unlock(&ci->i_ceph_lock); + } +} + void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc, struct ceph_mds_session *session) { @@ -2003,28 +2212,10 @@ void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc, dout("kick_flushing_caps mds%d\n", session->s_mds); list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) { - struct inode *inode = &ci->vfs_inode; - struct ceph_cap *cap; - int delayed = 0; - - spin_lock(&ci->i_ceph_lock); - cap = ci->i_auth_cap; - if (cap && cap->session == session) { - dout("kick_flushing_caps %p cap %p %s\n", inode, - cap, ceph_cap_string(ci->i_flushing_caps)); - delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, - __ceph_caps_used(ci), - __ceph_caps_wanted(ci), - cap->issued | cap->implemented, - ci->i_flushing_caps, NULL); - if (delayed) { - spin_lock(&ci->i_ceph_lock); - __cap_delay_requeue(mdsc, ci); - spin_unlock(&ci->i_ceph_lock); - } - } else { - pr_err("%p auth cap %p not mds%d ???\n", inode, - cap, session->s_mds); + int delayed = __kick_flushing_caps(mdsc, session, ci); + if (delayed) { + spin_lock(&ci->i_ceph_lock); + __cap_delay_requeue(mdsc, ci); spin_unlock(&ci->i_ceph_lock); } } @@ -2036,26 +2227,25 @@ static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc, { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_cap *cap; - int delayed = 0; spin_lock(&ci->i_ceph_lock); cap = ci->i_auth_cap; - dout("kick_flushing_inode_caps %p flushing %s flush_seq %lld\n", inode, - ceph_cap_string(ci->i_flushing_caps), ci->i_cap_flush_seq); + dout("kick_flushing_inode_caps %p flushing %s\n", inode, + ceph_cap_string(ci->i_flushing_caps)); __ceph_flush_snaps(ci, &session, 1); if (ci->i_flushing_caps) { + int delayed; + spin_lock(&mdsc->cap_dirty_lock); list_move_tail(&ci->i_flushing_item, &cap->session->s_cap_flushing); spin_unlock(&mdsc->cap_dirty_lock); - delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, - __ceph_caps_used(ci), - __ceph_caps_wanted(ci), - cap->issued | cap->implemented, - ci->i_flushing_caps, NULL); + spin_unlock(&ci->i_ceph_lock); + + delayed = __kick_flushing_caps(mdsc, session, ci); if (delayed) { spin_lock(&ci->i_ceph_lock); __cap_delay_requeue(mdsc, ci); @@ -2073,7 +2263,8 @@ static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc, * * Protected by i_ceph_lock. */ -static void __take_cap_refs(struct ceph_inode_info *ci, int got) +static void __take_cap_refs(struct ceph_inode_info *ci, int got, + bool snap_rwsem_locked) { if (got & CEPH_CAP_PIN) ci->i_pin_ref++; @@ -2081,8 +2272,14 @@ static void __take_cap_refs(struct ceph_inode_info *ci, int got) ci->i_rd_ref++; if (got & CEPH_CAP_FILE_CACHE) ci->i_rdcache_ref++; - if (got & CEPH_CAP_FILE_WR) + if (got & CEPH_CAP_FILE_WR) { + if (ci->i_wr_ref == 0 && !ci->i_head_snapc) { + BUG_ON(!snap_rwsem_locked); + ci->i_head_snapc = ceph_get_snap_context( + ci->i_snap_realm->cached_context); + } ci->i_wr_ref++; + } if (got & CEPH_CAP_FILE_BUFFER) { if (ci->i_wb_ref == 0) ihold(&ci->vfs_inode); @@ -2100,16 +2297,19 @@ static void __take_cap_refs(struct ceph_inode_info *ci, int got) * requested from the MDS. */ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, - loff_t endoff, int *got, int *check_max, int *err) + loff_t endoff, bool nonblock, int *got, int *err) { struct inode *inode = &ci->vfs_inode; + struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; int ret = 0; int have, implemented; int file_wanted; + bool snap_rwsem_locked = false; dout("get_cap_refs %p need %s want %s\n", inode, ceph_cap_string(need), ceph_cap_string(want)); +again: spin_lock(&ci->i_ceph_lock); /* make sure file is actually open */ @@ -2125,6 +2325,10 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, /* finish pending truncate */ while (ci->i_truncate_pending) { spin_unlock(&ci->i_ceph_lock); + if (snap_rwsem_locked) { + up_read(&mdsc->snap_rwsem); + snap_rwsem_locked = false; + } __ceph_do_pending_vmtruncate(inode); spin_lock(&ci->i_ceph_lock); } @@ -2136,7 +2340,7 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, dout("get_cap_refs %p endoff %llu > maxsize %llu\n", inode, endoff, ci->i_max_size); if (endoff > ci->i_requested_max_size) { - *check_max = 1; + *err = -EAGAIN; ret = 1; } goto out_unlock; @@ -2164,8 +2368,29 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, inode, ceph_cap_string(have), ceph_cap_string(not), ceph_cap_string(revoking)); if ((revoking & not) == 0) { + if (!snap_rwsem_locked && + !ci->i_head_snapc && + (need & CEPH_CAP_FILE_WR)) { + if (!down_read_trylock(&mdsc->snap_rwsem)) { + /* + * we can not call down_read() when + * task isn't in TASK_RUNNING state + */ + if (nonblock) { + *err = -EAGAIN; + ret = 1; + goto out_unlock; + } + + spin_unlock(&ci->i_ceph_lock); + down_read(&mdsc->snap_rwsem); + snap_rwsem_locked = true; + goto again; + } + snap_rwsem_locked = true; + } *got = need | (have & want); - __take_cap_refs(ci, *got); + __take_cap_refs(ci, *got, true); ret = 1; } } else { @@ -2184,11 +2409,21 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, goto out_unlock; } + if (!__ceph_is_any_caps(ci) && + ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { + dout("get_cap_refs %p forced umount\n", inode); + *err = -EIO; + ret = 1; + goto out_unlock; + } + dout("get_cap_refs %p have %s needed %s\n", inode, ceph_cap_string(have), ceph_cap_string(need)); } out_unlock: spin_unlock(&ci->i_ceph_lock); + if (snap_rwsem_locked) + up_read(&mdsc->snap_rwsem); dout("get_cap_refs %p ret %d got %s\n", inode, ret, ceph_cap_string(*got)); @@ -2231,50 +2466,70 @@ static void check_max_size(struct inode *inode, loff_t endoff) int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, loff_t endoff, int *got, struct page **pinned_page) { - int _got, check_max, ret, err = 0; + int _got, ret, err = 0; -retry: - if (endoff > 0) - check_max_size(&ci->vfs_inode, endoff); - _got = 0; - check_max = 0; - ret = wait_event_interruptible(ci->i_cap_wq, - try_get_cap_refs(ci, need, want, endoff, - &_got, &check_max, &err)); - if (err) - ret = err; + ret = ceph_pool_perm_check(ci, need); if (ret < 0) return ret; - if (check_max) - goto retry; + while (true) { + if (endoff > 0) + check_max_size(&ci->vfs_inode, endoff); - if (ci->i_inline_version != CEPH_INLINE_NONE && - (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) && - i_size_read(&ci->vfs_inode) > 0) { - struct page *page = find_get_page(ci->vfs_inode.i_mapping, 0); - if (page) { - if (PageUptodate(page)) { - *pinned_page = page; - goto out; - } - page_cache_release(page); - } - /* - * drop cap refs first because getattr while holding - * caps refs can cause deadlock. - */ - ceph_put_cap_refs(ci, _got); + err = 0; _got = 0; + ret = try_get_cap_refs(ci, need, want, endoff, + false, &_got, &err); + if (ret) { + if (err == -EAGAIN) + continue; + if (err < 0) + return err; + } else { + ret = wait_event_interruptible(ci->i_cap_wq, + try_get_cap_refs(ci, need, want, endoff, + true, &_got, &err)); + if (err == -EAGAIN) + continue; + if (err < 0) + ret = err; + if (ret < 0) + return ret; + } - /* getattr request will bring inline data into page cache */ - ret = __ceph_do_getattr(&ci->vfs_inode, NULL, - CEPH_STAT_CAP_INLINE_DATA, true); - if (ret < 0) - return ret; - goto retry; + if (ci->i_inline_version != CEPH_INLINE_NONE && + (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) && + i_size_read(&ci->vfs_inode) > 0) { + struct page *page = + find_get_page(ci->vfs_inode.i_mapping, 0); + if (page) { + if (PageUptodate(page)) { + *pinned_page = page; + break; + } + page_cache_release(page); + } + /* + * drop cap refs first because getattr while + * holding * caps refs can cause deadlock. + */ + ceph_put_cap_refs(ci, _got); + _got = 0; + + /* + * getattr request will bring inline data into + * page cache + */ + ret = __ceph_do_getattr(&ci->vfs_inode, NULL, + CEPH_STAT_CAP_INLINE_DATA, + true); + if (ret < 0) + return ret; + continue; + } + break; } -out: + *got = _got; return 0; } @@ -2286,10 +2541,31 @@ out: void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps) { spin_lock(&ci->i_ceph_lock); - __take_cap_refs(ci, caps); + __take_cap_refs(ci, caps, false); spin_unlock(&ci->i_ceph_lock); } + +/* + * drop cap_snap that is not associated with any snapshot. + * we don't need to send FLUSHSNAP message for it. + */ +static int ceph_try_drop_cap_snap(struct ceph_cap_snap *capsnap) +{ + if (!capsnap->need_flush && + !capsnap->writing && !capsnap->dirty_pages) { + + dout("dropping cap_snap %p follows %llu\n", + capsnap, capsnap->follows); + ceph_put_snap_context(capsnap->context); + list_del(&capsnap->ci_item); + list_del(&capsnap->flushing_item); + ceph_put_cap_snap(capsnap); + return 1; + } + return 0; +} + /* * Release cap refs. * @@ -2303,7 +2579,6 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had) { struct inode *inode = &ci->vfs_inode; int last = 0, put = 0, flushsnaps = 0, wake = 0; - struct ceph_cap_snap *capsnap; spin_lock(&ci->i_ceph_lock); if (had & CEPH_CAP_PIN) @@ -2325,17 +2600,24 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had) if (had & CEPH_CAP_FILE_WR) if (--ci->i_wr_ref == 0) { last++; - if (!list_empty(&ci->i_cap_snaps)) { - capsnap = list_first_entry(&ci->i_cap_snaps, - struct ceph_cap_snap, - ci_item); - if (capsnap->writing) { - capsnap->writing = 0; - flushsnaps = - __ceph_finish_cap_snap(ci, - capsnap); - wake = 1; - } + if (__ceph_have_pending_cap_snap(ci)) { + struct ceph_cap_snap *capsnap = + list_last_entry(&ci->i_cap_snaps, + struct ceph_cap_snap, + ci_item); + capsnap->writing = 0; + if (ceph_try_drop_cap_snap(capsnap)) + put++; + else if (__ceph_finish_cap_snap(ci, capsnap)) + flushsnaps = 1; + wake = 1; + } + if (ci->i_wrbuffer_ref_head == 0 && + ci->i_dirty_caps == 0 && + ci->i_flushing_caps == 0) { + BUG_ON(!ci->i_head_snapc); + ceph_put_snap_context(ci->i_head_snapc); + ci->i_head_snapc = NULL; } /* see comment in __ceph_remove_cap() */ if (!__ceph_is_any_caps(ci) && ci->i_snap_realm) @@ -2352,7 +2634,7 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had) ceph_flush_snaps(ci); if (wake) wake_up_all(&ci->i_cap_wq); - if (put) + while (put-- > 0) iput(inode); } @@ -2380,7 +2662,9 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, if (ci->i_head_snapc == snapc) { ci->i_wrbuffer_ref_head -= nr; if (ci->i_wrbuffer_ref_head == 0 && - ci->i_dirty_caps == 0 && ci->i_flushing_caps == 0) { + ci->i_wr_ref == 0 && + ci->i_dirty_caps == 0 && + ci->i_flushing_caps == 0) { BUG_ON(!ci->i_head_snapc); ceph_put_snap_context(ci->i_head_snapc); ci->i_head_snapc = NULL; @@ -2401,25 +2685,15 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, capsnap->dirty_pages -= nr; if (capsnap->dirty_pages == 0) { complete_capsnap = 1; - if (capsnap->dirty == 0) - /* cap writeback completed before we created - * the cap_snap; no FLUSHSNAP is needed */ - drop_capsnap = 1; + drop_capsnap = ceph_try_drop_cap_snap(capsnap); } dout("put_wrbuffer_cap_refs on %p cap_snap %p " - " snap %lld %d/%d -> %d/%d %s%s%s\n", + " snap %lld %d/%d -> %d/%d %s%s\n", inode, capsnap, capsnap->context->seq, ci->i_wrbuffer_ref+nr, capsnap->dirty_pages + nr, ci->i_wrbuffer_ref, capsnap->dirty_pages, last ? " (wrbuffer last)" : "", - complete_capsnap ? " (complete capsnap)" : "", - drop_capsnap ? " (drop capsnap)" : ""); - if (drop_capsnap) { - ceph_put_snap_context(capsnap->context); - list_del(&capsnap->ci_item); - list_del(&capsnap->flushing_item); - ceph_put_cap_snap(capsnap); - } + complete_capsnap ? " (complete capsnap)" : ""); } spin_unlock(&ci->i_ceph_lock); @@ -2526,7 +2800,8 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc, * try to invalidate (once). (If there are dirty buffers, we * will invalidate _after_ writeback.) */ - if (((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) && + if (!S_ISDIR(inode->i_mode) && /* don't invalidate readdir cache */ + ((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) && (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 && !ci->i_wrbuffer_ref) { if (try_nonblocking_invalidate(inode)) { @@ -2732,16 +3007,29 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; + struct ceph_cap_flush *cf; + struct rb_node *n; + LIST_HEAD(to_remove); unsigned seq = le32_to_cpu(m->seq); int dirty = le32_to_cpu(m->dirty); int cleaned = 0; int drop = 0; - int i; - for (i = 0; i < CEPH_CAP_BITS; i++) - if ((dirty & (1 << i)) && - (u16)flush_tid == ci->i_cap_flush_tid[i]) - cleaned |= 1 << i; + n = rb_first(&ci->i_cap_flush_tree); + while (n) { + cf = rb_entry(n, struct ceph_cap_flush, i_node); + n = rb_next(&cf->i_node); + if (cf->tid == flush_tid) + cleaned = cf->caps; + if (cf->tid <= flush_tid) { + rb_erase(&cf->i_node, &ci->i_cap_flush_tree); + list_add_tail(&cf->list, &to_remove); + } else { + cleaned &= ~cf->caps; + if (!cleaned) + break; + } + } dout("handle_cap_flush_ack inode %p mds%d seq %d on %s cleaned %s," " flushing %s -> %s\n", @@ -2749,12 +3037,23 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, ceph_cap_string(cleaned), ceph_cap_string(ci->i_flushing_caps), ceph_cap_string(ci->i_flushing_caps & ~cleaned)); - if (ci->i_flushing_caps == (ci->i_flushing_caps & ~cleaned)) + if (list_empty(&to_remove) && !cleaned) goto out; ci->i_flushing_caps &= ~cleaned; spin_lock(&mdsc->cap_dirty_lock); + + if (!list_empty(&to_remove)) { + list_for_each_entry(cf, &to_remove, list) + rb_erase(&cf->g_node, &mdsc->cap_flush_tree); + + n = rb_first(&mdsc->cap_flush_tree); + cf = n ? rb_entry(n, struct ceph_cap_flush, g_node) : NULL; + if (!cf || cf->tid > flush_tid) + wake_up_all(&mdsc->cap_flushing_wq); + } + if (ci->i_flushing_caps == 0) { list_del_init(&ci->i_flushing_item); if (!list_empty(&session->s_cap_flushing)) @@ -2764,14 +3063,14 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, struct ceph_inode_info, i_flushing_item)->vfs_inode); mdsc->num_cap_flushing--; - wake_up_all(&mdsc->cap_flushing_wq); dout(" inode %p now !flushing\n", inode); if (ci->i_dirty_caps == 0) { dout(" inode %p now clean\n", inode); BUG_ON(!list_empty(&ci->i_dirty_item)); drop = 1; - if (ci->i_wrbuffer_ref_head == 0) { + if (ci->i_wr_ref == 0 && + ci->i_wrbuffer_ref_head == 0) { BUG_ON(!ci->i_head_snapc); ceph_put_snap_context(ci->i_head_snapc); ci->i_head_snapc = NULL; @@ -2785,6 +3084,13 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, out: spin_unlock(&ci->i_ceph_lock); + + while (!list_empty(&to_remove)) { + cf = list_first_entry(&to_remove, + struct ceph_cap_flush, list); + list_del(&cf->list); + ceph_free_cap_flush(cf); + } if (drop) iput(inode); } @@ -2800,6 +3106,7 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid, struct ceph_mds_session *session) { struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; u64 follows = le64_to_cpu(m->snap_follows); struct ceph_cap_snap *capsnap; int drop = 0; @@ -2823,6 +3130,7 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid, list_del(&capsnap->ci_item); list_del(&capsnap->flushing_item); ceph_put_cap_snap(capsnap); + wake_up_all(&mdsc->cap_flushing_wq); drop = 1; break; } else { @@ -2971,7 +3279,6 @@ retry: mutex_lock_nested(&session->s_mutex, SINGLE_DEPTH_NESTING); } - ceph_add_cap_releases(mdsc, tsession); new_cap = ceph_get_cap(mdsc, NULL); } else { WARN_ON(1); @@ -3167,16 +3474,20 @@ void ceph_handle_caps(struct ceph_mds_session *session, dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq, (unsigned)seq); - if (op == CEPH_CAP_OP_IMPORT) - ceph_add_cap_releases(mdsc, session); - if (!inode) { dout(" i don't have ino %llx\n", vino.ino); if (op == CEPH_CAP_OP_IMPORT) { + cap = ceph_get_cap(mdsc, NULL); + cap->cap_ino = vino.ino; + cap->queue_release = 1; + cap->cap_id = cap_id; + cap->mseq = mseq; + cap->seq = seq; spin_lock(&session->s_cap_lock); - __queue_cap_release(session, vino.ino, cap_id, - mseq, seq); + list_add_tail(&cap->session_caps, + &session->s_cap_releases); + session->s_num_cap_releases++; spin_unlock(&session->s_cap_lock); } goto flush_cap_releases; @@ -3252,11 +3563,10 @@ void ceph_handle_caps(struct ceph_mds_session *session, flush_cap_releases: /* - * send any full release message to try to move things + * send any cap release message to try to move things * along for the mds (who clearly thinks we still have this * cap). */ - ceph_add_cap_releases(mdsc, session); ceph_send_cap_releases(mdsc, session); done: |