Message ID | 20200916173854.330265-1-jlayton@kernel.org |
---|---|
Headers | show |
Series | ceph: addr.c cleanups | expand |
On Wed, 2020-09-16 at 13:38 -0400, Jeff Layton wrote: > ...and reorganize the loop for better clarity. > > Signed-off-by: Jeff Layton <jlayton@kernel.org> > --- > fs/ceph/addr.c | 148 ++++++++++++++++++++++--------------------------- > 1 file changed, 65 insertions(+), 83 deletions(-) > > diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c > index a6c54cfc3fea..89c4d8a9a5eb 100644 > --- a/fs/ceph/addr.c > +++ b/fs/ceph/addr.c > @@ -1251,6 +1251,8 @@ static int context_is_writeable_or_written(struct inode *inode, > * @inode: inode associated with page > * @page: page being dirtied > * > + * We are only allowed to write into/dirty the page if the page is > + * clean, or already dirty within the same snap context. > * Returns NULL on success, negative error code on error, and a snapc ref that should be > * waited on otherwise. > */ > @@ -1307,104 +1309,84 @@ ceph_find_incompatible(struct inode *inode, struct page *page) > /* > * We are only allowed to write into/dirty the page if the page is > * clean, or already dirty within the same snap context. > - * > - * called with page locked. > - * return success with page locked, > - * or any failure (incl -EAGAIN) with page unlocked. > */ > -static int ceph_update_writeable_page(struct file *file, > - loff_t pos, unsigned len, > - struct page *page) > +static int ceph_write_begin(struct file *file, struct address_space *mapping, > + loff_t pos, unsigned len, unsigned flags, > + struct page **pagep, void **fsdata) > { > struct inode *inode = file_inode(file); > struct ceph_inode_info *ci = ceph_inode(inode); > struct ceph_snap_context *snapc; > - loff_t page_off = pos & PAGE_MASK; > + struct page *page = NULL; > + pgoff_t index = pos >> PAGE_SHIFT; > int pos_in_page = pos & ~PAGE_MASK; > - int end_in_page = pos_in_page + len; > - loff_t i_size; > - int r; > + int r = 0; > > -retry_locked: > - snapc = ceph_find_incompatible(inode, page); > - if (snapc) { > - if (IS_ERR(snapc)) { > - r = PTR_ERR(snapc); > - goto fail_unlock; > + dout("write_begin file %p inode %p page %p %d~%d\n", file, inode, page, (int)pos, (int)len); > + > + for (;;) { > + page = grab_cache_page_write_begin(mapping, index, 0); > + if (!page) { > + r = -ENOMEM; > + break; > } > - unlock_page(page); > - ceph_queue_writeback(inode); > - r = wait_event_killable(ci->i_cap_wq, > - context_is_writeable_or_written(inode, snapc)); > - ceph_put_snap_context(snapc); > - return -EAGAIN; > - } > > - if (PageUptodate(page)) { > - dout(" page %p already uptodate\n", page); > - return 0; > - } > + snapc = ceph_find_incompatible(inode, page); > + if (snapc) { > + if (IS_ERR(snapc)) { > + r = PTR_ERR(snapc); > + break; > + } > + unlock_page(page); > + ceph_queue_writeback(inode); > + r = wait_event_killable(ci->i_cap_wq, > + context_is_writeable_or_written(inode, snapc)); > + ceph_put_snap_context(snapc); > + put_page(page); > + page = NULL; Now that I look a bit more, there is probably no need to hold the page reference across this wait. I'll change that to put it after unlocking and re-test. > + if (r != 0) > + break; > + continue; > + } > > - /* full page? */ > - if (pos_in_page == 0 && len == PAGE_SIZE) > - return 0; > + if (PageUptodate(page)) { > + dout(" page %p already uptodate\n", page); > + break; > + } > > - /* past end of file? */ > - i_size = i_size_read(inode); > - > - if (page_off >= i_size || > - (pos_in_page == 0 && (pos+len) >= i_size && > - end_in_page - pos_in_page != PAGE_SIZE)) { > - dout(" zeroing %p 0 - %d and %d - %d\n", > - page, pos_in_page, end_in_page, (int)PAGE_SIZE); > - zero_user_segments(page, > - 0, pos_in_page, > - end_in_page, PAGE_SIZE); > - return 0; > - } > + /* > + * In some cases we don't need to read at all: > + * - full page write > + * - write that lies completely beyond EOF > + * - write that covers the the page from start to EOF or beyond it > + */ > + if ((pos_in_page == 0 && len == PAGE_SIZE) || > + (pos >= i_size_read(inode)) || > + (pos_in_page == 0 && (pos + len) >= i_size_read(inode))) { > + zero_user_segments(page, 0, pos_in_page, > + pos_in_page + len, PAGE_SIZE); > + break; > + } > > - /* we need to read it. */ > - r = ceph_do_readpage(file, page); > - if (r < 0) { > - if (r == -EINPROGRESS) > - return -EAGAIN; > - goto fail_unlock; > + /* > + * We need to read it. If we get back -EINPROGRESS, then the page was > + * handed off to fscache and it will be unlocked when the read completes. > + * Refind the page in that case so we can reacquire the page lock. Otherwise > + * we got a hard error or the read was completed synchronously. > + */ > + r = ceph_do_readpage(file, page); > + if (r != -EINPROGRESS) > + break; > } > - goto retry_locked; > -fail_unlock: > - unlock_page(page); > - return r; > -} > > -/* > - * We are only allowed to write into/dirty the page if the page is > - * clean, or already dirty within the same snap context. > - */ > -static int ceph_write_begin(struct file *file, struct address_space *mapping, > - loff_t pos, unsigned len, unsigned flags, > - struct page **pagep, void **fsdata) > -{ > - struct inode *inode = file_inode(file); > - struct page *page; > - pgoff_t index = pos >> PAGE_SHIFT; > - int r; > - > - do { > - /* get a page */ > - page = grab_cache_page_write_begin(mapping, index, 0); > - if (!page) > - return -ENOMEM; > - > - dout("write_begin file %p inode %p page %p %d~%d\n", file, > - inode, page, (int)pos, (int)len); > - > - r = ceph_update_writeable_page(file, pos, len, page); > - if (r < 0) > + if (r < 0) { > + if (page) { > + unlock_page(page); > put_page(page); > - else > - *pagep = page; > - } while (r == -EAGAIN); > - > + } > + } else { > + *pagep = page; > + } > return r; > } > -- Jeff Layton <jlayton@kernel.org>
We're seeing file corruption while running 5.10, bisected to 1cc1699070bd: >> +static int ceph_write_begin(struct file *file, struct address_space *mapping, >> + loff_t pos, unsigned len, unsigned flags, >> + struct page **pagep, void **fsdata) <snip> >> + /* >> + * In some cases we don't need to read at all: >> + * - full page write >> + * - write that lies completely beyond EOF >> + * - write that covers the the page from start to EOF or beyond it >> + */ >> + if ((pos_in_page == 0 && len == PAGE_SIZE) || >> + (pos >= i_size_read(inode)) || Shouldn't this be '((pos & PAGE_MASK) >= i_size_read(inode)) ||' ? Seems like fs/netfs/read_helper.c currently has the same issue? >> + (pos_in_page == 0 && (pos + len) >= i_size_read(inode))) { >> + zero_user_segments(page, 0, pos_in_page, >> + pos_in_page + len, PAGE_SIZE); >> + break; >> + } -- Andrew W. Elble Infrastructure Engineer Information and Technology Services Rochester Institute of Technology tel: (585)-475-2411 | aweits@rit.edu PGP: BFAD 8461 4CCF DC95 DA2C B0EB 965B 082E 863E C912
On Fri, 2021-06-11 at 10:14 -0400, Andrew W Elble wrote: > We're seeing file corruption while running 5.10, bisected to 1cc1699070bd: > > > > +static int ceph_write_begin(struct file *file, struct address_space *mapping, > > > + loff_t pos, unsigned len, unsigned flags, > > > + struct page **pagep, void **fsdata) > > <snip> > > > > + /* > > > + * In some cases we don't need to read at all: > > > + * - full page write > > > + * - write that lies completely beyond EOF > > > + * - write that covers the the page from start to EOF or beyond it > > > + */ > > > + if ((pos_in_page == 0 && len == PAGE_SIZE) || > > > + (pos >= i_size_read(inode)) || > > Shouldn't this be '((pos & PAGE_MASK) >= i_size_read(inode)) ||' ? > > Seems like fs/netfs/read_helper.c currently has the same issue? > Yeah...I think you may be right. Have you tried a patch that does that and does it fix the issue? Also, do you happen to have a testcase that we could stick in xfstests for this? If not, we can probably write one, but if you already have one that'd be great. > > > + (pos_in_page == 0 && (pos + len) >= i_size_read(inode))) { > > > + zero_user_segments(page, 0, pos_in_page, > > > + pos_in_page + len, PAGE_SIZE); > > > + break; > > > + } > -- Jeff Layton <jlayton@kernel.org>
Jeff Layton <jlayton@kernel.org> wrote: > On Fri, 2021-06-11 at 10:14 -0400, Andrew W Elble wrote: > > We're seeing file corruption while running 5.10, bisected to 1cc1699070bd: > > > > > > +static int ceph_write_begin(struct file *file, struct address_space *mapping, > > > > + loff_t pos, unsigned len, unsigned flags, > > > > + struct page **pagep, void **fsdata) > > > > <snip> > > > > > > + /* > > > > + * In some cases we don't need to read at all: > > > > + * - full page write > > > > + * - write that lies completely beyond EOF > > > > + * - write that covers the the page from start to EOF or beyond it > > > > + */ > > > > + if ((pos_in_page == 0 && len == PAGE_SIZE) || > > > > + (pos >= i_size_read(inode)) || > > > > Shouldn't this be '((pos & PAGE_MASK) >= i_size_read(inode)) ||' ? > > > > Seems like fs/netfs/read_helper.c currently has the same issue? That's not quite right either. page may be larger than PAGE_MASK if grab_cache_page_write_begin() returns a THP (if that's possible). Maybe: (pos & thp_size(page) - 1) >= i_size_read(inode) Really, we want something like thp_pos(). Maybe Willy has something like that up his sleeve. In fact, in netfs_write_begin(), index and pos_in_page should be calculated after grab_cache_page_write_begin() has been called, just in case the new page extends before the page containing the requested position. David
On Fri, Jun 11, 2021 at 04:11:49PM +0100, David Howells wrote: > Jeff Layton <jlayton@kernel.org> wrote: > > > On Fri, 2021-06-11 at 10:14 -0400, Andrew W Elble wrote: > > > We're seeing file corruption while running 5.10, bisected to 1cc1699070bd: > > > > > > > > +static int ceph_write_begin(struct file *file, struct address_space *mapping, > > > > > + loff_t pos, unsigned len, unsigned flags, > > > > > + struct page **pagep, void **fsdata) > > > > > > <snip> > > > > > > > > + /* > > > > > + * In some cases we don't need to read at all: > > > > > + * - full page write > > > > > + * - write that lies completely beyond EOF > > > > > + * - write that covers the the page from start to EOF or beyond it > > > > > + */ > > > > > + if ((pos_in_page == 0 && len == PAGE_SIZE) || > > > > > + (pos >= i_size_read(inode)) || > > > > > > Shouldn't this be '((pos & PAGE_MASK) >= i_size_read(inode)) ||' ? > > > > > > Seems like fs/netfs/read_helper.c currently has the same issue? How does (pos & PAGE_MASK) >= i_size_read() make sense? That could only be true if the file is less than a page in size, whereas it should always be true if the write starts outside the current i_size. > That's not quite right either. page may be larger than PAGE_MASK if > grab_cache_page_write_begin() returns a THP (if that's possible). > > Maybe: > > (pos & thp_size(page) - 1) >= i_size_read(inode) > > Really, we want something like thp_pos(). Maybe Willy has something like that > up his sleeve. > > In fact, in netfs_write_begin(), index and pos_in_page should be calculated > after grab_cache_page_write_begin() has been called, just in case the new page > extends before the page containing the requested position. Yes. I do that kind of thing in iomap. What you're doing there looks a bit like offset_in_folio(), but I don't understand the problem with just checking pos against i_size directly. https://git.infradead.org/users/willy/pagecache.git/shortlog/refs/heads/folio contains a number of commits that start 'iomap:' which may be of interest.
Matthew Wilcox <willy@infradead.org> wrote: > Yes. I do that kind of thing in iomap. What you're doing there looks > a bit like offset_in_folio(), but I don't understand the problem with > just checking pos against i_size directly. pos can be in the middle of a page. If i_size is at, say, the same point in the middle of that page and the page isn't currently in the cache, then we'll just clear the entire page and not read the bottom part of it (ie. the bit prior to the EOF). It's odd, though, that xfstests doesn't catch this. David
On Fri, 2021-06-11 at 16:20 +0100, Matthew Wilcox wrote: > On Fri, Jun 11, 2021 at 04:11:49PM +0100, David Howells wrote: > > Jeff Layton <jlayton@kernel.org> wrote: > > > > > On Fri, 2021-06-11 at 10:14 -0400, Andrew W Elble wrote: > > > > We're seeing file corruption while running 5.10, bisected to 1cc1699070bd: > > > > > > > > > > +static int ceph_write_begin(struct file *file, struct address_space *mapping, > > > > > > + loff_t pos, unsigned len, unsigned flags, > > > > > > + struct page **pagep, void **fsdata) > > > > > > > > <snip> > > > > > > > > > > + /* > > > > > > + * In some cases we don't need to read at all: > > > > > > + * - full page write > > > > > > + * - write that lies completely beyond EOF > > > > > > + * - write that covers the the page from start to EOF or beyond it > > > > > > + */ > > > > > > + if ((pos_in_page == 0 && len == PAGE_SIZE) || > > > > > > + (pos >= i_size_read(inode)) || > > > > > > > > Shouldn't this be '((pos & PAGE_MASK) >= i_size_read(inode)) ||' ? > > > > > > > > Seems like fs/netfs/read_helper.c currently has the same issue? > > How does (pos & PAGE_MASK) >= i_size_read() make sense? That could only > be true if the file is less than a page in size, whereas it should > always be true if the write starts outside the current i_size. > Yeah, I guess what we really need is to round the i_size up to the start of the next page and then compare whether pos is beyond that. > > That's not quite right either. page may be larger than PAGE_MASK if > > grab_cache_page_write_begin() returns a THP (if that's possible). > > > > Maybe: > > > > (pos & thp_size(page) - 1) >= i_size_read(inode) > > > > Really, we want something like thp_pos(). Maybe Willy has something like that > > up his sleeve. > > > > In fact, in netfs_write_begin(), index and pos_in_page should be calculated > > after grab_cache_page_write_begin() has been called, just in case the new page > > extends before the page containing the requested position. > > Yes. I do that kind of thing in iomap. What you're doing there looks > a bit like offset_in_folio(), but I don't understand the problem with > just checking pos against i_size directly. > Suppose the i_size is 3 and you do a 1 byte write at offset 5. You're beyond the EOF, so the condition would return true, but you still need to read in the start of the page in that case. I think we probably need a testcase that does this in xfstests: open file write 3 bytes at start close unmount or drop pagecache in some way then write 1 byte at offset 5 see whether the resulting contents match the expect ones > https://git.infradead.org/users/willy/pagecache.git/shortlog/refs/heads/folio > contains a number of commits that start 'iomap:' which may be of interest. -- Jeff Layton <jlayton@kernel.org>
David Howells <dhowells@redhat.com> writes: > Matthew Wilcox <willy@infradead.org> wrote: > >> Yes. I do that kind of thing in iomap. What you're doing there looks >> a bit like offset_in_folio(), but I don't understand the problem with >> just checking pos against i_size directly. > > pos can be in the middle of a page. If i_size is at, say, the same point in > the middle of that page and the page isn't currently in the cache, then we'll > just clear the entire page and not read the bottom part of it (ie. the bit > prior to the EOF). > > It's odd, though, that xfstests doesn't catch this. Some more details/information: 1.) The test is a really simple piece of code that essentially writes a incrementing number to a file on 'node a', while a 'tail -f' is run and exited multiple times on 'node b'. We haven't had much luck in this causing the problem when done on a single node 2.) ((pos & PAGE_MASK) >= i_size_read(inode)) ||' merely reflects the logic that was in place prior to 1cc1699070bd. Patching fs/ceph/addr.c with this does seem to eliminate the corruption. (We'll test with the patch Jeff posted here shortly) 3.) After finding all of this, I went upstream to look at fs/ceph/addr.c and discovered the move to leveraging fs/netfs/read_helper.c - we've only just compiled against git master and replicated the corruption there. -- Andrew W. Elble Infrastructure Engineer Information and Technology Services Rochester Institute of Technology tel: (585)-475-2411 | aweits@rit.edu PGP: BFAD 8461 4CCF DC95 DA2C B0EB 965B 082E 863E C912
On Fri, Jun 11, 2021 at 04:35:25PM +0100, David Howells wrote: > Matthew Wilcox <willy@infradead.org> wrote: > > > Yes. I do that kind of thing in iomap. What you're doing there looks > > a bit like offset_in_folio(), but I don't understand the problem with > > just checking pos against i_size directly. > > pos can be in the middle of a page. If i_size is at, say, the same point in > the middle of that page and the page isn't currently in the cache, then we'll > just clear the entire page and not read the bottom part of it (ie. the bit > prior to the EOF). The comment says that's expected, though. I think the comment is wrong; consider the case where the page size is 4kB, file is 5kB long and we do a 1kB write at offset=6kB. The existing CEPH code (prior to netfs) will zero out 4-6KB and 7-8kB, which is wrong. Anyway, looking at netfs_write_begin(), it's wrong too, in a bunch of ways. You don't need to zero out the part of the page you're going to copy into. And the condition is overly complicated which makes it hard to know what's going on. Setting aside the is_cache_enabled part, I think you want: if (offset == 0 && len >= thp_size(page)) goto have_page_no_wait; if (page_offset(page) >= size) { zero_user_segments(page, 0, offset, offset + len, thp_size(page)); goto have_page_no_wait; } ... read the interesting chunks of page ...
Matthew Wilcox <willy@infradead.org> wrote: > Anyway, looking at netfs_write_begin(), it's wrong too, in a bunch of > ways. You don't need to zero out the part of the page you're going to > copy into. Zeroing it out isn't 'wrong', per se, just inefficient. Fixing that needs the filesystem to deal with it if the copy fails. > And the condition is overly complicated which makes it > hard to know what's going on. Setting aside the is_cache_enabled part, > I think you want: > > if (offset == 0 && len >= thp_size(page)) > goto have_page_no_wait; > if (page_offset(page) >= size) { > zero_user_segments(page, 0, offset, > offset + len, thp_size(page)); There's a third case too: where the write starts at the beginning of the page and goes to/straddles the EOF - but doesn't continue to the end of the page. You also didn't set PG_uptodate - presumably deliberately because there's a hole potentially containing random rubbish in the middle. > goto have_page_no_wait; > } > ... read the interesting chunks of page ... David