From 666525dfbdca09bbd4848ac711e4a4dbd6921325 Mon Sep 17 00:00:00 2001 From: Chen Gang Date: Mon, 7 Apr 2014 10:18:56 -0400 Subject: ext4: fix 64-bit number truncation warning '0x7FDEADBEEF' will be truncated to 32-bit number under unicore32. Need append 'ULL' for it. The related warning (with allmodconfig under unicore32): CC [M] fs/ext4/extents_status.o fs/ext4/extents_status.c: In function "__es_remove_extent": fs/ext4/extents_status.c:813: warning: integer constant is too large for "long" type Signed-off-by: Chen Gang Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents_status.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 0a014a7194b2..0ebc21204b51 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -810,7 +810,7 @@ retry: newes.es_lblk = end + 1; newes.es_len = len2; - block = 0x7FDEADBEEF; + block = 0x7FDEADBEEFULL; if (ext4_es_is_written(&orig_es) || ext4_es_is_unwritten(&orig_es)) block = ext4_es_pblock(&orig_es) + -- cgit v1.2.3 From 4adb6ab3e0fa71363a5ef229544b2d17de6600d7 Mon Sep 17 00:00:00 2001 From: Kazuya Mio Date: Mon, 7 Apr 2014 10:53:28 -0400 Subject: ext4: FIBMAP ioctl causes BUG_ON due to handle EXT_MAX_BLOCKS When we try to get 2^32-1 block of the file which has the extent (ee_block=2^32-2, ee_len=1) with FIBMAP ioctl, it causes BUG_ON in ext4_ext_put_gap_in_cache(). To avoid the problem, ext4_map_blocks() needs to check the file logical block number. ext4_ext_put_gap_in_cache() called via ext4_map_blocks() cannot handle 2^32-1 because the maximum file logical block number is 2^32-2. Note that ext4_ind_map_blocks() returns -EIO when the block number is invalid. So ext4_map_blocks() should also return the same errno. Signed-off-by: Kazuya Mio Signed-off-by: "Theodore Ts'o" Cc: stable@vger.kernel.org --- fs/ext4/inode.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 5b0d2c7d5408..93f16c5e8a8e 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -522,6 +522,10 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, if (unlikely(map->m_len > INT_MAX)) map->m_len = INT_MAX; + /* We can handle the block number less than EXT_MAX_BLOCKS */ + if (unlikely(map->m_lblk >= EXT_MAX_BLOCKS)) + return -EIO; + /* Lookup extent status tree firstly */ if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) { ext4_es_lru_add(inode); -- cgit v1.2.3 From 007649375f6af242d5b1df2c15996949714303ba Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Mon, 7 Apr 2014 10:54:20 -0400 Subject: ext4: initialize multi-block allocator before checking block descriptors With EXT4FS_DEBUG ext4_count_free_clusters() will call ext4_read_block_bitmap() without s_group_info initialized, so we need to initialize multi-block allocator before. And dependencies that must be solved, to allow this: - multi-block allocator needs in group descriptors - need to install s_op before initializing multi-block allocator, because in ext4_mb_init_backend() new inode is created. - initialize number of group desc blocks (s_gdb_count) otherwise number of clusters returned by ext4_free_clusters_after_init() is not correct. (see ext4_bg_num_gdb_nometa()) Here is the stack backtrace: (gdb) bt #0 ext4_get_group_info (group=0, sb=0xffff880079a10000) at ext4.h:2430 #1 ext4_validate_block_bitmap (sb=sb@entry=0xffff880079a10000, desc=desc@entry=0xffff880056510000, block_group=block_group@entry=0, bh=bh@entry=0xffff88007bf2b2d8) at balloc.c:358 #2 0xffffffff81232202 in ext4_wait_block_bitmap (sb=sb@entry=0xffff880079a10000, block_group=block_group@entry=0, bh=bh@entry=0xffff88007bf2b2d8) at balloc.c:476 #3 0xffffffff81232eaf in ext4_read_block_bitmap (sb=sb@entry=0xffff880079a10000, block_group=block_group@entry=0) at balloc.c:489 #4 0xffffffff81232fc0 in ext4_count_free_clusters (sb=sb@entry=0xffff880079a10000) at balloc.c:665 #5 0xffffffff81259ffa in ext4_check_descriptors (first_not_zeroed=, sb=0xffff880079a10000) at super.c:2143 #6 ext4_fill_super (sb=sb@entry=0xffff880079a10000, data=, data@entry=0x0 , silent=silent@entry=0) at super.c:3851 ... Signed-off-by: Azat Khuzhin Signed-off-by: "Theodore Ts'o" --- fs/ext4/super.c | 51 +++++++++++++++++++++++++++------------------------ 1 file changed, 27 insertions(+), 24 deletions(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index f3c667091618..6f9e6fadac04 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3869,19 +3869,38 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount2; } } + + /* + * set up enough so that it can read an inode, + * and create new inode for buddy allocator + */ + sbi->s_gdb_count = db_count; + if (!test_opt(sb, NOLOAD) && + EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) + sb->s_op = &ext4_sops; + else + sb->s_op = &ext4_nojournal_sops; + + ext4_ext_init(sb); + err = ext4_mb_init(sb); + if (err) { + ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)", + err); + goto failed_mount2; + } + if (!ext4_check_descriptors(sb, &first_not_zeroed)) { ext4_msg(sb, KERN_ERR, "group descriptors corrupted!"); - goto failed_mount2; + goto failed_mount2a; } if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) if (!ext4_fill_flex_info(sb)) { ext4_msg(sb, KERN_ERR, "unable to initialize " "flex_bg meta info!"); - goto failed_mount2; + goto failed_mount2a; } - sbi->s_gdb_count = db_count; get_random_bytes(&sbi->s_next_generation, sizeof(u32)); spin_lock_init(&sbi->s_next_gen_lock); @@ -3916,14 +3935,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_stripe = ext4_get_stripe_size(sbi); sbi->s_extent_max_zeroout_kb = 32; - /* - * set up enough so that it can read an inode - */ - if (!test_opt(sb, NOLOAD) && - EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) - sb->s_op = &ext4_sops; - else - sb->s_op = &ext4_nojournal_sops; sb->s_export_op = &ext4_export_ops; sb->s_xattr = ext4_xattr_handlers; #ifdef CONFIG_QUOTA @@ -4113,21 +4124,13 @@ no_journal: if (err) { ext4_msg(sb, KERN_ERR, "failed to reserve %llu clusters for " "reserved pool", ext4_calculate_resv_clusters(sb)); - goto failed_mount4a; + goto failed_mount5; } err = ext4_setup_system_zone(sb); if (err) { ext4_msg(sb, KERN_ERR, "failed to initialize system " "zone (%d)", err); - goto failed_mount4a; - } - - ext4_ext_init(sb); - err = ext4_mb_init(sb); - if (err) { - ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)", - err); goto failed_mount5; } @@ -4204,11 +4207,8 @@ failed_mount8: failed_mount7: ext4_unregister_li_request(sb); failed_mount6: - ext4_mb_release(sb); -failed_mount5: - ext4_ext_release(sb); ext4_release_system_zone(sb); -failed_mount4a: +failed_mount5: dput(sb->s_root); sb->s_root = NULL; failed_mount4: @@ -4232,11 +4232,14 @@ failed_mount3: percpu_counter_destroy(&sbi->s_extent_cache_cnt); if (sbi->s_mmp_tsk) kthread_stop(sbi->s_mmp_tsk); +failed_mount2a: + ext4_mb_release(sb); failed_mount2: for (i = 0; i < db_count; i++) brelse(sbi->s_group_desc[i]); ext4_kvfree(sbi->s_group_desc); failed_mount: + ext4_ext_release(sb); if (sbi->s_chksum_driver) crypto_free_shash(sbi->s_chksum_driver); if (sbi->s_proc) { -- cgit v1.2.3 From 9503c67c93ed0b95ba62d12d1fd09da6245dbdd6 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Mon, 7 Apr 2014 10:54:20 -0400 Subject: ext4: note the error in ext4_end_bio() ext4_end_bio() currently throws away the error that it receives. Chances are this is part of a spate of errors, one of which will end up getting the error returned to userspace somehow, but we shouldn't take that risk. Also print out the errno to aid in debug. Signed-off-by: Matthew Wilcox Signed-off-by: "Theodore Ts'o" Reviewed-by: Jan Kara Cc: stable@vger.kernel.org --- fs/ext4/page-io.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index ab95508e3d40..c18d95b50540 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -308,13 +308,14 @@ static void ext4_end_bio(struct bio *bio, int error) if (error) { struct inode *inode = io_end->inode; - ext4_warning(inode->i_sb, "I/O error writing to inode %lu " + ext4_warning(inode->i_sb, "I/O error %d writing to inode %lu " "(offset %llu size %ld starting block %llu)", - inode->i_ino, + error, inode->i_ino, (unsigned long long) io_end->offset, (long) io_end->size, (unsigned long long) bi_sector >> (inode->i_blkbits - 9)); + mapping_set_error(inode->i_mapping, error); } if (io_end->flag & EXT4_IO_END_UNWRITTEN) { -- cgit v1.2.3 From ec4cb1aa2b7bae18dd8164f2e9c7c51abcf61280 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 7 Apr 2014 10:54:21 -0400 Subject: ext4: fix jbd2 warning under heavy xattr load When heavily exercising xattr code the assertion that jbd2_journal_dirty_metadata() shouldn't return error was triggered: WARNING: at /srv/autobuild-ceph/gitbuilder.git/build/fs/jbd2/transaction.c:1237 jbd2_journal_dirty_metadata+0x1ba/0x260() CPU: 0 PID: 8877 Comm: ceph-osd Tainted: G W 3.10.0-ceph-00049-g68d04c9 #1 Hardware name: Dell Inc. PowerEdge R410/01V648, BIOS 1.6.3 02/07/2011 ffffffff81a1d3c8 ffff880214469928 ffffffff816311b0 ffff880214469968 ffffffff8103fae0 ffff880214469958 ffff880170a9dc30 ffff8802240fbe80 0000000000000000 ffff88020b366000 ffff8802256e7510 ffff880214469978 Call Trace: [] dump_stack+0x19/0x1b [] warn_slowpath_common+0x70/0xa0 [] warn_slowpath_null+0x1a/0x20 [] jbd2_journal_dirty_metadata+0x1ba/0x260 [] __ext4_handle_dirty_metadata+0xa3/0x140 [] ext4_xattr_release_block+0x103/0x1f0 [] ext4_xattr_block_set+0x1e0/0x910 [] ext4_xattr_set_handle+0x38b/0x4a0 [] ? trace_hardirqs_on+0xd/0x10 [] ext4_xattr_set+0xc2/0x140 [] ext4_xattr_user_set+0x47/0x50 [] generic_setxattr+0x6e/0x90 [] __vfs_setxattr_noperm+0x7b/0x1c0 [] vfs_setxattr+0xc4/0xd0 [] setxattr+0x13e/0x1e0 [] ? __sb_start_write+0xe7/0x1b0 [] ? mnt_want_write_file+0x28/0x60 [] ? fget_light+0x3c/0x130 [] ? mnt_want_write_file+0x28/0x60 [] ? __mnt_want_write+0x58/0x70 [] SyS_fsetxattr+0xbe/0x100 [] system_call_fastpath+0x16/0x1b The reason for the warning is that buffer_head passed into jbd2_journal_dirty_metadata() didn't have journal_head attached. This is caused by the following race of two ext4_xattr_release_block() calls: CPU1 CPU2 ext4_xattr_release_block() ext4_xattr_release_block() lock_buffer(bh); /* False */ if (BHDR(bh)->h_refcount == cpu_to_le32(1)) } else { le32_add_cpu(&BHDR(bh)->h_refcount, -1); unlock_buffer(bh); lock_buffer(bh); /* True */ if (BHDR(bh)->h_refcount == cpu_to_le32(1)) get_bh(bh); ext4_free_blocks() ... jbd2_journal_forget() jbd2_journal_unfile_buffer() -> JH is gone error = ext4_handle_dirty_xattr_block(handle, inode, bh); -> triggers the warning We fix the problem by moving ext4_handle_dirty_xattr_block() under the buffer lock. Sadly this cannot be done in nojournal mode as that function can call sync_dirty_buffer() which would deadlock. Luckily in nojournal mode the race is harmless (we only dirty already freed buffer) and thus for nojournal mode we leave the dirtying outside of the buffer lock. Reported-by: Sage Weil Signed-off-by: Jan Kara Signed-off-by: "Theodore Ts'o" Cc: stable@vger.kernel.org --- fs/ext4/xattr.c | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 1f5cf5880718..4eec399ec807 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -520,8 +520,8 @@ static void ext4_xattr_update_super_block(handle_t *handle, } /* - * Release the xattr block BH: If the reference count is > 1, decrement - * it; otherwise free the block. + * Release the xattr block BH: If the reference count is > 1, decrement it; + * otherwise free the block. */ static void ext4_xattr_release_block(handle_t *handle, struct inode *inode, @@ -542,16 +542,31 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode, if (ce) mb_cache_entry_free(ce); get_bh(bh); + unlock_buffer(bh); ext4_free_blocks(handle, inode, bh, 0, 1, EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); - unlock_buffer(bh); } else { le32_add_cpu(&BHDR(bh)->h_refcount, -1); if (ce) mb_cache_entry_release(ce); + /* + * Beware of this ugliness: Releasing of xattr block references + * from different inodes can race and so we have to protect + * from a race where someone else frees the block (and releases + * its journal_head) before we are done dirtying the buffer. In + * nojournal mode this race is harmless and we actually cannot + * call ext4_handle_dirty_xattr_block() with locked buffer as + * that function can call sync_dirty_buffer() so for that case + * we handle the dirtying after unlocking the buffer. + */ + if (ext4_handle_valid(handle)) + error = ext4_handle_dirty_xattr_block(handle, inode, + bh); unlock_buffer(bh); - error = ext4_handle_dirty_xattr_block(handle, inode, bh); + if (!ext4_handle_valid(handle)) + error = ext4_handle_dirty_xattr_block(handle, inode, + bh); if (IS_SYNC(inode)) ext4_handle_sync(handle); dquot_free_block(inode, EXT4_C2B(EXT4_SB(inode->i_sb), 1)); -- cgit v1.2.3 From 87f7e41636ff201148443551d06bc74497160aac Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 8 Apr 2014 11:38:28 -0400 Subject: ext4: update PF_MEMALLOC handling in ext4_write_inode() The special handling of PF_MEMALLOC callers in ext4_write_inode() shouldn't be necessary as there shouldn't be any. Warn about it. Also update comment before the function as it seems somewhat outdated. (Changes modeled on an ext3 patch posted by Jan Kara to the linux-ext4 mailing list on Februaryt 28, 2014, which apparently never went into the ext3 tree.) Signed-off-by: "Theodore Ts'o" Cc: Jan Kara --- fs/ext4/inode.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 93f16c5e8a8e..7b93df9aa182 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4427,21 +4427,20 @@ out_brelse: * * We are called from a few places: * - * - Within generic_file_write() for O_SYNC files. + * - Within generic_file_aio_write() -> generic_write_sync() for O_SYNC files. * Here, there will be no transaction running. We wait for any running * transaction to commit. * - * - Within sys_sync(), kupdate and such. - * We wait on commit, if tol to. + * - Within flush work (sys_sync(), kupdate and such). + * We wait on commit, if told to. * - * - Within prune_icache() (PF_MEMALLOC == true) - * Here we simply return. We can't afford to block kswapd on the - * journal commit. + * - Within iput_final() -> write_inode_now() + * We wait on commit, if told to. * * In all cases it is actually safe for us to return without doing anything, * because the inode has been copied into a raw inode buffer in - * ext4_mark_inode_dirty(). This is a correctness thing for O_SYNC and for - * knfsd. + * ext4_mark_inode_dirty(). This is a correctness thing for WB_SYNC_ALL + * writeback. * * Note that we are absolutely dependent upon all inode dirtiers doing the * right thing: they *must* call mark_inode_dirty() after dirtying info in @@ -4453,15 +4452,15 @@ out_brelse: * stuff(); * inode->i_size = expr; * - * is in error because a kswapd-driven write_inode() could occur while - * `stuff()' is running, and the new i_size will be lost. Plus the inode - * will no longer be on the superblock's dirty inode list. + * is in error because write_inode() could occur while `stuff()' is running, + * and the new i_size will be lost. Plus the inode will no longer be on the + * superblock's dirty inode list. */ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) { int err; - if (current->flags & PF_MEMALLOC) + if (WARN_ON_ONCE(current->flags & PF_MEMALLOC)) return 0; if (EXT4_SB(inode->i_sb)->s_journal) { -- cgit v1.2.3 From 1ce01c4a199c50b023802be25261c0c02b2f0214 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Thu, 10 Apr 2014 22:58:20 -0400 Subject: ext4: fix COLLAPSE_RANGE test failure in data journalling mode When mounting ext4 with data=journal option, xfstest shared/002 and shared/004 are currently failing as checksum computed for testfile does not match with the checksum computed in other journal modes. In case of data=journal mode, a call to filemap_write_and_wait_range will not flush anything to disk as buffers are not marked dirty in write_end. In collapse range this call is followed by a call to truncate_pagecache_range. Due to this, when checksum is computed, a portion of file is re-read from disk which replace valid data with NULL bytes and hence the reason for the difference in checksum. Calling ext4_force_commit before filemap_write_and_wait_range solves the issue as it will mark the buffers dirty during commit transaction which can be later synced by a call to filemap_write_and_wait_range. Signed-off-by: Namjae Jeon Signed-off-by: Ashish Sangwan Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 82df3ce9874a..be1e56cbbf32 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -5383,6 +5383,13 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) punch_start = offset >> EXT4_BLOCK_SIZE_BITS(sb); punch_stop = (offset + len) >> EXT4_BLOCK_SIZE_BITS(sb); + /* Call ext4_force_commit to flush all data in case of data=journal. */ + if (ext4_should_journal_data(inode)) { + ret = ext4_force_commit(inode->i_sb); + if (ret) + return ret; + } + /* Write out all dirty pages */ ret = filemap_write_and_wait_range(inode->i_mapping, offset, -1); if (ret) -- cgit v1.2.3 From c57ab39b9658315a742b6e61fdc86bb4d20cf566 Mon Sep 17 00:00:00 2001 From: Younger Liu Date: Thu, 10 Apr 2014 23:03:43 -0400 Subject: ext4: return ENOMEM rather than EIO when find_###_page() fails Return ENOMEM rather than EIO when find_get_page() fails in ext4_mb_get_buddy_page_lock() and find_or_create_page() fails in ext4_mb_load_buddy(). Signed-off-by: Younger Liu Signed-off-by: "Theodore Ts'o" --- fs/ext4/mballoc.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index a888cac76e9c..73ccbb3b973b 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -989,7 +989,7 @@ static int ext4_mb_get_buddy_page_lock(struct super_block *sb, poff = block % blocks_per_page; page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); if (!page) - return -EIO; + return -ENOMEM; BUG_ON(page->mapping != inode->i_mapping); e4b->bd_bitmap_page = page; e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); @@ -1003,7 +1003,7 @@ static int ext4_mb_get_buddy_page_lock(struct super_block *sb, pnum = block / blocks_per_page; page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); if (!page) - return -EIO; + return -ENOMEM; BUG_ON(page->mapping != inode->i_mapping); e4b->bd_buddy_page = page; return 0; @@ -1168,7 +1168,11 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, unlock_page(page); } } - if (page == NULL || !PageUptodate(page)) { + if (page == NULL) { + ret = -ENOMEM; + goto err; + } + if (!PageUptodate(page)) { ret = -EIO; goto err; } @@ -1197,7 +1201,11 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, unlock_page(page); } } - if (page == NULL || !PageUptodate(page)) { + if (page == NULL) { + ret = -ENOMEM; + goto err; + } + if (!PageUptodate(page)) { ret = -EIO; goto err; } -- cgit v1.2.3 From 622cad1325e404598fe3b148c3fa640dbaabc235 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 11 Apr 2014 10:35:17 -0400 Subject: ext4: move ext4_update_i_disksize() into mpage_map_and_submit_extent() The function ext4_update_i_disksize() is used in only one place, in the function mpage_map_and_submit_extent(). Move its code to simplify the code paths, and also move the call to ext4_mark_inode_dirty() into the i_data_sem's critical region, to be consistent with all of the other places where we update i_disksize. That way, we also keep the raw_inode's i_disksize protected, to avoid the following race: CPU #1 CPU #2 down_write(&i_data_sem) Modify i_disk_size up_write(&i_data_sem) down_write(&i_data_sem) Modify i_disk_size Copy i_disk_size to on-disk inode up_write(&i_data_sem) Copy i_disk_size to on-disk inode Signed-off-by: "Theodore Ts'o" Reviewed-by: Jan Kara Cc: stable@vger.kernel.org --- fs/ext4/ext4.h | 17 ----------------- fs/ext4/inode.c | 16 +++++++++++++--- 2 files changed, 13 insertions(+), 20 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index f1c65dc7cc0a..66946aa62127 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2466,23 +2466,6 @@ static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize) up_write(&EXT4_I(inode)->i_data_sem); } -/* - * Update i_disksize after writeback has been started. Races with truncate - * are avoided by checking i_size under i_data_sem. - */ -static inline void ext4_wb_update_i_disksize(struct inode *inode, loff_t newsize) -{ - loff_t i_size; - - down_write(&EXT4_I(inode)->i_data_sem); - i_size = i_size_read(inode); - if (newsize > i_size) - newsize = i_size; - if (newsize > EXT4_I(inode)->i_disksize) - EXT4_I(inode)->i_disksize = newsize; - up_write(&EXT4_I(inode)->i_data_sem); -} - struct ext4_group_info { unsigned long bb_state; struct rb_root bb_free_root; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 7b93df9aa182..f023f0cb46fc 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2247,13 +2247,23 @@ static int mpage_map_and_submit_extent(handle_t *handle, return err; } while (map->m_len); - /* Update on-disk size after IO is submitted */ + /* + * Update on-disk size after IO is submitted. Races with + * truncate are avoided by checking i_size under i_data_sem. + */ disksize = ((loff_t)mpd->first_page) << PAGE_CACHE_SHIFT; if (disksize > EXT4_I(inode)->i_disksize) { int err2; - - ext4_wb_update_i_disksize(inode, disksize); + loff_t i_size; + + down_write(&EXT4_I(inode)->i_data_sem); + i_size = i_size_read(inode); + if (disksize > i_size) + disksize = i_size; + if (disksize > EXT4_I(inode)->i_disksize) + EXT4_I(inode)->i_disksize = disksize; err2 = ext4_mark_inode_dirty(handle, inode); + up_write(&EXT4_I(inode)->i_data_sem); if (err2) ext4_error(inode->i_sb, "Failed to mark inode %lu dirty", -- cgit v1.2.3 From 9ef06cec7c96f6bf59f1dd8b64b9645820099051 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Sat, 12 Apr 2014 09:47:00 -0400 Subject: ext4: remove unnecessary check for APPEND and IMMUTABLE All the checks IS_APPEND and IS_IMMUTABLE for the fallocate operation on the inode are done in vfs. No need to do this again in ext4. Remove it. Signed-off-by: Lukas Czerner Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 6 ------ fs/ext4/inode.c | 6 +----- 2 files changed, 1 insertion(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index be1e56cbbf32..ed4ec48239b6 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -5398,12 +5398,6 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) /* Take mutex lock */ mutex_lock(&inode->i_mutex); - /* It's not possible punch hole on append only file */ - if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) { - ret = -EPERM; - goto out_mutex; - } - if (IS_SWAPFILE(inode)) { ret = -ETXTBSY; goto out_mutex; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index f023f0cb46fc..e2bba76f0d7b 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3541,11 +3541,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) } mutex_lock(&inode->i_mutex); - /* It's not possible punch hole on append only file */ - if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) { - ret = -EPERM; - goto out_mutex; - } + if (IS_SWAPFILE(inode)) { ret = -ETXTBSY; goto out_mutex; -- cgit v1.2.3 From 8fc61d92630d1c96057a94c61e1643475045b25b Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Sat, 12 Apr 2014 09:51:34 -0400 Subject: fs: prevent doing FALLOC_FL_ZERO_RANGE on append only file Currently punch hole and collapse range fallocate operation are not allowed on append only file. This should be case for zero range as well. Fix it by allowing only pure fallocate (possibly with keep size set). Signed-off-by: Lukas Czerner Signed-off-by: "Theodore Ts'o" --- fs/open.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/open.c b/fs/open.c index 631aea815def..3a83253d3373 100644 --- a/fs/open.c +++ b/fs/open.c @@ -254,11 +254,9 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len) return -EBADF; /* - * It's not possible to punch hole or perform collapse range - * on append only file + * We can only allow pure fallocate on append only files */ - if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_COLLAPSE_RANGE) - && IS_APPEND(inode)) + if ((mode & ~FALLOC_FL_KEEP_SIZE) && IS_APPEND(inode)) return -EPERM; if (IS_IMMUTABLE(inode)) -- cgit v1.2.3 From 23fffa925ea2c9a2bcb1a4453e2c542635aa3545 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Sat, 12 Apr 2014 09:56:41 -0400 Subject: fs: move falloc collapse range check into the filesystem methods Currently in do_fallocate in collapse range case we're checking whether offset + len is not bigger than i_size. However there is nothing which would prevent i_size from changing so the check is pointless. It should be done in the file system itself and the file system needs to make sure that i_size is not going to change. The i_size check for the other fallocate modes are also done in the filesystems. As it is now we can easily crash the kernel by having two processes doing truncate and fallocate collapse range at the same time. This can be reproduced on ext4 and it is theoretically possible on xfs even though I was not able to trigger it with this simple test. This commit removes the check from do_fallocate and adds it to the file system. Signed-off-by: Lukas Czerner Signed-off-by: "Theodore Ts'o" Acked-by: Dave Chinner Reviewed-by: Christoph Hellwig --- fs/ext4/extents.c | 11 +++++++++-- fs/open.c | 8 -------- fs/xfs/xfs_file.c | 10 +++++++++- 3 files changed, 18 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index ed4ec48239b6..ac5460d0d133 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -5368,8 +5368,6 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) loff_t new_size; int ret; - BUG_ON(offset + len > i_size_read(inode)); - /* Collapse range works only on fs block size aligned offsets. */ if (offset & (EXT4_BLOCK_SIZE(sb) - 1) || len & (EXT4_BLOCK_SIZE(sb) - 1)) @@ -5398,6 +5396,15 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) /* Take mutex lock */ mutex_lock(&inode->i_mutex); + /* + * There is no need to overlap collapse range with EOF, in which case + * it is effectively a truncate operation + */ + if (offset + len >= i_size_read(inode)) { + ret = -EINVAL; + goto out_mutex; + } + if (IS_SWAPFILE(inode)) { ret = -ETXTBSY; goto out_mutex; diff --git a/fs/open.c b/fs/open.c index 3a83253d3373..adf34202213a 100644 --- a/fs/open.c +++ b/fs/open.c @@ -284,14 +284,6 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len) if (((offset + len) > inode->i_sb->s_maxbytes) || ((offset + len) < 0)) return -EFBIG; - /* - * There is no need to overlap collapse range with EOF, in which case - * it is effectively a truncate operation - */ - if ((mode & FALLOC_FL_COLLAPSE_RANGE) && - (offset + len >= i_size_read(inode))) - return -EINVAL; - if (!file->f_op->fallocate) return -EOPNOTSUPP; diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index f7abff8c16ca..3cb528c4f27c 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -840,7 +840,15 @@ xfs_file_fallocate( goto out_unlock; } - ASSERT(offset + len < i_size_read(inode)); + /* + * There is no need to overlap collapse range with EOF, + * in which case it is effectively a truncate operation + */ + if (offset + len >= i_size_read(inode)) { + error = -EINVAL; + goto out_unlock; + } + new_size = i_size_read(inode) - len; error = xfs_collapse_file_space(ip, offset, len); -- cgit v1.2.3 From 0790b31b69374ddadefebb156251b319e5b43345 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Sat, 12 Apr 2014 10:05:37 -0400 Subject: fs: disallow all fallocate operation on active swapfile Currently some file system have IS_SWAPFILE check in their fallocate implementations and some do not. However we should really prevent any fallocate operation on swapfile so move the check to vfs and remove the redundant checks from the file systems fallocate implementations. Signed-off-by: Lukas Czerner Signed-off-by: "Theodore Ts'o" --- fs/ceph/file.c | 3 --- fs/ext4/extents.c | 5 ----- fs/ext4/inode.c | 5 ----- fs/open.c | 7 +++++++ 4 files changed, 7 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 09c7afe32e49..596e6cc9f9c4 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -1215,9 +1215,6 @@ static long ceph_fallocate(struct file *file, int mode, if (!S_ISREG(inode->i_mode)) return -EOPNOTSUPP; - if (IS_SWAPFILE(inode)) - return -ETXTBSY; - mutex_lock(&inode->i_mutex); if (ceph_snap(inode) != CEPH_NOSNAP) { diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index ac5460d0d133..b2d3869b5762 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -5405,11 +5405,6 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) goto out_mutex; } - if (IS_SWAPFILE(inode)) { - ret = -ETXTBSY; - goto out_mutex; - } - /* Currently just for extent based files */ if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { ret = -EOPNOTSUPP; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index e2bba76f0d7b..b74cfd2a42ec 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3542,11 +3542,6 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) mutex_lock(&inode->i_mutex); - if (IS_SWAPFILE(inode)) { - ret = -ETXTBSY; - goto out_mutex; - } - /* No need to punch hole beyond i_size */ if (offset >= inode->i_size) goto out_mutex; diff --git a/fs/open.c b/fs/open.c index adf34202213a..7b823daa6a93 100644 --- a/fs/open.c +++ b/fs/open.c @@ -262,6 +262,13 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len) if (IS_IMMUTABLE(inode)) return -EPERM; + /* + * We can not allow to do any fallocate operation on an active + * swapfile + */ + if (IS_SWAPFILE(inode)) + ret = -ETXTBSY; + /* * Revalidate the write permissions, in case security policy has * changed since the files were opened. -- cgit v1.2.3 From 6e6358fc3c3c862bfe9a5bc029d3f8ce43dc9765 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 12 Apr 2014 12:45:25 -0400 Subject: ext4: use i_size_read in ext4_unaligned_aio() We haven't taken i_mutex yet, so we need to use i_size_read(). Signed-off-by: "Theodore Ts'o" Cc: stable@vger.kernel.org --- fs/ext4/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 6db7f7db7777..bc765591101a 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -82,7 +82,7 @@ ext4_unaligned_aio(struct inode *inode, const struct iovec *iov, size_t count = iov_length(iov, nr_segs); loff_t final_size = pos + count; - if (pos >= inode->i_size) + if (pos >= i_size_read(inode)) return 0; if ((pos & blockmask) || (final_size & blockmask)) -- cgit v1.2.3 From 847c6c422aa0ae81a5517a9558ec2737806dca48 Mon Sep 17 00:00:00 2001 From: Zheng Liu Date: Sat, 12 Apr 2014 12:45:55 -0400 Subject: ext4: fix byte order problems introduced by the COLLAPSE_RANGE patches This commit tries to fix some byte order issues that is found by sparse check. $ make M=fs/ext4 C=2 CF=-D__CHECK_ENDIAN__ ... CHECK fs/ext4/extents.c fs/ext4/extents.c:5232:41: warning: restricted __le32 degrades to integer fs/ext4/extents.c:5236:52: warning: bad assignment (-=) to restricted __le32 fs/ext4/extents.c:5258:45: warning: bad assignment (-=) to restricted __le32 fs/ext4/extents.c:5303:28: warning: restricted __le32 degrades to integer fs/ext4/extents.c:5318:18: warning: incorrect type in assignment (different base types) fs/ext4/extents.c:5318:18: expected unsigned int [unsigned] [usertype] ex_start fs/ext4/extents.c:5318:18: got restricted __le32 [usertype] ee_block fs/ext4/extents.c:5319:24: warning: restricted __le32 degrades to integer fs/ext4/extents.c:5334:31: warning: incorrect type in assignment (different base types) ... Cc: Andreas Dilger Cc: Namjae Jeon Signed-off-by: Zheng Liu Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index b2d3869b5762..f24ef8697609 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -5229,11 +5229,11 @@ ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift, if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr)) update = 1; - *start = ex_last->ee_block + + *start = le32_to_cpu(ex_last->ee_block) + ext4_ext_get_actual_len(ex_last); while (ex_start <= ex_last) { - ex_start->ee_block -= shift; + le32_add_cpu(&ex_start->ee_block, -shift); if (ex_start > EXT_FIRST_EXTENT(path[depth].p_hdr)) { if (ext4_ext_try_to_merge_right(inode, @@ -5255,7 +5255,7 @@ ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift, if (err) goto out; - path[depth].p_idx->ei_block -= shift; + le32_add_cpu(&path[depth].p_idx->ei_block, -shift); err = ext4_ext_dirty(handle, inode, path + depth); if (err) goto out; @@ -5300,7 +5300,8 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle, return ret; } - stop_block = extent->ee_block + ext4_ext_get_actual_len(extent); + stop_block = le32_to_cpu(extent->ee_block) + + ext4_ext_get_actual_len(extent); ext4_ext_drop_refs(path); kfree(path); @@ -5315,8 +5316,9 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle, path = ext4_ext_find_extent(inode, start - 1, NULL, 0); depth = path->p_depth; extent = path[depth].p_ext; - ex_start = extent->ee_block; - ex_end = extent->ee_block + ext4_ext_get_actual_len(extent); + ex_start = le32_to_cpu(extent->ee_block); + ex_end = le32_to_cpu(extent->ee_block) + + ext4_ext_get_actual_len(extent); ext4_ext_drop_refs(path); kfree(path); @@ -5331,7 +5333,7 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle, return PTR_ERR(path); depth = path->p_depth; extent = path[depth].p_ext; - current_block = extent->ee_block; + current_block = le32_to_cpu(extent->ee_block); if (start > current_block) { /* Hole, move to the next extent */ ret = mext_next_extent(inode, path, &extent); -- cgit v1.2.3 From 40c406c74eb9eed58ae7d4d12a0197f7279c9499 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 12 Apr 2014 22:53:53 -0400 Subject: ext4: COLLAPSE_RANGE only works on extent-based files Unfortunately, we weren't checking to make sure of this the inode was extent-based before attempt operate on it. Hilarity ensues. Signed-off-by: "Theodore Ts'o" Cc: Namjae Jeon --- fs/ext4/extents.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index f24ef8697609..96e0a4bc8faa 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4878,9 +4878,6 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) if (mode & FALLOC_FL_PUNCH_HOLE) return ext4_punch_hole(inode, offset, len); - if (mode & FALLOC_FL_COLLAPSE_RANGE) - return ext4_collapse_range(inode, offset, len); - ret = ext4_convert_inline_data(inode); if (ret) return ret; @@ -4892,6 +4889,9 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) return -EOPNOTSUPP; + if (mode & FALLOC_FL_COLLAPSE_RANGE) + return ext4_collapse_range(inode, offset, len); + if (mode & FALLOC_FL_ZERO_RANGE) return ext4_zero_range(file, offset, len, mode); -- cgit v1.2.3 From e2cbd587418251bb73c4c1e8e2c7c1816d7a98d9 Mon Sep 17 00:00:00 2001 From: jon ernst Date: Sat, 12 Apr 2014 23:01:28 -0400 Subject: ext4: silence sparse check warning for function ext4_trim_extent This fixes the following sparse warning: CHECK fs/ext4/mballoc.c fs/ext4/mballoc.c:5019:9: warning: context imbalance in 'ext4_trim_extent' - unexpected unlock Signed-off-by: "Jon Ernst" Signed-off-by: "Theodore Ts'o" --- fs/ext4/mballoc.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 73ccbb3b973b..c8238a26818c 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -5016,6 +5016,8 @@ error_return: */ static int ext4_trim_extent(struct super_block *sb, int start, int count, ext4_group_t group, struct ext4_buddy *e4b) +__releases(bitlock) +__acquires(bitlock) { struct ext4_free_extent ex; int ret = 0; -- cgit v1.2.3 From 8dc79ec4c0537e1b83c0739af82a7babefb30012 Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Sun, 13 Apr 2014 15:05:42 -0400 Subject: ext4: fix error handling in ext4_ext_shift_extents Fix error handling by adding some. :-) Signed-off-by: Dmitry Monakhov Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 96e0a4bc8faa..38be06354b88 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -5314,11 +5314,18 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle, * enough to accomodate the shift. */ path = ext4_ext_find_extent(inode, start - 1, NULL, 0); + if (IS_ERR(path)) + return PTR_ERR(path); depth = path->p_depth; extent = path[depth].p_ext; - ex_start = le32_to_cpu(extent->ee_block); - ex_end = le32_to_cpu(extent->ee_block) + + if (extent) { + ex_start = le32_to_cpu(extent->ee_block); + ex_end = le32_to_cpu(extent->ee_block) + ext4_ext_get_actual_len(extent); + } else { + ex_start = 0; + ex_end = 0; + } ext4_ext_drop_refs(path); kfree(path); -- cgit v1.2.3 From a18ed359bdddcded4f97ff5e2f07793ff9336913 Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Sun, 13 Apr 2014 15:41:13 -0400 Subject: ext4: always check ext4_ext_find_extent result Where are some places where logic guaranties us that extent we are searching exits, but this may not be true due to on-disk data corruption. If such corruption happens we must prevent possible null pointer dereferences. Signed-off-by: Dmitry Monakhov Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 38be06354b88..64b400356cad 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3313,6 +3313,11 @@ static int ext4_split_extent(handle_t *handle, return PTR_ERR(path); depth = ext_depth(inode); ex = path[depth].p_ext; + if (!ex) { + EXT4_ERROR_INODE(inode, "unexpected hole at %lu", + (unsigned long) map->m_lblk); + return -EIO; + } uninitialized = ext4_ext_is_uninitialized(ex); split_flag1 = 0; @@ -3694,6 +3699,12 @@ static int ext4_convert_initialized_extents(handle_t *handle, } depth = ext_depth(inode); ex = path[depth].p_ext; + if (!ex) { + EXT4_ERROR_INODE(inode, "unexpected hole at %lu", + (unsigned long) map->m_lblk); + err = -EIO; + goto out; + } } err = ext4_ext_get_access(handle, inode, path + depth); @@ -5340,6 +5351,12 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle, return PTR_ERR(path); depth = path->p_depth; extent = path[depth].p_ext; + if (!extent) { + EXT4_ERROR_INODE(inode, "unexpected hole at %lu", + (unsigned long) start); + return -EIO; + } + current_block = le32_to_cpu(extent->ee_block); if (start > current_block) { /* Hole, move to the next extent */ -- cgit v1.2.3 From 4ab9ed578e82851645f3dd69d36d91ae77564d6c Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 14 Apr 2014 18:11:58 +1000 Subject: xfs: kill buffers over failed write ranges properly When a write fails, if we don't clear the delalloc flags from the buffers over the failed range, they can persist beyond EOF and cause problems. writeback will see the pages in the page cache, see they are dirty and continually retry the write, assuming that the page beyond EOF is just racing with a truncate. The page will eventually be released due to some other operation (e.g. direct IO), and it will not pass through invalidation because it is dirty. Hence it will be released with buffer_delay set on it, and trigger warnings in xfs_vm_releasepage() and assert fail in xfs_file_aio_write_direct because invalidation failed and we didn't write the corect amount. This causes failures on block size < page size filesystems in fsx and fsstress workloads run by xfstests. Fix it by completely trashing any state on the buffer that could be used to imply that it contains valid data when the delalloc range over the buffer is punched out during the failed write handling. Signed-off-by: Dave Chinner Tested-by: Brian Foster Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/xfs_aops.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'fs') diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 75df77d09f75..282c726d04d0 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -1566,6 +1566,16 @@ xfs_vm_write_failed( xfs_vm_kill_delalloc_range(inode, block_offset, block_offset + bh->b_size); + + /* + * This buffer does not contain data anymore. make sure anyone + * who finds it knows that for certain. + */ + clear_buffer_delay(bh); + clear_buffer_uptodate(bh); + clear_buffer_mapped(bh); + clear_buffer_new(bh); + clear_buffer_dirty(bh); } } -- cgit v1.2.3 From 72ab70a19b4ebb19dbe2a79faaa6a4ccead58e70 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 14 Apr 2014 18:13:29 +1000 Subject: xfs: write failure beyond EOF truncates too much data If we fail a write beyond EOF and have to handle it in xfs_vm_write_begin(), we truncate the inode back to the current inode size. This doesn't take into account the fact that we may have already made successful writes to the same page (in the case of block size < page size) and hence we can truncate the page cache away from blocks with valid data in them. If these blocks are delayed allocation blocks, we now have a mismatch between the page cache and the extent tree, and this will trigger - at minimum - a delayed block count mismatch assert when the inode is evicted from the cache. We can also trip over it when block mapping for direct IO - this is the most common symptom seen from fsx and fsstress when run from xfstests. Fix it by only truncating away the exact range we are updating state for in this write_begin call. Signed-off-by: Dave Chinner Tested-by: Brian Foster Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/xfs_aops.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 282c726d04d0..5f296934879f 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -1609,12 +1609,21 @@ xfs_vm_write_begin( status = __block_write_begin(page, pos, len, xfs_get_blocks); if (unlikely(status)) { struct inode *inode = mapping->host; + size_t isize = i_size_read(inode); xfs_vm_write_failed(inode, page, pos, len); unlock_page(page); - if (pos + len > i_size_read(inode)) - truncate_pagecache(inode, i_size_read(inode)); + /* + * If the write is beyond EOF, we only want to kill blocks + * allocated in this write, not blocks that were previously + * written successfully. + */ + if (pos + len > isize) { + ssize_t start = max_t(ssize_t, pos, isize); + + truncate_pagecache_range(inode, start, pos + len); + } page_cache_release(page); page = NULL; -- cgit v1.2.3 From aad3f3755e7f043789b772856d1a2935f2b41a4b Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 14 Apr 2014 18:14:11 +1000 Subject: xfs: xfs_vm_write_end truncates too much on failure Similar to the write_begin problem, xfs-vm_write_end will truncate back to the old EOF, potentially removing page cache from over the top of delalloc blocks with valid data in them. Fix this by truncating back to just the start of the failed write. Signed-off-by: Dave Chinner Tested-by: Brian Foster Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/xfs_aops.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 5f296934879f..e0a793113ea9 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -1634,9 +1634,12 @@ xfs_vm_write_begin( } /* - * On failure, we only need to kill delalloc blocks beyond EOF because they - * will never be written. For blocks within EOF, generic_write_end() zeros them - * so they are safe to leave alone and be written with all the other valid data. + * On failure, we only need to kill delalloc blocks beyond EOF in the range of + * this specific write because they will never be written. Previous writes + * beyond EOF where block allocation succeeded do not need to be trashed, so + * only new blocks from this write should be trashed. For blocks within + * EOF, generic_write_end() zeros them so they are safe to leave alone and be + * written with all the other valid data. */ STATIC int xfs_vm_write_end( @@ -1659,8 +1662,11 @@ xfs_vm_write_end( loff_t to = pos + len; if (to > isize) { - truncate_pagecache(inode, isize); + /* only kill blocks in this write beyond EOF */ + if (pos > isize) + isize = pos; xfs_vm_kill_delalloc_range(inode, isize, to); + truncate_pagecache_range(inode, isize, to); } } return ret; -- cgit v1.2.3 From 897b73b6a2ee5d3c06648b601beb1724f7fbd678 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 14 Apr 2014 18:15:11 +1000 Subject: xfs: zeroing space needs to punch delalloc blocks When we are zeroing space andit is covered by a delalloc range, we need to punch the delalloc range out before we truncate the page cache. Failing to do so leaves and inconsistency between the page cache and the extent tree, which we later trip over when doing direct IO over the same range. Signed-off-by: Dave Chinner Tested-by: Brian Foster Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/xfs_bmap_util.c | 13 ++++++++++++- fs/xfs/xfs_trace.h | 1 + 2 files changed, 13 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 01f6a646caa1..296160b8e78c 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -1418,6 +1418,8 @@ xfs_zero_file_space( xfs_off_t end_boundary; int error; + trace_xfs_zero_file_space(ip); + granularity = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE); /* @@ -1432,9 +1434,18 @@ xfs_zero_file_space( ASSERT(end_boundary <= offset + len); if (start_boundary < end_boundary - 1) { - /* punch out the page cache over the conversion range */ + /* + * punch out delayed allocation blocks and the page cache over + * the conversion range + */ + xfs_ilock(ip, XFS_ILOCK_EXCL); + error = xfs_bmap_punch_delalloc_range(ip, + XFS_B_TO_FSBT(mp, start_boundary), + XFS_B_TO_FSB(mp, end_boundary - start_boundary)); + xfs_iunlock(ip, XFS_ILOCK_EXCL); truncate_pagecache_range(VFS_I(ip), start_boundary, end_boundary - 1); + /* convert the blocks */ error = xfs_alloc_file_space(ip, start_boundary, end_boundary - start_boundary - 1, diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index a4ae41c179a8..65d8c793a25c 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -603,6 +603,7 @@ DEFINE_INODE_EVENT(xfs_readlink); DEFINE_INODE_EVENT(xfs_inactive_symlink); DEFINE_INODE_EVENT(xfs_alloc_file_space); DEFINE_INODE_EVENT(xfs_free_file_space); +DEFINE_INODE_EVENT(xfs_zero_file_space); DEFINE_INODE_EVENT(xfs_collapse_file_space); DEFINE_INODE_EVENT(xfs_readdir); #ifdef CONFIG_XFS_POSIX_ACL -- cgit v1.2.3 From 036acea2ceabd19cb5734ae7a9d64c0a5ef90484 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Mon, 14 Apr 2014 23:36:15 -0400 Subject: ext4: fix ext4_count_free_clusters() with EXT4FS_DEBUG and bigalloc enabled With bigalloc enabled we must use EXT4_CLUSTERS_PER_GROUP() instead of EXT4_BLOCKS_PER_GROUP() otherwise we will go beyond the allocated buffer. $ mount -t ext4 /dev/vde /vde [ 70.573993] EXT4-fs DEBUG (fs/ext4/mballoc.c, 2346): ext4_mb_alloc_groupinfo: [ 70.575174] allocated s_groupinfo array for 1 meta_bg's [ 70.576172] EXT4-fs DEBUG (fs/ext4/super.c, 2092): ext4_check_descriptors: [ 70.576972] Checking group descriptorsBUG: unable to handle kernel paging request at ffff88006ab56000 [ 72.463686] IP: [] __bitmap_weight+0x2a/0x7f [ 72.464168] PGD 295e067 PUD 2961067 PMD 7fa8e067 PTE 800000006ab56060 [ 72.464738] Oops: 0000 [#1] SMP DEBUG_PAGEALLOC [ 72.465139] Modules linked in: [ 72.465402] CPU: 1 PID: 3560 Comm: mount Tainted: G W 3.14.0-rc2-00069-ge57bce1 #60 [ 72.466079] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 [ 72.466505] task: ffff88007ce6c8a0 ti: ffff88006b7f0000 task.ti: ffff88006b7f0000 [ 72.466505] RIP: 0010:[] [] __bitmap_weight+0x2a/0x7f [ 72.466505] RSP: 0018:ffff88006b7f1c00 EFLAGS: 00010206 [ 72.466505] RAX: 0000000000000000 RBX: 000000000000050a RCX: 0000000000000040 [ 72.466505] RDX: 0000000000000000 RSI: 0000000000080000 RDI: 0000000000000000 [ 72.466505] RBP: ffff88006b7f1c28 R08: 0000000000000002 R09: 0000000000000000 [ 72.466505] R10: 000000000000babe R11: 0000000000000400 R12: 0000000000080000 [ 72.466505] R13: 0000000000000200 R14: 0000000000002000 R15: ffff88006ab55000 [ 72.466505] FS: 00007f43ba1fa840(0000) GS:ffff88007f800000(0000) knlGS:0000000000000000 [ 72.466505] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b [ 72.466505] CR2: ffff88006ab56000 CR3: 000000006b7e6000 CR4: 00000000000006e0 [ 72.466505] Stack: [ 72.466505] ffff88006ab65000 0000000000000000 0000000000000000 0000000000010000 [ 72.466505] ffff88006ab6f400 ffff88006b7f1c58 ffffffff81396bb8 0000000000010000 [ 72.466505] 0000000000000000 ffff88007b869a90 ffff88006a48a000 ffff88006b7f1c70 [ 72.466505] Call Trace: [ 72.466505] [] memweight+0x5f/0x8a [ 72.466505] [] ext4_count_free+0x13/0x21 [ 72.466505] [] ext4_count_free_clusters+0xdb/0x171 [ 72.466505] [] ext4_fill_super+0x117c/0x28ef [ 72.466505] [] ? vsnprintf+0x1c7/0x3f7 [ 72.466505] [] mount_bdev+0x145/0x19c [ 72.466505] [] ? ext4_calculate_overhead+0x2a1/0x2a1 [ 72.466505] [] ext4_mount+0x15/0x17 [ 72.466505] [] mount_fs+0x67/0x150 [ 72.466505] [] vfs_kern_mount+0x64/0xde [ 72.466505] [] do_mount+0x6fe/0x7f5 [ 72.466505] [] ? strndup_user+0x3a/0xd9 [ 72.466505] [] SyS_mount+0x85/0xbe [ 72.466505] [] tracesys+0xdd/0xe2 [ 72.466505] Code: c3 89 f0 b9 40 00 00 00 55 99 48 89 e5 41 57 f7 f9 41 56 49 89 ff 41 55 45 31 ed 41 54 41 89 f4 53 31 db 41 89 c6 45 39 ee 7e 10 <4b> 8b 3c ef 49 ff c5 e8 bf ff ff ff 01 c3 eb eb 31 c0 45 85 f6 [ 72.466505] RIP [] __bitmap_weight+0x2a/0x7f [ 72.466505] RSP [ 72.466505] CR2: ffff88006ab56000 [ 72.466505] ---[ end trace 7d051a08ae138573 ]--- Killed Signed-off-by: "Theodore Ts'o" --- fs/ext4/balloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 6ea7b1436bbc..5c56785007e0 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -667,7 +667,7 @@ ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb) continue; x = ext4_count_free(bitmap_bh->b_data, - EXT4_BLOCKS_PER_GROUP(sb) / 8); + EXT4_CLUSTERS_PER_GROUP(sb) / 8); printk(KERN_DEBUG "group %u: stored = %d, counted = %u\n", i, ext4_free_group_clusters(sb, gdp), x); bitmap_count += x; -- cgit v1.2.3 From c11f1df5003d534fd067f0168bfad7befffb3b5c Mon Sep 17 00:00:00 2001 From: Sachin Prabhu Date: Tue, 11 Mar 2014 16:11:47 +0000 Subject: cifs: Wait for writebacks to complete before attempting write. Problem reported in Red Hat bz 1040329 for strict writes where we cache only when we hold oplock and write direct to the server when we don't. When we receive an oplock break, we first change the oplock value for the inode in cifsInodeInfo->oplock to indicate that we no longer hold the oplock before we enqueue a task to flush changes to the backing device. Once we have completed flushing the changes, we return the oplock to the server. There are 2 ways here where we can have data corruption 1) While we flush changes to the backing device as part of the oplock break, we can have processes write to the file. These writes check for the oplock, find none and attempt to write directly to the server. These direct writes made while we are flushing from cache could be overwritten by data being flushed from the cache causing data corruption. 2) While a thread runs in cifs_strict_writev, the machine could receive and process an oplock break after the thread has checked the oplock and found that it allows us to cache and before we have made changes to the cache. In that case, we end up with a dirty page in cache when we shouldn't have any. This will be flushed later and will overwrite all subsequent writes to the part of the file represented by this page. Before making any writes to the server, we need to confirm that we are not in the process of flushing data to the server and if we are, we should wait until the process is complete before we attempt the write. We should also wait for existing writes to complete before we process an oplock break request which changes oplock values. We add a version specific downgrade_oplock() operation to allow for differences in the oplock values set for the different smb versions. Cc: stable@vger.kernel.org Signed-off-by: Sachin Prabhu Reviewed-by: Jeff Layton Reviewed-by: Pavel Shilovsky Signed-off-by: Steve French --- fs/cifs/cifsfs.c | 14 +++++++++- fs/cifs/cifsglob.h | 8 ++++++ fs/cifs/cifsproto.h | 3 +++ fs/cifs/file.c | 31 +++++++++++++++++++--- fs/cifs/misc.c | 74 +++++++++++++++++++++++++++++++++++++++++++++++++++-- fs/cifs/smb1ops.c | 11 ++++++++ fs/cifs/smb2misc.c | 18 ++++++++++--- fs/cifs/smb2ops.c | 14 ++++++++++ 8 files changed, 164 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index df9c9141c099..5be1f997ecde 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -253,6 +253,11 @@ cifs_alloc_inode(struct super_block *sb) cifs_set_oplock_level(cifs_inode, 0); cifs_inode->delete_pending = false; cifs_inode->invalid_mapping = false; + clear_bit(CIFS_INODE_PENDING_OPLOCK_BREAK, &cifs_inode->flags); + clear_bit(CIFS_INODE_PENDING_WRITERS, &cifs_inode->flags); + clear_bit(CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, &cifs_inode->flags); + spin_lock_init(&cifs_inode->writers_lock); + cifs_inode->writers = 0; cifs_inode->vfs_inode.i_blkbits = 14; /* 2**14 = CIFS_MAX_MSGSIZE */ cifs_inode->server_eof = 0; cifs_inode->uniqueid = 0; @@ -732,19 +737,26 @@ static ssize_t cifs_file_aio_write(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) { struct inode *inode = file_inode(iocb->ki_filp); + struct cifsInodeInfo *cinode = CIFS_I(inode); ssize_t written; int rc; + written = cifs_get_writer(cinode); + if (written) + return written; + written = generic_file_aio_write(iocb, iov, nr_segs, pos); if (CIFS_CACHE_WRITE(CIFS_I(inode))) - return written; + goto out; rc = filemap_fdatawrite(inode->i_mapping); if (rc) cifs_dbg(FYI, "cifs_file_aio_write: %d rc on %p inode\n", rc, inode); +out: + cifs_put_writer(cinode); return written; } diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index c0f3718b77a8..30f6e9251a4a 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -228,6 +228,8 @@ struct smb_version_operations { /* verify the message */ int (*check_message)(char *, unsigned int); bool (*is_oplock_break)(char *, struct TCP_Server_Info *); + void (*downgrade_oplock)(struct TCP_Server_Info *, + struct cifsInodeInfo *, bool); /* process transaction2 response */ bool (*check_trans2)(struct mid_q_entry *, struct TCP_Server_Info *, char *, int); @@ -1113,6 +1115,12 @@ struct cifsInodeInfo { unsigned int epoch; /* used to track lease state changes */ bool delete_pending; /* DELETE_ON_CLOSE is set */ bool invalid_mapping; /* pagecache is invalid */ + unsigned long flags; +#define CIFS_INODE_PENDING_OPLOCK_BREAK (0) /* oplock break in progress */ +#define CIFS_INODE_PENDING_WRITERS (1) /* Writes in progress */ +#define CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2 (2) /* Downgrade oplock to L2 */ + spinlock_t writers_lock; + unsigned int writers; /* Number of writers on this inode */ unsigned long time; /* jiffies of last update of inode */ u64 server_eof; /* current file size on server -- protected by i_lock */ u64 uniqueid; /* server inode number */ diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index acc4ee8ed075..ca7980a1e303 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -127,6 +127,9 @@ extern u64 cifs_UnixTimeToNT(struct timespec); extern struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time, int offset); extern void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock); +extern int cifs_get_writer(struct cifsInodeInfo *cinode); +extern void cifs_put_writer(struct cifsInodeInfo *cinode); +extern void cifs_done_oplock_break(struct cifsInodeInfo *cinode); extern int cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, const unsigned int xid); extern int cifs_push_mandatory_locks(struct cifsFileInfo *cfile); diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 8add25538a3b..d8ee76241b64 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -2621,12 +2621,20 @@ cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov, struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); ssize_t written; + written = cifs_get_writer(cinode); + if (written) + return written; + if (CIFS_CACHE_WRITE(cinode)) { if (cap_unix(tcon->ses) && (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) - && ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0)) - return generic_file_aio_write(iocb, iov, nr_segs, pos); - return cifs_writev(iocb, iov, nr_segs, pos); + && ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0)) { + written = generic_file_aio_write( + iocb, iov, nr_segs, pos); + goto out; + } + written = cifs_writev(iocb, iov, nr_segs, pos); + goto out; } /* * For non-oplocked files in strict cache mode we need to write the data @@ -2646,6 +2654,8 @@ cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov, inode); cinode->oplock = 0; } +out: + cifs_put_writer(cinode); return written; } @@ -3621,6 +3631,13 @@ static int cifs_launder_page(struct page *page) return rc; } +static int +cifs_pending_writers_wait(void *unused) +{ + schedule(); + return 0; +} + void cifs_oplock_break(struct work_struct *work) { struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo, @@ -3628,8 +3645,15 @@ void cifs_oplock_break(struct work_struct *work) struct inode *inode = cfile->dentry->d_inode; struct cifsInodeInfo *cinode = CIFS_I(inode); struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); + struct TCP_Server_Info *server = tcon->ses->server; int rc = 0; + wait_on_bit(&cinode->flags, CIFS_INODE_PENDING_WRITERS, + cifs_pending_writers_wait, TASK_UNINTERRUPTIBLE); + + server->ops->downgrade_oplock(server, cinode, + test_bit(CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, &cinode->flags)); + if (!CIFS_CACHE_WRITE(cinode) && CIFS_CACHE_READ(cinode) && cifs_has_mand_locks(cinode)) { cifs_dbg(FYI, "Reset oplock to None for inode=%p due to mand locks\n", @@ -3666,6 +3690,7 @@ void cifs_oplock_break(struct work_struct *work) cinode); cifs_dbg(FYI, "Oplock release rc = %d\n", rc); } + cifs_done_oplock_break(cinode); } /* diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 2f9f3790679d..3b0c62e622da 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -466,8 +466,22 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv) cifs_dbg(FYI, "file id match, oplock break\n"); pCifsInode = CIFS_I(netfile->dentry->d_inode); - cifs_set_oplock_level(pCifsInode, - pSMB->OplockLevel ? OPLOCK_READ : 0); + set_bit(CIFS_INODE_PENDING_OPLOCK_BREAK, + &pCifsInode->flags); + + /* + * Set flag if the server downgrades the oplock + * to L2 else clear. + */ + if (pSMB->OplockLevel) + set_bit( + CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, + &pCifsInode->flags); + else + clear_bit( + CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, + &pCifsInode->flags); + queue_work(cifsiod_wq, &netfile->oplock_break); netfile->oplock_break_cancelled = false; @@ -551,6 +565,62 @@ void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock) cinode->oplock = 0; } +static int +cifs_oplock_break_wait(void *unused) +{ + schedule(); + return signal_pending(current) ? -ERESTARTSYS : 0; +} + +/* + * We wait for oplock breaks to be processed before we attempt to perform + * writes. + */ +int cifs_get_writer(struct cifsInodeInfo *cinode) +{ + int rc; + +start: + rc = wait_on_bit(&cinode->flags, CIFS_INODE_PENDING_OPLOCK_BREAK, + cifs_oplock_break_wait, TASK_KILLABLE); + if (rc) + return rc; + + spin_lock(&cinode->writers_lock); + if (!cinode->writers) + set_bit(CIFS_INODE_PENDING_WRITERS, &cinode->flags); + cinode->writers++; + /* Check to see if we have started servicing an oplock break */ + if (test_bit(CIFS_INODE_PENDING_OPLOCK_BREAK, &cinode->flags)) { + cinode->writers--; + if (cinode->writers == 0) { + clear_bit(CIFS_INODE_PENDING_WRITERS, &cinode->flags); + wake_up_bit(&cinode->flags, CIFS_INODE_PENDING_WRITERS); + } + spin_unlock(&cinode->writers_lock); + goto start; + } + spin_unlock(&cinode->writers_lock); + return 0; +} + +void cifs_put_writer(struct cifsInodeInfo *cinode) +{ + spin_lock(&cinode->writers_lock); + cinode->writers--; + if (cinode->writers == 0) { + clear_bit(CIFS_INODE_PENDING_WRITERS, &cinode->flags); + wake_up_bit(&cinode->flags, CIFS_INODE_PENDING_WRITERS); + } + spin_unlock(&cinode->writers_lock); +} + +void cifs_done_oplock_break(struct cifsInodeInfo *cinode) +{ + clear_bit(CIFS_INODE_PENDING_OPLOCK_BREAK, &cinode->flags); + wake_up_bit(&cinode->flags, CIFS_INODE_PENDING_OPLOCK_BREAK); +} + bool backup_cred(struct cifs_sb_info *cifs_sb) { diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c index 526fb89f9230..d1fdfa848703 100644 --- a/fs/cifs/smb1ops.c +++ b/fs/cifs/smb1ops.c @@ -372,6 +372,16 @@ coalesce_t2(char *second_buf, struct smb_hdr *target_hdr) return 0; } +static void +cifs_downgrade_oplock(struct TCP_Server_Info *server, + struct cifsInodeInfo *cinode, bool set_level2) +{ + if (set_level2) + cifs_set_oplock_level(cinode, OPLOCK_READ); + else + cifs_set_oplock_level(cinode, 0); +} + static bool cifs_check_trans2(struct mid_q_entry *mid, struct TCP_Server_Info *server, char *buf, int malformed) @@ -1019,6 +1029,7 @@ struct smb_version_operations smb1_operations = { .clear_stats = cifs_clear_stats, .print_stats = cifs_print_stats, .is_oplock_break = is_valid_oplock_break, + .downgrade_oplock = cifs_downgrade_oplock, .check_trans2 = cifs_check_trans2, .need_neg = cifs_need_neg, .negotiate = cifs_negotiate, diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index fb3966265b6e..b8021fde987d 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -575,9 +575,21 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server) else cfile->oplock_break_cancelled = false; - server->ops->set_oplock_level(cinode, - rsp->OplockLevel ? SMB2_OPLOCK_LEVEL_II : 0, - 0, NULL); + set_bit(CIFS_INODE_PENDING_OPLOCK_BREAK, + &cinode->flags); + + /* + * Set flag if the server downgrades the oplock + * to L2 else clear. + */ + if (rsp->OplockLevel) + set_bit( + CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, + &cinode->flags); + else + clear_bit( + CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, + &cinode->flags); queue_work(cifsiod_wq, &cfile->oplock_break); diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 192f51a12cf1..35ddc3ed119d 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -904,6 +904,17 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, return rc; } +static void +smb2_downgrade_oplock(struct TCP_Server_Info *server, + struct cifsInodeInfo *cinode, bool set_level2) +{ + if (set_level2) + server->ops->set_oplock_level(cinode, SMB2_OPLOCK_LEVEL_II, + 0, NULL); + else + server->ops->set_oplock_level(cinode, 0, 0, NULL); +} + static void smb2_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock, unsigned int epoch, bool *purge_cache) @@ -1110,6 +1121,7 @@ struct smb_version_operations smb20_operations = { .clear_stats = smb2_clear_stats, .print_stats = smb2_print_stats, .is_oplock_break = smb2_is_valid_oplock_break, + .downgrade_oplock = smb2_downgrade_oplock, .need_neg = smb2_need_neg, .negotiate = smb2_negotiate, .negotiate_wsize = smb2_negotiate_wsize, @@ -1184,6 +1196,7 @@ struct smb_version_operations smb21_operations = { .clear_stats = smb2_clear_stats, .print_stats = smb2_print_stats, .is_oplock_break = smb2_is_valid_oplock_break, + .downgrade_oplock = smb2_downgrade_oplock, .need_neg = smb2_need_neg, .negotiate = smb2_negotiate, .negotiate_wsize = smb2_negotiate_wsize, @@ -1259,6 +1272,7 @@ struct smb_version_operations smb30_operations = { .print_stats = smb2_print_stats, .dump_share_caps = smb2_dump_share_caps, .is_oplock_break = smb2_is_valid_oplock_break, + .downgrade_oplock = smb2_downgrade_oplock, .need_neg = smb2_need_neg, .negotiate = smb2_negotiate, .negotiate_wsize = smb2_negotiate_wsize, -- cgit v1.2.3 From 60977fcc808664f82412bb37da7be17640ba99d9 Mon Sep 17 00:00:00 2001 From: Steve French Date: Tue, 25 Mar 2014 19:46:36 -0500 Subject: Return correct error on query of xattr on file with empty xattrs xfstest 020 detected a problem with cifs xattr handling. When a file had an empty xattr list, we returned success (with an empty xattr value) on query of particular xattrs rather than returning ENODATA. This patch fixes it so that query of an xattr returns ENODATA when the xattr list is empty for the file. Signed-off-by: Steve French Reviewed-by: Jeff Layton --- fs/cifs/cifssmb.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index f3264bd7a83d..6ce4e0954b98 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -6197,6 +6197,9 @@ QAllEAsRetry: cifs_dbg(FYI, "ea length %d\n", list_len); if (list_len <= 8) { cifs_dbg(FYI, "empty EA list returned from server\n"); + /* didn't find the named attribute */ + if (ea_name) + rc = -ENODATA; goto QAllEAsOut; } -- cgit v1.2.3 From 8e3ecc87695f4a7e9e217ebd55ca6a39b6a451b8 Mon Sep 17 00:00:00 2001 From: Cyril Roelandt Date: Fri, 4 Apr 2014 00:05:21 +0200 Subject: fs: cifs: remove unused variable. In SMB2_set_compression(), the "res_key" variable is only initialized to NULL and later kfreed. It is therefore useless and should be removed. Found with the following semantic patch: @@ identifier foo; identifier f; type T; @@ * f(...) { ... * T *foo = NULL; ... when forall when != foo * kfree(foo); ... } Signed-off-by: Cyril Roelandt Signed-off-by: Steve French --- fs/cifs/smb2pdu.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 860344701067..3802f8c94acc 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -1352,7 +1352,6 @@ SMB2_set_compression(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid) { int rc; - char *res_key = NULL; struct compress_ioctl fsctl_input; char *ret_data = NULL; @@ -1365,7 +1364,6 @@ SMB2_set_compression(const unsigned int xid, struct cifs_tcon *tcon, 2 /* in data len */, &ret_data /* out data */, NULL); cifs_dbg(FYI, "set compression rc %d\n", rc); - kfree(res_key); return rc; } -- cgit v1.2.3 From a2a4dc494a7b7135f460e38e788c4a58f65e4ac3 Mon Sep 17 00:00:00 2001 From: Thomas Bächler Date: Thu, 3 Apr 2014 21:55:37 +0200 Subject: fs: Don't return 0 from get_anon_bdev MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 9e30cc9595303b27b48 removed an internal mount. This has the side-effect that rootfs now has FSID 0. Many userspace utilities assume that st_dev in struct stat is never 0, so this change breaks a number of tools in early userspace. Since we don't know how many userspace programs are affected, make sure that FSID is at least 1. References: http://article.gmane.org/gmane.linux.kernel/1666905 References: http://permalink.gmane.org/gmane.linux.utilities.util-linux-ng/8557 Cc: 3.14 Signed-off-by: Thomas Bächler Acked-by: Tejun Heo Acked-by: H. Peter Anvin Tested-by: Alexandre Demers Signed-off-by: Greg Kroah-Hartman --- fs/super.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/super.c b/fs/super.c index e9dc3c3fe159..48377f7463c0 100644 --- a/fs/super.c +++ b/fs/super.c @@ -800,7 +800,10 @@ void emergency_remount(void) static DEFINE_IDA(unnamed_dev_ida); static DEFINE_SPINLOCK(unnamed_dev_lock);/* protects the above */ -static int unnamed_dev_start = 0; /* don't bother trying below it */ +/* Many userspace utilities consider an FSID of 0 invalid. + * Always return at least 1 from get_anon_bdev. + */ +static int unnamed_dev_start = 1; int get_anon_bdev(dev_t *p) { -- cgit v1.2.3 From 4afddd60a770560d370d6f85c5aef57c16bf7502 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 2 Apr 2014 16:40:52 -0400 Subject: kernfs: protect lazy kernfs_iattrs allocation with mutex kernfs_iattrs is allocated lazily when operations which require it take place; unfortunately, the lazy allocation and returning weren't properly synchronized and when there are multiple concurrent operations, it might end up returning kernfs_iattrs which hasn't finished initialization yet or different copies to different callers. Fix it by synchronizing with a mutex. This can be smarter with memory barriers but let's go there if it actually turns out to be necessary. Signed-off-by: Tejun Heo Link: http://lkml.kernel.org/g/533ABA32.9080602@oracle.com Reported-by: Sasha Levin Cc: stable@vger.kernel.org # 3.14 Signed-off-by: Greg Kroah-Hartman --- fs/kernfs/inode.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c index abb0f1f53d93..985217626e66 100644 --- a/fs/kernfs/inode.c +++ b/fs/kernfs/inode.c @@ -48,14 +48,18 @@ void __init kernfs_inode_init(void) static struct kernfs_iattrs *kernfs_iattrs(struct kernfs_node *kn) { + static DEFINE_MUTEX(iattr_mutex); + struct kernfs_iattrs *ret; struct iattr *iattrs; + mutex_lock(&iattr_mutex); + if (kn->iattr) - return kn->iattr; + goto out_unlock; kn->iattr = kzalloc(sizeof(struct kernfs_iattrs), GFP_KERNEL); if (!kn->iattr) - return NULL; + goto out_unlock; iattrs = &kn->iattr->ia_iattr; /* assign default attributes */ @@ -65,8 +69,10 @@ static struct kernfs_iattrs *kernfs_iattrs(struct kernfs_node *kn) iattrs->ia_atime = iattrs->ia_mtime = iattrs->ia_ctime = CURRENT_TIME; simple_xattrs_init(&kn->iattr->xattrs); - - return kn->iattr; +out_unlock: + ret = kn->iattr; + mutex_unlock(&iattr_mutex); + return ret; } static int __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr) -- cgit v1.2.3 From 33ac1257ff0dee2e9c7f009b1c1914b7990217b2 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 10 Jan 2014 08:57:31 -0500 Subject: sysfs, driver-core: remove unused {sysfs|device}_schedule_callback_owner() All device_schedule_callback_owner() users are converted to use device_remove_file_self(). Remove now unused {sysfs|device}_schedule_callback_owner(). Signed-off-by: Tejun Heo Signed-off-by: Greg Kroah-Hartman --- drivers/base/core.c | 33 ------------------ fs/sysfs/file.c | 92 -------------------------------------------------- include/linux/device.h | 11 +----- include/linux/sysfs.h | 9 ----- 4 files changed, 1 insertion(+), 144 deletions(-) (limited to 'fs') diff --git a/drivers/base/core.c b/drivers/base/core.c index 0dd65281cc65..20da3ad1696b 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -614,39 +614,6 @@ void device_remove_bin_file(struct device *dev, } EXPORT_SYMBOL_GPL(device_remove_bin_file); -/** - * device_schedule_callback_owner - helper to schedule a callback for a device - * @dev: device. - * @func: callback function to invoke later. - * @owner: module owning the callback routine - * - * Attribute methods must not unregister themselves or their parent device - * (which would amount to the same thing). Attempts to do so will deadlock, - * since unregistration is mutually exclusive with driver callbacks. - * - * Instead methods can call this routine, which will attempt to allocate - * and schedule a workqueue request to call back @func with @dev as its - * argument in the workqueue's process context. @dev will be pinned until - * @func returns. - * - * This routine is usually called via the inline device_schedule_callback(), - * which automatically sets @owner to THIS_MODULE. - * - * Returns 0 if the request was submitted, -ENOMEM if storage could not - * be allocated, -ENODEV if a reference to @owner isn't available. - * - * NOTE: This routine won't work if CONFIG_SYSFS isn't set! It uses an - * underlying sysfs routine (since it is intended for use by attribute - * methods), and if sysfs isn't available you'll get nothing but -ENOSYS. - */ -int device_schedule_callback_owner(struct device *dev, - void (*func)(struct device *), struct module *owner) -{ - return sysfs_schedule_callback(&dev->kobj, - (void (*)(void *)) func, dev, owner); -} -EXPORT_SYMBOL_GPL(device_schedule_callback_owner); - static void klist_children_get(struct klist_node *n) { struct device_private *p = to_device_private_parent(n); diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index 1b8b91b67fdb..28cc1acd5439 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -453,95 +453,3 @@ void sysfs_remove_bin_file(struct kobject *kobj, kernfs_remove_by_name(kobj->sd, attr->attr.name); } EXPORT_SYMBOL_GPL(sysfs_remove_bin_file); - -struct sysfs_schedule_callback_struct { - struct list_head workq_list; - struct kobject *kobj; - void (*func)(void *); - void *data; - struct module *owner; - struct work_struct work; -}; - -static struct workqueue_struct *sysfs_workqueue; -static DEFINE_MUTEX(sysfs_workq_mutex); -static LIST_HEAD(sysfs_workq); -static void sysfs_schedule_callback_work(struct work_struct *work) -{ - struct sysfs_schedule_callback_struct *ss = container_of(work, - struct sysfs_schedule_callback_struct, work); - - (ss->func)(ss->data); - kobject_put(ss->kobj); - module_put(ss->owner); - mutex_lock(&sysfs_workq_mutex); - list_del(&ss->workq_list); - mutex_unlock(&sysfs_workq_mutex); - kfree(ss); -} - -/** - * sysfs_schedule_callback - helper to schedule a callback for a kobject - * @kobj: object we're acting for. - * @func: callback function to invoke later. - * @data: argument to pass to @func. - * @owner: module owning the callback code - * - * sysfs attribute methods must not unregister themselves or their parent - * kobject (which would amount to the same thing). Attempts to do so will - * deadlock, since unregistration is mutually exclusive with driver - * callbacks. - * - * Instead methods can call this routine, which will attempt to allocate - * and schedule a workqueue request to call back @func with @data as its - * argument in the workqueue's process context. @kobj will be pinned - * until @func returns. - * - * Returns 0 if the request was submitted, -ENOMEM if storage could not - * be allocated, -ENODEV if a reference to @owner isn't available, - * -EAGAIN if a callback has already been scheduled for @kobj. - */ -int sysfs_schedule_callback(struct kobject *kobj, void (*func)(void *), - void *data, struct module *owner) -{ - struct sysfs_schedule_callback_struct *ss, *tmp; - - if (!try_module_get(owner)) - return -ENODEV; - - mutex_lock(&sysfs_workq_mutex); - list_for_each_entry_safe(ss, tmp, &sysfs_workq, workq_list) - if (ss->kobj == kobj) { - module_put(owner); - mutex_unlock(&sysfs_workq_mutex); - return -EAGAIN; - } - mutex_unlock(&sysfs_workq_mutex); - - if (sysfs_workqueue == NULL) { - sysfs_workqueue = create_singlethread_workqueue("sysfsd"); - if (sysfs_workqueue == NULL) { - module_put(owner); - return -ENOMEM; - } - } - - ss = kmalloc(sizeof(*ss), GFP_KERNEL); - if (!ss) { - module_put(owner); - return -ENOMEM; - } - kobject_get(kobj); - ss->kobj = kobj; - ss->func = func; - ss->data = data; - ss->owner = owner; - INIT_WORK(&ss->work, sysfs_schedule_callback_work); - INIT_LIST_HEAD(&ss->workq_list); - mutex_lock(&sysfs_workq_mutex); - list_add_tail(&ss->workq_list, &sysfs_workq); - mutex_unlock(&sysfs_workq_mutex); - queue_work(sysfs_workqueue, &ss->work); - return 0; -} -EXPORT_SYMBOL_GPL(sysfs_schedule_callback); diff --git a/include/linux/device.h b/include/linux/device.h index 233bbbeb768d..d1d1c055b48e 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -566,12 +566,6 @@ extern int __must_check device_create_bin_file(struct device *dev, const struct bin_attribute *attr); extern void device_remove_bin_file(struct device *dev, const struct bin_attribute *attr); -extern int device_schedule_callback_owner(struct device *dev, - void (*func)(struct device *dev), struct module *owner); - -/* This is a macro to avoid include problems with THIS_MODULE */ -#define device_schedule_callback(dev, func) \ - device_schedule_callback_owner(dev, func, THIS_MODULE) /* device resource management */ typedef void (*dr_release_t)(struct device *dev, void *res); @@ -932,10 +926,7 @@ extern int device_online(struct device *dev); extern struct device *__root_device_register(const char *name, struct module *owner); -/* - * This is a macro to avoid include problems with THIS_MODULE, - * just as per what is done for device_schedule_callback() above. - */ +/* This is a macro to avoid include problems with THIS_MODULE */ #define root_device_register(name) \ __root_device_register(name, THIS_MODULE) diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h index 084354b0e814..5ffaa3443712 100644 --- a/include/linux/sysfs.h +++ b/include/linux/sysfs.h @@ -179,9 +179,6 @@ struct sysfs_ops { #ifdef CONFIG_SYSFS -int sysfs_schedule_callback(struct kobject *kobj, void (*func)(void *), - void *data, struct module *owner); - int __must_check sysfs_create_dir_ns(struct kobject *kobj, const void *ns); void sysfs_remove_dir(struct kobject *kobj); int __must_check sysfs_rename_dir_ns(struct kobject *kobj, const char *new_name, @@ -255,12 +252,6 @@ static inline void sysfs_enable_ns(struct kernfs_node *kn) #else /* CONFIG_SYSFS */ -static inline int sysfs_schedule_callback(struct kobject *kobj, - void (*func)(void *), void *data, struct module *owner) -{ - return -ENOSYS; -} - static inline int sysfs_create_dir_ns(struct kobject *kobj, const void *ns) { return 0; -- cgit v1.2.3 From 0e1f789d0dc38db79dfc4ddfd9cf541a8c198b7a Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 17 Apr 2014 08:15:19 +1000 Subject: xfs: don't map ranges that span EOF for direct IO Al Viro tracked down the problem that has caused generic/263 to fail on XFS since the test was introduced. If is caused by xfs_get_blocks() mapping a single extent that spans EOF without marking it as buffer-new() so that the direct IO code does not zero the tail of the block at the new EOF. This is a long standing bug that has been around for many, many years. Because xfs_get_blocks() starts the map before EOF, it can't set buffer_new(), because that causes he direct IO code to also zero unaligned sectors at the head of the IO. This would overwrite valid data with zeros, and hence we cannot validly return a single extent that spans EOF to direct IO. Fix this by detecting a mapping that spans EOF and truncate it down to EOF. This results in the the direct IO code doing the right thing for unaligned data blocks before EOF, and then returning to get another mapping for the region beyond EOF which XFS treats correctly by setting buffer_new() on it. This makes direct Io behave correctly w.r.t. tail block zeroing beyond EOF, and fsx is happy about that. Again, thanks to Al Viro for finding what I couldn't. [ dchinner: Fix for __divdi3 build error: Reported-by: Paul Gortmaker Tested-by: Paul Gortmaker Signed-off-by: Mark Tinguely Reviewed-by: Eric Sandeen ] Signed-off-by: Dave Chinner Tested-by: Brian Foster Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/xfs_aops.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'fs') diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index e0a793113ea9..0479c32c5eb1 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -1344,6 +1344,14 @@ __xfs_get_blocks( /* * If this is O_DIRECT or the mpage code calling tell them how large * the mapping is, so that we can avoid repeated get_blocks calls. + * + * If the mapping spans EOF, then we have to break the mapping up as the + * mapping for blocks beyond EOF must be marked new so that sub block + * regions can be correctly zeroed. We can't do this for mappings within + * EOF unless the mapping was just allocated or is unwritten, otherwise + * the callers would overwrite existing data with zeros. Hence we have + * to split the mapping into a range up to and including EOF, and a + * second mapping for beyond EOF. */ if (direct || size > (1 << inode->i_blkbits)) { xfs_off_t mapping_size; @@ -1354,6 +1362,12 @@ __xfs_get_blocks( ASSERT(mapping_size > 0); if (mapping_size > size) mapping_size = size; + if (offset < i_size_read(inode) && + offset + mapping_size >= i_size_read(inode)) { + /* limit mapping to block that spans EOF */ + mapping_size = roundup_64(i_size_read(inode) - offset, + 1 << inode->i_blkbits); + } if (mapping_size > LONG_MAX) mapping_size = LONG_MAX; -- cgit v1.2.3 From d39a2ced0fa0172faa46df0866fc22419b876e2a Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 17 Apr 2014 08:15:25 +1000 Subject: xfs: collapse range is delalloc challenged FSX has been detecting data corruption after to collapse range calls. The key observation is that the offset of the last extent in the file was not being shifted, and hence when the file size was adjusted it was truncating away data because the extents handled been correctly shifted. Tracing indicated that before the collapse, the extent list looked like: .... ino 0x5788 state idx 6 offset 26 block 195904 count 10 flag 0 ino 0x5788 state idx 7 offset 39 block 195917 count 35 flag 0 ino 0x5788 state idx 8 offset 86 block 195964 count 32 flag 0 and after the shift of 2 blocks: ino 0x5788 state idx 6 offset 24 block 195904 count 10 flag 0 ino 0x5788 state idx 7 offset 37 block 195917 count 35 flag 0 ino 0x5788 state idx 8 offset 86 block 195964 count 32 flag 0 Note that the last extent did not change offset. After the changing of the file size: ino 0x5788 state idx 6 offset 24 block 195904 count 10 flag 0 ino 0x5788 state idx 7 offset 37 block 195917 count 35 flag 0 ino 0x5788 state idx 8 offset 86 block 195964 count 30 flag 0 You can see that the last extent had it's length truncated, indicating that we've lost data. The reason for this is that the xfs_bmap_shift_extents() loop uses XFS_IFORK_NEXTENTS() to determine how many extents are in the inode. This, unfortunately, doesn't take into account delayed allocation extents - it's a count of physically allocated extents - and hence when the file being collapsed has a delalloc extent like this one does prior to the range being collapsed: .... ino 0x5788 state idx 4 offset 11 block 4503599627239429 count 1 flag 0 .... it gets the count wrong and terminates the shift loop early. Fix it by using the in-memory extent array size that includes delayed allocation extents to determine the number of extents on the inode. Signed-off-by: Dave Chinner Tested-by: Brian Foster Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/xfs_bmap.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 5b6092ef51ef..f0efc7e970ef 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -5413,6 +5413,7 @@ xfs_bmap_shift_extents( int whichfork = XFS_DATA_FORK; int logflags; xfs_filblks_t blockcount = 0; + int total_extents; if (unlikely(XFS_TEST_ERROR( (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && @@ -5429,7 +5430,6 @@ xfs_bmap_shift_extents( ASSERT(current_ext != NULL); ifp = XFS_IFORK_PTR(ip, whichfork); - if (!(ifp->if_flags & XFS_IFEXTENTS)) { /* Read in all the extents */ error = xfs_iread_extents(tp, ip, whichfork); @@ -5456,7 +5456,6 @@ xfs_bmap_shift_extents( /* We are going to change core inode */ logflags = XFS_ILOG_CORE; - if (ifp->if_flags & XFS_IFBROOT) { cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); cur->bc_private.b.firstblock = *firstblock; @@ -5467,8 +5466,14 @@ xfs_bmap_shift_extents( logflags |= XFS_ILOG_DEXT; } - while (nexts++ < num_exts && - *current_ext < XFS_IFORK_NEXTENTS(ip, whichfork)) { + /* + * There may be delalloc extents in the data fork before the range we + * are collapsing out, so we cannot + * use the count of real extents here. Instead we have to calculate it + * from the incore fork. + */ + total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t); + while (nexts++ < num_exts && *current_ext < total_extents) { gotp = xfs_iext_get_ext(ifp, *current_ext); xfs_bmbt_get_all(gotp, &got); @@ -5556,10 +5561,11 @@ xfs_bmap_shift_extents( } (*current_ext)++; + total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t); } /* Check if we are done */ - if (*current_ext == XFS_IFORK_NEXTENTS(ip, whichfork)) + if (*current_ext == total_extents) *done = 1; del_cursor: @@ -5568,6 +5574,5 @@ del_cursor: error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); xfs_trans_log_inode(tp, ip, logflags); - return error; } -- cgit v1.2.3 From 9c23eccc1e746f64b18fab070a37189b4422e44a Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 17 Apr 2014 08:15:26 +1000 Subject: xfs: unmount does not wait for shutdown during unmount MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit And interesting situation can occur if a log IO error occurs during the unmount of a filesystem. The cases reported have the same signature - the update of the superblock counters fails due to a log write IO error: XFS (dm-16): xfs_do_force_shutdown(0x2) called from line 1170 of file fs/xfs/xfs_log.c. Return address = 0xffffffffa08a44a1 XFS (dm-16): Log I/O Error Detected. Shutting down filesystem XFS (dm-16): Unable to update superblock counters. Freespace may not be correct on next mount. XFS (dm-16): xfs_log_force: error 5 returned. XFS (¿-¿¿¿): Please umount the filesystem and rectify the problem(s) It can be seen that the last line of output contains a corrupt device name - this is because the log and xfs_mount structures have already been freed by the time this message is printed. A kernel oops closely follows. The issue is that the shutdown is occurring in a separate IO completion thread to the unmount. Once the shutdown processing has started and all the iclogs are marked with XLOG_STATE_IOERROR, the log shutdown code wakes anyone waiting on a log force so they can process the shutdown error. This wakes up the unmount code that is doing a synchronous transaction to update the superblock counters. The unmount path now sees all the iclogs are marked with XLOG_STATE_IOERROR and so never waits on them again, knowing that if it does, there will not be a wakeup trigger for it and we will hang the unmount if we do. Hence the unmount runs through all the remaining code and frees all the filesystem structures while the xlog_iodone() is still processing the shutdown. When the log shutdown processing completes, xfs_do_force_shutdown() emits the "Please umount the filesystem and rectify the problem(s)" message, and xlog_iodone() then aborts all the objects attached to the iclog. An iclog that has already been freed.... The real issue here is that there is no serialisation point between the log IO and the unmount. We have serialisations points for log writes, log forces, reservations, etc, but we don't actually have any code that wakes for log IO to fully complete. We do that for all other types of object, so why not iclogbufs? Well, it turns out that we can easily do this. We've got xfs_buf handles, and that's what everyone else uses for IO serialisation. i.e. bp->b_sema. So, lets hold iclogbufs locked over IO, and only release the lock in xlog_iodone() when we are finished with the buffer. That way before we tear down the iclog, we can lock and unlock the buffer to ensure IO completion has finished completely before we tear it down. Signed-off-by: Dave Chinner Tested-by: Mike Snitzer Tested-by: Bob Mastors Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/xfs_log.c | 53 ++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 44 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 8497a00e399d..08624dc67317 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1181,11 +1181,14 @@ xlog_iodone(xfs_buf_t *bp) /* log I/O is always issued ASYNC */ ASSERT(XFS_BUF_ISASYNC(bp)); xlog_state_done_syncing(iclog, aborted); + /* - * do not reference the buffer (bp) here as we could race - * with it being freed after writing the unmount record to the - * log. + * drop the buffer lock now that we are done. Nothing references + * the buffer after this, so an unmount waiting on this lock can now + * tear it down safely. As such, it is unsafe to reference the buffer + * (bp) after the unlock as we could race with it being freed. */ + xfs_buf_unlock(bp); } /* @@ -1368,8 +1371,16 @@ xlog_alloc_log( bp = xfs_buf_alloc(mp->m_logdev_targp, 0, BTOBB(log->l_iclog_size), 0); if (!bp) goto out_free_log; - bp->b_iodone = xlog_iodone; + + /* + * The iclogbuf buffer locks are held over IO but we are not going to do + * IO yet. Hence unlock the buffer so that the log IO path can grab it + * when appropriately. + */ ASSERT(xfs_buf_islocked(bp)); + xfs_buf_unlock(bp); + + bp->b_iodone = xlog_iodone; log->l_xbuf = bp; spin_lock_init(&log->l_icloglock); @@ -1398,6 +1409,9 @@ xlog_alloc_log( if (!bp) goto out_free_iclog; + ASSERT(xfs_buf_islocked(bp)); + xfs_buf_unlock(bp); + bp->b_iodone = xlog_iodone; iclog->ic_bp = bp; iclog->ic_data = bp->b_addr; @@ -1422,7 +1436,6 @@ xlog_alloc_log( iclog->ic_callback_tail = &(iclog->ic_callback); iclog->ic_datap = (char *)iclog->ic_data + log->l_iclog_hsize; - ASSERT(xfs_buf_islocked(iclog->ic_bp)); init_waitqueue_head(&iclog->ic_force_wait); init_waitqueue_head(&iclog->ic_write_wait); @@ -1631,6 +1644,12 @@ xlog_cksum( * we transition the iclogs to IOERROR state *after* flushing all existing * iclogs to disk. This is because we don't want anymore new transactions to be * started or completed afterwards. + * + * We lock the iclogbufs here so that we can serialise against IO completion + * during unmount. We might be processing a shutdown triggered during unmount, + * and that can occur asynchronously to the unmount thread, and hence we need to + * ensure that completes before tearing down the iclogbufs. Hence we need to + * hold the buffer lock across the log IO to acheive that. */ STATIC int xlog_bdstrat( @@ -1638,6 +1657,7 @@ xlog_bdstrat( { struct xlog_in_core *iclog = bp->b_fspriv; + xfs_buf_lock(bp); if (iclog->ic_state & XLOG_STATE_IOERROR) { xfs_buf_ioerror(bp, EIO); xfs_buf_stale(bp); @@ -1645,7 +1665,8 @@ xlog_bdstrat( /* * It would seem logical to return EIO here, but we rely on * the log state machine to propagate I/O errors instead of - * doing it here. + * doing it here. Similarly, IO completion will unlock the + * buffer, so we don't do it here. */ return 0; } @@ -1847,14 +1868,28 @@ xlog_dealloc_log( xlog_cil_destroy(log); /* - * always need to ensure that the extra buffer does not point to memory - * owned by another log buffer before we free it. + * Cycle all the iclogbuf locks to make sure all log IO completion + * is done before we tear down these buffers. */ + iclog = log->l_iclog; + for (i = 0; i < log->l_iclog_bufs; i++) { + xfs_buf_lock(iclog->ic_bp); + xfs_buf_unlock(iclog->ic_bp); + iclog = iclog->ic_next; + } + + /* + * Always need to ensure that the extra buffer does not point to memory + * owned by another log buffer before we free it. Also, cycle the lock + * first to ensure we've completed IO on it. + */ + xfs_buf_lock(log->l_xbuf); + xfs_buf_unlock(log->l_xbuf); xfs_buf_set_empty(log->l_xbuf, BTOBB(log->l_iclog_size)); xfs_buf_free(log->l_xbuf); iclog = log->l_iclog; - for (i=0; il_iclog_bufs; i++) { + for (i = 0; i < log->l_iclog_bufs; i++) { xfs_buf_free(iclog->ic_bp); next_iclog = iclog->ic_next; kmem_free(iclog); -- cgit v1.2.3 From 07d5035a289f8bebe0ea86c293b2d5412478c481 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 17 Apr 2014 08:15:27 +1000 Subject: xfs: wrong error sign conversion during failed DIO writes We negate the error value being returned from a generic function incorrectly. The code path that it is running in returned negative errors, so there is no need to negate it to get the correct error signs here. This was uncovered by generic/019. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/xfs_file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 79e96ce98733..82afdcb33183 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -679,7 +679,7 @@ xfs_file_dio_aio_write( goto out; if (mapping->nrpages) { - ret = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping, + ret = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, pos, -1); if (ret) goto out; -- cgit v1.2.3 From 8d6c121018bf60d631c05a4a2efc468a392b97bb Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Thu, 17 Apr 2014 08:15:28 +1000 Subject: xfs: fix buffer use after free on IO error When testing exhaustion of dm snapshots, the following appeared with CONFIG_DEBUG_OBJECTS_FREE enabled: ODEBUG: free active (active state 0) object type: work_struct hint: xfs_buf_iodone_work+0x0/0x1d0 [xfs] indicating that we'd freed a buffer which still had a pending reference, down this path: [ 190.867975] [] debug_check_no_obj_freed+0x22b/0x270 [ 190.880820] [] kmem_cache_free+0xd0/0x370 [ 190.892615] [] xfs_buf_free+0xe4/0x210 [xfs] [ 190.905629] [] xfs_buf_rele+0xe7/0x270 [xfs] [ 190.911770] [] xfs_trans_read_buf_map+0x7b6/0xac0 [xfs] At issue is the fact that if IO fails in xfs_buf_iorequest, we'll queue completion unconditionally, and then call xfs_buf_rele; but if IO failed, there are no IOs remaining, and xfs_buf_rele will free the bp while work is still queued. Fix this by not scheduling completion if the buffer has an error on it; run it immediately. The rest is only comment changes. Thanks to dchinner for spotting the root cause. Signed-off-by: Eric Sandeen Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/xfs_buf.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 107f2fdfe41f..cb10a0aaab3a 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1372,21 +1372,29 @@ xfs_buf_iorequest( xfs_buf_wait_unpin(bp); xfs_buf_hold(bp); - /* Set the count to 1 initially, this will stop an I/O + /* + * Set the count to 1 initially, this will stop an I/O * completion callout which happens before we have started * all the I/O from calling xfs_buf_ioend too early. */ atomic_set(&bp->b_io_remaining, 1); _xfs_buf_ioapply(bp); - _xfs_buf_ioend(bp, 1); + /* + * If _xfs_buf_ioapply failed, we'll get back here with + * only the reference we took above. _xfs_buf_ioend will + * drop it to zero, so we'd better not queue it for later, + * or we'll free it before it's done. + */ + _xfs_buf_ioend(bp, bp->b_error ? 0 : 1); xfs_buf_rele(bp); } /* * Waits for I/O to complete on the buffer supplied. It returns immediately if - * no I/O is pending or there is already a pending error on the buffer. It - * returns the I/O error code, if any, or 0 if there was no error. + * no I/O is pending or there is already a pending error on the buffer, in which + * case nothing will ever complete. It returns the I/O error code, if any, or + * 0 if there was no error. */ int xfs_buf_iowait( -- cgit v1.2.3 From 330033d697ed8d296fa52b5303db9d802ad901cc Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Thu, 17 Apr 2014 08:15:30 +1000 Subject: xfs: fix tmpfile/selinux deadlock and initialize security xfstests generic/004 reproduces an ilock deadlock using the tmpfile interface when selinux is enabled. This occurs because xfs_create_tmpfile() takes the ilock and then calls d_tmpfile(). The latter eventually calls into xfs_xattr_get() which attempts to get the lock again. E.g.: xfs_io D ffffffff81c134c0 4096 3561 3560 0x00000080 ffff8801176a1a68 0000000000000046 ffff8800b401b540 ffff8801176a1fd8 00000000001d5800 00000000001d5800 ffff8800b401b540 ffff8800b401b540 ffff8800b73a6bd0 fffffffeffffffff ffff8800b73a6bd8 ffff8800b5ddb480 Call Trace: [] schedule+0x29/0x70 [] rwsem_down_read_failed+0xc5/0x120 [] ? xfs_ilock_attr_map_shared+0x1f/0x50 [xfs] [] call_rwsem_down_read_failed+0x14/0x30 [] ? down_read_nested+0x89/0xa0 [] ? xfs_ilock+0x122/0x250 [xfs] [] xfs_ilock+0x122/0x250 [xfs] [] xfs_ilock_attr_map_shared+0x1f/0x50 [xfs] [] xfs_attr_get+0x90/0xe0 [xfs] [] xfs_xattr_get+0x37/0x50 [xfs] [] generic_getxattr+0x4f/0x70 [] inode_doinit_with_dentry+0x1ae/0x650 [] selinux_d_instantiate+0x1c/0x20 [] security_d_instantiate+0x1b/0x30 [] d_instantiate+0x50/0x70 [] d_tmpfile+0xb5/0xc0 [] xfs_create_tmpfile+0x362/0x410 [xfs] [] xfs_vn_tmpfile+0x18/0x20 [xfs] [] path_openat+0x228/0x6a0 [] ? sched_clock+0x9/0x10 [] ? kvm_clock_read+0x27/0x40 [] ? __alloc_fd+0xaf/0x1f0 [] do_filp_open+0x3a/0x90 [] ? _raw_spin_unlock+0x27/0x40 [] ? __alloc_fd+0xaf/0x1f0 [] do_sys_open+0x12e/0x210 [] SyS_open+0x1e/0x20 [] system_call_fastpath+0x16/0x1b xfs_vn_tmpfile() also fails to initialize security on the newly created inode. Pull the d_tmpfile() call up into xfs_vn_tmpfile() after the transaction has been committed and the inode unlocked. Also, initialize security on the inode based on the parent directory provided via the tmpfile call. Signed-off-by: Brian Foster Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/xfs_inode.c | 5 +++-- fs/xfs/xfs_inode.h | 2 +- fs/xfs/xfs_iops.c | 20 +++++++++++++++++--- 3 files changed, 21 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 5e7a38fa6ee6..768087bedbac 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1334,7 +1334,8 @@ int xfs_create_tmpfile( struct xfs_inode *dp, struct dentry *dentry, - umode_t mode) + umode_t mode, + struct xfs_inode **ipp) { struct xfs_mount *mp = dp->i_mount; struct xfs_inode *ip = NULL; @@ -1402,7 +1403,6 @@ xfs_create_tmpfile( xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp); ip->i_d.di_nlink--; - d_tmpfile(dentry, VFS_I(ip)); error = xfs_iunlink(tp, ip); if (error) goto out_trans_abort; @@ -1415,6 +1415,7 @@ xfs_create_tmpfile( xfs_qm_dqrele(gdqp); xfs_qm_dqrele(pdqp); + *ipp = ip; return 0; out_trans_abort: diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 396cc1fafd0d..f2fcde52b66d 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -334,7 +334,7 @@ int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name, int xfs_create(struct xfs_inode *dp, struct xfs_name *name, umode_t mode, xfs_dev_t rdev, struct xfs_inode **ipp); int xfs_create_tmpfile(struct xfs_inode *dp, struct dentry *dentry, - umode_t mode); + umode_t mode, struct xfs_inode **ipp); int xfs_remove(struct xfs_inode *dp, struct xfs_name *name, struct xfs_inode *ip); int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip, diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 89b07e43ca28..ef1ca010f417 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -1053,11 +1053,25 @@ xfs_vn_tmpfile( struct dentry *dentry, umode_t mode) { - int error; + int error; + struct xfs_inode *ip; + struct inode *inode; - error = xfs_create_tmpfile(XFS_I(dir), dentry, mode); + error = xfs_create_tmpfile(XFS_I(dir), dentry, mode, &ip); + if (unlikely(error)) + return -error; - return -error; + inode = VFS_I(ip); + + error = xfs_init_security(inode, dir, &dentry->d_name); + if (unlikely(error)) { + iput(inode); + return -error; + } + + d_tmpfile(dentry, inode); + + return 0; } static const struct inode_operations xfs_inode_operations = { -- cgit v1.2.3 From bae9f746a18ee31bbeeb25ae6615805ed6eca173 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 15 Apr 2014 12:48:49 -0400 Subject: cifs: fix error handling cifs_user_readv Coverity says: *** CID 1202537: Dereference after null check (FORWARD_NULL) /fs/cifs/file.c: 2873 in cifs_user_readv() 2867 cur_len = min_t(const size_t, len - total_read, cifs_sb->rsize); 2868 npages = DIV_ROUND_UP(cur_len, PAGE_SIZE); 2869 2870 /* allocate a readdata struct */ 2871 rdata = cifs_readdata_alloc(npages, 2872 cifs_uncached_readv_complete); >>> CID 1202537: Dereference after null check (FORWARD_NULL) >>> Comparing "rdata" to null implies that "rdata" might be null. 2873 if (!rdata) { 2874 rc = -ENOMEM; 2875 goto error; 2876 } 2877 2878 rc = cifs_read_allocate_pages(rdata, npages); ...when we "goto error", rc will be non-zero, and then we end up trying to do a kref_put on the rdata (which is NULL). Fix this by replacing the "goto error" with a "break". Reported-by: Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/file.c b/fs/cifs/file.c index d8ee76241b64..a875eedfd928 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -2882,7 +2882,7 @@ ssize_t cifs_user_readv(struct kiocb *iocb, const struct iovec *iov, cifs_uncached_readv_complete); if (!rdata) { rc = -ENOMEM; - goto error; + break; } rc = cifs_read_allocate_pages(rdata, npages); -- cgit v1.2.3 From 1f80c0cc39e587edd06a36b43ba3a3b09d4ac428 Mon Sep 17 00:00:00 2001 From: Michael Opdenacker Date: Tue, 15 Apr 2014 10:06:50 +0200 Subject: cif: fix dead code This issue was found by Coverity (CID 1202536) This proposes a fix for a statement that creates dead code. The "rc < 0" statement is within code that is run with "rc > 0". It seems like "err < 0" was meant to be used here. This way, the error code is returned by the function. Signed-off-by: Michael Opdenacker Acked-by: Al Viro Signed-off-by: Steve French --- fs/cifs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/file.c b/fs/cifs/file.c index a875eedfd928..5ed03e0b8b40 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -2599,7 +2599,7 @@ cifs_writev(struct kiocb *iocb, const struct iovec *iov, ssize_t err; err = generic_write_sync(file, iocb->ki_pos - rc, rc); - if (rc < 0) + if (err < 0) rc = err; } } else { -- cgit v1.2.3 From 694c793fc1ade0946149c5f8d43f71e0728c4e81 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Fri, 18 Apr 2014 10:21:15 -0400 Subject: ext4: use truncate_pagecache() in collapse range We should be using truncate_pagecache() instead of truncate_pagecache_range() in the collapse range because we're truncating page cache from offset to the end of file. truncate_pagecache() also get rid of the private COWed pages from the range because we're going to shift the end of the file. Signed-off-by: Lukas Czerner Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 64b400356cad..3de9b2d7028c 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -5437,7 +5437,7 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) goto out_mutex; } - truncate_pagecache_range(inode, offset, -1); + truncate_pagecache(inode, offset); /* Wait for existing dio to complete */ ext4_inode_block_unlocked_dio(inode); -- cgit v1.2.3 From 1a66c7c3bea52ba0f7596b8940d74fce75281d16 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Fri, 18 Apr 2014 10:41:52 -0400 Subject: ext4: use filemap_write_and_wait_range() correctly in collapse range Currently we're passing -1 as lend argumnet for filemap_write_and_wait_range() which is wrong since lend is signed type so it would cause some confusion and we might not write_and_wait for the entire range we're expecting to write. Fix it by using LLONG_MAX instead. Signed-off-by: Lukas Czerner Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 3de9b2d7028c..f4a676908b0b 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -5415,7 +5415,7 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) } /* Write out all dirty pages */ - ret = filemap_write_and_wait_range(inode->i_mapping, offset, -1); + ret = filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); if (ret) return ret; -- cgit v1.2.3 From 2c1d23289bc2f7cfa358bc856b87a992dcb11ad5 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Fri, 18 Apr 2014 10:43:21 -0400 Subject: ext4: fix removing status extents in ext4_collapse_range() Currently in ext4_collapse_range() when calling ext4_es_remove_extent() to remove status extents we're passing (EXT_MAX_BLOCKS - punch_start - 1) in order to remove all extents from start of the collapse range to the end of the file. However this is wrong because we might miss the possible extent covering the last block of the file. Fix it by removing the -1. Signed-off-by: Lukas Czerner Signed-off-by: "Theodore Ts'o" Reviewed-by: Namjae Jeon --- fs/ext4/extents.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index f4a676908b0b..c6f624582d37 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -5454,7 +5454,7 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) ext4_discard_preallocations(inode); ret = ext4_es_remove_extent(inode, punch_start, - EXT_MAX_BLOCKS - punch_start - 1); + EXT_MAX_BLOCKS - punch_start); if (ret) { up_write(&EXT4_I(inode)->i_data_sem); goto out_stop; -- cgit v1.2.3 From 9337d5d31ab798f0c74150506371551a9195251a Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Fri, 18 Apr 2014 10:48:25 -0400 Subject: ext4: no need to truncate pagecache twice in collapse range We're already calling truncate_pagecache() before we attempt to do any actual job so there is not need to truncate pagecache once more using truncate_setsize() after we're finished. Remove truncate_setsize() and replace it just with i_size_write() note that we're holding appropriate locks. Signed-off-by: Lukas Czerner Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index c6f624582d37..3ee60e2e2ac7 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -5474,7 +5474,7 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) } new_size = i_size_read(inode) - len; - truncate_setsize(inode, new_size); + i_size_write(inode, new_size); EXT4_I(inode)->i_disksize = new_size; ext4_discard_preallocations(inode); -- cgit v1.2.3 From ef24f6c234de9a03aed9368163dbaad9a4f6391f Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Fri, 18 Apr 2014 10:50:23 -0400 Subject: ext4: discard preallocations after removing space Currently in ext4_collapse_range() and ext4_punch_hole() we're discarding preallocation twice. Once before we attempt to do any changes and second time after we're done with the changes. While the second call to ext4_discard_preallocations() in ext4_punch_hole() case is not needed, we need to discard preallocation right after ext4_ext_remove_space() in collapse range case because in the case we had to restart a transaction in the middle of removing space we might have new preallocations created. Remove unneeded ext4_discard_preallocations() ext4_punch_hole() and move it to the better place in ext4_collapse_range() Signed-off-by: Lukas Czerner Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 2 +- fs/ext4/inode.c | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 3ee60e2e2ac7..eb7be8f08e10 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -5465,6 +5465,7 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) up_write(&EXT4_I(inode)->i_data_sem); goto out_stop; } + ext4_discard_preallocations(inode); ret = ext4_ext_shift_extents(inode, handle, punch_stop, punch_stop - punch_start); @@ -5477,7 +5478,6 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) i_size_write(inode, new_size); EXT4_I(inode)->i_disksize = new_size; - ext4_discard_preallocations(inode); up_write(&EXT4_I(inode)->i_data_sem); if (IS_SYNC(inode)) ext4_handle_sync(handle); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index b74cfd2a42ec..d7b7462a0e13 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3621,7 +3621,6 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) ret = ext4_free_hole_blocks(handle, inode, first_block, stop_block); - ext4_discard_preallocations(inode); up_write(&EXT4_I(inode)->i_data_sem); if (IS_SYNC(inode)) ext4_handle_sync(handle); -- cgit v1.2.3 From 6dd834effc12ba71092d9d1e4944530234b58ab1 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Fri, 18 Apr 2014 10:55:24 -0400 Subject: ext4: fix extent merging in ext4_ext_shift_path_extents() There is a bug in ext4_ext_shift_path_extents() where if we actually manage to merge a extent we would skip shifting the next extent. This will result in in one extent in the extent tree not being properly shifted. This is causing failure in various xfstests tests using fsx or fsstress with collapse range support. It will also cause file system corruption which looks something like: e2fsck 1.42.9 (4-Feb-2014) Pass 1: Checking inodes, blocks, and sizes Inode 20 has out of order extents (invalid logical block 3, physical block 492938, len 2) Clear? yes ... when running e2fsck. It's also very easily reproducible just by running fsx without any parameters. I can usually hit the problem within a minute. Fix it by increasing ex_start only if we're not merging the extent. Signed-off-by: Lukas Czerner Signed-off-by: "Theodore Ts'o" Reviewed-by: Namjae Jeon --- fs/ext4/extents.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index eb7be8f08e10..d0860f2d36d0 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -5245,13 +5245,14 @@ ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift, while (ex_start <= ex_last) { le32_add_cpu(&ex_start->ee_block, -shift); - if (ex_start > - EXT_FIRST_EXTENT(path[depth].p_hdr)) { - if (ext4_ext_try_to_merge_right(inode, - path, ex_start - 1)) - ex_last--; - } - ex_start++; + /* Try to merge to the left. */ + if ((ex_start > + EXT_FIRST_EXTENT(path[depth].p_hdr)) && + ext4_ext_try_to_merge_right(inode, + path, ex_start - 1)) + ex_last--; + else + ex_start++; } err = ext4_ext_dirty(handle, inode, path + depth); if (err) -- cgit v1.2.3 From 6c5e73d3a26b73bfcac0b4a932cb918177d067f2 Mon Sep 17 00:00:00 2001 From: jon ernst Date: Fri, 18 Apr 2014 11:50:35 -0400 Subject: ext4: enforce we are operating on a regular file in ext4_zero_range() Signed-off-by: Jon Ernst Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index d0860f2d36d0..2f49b12a4c40 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4741,6 +4741,9 @@ static long ext4_zero_range(struct file *file, loff_t offset, trace_ext4_zero_range(inode, offset, len, mode); + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + /* * Write out all dirty pages to avoid race conditions * Then release them. -- cgit v1.2.3 From 86f1ca3889142d5959362c5694db3f3dc26f377a Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 18 Apr 2014 11:52:11 -0400 Subject: ext4: use EINVAL if not a regular file in ext4_collapse_range() Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 2f49b12a4c40..9b9251adb400 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -5404,7 +5404,7 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) return -EINVAL; if (!S_ISREG(inode->i_mode)) - return -EOPNOTSUPP; + return -EINVAL; trace_ext4_collapse_range(inode, offset, len); -- cgit v1.2.3 From 404ca80eb5c2727d78cd517d12108b040c522e12 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 19 Apr 2014 10:15:07 -0700 Subject: coredump: fix va_list corruption A va_list needs to be copied in case it needs to be used twice. Thanks to Hugh for debugging this issue, leading to various panics. Tested: lpq84:~# echo "|/foobar12345 %h %h %h %h %h %h %h %h %h %h %h %h %h %h %h %h %h %h %h %h" >/proc/sys/kernel/core_pattern 'produce_core' is simply : main() { *(int *)0 = 1;} lpq84:~# ./produce_core Segmentation fault (core dumped) lpq84:~# dmesg | tail -1 [ 614.352947] Core dump to |/foobar12345 lpq84 lpq84 lpq84 lpq84 lpq84 lpq84 lpq84 lpq84 lpq84 lpq84 lpq84 lpq84 lpq84 lpq84 lpq84 lpq84 lpq84 lpq84 lpq84 (null) pipe failed Notice the last argument was replaced by a NULL (we were lucky enough to not crash, but do not try this on your production machine !) After fix : lpq83:~# echo "|/foobar12345 %h %h %h %h %h %h %h %h %h %h %h %h %h %h %h %h %h %h %h %h" >/proc/sys/kernel/core_pattern lpq83:~# ./produce_core Segmentation fault lpq83:~# dmesg | tail -1 [ 740.800441] Core dump to |/foobar12345 lpq83 lpq83 lpq83 lpq83 lpq83 lpq83 lpq83 lpq83 lpq83 lpq83 lpq83 lpq83 lpq83 lpq83 lpq83 lpq83 lpq83 lpq83 lpq83 lpq83 pipe failed Fixes: 5fe9d8ca21cc ("coredump: cn_vprintf() has no reason to call vsnprintf() twice") Signed-off-by: Eric Dumazet Diagnosed-by: Hugh Dickins Acked-by: Oleg Nesterov Cc: Neil Horman Cc: Andrew Morton Cc: stable@vger.kernel.org # 3.11+ Signed-off-by: Linus Torvalds --- fs/coredump.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/coredump.c b/fs/coredump.c index e3ad709a4232..0b2528fb640e 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -73,10 +73,15 @@ static int expand_corename(struct core_name *cn, int size) static int cn_vprintf(struct core_name *cn, const char *fmt, va_list arg) { int free, need; + va_list arg_copy; again: free = cn->size - cn->used; - need = vsnprintf(cn->corename + cn->used, free, fmt, arg); + + va_copy(arg_copy, arg); + need = vsnprintf(cn->corename + cn->used, free, fmt, arg_copy); + va_end(arg_copy); + if (need < free) { cn->used += need; return 0; -- cgit v1.2.3 From a8680e0d5efd46aa54d7085e5b4a268f726922c7 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Sat, 19 Apr 2014 16:37:31 -0400 Subject: ext4: fix COLLAPSE_RANGE failure with 1KB block size When formatting with 1KB or 2KB(not aligned with PAGE SIZE) block size, xfstests generic/075 and 091 are failing. The offset supplied to function truncate_pagecache_range is block size aligned. In this function start offset is re-aligned to PAGE_SIZE by rounding_up to the next page boundary. Due to this rounding up, old data remains in the page cache when blocksize is less than page size and start offset is not aligned with page size. In case of collapse range, we need to align start offset to page size boundary by doing a round down operation instead of round up. Signed-off-by: Namjae Jeon Signed-off-by: Ashish Sangwan Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 9b9251adb400..d6bca2a1debe 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -5395,7 +5395,7 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) ext4_lblk_t punch_start, punch_stop; handle_t *handle; unsigned int credits; - loff_t new_size; + loff_t new_size, ioffset; int ret; /* Collapse range works only on fs block size aligned offsets. */ @@ -5418,8 +5418,15 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) return ret; } + /* + * Need to round down offset to be aligned with page size boundary + * for page size > block size. + */ + ioffset = round_down(offset, PAGE_SIZE); + /* Write out all dirty pages */ - ret = filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); + ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, + LLONG_MAX); if (ret) return ret; @@ -5441,7 +5448,7 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) goto out_mutex; } - truncate_pagecache(inode, offset); + truncate_pagecache(inode, ioffset); /* Wait for existing dio to complete */ ext4_inode_block_unlocked_dio(inode); -- cgit v1.2.3 From 0a04b248532b358b27a8da050642da6f5f304b03 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Sat, 19 Apr 2014 16:38:21 -0400 Subject: ext4: disable COLLAPSE_RANGE for bigalloc Once COLLAPSE RANGE is be disable for ext4 with bigalloc feature till finding root-cause of problem. It will be enable with fixing that regression of xfstest(generic 075 and 091) again. Signed-off-by: Namjae Jeon Signed-off-by: Ashish Sangwan Reviewed-by: Lukas Czerner Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index d6bca2a1debe..01b0c208f625 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -5406,6 +5406,9 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) if (!S_ISREG(inode->i_mode)) return -EINVAL; + if (EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) + return -EOPNOTSUPP; + trace_ext4_collapse_range(inode, offset, len); punch_start = offset >> EXT4_BLOCK_SIZE_BITS(sb); -- cgit v1.2.3