From 666525dfbdca09bbd4848ac711e4a4dbd6921325 Mon Sep 17 00:00:00 2001 From: Chen Gang Date: Mon, 7 Apr 2014 10:18:56 -0400 Subject: ext4: fix 64-bit number truncation warning '0x7FDEADBEEF' will be truncated to 32-bit number under unicore32. Need append 'ULL' for it. The related warning (with allmodconfig under unicore32): CC [M] fs/ext4/extents_status.o fs/ext4/extents_status.c: In function "__es_remove_extent": fs/ext4/extents_status.c:813: warning: integer constant is too large for "long" type Signed-off-by: Chen Gang Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents_status.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 0a014a7194b2..0ebc21204b51 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -810,7 +810,7 @@ retry: newes.es_lblk = end + 1; newes.es_len = len2; - block = 0x7FDEADBEEF; + block = 0x7FDEADBEEFULL; if (ext4_es_is_written(&orig_es) || ext4_es_is_unwritten(&orig_es)) block = ext4_es_pblock(&orig_es) + -- cgit v1.2.3 From 4adb6ab3e0fa71363a5ef229544b2d17de6600d7 Mon Sep 17 00:00:00 2001 From: Kazuya Mio Date: Mon, 7 Apr 2014 10:53:28 -0400 Subject: ext4: FIBMAP ioctl causes BUG_ON due to handle EXT_MAX_BLOCKS When we try to get 2^32-1 block of the file which has the extent (ee_block=2^32-2, ee_len=1) with FIBMAP ioctl, it causes BUG_ON in ext4_ext_put_gap_in_cache(). To avoid the problem, ext4_map_blocks() needs to check the file logical block number. ext4_ext_put_gap_in_cache() called via ext4_map_blocks() cannot handle 2^32-1 because the maximum file logical block number is 2^32-2. Note that ext4_ind_map_blocks() returns -EIO when the block number is invalid. So ext4_map_blocks() should also return the same errno. Signed-off-by: Kazuya Mio Signed-off-by: "Theodore Ts'o" Cc: stable@vger.kernel.org --- fs/ext4/inode.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 5b0d2c7d5408..93f16c5e8a8e 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -522,6 +522,10 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, if (unlikely(map->m_len > INT_MAX)) map->m_len = INT_MAX; + /* We can handle the block number less than EXT_MAX_BLOCKS */ + if (unlikely(map->m_lblk >= EXT_MAX_BLOCKS)) + return -EIO; + /* Lookup extent status tree firstly */ if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) { ext4_es_lru_add(inode); -- cgit v1.2.3 From 007649375f6af242d5b1df2c15996949714303ba Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Mon, 7 Apr 2014 10:54:20 -0400 Subject: ext4: initialize multi-block allocator before checking block descriptors With EXT4FS_DEBUG ext4_count_free_clusters() will call ext4_read_block_bitmap() without s_group_info initialized, so we need to initialize multi-block allocator before. And dependencies that must be solved, to allow this: - multi-block allocator needs in group descriptors - need to install s_op before initializing multi-block allocator, because in ext4_mb_init_backend() new inode is created. - initialize number of group desc blocks (s_gdb_count) otherwise number of clusters returned by ext4_free_clusters_after_init() is not correct. (see ext4_bg_num_gdb_nometa()) Here is the stack backtrace: (gdb) bt #0 ext4_get_group_info (group=0, sb=0xffff880079a10000) at ext4.h:2430 #1 ext4_validate_block_bitmap (sb=sb@entry=0xffff880079a10000, desc=desc@entry=0xffff880056510000, block_group=block_group@entry=0, bh=bh@entry=0xffff88007bf2b2d8) at balloc.c:358 #2 0xffffffff81232202 in ext4_wait_block_bitmap (sb=sb@entry=0xffff880079a10000, block_group=block_group@entry=0, bh=bh@entry=0xffff88007bf2b2d8) at balloc.c:476 #3 0xffffffff81232eaf in ext4_read_block_bitmap (sb=sb@entry=0xffff880079a10000, block_group=block_group@entry=0) at balloc.c:489 #4 0xffffffff81232fc0 in ext4_count_free_clusters (sb=sb@entry=0xffff880079a10000) at balloc.c:665 #5 0xffffffff81259ffa in ext4_check_descriptors (first_not_zeroed=, sb=0xffff880079a10000) at super.c:2143 #6 ext4_fill_super (sb=sb@entry=0xffff880079a10000, data=, data@entry=0x0 , silent=silent@entry=0) at super.c:3851 ... Signed-off-by: Azat Khuzhin Signed-off-by: "Theodore Ts'o" --- fs/ext4/super.c | 51 +++++++++++++++++++++++++++------------------------ 1 file changed, 27 insertions(+), 24 deletions(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index f3c667091618..6f9e6fadac04 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3869,19 +3869,38 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount2; } } + + /* + * set up enough so that it can read an inode, + * and create new inode for buddy allocator + */ + sbi->s_gdb_count = db_count; + if (!test_opt(sb, NOLOAD) && + EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) + sb->s_op = &ext4_sops; + else + sb->s_op = &ext4_nojournal_sops; + + ext4_ext_init(sb); + err = ext4_mb_init(sb); + if (err) { + ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)", + err); + goto failed_mount2; + } + if (!ext4_check_descriptors(sb, &first_not_zeroed)) { ext4_msg(sb, KERN_ERR, "group descriptors corrupted!"); - goto failed_mount2; + goto failed_mount2a; } if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) if (!ext4_fill_flex_info(sb)) { ext4_msg(sb, KERN_ERR, "unable to initialize " "flex_bg meta info!"); - goto failed_mount2; + goto failed_mount2a; } - sbi->s_gdb_count = db_count; get_random_bytes(&sbi->s_next_generation, sizeof(u32)); spin_lock_init(&sbi->s_next_gen_lock); @@ -3916,14 +3935,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_stripe = ext4_get_stripe_size(sbi); sbi->s_extent_max_zeroout_kb = 32; - /* - * set up enough so that it can read an inode - */ - if (!test_opt(sb, NOLOAD) && - EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) - sb->s_op = &ext4_sops; - else - sb->s_op = &ext4_nojournal_sops; sb->s_export_op = &ext4_export_ops; sb->s_xattr = ext4_xattr_handlers; #ifdef CONFIG_QUOTA @@ -4113,21 +4124,13 @@ no_journal: if (err) { ext4_msg(sb, KERN_ERR, "failed to reserve %llu clusters for " "reserved pool", ext4_calculate_resv_clusters(sb)); - goto failed_mount4a; + goto failed_mount5; } err = ext4_setup_system_zone(sb); if (err) { ext4_msg(sb, KERN_ERR, "failed to initialize system " "zone (%d)", err); - goto failed_mount4a; - } - - ext4_ext_init(sb); - err = ext4_mb_init(sb); - if (err) { - ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)", - err); goto failed_mount5; } @@ -4204,11 +4207,8 @@ failed_mount8: failed_mount7: ext4_unregister_li_request(sb); failed_mount6: - ext4_mb_release(sb); -failed_mount5: - ext4_ext_release(sb); ext4_release_system_zone(sb); -failed_mount4a: +failed_mount5: dput(sb->s_root); sb->s_root = NULL; failed_mount4: @@ -4232,11 +4232,14 @@ failed_mount3: percpu_counter_destroy(&sbi->s_extent_cache_cnt); if (sbi->s_mmp_tsk) kthread_stop(sbi->s_mmp_tsk); +failed_mount2a: + ext4_mb_release(sb); failed_mount2: for (i = 0; i < db_count; i++) brelse(sbi->s_group_desc[i]); ext4_kvfree(sbi->s_group_desc); failed_mount: + ext4_ext_release(sb); if (sbi->s_chksum_driver) crypto_free_shash(sbi->s_chksum_driver); if (sbi->s_proc) { -- cgit v1.2.3 From 9503c67c93ed0b95ba62d12d1fd09da6245dbdd6 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Mon, 7 Apr 2014 10:54:20 -0400 Subject: ext4: note the error in ext4_end_bio() ext4_end_bio() currently throws away the error that it receives. Chances are this is part of a spate of errors, one of which will end up getting the error returned to userspace somehow, but we shouldn't take that risk. Also print out the errno to aid in debug. Signed-off-by: Matthew Wilcox Signed-off-by: "Theodore Ts'o" Reviewed-by: Jan Kara Cc: stable@vger.kernel.org --- fs/ext4/page-io.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index ab95508e3d40..c18d95b50540 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -308,13 +308,14 @@ static void ext4_end_bio(struct bio *bio, int error) if (error) { struct inode *inode = io_end->inode; - ext4_warning(inode->i_sb, "I/O error writing to inode %lu " + ext4_warning(inode->i_sb, "I/O error %d writing to inode %lu " "(offset %llu size %ld starting block %llu)", - inode->i_ino, + error, inode->i_ino, (unsigned long long) io_end->offset, (long) io_end->size, (unsigned long long) bi_sector >> (inode->i_blkbits - 9)); + mapping_set_error(inode->i_mapping, error); } if (io_end->flag & EXT4_IO_END_UNWRITTEN) { -- cgit v1.2.3 From ec4cb1aa2b7bae18dd8164f2e9c7c51abcf61280 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 7 Apr 2014 10:54:21 -0400 Subject: ext4: fix jbd2 warning under heavy xattr load When heavily exercising xattr code the assertion that jbd2_journal_dirty_metadata() shouldn't return error was triggered: WARNING: at /srv/autobuild-ceph/gitbuilder.git/build/fs/jbd2/transaction.c:1237 jbd2_journal_dirty_metadata+0x1ba/0x260() CPU: 0 PID: 8877 Comm: ceph-osd Tainted: G W 3.10.0-ceph-00049-g68d04c9 #1 Hardware name: Dell Inc. PowerEdge R410/01V648, BIOS 1.6.3 02/07/2011 ffffffff81a1d3c8 ffff880214469928 ffffffff816311b0 ffff880214469968 ffffffff8103fae0 ffff880214469958 ffff880170a9dc30 ffff8802240fbe80 0000000000000000 ffff88020b366000 ffff8802256e7510 ffff880214469978 Call Trace: [] dump_stack+0x19/0x1b [] warn_slowpath_common+0x70/0xa0 [] warn_slowpath_null+0x1a/0x20 [] jbd2_journal_dirty_metadata+0x1ba/0x260 [] __ext4_handle_dirty_metadata+0xa3/0x140 [] ext4_xattr_release_block+0x103/0x1f0 [] ext4_xattr_block_set+0x1e0/0x910 [] ext4_xattr_set_handle+0x38b/0x4a0 [] ? trace_hardirqs_on+0xd/0x10 [] ext4_xattr_set+0xc2/0x140 [] ext4_xattr_user_set+0x47/0x50 [] generic_setxattr+0x6e/0x90 [] __vfs_setxattr_noperm+0x7b/0x1c0 [] vfs_setxattr+0xc4/0xd0 [] setxattr+0x13e/0x1e0 [] ? __sb_start_write+0xe7/0x1b0 [] ? mnt_want_write_file+0x28/0x60 [] ? fget_light+0x3c/0x130 [] ? mnt_want_write_file+0x28/0x60 [] ? __mnt_want_write+0x58/0x70 [] SyS_fsetxattr+0xbe/0x100 [] system_call_fastpath+0x16/0x1b The reason for the warning is that buffer_head passed into jbd2_journal_dirty_metadata() didn't have journal_head attached. This is caused by the following race of two ext4_xattr_release_block() calls: CPU1 CPU2 ext4_xattr_release_block() ext4_xattr_release_block() lock_buffer(bh); /* False */ if (BHDR(bh)->h_refcount == cpu_to_le32(1)) } else { le32_add_cpu(&BHDR(bh)->h_refcount, -1); unlock_buffer(bh); lock_buffer(bh); /* True */ if (BHDR(bh)->h_refcount == cpu_to_le32(1)) get_bh(bh); ext4_free_blocks() ... jbd2_journal_forget() jbd2_journal_unfile_buffer() -> JH is gone error = ext4_handle_dirty_xattr_block(handle, inode, bh); -> triggers the warning We fix the problem by moving ext4_handle_dirty_xattr_block() under the buffer lock. Sadly this cannot be done in nojournal mode as that function can call sync_dirty_buffer() which would deadlock. Luckily in nojournal mode the race is harmless (we only dirty already freed buffer) and thus for nojournal mode we leave the dirtying outside of the buffer lock. Reported-by: Sage Weil Signed-off-by: Jan Kara Signed-off-by: "Theodore Ts'o" Cc: stable@vger.kernel.org --- fs/ext4/xattr.c | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 1f5cf5880718..4eec399ec807 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -520,8 +520,8 @@ static void ext4_xattr_update_super_block(handle_t *handle, } /* - * Release the xattr block BH: If the reference count is > 1, decrement - * it; otherwise free the block. + * Release the xattr block BH: If the reference count is > 1, decrement it; + * otherwise free the block. */ static void ext4_xattr_release_block(handle_t *handle, struct inode *inode, @@ -542,16 +542,31 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode, if (ce) mb_cache_entry_free(ce); get_bh(bh); + unlock_buffer(bh); ext4_free_blocks(handle, inode, bh, 0, 1, EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); - unlock_buffer(bh); } else { le32_add_cpu(&BHDR(bh)->h_refcount, -1); if (ce) mb_cache_entry_release(ce); + /* + * Beware of this ugliness: Releasing of xattr block references + * from different inodes can race and so we have to protect + * from a race where someone else frees the block (and releases + * its journal_head) before we are done dirtying the buffer. In + * nojournal mode this race is harmless and we actually cannot + * call ext4_handle_dirty_xattr_block() with locked buffer as + * that function can call sync_dirty_buffer() so for that case + * we handle the dirtying after unlocking the buffer. + */ + if (ext4_handle_valid(handle)) + error = ext4_handle_dirty_xattr_block(handle, inode, + bh); unlock_buffer(bh); - error = ext4_handle_dirty_xattr_block(handle, inode, bh); + if (!ext4_handle_valid(handle)) + error = ext4_handle_dirty_xattr_block(handle, inode, + bh); if (IS_SYNC(inode)) ext4_handle_sync(handle); dquot_free_block(inode, EXT4_C2B(EXT4_SB(inode->i_sb), 1)); -- cgit v1.2.3 From 87f7e41636ff201148443551d06bc74497160aac Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 8 Apr 2014 11:38:28 -0400 Subject: ext4: update PF_MEMALLOC handling in ext4_write_inode() The special handling of PF_MEMALLOC callers in ext4_write_inode() shouldn't be necessary as there shouldn't be any. Warn about it. Also update comment before the function as it seems somewhat outdated. (Changes modeled on an ext3 patch posted by Jan Kara to the linux-ext4 mailing list on Februaryt 28, 2014, which apparently never went into the ext3 tree.) Signed-off-by: "Theodore Ts'o" Cc: Jan Kara --- fs/ext4/inode.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 93f16c5e8a8e..7b93df9aa182 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4427,21 +4427,20 @@ out_brelse: * * We are called from a few places: * - * - Within generic_file_write() for O_SYNC files. + * - Within generic_file_aio_write() -> generic_write_sync() for O_SYNC files. * Here, there will be no transaction running. We wait for any running * transaction to commit. * - * - Within sys_sync(), kupdate and such. - * We wait on commit, if tol to. + * - Within flush work (sys_sync(), kupdate and such). + * We wait on commit, if told to. * - * - Within prune_icache() (PF_MEMALLOC == true) - * Here we simply return. We can't afford to block kswapd on the - * journal commit. + * - Within iput_final() -> write_inode_now() + * We wait on commit, if told to. * * In all cases it is actually safe for us to return without doing anything, * because the inode has been copied into a raw inode buffer in - * ext4_mark_inode_dirty(). This is a correctness thing for O_SYNC and for - * knfsd. + * ext4_mark_inode_dirty(). This is a correctness thing for WB_SYNC_ALL + * writeback. * * Note that we are absolutely dependent upon all inode dirtiers doing the * right thing: they *must* call mark_inode_dirty() after dirtying info in @@ -4453,15 +4452,15 @@ out_brelse: * stuff(); * inode->i_size = expr; * - * is in error because a kswapd-driven write_inode() could occur while - * `stuff()' is running, and the new i_size will be lost. Plus the inode - * will no longer be on the superblock's dirty inode list. + * is in error because write_inode() could occur while `stuff()' is running, + * and the new i_size will be lost. Plus the inode will no longer be on the + * superblock's dirty inode list. */ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) { int err; - if (current->flags & PF_MEMALLOC) + if (WARN_ON_ONCE(current->flags & PF_MEMALLOC)) return 0; if (EXT4_SB(inode->i_sb)->s_journal) { -- cgit v1.2.3 From 1ce01c4a199c50b023802be25261c0c02b2f0214 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Thu, 10 Apr 2014 22:58:20 -0400 Subject: ext4: fix COLLAPSE_RANGE test failure in data journalling mode When mounting ext4 with data=journal option, xfstest shared/002 and shared/004 are currently failing as checksum computed for testfile does not match with the checksum computed in other journal modes. In case of data=journal mode, a call to filemap_write_and_wait_range will not flush anything to disk as buffers are not marked dirty in write_end. In collapse range this call is followed by a call to truncate_pagecache_range. Due to this, when checksum is computed, a portion of file is re-read from disk which replace valid data with NULL bytes and hence the reason for the difference in checksum. Calling ext4_force_commit before filemap_write_and_wait_range solves the issue as it will mark the buffers dirty during commit transaction which can be later synced by a call to filemap_write_and_wait_range. Signed-off-by: Namjae Jeon Signed-off-by: Ashish Sangwan Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 82df3ce9874a..be1e56cbbf32 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -5383,6 +5383,13 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) punch_start = offset >> EXT4_BLOCK_SIZE_BITS(sb); punch_stop = (offset + len) >> EXT4_BLOCK_SIZE_BITS(sb); + /* Call ext4_force_commit to flush all data in case of data=journal. */ + if (ext4_should_journal_data(inode)) { + ret = ext4_force_commit(inode->i_sb); + if (ret) + return ret; + } + /* Write out all dirty pages */ ret = filemap_write_and_wait_range(inode->i_mapping, offset, -1); if (ret) -- cgit v1.2.3 From c57ab39b9658315a742b6e61fdc86bb4d20cf566 Mon Sep 17 00:00:00 2001 From: Younger Liu Date: Thu, 10 Apr 2014 23:03:43 -0400 Subject: ext4: return ENOMEM rather than EIO when find_###_page() fails Return ENOMEM rather than EIO when find_get_page() fails in ext4_mb_get_buddy_page_lock() and find_or_create_page() fails in ext4_mb_load_buddy(). Signed-off-by: Younger Liu Signed-off-by: "Theodore Ts'o" --- fs/ext4/mballoc.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index a888cac76e9c..73ccbb3b973b 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -989,7 +989,7 @@ static int ext4_mb_get_buddy_page_lock(struct super_block *sb, poff = block % blocks_per_page; page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); if (!page) - return -EIO; + return -ENOMEM; BUG_ON(page->mapping != inode->i_mapping); e4b->bd_bitmap_page = page; e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); @@ -1003,7 +1003,7 @@ static int ext4_mb_get_buddy_page_lock(struct super_block *sb, pnum = block / blocks_per_page; page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); if (!page) - return -EIO; + return -ENOMEM; BUG_ON(page->mapping != inode->i_mapping); e4b->bd_buddy_page = page; return 0; @@ -1168,7 +1168,11 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, unlock_page(page); } } - if (page == NULL || !PageUptodate(page)) { + if (page == NULL) { + ret = -ENOMEM; + goto err; + } + if (!PageUptodate(page)) { ret = -EIO; goto err; } @@ -1197,7 +1201,11 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, unlock_page(page); } } - if (page == NULL || !PageUptodate(page)) { + if (page == NULL) { + ret = -ENOMEM; + goto err; + } + if (!PageUptodate(page)) { ret = -EIO; goto err; } -- cgit v1.2.3 From 622cad1325e404598fe3b148c3fa640dbaabc235 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 11 Apr 2014 10:35:17 -0400 Subject: ext4: move ext4_update_i_disksize() into mpage_map_and_submit_extent() The function ext4_update_i_disksize() is used in only one place, in the function mpage_map_and_submit_extent(). Move its code to simplify the code paths, and also move the call to ext4_mark_inode_dirty() into the i_data_sem's critical region, to be consistent with all of the other places where we update i_disksize. That way, we also keep the raw_inode's i_disksize protected, to avoid the following race: CPU #1 CPU #2 down_write(&i_data_sem) Modify i_disk_size up_write(&i_data_sem) down_write(&i_data_sem) Modify i_disk_size Copy i_disk_size to on-disk inode up_write(&i_data_sem) Copy i_disk_size to on-disk inode Signed-off-by: "Theodore Ts'o" Reviewed-by: Jan Kara Cc: stable@vger.kernel.org --- fs/ext4/ext4.h | 17 ----------------- fs/ext4/inode.c | 16 +++++++++++++--- 2 files changed, 13 insertions(+), 20 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index f1c65dc7cc0a..66946aa62127 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2466,23 +2466,6 @@ static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize) up_write(&EXT4_I(inode)->i_data_sem); } -/* - * Update i_disksize after writeback has been started. Races with truncate - * are avoided by checking i_size under i_data_sem. - */ -static inline void ext4_wb_update_i_disksize(struct inode *inode, loff_t newsize) -{ - loff_t i_size; - - down_write(&EXT4_I(inode)->i_data_sem); - i_size = i_size_read(inode); - if (newsize > i_size) - newsize = i_size; - if (newsize > EXT4_I(inode)->i_disksize) - EXT4_I(inode)->i_disksize = newsize; - up_write(&EXT4_I(inode)->i_data_sem); -} - struct ext4_group_info { unsigned long bb_state; struct rb_root bb_free_root; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 7b93df9aa182..f023f0cb46fc 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2247,13 +2247,23 @@ static int mpage_map_and_submit_extent(handle_t *handle, return err; } while (map->m_len); - /* Update on-disk size after IO is submitted */ + /* + * Update on-disk size after IO is submitted. Races with + * truncate are avoided by checking i_size under i_data_sem. + */ disksize = ((loff_t)mpd->first_page) << PAGE_CACHE_SHIFT; if (disksize > EXT4_I(inode)->i_disksize) { int err2; - - ext4_wb_update_i_disksize(inode, disksize); + loff_t i_size; + + down_write(&EXT4_I(inode)->i_data_sem); + i_size = i_size_read(inode); + if (disksize > i_size) + disksize = i_size; + if (disksize > EXT4_I(inode)->i_disksize) + EXT4_I(inode)->i_disksize = disksize; err2 = ext4_mark_inode_dirty(handle, inode); + up_write(&EXT4_I(inode)->i_data_sem); if (err2) ext4_error(inode->i_sb, "Failed to mark inode %lu dirty", -- cgit v1.2.3 From 9ef06cec7c96f6bf59f1dd8b64b9645820099051 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Sat, 12 Apr 2014 09:47:00 -0400 Subject: ext4: remove unnecessary check for APPEND and IMMUTABLE All the checks IS_APPEND and IS_IMMUTABLE for the fallocate operation on the inode are done in vfs. No need to do this again in ext4. Remove it. Signed-off-by: Lukas Czerner Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 6 ------ fs/ext4/inode.c | 6 +----- 2 files changed, 1 insertion(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index be1e56cbbf32..ed4ec48239b6 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -5398,12 +5398,6 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) /* Take mutex lock */ mutex_lock(&inode->i_mutex); - /* It's not possible punch hole on append only file */ - if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) { - ret = -EPERM; - goto out_mutex; - } - if (IS_SWAPFILE(inode)) { ret = -ETXTBSY; goto out_mutex; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index f023f0cb46fc..e2bba76f0d7b 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3541,11 +3541,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) } mutex_lock(&inode->i_mutex); - /* It's not possible punch hole on append only file */ - if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) { - ret = -EPERM; - goto out_mutex; - } + if (IS_SWAPFILE(inode)) { ret = -ETXTBSY; goto out_mutex; -- cgit v1.2.3 From 8fc61d92630d1c96057a94c61e1643475045b25b Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Sat, 12 Apr 2014 09:51:34 -0400 Subject: fs: prevent doing FALLOC_FL_ZERO_RANGE on append only file Currently punch hole and collapse range fallocate operation are not allowed on append only file. This should be case for zero range as well. Fix it by allowing only pure fallocate (possibly with keep size set). Signed-off-by: Lukas Czerner Signed-off-by: "Theodore Ts'o" --- fs/open.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/open.c b/fs/open.c index 631aea815def..3a83253d3373 100644 --- a/fs/open.c +++ b/fs/open.c @@ -254,11 +254,9 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len) return -EBADF; /* - * It's not possible to punch hole or perform collapse range - * on append only file + * We can only allow pure fallocate on append only files */ - if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_COLLAPSE_RANGE) - && IS_APPEND(inode)) + if ((mode & ~FALLOC_FL_KEEP_SIZE) && IS_APPEND(inode)) return -EPERM; if (IS_IMMUTABLE(inode)) -- cgit v1.2.3 From 23fffa925ea2c9a2bcb1a4453e2c542635aa3545 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Sat, 12 Apr 2014 09:56:41 -0400 Subject: fs: move falloc collapse range check into the filesystem methods Currently in do_fallocate in collapse range case we're checking whether offset + len is not bigger than i_size. However there is nothing which would prevent i_size from changing so the check is pointless. It should be done in the file system itself and the file system needs to make sure that i_size is not going to change. The i_size check for the other fallocate modes are also done in the filesystems. As it is now we can easily crash the kernel by having two processes doing truncate and fallocate collapse range at the same time. This can be reproduced on ext4 and it is theoretically possible on xfs even though I was not able to trigger it with this simple test. This commit removes the check from do_fallocate and adds it to the file system. Signed-off-by: Lukas Czerner Signed-off-by: "Theodore Ts'o" Acked-by: Dave Chinner Reviewed-by: Christoph Hellwig --- fs/ext4/extents.c | 11 +++++++++-- fs/open.c | 8 -------- fs/xfs/xfs_file.c | 10 +++++++++- 3 files changed, 18 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index ed4ec48239b6..ac5460d0d133 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -5368,8 +5368,6 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) loff_t new_size; int ret; - BUG_ON(offset + len > i_size_read(inode)); - /* Collapse range works only on fs block size aligned offsets. */ if (offset & (EXT4_BLOCK_SIZE(sb) - 1) || len & (EXT4_BLOCK_SIZE(sb) - 1)) @@ -5398,6 +5396,15 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) /* Take mutex lock */ mutex_lock(&inode->i_mutex); + /* + * There is no need to overlap collapse range with EOF, in which case + * it is effectively a truncate operation + */ + if (offset + len >= i_size_read(inode)) { + ret = -EINVAL; + goto out_mutex; + } + if (IS_SWAPFILE(inode)) { ret = -ETXTBSY; goto out_mutex; diff --git a/fs/open.c b/fs/open.c index 3a83253d3373..adf34202213a 100644 --- a/fs/open.c +++ b/fs/open.c @@ -284,14 +284,6 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len) if (((offset + len) > inode->i_sb->s_maxbytes) || ((offset + len) < 0)) return -EFBIG; - /* - * There is no need to overlap collapse range with EOF, in which case - * it is effectively a truncate operation - */ - if ((mode & FALLOC_FL_COLLAPSE_RANGE) && - (offset + len >= i_size_read(inode))) - return -EINVAL; - if (!file->f_op->fallocate) return -EOPNOTSUPP; diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index f7abff8c16ca..3cb528c4f27c 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -840,7 +840,15 @@ xfs_file_fallocate( goto out_unlock; } - ASSERT(offset + len < i_size_read(inode)); + /* + * There is no need to overlap collapse range with EOF, + * in which case it is effectively a truncate operation + */ + if (offset + len >= i_size_read(inode)) { + error = -EINVAL; + goto out_unlock; + } + new_size = i_size_read(inode) - len; error = xfs_collapse_file_space(ip, offset, len); -- cgit v1.2.3 From 0790b31b69374ddadefebb156251b319e5b43345 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Sat, 12 Apr 2014 10:05:37 -0400 Subject: fs: disallow all fallocate operation on active swapfile Currently some file system have IS_SWAPFILE check in their fallocate implementations and some do not. However we should really prevent any fallocate operation on swapfile so move the check to vfs and remove the redundant checks from the file systems fallocate implementations. Signed-off-by: Lukas Czerner Signed-off-by: "Theodore Ts'o" --- fs/ceph/file.c | 3 --- fs/ext4/extents.c | 5 ----- fs/ext4/inode.c | 5 ----- fs/open.c | 7 +++++++ 4 files changed, 7 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 09c7afe32e49..596e6cc9f9c4 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -1215,9 +1215,6 @@ static long ceph_fallocate(struct file *file, int mode, if (!S_ISREG(inode->i_mode)) return -EOPNOTSUPP; - if (IS_SWAPFILE(inode)) - return -ETXTBSY; - mutex_lock(&inode->i_mutex); if (ceph_snap(inode) != CEPH_NOSNAP) { diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index ac5460d0d133..b2d3869b5762 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -5405,11 +5405,6 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) goto out_mutex; } - if (IS_SWAPFILE(inode)) { - ret = -ETXTBSY; - goto out_mutex; - } - /* Currently just for extent based files */ if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { ret = -EOPNOTSUPP; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index e2bba76f0d7b..b74cfd2a42ec 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3542,11 +3542,6 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) mutex_lock(&inode->i_mutex); - if (IS_SWAPFILE(inode)) { - ret = -ETXTBSY; - goto out_mutex; - } - /* No need to punch hole beyond i_size */ if (offset >= inode->i_size) goto out_mutex; diff --git a/fs/open.c b/fs/open.c index adf34202213a..7b823daa6a93 100644 --- a/fs/open.c +++ b/fs/open.c @@ -262,6 +262,13 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len) if (IS_IMMUTABLE(inode)) return -EPERM; + /* + * We can not allow to do any fallocate operation on an active + * swapfile + */ + if (IS_SWAPFILE(inode)) + ret = -ETXTBSY; + /* * Revalidate the write permissions, in case security policy has * changed since the files were opened. -- cgit v1.2.3 From 6e6358fc3c3c862bfe9a5bc029d3f8ce43dc9765 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 12 Apr 2014 12:45:25 -0400 Subject: ext4: use i_size_read in ext4_unaligned_aio() We haven't taken i_mutex yet, so we need to use i_size_read(). Signed-off-by: "Theodore Ts'o" Cc: stable@vger.kernel.org --- fs/ext4/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 6db7f7db7777..bc765591101a 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -82,7 +82,7 @@ ext4_unaligned_aio(struct inode *inode, const struct iovec *iov, size_t count = iov_length(iov, nr_segs); loff_t final_size = pos + count; - if (pos >= inode->i_size) + if (pos >= i_size_read(inode)) return 0; if ((pos & blockmask) || (final_size & blockmask)) -- cgit v1.2.3 From 847c6c422aa0ae81a5517a9558ec2737806dca48 Mon Sep 17 00:00:00 2001 From: Zheng Liu Date: Sat, 12 Apr 2014 12:45:55 -0400 Subject: ext4: fix byte order problems introduced by the COLLAPSE_RANGE patches This commit tries to fix some byte order issues that is found by sparse check. $ make M=fs/ext4 C=2 CF=-D__CHECK_ENDIAN__ ... CHECK fs/ext4/extents.c fs/ext4/extents.c:5232:41: warning: restricted __le32 degrades to integer fs/ext4/extents.c:5236:52: warning: bad assignment (-=) to restricted __le32 fs/ext4/extents.c:5258:45: warning: bad assignment (-=) to restricted __le32 fs/ext4/extents.c:5303:28: warning: restricted __le32 degrades to integer fs/ext4/extents.c:5318:18: warning: incorrect type in assignment (different base types) fs/ext4/extents.c:5318:18: expected unsigned int [unsigned] [usertype] ex_start fs/ext4/extents.c:5318:18: got restricted __le32 [usertype] ee_block fs/ext4/extents.c:5319:24: warning: restricted __le32 degrades to integer fs/ext4/extents.c:5334:31: warning: incorrect type in assignment (different base types) ... Cc: Andreas Dilger Cc: Namjae Jeon Signed-off-by: Zheng Liu Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index b2d3869b5762..f24ef8697609 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -5229,11 +5229,11 @@ ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift, if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr)) update = 1; - *start = ex_last->ee_block + + *start = le32_to_cpu(ex_last->ee_block) + ext4_ext_get_actual_len(ex_last); while (ex_start <= ex_last) { - ex_start->ee_block -= shift; + le32_add_cpu(&ex_start->ee_block, -shift); if (ex_start > EXT_FIRST_EXTENT(path[depth].p_hdr)) { if (ext4_ext_try_to_merge_right(inode, @@ -5255,7 +5255,7 @@ ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift, if (err) goto out; - path[depth].p_idx->ei_block -= shift; + le32_add_cpu(&path[depth].p_idx->ei_block, -shift); err = ext4_ext_dirty(handle, inode, path + depth); if (err) goto out; @@ -5300,7 +5300,8 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle, return ret; } - stop_block = extent->ee_block + ext4_ext_get_actual_len(extent); + stop_block = le32_to_cpu(extent->ee_block) + + ext4_ext_get_actual_len(extent); ext4_ext_drop_refs(path); kfree(path); @@ -5315,8 +5316,9 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle, path = ext4_ext_find_extent(inode, start - 1, NULL, 0); depth = path->p_depth; extent = path[depth].p_ext; - ex_start = extent->ee_block; - ex_end = extent->ee_block + ext4_ext_get_actual_len(extent); + ex_start = le32_to_cpu(extent->ee_block); + ex_end = le32_to_cpu(extent->ee_block) + + ext4_ext_get_actual_len(extent); ext4_ext_drop_refs(path); kfree(path); @@ -5331,7 +5333,7 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle, return PTR_ERR(path); depth = path->p_depth; extent = path[depth].p_ext; - current_block = extent->ee_block; + current_block = le32_to_cpu(extent->ee_block); if (start > current_block) { /* Hole, move to the next extent */ ret = mext_next_extent(inode, path, &extent); -- cgit v1.2.3 From 40c406c74eb9eed58ae7d4d12a0197f7279c9499 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 12 Apr 2014 22:53:53 -0400 Subject: ext4: COLLAPSE_RANGE only works on extent-based files Unfortunately, we weren't checking to make sure of this the inode was extent-based before attempt operate on it. Hilarity ensues. Signed-off-by: "Theodore Ts'o" Cc: Namjae Jeon --- fs/ext4/extents.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index f24ef8697609..96e0a4bc8faa 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4878,9 +4878,6 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) if (mode & FALLOC_FL_PUNCH_HOLE) return ext4_punch_hole(inode, offset, len); - if (mode & FALLOC_FL_COLLAPSE_RANGE) - return ext4_collapse_range(inode, offset, len); - ret = ext4_convert_inline_data(inode); if (ret) return ret; @@ -4892,6 +4889,9 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) return -EOPNOTSUPP; + if (mode & FALLOC_FL_COLLAPSE_RANGE) + return ext4_collapse_range(inode, offset, len); + if (mode & FALLOC_FL_ZERO_RANGE) return ext4_zero_range(file, offset, len, mode); -- cgit v1.2.3 From e2cbd587418251bb73c4c1e8e2c7c1816d7a98d9 Mon Sep 17 00:00:00 2001 From: jon ernst Date: Sat, 12 Apr 2014 23:01:28 -0400 Subject: ext4: silence sparse check warning for function ext4_trim_extent This fixes the following sparse warning: CHECK fs/ext4/mballoc.c fs/ext4/mballoc.c:5019:9: warning: context imbalance in 'ext4_trim_extent' - unexpected unlock Signed-off-by: "Jon Ernst" Signed-off-by: "Theodore Ts'o" --- fs/ext4/mballoc.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 73ccbb3b973b..c8238a26818c 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -5016,6 +5016,8 @@ error_return: */ static int ext4_trim_extent(struct super_block *sb, int start, int count, ext4_group_t group, struct ext4_buddy *e4b) +__releases(bitlock) +__acquires(bitlock) { struct ext4_free_extent ex; int ret = 0; -- cgit v1.2.3 From 8dc79ec4c0537e1b83c0739af82a7babefb30012 Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Sun, 13 Apr 2014 15:05:42 -0400 Subject: ext4: fix error handling in ext4_ext_shift_extents Fix error handling by adding some. :-) Signed-off-by: Dmitry Monakhov Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 96e0a4bc8faa..38be06354b88 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -5314,11 +5314,18 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle, * enough to accomodate the shift. */ path = ext4_ext_find_extent(inode, start - 1, NULL, 0); + if (IS_ERR(path)) + return PTR_ERR(path); depth = path->p_depth; extent = path[depth].p_ext; - ex_start = le32_to_cpu(extent->ee_block); - ex_end = le32_to_cpu(extent->ee_block) + + if (extent) { + ex_start = le32_to_cpu(extent->ee_block); + ex_end = le32_to_cpu(extent->ee_block) + ext4_ext_get_actual_len(extent); + } else { + ex_start = 0; + ex_end = 0; + } ext4_ext_drop_refs(path); kfree(path); -- cgit v1.2.3 From a18ed359bdddcded4f97ff5e2f07793ff9336913 Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Sun, 13 Apr 2014 15:41:13 -0400 Subject: ext4: always check ext4_ext_find_extent result Where are some places where logic guaranties us that extent we are searching exits, but this may not be true due to on-disk data corruption. If such corruption happens we must prevent possible null pointer dereferences. Signed-off-by: Dmitry Monakhov Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 38be06354b88..64b400356cad 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3313,6 +3313,11 @@ static int ext4_split_extent(handle_t *handle, return PTR_ERR(path); depth = ext_depth(inode); ex = path[depth].p_ext; + if (!ex) { + EXT4_ERROR_INODE(inode, "unexpected hole at %lu", + (unsigned long) map->m_lblk); + return -EIO; + } uninitialized = ext4_ext_is_uninitialized(ex); split_flag1 = 0; @@ -3694,6 +3699,12 @@ static int ext4_convert_initialized_extents(handle_t *handle, } depth = ext_depth(inode); ex = path[depth].p_ext; + if (!ex) { + EXT4_ERROR_INODE(inode, "unexpected hole at %lu", + (unsigned long) map->m_lblk); + err = -EIO; + goto out; + } } err = ext4_ext_get_access(handle, inode, path + depth); @@ -5340,6 +5351,12 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle, return PTR_ERR(path); depth = path->p_depth; extent = path[depth].p_ext; + if (!extent) { + EXT4_ERROR_INODE(inode, "unexpected hole at %lu", + (unsigned long) start); + return -EIO; + } + current_block = le32_to_cpu(extent->ee_block); if (start > current_block) { /* Hole, move to the next extent */ -- cgit v1.2.3 From 0040e606e35a0db80fc3fac04ccc7c7176a8e2b1 Mon Sep 17 00:00:00 2001 From: Christoph Jaeger Date: Sat, 12 Apr 2014 13:33:13 +0200 Subject: btrfs: fix use-after-free in mount_subvol() Pointer 'newargs' is used after the memory that it points to has already been freed. Picked up by Coverity - CID 1201425. Fixes: 0723a0473f ("btrfs: allow mounting btrfs subvolumes with different ro/rw options") Signed-off-by: Christoph Jaeger Signed-off-by: Chris Mason --- fs/btrfs/super.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 994c40955315..53bc3733d483 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1186,7 +1186,6 @@ static struct dentry *mount_subvol(const char *subvol_name, int flags, return ERR_PTR(-ENOMEM); mnt = vfs_kern_mount(&btrfs_fs_type, flags, device_name, newargs); - kfree(newargs); if (PTR_RET(mnt) == -EBUSY) { if (flags & MS_RDONLY) { @@ -1196,17 +1195,22 @@ static struct dentry *mount_subvol(const char *subvol_name, int flags, int r; mnt = vfs_kern_mount(&btrfs_fs_type, flags | MS_RDONLY, device_name, newargs); - if (IS_ERR(mnt)) + if (IS_ERR(mnt)) { + kfree(newargs); return ERR_CAST(mnt); + } r = btrfs_remount(mnt->mnt_sb, &flags, NULL); if (r < 0) { /* FIXME: release vfsmount mnt ??*/ + kfree(newargs); return ERR_PTR(r); } } } + kfree(newargs); + if (IS_ERR(mnt)) return ERR_CAST(mnt); -- cgit v1.2.3 From 036acea2ceabd19cb5734ae7a9d64c0a5ef90484 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Mon, 14 Apr 2014 23:36:15 -0400 Subject: ext4: fix ext4_count_free_clusters() with EXT4FS_DEBUG and bigalloc enabled With bigalloc enabled we must use EXT4_CLUSTERS_PER_GROUP() instead of EXT4_BLOCKS_PER_GROUP() otherwise we will go beyond the allocated buffer. $ mount -t ext4 /dev/vde /vde [ 70.573993] EXT4-fs DEBUG (fs/ext4/mballoc.c, 2346): ext4_mb_alloc_groupinfo: [ 70.575174] allocated s_groupinfo array for 1 meta_bg's [ 70.576172] EXT4-fs DEBUG (fs/ext4/super.c, 2092): ext4_check_descriptors: [ 70.576972] Checking group descriptorsBUG: unable to handle kernel paging request at ffff88006ab56000 [ 72.463686] IP: [] __bitmap_weight+0x2a/0x7f [ 72.464168] PGD 295e067 PUD 2961067 PMD 7fa8e067 PTE 800000006ab56060 [ 72.464738] Oops: 0000 [#1] SMP DEBUG_PAGEALLOC [ 72.465139] Modules linked in: [ 72.465402] CPU: 1 PID: 3560 Comm: mount Tainted: G W 3.14.0-rc2-00069-ge57bce1 #60 [ 72.466079] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 [ 72.466505] task: ffff88007ce6c8a0 ti: ffff88006b7f0000 task.ti: ffff88006b7f0000 [ 72.466505] RIP: 0010:[] [] __bitmap_weight+0x2a/0x7f [ 72.466505] RSP: 0018:ffff88006b7f1c00 EFLAGS: 00010206 [ 72.466505] RAX: 0000000000000000 RBX: 000000000000050a RCX: 0000000000000040 [ 72.466505] RDX: 0000000000000000 RSI: 0000000000080000 RDI: 0000000000000000 [ 72.466505] RBP: ffff88006b7f1c28 R08: 0000000000000002 R09: 0000000000000000 [ 72.466505] R10: 000000000000babe R11: 0000000000000400 R12: 0000000000080000 [ 72.466505] R13: 0000000000000200 R14: 0000000000002000 R15: ffff88006ab55000 [ 72.466505] FS: 00007f43ba1fa840(0000) GS:ffff88007f800000(0000) knlGS:0000000000000000 [ 72.466505] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b [ 72.466505] CR2: ffff88006ab56000 CR3: 000000006b7e6000 CR4: 00000000000006e0 [ 72.466505] Stack: [ 72.466505] ffff88006ab65000 0000000000000000 0000000000000000 0000000000010000 [ 72.466505] ffff88006ab6f400 ffff88006b7f1c58 ffffffff81396bb8 0000000000010000 [ 72.466505] 0000000000000000 ffff88007b869a90 ffff88006a48a000 ffff88006b7f1c70 [ 72.466505] Call Trace: [ 72.466505] [] memweight+0x5f/0x8a [ 72.466505] [] ext4_count_free+0x13/0x21 [ 72.466505] [] ext4_count_free_clusters+0xdb/0x171 [ 72.466505] [] ext4_fill_super+0x117c/0x28ef [ 72.466505] [] ? vsnprintf+0x1c7/0x3f7 [ 72.466505] [] mount_bdev+0x145/0x19c [ 72.466505] [] ? ext4_calculate_overhead+0x2a1/0x2a1 [ 72.466505] [] ext4_mount+0x15/0x17 [ 72.466505] [] mount_fs+0x67/0x150 [ 72.466505] [] vfs_kern_mount+0x64/0xde [ 72.466505] [] do_mount+0x6fe/0x7f5 [ 72.466505] [] ? strndup_user+0x3a/0xd9 [ 72.466505] [] SyS_mount+0x85/0xbe [ 72.466505] [] tracesys+0xdd/0xe2 [ 72.466505] Code: c3 89 f0 b9 40 00 00 00 55 99 48 89 e5 41 57 f7 f9 41 56 49 89 ff 41 55 45 31 ed 41 54 41 89 f4 53 31 db 41 89 c6 45 39 ee 7e 10 <4b> 8b 3c ef 49 ff c5 e8 bf ff ff ff 01 c3 eb eb 31 c0 45 85 f6 [ 72.466505] RIP [] __bitmap_weight+0x2a/0x7f [ 72.466505] RSP [ 72.466505] CR2: ffff88006ab56000 [ 72.466505] ---[ end trace 7d051a08ae138573 ]--- Killed Signed-off-by: "Theodore Ts'o" --- fs/ext4/balloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 6ea7b1436bbc..5c56785007e0 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -667,7 +667,7 @@ ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb) continue; x = ext4_count_free(bitmap_bh->b_data, - EXT4_BLOCKS_PER_GROUP(sb) / 8); + EXT4_CLUSTERS_PER_GROUP(sb) / 8); printk(KERN_DEBUG "group %u: stored = %d, counted = %u\n", i, ext4_free_group_clusters(sb, gdp), x); bitmap_count += x; -- cgit v1.2.3 From f1c6bb2cb8b81013e8979806f8e15e3d53efb96d Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 15 Apr 2014 06:17:49 -0400 Subject: locks: allow __break_lease to sleep even when break_time is 0 A fl->fl_break_time of 0 has a special meaning to the lease break code that basically means "never break the lease". knfsd uses this to ensure that leases don't disappear out from under it. Unfortunately, the code in __break_lease can end up passing this value to wait_event_interruptible as a timeout, which prevents it from going to sleep at all. This makes __break_lease to spin in a tight loop and causes soft lockups. Fix this by ensuring that we pass a minimum value of 1 as a timeout instead. Cc: Cc: J. Bruce Fields Reported-by: Terry Barnaby Signed-off-by: Jeff Layton --- fs/locks.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/locks.c b/fs/locks.c index 13fc7a6d380a..b380f5543614 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -1391,11 +1391,10 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type) restart: break_time = flock->fl_break_time; - if (break_time != 0) { + if (break_time != 0) break_time -= jiffies; - if (break_time == 0) - break_time++; - } + if (break_time == 0) + break_time++; locks_insert_block(flock, new_fl); spin_unlock(&inode->i_lock); error = wait_event_interruptible_timeout(new_fl->fl_wait, -- cgit v1.2.3 From e02ba72aabfade4c9cd6e3263e9b57bf890ad25c Mon Sep 17 00:00:00 2001 From: Anatol Pomozov Date: Tue, 15 Apr 2014 11:31:33 -0700 Subject: aio: block io_destroy() until all context requests are completed deletes aio context and all resources related to. It makes sense that no IO operations connected to the context should be running after the context is destroyed. As we removed io_context we have no chance to get requests status or call io_getevents(). man page for io_destroy says that this function may block until all context's requests are completed. Before kernel 3.11 io_destroy() blocked indeed, but since aio refactoring in 3.11 it is not true anymore. Here is a pseudo-code that shows a testcase for a race condition discovered in 3.11: initialize io_context io_submit(read to buffer) io_destroy() // context is destroyed so we can free the resources free(buffers); // if the buffer is allocated by some other user he'll be surprised // to learn that the buffer still filled by an outstanding operation // from the destroyed io_context The fix is straight-forward - add a completion struct and wait on it in io_destroy, complete() should be called when number of in-fligh requests reaches zero. If two or more io_destroy() called for the same context simultaneously then only the first one waits for IO completion, other calls behaviour is undefined. Tested: ran http://pastebin.com/LrPsQ4RL testcase for several hours and do not see the race condition anymore. Signed-off-by: Anatol Pomozov Signed-off-by: Benjamin LaHaise --- fs/aio.c | 36 ++++++++++++++++++++++++++++++++---- 1 file changed, 32 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index 12a3de0ee6da..2adbb0398ab9 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -112,6 +112,11 @@ struct kioctx { struct work_struct free_work; + /* + * signals when all in-flight requests are done + */ + struct completion *requests_done; + struct { /* * This counts the number of available slots in the ringbuffer, @@ -508,6 +513,10 @@ static void free_ioctx_reqs(struct percpu_ref *ref) { struct kioctx *ctx = container_of(ref, struct kioctx, reqs); + /* At this point we know that there are no any in-flight requests */ + if (ctx->requests_done) + complete(ctx->requests_done); + INIT_WORK(&ctx->free_work, free_ioctx); schedule_work(&ctx->free_work); } @@ -718,7 +727,8 @@ err: * when the processes owning a context have all exited to encourage * the rapid destruction of the kioctx. */ -static void kill_ioctx(struct mm_struct *mm, struct kioctx *ctx) +static void kill_ioctx(struct mm_struct *mm, struct kioctx *ctx, + struct completion *requests_done) { if (!atomic_xchg(&ctx->dead, 1)) { struct kioctx_table *table; @@ -747,7 +757,11 @@ static void kill_ioctx(struct mm_struct *mm, struct kioctx *ctx) if (ctx->mmap_size) vm_munmap(ctx->mmap_base, ctx->mmap_size); + ctx->requests_done = requests_done; percpu_ref_kill(&ctx->users); + } else { + if (requests_done) + complete(requests_done); } } @@ -809,7 +823,7 @@ void exit_aio(struct mm_struct *mm) */ ctx->mmap_size = 0; - kill_ioctx(mm, ctx); + kill_ioctx(mm, ctx, NULL); } } @@ -1185,7 +1199,7 @@ SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp) if (!IS_ERR(ioctx)) { ret = put_user(ioctx->user_id, ctxp); if (ret) - kill_ioctx(current->mm, ioctx); + kill_ioctx(current->mm, ioctx, NULL); percpu_ref_put(&ioctx->users); } @@ -1203,8 +1217,22 @@ SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx) { struct kioctx *ioctx = lookup_ioctx(ctx); if (likely(NULL != ioctx)) { - kill_ioctx(current->mm, ioctx); + struct completion requests_done = + COMPLETION_INITIALIZER_ONSTACK(requests_done); + + /* Pass requests_done to kill_ioctx() where it can be set + * in a thread-safe way. If we try to set it here then we have + * a race condition if two io_destroy() called simultaneously. + */ + kill_ioctx(current->mm, ioctx, &requests_done); percpu_ref_put(&ioctx->users); + + /* Wait until all IO for the context are done. Otherwise kernel + * keep using user-space buffers even if user thinks the context + * is destroyed. + */ + wait_for_completion(&requests_done); + return 0; } pr_debug("EINVAL: io_destroy: invalid context id\n"); -- cgit v1.2.3 From 4991a628a789dc5954e98e79476d9808812292ec Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 15 Apr 2014 08:44:12 -0400 Subject: locks: allow __break_lease to sleep even when break_time is 0 A fl->fl_break_time of 0 has a special meaning to the lease break code that basically means "never break the lease". knfsd uses this to ensure that leases don't disappear out from under it. Unfortunately, the code in __break_lease can end up passing this value to wait_event_interruptible as a timeout, which prevents it from going to sleep at all. This causes __break_lease to spin in a tight loop and causes soft lockups. Fix this by ensuring that we pass a minimum value of 1 as a timeout instead. Cc: Cc: J. Bruce Fields Reported-by: Terry Barnaby Signed-off-by: Jeff Layton Signed-off-by: J. Bruce Fields --- fs/locks.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/locks.c b/fs/locks.c index 13fc7a6d380a..b380f5543614 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -1391,11 +1391,10 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type) restart: break_time = flock->fl_break_time; - if (break_time != 0) { + if (break_time != 0) break_time -= jiffies; - if (break_time == 0) - break_time++; - } + if (break_time == 0) + break_time++; locks_insert_block(flock, new_fl); spin_unlock(&inode->i_lock); error = wait_event_interruptible_timeout(new_fl->fl_wait, -- cgit v1.2.3 From 3758cf7e14b753838fe754ede3862af10b35fdac Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 15 Apr 2014 08:51:48 -0400 Subject: nfsd: set timeparms.to_maxval in setup_callback_client ...otherwise the logic in the timeout handling doesn't work correctly. Spotted-by: Trond Myklebust Cc: stable@vger.kernel.org Signed-off-by: Jeff Layton Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4callback.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 39c8ef875f91..2c73cae9899d 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -654,9 +654,11 @@ static struct rpc_clnt *create_backchannel_client(struct rpc_create_args *args) static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *conn, struct nfsd4_session *ses) { + int maxtime = max_cb_time(clp->net); struct rpc_timeout timeparms = { - .to_initval = max_cb_time(clp->net), + .to_initval = maxtime, .to_retries = 0, + .to_maxval = maxtime, }; struct rpc_create_args args = { .net = clp->net, -- cgit v1.2.3 From fc208d026be0c7d60db9118583fc62f6ca97743d Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Wed, 9 Apr 2014 11:07:01 -0400 Subject: Revert "nfsd4: fix nfs4err_resource in 4.1 case" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since we're still limiting attributes to a page, the result here is that a large getattr result will return NFS4ERR_REP_TOO_BIG/TOO_BIG_TO_CACHE instead of NFS4ERR_RESOURCE. Both error returns are wrong, and the real bug here is the arbitrary limit on getattr results, fixed by as-yet out-of-tree patches. But at a minimum we can make life easier for clients by sticking to one broken behavior in released kernels instead of two.... Trond says: one immediate consequence of this patch will be that NFSv4.1 clients will now report EIO instead of EREMOTEIO if they hit the problem. That may make debugging a little less obvious. Another consequence will be that if we ever do try to add client side handling of NFS4ERR_REP_TOO_BIG, then we now have to deal with the “handle existing buggy server” syndrome. Reported-by: Trond Myklebust Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4xdr.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 2723c1badd01..18881f34737a 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -3627,14 +3627,6 @@ nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op) /* nfsd4_check_resp_size guarantees enough room for error status */ if (!op->status) op->status = nfsd4_check_resp_size(resp, 0); - if (op->status == nfserr_resource && nfsd4_has_session(&resp->cstate)) { - struct nfsd4_slot *slot = resp->cstate.slot; - - if (slot->sl_flags & NFSD4_SLOT_CACHETHIS) - op->status = nfserr_rep_too_big_to_cache; - else - op->status = nfserr_rep_too_big; - } if (so) { so->so_replay.rp_status = op->status; so->so_replay.rp_buflen = (char *)resp->p - (char *)(statp+1); -- cgit v1.2.3 From 694c793fc1ade0946149c5f8d43f71e0728c4e81 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Fri, 18 Apr 2014 10:21:15 -0400 Subject: ext4: use truncate_pagecache() in collapse range We should be using truncate_pagecache() instead of truncate_pagecache_range() in the collapse range because we're truncating page cache from offset to the end of file. truncate_pagecache() also get rid of the private COWed pages from the range because we're going to shift the end of the file. Signed-off-by: Lukas Czerner Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 64b400356cad..3de9b2d7028c 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -5437,7 +5437,7 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) goto out_mutex; } - truncate_pagecache_range(inode, offset, -1); + truncate_pagecache(inode, offset); /* Wait for existing dio to complete */ ext4_inode_block_unlocked_dio(inode); -- cgit v1.2.3 From 1a66c7c3bea52ba0f7596b8940d74fce75281d16 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Fri, 18 Apr 2014 10:41:52 -0400 Subject: ext4: use filemap_write_and_wait_range() correctly in collapse range Currently we're passing -1 as lend argumnet for filemap_write_and_wait_range() which is wrong since lend is signed type so it would cause some confusion and we might not write_and_wait for the entire range we're expecting to write. Fix it by using LLONG_MAX instead. Signed-off-by: Lukas Czerner Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 3de9b2d7028c..f4a676908b0b 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -5415,7 +5415,7 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) } /* Write out all dirty pages */ - ret = filemap_write_and_wait_range(inode->i_mapping, offset, -1); + ret = filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); if (ret) return ret; -- cgit v1.2.3 From 2c1d23289bc2f7cfa358bc856b87a992dcb11ad5 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Fri, 18 Apr 2014 10:43:21 -0400 Subject: ext4: fix removing status extents in ext4_collapse_range() Currently in ext4_collapse_range() when calling ext4_es_remove_extent() to remove status extents we're passing (EXT_MAX_BLOCKS - punch_start - 1) in order to remove all extents from start of the collapse range to the end of the file. However this is wrong because we might miss the possible extent covering the last block of the file. Fix it by removing the -1. Signed-off-by: Lukas Czerner Signed-off-by: "Theodore Ts'o" Reviewed-by: Namjae Jeon --- fs/ext4/extents.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index f4a676908b0b..c6f624582d37 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -5454,7 +5454,7 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) ext4_discard_preallocations(inode); ret = ext4_es_remove_extent(inode, punch_start, - EXT_MAX_BLOCKS - punch_start - 1); + EXT_MAX_BLOCKS - punch_start); if (ret) { up_write(&EXT4_I(inode)->i_data_sem); goto out_stop; -- cgit v1.2.3 From 9337d5d31ab798f0c74150506371551a9195251a Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Fri, 18 Apr 2014 10:48:25 -0400 Subject: ext4: no need to truncate pagecache twice in collapse range We're already calling truncate_pagecache() before we attempt to do any actual job so there is not need to truncate pagecache once more using truncate_setsize() after we're finished. Remove truncate_setsize() and replace it just with i_size_write() note that we're holding appropriate locks. Signed-off-by: Lukas Czerner Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index c6f624582d37..3ee60e2e2ac7 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -5474,7 +5474,7 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) } new_size = i_size_read(inode) - len; - truncate_setsize(inode, new_size); + i_size_write(inode, new_size); EXT4_I(inode)->i_disksize = new_size; ext4_discard_preallocations(inode); -- cgit v1.2.3 From ef24f6c234de9a03aed9368163dbaad9a4f6391f Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Fri, 18 Apr 2014 10:50:23 -0400 Subject: ext4: discard preallocations after removing space Currently in ext4_collapse_range() and ext4_punch_hole() we're discarding preallocation twice. Once before we attempt to do any changes and second time after we're done with the changes. While the second call to ext4_discard_preallocations() in ext4_punch_hole() case is not needed, we need to discard preallocation right after ext4_ext_remove_space() in collapse range case because in the case we had to restart a transaction in the middle of removing space we might have new preallocations created. Remove unneeded ext4_discard_preallocations() ext4_punch_hole() and move it to the better place in ext4_collapse_range() Signed-off-by: Lukas Czerner Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 2 +- fs/ext4/inode.c | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 3ee60e2e2ac7..eb7be8f08e10 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -5465,6 +5465,7 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) up_write(&EXT4_I(inode)->i_data_sem); goto out_stop; } + ext4_discard_preallocations(inode); ret = ext4_ext_shift_extents(inode, handle, punch_stop, punch_stop - punch_start); @@ -5477,7 +5478,6 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) i_size_write(inode, new_size); EXT4_I(inode)->i_disksize = new_size; - ext4_discard_preallocations(inode); up_write(&EXT4_I(inode)->i_data_sem); if (IS_SYNC(inode)) ext4_handle_sync(handle); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index b74cfd2a42ec..d7b7462a0e13 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3621,7 +3621,6 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) ret = ext4_free_hole_blocks(handle, inode, first_block, stop_block); - ext4_discard_preallocations(inode); up_write(&EXT4_I(inode)->i_data_sem); if (IS_SYNC(inode)) ext4_handle_sync(handle); -- cgit v1.2.3 From 6dd834effc12ba71092d9d1e4944530234b58ab1 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Fri, 18 Apr 2014 10:55:24 -0400 Subject: ext4: fix extent merging in ext4_ext_shift_path_extents() There is a bug in ext4_ext_shift_path_extents() where if we actually manage to merge a extent we would skip shifting the next extent. This will result in in one extent in the extent tree not being properly shifted. This is causing failure in various xfstests tests using fsx or fsstress with collapse range support. It will also cause file system corruption which looks something like: e2fsck 1.42.9 (4-Feb-2014) Pass 1: Checking inodes, blocks, and sizes Inode 20 has out of order extents (invalid logical block 3, physical block 492938, len 2) Clear? yes ... when running e2fsck. It's also very easily reproducible just by running fsx without any parameters. I can usually hit the problem within a minute. Fix it by increasing ex_start only if we're not merging the extent. Signed-off-by: Lukas Czerner Signed-off-by: "Theodore Ts'o" Reviewed-by: Namjae Jeon --- fs/ext4/extents.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index eb7be8f08e10..d0860f2d36d0 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -5245,13 +5245,14 @@ ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift, while (ex_start <= ex_last) { le32_add_cpu(&ex_start->ee_block, -shift); - if (ex_start > - EXT_FIRST_EXTENT(path[depth].p_hdr)) { - if (ext4_ext_try_to_merge_right(inode, - path, ex_start - 1)) - ex_last--; - } - ex_start++; + /* Try to merge to the left. */ + if ((ex_start > + EXT_FIRST_EXTENT(path[depth].p_hdr)) && + ext4_ext_try_to_merge_right(inode, + path, ex_start - 1)) + ex_last--; + else + ex_start++; } err = ext4_ext_dirty(handle, inode, path + depth); if (err) -- cgit v1.2.3 From 6c5e73d3a26b73bfcac0b4a932cb918177d067f2 Mon Sep 17 00:00:00 2001 From: jon ernst Date: Fri, 18 Apr 2014 11:50:35 -0400 Subject: ext4: enforce we are operating on a regular file in ext4_zero_range() Signed-off-by: Jon Ernst Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index d0860f2d36d0..2f49b12a4c40 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4741,6 +4741,9 @@ static long ext4_zero_range(struct file *file, loff_t offset, trace_ext4_zero_range(inode, offset, len, mode); + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + /* * Write out all dirty pages to avoid race conditions * Then release them. -- cgit v1.2.3 From 86f1ca3889142d5959362c5694db3f3dc26f377a Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 18 Apr 2014 11:52:11 -0400 Subject: ext4: use EINVAL if not a regular file in ext4_collapse_range() Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 2f49b12a4c40..9b9251adb400 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -5404,7 +5404,7 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) return -EINVAL; if (!S_ISREG(inode->i_mode)) - return -EOPNOTSUPP; + return -EINVAL; trace_ext4_collapse_range(inode, offset, len); -- cgit v1.2.3 From 22213318af7ae265bc6cd8aef2febbc2d69a2440 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 19 Apr 2014 12:30:58 -0400 Subject: fix races between __d_instantiate() and checks of dentry flags in non-lazy walk we need to be careful about dentry switching from negative to positive - both ->d_flags and ->d_inode are updated, and in some places we might see only one store. The cases where dentry has been obtained by dcache lookup with ->i_mutex held on parent are safe - ->d_lock and ->i_mutex provide all the barriers we need. However, there are several places where we run into trouble: * do_last() fetches ->d_inode, then checks ->d_flags and assumes that inode won't be NULL unless d_is_negative() is true. Race with e.g. creat() - we might have fetched the old value of ->d_inode (still NULL) and new value of ->d_flags (already not DCACHE_MISS_TYPE). Lin Ming has observed and reported the resulting oops. * a bunch of places checks ->d_inode for being non-NULL, then checks ->d_flags for "is it a symlink". Race with symlink(2) in case if our CPU sees ->d_inode update first - we see non-NULL there, but ->d_flags still contains DCACHE_MISS_TYPE instead of DCACHE_SYMLINK_TYPE. Result: false negative on "should we follow link here?", with subsequent unpleasantness. Cc: stable@vger.kernel.org # 3.13 and 3.14 need that one Reported-and-tested-by: Lin Ming Signed-off-by: Al Viro --- fs/dcache.c | 3 +-- fs/namei.c | 6 +++--- 2 files changed, 4 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/dcache.c b/fs/dcache.c index 40707d88a945..494a9def5dce 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1647,8 +1647,7 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode) unsigned add_flags = d_flags_for_inode(inode); spin_lock(&dentry->d_lock); - dentry->d_flags &= ~DCACHE_ENTRY_TYPE; - dentry->d_flags |= add_flags; + __d_set_type(dentry, add_flags); if (inode) hlist_add_head(&dentry->d_alias, &inode->i_dentry); dentry->d_inode = inode; diff --git a/fs/namei.c b/fs/namei.c index c6157c894fce..80168273396b 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1542,7 +1542,7 @@ static inline int walk_component(struct nameidata *nd, struct path *path, inode = path->dentry->d_inode; } err = -ENOENT; - if (!inode) + if (!inode || d_is_negative(path->dentry)) goto out_path_put; if (should_follow_link(path->dentry, follow)) { @@ -2249,7 +2249,7 @@ mountpoint_last(struct nameidata *nd, struct path *path) mutex_unlock(&dir->d_inode->i_mutex); done: - if (!dentry->d_inode) { + if (!dentry->d_inode || d_is_negative(dentry)) { error = -ENOENT; dput(dentry); goto out; @@ -2994,7 +2994,7 @@ retry_lookup: finish_lookup: /* we _can_ be in RCU mode here */ error = -ENOENT; - if (d_is_negative(path->dentry)) { + if (!inode || d_is_negative(path->dentry)) { path_to_nameidata(path, nd); goto out; } -- cgit v1.2.3 From a8680e0d5efd46aa54d7085e5b4a268f726922c7 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Sat, 19 Apr 2014 16:37:31 -0400 Subject: ext4: fix COLLAPSE_RANGE failure with 1KB block size When formatting with 1KB or 2KB(not aligned with PAGE SIZE) block size, xfstests generic/075 and 091 are failing. The offset supplied to function truncate_pagecache_range is block size aligned. In this function start offset is re-aligned to PAGE_SIZE by rounding_up to the next page boundary. Due to this rounding up, old data remains in the page cache when blocksize is less than page size and start offset is not aligned with page size. In case of collapse range, we need to align start offset to page size boundary by doing a round down operation instead of round up. Signed-off-by: Namjae Jeon Signed-off-by: Ashish Sangwan Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 9b9251adb400..d6bca2a1debe 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -5395,7 +5395,7 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) ext4_lblk_t punch_start, punch_stop; handle_t *handle; unsigned int credits; - loff_t new_size; + loff_t new_size, ioffset; int ret; /* Collapse range works only on fs block size aligned offsets. */ @@ -5418,8 +5418,15 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) return ret; } + /* + * Need to round down offset to be aligned with page size boundary + * for page size > block size. + */ + ioffset = round_down(offset, PAGE_SIZE); + /* Write out all dirty pages */ - ret = filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); + ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, + LLONG_MAX); if (ret) return ret; @@ -5441,7 +5448,7 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) goto out_mutex; } - truncate_pagecache(inode, offset); + truncate_pagecache(inode, ioffset); /* Wait for existing dio to complete */ ext4_inode_block_unlocked_dio(inode); -- cgit v1.2.3 From 0a04b248532b358b27a8da050642da6f5f304b03 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Sat, 19 Apr 2014 16:38:21 -0400 Subject: ext4: disable COLLAPSE_RANGE for bigalloc Once COLLAPSE RANGE is be disable for ext4 with bigalloc feature till finding root-cause of problem. It will be enable with fixing that regression of xfstest(generic 075 and 091) again. Signed-off-by: Namjae Jeon Signed-off-by: Ashish Sangwan Reviewed-by: Lukas Czerner Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index d6bca2a1debe..01b0c208f625 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -5406,6 +5406,9 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) if (!S_ISREG(inode->i_mode)) return -EINVAL; + if (EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) + return -EOPNOTSUPP; + trace_ext4_collapse_range(inode, offset, len); punch_start = offset >> EXT4_BLOCK_SIZE_BITS(sb); -- cgit v1.2.3 From 0d3f7a2dd2f5cf9642982515e020c1aee2cf7af6 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 22 Apr 2014 08:23:58 -0400 Subject: locks: rename file-private locks to "open file description locks" File-private locks have been merged into Linux for v3.15, and *now* people are commenting that the name and macro definitions for the new file-private locks suck. ...and I can't even disagree. The names and command macros do suck. We're going to have to live with these for a long time, so it's important that we be happy with the names before we're stuck with them. The consensus on the lists so far is that they should be rechristened as "open file description locks". The name isn't a big deal for the kernel, but the command macros are not visually distinct enough from the traditional POSIX lock macros. The glibc and documentation folks are recommending that we change them to look like F_OFD_{GETLK|SETLK|SETLKW}. That lessens the chance that a programmer will typo one of the commands wrong, and also makes it easier to spot this difference when reading code. This patch makes the following changes that I think are necessary before v3.15 ships: 1) rename the command macros to their new names. These end up in the uapi headers and so are part of the external-facing API. It turns out that glibc doesn't actually use the fcntl.h uapi header, but it's hard to be sure that something else won't. Changing it now is safest. 2) make the the /proc/locks output display these as type "OFDLCK" Cc: Michael Kerrisk Cc: Christoph Hellwig Cc: Carlos O'Donell Cc: Stefan Metzmacher Cc: Andy Lutomirski Cc: Frank Filz Cc: Theodore Ts'o Signed-off-by: Jeff Layton --- arch/arm/kernel/sys_oabi-compat.c | 6 +++--- fs/compat.c | 14 +++++++------- fs/fcntl.c | 12 ++++++------ fs/locks.c | 14 +++++++------- include/uapi/asm-generic/fcntl.h | 20 ++++++++++---------- security/selinux/hooks.c | 6 +++--- 6 files changed, 36 insertions(+), 36 deletions(-) (limited to 'fs') diff --git a/arch/arm/kernel/sys_oabi-compat.c b/arch/arm/kernel/sys_oabi-compat.c index 702bd329d9d0..e90a3148f385 100644 --- a/arch/arm/kernel/sys_oabi-compat.c +++ b/arch/arm/kernel/sys_oabi-compat.c @@ -203,9 +203,9 @@ asmlinkage long sys_oabi_fcntl64(unsigned int fd, unsigned int cmd, int ret; switch (cmd) { - case F_GETLKP: - case F_SETLKP: - case F_SETLKPW: + case F_OFD_GETLK: + case F_OFD_SETLK: + case F_OFD_SETLKW: case F_GETLK64: case F_SETLK64: case F_SETLKW64: diff --git a/fs/compat.c b/fs/compat.c index ca926ad0430c..66d3d3c6b4b2 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -457,9 +457,9 @@ COMPAT_SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd, case F_GETLK64: case F_SETLK64: case F_SETLKW64: - case F_GETLKP: - case F_SETLKP: - case F_SETLKPW: + case F_OFD_GETLK: + case F_OFD_SETLK: + case F_OFD_SETLKW: ret = get_compat_flock64(&f, compat_ptr(arg)); if (ret != 0) break; @@ -468,7 +468,7 @@ COMPAT_SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd, conv_cmd = convert_fcntl_cmd(cmd); ret = sys_fcntl(fd, conv_cmd, (unsigned long)&f); set_fs(old_fs); - if ((conv_cmd == F_GETLK || conv_cmd == F_GETLKP) && ret == 0) { + if ((conv_cmd == F_GETLK || conv_cmd == F_OFD_GETLK) && ret == 0) { /* need to return lock information - see above for commentary */ if (f.l_start > COMPAT_LOFF_T_MAX) ret = -EOVERFLOW; @@ -493,9 +493,9 @@ COMPAT_SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd, case F_GETLK64: case F_SETLK64: case F_SETLKW64: - case F_GETLKP: - case F_SETLKP: - case F_SETLKPW: + case F_OFD_GETLK: + case F_OFD_SETLK: + case F_OFD_SETLKW: return -EINVAL; } return compat_sys_fcntl64(fd, cmd, arg); diff --git a/fs/fcntl.c b/fs/fcntl.c index 9ead1596399a..72c82f69b01b 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -274,15 +274,15 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, break; #if BITS_PER_LONG != 32 /* 32-bit arches must use fcntl64() */ - case F_GETLKP: + case F_OFD_GETLK: #endif case F_GETLK: err = fcntl_getlk(filp, cmd, (struct flock __user *) arg); break; #if BITS_PER_LONG != 32 /* 32-bit arches must use fcntl64() */ - case F_SETLKP: - case F_SETLKPW: + case F_OFD_SETLK: + case F_OFD_SETLKW: #endif /* Fallthrough */ case F_SETLK: @@ -399,13 +399,13 @@ SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd, switch (cmd) { case F_GETLK64: - case F_GETLKP: + case F_OFD_GETLK: err = fcntl_getlk64(f.file, cmd, (struct flock64 __user *) arg); break; case F_SETLK64: case F_SETLKW64: - case F_SETLKP: - case F_SETLKPW: + case F_OFD_SETLK: + case F_OFD_SETLKW: err = fcntl_setlk64(fd, f.file, cmd, (struct flock64 __user *) arg); break; diff --git a/fs/locks.c b/fs/locks.c index b380f5543614..e1023b504279 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -1941,7 +1941,7 @@ int fcntl_getlk(struct file *filp, unsigned int cmd, struct flock __user *l) if (error) goto out; - if (cmd == F_GETLKP) { + if (cmd == F_OFD_GETLK) { error = -EINVAL; if (flock.l_pid != 0) goto out; @@ -2076,7 +2076,7 @@ again: * FL_FILE_PVT flag and override the owner. */ switch (cmd) { - case F_SETLKP: + case F_OFD_SETLK: error = -EINVAL; if (flock.l_pid != 0) goto out; @@ -2085,7 +2085,7 @@ again: file_lock->fl_flags |= FL_FILE_PVT; file_lock->fl_owner = (fl_owner_t)filp; break; - case F_SETLKPW: + case F_OFD_SETLKW: error = -EINVAL; if (flock.l_pid != 0) goto out; @@ -2143,7 +2143,7 @@ int fcntl_getlk64(struct file *filp, unsigned int cmd, struct flock64 __user *l) if (error) goto out; - if (cmd == F_GETLKP) { + if (cmd == F_OFD_GETLK) { error = -EINVAL; if (flock.l_pid != 0) goto out; @@ -2211,7 +2211,7 @@ again: * FL_FILE_PVT flag and override the owner. */ switch (cmd) { - case F_SETLKP: + case F_OFD_SETLK: error = -EINVAL; if (flock.l_pid != 0) goto out; @@ -2220,7 +2220,7 @@ again: file_lock->fl_flags |= FL_FILE_PVT; file_lock->fl_owner = (fl_owner_t)filp; break; - case F_SETLKPW: + case F_OFD_SETLKW: error = -EINVAL; if (flock.l_pid != 0) goto out; @@ -2413,7 +2413,7 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl, if (fl->fl_flags & FL_ACCESS) seq_printf(f, "ACCESS"); else if (IS_FILE_PVT(fl)) - seq_printf(f, "FLPVT "); + seq_printf(f, "OFDLCK"); else seq_printf(f, "POSIX "); diff --git a/include/uapi/asm-generic/fcntl.h b/include/uapi/asm-generic/fcntl.h index a9b13f8b3595..7543b3e51331 100644 --- a/include/uapi/asm-generic/fcntl.h +++ b/include/uapi/asm-generic/fcntl.h @@ -133,20 +133,20 @@ #endif /* - * fd "private" POSIX locks. + * Open File Description Locks * - * Usually POSIX locks held by a process are released on *any* close and are + * Usually record locks held by a process are released on *any* close and are * not inherited across a fork(). * - * These cmd values will set locks that conflict with normal POSIX locks, but - * are "owned" by the opened file, not the process. This means that they are - * inherited across fork() like BSD (flock) locks, and they are only released - * automatically when the last reference to the the open file against which - * they were acquired is put. + * These cmd values will set locks that conflict with process-associated + * record locks, but are "owned" by the open file description, not the + * process. This means that they are inherited across fork() like BSD (flock) + * locks, and they are only released automatically when the last reference to + * the the open file against which they were acquired is put. */ -#define F_GETLKP 36 -#define F_SETLKP 37 -#define F_SETLKPW 38 +#define F_OFD_GETLK 36 +#define F_OFD_SETLK 37 +#define F_OFD_SETLKW 38 #define F_OWNER_TID 0 #define F_OWNER_PID 1 diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index b4beb77967b1..2c7341dbc5d6 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -3317,9 +3317,9 @@ static int selinux_file_fcntl(struct file *file, unsigned int cmd, case F_GETLK: case F_SETLK: case F_SETLKW: - case F_GETLKP: - case F_SETLKP: - case F_SETLKPW: + case F_OFD_GETLK: + case F_OFD_SETLK: + case F_OFD_SETLKW: #if BITS_PER_LONG == 32 case F_GETLK64: case F_SETLK64: -- cgit v1.2.3 From cff2fce58b2b0f59089e7edcdc38803d65057b9f Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 22 Apr 2014 08:24:32 -0400 Subject: locks: rename FL_FILE_PVT and IS_FILE_PVT to use "*_OFDLCK" instead File-private locks have been re-christened as "open file description" locks. Finish the symbol name cleanup in the internal implementation. Signed-off-by: Jeff Layton --- fs/locks.c | 34 +++++++++++++++++----------------- include/linux/fs.h | 2 +- 2 files changed, 18 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/locks.c b/fs/locks.c index e1023b504279..e663aeac579e 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -135,7 +135,7 @@ #define IS_POSIX(fl) (fl->fl_flags & FL_POSIX) #define IS_FLOCK(fl) (fl->fl_flags & FL_FLOCK) #define IS_LEASE(fl) (fl->fl_flags & (FL_LEASE|FL_DELEG)) -#define IS_FILE_PVT(fl) (fl->fl_flags & FL_FILE_PVT) +#define IS_OFDLCK(fl) (fl->fl_flags & FL_OFDLCK) static bool lease_breaking(struct file_lock *fl) { @@ -564,7 +564,7 @@ static void __locks_insert_block(struct file_lock *blocker, BUG_ON(!list_empty(&waiter->fl_block)); waiter->fl_next = blocker; list_add_tail(&waiter->fl_block, &blocker->fl_block); - if (IS_POSIX(blocker) && !IS_FILE_PVT(blocker)) + if (IS_POSIX(blocker) && !IS_OFDLCK(blocker)) locks_insert_global_blocked(waiter); } @@ -759,12 +759,12 @@ EXPORT_SYMBOL(posix_test_lock); * of tasks (such as posix threads) sharing the same open file table. * To handle those cases, we just bail out after a few iterations. * - * For FL_FILE_PVT locks, the owner is the filp, not the files_struct. + * For FL_OFDLCK locks, the owner is the filp, not the files_struct. * Because the owner is not even nominally tied to a thread of * execution, the deadlock detection below can't reasonably work well. Just * skip it for those. * - * In principle, we could do a more limited deadlock detection on FL_FILE_PVT + * In principle, we could do a more limited deadlock detection on FL_OFDLCK * locks that just checks for the case where two tasks are attempting to * upgrade from read to write locks on the same inode. */ @@ -791,9 +791,9 @@ static int posix_locks_deadlock(struct file_lock *caller_fl, /* * This deadlock detector can't reasonably detect deadlocks with - * FL_FILE_PVT locks, since they aren't owned by a process, per-se. + * FL_OFDLCK locks, since they aren't owned by a process, per-se. */ - if (IS_FILE_PVT(caller_fl)) + if (IS_OFDLCK(caller_fl)) return 0; while ((block_fl = what_owner_is_waiting_for(block_fl))) { @@ -1890,7 +1890,7 @@ EXPORT_SYMBOL_GPL(vfs_test_lock); static int posix_lock_to_flock(struct flock *flock, struct file_lock *fl) { - flock->l_pid = IS_FILE_PVT(fl) ? -1 : fl->fl_pid; + flock->l_pid = IS_OFDLCK(fl) ? -1 : fl->fl_pid; #if BITS_PER_LONG == 32 /* * Make sure we can represent the posix lock via @@ -1912,7 +1912,7 @@ static int posix_lock_to_flock(struct flock *flock, struct file_lock *fl) #if BITS_PER_LONG == 32 static void posix_lock_to_flock64(struct flock64 *flock, struct file_lock *fl) { - flock->l_pid = IS_FILE_PVT(fl) ? -1 : fl->fl_pid; + flock->l_pid = IS_OFDLCK(fl) ? -1 : fl->fl_pid; flock->l_start = fl->fl_start; flock->l_len = fl->fl_end == OFFSET_MAX ? 0 : fl->fl_end - fl->fl_start + 1; @@ -1947,7 +1947,7 @@ int fcntl_getlk(struct file *filp, unsigned int cmd, struct flock __user *l) goto out; cmd = F_GETLK; - file_lock.fl_flags |= FL_FILE_PVT; + file_lock.fl_flags |= FL_OFDLCK; file_lock.fl_owner = (fl_owner_t)filp; } @@ -2073,7 +2073,7 @@ again: /* * If the cmd is requesting file-private locks, then set the - * FL_FILE_PVT flag and override the owner. + * FL_OFDLCK flag and override the owner. */ switch (cmd) { case F_OFD_SETLK: @@ -2082,7 +2082,7 @@ again: goto out; cmd = F_SETLK; - file_lock->fl_flags |= FL_FILE_PVT; + file_lock->fl_flags |= FL_OFDLCK; file_lock->fl_owner = (fl_owner_t)filp; break; case F_OFD_SETLKW: @@ -2091,7 +2091,7 @@ again: goto out; cmd = F_SETLKW; - file_lock->fl_flags |= FL_FILE_PVT; + file_lock->fl_flags |= FL_OFDLCK; file_lock->fl_owner = (fl_owner_t)filp; /* Fallthrough */ case F_SETLKW: @@ -2149,7 +2149,7 @@ int fcntl_getlk64(struct file *filp, unsigned int cmd, struct flock64 __user *l) goto out; cmd = F_GETLK64; - file_lock.fl_flags |= FL_FILE_PVT; + file_lock.fl_flags |= FL_OFDLCK; file_lock.fl_owner = (fl_owner_t)filp; } @@ -2208,7 +2208,7 @@ again: /* * If the cmd is requesting file-private locks, then set the - * FL_FILE_PVT flag and override the owner. + * FL_OFDLCK flag and override the owner. */ switch (cmd) { case F_OFD_SETLK: @@ -2217,7 +2217,7 @@ again: goto out; cmd = F_SETLK64; - file_lock->fl_flags |= FL_FILE_PVT; + file_lock->fl_flags |= FL_OFDLCK; file_lock->fl_owner = (fl_owner_t)filp; break; case F_OFD_SETLKW: @@ -2226,7 +2226,7 @@ again: goto out; cmd = F_SETLKW64; - file_lock->fl_flags |= FL_FILE_PVT; + file_lock->fl_flags |= FL_OFDLCK; file_lock->fl_owner = (fl_owner_t)filp; /* Fallthrough */ case F_SETLKW64: @@ -2412,7 +2412,7 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl, if (IS_POSIX(fl)) { if (fl->fl_flags & FL_ACCESS) seq_printf(f, "ACCESS"); - else if (IS_FILE_PVT(fl)) + else if (IS_OFDLCK(fl)) seq_printf(f, "OFDLCK"); else seq_printf(f, "POSIX "); diff --git a/include/linux/fs.h b/include/linux/fs.h index 7a9c5bca2b76..878031227c57 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -815,7 +815,7 @@ static inline struct file *get_file(struct file *f) #define FL_SLEEP 128 /* A blocking lock */ #define FL_DOWNGRADE_PENDING 256 /* Lease is being downgraded */ #define FL_UNLOCK_PENDING 512 /* Lease is being broken */ -#define FL_FILE_PVT 1024 /* lock is private to the file */ +#define FL_OFDLCK 1024 /* lock is "owned" by struct file */ /* * Special return value from posix_lock_file() and vfs_lock_file() for -- cgit v1.2.3 From c5f7d0bb29df2e1848a236e58e201daf5b4e0f21 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 15 Apr 2014 10:41:00 +0800 Subject: btrfs: Change the hole range to a more accurate value. Commit 3ac0d7b96a268a98bd474cab8bce3a9f125aaccf fixed the btrfs expanding write problem but the hole punched is sometimes too large for some iovec, which has unmapped data ranges. This patch will change to hole range to a more accurate value using the counts checked by the write check routines. Reported-by: Al Viro Signed-off-by: Qu Wenruo Signed-off-by: Chris Mason --- fs/btrfs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 23f6a9d9f104..e7e78fa9085e 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1783,7 +1783,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, start_pos = round_down(pos, root->sectorsize); if (start_pos > i_size_read(inode)) { /* Expand hole size to cover write data, preventing empty gap */ - end_pos = round_up(pos + iov->iov_len, root->sectorsize); + end_pos = round_up(pos + count, root->sectorsize); err = btrfs_cont_expand(inode, i_size_read(inode), end_pos); if (err) { mutex_unlock(&inode->i_mutex); -- cgit v1.2.3 From 3f9e3df8da3c51649c15db249978a10f7374236a Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 15 Apr 2014 18:50:17 +0200 Subject: btrfs: replace error code from btrfs_drop_extents There's a case which clone does not handle and used to BUG_ON instead, (testcase xfstests/btrfs/035), now returns EINVAL. This error code is confusing to the ioctl caller, as it normally signifies errorneous arguments. Change it to ENOPNOTSUPP which allows a fall back to copy instead of clone. This does not affect the common reflink operation. Signed-off-by: David Sterba Signed-off-by: Chris Mason --- fs/btrfs/file.c | 6 +++--- fs/btrfs/ioctl.c | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index e7e78fa9085e..1eee3f79d75f 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -805,7 +805,7 @@ next_slot: if (start > key.offset && end < extent_end) { BUG_ON(del_nr > 0); if (extent_type == BTRFS_FILE_EXTENT_INLINE) { - ret = -EINVAL; + ret = -EOPNOTSUPP; break; } @@ -851,7 +851,7 @@ next_slot: */ if (start <= key.offset && end < extent_end) { if (extent_type == BTRFS_FILE_EXTENT_INLINE) { - ret = -EINVAL; + ret = -EOPNOTSUPP; break; } @@ -877,7 +877,7 @@ next_slot: if (start > key.offset && end >= extent_end) { BUG_ON(del_nr > 0); if (extent_type == BTRFS_FILE_EXTENT_INLINE) { - ret = -EINVAL; + ret = -EOPNOTSUPP; break; } diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index f2e8fcc279a3..7b001abc73c7 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3088,7 +3088,7 @@ process_slot: new_key.offset + datal, 1); if (ret) { - if (ret != -EINVAL) + if (ret != -EOPNOTSUPP) btrfs_abort_transaction(trans, root, ret); btrfs_end_transaction(trans, root); @@ -3163,7 +3163,7 @@ process_slot: new_key.offset + datal, 1); if (ret) { - if (ret != -EINVAL) + if (ret != -EOPNOTSUPP) btrfs_abort_transaction(trans, root, ret); btrfs_end_transaction(trans, root); -- cgit v1.2.3 From 9d89ce658718d9d21465666127a15ca2c82c4ea7 Mon Sep 17 00:00:00 2001 From: Wang Shilong Date: Wed, 23 Apr 2014 19:33:33 +0800 Subject: Btrfs: move btrfs_{set,clear}_and_info() to ctree.h Signed-off-by: Wang Shilong Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 14 ++++++++++++++ fs/btrfs/super.c | 14 -------------- 2 files changed, 14 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index ad1a5943a923..e7c9e1c540f4 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2058,6 +2058,20 @@ struct btrfs_ioctl_defrag_range_args { #define btrfs_raw_test_opt(o, opt) ((o) & BTRFS_MOUNT_##opt) #define btrfs_test_opt(root, opt) ((root)->fs_info->mount_opt & \ BTRFS_MOUNT_##opt) +#define btrfs_set_and_info(root, opt, fmt, args...) \ +{ \ + if (!btrfs_test_opt(root, opt)) \ + btrfs_info(root->fs_info, fmt, ##args); \ + btrfs_set_opt(root->fs_info->mount_opt, opt); \ +} + +#define btrfs_clear_and_info(root, opt, fmt, args...) \ +{ \ + if (btrfs_test_opt(root, opt)) \ + btrfs_info(root->fs_info, fmt, ##args); \ + btrfs_clear_opt(root->fs_info->mount_opt, opt); \ +} + /* * Inode flags */ diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 53bc3733d483..363404b9713f 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -385,20 +385,6 @@ static match_table_t tokens = { {Opt_err, NULL}, }; -#define btrfs_set_and_info(root, opt, fmt, args...) \ -{ \ - if (!btrfs_test_opt(root, opt)) \ - btrfs_info(root->fs_info, fmt, ##args); \ - btrfs_set_opt(root->fs_info->mount_opt, opt); \ -} - -#define btrfs_clear_and_info(root, opt, fmt, args...) \ -{ \ - if (btrfs_test_opt(root, opt)) \ - btrfs_info(root->fs_info, fmt, ##args); \ - btrfs_clear_opt(root->fs_info->mount_opt, opt); \ -} - /* * Regular mount options parser. Everything that is needed only when * reading in a new superblock is parsed here. -- cgit v1.2.3 From e60efa84252c059bde5f65fccc6af94478d39e3b Mon Sep 17 00:00:00 2001 From: Wang Shilong Date: Wed, 23 Apr 2014 19:33:34 +0800 Subject: Btrfs: avoid triggering bug_on() when we fail to start inode caching task When running stress test(including snapshots,balance,fstress), we trigger the following BUG_ON() which is because we fail to start inode caching task. [ 181.131945] kernel BUG at fs/btrfs/inode-map.c:179! [ 181.137963] invalid opcode: 0000 [#1] SMP [ 181.217096] CPU: 11 PID: 2532 Comm: btrfs Not tainted 3.14.0 #1 [ 181.240521] task: ffff88013b621b30 ti: ffff8800b6ada000 task.ti: ffff8800b6ada000 [ 181.367506] Call Trace: [ 181.371107] [] btrfs_return_ino+0x9e/0x110 [btrfs] [ 181.379191] [] btrfs_evict_inode+0x46b/0x4c0 [btrfs] [ 181.387464] [] ? autoremove_wake_function+0x40/0x40 [ 181.395642] [] evict+0x9e/0x190 [ 181.401882] [] iput+0xf3/0x180 [ 181.408025] [] btrfs_orphan_cleanup+0x1ee/0x430 [btrfs] [ 181.416614] [] btrfs_mksubvol.isra.29+0x3bd/0x450 [btrfs] [ 181.425399] [] btrfs_ioctl_snap_create_transid+0x186/0x190 [btrfs] [ 181.435059] [] btrfs_ioctl_snap_create_v2+0xeb/0x130 [btrfs] [ 181.444148] [] btrfs_ioctl+0xf76/0x2b90 [btrfs] [ 181.451971] [] ? handle_mm_fault+0x475/0xe80 [ 181.459509] [] ? __do_page_fault+0x1ec/0x520 [ 181.467046] [] ? do_mmap_pgoff+0x2f5/0x3c0 [ 181.474393] [] do_vfs_ioctl+0x2d8/0x4b0 [ 181.481450] [] SyS_ioctl+0x81/0xa0 [ 181.488021] [] system_call_fastpath+0x16/0x1b We should avoid triggering BUG_ON() here, instead, we output warning messages and clear inode_cache option. Signed-off-by: Wang Shilong Signed-off-by: Chris Mason --- fs/btrfs/inode-map.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index cc8ca193d830..8ad529e3e67e 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -176,7 +176,11 @@ static void start_caching(struct btrfs_root *root) tsk = kthread_run(caching_kthread, root, "btrfs-ino-cache-%llu\n", root->root_key.objectid); - BUG_ON(IS_ERR(tsk)); /* -ENOMEM */ + if (IS_ERR(tsk)) { + btrfs_warn(root->fs_info, "failed to start inode caching task"); + btrfs_clear_and_info(root, CHANGE_INODE_CACHE, + "disabling inode map caching"); + } } int btrfs_find_free_ino(struct btrfs_root *root, u64 *objectid) -- cgit v1.2.3 From 28c16cbbc32781224309e50cc99c684f2498bc59 Mon Sep 17 00:00:00 2001 From: Wang Shilong Date: Wed, 23 Apr 2014 19:33:35 +0800 Subject: Btrfs: fix possible memory leaks in open_ctree() Fix possible memory leaks in the following error handling paths: read_tree_block() btrfs_recover_log_trees btrfs_commit_super() btrfs_find_orphan_roots() btrfs_cleanup_fs_roots() Signed-off-by: Wang Shilong Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 6d1ac7d46f81..0e4fb4a5c7f0 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2864,7 +2864,7 @@ retry_root_backup: printk(KERN_ERR "BTRFS: failed to read log tree\n"); free_extent_buffer(log_tree_root->node); kfree(log_tree_root); - goto fail_trans_kthread; + goto fail_qgroup; } /* returns with log_tree_root freed on success */ ret = btrfs_recover_log_trees(log_tree_root); @@ -2873,24 +2873,24 @@ retry_root_backup: "Failed to recover log tree"); free_extent_buffer(log_tree_root->node); kfree(log_tree_root); - goto fail_trans_kthread; + goto fail_qgroup; } if (sb->s_flags & MS_RDONLY) { ret = btrfs_commit_super(tree_root); if (ret) - goto fail_trans_kthread; + goto fail_qgroup; } } ret = btrfs_find_orphan_roots(tree_root); if (ret) - goto fail_trans_kthread; + goto fail_qgroup; if (!(sb->s_flags & MS_RDONLY)) { ret = btrfs_cleanup_fs_roots(fs_info); if (ret) - goto fail_trans_kthread; + goto fail_qgroup; ret = btrfs_recover_relocation(tree_root); if (ret < 0) { -- cgit v1.2.3 From 1c70d8fb4dfa95bee491816b2a6767b5ca1080e7 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Wed, 23 Apr 2014 19:33:36 +0800 Subject: Btrfs: fix inode caching vs tree log Currently, with inode cache enabled, we will reuse its inode id immediately after unlinking file, we may hit something like following: |->iput inode |->return inode id into inode cache |->create dir,fsync |->power off An easy way to reproduce this problem is: mkfs.btrfs -f /dev/sdb mount /dev/sdb /mnt -o inode_cache,commit=100 dd if=/dev/zero of=/mnt/data bs=1M count=10 oflag=sync inode_id=`ls -i /mnt/data | awk '{print $1}'` rm -f /mnt/data i=1 while [ 1 ] do mkdir /mnt/dir_$i test1=`stat /mnt/dir_$i | grep Inode: | awk '{print $4}'` if [ $test1 -eq $inode_id ] then dd if=/dev/zero of=/mnt/dir_$i/data bs=1M count=1 oflag=sync echo b > /proc/sysrq-trigger fi sleep 1 i=$(($i+1)) done mount /dev/sdb /mnt umount /dev/sdb btrfs check /dev/sdb We fix this problem by adding unlinked inode's id into pinned tree, and we can not reuse them until committing transaction. Cc: stable@vger.kernel.org Signed-off-by: Miao Xie Signed-off-by: Wang Shilong Signed-off-by: Chris Mason --- fs/btrfs/inode-map.c | 18 ++---------------- 1 file changed, 2 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index 8ad529e3e67e..86935f5ae291 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -209,24 +209,14 @@ again: void btrfs_return_ino(struct btrfs_root *root, u64 objectid) { - struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; struct btrfs_free_space_ctl *pinned = root->free_ino_pinned; if (!btrfs_test_opt(root, INODE_MAP_CACHE)) return; - again: if (root->cached == BTRFS_CACHE_FINISHED) { - __btrfs_add_free_space(ctl, objectid, 1); + __btrfs_add_free_space(pinned, objectid, 1); } else { - /* - * If we are in the process of caching free ino chunks, - * to avoid adding the same inode number to the free_ino - * tree twice due to cross transaction, we'll leave it - * in the pinned tree until a transaction is committed - * or the caching work is done. - */ - down_write(&root->fs_info->commit_root_sem); spin_lock(&root->cache_lock); if (root->cached == BTRFS_CACHE_FINISHED) { @@ -238,11 +228,7 @@ again: start_caching(root); - if (objectid <= root->cache_progress || - objectid >= root->highest_objectid) - __btrfs_add_free_space(ctl, objectid, 1); - else - __btrfs_add_free_space(pinned, objectid, 1); + __btrfs_add_free_space(pinned, objectid, 1); up_write(&root->fs_info->commit_root_sem); } -- cgit v1.2.3 From 9ce49a0b4ff7f13961d8d106ffae959823d2e758 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 24 Apr 2014 15:15:28 +0100 Subject: Btrfs: use correct key when repeating search for extent item If skinny metadata is enabled and our first tree search fails to find a skinny extent item, we may repeat a tree search for a "fat" extent item (if the previous item in the leaf is not the "fat" extent we're looking for). However we were not setting the new key's objectid to the right value, as we previously used the same key variable to peek at the previous item in the leaf, which has a different objectid. So just set the right objectid to avoid modifying/deleting a wrong item if we repeat the tree search. Signed-off-by: Filipe David Borba Manana Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 1306487c82cf..678cb352902c 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1542,6 +1542,7 @@ again: ret = 0; } if (ret) { + key.objectid = bytenr; key.type = BTRFS_EXTENT_ITEM_KEY; key.offset = num_bytes; btrfs_release_path(path); @@ -5719,6 +5720,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, if (ret > 0 && skinny_metadata) { skinny_metadata = false; + key.objectid = bytenr; key.type = BTRFS_EXTENT_ITEM_KEY; key.offset = num_bytes; btrfs_release_path(path); -- cgit v1.2.3 From f8213bdc89719bad895a02c62c4a85066ff76720 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 24 Apr 2014 15:15:29 +0100 Subject: Btrfs: correctly set profile flags on seqlock retry If we had to retry on the profiles seqlock (due to a concurrent write), we would set bits on the input flags that corresponded both to the current profile and to previous values of the profile. Signed-off-by: Filipe David Borba Manana Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 678cb352902c..5590af92094b 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3543,11 +3543,13 @@ static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) return extended_to_chunk(flags | tmp); } -static u64 get_alloc_profile(struct btrfs_root *root, u64 flags) +static u64 get_alloc_profile(struct btrfs_root *root, u64 orig_flags) { unsigned seq; + u64 flags; do { + flags = orig_flags; seq = read_seqbegin(&root->fs_info->profiles_lock); if (flags & BTRFS_BLOCK_GROUP_DATA) -- cgit v1.2.3 From c1befb885939cdaaf420c10bbe9ff57aa00446ea Mon Sep 17 00:00:00 2001 From: Jianyu Zhan Date: Thu, 17 Apr 2014 17:52:10 +0800 Subject: kernfs: fix a subdir count leak Currently kernfs_link_sibling() increates parent->dir.subdirs before adding the node into parent's chidren rb tree. Because it is possible that kernfs_link_sibling() couldn't find a suitable slot and bail out, this leads to a mismatch between elevated subdir count with actual children node numbers. This patches fix this problem, by moving the subdir accouting after the actual addtion happening. Signed-off-by: Jianyu Zhan Acked-by: Tejun Heo Signed-off-by: Greg Kroah-Hartman --- fs/kernfs/dir.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index 78f3403300af..ac127cd008bf 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -232,9 +232,6 @@ static int kernfs_link_sibling(struct kernfs_node *kn) struct rb_node **node = &kn->parent->dir.children.rb_node; struct rb_node *parent = NULL; - if (kernfs_type(kn) == KERNFS_DIR) - kn->parent->dir.subdirs++; - while (*node) { struct kernfs_node *pos; int result; @@ -249,9 +246,15 @@ static int kernfs_link_sibling(struct kernfs_node *kn) else return -EEXIST; } + /* add new node and rebalance the tree */ rb_link_node(&kn->rb, parent, node); rb_insert_color(&kn->rb, &kn->parent->dir.children); + + /* successfully added, account subdir number */ + if (kernfs_type(kn) == KERNFS_DIR) + kn->parent->dir.subdirs++; + return 0; } -- cgit v1.2.3 From b44b2140265ddfde03acbe809336111d31adb0d1 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 20 Apr 2014 08:29:21 -0400 Subject: kernfs: add back missing error check in kernfs_fop_mmap() While updating how mmap enabled kernfs files are handled by lockdep, 9b2db6e18945 ("sysfs: bail early from kernfs_file_mmap() to avoid spurious lockdep warning") inadvertently dropped error return check from kernfs_file_mmap(). The intention was just dropping "if (ops->mmap)" check as the control won't reach the point if the mmap callback isn't implemented, but I mistakenly removed the error return check together with it. This led to Xorg crash on i810 which was reported and bisected to the commit and then to the specific change by Tobias. Signed-off-by: Tejun Heo Reported-and-bisected-by: Tobias Powalowski Tested-by: Tobias Powalowski References: http://lkml.kernel.org/g/533D01BD.1010200@googlemail.com Cc: stable # 3.14 Signed-off-by: Greg Kroah-Hartman --- fs/kernfs/file.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index 8034706a7af8..e01ea4a14a01 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -484,6 +484,8 @@ static int kernfs_fop_mmap(struct file *file, struct vm_area_struct *vma) ops = kernfs_ops(of->kn); rc = ops->mmap(of, vma); + if (rc) + goto out_put; /* * PowerPC's pci_mmap of legacy_mem uses shmem_zero_setup() -- cgit v1.2.3 From cfd4a535b68faf651b238586011f5bae128391c4 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Sat, 26 Apr 2014 05:02:03 -0700 Subject: Btrfs: limit the path size in send to PATH_MAX fs_path_ensure_buf is used to make sure our path buffers for send are big enough for the path names as we construct them. The buffer size is limited to 32K by the length field in the struct. But bugs in the path construction can end up trying to build a huge buffer, and we'll do invalid memmmoves when the buffer length field wraps. This patch is step one, preventing the overflows. Signed-off-by: Chris Mason --- fs/btrfs/send.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 1ac3ca98c429..eb6537a08c1b 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -349,6 +349,11 @@ static int fs_path_ensure_buf(struct fs_path *p, int len) if (p->buf_len >= len) return 0; + if (len > PATH_MAX) { + WARN_ON(1); + return -ENOMEM; + } + path_len = p->end - p->start; old_buf_len = p->buf_len; -- cgit v1.2.3 From 7736e8cc51bbfdbd538c1870c314eb3483fe04ed Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 23 Apr 2014 18:14:42 +0200 Subject: fuse: add __exit to fuse_ctl_cleanup fuse_ctl_cleanup is only called by __exit fuse_exit Signed-off-by: Fabian Frederick Signed-off-by: Miklos Szeredi --- fs/fuse/control.c | 2 +- fs/fuse/fuse_i.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/fuse/control.c b/fs/fuse/control.c index a0b0855d00a9..205e0d5d5307 100644 --- a/fs/fuse/control.c +++ b/fs/fuse/control.c @@ -348,7 +348,7 @@ int __init fuse_ctl_init(void) return register_filesystem(&fuse_ctl_fs_type); } -void fuse_ctl_cleanup(void) +void __exit fuse_ctl_cleanup(void) { unregister_filesystem(&fuse_ctl_fs_type); } diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index a257ed8ebee6..adfa2d505c1a 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -725,7 +725,7 @@ int fuse_dev_init(void); void fuse_dev_cleanup(void); int fuse_ctl_init(void); -void fuse_ctl_cleanup(void); +void __exit fuse_ctl_cleanup(void); /** * Allocate a request -- cgit v1.2.3 From 4adb83029de8ef5144a14dbb5c21de0f156c1a03 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 28 Apr 2014 14:19:21 +0200 Subject: fuse: check fallocate mode Don't allow new fallocate modes until we figure out what (if anything) that takes. Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 13f8bdec5110..982288a1c6ab 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -2972,6 +2972,9 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, bool lock_inode = !(mode & FALLOC_FL_KEEP_SIZE) || (mode & FALLOC_FL_PUNCH_HOLE); + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) + return -EOPNOTSUPP; + if (fc->no_fallocate) return -EOPNOTSUPP; -- cgit v1.2.3 From aeb4eb6b55261f44d3705f0a3b9e4906bfa5e416 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 28 Apr 2014 14:19:21 +0200 Subject: fuse: fix mtime update error in fsync Bad case of shadowing. Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 982288a1c6ab..7ae1e6972caa 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -501,7 +501,7 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end, fuse_sync_writes(inode); if (test_bit(FUSE_I_MTIME_DIRTY, &get_fuse_inode(inode)->state)) { - int err = fuse_flush_mtime(file, false); + err = fuse_flush_mtime(file, false); if (err) goto out; } -- cgit v1.2.3 From d31433c8b06d44e27f7637574137dc4b5e6fd1d1 Mon Sep 17 00:00:00 2001 From: Maxim Patlasov Date: Mon, 28 Apr 2014 14:19:21 +0200 Subject: fuse: do not use uninitialized i_mode When inode is in I_NEW state, inode->i_mode is not initialized yet. Do not use it before fuse_init_inode() is called. Signed-off-by: Maxim Patlasov Signed-off-by: Miklos Szeredi --- fs/fuse/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 8d611696fcad..299e553fcdfd 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -303,7 +303,7 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid, if ((inode->i_state & I_NEW)) { inode->i_flags |= S_NOATIME; - if (!fc->writeback_cache || !S_ISREG(inode->i_mode)) + if (!fc->writeback_cache || !S_ISREG(attr->mode)) inode->i_flags |= S_NOCMTIME; inode->i_generation = generation; inode->i_data.backing_dev_info = &fc->bdi; -- cgit v1.2.3 From 009dd694e82098a703b8b5c8dd9f54c131dbb9b3 Mon Sep 17 00:00:00 2001 From: Maxim Patlasov Date: Mon, 28 Apr 2014 14:19:22 +0200 Subject: fuse: update mtime on truncate(2) Handling truncate(2), VFS doesn't set ATTR_MTIME bit in iattr structure; only ATTR_SIZE bit is set. In-kernel fuse must handle the case by setting mtime fields of struct fuse_setattr_in to "now" and set FATTR_MTIME bit even though ATTR_MTIME was not set. Signed-off-by: Maxim Patlasov Signed-off-by: Miklos Szeredi --- fs/fuse/dir.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 5b4e035b364c..5e361b122526 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1678,6 +1678,8 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr, if (is_truncate) { fuse_set_nowrite(inode); set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); + if (trust_local_mtime && attr->ia_size != inode->i_size) + attr->ia_valid |= ATTR_MTIME; } memset(&inarg, 0, sizeof(inarg)); -- cgit v1.2.3 From 75caeecdf9c7151af5f7d972e2dabbff1bef30a7 Mon Sep 17 00:00:00 2001 From: Maxim Patlasov Date: Mon, 28 Apr 2014 14:19:22 +0200 Subject: fuse: update mtime on open(O_TRUNC) in atomic_o_trunc mode In case of fc->atomic_o_trunc is set, fuse does nothing in fuse_do_setattr() while handling open(O_TRUNC). Hence, i_mtime must be updated explicitly in fuse_finish_open(). The patch also adds extra locking encompassing open(O_TRUNC) operation to avoid races between the truncation and updating i_mtime. Signed-off-by: Maxim Patlasov Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 7ae1e6972caa..e68d8c3f063a 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -223,6 +223,8 @@ void fuse_finish_open(struct inode *inode, struct file *file) i_size_write(inode, 0); spin_unlock(&fc->lock); fuse_invalidate_attr(inode); + if (fc->writeback_cache) + file_update_time(file); } if ((file->f_mode & FMODE_WRITE) && fc->writeback_cache) fuse_link_write_file(file); @@ -232,18 +234,26 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir) { struct fuse_conn *fc = get_fuse_conn(inode); int err; + bool lock_inode = (file->f_flags & O_TRUNC) && + fc->atomic_o_trunc && + fc->writeback_cache; err = generic_file_open(inode, file); if (err) return err; + if (lock_inode) + mutex_lock(&inode->i_mutex); + err = fuse_do_open(fc, get_node_id(inode), file, isdir); - if (err) - return err; - fuse_finish_open(inode, file); + if (!err) + fuse_finish_open(inode, file); - return 0; + if (lock_inode) + mutex_unlock(&inode->i_mutex); + + return err; } static void fuse_prepare_release(struct fuse_file *ff, int flags, int opcode) -- cgit v1.2.3 From 93d2269d2ffb871fdfc5555cb5d4a7c0fc56e7fe Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 28 Apr 2014 14:19:22 +0200 Subject: fuse: fuse: fallocate: use file_update_time() in preparation for getting rid of FUSE_I_MTIME_DIRTY. Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/fuse/file.c b/fs/fuse/file.c index e68d8c3f063a..3391f6a840bd 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -3030,12 +3030,8 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, if (!(mode & FALLOC_FL_KEEP_SIZE)) { bool changed = fuse_write_update_size(inode, offset + length); - if (changed && fc->writeback_cache) { - struct fuse_inode *fi = get_fuse_inode(inode); - - inode->i_mtime = current_fs_time(inode->i_sb); - set_bit(FUSE_I_MTIME_DIRTY, &fi->state); - } + if (changed && fc->writeback_cache) + file_update_time(file); } if (mode & FALLOC_FL_PUNCH_HOLE) -- cgit v1.2.3 From 22401e7b7a686bff02549cd648ba6f73f8dba868 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 28 Apr 2014 14:19:23 +0200 Subject: fuse: clean up fsync Don't need to start I/O twice (once without i_mutex and one within). Also make sure that even if the userspace filesystem doesn't support FSYNC we do all the steps other than sending the message. Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 3391f6a840bd..65586a567d0d 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -490,13 +490,6 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end, if (is_bad_inode(inode)) return -EIO; - err = filemap_write_and_wait_range(inode->i_mapping, start, end); - if (err) - return err; - - if ((!isdir && fc->no_fsync) || (isdir && fc->no_fsyncdir)) - return 0; - mutex_lock(&inode->i_mutex); /* @@ -504,7 +497,7 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end, * wait for all outstanding writes, before sending the FSYNC * request. */ - err = write_inode_now(inode, 0); + err = filemap_write_and_wait_range(inode->i_mapping, start, end); if (err) goto out; @@ -515,6 +508,8 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end, if (err) goto out; } + if ((!isdir && fc->no_fsync) || (isdir && fc->no_fsyncdir)) + goto out; req = fuse_get_req_nopages(fc); if (IS_ERR(req)) { -- cgit v1.2.3 From 1e18bda86e2dcc4ecb176213ee34649c93ad1396 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 28 Apr 2014 14:19:23 +0200 Subject: fuse: add .write_inode ...and flush mtime from this. This allows us to use the kernel infrastructure for writing out dirty metadata (mtime at this point, but ctime in the next patches and also maybe atime). Signed-off-by: Miklos Szeredi --- fs/fuse/dir.c | 28 +++++++++++----------------- fs/fuse/file.c | 44 +++++++++++++++++++++++++++++++------------- fs/fuse/fuse_i.h | 5 ++--- fs/fuse/inode.c | 1 + 4 files changed, 45 insertions(+), 33 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 5e361b122526..8c233834591f 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1597,23 +1597,17 @@ static void fuse_setattr_fill(struct fuse_conn *fc, struct fuse_req *req, /* * Flush inode->i_mtime to the server */ -int fuse_flush_mtime(struct file *file, bool nofail) +int fuse_flush_mtime(struct inode *inode, struct fuse_file *ff) { - struct inode *inode = file->f_mapping->host; - struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_req *req = NULL; + struct fuse_req *req; struct fuse_setattr_in inarg; struct fuse_attr_out outarg; int err; - if (nofail) { - req = fuse_get_req_nofail_nopages(fc, file); - } else { - req = fuse_get_req_nopages(fc); - if (IS_ERR(req)) - return PTR_ERR(req); - } + req = fuse_get_req_nopages(fc); + if (IS_ERR(req)) + return PTR_ERR(req); memset(&inarg, 0, sizeof(inarg)); memset(&outarg, 0, sizeof(outarg)); @@ -1621,15 +1615,15 @@ int fuse_flush_mtime(struct file *file, bool nofail) inarg.valid |= FATTR_MTIME; inarg.mtime = inode->i_mtime.tv_sec; inarg.mtimensec = inode->i_mtime.tv_nsec; - + if (ff) { + inarg.valid |= FATTR_FH; + inarg.fh = ff->fh; + } fuse_setattr_fill(fc, req, inode, &inarg, &outarg); fuse_request_send(fc, req); err = req->out.h.error; fuse_put_request(fc, req); - if (!err) - clear_bit(FUSE_I_MTIME_DIRTY, &fi->state); - return err; } @@ -1715,7 +1709,7 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr, /* the kernel maintains i_mtime locally */ if (trust_local_mtime && (attr->ia_valid & ATTR_MTIME)) { inode->i_mtime = attr->ia_mtime; - clear_bit(FUSE_I_MTIME_DIRTY, &fi->state); + /* FIXME: clear I_DIRTY_SYNC? */ } fuse_change_attributes_common(inode, &outarg.attr, @@ -1953,7 +1947,7 @@ static int fuse_update_time(struct inode *inode, struct timespec *now, { if (flags & S_MTIME) { inode->i_mtime = *now; - set_bit(FUSE_I_MTIME_DIRTY, &get_fuse_inode(inode)->state); + mark_inode_dirty_sync(inode); BUG_ON(!S_ISREG(inode->i_mode)); } return 0; diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 65586a567d0d..d228c3962ffd 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -324,10 +324,7 @@ static int fuse_release(struct inode *inode, struct file *file) /* see fuse_vma_close() for !writeback_cache case */ if (fc->writeback_cache) - filemap_write_and_wait(file->f_mapping); - - if (test_bit(FUSE_I_MTIME_DIRTY, &get_fuse_inode(inode)->state)) - fuse_flush_mtime(file, true); + write_inode_now(inode, 1); fuse_release_common(file, FUSE_RELEASE); @@ -449,7 +446,7 @@ static int fuse_flush(struct file *file, fl_owner_t id) if (fc->no_flush) return 0; - err = filemap_write_and_wait(file->f_mapping); + err = write_inode_now(inode, 1); if (err) return err; @@ -502,12 +499,10 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end, goto out; fuse_sync_writes(inode); + err = sync_inode_metadata(inode, 1); + if (err) + goto out; - if (test_bit(FUSE_I_MTIME_DIRTY, &get_fuse_inode(inode)->state)) { - err = fuse_flush_mtime(file, false); - if (err) - goto out; - } if ((!isdir && fc->no_fsync) || (isdir && fc->no_fsyncdir)) goto out; @@ -1664,13 +1659,13 @@ static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_req *req) fuse_writepage_free(fc, req); } -static struct fuse_file *fuse_write_file_get(struct fuse_conn *fc, - struct fuse_inode *fi) +static struct fuse_file *__fuse_write_file_get(struct fuse_conn *fc, + struct fuse_inode *fi) { struct fuse_file *ff = NULL; spin_lock(&fc->lock); - if (!WARN_ON(list_empty(&fi->write_files))) { + if (!list_empty(&fi->write_files)) { ff = list_entry(fi->write_files.next, struct fuse_file, write_entry); fuse_file_get(ff); @@ -1680,6 +1675,29 @@ static struct fuse_file *fuse_write_file_get(struct fuse_conn *fc, return ff; } +static struct fuse_file *fuse_write_file_get(struct fuse_conn *fc, + struct fuse_inode *fi) +{ + struct fuse_file *ff = __fuse_write_file_get(fc, fi); + WARN_ON(!ff); + return ff; +} + +int fuse_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_file *ff; + int err; + + ff = __fuse_write_file_get(fc, fi); + err = fuse_flush_mtime(inode, ff); + if (ff) + fuse_file_put(ff, 0); + + return err; +} + static int fuse_writepage_locked(struct page *page) { struct address_space *mapping = page->mapping; diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index adfa2d505c1a..d2f10054b9a1 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -119,8 +119,6 @@ enum { FUSE_I_INIT_RDPLUS, /** An operation changing file size is in progress */ FUSE_I_SIZE_UNSTABLE, - /** i_mtime has been updated locally; a flush to userspace needed */ - FUSE_I_MTIME_DIRTY, }; struct fuse_conn; @@ -891,7 +889,8 @@ int fuse_dev_release(struct inode *inode, struct file *file); bool fuse_write_update_size(struct inode *inode, loff_t pos); -int fuse_flush_mtime(struct file *file, bool nofail); +int fuse_flush_mtime(struct inode *inode, struct fuse_file *ff); +int fuse_write_inode(struct inode *inode, struct writeback_control *wbc); int fuse_do_setattr(struct inode *inode, struct iattr *attr, struct file *file); diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 299e553fcdfd..5997e4940512 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -788,6 +788,7 @@ static const struct super_operations fuse_super_operations = { .alloc_inode = fuse_alloc_inode, .destroy_inode = fuse_destroy_inode, .evict_inode = fuse_evict_inode, + .write_inode = fuse_write_inode, .drop_inode = generic_delete_inode, .remount_fs = fuse_remount_fs, .put_super = fuse_put_super, -- cgit v1.2.3 From e27c9d3877a0d0479711a55f5cdd7ee91442da53 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 28 Apr 2014 14:19:23 +0200 Subject: fuse: fuse: add time_gran to INIT_OUT Allow userspace fs to specify time granularity. This is needed because with writeback_cache mode the kernel is responsible for generating mtime and ctime, but if the underlying filesystem doesn't support nanosecond granularity then the cache will contain a different value from the one stored on the filesystem resulting in a change of times after a cache flush. Make the default granularity 1s. Signed-off-by: Miklos Szeredi --- fs/fuse/inode.c | 5 +++++ include/uapi/linux/fuse.h | 7 +++++++ 2 files changed, 12 insertions(+) (limited to 'fs') diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 5997e4940512..560eafcdd6a7 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -891,6 +891,11 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) fc->async_dio = 1; if (arg->flags & FUSE_WRITEBACK_CACHE) fc->writeback_cache = 1; + if (arg->time_gran && arg->time_gran <= 1000000000) + fc->sb->s_time_gran = arg->time_gran; + else + fc->sb->s_time_gran = 1000000000; + } else { ra_pages = fc->max_read / PAGE_CACHE_SIZE; fc->no_lock = 1; diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index cf4750e1bb49..d1b4e2ca9672 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -96,6 +96,8 @@ * * 7.23 * - add FUSE_WRITEBACK_CACHE + * - add time_gran to fuse_init_out + * - add reserved space to fuse_init_out */ #ifndef _LINUX_FUSE_H @@ -559,6 +561,9 @@ struct fuse_init_in { uint32_t flags; }; +#define FUSE_COMPAT_INIT_OUT_SIZE 8 +#define FUSE_COMPAT_22_INIT_OUT_SIZE 24 + struct fuse_init_out { uint32_t major; uint32_t minor; @@ -567,6 +572,8 @@ struct fuse_init_out { uint16_t max_background; uint16_t congestion_threshold; uint32_t max_write; + uint32_t time_gran; + uint32_t unused[9]; }; #define CUSE_INIT_INFO_MAX 4096 -- cgit v1.2.3 From ab9e13f7c771b511d8f71666e83cb27bcc635b98 Mon Sep 17 00:00:00 2001 From: Maxim Patlasov Date: Mon, 28 Apr 2014 14:19:24 +0200 Subject: fuse: allow ctime flushing to userspace The patch extends fuse_setattr_in, and extends the flush procedure (fuse_flush_times()) called on ->write_inode() to send the ctime as well as mtime. Signed-off-by: Maxim Patlasov Signed-off-by: Miklos Szeredi --- fs/fuse/dir.c | 9 +++++++-- fs/fuse/file.c | 2 +- fs/fuse/fuse_i.h | 2 +- include/uapi/linux/fuse.h | 7 +++++-- 4 files changed, 14 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 8c233834591f..e6ba579e2937 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1597,7 +1597,7 @@ static void fuse_setattr_fill(struct fuse_conn *fc, struct fuse_req *req, /* * Flush inode->i_mtime to the server */ -int fuse_flush_mtime(struct inode *inode, struct fuse_file *ff) +int fuse_flush_times(struct inode *inode, struct fuse_file *ff) { struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_req *req; @@ -1612,9 +1612,14 @@ int fuse_flush_mtime(struct inode *inode, struct fuse_file *ff) memset(&inarg, 0, sizeof(inarg)); memset(&outarg, 0, sizeof(outarg)); - inarg.valid |= FATTR_MTIME; + inarg.valid = FATTR_MTIME; inarg.mtime = inode->i_mtime.tv_sec; inarg.mtimensec = inode->i_mtime.tv_nsec; + if (fc->minor >= 23) { + inarg.valid |= FATTR_CTIME; + inarg.ctime = inode->i_ctime.tv_sec; + inarg.ctimensec = inode->i_ctime.tv_nsec; + } if (ff) { inarg.valid |= FATTR_FH; inarg.fh = ff->fh; diff --git a/fs/fuse/file.c b/fs/fuse/file.c index d228c3962ffd..96d513e01a5d 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1691,7 +1691,7 @@ int fuse_write_inode(struct inode *inode, struct writeback_control *wbc) int err; ff = __fuse_write_file_get(fc, fi); - err = fuse_flush_mtime(inode, ff); + err = fuse_flush_times(inode, ff); if (ff) fuse_file_put(ff, 0); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index d2f10054b9a1..40677e33504f 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -889,7 +889,7 @@ int fuse_dev_release(struct inode *inode, struct file *file); bool fuse_write_update_size(struct inode *inode, loff_t pos); -int fuse_flush_mtime(struct inode *inode, struct fuse_file *ff); +int fuse_flush_times(struct inode *inode, struct fuse_file *ff); int fuse_write_inode(struct inode *inode, struct writeback_control *wbc); int fuse_do_setattr(struct inode *inode, struct iattr *attr, diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index d1b4e2ca9672..e86a21acef75 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -98,6 +98,8 @@ * - add FUSE_WRITEBACK_CACHE * - add time_gran to fuse_init_out * - add reserved space to fuse_init_out + * - add FATTR_CTIME + * - add ctime and ctimensec to fuse_setattr_in */ #ifndef _LINUX_FUSE_H @@ -193,6 +195,7 @@ struct fuse_file_lock { #define FATTR_ATIME_NOW (1 << 7) #define FATTR_MTIME_NOW (1 << 8) #define FATTR_LOCKOWNER (1 << 9) +#define FATTR_CTIME (1 << 10) /** * Flags returned by the OPEN request @@ -440,10 +443,10 @@ struct fuse_setattr_in { uint64_t lock_owner; uint64_t atime; uint64_t mtime; - uint64_t unused2; + uint64_t ctime; uint32_t atimensec; uint32_t mtimensec; - uint32_t unused3; + uint32_t ctimensec; uint32_t mode; uint32_t unused4; uint32_t uid; -- cgit v1.2.3 From 8b47e73e91c064cd75e8bf458ce738e1bfe2e700 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 28 Apr 2014 14:19:24 +0200 Subject: fuse: remove .update_time This implements updating ctime as well as mtime on file_update_time(). Signed-off-by: Miklos Szeredi --- fs/fuse/dir.c | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index e6ba579e2937..65357bd1d17b 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1947,17 +1947,6 @@ static int fuse_removexattr(struct dentry *entry, const char *name) return err; } -static int fuse_update_time(struct inode *inode, struct timespec *now, - int flags) -{ - if (flags & S_MTIME) { - inode->i_mtime = *now; - mark_inode_dirty_sync(inode); - BUG_ON(!S_ISREG(inode->i_mode)); - } - return 0; -} - static const struct inode_operations fuse_dir_inode_operations = { .lookup = fuse_lookup, .mkdir = fuse_mkdir, @@ -1997,7 +1986,6 @@ static const struct inode_operations fuse_common_inode_operations = { .getxattr = fuse_getxattr, .listxattr = fuse_listxattr, .removexattr = fuse_removexattr, - .update_time = fuse_update_time, }; static const struct inode_operations fuse_symlink_inode_operations = { -- cgit v1.2.3 From 31f3267b4ba16b12fb9dd3b1953ea0f221cc2ab4 Mon Sep 17 00:00:00 2001 From: Maxim Patlasov Date: Mon, 28 Apr 2014 14:19:24 +0200 Subject: fuse: trust kernel i_ctime only Let the kernel maintain i_ctime locally: update i_ctime explicitly on truncate, fallocate, open(O_TRUNC), setxattr, removexattr, link, rename, unlink. The inode flag I_DIRTY_SYNC serves as indication that local i_ctime should be flushed to the server eventually. The patch sets the flag and updates i_ctime in course of operations listed above. Signed-off-by: Maxim Patlasov Signed-off-by: Miklos Szeredi --- fs/fuse/dir.c | 22 ++++++++++++++++++++-- fs/fuse/inode.c | 6 ++++-- 2 files changed, 24 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 65357bd1d17b..f62ab8eedb5f 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -679,6 +679,14 @@ static int fuse_symlink(struct inode *dir, struct dentry *entry, return create_new_entry(fc, req, dir, entry, S_IFLNK); } +static inline void fuse_update_ctime(struct inode *inode) +{ + if (!IS_NOCMTIME(inode)) { + inode->i_ctime = current_fs_time(inode->i_sb); + mark_inode_dirty_sync(inode); + } +} + static int fuse_unlink(struct inode *dir, struct dentry *entry) { int err; @@ -713,6 +721,7 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry) fuse_invalidate_attr(inode); fuse_invalidate_attr(dir); fuse_invalidate_entry_cache(entry); + fuse_update_ctime(inode); } else if (err == -EINTR) fuse_invalidate_entry(entry); return err; @@ -771,6 +780,7 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent, if (!err) { /* ctime changes */ fuse_invalidate_attr(oldent->d_inode); + fuse_update_ctime(oldent->d_inode); fuse_invalidate_attr(olddir); if (olddir != newdir) @@ -780,6 +790,7 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent, if (newent->d_inode) { fuse_invalidate_attr(newent->d_inode); fuse_invalidate_entry_cache(newent); + fuse_update_ctime(newent->d_inode); } } else if (err == -EINTR) { /* If request was interrupted, DEITY only knows if the @@ -829,6 +840,7 @@ static int fuse_link(struct dentry *entry, struct inode *newdir, inc_nlink(inode); spin_unlock(&fc->lock); fuse_invalidate_attr(inode); + fuse_update_ctime(inode); } else if (err == -EINTR) { fuse_invalidate_attr(inode); } @@ -846,6 +858,8 @@ static void fuse_fillattr(struct inode *inode, struct fuse_attr *attr, attr->size = i_size_read(inode); attr->mtime = inode->i_mtime.tv_sec; attr->mtimensec = inode->i_mtime.tv_nsec; + attr->ctime = inode->i_ctime.tv_sec; + attr->ctimensec = inode->i_ctime.tv_nsec; } stat->dev = inode->i_sb->s_dev; @@ -1811,8 +1825,10 @@ static int fuse_setxattr(struct dentry *entry, const char *name, fc->no_setxattr = 1; err = -EOPNOTSUPP; } - if (!err) + if (!err) { fuse_invalidate_attr(inode); + fuse_update_ctime(inode); + } return err; } @@ -1942,8 +1958,10 @@ static int fuse_removexattr(struct dentry *entry, const char *name) fc->no_removexattr = 1; err = -EOPNOTSUPP; } - if (!err) + if (!err) { fuse_invalidate_attr(inode); + fuse_update_ctime(inode); + } return err; } diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 560eafcdd6a7..e9ecb1878109 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -175,9 +175,9 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, if (!fc->writeback_cache || !S_ISREG(inode->i_mode)) { inode->i_mtime.tv_sec = attr->mtime; inode->i_mtime.tv_nsec = attr->mtimensec; + inode->i_ctime.tv_sec = attr->ctime; + inode->i_ctime.tv_nsec = attr->ctimensec; } - inode->i_ctime.tv_sec = attr->ctime; - inode->i_ctime.tv_nsec = attr->ctimensec; if (attr->blksize != 0) inode->i_blkbits = ilog2(attr->blksize); @@ -256,6 +256,8 @@ static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr) inode->i_size = attr->size; inode->i_mtime.tv_sec = attr->mtime; inode->i_mtime.tv_nsec = attr->mtimensec; + inode->i_ctime.tv_sec = attr->ctime; + inode->i_ctime.tv_nsec = attr->ctimensec; if (S_ISREG(inode->i_mode)) { fuse_init_common(inode); fuse_init_file_inode(inode); -- cgit v1.2.3 From 3ad22c62dd23ad26c8737c300f455de60ba01f40 Mon Sep 17 00:00:00 2001 From: Maxim Patlasov Date: Mon, 28 Apr 2014 14:19:25 +0200 Subject: fuse: clear FUSE_I_CTIME_DIRTY flag on setattr The patch addresses two use-cases when the flag may be safely cleared: 1. fuse_do_setattr() is called with ATTR_CTIME flag set in attr->ia_valid. In this case attr->ia_ctime bears actual value. In-kernel fuse must send it to the userspace server and then assign the value to inode->i_ctime. 2. fuse_do_setattr() is called with ATTR_SIZE flag set in attr->ia_valid, whereas ATTR_CTIME is not set (truncate(2)). In this case in-kernel fuse must sent "now" to the userspace server and then assign the value to inode->i_ctime. In both cases we could clear I_DIRTY_SYNC, but that needs more thought. Signed-off-by: Maxim Patlasov Signed-off-by: Miklos Szeredi --- fs/fuse/dir.c | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index f62ab8eedb5f..843dcf1222eb 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1518,7 +1518,7 @@ static bool update_mtime(unsigned ivalid, bool trust_local_mtime) } static void iattr_to_fattr(struct iattr *iattr, struct fuse_setattr_in *arg, - bool trust_local_mtime) + bool trust_local_cmtime) { unsigned ivalid = iattr->ia_valid; @@ -1537,13 +1537,18 @@ static void iattr_to_fattr(struct iattr *iattr, struct fuse_setattr_in *arg, if (!(ivalid & ATTR_ATIME_SET)) arg->valid |= FATTR_ATIME_NOW; } - if ((ivalid & ATTR_MTIME) && update_mtime(ivalid, trust_local_mtime)) { + if ((ivalid & ATTR_MTIME) && update_mtime(ivalid, trust_local_cmtime)) { arg->valid |= FATTR_MTIME; arg->mtime = iattr->ia_mtime.tv_sec; arg->mtimensec = iattr->ia_mtime.tv_nsec; - if (!(ivalid & ATTR_MTIME_SET) && !trust_local_mtime) + if (!(ivalid & ATTR_MTIME_SET) && !trust_local_cmtime) arg->valid |= FATTR_MTIME_NOW; } + if ((ivalid & ATTR_CTIME) && trust_local_cmtime) { + arg->valid |= FATTR_CTIME; + arg->ctime = iattr->ia_ctime.tv_sec; + arg->ctimensec = iattr->ia_ctime.tv_nsec; + } } /* @@ -1666,7 +1671,7 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr, bool is_wb = fc->writeback_cache; loff_t oldsize; int err; - bool trust_local_mtime = is_wb && S_ISREG(inode->i_mode); + bool trust_local_cmtime = is_wb && S_ISREG(inode->i_mode); if (!(fc->flags & FUSE_DEFAULT_PERMISSIONS)) attr->ia_valid |= ATTR_FORCE; @@ -1691,13 +1696,13 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr, if (is_truncate) { fuse_set_nowrite(inode); set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); - if (trust_local_mtime && attr->ia_size != inode->i_size) - attr->ia_valid |= ATTR_MTIME; + if (trust_local_cmtime && attr->ia_size != inode->i_size) + attr->ia_valid |= ATTR_MTIME | ATTR_CTIME; } memset(&inarg, 0, sizeof(inarg)); memset(&outarg, 0, sizeof(outarg)); - iattr_to_fattr(attr, &inarg, trust_local_mtime); + iattr_to_fattr(attr, &inarg, trust_local_cmtime); if (file) { struct fuse_file *ff = file->private_data; inarg.valid |= FATTR_FH; @@ -1726,8 +1731,11 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr, spin_lock(&fc->lock); /* the kernel maintains i_mtime locally */ - if (trust_local_mtime && (attr->ia_valid & ATTR_MTIME)) { - inode->i_mtime = attr->ia_mtime; + if (trust_local_cmtime) { + if (attr->ia_valid & ATTR_MTIME) + inode->i_mtime = attr->ia_mtime; + if (attr->ia_valid & ATTR_CTIME) + inode->i_ctime = attr->ia_ctime; /* FIXME: clear I_DIRTY_SYNC? */ } -- cgit v1.2.3 From 4ace1f85a7cdab5453c2e12029ff978dd4cd6155 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 28 Apr 2014 14:19:25 +0200 Subject: fuse: clear MS_I_VERSION Fuse doesn't support i_version (yet). Signed-off-by: Miklos Szeredi --- fs/fuse/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index e9ecb1878109..754dcf23de8a 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1004,7 +1004,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) if (sb->s_flags & MS_MANDLOCK) goto err; - sb->s_flags &= ~MS_NOSEC; + sb->s_flags &= ~(MS_NOSEC | MS_I_VERSION); if (!parse_fuse_opt((char *) data, &d, is_bdev)) goto err; -- cgit v1.2.3 From 1560c974dcd40a8d3f193283acd7cc6aee13dc13 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 28 Apr 2014 16:43:44 +0200 Subject: fuse: add renameat2 support Support RENAME_EXCHANGE and RENAME_NOREPLACE flags on the userspace ABI. Signed-off-by: Miklos Szeredi --- fs/fuse/dir.c | 55 ++++++++++++++++++++++++++++++++++++++++------- fs/fuse/fuse_i.h | 3 +++ include/uapi/linux/fuse.h | 8 +++++++ 3 files changed, 58 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 843dcf1222eb..42198359fa1b 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -752,23 +752,26 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry) return err; } -static int fuse_rename(struct inode *olddir, struct dentry *oldent, - struct inode *newdir, struct dentry *newent) +static int fuse_rename_common(struct inode *olddir, struct dentry *oldent, + struct inode *newdir, struct dentry *newent, + unsigned int flags, int opcode, size_t argsize) { int err; - struct fuse_rename_in inarg; + struct fuse_rename2_in inarg; struct fuse_conn *fc = get_fuse_conn(olddir); - struct fuse_req *req = fuse_get_req_nopages(fc); + struct fuse_req *req; + req = fuse_get_req_nopages(fc); if (IS_ERR(req)) return PTR_ERR(req); - memset(&inarg, 0, sizeof(inarg)); + memset(&inarg, 0, argsize); inarg.newdir = get_node_id(newdir); - req->in.h.opcode = FUSE_RENAME; + inarg.flags = flags; + req->in.h.opcode = opcode; req->in.h.nodeid = get_node_id(olddir); req->in.numargs = 3; - req->in.args[0].size = sizeof(inarg); + req->in.args[0].size = argsize; req->in.args[0].value = &inarg; req->in.args[1].size = oldent->d_name.len + 1; req->in.args[1].value = oldent->d_name.name; @@ -782,12 +785,17 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent, fuse_invalidate_attr(oldent->d_inode); fuse_update_ctime(oldent->d_inode); + if (flags & RENAME_EXCHANGE) { + fuse_invalidate_attr(newent->d_inode); + fuse_update_ctime(newent->d_inode); + } + fuse_invalidate_attr(olddir); if (olddir != newdir) fuse_invalidate_attr(newdir); /* newent will end up negative */ - if (newent->d_inode) { + if (!(flags & RENAME_EXCHANGE) && newent->d_inode) { fuse_invalidate_attr(newent->d_inode); fuse_invalidate_entry_cache(newent); fuse_update_ctime(newent->d_inode); @@ -806,6 +814,36 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent, return err; } +static int fuse_rename(struct inode *olddir, struct dentry *oldent, + struct inode *newdir, struct dentry *newent) +{ + return fuse_rename_common(olddir, oldent, newdir, newent, 0, + FUSE_RENAME, sizeof(struct fuse_rename_in)); +} + +static int fuse_rename2(struct inode *olddir, struct dentry *oldent, + struct inode *newdir, struct dentry *newent, + unsigned int flags) +{ + struct fuse_conn *fc = get_fuse_conn(olddir); + int err; + + if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE)) + return -EINVAL; + + if (fc->no_rename2 || fc->minor < 23) + return -EINVAL; + + err = fuse_rename_common(olddir, oldent, newdir, newent, flags, + FUSE_RENAME2, sizeof(struct fuse_rename2_in)); + if (err == -ENOSYS) { + fc->no_rename2 = 1; + err = -EINVAL; + } + return err; + +} + static int fuse_link(struct dentry *entry, struct inode *newdir, struct dentry *newent) { @@ -1980,6 +2018,7 @@ static const struct inode_operations fuse_dir_inode_operations = { .unlink = fuse_unlink, .rmdir = fuse_rmdir, .rename = fuse_rename, + .rename2 = fuse_rename2, .link = fuse_link, .setattr = fuse_setattr, .create = fuse_create, diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 40677e33504f..7aa5c75e0de1 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -542,6 +542,9 @@ struct fuse_conn { /** Is fallocate not implemented by fs? */ unsigned no_fallocate:1; + /** Is rename with flags implemented by fs? */ + unsigned no_rename2:1; + /** Use enhanced/automatic page cache invalidation. */ unsigned auto_inval_data:1; diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index e86a21acef75..40b5ca8a1b1f 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -100,6 +100,7 @@ * - add reserved space to fuse_init_out * - add FATTR_CTIME * - add ctime and ctimensec to fuse_setattr_in + * - add FUSE_RENAME2 request */ #ifndef _LINUX_FUSE_H @@ -353,6 +354,7 @@ enum fuse_opcode { FUSE_BATCH_FORGET = 42, FUSE_FALLOCATE = 43, FUSE_READDIRPLUS = 44, + FUSE_RENAME2 = 45, /* CUSE specific operations */ CUSE_INIT = 4096, @@ -431,6 +433,12 @@ struct fuse_rename_in { uint64_t newdir; }; +struct fuse_rename2_in { + uint64_t newdir; + uint32_t flags; + uint32_t padding; +}; + struct fuse_link_in { uint64_t oldnodeid; }; -- cgit v1.2.3 From 0081bd83c089ef3d0c9a4e4e869e2ab75f2cb379 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Tue, 8 Apr 2014 21:42:59 +0800 Subject: ceph: check directory's completeness before emitting directory entry Signed-off-by: Yan, Zheng Reviewed-by: Sage Weil --- fs/ceph/dir.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 766410a12c2c..8c7f90b96913 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -182,9 +182,16 @@ more: spin_unlock(&dentry->d_lock); spin_unlock(&parent->d_lock); + /* make sure a dentry wasn't dropped while we didn't have parent lock */ + if (!ceph_dir_is_complete(dir)) { + dout(" lost dir complete on %p; falling back to mds\n", dir); + dput(dentry); + err = -EAGAIN; + goto out; + } + dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, ctx->pos, dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode); - ctx->pos = di->offset; if (!dir_emit(ctx, dentry->d_name.name, dentry->d_name.len, ceph_translate_ino(dentry->d_sb, dentry->d_inode->i_ino), @@ -198,19 +205,12 @@ more: return 0; } + ctx->pos = di->offset + 1; + if (last) dput(last); last = dentry; - ctx->pos++; - - /* make sure a dentry wasn't dropped while we didn't have parent lock */ - if (!ceph_dir_is_complete(dir)) { - dout(" lost dir complete on %p; falling back to mds\n", dir); - err = -EAGAIN; - goto out; - } - spin_lock(&parent->d_lock); p = p->prev; /* advance to next dentry */ goto more; @@ -296,6 +296,8 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) err = __dcache_readdir(file, ctx, shared_gen); if (err != -EAGAIN) return err; + frag = fpos_frag(ctx->pos); + off = fpos_off(ctx->pos); } else { spin_unlock(&ci->i_ceph_lock); } -- cgit v1.2.3 From 6da5246dd4b077ab229481ca342802f7fdcdab59 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Wed, 9 Apr 2014 09:35:19 +0800 Subject: ceph: use fpos_cmp() to compare dentry positions Signed-off-by: Yan, Zheng Reviewed-by: Sage Weil --- fs/ceph/dir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 8c7f90b96913..fb4f7a203eda 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -141,7 +141,7 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx, /* start at beginning? */ if (ctx->pos == 2 || last == NULL || - ctx->pos < ceph_dentry(last)->offset) { + fpos_cmp(ctx->pos, ceph_dentry(last)->offset) < 0) { if (list_empty(&parent->d_subdirs)) goto out_unlock; p = parent->d_subdirs.prev; -- cgit v1.2.3 From 0a8a70f96fe1bd3e07c15bb86fd247e76102398a Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Mon, 14 Apr 2014 13:13:02 +0800 Subject: ceph: clear directory's completeness when creating file When creating a file, ceph_set_dentry_offset() puts the new dentry at the end of directory's d_subdirs, then set the dentry's offset based on directory's max offset. The offset does not reflect the real postion of the dentry in directory. Later readdir reply from MDS may change the dentry's position/offset. This inconsistency can cause missing/duplicate entries in readdir result if readdir is partly satisfied by dcache_readdir(). The fix is clear directory's completeness after creating/renaming file. It prevents later readdir from using dcache_readdir(). Fixes: http://tracker.ceph.com/issues/8025 Signed-off-by: Yan, Zheng Reviewed-by: Sage Weil --- fs/ceph/dir.c | 9 ++++---- fs/ceph/inode.c | 71 +++++++++++++-------------------------------------------- fs/ceph/super.h | 1 - 3 files changed, 21 insertions(+), 60 deletions(-) (limited to 'fs') diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index fb4f7a203eda..c29d6ae68874 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -448,7 +448,6 @@ more: if (atomic_read(&ci->i_release_count) == fi->dir_release_count) { dout(" marking %p complete\n", inode); __ceph_dir_set_complete(ci, fi->dir_release_count); - ci->i_max_offset = ctx->pos; } spin_unlock(&ci->i_ceph_lock); @@ -937,14 +936,16 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry, * to do it here. */ - /* d_move screws up d_subdirs order */ - ceph_dir_clear_complete(new_dir); - d_move(old_dentry, new_dentry); /* ensure target dentry is invalidated, despite rehashing bug in vfs_rename_dir */ ceph_invalidate_dentry_lease(new_dentry); + + /* d_move screws up sibling dentries' offsets */ + ceph_dir_clear_complete(old_dir); + ceph_dir_clear_complete(new_dir); + } ceph_mdsc_put_request(req); return err; diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 0b0728e5be2d..233c6f96910a 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -744,7 +744,6 @@ static int fill_inode(struct inode *inode, !__ceph_dir_is_complete(ci)) { dout(" marking %p complete (empty)\n", inode); __ceph_dir_set_complete(ci, atomic_read(&ci->i_release_count)); - ci->i_max_offset = 2; } no_change: /* only update max_size on auth cap */ @@ -889,41 +888,6 @@ out_unlock: return; } -/* - * Set dentry's directory position based on the current dir's max, and - * order it in d_subdirs, so that dcache_readdir behaves. - * - * Always called under directory's i_mutex. - */ -static void ceph_set_dentry_offset(struct dentry *dn) -{ - struct dentry *dir = dn->d_parent; - struct inode *inode = dir->d_inode; - struct ceph_inode_info *ci; - struct ceph_dentry_info *di; - - BUG_ON(!inode); - - ci = ceph_inode(inode); - di = ceph_dentry(dn); - - spin_lock(&ci->i_ceph_lock); - if (!__ceph_dir_is_complete(ci)) { - spin_unlock(&ci->i_ceph_lock); - return; - } - di->offset = ceph_inode(inode)->i_max_offset++; - spin_unlock(&ci->i_ceph_lock); - - spin_lock(&dir->d_lock); - spin_lock_nested(&dn->d_lock, DENTRY_D_LOCK_NESTED); - list_move(&dn->d_u.d_child, &dir->d_subdirs); - dout("set_dentry_offset %p %lld (%p %p)\n", dn, di->offset, - dn->d_u.d_child.prev, dn->d_u.d_child.next); - spin_unlock(&dn->d_lock); - spin_unlock(&dir->d_lock); -} - /* * splice a dentry to an inode. * caller must hold directory i_mutex for this to be safe. @@ -933,7 +897,7 @@ static void ceph_set_dentry_offset(struct dentry *dn) * the caller) if we fail. */ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in, - bool *prehash, bool set_offset) + bool *prehash) { struct dentry *realdn; @@ -965,8 +929,6 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in, } if ((!prehash || *prehash) && d_unhashed(dn)) d_rehash(dn); - if (set_offset) - ceph_set_dentry_offset(dn); out: return dn; } @@ -987,7 +949,6 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, { struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; struct inode *in = NULL; - struct ceph_mds_reply_inode *ininfo; struct ceph_vino vino; struct ceph_fs_client *fsc = ceph_sb_to_client(sb); int err = 0; @@ -1161,6 +1122,9 @@ retry_lookup: /* rename? */ if (req->r_old_dentry && req->r_op == CEPH_MDS_OP_RENAME) { + struct inode *olddir = req->r_old_dentry_dir; + BUG_ON(!olddir); + dout(" src %p '%.*s' dst %p '%.*s'\n", req->r_old_dentry, req->r_old_dentry->d_name.len, @@ -1180,13 +1144,10 @@ retry_lookup: rehashing bug in vfs_rename_dir */ ceph_invalidate_dentry_lease(dn); - /* - * d_move() puts the renamed dentry at the end of - * d_subdirs. We need to assign it an appropriate - * directory offset so we can behave when dir is - * complete. - */ - ceph_set_dentry_offset(req->r_old_dentry); + /* d_move screws up sibling dentries' offsets */ + ceph_dir_clear_complete(dir); + ceph_dir_clear_complete(olddir); + dout("dn %p gets new offset %lld\n", req->r_old_dentry, ceph_dentry(req->r_old_dentry)->offset); @@ -1213,8 +1174,9 @@ retry_lookup: /* attach proper inode */ if (!dn->d_inode) { + ceph_dir_clear_complete(dir); ihold(in); - dn = splice_dentry(dn, in, &have_lease, true); + dn = splice_dentry(dn, in, &have_lease); if (IS_ERR(dn)) { err = PTR_ERR(dn); goto done; @@ -1235,17 +1197,16 @@ retry_lookup: (req->r_op == CEPH_MDS_OP_LOOKUPSNAP || req->r_op == CEPH_MDS_OP_MKSNAP)) { struct dentry *dn = req->r_dentry; + struct inode *dir = req->r_locked_dir; /* fill out a snapdir LOOKUPSNAP dentry */ BUG_ON(!dn); - BUG_ON(!req->r_locked_dir); - BUG_ON(ceph_snap(req->r_locked_dir) != CEPH_SNAPDIR); - ininfo = rinfo->targeti.in; - vino.ino = le64_to_cpu(ininfo->ino); - vino.snap = le64_to_cpu(ininfo->snapid); + BUG_ON(!dir); + BUG_ON(ceph_snap(dir) != CEPH_SNAPDIR); dout(" linking snapped dir %p to dn %p\n", in, dn); + ceph_dir_clear_complete(dir); ihold(in); - dn = splice_dentry(dn, in, NULL, true); + dn = splice_dentry(dn, in, NULL); if (IS_ERR(dn)) { err = PTR_ERR(dn); goto done; @@ -1407,7 +1368,7 @@ retry_lookup: } if (!dn->d_inode) { - dn = splice_dentry(dn, in, NULL, false); + dn = splice_dentry(dn, in, NULL); if (IS_ERR(dn)) { err = PTR_ERR(dn); dn = NULL; diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 7866cd05a6bb..ead05cc1f447 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -266,7 +266,6 @@ struct ceph_inode_info { struct timespec i_rctime; u64 i_rbytes, i_rfiles, i_rsubdirs; u64 i_files, i_subdirs; - u64 i_max_offset; /* largest readdir offset, set with complete dir */ struct rb_root i_fragtree; struct mutex i_fragtree_mutex; -- cgit v1.2.3 From fd7b95cd1b58171a0b931b2063729a032bec4ac2 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Thu, 17 Apr 2014 08:02:02 +0800 Subject: ceph: avoid releasing caps that are being used To avoid releasing caps that are being used, encode_inode_release() should send implemented caps to MDS. Signed-off-by: Yan, Zheng Reviewed-by: Sage Weil --- fs/ceph/caps.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 2e5e648eb5c3..c561b628ebce 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -3261,7 +3261,7 @@ int ceph_encode_inode_release(void **p, struct inode *inode, rel->seq = cpu_to_le32(cap->seq); rel->issue_seq = cpu_to_le32(cap->issue_seq), rel->mseq = cpu_to_le32(cap->mseq); - rel->caps = cpu_to_le32(cap->issued); + rel->caps = cpu_to_le32(cap->implemented); rel->wanted = cpu_to_le32(cap->mds_wanted); rel->dname_len = 0; rel->dname_seq = 0; -- cgit v1.2.3 From 3bd58143bafc56dbc07f4f085e4d7e018d332674 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Sun, 27 Apr 2014 09:17:45 +0800 Subject: ceph: reserve caps for file layout/lock MDS requests Signed-off-by: Yan, Zheng Reviewed-by: Sage Weil --- fs/ceph/ioctl.c | 3 +++ fs/ceph/locks.c | 1 + 2 files changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c index efbe08289292..2042fd16d1a8 100644 --- a/fs/ceph/ioctl.c +++ b/fs/ceph/ioctl.c @@ -110,6 +110,8 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg) return PTR_ERR(req); req->r_inode = inode; ihold(inode); + req->r_num_caps = 1; + req->r_inode_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL; req->r_args.setlayout.layout.fl_stripe_unit = @@ -154,6 +156,7 @@ static long ceph_ioctl_set_layout_policy (struct file *file, void __user *arg) return PTR_ERR(req); req->r_inode = inode; ihold(inode); + req->r_num_caps = 1; req->r_args.setlayout.layout.fl_stripe_unit = cpu_to_le32(l.stripe_unit); diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index d94ba0df9f4d..191398852a2e 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -45,6 +45,7 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file, return PTR_ERR(req); req->r_inode = inode; ihold(inode); + req->r_num_caps = 1; /* mds requires start and length rather than start and end */ if (LLONG_MAX == fl->fl_end) -- cgit v1.2.3 From 03b3b889e79cdb6b806fc0ba9be0d71c186bbfaa Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 29 Apr 2014 15:45:28 -0400 Subject: fold d_kill() and d_free() Signed-off-by: Al Viro --- fs/dcache.c | 76 +++++++++++++++++++------------------------------------------ 1 file changed, 24 insertions(+), 52 deletions(-) (limited to 'fs') diff --git a/fs/dcache.c b/fs/dcache.c index 494a9def5dce..9b15c5c37277 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -246,23 +246,6 @@ static void __d_free(struct rcu_head *head) kmem_cache_free(dentry_cache, dentry); } -/* - * no locks, please. - */ -static void d_free(struct dentry *dentry) -{ - BUG_ON((int)dentry->d_lockref.count > 0); - this_cpu_dec(nr_dentry); - if (dentry->d_op && dentry->d_op->d_release) - dentry->d_op->d_release(dentry); - - /* if dentry was never visible to RCU, immediate free is OK */ - if (!(dentry->d_flags & DCACHE_RCUACCESS)) - __d_free(&dentry->d_u.d_rcu); - else - call_rcu(&dentry->d_u.d_rcu, __d_free); -} - /** * dentry_rcuwalk_barrier - invalidate in-progress rcu-walk lookups * @dentry: the target dentry @@ -419,40 +402,6 @@ static void dentry_lru_del(struct dentry *dentry) } } -/** - * d_kill - kill dentry and return parent - * @dentry: dentry to kill - * @parent: parent dentry - * - * The dentry must already be unhashed and removed from the LRU. - * - * If this is the root of the dentry tree, return NULL. - * - * dentry->d_lock and parent->d_lock must be held by caller, and are dropped by - * d_kill. - */ -static struct dentry *d_kill(struct dentry *dentry, struct dentry *parent) - __releases(dentry->d_lock) - __releases(parent->d_lock) - __releases(dentry->d_inode->i_lock) -{ - list_del(&dentry->d_u.d_child); - /* - * Inform d_walk() that we are no longer attached to the - * dentry tree - */ - dentry->d_flags |= DCACHE_DENTRY_KILLED; - if (parent) - spin_unlock(&parent->d_lock); - dentry_iput(dentry); - /* - * dentry_iput drops the locks, at which point nobody (except - * transient RCU lookups) can reach this dentry. - */ - d_free(dentry); - return parent; -} - /** * d_drop - drop a dentry * @dentry: dentry to drop @@ -546,7 +495,30 @@ relock: dentry_lru_del(dentry); /* if it was on the hash then remove it */ __d_drop(dentry); - return d_kill(dentry, parent); + list_del(&dentry->d_u.d_child); + /* + * Inform d_walk() that we are no longer attached to the + * dentry tree + */ + dentry->d_flags |= DCACHE_DENTRY_KILLED; + if (parent) + spin_unlock(&parent->d_lock); + dentry_iput(dentry); + /* + * dentry_iput drops the locks, at which point nobody (except + * transient RCU lookups) can reach this dentry. + */ + BUG_ON((int)dentry->d_lockref.count > 0); + this_cpu_dec(nr_dentry); + if (dentry->d_op && dentry->d_op->d_release) + dentry->d_op->d_release(dentry); + + /* if dentry was never visible to RCU, immediate free is OK */ + if (!(dentry->d_flags & DCACHE_RCUACCESS)) + __d_free(&dentry->d_u.d_rcu); + else + call_rcu(&dentry->d_u.d_rcu, __d_free); + return parent; } /* -- cgit v1.2.3 From 5c47e6d0ad608987b91affbcf7d1fc12dfbe8fb4 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 29 Apr 2014 16:13:18 -0400 Subject: fold try_prune_one_dentry() Signed-off-by: Al Viro --- fs/dcache.c | 75 +++++++++++++++++++++---------------------------------------- 1 file changed, 25 insertions(+), 50 deletions(-) (limited to 'fs') diff --git a/fs/dcache.c b/fs/dcache.c index 9b15c5c37277..a5540d491954 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -787,47 +787,9 @@ restart: } EXPORT_SYMBOL(d_prune_aliases); -/* - * Try to throw away a dentry - free the inode, dput the parent. - * Requires dentry->d_lock is held, and dentry->d_count == 0. - * Releases dentry->d_lock. - * - * This may fail if locks cannot be acquired no problem, just try again. - */ -static struct dentry * try_prune_one_dentry(struct dentry *dentry) - __releases(dentry->d_lock) -{ - struct dentry *parent; - - parent = dentry_kill(dentry, 0); - /* - * If dentry_kill returns NULL, we have nothing more to do. - * if it returns the same dentry, trylocks failed. In either - * case, just loop again. - * - * Otherwise, we need to prune ancestors too. This is necessary - * to prevent quadratic behavior of shrink_dcache_parent(), but - * is also expected to be beneficial in reducing dentry cache - * fragmentation. - */ - if (!parent) - return NULL; - if (parent == dentry) - return dentry; - - /* Prune ancestors. */ - dentry = parent; - while (dentry) { - if (lockref_put_or_lock(&dentry->d_lockref)) - return NULL; - dentry = dentry_kill(dentry, 1); - } - return NULL; -} - static void shrink_dentry_list(struct list_head *list) { - struct dentry *dentry; + struct dentry *dentry, *parent; rcu_read_lock(); for (;;) { @@ -863,22 +825,35 @@ static void shrink_dentry_list(struct list_head *list) } rcu_read_unlock(); + parent = dentry_kill(dentry, 0); /* - * If 'try_to_prune()' returns a dentry, it will - * be the same one we passed in, and d_lock will - * have been held the whole time, so it will not - * have been added to any other lists. We failed - * to get the inode lock. - * - * We just add it back to the shrink list. + * If dentry_kill returns NULL, we have nothing more to do. */ - dentry = try_prune_one_dentry(dentry); - - rcu_read_lock(); - if (dentry) { + if (!parent) { + rcu_read_lock(); + continue; + } + if (unlikely(parent == dentry)) { + /* + * trylocks have failed and d_lock has been held the + * whole time, so it could not have been added to any + * other lists. Just add it back to the shrink list. + */ + rcu_read_lock(); d_shrink_add(dentry, list); spin_unlock(&dentry->d_lock); + continue; } + /* + * We need to prune ancestors too. This is necessary to prevent + * quadratic behavior of shrink_dcache_parent(), but is also + * expected to be beneficial in reducing dentry cache + * fragmentation. + */ + dentry = parent; + while (dentry && !lockref_put_or_lock(&dentry->d_lockref)) + dentry = dentry_kill(dentry, 1); + rcu_read_lock(); } rcu_read_unlock(); } -- cgit v1.2.3 From b4f0354e968f5fabd39bc85b99fedae4a97589fe Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 29 Apr 2014 23:40:14 -0400 Subject: new helper: dentry_free() The part of old d_free() that dealt with actual freeing of dentry. Taken out of dentry_kill() into a separate function. Signed-off-by: Al Viro --- fs/dcache.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/dcache.c b/fs/dcache.c index a5540d491954..dab7db10d685 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -246,6 +246,15 @@ static void __d_free(struct rcu_head *head) kmem_cache_free(dentry_cache, dentry); } +static void dentry_free(struct dentry *dentry) +{ + /* if dentry was never visible to RCU, immediate free is OK */ + if (!(dentry->d_flags & DCACHE_RCUACCESS)) + __d_free(&dentry->d_u.d_rcu); + else + call_rcu(&dentry->d_u.d_rcu, __d_free); +} + /** * dentry_rcuwalk_barrier - invalidate in-progress rcu-walk lookups * @dentry: the target dentry @@ -513,11 +522,7 @@ relock: if (dentry->d_op && dentry->d_op->d_release) dentry->d_op->d_release(dentry); - /* if dentry was never visible to RCU, immediate free is OK */ - if (!(dentry->d_flags & DCACHE_RCUACCESS)) - __d_free(&dentry->d_u.d_rcu); - else - call_rcu(&dentry->d_u.d_rcu, __d_free); + dentry_free(dentry); return parent; } -- cgit v1.2.3 From 01b6035190b024240a43ac1d8e9c6f964f5f1c63 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 29 Apr 2014 23:42:52 -0400 Subject: expand the call of dentry_lru_del() in dentry_kill() Signed-off-by: Al Viro --- fs/dcache.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/dcache.c b/fs/dcache.c index dab7db10d685..e482775343a0 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -501,7 +501,12 @@ relock: if ((dentry->d_flags & DCACHE_OP_PRUNE) && !d_unhashed(dentry)) dentry->d_op->d_prune(dentry); - dentry_lru_del(dentry); + if (dentry->d_flags & DCACHE_LRU_LIST) { + if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) + d_lru_del(dentry); + else + d_shrink_del(dentry); + } /* if it was on the hash then remove it */ __d_drop(dentry); list_del(&dentry->d_u.d_child); -- cgit v1.2.3 From 754320d6e166d3a12cb4810a452bde00afbd4e9a Mon Sep 17 00:00:00 2001 From: Leon Yu Date: Thu, 1 May 2014 03:31:28 +0000 Subject: aio: fix potential leak in aio_run_iocb(). iovec should be reclaimed whenever caller of rw_copy_check_uvector() returns, but it doesn't hold when failure happens right after aio_setup_vectored_rw(). Fix that in a such way to avoid hairy goto. Signed-off-by: Leon Yu Signed-off-by: Benjamin LaHaise Cc: stable@vger.kernel.org --- fs/aio.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index 2adbb0398ab9..a0ed6c7d2cd2 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -1327,10 +1327,8 @@ rw_common: &iovec, compat) : aio_setup_single_vector(req, rw, buf, &nr_segs, iovec); - if (ret) - return ret; - - ret = rw_verify_area(rw, file, &req->ki_pos, req->ki_nbytes); + if (!ret) + ret = rw_verify_area(rw, file, &req->ki_pos, req->ki_nbytes); if (ret < 0) { if (iovec != &inline_vec) kfree(iovec); -- cgit v1.2.3 From 41edf278fc2f042f4e22a12ed87d19c5201210e1 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 1 May 2014 10:30:00 -0400 Subject: dentry_kill(): don't try to remove from shrink list If the victim in on the shrink list, don't remove it from there. If shrink_dentry_list() manages to remove it from the list before we are done - fine, we'll just free it as usual. If not - mark it with new flag (DCACHE_MAY_FREE) and leave it there. Eventually, shrink_dentry_list() will get to it, remove the sucker from shrink list and call dentry_kill(dentry, 0). Which is where we'll deal with freeing. Since now dentry_kill(dentry, 0) may happen after or during dentry_kill(dentry, 1), we need to recognize that (by seeing DCACHE_DENTRY_KILLED already set), unlock everything and either free the sucker (in case DCACHE_MAY_FREE has been set) or leave it for ongoing dentry_kill(dentry, 1) to deal with. Signed-off-by: Al Viro --- fs/dcache.c | 27 +++++++++++++++++++-------- include/linux/dcache.h | 2 ++ 2 files changed, 21 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/dcache.c b/fs/dcache.c index e482775343a0..58e26bee7ef4 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -468,7 +468,14 @@ dentry_kill(struct dentry *dentry, int unlock_on_failure) __releases(dentry->d_lock) { struct inode *inode; - struct dentry *parent; + struct dentry *parent = NULL; + bool can_free = true; + + if (unlikely(dentry->d_flags & DCACHE_DENTRY_KILLED)) { + can_free = dentry->d_flags & DCACHE_MAY_FREE; + spin_unlock(&dentry->d_lock); + goto out; + } inode = dentry->d_inode; if (inode && !spin_trylock(&inode->i_lock)) { @@ -479,9 +486,7 @@ relock: } return dentry; /* try again with same dentry */ } - if (IS_ROOT(dentry)) - parent = NULL; - else + if (!IS_ROOT(dentry)) parent = dentry->d_parent; if (parent && !spin_trylock(&parent->d_lock)) { if (inode) @@ -504,8 +509,6 @@ relock: if (dentry->d_flags & DCACHE_LRU_LIST) { if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) d_lru_del(dentry); - else - d_shrink_del(dentry); } /* if it was on the hash then remove it */ __d_drop(dentry); @@ -527,7 +530,15 @@ relock: if (dentry->d_op && dentry->d_op->d_release) dentry->d_op->d_release(dentry); - dentry_free(dentry); + spin_lock(&dentry->d_lock); + if (dentry->d_flags & DCACHE_SHRINK_LIST) { + dentry->d_flags |= DCACHE_MAY_FREE; + can_free = false; + } + spin_unlock(&dentry->d_lock); +out: + if (likely(can_free)) + dentry_free(dentry); return parent; } @@ -829,7 +840,7 @@ static void shrink_dentry_list(struct list_head *list) * We found an inuse dentry which was not removed from * the LRU because of laziness during lookup. Do not free it. */ - if (dentry->d_lockref.count) { + if ((int)dentry->d_lockref.count > 0) { spin_unlock(&dentry->d_lock); continue; } diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 3b9bfdb83ba6..3c7ec327ebd2 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -221,6 +221,8 @@ struct dentry_operations { #define DCACHE_SYMLINK_TYPE 0x00300000 /* Symlink */ #define DCACHE_FILE_TYPE 0x00400000 /* Other file type */ +#define DCACHE_MAY_FREE 0x00800000 + extern seqlock_t rename_lock; static inline int dname_external(const struct dentry *dentry) -- cgit v1.2.3 From fe91522a7ba82ca1a51b07e19954b3825e4aaa22 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 3 May 2014 00:02:25 -0400 Subject: don't remove from shrink list in select_collect() If we find something already on a shrink list, just increment data->found and do nothing else. Loops in shrink_dcache_parent() and check_submounts_and_drop() will do the right thing - everything we did put into our list will be evicted and if there had been nothing, but data->found got non-zero, well, we have somebody else shrinking those guys; just try again. Signed-off-by: Al Viro --- fs/dcache.c | 31 ++++++++++--------------------- 1 file changed, 10 insertions(+), 21 deletions(-) (limited to 'fs') diff --git a/fs/dcache.c b/fs/dcache.c index 58e26bee7ef4..f39a6f5a1220 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1229,34 +1229,23 @@ static enum d_walk_ret select_collect(void *_data, struct dentry *dentry) if (data->start == dentry) goto out; - /* - * move only zero ref count dentries to the dispose list. - * - * Those which are presently on the shrink list, being processed - * by shrink_dentry_list(), shouldn't be moved. Otherwise the - * loop in shrink_dcache_parent() might not make any progress - * and loop forever. - */ - if (dentry->d_lockref.count) { - dentry_lru_del(dentry); - } else if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) { - /* - * We can't use d_lru_shrink_move() because we - * need to get the global LRU lock and do the - * LRU accounting. - */ - d_lru_del(dentry); - d_shrink_add(dentry, &data->dispose); + if (dentry->d_flags & DCACHE_SHRINK_LIST) { data->found++; - ret = D_WALK_NORETRY; + } else { + if (dentry->d_flags & DCACHE_LRU_LIST) + d_lru_del(dentry); + if (!dentry->d_lockref.count) { + d_shrink_add(dentry, &data->dispose); + data->found++; + } } /* * We can return to the caller if we have found some (this * ensures forward progress). We'll be coming back to find * the rest. */ - if (data->found && need_resched()) - ret = D_WALK_QUIT; + if (!list_empty(&data->dispose)) + ret = need_resched() ? D_WALK_QUIT : D_WALK_NORETRY; out: return ret; } -- cgit v1.2.3 From 9c8c10e262e0f62cb2530f1b076de979123183dd Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 2 May 2014 20:36:10 -0400 Subject: more graceful recovery in umount_collect() Start with shrink_dcache_parent(), then scan what remains. First of all, BUG() is very much an overkill here; we are holding ->s_umount, and hitting BUG() means that a lot of interesting stuff will be hanging after that point (sync(2), for example). Moreover, in cases when there had been more than one leak, we'll be better off reporting all of them. And more than just the last component of pathname - %pd is there for just such uses... That was the last user of dentry_lru_del(), so kill it off... Signed-off-by: Al Viro --- fs/dcache.c | 101 +++++++++++++++--------------------------------------------- 1 file changed, 25 insertions(+), 76 deletions(-) (limited to 'fs') diff --git a/fs/dcache.c b/fs/dcache.c index f39a6f5a1220..2321e1a861f6 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -395,22 +395,6 @@ static void dentry_lru_add(struct dentry *dentry) d_lru_add(dentry); } -/* - * Remove a dentry with references from the LRU. - * - * If we are on the shrink list, then we can get to try_prune_one_dentry() and - * lose our last reference through the parent walk. In this case, we need to - * remove ourselves from the shrink list, not the LRU. - */ -static void dentry_lru_del(struct dentry *dentry) -{ - if (dentry->d_flags & DCACHE_LRU_LIST) { - if (dentry->d_flags & DCACHE_SHRINK_LIST) - return d_shrink_del(dentry); - d_lru_del(dentry); - } -} - /** * d_drop - drop a dentry * @dentry: dentry to drop @@ -1275,45 +1259,35 @@ void shrink_dcache_parent(struct dentry *parent) } EXPORT_SYMBOL(shrink_dcache_parent); -static enum d_walk_ret umount_collect(void *_data, struct dentry *dentry) +static enum d_walk_ret umount_check(void *_data, struct dentry *dentry) { - struct select_data *data = _data; - enum d_walk_ret ret = D_WALK_CONTINUE; + /* it has busy descendents; complain about those instead */ + if (!list_empty(&dentry->d_subdirs)) + return D_WALK_CONTINUE; - if (dentry->d_lockref.count) { - dentry_lru_del(dentry); - if (likely(!list_empty(&dentry->d_subdirs))) - goto out; - if (dentry == data->start && dentry->d_lockref.count == 1) - goto out; - printk(KERN_ERR - "BUG: Dentry %p{i=%lx,n=%s}" - " still in use (%d)" - " [unmount of %s %s]\n", + /* root with refcount 1 is fine */ + if (dentry == _data && dentry->d_lockref.count == 1) + return D_WALK_CONTINUE; + + printk(KERN_ERR "BUG: Dentry %p{i=%lx,n=%pd} " + " still in use (%d) [unmount of %s %s]\n", dentry, dentry->d_inode ? dentry->d_inode->i_ino : 0UL, - dentry->d_name.name, + dentry, dentry->d_lockref.count, dentry->d_sb->s_type->name, dentry->d_sb->s_id); - BUG(); - } else if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) { - /* - * We can't use d_lru_shrink_move() because we - * need to get the global LRU lock and do the - * LRU accounting. - */ - if (dentry->d_flags & DCACHE_LRU_LIST) - d_lru_del(dentry); - d_shrink_add(dentry, &data->dispose); - data->found++; - ret = D_WALK_NORETRY; - } -out: - if (data->found && need_resched()) - ret = D_WALK_QUIT; - return ret; + WARN_ON(1); + return D_WALK_CONTINUE; +} + +static void do_one_tree(struct dentry *dentry) +{ + shrink_dcache_parent(dentry); + d_walk(dentry, dentry, umount_check, NULL); + d_drop(dentry); + dput(dentry); } /* @@ -1323,40 +1297,15 @@ void shrink_dcache_for_umount(struct super_block *sb) { struct dentry *dentry; - if (down_read_trylock(&sb->s_umount)) - BUG(); + WARN(down_read_trylock(&sb->s_umount), "s_umount should've been locked"); dentry = sb->s_root; sb->s_root = NULL; - for (;;) { - struct select_data data; - - INIT_LIST_HEAD(&data.dispose); - data.start = dentry; - data.found = 0; - - d_walk(dentry, &data, umount_collect, NULL); - if (!data.found) - break; - - shrink_dentry_list(&data.dispose); - cond_resched(); - } - d_drop(dentry); - dput(dentry); + do_one_tree(dentry); while (!hlist_bl_empty(&sb->s_anon)) { - struct select_data data; - dentry = hlist_bl_entry(hlist_bl_first(&sb->s_anon), struct dentry, d_hash); - - INIT_LIST_HEAD(&data.dispose); - data.start = NULL; - data.found = 0; - - d_walk(dentry, &data, umount_collect, NULL); - if (data.found) - shrink_dentry_list(&data.dispose); - cond_resched(); + dentry = dget(hlist_bl_entry(hlist_bl_first(&sb->s_anon), struct dentry, d_hash)); + do_one_tree(dentry); } } -- cgit v1.2.3 From 60942f2f235ce7b817166cdf355eed729094834d Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 2 May 2014 15:38:39 -0400 Subject: dcache: don't need rcu in shrink_dentry_list() Since now the shrink list is private and nobody can free the dentry while it is on the shrink list, we can remove RCU protection from this. Signed-off-by: Miklos Szeredi Signed-off-by: Al Viro --- fs/dcache.c | 27 ++++----------------------- 1 file changed, 4 insertions(+), 23 deletions(-) (limited to 'fs') diff --git a/fs/dcache.c b/fs/dcache.c index 2321e1a861f6..42ae01eefc07 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -796,23 +796,9 @@ static void shrink_dentry_list(struct list_head *list) { struct dentry *dentry, *parent; - rcu_read_lock(); - for (;;) { - dentry = list_entry_rcu(list->prev, struct dentry, d_lru); - if (&dentry->d_lru == list) - break; /* empty */ - - /* - * Get the dentry lock, and re-verify that the dentry is - * this on the shrinking list. If it is, we know that - * DCACHE_SHRINK_LIST and DCACHE_LRU_LIST are set. - */ + while (!list_empty(list)) { + dentry = list_entry(list->prev, struct dentry, d_lru); spin_lock(&dentry->d_lock); - if (dentry != list_entry(list->prev, struct dentry, d_lru)) { - spin_unlock(&dentry->d_lock); - continue; - } - /* * The dispose list is isolated and dentries are not accounted * to the LRU here, so we can simply remove it from the list @@ -828,23 +814,20 @@ static void shrink_dentry_list(struct list_head *list) spin_unlock(&dentry->d_lock); continue; } - rcu_read_unlock(); parent = dentry_kill(dentry, 0); /* * If dentry_kill returns NULL, we have nothing more to do. */ - if (!parent) { - rcu_read_lock(); + if (!parent) continue; - } + if (unlikely(parent == dentry)) { /* * trylocks have failed and d_lock has been held the * whole time, so it could not have been added to any * other lists. Just add it back to the shrink list. */ - rcu_read_lock(); d_shrink_add(dentry, list); spin_unlock(&dentry->d_lock); continue; @@ -858,9 +841,7 @@ static void shrink_dentry_list(struct list_head *list) dentry = parent; while (dentry && !lockref_put_or_lock(&dentry->d_lockref)) dentry = dentry_kill(dentry, 1); - rcu_read_lock(); } - rcu_read_unlock(); } static enum lru_status -- cgit v1.2.3 From c99d609a16506602a7398eea7d12b13513f3d889 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 5 May 2014 16:18:37 +1000 Subject: xfs: fully support v5 format filesystems We have had this code in the kernel for over a year now and have shaken all the known issues out of the code over the past few releases. It's now time to remove the experimental warnings during mount and fully support the new filesystem format in production systems. Remove the experimental warning, and add a version number to the initial "mounting filesystem" message to tell use what type of filesystem is being mounted. Also, remove the temporary inode cluster size output at mount time now we know that this code works fine. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/xfs_log.c | 10 ++++++---- fs/xfs/xfs_mount.c | 2 -- fs/xfs/xfs_sb.c | 4 ---- 3 files changed, 6 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 08624dc67317..a5f8bd9899d3 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -616,11 +616,13 @@ xfs_log_mount( int error = 0; int min_logfsbs; - if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) - xfs_notice(mp, "Mounting Filesystem"); - else { + if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) { + xfs_notice(mp, "Mounting V%d Filesystem", + XFS_SB_VERSION_NUM(&mp->m_sb)); + } else { xfs_notice(mp, -"Mounting filesystem in no-recovery mode. Filesystem will be inconsistent."); +"Mounting V%d filesystem in no-recovery mode. Filesystem will be inconsistent.", + XFS_SB_VERSION_NUM(&mp->m_sb)); ASSERT(mp->m_flags & XFS_MOUNT_RDONLY); } diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 993cb19e7d39..944f3d9456a8 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -743,8 +743,6 @@ xfs_mountfs( new_size *= mp->m_sb.sb_inodesize / XFS_DINODE_MIN_SIZE; if (mp->m_sb.sb_inoalignmt >= XFS_B_TO_FSBT(mp, new_size)) mp->m_inode_cluster_size = new_size; - xfs_info(mp, "Using inode cluster size of %d bytes", - mp->m_inode_cluster_size); } /* diff --git a/fs/xfs/xfs_sb.c b/fs/xfs/xfs_sb.c index 0c0e41bbe4e3..8baf61afae1d 100644 --- a/fs/xfs/xfs_sb.c +++ b/fs/xfs/xfs_sb.c @@ -201,10 +201,6 @@ xfs_mount_validate_sb( * write validation, we don't need to check feature masks. */ if (check_version && XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) { - xfs_alert(mp, -"Version 5 superblock detected. This kernel has EXPERIMENTAL support enabled!\n" -"Use of these features in this kernel is at your own risk!"); - if (xfs_sb_has_compat_feature(sbp, XFS_SB_FEAT_COMPAT_UNKNOWN)) { xfs_warn(mp, -- cgit v1.2.3 From fcdd57c8902a936a616df2066d873b38ef3ed364 Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Mon, 7 Apr 2014 13:39:06 +0300 Subject: UBIFS: fix remount error path Dan's "smatch" checker found out that there was a bug in the error path of the 'ubifs_remount_rw()' function. Instead of jumping to the "out" label which cleans-things up, we just returned. This patch fixes the problem. Reported-by: Dan Carpenter Signed-off-by: Artem Bityutskiy --- fs/ubifs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index a1266089eca1..a81c7b556896 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -1556,7 +1556,7 @@ static int ubifs_remount_rw(struct ubifs_info *c) if (c->space_fixup) { err = ubifs_fixup_free_space(c); if (err) - return err; + goto out; } err = check_free_space(c); -- cgit v1.2.3 From d540e43b0ab134b22f015f725ce6e070d12b0244 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Tue, 6 May 2014 07:34:28 +1000 Subject: xfs: initialize default acls for ->tmpfile() The current tmpfile handler does not initialize default ACLs. Doing so within xfs_vn_tmpfile() makes it roughly equivalent to xfs_vn_mknod(), which is already used as a common create handler. xfs_vn_mknod() does not currently have a mechanism to determine whether to link the file into the namespace. Therefore, further abstract xfs_vn_mknod() into a new xfs_generic_create() handler with a tmpfile parameter. This new handler calls xfs_create_tmpfile() and d_tmpfile() on the dentry when called via ->tmpfile(). Signed-off-by: Brian Foster Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_iops.c | 55 +++++++++++++++++++++++++++++-------------------------- 1 file changed, 29 insertions(+), 26 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index ef1ca010f417..301ecbfcc0be 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -124,15 +124,15 @@ xfs_cleanup_inode( xfs_dentry_to_name(&teardown, dentry, 0); xfs_remove(XFS_I(dir), &teardown, XFS_I(inode)); - iput(inode); } STATIC int -xfs_vn_mknod( +xfs_generic_create( struct inode *dir, struct dentry *dentry, umode_t mode, - dev_t rdev) + dev_t rdev, + bool tmpfile) /* unnamed file */ { struct inode *inode; struct xfs_inode *ip = NULL; @@ -156,8 +156,12 @@ xfs_vn_mknod( if (error) return error; - xfs_dentry_to_name(&name, dentry, mode); - error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip); + if (!tmpfile) { + xfs_dentry_to_name(&name, dentry, mode); + error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip); + } else { + error = xfs_create_tmpfile(XFS_I(dir), dentry, mode, &ip); + } if (unlikely(error)) goto out_free_acl; @@ -180,7 +184,11 @@ xfs_vn_mknod( } #endif - d_instantiate(dentry, inode); + if (tmpfile) + d_tmpfile(dentry, inode); + else + d_instantiate(dentry, inode); + out_free_acl: if (default_acl) posix_acl_release(default_acl); @@ -189,10 +197,22 @@ xfs_vn_mknod( return -error; out_cleanup_inode: - xfs_cleanup_inode(dir, inode, dentry); + if (!tmpfile) + xfs_cleanup_inode(dir, inode, dentry); + iput(inode); goto out_free_acl; } +STATIC int +xfs_vn_mknod( + struct inode *dir, + struct dentry *dentry, + umode_t mode, + dev_t rdev) +{ + return xfs_generic_create(dir, dentry, mode, rdev, false); +} + STATIC int xfs_vn_create( struct inode *dir, @@ -353,6 +373,7 @@ xfs_vn_symlink( out_cleanup_inode: xfs_cleanup_inode(dir, inode, dentry); + iput(inode); out: return -error; } @@ -1053,25 +1074,7 @@ xfs_vn_tmpfile( struct dentry *dentry, umode_t mode) { - int error; - struct xfs_inode *ip; - struct inode *inode; - - error = xfs_create_tmpfile(XFS_I(dir), dentry, mode, &ip); - if (unlikely(error)) - return -error; - - inode = VFS_I(ip); - - error = xfs_init_security(inode, dir, &dentry->d_name); - if (unlikely(error)) { - iput(inode); - return -error; - } - - d_tmpfile(dentry, inode); - - return 0; + return xfs_generic_create(dir, dentry, mode, 0, true); } static const struct inode_operations xfs_inode_operations = { -- cgit v1.2.3 From 8275cdd0e7ac550dcce2b3ef6d2fb3b808c1ae59 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 6 May 2014 07:37:31 +1000 Subject: xfs: remote attribute overwrite causes transaction overrun Commit e461fcb ("xfs: remote attribute lookups require the value length") passes the remote attribute length in the xfs_da_args structure on lookup so that CRC calculations and validity checking can be performed correctly by related code. This, unfortunately has the side effect of changing the args->valuelen parameter in cases where it shouldn't. That is, when we replace a remote attribute, the incoming replacement stores the value and length in args->value and args->valuelen, but then the lookup which finds the existing remote attribute overwrites args->valuelen with the length of the remote attribute being replaced. Hence when we go to create the new attribute, we create it of the size of the existing remote attribute, not the size it is supposed to be. When the new attribute is much smaller than the old attribute, this results in a transaction overrun and an ASSERT() failure on a debug kernel: XFS: Assertion failed: tp->t_blk_res_used <= tp->t_blk_res, file: fs/xfs/xfs_trans.c, line: 331 Fix this by keeping the remote attribute value length separate to the attribute value length in the xfs_da_args structure. The enables us to pass the length of the remote attribute to be removed without overwriting the new attribute's length. Also, ensure that when we save remote block contexts for a later rename we zero the original state variables so that we don't confuse the state of the attribute to be removes with the state of the new attribute that we just added. [Spotted by Brain Foster.] Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/xfs_attr.c | 24 +++++++++++++++++++++++- fs/xfs/xfs_attr_leaf.c | 21 +++++++++++---------- fs/xfs/xfs_attr_list.c | 1 + fs/xfs/xfs_attr_remote.c | 8 +++++--- fs/xfs/xfs_da_btree.h | 2 ++ 5 files changed, 42 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c index 01b6a0102fbd..abda1124a70f 100644 --- a/fs/xfs/xfs_attr.c +++ b/fs/xfs/xfs_attr.c @@ -213,7 +213,7 @@ xfs_attr_calc_size( * Out of line attribute, cannot double split, but * make room for the attribute value itself. */ - uint dblocks = XFS_B_TO_FSB(mp, valuelen); + uint dblocks = xfs_attr3_rmt_blocks(mp, valuelen); nblks += dblocks; nblks += XFS_NEXTENTADD_SPACE_RES(mp, dblocks, XFS_ATTR_FORK); } @@ -698,11 +698,22 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) trace_xfs_attr_leaf_replace(args); + /* save the attribute state for later removal*/ args->op_flags |= XFS_DA_OP_RENAME; /* an atomic rename */ args->blkno2 = args->blkno; /* set 2nd entry info*/ args->index2 = args->index; args->rmtblkno2 = args->rmtblkno; args->rmtblkcnt2 = args->rmtblkcnt; + args->rmtvaluelen2 = args->rmtvaluelen; + + /* + * clear the remote attr state now that it is saved so that the + * values reflect the state of the attribute we are about to + * add, not the attribute we just found and will remove later. + */ + args->rmtblkno = 0; + args->rmtblkcnt = 0; + args->rmtvaluelen = 0; } /* @@ -794,6 +805,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) args->blkno = args->blkno2; args->rmtblkno = args->rmtblkno2; args->rmtblkcnt = args->rmtblkcnt2; + args->rmtvaluelen = args->rmtvaluelen2; if (args->rmtblkno) { error = xfs_attr_rmtval_remove(args); if (error) @@ -999,13 +1011,22 @@ restart: trace_xfs_attr_node_replace(args); + /* save the attribute state for later removal*/ args->op_flags |= XFS_DA_OP_RENAME; /* atomic rename op */ args->blkno2 = args->blkno; /* set 2nd entry info*/ args->index2 = args->index; args->rmtblkno2 = args->rmtblkno; args->rmtblkcnt2 = args->rmtblkcnt; + args->rmtvaluelen2 = args->rmtvaluelen; + + /* + * clear the remote attr state now that it is saved so that the + * values reflect the state of the attribute we are about to + * add, not the attribute we just found and will remove later. + */ args->rmtblkno = 0; args->rmtblkcnt = 0; + args->rmtvaluelen = 0; } retval = xfs_attr3_leaf_add(blk->bp, state->args); @@ -1133,6 +1154,7 @@ restart: args->blkno = args->blkno2; args->rmtblkno = args->rmtblkno2; args->rmtblkcnt = args->rmtblkcnt2; + args->rmtvaluelen = args->rmtvaluelen2; if (args->rmtblkno) { error = xfs_attr_rmtval_remove(args); if (error) diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c index fe9587fab17a..511c283459b1 100644 --- a/fs/xfs/xfs_attr_leaf.c +++ b/fs/xfs/xfs_attr_leaf.c @@ -1229,6 +1229,7 @@ xfs_attr3_leaf_add_work( name_rmt->valueblk = 0; args->rmtblkno = 1; args->rmtblkcnt = xfs_attr3_rmt_blocks(mp, args->valuelen); + args->rmtvaluelen = args->valuelen; } xfs_trans_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, xfs_attr3_leaf_name(leaf, args->index), @@ -2167,11 +2168,11 @@ xfs_attr3_leaf_lookup_int( if (!xfs_attr_namesp_match(args->flags, entry->flags)) continue; args->index = probe; - args->valuelen = be32_to_cpu(name_rmt->valuelen); + args->rmtvaluelen = be32_to_cpu(name_rmt->valuelen); args->rmtblkno = be32_to_cpu(name_rmt->valueblk); args->rmtblkcnt = xfs_attr3_rmt_blocks( args->dp->i_mount, - args->valuelen); + args->rmtvaluelen); return XFS_ERROR(EEXIST); } } @@ -2220,19 +2221,19 @@ xfs_attr3_leaf_getvalue( name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index); ASSERT(name_rmt->namelen == args->namelen); ASSERT(memcmp(args->name, name_rmt->name, args->namelen) == 0); - valuelen = be32_to_cpu(name_rmt->valuelen); + args->rmtvaluelen = be32_to_cpu(name_rmt->valuelen); args->rmtblkno = be32_to_cpu(name_rmt->valueblk); args->rmtblkcnt = xfs_attr3_rmt_blocks(args->dp->i_mount, - valuelen); + args->rmtvaluelen); if (args->flags & ATTR_KERNOVAL) { - args->valuelen = valuelen; + args->valuelen = args->rmtvaluelen; return 0; } - if (args->valuelen < valuelen) { - args->valuelen = valuelen; + if (args->valuelen < args->rmtvaluelen) { + args->valuelen = args->rmtvaluelen; return XFS_ERROR(ERANGE); } - args->valuelen = valuelen; + args->valuelen = args->rmtvaluelen; } return 0; } @@ -2519,7 +2520,7 @@ xfs_attr3_leaf_clearflag( ASSERT((entry->flags & XFS_ATTR_LOCAL) == 0); name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index); name_rmt->valueblk = cpu_to_be32(args->rmtblkno); - name_rmt->valuelen = cpu_to_be32(args->valuelen); + name_rmt->valuelen = cpu_to_be32(args->rmtvaluelen); xfs_trans_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, name_rmt, sizeof(*name_rmt))); } @@ -2677,7 +2678,7 @@ xfs_attr3_leaf_flipflags( ASSERT((entry1->flags & XFS_ATTR_LOCAL) == 0); name_rmt = xfs_attr3_leaf_name_remote(leaf1, args->index); name_rmt->valueblk = cpu_to_be32(args->rmtblkno); - name_rmt->valuelen = cpu_to_be32(args->valuelen); + name_rmt->valuelen = cpu_to_be32(args->rmtvaluelen); xfs_trans_log_buf(args->trans, bp1, XFS_DA_LOGRANGE(leaf1, name_rmt, sizeof(*name_rmt))); } diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c index 01db96f60cf0..833fe5d98d80 100644 --- a/fs/xfs/xfs_attr_list.c +++ b/fs/xfs/xfs_attr_list.c @@ -447,6 +447,7 @@ xfs_attr3_leaf_list_int( args.dp = context->dp; args.whichfork = XFS_ATTR_FORK; args.valuelen = valuelen; + args.rmtvaluelen = valuelen; args.value = kmem_alloc(valuelen, KM_SLEEP | KM_NOFS); args.rmtblkno = be32_to_cpu(name_rmt->valueblk); args.rmtblkcnt = xfs_attr3_rmt_blocks( diff --git a/fs/xfs/xfs_attr_remote.c b/fs/xfs/xfs_attr_remote.c index 6e37823e2932..d2e6e948cec7 100644 --- a/fs/xfs/xfs_attr_remote.c +++ b/fs/xfs/xfs_attr_remote.c @@ -337,7 +337,7 @@ xfs_attr_rmtval_get( struct xfs_buf *bp; xfs_dablk_t lblkno = args->rmtblkno; __uint8_t *dst = args->value; - int valuelen = args->valuelen; + int valuelen; int nmap; int error; int blkcnt = args->rmtblkcnt; @@ -347,7 +347,9 @@ xfs_attr_rmtval_get( trace_xfs_attr_rmtval_get(args); ASSERT(!(args->flags & ATTR_KERNOVAL)); + ASSERT(args->rmtvaluelen == args->valuelen); + valuelen = args->rmtvaluelen; while (valuelen > 0) { nmap = ATTR_RMTVALUE_MAPSIZE; error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno, @@ -415,7 +417,7 @@ xfs_attr_rmtval_set( * attributes have headers, we can't just do a straight byte to FSB * conversion and have to take the header space into account. */ - blkcnt = xfs_attr3_rmt_blocks(mp, args->valuelen); + blkcnt = xfs_attr3_rmt_blocks(mp, args->rmtvaluelen); error = xfs_bmap_first_unused(args->trans, args->dp, blkcnt, &lfileoff, XFS_ATTR_FORK); if (error) @@ -480,7 +482,7 @@ xfs_attr_rmtval_set( */ lblkno = args->rmtblkno; blkcnt = args->rmtblkcnt; - valuelen = args->valuelen; + valuelen = args->rmtvaluelen; while (valuelen > 0) { struct xfs_buf *bp; xfs_daddr_t dblkno; diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h index 6e95ea79f5d7..201c6091d26a 100644 --- a/fs/xfs/xfs_da_btree.h +++ b/fs/xfs/xfs_da_btree.h @@ -60,10 +60,12 @@ typedef struct xfs_da_args { int index; /* index of attr of interest in blk */ xfs_dablk_t rmtblkno; /* remote attr value starting blkno */ int rmtblkcnt; /* remote attr value block count */ + int rmtvaluelen; /* remote attr value length in bytes */ xfs_dablk_t blkno2; /* blkno of 2nd attr leaf of interest */ int index2; /* index of 2nd attr in blk */ xfs_dablk_t rmtblkno2; /* remote attr value starting blkno */ int rmtblkcnt2; /* remote attr value block count */ + int rmtvaluelen2; /* remote attr value length in bytes */ int op_flags; /* operation flags */ enum xfs_dacmp cmpresult; /* name compare result for lookups */ } xfs_da_args_t; -- cgit v1.2.3 From 50c6e282bdf5e8dabf8d7cf7b162545a55645fd9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 4 May 2014 13:03:32 +0200 Subject: posix_acl: handle NULL ACL in posix_acl_equiv_mode Various filesystems don't bother checking for a NULL ACL in posix_acl_equiv_mode, and thus can dereference a NULL pointer when it gets passed one. This usually happens from the NFS server, as the ACL tools never pass a NULL ACL, but instead of one representing the mode bits. Instead of adding boilerplat to all filesystems put this check into one place, which will allow us to remove the check from other filesystems as well later on. Signed-off-by: Christoph Hellwig Reported-by: Ben Greear Reported-by: Marco Munderloh , Cc: Chuck Lever Cc: stable@vger.kernel.org Signed-off-by: Al Viro --- fs/posix_acl.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs') diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 9e363e41dacc..0855f772cd41 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -246,6 +246,12 @@ posix_acl_equiv_mode(const struct posix_acl *acl, umode_t *mode_p) umode_t mode = 0; int not_equiv = 0; + /* + * A null ACL can always be presented as mode bits. + */ + if (!acl) + return 0; + FOREACH_ACL_ENTRY(pa, acl, pe) { switch (pa->e_tag) { case ACL_USER_OBJ: -- cgit v1.2.3 From 457c1b27ed56ec472d202731b12417bff023594a Mon Sep 17 00:00:00 2001 From: Nishanth Aravamudan Date: Tue, 6 May 2014 12:50:00 -0700 Subject: hugetlb: ensure hugepage access is denied if hugepages are not supported Currently, I am seeing the following when I `mount -t hugetlbfs /none /dev/hugetlbfs`, and then simply do a `ls /dev/hugetlbfs`. I think it's related to the fact that hugetlbfs is properly not correctly setting itself up in this state?: Unable to handle kernel paging request for data at address 0x00000031 Faulting instruction address: 0xc000000000245710 Oops: Kernel access of bad area, sig: 11 [#1] SMP NR_CPUS=2048 NUMA pSeries .... In KVM guests on Power, in a guest not backed by hugepages, we see the following: AnonHugePages: 0 kB HugePages_Total: 0 HugePages_Free: 0 HugePages_Rsvd: 0 HugePages_Surp: 0 Hugepagesize: 64 kB HPAGE_SHIFT == 0 in this configuration, which indicates that hugepages are not supported at boot-time, but this is only checked in hugetlb_init(). Extract the check to a helper function, and use it in a few relevant places. This does make hugetlbfs not supported (not registered at all) in this environment. I believe this is fine, as there are no valid hugepages and that won't change at runtime. [akpm@linux-foundation.org: use pr_info(), per Mel] [akpm@linux-foundation.org: fix build when HPAGE_SHIFT is undefined] Signed-off-by: Nishanth Aravamudan Reviewed-by: Aneesh Kumar K.V Acked-by: Mel Gorman Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 5 +++++ include/linux/hugetlb.h | 10 ++++++++++ mm/hugetlb.c | 19 ++++++++++++++----- 3 files changed, 29 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 204027520937..e19d4c0cacae 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -1030,6 +1030,11 @@ static int __init init_hugetlbfs_fs(void) int error; int i; + if (!hugepages_supported()) { + pr_info("hugetlbfs: disabling because there are no supported hugepage sizes\n"); + return -ENOTSUPP; + } + error = bdi_init(&hugetlbfs_backing_dev_info); if (error) return error; diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 5b337cf8fb86..b65166de1d9d 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -412,6 +412,16 @@ static inline spinlock_t *huge_pte_lockptr(struct hstate *h, return &mm->page_table_lock; } +static inline bool hugepages_supported(void) +{ + /* + * Some platform decide whether they support huge pages at boot + * time. On these, such as powerpc, HPAGE_SHIFT is set to 0 when + * there is no such support + */ + return HPAGE_SHIFT != 0; +} + #else /* CONFIG_HUGETLB_PAGE */ struct hstate {}; #define alloc_huge_page_node(h, nid) NULL diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 246192929a2d..c82290b9c1fc 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1981,11 +1981,7 @@ static int __init hugetlb_init(void) { int i; - /* Some platform decide whether they support huge pages at boot - * time. On these, such as powerpc, HPAGE_SHIFT is set to 0 when - * there is no such support - */ - if (HPAGE_SHIFT == 0) + if (!hugepages_supported()) return 0; if (!size_to_hstate(default_hstate_size)) { @@ -2112,6 +2108,9 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy, unsigned long tmp; int ret; + if (!hugepages_supported()) + return -ENOTSUPP; + tmp = h->max_huge_pages; if (write && h->order >= MAX_ORDER) @@ -2165,6 +2164,9 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write, unsigned long tmp; int ret; + if (!hugepages_supported()) + return -ENOTSUPP; + tmp = h->nr_overcommit_huge_pages; if (write && h->order >= MAX_ORDER) @@ -2190,6 +2192,8 @@ out: void hugetlb_report_meminfo(struct seq_file *m) { struct hstate *h = &default_hstate; + if (!hugepages_supported()) + return; seq_printf(m, "HugePages_Total: %5lu\n" "HugePages_Free: %5lu\n" @@ -2206,6 +2210,8 @@ void hugetlb_report_meminfo(struct seq_file *m) int hugetlb_report_node_meminfo(int nid, char *buf) { struct hstate *h = &default_hstate; + if (!hugepages_supported()) + return 0; return sprintf(buf, "Node %d HugePages_Total: %5u\n" "Node %d HugePages_Free: %5u\n" @@ -2220,6 +2226,9 @@ void hugetlb_show_meminfo(void) struct hstate *h; int nid; + if (!hugepages_supported()) + return; + for_each_node_state(nid, N_MEMORY) for_each_hstate(h) pr_info("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n", -- cgit v1.2.3 From 6b6751f7feba68d8f5c72b72cc69a1c5a625529c Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Tue, 6 May 2014 12:50:06 -0700 Subject: autofs: fix lockref lookup autofs needs to be able to see private data dentry flags for its dentrys that are being created but not yet hashed and for its dentrys that have been rmdir()ed but not yet freed. It needs to do this so it can block processes in these states until a status has been returned to indicate the given operation is complete. It does this by keeping two lists, active and expring, of dentrys in this state and uses ->d_release() to keep them stable while it checks the reference count to determine if they should be used. But with the recent lockref changes dentrys being freed sometimes don't transition to a reference count of 0 before being freed so autofs can occassionally use a dentry that is invalid which can lead to a panic. Signed-off-by: Ian Kent Cc: Al Viro Cc: Linus Torvalds Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/autofs4/root.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index 2caf36ac3e93..cc87c1abac97 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -179,7 +179,7 @@ static struct dentry *autofs4_lookup_active(struct dentry *dentry) spin_lock(&active->d_lock); /* Already gone? */ - if (!d_count(active)) + if ((int) d_count(active) <= 0) goto next; qstr = &active->d_name; @@ -230,7 +230,7 @@ static struct dentry *autofs4_lookup_expiring(struct dentry *dentry) spin_lock(&expiring->d_lock); - /* Bad luck, we've already been dentry_iput */ + /* We've already been dentry_iput or unlinked */ if (!expiring->d_inode) goto next; -- cgit v1.2.3 From 1e2ee49f7f1b79f0b14884fe6a602f0411b39552 Mon Sep 17 00:00:00 2001 From: Will Woods Date: Tue, 6 May 2014 12:50:10 -0700 Subject: fanotify: fix -EOVERFLOW with large files on 64-bit On 64-bit systems, O_LARGEFILE is automatically added to flags inside the open() syscall (also openat(), blkdev_open(), etc). Userspace therefore defines O_LARGEFILE to be 0 - you can use it, but it's a no-op. Everything should be O_LARGEFILE by default. But: when fanotify does create_fd() it uses dentry_open(), which skips all that. And userspace can't set O_LARGEFILE in fanotify_init() because it's defined to 0. So if fanotify gets an event regarding a large file, the read() will just fail with -EOVERFLOW. This patch adds O_LARGEFILE to fanotify_init()'s event_f_flags on 64-bit systems, using the same test as open()/openat()/etc. Addresses https://bugzilla.redhat.com/show_bug.cgi?id=696821 Signed-off-by: Will Woods Acked-by: Eric Paris Reviewed-by: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/notify/fanotify/fanotify_user.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 4e565c814309..732648b270dc 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -698,6 +698,8 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) } group->overflow_event = &oevent->fse; + if (force_o_largefile()) + event_f_flags |= O_LARGEFILE; group->fanotify_data.f_flags = event_f_flags; #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS spin_lock_init(&group->fanotify_data.access_lock); -- cgit v1.2.3 From d353efd02357a74753cd45f367a2d3d357fd6904 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Tue, 6 May 2014 12:50:11 -0700 Subject: fs/affs/super.c: bugfix / double free Commit 842a859db26b ("affs: use ->kill_sb() to simplify ->put_super() and failure exits of ->mount()") adds .kill_sb which frees sbi but doesn't remove sbi free in case of parse_options error causing double free+random crash. Signed-off-by: Fabian Frederick Cc: Alexander Viro Cc: [3.14.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/affs/super.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/affs/super.c b/fs/affs/super.c index 6d589f28bf9b..895ac7dc9dbf 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -340,8 +340,6 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent) &blocksize,&sbi->s_prefix, sbi->s_volume, &mount_flags)) { printk(KERN_ERR "AFFS: Error parsing options\n"); - kfree(sbi->s_prefix); - kfree(sbi); return -EINVAL; } /* N.B. after this point s_prefix must be released */ -- cgit v1.2.3