From bf068ee266f9dbaa6dacb8433a366bb399e7ae5b Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 19 Aug 2008 22:16:43 -0400 Subject: ext4: Handle unwritten extent properly with delayed allocation When using fallocate the buffer_heads are marked unwritten and unmapped. We need to map them in the writepages after a get_block. Otherwise we split the uninit extents, but never write the content to disk. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Mingming Cao Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 59fbbe899acc..a1c7d7623213 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1741,6 +1741,13 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical, if (buffer_delay(bh)) { bh->b_blocknr = pblock; clear_buffer_delay(bh); + bh->b_bdev = inode->i_sb->s_bdev; + } else if (buffer_unwritten(bh)) { + bh->b_blocknr = pblock; + clear_buffer_unwritten(bh); + set_buffer_mapped(bh); + set_buffer_new(bh); + bh->b_bdev = inode->i_sb->s_bdev; } else if (buffer_mapped(bh)) BUG_ON(bh->b_blocknr != pblock); @@ -1814,7 +1821,7 @@ static void mpage_da_map_blocks(struct mpage_da_data *mpd) * If blocks are delayed marked, we need to * put actual blocknr and drop delayed bit */ - if (buffer_delay(lbh)) + if (buffer_delay(lbh) || buffer_unwritten(lbh)) mpage_put_bnr_to_bhs(mpd, next, &new); /* go for the remaining blocks */ @@ -1823,7 +1830,8 @@ static void mpage_da_map_blocks(struct mpage_da_data *mpd) } } -#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | (1 << BH_Delay)) +#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \ + (1 << BH_Delay) | (1 << BH_Unwritten)) /* * mpage_add_bh_to_extent - try to add one more block to extent of blocks -- cgit v1.2.3 From b4df2030858bde986cb6ff2e4b45945f84649e32 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 13 Aug 2008 21:44:34 -0400 Subject: ext4: Fix potential truncate BUG due to i_prealloc_list being non-empty We need to call ext4_discard_reservation() earlier in ext4_truncate(), to avoid a BUG() in ext4_mb_return_to_preallocation(), which is called (ultimately) by ext4_free_blocks(). So we must ditch the blocks on i_prealloc_list before we start freeing the data blocks. Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index a1c7d7623213..2d54c822c4c3 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3494,6 +3494,9 @@ void ext4_truncate(struct inode *inode) * modify the block allocation tree. */ down_write(&ei->i_data_sem); + + ext4_discard_reservation(inode); + /* * The orphan list entry will now protect us from any crash which * occurs before the truncate completes, so it is now safe to propagate @@ -3563,8 +3566,6 @@ do_indirects: ; } - ext4_discard_reservation(inode); - up_write(&ei->i_data_sem); inode->i_mtime = inode->i_ctime = ext4_current_time(inode); ext4_mark_inode_dirty(handle, inode); -- cgit v1.2.3 From cd21322616c3af265d39bf15321d436e667a5dd1 Mon Sep 17 00:00:00 2001 From: Mingming Cao Date: Tue, 19 Aug 2008 22:16:59 -0400 Subject: ext4: Fix delalloc release block reservation for truncate Ext4 will release the reserved blocks for delayed allocations when inode is truncated/unlinked. If there is no reserved block at all, we shouldn't need to do so. But current code still tries to release the reserved blocks regardless whether the counters's value is 0. Continue to do that causes the later calculation to go wrong and a kernel BUG_ON() caught that. This doesn't happen for extent-based files, as the calculation for 0 reserved blocks was right for extent based file. This patch fixed the kernel BUG() due to above reason. It adds checks for 0 to avoid unnecessary release and fix calculation for non-extent files. Signed-off-by: Mingming Cao Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 2d54c822c4c3..5e17d5f22a7e 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1005,6 +1005,9 @@ static int ext4_indirect_calc_metadata_amount(struct inode *inode, int blocks) */ static int ext4_calc_metadata_amount(struct inode *inode, int blocks) { + if (!blocks) + return 0; + if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) return ext4_ext_calc_metadata_amount(inode, blocks); @@ -1559,7 +1562,25 @@ static void ext4_da_release_space(struct inode *inode, int to_free) struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); int total, mdb, mdb_free, release; + if (!to_free) + return; /* Nothing to release, exit */ + spin_lock(&EXT4_I(inode)->i_block_reservation_lock); + + if (!EXT4_I(inode)->i_reserved_data_blocks) { + /* + * if there is no reserved blocks, but we try to free some + * then the counter is messed up somewhere. + * but since this function is called from invalidate + * page, it's harmless to return without any action + */ + printk(KERN_INFO "ext4 delalloc try to release %d reserved " + "blocks for inode %lu, but there is no reserved " + "data blocks\n", to_free, inode->i_ino); + spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); + return; + } + /* recalculate the number of metablocks still need to be reserved */ total = EXT4_I(inode)->i_reserved_data_blocks - to_free; mdb = ext4_calc_metadata_amount(inode, total); -- cgit v1.2.3 From d015641734cde55d2fce48a6db3983c8a029fe05 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 19 Aug 2008 21:57:43 -0400 Subject: ext4: Fix ext4_dx_readdir hash collision handling This fixes a bug where readdir() would return a directory entry twice if there was a hash collision in an hash tree indexed directory. Signed-off-by: Eugene Dashevsky Signed-off-by: Mike Snitzer Signed-off-by: "Theodore Ts'o" --- fs/ext4/dir.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index d3d23d73c08b..ec8e33b45219 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -411,7 +411,7 @@ static int call_filldir(struct file * filp, void * dirent, get_dtype(sb, fname->file_type)); if (error) { filp->f_pos = curr_pos; - info->extra_fname = fname->next; + info->extra_fname = fname; return error; } fname = fname->next; @@ -450,11 +450,21 @@ static int ext4_dx_readdir(struct file * filp, * If there are any leftover names on the hash collision * chain, return them first. */ - if (info->extra_fname && - call_filldir(filp, dirent, filldir, info->extra_fname)) - goto finished; + if (info->extra_fname) { + if (call_filldir(filp, dirent, filldir, info->extra_fname)) + goto finished; - if (!info->curr_node) + info->extra_fname = NULL; + info->curr_node = rb_next(info->curr_node); + if (!info->curr_node) { + if (info->next_hash == ~0) { + filp->f_pos = EXT4_HTREE_EOF; + goto finished; + } + info->curr_hash = info->next_hash; + info->curr_minor_hash = 0; + } + } else if (!info->curr_node) info->curr_node = rb_first(&info->root); while (1) { -- cgit v1.2.3 From 88aa3cff4e9a38b953de9fbc54c96e619a2bb9f9 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 16 Aug 2008 07:57:35 -0400 Subject: ext4: Use ext4_discard_reservations instead of mballoc-specific call In ext4_ext_truncate(), we should use the more generic ext4_discard_reservations() call so we do the right thing when the filesystem is mounted with the nomballoc option. Signed-off-by: "Theodore Ts'o" Reviewed-by: Mingming Cao --- fs/ext4/extents.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 612c3d2c3824..7212947a8ca3 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2819,7 +2819,7 @@ void ext4_ext_truncate(struct inode *inode) down_write(&EXT4_I(inode)->i_data_sem); ext4_ext_invalidate_cache(inode); - ext4_mb_discard_inode_preallocations(inode); + ext4_discard_reservation(inode); /* * TODO: optimization is possible here. -- cgit v1.2.3 From 37609fd5ae62db75026d9f53096a1fbc35e040d9 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 19 Aug 2008 22:13:41 -0400 Subject: ext4: don't try to resize if there are no reserved gdt blocks left When trying to resize an ext4 fs and you run out of reserved gdt blocks, you get an error that doesn't actually tell you what went wrong, it just says that the gdb it picked is not correct, which is the case since you don't have any reserved gdt blocks left. This patch adds a check to make sure you have reserved gdt blocks to use, and if not prints out a more relevant error. Signed-off-by: Josef Bacik Cc: Cc: Andreas Dilger Signed-off-by: Andrew Morton Signed-off-by: "Theodore Ts'o" --- fs/ext4/resize.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 0a9265164265..b3d35604ea18 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -773,7 +773,8 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) if (reserved_gdb || gdb_off == 0) { if (!EXT4_HAS_COMPAT_FEATURE(sb, - EXT4_FEATURE_COMPAT_RESIZE_INODE)){ + EXT4_FEATURE_COMPAT_RESIZE_INODE) + || !le16_to_cpu(es->s_reserved_gdt_blocks)) { ext4_warning(sb, __func__, "No reserved GDT blocks, can't resize"); return -EPERM; -- cgit v1.2.3 From c001077f4003fa75793bb62979baa6241dd8eb19 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Tue, 19 Aug 2008 22:19:50 -0400 Subject: ext4: Fix bug where we return ENOSPC even though we have plenty of inodes The find_group_flex() function starts with best_flex as the parent_fbg_group, which happens to have 0 inodes free. Some of the flex groups searched have free blocks and free inodes, but the flex_freeb_ratio is < 10, so they're skipped. Then when a group is compared to the current "best" flex group, it does not have more free blocks than "best", so it is skipped as well. This continues until no flex group with free inodes is found which has a proper ratio or which has more free blocks than the "best" group, and we're left with a "best" group that has 0 inodes free, and we return -ENOSPC. We fix this by changing the logic so that if the current "best" flex group has no inodes free, and the current one does have room, it is promoted to the next "best." Signed-off-by: Eric Sandeen Signed-off-by: "Theodore Ts'o" --- fs/ext4/ialloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 655e760212b8..f344834bbf58 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -351,7 +351,7 @@ find_close_to_parent: goto found_flexbg; } - if (best_flex < 0 || + if (flex_group[best_flex].free_inodes == 0 || (flex_group[i].free_blocks > flex_group[best_flex].free_blocks && flex_group[i].free_inodes)) -- cgit v1.2.3 From a02908f19c819aeec5e3dcf238adaa6deddd70b0 Mon Sep 17 00:00:00 2001 From: Mingming Cao Date: Tue, 19 Aug 2008 22:16:07 -0400 Subject: ext4: journal credits calulation cleanup and fix for non-extent writepage When considering how many journal credits are needed for modifying a chunk of data, we need to account for the super block, inode block, quota blocks and xattr block, indirect/index blocks, also, group bitmap and group descriptor blocks for new allocation (including data and indirect/index blocks). There are many places in ext4 do the calculation on their own and often missed one or two meta blocks, and often they assume single block allocation, and did not considering the multile chunk of allocation case. This patch is trying to cleanup current journal credit code, provides some common helper funtion to calculate the journal credits, to be used for writepage, writepages, DIO, fallocate, migration, defrag, and for both nonextent and extent files. This patch modified the writepage/write_begin credit caculation for nonextent files, to use the new helper function. It also fixed the problem that writepage on nonextent files did not consider the case blocksize Reviewed-by: Aneesh Kumar K.V Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 3 ++ fs/ext4/ext4_jbd2.h | 8 ++++ fs/ext4/inode.c | 131 ++++++++++++++++++++++++++++++++++++++-------------- 3 files changed, 108 insertions(+), 34 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 6c7924d9e358..38e661b0ea88 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1072,6 +1072,7 @@ extern void ext4_set_inode_flags(struct inode *); extern void ext4_get_inode_flags(struct ext4_inode_info *); extern void ext4_set_aops(struct inode *inode); extern int ext4_writepage_trans_blocks(struct inode *); +extern int ext4_meta_trans_blocks(struct inode *, int nrblocks, int idxblocks); extern int ext4_block_truncate_page(handle_t *handle, struct address_space *mapping, loff_t from); extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page); @@ -1227,6 +1228,8 @@ extern const struct inode_operations ext4_fast_symlink_inode_operations; /* extents.c */ extern int ext4_ext_tree_init(handle_t *handle, struct inode *); extern int ext4_ext_writepage_trans_blocks(struct inode *, int); +extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, + int chunk); extern int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, ext4_lblk_t iblock, unsigned long max_blocks, struct buffer_head *bh_result, diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index eb8bc3afe6e9..b455c685a98b 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -51,6 +51,14 @@ EXT4_XATTR_TRANS_BLOCKS - 2 + \ 2*EXT4_QUOTA_TRANS_BLOCKS(sb)) +/* + * Define the number of metadata blocks we need to account to modify data. + * + * This include super block, inode block, quota blocks and xattr blocks + */ +#define EXT4_META_TRANS_BLOCKS(sb) (EXT4_XATTR_TRANS_BLOCKS + \ + 2*EXT4_QUOTA_TRANS_BLOCKS(sb)) + /* Delete operations potentially hit one directory's namespace plus an * entire inode, plus arbitrary amounts of bitmap/indirection data. Be * generous. We can grow the delete transaction later if necessary. */ diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 5e17d5f22a7e..a27129065144 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4354,56 +4354,119 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry, return 0; } +static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks, + int chunk) +{ + int indirects; + + /* if nrblocks are contiguous */ + if (chunk) { + /* + * With N contiguous data blocks, it need at most + * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) indirect blocks + * 2 dindirect blocks + * 1 tindirect block + */ + indirects = nrblocks / EXT4_ADDR_PER_BLOCK(inode->i_sb); + return indirects + 3; + } + /* + * if nrblocks are not contiguous, worse case, each block touch + * a indirect block, and each indirect block touch a double indirect + * block, plus a triple indirect block + */ + indirects = nrblocks * 2 + 1; + return indirects; +} + +static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) +{ + if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) + return ext4_indirect_trans_blocks(inode, nrblocks, 0); + return ext4_ext_index_trans_blocks(inode, nrblocks, 0); +} /* - * How many blocks doth make a writepage()? + * Account for index blocks, block groups bitmaps and block group + * descriptor blocks if modify datablocks and index blocks + * worse case, the indexs blocks spread over different block groups * - * With N blocks per page, it may be: - * N data blocks - * 2 indirect block - * 2 dindirect - * 1 tindirect - * N+5 bitmap blocks (from the above) - * N+5 group descriptor summary blocks - * 1 inode block - * 1 superblock. - * 2 * EXT4_SINGLEDATA_TRANS_BLOCKS for the quote files + * If datablocks are discontiguous, they are possible to spread over + * different block groups too. If they are contiugous, with flexbg, + * they could still across block group boundary. * - * 3 * (N + 5) + 2 + 2 * EXT4_SINGLEDATA_TRANS_BLOCKS + * Also account for superblock, inode, quota and xattr blocks + */ +int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk) +{ + int groups, gdpblocks; + int idxblocks; + int ret = 0; + + /* + * How many index blocks need to touch to modify nrblocks? + * The "Chunk" flag indicating whether the nrblocks is + * physically contiguous on disk + * + * For Direct IO and fallocate, they calls get_block to allocate + * one single extent at a time, so they could set the "Chunk" flag + */ + idxblocks = ext4_index_trans_blocks(inode, nrblocks, chunk); + + ret = idxblocks; + + /* + * Now let's see how many group bitmaps and group descriptors need + * to account + */ + groups = idxblocks; + if (chunk) + groups += 1; + else + groups += nrblocks; + + gdpblocks = groups; + if (groups > EXT4_SB(inode->i_sb)->s_groups_count) + groups = EXT4_SB(inode->i_sb)->s_groups_count; + if (groups > EXT4_SB(inode->i_sb)->s_gdb_count) + gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count; + + /* bitmaps and block group descriptor blocks */ + ret += groups + gdpblocks; + + /* Blocks for super block, inode, quota and xattr blocks */ + ret += EXT4_META_TRANS_BLOCKS(inode->i_sb); + + return ret; +} + +/* + * Calulate the total number of credits to reserve to fit + * the modification of a single pages into a single transaction * - * With ordered or writeback data it's the same, less the N data blocks. + * This could be called via ext4_write_begin() or later + * ext4_da_writepages() in delalyed allocation case. * - * If the inode's direct blocks can hold an integral number of pages then a - * page cannot straddle two indirect blocks, and we can only touch one indirect - * and dindirect block, and the "5" above becomes "3". + * In both case it's possible that we could allocating multiple + * chunks of blocks. We need to consider the worse case, when + * one new block per extent. * - * This still overestimates under most circumstances. If we were to pass the - * start and end offsets in here as well we could do block_to_path() on each - * block and work out the exact number of indirects which are touched. Pah. + * For Direct IO and fallocate, the journal credits reservation + * is based on one single extent allocation, so they could use + * EXT4_DATA_TRANS_BLOCKS to get the needed credit to log a single + * chunk of allocation needs. */ - int ext4_writepage_trans_blocks(struct inode *inode) { int bpp = ext4_journal_blocks_per_page(inode); - int indirects = (EXT4_NDIR_BLOCKS % bpp) ? 5 : 3; int ret; - if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) - return ext4_ext_writepage_trans_blocks(inode, bpp); + ret = ext4_meta_trans_blocks(inode, bpp, 0); + /* Account for data blocks for journalled mode */ if (ext4_should_journal_data(inode)) - ret = 3 * (bpp + indirects) + 2; - else - ret = 2 * (bpp + indirects) + 2; - -#ifdef CONFIG_QUOTA - /* We know that structure was already allocated during DQUOT_INIT so - * we will be updating only the data blocks + inodes */ - ret += 2*EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb); -#endif - + ret += bpp; return ret; } - /* * The caller must have previously called ext4_reserve_inode_write(). * Give this, we know that the caller already has write access to iloc->bh. -- cgit v1.2.3 From ee12b630687d510f6f4b6d4acdc4e267fd4adeda Mon Sep 17 00:00:00 2001 From: Mingming Cao Date: Tue, 19 Aug 2008 22:16:05 -0400 Subject: ext4: journal credits reservation fixes for extent file writepage This patch modified the writepage/write_begin credit calculation for extent files, to use the credits caculation helper function. The current calculation of how many index/leaf blocks should be accounted is too conservetive, it always considered the worse case, where the tree level is 5, and in the case of multiple chunk allocations, it always assumed no blocks were dirtied in common across the allocations. This path uses the accurate depth of the inode with some extras to calculate the index blocks, and also less conservative in the case of multiple allocation accounting. Signed-off-by: Mingming Cao Reviewed-by: Aneesh Kumar K.V Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4_extents.h | 4 +- fs/ext4/extents.c | 104 +++++++++++++++++++++---------------------------- fs/ext4/migrate.c | 3 +- 3 files changed, 49 insertions(+), 62 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h index 6c166c0a54b7..d33dc56d6986 100644 --- a/fs/ext4/ext4_extents.h +++ b/fs/ext4/ext4_extents.h @@ -216,7 +216,9 @@ extern int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks); extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *); extern void ext4_ext_store_pblock(struct ext4_extent *, ext4_fsblk_t); extern int ext4_extent_tree_init(handle_t *, struct inode *); -extern int ext4_ext_calc_credits_for_insert(struct inode *, struct ext4_ext_path *); +extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode, + int num, + struct ext4_ext_path *path); extern int ext4_ext_try_to_merge(struct inode *inode, struct ext4_ext_path *path, struct ext4_extent *); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 7212947a8ca3..5c5dd3a1d657 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -1747,54 +1747,61 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, } /* - * ext4_ext_calc_credits_for_insert: - * This routine returns max. credits that the extent tree can consume. - * It should be OK for low-performance paths like ->writepage() - * To allow many writing processes to fit into a single transaction, - * the caller should calculate credits under i_data_sem and - * pass the actual path. + * ext4_ext_calc_credits_for_single_extent: + * This routine returns max. credits that needed to insert an extent + * to the extent tree. + * When pass the actual path, the caller should calculate credits + * under i_data_sem. */ -int ext4_ext_calc_credits_for_insert(struct inode *inode, +int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int num, struct ext4_ext_path *path) { - int depth, needed; - if (path) { + int depth = ext_depth(inode); + int ret; + /* probably there is space in leaf? */ - depth = ext_depth(inode); if (le16_to_cpu(path[depth].p_hdr->eh_entries) - < le16_to_cpu(path[depth].p_hdr->eh_max)) - return 1; - } + < le16_to_cpu(path[depth].p_hdr->eh_max)) { - /* - * given 32-bit logical block (4294967296 blocks), max. tree - * can be 4 levels in depth -- 4 * 340^4 == 53453440000. - * Let's also add one more level for imbalance. - */ - depth = 5; - - /* allocation of new data block(s) */ - needed = 2; + /* + * There are some space in the leaf tree, no + * need to account for leaf block credit + * + * bitmaps and block group descriptor blocks + * and other metadat blocks still need to be + * accounted. + */ + /* 1 one bitmap, 1 block group descriptor */ + ret = 2 + EXT4_META_TRANS_BLOCKS(inode->i_sb); + } + } - /* - * tree can be full, so it would need to grow in depth: - * we need one credit to modify old root, credits for - * new root will be added in split accounting - */ - needed += 1; + return ext4_meta_trans_blocks(inode, num, 1); +} - /* - * Index split can happen, we would need: - * allocate intermediate indexes (bitmap + group) - * + change two blocks at each level, but root (already included) - */ - needed += (depth * 2) + (depth * 2); +/* + * How many index/leaf blocks need to change/allocate to modify nrblocks? + * + * if nrblocks are fit in a single extent (chunk flag is 1), then + * in the worse case, each tree level index/leaf need to be changed + * if the tree split due to insert a new extent, then the old tree + * index/leaf need to be updated too + * + * If the nrblocks are discontiguous, they could cause + * the whole tree split more than once, but this is really rare. + */ +int ext4_ext_index_trans_blocks(struct inode *inode, int num, int chunk) +{ + int index; + int depth = ext_depth(inode); - /* any allocation modifies superblock */ - needed += 1; + if (chunk) + index = depth * 2; + else + index = depth * 3; - return needed; + return index; } static int ext4_remove_blocks(handle_t *handle, struct inode *inode, @@ -1921,9 +1928,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, correct_index = 1; credits += (ext_depth(inode)) + 1; } -#ifdef CONFIG_QUOTA credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb); -#endif err = ext4_ext_journal_restart(handle, credits); if (err) @@ -2858,27 +2863,6 @@ out_stop: ext4_journal_stop(handle); } -/* - * ext4_ext_writepage_trans_blocks: - * calculate max number of blocks we could modify - * in order to allocate new block for an inode - */ -int ext4_ext_writepage_trans_blocks(struct inode *inode, int num) -{ - int needed; - - needed = ext4_ext_calc_credits_for_insert(inode, NULL); - - /* caller wants to allocate num blocks, but note it includes sb */ - needed = needed * num - (num - 1); - -#ifdef CONFIG_QUOTA - needed += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb); -#endif - - return needed; -} - static void ext4_falloc_update_inode(struct inode *inode, int mode, loff_t new_size, int update_ctime) { diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index b9e077ba07e9..46fc0b5b12ba 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -53,7 +53,8 @@ static int finish_range(handle_t *handle, struct inode *inode, * credit. But below we try to not accumalate too much * of them by restarting the journal. */ - needed = ext4_ext_calc_credits_for_insert(inode, path); + needed = ext4_ext_calc_credits_for_single_extent(inode, + lb->last_block - lb->first_block + 1, path); /* * Make sure the credit we accumalated is not really high -- cgit v1.2.3 From f3bd1f3fa8ca7ec70cfd87aa94dc5e1a260901f2 Mon Sep 17 00:00:00 2001 From: Mingming Cao Date: Tue, 19 Aug 2008 22:16:03 -0400 Subject: ext4: journal credits reservation fixes for DIO, fallocate DIO and fallocate credit calculation is different than writepage, as they do start a new journal right for each call to ext4_get_blocks_wrap(). This patch uses the helper function in DIO and fallocate case, passing a flag indicating that the modified data are contigous thus could account less indirect/index blocks. This patch also fixed the journal credit reservation for direct I/O (DIO). Previously the estimated credits for DIO only was calculated for non-extent files, which was not enough if the file is extent-based. Also fixed was fallocate double-counting credits for modifying the the superblock. Signed-off-by: Mingming Cao Reviewed-by: Aneesh Kumar K.V Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 1 + fs/ext4/extents.c | 11 +++++------ fs/ext4/inode.c | 45 ++++++++++++++++++++++++--------------------- 3 files changed, 30 insertions(+), 27 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 38e661b0ea88..295003241d3d 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1073,6 +1073,7 @@ extern void ext4_get_inode_flags(struct ext4_inode_info *); extern void ext4_set_aops(struct inode *inode); extern int ext4_writepage_trans_blocks(struct inode *); extern int ext4_meta_trans_blocks(struct inode *, int nrblocks, int idxblocks); +extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); extern int ext4_block_truncate_page(handle_t *handle, struct address_space *mapping, loff_t from); extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 5c5dd3a1d657..5596b70efa20 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -1758,7 +1758,7 @@ int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int num, { if (path) { int depth = ext_depth(inode); - int ret; + int ret = 0; /* probably there is space in leaf? */ if (le16_to_cpu(path[depth].p_hdr->eh_entries) @@ -1777,7 +1777,7 @@ int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int num, } } - return ext4_meta_trans_blocks(inode, num, 1); + return ext4_chunk_trans_blocks(inode, num); } /* @@ -2810,7 +2810,7 @@ void ext4_ext_truncate(struct inode *inode) /* * probably first extent we're gonna free will be last in block */ - err = ext4_writepage_trans_blocks(inode) + 3; + err = ext4_writepage_trans_blocks(inode); handle = ext4_journal_start(inode, err); if (IS_ERR(handle)) return; @@ -2923,10 +2923,9 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len) max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) - block; /* - * credits to insert 1 extent into extent tree + buffers to be able to - * modify 1 super block, 1 block bitmap and 1 group descriptor. + * credits to insert 1 extent into extent tree */ - credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + 3; + credits = ext4_chunk_trans_blocks(inode, max_blocks); mutex_lock(&inode->i_mutex); retry: while (ret >= 0 && ret < max_blocks) { diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index a27129065144..ffc95ba48859 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1044,18 +1044,6 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used) spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); } -/* Maximum number of blocks we map for direct IO at once. */ -#define DIO_MAX_BLOCKS 4096 -/* - * Number of credits we need for writing DIO_MAX_BLOCKS: - * We need sb + group descriptor + bitmap + inode -> 4 - * For B blocks with A block pointers per block we need: - * 1 (triple ind.) + (B/A/A + 2) (doubly ind.) + (B/A + 2) (indirect). - * If we plug in 4096 for B and 256 for A (for 1KB block size), we get 25. - */ -#define DIO_CREDITS 25 - - /* * The ext4_get_blocks_wrap() function try to look up the requested blocks, * and returns if the blocks are already mapped. @@ -1167,19 +1155,23 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block, return retval; } +/* Maximum number of blocks we map for direct IO at once. */ +#define DIO_MAX_BLOCKS 4096 + static int ext4_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { handle_t *handle = ext4_journal_current_handle(); int ret = 0, started = 0; unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; + int dio_credits; if (create && !handle) { /* Direct IO write... */ if (max_blocks > DIO_MAX_BLOCKS) max_blocks = DIO_MAX_BLOCKS; - handle = ext4_journal_start(inode, DIO_CREDITS + - 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb)); + dio_credits = ext4_chunk_trans_blocks(inode, max_blocks); + handle = ext4_journal_start(inode, dio_credits); if (IS_ERR(handle)) { ret = PTR_ERR(handle); goto out; @@ -2243,7 +2235,7 @@ static int ext4_da_writepage(struct page *page, * for DIO, writepages, and truncate */ #define EXT4_MAX_WRITEBACK_PAGES DIO_MAX_BLOCKS -#define EXT4_MAX_WRITEBACK_CREDITS DIO_CREDITS +#define EXT4_MAX_WRITEBACK_CREDITS 25 static int ext4_da_writepages(struct address_space *mapping, struct writeback_control *wbc) @@ -4441,7 +4433,8 @@ int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk) /* * Calulate the total number of credits to reserve to fit - * the modification of a single pages into a single transaction + * the modification of a single pages into a single transaction, + * which may include multiple chunks of block allocations. * * This could be called via ext4_write_begin() or later * ext4_da_writepages() in delalyed allocation case. @@ -4449,11 +4442,6 @@ int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk) * In both case it's possible that we could allocating multiple * chunks of blocks. We need to consider the worse case, when * one new block per extent. - * - * For Direct IO and fallocate, the journal credits reservation - * is based on one single extent allocation, so they could use - * EXT4_DATA_TRANS_BLOCKS to get the needed credit to log a single - * chunk of allocation needs. */ int ext4_writepage_trans_blocks(struct inode *inode) { @@ -4467,6 +4455,21 @@ int ext4_writepage_trans_blocks(struct inode *inode) ret += bpp; return ret; } + +/* + * Calculate the journal credits for a chunk of data modification. + * + * This is called from DIO, fallocate or whoever calling + * ext4_get_blocks_wrap() to map/allocate a chunk of contigous disk blocks. + * + * journal buffers for data blocks are not included here, as DIO + * and fallocate do no need to journal data buffers. + */ +int ext4_chunk_trans_blocks(struct inode *inode, int nrblocks) +{ + return ext4_meta_trans_blocks(inode, nrblocks, 1); +} + /* * The caller must have previously called ext4_reserve_inode_write(). * Give this, we know that the caller already has write access to iloc->bh. -- cgit v1.2.3 From a1d6cc563bfdf1bf2829d3e6ce4d8b774251796b Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 19 Aug 2008 21:55:02 -0400 Subject: ext4: Rework the ext4_da_writepages() function With the below changes we reserve credit needed to insert only one extent resulting from a call to single get_block. This makes sure we don't take too much journal credits during writeout. We also don't limit the pages to write. That means we loop through the dirty pages building largest possible contiguous block request. Then we issue a single get_block request. We may get less block that we requested. If so we would end up not mapping some of the buffer_heads. That means those buffer_heads are still marked delay. Later in the writepage callback via __mpage_writepage we redirty those pages. We should also not limit/throttle wbc->nr_to_write in the filesystem writepages callback. That cause wrong behaviour in generic_sync_sb_inodes caused by wbc->nr_to_write being <= 0 Signed-off-by: Aneesh Kumar K.V Reviewed-by: Mingming Cao Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 201 +++++++++++++++++++++++++++++++------------------------- 1 file changed, 113 insertions(+), 88 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index ffc95ba48859..8dd22eade42c 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -41,6 +41,8 @@ #include "acl.h" #include "ext4_extents.h" +#define MPAGE_DA_EXTENT_TAIL 0x01 + static inline int ext4_begin_ordered_truncate(struct inode *inode, loff_t new_size) { @@ -1626,11 +1628,13 @@ struct mpage_da_data { unsigned long first_page, next_page; /* extent of pages */ get_block_t *get_block; struct writeback_control *wbc; + int io_done; + long pages_written; }; /* * mpage_da_submit_io - walks through extent of pages and try to write - * them with __mpage_writepage() + * them with writepage() call back * * @mpd->inode: inode * @mpd->first_page: first page of the extent @@ -1645,18 +1649,11 @@ struct mpage_da_data { static int mpage_da_submit_io(struct mpage_da_data *mpd) { struct address_space *mapping = mpd->inode->i_mapping; - struct mpage_data mpd_pp = { - .bio = NULL, - .last_block_in_bio = 0, - .get_block = mpd->get_block, - .use_writepage = 1, - }; int ret = 0, err, nr_pages, i; unsigned long index, end; struct pagevec pvec; BUG_ON(mpd->next_page <= mpd->first_page); - pagevec_init(&pvec, 0); index = mpd->first_page; end = mpd->next_page - 1; @@ -1674,8 +1671,9 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd) break; index++; - err = __mpage_writepage(page, mpd->wbc, &mpd_pp); - + err = mapping->a_ops->writepage(page, mpd->wbc); + if (!err) + mpd->pages_written++; /* * In error case, we have to continue because * remaining pages are still locked @@ -1686,9 +1684,6 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd) } pagevec_release(&pvec); } - if (mpd_pp.bio) - mpage_bio_submit(WRITE, mpd_pp.bio); - return ret; } @@ -1711,7 +1706,7 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical, int blocks = exbh->b_size >> inode->i_blkbits; sector_t pblock = exbh->b_blocknr, cur_logical; struct buffer_head *head, *bh; - unsigned long index, end; + pgoff_t index, end; struct pagevec pvec; int nr_pages, i; @@ -1796,13 +1791,11 @@ static inline void __unmap_underlying_blocks(struct inode *inode, * * The function skips space we know is already mapped to disk blocks. * - * The function ignores errors ->get_block() returns, thus real - * error handling is postponed to __mpage_writepage() */ static void mpage_da_map_blocks(struct mpage_da_data *mpd) { + int err = 0; struct buffer_head *lbh = &mpd->lbh; - int err = 0, remain = lbh->b_size; sector_t next = lbh->b_blocknr; struct buffer_head new; @@ -1812,35 +1805,32 @@ static void mpage_da_map_blocks(struct mpage_da_data *mpd) if (buffer_mapped(lbh) && !buffer_delay(lbh)) return; - while (remain) { - new.b_state = lbh->b_state; - new.b_blocknr = 0; - new.b_size = remain; - err = mpd->get_block(mpd->inode, next, &new, 1); - if (err) { - /* - * Rather than implement own error handling - * here, we just leave remaining blocks - * unallocated and try again with ->writepage() - */ - break; - } - BUG_ON(new.b_size == 0); + new.b_state = lbh->b_state; + new.b_blocknr = 0; + new.b_size = lbh->b_size; - if (buffer_new(&new)) - __unmap_underlying_blocks(mpd->inode, &new); + /* + * If we didn't accumulate anything + * to write simply return + */ + if (!new.b_size) + return; + err = mpd->get_block(mpd->inode, next, &new, 1); + if (err) + return; + BUG_ON(new.b_size == 0); - /* - * If blocks are delayed marked, we need to - * put actual blocknr and drop delayed bit - */ - if (buffer_delay(lbh) || buffer_unwritten(lbh)) - mpage_put_bnr_to_bhs(mpd, next, &new); + if (buffer_new(&new)) + __unmap_underlying_blocks(mpd->inode, &new); - /* go for the remaining blocks */ - next += new.b_size >> mpd->inode->i_blkbits; - remain -= new.b_size; - } + /* + * If blocks are delayed marked, we need to + * put actual blocknr and drop delayed bit + */ + if (buffer_delay(lbh) || buffer_unwritten(lbh)) + mpage_put_bnr_to_bhs(mpd, next, &new); + + return; } #define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \ @@ -1886,13 +1876,9 @@ static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, * need to flush current extent and start new one */ mpage_da_map_blocks(mpd); - - /* - * Now start a new extent - */ - lbh->b_size = bh->b_size; - lbh->b_state = bh->b_state & BH_FLAGS; - lbh->b_blocknr = logical; + mpage_da_submit_io(mpd); + mpd->io_done = 1; + return; } /* @@ -1912,17 +1898,35 @@ static int __mpage_da_writepage(struct page *page, struct buffer_head *bh, *head, fake; sector_t logical; + if (mpd->io_done) { + /* + * Rest of the page in the page_vec + * redirty then and skip then. We will + * try to to write them again after + * starting a new transaction + */ + redirty_page_for_writepage(wbc, page); + unlock_page(page); + return MPAGE_DA_EXTENT_TAIL; + } /* * Can we merge this page to current extent? */ if (mpd->next_page != page->index) { /* * Nope, we can't. So, we map non-allocated blocks - * and start IO on them using __mpage_writepage() + * and start IO on them using writepage() */ if (mpd->next_page != mpd->first_page) { mpage_da_map_blocks(mpd); mpage_da_submit_io(mpd); + /* + * skip rest of the page in the page_vec + */ + mpd->io_done = 1; + redirty_page_for_writepage(wbc, page); + unlock_page(page); + return MPAGE_DA_EXTENT_TAIL; } /* @@ -1953,6 +1957,8 @@ static int __mpage_da_writepage(struct page *page, set_buffer_dirty(bh); set_buffer_uptodate(bh); mpage_add_bh_to_extent(mpd, logical, bh); + if (mpd->io_done) + return MPAGE_DA_EXTENT_TAIL; } else { /* * Page with regular buffer heads, just add all dirty ones @@ -1961,8 +1967,12 @@ static int __mpage_da_writepage(struct page *page, bh = head; do { BUG_ON(buffer_locked(bh)); - if (buffer_dirty(bh)) + if (buffer_dirty(bh) && + (!buffer_mapped(bh) || buffer_delay(bh))) { mpage_add_bh_to_extent(mpd, logical, bh); + if (mpd->io_done) + return MPAGE_DA_EXTENT_TAIL; + } logical++; } while ((bh = bh->b_this_page) != head); } @@ -1981,22 +1991,13 @@ static int __mpage_da_writepage(struct page *page, * * This is a library function, which implements the writepages() * address_space_operation. - * - * In order to avoid duplication of logic that deals with partial pages, - * multiple bio per page, etc, we find non-allocated blocks, allocate - * them with minimal calls to ->get_block() and re-use __mpage_writepage() - * - * It's important that we call __mpage_writepage() only once for each - * involved page, otherwise we'd have to implement more complicated logic - * to deal with pages w/o PG_lock or w/ PG_writeback and so on. - * - * See comments to mpage_writepages() */ static int mpage_da_writepages(struct address_space *mapping, struct writeback_control *wbc, get_block_t get_block) { struct mpage_da_data mpd; + long to_write; int ret; if (!get_block) @@ -2010,17 +2011,22 @@ static int mpage_da_writepages(struct address_space *mapping, mpd.first_page = 0; mpd.next_page = 0; mpd.get_block = get_block; + mpd.io_done = 0; + mpd.pages_written = 0; + + to_write = wbc->nr_to_write; ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, &mpd); /* * Handle last extent of pages */ - if (mpd.next_page != mpd.first_page) { + if (!mpd.io_done && mpd.next_page != mpd.first_page) { mpage_da_map_blocks(&mpd); mpage_da_submit_io(&mpd); } + wbc->nr_to_write = to_write - mpd.pages_written; return ret; } @@ -2238,7 +2244,7 @@ static int ext4_da_writepage(struct page *page, #define EXT4_MAX_WRITEBACK_CREDITS 25 static int ext4_da_writepages(struct address_space *mapping, - struct writeback_control *wbc) + struct writeback_control *wbc) { struct inode *inode = mapping->host; handle_t *handle = NULL; @@ -2246,42 +2252,53 @@ static int ext4_da_writepages(struct address_space *mapping, int ret = 0; long to_write; loff_t range_start = 0; + long pages_skipped = 0; /* * No pages to write? This is mainly a kludge to avoid starting * a transaction for special inodes like journal inode on last iput() * because that could violate lock ordering on umount */ - if (!mapping->nrpages) + if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) return 0; - /* - * Estimate the worse case needed credits to write out - * EXT4_MAX_BUF_BLOCKS pages - */ - needed_blocks = EXT4_MAX_WRITEBACK_CREDITS; - - to_write = wbc->nr_to_write; - if (!wbc->range_cyclic) { + if (!wbc->range_cyclic) /* * If range_cyclic is not set force range_cont * and save the old writeback_index */ wbc->range_cont = 1; - range_start = wbc->range_start; - } - while (!ret && to_write) { + range_start = wbc->range_start; + pages_skipped = wbc->pages_skipped; + +restart_loop: + to_write = wbc->nr_to_write; + while (!ret && to_write > 0) { + + /* + * we insert one extent at a time. So we need + * credit needed for single extent allocation. + * journalled mode is currently not supported + * by delalloc + */ + BUG_ON(ext4_should_journal_data(inode)); + needed_blocks = EXT4_DATA_TRANS_BLOCKS(inode->i_sb); + /* start a new transaction*/ handle = ext4_journal_start(inode, needed_blocks); if (IS_ERR(handle)) { ret = PTR_ERR(handle); + printk(KERN_EMERG "%s: jbd2_start: " + "%ld pages, ino %lu; err %d\n", __func__, + wbc->nr_to_write, inode->i_ino, ret); + dump_stack(); goto out_writepages; } if (ext4_should_order_data(inode)) { /* * With ordered mode we need to add - * the inode to the journal handle + * the inode to the journal handl * when we do block allocation. */ ret = ext4_jbd2_file_inode(handle, inode); @@ -2289,20 +2306,20 @@ static int ext4_da_writepages(struct address_space *mapping, ext4_journal_stop(handle); goto out_writepages; } - } - /* - * set the max dirty pages could be write at a time - * to fit into the reserved transaction credits - */ - if (wbc->nr_to_write > EXT4_MAX_WRITEBACK_PAGES) - wbc->nr_to_write = EXT4_MAX_WRITEBACK_PAGES; to_write -= wbc->nr_to_write; ret = mpage_da_writepages(mapping, wbc, - ext4_da_get_block_write); + ext4_da_get_block_write); ext4_journal_stop(handle); - if (wbc->nr_to_write) { + if (ret == MPAGE_DA_EXTENT_TAIL) { + /* + * got one extent now try with + * rest of the pages + */ + to_write += wbc->nr_to_write; + ret = 0; + } else if (wbc->nr_to_write) { /* * There is no more writeout needed * or we requested for a noblocking writeout @@ -2314,10 +2331,18 @@ static int ext4_da_writepages(struct address_space *mapping, wbc->nr_to_write = to_write; } + if (wbc->range_cont && (pages_skipped != wbc->pages_skipped)) { + /* We skipped pages in this loop */ + wbc->range_start = range_start; + wbc->nr_to_write = to_write + + wbc->pages_skipped - pages_skipped; + wbc->pages_skipped = pages_skipped; + goto restart_loop; + } + out_writepages: wbc->nr_to_write = to_write; - if (range_start) - wbc->range_start = range_start; + wbc->range_start = range_start; return ret; } -- cgit v1.2.3 From 525f4ed8dcb72c71b306a78ecbf06f41d08fe441 Mon Sep 17 00:00:00 2001 From: Mingming Cao Date: Tue, 19 Aug 2008 22:15:58 -0400 Subject: ext4: journal credit fix for the delayed allocation's writepages() function Previous delalloc writepages implementation started a new transaction outside of a loop which called get_block() to do the block allocation. Since we didn't know exactly how many blocks would need to be allocated, the estimated journal credits required was very conservative and caused many issues. With the reworked delayed allocation, a new transaction is created for each get_block(), thus we don't need to guess how many credits for the multiple chunk of allocation. We start every transaction with enough credits for inserting a single exent. When estimate the credits for indirect blocks to allocate a chunk of blocks, we need to know the number of data blocks to allocate. We use the total number of reserved delalloc datablocks; if that is too big, for non-extent files, we need to limit the number of blocks to EXT4_MAX_TRANS_BLOCKS. Code cleanup from Aneesh. Signed-off-by: Mingming Cao Reviewed-off-by: Aneesh Kumar K.V Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 8 +++--- fs/ext4/inode.c | 74 ++++++++++++++++++++++++++++++++++++++++--------------- 2 files changed, 58 insertions(+), 24 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 5596b70efa20..b24d3c53f20c 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -1753,7 +1753,7 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, * When pass the actual path, the caller should calculate credits * under i_data_sem. */ -int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int num, +int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks, struct ext4_ext_path *path) { if (path) { @@ -1772,12 +1772,12 @@ int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int num, * and other metadat blocks still need to be * accounted. */ - /* 1 one bitmap, 1 block group descriptor */ + /* 1 bitmap, 1 block group descriptor */ ret = 2 + EXT4_META_TRANS_BLOCKS(inode->i_sb); } } - return ext4_chunk_trans_blocks(inode, num); + return ext4_chunk_trans_blocks(inode, nrblocks); } /* @@ -1791,7 +1791,7 @@ int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int num, * If the nrblocks are discontiguous, they could cause * the whole tree split more than once, but this is really rare. */ -int ext4_ext_index_trans_blocks(struct inode *inode, int num, int chunk) +int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) { int index; int depth = ext_depth(inode); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 8dd22eade42c..d1906d9a22de 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1848,29 +1848,53 @@ static void mpage_da_map_blocks(struct mpage_da_data *mpd) static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, sector_t logical, struct buffer_head *bh) { - struct buffer_head *lbh = &mpd->lbh; sector_t next; + size_t b_size = bh->b_size; + struct buffer_head *lbh = &mpd->lbh; + int nrblocks = lbh->b_size >> mpd->inode->i_blkbits; - next = lbh->b_blocknr + (lbh->b_size >> mpd->inode->i_blkbits); - + /* check if thereserved journal credits might overflow */ + if (!(EXT4_I(mpd->inode)->i_flags & EXT4_EXTENTS_FL)) { + if (nrblocks >= EXT4_MAX_TRANS_DATA) { + /* + * With non-extent format we are limited by the journal + * credit available. Total credit needed to insert + * nrblocks contiguous blocks is dependent on the + * nrblocks. So limit nrblocks. + */ + goto flush_it; + } else if ((nrblocks + (b_size >> mpd->inode->i_blkbits)) > + EXT4_MAX_TRANS_DATA) { + /* + * Adding the new buffer_head would make it cross the + * allowed limit for which we have journal credit + * reserved. So limit the new bh->b_size + */ + b_size = (EXT4_MAX_TRANS_DATA - nrblocks) << + mpd->inode->i_blkbits; + /* we will do mpage_da_submit_io in the next loop */ + } + } /* * First block in the extent */ if (lbh->b_size == 0) { lbh->b_blocknr = logical; - lbh->b_size = bh->b_size; + lbh->b_size = b_size; lbh->b_state = bh->b_state & BH_FLAGS; return; } + next = lbh->b_blocknr + nrblocks; /* * Can we merge the block to our big extent? */ if (logical == next && (bh->b_state & BH_FLAGS) == lbh->b_state) { - lbh->b_size += bh->b_size; + lbh->b_size += b_size; return; } +flush_it: /* * We couldn't merge the block to our extent, so we * need to flush current extent and start new one @@ -2231,17 +2255,29 @@ static int ext4_da_writepage(struct page *page, } /* - * For now just follow the DIO way to estimate the max credits - * needed to write out EXT4_MAX_WRITEBACK_PAGES. - * todo: need to calculate the max credits need for - * extent based files, currently the DIO credits is based on - * indirect-blocks mapping way. - * - * Probably should have a generic way to calculate credits - * for DIO, writepages, and truncate + * This is called via ext4_da_writepages() to + * calulate the total number of credits to reserve to fit + * a single extent allocation into a single transaction, + * ext4_da_writpeages() will loop calling this before + * the block allocation. */ -#define EXT4_MAX_WRITEBACK_PAGES DIO_MAX_BLOCKS -#define EXT4_MAX_WRITEBACK_CREDITS 25 + +static int ext4_da_writepages_trans_blocks(struct inode *inode) +{ + int max_blocks = EXT4_I(inode)->i_reserved_data_blocks; + + /* + * With non-extent format the journal credit needed to + * insert nrblocks contiguous block is dependent on + * number of contiguous block. So we will limit + * number of contiguous block to a sane value + */ + if (!(inode->i_flags & EXT4_EXTENTS_FL) && + (max_blocks > EXT4_MAX_TRANS_DATA)) + max_blocks = EXT4_MAX_TRANS_DATA; + + return ext4_chunk_trans_blocks(inode, max_blocks); +} static int ext4_da_writepages(struct address_space *mapping, struct writeback_control *wbc) @@ -2283,7 +2319,7 @@ restart_loop: * by delalloc */ BUG_ON(ext4_should_journal_data(inode)); - needed_blocks = EXT4_DATA_TRANS_BLOCKS(inode->i_sb); + needed_blocks = ext4_da_writepages_trans_blocks(inode); /* start a new transaction*/ handle = ext4_journal_start(inode, needed_blocks); @@ -4461,11 +4497,9 @@ int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk) * the modification of a single pages into a single transaction, * which may include multiple chunks of block allocations. * - * This could be called via ext4_write_begin() or later - * ext4_da_writepages() in delalyed allocation case. + * This could be called via ext4_write_begin() * - * In both case it's possible that we could allocating multiple - * chunks of blocks. We need to consider the worse case, when + * We need to consider the worse case, when * one new block per extent. */ int ext4_writepage_trans_blocks(struct inode *inode) -- cgit v1.2.3 From 16eb72956496594d023a7d7cd14a86404ad195ad Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 19 Aug 2008 21:16:54 -0400 Subject: ext4: make sure ext4_has_free_blocks returns 0 for ENOSPC Fix ext4_has_free_blocks() to return 0 when we don't have enough space. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Mingming Cao Signed-off-by: "Theodore Ts'o" --- fs/ext4/balloc.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 1ae5004e93fc..e9fa960ba6da 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -1626,6 +1626,9 @@ ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi, free_blocks = percpu_counter_sum_and_set(&sbi->s_freeblocks_counter); #endif + if (free_blocks <= root_blocks) + /* we don't have free space */ + return 0; if (free_blocks - root_blocks < nblocks) return free_blocks - root_blocks; return nblocks; -- cgit v1.2.3 From 91246c009094142f95ecc7573b7caed2bcef52c7 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 19 Aug 2008 21:14:52 -0400 Subject: ext4: Initialize writeback_index to 0 when allocating a new inode The write_cache_pages() function uses the mapping->writeback_index as the starting index to write out when range_cyclic is set. Properly initialize writeback_index so that we start the writeout at index 0. This was found when debugging the small file fragmentation on ext4. Signed-off-by: Aneesh Kumar K.V Signed-off-by: "Theodore Ts'o" --- fs/ext4/super.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index d5d77958b861..566344b926b7 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -568,6 +568,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) #endif ei->i_block_alloc_info = NULL; ei->vfs_inode.i_version = 1; + ei->vfs_inode.i_data.writeback_index = 0; memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache)); INIT_LIST_HEAD(&ei->i_prealloc_list); spin_lock_init(&ei->i_prealloc_lock); -- cgit v1.2.3 From 5e745b041f2ccad63077118b40468521306f3962 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Mon, 18 Aug 2008 18:00:57 -0400 Subject: ext4: Fix small file fragmentation For small file block allocations, mballoc uses per cpu prealloc space. Use goal block when searching for the right prealloc space. Also make sure ext4_da_writepages tries to write all the pages for small files in single attempt Signed-off-by: Aneesh Kumar K.V Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 21 +++++++++++++++------ fs/ext4/mballoc.c | 53 ++++++++++++++++++++++++++++++++++++++++++++++------- 2 files changed, 61 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index d1906d9a22de..7e91913e325b 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2282,13 +2282,12 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode) static int ext4_da_writepages(struct address_space *mapping, struct writeback_control *wbc) { - struct inode *inode = mapping->host; handle_t *handle = NULL; - int needed_blocks; - int ret = 0; - long to_write; loff_t range_start = 0; - long pages_skipped = 0; + struct inode *inode = mapping->host; + int needed_blocks, ret = 0, nr_to_writebump = 0; + long to_write, pages_skipped = 0; + struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); /* * No pages to write? This is mainly a kludge to avoid starting @@ -2297,6 +2296,16 @@ static int ext4_da_writepages(struct address_space *mapping, */ if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) return 0; + /* + * Make sure nr_to_write is >= sbi->s_mb_stream_request + * This make sure small files blocks are allocated in + * single attempt. This ensure that small files + * get less fragmented. + */ + if (wbc->nr_to_write < sbi->s_mb_stream_request) { + nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write; + wbc->nr_to_write = sbi->s_mb_stream_request; + } if (!wbc->range_cyclic) /* @@ -2377,7 +2386,7 @@ restart_loop: } out_writepages: - wbc->nr_to_write = to_write; + wbc->nr_to_write = to_write - nr_to_writebump; wbc->range_start = range_start; return ret; } diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 865e9ddb44d4..e0e3a5eb1ddb 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3281,6 +3281,35 @@ static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac, mb_debug("use %u/%u from group pa %p\n", pa->pa_lstart-len, len, pa); } +/* + * Return the prealloc space that have minimal distance + * from the goal block. @cpa is the prealloc + * space that is having currently known minimal distance + * from the goal block. + */ +static struct ext4_prealloc_space * +ext4_mb_check_group_pa(ext4_fsblk_t goal_block, + struct ext4_prealloc_space *pa, + struct ext4_prealloc_space *cpa) +{ + ext4_fsblk_t cur_distance, new_distance; + + if (cpa == NULL) { + atomic_inc(&pa->pa_count); + return pa; + } + cur_distance = abs(goal_block - cpa->pa_pstart); + new_distance = abs(goal_block - pa->pa_pstart); + + if (cur_distance < new_distance) + return cpa; + + /* drop the previous reference */ + atomic_dec(&cpa->pa_count); + atomic_inc(&pa->pa_count); + return pa; +} + /* * search goal blocks in preallocated space */ @@ -3290,7 +3319,8 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac) int order, i; struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); struct ext4_locality_group *lg; - struct ext4_prealloc_space *pa; + struct ext4_prealloc_space *pa, *cpa = NULL; + ext4_fsblk_t goal_block; /* only data can be preallocated */ if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) @@ -3333,6 +3363,13 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac) /* The max size of hash table is PREALLOC_TB_SIZE */ order = PREALLOC_TB_SIZE - 1; + goal_block = ac->ac_g_ex.fe_group * EXT4_BLOCKS_PER_GROUP(ac->ac_sb) + + ac->ac_g_ex.fe_start + + le32_to_cpu(EXT4_SB(ac->ac_sb)->s_es->s_first_data_block); + /* + * search for the prealloc space that is having + * minimal distance from the goal block. + */ for (i = order; i < PREALLOC_TB_SIZE; i++) { rcu_read_lock(); list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[i], @@ -3340,17 +3377,19 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac) spin_lock(&pa->pa_lock); if (pa->pa_deleted == 0 && pa->pa_free >= ac->ac_o_ex.fe_len) { - atomic_inc(&pa->pa_count); - ext4_mb_use_group_pa(ac, pa); - spin_unlock(&pa->pa_lock); - ac->ac_criteria = 20; - rcu_read_unlock(); - return 1; + + cpa = ext4_mb_check_group_pa(goal_block, + pa, cpa); } spin_unlock(&pa->pa_lock); } rcu_read_unlock(); } + if (cpa) { + ext4_mb_use_group_pa(ac, cpa); + ac->ac_criteria = 20; + return 1; + } return 0; } -- cgit v1.2.3 From 74c27c43ebd020fcb65364613503f6c08dc6f535 Mon Sep 17 00:00:00 2001 From: Takashi YOSHII Date: Mon, 11 Aug 2008 20:10:54 +0900 Subject: binfmt_flat: Stub in a FLAT_PLAT_INIT(). This provides a FLAT_PLAT_INIT() arch hook for platforms that need to set up specific register state prior to calling in to the process, as per ELF_PLAT_INIT(). Signed-off-by: Takashi YOSHII Signed-off-by: Paul Mundt --- fs/binfmt_flat.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index 56372ecf1690..dfc0197905ca 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -914,7 +914,9 @@ static int load_flat_binary(struct linux_binprm * bprm, struct pt_regs * regs) /* Stash our initial stack pointer into the mm structure */ current->mm->start_stack = (unsigned long )sp; - +#ifdef FLAT_PLAT_INIT + FLAT_PLAT_INIT(regs); +#endif DBG_FLT("start_thread(regs=0x%x, entry=0x%x, start_stack=0x%x)\n", (int)regs, (int)start_addr, (int)current->mm->start_stack); -- cgit v1.2.3 From 2c731afb0d4ba16018b400c75665fbdb8feb2175 Mon Sep 17 00:00:00 2001 From: Steve French Date: Mon, 11 Aug 2008 22:28:53 +0000 Subject: [CIFS] if get root inode fails during mount, cleanup tree connection CC: Stable Kernel Signed-off-by: Steve French --- fs/cifs/cifsfs.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index e8da4ee761b5..f50fc8728c94 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -174,6 +174,8 @@ out_no_root: cERROR(1, ("cifs_read_super: get root inode failed")); if (inode) iput(inode); + + cifs_umount(sb, cifs_sb); out_mount_failed: if (cifs_sb) { -- cgit v1.2.3 From 54b4602d5fe50571362e101138d24edb9cf82d29 Mon Sep 17 00:00:00 2001 From: Steve French Date: Mon, 11 Aug 2008 22:31:40 +0000 Subject: [CIFS] remove trailing whitespace Signed-off-by: Steve French --- fs/cifs/cifsfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index f50fc8728c94..25ecbd5b0404 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -174,7 +174,7 @@ out_no_root: cERROR(1, ("cifs_read_super: get root inode failed")); if (inode) iput(inode); - + cifs_umount(sb, cifs_sb); out_mount_failed: -- cgit v1.2.3 From ce769caa50a3fc835b4fc1a6e1463ada127a2e8a Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Fri, 18 Jul 2008 12:54:21 +0300 Subject: UBIFS: print volume name as well We encouredge people to mount using volume name, not device numbers. So print the name of the mounted UBI volume, not just IDs. Signed-off-by: Artem Bityutskiy --- fs/ubifs/super.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index ca1e2d4e03cc..43af934a7558 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -1122,8 +1122,8 @@ static int mount_ubifs(struct ubifs_info *c) if (err) goto out_infos; - ubifs_msg("mounted UBI device %d, volume %d", c->vi.ubi_num, - c->vi.vol_id); + ubifs_msg("mounted UBI device %d, volume %d, name \"%s\"", + c->vi.ubi_num, c->vi.vol_id, c->vi.name); if (mounted_read_only) ubifs_msg("mounted read-only"); x = (long long)c->main_lebs * c->leb_size; -- cgit v1.2.3 From 182854b46f9feb6f1b03abe747bb2beeebf2adb0 Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Fri, 18 Jul 2008 18:54:29 +0300 Subject: UBIFS: fix budgeting calculations The 'ubifs_release_dirty_inode_budget()' was buggy and incorrectly freed the budget, which led to not freeing all dirty data budget. This patch fixes that. Also, this patch fixes ubifs_mkdir() which passed 1 in dirty_ino_d, which makes no sense. Well, it is harmless though. Also, add few more useful assertions. And improve few debugging messages. Signed-off-by: Artem Bityutskiy --- fs/ubifs/budget.c | 5 +++-- fs/ubifs/dir.c | 3 +-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c index d81fb9ed2b8e..12a1717db87c 100644 --- a/fs/ubifs/budget.c +++ b/fs/ubifs/budget.c @@ -686,9 +686,10 @@ void ubifs_convert_page_budget(struct ubifs_info *c) void ubifs_release_dirty_inode_budget(struct ubifs_info *c, struct ubifs_inode *ui) { - struct ubifs_budget_req req = {.dd_growth = c->inode_budget, - .dirtied_ino_d = ui->data_len}; + struct ubifs_budget_req req; + memset(&req, 0, sizeof(struct ubifs_budget_req)); + req.dd_growth = c->inode_budget + ui->data_len; ubifs_release_budget(c, &req); } diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index e90374be7d3b..a79e850fee6d 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -727,8 +727,7 @@ static int ubifs_mkdir(struct inode *dir, struct dentry *dentry, int mode) struct ubifs_inode *dir_ui = ubifs_inode(dir); struct ubifs_info *c = dir->i_sb->s_fs_info; int err, sz_change = CALC_DENT_SIZE(dentry->d_name.len); - struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1, - .dirtied_ino_d = 1 }; + struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1 }; /* * Budget request settings: new inode, new direntry and changing parent -- cgit v1.2.3 From 7d32c2bb143fa1ca3b0c420feb08a832d65395be Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Fri, 18 Jul 2008 18:54:29 +0300 Subject: UBIFS: improve debugging 1. Print inode mode in some of debugging messages 2. Add few more useful assertions Signed-off-by: Artem Bityutskiy --- fs/ubifs/file.c | 3 ++- fs/ubifs/super.c | 6 ++++-- 2 files changed, 6 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 8565e586e533..01598f28020b 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -941,7 +941,8 @@ int ubifs_setattr(struct dentry *dentry, struct iattr *attr) struct inode *inode = dentry->d_inode; struct ubifs_info *c = inode->i_sb->s_fs_info; - dbg_gen("ino %lu, ia_valid %#x", inode->i_ino, attr->ia_valid); + dbg_gen("ino %lu, mode %#x, ia_valid %#x", + inode->i_ino, inode->i_mode, attr->ia_valid); err = inode_change_ok(inode, attr); if (err) return err; diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 43af934a7558..06e3b22a0c1b 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -299,7 +299,7 @@ static int ubifs_write_inode(struct inode *inode, int wait) return 0; } - dbg_gen("inode %lu", inode->i_ino); + dbg_gen("inode %lu, mode %#x", inode->i_ino, (int)inode->i_mode); err = ubifs_jnl_write_inode(c, inode, 0); if (err) ubifs_err("can't write inode %lu, error %d", inode->i_ino, err); @@ -323,9 +323,10 @@ static void ubifs_delete_inode(struct inode *inode) */ goto out; - dbg_gen("inode %lu", inode->i_ino); + dbg_gen("inode %lu, mode %#x", inode->i_ino, (int)inode->i_mode); ubifs_assert(!atomic_read(&inode->i_count)); ubifs_assert(inode->i_nlink == 0); + ubifs_assert(!ubifs_inode(inode)->dirty); truncate_inode_pages(&inode->i_data, 0); if (is_bad_inode(inode)) @@ -1469,6 +1470,7 @@ static void ubifs_put_super(struct super_block *sb) */ ubifs_assert(atomic_long_read(&c->dirty_pg_cnt) == 0); ubifs_assert(c->budg_idx_growth == 0); + ubifs_assert(c->budg_dd_growth == 0); ubifs_assert(c->budg_data_growth == 0); /* -- cgit v1.2.3 From 1e0f358e29cc91c8eb09e10cbf1f6bb58a62c795 Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Mon, 21 Jul 2008 10:59:53 +0300 Subject: UBIFS: free budget in delete_inode as well Although the inode is marked as clean when it is being deleted, it might stay and be used as orphan, and be marked as dirty. So we have to free the budget when we delete it. Signed-off-by: Artem Bityutskiy --- fs/ubifs/super.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 06e3b22a0c1b..884beed1dcb8 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -314,8 +314,9 @@ static void ubifs_delete_inode(struct inode *inode) { int err; struct ubifs_info *c = inode->i_sb->s_fs_info; + struct ubifs_inode *ui = ubifs_inode(inode); - if (ubifs_inode(inode)->xattr) + if (ui->xattr) /* * Extended attribute inode deletions are fully handled in * 'ubifs_removexattr()'. These inodes are special and have @@ -326,13 +327,12 @@ static void ubifs_delete_inode(struct inode *inode) dbg_gen("inode %lu, mode %#x", inode->i_ino, (int)inode->i_mode); ubifs_assert(!atomic_read(&inode->i_count)); ubifs_assert(inode->i_nlink == 0); - ubifs_assert(!ubifs_inode(inode)->dirty); truncate_inode_pages(&inode->i_data, 0); if (is_bad_inode(inode)) goto out; - ubifs_inode(inode)->ui_size = inode->i_size = 0; + ui->ui_size = inode->i_size = 0; err = ubifs_jnl_write_inode(c, inode, 1); if (err) /* @@ -341,6 +341,8 @@ static void ubifs_delete_inode(struct inode *inode) */ ubifs_err("can't write inode %lu, error %d", inode->i_ino, err); out: + if (ui->dirty) + ubifs_release_dirty_inode_budget(c, ui); clear_inode(inode); } -- cgit v1.2.3 From 16dfd804b44ef7156d1c201f100bd0d9dc6b7c4b Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Fri, 18 Jul 2008 16:47:41 +0300 Subject: UBIFS: fix error return in failure mode UBIFS recovery testing debug facility simulates media failures. When simulating an IO error, the error code returned must be -EIO but it was not always if the user switched off the debug recovery testing option at the same time. Signed-off-by: Adrian Hunter --- fs/ubifs/debug.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index 4e3aaeba4eca..0adfb29b8503 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -2208,16 +2208,17 @@ int dbg_leb_read(struct ubi_volume_desc *desc, int lnum, char *buf, int offset, int dbg_leb_write(struct ubi_volume_desc *desc, int lnum, const void *buf, int offset, int len, int dtype) { - int err; + int err, failing; if (in_failure_mode(desc)) return -EIO; - if (do_fail(desc, lnum, 1)) + failing = do_fail(desc, lnum, 1); + if (failing) cut_data(buf, len); err = ubi_leb_write(desc, lnum, buf, offset, len, dtype); if (err) return err; - if (in_failure_mode(desc)) + if (failing) return -EIO; return 0; } -- cgit v1.2.3 From 2fb42b11f61cbcef7dfc225c1d26c4511436583d Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Fri, 18 Jul 2008 17:56:37 +0300 Subject: UBIFS: ensure UBIFS switches to read-only on error UBI transparently handles write errors by automatically copying and remapping the affected eraseblock. If UBI is unable to do that, for example its pool of eraseblocks reserved for bad block handling is empty, then the error is propagated to UBIFS. UBIFS must protect the media from falling into an inconsistent state by immediately switching to read-only mode. In the case of log updates, this was not being done. Signed-off-by: Adrian Hunter --- fs/ubifs/log.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/ubifs/log.c b/fs/ubifs/log.c index 36857b9ed59e..e14829e50693 100644 --- a/fs/ubifs/log.c +++ b/fs/ubifs/log.c @@ -317,6 +317,8 @@ int ubifs_add_bud_to_log(struct ubifs_info *c, int jhead, int lnum, int offs) return 0; out_unlock: + if (err != -EAGAIN) + ubifs_ro_mode(c, err); mutex_unlock(&c->log_mutex); kfree(ref); kfree(bud); -- cgit v1.2.3 From ff46d7b3e0870a70331b069372c36fbc43018c2d Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Mon, 21 Jul 2008 15:39:05 +0300 Subject: UBIFS: make ubifs_ro_mode() not inline We use ubifs_ro_mode() quite a lot, and not in fast-path, so there is no reason to blow the code up by having it inlined. Also, we usually want R/O mode change to be seen to other CPUs as soon as possible, so when we make this a function call, we will automatically have a memory barrier. Signed-off-by: Adrian Hunter Signed-off-by: Artem Bityutskiy --- fs/ubifs/io.c | 14 ++++++++++++++ fs/ubifs/misc.h | 14 -------------- fs/ubifs/ubifs.h | 1 + 3 files changed, 15 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c index 3374f91b6709..054363f2b207 100644 --- a/fs/ubifs/io.c +++ b/fs/ubifs/io.c @@ -53,6 +53,20 @@ #include #include "ubifs.h" +/** + * ubifs_ro_mode - switch UBIFS to read read-only mode. + * @c: UBIFS file-system description object + * @err: error code which is the reason of switching to R/O mode + */ +void ubifs_ro_mode(struct ubifs_info *c, int err) +{ + if (!c->ro_media) { + c->ro_media = 1; + ubifs_warn("switched to read-only mode, error %d", err); + dbg_dump_stack(); + } +} + /** * ubifs_check_node - check node. * @c: UBIFS file-system description object diff --git a/fs/ubifs/misc.h b/fs/ubifs/misc.h index 4beccfc256d2..cd83ffc8101c 100644 --- a/fs/ubifs/misc.h +++ b/fs/ubifs/misc.h @@ -79,20 +79,6 @@ static inline struct ubifs_inode *ubifs_inode(const struct inode *inode) return container_of(inode, struct ubifs_inode, vfs_inode); } -/** - * ubifs_ro_mode - switch UBIFS to read read-only mode. - * @c: UBIFS file-system description object - * @err: error code which is the reason of switching to R/O mode - */ -static inline void ubifs_ro_mode(struct ubifs_info *c, int err) -{ - if (!c->ro_media) { - c->ro_media = 1; - ubifs_warn("switched to read-only mode, error %d", err); - dbg_dump_stack(); - } -} - /** * ubifs_compr_present - check if compressor was compiled in. * @compr_type: compressor type to check diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index e4f89f271827..c488d43b6359 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -1346,6 +1346,7 @@ extern struct backing_dev_info ubifs_backing_dev_info; extern struct ubifs_compressor *ubifs_compressors[UBIFS_COMPR_TYPES_CNT]; /* io.c */ +void ubifs_ro_mode(struct ubifs_info *c, int err); int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len); int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs, int dtype); -- cgit v1.2.3 From fbfa6c884aae2aff479eb8c996c564b1a34eae30 Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Tue, 22 Jul 2008 11:52:52 +0300 Subject: UBIFS: do not write orphans back Orphan inodes are deleted inodes which will disappear after FS re-mount. There is not need to write orphan inodes back, because they are not needed on the flash media. So optimize orphans a little by not writing them back. Just mark them as clean, free the budget, and report success to VFS. Signed-off-by: Artem Bityutskiy --- fs/ubifs/super.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 884beed1dcb8..13e90b0dd95d 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -278,7 +278,7 @@ static void ubifs_destroy_inode(struct inode *inode) */ static int ubifs_write_inode(struct inode *inode, int wait) { - int err; + int err = 0; struct ubifs_info *c = inode->i_sb->s_fs_info; struct ubifs_inode *ui = ubifs_inode(inode); @@ -299,10 +299,18 @@ static int ubifs_write_inode(struct inode *inode, int wait) return 0; } - dbg_gen("inode %lu, mode %#x", inode->i_ino, (int)inode->i_mode); - err = ubifs_jnl_write_inode(c, inode, 0); - if (err) - ubifs_err("can't write inode %lu, error %d", inode->i_ino, err); + /* + * As an optimization, do not write orphan inodes to the media just + * because this is not needed. + */ + dbg_gen("inode %lu, mode %#x, nlink %u", + inode->i_ino, (int)inode->i_mode, inode->i_nlink); + if (inode->i_nlink) { + err = ubifs_jnl_write_inode(c, inode, 0); + if (err) + ubifs_err("can't write inode %lu, error %d", + inode->i_ino, err); + } ui->dirty = 0; mutex_unlock(&ui->ui_mutex); -- cgit v1.2.3 From 1f28681ad34a0c7e51dc5070c84b53f7bd34f44c Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Tue, 22 Jul 2008 12:06:13 +0300 Subject: UBIFS: remove unneeded function parameter Simplify 'ubifs_jnl_write_inode()' by removing the 'deletion' parameter which is not really needed because we may test inode->i_nlink and check whether this is a deletion or not. Signed-off-by: Artem Bityutskiy --- fs/ubifs/journal.c | 19 +++++++------------ fs/ubifs/super.c | 4 ++-- fs/ubifs/ubifs.h | 3 +-- 3 files changed, 10 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c index 283155abe5f5..666ad82ec51a 100644 --- a/fs/ubifs/journal.c +++ b/fs/ubifs/journal.c @@ -750,30 +750,25 @@ out_free: * ubifs_jnl_write_inode - flush inode to the journal. * @c: UBIFS file-system description object * @inode: inode to flush - * @deletion: inode has been deleted * * This function writes inode @inode to the journal. If the inode is * synchronous, it also synchronizes the write-buffer. Returns zero in case of * success and a negative error code in case of failure. */ -int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode, - int deletion) +int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode) { - int err, len, lnum, offs, sync = 0; + int err, lnum, offs; struct ubifs_ino_node *ino; struct ubifs_inode *ui = ubifs_inode(inode); + int sync = 0, len = UBIFS_INO_NODE_SZ, last_reference = !inode->i_nlink; - dbg_jnl("ino %lu%s", inode->i_ino, - deletion ? " (last reference)" : ""); - if (deletion) - ubifs_assert(inode->i_nlink == 0); + dbg_jnl("ino %lu, nlink %u", inode->i_ino, inode->i_nlink); - len = UBIFS_INO_NODE_SZ; /* * If the inode is being deleted, do not write the attached data. No * need to synchronize the write-buffer either. */ - if (!deletion) { + if (!last_reference) { len += ui->data_len; sync = IS_SYNC(inode); } @@ -786,7 +781,7 @@ int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode, if (err) goto out_free; - pack_inode(c, ino, inode, 1, deletion); + pack_inode(c, ino, inode, 1, last_reference); err = write_head(c, BASEHD, ino, len, &lnum, &offs, sync); if (err) goto out_release; @@ -795,7 +790,7 @@ int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode, inode->i_ino); release_head(c, BASEHD); - if (deletion) { + if (last_reference) { err = ubifs_tnc_remove_ino(c, inode->i_ino); if (err) goto out_ro; diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 13e90b0dd95d..cf1fb6cffa09 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -306,7 +306,7 @@ static int ubifs_write_inode(struct inode *inode, int wait) dbg_gen("inode %lu, mode %#x, nlink %u", inode->i_ino, (int)inode->i_mode, inode->i_nlink); if (inode->i_nlink) { - err = ubifs_jnl_write_inode(c, inode, 0); + err = ubifs_jnl_write_inode(c, inode); if (err) ubifs_err("can't write inode %lu, error %d", inode->i_ino, err); @@ -341,7 +341,7 @@ static void ubifs_delete_inode(struct inode *inode) goto out; ui->ui_size = inode->i_size = 0; - err = ubifs_jnl_write_inode(c, inode, 1); + err = ubifs_jnl_write_inode(c, inode); if (err) /* * Worst case we have a lost orphan inode wasting space, so a diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index c488d43b6359..6ddd1de2ea64 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -1400,8 +1400,7 @@ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir, int deletion, int xent); int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode, const union ubifs_key *key, const void *buf, int len); -int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode, - int last_reference); +int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode); int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, const struct dentry *old_dentry, const struct inode *new_dir, -- cgit v1.2.3 From fd6c6b51e3677937090314b20b00f2194900d81b Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Tue, 22 Jul 2008 12:19:09 +0300 Subject: UBIFS: remove another unneeded function parameter The 'last_reference' parameter of 'pack_inode()' is not really needed because 'inode->i_nlink' may be tested instead. Zap it. Signed-off-by: Artem Bityutskiy --- fs/ubifs/journal.c | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c index 666ad82ec51a..3bc3fc947099 100644 --- a/fs/ubifs/journal.c +++ b/fs/ubifs/journal.c @@ -447,13 +447,11 @@ static int get_dent_type(int mode) * @ino: buffer in which to pack inode node * @inode: inode to pack * @last: indicates the last node of the group - * @last_reference: non-zero if this is a deletion inode */ static void pack_inode(struct ubifs_info *c, struct ubifs_ino_node *ino, - const struct inode *inode, int last, - int last_reference) + const struct inode *inode, int last) { - int data_len = 0; + int data_len = 0, last_reference = !inode->i_nlink; struct ubifs_inode *ui = ubifs_inode(inode); ino->ch.node_type = UBIFS_INO_NODE; @@ -596,9 +594,9 @@ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir, ubifs_prep_grp_node(c, dent, dlen, 0); ino = (void *)dent + aligned_dlen; - pack_inode(c, ino, inode, 0, last_reference); + pack_inode(c, ino, inode, 0); ino = (void *)ino + aligned_ilen; - pack_inode(c, ino, dir, 1, 0); + pack_inode(c, ino, dir, 1); if (last_reference) { err = ubifs_add_orphan(c, inode->i_ino); @@ -781,7 +779,7 @@ int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode) if (err) goto out_free; - pack_inode(c, ino, inode, 1, last_reference); + pack_inode(c, ino, inode, 1); err = write_head(c, BASEHD, ino, len, &lnum, &offs, sync); if (err) goto out_release; @@ -912,16 +910,16 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, p = (void *)dent2 + aligned_dlen2; if (new_inode) { - pack_inode(c, p, new_inode, 0, last_reference); + pack_inode(c, p, new_inode, 0); p += ALIGN(ilen, 8); } if (!move) - pack_inode(c, p, old_dir, 1, 0); + pack_inode(c, p, old_dir, 1); else { - pack_inode(c, p, old_dir, 0, 0); + pack_inode(c, p, old_dir, 0); p += ALIGN(plen, 8); - pack_inode(c, p, new_dir, 1, 0); + pack_inode(c, p, new_dir, 1); } if (last_reference) { @@ -1126,7 +1124,7 @@ int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode, if (err) goto out_free; - pack_inode(c, ino, inode, 0, 0); + pack_inode(c, ino, inode, 0); ubifs_prep_grp_node(c, trun, UBIFS_TRUN_NODE_SZ, dlen ? 0 : 1); if (dlen) ubifs_prep_grp_node(c, dn, dlen, 1); @@ -1246,9 +1244,9 @@ int ubifs_jnl_delete_xattr(struct ubifs_info *c, const struct inode *host, ubifs_prep_grp_node(c, xent, xlen, 0); ino = (void *)xent + aligned_xlen; - pack_inode(c, ino, inode, 0, 1); + pack_inode(c, ino, inode, 0); ino = (void *)ino + UBIFS_INO_NODE_SZ; - pack_inode(c, ino, host, 1, 0); + pack_inode(c, ino, host, 1); err = write_head(c, BASEHD, xent, len, &lnum, &xent_offs, sync); if (!sync && !err) @@ -1339,8 +1337,8 @@ int ubifs_jnl_change_xattr(struct ubifs_info *c, const struct inode *inode, if (err) goto out_free; - pack_inode(c, ino, host, 0, 0); - pack_inode(c, (void *)ino + aligned_len1, inode, 1, 0); + pack_inode(c, ino, host, 0); + pack_inode(c, (void *)ino + aligned_len1, inode, 1); err = write_head(c, BASEHD, ino, aligned_len, &lnum, &offs, 0); if (!sync && !err) { -- cgit v1.2.3 From 014eb04b03202dc75c1c749df4246d98045f5e69 Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Mon, 21 Jul 2008 17:14:29 +0300 Subject: UBIFS: increment commit number earlier Increment the commit number at the beginnig of the commit, instead of doing this after the commit. This is needed for further optimizations. Signed-off-by: Artem Bityutskiy --- fs/ubifs/commit.c | 3 ++- fs/ubifs/log.c | 2 +- fs/ubifs/orphan.c | 4 ++-- fs/ubifs/ubifs.h | 3 ++- 4 files changed, 7 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c index 3b516316c9b3..0a6aa2cc78f0 100644 --- a/fs/ubifs/commit.c +++ b/fs/ubifs/commit.c @@ -74,6 +74,7 @@ static int do_commit(struct ubifs_info *c) goto out_up; } + c->cmt_no += 1; err = ubifs_gc_start_commit(c); if (err) goto out_up; @@ -115,7 +116,7 @@ static int do_commit(struct ubifs_info *c) goto out; mutex_lock(&c->mst_mutex); - c->mst_node->cmt_no = cpu_to_le64(++c->cmt_no); + c->mst_node->cmt_no = cpu_to_le64(c->cmt_no); c->mst_node->log_lnum = cpu_to_le32(new_ltail_lnum); c->mst_node->root_lnum = cpu_to_le32(zroot.lnum); c->mst_node->root_offs = cpu_to_le32(zroot.offs); diff --git a/fs/ubifs/log.c b/fs/ubifs/log.c index e14829e50693..3e0aa7367556 100644 --- a/fs/ubifs/log.c +++ b/fs/ubifs/log.c @@ -412,7 +412,7 @@ int ubifs_log_start_commit(struct ubifs_info *c, int *ltail_lnum) return -ENOMEM; cs->ch.node_type = UBIFS_CS_NODE; - cs->cmt_no = cpu_to_le64(c->cmt_no + 1); + cs->cmt_no = cpu_to_le64(c->cmt_no); ubifs_prepare_node(c, cs, UBIFS_CS_NODE_SZ, 0); /* diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c index 3afeb9242c6a..02d3462f4d3e 100644 --- a/fs/ubifs/orphan.c +++ b/fs/ubifs/orphan.c @@ -310,10 +310,10 @@ static int write_orph_node(struct ubifs_info *c, int atomic) c->cmt_orphans -= cnt; spin_unlock(&c->orphan_lock); if (c->cmt_orphans) - orph->cmt_no = cpu_to_le64(c->cmt_no + 1); + orph->cmt_no = cpu_to_le64(c->cmt_no); else /* Mark the last node of the commit */ - orph->cmt_no = cpu_to_le64((c->cmt_no + 1) | (1ULL << 63)); + orph->cmt_no = cpu_to_le64((c->cmt_no) | (1ULL << 63)); ubifs_assert(c->ohead_offs + len <= c->leb_size); ubifs_assert(c->ohead_lnum >= c->orph_first); ubifs_assert(c->ohead_lnum <= c->orph_last); diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index 6ddd1de2ea64..21502b6040f0 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -865,7 +865,8 @@ struct ubifs_mount_opts { * @highest_inum: highest used inode number * @vfs_gen: VFS inode generation counter * @max_sqnum: current global sequence number - * @cmt_no: commit number (last successfully completed commit) + * @cmt_no: commit number of the last successfully completed commit, protected + * by @commit_sem * @cnt_lock: protects @highest_inum, @vfs_gen, and @max_sqnum counters * @fmt_version: UBIFS on-flash format version * @uuid: UUID from super block -- cgit v1.2.3 From de94eb558b542873d3f6f9ede1b8575fb5662248 Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Tue, 22 Jul 2008 13:06:20 +0300 Subject: UBIFS: optimize deletions Every time anything is deleted, UBIFS writes the deletion inode node twice - once in 'ubifs_jnl_update()' and the second time in 'ubifs_jnl_write_inode()'. However, the second write is not needed if no commit happened after 'ubifs_jnl_update()'. This patch checks that condition and avoids writing the deletion inode for the second time. Signed-off-by: Artem Bityutskiy --- fs/ubifs/journal.c | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/ubifs/super.c | 6 ++++-- fs/ubifs/ubifs.h | 12 ++++++++--- 3 files changed, 73 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c index 3bc3fc947099..0bcee7d221e8 100644 --- a/fs/ubifs/journal.c +++ b/fs/ubifs/journal.c @@ -604,6 +604,7 @@ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir, release_head(c, BASEHD); goto out_finish; } + ui->del_cmtno = c->cmt_no; } err = write_head(c, BASEHD, dent, len, &lnum, &dent_offs, sync); @@ -820,6 +821,64 @@ out_free: return err; } +/** + * ubifs_jnl_write_inode - delete an inode. + * @c: UBIFS file-system description object + * @inode: inode to delete + * + * This function deletes inode @inode which includes removing it from orphans, + * deleting it from TNC and, in some cases, writing a deletion inode to the + * journal. + * + * When regular file inodes are unlinked or a directory inode is removed, the + * 'ubifs_jnl_update()' function write corresponding deletion inode and + * direntry to the media, and adds the inode to orphans. After this, when the + * last reference to this inode has been dropped, this function is called. In + * general, it has to write one more deletion inode to the media, because if + * a commit happened between 'ubifs_jnl_update()' and + * 'ubifs_jnl_delete_inode()', the deletion inode is not in the journal + * anymore, and in fact it might be not on the flash anymore, becouse it might + * have been garbage-collected already. And for optimization reasond UBIFS does + * not read the orphan area if it has been unmounted cleanly, so it would have + * no indication in the journal that there is a deleted inode which has to be + * removed from TNC. + * + * However, if there was no commit between 'ubifs_jnl_update()' and + * 'ubifs_jnl_delete_inode()', then there is no need to write the deletion + * inode to the media for the second time. And this is quite typical case. + * + * This function returns zero in case of success and a negative error code in + * case of failure. + */ +int ubifs_jnl_delete_inode(struct ubifs_info *c, const struct inode *inode) +{ + int err; + struct ubifs_inode *ui = ubifs_inode(inode); + + ubifs_assert(inode->i_nlink == 0); + + if (ui->del_cmtno != c->cmt_no) + /* A commit happened for sure */ + return ubifs_jnl_write_inode(c, inode); + + down_read(&c->commit_sem); + /* + * Check commit number again, because the first test has been done + * without @c->commit_sem, so a commit might have happened. + */ + if (ui->del_cmtno != c->cmt_no) { + up_read(&c->commit_sem); + return ubifs_jnl_write_inode(c, inode); + } + + ubifs_delete_orphan(c, inode->i_ino); + err = ubifs_tnc_remove_ino(c, inode->i_ino); + if (err) + ubifs_ro_mode(c, err); + up_read(&c->commit_sem); + return err; +} + /** * ubifs_jnl_rename - rename a directory entry. * @c: UBIFS file-system description object @@ -928,6 +987,7 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, release_head(c, BASEHD); goto out_finish; } + new_ui->del_cmtno = c->cmt_no; } err = write_head(c, BASEHD, dent, len, &lnum, &offs, sync); diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index cf1fb6cffa09..6cc4175f23c1 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -341,13 +341,15 @@ static void ubifs_delete_inode(struct inode *inode) goto out; ui->ui_size = inode->i_size = 0; - err = ubifs_jnl_write_inode(c, inode); + err = ubifs_jnl_delete_inode(c, inode); if (err) /* * Worst case we have a lost orphan inode wasting space, so a * simple error message is ok here. */ - ubifs_err("can't write inode %lu, error %d", inode->i_ino, err); + ubifs_err("can't delete inode %lu, error %d", + inode->i_ino, err); + out: if (ui->dirty) ubifs_release_dirty_inode_budget(c, ui); diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index 21502b6040f0..dfb4b93614ff 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -322,6 +322,8 @@ struct ubifs_gced_idx_leb { * struct ubifs_inode - UBIFS in-memory inode description. * @vfs_inode: VFS inode description object * @creat_sqnum: sequence number at time of creation + * @del_cmtno: commit number corresponding to the time the inode was deleted, + * protected by @c->commit_sem; * @xattr_size: summarized size of all extended attributes in bytes * @xattr_cnt: count of extended attributes this inode has * @xattr_names: sum of lengths of all extended attribute names belonging to @@ -372,7 +374,10 @@ struct ubifs_gced_idx_leb { */ struct ubifs_inode { struct inode vfs_inode; - unsigned long long creat_sqnum; + union { + unsigned long long creat_sqnum; + unsigned long long del_cmtno; + }; unsigned int xattr_size; unsigned int xattr_cnt; unsigned int xattr_names; @@ -779,7 +784,7 @@ struct ubifs_compressor { /** * struct ubifs_budget_req - budget requirements of an operation. * - * @fast: non-zero if the budgeting should try to aquire budget quickly and + * @fast: non-zero if the budgeting should try to acquire budget quickly and * should not try to call write-back * @recalculate: non-zero if @idx_growth, @data_growth, and @dd_growth fields * have to be re-calculated @@ -860,7 +865,7 @@ struct ubifs_mount_opts { * struct ubifs_info - UBIFS file-system description data structure * (per-superblock). * @vfs_sb: VFS @struct super_block object - * @bdi: backing device info object to make VFS happy and disable readahead + * @bdi: backing device info object to make VFS happy and disable read-ahead * * @highest_inum: highest used inode number * @vfs_gen: VFS inode generation counter @@ -1402,6 +1407,7 @@ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir, int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode, const union ubifs_key *key, const void *buf, int len); int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode); +int ubifs_jnl_delete_inode(struct ubifs_info *c, const struct inode *inode); int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, const struct dentry *old_dentry, const struct inode *new_dir, -- cgit v1.2.3 From bc813355c704e5916a86dd4b96fd226bfa3fc6ca Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Wed, 23 Jul 2008 15:23:11 +0300 Subject: UBIFS: do not union creat_sqnum and del_cmtno The values in these two fields need to be preserved independently and so a union cannot be used. Signed-off-by: Adrian Hunter --- fs/ubifs/ubifs.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index dfb4b93614ff..d342c6907244 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -374,10 +374,8 @@ struct ubifs_gced_idx_leb { */ struct ubifs_inode { struct inode vfs_inode; - union { - unsigned long long creat_sqnum; - unsigned long long del_cmtno; - }; + unsigned long long creat_sqnum; + unsigned long long del_cmtno; unsigned int xattr_size; unsigned int xattr_cnt; unsigned int xattr_names; -- cgit v1.2.3 From 7d62ff2c396470bb62a3853f14d3962eac1da974 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Wed, 23 Jul 2008 15:48:39 +0300 Subject: UBIFS: fix typos in comments Signed-off-by: Adrian Hunter --- fs/ubifs/journal.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c index 0bcee7d221e8..25de6fde383f 100644 --- a/fs/ubifs/journal.c +++ b/fs/ubifs/journal.c @@ -822,7 +822,7 @@ out_free: } /** - * ubifs_jnl_write_inode - delete an inode. + * ubifs_jnl_delete_inode - delete an inode. * @c: UBIFS file-system description object * @inode: inode to delete * @@ -831,21 +831,21 @@ out_free: * journal. * * When regular file inodes are unlinked or a directory inode is removed, the - * 'ubifs_jnl_update()' function write corresponding deletion inode and + * 'ubifs_jnl_update()' function writes a corresponding deletion inode and * direntry to the media, and adds the inode to orphans. After this, when the * last reference to this inode has been dropped, this function is called. In * general, it has to write one more deletion inode to the media, because if * a commit happened between 'ubifs_jnl_update()' and * 'ubifs_jnl_delete_inode()', the deletion inode is not in the journal - * anymore, and in fact it might be not on the flash anymore, becouse it might - * have been garbage-collected already. And for optimization reasond UBIFS does + * anymore, and in fact it might not be on the flash anymore, because it might + * have been garbage-collected already. And for optimization reasons UBIFS does * not read the orphan area if it has been unmounted cleanly, so it would have * no indication in the journal that there is a deleted inode which has to be * removed from TNC. * * However, if there was no commit between 'ubifs_jnl_update()' and * 'ubifs_jnl_delete_inode()', then there is no need to write the deletion - * inode to the media for the second time. And this is quite typical case. + * inode to the media for the second time. And this is quite a typical case. * * This function returns zero in case of success and a negative error code in * case of failure. -- cgit v1.2.3 From f769108424a19c7758546d1d7d19f098b1a33759 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Wed, 23 Jul 2008 16:55:55 +0300 Subject: UBIFS: correct orphan deletion order The debug function that checks orphans, does so using the TNC mutex. That means it will not see a correct picture if the inode is removed from the orphan tree before it is removed from TNC. Signed-off-by: Adrian Hunter --- fs/ubifs/journal.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c index 25de6fde383f..acdae00aaa54 100644 --- a/fs/ubifs/journal.c +++ b/fs/ubifs/journal.c @@ -871,10 +871,11 @@ int ubifs_jnl_delete_inode(struct ubifs_info *c, const struct inode *inode) return ubifs_jnl_write_inode(c, inode); } - ubifs_delete_orphan(c, inode->i_ino); err = ubifs_tnc_remove_ino(c, inode->i_ino); if (err) ubifs_ro_mode(c, err); + else + ubifs_delete_orphan(c, inode->i_ino); up_read(&c->commit_sem); return err; } -- cgit v1.2.3 From 547000da6412c45456ff2ff44a171d01027bd727 Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Thu, 24 Jul 2008 14:42:05 +0300 Subject: UBIFS: improve budgeting checks Budgeting is a crucial UBIFS subsystem - add more assertions to improve requests checking. This is not compiled in when UBIFS debugging is disabled. Signed-off-by: Artem Bityutskiy --- fs/ubifs/budget.c | 12 ++++++++++++ fs/ubifs/ubifs.h | 8 +++++++- 2 files changed, 19 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c index 12a1717db87c..f5afce5f37bd 100644 --- a/fs/ubifs/budget.c +++ b/fs/ubifs/budget.c @@ -543,6 +543,12 @@ int ubifs_budget_space(struct ubifs_info *c, struct ubifs_budget_req *req) int err, idx_growth, data_growth, dd_growth; struct retries_info ri; + ubifs_assert(req->new_page <= 1); + ubifs_assert(req->dirtied_page <= 1); + ubifs_assert(req->new_dent <= 1); + ubifs_assert(req->mod_dent <= 1); + ubifs_assert(req->new_ino <= 1); + ubifs_assert(req->new_ino_d <= UBIFS_MAX_INO_DATA); ubifs_assert(req->dirtied_ino <= 4); ubifs_assert(req->dirtied_ino_d <= UBIFS_MAX_INO_DATA * 4); @@ -618,6 +624,12 @@ again: */ void ubifs_release_budget(struct ubifs_info *c, struct ubifs_budget_req *req) { + ubifs_assert(req->new_page <= 1); + ubifs_assert(req->dirtied_page <= 1); + ubifs_assert(req->new_dent <= 1); + ubifs_assert(req->mod_dent <= 1); + ubifs_assert(req->new_ino <= 1); + ubifs_assert(req->new_ino_d <= UBIFS_MAX_INO_DATA); ubifs_assert(req->dirtied_ino <= 4); ubifs_assert(req->dirtied_ino_d <= UBIFS_MAX_INO_DATA * 4); if (!req->recalculate) { diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index d342c6907244..565dca2ec0bd 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -812,17 +812,23 @@ struct ubifs_compressor { struct ubifs_budget_req { unsigned int fast:1; unsigned int recalculate:1; +#ifndef UBIFS_DEBUG unsigned int new_page:1; unsigned int dirtied_page:1; unsigned int new_dent:1; unsigned int mod_dent:1; unsigned int new_ino:1; unsigned int new_ino_d:13; -#ifndef UBIFS_DEBUG unsigned int dirtied_ino:4; unsigned int dirtied_ino_d:15; #else /* Not bit-fields to check for overflows */ + unsigned int new_page; + unsigned int dirtied_page; + unsigned int new_dent; + unsigned int mod_dent; + unsigned int new_ino; + unsigned int new_ino_d; unsigned int dirtied_ino; unsigned int dirtied_ino_d; #endif -- cgit v1.2.3 From dab4b4d2f915a65022343012a795f4ae4ae7e83c Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Thu, 24 Jul 2008 14:52:45 +0300 Subject: UBIFS: align inode data to eight UBIFS aligns node lengths to 8, so budgeting has to do the same. Well, direntry, inode, and page budgets are already aligned, but not inode data budget (e.g., data in special devices or symlinks). Do this for inode data as well. Also, add corresponding debugging checks. Signed-off-by: Artem Bityutskiy --- fs/ubifs/budget.c | 10 +++++++++- fs/ubifs/dir.c | 10 ++++++---- fs/ubifs/file.c | 4 ++-- fs/ubifs/ubifs.h | 4 ++++ fs/ubifs/xattr.c | 4 ++-- 5 files changed, 23 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c index f5afce5f37bd..a3978ba4215e 100644 --- a/fs/ubifs/budget.c +++ b/fs/ubifs/budget.c @@ -551,6 +551,8 @@ int ubifs_budget_space(struct ubifs_info *c, struct ubifs_budget_req *req) ubifs_assert(req->new_ino_d <= UBIFS_MAX_INO_DATA); ubifs_assert(req->dirtied_ino <= 4); ubifs_assert(req->dirtied_ino_d <= UBIFS_MAX_INO_DATA * 4); + ubifs_assert(!(req->new_ino_d & 7)); + ubifs_assert(!(req->dirtied_ino_d & 7)); data_growth = calc_data_growth(c, req); dd_growth = calc_dd_growth(c, req); @@ -632,6 +634,8 @@ void ubifs_release_budget(struct ubifs_info *c, struct ubifs_budget_req *req) ubifs_assert(req->new_ino_d <= UBIFS_MAX_INO_DATA); ubifs_assert(req->dirtied_ino <= 4); ubifs_assert(req->dirtied_ino_d <= UBIFS_MAX_INO_DATA * 4); + ubifs_assert(!(req->new_ino_d & 7)); + ubifs_assert(!(req->dirtied_ino_d & 7)); if (!req->recalculate) { ubifs_assert(req->idx_growth >= 0); ubifs_assert(req->data_growth >= 0); @@ -659,7 +663,11 @@ void ubifs_release_budget(struct ubifs_info *c, struct ubifs_budget_req *req) ubifs_assert(c->budg_idx_growth >= 0); ubifs_assert(c->budg_data_growth >= 0); + ubifs_assert(c->budg_dd_growth >= 0); ubifs_assert(c->min_idx_lebs < c->main_lebs); + ubifs_assert(!(c->budg_idx_growth & 7)); + ubifs_assert(!(c->budg_data_growth & 7)); + ubifs_assert(!(c->budg_dd_growth & 7)); spin_unlock(&c->space_lock); } @@ -701,7 +709,7 @@ void ubifs_release_dirty_inode_budget(struct ubifs_info *c, struct ubifs_budget_req req; memset(&req, 0, sizeof(struct ubifs_budget_req)); - req.dd_growth = c->inode_budget + ui->data_len; + req.dd_growth = c->inode_budget + ALIGN(ui->data_len, 8); ubifs_release_budget(c, &req); } diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index a79e850fee6d..eba3a8a7c333 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -525,7 +525,7 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir, struct ubifs_inode *dir_ui = ubifs_inode(dir); int err, sz_change = CALC_DENT_SIZE(dentry->d_name.len); struct ubifs_budget_req req = { .new_dent = 1, .dirtied_ino = 2, - .dirtied_ino_d = ui->data_len }; + .dirtied_ino_d = ALIGN(ui->data_len, 8) }; /* * Budget request settings: new direntry, changing the target inode, @@ -788,7 +788,8 @@ static int ubifs_mknod(struct inode *dir, struct dentry *dentry, int sz_change = CALC_DENT_SIZE(dentry->d_name.len); int err, devlen = 0; struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1, - .new_ino_d = devlen, .dirtied_ino = 1 }; + .new_ino_d = ALIGN(devlen, 8), + .dirtied_ino = 1 }; /* * Budget request settings: new inode, new direntry and changing parent @@ -862,7 +863,8 @@ static int ubifs_symlink(struct inode *dir, struct dentry *dentry, int err, len = strlen(symname); int sz_change = CALC_DENT_SIZE(dentry->d_name.len); struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1, - .new_ino_d = len, .dirtied_ino = 1 }; + .new_ino_d = ALIGN(len, 8), + .dirtied_ino = 1 }; /* * Budget request settings: new inode, new direntry and changing parent @@ -1011,7 +1013,7 @@ static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry, struct ubifs_budget_req req = { .new_dent = 1, .mod_dent = 1, .dirtied_ino = 3 }; struct ubifs_budget_req ino_req = { .dirtied_ino = 1, - .dirtied_ino_d = old_inode_ui->data_len }; + .dirtied_ino_d = ALIGN(old_inode_ui->data_len, 8) }; struct timespec time; /* diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 01598f28020b..9fecab2f30bc 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -890,7 +890,7 @@ static int do_setattr(struct ubifs_info *c, struct inode *inode, loff_t new_size = attr->ia_size; struct ubifs_inode *ui = ubifs_inode(inode); struct ubifs_budget_req req = { .dirtied_ino = 1, - .dirtied_ino_d = ui->data_len }; + .dirtied_ino_d = ALIGN(ui->data_len, 8) }; err = ubifs_budget_space(c, &req); if (err) @@ -1052,7 +1052,7 @@ static int update_mctime(struct ubifs_info *c, struct inode *inode) if (mctime_update_needed(inode, &now)) { int err, release; struct ubifs_budget_req req = { .dirtied_ino = 1, - .dirtied_ino_d = ui->data_len }; + .dirtied_ino_d = ALIGN(ui->data_len, 8) }; err = ubifs_budget_space(c, &req); if (err) diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index 565dca2ec0bd..73ca8a009798 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -808,6 +808,10 @@ struct ubifs_compressor { * An inode may contain 4KiB of data at max., thus the widths of @new_ino_d * is 13 bits, and @dirtied_ino_d - 15, because up to 4 inodes may be made * dirty by the re-name operation. + * + * Note, UBIFS aligns node lengths to 8-bytes boundary, so the requester has to + * make sure the amount of inode data which contribute to @new_ino_d and + * @dirtied_ino_d fields are aligned. */ struct ubifs_budget_req { unsigned int fast:1; diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c index 1388a078e1a9..39e831d074ce 100644 --- a/fs/ubifs/xattr.c +++ b/fs/ubifs/xattr.c @@ -103,8 +103,8 @@ static int create_xattr(struct ubifs_info *c, struct inode *host, struct inode *inode; struct ubifs_inode *ui, *host_ui = ubifs_inode(host); struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1, - .new_ino_d = size, .dirtied_ino = 1, - .dirtied_ino_d = host_ui->data_len}; + .new_ino_d = size, .dirtied_ino = 1, + .dirtied_ino_d = ALIGN(host_ui->data_len, 8)}; if (host_ui->xattr_cnt >= MAX_XATTRS_PER_INODE) return -ENOSPC; -- cgit v1.2.3 From 1de9415906bccab51fb74c6adf575948610f0909 Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Fri, 25 Jul 2008 12:58:38 +0300 Subject: UBIFS: print pid in dump function Useful when something fails and there are many processes racing. Signed-off-by: Artem Bityutskiy --- fs/ubifs/debug.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index 0adfb29b8503..b9cb77473758 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -568,8 +568,8 @@ void dbg_dump_budget_req(const struct ubifs_budget_req *req) void dbg_dump_lstats(const struct ubifs_lp_stats *lst) { spin_lock(&dbg_lock); - printk(KERN_DEBUG "Lprops statistics: empty_lebs %d, idx_lebs %d\n", - lst->empty_lebs, lst->idx_lebs); + printk(KERN_DEBUG "(pid %d) Lprops statistics: empty_lebs %d, " + "idx_lebs %d\n", current->pid, lst->empty_lebs, lst->idx_lebs); printk(KERN_DEBUG "\ttaken_empty_lebs %d, total_free %lld, " "total_dirty %lld\n", lst->taken_empty_lebs, lst->total_free, lst->total_dirty); @@ -587,8 +587,8 @@ void dbg_dump_budg(struct ubifs_info *c) struct ubifs_gced_idx_leb *idx_gc; spin_lock(&dbg_lock); - printk(KERN_DEBUG "Budgeting info: budg_data_growth %lld, " - "budg_dd_growth %lld, budg_idx_growth %lld\n", + printk(KERN_DEBUG "(pid %d) Budgeting info: budg_data_growth %lld, " + "budg_dd_growth %lld, budg_idx_growth %lld\n", current->pid, c->budg_data_growth, c->budg_dd_growth, c->budg_idx_growth); printk(KERN_DEBUG "\tdata budget sum %lld, total budget sum %lld, " "freeable_cnt %d\n", c->budg_data_growth + c->budg_dd_growth, @@ -634,7 +634,7 @@ void dbg_dump_lprops(struct ubifs_info *c) struct ubifs_lprops lp; struct ubifs_lp_stats lst; - printk(KERN_DEBUG "Dumping LEB properties\n"); + printk(KERN_DEBUG "(pid %d) Dumping LEB properties\n", current->pid); ubifs_get_lp_stats(c, &lst); dbg_dump_lstats(&lst); @@ -655,7 +655,7 @@ void dbg_dump_leb(const struct ubifs_info *c, int lnum) if (dbg_failure_mode) return; - printk(KERN_DEBUG "Dumping LEB %d\n", lnum); + printk(KERN_DEBUG "(pid %d) Dumping LEB %d\n", current->pid, lnum); sleb = ubifs_scan(c, lnum, 0, c->dbg_buf); if (IS_ERR(sleb)) { @@ -720,8 +720,8 @@ void dbg_dump_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat) { int i; - printk(KERN_DEBUG "Dumping heap cat %d (%d elements)\n", - cat, heap->cnt); + printk(KERN_DEBUG "(pid %d) Dumping heap cat %d (%d elements)\n", + current->pid, cat, heap->cnt); for (i = 0; i < heap->cnt; i++) { struct ubifs_lprops *lprops = heap->arr[i]; @@ -736,7 +736,7 @@ void dbg_dump_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode, { int i; - printk(KERN_DEBUG "Dumping pnode:\n"); + printk(KERN_DEBUG "(pid %d) Dumping pnode:\n", current->pid); printk(KERN_DEBUG "\taddress %zx parent %zx cnext %zx\n", (size_t)pnode, (size_t)parent, (size_t)pnode->cnext); printk(KERN_DEBUG "\tflags %lu iip %d level %d num %d\n", @@ -755,7 +755,7 @@ void dbg_dump_tnc(struct ubifs_info *c) int level; printk(KERN_DEBUG "\n"); - printk(KERN_DEBUG "Dumping the TNC tree\n"); + printk(KERN_DEBUG "(pid %d) Dumping the TNC tree\n", current->pid); znode = ubifs_tnc_levelorder_next(c->zroot.znode, NULL); level = znode->level; printk(KERN_DEBUG "== Level %d ==\n", level); -- cgit v1.2.3 From b364b41aeb0289be402be83eebca92eb90bfcb8b Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Fri, 25 Jul 2008 14:38:51 +0300 Subject: UBIFS: reserve more space for index At the moment UBIFS reserves twice old index size space for the index. But this is not enough in some cases, because if the indexing node are very fragmented and there are many small gaps, while the dirty index has big znodes - in-the-gaps method would fail. Thus, reserve trise as more, in which case we are guaranteed that we can commit in any case. Signed-off-by: Artem Bityutskiy --- fs/ubifs/budget.c | 8 ++++---- fs/ubifs/find.c | 9 +++++++-- fs/ubifs/misc.h | 2 +- fs/ubifs/ubifs-media.h | 4 ++-- 4 files changed, 14 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c index a3978ba4215e..323d83a4d099 100644 --- a/fs/ubifs/budget.c +++ b/fs/ubifs/budget.c @@ -263,8 +263,8 @@ int ubifs_calc_min_idx_lebs(struct ubifs_info *c) idx_size = c->old_idx_sz + c->budg_idx_growth + c->budg_uncommitted_idx; - /* And make sure we have twice the index size of space reserved */ - idx_size <<= 1; + /* And make sure we have trice the index size of space reserved */ + idx_size = idx_size + (idx_size << 1); /* * We do not maintain 'old_idx_size' as 'old_idx_lebs'/'old_idx_bytes' @@ -388,11 +388,11 @@ static int can_use_rp(struct ubifs_info *c) * This function makes sure UBIFS has enough free eraseblocks for index growth * and data. * - * When budgeting index space, UBIFS reserves twice as more LEBs as the index + * When budgeting index space, UBIFS reserves trice as more LEBs as the index * would take if it was consolidated and written to the flash. This guarantees * that the "in-the-gaps" commit method always succeeds and UBIFS will always * be able to commit dirty index. So this function basically adds amount of - * budgeted index space to the size of the current index, multiplies this by 2, + * budgeted index space to the size of the current index, multiplies this by 3, * and makes sure this does not exceed the amount of free eraseblocks. * * Notes about @c->min_idx_lebs and @c->lst.idx_lebs variables: diff --git a/fs/ubifs/find.c b/fs/ubifs/find.c index 10394c548367..c70c7679c1bf 100644 --- a/fs/ubifs/find.c +++ b/fs/ubifs/find.c @@ -290,9 +290,14 @@ int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp, idx_lp = idx_heap->arr[0]; sum = idx_lp->free + idx_lp->dirty; /* - * Since we reserve twice as more space for the index than it + * Since we reserve trice as more space for the index than it * actually takes, it does not make sense to pick indexing LEBs - * with less than half LEB of dirty space. + * with less than, say, half LEB of dirty space. May be half is + * not the optimal boundary - this should be tested and + * checked. This boundary should determine how much we use + * in-the-gaps to consolidate the index comparing to how much + * we use garbage collector to consolidate it. The "half" + * criteria just feels to be fine. */ if (sum < min_space || sum < c->half_leb_size) idx_lp = NULL; diff --git a/fs/ubifs/misc.h b/fs/ubifs/misc.h index cd83ffc8101c..87dabf9fe742 100644 --- a/fs/ubifs/misc.h +++ b/fs/ubifs/misc.h @@ -308,7 +308,7 @@ static inline long long ubifs_reported_space(const struct ubifs_info *c, { int divisor, factor; - divisor = UBIFS_MAX_DATA_NODE_SZ + (c->max_idx_node_sz << 1); + divisor = UBIFS_MAX_DATA_NODE_SZ + (c->max_idx_node_sz * 3); factor = UBIFS_MAX_DATA_NODE_SZ - UBIFS_DATA_NODE_SZ; do_div(free, divisor); diff --git a/fs/ubifs/ubifs-media.h b/fs/ubifs/ubifs-media.h index 0cc7da9bed47..bd2121f3426e 100644 --- a/fs/ubifs/ubifs-media.h +++ b/fs/ubifs/ubifs-media.h @@ -228,10 +228,10 @@ enum { /* Minimum number of orphan area logical eraseblocks */ #define UBIFS_MIN_ORPH_LEBS 1 /* - * Minimum number of main area logical eraseblocks (buds, 2 for the index, 1 + * Minimum number of main area logical eraseblocks (buds, 3 for the index, 1 * for GC, 1 for deletions, and at least 1 for committed data). */ -#define UBIFS_MIN_MAIN_LEBS (UBIFS_MIN_BUD_LEBS + 5) +#define UBIFS_MIN_MAIN_LEBS (UBIFS_MIN_BUD_LEBS + 6) /* Minimum number of logical eraseblocks */ #define UBIFS_MIN_LEB_CNT (UBIFS_SB_LEBS + UBIFS_MST_LEBS + \ -- cgit v1.2.3 From 0010f18afc5f8ba25e1d20e3165894c32a65af02 Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Fri, 25 Jul 2008 16:39:44 +0300 Subject: UBIFS: minor tweaks in commit No functional changes, just lessen the amount of indentations. Signed-off-by: Artem Bityutskiy --- fs/ubifs/tnc_commit.c | 37 ++++++++++++++++++------------------- 1 file changed, 18 insertions(+), 19 deletions(-) (limited to 'fs') diff --git a/fs/ubifs/tnc_commit.c b/fs/ubifs/tnc_commit.c index 8117e65ba2e9..8ac76b1c2d55 100644 --- a/fs/ubifs/tnc_commit.c +++ b/fs/ubifs/tnc_commit.c @@ -372,26 +372,25 @@ static int layout_in_gaps(struct ubifs_info *c, int cnt) written = layout_leb_in_gaps(c, p); if (written < 0) { err = written; - if (err == -ENOSPC) { - if (!dbg_force_in_the_gaps_enabled) { - /* - * Do not print scary warnings if the - * debugging option which forces - * in-the-gaps is enabled. - */ - ubifs_err("out of space"); - spin_lock(&c->space_lock); - dbg_dump_budg(c); - spin_unlock(&c->space_lock); - dbg_dump_lprops(c); - } - /* Try to commit anyway */ - err = 0; - break; + if (err != -ENOSPC) { + kfree(c->gap_lebs); + c->gap_lebs = NULL; + return err; } - kfree(c->gap_lebs); - c->gap_lebs = NULL; - return err; + if (!dbg_force_in_the_gaps_enabled) { + /* + * Do not print scary warnings if the debugging + * option which forces in-the-gaps is enabled. + */ + ubifs_err("out of space"); + spin_lock(&c->space_lock); + dbg_dump_budg(c); + spin_unlock(&c->space_lock); + dbg_dump_lprops(c); + } + /* Try to commit anyway */ + err = 0; + break; } p++; cnt -= written; -- cgit v1.2.3 From 22bc7fa8c5da09805edc6a6199ce81373b2c207d Mon Sep 17 00:00:00 2001 From: Zoltan Sogor Date: Mon, 28 Jul 2008 16:28:49 +0200 Subject: UBIFS: support splice_write Signed-off-by: Zoltan Sogor Signed-off-by: Artem Bityutskiy --- fs/ubifs/file.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 9fecab2f30bc..4071d1cae29f 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1271,6 +1271,7 @@ struct file_operations ubifs_file_operations = { .fsync = ubifs_fsync, .unlocked_ioctl = ubifs_ioctl, .splice_read = generic_file_splice_read, + .splice_write = generic_file_splice_write, #ifdef CONFIG_COMPAT .compat_ioctl = ubifs_compat_ioctl, #endif -- cgit v1.2.3 From 3a13252c6f3a029ac992a36910e945f361482797 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Wed, 30 Jul 2008 12:18:02 +0300 Subject: UBIFS: correct spelling of "thrice". Signed-off-by: Adrian Hunter --- fs/ubifs/budget.c | 4 ++-- fs/ubifs/find.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c index 323d83a4d099..154098157473 100644 --- a/fs/ubifs/budget.c +++ b/fs/ubifs/budget.c @@ -263,7 +263,7 @@ int ubifs_calc_min_idx_lebs(struct ubifs_info *c) idx_size = c->old_idx_sz + c->budg_idx_growth + c->budg_uncommitted_idx; - /* And make sure we have trice the index size of space reserved */ + /* And make sure we have thrice the index size of space reserved */ idx_size = idx_size + (idx_size << 1); /* @@ -388,7 +388,7 @@ static int can_use_rp(struct ubifs_info *c) * This function makes sure UBIFS has enough free eraseblocks for index growth * and data. * - * When budgeting index space, UBIFS reserves trice as more LEBs as the index + * When budgeting index space, UBIFS reserves thrice as many LEBs as the index * would take if it was consolidated and written to the flash. This guarantees * that the "in-the-gaps" commit method always succeeds and UBIFS will always * be able to commit dirty index. So this function basically adds amount of diff --git a/fs/ubifs/find.c b/fs/ubifs/find.c index c70c7679c1bf..adee7b5ddeab 100644 --- a/fs/ubifs/find.c +++ b/fs/ubifs/find.c @@ -290,7 +290,7 @@ int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp, idx_lp = idx_heap->arr[0]; sum = idx_lp->free + idx_lp->dirty; /* - * Since we reserve trice as more space for the index than it + * Since we reserve thrice as much space for the index than it * actually takes, it does not make sense to pick indexing LEBs * with less than, say, half LEB of dirty space. May be half is * not the optimal boundary - this should be tested and -- cgit v1.2.3 From 81ffa38e1558f54db190e2d11e7260ab09c4acf2 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Fri, 1 Aug 2008 15:35:08 +0300 Subject: UBIFS: always set i_generation to 0 UBIFS does not presently re-use inode numbers, so leaving i_generation zero is most appropriate for now. Signed-off-by: Adrian Hunter Signed-off-by: Artem Bityutskiy --- fs/ubifs/dir.c | 1 - fs/ubifs/super.c | 2 -- fs/ubifs/ubifs.h | 4 +--- 3 files changed, 1 insertion(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index eba3a8a7c333..0d1ab8967a4c 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -165,7 +165,6 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir, } inode->i_ino = ++c->highest_inum; - inode->i_generation = ++c->vfs_gen; /* * The creation sequence number remains with this inode for its * lifetime. All nodes for this inode have a greater sequence number, diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 6cc4175f23c1..2c268a476413 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -30,7 +30,6 @@ #include #include #include -#include #include #include #include @@ -1671,7 +1670,6 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent) INIT_LIST_HEAD(&c->orph_new); c->highest_inum = UBIFS_FIRST_INO; - get_random_bytes(&c->vfs_gen, sizeof(int)); c->lhead_lnum = c->ltail_lnum = UBIFS_LOG_LNUM; ubi_get_volume_info(ubi, &c->vi); diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index 73ca8a009798..f2dd749d7989 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -876,11 +876,10 @@ struct ubifs_mount_opts { * @bdi: backing device info object to make VFS happy and disable read-ahead * * @highest_inum: highest used inode number - * @vfs_gen: VFS inode generation counter * @max_sqnum: current global sequence number * @cmt_no: commit number of the last successfully completed commit, protected * by @commit_sem - * @cnt_lock: protects @highest_inum, @vfs_gen, and @max_sqnum counters + * @cnt_lock: protects @highest_inum and @max_sqnum counters * @fmt_version: UBIFS on-flash format version * @uuid: UUID from super block * @@ -1117,7 +1116,6 @@ struct ubifs_info { struct backing_dev_info bdi; ino_t highest_inum; - unsigned int vfs_gen; unsigned long long max_sqnum; unsigned long long cmt_no; spinlock_t cnt_lock; -- cgit v1.2.3 From 840dc6b891d521f18bf081bd5a32e4a1f8110abc Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Fri, 1 Aug 2008 18:13:37 +0300 Subject: UBIFS: improve arguments checking in debugging messages Use "if (0) printk()" construct in debugging print macros to make the debugging messages be checked even if debugging is off. This patch also removes some unneeded spaces and blank lines. Signed-off-by: Artem Bityutskiy --- fs/ubifs/debug.h | 143 +++++++++++++++++++++++++++---------------------------- 1 file changed, 69 insertions(+), 74 deletions(-) (limited to 'fs') diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h index 3c4f1e93c9e0..50315fc57185 100644 --- a/fs/ubifs/debug.h +++ b/fs/ubifs/debug.h @@ -27,7 +27,7 @@ #define UBIFS_DBG(op) op -#define ubifs_assert(expr) do { \ +#define ubifs_assert(expr) do { \ if (unlikely(!(expr))) { \ printk(KERN_CRIT "UBIFS assert failed in %s at %u (pid %d)\n", \ __func__, __LINE__, current->pid); \ @@ -73,50 +73,50 @@ const char *dbg_key_str1(const struct ubifs_info *c, const union ubifs_key *key); /* - * DBGKEY macros require dbg_lock to be held, which it is in the dbg message + * DBGKEY macros require @dbg_lock to be held, which it is in the dbg message * macros. */ #define DBGKEY(key) dbg_key_str0(c, (key)) #define DBGKEY1(key) dbg_key_str1(c, (key)) /* General messages */ -#define dbg_gen(fmt, ...) dbg_do_msg(UBIFS_MSG_GEN, fmt, ##__VA_ARGS__) +#define dbg_gen(fmt, ...) dbg_do_msg(UBIFS_MSG_GEN, fmt, ##__VA_ARGS__) /* Additional journal messages */ -#define dbg_jnl(fmt, ...) dbg_do_msg(UBIFS_MSG_JNL, fmt, ##__VA_ARGS__) +#define dbg_jnl(fmt, ...) dbg_do_msg(UBIFS_MSG_JNL, fmt, ##__VA_ARGS__) /* Additional TNC messages */ -#define dbg_tnc(fmt, ...) dbg_do_msg(UBIFS_MSG_TNC, fmt, ##__VA_ARGS__) +#define dbg_tnc(fmt, ...) dbg_do_msg(UBIFS_MSG_TNC, fmt, ##__VA_ARGS__) /* Additional lprops messages */ -#define dbg_lp(fmt, ...) dbg_do_msg(UBIFS_MSG_LP, fmt, ##__VA_ARGS__) +#define dbg_lp(fmt, ...) dbg_do_msg(UBIFS_MSG_LP, fmt, ##__VA_ARGS__) /* Additional LEB find messages */ -#define dbg_find(fmt, ...) dbg_do_msg(UBIFS_MSG_FIND, fmt, ##__VA_ARGS__) +#define dbg_find(fmt, ...) dbg_do_msg(UBIFS_MSG_FIND, fmt, ##__VA_ARGS__) /* Additional mount messages */ -#define dbg_mnt(fmt, ...) dbg_do_msg(UBIFS_MSG_MNT, fmt, ##__VA_ARGS__) +#define dbg_mnt(fmt, ...) dbg_do_msg(UBIFS_MSG_MNT, fmt, ##__VA_ARGS__) /* Additional I/O messages */ -#define dbg_io(fmt, ...) dbg_do_msg(UBIFS_MSG_IO, fmt, ##__VA_ARGS__) +#define dbg_io(fmt, ...) dbg_do_msg(UBIFS_MSG_IO, fmt, ##__VA_ARGS__) /* Additional commit messages */ -#define dbg_cmt(fmt, ...) dbg_do_msg(UBIFS_MSG_CMT, fmt, ##__VA_ARGS__) +#define dbg_cmt(fmt, ...) dbg_do_msg(UBIFS_MSG_CMT, fmt, ##__VA_ARGS__) /* Additional budgeting messages */ -#define dbg_budg(fmt, ...) dbg_do_msg(UBIFS_MSG_BUDG, fmt, ##__VA_ARGS__) +#define dbg_budg(fmt, ...) dbg_do_msg(UBIFS_MSG_BUDG, fmt, ##__VA_ARGS__) /* Additional log messages */ -#define dbg_log(fmt, ...) dbg_do_msg(UBIFS_MSG_LOG, fmt, ##__VA_ARGS__) +#define dbg_log(fmt, ...) dbg_do_msg(UBIFS_MSG_LOG, fmt, ##__VA_ARGS__) /* Additional gc messages */ -#define dbg_gc(fmt, ...) dbg_do_msg(UBIFS_MSG_GC, fmt, ##__VA_ARGS__) +#define dbg_gc(fmt, ...) dbg_do_msg(UBIFS_MSG_GC, fmt, ##__VA_ARGS__) /* Additional scan messages */ -#define dbg_scan(fmt, ...) dbg_do_msg(UBIFS_MSG_SCAN, fmt, ##__VA_ARGS__) +#define dbg_scan(fmt, ...) dbg_do_msg(UBIFS_MSG_SCAN, fmt, ##__VA_ARGS__) /* Additional recovery messages */ -#define dbg_rcvry(fmt, ...) dbg_do_msg(UBIFS_MSG_RCVRY, fmt, ##__VA_ARGS__) +#define dbg_rcvry(fmt, ...) dbg_do_msg(UBIFS_MSG_RCVRY, fmt, ##__VA_ARGS__) /* * Debugging message type flags (must match msg_type_names in debug.c). @@ -239,34 +239,23 @@ typedef int (*dbg_leaf_callback)(struct ubifs_info *c, struct ubifs_zbranch *zbr, void *priv); typedef int (*dbg_znode_callback)(struct ubifs_info *c, struct ubifs_znode *znode, void *priv); - int dbg_walk_index(struct ubifs_info *c, dbg_leaf_callback leaf_cb, dbg_znode_callback znode_cb, void *priv); /* Checking functions */ int dbg_check_lprops(struct ubifs_info *c); - int dbg_old_index_check_init(struct ubifs_info *c, struct ubifs_zbranch *zroot); int dbg_check_old_index(struct ubifs_info *c, struct ubifs_zbranch *zroot); - int dbg_check_cats(struct ubifs_info *c); - int dbg_check_ltab(struct ubifs_info *c); - int dbg_check_synced_i_size(struct inode *inode); - int dbg_check_dir_size(struct ubifs_info *c, const struct inode *dir); - int dbg_check_tnc(struct ubifs_info *c, int extra); - int dbg_check_idx_size(struct ubifs_info *c, long long idx_size); - int dbg_check_filesystem(struct ubifs_info *c); - void dbg_check_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat, int add_pos); - int dbg_check_lprops(struct ubifs_info *c); int dbg_check_lpt_nodes(struct ubifs_info *c, struct ubifs_cnode *cnode, int row, int col); @@ -329,71 +318,77 @@ static inline int dbg_change(struct ubi_volume_desc *desc, int lnum, #else /* !CONFIG_UBIFS_FS_DEBUG */ #define UBIFS_DBG(op) -#define ubifs_assert(expr) ({}) -#define ubifs_assert_cmt_locked(c) + +/* Use "if (0)" to make compiler check arguments even if debugging is off */ +#define ubifs_assert(expr) do { \ + if (0 && (expr)) \ + printk(KERN_CRIT "UBIFS assert failed in %s at %u (pid %d)\n", \ + __func__, __LINE__, current->pid); \ +} while (0) + +#define dbg_err(fmt, ...) do { \ + if (0) \ + ubifs_err(fmt, ##__VA_ARGS__); \ +} while (0) + +#define dbg_msg(fmt, ...) do { \ + if (0) \ + printk(KERN_DEBUG "UBIFS DBG (pid %d): %s: " fmt "\n", \ + current->pid, __func__, ##__VA_ARGS__); \ +} while (0) + #define dbg_dump_stack() -#define dbg_err(fmt, ...) ({}) -#define dbg_msg(fmt, ...) ({}) -#define dbg_key(c, key, fmt, ...) ({}) - -#define dbg_gen(fmt, ...) ({}) -#define dbg_jnl(fmt, ...) ({}) -#define dbg_tnc(fmt, ...) ({}) -#define dbg_lp(fmt, ...) ({}) -#define dbg_find(fmt, ...) ({}) -#define dbg_mnt(fmt, ...) ({}) -#define dbg_io(fmt, ...) ({}) -#define dbg_cmt(fmt, ...) ({}) -#define dbg_budg(fmt, ...) ({}) -#define dbg_log(fmt, ...) ({}) -#define dbg_gc(fmt, ...) ({}) -#define dbg_scan(fmt, ...) ({}) -#define dbg_rcvry(fmt, ...) ({}) - -#define dbg_ntype(type) "" -#define dbg_cstate(cmt_state) "" -#define dbg_get_key_dump(c, key) ({}) -#define dbg_dump_inode(c, inode) ({}) -#define dbg_dump_node(c, node) ({}) -#define dbg_dump_budget_req(req) ({}) -#define dbg_dump_lstats(lst) ({}) -#define dbg_dump_budg(c) ({}) -#define dbg_dump_lprop(c, lp) ({}) -#define dbg_dump_lprops(c) ({}) -#define dbg_dump_leb(c, lnum) ({}) -#define dbg_dump_znode(c, znode) ({}) -#define dbg_dump_heap(c, heap, cat) ({}) -#define dbg_dump_pnode(c, pnode, parent, iip) ({}) -#define dbg_dump_tnc(c) ({}) -#define dbg_dump_index(c) ({}) +#define ubifs_assert_cmt_locked(c) -#define dbg_walk_index(c, leaf_cb, znode_cb, priv) 0 +#define dbg_gen(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_jnl(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_tnc(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_lp(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_find(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_mnt(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_io(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_cmt(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_budg(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_log(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_gc(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_scan(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_rcvry(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) + +#define DBGKEY(key) ((char *)(key)) +#define DBGKEY1(key) ((char *)(key)) + +#define dbg_ntype(type) "" +#define dbg_cstate(cmt_state) "" +#define dbg_get_key_dump(c, key) ({}) +#define dbg_dump_inode(c, inode) ({}) +#define dbg_dump_node(c, node) ({}) +#define dbg_dump_budget_req(req) ({}) +#define dbg_dump_lstats(lst) ({}) +#define dbg_dump_budg(c) ({}) +#define dbg_dump_lprop(c, lp) ({}) +#define dbg_dump_lprops(c) ({}) +#define dbg_dump_leb(c, lnum) ({}) +#define dbg_dump_znode(c, znode) ({}) +#define dbg_dump_heap(c, heap, cat) ({}) +#define dbg_dump_pnode(c, pnode, parent, iip) ({}) +#define dbg_dump_tnc(c) ({}) +#define dbg_dump_index(c) ({}) +#define dbg_walk_index(c, leaf_cb, znode_cb, priv) 0 #define dbg_old_index_check_init(c, zroot) 0 #define dbg_check_old_index(c, zroot) 0 - #define dbg_check_cats(c) 0 - #define dbg_check_ltab(c) 0 - #define dbg_check_synced_i_size(inode) 0 - #define dbg_check_dir_size(c, dir) 0 - #define dbg_check_tnc(c, x) 0 - #define dbg_check_idx_size(c, idx_size) 0 - #define dbg_check_filesystem(c) 0 - #define dbg_check_heap(c, heap, cat, add_pos) ({}) - #define dbg_check_lprops(c) 0 #define dbg_check_lpt_nodes(c, cnode, row, col) 0 - #define dbg_force_in_the_gaps_enabled 0 #define dbg_force_in_the_gaps() 0 - #define dbg_failure_mode 0 #define dbg_failure_mode_registration(c) ({}) #define dbg_failure_mode_deregistration(c) ({}) -- cgit v1.2.3 From 5acd6ff8ac09eb71f3aef2ccccefab658be8aff4 Mon Sep 17 00:00:00 2001 From: Zoltan Sogor Date: Tue, 12 Aug 2008 13:54:54 +0300 Subject: UBIFS: fix budgeting request alignment in xattr code Data length has to be aligned in the budgeting request. Code in xattr.c did not do this. Signed-off-by: Zoltan Sogor Signed-off-by: Artem Bityutskiy --- fs/ubifs/xattr.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c index 39e831d074ce..6f493dea561e 100644 --- a/fs/ubifs/xattr.c +++ b/fs/ubifs/xattr.c @@ -103,8 +103,8 @@ static int create_xattr(struct ubifs_info *c, struct inode *host, struct inode *inode; struct ubifs_inode *ui, *host_ui = ubifs_inode(host); struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1, - .new_ino_d = size, .dirtied_ino = 1, - .dirtied_ino_d = ALIGN(host_ui->data_len, 8)}; + .new_ino_d = ALIGN(size, 8), .dirtied_ino = 1, + .dirtied_ino_d = ALIGN(host_ui->data_len, 8) }; if (host_ui->xattr_cnt >= MAX_XATTRS_PER_INODE) return -ENOSPC; @@ -200,7 +200,7 @@ static int change_xattr(struct ubifs_info *c, struct inode *host, struct ubifs_inode *host_ui = ubifs_inode(host); struct ubifs_inode *ui = ubifs_inode(inode); struct ubifs_budget_req req = { .dirtied_ino = 2, - .dirtied_ino_d = size + host_ui->data_len }; + .dirtied_ino_d = ALIGN(size, 8) + ALIGN(host_ui->data_len, 8) }; ubifs_assert(ui->data_len == inode->i_size); err = ubifs_budget_space(c, &req); @@ -497,8 +497,8 @@ static int remove_xattr(struct ubifs_info *c, struct inode *host, int err; struct ubifs_inode *host_ui = ubifs_inode(host); struct ubifs_inode *ui = ubifs_inode(inode); - struct ubifs_budget_req req = { .dirtied_ino = 1, .mod_dent = 1, - .dirtied_ino_d = host_ui->data_len }; + struct ubifs_budget_req req = { .dirtied_ino = 2, .mod_dent = 1, + .dirtied_ino_d = ALIGN(host_ui->data_len, 8) }; ubifs_assert(ui->data_len == inode->i_size); -- cgit v1.2.3 From 0a883a05c54b326bcf99c0902af28dae0386be0a Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Wed, 13 Aug 2008 14:13:26 +0300 Subject: UBIFS: few commentary fixes Signed-off-by: Artem Bityutskiy --- fs/ubifs/super.c | 8 ++++---- fs/ubifs/ubifs.h | 2 -- 2 files changed, 4 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 2c268a476413..f71e6b8822c4 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -148,7 +148,7 @@ struct inode *ubifs_iget(struct super_block *sb, unsigned long inum) if (err) goto out_invalid; - /* Disable readahead */ + /* Disable read-ahead */ inode->i_mapping->backing_dev_info = &c->bdi; switch (inode->i_mode & S_IFMT) { @@ -344,7 +344,7 @@ static void ubifs_delete_inode(struct inode *inode) if (err) /* * Worst case we have a lost orphan inode wasting space, so a - * simple error message is ok here. + * simple error message is OK here. */ ubifs_err("can't delete inode %lu, error %d", inode->i_ino, err); @@ -1683,10 +1683,10 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent) } /* - * UBIFS provids 'backing_dev_info' in order to disable readahead. For + * UBIFS provides 'backing_dev_info' in order to disable read-ahead. For * UBIFS, I/O is not deferred, it is done immediately in readpage, * which means the user would have to wait not just for their own I/O - * but the readahead I/O as well i.e. completely pointless. + * but the read-ahead I/O as well i.e. completely pointless. * * Read-ahead will be disabled because @c->bdi.ra_pages is 0. */ diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index f2dd749d7989..d7f706f7a302 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -20,8 +20,6 @@ * Adrian Hunter */ -/* Implementation version 0.7 */ - #ifndef __UBIFS_H__ #define __UBIFS_H__ -- cgit v1.2.3 From 720b499c806200d06f4f22c668d46db784117089 Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Wed, 13 Aug 2008 16:16:31 +0300 Subject: UBIFS: remove unneeded check Commit d70b67c8bc72ee23b55381bd6a884f4796692f77 fixed VFS and it never calls FS lookup function in deleted directories now. We may remove corresponding UBIFS check. Signed-off-by: Artem Bityutskiy --- fs/ubifs/dir.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index 0d1ab8967a4c..5c96f1fb7016 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -219,15 +219,7 @@ static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry, err = ubifs_tnc_lookup_nm(c, &key, dent, &dentry->d_name); if (err) { - /* - * Do not hash the direntry if parent 'i_nlink' is zero, because - * this has side-effects - '->delete_inode()' call will not be - * called for the parent orphan inode, because 'd_count' of its - * direntry will stay 1 (it'll be negative direntry I guess) - * and prevent 'iput_final()' until the dentry is destroyed due - * to unmount or memory pressure. - */ - if (err == -ENOENT && dir->i_nlink != 0) { + if (err == -ENOENT) { dbg_gen("not found"); goto done; } -- cgit v1.2.3 From ad661334b8ae421154b121ee6ad3b56807adbf11 Mon Sep 17 00:00:00 2001 From: Steve French Date: Tue, 12 Aug 2008 14:14:40 +0000 Subject: [CIFS] mount of IPC$ breaks with iget patch In looking at network named pipe support on cifs, I noticed that Dave Howell's iget patch: iget: stop CIFS from using iget() and read_inode() broke mounts to IPC$ (the interprocess communication share), and don't handle the error case (when getting info on the root inode fails). Thanks to Gunter who noted a typo in a debug line in the original version of this patch. CC: David Howells CC: Gunter Kukkukk CC: Stable Kernel Signed-off-by: Steve French --- fs/cifs/inode.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 28a22092d450..848286861c31 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -649,6 +649,7 @@ struct inode *cifs_iget(struct super_block *sb, unsigned long ino) inode->i_fop = &simple_dir_operations; inode->i_uid = cifs_sb->mnt_uid; inode->i_gid = cifs_sb->mnt_gid; + } else if (rc) { _FreeXid(xid); iget_failed(inode); return ERR_PTR(rc); -- cgit v1.2.3 From c78c7e35a4709b55d3126624662c8f6d7e3d1a5e Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Tue, 12 Aug 2008 16:30:12 +0300 Subject: UBIFS: xattr bugfixes Xattr code has not been tested for a while and there were serveral bugs. One of them is using wrong inode in 'ubifs_jnl_change_xattr()'. The other is a deadlock in 'ubifs_setxattr()': the i_mutex is locked in 'cap_inode_need_killpriv()' path, so deadlock happens when 'ubifs_setxattr()' tries to lock it again. Thanks to Zoltan Sogor for finding these bugs. Signed-off-by: Artem Bityutskiy --- fs/ubifs/journal.c | 2 +- fs/ubifs/xattr.c | 44 +++++++++++++++++--------------------------- 2 files changed, 18 insertions(+), 28 deletions(-) (limited to 'fs') diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c index acdae00aaa54..22993f867d19 100644 --- a/fs/ubifs/journal.c +++ b/fs/ubifs/journal.c @@ -1374,7 +1374,7 @@ int ubifs_jnl_change_xattr(struct ubifs_info *c, const struct inode *inode, const struct inode *host) { int err, len1, len2, aligned_len, aligned_len1, lnum, offs; - struct ubifs_inode *host_ui = ubifs_inode(inode); + struct ubifs_inode *host_ui = ubifs_inode(host); struct ubifs_ino_node *ino; union ubifs_key key; int sync = IS_DIRSYNC(host); diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c index 6f493dea561e..649bec78b645 100644 --- a/fs/ubifs/xattr.c +++ b/fs/ubifs/xattr.c @@ -61,7 +61,7 @@ /* * Limit the number of extended attributes per inode so that the total size - * (xattr_size) is guaranteeded to fit in an 'unsigned int'. + * (@xattr_size) is guaranteeded to fit in an 'unsigned int'. */ #define MAX_XATTRS_PER_INODE 65535 @@ -110,7 +110,7 @@ static int create_xattr(struct ubifs_info *c, struct inode *host, return -ENOSPC; /* * Linux limits the maximum size of the extended attribute names list - * to %XATTR_LIST_MAX. This means we should not allow creating more* + * to %XATTR_LIST_MAX. This means we should not allow creating more * extended attributes if the name list becomes larger. This limitation * is artificial for UBIFS, though. */ @@ -128,7 +128,6 @@ static int create_xattr(struct ubifs_info *c, struct inode *host, goto out_budg; } - mutex_lock(&host_ui->ui_mutex); /* Re-define all operations to be "nothing" */ inode->i_mapping->a_ops = &none_address_operations; inode->i_op = &none_inode_operations; @@ -141,23 +140,19 @@ static int create_xattr(struct ubifs_info *c, struct inode *host, ui->data = kmalloc(size, GFP_NOFS); if (!ui->data) { err = -ENOMEM; - goto out_unlock; + goto out_free; } - memcpy(ui->data, value, size); + inode->i_size = ui->ui_size = size; + ui->data_len = size; + + mutex_lock(&host_ui->ui_mutex); host->i_ctime = ubifs_current_time(host); host_ui->xattr_cnt += 1; host_ui->xattr_size += CALC_DENT_SIZE(nm->len); host_ui->xattr_size += CALC_XATTR_BYTES(size); host_ui->xattr_names += nm->len; - /* - * We do not use i_size_write() because nobody can race with us as we - * are holding host @host->i_mutex - every xattr operation for this - * inode is serialized by it. - */ - inode->i_size = ui->ui_size = size; - ui->data_len = size; err = ubifs_jnl_update(c, host, nm, inode, 0, 1); if (err) goto out_cancel; @@ -172,8 +167,8 @@ out_cancel: host_ui->xattr_cnt -= 1; host_ui->xattr_size -= CALC_DENT_SIZE(nm->len); host_ui->xattr_size -= CALC_XATTR_BYTES(size); -out_unlock: mutex_unlock(&host_ui->ui_mutex); +out_free: make_bad_inode(inode); iput(inode); out_budg: @@ -207,22 +202,21 @@ static int change_xattr(struct ubifs_info *c, struct inode *host, if (err) return err; - mutex_lock(&host_ui->ui_mutex); - host->i_ctime = ubifs_current_time(host); - host_ui->xattr_size -= CALC_XATTR_BYTES(ui->data_len); - host_ui->xattr_size += CALC_XATTR_BYTES(size); - kfree(ui->data); ui->data = kmalloc(size, GFP_NOFS); if (!ui->data) { err = -ENOMEM; - goto out_unlock; + goto out_free; } - memcpy(ui->data, value, size); inode->i_size = ui->ui_size = size; ui->data_len = size; + mutex_lock(&host_ui->ui_mutex); + host->i_ctime = ubifs_current_time(host); + host_ui->xattr_size -= CALC_XATTR_BYTES(ui->data_len); + host_ui->xattr_size += CALC_XATTR_BYTES(size); + /* * It is important to write the host inode after the xattr inode * because if the host inode gets synchronized (via 'fsync()'), then @@ -240,9 +234,9 @@ static int change_xattr(struct ubifs_info *c, struct inode *host, out_cancel: host_ui->xattr_size -= CALC_XATTR_BYTES(size); host_ui->xattr_size += CALC_XATTR_BYTES(ui->data_len); - make_bad_inode(inode); -out_unlock: mutex_unlock(&host_ui->ui_mutex); + make_bad_inode(inode); +out_free: ubifs_release_budget(c, &req); return err; } @@ -312,6 +306,7 @@ int ubifs_setxattr(struct dentry *dentry, const char *name, dbg_gen("xattr '%s', host ino %lu ('%.*s'), size %zd", name, host->i_ino, dentry->d_name.len, dentry->d_name.name, size); + ubifs_assert(mutex_is_locked(&host->i_mutex)); if (size > UBIFS_MAX_INO_DATA) return -ERANGE; @@ -384,7 +379,6 @@ ssize_t ubifs_getxattr(struct dentry *dentry, const char *name, void *buf, if (!xent) return -ENOMEM; - mutex_lock(&host->i_mutex); xent_key_init(c, &key, host->i_ino, &nm); err = ubifs_tnc_lookup_nm(c, &key, xent, &nm); if (err) { @@ -419,7 +413,6 @@ ssize_t ubifs_getxattr(struct dentry *dentry, const char *name, void *buf, out_iput: iput(inode); out_unlock: - mutex_unlock(&host->i_mutex); kfree(xent); return err; } @@ -449,8 +442,6 @@ ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size) return -ERANGE; lowest_xent_key(c, &key, host->i_ino); - - mutex_lock(&host->i_mutex); while (1) { int type; @@ -479,7 +470,6 @@ ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size) pxent = xent; key_read(c, &xent->key, &key); } - mutex_unlock(&host->i_mutex); kfree(pxent); if (err != -ENOENT) { -- cgit v1.2.3 From 7d455e0030eeab820773e7786605be4d9e56a04b Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 15 Aug 2008 00:40:43 -0700 Subject: fs/inode.c: properly init address_space->writeback_index write_cache_pages() uses i_mapping->writeback_index to pick up where it left off the last time a given inode was found by pdflush or balance_dirty_pages (or anyone else who sets wbc->range_cyclic) alloc_inode() should set it to a sane value so that writeback doesn't start in the middle of a file. It is somewhat difficult to notice the bug since write_cache_pages will loop around to the start of the file and the elevator helps hide the resulting seeks. For whatever reason, Btrfs hits this often. Unpatched, untarring 30 copies of the linux kernel in series runs at 47MB/s on a single sata drive. With this fix, it jumps to 62MB/s. Signed-off-by: Chris Mason Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/inode.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/inode.c b/fs/inode.c index b6726f644530..0487ddba1397 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -166,6 +166,7 @@ static struct inode *alloc_inode(struct super_block *sb) mapping_set_gfp_mask(mapping, GFP_HIGHUSER_PAGECACHE); mapping->assoc_mapping = NULL; mapping->backing_dev_info = &default_backing_dev_info; + mapping->writeback_index = 0; /* * If the block_device provides a backing_dev_info for client -- cgit v1.2.3 From c963343a1150106819773e828c9b237ed977615b Mon Sep 17 00:00:00 2001 From: Bob Copeland Date: Fri, 15 Aug 2008 00:40:46 -0700 Subject: omfs: fix potential oops when directory size is corrupted Testing with a modified fsfuzzer reveals a couple of locations in omfs where filesystem variables are ultimately used as loop counters with insufficient sanity checking. In this case, dir->i_size is used to compute the number of buckets in the directory hash. If too large, readdir will overrun a buffer. Since it's an invariant that dir->i_size is equal to the sysblock size, and we already sanity check that, just use that value instead. This fixes the following oops: BUG: unable to handle kernel paging request at c978e004 IP: [] omfs_readdir+0x18e/0x32f Oops: 0000 [#1] PREEMPT DEBUG_PAGEALLOC Modules linked in: Pid: 4796, comm: ls Not tainted (2.6.27-rc2 #12) EIP: 0060:[] EFLAGS: 00010287 CPU: 0 EIP is at omfs_readdir+0x18e/0x32f EAX: c978d000 EBX: 00000000 ECX: cbfcfaf8 EDX: cb2cf100 ESI: 00001000 EDI: 00000800 EBP: cb2d3f68 ESP: cb2d3f0c DS: 007b ES: 007b FS: 0000 GS: 0033 SS: 0068 Process ls (pid: 4796, ti=cb2d3000 task=cb175f40 task.ti=cb2d3000) Stack: 00000002 00000000 00000000 c018a820 cb2d3f94 cb2cf100 cbfb0000 ffffff10 cbfb3b80 cbfcfaf8 000001c9 00000a09 00000000 00000000 00000000 cbfcfbc8 c9697000 cbfb3b80 22222222 00001000 c08e6cd0 cb2cf100 cbfb3b80 cb2d3f88 Call Trace: [] ? filldir64+0x0/0xcd [] ? vfs_readdir+0x56/0x82 [] ? filldir64+0x0/0xcd [] ? sys_getdents64+0x5e/0xa0 [] ? sysenter_do_call+0x12/0x31 ======================= Code: 00 89 f0 89 f3 0f ac f8 14 81 e3 ff ff 0f 00 48 8d 14 c5 b8 01 00 00 89 45 cc 89 55 f0 e9 8c 01 00 00 8b 4d c8 8b 75 f0 8b 41 18 <8b> 54 30 04 8b 04 30 31 f6 89 5d dc 89 d1 8b 55 b8 0f c8 0f c9 Reported-by: Eric Sesterhenn Signed-off-by: Bob Copeland Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/omfs/inode.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c index a95fe5984f4b..d29047b1b9b0 100644 --- a/fs/omfs/inode.c +++ b/fs/omfs/inode.c @@ -232,8 +232,7 @@ struct inode *omfs_iget(struct super_block *sb, ino_t ino) inode->i_mode = S_IFDIR | (S_IRWXUGO & ~sbi->s_dmask); inode->i_op = &omfs_dir_inops; inode->i_fop = &omfs_dir_operations; - inode->i_size = be32_to_cpu(oi->i_head.h_body_size) + - sizeof(struct omfs_header); + inode->i_size = sbi->s_sys_blocksize; inc_nlink(inode); break; case OMFS_FILE: -- cgit v1.2.3 From 9419fc1c957d600093baaea247fef23cca3b4e93 Mon Sep 17 00:00:00 2001 From: Bob Copeland Date: Fri, 15 Aug 2008 00:40:47 -0700 Subject: omfs: fix oops when file metadata is corrupted A fuzzed fileystem image failed with OMFS when the extent count was used in a loop without being checked against the max number of extents. It also provoked a signed division for an array index that was checked as if unsigned, leading to index by -1. omfsck will be updated to fix these cases, in the meantime bail out gracefully. Reported-by: Eric Sesterhenn Signed-off-by: Bob Copeland Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/omfs/bitmap.c | 5 +++-- fs/omfs/file.c | 33 ++++++++++++++++++++++++++------- 2 files changed, 29 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/omfs/bitmap.c b/fs/omfs/bitmap.c index 697663b01bae..e1c0ec0ae989 100644 --- a/fs/omfs/bitmap.c +++ b/fs/omfs/bitmap.c @@ -92,7 +92,7 @@ int omfs_allocate_block(struct super_block *sb, u64 block) struct buffer_head *bh; struct omfs_sb_info *sbi = OMFS_SB(sb); int bits_per_entry = 8 * sb->s_blocksize; - int map, bit; + unsigned int map, bit; int ret = 0; u64 tmp; @@ -176,7 +176,8 @@ int omfs_clear_range(struct super_block *sb, u64 block, int count) struct omfs_sb_info *sbi = OMFS_SB(sb); int bits_per_entry = 8 * sb->s_blocksize; u64 tmp; - int map, bit, ret; + unsigned int map, bit; + int ret; tmp = block; bit = do_div(tmp, bits_per_entry); diff --git a/fs/omfs/file.c b/fs/omfs/file.c index 7e2499053e4d..834b2331f6b3 100644 --- a/fs/omfs/file.c +++ b/fs/omfs/file.c @@ -26,6 +26,13 @@ static int omfs_sync_file(struct file *file, struct dentry *dentry, return err ? -EIO : 0; } +static u32 omfs_max_extents(struct omfs_sb_info *sbi, int offset) +{ + return (sbi->s_sys_blocksize - offset - + sizeof(struct omfs_extent)) / + sizeof(struct omfs_extent_entry) + 1; +} + void omfs_make_empty_table(struct buffer_head *bh, int offset) { struct omfs_extent *oe = (struct omfs_extent *) &bh->b_data[offset]; @@ -45,6 +52,7 @@ int omfs_shrink_inode(struct inode *inode) struct buffer_head *bh; u64 next, last; u32 extent_count; + u32 max_extents; int ret; /* traverse extent table, freeing each entry that is greater @@ -62,15 +70,18 @@ int omfs_shrink_inode(struct inode *inode) goto out; oe = (struct omfs_extent *)(&bh->b_data[OMFS_EXTENT_START]); + max_extents = omfs_max_extents(sbi, OMFS_EXTENT_START); for (;;) { - if (omfs_is_bad(sbi, (struct omfs_header *) bh->b_data, next)) { - brelse(bh); - goto out; - } + if (omfs_is_bad(sbi, (struct omfs_header *) bh->b_data, next)) + goto out_brelse; extent_count = be32_to_cpu(oe->e_extent_count); + + if (extent_count > max_extents) + goto out_brelse; + last = next; next = be64_to_cpu(oe->e_next); entry = &oe->e_entry; @@ -98,10 +109,14 @@ int omfs_shrink_inode(struct inode *inode) if (!bh) goto out; oe = (struct omfs_extent *) (&bh->b_data[OMFS_EXTENT_CONT]); + max_extents = omfs_max_extents(sbi, OMFS_EXTENT_CONT); } ret = 0; out: return ret; +out_brelse: + brelse(bh); + return ret; } static void omfs_truncate(struct inode *inode) @@ -154,9 +169,7 @@ static int omfs_grow_extent(struct inode *inode, struct omfs_extent *oe, goto out; } } - max_count = (sbi->s_sys_blocksize - OMFS_EXTENT_START - - sizeof(struct omfs_extent)) / - sizeof(struct omfs_extent_entry) + 1; + max_count = omfs_max_extents(sbi, OMFS_EXTENT_START); /* TODO: add a continuation block here */ if (be32_to_cpu(oe->e_extent_count) > max_count-1) @@ -225,6 +238,7 @@ static int omfs_get_block(struct inode *inode, sector_t block, sector_t next, offset; int ret; u64 new_block; + u32 max_extents; int extent_count; struct omfs_extent *oe; struct omfs_extent_entry *entry; @@ -238,6 +252,7 @@ static int omfs_get_block(struct inode *inode, sector_t block, goto out; oe = (struct omfs_extent *)(&bh->b_data[OMFS_EXTENT_START]); + max_extents = omfs_max_extents(sbi, OMFS_EXTENT_START); next = inode->i_ino; for (;;) { @@ -249,6 +264,9 @@ static int omfs_get_block(struct inode *inode, sector_t block, next = be64_to_cpu(oe->e_next); entry = &oe->e_entry; + if (extent_count > max_extents) + goto out_brelse; + offset = find_block(inode, entry, block, extent_count, &remain); if (offset > 0) { ret = 0; @@ -266,6 +284,7 @@ static int omfs_get_block(struct inode *inode, sector_t block, if (!bh) goto out; oe = (struct omfs_extent *) (&bh->b_data[OMFS_EXTENT_CONT]); + max_extents = omfs_max_extents(sbi, OMFS_EXTENT_CONT); } if (create) { ret = omfs_grow_extent(inode, oe, &new_block); -- cgit v1.2.3 From aab3a8c7a3a6a001dd439ed00d4db17a1059803e Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Tue, 19 Aug 2008 14:23:37 +0000 Subject: [CIFS] reindent misindented statement MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Ilpo Järvinen Signed-off-by: Steve French --- fs/cifs/inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 848286861c31..9c548f110102 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -546,7 +546,8 @@ int cifs_get_inode_info(struct inode **pinode, if ((inode->i_mode & S_IWUGO) == 0 && (attr & ATTR_READONLY) == 0) inode->i_mode |= (S_IWUGO & default_mode); - inode->i_mode &= ~S_IFMT; + + inode->i_mode &= ~S_IFMT; } /* clear write bits if ATTR_READONLY is set */ if (attr & ATTR_READONLY) -- cgit v1.2.3 From cb7691b648bddbfaf6dd8d8068273dbb18d2484c Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 18 Aug 2008 15:41:05 -0400 Subject: cifs: add local server pointer to cifs_setup_session cifs_setup_session references pSesInfo->server several times. That pointer shouldn't change during the life of the function so grab it once and store it in a local var. This makes the code look a little cleaner too. Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/connect.c | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 0711db65afe8..4c13bcdb92a5 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -3598,19 +3598,21 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo, char ntlm_session_key[CIFS_SESS_KEY_SIZE]; bool ntlmv2_flag = false; int first_time = 0; + struct TCP_Server_Info *server = pSesInfo->server; /* what if server changes its buffer size after dropping the session? */ - if (pSesInfo->server->maxBuf == 0) /* no need to send on reconnect */ { + if (server->maxBuf == 0) /* no need to send on reconnect */ { rc = CIFSSMBNegotiate(xid, pSesInfo); - if (rc == -EAGAIN) /* retry only once on 1st time connection */ { + if (rc == -EAGAIN) { + /* retry only once on 1st time connection */ rc = CIFSSMBNegotiate(xid, pSesInfo); if (rc == -EAGAIN) rc = -EHOSTDOWN; } if (rc == 0) { spin_lock(&GlobalMid_Lock); - if (pSesInfo->server->tcpStatus != CifsExiting) - pSesInfo->server->tcpStatus = CifsGood; + if (server->tcpStatus != CifsExiting) + server->tcpStatus = CifsGood; else rc = -EHOSTDOWN; spin_unlock(&GlobalMid_Lock); @@ -3623,23 +3625,22 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo, goto ss_err_exit; pSesInfo->flags = 0; - pSesInfo->capabilities = pSesInfo->server->capabilities; + pSesInfo->capabilities = server->capabilities; if (linuxExtEnabled == 0) pSesInfo->capabilities &= (~CAP_UNIX); /* pSesInfo->sequence_number = 0;*/ cFYI(1, ("Security Mode: 0x%x Capabilities: 0x%x TimeAdjust: %d", - pSesInfo->server->secMode, - pSesInfo->server->capabilities, - pSesInfo->server->timeAdj)); + server->secMode, server->capabilities, server->timeAdj)); + if (experimEnabled < 2) rc = CIFS_SessSetup(xid, pSesInfo, first_time, nls_info); else if (extended_security && (pSesInfo->capabilities & CAP_EXTENDED_SECURITY) - && (pSesInfo->server->secType == NTLMSSP)) { + && (server->secType == NTLMSSP)) { rc = -EOPNOTSUPP; } else if (extended_security && (pSesInfo->capabilities & CAP_EXTENDED_SECURITY) - && (pSesInfo->server->secType == RawNTLMSSP)) { + && (server->secType == RawNTLMSSP)) { cFYI(1, ("NTLMSSP sesssetup")); rc = CIFSNTLMSSPNegotiateSessSetup(xid, pSesInfo, &ntlmv2_flag, nls_info); @@ -3668,12 +3669,12 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo, } else { SMBNTencrypt(pSesInfo->password, - pSesInfo->server->cryptKey, + server->cryptKey, ntlm_session_key); if (first_time) cifs_calculate_mac_key( - &pSesInfo->server->mac_signing_key, + &server->mac_signing_key, ntlm_session_key, pSesInfo->password); } @@ -3686,13 +3687,13 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo, nls_info); } } else { /* old style NTLM 0.12 session setup */ - SMBNTencrypt(pSesInfo->password, pSesInfo->server->cryptKey, + SMBNTencrypt(pSesInfo->password, server->cryptKey, ntlm_session_key); if (first_time) - cifs_calculate_mac_key( - &pSesInfo->server->mac_signing_key, - ntlm_session_key, pSesInfo->password); + cifs_calculate_mac_key(&server->mac_signing_key, + ntlm_session_key, + pSesInfo->password); rc = CIFSSessSetup(xid, pSesInfo, ntlm_session_key, nls_info); } -- cgit v1.2.3 From c16fefa56334e8d0197492607e473fdbb813073f Mon Sep 17 00:00:00 2001 From: Steve French Date: Tue, 19 Aug 2008 19:35:33 +0000 Subject: [CIFS] distinguish between Kerberos and MSKerberos in upcall Properly handle MSKRB5 by passing sec=mskrb5 to the upcall so that the spengo blob can be generated appropriately. Also, make decode_negTokenInit prefer whichever mechanism is first in the list. Needed for some NetApp servers, and possibly some older versions of Windows which treat the two KRB5 mechanisms differently. Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/asn1.c | 11 ++++++++--- fs/cifs/cifs_spnego.c | 4 +++- fs/cifs/cifsglob.h | 3 ++- fs/cifs/sess.c | 2 +- 4 files changed, 14 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/cifs/asn1.c b/fs/cifs/asn1.c index 5fabd2caf93c..1b09f1670061 100644 --- a/fs/cifs/asn1.c +++ b/fs/cifs/asn1.c @@ -476,6 +476,7 @@ decode_negTokenInit(unsigned char *security_blob, int length, unsigned int cls, con, tag, oidlen, rc; bool use_ntlmssp = false; bool use_kerberos = false; + bool use_mskerberos = false; *secType = NTLM; /* BB eventually make Kerberos or NLTMSSP the default*/ @@ -574,10 +575,12 @@ decode_negTokenInit(unsigned char *security_blob, int length, *(oid + 1), *(oid + 2), *(oid + 3))); if (compare_oid(oid, oidlen, MSKRB5_OID, - MSKRB5_OID_LEN)) - use_kerberos = true; + MSKRB5_OID_LEN) && + !use_kerberos) + use_mskerberos = true; else if (compare_oid(oid, oidlen, KRB5_OID, - KRB5_OID_LEN)) + KRB5_OID_LEN) && + !use_mskerberos) use_kerberos = true; else if (compare_oid(oid, oidlen, NTLMSSP_OID, NTLMSSP_OID_LEN)) @@ -630,6 +633,8 @@ decode_negTokenInit(unsigned char *security_blob, int length, if (use_kerberos) *secType = Kerberos; + else if (use_mskerberos) + *secType = MSKerberos; else if (use_ntlmssp) *secType = NTLMSSP; diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c index 2434ab0e8791..117ef4bba68e 100644 --- a/fs/cifs/cifs_spnego.c +++ b/fs/cifs/cifs_spnego.c @@ -114,9 +114,11 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo) dp = description + strlen(description); - /* for now, only sec=krb5 is valid */ + /* for now, only sec=krb5 and sec=mskrb5 are valid */ if (server->secType == Kerberos) sprintf(dp, ";sec=krb5"); + else if (server->secType == MSKerberos) + sprintf(dp, ";sec=mskrb5"); else goto out; diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 7e1cf262effe..8dfd6f24d488 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -80,7 +80,8 @@ enum securityEnum { NTLMv2, /* Legacy NTLM auth with NTLMv2 hash */ RawNTLMSSP, /* NTLMSSP without SPNEGO */ NTLMSSP, /* NTLMSSP via SPNEGO */ - Kerberos /* Kerberos via SPNEGO */ + Kerberos, /* Kerberos via SPNEGO */ + MSKerberos, /* MS Kerberos via SPNEGO */ }; enum protocolEnum { diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index ed150efbe27c..3188e4d9cddb 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -505,7 +505,7 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time, unicode_ssetup_strings(&bcc_ptr, ses, nls_cp); } else ascii_ssetup_strings(&bcc_ptr, ses, nls_cp); - } else if (type == Kerberos) { + } else if (type == Kerberos || type == MSKerberos) { #ifdef CONFIG_CIFS_UPCALL struct cifs_spnego_msg *msg; spnego_key = cifs_get_spnego_key(ses); -- cgit v1.2.3 From 3d2af3465e91335bd1dbf36b19e92079d901409f Mon Sep 17 00:00:00 2001 From: Steve French Date: Tue, 19 Aug 2008 20:51:09 +0000 Subject: [CIFS] Kerberos support not considered experimental anymore Acked-by: Jeff Layton Signed-off-by: Steve French --- fs/Kconfig | 1 - fs/cifs/README | 30 ++++++++++++++++++++++++++---- 2 files changed, 26 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/Kconfig b/fs/Kconfig index d3873583360b..f0427105a619 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -1984,7 +1984,6 @@ config CIFS_EXPERIMENTAL config CIFS_UPCALL bool "Kerberos/SPNEGO advanced session setup (EXPERIMENTAL)" - depends on CIFS_EXPERIMENTAL depends on KEYS help Enables an upcall mechanism for CIFS which accesses diff --git a/fs/cifs/README b/fs/cifs/README index 2bd6fe556f88..68b5c1169d9d 100644 --- a/fs/cifs/README +++ b/fs/cifs/README @@ -642,8 +642,30 @@ The statistics for the number of total SMBs and oplock breaks are different in that they represent all for that share, not just those for which the server returned success. -Also note that "cat /proc/fs/cifs/DebugData" will display information about +Also note that "cat /proc/fs/cifs/DebugData" will display information about the active sessions and the shares that are mounted. -Enabling Kerberos (extended security) works when CONFIG_CIFS_EXPERIMENTAL is -on but requires a user space helper (from the Samba project). NTLM and NTLMv2 and -LANMAN support do not require this helper. + +Enabling Kerberos (extended security) works but requires version 1.2 or later +of the helper program cifs.upcall to be present and to be configured in the +/etc/request-key.conf file. The cifs.upcall helper program is from the Samba +project(http://www.samba.org). NTLM and NTLMv2 and LANMAN support do not +require this helper. Note that NTLMv2 security (which does not require the +cifs.upcall helper program), instead of using Kerberos, is sufficient for +some use cases. + +Enabling DFS support (used to access shares transparently in an MS-DFS +global name space) requires that CONFIG_CIFS_EXPERIMENTAL be enabled. In +addition, DFS support for target shares which are specified as UNC +names which begin with host names (rather than IP addresses) requires +a user space helper (such as cifs.upcall) to be present in order to +translate host names to ip address, and the user space helper must also +be configured in the file /etc/request-key.conf + +To use cifs Kerberos and DFS support, the Linux keyutils package should be +installed and something like the following lines should be added to the +/etc/request-key.conf file: + +create cifs.spnego * * /usr/local/sbin/cifs.upcall %k +create dns_resolver * * /usr/local/sbin/cifs.upcall %k + + -- cgit v1.2.3 From 5f22ca9b13551debea77a407a8d06cd9c6f15238 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 20 Aug 2008 08:31:19 -0700 Subject: vfat: fix 'sync' mount deadlock due to BKL->lock_super conversion There was another FAT BKL conversion deadlock reported by Bart Trojanowski due to the BKL being used as a recursive lock by FAT, which was missed because it only triggers with 'sync' (or 'dirsync') mounts. The recursion worked for the BKL, but after the conversion to lock_super (which uses a mutex), it just deadlocks. Thanks to Bart for debugging this and testing the fix. The lock debugging information from the original report: ============================================= [ INFO: possible recursive locking detected ] 2.6.27-rc3-bisect-00448-ga7f5aaf #16 --------------------------------------------- mv/4020 is trying to acquire lock: (&type->s_lock_key#9){--..}, at: [] lock_super+0x1e/0x20 but task is already holding lock: (&type->s_lock_key#9){--..}, at: [] lock_super+0x1e/0x20 other info that might help us debug this: 3 locks held by mv/4020: #0: (&sb->s_type->i_mutex_key#9/1){--..}, at: [] do_unlinkat+0x66/0x140 #1: (&sb->s_type->i_mutex_key#9){--..}, at: [] vfs_unlink+0x84/0x110 #2: (&type->s_lock_key#9){--..}, at: [] lock_super+0x1e/0x20 stack backtrace: Pid: 4020, comm: mv Not tainted 2.6.27-rc3-bisect-00448-ga7f5aaf #16 [] validate_chain+0x984/0xea0 [] ? native_sched_clock+0x0/0xf0 [] __lock_acquire+0x2ec/0x9b0 [] lock_acquire+0x6f/0x90 [] ? lock_super+0x1e/0x20 [] mutex_lock_nested+0xad/0x300 [] ? lock_super+0x1e/0x20 [] ? lock_super+0x1e/0x20 [] lock_super+0x1e/0x20 [] fat_write_inode+0x60/0x2b0 [fat] [] ? _spin_unlock_irqrestore+0x48/0x80 [] ? fat_sync_inode+0x3/0x20 [fat] [] fat_sync_inode+0x12/0x20 [fat] [] fat_remove_entries+0xbe/0x120 [fat] [] vfat_unlink+0x5f/0x90 [vfat] [] ? vfat_unlink+0x0/0x90 [vfat] [] vfs_unlink+0x98/0x110 [] do_unlinkat+0x130/0x140 [] ? audit_syscall_entry+0x105/0x150 [] sys_unlinkat+0x3b/0x40 [] sysenter_do_call+0x12/0x3f ======================= where the deadlock is due to the nesting of lock_super from vfat_unlink to fat_write_inode: - do_unlinkat - vfs_unlink - vfat_unlink * lock_super - fat_remove_entries - fat_sync_inode - fat_write_inode * lock_super and the fix is to simply remove the use of lock_super() in fat_write_inode. The lock_super() there had been just an automatic conversion of the kernel lock to the superblock lock, but no locking was actually needed there, since the code in fat_write_inode already protected all relevant accesses with a spinlock (sbi->inode_hash_lock to be exact). The only code inside the BKL (and thus the superblock lock) was accesses tp local variables or calls to functions that have long been SMP-safe (i.e. sb_bread, mark_buffe_dirty and brlese). Bart reports: "Looks good. I ran 10 parallel processes creating 1M files truncating them, writing to them again and then deleting them. This patch fixes the issue I ran into. Signed-off-by: Bart Trojanowski " Reported-and-tested-by: Bart Trojanowski Signed-off-by: Linus Torvalds --- fs/fat/inode.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 6d266d793e2c..80ff3381fa21 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -562,26 +562,23 @@ static int fat_write_inode(struct inode *inode, int wait) struct buffer_head *bh; struct msdos_dir_entry *raw_entry; loff_t i_pos; - int err = 0; + int err; retry: i_pos = MSDOS_I(inode)->i_pos; if (inode->i_ino == MSDOS_ROOT_INO || !i_pos) return 0; - lock_super(sb); bh = sb_bread(sb, i_pos >> sbi->dir_per_block_bits); if (!bh) { printk(KERN_ERR "FAT: unable to read inode block " "for updating (i_pos %lld)\n", i_pos); - err = -EIO; - goto out; + return -EIO; } spin_lock(&sbi->inode_hash_lock); if (i_pos != MSDOS_I(inode)->i_pos) { spin_unlock(&sbi->inode_hash_lock); brelse(bh); - unlock_super(sb); goto retry; } @@ -607,11 +604,10 @@ retry: } spin_unlock(&sbi->inode_hash_lock); mark_buffer_dirty(bh); + err = 0; if (wait) err = sync_dirty_buffer(bh); brelse(bh); -out: - unlock_super(sb); return err; } -- cgit v1.2.3 From 1804dc6e145f3f24a8c94deddfc0a986d380a27f Mon Sep 17 00:00:00 2001 From: Clement Calmels Date: Wed, 20 Aug 2008 14:09:00 -0700 Subject: /proc/self/maps doesn't display the real file offset This addresses http://bugzilla.kernel.org/show_bug.cgi?id=11318 In function show_map (file: fs/proc/task_mmu.c), if vma->vm_pgoff > 2^20 than (vma->vm_pgoff << PAGE_SIZE) is greater than 2^32 (with PAGE_SIZE equal to 4096 (i.e. 2^12). The next seq_printf use an unsigned long for the conversion of (vma->vm_pgoff << PAGE_SIZE), as a result the offset value displayed in /proc/self/maps is truncated if the page offset is greater than 2^20. A test that shows this issue: #define _GNU_SOURCE #include #include #include #include #include #include #include #include #define PAGE_SIZE (getpagesize()) #if __i386__ # define U64_STR "%llx" #elif __x86_64 # define U64_STR "%lx" #else # error "Architecture Unsupported" #endif int main(int argc, char *argv[]) { int fd; char *addr; off64_t offset = 0x10000000; char *filename = "/dev/zero"; fd = open(filename, O_RDONLY); if (fd < 0) { perror("open"); return 1; } offset *= 0x10; printf("offset = " U64_STR "\n", offset); addr = (char*)mmap64(NULL, PAGE_SIZE, PROT_READ, MAP_PRIVATE, fd, offset); if ((void*)addr == MAP_FAILED) { perror("mmap64"); return 1; } { FILE *fmaps; char *line = NULL; size_t len = 0; ssize_t read; size_t filename_len = strlen(filename); fmaps = fopen("/proc/self/maps", "r"); if (!fmaps) { perror("fopen"); return 1; } while ((read = getline(&line, &len, fmaps)) != -1) { if ((read > filename_len + 1) && (strncmp(&line[read - filename_len - 1], filename, filename_len) == 0)) printf("%s", line); } if (line) free(line); fclose(fmaps); } close(fd); return 0; } [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Clement Calmels Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/nommu.c | 4 ++-- fs/proc/task_mmu.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c index 79ecd281d2cb..3f87d2632947 100644 --- a/fs/proc/nommu.c +++ b/fs/proc/nommu.c @@ -52,14 +52,14 @@ int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma) } seq_printf(m, - "%08lx-%08lx %c%c%c%c %08lx %02x:%02x %lu %n", + "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n", vma->vm_start, vma->vm_end, flags & VM_READ ? 'r' : '-', flags & VM_WRITE ? 'w' : '-', flags & VM_EXEC ? 'x' : '-', flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p', - vma->vm_pgoff << PAGE_SHIFT, + ((loff_t)vma->vm_pgoff) << PAGE_SHIFT, MAJOR(dev), MINOR(dev), ino, &len); if (file) { diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 7546a918f790..73d1891ee625 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -219,14 +219,14 @@ static int show_map(struct seq_file *m, void *v) ino = inode->i_ino; } - seq_printf(m, "%08lx-%08lx %c%c%c%c %08lx %02x:%02x %lu %n", + seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n", vma->vm_start, vma->vm_end, flags & VM_READ ? 'r' : '-', flags & VM_WRITE ? 'w' : '-', flags & VM_EXEC ? 'x' : '-', flags & VM_MAYSHARE ? 's' : 'p', - vma->vm_pgoff << PAGE_SHIFT, + ((loff_t)vma->vm_pgoff) << PAGE_SHIFT, MAJOR(dev), MINOR(dev), ino, &len); /* -- cgit v1.2.3 From ff9bc512f198eb47204f55b24c6fe3d36ed89592 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Wed, 20 Aug 2008 14:09:10 -0700 Subject: binfmt_misc: fix false -ENOEXEC when coupled with other binary handlers In case the binfmt_misc binary handler is registered *before* the e.g. script one (when for example being compiled as a module) the following situation may occur: 1. user launches a script, whose interpreter is a misc binary; 2. the load_misc_binary sets the misc_bang and returns -ENOEVEC, since the binary is a script; 3. the load_script_binary loads one and calls for search_binary_hander to run the interpreter; 4. the load_misc_binary is called again, but refuses to load the binary due to misc_bang bit set. The fix is to move the misc_bang setting lower - prior to the actual call to the search_binary_handler. Caused by the commit 3a2e7f47 (binfmt_misc.c: avoid potential kernel stack overflow) Signed-off-by: Pavel Emelyanov Reported-by: Kirill A. Shutemov Tested-by: Kirill A. Shutemov Cc: [2.6.26.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/binfmt_misc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index 756205314c24..8d7e88e02e0f 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -120,8 +120,6 @@ static int load_misc_binary(struct linux_binprm *bprm, struct pt_regs *regs) if (bprm->misc_bang) goto _ret; - bprm->misc_bang = 1; - /* to keep locking time low, we copy the interpreter string */ read_lock(&entries_lock); fmt = check_file(bprm); @@ -199,6 +197,8 @@ static int load_misc_binary(struct linux_binprm *bprm, struct pt_regs *regs) if (retval < 0) goto _error; + bprm->misc_bang = 1; + retval = search_binary_handler (bprm, regs); if (retval < 0) goto _error; -- cgit v1.2.3 From 2d70b68d42b5196a48ccb639e3797f097ef5bea3 Mon Sep 17 00:00:00 2001 From: Ken Chen Date: Wed, 20 Aug 2008 14:09:17 -0700 Subject: fix setpriority(PRIO_PGRP) thread iterator breakage When user calls sys_setpriority(PRIO_PGRP ...) on a NPTL style multi-LWP process, only the task leader of the process is affected, all other sibling LWP threads didn't receive the setting. The problem was that the iterator used in sys_setpriority() only iteartes over one task for each process, ignoring all other sibling thread. Introduce a new macro do_each_pid_thread / while_each_pid_thread to walk each thread of a process. Convert 4 call sites in {set/get}priority and ioprio_{set/get}. Signed-off-by: Ken Chen Cc: Oleg Nesterov Cc: Roland McGrath Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ioprio.c | 8 ++++---- include/linux/pid.h | 9 +++++++++ kernel/sys.c | 8 ++++---- 3 files changed, 17 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/ioprio.c b/fs/ioprio.c index c4a1c3c65aac..da3cc460d4df 100644 --- a/fs/ioprio.c +++ b/fs/ioprio.c @@ -115,11 +115,11 @@ asmlinkage long sys_ioprio_set(int which, int who, int ioprio) pgrp = task_pgrp(current); else pgrp = find_vpid(who); - do_each_pid_task(pgrp, PIDTYPE_PGID, p) { + do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { ret = set_task_ioprio(p, ioprio); if (ret) break; - } while_each_pid_task(pgrp, PIDTYPE_PGID, p); + } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); break; case IOPRIO_WHO_USER: if (!who) @@ -204,7 +204,7 @@ asmlinkage long sys_ioprio_get(int which, int who) pgrp = task_pgrp(current); else pgrp = find_vpid(who); - do_each_pid_task(pgrp, PIDTYPE_PGID, p) { + do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { tmpio = get_task_ioprio(p); if (tmpio < 0) continue; @@ -212,7 +212,7 @@ asmlinkage long sys_ioprio_get(int which, int who) ret = tmpio; else ret = ioprio_best(ret, tmpio); - } while_each_pid_task(pgrp, PIDTYPE_PGID, p); + } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); break; case IOPRIO_WHO_USER: if (!who) diff --git a/include/linux/pid.h b/include/linux/pid.h index 22921ac4cfd9..d7e98ff8021e 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -161,4 +161,13 @@ pid_t pid_vnr(struct pid *pid); } \ } while (0) +#define do_each_pid_thread(pid, type, task) \ + do_each_pid_task(pid, type, task) { \ + struct task_struct *tg___ = task; \ + do { + +#define while_each_pid_thread(pid, type, task) \ + } while_each_thread(tg___, task); \ + task = tg___; \ + } while_each_pid_task(pid, type, task) #endif /* _LINUX_PID_H */ diff --git a/kernel/sys.c b/kernel/sys.c index 3dacb00a7f76..038a7bc0901d 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -169,9 +169,9 @@ asmlinkage long sys_setpriority(int which, int who, int niceval) pgrp = find_vpid(who); else pgrp = task_pgrp(current); - do_each_pid_task(pgrp, PIDTYPE_PGID, p) { + do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { error = set_one_prio(p, niceval, error); - } while_each_pid_task(pgrp, PIDTYPE_PGID, p); + } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); break; case PRIO_USER: user = current->user; @@ -229,11 +229,11 @@ asmlinkage long sys_getpriority(int which, int who) pgrp = find_vpid(who); else pgrp = task_pgrp(current); - do_each_pid_task(pgrp, PIDTYPE_PGID, p) { + do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { niceval = 20 - task_nice(p); if (niceval > retval) retval = niceval; - } while_each_pid_task(pgrp, PIDTYPE_PGID, p); + } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); break; case PRIO_USER: user = current->user; -- cgit v1.2.3 From 82d63fc9e30687c055b97928942b8893ea65b0bb Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 20 Aug 2008 14:09:24 -0700 Subject: cramfs: fix named-pipe handling After commit a97c9bf33f4612e2aed6f000f6b1d268b6814f3c (fix cramfs making duplicate entries in inode cache) in kernel 2.6.14, named-pipe on cramfs does not work properly. It seems the commit make all named-pipe on cramfs share their inode (and named-pipe buffer). Make ..._test() refuse to merge inodes with ->i_ino == 1, take inode setup back to get_cramfs_inode() and make ->drop_inode() evict ones with ->i_ino == 1 immediately. Reported-by: Atsushi Nemoto Cc: Al Viro Cc: [2.6.14 and later] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/cramfs/inode.c | 84 +++++++++++++++++++++++++------------------------------ 1 file changed, 38 insertions(+), 46 deletions(-) (limited to 'fs') diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index 0c3b618c15b3..f40423eb1a14 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -43,58 +43,13 @@ static DEFINE_MUTEX(read_mutex); static int cramfs_iget5_test(struct inode *inode, void *opaque) { struct cramfs_inode *cramfs_inode = opaque; - - if (inode->i_ino != CRAMINO(cramfs_inode)) - return 0; /* does not match */ - - if (inode->i_ino != 1) - return 1; - - /* all empty directories, char, block, pipe, and sock, share inode #1 */ - - if ((inode->i_mode != cramfs_inode->mode) || - (inode->i_gid != cramfs_inode->gid) || - (inode->i_uid != cramfs_inode->uid)) - return 0; /* does not match */ - - if ((S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) && - (inode->i_rdev != old_decode_dev(cramfs_inode->size))) - return 0; /* does not match */ - - return 1; /* matches */ + return inode->i_ino == CRAMINO(cramfs_inode) && inode->i_ino != 1; } static int cramfs_iget5_set(struct inode *inode, void *opaque) { - static struct timespec zerotime; struct cramfs_inode *cramfs_inode = opaque; - inode->i_mode = cramfs_inode->mode; - inode->i_uid = cramfs_inode->uid; - inode->i_size = cramfs_inode->size; - inode->i_blocks = (cramfs_inode->size - 1) / 512 + 1; - inode->i_gid = cramfs_inode->gid; - /* Struct copy intentional */ - inode->i_mtime = inode->i_atime = inode->i_ctime = zerotime; inode->i_ino = CRAMINO(cramfs_inode); - /* inode->i_nlink is left 1 - arguably wrong for directories, - but it's the best we can do without reading the directory - contents. 1 yields the right result in GNU find, even - without -noleaf option. */ - if (S_ISREG(inode->i_mode)) { - inode->i_fop = &generic_ro_fops; - inode->i_data.a_ops = &cramfs_aops; - } else if (S_ISDIR(inode->i_mode)) { - inode->i_op = &cramfs_dir_inode_operations; - inode->i_fop = &cramfs_directory_operations; - } else if (S_ISLNK(inode->i_mode)) { - inode->i_op = &page_symlink_inode_operations; - inode->i_data.a_ops = &cramfs_aops; - } else { - inode->i_size = 0; - inode->i_blocks = 0; - init_special_inode(inode, inode->i_mode, - old_decode_dev(cramfs_inode->size)); - } return 0; } @@ -104,12 +59,48 @@ static struct inode *get_cramfs_inode(struct super_block *sb, struct inode *inode = iget5_locked(sb, CRAMINO(cramfs_inode), cramfs_iget5_test, cramfs_iget5_set, cramfs_inode); + static struct timespec zerotime; + if (inode && (inode->i_state & I_NEW)) { + inode->i_mode = cramfs_inode->mode; + inode->i_uid = cramfs_inode->uid; + inode->i_size = cramfs_inode->size; + inode->i_blocks = (cramfs_inode->size - 1) / 512 + 1; + inode->i_gid = cramfs_inode->gid; + /* Struct copy intentional */ + inode->i_mtime = inode->i_atime = inode->i_ctime = zerotime; + /* inode->i_nlink is left 1 - arguably wrong for directories, + but it's the best we can do without reading the directory + contents. 1 yields the right result in GNU find, even + without -noleaf option. */ + if (S_ISREG(inode->i_mode)) { + inode->i_fop = &generic_ro_fops; + inode->i_data.a_ops = &cramfs_aops; + } else if (S_ISDIR(inode->i_mode)) { + inode->i_op = &cramfs_dir_inode_operations; + inode->i_fop = &cramfs_directory_operations; + } else if (S_ISLNK(inode->i_mode)) { + inode->i_op = &page_symlink_inode_operations; + inode->i_data.a_ops = &cramfs_aops; + } else { + inode->i_size = 0; + inode->i_blocks = 0; + init_special_inode(inode, inode->i_mode, + old_decode_dev(cramfs_inode->size)); + } unlock_new_inode(inode); } return inode; } +static void cramfs_drop_inode(struct inode *inode) +{ + if (inode->i_ino == 1) + generic_delete_inode(inode); + else + generic_drop_inode(inode); +} + /* * We have our own block cache: don't fill up the buffer cache * with the rom-image, because the way the filesystem is set @@ -534,6 +525,7 @@ static const struct super_operations cramfs_ops = { .put_super = cramfs_put_super, .remount_fs = cramfs_remount, .statfs = cramfs_statfs, + .drop_inode = cramfs_drop_inode, }; static int cramfs_get_sb(struct file_system_type *fs_type, -- cgit v1.2.3 From 18496e80f729be5f536d0315751b3bbb95ca913e Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Thu, 7 Aug 2008 00:11:12 +0300 Subject: [PATCH] ocfs2/cluster/tcp.c: make some functions static Commit 0f475b2abed6cbccee1da20a0bef2895eb2a0edd (ocfs2/net: Silence build warnings) made sense as far as it fixed compile warnings, but it was not required that it made the functions global. Signed-off-by: Adrian Bunk Signed-off-by: Mark Fasheh --- fs/ocfs2/cluster/tcp.c | 44 ++++++++++++++++++++++++++++++++++------- fs/ocfs2/cluster/tcp_internal.h | 32 ------------------------------ 2 files changed, 37 insertions(+), 39 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index a27d61581bd6..2bcf706d9dd3 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -143,8 +143,8 @@ static void o2net_sc_postpone_idle(struct o2net_sock_container *sc); static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc); #ifdef CONFIG_DEBUG_FS -void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype, - u32 msgkey, struct task_struct *task, u8 node) +static void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype, + u32 msgkey, struct task_struct *task, u8 node) { INIT_LIST_HEAD(&nst->st_net_debug_item); nst->st_task = task; @@ -153,31 +153,61 @@ void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype, nst->st_node = node; } -void o2net_set_nst_sock_time(struct o2net_send_tracking *nst) +static void o2net_set_nst_sock_time(struct o2net_send_tracking *nst) { do_gettimeofday(&nst->st_sock_time); } -void o2net_set_nst_send_time(struct o2net_send_tracking *nst) +static void o2net_set_nst_send_time(struct o2net_send_tracking *nst) { do_gettimeofday(&nst->st_send_time); } -void o2net_set_nst_status_time(struct o2net_send_tracking *nst) +static void o2net_set_nst_status_time(struct o2net_send_tracking *nst) { do_gettimeofday(&nst->st_status_time); } -void o2net_set_nst_sock_container(struct o2net_send_tracking *nst, +static void o2net_set_nst_sock_container(struct o2net_send_tracking *nst, struct o2net_sock_container *sc) { nst->st_sc = sc; } -void o2net_set_nst_msg_id(struct o2net_send_tracking *nst, u32 msg_id) +static void o2net_set_nst_msg_id(struct o2net_send_tracking *nst, u32 msg_id) { nst->st_id = msg_id; } + +#else /* CONFIG_DEBUG_FS */ + +static inline void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype, + u32 msgkey, struct task_struct *task, u8 node) +{ +} + +static inline void o2net_set_nst_sock_time(struct o2net_send_tracking *nst) +{ +} + +static inline void o2net_set_nst_send_time(struct o2net_send_tracking *nst) +{ +} + +static inline void o2net_set_nst_status_time(struct o2net_send_tracking *nst) +{ +} + +static inline void o2net_set_nst_sock_container(struct o2net_send_tracking *nst, + struct o2net_sock_container *sc) +{ +} + +static inline void o2net_set_nst_msg_id(struct o2net_send_tracking *nst, + u32 msg_id) +{ +} + #endif /* CONFIG_DEBUG_FS */ static inline int o2net_reconnect_delay(void) diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h index 18307ff81b77..8d58cfe410b1 100644 --- a/fs/ocfs2/cluster/tcp_internal.h +++ b/fs/ocfs2/cluster/tcp_internal.h @@ -224,42 +224,10 @@ struct o2net_send_tracking { struct timeval st_send_time; struct timeval st_status_time; }; - -void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype, - u32 msgkey, struct task_struct *task, u8 node); -void o2net_set_nst_sock_time(struct o2net_send_tracking *nst); -void o2net_set_nst_send_time(struct o2net_send_tracking *nst); -void o2net_set_nst_status_time(struct o2net_send_tracking *nst); -void o2net_set_nst_sock_container(struct o2net_send_tracking *nst, - struct o2net_sock_container *sc); -void o2net_set_nst_msg_id(struct o2net_send_tracking *nst, u32 msg_id); - #else struct o2net_send_tracking { u32 dummy; }; - -static inline void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype, - u32 msgkey, struct task_struct *task, u8 node) -{ -} -static inline void o2net_set_nst_sock_time(struct o2net_send_tracking *nst) -{ -} -static inline void o2net_set_nst_send_time(struct o2net_send_tracking *nst) -{ -} -static inline void o2net_set_nst_status_time(struct o2net_send_tracking *nst) -{ -} -static inline void o2net_set_nst_sock_container(struct o2net_send_tracking *nst, - struct o2net_sock_container *sc) -{ -} -static inline void o2net_set_nst_msg_id(struct o2net_send_tracking *nst, - u32 msg_id) -{ -} #endif /* CONFIG_DEBUG_FS */ #endif /* O2CLUSTER_TCP_INTERNAL_H */ -- cgit v1.2.3 From a57a874b04e27cb530a0e18c244387452e73ccce Mon Sep 17 00:00:00 2001 From: Alexander Beregalov Date: Wed, 6 Aug 2008 00:50:41 +0400 Subject: [PATCH] ocfs2/cluster/netdebug.c: fix warning ocfs2/cluster/netdebug.c: fix warning fs/ocfs2/cluster/netdebug.c:154: warning: format '%lu' expects type 'long unsigned int', but argument 17 has type 'suseconds_t' Signed-off-by: Alexander Beregalov Signed-off-by: Mark Fasheh --- fs/ocfs2/cluster/netdebug.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c index d8bfa0eb41b2..52276c02f710 100644 --- a/fs/ocfs2/cluster/netdebug.c +++ b/fs/ocfs2/cluster/netdebug.c @@ -138,20 +138,20 @@ static int nst_seq_show(struct seq_file *seq, void *v) " message id: %d\n" " message type: %u\n" " message key: 0x%08x\n" - " sock acquiry: %lu.%lu\n" - " send start: %lu.%lu\n" - " wait start: %lu.%lu\n", + " sock acquiry: %lu.%ld\n" + " send start: %lu.%ld\n" + " wait start: %lu.%ld\n", nst, (unsigned long)nst->st_task->pid, (unsigned long)nst->st_task->tgid, nst->st_task->comm, nst->st_node, nst->st_sc, nst->st_id, nst->st_msg_type, nst->st_msg_key, nst->st_sock_time.tv_sec, - (unsigned long)nst->st_sock_time.tv_usec, + (long)nst->st_sock_time.tv_usec, nst->st_send_time.tv_sec, - (unsigned long)nst->st_send_time.tv_usec, + (long)nst->st_send_time.tv_usec, nst->st_status_time.tv_sec, - nst->st_status_time.tv_usec); + (long)nst->st_status_time.tv_usec); } spin_unlock(&o2net_debug_lock); @@ -276,7 +276,7 @@ static void *sc_seq_next(struct seq_file *seq, void *v, loff_t *pos) return sc; /* unused, just needs to be null when done */ } -#define TV_SEC_USEC(TV) TV.tv_sec, (unsigned long)TV.tv_usec +#define TV_SEC_USEC(TV) TV.tv_sec, (long)TV.tv_usec static int sc_seq_show(struct seq_file *seq, void *v) { @@ -309,12 +309,12 @@ static int sc_seq_show(struct seq_file *seq, void *v) " remote node: %s\n" " page off: %zu\n" " handshake ok: %u\n" - " timer: %lu.%lu\n" - " data ready: %lu.%lu\n" - " advance start: %lu.%lu\n" - " advance stop: %lu.%lu\n" - " func start: %lu.%lu\n" - " func stop: %lu.%lu\n" + " timer: %lu.%ld\n" + " data ready: %lu.%ld\n" + " advance start: %lu.%ld\n" + " advance stop: %lu.%ld\n" + " func start: %lu.%ld\n" + " func stop: %lu.%ld\n" " func key: %u\n" " func type: %u\n", sc, -- cgit v1.2.3 From a1af7d15a18d1e375b0a6fee93789a0bbfe088b4 Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Tue, 19 Aug 2008 17:20:28 -0700 Subject: ocfs2: Fix sleep-with-spinlock recovery regression This fixes a bug introduced with 539d8264093560b917ee3afe4c7f74e5da09d6a5: [PATCH 2/2] ocfs2: Fix race between mount and recovery ocfs2_mark_dead_nodes() was reading journal inodes while holding the spinlock protecting our in-memory recovery state. The fix is very simple - the disk state is protected by a cluster lock that's already held, so we just move the spinlock down past the read. Reviewed-by: Joel Becker Signed-off-by: Mark Fasheh --- fs/ocfs2/journal.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 7a37240f7a31..c47bc2a809c2 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -1418,13 +1418,13 @@ int ocfs2_mark_dead_nodes(struct ocfs2_super *osb) { unsigned int node_num; int status, i; + u32 gen; struct buffer_head *bh = NULL; struct ocfs2_dinode *di; /* This is called with the super block cluster lock, so we * know that the slot map can't change underneath us. */ - spin_lock(&osb->osb_lock); for (i = 0; i < osb->max_slots; i++) { /* Read journal inode to get the recovery generation */ status = ocfs2_read_journal_inode(osb, i, &bh, NULL); @@ -1433,23 +1433,31 @@ int ocfs2_mark_dead_nodes(struct ocfs2_super *osb) goto bail; } di = (struct ocfs2_dinode *)bh->b_data; - osb->slot_recovery_generations[i] = - ocfs2_get_recovery_generation(di); + gen = ocfs2_get_recovery_generation(di); brelse(bh); bh = NULL; + spin_lock(&osb->osb_lock); + osb->slot_recovery_generations[i] = gen; + mlog(0, "Slot %u recovery generation is %u\n", i, osb->slot_recovery_generations[i]); - if (i == osb->slot_num) + if (i == osb->slot_num) { + spin_unlock(&osb->osb_lock); continue; + } status = ocfs2_slot_to_node_num_locked(osb, i, &node_num); - if (status == -ENOENT) + if (status == -ENOENT) { + spin_unlock(&osb->osb_lock); continue; + } - if (__ocfs2_recovery_map_test(osb, node_num)) + if (__ocfs2_recovery_map_test(osb, node_num)) { + spin_unlock(&osb->osb_lock); continue; + } spin_unlock(&osb->osb_lock); /* Ok, we have a slot occupied by another node which @@ -1465,10 +1473,7 @@ int ocfs2_mark_dead_nodes(struct ocfs2_super *osb) mlog_errno(status); goto bail; } - - spin_lock(&osb->osb_lock); } - spin_unlock(&osb->osb_lock); status = 0; bail: -- cgit v1.2.3 From 83cab5338fa8c74f979223698c8d4cc88f2ab68e Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Thu, 21 Aug 2008 14:14:27 +0800 Subject: ocfs2: Jump to correct label in ocfs2_expand_inline_dir() When we fail to insert extent in ocfs2_expand_inline_dir(), we should go to out_commit, not out. Signed-off-by: Tao Ma Signed-off-by: Mark Fasheh --- fs/ocfs2/dir.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index 8a1875848080..8e9c4a47d819 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -1310,7 +1310,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, NULL); if (ret) { mlog_errno(ret); - goto out; + goto out_commit; } ret = ocfs2_journal_dirty(handle, di_bh); @@ -1336,7 +1336,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, len, 0, NULL); if (ret) { mlog_errno(ret); - goto out; + goto out_commit; } } -- cgit v1.2.3 From 9780eb6cfaf7d2d5ccc061eaf94e7aec6a17791e Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Tue, 5 Aug 2008 11:32:46 -0700 Subject: ocfs2: correctly set i_blocks after inline dir gets expanded We were setting i_blocks based on allocation before the extent insert, which is wrong as the value is a calculation based on ip_clusters which gets updated as a result of the insert. This patch moves the line in question to just after the call to ocfs2_insert_extent(). Without this fix, inline directories were temporarily having an i_blocks value of zero immediately after expansion to extents. Reported-and-tested-by: Tristan Ye Signed-off-by: Mark Fasheh --- fs/ocfs2/dir.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index 8e9c4a47d819..9cce563fd627 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -1300,7 +1300,6 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, di->i_size = cpu_to_le64(sb->s_blocksize); di->i_ctime = di->i_mtime = cpu_to_le64(dir->i_ctime.tv_sec); di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(dir->i_ctime.tv_nsec); - dir->i_blocks = ocfs2_inode_sector_count(dir); /* * This should never fail as our extent list is empty and all @@ -1313,6 +1312,12 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, goto out_commit; } + /* + * Set i_blocks after the extent insert for the most up to + * date ip_clusters value. + */ + dir->i_blocks = ocfs2_inode_sector_count(dir); + ret = ocfs2_journal_dirty(handle, di_bh); if (ret) { mlog_errno(ret); -- cgit v1.2.3 From de6bf18e9ce0df807dab08cff08751cac383429d Mon Sep 17 00:00:00 2001 From: Louis Rilling Date: Fri, 15 Aug 2008 12:37:23 -0700 Subject: [PATCH] configfs: Consolidate locking around configfs_detach_prep() in configfs_rmdir() It appears that configfs_rmdir() can protect configfs_detach_prep() retries with less calls to {spin,mutex}_{lock,unlock}, and a cleaner code. This patch does not change any behavior, except that it removes two useless lock/unlock pairs having nothing inside to protect and providing a useless barrier. Signed-off-by: Louis Rilling Signed-off-by: Joel Becker Signed-off-by: Mark Fasheh --- fs/configfs/dir.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index 7a8db78a91d2..8e93341f3e82 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -1311,16 +1311,18 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry) * Ensure that no racing symlink() will make detach_prep() fail while * the new link is temporarily attached */ - mutex_lock(&configfs_symlink_mutex); - spin_lock(&configfs_dirent_lock); do { struct mutex *wait_mutex; + mutex_lock(&configfs_symlink_mutex); + spin_lock(&configfs_dirent_lock); ret = configfs_detach_prep(dentry, &wait_mutex); - if (ret) { + if (ret) configfs_detach_rollback(dentry); - spin_unlock(&configfs_dirent_lock); - mutex_unlock(&configfs_symlink_mutex); + spin_unlock(&configfs_dirent_lock); + mutex_unlock(&configfs_symlink_mutex); + + if (ret) { if (ret != -EAGAIN) { config_item_put(parent_item); return ret; @@ -1329,13 +1331,8 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry) /* Wait until the racing operation terminates */ mutex_lock(wait_mutex); mutex_unlock(wait_mutex); - - mutex_lock(&configfs_symlink_mutex); - spin_lock(&configfs_dirent_lock); } } while (ret == -EAGAIN); - spin_unlock(&configfs_dirent_lock); - mutex_unlock(&configfs_symlink_mutex); /* Get a working ref for the duration of this function */ item = configfs_get_config_item(dentry); -- cgit v1.2.3 From 7a8fc9b248e77a4eab0613acf30a6811799786b3 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Sun, 17 Aug 2008 17:36:59 +0300 Subject: removed unused #include 's This patch lets the files using linux/version.h match the files that #include it. Signed-off-by: Adrian Bunk Signed-off-by: Linus Torvalds --- arch/arm/plat-omap/clock.c | 1 - arch/cris/arch-v32/kernel/fasttimer.c | 2 -- arch/mn10300/kernel/mn10300-serial.c | 1 - arch/powerpc/sysdev/bestcomm/gen_bd.c | 1 - arch/x86/mach-rdc321x/platform.c | 1 - drivers/atm/adummy.c | 1 - drivers/char/xilinx_hwicap/buffer_icap.h | 1 - drivers/char/xilinx_hwicap/fifo_icap.h | 1 - drivers/char/xilinx_hwicap/xilinx_hwicap.h | 1 - drivers/edac/edac_core.h | 1 - drivers/i2c/busses/i2c-at91.c | 1 - drivers/infiniband/hw/ehca/ehca_tools.h | 1 - drivers/infiniband/hw/ipath/ipath_fs.c | 1 - drivers/infiniband/hw/nes/nes.h | 1 - drivers/infiniband/ulp/iser/iser_verbs.c | 1 - drivers/input/keyboard/bf54x-keys.c | 1 - drivers/input/touchscreen/mainstone-wm97xx.c | 1 - drivers/mfd/asic3.c | 1 - drivers/misc/eeprom_93cx6.c | 1 - drivers/mtd/maps/amd76xrom.c | 1 - drivers/mtd/maps/ck804xrom.c | 1 - drivers/mtd/maps/esb2rom.c | 1 - drivers/mtd/nand/au1550nd.c | 1 - drivers/net/myri10ge/myri10ge.c | 1 - drivers/net/netxen/netxen_nic.h | 1 - drivers/net/netxen/netxen_nic_ethtool.c | 1 - drivers/net/netxen/netxen_nic_hdr.h | 2 -- drivers/net/tokenring/lanstreamer.c | 1 - drivers/net/tokenring/lanstreamer.h | 2 -- drivers/net/wireless/b43legacy/main.c | 1 - drivers/net/wireless/iwlwifi/iwl-3945-led.c | 1 - drivers/net/wireless/iwlwifi/iwl-led.c | 1 - drivers/net/wireless/iwlwifi/iwl-rfkill.c | 1 - drivers/rtc/rtc-max6902.c | 2 -- drivers/rtc/rtc-r9701.c | 1 - drivers/s390/net/ctcm_mpc.c | 1 - drivers/scsi/dpt/dpti_i2o.h | 1 - drivers/scsi/ips.c | 1 - drivers/scsi/ips.h | 1 - drivers/scsi/lpfc/lpfc_debugfs.c | 1 - drivers/scsi/nsp32.c | 1 - drivers/scsi/nsp32.h | 1 - drivers/scsi/pcmcia/nsp_cs.c | 1 - drivers/scsi/qla2xxx/qla_mid.c | 1 - drivers/usb/atm/ueagle-atm.c | 1 - drivers/usb/gadget/amd5536udc.c | 1 - drivers/usb/gadget/s3c2410_udc.c | 1 - drivers/usb/misc/iowarrior.c | 1 - drivers/usb/serial/garmin_gps.c | 2 -- drivers/video/arkfb.c | 1 - drivers/video/s3fb.c | 1 - drivers/video/vermilion/vermilion.h | 1 - drivers/video/vt8623fb.c | 1 - drivers/video/xilinxfb.c | 1 - fs/jffs2/jffs2_fs_i.h | 1 - fs/xfs/xfs_dmapi.h | 1 - include/asm-x86/xen/hypervisor.h | 1 - include/linux/fs_uart_pd.h | 1 - kernel/nsproxy.c | 1 - kernel/power/swap.c | 1 - kernel/user_namespace.c | 1 - kernel/utsname.c | 1 - kernel/utsname_sysctl.c | 1 - sound/mips/au1x00.c | 1 - sound/soc/at91/eti_b1_wm8731.c | 1 - sound/soc/codecs/wm8753.c | 1 - sound/soc/codecs/wm9712.c | 1 - 67 files changed, 72 deletions(-) (limited to 'fs') diff --git a/arch/arm/plat-omap/clock.c b/arch/arm/plat-omap/clock.c index 23a070599993..197974defbe4 100644 --- a/arch/arm/plat-omap/clock.c +++ b/arch/arm/plat-omap/clock.c @@ -10,7 +10,6 @@ * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ -#include #include #include #include diff --git a/arch/cris/arch-v32/kernel/fasttimer.c b/arch/cris/arch-v32/kernel/fasttimer.c index 2de9d5849ef0..111caa1a2efb 100644 --- a/arch/cris/arch-v32/kernel/fasttimer.c +++ b/arch/cris/arch-v32/kernel/fasttimer.c @@ -19,8 +19,6 @@ #include #include -#include - #include #include #include diff --git a/arch/mn10300/kernel/mn10300-serial.c b/arch/mn10300/kernel/mn10300-serial.c index 8b054e7a8ae8..aa07d0cd1905 100644 --- a/arch/mn10300/kernel/mn10300-serial.c +++ b/arch/mn10300/kernel/mn10300-serial.c @@ -17,7 +17,6 @@ static const char serial_revdate[] = "2007-11-06"; #define SUPPORT_SYSRQ #endif -#include #include #include #include diff --git a/arch/powerpc/sysdev/bestcomm/gen_bd.c b/arch/powerpc/sysdev/bestcomm/gen_bd.c index a3a134c35b0a..e0a53e3147b2 100644 --- a/arch/powerpc/sysdev/bestcomm/gen_bd.c +++ b/arch/powerpc/sysdev/bestcomm/gen_bd.c @@ -11,7 +11,6 @@ * */ -#include #include #include #include diff --git a/arch/x86/mach-rdc321x/platform.c b/arch/x86/mach-rdc321x/platform.c index a037041817c7..4f4e50c3ad3b 100644 --- a/arch/x86/mach-rdc321x/platform.c +++ b/arch/x86/mach-rdc321x/platform.c @@ -25,7 +25,6 @@ #include #include #include -#include #include #include diff --git a/drivers/atm/adummy.c b/drivers/atm/adummy.c index 2ebd07f2ef81..5effec6f5458 100644 --- a/drivers/atm/adummy.c +++ b/drivers/atm/adummy.c @@ -3,7 +3,6 @@ */ #include -#include #include #include #include diff --git a/drivers/char/xilinx_hwicap/buffer_icap.h b/drivers/char/xilinx_hwicap/buffer_icap.h index c5b1840906b2..8b0252bf06e2 100644 --- a/drivers/char/xilinx_hwicap/buffer_icap.h +++ b/drivers/char/xilinx_hwicap/buffer_icap.h @@ -38,7 +38,6 @@ #include #include -#include #include #include diff --git a/drivers/char/xilinx_hwicap/fifo_icap.h b/drivers/char/xilinx_hwicap/fifo_icap.h index ffabd3ba2bd8..62bda453c90b 100644 --- a/drivers/char/xilinx_hwicap/fifo_icap.h +++ b/drivers/char/xilinx_hwicap/fifo_icap.h @@ -38,7 +38,6 @@ #include #include -#include #include #include diff --git a/drivers/char/xilinx_hwicap/xilinx_hwicap.h b/drivers/char/xilinx_hwicap/xilinx_hwicap.h index 1f9c8b082dbe..24d0d9b938fb 100644 --- a/drivers/char/xilinx_hwicap/xilinx_hwicap.h +++ b/drivers/char/xilinx_hwicap/xilinx_hwicap.h @@ -38,7 +38,6 @@ #include #include -#include #include #include diff --git a/drivers/edac/edac_core.h b/drivers/edac/edac_core.h index b27b13c5eb5a..4b55ec607a88 100644 --- a/drivers/edac/edac_core.h +++ b/drivers/edac/edac_core.h @@ -34,7 +34,6 @@ #include #include #include -#include #define EDAC_MC_LABEL_LEN 31 #define EDAC_DEVICE_NAME_LEN 31 diff --git a/drivers/i2c/busses/i2c-at91.c b/drivers/i2c/busses/i2c-at91.c index c1adcdbf7979..9efb02137254 100644 --- a/drivers/i2c/busses/i2c-at91.c +++ b/drivers/i2c/busses/i2c-at91.c @@ -14,7 +14,6 @@ */ #include -#include #include #include #include diff --git a/drivers/infiniband/hw/ehca/ehca_tools.h b/drivers/infiniband/hw/ehca/ehca_tools.h index ec950bf8c479..21f7d06f14ad 100644 --- a/drivers/infiniband/hw/ehca/ehca_tools.h +++ b/drivers/infiniband/hw/ehca/ehca_tools.h @@ -54,7 +54,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/infiniband/hw/ipath/ipath_fs.c b/drivers/infiniband/hw/ipath/ipath_fs.c index 23faba9d21eb..8bb5170b4e41 100644 --- a/drivers/infiniband/hw/ipath/ipath_fs.c +++ b/drivers/infiniband/hw/ipath/ipath_fs.c @@ -31,7 +31,6 @@ * SOFTWARE. */ -#include #include #include #include diff --git a/drivers/infiniband/hw/nes/nes.h b/drivers/infiniband/hw/nes/nes.h index 39bd897b40c6..8eb7ae96974d 100644 --- a/drivers/infiniband/hw/nes/nes.h +++ b/drivers/infiniband/hw/nes/nes.h @@ -43,7 +43,6 @@ #include #include #include -#include #include #include diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c index 63462ecca147..26ff6214a81f 100644 --- a/drivers/infiniband/ulp/iser/iser_verbs.c +++ b/drivers/infiniband/ulp/iser/iser_verbs.c @@ -33,7 +33,6 @@ #include #include #include -#include #include "iscsi_iser.h" diff --git a/drivers/input/keyboard/bf54x-keys.c b/drivers/input/keyboard/bf54x-keys.c index 54ed8e2e1c02..6f227d3dbda1 100644 --- a/drivers/input/keyboard/bf54x-keys.c +++ b/drivers/input/keyboard/bf54x-keys.c @@ -29,7 +29,6 @@ */ #include -#include #include #include diff --git a/drivers/input/touchscreen/mainstone-wm97xx.c b/drivers/input/touchscreen/mainstone-wm97xx.c index 283f93a0cee2..37a555f37306 100644 --- a/drivers/input/touchscreen/mainstone-wm97xx.c +++ b/drivers/input/touchscreen/mainstone-wm97xx.c @@ -25,7 +25,6 @@ #include #include -#include #include #include #include diff --git a/drivers/mfd/asic3.c b/drivers/mfd/asic3.c index c6408a62d95e..bc2a807f210d 100644 --- a/drivers/mfd/asic3.c +++ b/drivers/mfd/asic3.c @@ -16,7 +16,6 @@ * */ -#include #include #include #include diff --git a/drivers/misc/eeprom_93cx6.c b/drivers/misc/eeprom_93cx6.c index ea55654e5948..15b1780025c8 100644 --- a/drivers/misc/eeprom_93cx6.c +++ b/drivers/misc/eeprom_93cx6.c @@ -26,7 +26,6 @@ #include #include -#include #include #include diff --git a/drivers/mtd/maps/amd76xrom.c b/drivers/mtd/maps/amd76xrom.c index 948b86f35ef4..d1eec7d3243f 100644 --- a/drivers/mtd/maps/amd76xrom.c +++ b/drivers/mtd/maps/amd76xrom.c @@ -6,7 +6,6 @@ #include #include -#include #include #include #include diff --git a/drivers/mtd/maps/ck804xrom.c b/drivers/mtd/maps/ck804xrom.c index effaf7cdefab..1a6feb4474de 100644 --- a/drivers/mtd/maps/ck804xrom.c +++ b/drivers/mtd/maps/ck804xrom.c @@ -9,7 +9,6 @@ #include #include -#include #include #include #include diff --git a/drivers/mtd/maps/esb2rom.c b/drivers/mtd/maps/esb2rom.c index aa64a4752781..bbbcdd4c8d13 100644 --- a/drivers/mtd/maps/esb2rom.c +++ b/drivers/mtd/maps/esb2rom.c @@ -12,7 +12,6 @@ #include #include -#include #include #include #include diff --git a/drivers/mtd/nand/au1550nd.c b/drivers/mtd/nand/au1550nd.c index 761946ea45b1..92c334ff4508 100644 --- a/drivers/mtd/nand/au1550nd.c +++ b/drivers/mtd/nand/au1550nd.c @@ -16,7 +16,6 @@ #include #include #include -#include #include #include diff --git a/drivers/net/myri10ge/myri10ge.c b/drivers/net/myri10ge/myri10ge.c index 5d76cd09e246..54cd89cb0838 100644 --- a/drivers/net/myri10ge/myri10ge.c +++ b/drivers/net/myri10ge/myri10ge.c @@ -56,7 +56,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/net/netxen/netxen_nic.h b/drivers/net/netxen/netxen_nic.h index ab871df6b1db..244ab49c4337 100644 --- a/drivers/net/netxen/netxen_nic.h +++ b/drivers/net/netxen/netxen_nic.h @@ -45,7 +45,6 @@ #include #include #include -#include #include #include diff --git a/drivers/net/netxen/netxen_nic_ethtool.c b/drivers/net/netxen/netxen_nic_ethtool.c index 4ad3e0844b99..b974ca0fc530 100644 --- a/drivers/net/netxen/netxen_nic_ethtool.c +++ b/drivers/net/netxen/netxen_nic_ethtool.c @@ -38,7 +38,6 @@ #include #include #include -#include #include "netxen_nic.h" #include "netxen_nic_hw.h" diff --git a/drivers/net/netxen/netxen_nic_hdr.h b/drivers/net/netxen/netxen_nic_hdr.h index e8e8d73f6ed7..e80f9e3e5973 100644 --- a/drivers/net/netxen/netxen_nic_hdr.h +++ b/drivers/net/netxen/netxen_nic_hdr.h @@ -32,8 +32,6 @@ #include #include -#include - #include #include #include diff --git a/drivers/net/tokenring/lanstreamer.c b/drivers/net/tokenring/lanstreamer.c index 47d84cd28097..59d1673f9387 100644 --- a/drivers/net/tokenring/lanstreamer.c +++ b/drivers/net/tokenring/lanstreamer.c @@ -119,7 +119,6 @@ #include #include #include -#include #include #include diff --git a/drivers/net/tokenring/lanstreamer.h b/drivers/net/tokenring/lanstreamer.h index e7bb3494afc7..13ccee6449c1 100644 --- a/drivers/net/tokenring/lanstreamer.h +++ b/drivers/net/tokenring/lanstreamer.h @@ -60,8 +60,6 @@ * */ -#include - /* MAX_INTR - the maximum number of times we can loop * inside the interrupt function before returning * control to the OS (maximum value is 256) diff --git a/drivers/net/wireless/b43legacy/main.c b/drivers/net/wireless/b43legacy/main.c index 2541c81932f0..1cb77db5c292 100644 --- a/drivers/net/wireless/b43legacy/main.c +++ b/drivers/net/wireless/b43legacy/main.c @@ -34,7 +34,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/net/wireless/iwlwifi/iwl-3945-led.c b/drivers/net/wireless/iwlwifi/iwl-3945-led.c index d3336966b6b5..705c65bed9fd 100644 --- a/drivers/net/wireless/iwlwifi/iwl-3945-led.c +++ b/drivers/net/wireless/iwlwifi/iwl-3945-led.c @@ -27,7 +27,6 @@ #include #include -#include #include #include #include diff --git a/drivers/net/wireless/iwlwifi/iwl-led.c b/drivers/net/wireless/iwlwifi/iwl-led.c index cb11c4a4d691..4eee1b163cd2 100644 --- a/drivers/net/wireless/iwlwifi/iwl-led.c +++ b/drivers/net/wireless/iwlwifi/iwl-led.c @@ -27,7 +27,6 @@ #include #include -#include #include #include #include diff --git a/drivers/net/wireless/iwlwifi/iwl-rfkill.c b/drivers/net/wireless/iwlwifi/iwl-rfkill.c index e5e5846e9f25..5d642298f04c 100644 --- a/drivers/net/wireless/iwlwifi/iwl-rfkill.c +++ b/drivers/net/wireless/iwlwifi/iwl-rfkill.c @@ -27,7 +27,6 @@ *****************************************************************************/ #include #include -#include #include #include diff --git a/drivers/rtc/rtc-max6902.c b/drivers/rtc/rtc-max6902.c index 12f0310ae89c..78b2551fb19d 100644 --- a/drivers/rtc/rtc-max6902.c +++ b/drivers/rtc/rtc-max6902.c @@ -20,8 +20,6 @@ */ #include -#include - #include #include #include diff --git a/drivers/rtc/rtc-r9701.c b/drivers/rtc/rtc-r9701.c index b35f9bfa2af4..395985b339c9 100644 --- a/drivers/rtc/rtc-r9701.c +++ b/drivers/rtc/rtc-r9701.c @@ -14,7 +14,6 @@ */ #include -#include #include #include #include diff --git a/drivers/s390/net/ctcm_mpc.c b/drivers/s390/net/ctcm_mpc.c index 49ae1cd25caa..2de1e2fccbf9 100644 --- a/drivers/s390/net/ctcm_mpc.c +++ b/drivers/s390/net/ctcm_mpc.c @@ -19,7 +19,6 @@ #undef DEBUGDATA #undef DEBUGCCW -#include #include #include #include diff --git a/drivers/scsi/dpt/dpti_i2o.h b/drivers/scsi/dpt/dpti_i2o.h index 19406cea6d6a..179ad77f6cc9 100644 --- a/drivers/scsi/dpt/dpti_i2o.h +++ b/drivers/scsi/dpt/dpti_i2o.h @@ -21,7 +21,6 @@ #include -#include #include #include diff --git a/drivers/scsi/ips.c b/drivers/scsi/ips.c index 7c615c70ec5c..bc9e6ddf41df 100644 --- a/drivers/scsi/ips.c +++ b/drivers/scsi/ips.c @@ -165,7 +165,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/scsi/ips.h b/drivers/scsi/ips.h index e0657b6f009c..4e49fbcfe8af 100644 --- a/drivers/scsi/ips.h +++ b/drivers/scsi/ips.h @@ -50,7 +50,6 @@ #ifndef _IPS_H_ #define _IPS_H_ -#include #include #include #include diff --git a/drivers/scsi/lpfc/lpfc_debugfs.c b/drivers/scsi/lpfc/lpfc_debugfs.c index 90272e65957a..094b47e94b29 100644 --- a/drivers/scsi/lpfc/lpfc_debugfs.c +++ b/drivers/scsi/lpfc/lpfc_debugfs.c @@ -27,7 +27,6 @@ #include #include #include -#include #include #include diff --git a/drivers/scsi/nsp32.c b/drivers/scsi/nsp32.c index edf9fdb3cb3c..22052bb7becb 100644 --- a/drivers/scsi/nsp32.c +++ b/drivers/scsi/nsp32.c @@ -23,7 +23,6 @@ * 1.2: PowerPC (big endian) support. */ -#include #include #include #include diff --git a/drivers/scsi/nsp32.h b/drivers/scsi/nsp32.h index 6715ecb3bfca..9565acf1aa72 100644 --- a/drivers/scsi/nsp32.h +++ b/drivers/scsi/nsp32.h @@ -16,7 +16,6 @@ #ifndef _NSP32_H #define _NSP32_H -#include //#define NSP32_DEBUG 9 /* diff --git a/drivers/scsi/pcmcia/nsp_cs.c b/drivers/scsi/pcmcia/nsp_cs.c index a221b6ef9fa9..24e6cb8396e3 100644 --- a/drivers/scsi/pcmcia/nsp_cs.c +++ b/drivers/scsi/pcmcia/nsp_cs.c @@ -25,7 +25,6 @@ ***********************************************************************/ -#include #include #include #include diff --git a/drivers/scsi/qla2xxx/qla_mid.c b/drivers/scsi/qla2xxx/qla_mid.c index 50baf6a1d67c..93560cd72784 100644 --- a/drivers/scsi/qla2xxx/qla_mid.c +++ b/drivers/scsi/qla2xxx/qla_mid.c @@ -6,7 +6,6 @@ */ #include "qla_def.h" -#include #include #include #include diff --git a/drivers/usb/atm/ueagle-atm.c b/drivers/usb/atm/ueagle-atm.c index cb01b5106efd..b6483dd98acc 100644 --- a/drivers/usb/atm/ueagle-atm.c +++ b/drivers/usb/atm/ueagle-atm.c @@ -64,7 +64,6 @@ #include #include #include -#include #include #include diff --git a/drivers/usb/gadget/amd5536udc.c b/drivers/usb/gadget/amd5536udc.c index 1500e1b3c302..abf8192f89e8 100644 --- a/drivers/usb/gadget/amd5536udc.c +++ b/drivers/usb/gadget/amd5536udc.c @@ -44,7 +44,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/usb/gadget/s3c2410_udc.c b/drivers/usb/gadget/s3c2410_udc.c index 538807384592..29d13ebe7500 100644 --- a/drivers/usb/gadget/s3c2410_udc.c +++ b/drivers/usb/gadget/s3c2410_udc.c @@ -35,7 +35,6 @@ #include #include #include -#include #include #include diff --git a/drivers/usb/misc/iowarrior.c b/drivers/usb/misc/iowarrior.c index e6ca9979e3ae..a4ef77ef917d 100644 --- a/drivers/usb/misc/iowarrior.c +++ b/drivers/usb/misc/iowarrior.c @@ -19,7 +19,6 @@ #include #include #include -#include #include /* Version Information */ diff --git a/drivers/usb/serial/garmin_gps.c b/drivers/usb/serial/garmin_gps.c index 2e663f1afd5e..d95382088075 100644 --- a/drivers/usb/serial/garmin_gps.c +++ b/drivers/usb/serial/garmin_gps.c @@ -38,8 +38,6 @@ #include #include -#include - /* the mode to be set when the port ist opened */ static int initial_mode = 1; diff --git a/drivers/video/arkfb.c b/drivers/video/arkfb.c index 4bd569e479a7..314d18694b6a 100644 --- a/drivers/video/arkfb.c +++ b/drivers/video/arkfb.c @@ -11,7 +11,6 @@ * Code is based on s3fb */ -#include #include #include #include diff --git a/drivers/video/s3fb.c b/drivers/video/s3fb.c index 8361bd0e3df1..4dcec48a1d78 100644 --- a/drivers/video/s3fb.c +++ b/drivers/video/s3fb.c @@ -11,7 +11,6 @@ * which is based on the code of neofb. */ -#include #include #include #include diff --git a/drivers/video/vermilion/vermilion.h b/drivers/video/vermilion/vermilion.h index c4aba59d4809..7491abfcf1fc 100644 --- a/drivers/video/vermilion/vermilion.h +++ b/drivers/video/vermilion/vermilion.h @@ -30,7 +30,6 @@ #define _VERMILION_H_ #include -#include #include #include #include diff --git a/drivers/video/vt8623fb.c b/drivers/video/vt8623fb.c index 34aae7a2a62b..3df17dc8c3d7 100644 --- a/drivers/video/vt8623fb.c +++ b/drivers/video/vt8623fb.c @@ -12,7 +12,6 @@ * (http://davesdomain.org.uk/viafb/) */ -#include #include #include #include diff --git a/drivers/video/xilinxfb.c b/drivers/video/xilinxfb.c index 7b3a8423f485..5da3d2423cc0 100644 --- a/drivers/video/xilinxfb.c +++ b/drivers/video/xilinxfb.c @@ -24,7 +24,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/jffs2/jffs2_fs_i.h b/fs/jffs2/jffs2_fs_i.h index 31559f45fdde..4c41db91eaa4 100644 --- a/fs/jffs2/jffs2_fs_i.h +++ b/fs/jffs2/jffs2_fs_i.h @@ -12,7 +12,6 @@ #ifndef _JFFS2_FS_I #define _JFFS2_FS_I -#include #include #include #include diff --git a/fs/xfs/xfs_dmapi.h b/fs/xfs/xfs_dmapi.h index cdc2d3464a1a..2813cdd72375 100644 --- a/fs/xfs/xfs_dmapi.h +++ b/fs/xfs/xfs_dmapi.h @@ -18,7 +18,6 @@ #ifndef __XFS_DMAPI_H__ #define __XFS_DMAPI_H__ -#include /* Values used to define the on-disk version of dm_attrname_t. All * on-disk attribute names start with the 8-byte string "SGI_DMI_". * diff --git a/include/asm-x86/xen/hypervisor.h b/include/asm-x86/xen/hypervisor.h index 8e15dd28c91f..04ee0610014a 100644 --- a/include/asm-x86/xen/hypervisor.h +++ b/include/asm-x86/xen/hypervisor.h @@ -35,7 +35,6 @@ #include #include -#include #include #include diff --git a/include/linux/fs_uart_pd.h b/include/linux/fs_uart_pd.h index 809bb9ffc788..36b61ff39277 100644 --- a/include/linux/fs_uart_pd.h +++ b/include/linux/fs_uart_pd.h @@ -12,7 +12,6 @@ #ifndef FS_UART_PD_H #define FS_UART_PD_H -#include #include enum fs_uart_id { diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 21575fc46d05..1d3ef29a2583 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -14,7 +14,6 @@ */ #include -#include #include #include #include diff --git a/kernel/power/swap.c b/kernel/power/swap.c index a0abf9a463f9..80ccac849e46 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -14,7 +14,6 @@ #include #include #include -#include #include #include #include diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index a9ab0596de44..532858fa5b88 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -6,7 +6,6 @@ */ #include -#include #include #include #include diff --git a/kernel/utsname.c b/kernel/utsname.c index 64d398f12444..815237a55af8 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -12,7 +12,6 @@ #include #include #include -#include #include #include diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c index fe3a56c2256d..4ab9659d269e 100644 --- a/kernel/utsname_sysctl.c +++ b/kernel/utsname_sysctl.c @@ -12,7 +12,6 @@ #include #include #include -#include #include static void *get_uts(ctl_table *table, int write) diff --git a/sound/mips/au1x00.c b/sound/mips/au1x00.c index ee0741f9eb53..fbef38a9604a 100644 --- a/sound/mips/au1x00.c +++ b/sound/mips/au1x00.c @@ -38,7 +38,6 @@ #include #include #include -#include #include #include #include diff --git a/sound/soc/at91/eti_b1_wm8731.c b/sound/soc/at91/eti_b1_wm8731.c index b081e83766b7..b81d6b2cfa1d 100644 --- a/sound/soc/at91/eti_b1_wm8731.c +++ b/sound/soc/at91/eti_b1_wm8731.c @@ -22,7 +22,6 @@ #include #include -#include #include #include #include diff --git a/sound/soc/codecs/wm8753.c b/sound/soc/codecs/wm8753.c index 8604809f0c36..dc7b18fd2782 100644 --- a/sound/soc/codecs/wm8753.c +++ b/sound/soc/codecs/wm8753.c @@ -34,7 +34,6 @@ #include #include -#include #include #include #include diff --git a/sound/soc/codecs/wm9712.c b/sound/soc/codecs/wm9712.c index 1fb7f9a7aecd..2f1c91b1d556 100644 --- a/sound/soc/codecs/wm9712.c +++ b/sound/soc/codecs/wm9712.c @@ -13,7 +13,6 @@ #include #include -#include #include #include #include -- cgit v1.2.3 From cc996099174dc05b35b7a29301026987990e7f8c Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Sat, 2 Aug 2008 07:30:48 +0400 Subject: [PATCH] proc: inode number fixlet Ouch, if number taken from IDA is too big, the intent was to signal an error, not check for overflow and still do overflowing addition. One still needs 2^28 proc entries to notice this. Signed-off-by: Alexey Dobriyan Signed-off-by: Al Viro --- fs/proc/generic.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 4fb81e9c94e3..bca0f81eb687 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -330,6 +330,7 @@ retry: spin_lock(&proc_inum_lock); ida_remove(&proc_inum_ida, i); spin_unlock(&proc_inum_lock); + return 0; } return PROC_DYNAMIC_FIRST + i; } -- cgit v1.2.3 From 2d8a10cd1760e7ecc07a21e409485947c68a3291 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 11 Aug 2008 11:33:57 -0400 Subject: [PATCH] fix efs_lookup() it needs to use d_splice_alias(), not d_add() Signed-off-by: Al Viro --- fs/efs/namei.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/efs/namei.c b/fs/efs/namei.c index 3a404e7fad53..291abb11e20e 100644 --- a/fs/efs/namei.c +++ b/fs/efs/namei.c @@ -74,8 +74,7 @@ struct dentry *efs_lookup(struct inode *dir, struct dentry *dentry, struct namei } unlock_kernel(); - d_add(dentry, inode); - return NULL; + return d_splice_alias(inode, dentry); } static struct inode *efs_nfs_get_inode(struct super_block *sb, u64 ino, -- cgit v1.2.3 From e45b590b976465c258f3e2a6cc84573fc19e16d3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 7 Aug 2008 23:49:07 +0200 Subject: [PATCH] change d_add_ci argument ordering As pointed out during review d_add_ci argument order should match d_add, so switch the dentry and inode arguments. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- fs/dcache.c | 2 +- fs/xfs/linux-2.6/xfs_iops.c | 2 +- include/linux/dcache.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/dcache.c b/fs/dcache.c index 101663d15e9f..80e93956aced 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1236,7 +1236,7 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry) * If no entry exists with the exact case name, allocate new dentry with * the exact case, and return the spliced entry. */ -struct dentry *d_add_ci(struct inode *inode, struct dentry *dentry, +struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode, struct qstr *name) { int error; diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c index 91bcd979242c..095d271f3434 100644 --- a/fs/xfs/linux-2.6/xfs_iops.c +++ b/fs/xfs/linux-2.6/xfs_iops.c @@ -355,7 +355,7 @@ xfs_vn_ci_lookup( /* else case-insensitive match... */ dname.name = ci_name.name; dname.len = ci_name.len; - dentry = d_add_ci(VFS_I(ip), dentry, &dname); + dentry = d_add_ci(dentry, VFS_I(ip), &dname); kmem_free(ci_name.name); return dentry; } diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 07aa198f19ed..efba1de629ac 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -230,7 +230,7 @@ extern void d_delete(struct dentry *); extern struct dentry * d_alloc(struct dentry *, const struct qstr *); extern struct dentry * d_alloc_anon(struct inode *); extern struct dentry * d_splice_alias(struct inode *, struct dentry *); -extern struct dentry * d_add_ci(struct inode *, struct dentry *, struct qstr *); +extern struct dentry * d_add_ci(struct dentry *, struct inode *, struct qstr *); extern void shrink_dcache_sb(struct super_block *); extern void shrink_dcache_parent(struct dentry *); extern void shrink_dcache_for_umount(struct super_block *); -- cgit v1.2.3 From 2690421743b03c9be05d8e44c3b827986d1329a7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 7 Aug 2008 23:50:21 +0200 Subject: [PATCH] ntfs: use d_add_ci d_add_ci was lifted 1:1 from ntfs. Change ntfs to use the common version. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- fs/ntfs/namei.c | 89 ++------------------------------------------------------- 1 file changed, 2 insertions(+), 87 deletions(-) (limited to 'fs') diff --git a/fs/ntfs/namei.c b/fs/ntfs/namei.c index e1781c8b1650..9e8a95be7a1e 100644 --- a/fs/ntfs/namei.c +++ b/fs/ntfs/namei.c @@ -174,7 +174,6 @@ static struct dentry *ntfs_lookup(struct inode *dir_ino, struct dentry *dent, // TODO: Consider moving this lot to a separate function! (AIA) handle_name: { - struct dentry *real_dent, *new_dent; MFT_RECORD *m; ntfs_attr_search_ctx *ctx; ntfs_inode *ni = NTFS_I(dent_inode); @@ -255,93 +254,9 @@ handle_name: } nls_name.hash = full_name_hash(nls_name.name, nls_name.len); - /* - * Note: No need for dent->d_lock lock as i_mutex is held on the - * parent inode. - */ - - /* Does a dentry matching the nls_name exist already? */ - real_dent = d_lookup(dent->d_parent, &nls_name); - /* If not, create it now. */ - if (!real_dent) { - real_dent = d_alloc(dent->d_parent, &nls_name); - kfree(nls_name.name); - if (!real_dent) { - err = -ENOMEM; - goto err_out; - } - new_dent = d_splice_alias(dent_inode, real_dent); - if (new_dent) - dput(real_dent); - else - new_dent = real_dent; - ntfs_debug("Done. (Created new dentry.)"); - return new_dent; - } + dent = d_add_ci(dent, dent_inode, &nls_name); kfree(nls_name.name); - /* Matching dentry exists, check if it is negative. */ - if (real_dent->d_inode) { - if (unlikely(real_dent->d_inode != dent_inode)) { - /* This can happen because bad inodes are unhashed. */ - BUG_ON(!is_bad_inode(dent_inode)); - BUG_ON(!is_bad_inode(real_dent->d_inode)); - } - /* - * Already have the inode and the dentry attached, decrement - * the reference count to balance the ntfs_iget() we did - * earlier on. We found the dentry using d_lookup() so it - * cannot be disconnected and thus we do not need to worry - * about any NFS/disconnectedness issues here. - */ - iput(dent_inode); - ntfs_debug("Done. (Already had inode and dentry.)"); - return real_dent; - } - /* - * Negative dentry: instantiate it unless the inode is a directory and - * has a 'disconnected' dentry (i.e. IS_ROOT and DCACHE_DISCONNECTED), - * in which case d_move() that in place of the found dentry. - */ - if (!S_ISDIR(dent_inode->i_mode)) { - /* Not a directory; everything is easy. */ - d_instantiate(real_dent, dent_inode); - ntfs_debug("Done. (Already had negative file dentry.)"); - return real_dent; - } - spin_lock(&dcache_lock); - if (list_empty(&dent_inode->i_dentry)) { - /* - * Directory without a 'disconnected' dentry; we need to do - * d_instantiate() by hand because it takes dcache_lock which - * we already hold. - */ - list_add(&real_dent->d_alias, &dent_inode->i_dentry); - real_dent->d_inode = dent_inode; - spin_unlock(&dcache_lock); - security_d_instantiate(real_dent, dent_inode); - ntfs_debug("Done. (Already had negative directory dentry.)"); - return real_dent; - } - /* - * Directory with a 'disconnected' dentry; get a reference to the - * 'disconnected' dentry. - */ - new_dent = list_entry(dent_inode->i_dentry.next, struct dentry, - d_alias); - dget_locked(new_dent); - spin_unlock(&dcache_lock); - /* Do security vodoo. */ - security_d_instantiate(real_dent, dent_inode); - /* Move new_dent in place of real_dent. */ - d_move(new_dent, real_dent); - /* Balance the ntfs_iget() we did above. */ - iput(dent_inode); - /* Throw away real_dent. */ - dput(real_dent); - /* Use new_dent as the actual dentry. */ - ntfs_debug("Done. (Already had negative, disconnected directory " - "dentry.)"); - return new_dent; + return dent; eio_err_out: ntfs_error(vol->sb, "Illegal file name attribute. Run chkdsk."); -- cgit v1.2.3 From 8f3f655da7288504c1013621090ecc940173ae1c Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 12 Aug 2008 00:28:24 -0400 Subject: [PATCH] fix regular readdir() and friends Handling of -EOVERFLOW. Signed-off-by: Al Viro --- fs/compat.c | 8 ++++++-- fs/readdir.c | 8 ++++++-- 2 files changed, 12 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/compat.c b/fs/compat.c index c9d1472e65c5..075d0509970d 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -792,8 +792,10 @@ static int compat_fillonedir(void *__buf, const char *name, int namlen, if (buf->result) return -EINVAL; d_ino = ino; - if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) + if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) { + buf->result = -EOVERFLOW; return -EOVERFLOW; + } buf->result++; dirent = buf->dirent; if (!access_ok(VERIFY_WRITE, dirent, @@ -862,8 +864,10 @@ static int compat_filldir(void *__buf, const char *name, int namlen, if (reclen > buf->count) return -EINVAL; d_ino = ino; - if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) + if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) { + buf->error = -EOVERFLOW; return -EOVERFLOW; + } dirent = buf->previous; if (dirent) { if (__put_user(offset, &dirent->d_off)) diff --git a/fs/readdir.c b/fs/readdir.c index 4e026e5407fb..93a7559bbfd8 100644 --- a/fs/readdir.c +++ b/fs/readdir.c @@ -80,8 +80,10 @@ static int fillonedir(void * __buf, const char * name, int namlen, loff_t offset if (buf->result) return -EINVAL; d_ino = ino; - if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) + if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) { + buf->result = -EOVERFLOW; return -EOVERFLOW; + } buf->result++; dirent = buf->dirent; if (!access_ok(VERIFY_WRITE, dirent, @@ -155,8 +157,10 @@ static int filldir(void * __buf, const char * name, int namlen, loff_t offset, if (reclen > buf->count) return -EINVAL; d_ino = ino; - if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) + if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) { + buf->error = -EOVERFLOW; return -EOVERFLOW; + } dirent = buf->previous; if (dirent) { if (__put_user(offset, &dirent->d_off)) -- cgit v1.2.3 From 59af1584bf33810639cb98d79856021253e2177c Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 24 Aug 2008 07:24:41 -0400 Subject: [PATCH] fix ->llseek() for a bunch of directories Signed-off-by: Al Viro --- fs/9p/vfs_dir.c | 1 + fs/adfs/dir.c | 1 + fs/affs/dir.c | 1 + fs/autofs4/root.c | 2 ++ fs/befs/linuxvfs.c | 1 + fs/xfs/linux-2.6/xfs_file.c | 1 + 6 files changed, 7 insertions(+) (limited to 'fs') diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c index 88e3787c6ea9..e298fe194093 100644 --- a/fs/9p/vfs_dir.c +++ b/fs/9p/vfs_dir.c @@ -119,6 +119,7 @@ int v9fs_dir_release(struct inode *inode, struct file *filp) const struct file_operations v9fs_dir_operations = { .read = generic_read_dir, + .llseek = generic_file_llseek, .readdir = v9fs_dir_readdir, .open = v9fs_file_open, .release = v9fs_dir_release, diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c index fc1a8dc64d78..85a30e929800 100644 --- a/fs/adfs/dir.c +++ b/fs/adfs/dir.c @@ -197,6 +197,7 @@ out: const struct file_operations adfs_dir_operations = { .read = generic_read_dir, + .llseek = generic_file_llseek, .readdir = adfs_readdir, .fsync = file_fsync, }; diff --git a/fs/affs/dir.c b/fs/affs/dir.c index 6e3f282424b0..7b36904dbeac 100644 --- a/fs/affs/dir.c +++ b/fs/affs/dir.c @@ -19,6 +19,7 @@ static int affs_readdir(struct file *, void *, filldir_t); const struct file_operations affs_dir_operations = { .read = generic_read_dir, + .llseek = generic_file_llseek, .readdir = affs_readdir, .fsync = file_fsync, }; diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index bcfb2dc0a61b..2a41c2a7fc52 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -36,6 +36,7 @@ const struct file_operations autofs4_root_operations = { .release = dcache_dir_close, .read = generic_read_dir, .readdir = dcache_readdir, + .llseek = dcache_dir_lseek, .ioctl = autofs4_root_ioctl, }; @@ -44,6 +45,7 @@ const struct file_operations autofs4_dir_operations = { .release = dcache_dir_close, .read = generic_read_dir, .readdir = dcache_readdir, + .llseek = dcache_dir_lseek, }; const struct inode_operations autofs4_indirect_root_inode_operations = { diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index 02c6e62b72f8..740f53672a8a 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -66,6 +66,7 @@ static struct kmem_cache *befs_inode_cachep; static const struct file_operations befs_dir_operations = { .read = generic_read_dir, .readdir = befs_readdir, + .llseek = generic_file_llseek, }; static const struct inode_operations befs_dir_inode_operations = { diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c index 5f60363b9343..5311c1acdd40 100644 --- a/fs/xfs/linux-2.6/xfs_file.c +++ b/fs/xfs/linux-2.6/xfs_file.c @@ -475,6 +475,7 @@ const struct file_operations xfs_invis_file_operations = { const struct file_operations xfs_dir_file_operations = { .read = generic_read_dir, .readdir = xfs_file_readdir, + .llseek = generic_file_llseek, .unlocked_ioctl = xfs_file_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = xfs_file_compat_ioctl, -- cgit v1.2.3 From 4cdfe84b51420c9ac95c7133da2d4c8a191094af Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 24 Aug 2008 07:45:33 -0400 Subject: [PATCH] deal with the first call of ->show() generating no output seq_read() has a subtle bug - we want the first loop there to go until at least one *non-empty* record had fit entirely into buffer. Signed-off-by: Al Viro --- fs/seq_file.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/seq_file.c b/fs/seq_file.c index 5d54205e486b..bd20f7f5a933 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -108,9 +108,9 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) goto Done; } /* we need at least one record in buffer */ + pos = m->index; + p = m->op->start(m, &pos); while (1) { - pos = m->index; - p = m->op->start(m, &pos); err = PTR_ERR(p); if (!p || IS_ERR(p)) break; @@ -119,6 +119,11 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) break; if (unlikely(err)) m->count = 0; + if (unlikely(!m->count)) { + p = m->op->next(m, p, &pos); + m->index = pos; + continue; + } if (m->count < m->size) goto Fill; m->op->stop(m, p); @@ -128,6 +133,8 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) goto Enomem; m->count = 0; m->version = 0; + pos = m->index; + p = m->op->start(m, &pos); } m->op->stop(m, p); m->count = 0; -- cgit v1.2.3 From d6817cdbd143f87f9d7c59a4c3194091190eeb84 Mon Sep 17 00:00:00 2001 From: Joel Becker Date: Fri, 22 Aug 2008 14:30:10 -0700 Subject: ocfs2: Increment the reference count of an already-active stack. The ocfs2_stack_driver_request() function failed to increment the refcount of an already-active stack. It only did the increment on the first reference. Whoops. Signed-off-by: Joel Becker Tested-by: Marcos Matsunaga Signed-off-by: Mark Fasheh --- fs/ocfs2/stackglue.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c index 10e149ae5e3a..07f348b8d721 100644 --- a/fs/ocfs2/stackglue.c +++ b/fs/ocfs2/stackglue.c @@ -97,13 +97,14 @@ static int ocfs2_stack_driver_request(const char *stack_name, goto out; } - /* Ok, the stack is pinned */ - p->sp_count++; active_stack = p; - rc = 0; out: + /* If we found it, pin it */ + if (!rc) + active_stack->sp_count++; + spin_unlock(&ocfs2_stack_lock); return rc; } -- cgit v1.2.3 From 6ce5eecb9cd3ac97b952c50309b87c31488a45e9 Mon Sep 17 00:00:00 2001 From: Steve French Date: Tue, 26 Aug 2008 00:37:14 +0000 Subject: [CIFS] check version in spnego upcall response Currently, we don't check the version in the SPNEGO upcall response even though one is provided. Jeff and Q have made the corresponding change to the Samba client (cifs.upcall). Acked-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/CHANGES | 6 +++++- fs/cifs/cifs_spnego.h | 2 +- fs/cifs/sess.c | 9 +++++++++ 3 files changed, 15 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES index f5d0083e09fa..526041a52d35 100644 --- a/fs/cifs/CHANGES +++ b/fs/cifs/CHANGES @@ -4,7 +4,11 @@ Fix premature write failure on congested networks (we would give up on EAGAIN from the socket too quickly on large writes). Cifs_mkdir and cifs_create now respect the setgid bit on parent dir. Fix endian problems in acl (mode from/to cifs acl) on bigendian -architectures. +architectures. Fix problems with preserving timestamps on copying open +files (e.g. "cp -a") to Windows servers. For mkdir and create honor setgid bit +on parent directory when server supports Unix Extensions but not POSIX +create. Update cifs.upcall version to handle new Kerberos sec flags +(this requires update of cifs.upcall program from Samba). Version 1.53 ------------ diff --git a/fs/cifs/cifs_spnego.h b/fs/cifs/cifs_spnego.h index 05a34b17a1ab..e4041ec4d712 100644 --- a/fs/cifs/cifs_spnego.h +++ b/fs/cifs/cifs_spnego.h @@ -23,7 +23,7 @@ #ifndef _CIFS_SPNEGO_H #define _CIFS_SPNEGO_H -#define CIFS_SPNEGO_UPCALL_VERSION 1 +#define CIFS_SPNEGO_UPCALL_VERSION 2 /* * The version field should always be set to CIFS_SPNEGO_UPCALL_VERSION. diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index 3188e4d9cddb..b537fad3bf50 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -516,6 +516,15 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time, } msg = spnego_key->payload.data; + /* check version field to make sure that cifs.upcall is + sending us a response in an expected form */ + if (msg->version != CIFS_SPNEGO_UPCALL_VERSION) { + cERROR(1, ("incorrect version of cifs.upcall (expected" + " %d but got %d)", + CIFS_SPNEGO_UPCALL_VERSION, msg->version)); + rc = -EKEYREJECTED; + goto ssetup_exit; + } /* bail out if key is too long */ if (msg->sesskey_len > sizeof(ses->server->mac_signing_key.data.krb5)) { -- cgit v1.2.3 From e9775843ecb039318dbc9ded6da9c762bff28a0b Mon Sep 17 00:00:00 2001 From: Steve French Date: Tue, 26 Aug 2008 18:22:50 +0000 Subject: [CIFS] Correct keys dependency for cifs kerberos support Must also depend on CIFS ... Signed-off-by: Steve French --- fs/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/Kconfig b/fs/Kconfig index f0427105a619..3fab3901e0ef 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -1984,7 +1984,7 @@ config CIFS_EXPERIMENTAL config CIFS_UPCALL bool "Kerberos/SPNEGO advanced session setup (EXPERIMENTAL)" - depends on KEYS + depends on CIFS && KEYS help Enables an upcall mechanism for CIFS which accesses userspace helper utilities to provide SPNEGO packaged (RFC 4178) -- cgit v1.2.3 From 96c2a1137b9e00bcdbe3a95113ea8f42ca994f76 Mon Sep 17 00:00:00 2001 From: Steve French Date: Tue, 26 Aug 2008 18:32:28 +0000 Subject: [CIFS] Reorder cifs config item for better clarity Signed-off-by: Steve French --- fs/Kconfig | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/Kconfig b/fs/Kconfig index 3fab3901e0ef..abccb5dab9a8 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -1930,6 +1930,16 @@ config CIFS_WEAK_PW_HASH If unsure, say N. +config CIFS_UPCALL + bool "Kerberos/SPNEGO advanced session setup" + depends on CIFS && KEYS + help + Enables an upcall mechanism for CIFS which accesses + userspace helper utilities to provide SPNEGO packaged (RFC 4178) + Kerberos tickets which are needed to mount to certain secure servers + (for which more secure Kerberos authentication is required). If + unsure, say N. + config CIFS_XATTR bool "CIFS extended attributes" depends on CIFS @@ -1982,16 +1992,6 @@ config CIFS_EXPERIMENTAL (which is disabled by default). See the file fs/cifs/README for more details. If unsure, say N. -config CIFS_UPCALL - bool "Kerberos/SPNEGO advanced session setup (EXPERIMENTAL)" - depends on CIFS && KEYS - help - Enables an upcall mechanism for CIFS which accesses - userspace helper utilities to provide SPNEGO packaged (RFC 4178) - Kerberos tickets which are needed to mount to certain secure servers - (for which more secure Kerberos authentication is required). If - unsure, say N. - config CIFS_DFS_UPCALL bool "DFS feature support (EXPERIMENTAL)" depends on CIFS_EXPERIMENTAL -- cgit v1.2.3 From 48fd4f93a00eac844678629f2f00518e146ed30d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 22 Aug 2008 10:00:36 +0200 Subject: block: submit_bh() inadvertently discards barrier flag on a sync write Reported by Milan Broz , commit 18ce3751 inadvertently made submit_bh() discard the barrier bit for a WRITE_SYNC request. Fix that up. Signed-off-by: Jens Axboe --- fs/buffer.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/buffer.c b/fs/buffer.c index 38653e36e225..ac78d4c19b3b 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2926,14 +2926,17 @@ int submit_bh(int rw, struct buffer_head * bh) BUG_ON(!buffer_mapped(bh)); BUG_ON(!bh->b_end_io); - if (buffer_ordered(bh) && (rw == WRITE)) - rw = WRITE_BARRIER; + /* + * Mask in barrier bit for a write (could be either a WRITE or a + * WRITE_SYNC + */ + if (buffer_ordered(bh) && (rw & WRITE)) + rw |= WRITE_BARRIER; /* - * Only clear out a write error when rewriting, should this - * include WRITE_SYNC as well? + * Only clear out a write error when rewriting */ - if (test_set_buffer_req(bh) && (rw == WRITE || rw == WRITE_BARRIER)) + if (test_set_buffer_req(bh) && (rw & WRITE)) clear_buffer_write_io_error(bh); /* -- cgit v1.2.3 From 76029ff37f31dad64641489c610d98955217bb68 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Mon, 25 Aug 2008 20:36:08 +0200 Subject: bio: fix bio_copy_kern() handling of bio->bv_len The commit 68154e90c9d1492d570671ae181d9a8f8530da55 introduced bio_copy_kern() to add bounce support to blk_rq_map_kern. bio_copy_kern() uses bio->bv_len to copy data for READ commands after the completion but it doesn't work with a request that partially completed. SCSI always completes a PC request as a whole but seems some don't. This patch fixes bio_copy_kern to handle the above case. As bio_copy_user does, bio_copy_kern uses struct bio_map_data to store struct bio_vec. Signed-off-by: FUJITA Tomonori Reported-by: Nix Tested-by: Nix Cc: stable@kernel.org Signed-off-by: Jens Axboe --- fs/bio.c | 38 ++++++++++++++++++++++++++++---------- 1 file changed, 28 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/bio.c b/fs/bio.c index 8000e2fa16cb..8b1f5ee6f83c 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -469,20 +469,21 @@ static void bio_free_map_data(struct bio_map_data *bmd) kfree(bmd); } -static struct bio_map_data *bio_alloc_map_data(int nr_segs, int iov_count) +static struct bio_map_data *bio_alloc_map_data(int nr_segs, int iov_count, + gfp_t gfp_mask) { - struct bio_map_data *bmd = kmalloc(sizeof(*bmd), GFP_KERNEL); + struct bio_map_data *bmd = kmalloc(sizeof(*bmd), gfp_mask); if (!bmd) return NULL; - bmd->iovecs = kmalloc(sizeof(struct bio_vec) * nr_segs, GFP_KERNEL); + bmd->iovecs = kmalloc(sizeof(struct bio_vec) * nr_segs, gfp_mask); if (!bmd->iovecs) { kfree(bmd); return NULL; } - bmd->sgvecs = kmalloc(sizeof(struct sg_iovec) * iov_count, GFP_KERNEL); + bmd->sgvecs = kmalloc(sizeof(struct sg_iovec) * iov_count, gfp_mask); if (bmd->sgvecs) return bmd; @@ -596,7 +597,7 @@ struct bio *bio_copy_user_iov(struct request_queue *q, struct sg_iovec *iov, len += iov[i].iov_len; } - bmd = bio_alloc_map_data(nr_pages, iov_count); + bmd = bio_alloc_map_data(nr_pages, iov_count, GFP_KERNEL); if (!bmd) return ERR_PTR(-ENOMEM); @@ -942,19 +943,22 @@ static void bio_copy_kern_endio(struct bio *bio, int err) { struct bio_vec *bvec; const int read = bio_data_dir(bio) == READ; - char *p = bio->bi_private; + struct bio_map_data *bmd = bio->bi_private; int i; + char *p = bmd->sgvecs[0].iov_base; __bio_for_each_segment(bvec, bio, i, 0) { char *addr = page_address(bvec->bv_page); + int len = bmd->iovecs[i].bv_len; if (read && !err) - memcpy(p, addr, bvec->bv_len); + memcpy(p, addr, len); __free_page(bvec->bv_page); - p += bvec->bv_len; + p += len; } + bio_free_map_data(bmd); bio_put(bio); } @@ -978,11 +982,21 @@ struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len, const int nr_pages = end - start; struct bio *bio; struct bio_vec *bvec; + struct bio_map_data *bmd; int i, ret; + struct sg_iovec iov; + + iov.iov_base = data; + iov.iov_len = len; + + bmd = bio_alloc_map_data(nr_pages, 1, gfp_mask); + if (!bmd) + return ERR_PTR(-ENOMEM); + ret = -ENOMEM; bio = bio_alloc(gfp_mask, nr_pages); if (!bio) - return ERR_PTR(-ENOMEM); + goto out_bmd; while (len) { struct page *page; @@ -1016,14 +1030,18 @@ struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len, } } - bio->bi_private = data; + bio->bi_private = bmd; bio->bi_end_io = bio_copy_kern_endio; + + bio_set_map_data(bmd, bio, &iov, 1); return bio; cleanup: bio_for_each_segment(bvec, bio, i) __free_page(bvec->bv_page); bio_put(bio); +out_bmd: + bio_free_map_data(bmd); return ERR_PTR(ret); } -- cgit v1.2.3 From aefcc28a3a63ac33a298777aa50ba43641c75241 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Mon, 25 Aug 2008 20:36:08 +0200 Subject: bio: fix __bio_copy_iov() handling of bio->bv_len The commit c5dec1c3034f1ae3503efbf641ff3b0273b64797 introduced __bio_copy_iov() to add bounce support to blk_rq_map_user_iov. __bio_copy_iov() uses bio->bv_len to copy data for READ commands after the completion but it doesn't work with a request that partially completed. SCSI always completes a PC request as a whole but seems some don't. Signed-off-by: FUJITA Tomonori Cc: stable@kernel.org Signed-off-by: Jens Axboe --- fs/bio.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/bio.c b/fs/bio.c index 8b1f5ee6f83c..3cba7ae34d75 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -492,8 +492,8 @@ static struct bio_map_data *bio_alloc_map_data(int nr_segs, int iov_count, return NULL; } -static int __bio_copy_iov(struct bio *bio, struct sg_iovec *iov, int iov_count, - int uncopy) +static int __bio_copy_iov(struct bio *bio, struct bio_vec *iovecs, + struct sg_iovec *iov, int iov_count, int uncopy) { int ret = 0, i; struct bio_vec *bvec; @@ -503,7 +503,7 @@ static int __bio_copy_iov(struct bio *bio, struct sg_iovec *iov, int iov_count, __bio_for_each_segment(bvec, bio, i, 0) { char *bv_addr = page_address(bvec->bv_page); - unsigned int bv_len = bvec->bv_len; + unsigned int bv_len = iovecs[i].bv_len; while (bv_len && iov_idx < iov_count) { unsigned int bytes; @@ -555,7 +555,7 @@ int bio_uncopy_user(struct bio *bio) struct bio_map_data *bmd = bio->bi_private; int ret; - ret = __bio_copy_iov(bio, bmd->sgvecs, bmd->nr_sgvecs, 1); + ret = __bio_copy_iov(bio, bmd->iovecs, bmd->sgvecs, bmd->nr_sgvecs, 1); bio_free_map_data(bmd); bio_put(bio); @@ -634,7 +634,7 @@ struct bio *bio_copy_user_iov(struct request_queue *q, struct sg_iovec *iov, * success */ if (!write_to_vm) { - ret = __bio_copy_iov(bio, iov, iov_count, 0); + ret = __bio_copy_iov(bio, bio->bi_io_vec, iov, iov_count, 0); if (ret) goto cleanup; } -- cgit v1.2.3 From 87ed1d65fb536a0cd4e84874c0b038f953e448aa Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 27 Aug 2008 17:53:30 +0000 Subject: [CIFS] Add destroy routine for dns_resolver Otherwise, we're leaking the payload memory. CC: Stable Kernel Acked-by: David Howells Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/CHANGES | 3 ++- fs/cifs/dns_resolve.c | 7 +++++++ 2 files changed, 9 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES index 526041a52d35..f9e4ad97a79e 100644 --- a/fs/cifs/CHANGES +++ b/fs/cifs/CHANGES @@ -8,7 +8,8 @@ architectures. Fix problems with preserving timestamps on copying open files (e.g. "cp -a") to Windows servers. For mkdir and create honor setgid bit on parent directory when server supports Unix Extensions but not POSIX create. Update cifs.upcall version to handle new Kerberos sec flags -(this requires update of cifs.upcall program from Samba). +(this requires update of cifs.upcall program from Samba). Fix memory leak +on dns_upcall (resolving DFS referralls). Version 1.53 ------------ diff --git a/fs/cifs/dns_resolve.c b/fs/cifs/dns_resolve.c index f730ef35499e..a2e0673e1b08 100644 --- a/fs/cifs/dns_resolve.c +++ b/fs/cifs/dns_resolve.c @@ -47,11 +47,18 @@ static int dns_resolver_instantiate(struct key *key, const void *data, return rc; } +static void +dns_resolver_destroy(struct key *key) +{ + kfree(key->payload.data); +} + struct key_type key_type_dns_resolver = { .name = "dns_resolver", .def_datalen = sizeof(struct in_addr), .describe = user_describe, .instantiate = dns_resolver_instantiate, + .destroy = dns_resolver_destroy, .match = user_match, }; -- cgit v1.2.3 From bcc55c6664a90146149ba0fd93052adc94287b9f Mon Sep 17 00:00:00 2001 From: Steve French Date: Wed, 27 Aug 2008 21:30:22 +0000 Subject: [CIFS] Fix plaintext authentication The last eight bytes of the password field were not cleared when doing lanman plaintext password authentication. This patch fixes that. I tested it with Samba by setting password encryption to no in the server's smb.conf. Other servers also can be configured to force plaintext authentication. Note that plaintexti authentication requires setting /proc/fs/cifs/SecurityFlags to 0x30030 on the client (enabling both LANMAN and also plaintext password support). Also note that LANMAN support (and thus plaintext password support) requires CONFIG_CIFS_WEAK_PW_HASH to be enabled in menuconfig. CC: Jeff Layton CC: Stable Kernel Signed-off-by: Steve French --- fs/cifs/cifsencrypt.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index 83fd40dc1ef0..bd5f13d38450 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -294,6 +294,7 @@ void calc_lanman_hash(struct cifsSesInfo *ses, char *lnm_session_key) if ((ses->server->secMode & SECMODE_PW_ENCRYPT) == 0) if (extended_security & CIFSSEC_MAY_PLNTXT) { + memset(lnm_session_key, 0, CIFS_SESS_KEY_SIZE); memcpy(lnm_session_key, password_with_pad, CIFS_ENCPWD_SIZE); return; -- cgit v1.2.3 From 838726c4756813576078203eb7e1e219db0da870 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 28 Aug 2008 07:54:59 -0400 Subject: cifs: fix O_APPEND on directio mounts The direct I/O write codepath for CIFS is done through cifs_user_write(). That function does not currently call generic_write_checks() so the file position isn't being properly set when the file is opened with O_APPEND. It's also not doing the other "normal" checks that should be done for a write call. The problem is currently that when you open a file with O_APPEND on a mount with the directio mount option, the file position is set to the beginning of the file. This makes any subsequent writes clobber the data in the file starting at the beginning. This seems to fix the problem in cursory testing. It is, however important to note that NFS disallows the combination of (O_DIRECT|O_APPEND). If my understanding is correct, the concern is races with multiple clients appending to a file clobbering each others' data. Since the write model for CIFS and NFS is pretty similar in this regard, CIFS is probably subject to the same sort of races. What's unclear to me is why this is a particular problem with O_DIRECT and not with buffered writes... Regardless, disallowing O_APPEND on an entire mount is probably not reasonable, so we'll probably just have to deal with it and reevaluate this flag combination when we get proper support for O_DIRECT. In the meantime this patch at least fixes the existing problem. Signed-off-by: Jeff Layton Cc: Stable Tree Signed-off-by: Steve French --- fs/cifs/file.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/cifs/file.c b/fs/cifs/file.c index ff14d14903a0..cbefe1f1f9fe 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -833,6 +833,10 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data, return -EBADF; open_file = (struct cifsFileInfo *) file->private_data; + rc = generic_write_checks(file, poffset, &write_size, 0); + if (rc) + return rc; + xid = GetXid(); if (*poffset > file->f_path.dentry->d_inode->i_size) -- cgit v1.2.3 From 2e655021b8d50b5d90ce442f3de6bf3667729910 Mon Sep 17 00:00:00 2001 From: Steve French Date: Thu, 28 Aug 2008 15:30:06 +0000 Subject: [CIFS] update cifs change log Signed-off-by: Steve French --- fs/cifs/CHANGES | 5 ++++- fs/cifs/README | 14 ++++++++++++-- 2 files changed, 16 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES index f9e4ad97a79e..06e521a945c3 100644 --- a/fs/cifs/CHANGES +++ b/fs/cifs/CHANGES @@ -9,7 +9,10 @@ files (e.g. "cp -a") to Windows servers. For mkdir and create honor setgid bit on parent directory when server supports Unix Extensions but not POSIX create. Update cifs.upcall version to handle new Kerberos sec flags (this requires update of cifs.upcall program from Samba). Fix memory leak -on dns_upcall (resolving DFS referralls). +on dns_upcall (resolving DFS referralls). Fix plain text password +authentication (requires setting SecurityFlags to 0x30030 to enable +lanman and plain text though). Fix writes to be at correct offset when +file is open with O_APPEND and file is on a directio (forcediretio) mount. Version 1.53 ------------ diff --git a/fs/cifs/README b/fs/cifs/README index 68b5c1169d9d..bd2343d4c6a6 100644 --- a/fs/cifs/README +++ b/fs/cifs/README @@ -542,10 +542,20 @@ SecurityFlags Flags which control security negotiation and hashing mechanisms (as "must use") on the other hand does not make much sense. Default flags are 0x07007 - (NTLM, NTLMv2 and packet signing allowed). Maximum + (NTLM, NTLMv2 and packet signing allowed). The maximum allowable flags if you want to allow mounts to servers using weaker password hashes is 0x37037 (lanman, - plaintext, ntlm, ntlmv2, signing allowed): + plaintext, ntlm, ntlmv2, signing allowed). Some + SecurityFlags require the corresponding menuconfig + options to be enabled (lanman and plaintext require + CONFIG_CIFS_WEAK_PW_HASH for example). Enabling + plaintext authentication currently requires also + enabling lanman authentication in the security flags + because the cifs module only supports sending + laintext passwords using the older lanman dialect + form of the session setup SMB. (e.g. for authentication + using plain text passwords, set the SecurityFlags + to 0x30030): may use packet signing 0x00001 must use packet signing 0x01001 -- cgit v1.2.3 From c76da9da1fffa6de263486df54950eb328d58f71 Mon Sep 17 00:00:00 2001 From: Steve French Date: Thu, 28 Aug 2008 15:32:22 +0000 Subject: [CIFS] Turn off Unicode during session establishment for plaintext authentication LANMAN session setup did not support Unicode (after session setup, unicode can still be used though). Fixes samba bug# 5319 CC: Jeff Layton CC: Stable Kernel Signed-off-by: Steve French --- fs/cifs/sess.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index b537fad3bf50..252fdc0567f1 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -409,6 +409,8 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time, #ifdef CONFIG_CIFS_WEAK_PW_HASH char lnm_session_key[CIFS_SESS_KEY_SIZE]; + pSMB->req.hdr.Flags2 &= ~SMBFLG2_UNICODE; + /* no capabilities flags in old lanman negotiation */ pSMB->old_req.PasswordLength = cpu_to_le16(CIFS_SESS_KEY_SIZE); -- cgit v1.2.3 From c228c24bf1138d4757dbe20615df655815446da3 Mon Sep 17 00:00:00 2001 From: Andy Adamson Date: Thu, 21 Aug 2008 08:42:16 -0400 Subject: nfsd: fix compound state allocation error handling Move the cstate_alloc call so that if it fails, the response is setup to encode the NFS error. The out label now means that the nfsd4_compound_state has not been allocated. Signed-off-by: Andy Adamson Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4proc.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 2e51adac65de..e5b51ffafc6c 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -867,11 +867,6 @@ nfsd4_proc_compound(struct svc_rqst *rqstp, int slack_bytes; __be32 status; - status = nfserr_resource; - cstate = cstate_alloc(); - if (cstate == NULL) - goto out; - resp->xbuf = &rqstp->rq_res; resp->p = rqstp->rq_res.head[0].iov_base + rqstp->rq_res.head[0].iov_len; resp->tagp = resp->p; @@ -890,6 +885,11 @@ nfsd4_proc_compound(struct svc_rqst *rqstp, if (args->minorversion > NFSD_SUPPORTED_MINOR_VERSION) goto out; + status = nfserr_resource; + cstate = cstate_alloc(); + if (cstate == NULL) + goto out; + status = nfs_ok; while (!status && resp->opcnt < args->opcnt) { op = &args->ops[resp->opcnt++]; @@ -957,9 +957,9 @@ encode_op: nfsd4_increment_op_stats(op->opnum); } + cstate_free(cstate); out: nfsd4_release_compoundargs(args); - cstate_free(cstate); dprintk("nfsv4 compound returned %d\n", ntohl(status)); return status; } -- cgit v1.2.3 From 91b80969ba466ba4b915a4a1d03add8c297add3f Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Fri, 29 Aug 2008 19:18:45 -0400 Subject: nfsd: fix buffer overrun decoding NFSv4 acl The array we kmalloc() here is not large enough. Thanks to Johann Dahm and David Richter for bug report and testing. Signed-off-by: J. Bruce Fields Cc: David Richter Tested-by: Johann Dahm --- fs/nfsd/nfs4acl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c index b6ed38380ab8..54b8b4140c8f 100644 --- a/fs/nfsd/nfs4acl.c +++ b/fs/nfsd/nfs4acl.c @@ -443,7 +443,7 @@ init_state(struct posix_acl_state *state, int cnt) * enough space for either: */ alloc = sizeof(struct posix_ace_state_array) - + cnt*sizeof(struct posix_ace_state); + + cnt*sizeof(struct posix_user_ace_state); state->users = kzalloc(alloc, GFP_KERNEL); if (!state->users) return -ENOMEM; -- cgit v1.2.3 From 169ccbd44eb20f5bb7e4352451eba25397e29749 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Tue, 2 Sep 2008 14:35:37 -0700 Subject: NTFS: update homepage Update the location of the NTFS homepage in several files. Signed-off-by: Adrian Bunk Cc: Jeff Garzik Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/ntfs.txt | 4 ++-- MAINTAINERS | 2 +- fs/ntfs/usnjrnl.h | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/Documentation/filesystems/ntfs.txt b/Documentation/filesystems/ntfs.txt index e79ee2db183a..ac2a261c5f7d 100644 --- a/Documentation/filesystems/ntfs.txt +++ b/Documentation/filesystems/ntfs.txt @@ -40,7 +40,7 @@ Web site ======== There is plenty of additional information on the linux-ntfs web site -at http://linux-ntfs.sourceforge.net/ +at http://www.linux-ntfs.org/ The web site has a lot of additional information, such as a comprehensive FAQ, documentation on the NTFS on-disk format, information on the Linux-NTFS @@ -272,7 +272,7 @@ And you would know that /dev/hda2 has a size of 37768814 - 4209030 + 1 = For Win2k and later dynamic disks, you can for example use the ldminfo utility which is part of the Linux LDM tools (the latest version at the time of writing is linux-ldm-0.0.8.tar.bz2). You can download it from: - http://linux-ntfs.sourceforge.net/downloads.html + http://www.linux-ntfs.org/ Simply extract the downloaded archive (tar xvjf linux-ldm-0.0.8.tar.bz2), go into it (cd linux-ldm-0.0.8) and change to the test directory (cd test). You will find the precompiled (i386) ldminfo utility there. NOTE: You will not be diff --git a/MAINTAINERS b/MAINTAINERS index c4ca99cf80df..be83f3424f3b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3051,7 +3051,7 @@ P: Anton Altaparmakov M: aia21@cantab.net L: linux-ntfs-dev@lists.sourceforge.net L: linux-kernel@vger.kernel.org -W: http://linux-ntfs.sf.net/ +W: http://www.linux-ntfs.org/ T: git kernel.org:/pub/scm/linux/kernel/git/aia21/ntfs-2.6.git S: Maintained diff --git a/fs/ntfs/usnjrnl.h b/fs/ntfs/usnjrnl.h index 3a8af75351e8..4087fbdac327 100644 --- a/fs/ntfs/usnjrnl.h +++ b/fs/ntfs/usnjrnl.h @@ -113,7 +113,7 @@ typedef struct { * Reason flags (32-bit). Cumulative flags describing the change(s) to the * file since it was last opened. I think the names speak for themselves but * if you disagree check out the descriptions in the Linux NTFS project NTFS - * documentation: http://linux-ntfs.sourceforge.net/ntfs/files/usnjrnl.html + * documentation: http://www.linux-ntfs.org/ */ enum { USN_REASON_DATA_OVERWRITE = const_cpu_to_le32(0x00000001), @@ -145,7 +145,7 @@ typedef le32 USN_REASON_FLAGS; * Source info flags (32-bit). Information about the source of the change(s) * to the file. For detailed descriptions of what these mean, see the Linux * NTFS project NTFS documentation: - * http://linux-ntfs.sourceforge.net/ntfs/files/usnjrnl.html + * http://www.linux-ntfs.org/ */ enum { USN_SOURCE_DATA_MANAGEMENT = const_cpu_to_le32(0x00000001), -- cgit v1.2.3 From 4b8561521dbaa3d766b198496b220e984e3bf756 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Tue, 2 Sep 2008 14:35:53 -0700 Subject: mm: show quicklist usage in /proc/meminfo Quicklists can consume several GB of memory. We should provide a means of monitoring this. After this patch is applied, /proc/meminfo will output the following: % cat /proc/meminfo MemTotal: 7715392 kB MemFree: 5401600 kB Buffers: 80384 kB Cached: 300800 kB SwapCached: 0 kB Active: 235584 kB Inactive: 262656 kB SwapTotal: 2031488 kB SwapFree: 2031488 kB Dirty: 3520 kB Writeback: 0 kB AnonPages: 117696 kB Mapped: 38528 kB Slab: 1589952 kB SReclaimable: 23104 kB SUnreclaim: 1566848 kB PageTables: 14656 kB NFS_Unstable: 0 kB Bounce: 0 kB WritebackTmp: 0 kB CommitLimit: 5889152 kB Committed_AS: 393152 kB VmallocTotal: 17592177655808 kB VmallocUsed: 29056 kB VmallocChunk: 17592177626432 kB Quicklists: 130944 kB HugePages_Total: 0 HugePages_Free: 0 HugePages_Rsvd: 0 HugePages_Surp: 0 Hugepagesize: 262144 kB Signed-off-by: KOSAKI Motohiro Cc: Christoph Lameter Cc: Keiichiro Tokunaga Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/proc_misc.c | 7 +++++-- include/linux/quicklist.h | 7 +++++++ 2 files changed, 12 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c index ded969862960..00f10a2dcf12 100644 --- a/fs/proc/proc_misc.c +++ b/fs/proc/proc_misc.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -189,7 +190,8 @@ static int meminfo_read_proc(char *page, char **start, off_t off, "Committed_AS: %8lu kB\n" "VmallocTotal: %8lu kB\n" "VmallocUsed: %8lu kB\n" - "VmallocChunk: %8lu kB\n", + "VmallocChunk: %8lu kB\n" + "Quicklists: %8lu kB\n", K(i.totalram), K(i.freeram), K(i.bufferram), @@ -221,7 +223,8 @@ static int meminfo_read_proc(char *page, char **start, off_t off, K(committed), (unsigned long)VMALLOC_TOTAL >> 10, vmi.used >> 10, - vmi.largest_chunk >> 10 + vmi.largest_chunk >> 10, + K(quicklist_total_size()) ); len += hugetlb_report_meminfo(page + len); diff --git a/include/linux/quicklist.h b/include/linux/quicklist.h index 39b66713a0bb..bd466439c588 100644 --- a/include/linux/quicklist.h +++ b/include/linux/quicklist.h @@ -80,6 +80,13 @@ void quicklist_trim(int nr, void (*dtor)(void *), unsigned long quicklist_total_size(void); +#else + +static inline unsigned long quicklist_total_size(void) +{ + return 0; +} + #endif #endif /* LINUX_QUICKLIST_H */ -- cgit v1.2.3