diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2015-11-06 17:17:13 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2015-11-06 17:17:13 -0800 |
commit | 27eb427bdc0960ad64b72da03e3596c801e7a9e9 (patch) | |
tree | 4170a265e99d455ca53d26a19e59330e3277fccd /fs/btrfs/file.c | |
parent | 713009809681e5a7871e96e6992692c805b4480b (diff) | |
parent | 2959a32a858a2c44bbbce83d19c158d54cc5998a (diff) |
Merge branch 'for-linus-4.4' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs
Pull btrfs updates from Chris Mason:
"We have a lot of subvolume quota improvements in here, along with big
piles of cleanups from Dave Sterba and Anand Jain and others.
Josef pitched in a batch of allocator fixes based on production use
here at FB. We found that mount -o ssd_spread greatly improved our
performance on hardware raid5/6, but it exposed some CPU bottlenecks
in the allocator. These patches make a huge difference"
* 'for-linus-4.4' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs: (100 commits)
Btrfs: fix hole punching when using the no-holes feature
Btrfs: find_free_extent: Do not erroneously skip LOOP_CACHING_WAIT state
btrfs: Fix a data space underflow warning
btrfs: qgroup: Fix a rebase bug which will cause qgroup double free
btrfs: qgroup: Fix a race in delayed_ref which leads to abort trans
btrfs: clear PF_NOFREEZE in cleaner_kthread()
btrfs: qgroup: Don't copy extent buffer to do qgroup rescan
btrfs: add balance filters limits, stripes and usage to supported mask
btrfs: extend balance filter usage to take minimum and maximum
btrfs: add balance filter for stripes
btrfs: extend balance filter limit to take minimum and maximum
btrfs: fix use after free iterating extrefs
btrfs: check unsupported filters in balance arguments
Btrfs: fix regression running delayed references when using qgroups
Btrfs: fix regression when running delayed references
Btrfs: don't do extra bitmap search in one bit case
Btrfs: keep track of largest extent in bitmaps
Btrfs: don't keep trying to build clusters if we are fragmented
Btrfs: cut down on loops through the allocator
Btrfs: don't continue setting up space cache when enospc
...
Diffstat (limited to 'fs/btrfs/file.c')
-rw-r--r-- | fs/btrfs/file.c | 228 |
1 files changed, 162 insertions, 66 deletions
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 8c6f247ba81d..6bd5ce9d75f0 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -847,7 +847,7 @@ next_slot: disk_bytenr, num_bytes, 0, root->root_key.objectid, new_key.objectid, - start - extent_offset, 1); + start - extent_offset); BUG_ON(ret); /* -ENOMEM */ } key.offset = start; @@ -925,7 +925,7 @@ delete_extent_item: disk_bytenr, num_bytes, 0, root->root_key.objectid, key.objectid, key.offset - - extent_offset, 0); + extent_offset); BUG_ON(ret); /* -ENOMEM */ inode_sub_bytes(inode, extent_end - key.offset); @@ -1204,7 +1204,7 @@ again: ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0, root->root_key.objectid, - ino, orig_offset, 1); + ino, orig_offset); BUG_ON(ret); /* -ENOMEM */ if (split == start) { @@ -1231,7 +1231,7 @@ again: del_nr++; ret = btrfs_free_extent(trans, root, bytenr, num_bytes, 0, root->root_key.objectid, - ino, orig_offset, 0); + ino, orig_offset); BUG_ON(ret); /* -ENOMEM */ } other_start = 0; @@ -1248,7 +1248,7 @@ again: del_nr++; ret = btrfs_free_extent(trans, root, bytenr, num_bytes, 0, root->root_key.objectid, - ino, orig_offset, 0); + ino, orig_offset); BUG_ON(ret); /* -ENOMEM */ } if (del_nr == 0) { @@ -1469,7 +1469,6 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, u64 release_bytes = 0; u64 lockstart; u64 lockend; - unsigned long first_index; size_t num_written = 0; int nrptrs; int ret = 0; @@ -1485,8 +1484,6 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, if (!pages) return -ENOMEM; - first_index = pos >> PAGE_CACHE_SHIFT; - while (iov_iter_count(i) > 0) { size_t offset = pos & (PAGE_CACHE_SIZE - 1); size_t write_bytes = min(iov_iter_count(i), @@ -1510,12 +1507,17 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, } reserve_bytes = num_pages << PAGE_CACHE_SHIFT; - ret = btrfs_check_data_free_space(inode, reserve_bytes, write_bytes); - if (ret == -ENOSPC && - (BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | - BTRFS_INODE_PREALLOC))) { + + if (BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | + BTRFS_INODE_PREALLOC)) { ret = check_can_nocow(inode, pos, &write_bytes); + if (ret < 0) + break; if (ret > 0) { + /* + * For nodata cow case, no need to reserve + * data space. + */ only_release_metadata = true; /* * our prealloc extent may be smaller than @@ -1524,20 +1526,19 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, num_pages = DIV_ROUND_UP(write_bytes + offset, PAGE_CACHE_SIZE); reserve_bytes = num_pages << PAGE_CACHE_SHIFT; - ret = 0; - } else { - ret = -ENOSPC; + goto reserve_metadata; } } - - if (ret) + ret = btrfs_check_data_free_space(inode, pos, write_bytes); + if (ret < 0) break; +reserve_metadata: ret = btrfs_delalloc_reserve_metadata(inode, reserve_bytes); if (ret) { if (!only_release_metadata) - btrfs_free_reserved_data_space(inode, - reserve_bytes); + btrfs_free_reserved_data_space(inode, pos, + write_bytes); else btrfs_end_write_no_snapshoting(root); break; @@ -1603,12 +1604,17 @@ again: BTRFS_I(inode)->outstanding_extents++; spin_unlock(&BTRFS_I(inode)->lock); } - if (only_release_metadata) + if (only_release_metadata) { btrfs_delalloc_release_metadata(inode, release_bytes); - else - btrfs_delalloc_release_space(inode, + } else { + u64 __pos; + + __pos = round_down(pos, root->sectorsize) + + (dirty_pages << PAGE_CACHE_SHIFT); + btrfs_delalloc_release_space(inode, __pos, release_bytes); + } } release_bytes = dirty_pages << PAGE_CACHE_SHIFT; @@ -1660,7 +1666,7 @@ again: btrfs_end_write_no_snapshoting(root); btrfs_delalloc_release_metadata(inode, release_bytes); } else { - btrfs_delalloc_release_space(inode, release_bytes); + btrfs_delalloc_release_space(inode, pos, release_bytes); } } @@ -2266,7 +2272,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) u64 drop_end; int ret = 0; int err = 0; - int rsv_count; + unsigned int rsv_count; bool same_page; bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES); u64 ino_size; @@ -2488,6 +2494,19 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) trans->block_rsv = &root->fs_info->trans_block_rsv; /* + * If we are using the NO_HOLES feature we might have had already an + * hole that overlaps a part of the region [lockstart, lockend] and + * ends at (or beyond) lockend. Since we have no file extent items to + * represent holes, drop_end can be less than lockend and so we must + * make sure we have an extent map representing the existing hole (the + * call to __btrfs_drop_extents() might have dropped the existing extent + * map representing the existing hole), otherwise the fast fsync path + * will not record the existence of the hole region + * [existing_hole_start, lockend]. + */ + if (drop_end <= lockend) + drop_end = lockend + 1; + /* * Don't insert file hole extent item if it's for a range beyond eof * (because it's useless) or if it represents a 0 bytes range (when * cur_offset == drop_end). @@ -2541,17 +2560,61 @@ out_only_mutex: return err; } +/* Helper structure to record which range is already reserved */ +struct falloc_range { + struct list_head list; + u64 start; + u64 len; +}; + +/* + * Helper function to add falloc range + * + * Caller should have locked the larger range of extent containing + * [start, len) + */ +static int add_falloc_range(struct list_head *head, u64 start, u64 len) +{ + struct falloc_range *prev = NULL; + struct falloc_range *range = NULL; + + if (list_empty(head)) + goto insert; + + /* + * As fallocate iterate by bytenr order, we only need to check + * the last range. + */ + prev = list_entry(head->prev, struct falloc_range, list); + if (prev->start + prev->len == start) { + prev->len += len; + return 0; + } +insert: + range = kmalloc(sizeof(*range), GFP_NOFS); + if (!range) + return -ENOMEM; + range->start = start; + range->len = len; + list_add_tail(&range->list, head); + return 0; +} + static long btrfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { struct inode *inode = file_inode(file); struct extent_state *cached_state = NULL; + struct falloc_range *range; + struct falloc_range *tmp; + struct list_head reserve_list; u64 cur_offset; u64 last_byte; u64 alloc_start; u64 alloc_end; u64 alloc_hint = 0; u64 locked_end; + u64 actual_end = 0; struct extent_map *em; int blocksize = BTRFS_I(inode)->root->sectorsize; int ret; @@ -2567,11 +2630,12 @@ static long btrfs_fallocate(struct file *file, int mode, return btrfs_punch_hole(inode, offset, len); /* - * Make sure we have enough space before we do the - * allocation. + * Only trigger disk allocation, don't trigger qgroup reserve + * + * For qgroup space, it will be checked later. */ - ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start, alloc_end - alloc_start); - if (ret) + ret = btrfs_alloc_data_chunk_ondemand(inode, alloc_end - alloc_start); + if (ret < 0) return ret; mutex_lock(&inode->i_mutex); @@ -2579,6 +2643,13 @@ static long btrfs_fallocate(struct file *file, int mode, if (ret) goto out; + /* + * TODO: Move these two operations after we have checked + * accurate reserved space, or fallocate can still fail but + * with page truncated or size expanded. + * + * But that's a minor problem and won't do much harm BTW. + */ if (alloc_start > inode->i_size) { ret = btrfs_cont_expand(inode, i_size_read(inode), alloc_start); @@ -2637,10 +2708,10 @@ static long btrfs_fallocate(struct file *file, int mode, } } + /* First, check if we exceed the qgroup limit */ + INIT_LIST_HEAD(&reserve_list); cur_offset = alloc_start; while (1) { - u64 actual_end; - em = btrfs_get_extent(inode, NULL, 0, cur_offset, alloc_end - cur_offset, 0); if (IS_ERR_OR_NULL(em)) { @@ -2653,57 +2724,82 @@ static long btrfs_fallocate(struct file *file, int mode, last_byte = min(extent_map_end(em), alloc_end); actual_end = min_t(u64, extent_map_end(em), offset + len); last_byte = ALIGN(last_byte, blocksize); - if (em->block_start == EXTENT_MAP_HOLE || (cur_offset >= inode->i_size && !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { - ret = btrfs_prealloc_file_range(inode, mode, cur_offset, - last_byte - cur_offset, - 1 << inode->i_blkbits, - offset + len, - &alloc_hint); - } else if (actual_end > inode->i_size && - !(mode & FALLOC_FL_KEEP_SIZE)) { - struct btrfs_trans_handle *trans; - struct btrfs_root *root = BTRFS_I(inode)->root; - - /* - * We didn't need to allocate any more space, but we - * still extended the size of the file so we need to - * update i_size and the inode item. - */ - trans = btrfs_start_transaction(root, 1); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - } else { - inode->i_ctime = CURRENT_TIME; - i_size_write(inode, actual_end); - btrfs_ordered_update_i_size(inode, actual_end, - NULL); - ret = btrfs_update_inode(trans, root, inode); - if (ret) - btrfs_end_transaction(trans, root); - else - ret = btrfs_end_transaction(trans, - root); + ret = add_falloc_range(&reserve_list, cur_offset, + last_byte - cur_offset); + if (ret < 0) { + free_extent_map(em); + break; } + ret = btrfs_qgroup_reserve_data(inode, cur_offset, + last_byte - cur_offset); + if (ret < 0) + break; } free_extent_map(em); - if (ret < 0) - break; - cur_offset = last_byte; - if (cur_offset >= alloc_end) { - ret = 0; + if (cur_offset >= alloc_end) break; + } + + /* + * If ret is still 0, means we're OK to fallocate. + * Or just cleanup the list and exit. + */ + list_for_each_entry_safe(range, tmp, &reserve_list, list) { + if (!ret) + ret = btrfs_prealloc_file_range(inode, mode, + range->start, + range->len, 1 << inode->i_blkbits, + offset + len, &alloc_hint); + list_del(&range->list); + kfree(range); + } + if (ret < 0) + goto out_unlock; + + if (actual_end > inode->i_size && + !(mode & FALLOC_FL_KEEP_SIZE)) { + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(inode)->root; + + /* + * We didn't need to allocate any more space, but we + * still extended the size of the file so we need to + * update i_size and the inode item. + */ + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + } else { + inode->i_ctime = CURRENT_TIME; + i_size_write(inode, actual_end); + btrfs_ordered_update_i_size(inode, actual_end, NULL); + ret = btrfs_update_inode(trans, root, inode); + if (ret) + btrfs_end_transaction(trans, root); + else + ret = btrfs_end_transaction(trans, root); } } +out_unlock: unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, &cached_state, GFP_NOFS); out: + /* + * As we waited the extent range, the data_rsv_map must be empty + * in the range, as written data range will be released from it. + * And for prealloacted extent, it will also be released when + * its metadata is written. + * So this is completely used as cleanup. + */ + btrfs_qgroup_free_data(inode, alloc_start, alloc_end - alloc_start); mutex_unlock(&inode->i_mutex); /* Let go of our reservation. */ - btrfs_free_reserved_data_space(inode, alloc_end - alloc_start); + btrfs_free_reserved_data_space(inode, alloc_start, + alloc_end - alloc_start); return ret; } |