summaryrefslogtreecommitdiff
path: root/fs/btrfs/file.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2015-11-06 17:17:13 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2015-11-06 17:17:13 -0800
commit27eb427bdc0960ad64b72da03e3596c801e7a9e9 (patch)
tree4170a265e99d455ca53d26a19e59330e3277fccd /fs/btrfs/file.c
parent713009809681e5a7871e96e6992692c805b4480b (diff)
parent2959a32a858a2c44bbbce83d19c158d54cc5998a (diff)
Merge branch 'for-linus-4.4' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs
Pull btrfs updates from Chris Mason: "We have a lot of subvolume quota improvements in here, along with big piles of cleanups from Dave Sterba and Anand Jain and others. Josef pitched in a batch of allocator fixes based on production use here at FB. We found that mount -o ssd_spread greatly improved our performance on hardware raid5/6, but it exposed some CPU bottlenecks in the allocator. These patches make a huge difference" * 'for-linus-4.4' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs: (100 commits) Btrfs: fix hole punching when using the no-holes feature Btrfs: find_free_extent: Do not erroneously skip LOOP_CACHING_WAIT state btrfs: Fix a data space underflow warning btrfs: qgroup: Fix a rebase bug which will cause qgroup double free btrfs: qgroup: Fix a race in delayed_ref which leads to abort trans btrfs: clear PF_NOFREEZE in cleaner_kthread() btrfs: qgroup: Don't copy extent buffer to do qgroup rescan btrfs: add balance filters limits, stripes and usage to supported mask btrfs: extend balance filter usage to take minimum and maximum btrfs: add balance filter for stripes btrfs: extend balance filter limit to take minimum and maximum btrfs: fix use after free iterating extrefs btrfs: check unsupported filters in balance arguments Btrfs: fix regression running delayed references when using qgroups Btrfs: fix regression when running delayed references Btrfs: don't do extra bitmap search in one bit case Btrfs: keep track of largest extent in bitmaps Btrfs: don't keep trying to build clusters if we are fragmented Btrfs: cut down on loops through the allocator Btrfs: don't continue setting up space cache when enospc ...
Diffstat (limited to 'fs/btrfs/file.c')
-rw-r--r--fs/btrfs/file.c228
1 files changed, 162 insertions, 66 deletions
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 8c6f247ba81d..6bd5ce9d75f0 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -847,7 +847,7 @@ next_slot:
disk_bytenr, num_bytes, 0,
root->root_key.objectid,
new_key.objectid,
- start - extent_offset, 1);
+ start - extent_offset);
BUG_ON(ret); /* -ENOMEM */
}
key.offset = start;
@@ -925,7 +925,7 @@ delete_extent_item:
disk_bytenr, num_bytes, 0,
root->root_key.objectid,
key.objectid, key.offset -
- extent_offset, 0);
+ extent_offset);
BUG_ON(ret); /* -ENOMEM */
inode_sub_bytes(inode,
extent_end - key.offset);
@@ -1204,7 +1204,7 @@ again:
ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0,
root->root_key.objectid,
- ino, orig_offset, 1);
+ ino, orig_offset);
BUG_ON(ret); /* -ENOMEM */
if (split == start) {
@@ -1231,7 +1231,7 @@ again:
del_nr++;
ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
0, root->root_key.objectid,
- ino, orig_offset, 0);
+ ino, orig_offset);
BUG_ON(ret); /* -ENOMEM */
}
other_start = 0;
@@ -1248,7 +1248,7 @@ again:
del_nr++;
ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
0, root->root_key.objectid,
- ino, orig_offset, 0);
+ ino, orig_offset);
BUG_ON(ret); /* -ENOMEM */
}
if (del_nr == 0) {
@@ -1469,7 +1469,6 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
u64 release_bytes = 0;
u64 lockstart;
u64 lockend;
- unsigned long first_index;
size_t num_written = 0;
int nrptrs;
int ret = 0;
@@ -1485,8 +1484,6 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
if (!pages)
return -ENOMEM;
- first_index = pos >> PAGE_CACHE_SHIFT;
-
while (iov_iter_count(i) > 0) {
size_t offset = pos & (PAGE_CACHE_SIZE - 1);
size_t write_bytes = min(iov_iter_count(i),
@@ -1510,12 +1507,17 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
}
reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
- ret = btrfs_check_data_free_space(inode, reserve_bytes, write_bytes);
- if (ret == -ENOSPC &&
- (BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
- BTRFS_INODE_PREALLOC))) {
+
+ if (BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
+ BTRFS_INODE_PREALLOC)) {
ret = check_can_nocow(inode, pos, &write_bytes);
+ if (ret < 0)
+ break;
if (ret > 0) {
+ /*
+ * For nodata cow case, no need to reserve
+ * data space.
+ */
only_release_metadata = true;
/*
* our prealloc extent may be smaller than
@@ -1524,20 +1526,19 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
num_pages = DIV_ROUND_UP(write_bytes + offset,
PAGE_CACHE_SIZE);
reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
- ret = 0;
- } else {
- ret = -ENOSPC;
+ goto reserve_metadata;
}
}
-
- if (ret)
+ ret = btrfs_check_data_free_space(inode, pos, write_bytes);
+ if (ret < 0)
break;
+reserve_metadata:
ret = btrfs_delalloc_reserve_metadata(inode, reserve_bytes);
if (ret) {
if (!only_release_metadata)
- btrfs_free_reserved_data_space(inode,
- reserve_bytes);
+ btrfs_free_reserved_data_space(inode, pos,
+ write_bytes);
else
btrfs_end_write_no_snapshoting(root);
break;
@@ -1603,12 +1604,17 @@ again:
BTRFS_I(inode)->outstanding_extents++;
spin_unlock(&BTRFS_I(inode)->lock);
}
- if (only_release_metadata)
+ if (only_release_metadata) {
btrfs_delalloc_release_metadata(inode,
release_bytes);
- else
- btrfs_delalloc_release_space(inode,
+ } else {
+ u64 __pos;
+
+ __pos = round_down(pos, root->sectorsize) +
+ (dirty_pages << PAGE_CACHE_SHIFT);
+ btrfs_delalloc_release_space(inode, __pos,
release_bytes);
+ }
}
release_bytes = dirty_pages << PAGE_CACHE_SHIFT;
@@ -1660,7 +1666,7 @@ again:
btrfs_end_write_no_snapshoting(root);
btrfs_delalloc_release_metadata(inode, release_bytes);
} else {
- btrfs_delalloc_release_space(inode, release_bytes);
+ btrfs_delalloc_release_space(inode, pos, release_bytes);
}
}
@@ -2266,7 +2272,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
u64 drop_end;
int ret = 0;
int err = 0;
- int rsv_count;
+ unsigned int rsv_count;
bool same_page;
bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES);
u64 ino_size;
@@ -2488,6 +2494,19 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
trans->block_rsv = &root->fs_info->trans_block_rsv;
/*
+ * If we are using the NO_HOLES feature we might have had already an
+ * hole that overlaps a part of the region [lockstart, lockend] and
+ * ends at (or beyond) lockend. Since we have no file extent items to
+ * represent holes, drop_end can be less than lockend and so we must
+ * make sure we have an extent map representing the existing hole (the
+ * call to __btrfs_drop_extents() might have dropped the existing extent
+ * map representing the existing hole), otherwise the fast fsync path
+ * will not record the existence of the hole region
+ * [existing_hole_start, lockend].
+ */
+ if (drop_end <= lockend)
+ drop_end = lockend + 1;
+ /*
* Don't insert file hole extent item if it's for a range beyond eof
* (because it's useless) or if it represents a 0 bytes range (when
* cur_offset == drop_end).
@@ -2541,17 +2560,61 @@ out_only_mutex:
return err;
}
+/* Helper structure to record which range is already reserved */
+struct falloc_range {
+ struct list_head list;
+ u64 start;
+ u64 len;
+};
+
+/*
+ * Helper function to add falloc range
+ *
+ * Caller should have locked the larger range of extent containing
+ * [start, len)
+ */
+static int add_falloc_range(struct list_head *head, u64 start, u64 len)
+{
+ struct falloc_range *prev = NULL;
+ struct falloc_range *range = NULL;
+
+ if (list_empty(head))
+ goto insert;
+
+ /*
+ * As fallocate iterate by bytenr order, we only need to check
+ * the last range.
+ */
+ prev = list_entry(head->prev, struct falloc_range, list);
+ if (prev->start + prev->len == start) {
+ prev->len += len;
+ return 0;
+ }
+insert:
+ range = kmalloc(sizeof(*range), GFP_NOFS);
+ if (!range)
+ return -ENOMEM;
+ range->start = start;
+ range->len = len;
+ list_add_tail(&range->list, head);
+ return 0;
+}
+
static long btrfs_fallocate(struct file *file, int mode,
loff_t offset, loff_t len)
{
struct inode *inode = file_inode(file);
struct extent_state *cached_state = NULL;
+ struct falloc_range *range;
+ struct falloc_range *tmp;
+ struct list_head reserve_list;
u64 cur_offset;
u64 last_byte;
u64 alloc_start;
u64 alloc_end;
u64 alloc_hint = 0;
u64 locked_end;
+ u64 actual_end = 0;
struct extent_map *em;
int blocksize = BTRFS_I(inode)->root->sectorsize;
int ret;
@@ -2567,11 +2630,12 @@ static long btrfs_fallocate(struct file *file, int mode,
return btrfs_punch_hole(inode, offset, len);
/*
- * Make sure we have enough space before we do the
- * allocation.
+ * Only trigger disk allocation, don't trigger qgroup reserve
+ *
+ * For qgroup space, it will be checked later.
*/
- ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start, alloc_end - alloc_start);
- if (ret)
+ ret = btrfs_alloc_data_chunk_ondemand(inode, alloc_end - alloc_start);
+ if (ret < 0)
return ret;
mutex_lock(&inode->i_mutex);
@@ -2579,6 +2643,13 @@ static long btrfs_fallocate(struct file *file, int mode,
if (ret)
goto out;
+ /*
+ * TODO: Move these two operations after we have checked
+ * accurate reserved space, or fallocate can still fail but
+ * with page truncated or size expanded.
+ *
+ * But that's a minor problem and won't do much harm BTW.
+ */
if (alloc_start > inode->i_size) {
ret = btrfs_cont_expand(inode, i_size_read(inode),
alloc_start);
@@ -2637,10 +2708,10 @@ static long btrfs_fallocate(struct file *file, int mode,
}
}
+ /* First, check if we exceed the qgroup limit */
+ INIT_LIST_HEAD(&reserve_list);
cur_offset = alloc_start;
while (1) {
- u64 actual_end;
-
em = btrfs_get_extent(inode, NULL, 0, cur_offset,
alloc_end - cur_offset, 0);
if (IS_ERR_OR_NULL(em)) {
@@ -2653,57 +2724,82 @@ static long btrfs_fallocate(struct file *file, int mode,
last_byte = min(extent_map_end(em), alloc_end);
actual_end = min_t(u64, extent_map_end(em), offset + len);
last_byte = ALIGN(last_byte, blocksize);
-
if (em->block_start == EXTENT_MAP_HOLE ||
(cur_offset >= inode->i_size &&
!test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
- ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
- last_byte - cur_offset,
- 1 << inode->i_blkbits,
- offset + len,
- &alloc_hint);
- } else if (actual_end > inode->i_size &&
- !(mode & FALLOC_FL_KEEP_SIZE)) {
- struct btrfs_trans_handle *trans;
- struct btrfs_root *root = BTRFS_I(inode)->root;
-
- /*
- * We didn't need to allocate any more space, but we
- * still extended the size of the file so we need to
- * update i_size and the inode item.
- */
- trans = btrfs_start_transaction(root, 1);
- if (IS_ERR(trans)) {
- ret = PTR_ERR(trans);
- } else {
- inode->i_ctime = CURRENT_TIME;
- i_size_write(inode, actual_end);
- btrfs_ordered_update_i_size(inode, actual_end,
- NULL);
- ret = btrfs_update_inode(trans, root, inode);
- if (ret)
- btrfs_end_transaction(trans, root);
- else
- ret = btrfs_end_transaction(trans,
- root);
+ ret = add_falloc_range(&reserve_list, cur_offset,
+ last_byte - cur_offset);
+ if (ret < 0) {
+ free_extent_map(em);
+ break;
}
+ ret = btrfs_qgroup_reserve_data(inode, cur_offset,
+ last_byte - cur_offset);
+ if (ret < 0)
+ break;
}
free_extent_map(em);
- if (ret < 0)
- break;
-
cur_offset = last_byte;
- if (cur_offset >= alloc_end) {
- ret = 0;
+ if (cur_offset >= alloc_end)
break;
+ }
+
+ /*
+ * If ret is still 0, means we're OK to fallocate.
+ * Or just cleanup the list and exit.
+ */
+ list_for_each_entry_safe(range, tmp, &reserve_list, list) {
+ if (!ret)
+ ret = btrfs_prealloc_file_range(inode, mode,
+ range->start,
+ range->len, 1 << inode->i_blkbits,
+ offset + len, &alloc_hint);
+ list_del(&range->list);
+ kfree(range);
+ }
+ if (ret < 0)
+ goto out_unlock;
+
+ if (actual_end > inode->i_size &&
+ !(mode & FALLOC_FL_KEEP_SIZE)) {
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+
+ /*
+ * We didn't need to allocate any more space, but we
+ * still extended the size of the file so we need to
+ * update i_size and the inode item.
+ */
+ trans = btrfs_start_transaction(root, 1);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ } else {
+ inode->i_ctime = CURRENT_TIME;
+ i_size_write(inode, actual_end);
+ btrfs_ordered_update_i_size(inode, actual_end, NULL);
+ ret = btrfs_update_inode(trans, root, inode);
+ if (ret)
+ btrfs_end_transaction(trans, root);
+ else
+ ret = btrfs_end_transaction(trans, root);
}
}
+out_unlock:
unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
&cached_state, GFP_NOFS);
out:
+ /*
+ * As we waited the extent range, the data_rsv_map must be empty
+ * in the range, as written data range will be released from it.
+ * And for prealloacted extent, it will also be released when
+ * its metadata is written.
+ * So this is completely used as cleanup.
+ */
+ btrfs_qgroup_free_data(inode, alloc_start, alloc_end - alloc_start);
mutex_unlock(&inode->i_mutex);
/* Let go of our reservation. */
- btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
+ btrfs_free_reserved_data_space(inode, alloc_start,
+ alloc_end - alloc_start);
return ret;
}