diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2012-07-26 14:48:55 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2012-07-26 14:48:55 -0700 |
commit | e2aed8dfa50bb061747eeb14e6af099554a03b76 (patch) | |
tree | 900c96a2dfe7195e56ec3c1f027418029d0a8444 /fs/btrfs/transaction.c | |
parent | 476525004ac7e2f990b6956efcd44d0780c2ab4c (diff) | |
parent | b24baf6917a376420d535548e1f88744028bcf24 (diff) |
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs
Pull large btrfs update from Chris Mason:
"This pull request is very large, and the two main features in here
have been under testing/devel for quite a while.
We have subvolume quotas from the strato developers. This enables
full tracking of how many blocks are allocated to each subvolume (and
all snapshots) and you can set limits on a per-subvolume basis. You
can also create quota groups and toss multiple subvolumes into a big
group. It's everything you need to be a web hosting company and give
each user their own subvolume.
The userland side of the quotas is being refreshed, they'll send out
details on where to grab it soon.
Next is the kernel side of btrfs send/receive from Alexander Block.
This leverages the same infrastructure as the quota code to figure out
relationships between blocks and their owners. It can then compute
the difference between two snapshots and sends the diffs in a neutral
format into userland.
The basic model:
create a snapshot
send that snapshot as the initial backup
make changes
create a second snapshot
send the incremental as a backup
delete the first snapshot
(use the second snapshot for the next incremental)
The receive portion is all in userland, and in the 'next' branch of my
btrfs-progs repo.
There's still some work to do in terms of optimizing the send side
from kernel to userland. The really important part is figuring out
how two snapshots are different, and this is where we are
concentrating right now. The initial send of a dataset is a little
slower than tar, but the incremental sends are dramatically faster
than what rsync can do.
On top of all of that, we have a nice queue of fixes, cleanups and
optimizations."
Fix up trivial modify/del conflict in fs/btrfs/ioctl.c
Also fix up semantic conflict in fs/btrfs/send.c: the interface to
dentry_open() changed in commit 765927b2d508 ("switch dentry_open() to
struct path, make it grab references itself"), and since it now grabs
whatever references it needs, we should no longer do the mntget() on the
mnt (and we need to dput() the dentry reference we took).
* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs: (65 commits)
Btrfs: uninit variable fixes in send/receive
Btrfs: introduce BTRFS_IOC_SEND for btrfs send/receive
Btrfs: add btrfs_compare_trees function
Btrfs: introduce subvol uuids and times
Btrfs: make iref_to_path non static
Btrfs: add a barrier before a waitqueue_active check
Btrfs: call the ordered free operation without any locks held
Btrfs: Check INCOMPAT flags on remount and add helper function
Btrfs: add helper for tree enumeration
btrfs: allow cross-subvolume file clone
Btrfs: improve multi-thread buffer read
Btrfs: make btrfs's allocation smoothly with preallocation
Btrfs: lock the transition from dirty to writeback for an eb
Btrfs: fix potential race in extent buffer freeing
Btrfs: don't return true in releasepage unless we actually freed the eb
Btrfs: suppress printk() if all device I/O stats are zero
Btrfs: remove unwanted printk() for btrfs device I/O stats
Btrfs: rewrite BTRFS_SETGET_FUNCS
Btrfs: zero unused bytes in inode item
Btrfs: kill free_space pointer from inode structure
...
Conflicts:
fs/btrfs/ioctl.c
Diffstat (limited to 'fs/btrfs/transaction.c')
-rw-r--r-- | fs/btrfs/transaction.c | 101 |
1 files changed, 82 insertions, 19 deletions
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index b72b068183ec..7ac7cdcc294e 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -22,6 +22,7 @@ #include <linux/writeback.h> #include <linux/pagemap.h> #include <linux/blkdev.h> +#include <linux/uuid.h> #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -38,7 +39,6 @@ void put_transaction(struct btrfs_transaction *transaction) if (atomic_dec_and_test(&transaction->use_count)) { BUG_ON(!list_empty(&transaction->list)); WARN_ON(transaction->delayed_refs.root.rb_node); - WARN_ON(!list_empty(&transaction->delayed_refs.seq_head)); memset(transaction, 0, sizeof(*transaction)); kmem_cache_free(btrfs_transaction_cachep, transaction); } @@ -100,8 +100,8 @@ loop: kmem_cache_free(btrfs_transaction_cachep, cur_trans); cur_trans = fs_info->running_transaction; goto loop; - } else if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { - spin_unlock(&root->fs_info->trans_lock); + } else if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { + spin_unlock(&fs_info->trans_lock); kmem_cache_free(btrfs_transaction_cachep, cur_trans); return -EROFS; } @@ -126,7 +126,6 @@ loop: cur_trans->delayed_refs.num_heads = 0; cur_trans->delayed_refs.flushing = 0; cur_trans->delayed_refs.run_delayed_start = 0; - cur_trans->delayed_refs.seq = 1; /* * although the tree mod log is per file system and not per transaction, @@ -145,10 +144,8 @@ loop: } atomic_set(&fs_info->tree_mod_seq, 0); - init_waitqueue_head(&cur_trans->delayed_refs.seq_wait); spin_lock_init(&cur_trans->commit_lock); spin_lock_init(&cur_trans->delayed_refs.lock); - INIT_LIST_HEAD(&cur_trans->delayed_refs.seq_head); INIT_LIST_HEAD(&cur_trans->pending_snapshots); list_add_tail(&cur_trans->list, &fs_info->trans_list); @@ -299,6 +296,7 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, struct btrfs_transaction *cur_trans; u64 num_bytes = 0; int ret; + u64 qgroup_reserved = 0; if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) return ERR_PTR(-EROFS); @@ -317,6 +315,14 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, * the appropriate flushing if need be. */ if (num_items > 0 && root != root->fs_info->chunk_root) { + if (root->fs_info->quota_enabled && + is_fstree(root->root_key.objectid)) { + qgroup_reserved = num_items * root->leafsize; + ret = btrfs_qgroup_reserve(root, qgroup_reserved); + if (ret) + return ERR_PTR(ret); + } + num_bytes = btrfs_calc_trans_metadata_size(root, num_items); ret = btrfs_block_rsv_add(root, &root->fs_info->trans_block_rsv, @@ -349,11 +355,16 @@ again: h->transaction = cur_trans; h->blocks_used = 0; h->bytes_reserved = 0; + h->root = root; h->delayed_ref_updates = 0; h->use_count = 1; + h->adding_csums = 0; h->block_rsv = NULL; h->orig_rsv = NULL; h->aborted = 0; + h->qgroup_reserved = qgroup_reserved; + h->delayed_ref_elem.seq = 0; + INIT_LIST_HEAD(&h->qgroup_ref_list); smp_mb(); if (cur_trans->blocked && may_wait_transaction(root, type)) { @@ -473,7 +484,6 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root) { struct btrfs_transaction *cur_trans = trans->transaction; - struct btrfs_block_rsv *rsv = trans->block_rsv; int updates; int err; @@ -481,12 +491,6 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans, if (cur_trans->blocked || cur_trans->delayed_refs.flushing) return 1; - /* - * We need to do this in case we're deleting csums so the global block - * rsv get's used instead of the csum block rsv. - */ - trans->block_rsv = NULL; - updates = trans->delayed_ref_updates; trans->delayed_ref_updates = 0; if (updates) { @@ -495,8 +499,6 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans, return err; } - trans->block_rsv = rsv; - return should_end_transaction(trans, root); } @@ -513,8 +515,24 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, return 0; } + /* + * do the qgroup accounting as early as possible + */ + err = btrfs_delayed_refs_qgroup_accounting(trans, info); + btrfs_trans_release_metadata(trans, root); trans->block_rsv = NULL; + /* + * the same root has to be passed to start_transaction and + * end_transaction. Subvolume quota depends on this. + */ + WARN_ON(trans->root != root); + + if (trans->qgroup_reserved) { + btrfs_qgroup_free(root, trans->qgroup_reserved); + trans->qgroup_reserved = 0; + } + while (count < 2) { unsigned long cur = trans->delayed_ref_updates; trans->delayed_ref_updates = 0; @@ -527,6 +545,8 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, } count++; } + btrfs_trans_release_metadata(trans, root); + trans->block_rsv = NULL; if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) && should_end_transaction(trans, root)) { @@ -567,6 +587,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { err = -EIO; } + assert_qgroups_uptodate(trans); memset(trans, 0, sizeof(*trans)); kmem_cache_free(btrfs_trans_handle_cachep, trans); @@ -785,6 +806,13 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans, ret = btrfs_run_dev_stats(trans, root->fs_info); BUG_ON(ret); + ret = btrfs_run_qgroups(trans, root->fs_info); + BUG_ON(ret); + + /* run_qgroups might have added some more refs */ + ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); + BUG_ON(ret); + while (!list_empty(&fs_info->dirty_cowonly_roots)) { next = fs_info->dirty_cowonly_roots.next; list_del_init(next); @@ -926,11 +954,13 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, struct dentry *dentry; struct extent_buffer *tmp; struct extent_buffer *old; + struct timespec cur_time = CURRENT_TIME; int ret; u64 to_reserve = 0; u64 index = 0; u64 objectid; u64 root_flags; + uuid_le new_uuid; rsv = trans->block_rsv; @@ -957,6 +987,14 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, } } + ret = btrfs_qgroup_inherit(trans, fs_info, root->root_key.objectid, + objectid, pending->inherit); + kfree(pending->inherit); + if (ret) { + pending->error = ret; + goto fail; + } + key.objectid = objectid; key.offset = (u64)-1; key.type = BTRFS_ROOT_ITEM_KEY; @@ -1016,6 +1054,20 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, root_flags &= ~BTRFS_ROOT_SUBVOL_RDONLY; btrfs_set_root_flags(new_root_item, root_flags); + btrfs_set_root_generation_v2(new_root_item, + trans->transid); + uuid_le_gen(&new_uuid); + memcpy(new_root_item->uuid, new_uuid.b, BTRFS_UUID_SIZE); + memcpy(new_root_item->parent_uuid, root->root_item.uuid, + BTRFS_UUID_SIZE); + new_root_item->otime.sec = cpu_to_le64(cur_time.tv_sec); + new_root_item->otime.nsec = cpu_to_le64(cur_time.tv_nsec); + btrfs_set_root_otransid(new_root_item, trans->transid); + memset(&new_root_item->stime, 0, sizeof(new_root_item->stime)); + memset(&new_root_item->rtime, 0, sizeof(new_root_item->rtime)); + btrfs_set_root_stransid(new_root_item, 0); + btrfs_set_root_rtransid(new_root_item, 0); + old = btrfs_lock_root_node(root); ret = btrfs_cow_block(trans, root, old, NULL, 0, &old); if (ret) { @@ -1269,9 +1321,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, btrfs_run_ordered_operations(root, 0); - btrfs_trans_release_metadata(trans, root); - trans->block_rsv = NULL; - if (cur_trans->aborted) goto cleanup_transaction; @@ -1282,6 +1331,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, if (ret) goto cleanup_transaction; + btrfs_trans_release_metadata(trans, root); + trans->block_rsv = NULL; + cur_trans = trans->transaction; /* @@ -1330,7 +1382,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, spin_unlock(&root->fs_info->trans_lock); } - if (now < cur_trans->start_time || now - cur_trans->start_time < 1) + if (!btrfs_test_opt(root, SSD) && + (now < cur_trans->start_time || now - cur_trans->start_time < 1)) should_grow = 1; do { @@ -1352,6 +1405,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, goto cleanup_transaction; /* + * running the delayed items may have added new refs. account + * them now so that they hinder processing of more delayed refs + * as little as possible. + */ + btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info); + + /* * rename don't use btrfs_join_transaction, so, once we * set the transaction to blocked above, we aren't going * to get any new ordered operations. We can safely run @@ -1463,6 +1523,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, root->fs_info->chunk_root->node); switch_commit_root(root->fs_info->chunk_root); + assert_qgroups_uptodate(trans); update_super_roots(root); if (!root->fs_info->log_root_recovering) { @@ -1532,6 +1593,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, return ret; cleanup_transaction: + btrfs_trans_release_metadata(trans, root); + trans->block_rsv = NULL; btrfs_printk(root->fs_info, "Skipping commit of aborted transaction.\n"); // WARN_ON(1); if (current->journal_info == trans) |