From 0b86a832a1f38abec695864ec2eaedc9d2383f1b Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 24 Mar 2008 15:01:56 -0400 Subject: Btrfs: Add support for multiple devices per filesystem Signed-off-by: Chris Mason --- fs/btrfs/volumes.h | 78 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) create mode 100644 fs/btrfs/volumes.h (limited to 'fs/btrfs/volumes.h') diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h new file mode 100644 index 000000000000..4a47dcb00a73 --- /dev/null +++ b/fs/btrfs/volumes.h @@ -0,0 +1,78 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_VOLUMES_ +#define __BTRFS_VOLUMES_ +struct btrfs_device { + struct list_head dev_list; + struct btrfs_root *dev_root; + struct btrfs_key dev_key; + + struct block_device *bdev; + + /* the internal btrfs device id */ + u64 devid; + + /* size of the device */ + u64 total_bytes; + + /* bytes used */ + u64 bytes_used; + + /* optimal io alignment for this device */ + u32 io_align; + + /* optimal io width for this device */ + u32 io_width; + + /* minimal io size for this device */ + u32 sector_size; + + /* the kernel device number */ + u64 rdev; + + /* type and info about this device */ + u64 type; + + /* partition number, 0 for whole dev */ + int partition; + + /* length of the name data at the end of the item */ + int name_len; + + /* physical drive uuid (or lvm uuid) */ + u8 uuid[BTRFS_DEV_UUID_SIZE]; + + char *name; +}; + +int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, + struct btrfs_device *device, + u64 owner, u64 num_bytes, u64 *start); +int btrfs_map_block(struct btrfs_mapping_tree *map_tree, + u64 logical, u64 *phys, u64 *length, + struct btrfs_device **dev); +int btrfs_read_sys_array(struct btrfs_root *root); +int btrfs_read_chunk_tree(struct btrfs_root *root); +int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, + struct btrfs_root *extent_root, u64 *start, + u64 *num_bytes, u32 type); +void btrfs_mapping_init(struct btrfs_mapping_tree *tree); +void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree); +int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio); +#endif -- cgit v1.2.3 From 6324fbf334f4586325057197da7752f4ffa409d3 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 24 Mar 2008 15:01:59 -0400 Subject: Btrfs: Dynamic chunk and block group allocation Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 12 ++- fs/btrfs/disk-io.c | 2 +- fs/btrfs/extent-tree.c | 208 ++++++++++++++++++++++++++++++++++++++++++++----- fs/btrfs/inode.c | 9 ++- fs/btrfs/volumes.c | 87 +++++++++++++-------- fs/btrfs/volumes.h | 2 +- 6 files changed, 263 insertions(+), 57 deletions(-) (limited to 'fs/btrfs/volumes.h') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 1453d995fef6..a877105f5c47 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -431,9 +431,19 @@ struct btrfs_block_group_item { __le64 flags; } __attribute__ ((__packed__)); +struct btrfs_space_info { + u64 flags; + u64 total_bytes; + u64 bytes_used; + u64 bytes_pinned; + int full; + struct list_head list; +}; + struct btrfs_block_group_cache { struct btrfs_key key; struct btrfs_block_group_item item; + struct btrfs_space_info *space_info; u64 pinned; u64 flags; int cached; @@ -490,7 +500,7 @@ struct btrfs_fs_info { struct list_head dirty_cowonly_roots; struct list_head devices; - struct list_head *last_device; + struct list_head space_info; spinlock_t delalloc_lock; spinlock_t new_trans_lock; u64 delalloc_bytes; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 8e37fa120cc8..2a239ae49f78 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -700,8 +700,8 @@ struct btrfs_root *open_ctree(struct super_block *sb) fs_info->dev_root = dev_root; INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); INIT_LIST_HEAD(&fs_info->devices); + INIT_LIST_HEAD(&fs_info->space_info); btrfs_mapping_init(&fs_info->mapping_tree); - fs_info->last_device = &fs_info->devices; fs_info->sb = sb; fs_info->throttles = 0; fs_info->mount_opt = 0; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 2cd957d6e8d8..15082b1087be 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -36,6 +36,10 @@ static int finish_current_insert(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root); static int del_pending_extents(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root); +int btrfs_make_block_group(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytes_used, + u64 type, u64 chunk_tree, u64 chunk_objectid, + u64 size); static int cache_block_group(struct btrfs_root *root, @@ -168,16 +172,7 @@ struct btrfs_block_group_cache *btrfs_lookup_block_group(struct static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits) { - if ((bits & BLOCK_GROUP_DATA) && - (cache->flags & BTRFS_BLOCK_GROUP_DATA)) - return 1; - if ((bits & BLOCK_GROUP_METADATA) && - (cache->flags & BTRFS_BLOCK_GROUP_METADATA)) - return 1; - if ((bits & BLOCK_GROUP_SYSTEM) && - (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM)) - return 1; - return 0; + return (cache->flags & bits); } static int noinline find_search_start(struct btrfs_root *root, @@ -276,6 +271,18 @@ static u64 div_factor(u64 num, int factor) return num; } +static int block_group_state_bits(u64 flags) +{ + int bits = 0; + if (flags & BTRFS_BLOCK_GROUP_DATA) + bits |= BLOCK_GROUP_DATA; + if (flags & BTRFS_BLOCK_GROUP_METADATA) + bits |= BLOCK_GROUP_METADATA; + if (flags & BTRFS_BLOCK_GROUP_SYSTEM) + bits |= BLOCK_GROUP_SYSTEM; + return bits; +} + struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root, struct btrfs_block_group_cache *hint, u64 search_start, @@ -304,7 +311,7 @@ struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root, if (!owner) factor = 8; - bit = data; + bit = block_group_state_bits(data); if (search_start && search_start < total_fs_bytes) { struct btrfs_block_group_cache *shint; @@ -358,10 +365,15 @@ again: free_check = cache->key.offset; else free_check = div_factor(cache->key.offset, factor); + if (used + cache->pinned < free_check) { found_group = cache; goto found; } + if (full_search) { + printk("failed on cache %Lu used %Lu total %Lu\n", + cache->key.objectid, used, cache->key.offset); + } cond_resched(); } if (!full_search) { @@ -983,6 +995,58 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, return werr; } +static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info, + u64 flags) +{ + struct list_head *head = &info->space_info; + struct list_head *cur; + struct btrfs_space_info *found; + list_for_each(cur, head) { + found = list_entry(cur, struct btrfs_space_info, list); + if (found->flags == flags) + return found; + } + return NULL; + +} + +static int do_chunk_alloc(struct btrfs_trans_handle *trans, + struct btrfs_root *extent_root, u64 alloc_bytes, + u64 flags) +{ + struct btrfs_space_info *space_info; + u64 thresh; + u64 start; + u64 num_bytes; + int ret; + + space_info = __find_space_info(extent_root->fs_info, flags); + BUG_ON(!space_info); + + if (space_info->full) + return 0; + + thresh = div_factor(space_info->total_bytes, 7); + if ((space_info->bytes_used + space_info->bytes_pinned + alloc_bytes) < + thresh) + return 0; + + ret = btrfs_alloc_chunk(trans, extent_root, &start, &num_bytes, flags); + if (ret == -ENOSPC) { +printk("space info full %Lu\n", flags); + space_info->full = 1; + return 0; + } + + BUG_ON(ret); + + ret = btrfs_make_block_group(trans, extent_root, 0, flags, + extent_root->fs_info->chunk_root->root_key.objectid, + start, num_bytes); + BUG_ON(ret); + return 0; +} + static int update_block_group(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, int alloc, @@ -1012,8 +1076,10 @@ static int update_block_group(struct btrfs_trans_handle *trans, num_bytes = min(total, cache->key.offset - byte_in_group); if (alloc) { old_val += num_bytes; + cache->space_info->bytes_used += num_bytes; } else { old_val -= num_bytes; + cache->space_info->bytes_used -= num_bytes; if (mark_free) { set_extent_dirty(&info->free_space_cache, bytenr, bytenr + num_bytes - 1, @@ -1026,6 +1092,7 @@ static int update_block_group(struct btrfs_trans_handle *trans, } return 0; } + static int update_pinned_extents(struct btrfs_root *root, u64 bytenr, u64 num, int pin) { @@ -1047,9 +1114,11 @@ static int update_pinned_extents(struct btrfs_root *root, (bytenr - cache->key.objectid)); if (pin) { cache->pinned += len; + cache->space_info->bytes_pinned += len; fs_info->total_pinned += len; } else { cache->pinned -= len; + cache->space_info->bytes_pinned -= len; fs_info->total_pinned -= len; } bytenr += len; @@ -1472,7 +1541,7 @@ check_failed: goto new_group; } - if (!(data & BLOCK_GROUP_DATA)) { + if (!(data & BTRFS_BLOCK_GROUP_DATA)) { block_group = btrfs_lookup_block_group(info, ins->objectid); if (block_group) trans->block_group = block_group; @@ -1532,12 +1601,25 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans, struct btrfs_path *path; struct btrfs_key keys[2]; - if (data) - data = BLOCK_GROUP_DATA; - else if (root == root->fs_info->chunk_root) - data = BLOCK_GROUP_SYSTEM; - else - data = BLOCK_GROUP_METADATA; + if (data) { + data = BTRFS_BLOCK_GROUP_DATA; + } else if (root == root->fs_info->chunk_root) { + data = BTRFS_BLOCK_GROUP_SYSTEM; + } else { + data = BTRFS_BLOCK_GROUP_METADATA; + } + + if (root->ref_cows) { + if (data != BTRFS_BLOCK_GROUP_METADATA) { + ret = do_chunk_alloc(trans, root->fs_info->extent_root, + num_bytes, + BTRFS_BLOCK_GROUP_METADATA); + BUG_ON(ret); + } + ret = do_chunk_alloc(trans, root->fs_info->extent_root, + num_bytes, data); + BUG_ON(ret); + } new_hint = max(hint_byte, root->fs_info->alloc_start); if (new_hint < btrfs_super_total_bytes(&info->super_copy)) @@ -2490,6 +2572,34 @@ error: return ret; } +static int update_space_info(struct btrfs_fs_info *info, u64 flags, + u64 total_bytes, u64 bytes_used, + struct btrfs_space_info **space_info) +{ + struct btrfs_space_info *found; + + found = __find_space_info(info, flags); + if (found) { + found->total_bytes += total_bytes; + found->bytes_used += bytes_used; + WARN_ON(found->total_bytes < found->bytes_used); + *space_info = found; + return 0; + } + found = kmalloc(sizeof(*found), GFP_NOFS); + if (!found) + return -ENOMEM; + + list_add(&found->list, &info->space_info); + found->flags = flags; + found->total_bytes = total_bytes; + found->bytes_used = bytes_used; + found->bytes_pinned = 0; + found->full = 0; + *space_info = found; + return 0; +} + int btrfs_read_block_groups(struct btrfs_root *root) { struct btrfs_path *path; @@ -2497,6 +2607,7 @@ int btrfs_read_block_groups(struct btrfs_root *root) int bit; struct btrfs_block_group_cache *cache; struct btrfs_fs_info *info = root->fs_info; + struct btrfs_space_info *space_info; struct extent_io_tree *block_group_cache; struct btrfs_key key; struct btrfs_key found_key; @@ -2547,6 +2658,12 @@ int btrfs_read_block_groups(struct btrfs_root *root) bit = BLOCK_GROUP_METADATA; } + ret = update_space_info(info, cache->flags, found_key.offset, + btrfs_block_group_used(&cache->item), + &space_info); + BUG_ON(ret); + cache->space_info = space_info; + /* use EXTENT_LOCKED to prevent merging */ set_extent_bits(block_group_cache, found_key.objectid, found_key.objectid + found_key.offset - 1, @@ -2563,3 +2680,58 @@ error: btrfs_free_path(path); return ret; } + +int btrfs_make_block_group(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytes_used, + u64 type, u64 chunk_tree, u64 chunk_objectid, + u64 size) +{ + int ret; + int bit = 0; + struct btrfs_root *extent_root; + struct btrfs_block_group_cache *cache; + struct extent_io_tree *block_group_cache; + + extent_root = root->fs_info->extent_root; + block_group_cache = &root->fs_info->block_group_cache; + + cache = kmalloc(sizeof(*cache), GFP_NOFS); + BUG_ON(!cache); + cache->key.objectid = chunk_objectid; + cache->key.offset = size; + cache->cached = 0; + cache->pinned = 0; + btrfs_set_key_type(&cache->key, BTRFS_BLOCK_GROUP_ITEM_KEY); + memset(&cache->item, 0, sizeof(cache->item)); + btrfs_set_block_group_used(&cache->item, bytes_used); + btrfs_set_block_group_chunk_tree(&cache->item, chunk_tree); + btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid); + cache->flags = type; + btrfs_set_block_group_flags(&cache->item, type); + + ret = update_space_info(root->fs_info, cache->flags, size, bytes_used, + &cache->space_info); + BUG_ON(ret); + + if (type & BTRFS_BLOCK_GROUP_DATA) { + bit = BLOCK_GROUP_DATA; + } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { + bit = BLOCK_GROUP_SYSTEM; + } else if (type & BTRFS_BLOCK_GROUP_METADATA) { + bit = BLOCK_GROUP_METADATA; + } + set_extent_bits(block_group_cache, chunk_objectid, + chunk_objectid + size - 1, + bit | EXTENT_LOCKED, GFP_NOFS); + set_state_private(block_group_cache, chunk_objectid, + (unsigned long)cache); + + ret = btrfs_insert_item(trans, extent_root, &cache->key, &cache->item, + sizeof(cache->item)); + BUG_ON(ret); + + finish_current_insert(trans, extent_root); + ret = del_pending_extents(trans, extent_root); + BUG_ON(ret); + return 0; +} diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 17063cd2cb73..109576b57f69 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1527,6 +1527,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, { struct inode *inode; struct btrfs_inode_item *inode_item; + struct btrfs_block_group_cache *new_inode_group; struct btrfs_key *location; struct btrfs_path *path; struct btrfs_inode_ref *ref; @@ -1553,9 +1554,13 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, owner = 0; else owner = 1; - group = btrfs_find_block_group(root, group, 0, + new_inode_group = btrfs_find_block_group(root, group, 0, BTRFS_BLOCK_GROUP_METADATA, owner); - BTRFS_I(inode)->block_group = group; + if (!new_inode_group) { + printk("find_block group failed\n"); + new_inode_group = group; + } + BTRFS_I(inode)->block_group = new_inode_group; BTRFS_I(inode)->flags = 0; key[0].objectid = objectid; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 90a8d45dc6d7..a52a13f365d6 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -131,7 +131,7 @@ check_pending: btrfs_release_path(root, path); BUG_ON(*start < search_start); - if (*start + num_bytes >= search_end) { + if (*start + num_bytes > search_end) { ret = -ENOSPC; goto error; } @@ -159,8 +159,9 @@ int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, return -ENOMEM; ret = find_free_dev_extent(trans, device, path, num_bytes, start); - if (ret) + if (ret) { goto err; + } key.objectid = device->devid; key.offset = *start; @@ -214,22 +215,6 @@ error: return ret; } -static struct btrfs_device *next_device(struct list_head *head, - struct list_head *last) -{ - struct list_head *next = last->next; - struct btrfs_device *dev; - - if (list_empty(head)) - return NULL; - - if (next == head) - next = next->next; - - dev = list_entry(next, struct btrfs_device, dev_list); - return dev; -} - static int find_next_devid(struct btrfs_root *root, struct btrfs_path *path, u64 *objectid) { @@ -397,31 +382,63 @@ int btrfs_add_system_chunk(struct btrfs_trans_handle *trans, int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root, u64 *start, - u64 *num_bytes, u32 type) + u64 *num_bytes, u64 type) { u64 dev_offset; struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root; struct btrfs_stripe *stripes; struct btrfs_device *device = NULL; struct btrfs_chunk *chunk; + struct list_head private_devs; struct list_head *dev_list = &extent_root->fs_info->devices; - struct list_head *last_dev = extent_root->fs_info->last_device; + struct list_head *cur; struct extent_map_tree *em_tree; struct map_lookup *map; struct extent_map *em; u64 physical; u64 calc_size = 1024 * 1024 * 1024; - int num_stripes; + u64 avail; + u64 max_avail = 0; + int num_stripes = 1; + int looped = 0; int ret; - int index = 0; + int index; struct btrfs_key key; + if (list_empty(dev_list)) + return -ENOSPC; +again: + INIT_LIST_HEAD(&private_devs); + cur = dev_list->next; + index = 0; + /* build a private list of devices we will allocate from */ + while(index < num_stripes) { + device = list_entry(cur, struct btrfs_device, dev_list); + avail = device->total_bytes - device->bytes_used; + cur = cur->next; + if (avail > max_avail) + max_avail = avail; + if (avail >= calc_size) { + list_move_tail(&device->dev_list, &private_devs); + index++; + } + if (cur == dev_list) + break; + } + if (index < num_stripes) { + list_splice(&private_devs, dev_list); + if (!looped && max_avail > 0) { + looped = 1; + calc_size = max_avail; + goto again; + } + return -ENOSPC; + } ret = find_next_chunk(chunk_root, &key.objectid); if (ret) return ret; - num_stripes = 1; chunk = kmalloc(btrfs_chunk_item_size(num_stripes), GFP_NOFS); if (!chunk) return -ENOMEM; @@ -429,11 +446,12 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, stripes = &chunk->stripe; *num_bytes = calc_size; + index = 0; while(index < num_stripes) { - device = next_device(dev_list, last_dev); - BUG_ON(!device); - last_dev = &device->dev_list; - extent_root->fs_info->last_device = last_dev; + BUG_ON(list_empty(&private_devs)); + cur = private_devs.next; + device = list_entry(cur, struct btrfs_device, dev_list); + list_move_tail(&device->dev_list, dev_list); ret = btrfs_alloc_dev_extent(trans, device, key.objectid, @@ -449,6 +467,7 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, physical = dev_offset; index++; } + BUG_ON(!list_empty(&private_devs)); /* key.objectid was set above */ key.offset = *num_bytes; @@ -692,17 +711,17 @@ static int read_one_dev(struct btrfs_root *root, struct btrfs_key *key, int ret; devid = btrfs_device_id(leaf, dev_item); - if (btrfs_find_device(root, devid)) - return 0; - - device = kmalloc(sizeof(*device), GFP_NOFS); - if (!device) - return -ENOMEM; + device = btrfs_find_device(root, devid); + if (!device) { + device = kmalloc(sizeof(*device), GFP_NOFS); + if (!device) + return -ENOMEM; + list_add(&device->dev_list, &root->fs_info->devices); + } fill_device_from_item(leaf, dev_item, device); device->dev_root = root->fs_info->dev_root; device->bdev = root->fs_info->sb->s_bdev; - list_add(&device->dev_list, &root->fs_info->devices); memcpy(&device->dev_key, key, sizeof(*key)); ret = 0; #if 0 diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 4a47dcb00a73..4ab9891878a8 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -71,7 +71,7 @@ int btrfs_read_sys_array(struct btrfs_root *root); int btrfs_read_chunk_tree(struct btrfs_root *root); int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root, u64 *start, - u64 *num_bytes, u32 type); + u64 *num_bytes, u64 type); void btrfs_mapping_init(struct btrfs_mapping_tree *tree); void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree); int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio); -- cgit v1.2.3 From 0d81ba5dbedef0c3970d6aa318aa84920943e6e3 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 24 Mar 2008 15:02:07 -0400 Subject: Btrfs: Move device information into the super block so it can be scanned Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 21 ++---------------- fs/btrfs/disk-io.c | 4 +++- fs/btrfs/print-tree.c | 15 ++----------- fs/btrfs/volumes.c | 61 +++++++++++++-------------------------------------- fs/btrfs/volumes.h | 13 +---------- 5 files changed, 23 insertions(+), 91 deletions(-) (limited to 'fs/btrfs/volumes.h') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 285fb7e46106..96a493217860 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -150,21 +150,11 @@ struct btrfs_dev_item { /* minimal io size for this device */ __le32 sector_size; - /* the kernel device number */ - __le64 rdev; - /* type and info about this device */ __le64 type; - /* partition number, 0 for whole dev */ - __le32 partition; - - /* length of the name data at the end of the item */ - __le16 name_len; - - /* physical drive uuid (or lvm uuid) */ + /* btrfs generated uuid for this device */ u8 uuid[BTRFS_DEV_UUID_SIZE]; - /* name goes here */ } __attribute__ ((__packed__)); struct btrfs_stripe { @@ -255,6 +245,7 @@ struct btrfs_super_block { __le32 sys_chunk_array_size; u8 root_level; u8 chunk_root_level; + struct btrfs_dev_item dev_item; u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE]; } __attribute__ ((__packed__)); @@ -685,20 +676,12 @@ BTRFS_SETGET_FUNCS(device_io_align, struct btrfs_dev_item, io_align, 32); BTRFS_SETGET_FUNCS(device_io_width, struct btrfs_dev_item, io_width, 32); BTRFS_SETGET_FUNCS(device_sector_size, struct btrfs_dev_item, sector_size, 32); BTRFS_SETGET_FUNCS(device_id, struct btrfs_dev_item, devid, 64); -BTRFS_SETGET_FUNCS(device_rdev, struct btrfs_dev_item, rdev, 64); -BTRFS_SETGET_FUNCS(device_partition, struct btrfs_dev_item, partition, 32); -BTRFS_SETGET_FUNCS(device_name_len, struct btrfs_dev_item, name_len, 16); static inline char *btrfs_device_uuid(struct btrfs_dev_item *d) { return (char *)d + offsetof(struct btrfs_dev_item, uuid); } -static inline char *btrfs_device_name(struct btrfs_dev_item *d) -{ - return (char *)(d + 1); -} - BTRFS_SETGET_FUNCS(chunk_owner, struct btrfs_chunk, owner, 64); BTRFS_SETGET_FUNCS(chunk_stripe_len, struct btrfs_chunk, stripe_len, 64); BTRFS_SETGET_FUNCS(chunk_io_align, struct btrfs_chunk, io_align, 32); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 2a239ae49f78..26185d46712c 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -370,7 +370,6 @@ static int close_all_devices(struct btrfs_fs_info *fs_info) next = list->next; list_del(next); device = list_entry(next, struct btrfs_device, dev_list); - kfree(device->name); kfree(device); } return 0; @@ -800,6 +799,9 @@ struct btrfs_root *open_ctree(struct super_block *sb) } mutex_lock(&fs_info->fs_mutex); + ret = btrfs_read_super_device(tree_root, fs_info->sb_buffer); + BUG_ON(ret); + ret = btrfs_read_sys_array(tree_root); BUG_ON(ret); diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 9c1335dad40c..ee0de112cf5a 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -37,22 +37,11 @@ static void print_chunk(struct extent_buffer *eb, struct btrfs_chunk *chunk) static void print_dev_item(struct extent_buffer *eb, struct btrfs_dev_item *dev_item) { - char *name; - int name_len; - - name_len = btrfs_device_name_len(eb, dev_item); - name = kmalloc(name_len, GFP_NOFS); - if (name) { - read_extent_buffer(eb, name, - (unsigned long)btrfs_device_name(dev_item), - name_len); - } - printk("\t\tdev item name %.*s devid %llu " - "total_bytes %llu bytes used %Lu\n", name_len, name, + printk("\t\tdev item devid %llu " + "total_bytes %llu bytes used %Lu\n", (unsigned long long)btrfs_device_id(eb, dev_item), (unsigned long long)btrfs_device_total_bytes(eb, dev_item), (unsigned long long)btrfs_device_bytes_used(eb, dev_item)); - kfree(name); } void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) { diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index a52a13f365d6..ae22d01ecf54 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -278,7 +278,7 @@ int btrfs_add_device(struct btrfs_trans_handle *trans, key.offset = free_devid; ret = btrfs_insert_empty_item(trans, root, path, &key, - sizeof(*dev_item) + device->name_len); + sizeof(*dev_item)); if (ret) goto out; @@ -290,15 +290,9 @@ int btrfs_add_device(struct btrfs_trans_handle *trans, btrfs_set_device_io_align(leaf, dev_item, device->io_align); btrfs_set_device_io_width(leaf, dev_item, device->io_width); btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); - btrfs_set_device_rdev(leaf, dev_item, device->rdev); - btrfs_set_device_partition(leaf, dev_item, device->partition); - btrfs_set_device_name_len(leaf, dev_item, device->name_len); btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes); btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used); - ptr = (unsigned long)btrfs_device_name(dev_item); - write_extent_buffer(leaf, device->name, ptr, device->name_len); - ptr = (unsigned long)btrfs_device_uuid(dev_item); write_extent_buffer(leaf, device->uuid, ptr, BTRFS_DEV_UUID_SIZE); btrfs_mark_buffer_dirty(leaf); @@ -345,8 +339,6 @@ int btrfs_update_device(struct btrfs_trans_handle *trans, btrfs_set_device_io_align(leaf, dev_item, device->io_align); btrfs_set_device_io_width(leaf, dev_item, device->io_width); btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); - btrfs_set_device_rdev(leaf, dev_item, device->rdev); - btrfs_set_device_partition(leaf, dev_item, device->partition); btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes); btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used); btrfs_mark_buffer_dirty(leaf); @@ -676,7 +668,6 @@ static int fill_device_from_item(struct extent_buffer *leaf, struct btrfs_device *device) { unsigned long ptr; - char *name; device->devid = btrfs_device_id(leaf, dev_item); device->total_bytes = btrfs_device_total_bytes(leaf, dev_item); @@ -685,24 +676,14 @@ static int fill_device_from_item(struct extent_buffer *leaf, device->io_align = btrfs_device_io_align(leaf, dev_item); device->io_width = btrfs_device_io_width(leaf, dev_item); device->sector_size = btrfs_device_sector_size(leaf, dev_item); - device->rdev = btrfs_device_rdev(leaf, dev_item); - device->partition = btrfs_device_partition(leaf, dev_item); - device->name_len = btrfs_device_name_len(leaf, dev_item); ptr = (unsigned long)btrfs_device_uuid(dev_item); read_extent_buffer(leaf, device->uuid, ptr, BTRFS_DEV_UUID_SIZE); - name = kmalloc(device->name_len + 1, GFP_NOFS); - if (!name) - return -ENOMEM; - device->name = name; - ptr = (unsigned long)btrfs_device_name(dev_item); - read_extent_buffer(leaf, name, ptr, device->name_len); - name[device->name_len] = '\0'; return 0; } -static int read_one_dev(struct btrfs_root *root, struct btrfs_key *key, +static int read_one_dev(struct btrfs_root *root, struct extent_buffer *leaf, struct btrfs_dev_item *dev_item) { @@ -722,7 +703,6 @@ static int read_one_dev(struct btrfs_root *root, struct btrfs_key *key, fill_device_from_item(leaf, dev_item, device); device->dev_root = root->fs_info->dev_root; device->bdev = root->fs_info->sb->s_bdev; - memcpy(&device->dev_key, key, sizeof(*key)); ret = 0; #if 0 ret = btrfs_open_device(device); @@ -733,12 +713,20 @@ static int read_one_dev(struct btrfs_root *root, struct btrfs_key *key, return ret; } +int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf) +{ + struct btrfs_dev_item *dev_item; + + dev_item = (struct btrfs_dev_item *)offsetof(struct btrfs_super_block, + dev_item); + return read_one_dev(root, buf, dev_item); +} + int btrfs_read_sys_array(struct btrfs_root *root) { struct btrfs_super_block *super_copy = &root->fs_info->super_copy; struct extent_buffer *sb = root->fs_info->sb_buffer; struct btrfs_disk_key *disk_key; - struct btrfs_dev_item *dev_item; struct btrfs_chunk *chunk; struct btrfs_key key; u32 num_stripes; @@ -748,7 +736,6 @@ int btrfs_read_sys_array(struct btrfs_root *root) unsigned long sb_ptr; u32 cur; int ret; - int dev_only = 1; array_size = btrfs_super_sys_array_size(super_copy); @@ -757,7 +744,6 @@ int btrfs_read_sys_array(struct btrfs_root *root) * once for all of the chunks. This way there are device * structs filled in for every chunk */ -again: ptr = super_copy->sys_chunk_array; sb_ptr = offsetof(struct btrfs_super_block, sys_chunk_array); cur = 0; @@ -771,22 +757,10 @@ again: sb_ptr += len; cur += len; - if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID && - key.type == BTRFS_DEV_ITEM_KEY) { - dev_item = (struct btrfs_dev_item *)sb_ptr; - if (dev_only) { - ret = read_one_dev(root, &key, sb, dev_item); - BUG_ON(ret); - } - len = sizeof(*dev_item); - len += btrfs_device_name_len(sb, dev_item); - } else if (key.type == BTRFS_CHUNK_ITEM_KEY) { - + if (key.type == BTRFS_CHUNK_ITEM_KEY) { chunk = (struct btrfs_chunk *)sb_ptr; - if (!dev_only) { - ret = read_one_chunk(root, &key, sb, chunk); - BUG_ON(ret); - } + ret = read_one_chunk(root, &key, sb, chunk); + BUG_ON(ret); num_stripes = btrfs_chunk_num_stripes(sb, chunk); len = btrfs_chunk_item_size(num_stripes); } else { @@ -796,10 +770,6 @@ again: sb_ptr += len; cur += len; } - if (dev_only == 1) { - dev_only = 0; - goto again; - } return 0; } @@ -846,8 +816,7 @@ again: struct btrfs_dev_item *dev_item; dev_item = btrfs_item_ptr(leaf, slot, struct btrfs_dev_item); - ret = read_one_dev(root, &found_key, leaf, - dev_item); + ret = read_one_dev(root, leaf, dev_item); BUG_ON(ret); } } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) { diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 4ab9891878a8..77fa6efd79cf 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -21,7 +21,6 @@ struct btrfs_device { struct list_head dev_list; struct btrfs_root *dev_root; - struct btrfs_key dev_key; struct block_device *bdev; @@ -43,22 +42,11 @@ struct btrfs_device { /* minimal io size for this device */ u32 sector_size; - /* the kernel device number */ - u64 rdev; - /* type and info about this device */ u64 type; - /* partition number, 0 for whole dev */ - int partition; - - /* length of the name data at the end of the item */ - int name_len; - /* physical drive uuid (or lvm uuid) */ u8 uuid[BTRFS_DEV_UUID_SIZE]; - - char *name; }; int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, @@ -75,4 +63,5 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, void btrfs_mapping_init(struct btrfs_mapping_tree *tree); void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree); int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio); +int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf); #endif -- cgit v1.2.3 From 239b14b32dc39232ebf9cce29ff77c4c564355fd Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 24 Mar 2008 15:02:07 -0400 Subject: Btrfs: Bring back mount -o ssd optimizations Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 3 +++ fs/btrfs/disk-io.c | 2 ++ fs/btrfs/extent-tree.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++++-- fs/btrfs/extent_io.c | 2 ++ fs/btrfs/extent_io.h | 2 ++ fs/btrfs/inode.c | 29 +++++++++++++++++++++++++ fs/btrfs/volumes.c | 5 +++++ fs/btrfs/volumes.h | 3 +++ 8 files changed, 103 insertions(+), 2 deletions(-) (limited to 'fs/btrfs/volumes.h') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 96a493217860..acf22ad6115c 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1405,6 +1405,9 @@ int btrfs_csum_truncate(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 isize); /* inode.c */ +int btrfs_merge_bio_hook(struct page *page, unsigned long offset, + size_t size, struct bio *bio); + static inline void dec_i_blocks(struct inode *inode, u64 dec) { dec = dec >> 9; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 26185d46712c..4890151cd68d 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1103,4 +1103,6 @@ int btrfs_read_buffer(struct extent_buffer *buf) static struct extent_io_ops btree_extent_io_ops = { .writepage_io_hook = btree_writepage_io_hook, .submit_bio_hook = btree_submit_bio_hook, + /* note we're sharing with inode.c for the merge bio hook */ + .merge_bio_hook = btrfs_merge_bio_hook, }; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 14eb8fc87015..e9ef644ff56f 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1473,13 +1473,31 @@ static int noinline find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root * root = orig_root->fs_info->extent_root; struct btrfs_fs_info *info = root->fs_info; u64 total_needed = num_bytes; + u64 *last_ptr = NULL; struct btrfs_block_group_cache *block_group; int full_scan = 0; int wrapped = 0; + int empty_cluster = 2 * 1024 * 1024; WARN_ON(num_bytes < root->sectorsize); btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY); + if (data & BTRFS_BLOCK_GROUP_METADATA) { + last_ptr = &root->fs_info->last_alloc; + } + + if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) { + last_ptr = &root->fs_info->last_data_alloc; + } + + if (last_ptr) { + if (*last_ptr) + hint_byte = *last_ptr; + else { + empty_size += empty_cluster; + } + } + if (search_end == (u64)-1) search_end = btrfs_super_total_bytes(&info->super_copy); @@ -1489,11 +1507,14 @@ static int noinline find_free_extent(struct btrfs_trans_handle *trans, hint_byte = search_start; block_group = btrfs_find_block_group(root, block_group, hint_byte, data, 1); + if (last_ptr && *last_ptr == 0 && block_group) + hint_byte = block_group->key.objectid; } else { block_group = btrfs_find_block_group(root, trans->block_group, search_start, data, 1); } + search_start = max(search_start, hint_byte); total_needed += empty_size; @@ -1506,9 +1527,36 @@ check_failed: } ret = find_search_start(root, &block_group, &search_start, total_needed, data); + if (ret == -ENOSPC && last_ptr && *last_ptr) { + *last_ptr = 0; + block_group = btrfs_lookup_block_group(info, + orig_search_start); + search_start = orig_search_start; + ret = find_search_start(root, &block_group, &search_start, + total_needed, data); + } + if (ret == -ENOSPC) + goto enospc; if (ret) goto error; + if (last_ptr && *last_ptr && search_start != *last_ptr) { + *last_ptr = 0; + if (!empty_size) { + empty_size += empty_cluster; + total_needed += empty_size; + } + block_group = btrfs_lookup_block_group(info, + orig_search_start); + search_start = orig_search_start; + ret = find_search_start(root, &block_group, + &search_start, total_needed, data); + if (ret == -ENOSPC) + goto enospc; + if (ret) + goto error; + } + search_start = stripe_align(root, search_start); ins->objectid = search_start; ins->offset = num_bytes; @@ -1547,6 +1595,13 @@ check_failed: trans->block_group = block_group; } ins->offset = num_bytes; + if (last_ptr) { + *last_ptr = ins->objectid + ins->offset; + if (*last_ptr == + btrfs_super_total_bytes(&root->fs_info->super_copy)) { + *last_ptr = 0; + } + } return 0; new_group: @@ -1612,12 +1667,12 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans, if (root->ref_cows) { if (data != BTRFS_BLOCK_GROUP_METADATA) { ret = do_chunk_alloc(trans, root->fs_info->extent_root, - num_bytes, + 2 * 1024 * 1024, BTRFS_BLOCK_GROUP_METADATA); BUG_ON(ret); } ret = do_chunk_alloc(trans, root->fs_info->extent_root, - num_bytes, data); + num_bytes + 2 * 1024 * 1024, data); BUG_ON(ret); } diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 7e3a1ebde9fc..6dab664529c1 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1730,6 +1730,8 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree, if (bio_ret && *bio_ret) { bio = *bio_ret; if (bio->bi_sector + (bio->bi_size >> 9) != sector || + (tree->ops && tree->ops->merge_bio_hook && + tree->ops->merge_bio_hook(page, offset, size, bio)) || bio_add_page(bio, page, size, offset) < size) { ret = submit_one_bio(rw, bio); bio = NULL; diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 0dca89328f98..8b5319db2516 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -29,6 +29,8 @@ struct extent_io_ops { int (*fill_delalloc)(struct inode *inode, u64 start, u64 end); int (*writepage_io_hook)(struct page *page, u64 start, u64 end); int (*submit_bio_hook)(struct inode *inode, int rw, struct bio *bio); + int (*merge_bio_hook)(struct page *page, unsigned long offset, + size_t size, struct bio *bio); int (*readpage_io_hook)(struct page *page, u64 start, u64 end); int (*readpage_end_io_hook)(struct page *page, u64 start, u64 end, struct extent_state *state); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 109576b57f69..5140d6801846 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -296,6 +296,34 @@ int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end, return 0; } +int btrfs_merge_bio_hook(struct page *page, unsigned long offset, + size_t size, struct bio *bio) +{ + struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; + struct btrfs_mapping_tree *map_tree; + struct btrfs_device *dev; + u64 logical = bio->bi_sector << 9; + u64 physical; + u64 length = 0; + u64 map_length; + struct bio_vec *bvec; + int i; + int ret; + + bio_for_each_segment(bvec, bio, i) { + length += bvec->bv_len; + } + map_tree = &root->fs_info->mapping_tree; + map_length = length; + ret = btrfs_map_block(map_tree, logical, &physical, &map_length, &dev); + if (map_length < length + size) { + printk("merge bio hook logical %Lu bio len %Lu physical %Lu " + "len %Lu\n", logical, length, physical, map_length); + return 1; + } + return 0; +} + int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio) { struct btrfs_root *root = BTRFS_I(inode)->root; @@ -3033,6 +3061,7 @@ static struct file_operations btrfs_dir_file_operations = { static struct extent_io_ops btrfs_extent_io_ops = { .fill_delalloc = run_delalloc_range, .submit_bio_hook = btrfs_submit_bio_hook, + .merge_bio_hook = btrfs_merge_bio_hook, .readpage_io_hook = btrfs_readpage_io_hook, .readpage_end_io_hook = btrfs_readpage_end_io_hook, .set_bit_hook = btrfs_set_bit_hook, diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index ae22d01ecf54..16fb6bbe6e28 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -578,6 +578,11 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio) map_tree = &root->fs_info->mapping_tree; map_length = length; ret = btrfs_map_block(map_tree, logical, &physical, &map_length, &dev); + if (map_length < length) { + printk("mapping failed logical %Lu bio len %Lu physical %Lu " + "len %Lu\n", logical, length, physical, map_length); + BUG(); + } BUG_ON(map_length < length); bio->bi_sector = physical >> 9; bio->bi_bdev = dev->bdev; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 77fa6efd79cf..20259128152e 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -64,4 +64,7 @@ void btrfs_mapping_init(struct btrfs_mapping_tree *tree); void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree); int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio); int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf); +int btrfs_map_block(struct btrfs_mapping_tree *map_tree, + u64 logical, u64 *phys, u64 *length, + struct btrfs_device **dev); #endif -- cgit v1.2.3 From 8a4b83cc8bd75fca29ac68615896d9e92820e7c2 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 24 Mar 2008 15:02:07 -0400 Subject: Btrfs: Add support for device scanning and detection ioctls Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 21 ++++- fs/btrfs/disk-io.c | 24 ++++-- fs/btrfs/disk-io.h | 4 +- fs/btrfs/ioctl.h | 6 +- fs/btrfs/super.c | 61 ++++++++++---- fs/btrfs/volumes.c | 236 +++++++++++++++++++++++++++++++++++++++++++++++++---- fs/btrfs/volumes.h | 25 ++++++ 7 files changed, 333 insertions(+), 44 deletions(-) (limited to 'fs/btrfs/volumes.h') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index acf22ad6115c..7556f8319c60 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -37,7 +37,7 @@ extern struct kmem_cache *btrfs_transaction_cachep; extern struct kmem_cache *btrfs_bit_radix_cachep; extern struct kmem_cache *btrfs_path_cachep; -#define BTRFS_MAGIC "_B4RfS_M" +#define BTRFS_MAGIC "_B5RfS_M" #define BTRFS_MAX_LEVEL 8 @@ -238,6 +238,7 @@ struct btrfs_super_block { __le64 total_bytes; __le64 bytes_used; __le64 root_dir_objectid; + __le64 num_devices; __le32 sectorsize; __le32 nodesize; __le32 leafsize; @@ -440,6 +441,7 @@ struct btrfs_block_group_cache { }; struct btrfs_device; +struct btrfs_fs_devices; struct btrfs_fs_info { u8 fsid[BTRFS_FSID_SIZE]; struct btrfs_root *extent_root; @@ -489,7 +491,7 @@ struct btrfs_fs_info { u64 total_pinned; struct list_head dirty_cowonly_roots; - struct list_head devices; + struct btrfs_fs_devices *fs_devices; struct list_head space_info; spinlock_t delalloc_lock; spinlock_t new_trans_lock; @@ -677,6 +679,19 @@ BTRFS_SETGET_FUNCS(device_io_width, struct btrfs_dev_item, io_width, 32); BTRFS_SETGET_FUNCS(device_sector_size, struct btrfs_dev_item, sector_size, 32); BTRFS_SETGET_FUNCS(device_id, struct btrfs_dev_item, devid, 64); +BTRFS_SETGET_STACK_FUNCS(stack_device_type, struct btrfs_dev_item, type, 64); +BTRFS_SETGET_STACK_FUNCS(stack_device_total_bytes, struct btrfs_dev_item, + total_bytes, 64); +BTRFS_SETGET_STACK_FUNCS(stack_device_bytes_used, struct btrfs_dev_item, + bytes_used, 64); +BTRFS_SETGET_STACK_FUNCS(stack_device_io_align, struct btrfs_dev_item, + io_align, 32); +BTRFS_SETGET_STACK_FUNCS(stack_device_io_width, struct btrfs_dev_item, + io_width, 32); +BTRFS_SETGET_STACK_FUNCS(stack_device_sector_size, struct btrfs_dev_item, + sector_size, 32); +BTRFS_SETGET_STACK_FUNCS(stack_device_id, struct btrfs_dev_item, devid, 64); + static inline char *btrfs_device_uuid(struct btrfs_dev_item *d) { return (char *)d + offsetof(struct btrfs_dev_item, uuid); @@ -1106,6 +1121,8 @@ BTRFS_SETGET_STACK_FUNCS(super_stripesize, struct btrfs_super_block, stripesize, 32); BTRFS_SETGET_STACK_FUNCS(super_root_dir, struct btrfs_super_block, root_dir_objectid, 64); +BTRFS_SETGET_STACK_FUNCS(super_num_devices, struct btrfs_super_block, + num_devices, 64); static inline unsigned long btrfs_leaf_data(struct extent_buffer *l) { diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 4890151cd68d..f971a29e4f20 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -365,12 +365,12 @@ static int close_all_devices(struct btrfs_fs_info *fs_info) struct list_head *next; struct btrfs_device *device; - list = &fs_info->devices; - while(!list_empty(list)) { - next = list->next; - list_del(next); + list = &fs_info->fs_devices->devices; + list_for_each(next, list) { device = list_entry(next, struct btrfs_device, dev_list); - kfree(device); + if (device->bdev && device->bdev != fs_info->sb->s_bdev) + close_bdev_excl(device->bdev); + device->bdev = NULL; } return 0; } @@ -655,7 +655,8 @@ static int add_hasher(struct btrfs_fs_info *info, char *type) { return 0; } #endif -struct btrfs_root *open_ctree(struct super_block *sb) +struct btrfs_root *open_ctree(struct super_block *sb, + struct btrfs_fs_devices *fs_devices) { u32 sectorsize; u32 nodesize; @@ -697,8 +698,8 @@ struct btrfs_root *open_ctree(struct super_block *sb) fs_info->extent_root = extent_root; fs_info->chunk_root = chunk_root; fs_info->dev_root = dev_root; + fs_info->fs_devices = fs_devices; INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); - INIT_LIST_HEAD(&fs_info->devices); INIT_LIST_HEAD(&fs_info->space_info); btrfs_mapping_init(&fs_info->mapping_tree); fs_info->sb = sb; @@ -779,6 +780,12 @@ struct btrfs_root *open_ctree(struct super_block *sb) if (!btrfs_super_root(disk_super)) goto fail_sb_buffer; + if (btrfs_super_num_devices(disk_super) != fs_devices->num_devices) { + printk("Btrfs: wanted %llu devices, but found %llu\n", + (unsigned long long)btrfs_super_num_devices(disk_super), + (unsigned long long)fs_devices->num_devices); + goto fail_sb_buffer; + } nodesize = btrfs_super_nodesize(disk_super); leafsize = btrfs_super_leafsize(disk_super); sectorsize = btrfs_super_sectorsize(disk_super); @@ -799,8 +806,6 @@ struct btrfs_root *open_ctree(struct super_block *sb) } mutex_lock(&fs_info->fs_mutex); - ret = btrfs_read_super_device(tree_root, fs_info->sb_buffer); - BUG_ON(ret); ret = btrfs_read_sys_array(tree_root); BUG_ON(ret); @@ -859,6 +864,7 @@ fail_sb_buffer: fail_iput: iput(fs_info->btree_inode); fail: + close_all_devices(fs_info); kfree(extent_root); kfree(tree_root); kfree(fs_info); diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 206cb48638f7..b7cbc58a5553 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -21,6 +21,7 @@ #define BTRFS_SUPER_INFO_OFFSET (16 * 1024) struct btrfs_device; +struct btrfs_fs_devices; struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize); @@ -29,7 +30,8 @@ struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize); int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf); -struct btrfs_root *open_ctree(struct super_block *sb); +struct btrfs_root *open_ctree(struct super_block *sb, + struct btrfs_fs_devices *fs_devices); int close_ctree(struct btrfs_root *root); int write_ctree_super(struct btrfs_trans_handle *trans, struct btrfs_root *root); diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h index 8c6290665d49..4551e82013c8 100644 --- a/fs/btrfs/ioctl.h +++ b/fs/btrfs/ioctl.h @@ -22,8 +22,10 @@ #define BTRFS_IOCTL_MAGIC 0x94 #define BTRFS_VOL_NAME_MAX 255 +#define BTRFS_PATH_NAME_MAX 4095 + struct btrfs_ioctl_vol_args { - char name[BTRFS_VOL_NAME_MAX + 1]; + char name[BTRFS_PATH_NAME_MAX + 1]; }; #define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \ @@ -32,4 +34,6 @@ struct btrfs_ioctl_vol_args { struct btrfs_ioctl_vol_args) #define BTRFS_IOC_RESIZE _IOW(BTRFS_IOCTL_MAGIC, 3, \ struct btrfs_ioctl_vol_args) +#define BTRFS_IOC_SCAN_DEV _IOW(BTRFS_IOCTL_MAGIC, 4, \ + struct btrfs_ioctl_vol_args) #endif diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 67ed216df475..9624923a33dc 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -44,6 +44,7 @@ #include "ioctl.h" #include "print-tree.h" #include "xattr.h" +#include "volumes.h" #define BTRFS_SUPER_MAGIC 0x9123683E @@ -216,7 +217,9 @@ static int parse_options (char * options, return 1; } -static int btrfs_fill_super(struct super_block * sb, void * data, int silent) +static int btrfs_fill_super(struct super_block * sb, + struct btrfs_fs_devices *fs_devices, + void * data, int silent) { struct inode * inode; struct dentry * root_dentry; @@ -231,7 +234,7 @@ static int btrfs_fill_super(struct super_block * sb, void * data, int silent) sb->s_xattr = btrfs_xattr_handlers; sb->s_time_gran = 1; - tree_root = open_ctree(sb); + tree_root = open_ctree(sb, fs_devices); if (!tree_root || IS_ERR(tree_root)) { printk("btrfs: open_ctree failed\n"); @@ -334,18 +337,23 @@ static int test_bdev_super(struct super_block *s, void *data) int btrfs_get_sb_bdev(struct file_system_type *fs_type, int flags, const char *dev_name, void *data, - int (*fill_super)(struct super_block *, void *, int), struct vfsmount *mnt, const char *subvol) { struct block_device *bdev = NULL; struct super_block *s; struct dentry *root; + struct btrfs_fs_devices *fs_devices = NULL; int error = 0; - bdev = open_bdev_excl(dev_name, flags, fs_type); - if (IS_ERR(bdev)) - return PTR_ERR(bdev); + error = btrfs_scan_one_device(dev_name, flags, fs_type, &fs_devices); + if (error) + return error; + error = btrfs_open_devices(fs_devices, flags, fs_type); + if (error) + return error; + + bdev = fs_devices->lowest_bdev; /* * once the super is inserted into the list by sget, s_umount * will protect the lockfs code from trying to start a snapshot @@ -372,7 +380,8 @@ int btrfs_get_sb_bdev(struct file_system_type *fs_type, s->s_flags = flags; strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id)); sb_set_blocksize(s, block_size(bdev)); - error = fill_super(s, data, flags & MS_SILENT ? 1 : 0); + error = btrfs_fill_super(s, fs_devices, data, + flags & MS_SILENT ? 1 : 0); if (error) { up_write(&s->s_umount); deactivate_super(s); @@ -408,7 +417,7 @@ int btrfs_get_sb_bdev(struct file_system_type *fs_type, error_s: error = PTR_ERR(s); error_bdev: - close_bdev_excl(bdev); + btrfs_close_devices(fs_devices); error: return error; } @@ -421,8 +430,7 @@ static int btrfs_get_sb(struct file_system_type *fs_type, char *subvol_name = NULL; parse_options((char *)data, NULL, &subvol_name); - ret = btrfs_get_sb_bdev(fs_type, flags, dev_name, data, - btrfs_fill_super, mnt, + ret = btrfs_get_sb_bdev(fs_type, flags, dev_name, data, mnt, subvol_name ? subvol_name : "default"); if (subvol_name) kfree(subvol_name); @@ -445,13 +453,6 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) return 0; } -static long btrfs_control_ioctl(struct file *file, unsigned int cmd, - unsigned long arg) -{ - printk("btrfs control ioctl %d\n", cmd); - return 0; -} - static struct file_system_type btrfs_fs_type = { .owner = THIS_MODULE, .name = "btrfs", @@ -460,6 +461,31 @@ static struct file_system_type btrfs_fs_type = { .fs_flags = FS_REQUIRES_DEV, }; +static long btrfs_control_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + struct btrfs_ioctl_vol_args *vol; + struct btrfs_fs_devices *fs_devices; + int ret; + int len; + + vol = kmalloc(sizeof(*vol), GFP_KERNEL); + if (copy_from_user(vol, (void __user *)arg, sizeof(*vol))) { + ret = -EFAULT; + goto out; + } + len = strnlen(vol->name, BTRFS_PATH_NAME_MAX); + switch (cmd) { + case BTRFS_IOC_SCAN_DEV: + ret = btrfs_scan_one_device(vol->name, MS_RDONLY, + &btrfs_fs_type, &fs_devices); + break; + } +out: + kfree(vol); + return 0; +} + static void btrfs_write_super_lockfs(struct super_block *sb) { struct btrfs_root *root = btrfs_sb(sb); @@ -567,6 +593,7 @@ static void __exit exit_btrfs_fs(void) btrfs_interface_exit(); unregister_filesystem(&btrfs_fs_type); btrfs_exit_sysfs(); + btrfs_cleanup_fs_uuids(); } module_init(init_btrfs_fs) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 16fb6bbe6e28..263f01cc3db4 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -17,6 +17,7 @@ */ #include #include +#include #include "ctree.h" #include "extent_map.h" #include "disk-io.h" @@ -28,6 +29,215 @@ struct map_lookup { struct btrfs_device *dev; u64 physical; }; +static DEFINE_MUTEX(uuid_mutex); +static LIST_HEAD(fs_uuids); + +int btrfs_cleanup_fs_uuids(void) +{ + struct btrfs_fs_devices *fs_devices; + struct list_head *uuid_cur; + struct list_head *devices_cur; + struct btrfs_device *dev; + + list_for_each(uuid_cur, &fs_uuids) { + fs_devices = list_entry(uuid_cur, struct btrfs_fs_devices, + list); + while(!list_empty(&fs_devices->devices)) { + devices_cur = fs_devices->devices.next; + dev = list_entry(devices_cur, struct btrfs_device, + dev_list); + printk("uuid cleanup finds %s\n", dev->name); + if (dev->bdev) { + printk("closing\n"); + close_bdev_excl(dev->bdev); + } + list_del(&dev->dev_list); + kfree(dev); + } + } + return 0; +} + +static struct btrfs_device *__find_device(struct list_head *head, u64 devid) +{ + struct btrfs_device *dev; + struct list_head *cur; + + list_for_each(cur, head) { + dev = list_entry(cur, struct btrfs_device, dev_list); + if (dev->devid == devid) + return dev; + } + return NULL; +} + +static struct btrfs_fs_devices *find_fsid(u8 *fsid) +{ + struct list_head *cur; + struct btrfs_fs_devices *fs_devices; + + list_for_each(cur, &fs_uuids) { + fs_devices = list_entry(cur, struct btrfs_fs_devices, list); + if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0) + return fs_devices; + } + return NULL; +} + +static int device_list_add(const char *path, + struct btrfs_super_block *disk_super, + u64 devid, struct btrfs_fs_devices **fs_devices_ret) +{ + struct btrfs_device *device; + struct btrfs_fs_devices *fs_devices; + u64 found_transid = btrfs_super_generation(disk_super); + + fs_devices = find_fsid(disk_super->fsid); + if (!fs_devices) { + fs_devices = kmalloc(sizeof(*fs_devices), GFP_NOFS); + if (!fs_devices) + return -ENOMEM; + INIT_LIST_HEAD(&fs_devices->devices); + list_add(&fs_devices->list, &fs_uuids); + memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE); + fs_devices->latest_devid = devid; + fs_devices->latest_trans = found_transid; + fs_devices->lowest_devid = (u64)-1; + fs_devices->num_devices = 0; + device = NULL; + } else { + device = __find_device(&fs_devices->devices, devid); + } + if (!device) { + device = kzalloc(sizeof(*device), GFP_NOFS); + if (!device) { + /* we can safely leave the fs_devices entry around */ + return -ENOMEM; + } + device->devid = devid; + device->name = kstrdup(path, GFP_NOFS); + if (!device->name) { + kfree(device); + return -ENOMEM; + } + list_add(&device->dev_list, &fs_devices->devices); + fs_devices->num_devices++; + } + + if (found_transid > fs_devices->latest_trans) { + fs_devices->latest_devid = devid; + fs_devices->latest_trans = found_transid; + } + if (fs_devices->lowest_devid > devid) { + fs_devices->lowest_devid = devid; + printk("lowest devid now %Lu\n", devid); + } + *fs_devices_ret = fs_devices; + return 0; +} + +int btrfs_close_devices(struct btrfs_fs_devices *fs_devices) +{ + struct list_head *head = &fs_devices->devices; + struct list_head *cur; + struct btrfs_device *device; + + mutex_lock(&uuid_mutex); + list_for_each(cur, head) { + device = list_entry(cur, struct btrfs_device, dev_list); + if (device->bdev) { + close_bdev_excl(device->bdev); + printk("close devices closes %s\n", device->name); + } + device->bdev = NULL; + } + mutex_unlock(&uuid_mutex); + return 0; +} + +int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, + int flags, void *holder) +{ + struct block_device *bdev; + struct list_head *head = &fs_devices->devices; + struct list_head *cur; + struct btrfs_device *device; + int ret; + + mutex_lock(&uuid_mutex); + list_for_each(cur, head) { + device = list_entry(cur, struct btrfs_device, dev_list); + bdev = open_bdev_excl(device->name, flags, holder); +printk("opening %s devid %Lu\n", device->name, device->devid); + if (IS_ERR(bdev)) { + printk("open %s failed\n", device->name); + ret = PTR_ERR(bdev); + goto fail; + } + if (device->devid == fs_devices->latest_devid) + fs_devices->latest_bdev = bdev; + if (device->devid == fs_devices->lowest_devid) { + fs_devices->lowest_bdev = bdev; +printk("lowest bdev %s\n", device->name); + } + device->bdev = bdev; + } + mutex_unlock(&uuid_mutex); + return 0; +fail: + mutex_unlock(&uuid_mutex); + btrfs_close_devices(fs_devices); + return ret; +} + +int btrfs_scan_one_device(const char *path, int flags, void *holder, + struct btrfs_fs_devices **fs_devices_ret) +{ + struct btrfs_super_block *disk_super; + struct block_device *bdev; + struct buffer_head *bh; + int ret; + u64 devid; + + mutex_lock(&uuid_mutex); + + printk("scan one opens %s\n", path); + bdev = open_bdev_excl(path, flags, holder); + + if (IS_ERR(bdev)) { + printk("open failed\n"); + ret = PTR_ERR(bdev); + goto error; + } + + ret = set_blocksize(bdev, 4096); + if (ret) + goto error_close; + bh = __bread(bdev, BTRFS_SUPER_INFO_OFFSET / 4096, 4096); + if (!bh) { + ret = -EIO; + goto error_close; + } + disk_super = (struct btrfs_super_block *)bh->b_data; + if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC, + sizeof(disk_super->magic))) { + printk("no btrfs found on %s\n", path); + ret = -ENOENT; + goto error_brelse; + } + devid = le64_to_cpu(disk_super->dev_item.devid); + printk("found device %Lu on %s\n", devid, path); + ret = device_list_add(path, disk_super, devid, fs_devices_ret); + +error_brelse: + brelse(bh); +error_close: + close_bdev_excl(bdev); + printk("scan one closes bdev %s\n", path); +error: + mutex_unlock(&uuid_mutex); + return ret; +} /* * this uses a pretty simple search, the expectation is that it is @@ -56,6 +266,10 @@ static int find_free_dev_extent(struct btrfs_trans_handle *trans, /* FIXME use last free of some kind */ + /* we don't want to overwrite the superblock on the drive, + * so we make sure to start at an offset of at least 1MB + */ + search_start = max((u64)1024 * 1024, search_start); key.objectid = device->devid; key.offset = search_start; key.type = BTRFS_DEV_EXTENT_KEY; @@ -285,6 +499,7 @@ int btrfs_add_device(struct btrfs_trans_handle *trans, leaf = path->nodes[0]; dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); + device->devid = free_devid; btrfs_set_device_id(leaf, dev_item, device->devid); btrfs_set_device_type(leaf, dev_item, device->type); btrfs_set_device_io_align(leaf, dev_item, device->io_align); @@ -382,7 +597,7 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, struct btrfs_device *device = NULL; struct btrfs_chunk *chunk; struct list_head private_devs; - struct list_head *dev_list = &extent_root->fs_info->devices; + struct list_head *dev_list = &extent_root->fs_info->fs_devices->devices; struct list_head *cur; struct extent_map_tree *em_tree; struct map_lookup *map; @@ -449,7 +664,7 @@ again: key.objectid, calc_size, &dev_offset); BUG_ON(ret); - +printk("alloc chunk size %Lu from dev %Lu\n", calc_size, device->devid); device->bytes_used += calc_size; ret = btrfs_update_device(trans, device); BUG_ON(ret); @@ -592,17 +807,9 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio) struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid) { - struct btrfs_device *dev; - struct list_head *cur = root->fs_info->devices.next; - struct list_head *head = &root->fs_info->devices; + struct list_head *head = &root->fs_info->fs_devices->devices; - while(cur != head) { - dev = list_entry(cur, struct btrfs_device, dev_list); - if (dev->devid == devid) - return dev; - cur = cur->next; - } - return NULL; + return __find_device(head, devid); } static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, @@ -699,15 +906,16 @@ static int read_one_dev(struct btrfs_root *root, devid = btrfs_device_id(leaf, dev_item); device = btrfs_find_device(root, devid); if (!device) { + printk("warning devid %Lu not found already\n", devid); device = kmalloc(sizeof(*device), GFP_NOFS); if (!device) return -ENOMEM; - list_add(&device->dev_list, &root->fs_info->devices); + list_add(&device->dev_list, + &root->fs_info->fs_devices->devices); } fill_device_from_item(leaf, dev_item, device); device->dev_root = root->fs_info->dev_root; - device->bdev = root->fs_info->sb->s_bdev; ret = 0; #if 0 ret = btrfs_open_device(device); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 20259128152e..12f297eb0559 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -24,6 +24,8 @@ struct btrfs_device { struct block_device *bdev; + char *name; + /* the internal btrfs device id */ u64 devid; @@ -49,6 +51,20 @@ struct btrfs_device { u8 uuid[BTRFS_DEV_UUID_SIZE]; }; +struct btrfs_fs_devices { + u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */ + + /* the device with this id has the most recent coyp of the super */ + u64 latest_devid; + u64 latest_trans; + u64 lowest_devid; + u64 num_devices; + struct block_device *latest_bdev; + struct block_device *lowest_bdev; + struct list_head devices; + struct list_head list; +}; + int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, struct btrfs_device *device, u64 owner, u64 num_bytes, u64 *start); @@ -67,4 +83,13 @@ int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf); int btrfs_map_block(struct btrfs_mapping_tree *map_tree, u64 logical, u64 *phys, u64 *length, struct btrfs_device **dev); +int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, + int flags, void *holder); +int btrfs_scan_one_device(const char *path, int flags, void *holder, + struct btrfs_fs_devices **fs_devices_ret); +int btrfs_close_devices(struct btrfs_fs_devices *fs_devices); +int btrfs_add_device(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_device *device); +int btrfs_cleanup_fs_uuids(void); #endif -- cgit v1.2.3 From 8790d502e4401a4a3a4175b83a3a47e8d595c771 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Thu, 3 Apr 2008 16:29:03 -0400 Subject: Btrfs: Add support for mirroring across drives Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 9 ++- fs/btrfs/disk-io.c | 20 +++---- fs/btrfs/extent-tree.c | 93 +++++++++++++++-------------- fs/btrfs/inode.c | 4 +- fs/btrfs/volumes.c | 154 ++++++++++++++++++++++++++++++++++++++++--------- fs/btrfs/volumes.h | 11 ++-- 6 files changed, 198 insertions(+), 93 deletions(-) (limited to 'fs/btrfs/volumes.h') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 67d533cf8f47..0a207861472e 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -418,6 +418,7 @@ struct btrfs_csum_item { #define BTRFS_BLOCK_GROUP_SYSTEM (1 << 1) #define BTRFS_BLOCK_GROUP_METADATA (1 << 2) #define BTRFS_BLOCK_GROUP_RAID0 (1 << 3) +#define BTRFS_BLOCK_GROUP_RAID1 (1 << 4) struct btrfs_block_group_item { @@ -504,8 +505,12 @@ struct btrfs_fs_info { u64 delalloc_bytes; u64 last_alloc; u64 last_data_alloc; - int extra_data_alloc_bits; - int extra_alloc_bits; + int avail_data_alloc_bits; + int avail_metadata_alloc_bits; + int avail_system_alloc_bits; + int data_alloc_profile; + int metadata_alloc_profile; + int system_alloc_profile; }; /* diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index e40fb318ad99..ff75ad586767 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -735,7 +735,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, GFP_NOFS); struct btrfs_root *tree_root = kmalloc(sizeof(struct btrfs_root), GFP_NOFS); - struct btrfs_fs_info *fs_info = kmalloc(sizeof(*fs_info), + struct btrfs_fs_info *fs_info = kzalloc(sizeof(*fs_info), GFP_NOFS); struct btrfs_root *chunk_root = kmalloc(sizeof(struct btrfs_root), GFP_NOFS); @@ -744,6 +744,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, int ret; int err = -EINVAL; struct btrfs_super_block *disk_super; + if (!extent_root || !tree_root || !fs_info) { err = -ENOMEM; goto fail; @@ -756,11 +757,8 @@ struct btrfs_root *open_ctree(struct super_block *sb, spin_lock_init(&fs_info->delalloc_lock); spin_lock_init(&fs_info->new_trans_lock); - memset(&fs_info->super_kobj, 0, sizeof(fs_info->super_kobj)); init_completion(&fs_info->kobj_unregister); sb_set_blocksize(sb, 4096); - fs_info->running_transaction = NULL; - fs_info->last_trans_committed = 0; fs_info->tree_root = tree_root; fs_info->extent_root = extent_root; fs_info->chunk_root = chunk_root; @@ -770,11 +768,8 @@ struct btrfs_root *open_ctree(struct super_block *sb, INIT_LIST_HEAD(&fs_info->space_info); btrfs_mapping_init(&fs_info->mapping_tree); fs_info->sb = sb; - fs_info->throttles = 0; - fs_info->mount_opt = 0; fs_info->max_extent = (u64)-1; fs_info->max_inline = 8192 * 1024; - fs_info->delalloc_bytes = 0; setup_bdi(fs_info, &fs_info->bdi); fs_info->btree_inode = new_inode(sb); fs_info->btree_inode->i_ino = 1; @@ -802,12 +797,6 @@ struct btrfs_root *open_ctree(struct super_block *sb, extent_io_tree_init(&fs_info->extent_ins, fs_info->btree_inode->i_mapping, GFP_NOFS); fs_info->do_barriers = 1; - fs_info->closing = 0; - fs_info->total_pinned = 0; - fs_info->last_alloc = 0; - fs_info->last_data_alloc = 0; - fs_info->extra_alloc_bits = 0; - fs_info->extra_data_alloc_bits = 0; #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18) INIT_WORK(&fs_info->trans_work, btrfs_transaction_cleaner, fs_info); @@ -923,6 +912,11 @@ struct btrfs_root *open_ctree(struct super_block *sb, btrfs_read_block_groups(extent_root); fs_info->generation = btrfs_super_generation(disk_super) + 1; + if (btrfs_super_num_devices(disk_super) > 0) { + fs_info->data_alloc_profile = BTRFS_BLOCK_GROUP_RAID0; + fs_info->metadata_alloc_profile = BTRFS_BLOCK_GROUP_RAID1; + fs_info->system_alloc_profile = BTRFS_BLOCK_GROUP_RAID0; + } mutex_unlock(&fs_info->fs_mutex); return tree_root; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index a34c289aec21..4ab98d8b73fa 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -230,9 +230,13 @@ again: goto new_group; if (start + num > total_fs_bytes) goto new_group; + if (!block_group_bits(cache, data)) { + printk("block group bits don't match %Lu %Lu\n", cache->flags, data); + } *start_ret = start; return 0; - } out: + } +out: cache = btrfs_lookup_block_group(root->fs_info, search_start); if (!cache) { printk("Unable to find block group for %Lu\n", search_start); @@ -365,14 +369,17 @@ again: if (cache->key.objectid > total_fs_bytes) break; - if (full_search) - free_check = cache->key.offset; - else - free_check = div_factor(cache->key.offset, factor); + if (block_group_bits(cache, data)) { + if (full_search) + free_check = cache->key.offset; + else + free_check = div_factor(cache->key.offset, + factor); - if (used + cache->pinned < free_check) { - found_group = cache; - goto found; + if (used + cache->pinned < free_check) { + found_group = cache; + goto found; + } } cond_resched(); } @@ -1038,6 +1045,19 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, return 0; } +static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) +{ + u64 extra_flags = flags & (BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID1); + if (extra_flags) { + if (flags & BTRFS_BLOCK_GROUP_DATA) + fs_info->avail_data_alloc_bits |= extra_flags; + if (flags & BTRFS_BLOCK_GROUP_METADATA) + fs_info->avail_metadata_alloc_bits |= extra_flags; + if (flags & BTRFS_BLOCK_GROUP_SYSTEM) + fs_info->avail_system_alloc_bits |= extra_flags; + } +} static int do_chunk_alloc(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root, u64 alloc_bytes, @@ -1060,7 +1080,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, if (space_info->full) return 0; - thresh = div_factor(space_info->total_bytes, 7); + thresh = div_factor(space_info->total_bytes, 6); if ((space_info->bytes_used + space_info->bytes_pinned + alloc_bytes) < thresh) return 0; @@ -1079,16 +1099,7 @@ printk("space info full %Lu\n", flags); start, num_bytes); BUG_ON(ret); - if (flags & BTRFS_BLOCK_GROUP_RAID0) { - if (flags & BTRFS_BLOCK_GROUP_DATA) { - extent_root->fs_info->extra_data_alloc_bits = - BTRFS_BLOCK_GROUP_RAID0; - } - if (flags & BTRFS_BLOCK_GROUP_METADATA) { - extent_root->fs_info->extra_alloc_bits = - BTRFS_BLOCK_GROUP_RAID0; - } - } + set_avail_alloc_bits(extent_root->fs_info, flags); return 0; } @@ -1529,6 +1540,7 @@ static int noinline find_free_extent(struct btrfs_trans_handle *trans, if (data & BTRFS_BLOCK_GROUP_METADATA) { last_ptr = &root->fs_info->last_alloc; + empty_cluster = 256 * 1024; } if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) { @@ -1693,6 +1705,7 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans, u64 root_used; u64 search_start = 0; u64 new_hint; + u64 alloc_profile; u32 sizes[2]; struct btrfs_fs_info *info = root->fs_info; struct btrfs_root *extent_root = info->extent_root; @@ -1700,31 +1713,32 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans, struct btrfs_extent_ref *ref; struct btrfs_path *path; struct btrfs_key keys[2]; - int extra_chunk_alloc_bits = 0; if (data) { - data = BTRFS_BLOCK_GROUP_DATA | info->extra_data_alloc_bits; + alloc_profile = info->avail_data_alloc_bits & + info->data_alloc_profile; + data = BTRFS_BLOCK_GROUP_DATA | alloc_profile; } else if (root == root->fs_info->chunk_root) { - data = BTRFS_BLOCK_GROUP_SYSTEM; + alloc_profile = info->avail_system_alloc_bits & + info->system_alloc_profile; + data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile; } else { - data = BTRFS_BLOCK_GROUP_METADATA | info->extra_alloc_bits; + alloc_profile = info->avail_metadata_alloc_bits & + info->metadata_alloc_profile; + data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile; } - if (btrfs_super_num_devices(&info->super_copy) > 1 && - !(data & BTRFS_BLOCK_GROUP_SYSTEM)) - extra_chunk_alloc_bits = BTRFS_BLOCK_GROUP_RAID0; if (root->ref_cows) { if (!(data & BTRFS_BLOCK_GROUP_METADATA)) { ret = do_chunk_alloc(trans, root->fs_info->extent_root, 2 * 1024 * 1024, BTRFS_BLOCK_GROUP_METADATA | - info->extra_alloc_bits | - extra_chunk_alloc_bits); + (info->metadata_alloc_profile & + info->avail_metadata_alloc_bits)); BUG_ON(ret); } ret = do_chunk_alloc(trans, root->fs_info->extent_root, - num_bytes + 2 * 1024 * 1024, data | - extra_chunk_alloc_bits); + num_bytes + 2 * 1024 * 1024, data); BUG_ON(ret); } @@ -2046,12 +2060,12 @@ static int noinline walk_down_tree(struct btrfs_trans_handle *trans, if (!next || !btrfs_buffer_uptodate(next)) { free_extent_buffer(next); reada_walk_down(root, cur, path->slots[*level]); + + mutex_unlock(&root->fs_info->fs_mutex); next = read_tree_block(root, bytenr, blocksize); + mutex_lock(&root->fs_info->fs_mutex); - /* we used to drop the lock above, keep the - * code to double check so that we won't forget - * when we drop the lock again in the future - */ + /* we've dropped the lock, double check */ ret = lookup_extent_ref(trans, root, bytenr, blocksize, &refs); BUG_ON(ret); @@ -2739,16 +2753,7 @@ int btrfs_read_block_groups(struct btrfs_root *root) } else if (cache->flags & BTRFS_BLOCK_GROUP_METADATA) { bit = BLOCK_GROUP_METADATA; } - if (cache->flags & BTRFS_BLOCK_GROUP_RAID0) { - if (cache->flags & BTRFS_BLOCK_GROUP_DATA) { - info->extra_data_alloc_bits = - BTRFS_BLOCK_GROUP_RAID0; - } - if (cache->flags & BTRFS_BLOCK_GROUP_METADATA) { - info->extra_alloc_bits = - BTRFS_BLOCK_GROUP_RAID0; - } - } + set_avail_alloc_bits(info, cache->flags); ret = update_space_info(info, cache->flags, found_key.offset, btrfs_block_group_used(&cache->item), diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 0fa7cf227f1a..a8ae68c6fbb8 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -306,6 +306,7 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset, u64 physical; u64 length = 0; u64 map_length; + int total_devs; struct bio_vec *bvec; int i; int ret; @@ -315,7 +316,8 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset, } map_tree = &root->fs_info->mapping_tree; map_length = length; - ret = btrfs_map_block(map_tree, logical, &physical, &map_length, &dev); + ret = btrfs_map_block(map_tree, READ, 0, logical, &physical, + &map_length, &dev, &total_devs); if (map_length < length + size) { return 1; } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 169be0f7285b..bc3c0b97588e 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -31,6 +31,13 @@ struct stripe { u64 physical; }; +struct multi_bio { + atomic_t stripes; + bio_end_io_t *end_io; + void *private; + int error; +}; + struct map_lookup { u64 type; int io_align; @@ -632,12 +639,12 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, if (list_empty(dev_list)) return -ENOSPC; - if (type & BTRFS_BLOCK_GROUP_RAID0) + if (type & (BTRFS_BLOCK_GROUP_RAID0)) num_stripes = btrfs_super_num_devices(&info->super_copy); - if (type & BTRFS_BLOCK_GROUP_DATA) - stripe_len = 64 * 1024; - if (type & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM)) - stripe_len = 32 * 1024; + if (type & (BTRFS_BLOCK_GROUP_RAID1)) { + num_stripes = min_t(u64, 2, + btrfs_super_num_devices(&info->super_copy)); + } again: INIT_LIST_HEAD(&private_devs); cur = dev_list->next; @@ -682,7 +689,11 @@ again: stripes = &chunk->stripe; - *num_bytes = calc_size * num_stripes; + if (type & BTRFS_BLOCK_GROUP_RAID1) + *num_bytes = calc_size; + else + *num_bytes = calc_size * num_stripes; + index = 0; while(index < num_stripes) { BUG_ON(list_empty(&private_devs)); @@ -694,7 +705,7 @@ again: key.objectid, calc_size, &dev_offset); BUG_ON(ret); -printk("alloc chunk size %Lu from dev %Lu\n", calc_size, device->devid); +printk("alloc chunk start %Lu size %Lu from dev %Lu type %Lu\n", key.objectid, calc_size, device->devid, type); device->bytes_used += calc_size; ret = btrfs_update_device(trans, device); BUG_ON(ret); @@ -774,9 +785,9 @@ void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree) } } -int btrfs_map_block(struct btrfs_mapping_tree *map_tree, - u64 logical, u64 *phys, u64 *length, - struct btrfs_device **dev) +int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, + int dev_nr, u64 logical, u64 *phys, u64 *length, + struct btrfs_device **dev, int *total_devs) { struct extent_map *em; struct map_lookup *map; @@ -808,19 +819,39 @@ int btrfs_map_block(struct btrfs_mapping_tree *map_tree, /* stripe_offset is the offset of this block in its stripe*/ stripe_offset = offset - stripe_offset; - /* - * after this do_div call, stripe_nr is the number of stripes - * on this device we have to walk to find the data, and - * stripe_index is the number of our device in the stripe array - */ - stripe_index = do_div(stripe_nr, map->num_stripes); - + if (map->type & BTRFS_BLOCK_GROUP_RAID1) { + stripe_index = dev_nr; + if (rw & (1 << BIO_RW)) + *total_devs = map->num_stripes; + else { + int i; + u64 least = (u64)-1; + struct btrfs_device *cur; + + for (i = 0; i < map->num_stripes; i++) { + cur = map->stripes[i].dev; + spin_lock(&cur->io_lock); + if (cur->total_ios < least) { + least = cur->total_ios; + stripe_index = i; + } + spin_unlock(&cur->io_lock); + } + *total_devs = 1; + } + } else { + /* + * after this do_div call, stripe_nr is the number of stripes + * on this device we have to walk to find the data, and + * stripe_index is the number of our device in the stripe array + */ + stripe_index = do_div(stripe_nr, map->num_stripes); + } BUG_ON(stripe_index >= map->num_stripes); - *phys = map->stripes[stripe_index].physical + stripe_offset + stripe_nr * map->stripe_len; - if (map->type & BTRFS_BLOCK_GROUP_RAID0) { + if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1)) { /* we limit the length of each bio to what fits in a stripe */ *length = min_t(u64, em->len - offset, map->stripe_len - stripe_offset); @@ -833,33 +864,98 @@ int btrfs_map_block(struct btrfs_mapping_tree *map_tree, return 0; } +#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23) +static void end_bio_multi_stripe(struct bio *bio, int err) +#else +static int end_bio_multi_stripe(struct bio *bio, + unsigned int bytes_done, int err) +#endif +{ + struct multi_bio *multi = bio->bi_private; + +#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23) + if (bio->bi_size) + return 1; +#endif + if (err) + multi->error = err; + + if (atomic_dec_and_test(&multi->stripes)) { + bio->bi_private = multi->private; + bio->bi_end_io = multi->end_io; + + if (!err && multi->error) + err = multi->error; + kfree(multi); + + bio_endio(bio, err); + } else { + bio_put(bio); + } +#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23) + return 0; +#endif +} + int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio) { struct btrfs_mapping_tree *map_tree; struct btrfs_device *dev; + struct bio *first_bio = bio; u64 logical = bio->bi_sector << 9; u64 physical; u64 length = 0; u64 map_length; struct bio_vec *bvec; + struct multi_bio *multi = NULL; int i; int ret; + int dev_nr = 0; + int total_devs = 1; bio_for_each_segment(bvec, bio, i) { length += bvec->bv_len; } + map_tree = &root->fs_info->mapping_tree; map_length = length; - ret = btrfs_map_block(map_tree, logical, &physical, &map_length, &dev); - if (map_length < length) { - printk("mapping failed logical %Lu bio len %Lu physical %Lu " - "len %Lu\n", logical, length, physical, map_length); - BUG(); + while(dev_nr < total_devs) { + ret = btrfs_map_block(map_tree, rw, dev_nr, logical, + &physical, &map_length, &dev, + &total_devs); + if (map_length < length) { + printk("mapping failed logical %Lu bio len %Lu physical %Lu " + "len %Lu\n", logical, length, physical, map_length); + BUG(); + } + BUG_ON(map_length < length); + if (total_devs > 1) { + if (!multi) { + multi = kmalloc(sizeof(*multi), GFP_NOFS); + atomic_set(&multi->stripes, 1); + multi->end_io = bio->bi_end_io; + multi->private = first_bio->bi_private; + multi->error = 0; + } else { + atomic_inc(&multi->stripes); + } + if (dev_nr < total_devs - 1) { + bio = bio_clone(first_bio, GFP_NOFS); + BUG_ON(!bio); + } else { + bio = first_bio; + } + bio->bi_private = multi; + bio->bi_end_io = end_bio_multi_stripe; + } + bio->bi_sector = physical >> 9; + bio->bi_bdev = dev->bdev; + spin_lock(&dev->io_lock); + dev->total_ios++; + spin_unlock(&dev->io_lock); + submit_bio(rw, bio); + dev_nr++; } - BUG_ON(map_length < length); - bio->bi_sector = physical >> 9; - bio->bi_bdev = dev->bdev; - submit_bio(rw, bio); return 0; } @@ -982,6 +1078,8 @@ static int read_one_dev(struct btrfs_root *root, return -ENOMEM; list_add(&device->dev_list, &root->fs_info->fs_devices->devices); + device->total_ios = 0; + spin_lock_init(&device->io_lock); } fill_device_from_item(leaf, dev_item, device); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 12f297eb0559..35dec3efd78c 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -18,12 +18,16 @@ #ifndef __BTRFS_VOLUMES_ #define __BTRFS_VOLUMES_ + struct btrfs_device { struct list_head dev_list; struct btrfs_root *dev_root; + spinlock_t io_lock; struct block_device *bdev; + u64 total_ios; + char *name; /* the internal btrfs device id */ @@ -68,9 +72,9 @@ struct btrfs_fs_devices { int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, struct btrfs_device *device, u64 owner, u64 num_bytes, u64 *start); -int btrfs_map_block(struct btrfs_mapping_tree *map_tree, +int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, int stripe_nr, u64 logical, u64 *phys, u64 *length, - struct btrfs_device **dev); + struct btrfs_device **dev, int *total_stripes); int btrfs_read_sys_array(struct btrfs_root *root); int btrfs_read_chunk_tree(struct btrfs_root *root); int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, @@ -80,9 +84,6 @@ void btrfs_mapping_init(struct btrfs_mapping_tree *tree); void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree); int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio); int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf); -int btrfs_map_block(struct btrfs_mapping_tree *map_tree, - u64 logical, u64 *phys, u64 *length, - struct btrfs_device **dev); int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, int flags, void *holder); int btrfs_scan_one_device(const char *path, int flags, void *holder, -- cgit v1.2.3 From cea9e4452ebaf18dd0951e90dc84d82a5dee40b2 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 9 Apr 2008 16:28:12 -0400 Subject: Change btrfs_map_block to return a structure with mappings for all stripes Signed-off-by: Chris Mason --- fs/btrfs/ctree.c | 4 +- fs/btrfs/inode.c | 8 ++-- fs/btrfs/volumes.c | 135 +++++++++++++++++++++++++++++------------------------ fs/btrfs/volumes.h | 25 ++++++++-- 4 files changed, 103 insertions(+), 69 deletions(-) (limited to 'fs/btrfs/volumes.h') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 3c7891cf0d72..c92c6b0ee582 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -2025,8 +2025,10 @@ again: root->root_key.objectid, root_gen, disk_key.objectid, 0, l->start, 0); - if (IS_ERR(right)) + if (IS_ERR(right)) { + BUG_ON(1); return PTR_ERR(right); + } memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header)); btrfs_set_header_bytenr(right, right->start); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a8ae68c6fbb8..7ae677d8a6de 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -301,12 +301,9 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset, { struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; struct btrfs_mapping_tree *map_tree; - struct btrfs_device *dev; u64 logical = bio->bi_sector << 9; - u64 physical; u64 length = 0; u64 map_length; - int total_devs; struct bio_vec *bvec; int i; int ret; @@ -316,8 +313,9 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset, } map_tree = &root->fs_info->mapping_tree; map_length = length; - ret = btrfs_map_block(map_tree, READ, 0, logical, &physical, - &map_length, &dev, &total_devs); + ret = btrfs_map_block(map_tree, READ, logical, + &map_length, NULL); + if (map_length < length + size) { return 1; } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index b9294e3c05f0..008d3640e8c2 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -26,18 +26,6 @@ #include "print-tree.h" #include "volumes.h" -struct stripe { - struct btrfs_device *dev; - u64 physical; -}; - -struct multi_bio { - atomic_t stripes; - bio_end_io_t *end_io; - void *private; - int error; -}; - struct map_lookup { u64 type; int io_align; @@ -45,11 +33,11 @@ struct map_lookup { int stripe_len; int sector_size; int num_stripes; - struct stripe stripes[]; + struct btrfs_bio_stripe stripes[]; }; #define map_lookup_size(n) (sizeof(struct map_lookup) + \ - (sizeof(struct stripe) * (n))) + (sizeof(struct btrfs_bio_stripe) * (n))) static DEFINE_MUTEX(uuid_mutex); static LIST_HEAD(fs_uuids); @@ -801,8 +789,8 @@ void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree) } int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, - int dev_nr, u64 logical, u64 *phys, u64 *length, - struct btrfs_device **dev, int *total_devs) + u64 logical, u64 *length, + struct btrfs_multi_bio **multi_ret) { struct extent_map *em; struct map_lookup *map; @@ -810,8 +798,21 @@ int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, u64 offset; u64 stripe_offset; u64 stripe_nr; + int stripes_allocated = 8; int stripe_index; + int i; + struct btrfs_multi_bio *multi = NULL; + if (multi_ret && !(rw & (1 << BIO_RW))) { + stripes_allocated = 1; + } +again: + if (multi_ret) { + multi = kzalloc(btrfs_multi_bio_size(stripes_allocated), + GFP_NOFS); + if (!multi) + return -ENOMEM; + } spin_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, logical, *length); @@ -821,6 +822,17 @@ int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, map = (struct map_lookup *)em->bdev; offset = logical - em->start; + /* if our multi bio struct is too small, back off and try again */ + if (multi_ret && (rw & (1 << BIO_RW)) && + stripes_allocated < map->num_stripes && + ((map->type & BTRFS_BLOCK_GROUP_RAID1) || + (map->type & BTRFS_BLOCK_GROUP_DUP))) { + stripes_allocated = map->num_stripes; + spin_unlock(&em_tree->lock); + free_extent_map(em); + kfree(multi); + goto again; + } stripe_nr = offset; /* * stripe_nr counts the total number of stripes we have to stride @@ -834,10 +846,22 @@ int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, /* stripe_offset is the offset of this block in its stripe*/ stripe_offset = offset - stripe_offset; + if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_DUP)) { + /* we limit the length of each bio to what fits in a stripe */ + *length = min_t(u64, em->len - offset, + map->stripe_len - stripe_offset); + } else { + *length = em->len - offset; + } + if (!multi_ret) + goto out; + + multi->num_stripes = 1; + stripe_index = 0; if (map->type & BTRFS_BLOCK_GROUP_RAID1) { - stripe_index = dev_nr; if (rw & (1 << BIO_RW)) - *total_devs = map->num_stripes; + multi->num_stripes = map->num_stripes; else { int i; u64 least = (u64)-1; @@ -852,16 +876,10 @@ int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, } spin_unlock(&cur->io_lock); } - *total_devs = 1; } } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { - if (rw == WRITE) { - *total_devs = map->num_stripes; - stripe_index = dev_nr; - } else { - stripe_index = 0; - *total_devs = 1; - } + if (rw & (1 << BIO_RW)) + multi->num_stripes = map->num_stripes; } else { /* * after this do_div call, stripe_nr is the number of stripes @@ -871,18 +889,17 @@ int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, stripe_index = do_div(stripe_nr, map->num_stripes); } BUG_ON(stripe_index >= map->num_stripes); - *phys = map->stripes[stripe_index].physical + stripe_offset + - stripe_nr * map->stripe_len; - - if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_DUP)) { - /* we limit the length of each bio to what fits in a stripe */ - *length = min_t(u64, em->len - offset, - map->stripe_len - stripe_offset); - } else { - *length = em->len - offset; + BUG_ON(stripe_index != 0 && multi->num_stripes > 1); + + for (i = 0; i < multi->num_stripes; i++) { + multi->stripes[i].physical = + map->stripes[stripe_index].physical + stripe_offset + + stripe_nr * map->stripe_len; + multi->stripes[i].dev = map->stripes[stripe_index].dev; + stripe_index++; } - *dev = map->stripes[stripe_index].dev; + *multi_ret = multi; +out: free_extent_map(em); spin_unlock(&em_tree->lock); return 0; @@ -895,7 +912,7 @@ static int end_bio_multi_stripe(struct bio *bio, unsigned int bytes_done, int err) #endif { - struct multi_bio *multi = bio->bi_private; + struct btrfs_multi_bio *multi = bio->bi_private; #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23) if (bio->bi_size) @@ -904,7 +921,7 @@ static int end_bio_multi_stripe(struct bio *bio, if (err) multi->error = err; - if (atomic_dec_and_test(&multi->stripes)) { + if (atomic_dec_and_test(&multi->stripes_pending)) { bio->bi_private = multi->private; bio->bi_end_io = multi->end_io; @@ -927,11 +944,10 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio) struct btrfs_device *dev; struct bio *first_bio = bio; u64 logical = bio->bi_sector << 9; - u64 physical; u64 length = 0; u64 map_length; struct bio_vec *bvec; - struct multi_bio *multi = NULL; + struct btrfs_multi_bio *multi = NULL; int i; int ret; int dev_nr = 0; @@ -943,26 +959,22 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio) map_tree = &root->fs_info->mapping_tree; map_length = length; + + ret = btrfs_map_block(map_tree, rw, logical, &map_length, &multi); + BUG_ON(ret); + + total_devs = multi->num_stripes; + if (map_length < length) { + printk("mapping failed logical %Lu bio len %Lu " + "len %Lu\n", logical, length, map_length); + BUG(); + } + multi->end_io = first_bio->bi_end_io; + multi->private = first_bio->bi_private; + atomic_set(&multi->stripes_pending, multi->num_stripes); + while(dev_nr < total_devs) { - ret = btrfs_map_block(map_tree, rw, dev_nr, logical, - &physical, &map_length, &dev, - &total_devs); - if (map_length < length) { - printk("mapping failed logical %Lu bio len %Lu physical %Lu " - "len %Lu\n", logical, length, physical, map_length); - BUG(); - } - BUG_ON(map_length < length); if (total_devs > 1) { - if (!multi) { - multi = kmalloc(sizeof(*multi), GFP_NOFS); - atomic_set(&multi->stripes, 1); - multi->end_io = bio->bi_end_io; - multi->private = first_bio->bi_private; - multi->error = 0; - } else { - atomic_inc(&multi->stripes); - } if (dev_nr < total_devs - 1) { bio = bio_clone(first_bio, GFP_NOFS); BUG_ON(!bio); @@ -972,7 +984,8 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio) bio->bi_private = multi; bio->bi_end_io = end_bio_multi_stripe; } - bio->bi_sector = physical >> 9; + bio->bi_sector = multi->stripes[dev_nr].physical >> 9; + dev = multi->stripes[dev_nr].dev; bio->bi_bdev = dev->bdev; spin_lock(&dev->io_lock); dev->total_ios++; @@ -980,6 +993,8 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio) submit_bio(rw, bio); dev_nr++; } + if (total_devs == 1) + kfree(multi); return 0; } diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 35dec3efd78c..10ca0104750c 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -19,6 +19,8 @@ #ifndef __BTRFS_VOLUMES_ #define __BTRFS_VOLUMES_ +#include + struct btrfs_device { struct list_head dev_list; struct btrfs_root *dev_root; @@ -69,12 +71,29 @@ struct btrfs_fs_devices { struct list_head list; }; +struct btrfs_bio_stripe { + struct btrfs_device *dev; + u64 physical; +}; + +struct btrfs_multi_bio { + atomic_t stripes_pending; + bio_end_io_t *end_io; + void *private; + int error; + int num_stripes; + struct btrfs_bio_stripe stripes[]; +}; + +#define btrfs_multi_bio_size(n) (sizeof(struct btrfs_multi_bio) + \ + (sizeof(struct btrfs_bio_stripe) * (n))) + int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, struct btrfs_device *device, u64 owner, u64 num_bytes, u64 *start); -int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, int stripe_nr, - u64 logical, u64 *phys, u64 *length, - struct btrfs_device **dev, int *total_stripes); +int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, + u64 logical, u64 *length, + struct btrfs_multi_bio **multi_ret); int btrfs_read_sys_array(struct btrfs_root *root); int btrfs_read_chunk_tree(struct btrfs_root *root); int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, -- cgit v1.2.3 From f188591e987e21b6f7f8864c66a02858b95b530e Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 9 Apr 2008 16:28:12 -0400 Subject: Btrfs: Retry metadata reads in the face of checksum failures Signed-off-by: Chris Mason --- fs/btrfs/ctree.c | 11 +++++++- fs/btrfs/disk-io.c | 72 ++++++++++++++++++++++++++++++++++++++-------------- fs/btrfs/extent_io.c | 50 +++++++++++++++++++++--------------- fs/btrfs/extent_io.h | 5 ++-- fs/btrfs/inode.c | 7 ++--- fs/btrfs/volumes.c | 39 +++++++++++++++++++++++++--- fs/btrfs/volumes.h | 6 +++-- 7 files changed, 138 insertions(+), 52 deletions(-) (limited to 'fs/btrfs/volumes.h') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index efce173a935c..ff4e9c6859d1 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -569,7 +569,16 @@ static int check_leaf(struct btrfs_root *root, struct btrfs_path *path, static int noinline check_block(struct btrfs_root *root, struct btrfs_path *path, int level) { - return 0; + u64 found_start; + if (btrfs_header_level(path->nodes[level]) != level) + printk("warning: bad level %Lu wanted %d found %d\n", + path->nodes[level]->start, level, + btrfs_header_level(path->nodes[level])); + found_start = btrfs_header_bytenr(path->nodes[level]); + if (found_start != path->nodes[level]->start) { + printk("warning: bad bytentr %Lu found %Lu\n", + path->nodes[level]->start, found_start); + } #if 0 struct extent_buffer *buf = path->nodes[level]; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 94ff87d0eae4..59bdf0474be3 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -156,7 +156,6 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf, memcpy(&found, result, BTRFS_CRC32_SIZE); read_extent_buffer(buf, &val, 0, BTRFS_CRC32_SIZE); - WARN_ON(1); printk("btrfs: %s checksum verify failed on %llu " "wanted %X found %X from_this_trans %d " "level %d\n", @@ -171,6 +170,40 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf, return 0; } +static int btree_read_extent_buffer_pages(struct btrfs_root *root, + struct extent_buffer *eb, + u64 start) +{ + struct extent_io_tree *io_tree; + int ret; + int num_copies = 0; + int mirror_num = 0; + + io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree; + while (1) { + ret = read_extent_buffer_pages(io_tree, eb, start, 1, + btree_get_extent, mirror_num); + if (!ret) { + if (mirror_num) +printk("good read %Lu mirror %d total %d\n", eb->start, mirror_num, num_copies); + return ret; + } + num_copies = btrfs_num_copies(&root->fs_info->mapping_tree, + eb->start, eb->len); +printk("failed to read %Lu mirror %d total %d\n", eb->start, mirror_num, num_copies); + if (num_copies == 1) { +printk("reading %Lu failed only one copy\n", eb->start); + return ret; + } + mirror_num++; + if (mirror_num > num_copies) { +printk("bailing at mirror %d of %d\n", mirror_num, num_copies); + return ret; + } + } +printk("read extent buffer page last\n"); + return -EIO; +} int csum_dirty_buffer(struct btrfs_root *root, struct page *page) { @@ -180,6 +213,8 @@ int csum_dirty_buffer(struct btrfs_root *root, struct page *page) int found_level; unsigned long len; struct extent_buffer *eb; + int ret; + tree = &BTRFS_I(page->mapping->host)->io_tree; if (page->private == EXTENT_PAGE_PRIVATE) @@ -191,8 +226,8 @@ int csum_dirty_buffer(struct btrfs_root *root, struct page *page) WARN_ON(1); } eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS); - read_extent_buffer_pages(tree, eb, start + PAGE_CACHE_SIZE, 1, - btree_get_extent); + ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE); + BUG_ON(ret); btrfs_clear_buffer_defrag(eb); found_start = btrfs_header_bytenr(eb); if (found_start != start) { @@ -240,7 +275,7 @@ int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, unsigned long len; struct extent_buffer *eb; struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; - int ret; + int ret = 0; tree = &BTRFS_I(page->mapping->host)->io_tree; if (page->private == EXTENT_PAGE_PRIVATE) @@ -252,25 +287,26 @@ int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, WARN_ON(1); } eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS); - read_extent_buffer_pages(tree, eb, start + PAGE_CACHE_SIZE, 1, - btree_get_extent); + btrfs_clear_buffer_defrag(eb); found_start = btrfs_header_bytenr(eb); if (found_start != start) { - printk("warning: eb start incorrect %Lu buffer %Lu len %lu\n", - start, found_start, len); - WARN_ON(1); +printk("bad start on %Lu found %Lu\n", eb->start, found_start); + ret = -EIO; goto err; } if (eb->first_page != page) { printk("bad first page %lu %lu\n", eb->first_page->index, page->index); WARN_ON(1); + ret = -EIO; goto err; } found_level = btrfs_header_level(eb); ret = csum_tree_block(root, eb, 1); + if (ret) + ret = -EIO; end = min_t(u64, eb->len, PAGE_CACHE_SIZE); end = eb->start + end - 1; @@ -278,7 +314,7 @@ int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, err: free_extent_buffer(eb); out: - return 0; + return ret; } #if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23) @@ -329,7 +365,8 @@ int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, return 0; } -static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio) +static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, + int mirror_num) { struct btrfs_root *root = BTRFS_I(inode)->root; u64 offset; @@ -338,7 +375,7 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio) offset = bio->bi_sector << 9; if (rw & (1 << BIO_RW)) { - return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio); + return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num); } ret = btrfs_bio_wq_end_io(root->fs_info, bio, 1); @@ -349,7 +386,7 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio) submit_bio(rw, bio); return 0; } - return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio); + return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num); } static int btree_writepage(struct page *page, struct writeback_control *wbc) @@ -459,7 +496,7 @@ int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize) if (!buf) return 0; read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree, - buf, 0, 0, btree_get_extent); + buf, 0, 0, btree_get_extent, 0); free_extent_buffer(buf); return ret; } @@ -522,8 +559,7 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, if (!buf) return NULL; - ret = read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree, buf, 0, - 1, btree_get_extent); + ret = btree_read_extent_buffer_pages(root, buf, 0); if (ret == 0) { buf->flags |= EXTENT_UPTODATE; @@ -1366,10 +1402,8 @@ int btrfs_clear_buffer_defrag(struct extent_buffer *buf) int btrfs_read_buffer(struct extent_buffer *buf) { struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root; - struct inode *btree_inode = root->fs_info->btree_inode; int ret; - ret = read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree, - buf, 0, 1, btree_get_extent); + ret = btree_read_extent_buffer_pages(root, buf, 0); if (ret == 0) { buf->flags |= EXTENT_UPTODATE; } diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index cfc383c17a3a..2f159375c878 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1546,7 +1546,7 @@ static int end_bio_extent_readpage(struct bio *bio, !(state->state & EXTENT_LOCKED)) state = NULL; } - if (!state) { + if (!state && uptodate) { spin_unlock_irqrestore(&tree->lock, flags); set_extent_uptodate(tree, start, end, GFP_ATOMIC); @@ -1567,8 +1567,10 @@ static int end_bio_extent_readpage(struct bio *bio, } else { state = NULL; } - set_state_cb(tree, clear, EXTENT_UPTODATE); - clear->state |= EXTENT_UPTODATE; + if (uptodate) { + set_state_cb(tree, clear, EXTENT_UPTODATE); + clear->state |= EXTENT_UPTODATE; + } clear_state_bit(tree, clear, EXTENT_LOCKED, 1, 0); if (cur == start) @@ -1685,7 +1687,7 @@ extent_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, return bio; } -static int submit_one_bio(int rw, struct bio *bio) +static int submit_one_bio(int rw, struct bio *bio, int mirror_num) { u64 maxsector; int ret = 0; @@ -1722,7 +1724,8 @@ static int submit_one_bio(int rw, struct bio *bio) WARN_ON(1); } if (tree->ops && tree->ops->submit_bio_hook) - tree->ops->submit_bio_hook(page->mapping->host, rw, bio); + tree->ops->submit_bio_hook(page->mapping->host, rw, bio, + mirror_num); else submit_bio(rw, bio); if (bio_flagged(bio, BIO_EOPNOTSUPP)) @@ -1737,7 +1740,8 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree, struct block_device *bdev, struct bio **bio_ret, unsigned long max_pages, - bio_end_io_t end_io_func) + bio_end_io_t end_io_func, + int mirror_num) { int ret = 0; struct bio *bio; @@ -1749,7 +1753,7 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree, (tree->ops && tree->ops->merge_bio_hook && tree->ops->merge_bio_hook(page, offset, size, bio)) || bio_add_page(bio, page, size, offset) < size) { - ret = submit_one_bio(rw, bio); + ret = submit_one_bio(rw, bio, mirror_num); bio = NULL; } else { return 0; @@ -1769,7 +1773,7 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree, if (bio_ret) { *bio_ret = bio; } else { - ret = submit_one_bio(rw, bio); + ret = submit_one_bio(rw, bio, mirror_num); } return ret; @@ -1798,7 +1802,7 @@ void set_page_extent_head(struct page *page, unsigned long len) static int __extent_read_full_page(struct extent_io_tree *tree, struct page *page, get_extent_t *get_extent, - struct bio **bio) + struct bio **bio, int mirror_num) { struct inode *inode = page->mapping->host; u64 start = (u64)page->index << PAGE_CACHE_SHIFT; @@ -1901,7 +1905,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree, ret = submit_extent_page(READ, tree, page, sector, iosize, page_offset, bdev, bio, nr, - end_bio_extent_readpage); + end_bio_extent_readpage, mirror_num); } if (ret) SetPageError(page); @@ -1923,9 +1927,9 @@ int extent_read_full_page(struct extent_io_tree *tree, struct page *page, struct bio *bio = NULL; int ret; - ret = __extent_read_full_page(tree, page, get_extent, &bio); + ret = __extent_read_full_page(tree, page, get_extent, &bio, 0); if (bio) - submit_one_bio(READ, bio); + submit_one_bio(READ, bio, 0); return ret; } EXPORT_SYMBOL(extent_read_full_page); @@ -2077,7 +2081,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, ret = submit_extent_page(WRITE, tree, page, sector, iosize, page_offset, bdev, &epd->bio, max_nr, - end_bio_extent_writepage); + end_bio_extent_writepage, 0); if (ret) SetPageError(page); } @@ -2244,7 +2248,7 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page, write_cache_pages(mapping, &wbc_writepages, __extent_writepage, &epd); if (epd.bio) { - submit_one_bio(WRITE, epd.bio); + submit_one_bio(WRITE, epd.bio, 0); } return ret; } @@ -2265,7 +2269,7 @@ int extent_writepages(struct extent_io_tree *tree, ret = write_cache_pages(mapping, wbc, __extent_writepage, &epd); if (epd.bio) { - submit_one_bio(WRITE, epd.bio); + submit_one_bio(WRITE, epd.bio, 0); } return ret; } @@ -2297,7 +2301,8 @@ int extent_readpages(struct extent_io_tree *tree, page_cache_get(page); if (!pagevec_add(&pvec, page)) __pagevec_lru_add(&pvec); - __extent_read_full_page(tree, page, get_extent, &bio); + __extent_read_full_page(tree, page, get_extent, + &bio, 0); } page_cache_release(page); } @@ -2305,7 +2310,7 @@ int extent_readpages(struct extent_io_tree *tree, __pagevec_lru_add(&pvec); BUG_ON(!list_empty(pages)); if (bio) - submit_one_bio(READ, bio); + submit_one_bio(READ, bio, 0); return 0; } EXPORT_SYMBOL(extent_readpages); @@ -2430,7 +2435,7 @@ int extent_prepare_write(struct extent_io_tree *tree, ret = submit_extent_page(READ, tree, page, sector, iosize, page_offset, em->bdev, NULL, 1, - end_bio_extent_preparewrite); + end_bio_extent_preparewrite, 0); iocount++; block_start = block_start + iosize; } else { @@ -2696,6 +2701,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, mark_page_accessed(page0); set_page_extent_mapped(page0); set_page_extent_head(page0, len); + uptodate = PageUptodate(page0); } else { i = 0; } @@ -3006,7 +3012,7 @@ EXPORT_SYMBOL(extent_buffer_uptodate); int read_extent_buffer_pages(struct extent_io_tree *tree, struct extent_buffer *eb, u64 start, int wait, - get_extent_t *get_extent) + get_extent_t *get_extent, int mirror_num) { unsigned long i; unsigned long start_i; @@ -3062,8 +3068,10 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, if (!PageUptodate(page)) { if (start_i == 0) inc_all_pages = 1; + ClearPageError(page); err = __extent_read_full_page(tree, page, - get_extent, &bio); + get_extent, &bio, + mirror_num); if (err) { ret = err; } @@ -3073,7 +3081,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, } if (bio) - submit_one_bio(READ, bio); + submit_one_bio(READ, bio, mirror_num); if (ret || !wait) { return ret; diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 9d2991d1d3ce..8d6b8a14cc30 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -27,7 +27,8 @@ struct extent_state; struct extent_io_ops { int (*fill_delalloc)(struct inode *inode, u64 start, u64 end); int (*writepage_io_hook)(struct page *page, u64 start, u64 end); - int (*submit_bio_hook)(struct inode *inode, int rw, struct bio *bio); + int (*submit_bio_hook)(struct inode *inode, int rw, struct bio *bio, + int mirror_num); int (*merge_bio_hook)(struct page *page, unsigned long offset, size_t size, struct bio *bio); int (*readpage_io_hook)(struct page *page, u64 start, u64 end); @@ -172,7 +173,7 @@ struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, void free_extent_buffer(struct extent_buffer *eb); int read_extent_buffer_pages(struct extent_io_tree *tree, struct extent_buffer *eb, u64 start, int wait, - get_extent_t *get_extent); + get_extent_t *get_extent, int mirror_num); static inline void extent_buffer_get(struct extent_buffer *eb) { diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index e1ef1acdb350..8c2d5d036bd6 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -314,7 +314,7 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset, map_tree = &root->fs_info->mapping_tree; map_length = length; ret = btrfs_map_block(map_tree, READ, logical, - &map_length, NULL); + &map_length, NULL, 0); if (map_length < length + size) { return 1; @@ -322,7 +322,8 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset, return 0; } -int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio) +int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, + int mirror_num) { struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; @@ -347,7 +348,7 @@ int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio) BUG_ON(ret); mutex_unlock(&root->fs_info->fs_mutex); mapit: - return btrfs_map_bio(root, rw, bio); + return btrfs_map_bio(root, rw, bio, mirror_num); } int btrfs_readpage_io_hook(struct page *page, u64 start, u64 end) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 008d3640e8c2..3b927f698320 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -788,9 +788,31 @@ void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree) } } +int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len) +{ + struct extent_map *em; + struct map_lookup *map; + struct extent_map_tree *em_tree = &map_tree->map_tree; + int ret; + + spin_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, logical, len); + BUG_ON(!em); + + BUG_ON(em->start > logical || em->start + em->len < logical); + map = (struct map_lookup *)em->bdev; + if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1)) + ret = map->num_stripes; + else + ret = 1; + free_extent_map(em); + spin_unlock(&em_tree->lock); + return ret; +} + int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, u64 logical, u64 *length, - struct btrfs_multi_bio **multi_ret) + struct btrfs_multi_bio **multi_ret, int mirror_num) { struct extent_map *em; struct map_lookup *map; @@ -822,6 +844,9 @@ again: map = (struct map_lookup *)em->bdev; offset = logical - em->start; + if (mirror_num > map->num_stripes) + mirror_num = 0; + /* if our multi bio struct is too small, back off and try again */ if (multi_ret && (rw & (1 << BIO_RW)) && stripes_allocated < map->num_stripes && @@ -862,7 +887,9 @@ again: if (map->type & BTRFS_BLOCK_GROUP_RAID1) { if (rw & (1 << BIO_RW)) multi->num_stripes = map->num_stripes; - else { + else if (mirror_num) { + stripe_index = mirror_num - 1; + } else { int i; u64 least = (u64)-1; struct btrfs_device *cur; @@ -880,6 +907,8 @@ again: } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { if (rw & (1 << BIO_RW)) multi->num_stripes = map->num_stripes; + else if (mirror_num) + stripe_index = mirror_num - 1; } else { /* * after this do_div call, stripe_nr is the number of stripes @@ -938,7 +967,8 @@ static int end_bio_multi_stripe(struct bio *bio, #endif } -int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio) +int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, + int mirror_num) { struct btrfs_mapping_tree *map_tree; struct btrfs_device *dev; @@ -960,7 +990,8 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio) map_tree = &root->fs_info->mapping_tree; map_length = length; - ret = btrfs_map_block(map_tree, rw, logical, &map_length, &multi); + ret = btrfs_map_block(map_tree, rw, logical, &map_length, &multi, + mirror_num); BUG_ON(ret); total_devs = multi->num_stripes; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 10ca0104750c..3d5d0a9cb827 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -93,7 +93,7 @@ int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, u64 owner, u64 num_bytes, u64 *start); int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, u64 logical, u64 *length, - struct btrfs_multi_bio **multi_ret); + struct btrfs_multi_bio **multi_ret, int mirror_num); int btrfs_read_sys_array(struct btrfs_root *root); int btrfs_read_chunk_tree(struct btrfs_root *root); int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, @@ -101,7 +101,8 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 *num_bytes, u64 type); void btrfs_mapping_init(struct btrfs_mapping_tree *tree); void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree); -int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio); +int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, + int mirror_num); int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf); int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, int flags, void *holder); @@ -112,4 +113,5 @@ int btrfs_add_device(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_device *device); int btrfs_cleanup_fs_uuids(void); +int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len); #endif -- cgit v1.2.3 From f29844623de29a12358d7fba35d0959465b64adf Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Thu, 10 Apr 2008 16:19:33 -0400 Subject: Btrfs: Write out all super blocks on commit, and bring back proper barrier support Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 118 ++++++++++++++++++++++++++++++++++++++++++++++++++--- fs/btrfs/disk-io.h | 1 + fs/btrfs/volumes.c | 8 ++-- fs/btrfs/volumes.h | 3 ++ 4 files changed, 122 insertions(+), 8 deletions(-) (limited to 'fs/btrfs/volumes.h') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 59bdf0474be3..cf1de75f088a 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -382,7 +382,7 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, BUG_ON(ret); if (offset == BTRFS_SUPER_INFO_OFFSET) { - bio->bi_bdev = root->fs_info->sb->s_bdev; + bio->bi_bdev = root->fs_info->fs_devices->latest_bdev; submit_bio(rw, bio); return 0; } @@ -988,7 +988,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, spin_lock_init(&fs_info->new_trans_lock); init_completion(&fs_info->kobj_unregister); - sb_set_blocksize(sb, 4096); + sb_set_blocksize(sb, BTRFS_SUPER_INFO_SIZE); fs_info->tree_root = tree_root; fs_info->extent_root = extent_root; fs_info->chunk_root = chunk_root; @@ -1169,14 +1169,121 @@ fail: return ERR_PTR(err); } +static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate) +{ + char b[BDEVNAME_SIZE]; + + if (uptodate) { + set_buffer_uptodate(bh); + } else { + if (!buffer_eopnotsupp(bh) && printk_ratelimit()) { + printk(KERN_WARNING "lost page write due to " + "I/O error on %s\n", + bdevname(bh->b_bdev, b)); + } + set_buffer_write_io_error(bh); + clear_buffer_uptodate(bh); + } + unlock_buffer(bh); + put_bh(bh); +} + +int write_all_supers(struct btrfs_root *root) +{ + struct list_head *cur; + struct list_head *head = &root->fs_info->fs_devices->devices; + struct btrfs_device *dev; + struct extent_buffer *sb; + struct btrfs_dev_item *dev_item; + struct buffer_head *bh; + int ret; + int do_barriers; + + do_barriers = !btrfs_test_opt(root, NOBARRIER); + + sb = root->fs_info->sb_buffer; + dev_item = (struct btrfs_dev_item *)offsetof(struct btrfs_super_block, + dev_item); + list_for_each(cur, head) { + dev = list_entry(cur, struct btrfs_device, dev_list); + btrfs_set_device_type(sb, dev_item, dev->type); + btrfs_set_device_id(sb, dev_item, dev->devid); + btrfs_set_device_total_bytes(sb, dev_item, dev->total_bytes); + btrfs_set_device_bytes_used(sb, dev_item, dev->bytes_used); + btrfs_set_device_io_align(sb, dev_item, dev->io_align); + btrfs_set_device_io_width(sb, dev_item, dev->io_width); + btrfs_set_device_sector_size(sb, dev_item, dev->sector_size); + write_extent_buffer(sb, dev->uuid, + (unsigned long)btrfs_device_uuid(dev_item), + BTRFS_DEV_UUID_SIZE); + + btrfs_set_header_flag(sb, BTRFS_HEADER_FLAG_WRITTEN); + csum_tree_block(root, sb, 0); + + bh = __getblk(dev->bdev, BTRFS_SUPER_INFO_OFFSET / + root->fs_info->sb->s_blocksize, + BTRFS_SUPER_INFO_SIZE); + + read_extent_buffer(sb, bh->b_data, 0, BTRFS_SUPER_INFO_SIZE); + dev->pending_io = bh; + + get_bh(bh); + set_buffer_uptodate(bh); + lock_buffer(bh); + bh->b_end_io = btrfs_end_buffer_write_sync; + + if (do_barriers && dev->barriers) { + ret = submit_bh(WRITE_BARRIER, bh); + if (ret == -EOPNOTSUPP) { + printk("btrfs: disabling barriers on dev %s\n", + dev->name); + set_buffer_uptodate(bh); + dev->barriers = 0; + get_bh(bh); + lock_buffer(bh); + ret = submit_bh(WRITE, bh); + } + } else { + ret = submit_bh(WRITE, bh); + } + BUG_ON(ret); + } + + list_for_each(cur, head) { + dev = list_entry(cur, struct btrfs_device, dev_list); + BUG_ON(!dev->pending_io); + bh = dev->pending_io; + wait_on_buffer(bh); + if (!buffer_uptodate(dev->pending_io)) { + if (do_barriers && dev->barriers) { + printk("btrfs: disabling barriers on dev %s\n", + dev->name); + set_buffer_uptodate(bh); + get_bh(bh); + lock_buffer(bh); + dev->barriers = 0; + ret = submit_bh(WRITE, bh); + BUG_ON(ret); + wait_on_buffer(bh); + BUG_ON(!buffer_uptodate(bh)); + } else { + BUG(); + } + + } + dev->pending_io = NULL; + brelse(bh); + } + return 0; +} + int write_ctree_super(struct btrfs_trans_handle *trans, struct btrfs_root *root) { int ret; - struct extent_buffer *super = root->fs_info->sb_buffer; - struct inode *btree_inode = root->fs_info->btree_inode; - struct super_block *sb = root->fs_info->sb; + ret = write_all_supers(root); +#if 0 if (!btrfs_test_opt(root, NOBARRIER)) blkdev_issue_flush(sb->s_bdev, NULL); set_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, super); @@ -1184,6 +1291,7 @@ int write_ctree_super(struct btrfs_trans_handle *trans, struct btrfs_root super->start, super->len); if (!btrfs_test_opt(root, NOBARRIER)) blkdev_issue_flush(sb->s_bdev, NULL); +#endif return ret; } diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 4fac0ccbf8f8..60b01902db79 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -20,6 +20,7 @@ #define __DISKIO__ #define BTRFS_SUPER_INFO_OFFSET (16 * 1024) +#define BTRFS_SUPER_INFO_SIZE 4096 struct btrfs_device; struct btrfs_fs_devices; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 3b927f698320..07d43553141c 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -125,6 +125,7 @@ static int device_list_add(const char *path, return -ENOMEM; } device->devid = devid; + device->barriers = 1; device->name = kstrdup(path, GFP_NOFS); if (!device->name) { kfree(device); @@ -208,6 +209,7 @@ int btrfs_scan_one_device(const char *path, int flags, void *holder, struct buffer_head *bh; int ret; u64 devid; + u64 transid; mutex_lock(&uuid_mutex); @@ -236,14 +238,14 @@ int btrfs_scan_one_device(const char *path, int flags, void *holder, goto error_brelse; } devid = le64_to_cpu(disk_super->dev_item.devid); - printk("found device %Lu on %s\n", devid, path); + transid = btrfs_super_generation(disk_super); + printk("found device %Lu transid %Lu on %s\n", devid, transid, path); ret = device_list_add(path, disk_super, devid, fs_devices_ret); error_brelse: brelse(bh); error_close: close_bdev_excl(bdev); - printk("scan one closes bdev %s\n", path); error: mutex_unlock(&uuid_mutex); return ret; @@ -1143,7 +1145,7 @@ static int read_one_dev(struct btrfs_root *root, device = btrfs_find_device(root, devid); if (!device) { printk("warning devid %Lu not found already\n", devid); - device = kmalloc(sizeof(*device), GFP_NOFS); + device = kzalloc(sizeof(*device), GFP_NOFS); if (!device) return -ENOMEM; list_add(&device->dev_list, diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 3d5d0a9cb827..89548837a1cc 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -21,9 +21,12 @@ #include +struct buffer_head; struct btrfs_device { struct list_head dev_list; struct btrfs_root *dev_root; + struct buffer_head *pending_io; + int barriers; spinlock_t io_lock; struct block_device *bdev; -- cgit v1.2.3 From e17cade25ff8074101d653557a78df09c16ca276 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 15 Apr 2008 15:41:47 -0400 Subject: Btrfs: Add chunk uuids and update multi-device back references Block headers now store the chunk tree uuid Chunk items records the device uuid for each stripes Device extent items record better back refs to the chunk tree Block groups record better back refs to the chunk tree The chunk tree format has also changed. The objectid of BTRFS_CHUNK_ITEM_KEY used to be the logical offset of the chunk. Now it is a chunk tree id, with the logical offset being stored in the offset field of the key. This allows a single chunk tree to record multiple logical address spaces, upping the number of bytes indexed by a chunk tree from 2^64 to 2^128. Signed-off-by: Chris Mason --- fs/btrfs/ctree.c | 12 ++++++++ fs/btrfs/ctree.h | 80 ++++++++++++++++++++++++++++++++++++++++++-------- fs/btrfs/disk-io.c | 6 +++- fs/btrfs/extent-tree.c | 22 +++++--------- fs/btrfs/print-tree.c | 20 +++++++++---- fs/btrfs/volumes.c | 76 +++++++++++++++++++++++++++++++---------------- fs/btrfs/volumes.h | 6 ++-- 7 files changed, 160 insertions(+), 62 deletions(-) (limited to 'fs/btrfs/volumes.h') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index e8bf6c221e4e..618e526c9046 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1382,6 +1382,11 @@ static int noinline insert_new_root(struct btrfs_trans_handle *trans, write_extent_buffer(c, root->fs_info->fsid, (unsigned long)btrfs_header_fsid(c), BTRFS_FSID_SIZE); + + write_extent_buffer(c, root->fs_info->chunk_tree_uuid, + (unsigned long)btrfs_header_chunk_tree_uuid(c), + BTRFS_UUID_SIZE); + btrfs_set_node_key(c, &lower_key, 0); btrfs_set_node_blockptr(c, 0, lower->start); lower_gen = btrfs_header_generation(lower); @@ -1513,6 +1518,9 @@ static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root write_extent_buffer(split, root->fs_info->fsid, (unsigned long)btrfs_header_fsid(split), BTRFS_FSID_SIZE); + write_extent_buffer(split, root->fs_info->chunk_tree_uuid, + (unsigned long)btrfs_header_chunk_tree_uuid(split), + BTRFS_UUID_SIZE); mid = (c_nritems + 1) / 2; @@ -2043,6 +2051,10 @@ again: write_extent_buffer(right, root->fs_info->fsid, (unsigned long)btrfs_header_fsid(right), BTRFS_FSID_SIZE); + + write_extent_buffer(right, root->fs_info->chunk_tree_uuid, + (unsigned long)btrfs_header_chunk_tree_uuid(right), + BTRFS_UUID_SIZE); if (mid <= slot) { if (nritems == 1 || leaf_space_used(l, mid, nritems - mid) + space_needed > diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 09d614fcafb1..82d67c3db8bc 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -70,6 +70,7 @@ extern struct kmem_cache *btrfs_path_cachep; * All files have objectids higher than this. */ #define BTRFS_FIRST_FREE_OBJECTID 256ULL +#define BTRFS_FIRST_CHUNK_TREE_OBJECTID 256ULL /* @@ -131,7 +132,7 @@ struct btrfs_mapping_tree { struct extent_map_tree map_tree; }; -#define BTRFS_DEV_UUID_SIZE 16 +#define BTRFS_UUID_SIZE 16 struct btrfs_dev_item { /* the internal btrfs device id */ __le64 devid; @@ -154,17 +155,32 @@ struct btrfs_dev_item { /* type and info about this device */ __le64 type; + /* grouping information for allocation decisions */ + __le32 dev_group; + + /* seek speed 0-100 where 100 is fastest */ + u8 seek_speed; + + /* bandwidth 0-100 where 100 is fastest */ + u8 bandwidth; + /* btrfs generated uuid for this device */ - u8 uuid[BTRFS_DEV_UUID_SIZE]; + u8 uuid[BTRFS_UUID_SIZE]; } __attribute__ ((__packed__)); struct btrfs_stripe { __le64 devid; __le64 offset; + u8 dev_uuid[BTRFS_UUID_SIZE]; } __attribute__ ((__packed__)); struct btrfs_chunk { + /* size of this chunk in bytes */ + __le64 length; + + /* objectid of the root referencing this chunk */ __le64 owner; + __le64 stripe_len; __le64 type; @@ -199,10 +215,14 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes) * every tree block (leaf or node) starts with this header. */ struct btrfs_header { + /* these first four must match the super block */ u8 csum[BTRFS_CSUM_SIZE]; u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */ __le64 bytenr; /* which block this node is supposed to live in */ __le64 flags; + + /* allowed to be different from the super from here on down */ + u8 chunk_tree_uuid[BTRFS_UUID_SIZE]; __le64 generation; __le64 owner; __le32 nritems; @@ -235,6 +255,8 @@ struct btrfs_super_block { u8 fsid[16]; /* FS specific uuid */ __le64 bytenr; /* this block number */ __le64 flags; + + /* allowed to be different from the btrfs_header from here own down */ __le64 magic; __le64 generation; __le64 root; @@ -323,14 +345,16 @@ struct btrfs_extent_ref { /* dev extents record free space on individual devices. The owner * field points back to the chunk allocation mapping tree that allocated - * the extent + * the extent. The chunk tree uuid field is a way to double check the owner */ struct btrfs_dev_extent { - __le64 owner; + __le64 chunk_tree; + __le64 chunk_objectid; + __le64 chunk_offset; __le64 length; + u8 chunk_tree_uuid[BTRFS_UUID_SIZE]; } __attribute__ ((__packed__)); - struct btrfs_inode_ref { __le16 name_len; /* name goes here */ @@ -424,7 +448,6 @@ struct btrfs_csum_item { struct btrfs_block_group_item { __le64 used; - __le64 chunk_tree; __le64 chunk_objectid; __le64 flags; } __attribute__ ((__packed__)); @@ -451,6 +474,7 @@ struct btrfs_device; struct btrfs_fs_devices; struct btrfs_fs_info { u8 fsid[BTRFS_FSID_SIZE]; + u8 chunk_tree_uuid[BTRFS_UUID_SIZE]; struct btrfs_root *extent_root; struct btrfs_root *tree_root; struct btrfs_root *chunk_root; @@ -697,6 +721,9 @@ BTRFS_SETGET_FUNCS(device_io_align, struct btrfs_dev_item, io_align, 32); BTRFS_SETGET_FUNCS(device_io_width, struct btrfs_dev_item, io_width, 32); BTRFS_SETGET_FUNCS(device_sector_size, struct btrfs_dev_item, sector_size, 32); BTRFS_SETGET_FUNCS(device_id, struct btrfs_dev_item, devid, 64); +BTRFS_SETGET_FUNCS(device_group, struct btrfs_dev_item, dev_group, 32); +BTRFS_SETGET_FUNCS(device_seek_speed, struct btrfs_dev_item, seek_speed, 8); +BTRFS_SETGET_FUNCS(device_bandwidth, struct btrfs_dev_item, bandwidth, 8); BTRFS_SETGET_STACK_FUNCS(stack_device_type, struct btrfs_dev_item, type, 64); BTRFS_SETGET_STACK_FUNCS(stack_device_total_bytes, struct btrfs_dev_item, @@ -710,12 +737,19 @@ BTRFS_SETGET_STACK_FUNCS(stack_device_io_width, struct btrfs_dev_item, BTRFS_SETGET_STACK_FUNCS(stack_device_sector_size, struct btrfs_dev_item, sector_size, 32); BTRFS_SETGET_STACK_FUNCS(stack_device_id, struct btrfs_dev_item, devid, 64); +BTRFS_SETGET_STACK_FUNCS(stack_device_group, struct btrfs_dev_item, + dev_group, 32); +BTRFS_SETGET_STACK_FUNCS(stack_device_seek_speed, struct btrfs_dev_item, + seek_speed, 8); +BTRFS_SETGET_STACK_FUNCS(stack_device_bandwidth, struct btrfs_dev_item, + bandwidth, 8); static inline char *btrfs_device_uuid(struct btrfs_dev_item *d) { return (char *)d + offsetof(struct btrfs_dev_item, uuid); } +BTRFS_SETGET_FUNCS(chunk_length, struct btrfs_chunk, length, 64); BTRFS_SETGET_FUNCS(chunk_owner, struct btrfs_chunk, owner, 64); BTRFS_SETGET_FUNCS(chunk_stripe_len, struct btrfs_chunk, stripe_len, 64); BTRFS_SETGET_FUNCS(chunk_io_align, struct btrfs_chunk, io_align, 32); @@ -726,6 +760,12 @@ BTRFS_SETGET_FUNCS(chunk_num_stripes, struct btrfs_chunk, num_stripes, 16); BTRFS_SETGET_FUNCS(stripe_devid, struct btrfs_stripe, devid, 64); BTRFS_SETGET_FUNCS(stripe_offset, struct btrfs_stripe, offset, 64); +static inline char *btrfs_stripe_dev_uuid(struct btrfs_stripe *s) +{ + return (char *)s + offsetof(struct btrfs_stripe, dev_uuid); +} + +BTRFS_SETGET_STACK_FUNCS(stack_chunk_length, struct btrfs_chunk, length, 64); BTRFS_SETGET_STACK_FUNCS(stack_chunk_owner, struct btrfs_chunk, owner, 64); BTRFS_SETGET_STACK_FUNCS(stack_chunk_stripe_len, struct btrfs_chunk, stripe_len, 64); @@ -781,13 +821,10 @@ BTRFS_SETGET_STACK_FUNCS(block_group_used, struct btrfs_block_group_item, used, 64); BTRFS_SETGET_FUNCS(disk_block_group_used, struct btrfs_block_group_item, used, 64); -BTRFS_SETGET_STACK_FUNCS(block_group_chunk_tree, struct btrfs_block_group_item, - chunk_tree, 64); -BTRFS_SETGET_FUNCS(disk_block_group_chunk_tree, struct btrfs_block_group_item, - chunk_tree, 64); BTRFS_SETGET_STACK_FUNCS(block_group_chunk_objectid, struct btrfs_block_group_item, chunk_objectid, 64); -BTRFS_SETGET_FUNCS(disk_block_group_chunk_objecitd, + +BTRFS_SETGET_FUNCS(disk_block_group_chunk_objectid, struct btrfs_block_group_item, chunk_objectid, 64); BTRFS_SETGET_FUNCS(disk_block_group_flags, struct btrfs_block_group_item, flags, 64); @@ -850,9 +887,20 @@ BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32); BTRFS_SETGET_FUNCS(extent_refs, struct btrfs_extent_item, refs, 32); /* struct btrfs_dev_extent */ -BTRFS_SETGET_FUNCS(dev_extent_owner, struct btrfs_dev_extent, owner, 64); +BTRFS_SETGET_FUNCS(dev_extent_chunk_tree, struct btrfs_dev_extent, + chunk_tree, 64); +BTRFS_SETGET_FUNCS(dev_extent_chunk_objectid, struct btrfs_dev_extent, + chunk_objectid, 64); +BTRFS_SETGET_FUNCS(dev_extent_chunk_offset, struct btrfs_dev_extent, + chunk_offset, 64); BTRFS_SETGET_FUNCS(dev_extent_length, struct btrfs_dev_extent, length, 64); +static inline u8 *btrfs_dev_extent_chunk_tree_uuid(struct btrfs_dev_extent *dev) +{ + unsigned long ptr = offsetof(struct btrfs_dev_extent, chunk_tree_uuid); + return (u8 *)((unsigned long)dev + ptr); +} + /* struct btrfs_extent_ref */ BTRFS_SETGET_FUNCS(ref_root, struct btrfs_extent_ref, root, 64); BTRFS_SETGET_FUNCS(ref_generation, struct btrfs_extent_ref, generation, 64); @@ -1087,6 +1135,12 @@ static inline u8 *btrfs_header_fsid(struct extent_buffer *eb) return (u8 *)ptr; } +static inline u8 *btrfs_header_chunk_tree_uuid(struct extent_buffer *eb) +{ + unsigned long ptr = offsetof(struct btrfs_header, chunk_tree_uuid); + return (u8 *)ptr; +} + static inline u8 *btrfs_super_fsid(struct extent_buffer *eb) { unsigned long ptr = offsetof(struct btrfs_super_block, fsid); @@ -1311,7 +1365,7 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info); int btrfs_read_block_groups(struct btrfs_root *root); int btrfs_make_block_group(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytes_used, - u64 type, u64 chunk_tree, u64 chunk_objectid, + u64 type, u64 chunk_objectid, u64 chunk_offset, u64 size); /* ctree.c */ int btrfs_previous_item(struct btrfs_root *root, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 9c94dddde704..79c284c87286 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1125,6 +1125,10 @@ struct btrfs_root *open_ctree(struct super_block *sb, blocksize); BUG_ON(!chunk_root->node); + read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid, + (unsigned long)btrfs_header_chunk_tree_uuid(chunk_root->node), + BTRFS_UUID_SIZE); + ret = btrfs_read_chunk_tree(chunk_root); BUG_ON(ret); @@ -1229,7 +1233,7 @@ int write_all_supers(struct btrfs_root *root) btrfs_set_device_sector_size(sb, dev_item, dev->sector_size); write_extent_buffer(sb, dev->uuid, (unsigned long)btrfs_device_uuid(dev_item), - BTRFS_DEV_UUID_SIZE); + BTRFS_UUID_SIZE); btrfs_set_header_flag(sb, BTRFS_HEADER_FLAG_WRITTEN); csum_tree_block(root, sb, 0); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index e49147e767df..71f045c63493 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -35,10 +35,6 @@ static int finish_current_insert(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root); static int del_pending_extents(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root); -int btrfs_make_block_group(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 bytes_used, - u64 type, u64 chunk_tree, u64 chunk_objectid, - u64 size); static int cache_block_group(struct btrfs_root *root, @@ -980,7 +976,6 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, ret = get_state_private(block_group_cache, start, &ptr); if (ret) break; - cache = (struct btrfs_block_group_cache *)(unsigned long)ptr; err = write_one_cache_group(trans, root, path, cache); @@ -1094,8 +1089,7 @@ printk("space info full %Lu\n", flags); BUG_ON(ret); ret = btrfs_make_block_group(trans, extent_root, 0, flags, - extent_root->fs_info->chunk_root->root_key.objectid, - start, num_bytes); + BTRFS_FIRST_CHUNK_TREE_OBJECTID, start, num_bytes); BUG_ON(ret); return 0; @@ -2782,7 +2776,7 @@ error: int btrfs_make_block_group(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytes_used, - u64 type, u64 chunk_tree, u64 chunk_objectid, + u64 type, u64 chunk_objectid, u64 chunk_offset, u64 size) { int ret; @@ -2796,14 +2790,14 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, cache = kmalloc(sizeof(*cache), GFP_NOFS); BUG_ON(!cache); - cache->key.objectid = chunk_objectid; + cache->key.objectid = chunk_offset; cache->key.offset = size; cache->cached = 0; cache->pinned = 0; + btrfs_set_key_type(&cache->key, BTRFS_BLOCK_GROUP_ITEM_KEY); memset(&cache->item, 0, sizeof(cache->item)); btrfs_set_block_group_used(&cache->item, bytes_used); - btrfs_set_block_group_chunk_tree(&cache->item, chunk_tree); btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid); cache->flags = type; btrfs_set_block_group_flags(&cache->item, type); @@ -2813,12 +2807,12 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, BUG_ON(ret); bit = block_group_state_bits(type); - set_extent_bits(block_group_cache, chunk_objectid, - chunk_objectid + size - 1, + set_extent_bits(block_group_cache, chunk_offset, + chunk_offset + size - 1, bit | EXTENT_LOCKED, GFP_NOFS); - set_state_private(block_group_cache, chunk_objectid, - (unsigned long)cache); + set_state_private(block_group_cache, chunk_offset, + (unsigned long)cache); ret = btrfs_insert_item(trans, extent_root, &cache->key, &cache->item, sizeof(cache->item)); BUG_ON(ret); diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index ee0de112cf5a..e99f3249d05a 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -24,7 +24,8 @@ static void print_chunk(struct extent_buffer *eb, struct btrfs_chunk *chunk) { int num_stripes = btrfs_chunk_num_stripes(eb, chunk); int i; - printk("\t\tchunk owner %llu type %llu num_stripes %d\n", + printk("\t\tchunk length %llu owner %llu type %llu num_stripes %d\n", + (unsigned long long)btrfs_chunk_length(eb, chunk), (unsigned long long)btrfs_chunk_owner(eb, chunk), (unsigned long long)btrfs_chunk_type(eb, chunk), num_stripes); @@ -140,17 +141,24 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) case BTRFS_DEV_EXTENT_KEY: dev_extent = btrfs_item_ptr(l, i, struct btrfs_dev_extent); - printk("\t\tdev extent owner %llu length %llu\n", - (unsigned long long)btrfs_dev_extent_owner(l, dev_extent), - (unsigned long long)btrfs_dev_extent_length(l, dev_extent)); + printk("\t\tdev extent chunk_tree %llu\n" + "\t\tchunk objectid %llu chunk offset %llu " + "length %llu\n", + (unsigned long long) + btrfs_dev_extent_chunk_tree(l, dev_extent), + (unsigned long long) + btrfs_dev_extent_chunk_objectid(l, dev_extent), + (unsigned long long) + btrfs_dev_extent_chunk_offset(l, dev_extent), + (unsigned long long) + btrfs_dev_extent_length(l, dev_extent)); }; } } void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c) { - int i; - u32 nr; + int i; u32 nr; struct btrfs_key key; int level; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index f81519f0e4a7..23ebd95b25e0 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -180,7 +180,7 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, list_for_each(cur, head) { device = list_entry(cur, struct btrfs_device, dev_list); bdev = open_bdev_excl(device->name, flags, holder); -printk("opening %s devid %Lu\n", device->name, device->devid); + if (IS_ERR(bdev)) { printk("open %s failed\n", device->name); ret = PTR_ERR(bdev); @@ -190,7 +190,6 @@ printk("opening %s devid %Lu\n", device->name, device->devid); fs_devices->latest_bdev = bdev; if (device->devid == fs_devices->lowest_devid) { fs_devices->lowest_bdev = bdev; -printk("lowest bdev %s\n", device->name); } device->bdev = bdev; } @@ -372,7 +371,9 @@ error: int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, struct btrfs_device *device, - u64 owner, u64 num_bytes, u64 *start) + u64 chunk_tree, u64 chunk_objectid, + u64 chunk_offset, + u64 num_bytes, u64 *start) { int ret; struct btrfs_path *path; @@ -400,7 +401,14 @@ int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, leaf = path->nodes[0]; extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent); - btrfs_set_dev_extent_owner(leaf, extent, owner); + btrfs_set_dev_extent_chunk_tree(leaf, extent, chunk_tree); + btrfs_set_dev_extent_chunk_objectid(leaf, extent, chunk_objectid); + btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset); + + write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid, + (unsigned long)btrfs_dev_extent_chunk_tree_uuid(extent), + BTRFS_UUID_SIZE); + btrfs_set_dev_extent_length(leaf, extent, num_bytes); btrfs_mark_buffer_dirty(leaf); err: @@ -408,17 +416,18 @@ err: return ret; } -static int find_next_chunk(struct btrfs_root *root, u64 *objectid) +static int find_next_chunk(struct btrfs_root *root, u64 objectid, u64 *offset) { struct btrfs_path *path; int ret; struct btrfs_key key; + struct btrfs_chunk *chunk; struct btrfs_key found_key; path = btrfs_alloc_path(); BUG_ON(!path); - key.objectid = (u64)-1; + key.objectid = objectid; key.offset = (u64)-1; key.type = BTRFS_CHUNK_ITEM_KEY; @@ -430,11 +439,18 @@ static int find_next_chunk(struct btrfs_root *root, u64 *objectid) ret = btrfs_previous_item(root, path, 0, BTRFS_CHUNK_ITEM_KEY); if (ret) { - *objectid = 0; + *offset = 0; } else { btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); - *objectid = found_key.objectid + found_key.offset; + if (found_key.objectid != objectid) + *offset = 0; + else { + chunk = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_chunk); + *offset = found_key.offset + + btrfs_chunk_length(path->nodes[0], chunk); + } } ret = 0; error: @@ -520,9 +536,12 @@ int btrfs_add_device(struct btrfs_trans_handle *trans, btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes); btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used); + btrfs_set_device_group(leaf, dev_item, 0); + btrfs_set_device_seek_speed(leaf, dev_item, 0); + btrfs_set_device_bandwidth(leaf, dev_item, 0); ptr = (unsigned long)btrfs_device_uuid(dev_item); - write_extent_buffer(leaf, device->uuid, ptr, BTRFS_DEV_UUID_SIZE); + write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); btrfs_mark_buffer_dirty(leaf); ret = 0; @@ -674,7 +693,10 @@ again: return -ENOSPC; } - ret = find_next_chunk(chunk_root, &key.objectid); + key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; + key.type = BTRFS_CHUNK_ITEM_KEY; + ret = find_next_chunk(chunk_root, BTRFS_FIRST_CHUNK_TREE_OBJECTID, + &key.offset); if (ret) return ret; @@ -696,8 +718,9 @@ again: *num_bytes = calc_size * num_stripes; index = 0; -printk("new chunk type %Lu start %Lu size %Lu\n", type, key.objectid, *num_bytes); +printk("new chunk type %Lu start %Lu size %Lu\n", type, key.offset, *num_bytes); while(index < num_stripes) { + struct btrfs_stripe *stripe; BUG_ON(list_empty(&private_devs)); cur = private_devs.next; device = list_entry(cur, struct btrfs_device, dev_list); @@ -708,26 +731,28 @@ printk("new chunk type %Lu start %Lu size %Lu\n", type, key.objectid, *num_bytes list_move_tail(&device->dev_list, dev_list); ret = btrfs_alloc_dev_extent(trans, device, - key.objectid, - calc_size, &dev_offset); + info->chunk_root->root_key.objectid, + BTRFS_FIRST_CHUNK_TREE_OBJECTID, key.offset, + calc_size, &dev_offset); BUG_ON(ret); -printk("alloc chunk start %Lu size %Lu from dev %Lu type %Lu\n", key.objectid, calc_size, device->devid, type); +printk("alloc chunk start %Lu size %Lu from dev %Lu type %Lu\n", key.offset, calc_size, device->devid, type); device->bytes_used += calc_size; ret = btrfs_update_device(trans, device); BUG_ON(ret); map->stripes[index].dev = device; map->stripes[index].physical = dev_offset; - btrfs_set_stack_stripe_devid(stripes + index, device->devid); - btrfs_set_stack_stripe_offset(stripes + index, dev_offset); + stripe = stripes + index; + btrfs_set_stack_stripe_devid(stripe, device->devid); + btrfs_set_stack_stripe_offset(stripe, dev_offset); + memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE); physical = dev_offset; index++; } BUG_ON(!list_empty(&private_devs)); - /* key.objectid was set above */ - key.offset = *num_bytes; - key.type = BTRFS_CHUNK_ITEM_KEY; + /* key was set above */ + btrfs_set_stack_chunk_length(chunk, *num_bytes); btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid); btrfs_set_stack_chunk_stripe_len(chunk, stripe_len); btrfs_set_stack_chunk_type(chunk, type); @@ -745,14 +770,14 @@ printk("alloc chunk start %Lu size %Lu from dev %Lu type %Lu\n", key.objectid, c ret = btrfs_insert_item(trans, chunk_root, &key, chunk, btrfs_chunk_item_size(num_stripes)); BUG_ON(ret); - *start = key.objectid; + *start = key.offset;; em = alloc_extent_map(GFP_NOFS); if (!em) return -ENOMEM; em->bdev = (struct block_device *)map; - em->start = key.objectid; - em->len = key.offset; + em->start = key.offset; + em->len = *num_bytes; em->block_start = 0; kfree(chunk); @@ -1056,8 +1081,8 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, int ret; int i; - logical = key->objectid; - length = key->offset; + logical = key->offset; + length = btrfs_chunk_length(leaf, chunk); spin_lock(&map_tree->map_tree.lock); em = lookup_extent_mapping(&map_tree->map_tree, logical, 1); spin_unlock(&map_tree->map_tree.lock); @@ -1131,7 +1156,7 @@ static int fill_device_from_item(struct extent_buffer *leaf, device->sector_size = btrfs_device_sector_size(leaf, dev_item); ptr = (unsigned long)btrfs_device_uuid(dev_item); - read_extent_buffer(leaf, device->uuid, ptr, BTRFS_DEV_UUID_SIZE); + read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); return 0; } @@ -1143,7 +1168,6 @@ static int read_one_dev(struct btrfs_root *root, struct btrfs_device *device; u64 devid; int ret; - devid = btrfs_device_id(leaf, dev_item); device = btrfs_find_device(root, devid); if (!device) { diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 89548837a1cc..f9cae3072171 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -57,7 +57,7 @@ struct btrfs_device { u64 type; /* physical drive uuid (or lvm uuid) */ - u8 uuid[BTRFS_DEV_UUID_SIZE]; + u8 uuid[BTRFS_UUID_SIZE]; }; struct btrfs_fs_devices { @@ -93,7 +93,9 @@ struct btrfs_multi_bio { int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, struct btrfs_device *device, - u64 owner, u64 num_bytes, u64 *start); + u64 chunk_tree, u64 chunk_objectid, + u64 chunk_offset, + u64 num_bytes, u64 *start); int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, u64 logical, u64 *length, struct btrfs_multi_bio **multi_ret, int mirror_num); -- cgit v1.2.3 From f2d8d74d7874f8f81222363cd6459a365796e35a Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 21 Apr 2008 10:03:05 -0400 Subject: Btrfs: Make an unplug function that doesn't unplug every spindle Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 26 ++++++++++-------- fs/btrfs/inode.c | 6 +---- fs/btrfs/volumes.c | 79 +++++++++++++++++++++++++++++++++++++++--------------- fs/btrfs/volumes.h | 2 ++ 4 files changed, 75 insertions(+), 38 deletions(-) (limited to 'fs/btrfs/volumes.h') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index c829612c797e..7f5aca35494d 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -913,18 +913,22 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits) void btrfs_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) { - struct list_head *cur; - struct btrfs_device *device; - struct btrfs_fs_info *info; + struct inode *inode = page->mapping->host; + struct extent_map_tree *em_tree; + struct extent_map *em; + u64 offset = page_offset(page); - info = (struct btrfs_fs_info *)bdi->unplug_io_data; - list_for_each(cur, &info->fs_devices->devices) { - device = list_entry(cur, struct btrfs_device, dev_list); - bdi = blk_get_backing_dev_info(device->bdev); - if (bdi->unplug_io_fn) { - bdi->unplug_io_fn(bdi, page); - } - } + em_tree = &BTRFS_I(inode)->extent_tree; + spin_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE); + spin_unlock(&em_tree->lock); + if (!em) + return; + + offset = offset - em->start; + btrfs_unplug_page(&BTRFS_I(inode)->root->fs_info->mapping_tree, + em->block_start + offset, page); + free_extent_map(em); } static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 5b1514e58de4..e875c7c8a647 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -313,13 +313,9 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset, u64 logical = bio->bi_sector << 9; u64 length = 0; u64 map_length; - struct bio_vec *bvec; - int i; int ret; - bio_for_each_segment(bvec, bio, i) { - length += bvec->bv_len; - } + length = bio->bi_size; map_tree = &root->fs_info->mapping_tree; map_length = length; ret = btrfs_map_block(map_tree, READ, logical, diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 4fc92cf813d8..460240706505 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include "ctree.h" #include "extent_map.h" @@ -930,9 +931,10 @@ int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len) return ret; } -int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, - u64 logical, u64 *length, - struct btrfs_multi_bio **multi_ret, int mirror_num) +static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, + u64 logical, u64 *length, + struct btrfs_multi_bio **multi_ret, + int mirror_num, struct page *unplug_page) { struct extent_map *em; struct map_lookup *map; @@ -944,6 +946,7 @@ int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, int stripes_required = 1; int stripe_index; int i; + int num_stripes; struct btrfs_multi_bio *multi = NULL; if (multi_ret && !(rw & (1 << BIO_RW))) { @@ -960,10 +963,14 @@ again: spin_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, logical, *length); spin_unlock(&em_tree->lock); + + if (!em && unplug_page) + return 0; + if (!em) { printk("unable to find logical %Lu\n", logical); + BUG(); } - BUG_ON(!em); BUG_ON(em->start > logical || em->start + em->len < logical); map = (struct map_lookup *)em->bdev; @@ -1010,14 +1017,15 @@ again: } else { *length = em->len - offset; } - if (!multi_ret) + + if (!multi_ret && !unplug_page) goto out; - multi->num_stripes = 1; + num_stripes = 1; stripe_index = 0; if (map->type & BTRFS_BLOCK_GROUP_RAID1) { - if (rw & (1 << BIO_RW)) - multi->num_stripes = map->num_stripes; + if (unplug_page || (rw & (1 << BIO_RW))) + num_stripes = map->num_stripes; else if (mirror_num) { stripe_index = mirror_num - 1; } else { @@ -1037,7 +1045,7 @@ again: } } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { if (rw & (1 << BIO_RW)) - multi->num_stripes = map->num_stripes; + num_stripes = map->num_stripes; else if (mirror_num) stripe_index = mirror_num - 1; } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { @@ -1047,8 +1055,8 @@ again: stripe_index = do_div(stripe_nr, factor); stripe_index *= map->sub_stripes; - if (rw & (1 << BIO_RW)) - multi->num_stripes = map->sub_stripes; + if (unplug_page || (rw & (1 << BIO_RW))) + num_stripes = map->sub_stripes; else if (mirror_num) stripe_index += mirror_num - 1; else @@ -1063,19 +1071,50 @@ again: } BUG_ON(stripe_index >= map->num_stripes); - for (i = 0; i < multi->num_stripes; i++) { - multi->stripes[i].physical = - map->stripes[stripe_index].physical + stripe_offset + - stripe_nr * map->stripe_len; - multi->stripes[i].dev = map->stripes[stripe_index].dev; + for (i = 0; i < num_stripes; i++) { + if (unplug_page) { + struct btrfs_device *device; + struct backing_dev_info *bdi; + + device = map->stripes[stripe_index].dev; + bdi = blk_get_backing_dev_info(device->bdev); + if (bdi->unplug_io_fn) { + bdi->unplug_io_fn(bdi, unplug_page); + } + } else { + multi->stripes[i].physical = + map->stripes[stripe_index].physical + + stripe_offset + stripe_nr * map->stripe_len; + multi->stripes[i].dev = map->stripes[stripe_index].dev; + } stripe_index++; } - *multi_ret = multi; + if (multi_ret) { + *multi_ret = multi; + multi->num_stripes = num_stripes; + } out: free_extent_map(em); return 0; } +int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, + u64 logical, u64 *length, + struct btrfs_multi_bio **multi_ret, int mirror_num) +{ + return __btrfs_map_block(map_tree, rw, logical, length, multi_ret, + mirror_num, NULL); +} + +int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree, + u64 logical, struct page *page) +{ + u64 length = PAGE_CACHE_SIZE; + return __btrfs_map_block(map_tree, READ, logical, &length, + NULL, 0, page); +} + + #if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23) static void end_bio_multi_stripe(struct bio *bio, int err) #else @@ -1122,16 +1161,12 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, u64 logical = bio->bi_sector << 9; u64 length = 0; u64 map_length; - struct bio_vec *bvec; struct btrfs_multi_bio *multi = NULL; - int i; int ret; int dev_nr = 0; int total_devs = 1; - bio_for_each_segment(bvec, bio, i) { - length += bvec->bv_len; - } + length = bio->bi_size; map_tree = &root->fs_info->mapping_tree; map_length = length; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index f9cae3072171..dfe7e8a9507c 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -119,4 +119,6 @@ int btrfs_add_device(struct btrfs_trans_handle *trans, struct btrfs_device *device); int btrfs_cleanup_fs_uuids(void); int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len); +int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree, + u64 logical, struct page *page); #endif -- cgit v1.2.3 From b30757178dad19a0388d958ff9eea66e674d39ed Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 22 Apr 2008 09:22:07 -0400 Subject: Btrfs: Add a special device list for chunk allocations This allows other code that needs to walk every device in the FS to do so without locking against allocations. Signed-off-by: Chris Mason --- fs/btrfs/volumes.c | 15 ++++++++++----- fs/btrfs/volumes.h | 7 +++++++ 2 files changed, 17 insertions(+), 5 deletions(-) (limited to 'fs/btrfs/volumes.h') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 107fc74c3ab8..5619e50583e3 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -111,6 +111,7 @@ static int device_list_add(const char *path, if (!fs_devices) return -ENOMEM; INIT_LIST_HEAD(&fs_devices->devices); + INIT_LIST_HEAD(&fs_devices->alloc_list); list_add(&fs_devices->list, &fs_uuids); memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE); fs_devices->latest_devid = devid; @@ -139,6 +140,7 @@ static int device_list_add(const char *path, return -ENOMEM; } list_add(&device->dev_list, &fs_devices->devices); + list_add(&device->dev_alloc_list, &fs_devices->alloc_list); fs_devices->num_devices++; } @@ -660,7 +662,7 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, struct btrfs_device *device = NULL; struct btrfs_chunk *chunk; struct list_head private_devs; - struct list_head *dev_list = &extent_root->fs_info->fs_devices->devices; + struct list_head *dev_list; struct list_head *cur; struct extent_map_tree *em_tree; struct map_lookup *map; @@ -682,6 +684,7 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, int stripe_len = 64 * 1024; struct btrfs_key key; + dev_list = &extent_root->fs_info->fs_devices->alloc_list; if (list_empty(dev_list)) return -ENOSPC; @@ -752,12 +755,12 @@ again: /* build a private list of devices we will allocate from */ while(index < num_stripes) { - device = list_entry(cur, struct btrfs_device, dev_list); + device = list_entry(cur, struct btrfs_device, dev_alloc_list); avail = device->total_bytes - device->bytes_used; cur = cur->next; if (avail >= min_free) { - list_move_tail(&device->dev_list, &private_devs); + list_move_tail(&device->dev_alloc_list, &private_devs); index++; if (type & BTRFS_BLOCK_GROUP_DUP) index++; @@ -812,12 +815,12 @@ printk("new chunk type %Lu start %Lu size %Lu\n", type, key.offset, *num_bytes); struct btrfs_stripe *stripe; BUG_ON(list_empty(&private_devs)); cur = private_devs.next; - device = list_entry(cur, struct btrfs_device, dev_list); + device = list_entry(cur, struct btrfs_device, dev_alloc_list); /* loop over this device again if we're doing a dup group */ if (!(type & BTRFS_BLOCK_GROUP_DUP) || (index == num_stripes - 1)) - list_move_tail(&device->dev_list, dev_list); + list_move_tail(&device->dev_alloc_list, dev_list); ret = btrfs_alloc_dev_extent(trans, device, info->chunk_root->root_key.objectid, @@ -1329,6 +1332,8 @@ static int read_one_dev(struct btrfs_root *root, return -ENOMEM; list_add(&device->dev_list, &root->fs_info->fs_devices->devices); + list_add(&device->dev_alloc_list, + &root->fs_info->fs_devices->alloc_list); device->barriers = 1; spin_lock_init(&device->io_lock); } diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index dfe7e8a9507c..a2660d2b6b34 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -24,9 +24,12 @@ struct buffer_head; struct btrfs_device { struct list_head dev_list; + struct list_head dev_alloc_list; struct btrfs_root *dev_root; struct buffer_head *pending_io; + int barriers; + spinlock_t io_lock; struct block_device *bdev; @@ -70,7 +73,11 @@ struct btrfs_fs_devices { u64 num_devices; struct block_device *latest_bdev; struct block_device *lowest_bdev; + /* all of the devices in the FS */ struct list_head devices; + + /* devices not currently being allocated */ + struct list_head alloc_list; struct list_head list; }; -- cgit v1.2.3 From 8f18cf13396caae5a3d7ae91201cfb15181a9642 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 25 Apr 2008 16:53:30 -0400 Subject: Btrfs: Make the resizer work based on shrinking and growing devices Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 1 + fs/btrfs/disk-io.c | 4 + fs/btrfs/extent-tree.c | 145 +++++++++------------- fs/btrfs/inode.c | 36 ++++-- fs/btrfs/volumes.c | 324 +++++++++++++++++++++++++++++++++++++++++++++++-- fs/btrfs/volumes.h | 5 + 6 files changed, 407 insertions(+), 108 deletions(-) (limited to 'fs/btrfs/volumes.h') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index f00c4be59ad6..ac7106ec5357 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -474,6 +474,7 @@ struct btrfs_block_group_cache { u64 pinned; u64 flags; int cached; + int ro; }; struct btrfs_device; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 71838264ca6b..a9ce491d279b 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -818,6 +818,10 @@ struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info, return fs_info->tree_root; if (location->objectid == BTRFS_EXTENT_TREE_OBJECTID) return fs_info->extent_root; + if (location->objectid == BTRFS_CHUNK_TREE_OBJECTID) + return fs_info->chunk_root; + if (location->objectid == BTRFS_DEV_TREE_OBJECTID) + return fs_info->dev_root; root = radix_tree_lookup(&fs_info->fs_roots_radix, (unsigned long)location->objectid); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index c49592c5127a..6540095544e8 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -187,6 +187,7 @@ static int noinline find_search_start(struct btrfs_root *root, if (!cache) goto out; + total_fs_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy); free_space_cache = &root->fs_info->free_space_cache; @@ -196,7 +197,7 @@ again: goto out; last = max(search_start, cache->key.objectid); - if (!block_group_bits(cache, data)) { + if (!block_group_bits(cache, data) || cache->ro) { goto new_group; } @@ -221,6 +222,8 @@ again: continue; } spin_unlock_irq(&free_space_cache->lock); + if (cache->ro) + goto new_group; if (start + num > cache->key.objectid + cache->key.offset) goto new_group; if (start + num > total_fs_bytes) @@ -319,7 +322,7 @@ struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root, if (search_start && search_start < total_fs_bytes) { struct btrfs_block_group_cache *shint; shint = btrfs_lookup_block_group(info, search_start); - if (shint && block_group_bits(shint, data)) { + if (shint && block_group_bits(shint, data) && !shint->ro) { used = btrfs_block_group_used(&shint->item); if (used + shint->pinned < div_factor(shint->key.offset, factor)) { @@ -327,7 +330,7 @@ struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root, } } } - if (hint && block_group_bits(hint, data) && + if (hint && !hint->ro && block_group_bits(hint, data) && hint->key.objectid < total_fs_bytes) { used = btrfs_block_group_used(&hint->item); if (used + hint->pinned < @@ -364,7 +367,7 @@ again: if (cache->key.objectid > total_fs_bytes) break; - if (block_group_bits(cache, data)) { + if (!cache->ro && block_group_bits(cache, data)) { if (full_search) free_check = cache->key.offset; else @@ -1020,6 +1023,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, if (found) { found->total_bytes += total_bytes; found->bytes_used += bytes_used; + found->full = 0; WARN_ON(found->total_bytes < found->bytes_used); *space_info = found; return 0; @@ -1700,7 +1704,6 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans, u64 super_used; u64 root_used; u64 search_start = 0; - u64 new_hint; u64 alloc_profile; u32 sizes[2]; struct btrfs_fs_info *info = root->fs_info; @@ -1724,7 +1727,7 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans, data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile; } again: - if (root->ref_cows) { + if (root != root->fs_info->extent_root) { if (!(data & BTRFS_BLOCK_GROUP_METADATA)) { ret = do_chunk_alloc(trans, root->fs_info->extent_root, 2 * 1024 * 1024, @@ -1738,10 +1741,6 @@ again: BUG_ON(ret); } - new_hint = max(hint_byte, root->fs_info->alloc_start); - if (new_hint < btrfs_super_total_bytes(&info->super_copy)) - hint_byte = new_hint; - WARN_ON(num_bytes < root->sectorsize); ret = find_free_extent(trans, root, num_bytes, empty_size, search_start, search_end, hint_byte, ins, @@ -2473,15 +2472,16 @@ out: return ret; } -int btrfs_shrink_extent_tree(struct btrfs_root *root, u64 new_size) +int btrfs_shrink_extent_tree(struct btrfs_root *root, u64 shrink_start) { struct btrfs_trans_handle *trans; struct btrfs_root *tree_root = root->fs_info->tree_root; struct btrfs_path *path; u64 cur_byte; u64 total_found; + u64 shrink_last_byte; + struct btrfs_block_group_cache *shrink_block_group; struct btrfs_fs_info *info = root->fs_info; - struct extent_io_tree *block_group_cache; struct btrfs_key key; struct btrfs_key found_key; struct extent_buffer *leaf; @@ -2489,17 +2489,29 @@ int btrfs_shrink_extent_tree(struct btrfs_root *root, u64 new_size) int ret; int progress = 0; - btrfs_set_super_total_bytes(&info->super_copy, new_size); - clear_extent_dirty(&info->free_space_cache, new_size, (u64)-1, - GFP_NOFS); - block_group_cache = &info->block_group_cache; + shrink_block_group = btrfs_lookup_block_group(root->fs_info, + shrink_start); + BUG_ON(!shrink_block_group); + + shrink_last_byte = shrink_start + shrink_block_group->key.offset; + + shrink_block_group->space_info->total_bytes -= + shrink_block_group->key.offset; +printk("shrink_extent_tree %Lu -> %Lu type %Lu\n", shrink_start, shrink_last_byte, shrink_block_group->flags); path = btrfs_alloc_path(); root = root->fs_info->extent_root; path->reada = 2; again: + trans = btrfs_start_transaction(root, 1); + do_chunk_alloc(trans, root->fs_info->extent_root, + btrfs_block_group_used(&shrink_block_group->item) + + 2 * 1024 * 1024, shrink_block_group->flags); + btrfs_end_transaction(trans, root); + shrink_block_group->ro = 1; + total_found = 0; - key.objectid = new_size; + key.objectid = shrink_start; key.offset = 0; key.type = 0; cur_byte = key.objectid; @@ -2511,10 +2523,12 @@ again: ret = btrfs_previous_item(root, path, 0, BTRFS_EXTENT_ITEM_KEY); if (ret < 0) goto out; + if (ret == 0) { leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); - if (found_key.objectid + found_key.offset > new_size) { + if (found_key.objectid + found_key.offset > shrink_start && + found_key.objectid < shrink_last_byte) { cur_byte = found_key.objectid; key.objectid = cur_byte; } @@ -2543,6 +2557,9 @@ next: btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + if (found_key.objectid >= shrink_last_byte) + break; + if (progress && need_resched()) { memcpy(&key, &found_key, sizeof(key)); mutex_unlock(&root->fs_info->fs_mutex); @@ -2583,68 +2600,31 @@ next: goto again; } + /* + * we've freed all the extents, now remove the block + * group item from the tree + */ trans = btrfs_start_transaction(root, 1); - key.objectid = new_size; - key.offset = 0; - key.type = 0; - while(1) { - u64 ptr; - - ret = btrfs_search_slot(trans, root, &key, path, -1, 1); - if (ret < 0) - goto out; - - leaf = path->nodes[0]; - nritems = btrfs_header_nritems(leaf); -bg_next: - if (path->slots[0] >= nritems) { - ret = btrfs_next_leaf(root, path); - if (ret < 0) - break; - if (ret == 1) { - ret = 0; - break; - } - leaf = path->nodes[0]; - btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + memcpy(&key, &shrink_block_group->key, sizeof(key)); - /* - * btrfs_next_leaf doesn't cow buffers, we have to - * do the search again - */ - memcpy(&key, &found_key, sizeof(key)); - btrfs_release_path(root, path); - goto resched_check; - } + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret > 0) + ret = -EIO; + if (ret < 0) + goto out; - btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); - if (btrfs_key_type(&found_key) != BTRFS_BLOCK_GROUP_ITEM_KEY) { - printk("shrinker found key %Lu %u %Lu\n", - found_key.objectid, found_key.type, - found_key.offset); - path->slots[0]++; - goto bg_next; - } - ret = get_state_private(&info->block_group_cache, - found_key.objectid, &ptr); - if (!ret) - kfree((void *)(unsigned long)ptr); + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + kfree(shrink_block_group); - clear_extent_bits(&info->block_group_cache, found_key.objectid, - found_key.objectid + found_key.offset - 1, - (unsigned int)-1, GFP_NOFS); + clear_extent_bits(&info->block_group_cache, found_key.objectid, + found_key.objectid + found_key.offset - 1, + (unsigned int)-1, GFP_NOFS); - key.objectid = found_key.objectid + 1; - btrfs_del_item(trans, root, path); - btrfs_release_path(root, path); -resched_check: - if (need_resched()) { - mutex_unlock(&root->fs_info->fs_mutex); - cond_resched(); - mutex_lock(&root->fs_info->fs_mutex); - } - } - clear_extent_dirty(&info->free_space_cache, new_size, (u64)-1, + btrfs_del_item(trans, root, path); + clear_extent_dirty(&info->free_space_cache, + shrink_start, shrink_last_byte - 1, GFP_NOFS); btrfs_commit_transaction(trans, root); out: @@ -2652,13 +2632,6 @@ out: return ret; } -int btrfs_grow_extent_tree(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 new_size) -{ - btrfs_set_super_total_bytes(&root->fs_info->super_copy, new_size); - return 0; -} - int find_first_block_group(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *key) { @@ -2726,7 +2699,7 @@ int btrfs_read_block_groups(struct btrfs_root *root) leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); - cache = kmalloc(sizeof(*cache), GFP_NOFS); + cache = kzalloc(sizeof(*cache), GFP_NOFS); if (!cache) { ret = -ENOMEM; break; @@ -2736,8 +2709,6 @@ int btrfs_read_block_groups(struct btrfs_root *root) btrfs_item_ptr_offset(leaf, path->slots[0]), sizeof(cache->item)); memcpy(&cache->key, &found_key, sizeof(found_key)); - cache->cached = 0; - cache->pinned = 0; key.objectid = found_key.objectid + found_key.offset; btrfs_release_path(root, path); @@ -2789,12 +2760,10 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, extent_root = root->fs_info->extent_root; block_group_cache = &root->fs_info->block_group_cache; - cache = kmalloc(sizeof(*cache), GFP_NOFS); + cache = kzalloc(sizeof(*cache), GFP_NOFS); BUG_ON(!cache); cache->key.objectid = chunk_offset; cache->key.offset = size; - cache->cached = 0; - cache->pinned = 0; btrfs_set_key_type(&cache->key, BTRFS_BLOCK_GROUP_ITEM_KEY); memset(&cache->item, 0, sizeof(cache->item)); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index b31f52d4f2ca..4d12aa532c5b 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -16,6 +16,7 @@ * Boston, MA 021110-1307, USA. */ +#include #include #include #include @@ -2887,9 +2888,12 @@ static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg) { u64 new_size; u64 old_size; + u64 devid = 1; struct btrfs_ioctl_vol_args *vol_args; struct btrfs_trans_handle *trans; + struct btrfs_device *device = NULL; char *sizestr; + char *devstr = NULL; int ret = 0; int namelen; int mod = 0; @@ -2909,9 +2913,25 @@ static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg) goto out; } + mutex_lock(&root->fs_info->fs_mutex); sizestr = vol_args->name; + devstr = strchr(sizestr, ':'); + if (devstr) { + char *end; + sizestr = devstr + 1; + *devstr = '\0'; + devstr = vol_args->name; + devid = simple_strtoull(devstr, &end, 10); +printk("resizing devid %Lu\n", devid); + } + device = btrfs_find_device(root, devid, NULL); + if (!device) { + printk("resizer unable to find device %Lu\n", devid); + ret = -EINVAL; + goto out_unlock; + } if (!strcmp(sizestr, "max")) - new_size = root->fs_info->sb->s_bdev->bd_inode->i_size; + new_size = device->bdev->bd_inode->i_size; else { if (sizestr[0] == '-') { mod = -1; @@ -2923,12 +2943,11 @@ static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg) new_size = btrfs_parse_size(sizestr); if (new_size == 0) { ret = -EINVAL; - goto out; + goto out_unlock; } } - mutex_lock(&root->fs_info->fs_mutex); - old_size = btrfs_super_total_bytes(&root->fs_info->super_copy); + old_size = device->total_bytes; if (mod < 0) { if (new_size > old_size) { @@ -2944,7 +2963,7 @@ static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg) ret = -EINVAL; goto out_unlock; } - if (new_size > root->fs_info->sb->s_bdev->bd_inode->i_size) { + if (new_size > device->bdev->bd_inode->i_size) { ret = -EFBIG; goto out_unlock; } @@ -2952,13 +2971,14 @@ static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg) do_div(new_size, root->sectorsize); new_size *= root->sectorsize; -printk("new size is %Lu\n", new_size); +printk("new size for %s is %llu\n", device->name, (unsigned long long)new_size); + if (new_size > old_size) { trans = btrfs_start_transaction(root, 1); - ret = btrfs_grow_extent_tree(trans, root, new_size); + ret = btrfs_grow_device(trans, device, new_size); btrfs_commit_transaction(trans, root); } else { - ret = btrfs_shrink_extent_tree(root, new_size); + ret = btrfs_shrink_device(device, new_size); } out_unlock: diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index c63a982e31d0..a2c56de1548a 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -77,7 +77,7 @@ static struct btrfs_device *__find_device(struct list_head *head, u64 devid, list_for_each(cur, head) { dev = list_entry(cur, struct btrfs_device, dev_list); if (dev->devid == devid && - !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE)) { + (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) { return dev; } } @@ -293,6 +293,10 @@ static int find_free_dev_extent(struct btrfs_trans_handle *trans, * so we make sure to start at an offset of at least 1MB */ search_start = max((u64)1024 * 1024, search_start); + + if (root->fs_info->alloc_start + num_bytes <= device->total_bytes) + search_start = max(root->fs_info->alloc_start, search_start); + key.objectid = device->devid; key.offset = search_start; key.type = BTRFS_DEV_EXTENT_KEY; @@ -380,6 +384,33 @@ error: return ret; } +int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, + struct btrfs_device *device, + u64 start) +{ + int ret; + struct btrfs_path *path; + struct btrfs_root *root = device->dev_root; + struct btrfs_key key; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = device->devid; + key.offset = start; + key.type = BTRFS_DEV_EXTENT_KEY; + + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + BUG_ON(ret); + + ret = btrfs_del_item(trans, root, path); + BUG_ON(ret); + + btrfs_free_path(path); + return ret; +} + int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, struct btrfs_device *device, u64 chunk_tree, u64 chunk_objectid, @@ -560,6 +591,7 @@ out: btrfs_free_path(path); return ret; } + int btrfs_update_device(struct btrfs_trans_handle *trans, struct btrfs_device *device) { @@ -606,6 +638,254 @@ out: return ret; } +int btrfs_grow_device(struct btrfs_trans_handle *trans, + struct btrfs_device *device, u64 new_size) +{ + struct btrfs_super_block *super_copy = + &device->dev_root->fs_info->super_copy; + u64 old_total = btrfs_super_total_bytes(super_copy); + u64 diff = new_size - device->total_bytes; + + btrfs_set_super_total_bytes(super_copy, old_total + diff); + return btrfs_update_device(trans, device); +} + +static int btrfs_free_chunk(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 chunk_tree, u64 chunk_objectid, + u64 chunk_offset) +{ + int ret; + struct btrfs_path *path; + struct btrfs_key key; + + root = root->fs_info->chunk_root; + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = chunk_objectid; + key.offset = chunk_offset; + key.type = BTRFS_CHUNK_ITEM_KEY; + + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + BUG_ON(ret); + + ret = btrfs_del_item(trans, root, path); + BUG_ON(ret); + + btrfs_free_path(path); + return 0; +} + +int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64 + chunk_offset) +{ + struct btrfs_super_block *super_copy = &root->fs_info->super_copy; + struct btrfs_disk_key *disk_key; + struct btrfs_chunk *chunk; + u8 *ptr; + int ret = 0; + u32 num_stripes; + u32 array_size; + u32 len = 0; + u32 cur; + struct btrfs_key key; + + array_size = btrfs_super_sys_array_size(super_copy); + + ptr = super_copy->sys_chunk_array; + cur = 0; + + while (cur < array_size) { + disk_key = (struct btrfs_disk_key *)ptr; + btrfs_disk_key_to_cpu(&key, disk_key); + + len = sizeof(*disk_key); + + if (key.type == BTRFS_CHUNK_ITEM_KEY) { + chunk = (struct btrfs_chunk *)(ptr + len); + num_stripes = btrfs_stack_chunk_num_stripes(chunk); + len += btrfs_chunk_item_size(num_stripes); + } else { + ret = -EIO; + break; + } + if (key.objectid == chunk_objectid && + key.offset == chunk_offset) { + memmove(ptr, ptr + len, array_size - (cur + len)); + array_size -= len; + btrfs_set_super_sys_array_size(super_copy, array_size); + } else { + ptr += len; + cur += len; + } + } + return ret; +} + + +int btrfs_relocate_chunk(struct btrfs_root *root, + u64 chunk_tree, u64 chunk_objectid, + u64 chunk_offset) +{ + struct extent_map_tree *em_tree; + struct btrfs_root *extent_root; + struct btrfs_trans_handle *trans; + struct extent_map *em; + struct map_lookup *map; + int ret; + int i; + + root = root->fs_info->chunk_root; + extent_root = root->fs_info->extent_root; + em_tree = &root->fs_info->mapping_tree.map_tree; + + /* step one, relocate all the extents inside this chunk */ + ret = btrfs_shrink_extent_tree(extent_root, chunk_offset); + BUG_ON(ret); + + trans = btrfs_start_transaction(root, 1); + BUG_ON(!trans); + + /* + * step two, delete the device extents and the + * chunk tree entries + */ + spin_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, chunk_offset, 1); + spin_unlock(&em_tree->lock); + + BUG_ON(em->start > chunk_offset || em->start + em->len < chunk_offset); + map = (struct map_lookup *)em->bdev; + + for (i = 0; i < map->num_stripes; i++) { + ret = btrfs_free_dev_extent(trans, map->stripes[i].dev, + map->stripes[i].physical); + BUG_ON(ret); + } + ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid, + chunk_offset); + + BUG_ON(ret); + + if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { + ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset); + BUG_ON(ret); + goto out; + } + + + + spin_lock(&em_tree->lock); + remove_extent_mapping(em_tree, em); + kfree(map); + em->bdev = NULL; + + /* once for the tree */ + free_extent_map(em); + spin_unlock(&em_tree->lock); + +out: + /* once for us */ + free_extent_map(em); + + btrfs_end_transaction(trans, root); + return 0; +} + +/* + * shrinking a device means finding all of the device extents past + * the new size, and then following the back refs to the chunks. + * The chunk relocation code actually frees the device extent + */ +int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *root = device->dev_root; + struct btrfs_dev_extent *dev_extent = NULL; + struct btrfs_path *path; + u64 length; + u64 chunk_tree; + u64 chunk_objectid; + u64 chunk_offset; + int ret; + int slot; + struct extent_buffer *l; + struct btrfs_key key; + struct btrfs_super_block *super_copy = &root->fs_info->super_copy; + u64 old_total = btrfs_super_total_bytes(super_copy); + u64 diff = device->total_bytes - new_size; + + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + trans = btrfs_start_transaction(root, 1); + if (!trans) { + ret = -ENOMEM; + goto done; + } + + path->reada = 2; + + device->total_bytes = new_size; + ret = btrfs_update_device(trans, device); + if (ret) { + btrfs_end_transaction(trans, root); + goto done; + } + WARN_ON(diff > old_total); + btrfs_set_super_total_bytes(super_copy, old_total - diff); + btrfs_end_transaction(trans, root); + + key.objectid = device->devid; + key.offset = (u64)-1; + key.type = BTRFS_DEV_EXTENT_KEY; + + while (1) { + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto done; + + ret = btrfs_previous_item(root, path, 0, key.type); + if (ret < 0) + goto done; + if (ret) { + ret = 0; + goto done; + } + + l = path->nodes[0]; + slot = path->slots[0]; + btrfs_item_key_to_cpu(l, &key, path->slots[0]); + + if (key.objectid != device->devid) + goto done; + + dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); + length = btrfs_dev_extent_length(l, dev_extent); + + if (key.offset + length <= new_size) + goto done; + + chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent); + chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent); + chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent); + btrfs_release_path(root, path); + + ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid, + chunk_offset); + if (ret) + goto done; + } + +done: + btrfs_free_path(path); + return ret; +} + int btrfs_add_system_chunk(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_key *key, @@ -658,6 +938,7 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 dev_offset; struct btrfs_fs_info *info = extent_root->fs_info; struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root; + struct btrfs_path *path; struct btrfs_stripe *stripes; struct btrfs_device *device = NULL; struct btrfs_chunk *chunk; @@ -724,6 +1005,10 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, min_stripe_size = 1 * 1024 * 1024; } + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + /* we don't want a chunk larger than 10% of the FS */ percent_max = div_factor(btrfs_super_total_bytes(&info->super_copy), 1); max_chunk_size = min(percent_max, max_chunk_size); @@ -759,11 +1044,19 @@ again: avail = device->total_bytes - device->bytes_used; cur = cur->next; + if (avail >= min_free) { - list_move_tail(&device->dev_alloc_list, &private_devs); - index++; - if (type & BTRFS_BLOCK_GROUP_DUP) + u64 ignored_start = 0; + ret = find_free_dev_extent(trans, device, path, + min_free, + &ignored_start); + if (ret == 0) { + list_move_tail(&device->dev_alloc_list, + &private_devs); index++; + if (type & BTRFS_BLOCK_GROUP_DUP) + index++; + } } else if (avail > max_avail) max_avail = avail; if (cur == dev_list) @@ -785,30 +1078,37 @@ again: calc_size = max_avail; goto again; } + btrfs_free_path(path); return -ENOSPC; } key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; key.type = BTRFS_CHUNK_ITEM_KEY; ret = find_next_chunk(chunk_root, BTRFS_FIRST_CHUNK_TREE_OBJECTID, &key.offset); - if (ret) + if (ret) { + btrfs_free_path(path); return ret; + } chunk = kmalloc(btrfs_chunk_item_size(num_stripes), GFP_NOFS); - if (!chunk) + if (!chunk) { + btrfs_free_path(path); return -ENOMEM; + } map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); if (!map) { kfree(chunk); + btrfs_free_path(path); return -ENOMEM; } + btrfs_free_path(path); + path = NULL; stripes = &chunk->stripe; *num_bytes = chunk_bytes_by_type(type, calc_size, num_stripes, sub_stripes); - index = 0; printk("new chunk type %Lu start %Lu size %Lu\n", type, key.offset, *num_bytes); while(index < num_stripes) { @@ -874,6 +1174,11 @@ printk("alloc chunk start %Lu size %Lu from dev %Lu type %Lu\n", key.offset, cal em->len = *num_bytes; em->block_start = 0; + if (type & BTRFS_BLOCK_GROUP_SYSTEM) { + ret = btrfs_add_system_chunk(trans, chunk_root, &key, + chunk, btrfs_chunk_item_size(num_stripes)); + BUG_ON(ret); + } kfree(chunk); em_tree = &extent_root->fs_info->mapping_tree.map_tree; @@ -1376,11 +1681,6 @@ int btrfs_read_sys_array(struct btrfs_root *root) array_size = btrfs_super_sys_array_size(super_copy); - /* - * we do this loop twice, once for the device items and - * once for all of the chunks. This way there are device - * structs filled in for every chunk - */ ptr = super_copy->sys_chunk_array; sb_ptr = offsetof(struct btrfs_super_block, sys_chunk_array); cur = 0; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index a2660d2b6b34..6fe8440b37a7 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -128,4 +128,9 @@ int btrfs_cleanup_fs_uuids(void); int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len); int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree, u64 logical, struct page *page); +int btrfs_grow_device(struct btrfs_trans_handle *trans, + struct btrfs_device *device, u64 new_size); +struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid, + u8 *uuid); +int btrfs_shrink_device(struct btrfs_device *device, u64 new_size); #endif -- cgit v1.2.3 From 788f20eb5affef584e75ea84bb80a4c3352a2c0e Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 28 Apr 2008 15:29:42 -0400 Subject: Btrfs: Add new ioctl to add devices Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 2 ++ fs/btrfs/inode.c | 23 +++++++++++++++++ fs/btrfs/ioctl.h | 6 +++++ fs/btrfs/super.c | 1 + fs/btrfs/volumes.c | 75 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/volumes.h | 1 + 6 files changed, 108 insertions(+) (limited to 'fs/btrfs/volumes.h') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index ac7106ec5357..b9257b37bb96 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -551,6 +551,8 @@ struct btrfs_fs_info { u64 data_alloc_profile; u64 metadata_alloc_profile; u64 system_alloc_profile; + + void *bdev_holder; }; /* diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index ab707c0930df..1a74b5018699 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3070,6 +3070,27 @@ static int btrfs_ioctl_defrag(struct file *file) return 0; } +long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg) +{ + struct btrfs_ioctl_vol_args *vol_args; + int ret; + + vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS); + + if (!vol_args) + return -ENOMEM; + + if (copy_from_user(vol_args, arg, sizeof(*vol_args))) { + ret = -EFAULT; + goto out; + } + ret = btrfs_init_new_device(root, vol_args->name); + +out: + kfree(vol_args); + return ret; +} + long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { @@ -3082,6 +3103,8 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_defrag(file); case BTRFS_IOC_RESIZE: return btrfs_ioctl_resize(root, (void __user *)arg); + case BTRFS_IOC_ADD_DEV: + return btrfs_ioctl_add_dev(root, (void __user *)arg); } return -ENOTTY; diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h index 4551e82013c8..8ad35fc4ba56 100644 --- a/fs/btrfs/ioctl.h +++ b/fs/btrfs/ioctl.h @@ -36,4 +36,10 @@ struct btrfs_ioctl_vol_args { struct btrfs_ioctl_vol_args) #define BTRFS_IOC_SCAN_DEV _IOW(BTRFS_IOCTL_MAGIC, 4, \ struct btrfs_ioctl_vol_args) +#define BTRFS_IOC_ADD_DEV _IOW(BTRFS_IOCTL_MAGIC, 10, \ + struct btrfs_ioctl_vol_args) +#define BTRFS_IOC_RM_DEV _IOW(BTRFS_IOCTL_MAGIC, 11, \ + struct btrfs_ioctl_vol_args) +#define BTRFS_IOC_BALANCE _IOW(BTRFS_IOCTL_MAGIC, 12, \ + struct btrfs_ioctl_vol_args) #endif diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 892864906880..7153dfaa3404 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -388,6 +388,7 @@ int btrfs_get_sb_bdev(struct file_system_type *fs_type, goto error; } + btrfs_sb(s)->fs_info->bdev_holder = fs_type; s->s_flags |= MS_ACTIVE; } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index a2c56de1548a..b93c15aa17db 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include "ctree.h" #include "extent_map.h" @@ -592,6 +593,80 @@ out: return ret; } +int btrfs_init_new_device(struct btrfs_root *root, char *device_path) +{ + struct btrfs_trans_handle *trans; + struct btrfs_device *device; + struct block_device *bdev; + struct list_head *cur; + struct list_head *devices; + u64 total_bytes; + int ret = 0; + + + bdev = open_bdev_excl(device_path, 0, root->fs_info->bdev_holder); + if (!bdev) { + return -EIO; + } + mutex_lock(&root->fs_info->fs_mutex); + trans = btrfs_start_transaction(root, 1); + devices = &root->fs_info->fs_devices->devices; + list_for_each(cur, devices) { + device = list_entry(cur, struct btrfs_device, dev_list); + if (device->bdev == bdev) { + ret = -EEXIST; + goto out; + } + } + + device = kzalloc(sizeof(*device), GFP_NOFS); + if (!device) { + /* we can safely leave the fs_devices entry around */ + ret = -ENOMEM; + goto out_close_bdev; + } + + device->barriers = 1; + generate_random_uuid(device->uuid); + spin_lock_init(&device->io_lock); + device->name = kstrdup(device_path, GFP_NOFS); + if (!device->name) { + kfree(device); + goto out_close_bdev; + } + device->io_width = root->sectorsize; + device->io_align = root->sectorsize; + device->sector_size = root->sectorsize; + device->total_bytes = i_size_read(bdev->bd_inode); + device->dev_root = root->fs_info->dev_root; + device->bdev = bdev; + + ret = btrfs_add_device(trans, root, device); + if (ret) + goto out_close_bdev; + + total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy); + btrfs_set_super_total_bytes(&root->fs_info->super_copy, + total_bytes + device->total_bytes); + + total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy); + btrfs_set_super_num_devices(&root->fs_info->super_copy, + total_bytes + 1); + + list_add(&device->dev_list, &root->fs_info->fs_devices->devices); + list_add(&device->dev_alloc_list, + &root->fs_info->fs_devices->alloc_list); + root->fs_info->fs_devices->num_devices++; +out: + btrfs_end_transaction(trans, root); + mutex_unlock(&root->fs_info->fs_mutex); + return ret; + +out_close_bdev: + close_bdev_excl(bdev); + goto out; +} + int btrfs_update_device(struct btrfs_trans_handle *trans, struct btrfs_device *device) { diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 6fe8440b37a7..6f173450378b 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -133,4 +133,5 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans, struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid, u8 *uuid); int btrfs_shrink_device(struct btrfs_device *device, u64 new_size); +int btrfs_init_new_device(struct btrfs_root *root, char *path); #endif -- cgit v1.2.3 From ec44a35cbeb26ab2da84cb280d778260f2312feb Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 28 Apr 2008 15:29:52 -0400 Subject: Btrfs: Add balance ioctl to restripe the chunks Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 2 +- fs/btrfs/extent-tree.c | 106 +++++++++++++++++++++++++++++++++++++-------- fs/btrfs/inode.c | 11 +++++ fs/btrfs/volumes.c | 115 +++++++++++++++++++++++++++++++++++++++++++++---- fs/btrfs/volumes.h | 1 + 5 files changed, 208 insertions(+), 27 deletions(-) (limited to 'fs/btrfs/volumes.h') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index b9257b37bb96..73b92dd150ff 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1364,7 +1364,7 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans, u64 root_objectid, u64 ref_generation, u64 owner, u64 owner_offset, u64 empty_size, u64 hint_byte, - u64 search_end, struct btrfs_key *ins, int data); + u64 search_end, struct btrfs_key *ins, u64 data); int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf); int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index fe4fe709c312..95aee5a29375 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -17,6 +17,7 @@ */ #include #include +#include #include "hash.h" #include "crc32c.h" #include "ctree.h" @@ -1058,6 +1059,26 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) } } +static u64 reduce_alloc_profile(u64 flags) +{ + if ((flags & BTRFS_BLOCK_GROUP_DUP) && + (flags & (BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10))) + flags &= ~BTRFS_BLOCK_GROUP_DUP; + + if ((flags & BTRFS_BLOCK_GROUP_RAID1) && + (flags & BTRFS_BLOCK_GROUP_RAID10)) + flags &= ~BTRFS_BLOCK_GROUP_RAID1; + + if ((flags & BTRFS_BLOCK_GROUP_RAID0) && + ((flags & BTRFS_BLOCK_GROUP_RAID1) | + (flags & BTRFS_BLOCK_GROUP_RAID10) | + (flags & BTRFS_BLOCK_GROUP_DUP))) + flags &= ~BTRFS_BLOCK_GROUP_RAID0; + return flags; +} + + static int do_chunk_alloc(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root, u64 alloc_bytes, u64 flags) @@ -1068,6 +1089,8 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 num_bytes; int ret; + flags = reduce_alloc_profile(flags); + space_info = __find_space_info(extent_root->fs_info, flags); if (!space_info) { ret = update_space_info(extent_root->fs_info, flags, @@ -1684,6 +1707,7 @@ enospc: error: return ret; } + /* * finds a free extent and does all the dirty work required for allocation * returns the key for the extent through ins, and a tree buffer for @@ -1697,7 +1721,7 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans, u64 root_objectid, u64 ref_generation, u64 owner, u64 owner_offset, u64 empty_size, u64 hint_byte, - u64 search_end, struct btrfs_key *ins, int data) + u64 search_end, struct btrfs_key *ins, u64 data) { int ret; int pending_ret; @@ -1727,6 +1751,7 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans, data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile; } again: + data = reduce_alloc_profile(data); if (root->ref_cows) { if (!(data & BTRFS_BLOCK_GROUP_METADATA)) { ret = do_chunk_alloc(trans, root->fs_info->extent_root, @@ -1752,6 +1777,9 @@ again: num_bytes = max(num_bytes, min_alloc_size); goto again; } + if (ret) { + printk("allocation failed flags %Lu\n", data); + } BUG_ON(ret); if (ret) return ret; @@ -2274,8 +2302,6 @@ static int noinline relocate_inode_pages(struct inode *inode, u64 start, { u64 page_start; u64 page_end; - u64 delalloc_start; - u64 existing_delalloc; unsigned long last_index; unsigned long i; struct page *page; @@ -2293,7 +2319,6 @@ static int noinline relocate_inode_pages(struct inode *inode, u64 start, ra_pages = BTRFS_I(inode)->root->fs_info->bdi.ra_pages; file_ra_state_init(ra, inode->i_mapping); - kfree(ra); for (; i <= last_index; i++) { if (total_read % ra_pages == 0) { @@ -2313,26 +2338,30 @@ static int noinline relocate_inode_pages(struct inode *inode, u64 start, goto out_unlock; } } +#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18) + ClearPageDirty(page); +#else + cancel_dirty_page(page, PAGE_CACHE_SIZE); +#endif + wait_on_page_writeback(page); + set_page_extent_mapped(page); page_start = (u64)page->index << PAGE_CACHE_SHIFT; page_end = page_start + PAGE_CACHE_SIZE - 1; lock_extent(io_tree, page_start, page_end, GFP_NOFS); - delalloc_start = page_start; - existing_delalloc = count_range_bits(io_tree, - &delalloc_start, page_end, - PAGE_CACHE_SIZE, EXTENT_DELALLOC); - + set_page_dirty(page); set_extent_delalloc(io_tree, page_start, page_end, GFP_NOFS); unlock_extent(io_tree, page_start, page_end, GFP_NOFS); - set_page_dirty(page); unlock_page(page); page_cache_release(page); + balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1); } out_unlock: + kfree(ra); mutex_unlock(&inode->i_mutex); return 0; } @@ -2397,8 +2426,6 @@ static int noinline relocate_one_reference(struct btrfs_root *extent_root, goto out; } relocate_inode_pages(inode, ref_offset, extent_key->offset); - /* FIXME, data=ordered will help get rid of this */ - filemap_fdatawrite(inode->i_mapping); iput(inode); mutex_lock(&extent_root->fs_info->fs_mutex); } else { @@ -2486,6 +2513,47 @@ out: return ret; } +static u64 update_block_group_flags(struct btrfs_root *root, u64 flags) +{ + u64 num_devices; + u64 stripped = BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10; + + num_devices = btrfs_super_num_devices(&root->fs_info->super_copy); + if (num_devices == 1) { + stripped |= BTRFS_BLOCK_GROUP_DUP; + stripped = flags & ~stripped; + + /* turn raid0 into single device chunks */ + if (flags & BTRFS_BLOCK_GROUP_RAID0) + return stripped; + + /* turn mirroring into duplication */ + if (flags & (BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10)) + return stripped | BTRFS_BLOCK_GROUP_DUP; + return flags; + } else { + /* they already had raid on here, just return */ + if ((flags & BTRFS_BLOCK_GROUP_DUP) && + (flags & BTRFS_BLOCK_GROUP_RAID1)) { + } + if (flags & stripped) + return flags; + + stripped |= BTRFS_BLOCK_GROUP_DUP; + stripped = flags & ~stripped; + + /* switch duplicated blocks with raid1 */ + if (flags & BTRFS_BLOCK_GROUP_DUP) + return stripped | BTRFS_BLOCK_GROUP_RAID1; + + /* turn single device chunks into raid0 */ + return stripped | BTRFS_BLOCK_GROUP_RAID0; + } + return flags; +} + int btrfs_shrink_extent_tree(struct btrfs_root *root, u64 shrink_start) { struct btrfs_trans_handle *trans; @@ -2494,6 +2562,7 @@ int btrfs_shrink_extent_tree(struct btrfs_root *root, u64 shrink_start) u64 cur_byte; u64 total_found; u64 shrink_last_byte; + u64 new_alloc_flags; struct btrfs_block_group_cache *shrink_block_group; struct btrfs_fs_info *info = root->fs_info; struct btrfs_key key; @@ -2511,17 +2580,20 @@ int btrfs_shrink_extent_tree(struct btrfs_root *root, u64 shrink_start) shrink_block_group->space_info->total_bytes -= shrink_block_group->key.offset; -printk("shrink_extent_tree %Lu -> %Lu type %Lu\n", shrink_start, shrink_last_byte, shrink_block_group->flags); path = btrfs_alloc_path(); root = root->fs_info->extent_root; path->reada = 2; again: - trans = btrfs_start_transaction(root, 1); - do_chunk_alloc(trans, root->fs_info->extent_root, + if (btrfs_block_group_used(&shrink_block_group->item) > 0) { + trans = btrfs_start_transaction(root, 1); + new_alloc_flags = update_block_group_flags(root, + shrink_block_group->flags); + do_chunk_alloc(trans, root->fs_info->extent_root, btrfs_block_group_used(&shrink_block_group->item) + - 2 * 1024 * 1024, shrink_block_group->flags); - btrfs_end_transaction(trans, root); + 2 * 1024 * 1024, new_alloc_flags); + btrfs_end_transaction(trans, root); + } shrink_block_group->ro = 1; total_found = 0; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 1a74b5018699..994834474590 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2864,6 +2864,15 @@ int btrfs_defrag_file(struct file *file) { goto out_unlock; } } + +#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18) + ClearPageDirty(page); +#else + cancel_dirty_page(page, PAGE_CACHE_SIZE); +#endif + wait_on_page_writeback(page); + set_page_extent_mapped(page); + page_start = (u64)page->index << PAGE_CACHE_SHIFT; page_end = page_start + PAGE_CACHE_SIZE - 1; @@ -3105,6 +3114,8 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_resize(root, (void __user *)arg); case BTRFS_IOC_ADD_DEV: return btrfs_ioctl_add_dev(root, (void __user *)arg); + case BTRFS_IOC_BALANCE: + return btrfs_balance(root->fs_info->dev_root); } return -ENOTTY; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index b93c15aa17db..6476ecbf132e 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -869,6 +869,107 @@ out: return 0; } +static u64 div_factor(u64 num, int factor) +{ + if (factor == 10) + return num; + num *= factor; + do_div(num, 10); + return num; +} + + +int btrfs_balance(struct btrfs_root *dev_root) +{ + int ret; + struct list_head *cur; + struct list_head *devices = &dev_root->fs_info->fs_devices->devices; + struct btrfs_device *device; + u64 old_size; + u64 size_to_free; + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_chunk *chunk; + struct btrfs_root *chunk_root = dev_root->fs_info->chunk_root; + struct btrfs_trans_handle *trans; + struct btrfs_key found_key; + + + dev_root = dev_root->fs_info->dev_root; + + mutex_lock(&dev_root->fs_info->fs_mutex); + /* step one make some room on all the devices */ + list_for_each(cur, devices) { + device = list_entry(cur, struct btrfs_device, dev_list); + old_size = device->total_bytes; + size_to_free = div_factor(old_size, 1); + size_to_free = min(size_to_free, (u64)1 * 1024 * 1024); + if (device->total_bytes - device->bytes_used > size_to_free) + continue; + + ret = btrfs_shrink_device(device, old_size - size_to_free); + BUG_ON(ret); + + trans = btrfs_start_transaction(dev_root, 1); + BUG_ON(!trans); + + ret = btrfs_grow_device(trans, device, old_size); + BUG_ON(ret); + + btrfs_end_transaction(trans, dev_root); + } + + /* step two, relocate all the chunks */ + path = btrfs_alloc_path(); + BUG_ON(!path); + + key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; + key.offset = (u64)-1; + key.type = BTRFS_CHUNK_ITEM_KEY; + + while(1) { + ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); + if (ret < 0) + goto error; + + /* + * this shouldn't happen, it means the last relocate + * failed + */ + if (ret == 0) + break; + + ret = btrfs_previous_item(chunk_root, path, 0, + BTRFS_CHUNK_ITEM_KEY); + if (ret) { + break; + } + btrfs_item_key_to_cpu(path->nodes[0], &found_key, + path->slots[0]); + if (found_key.objectid != key.objectid) + break; + chunk = btrfs_item_ptr(path->nodes[0], + path->slots[0], + struct btrfs_chunk); + key.offset = found_key.offset; + /* chunk zero is special */ + if (key.offset == 0) + break; + + ret = btrfs_relocate_chunk(chunk_root, + chunk_root->root_key.objectid, + found_key.objectid, + found_key.offset); + BUG_ON(ret); + btrfs_release_path(chunk_root, path); + } + ret = 0; +error: + btrfs_free_path(path); + mutex_unlock(&dev_root->fs_info->fs_mutex); + return ret; +} + /* * shrinking a device means finding all of the device extents past * the new size, and then following the back refs to the chunks. @@ -985,15 +1086,6 @@ int btrfs_add_system_chunk(struct btrfs_trans_handle *trans, return 0; } -static u64 div_factor(u64 num, int factor) -{ - if (factor == 10) - return num; - num *= factor; - do_div(num, 10); - return num; -} - static u64 chunk_bytes_by_type(u64 type, u64 calc_size, int num_stripes, int sub_stripes) { @@ -1040,6 +1132,11 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, int stripe_len = 64 * 1024; struct btrfs_key key; + if ((type & BTRFS_BLOCK_GROUP_RAID1) && + (type & BTRFS_BLOCK_GROUP_DUP)) { + WARN_ON(1); + type &= ~BTRFS_BLOCK_GROUP_DUP; + } dev_list = &extent_root->fs_info->fs_devices->alloc_list; if (list_empty(dev_list)) return -ENOSPC; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 6f173450378b..3f9a17f2e41b 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -134,4 +134,5 @@ struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid, u8 *uuid); int btrfs_shrink_device(struct btrfs_device *device, u64 new_size); int btrfs_init_new_device(struct btrfs_root *root, char *path); +int btrfs_balance(struct btrfs_root *dev_root); #endif -- cgit v1.2.3 From a236aed14ccb0661611d4416f6b573d892bdc60a Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 29 Apr 2008 09:38:00 -0400 Subject: Btrfs: Deal with failed writes in mirrored configurations Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 17 +++++++++++++++-- fs/btrfs/extent-tree.c | 4 ++-- fs/btrfs/volumes.c | 17 ++++++++++++++--- fs/btrfs/volumes.h | 3 ++- 4 files changed, 33 insertions(+), 8 deletions(-) (limited to 'fs/btrfs/volumes.h') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 1c7d84aff864..e35e70165b53 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1385,7 +1385,10 @@ int write_all_supers(struct btrfs_root *root) struct buffer_head *bh; int ret; int do_barriers; + int max_errors; + int total_errors = 0; + max_errors = btrfs_super_num_devices(&root->fs_info->super_copy) - 1; do_barriers = !btrfs_test_opt(root, NOBARRIER); sb = root->fs_info->sb_buffer; @@ -1433,8 +1436,14 @@ int write_all_supers(struct btrfs_root *root) } else { ret = submit_bh(WRITE, bh); } - BUG_ON(ret); + if (ret) + total_errors++; } + if (total_errors > max_errors) { + printk("btrfs: %d errors while writing supers\n", total_errors); + BUG(); + } + total_errors = 0; list_for_each(cur, head) { dev = list_entry(cur, struct btrfs_device, dev_list); @@ -1454,13 +1463,17 @@ int write_all_supers(struct btrfs_root *root) wait_on_buffer(bh); BUG_ON(!buffer_uptodate(bh)); } else { - BUG(); + total_errors++; } } dev->pending_io = NULL; brelse(bh); } + if (total_errors > max_errors) { + printk("btrfs: %d errors while writing supers\n", total_errors); + BUG(); + } return 0; } diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 95aee5a29375..f94794a99329 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -315,8 +315,8 @@ struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root, block_group_cache = &info->block_group_cache; total_fs_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy); - if (!owner) - factor = 10; + if (data & BTRFS_BLOCK_GROUP_METADATA) + factor = 9; bit = block_group_state_bits(data); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 9a7241134560..57ab755aca76 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1425,6 +1425,7 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, int stripe_index; int i; int num_stripes; + int max_errors = 0; struct btrfs_multi_bio *multi = NULL; if (multi_ret && !(rw & (1 << BIO_RW))) { @@ -1436,6 +1437,8 @@ again: GFP_NOFS); if (!multi) return -ENOMEM; + + atomic_set(&multi->error, 0); } spin_lock(&em_tree->lock); @@ -1462,8 +1465,10 @@ again: if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_DUP)) { stripes_required = map->num_stripes; + max_errors = 1; } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { stripes_required = map->sub_stripes; + max_errors = 1; } } if (multi_ret && rw == WRITE && @@ -1561,6 +1566,7 @@ again: if (multi_ret) { *multi_ret = multi; multi->num_stripes = num_stripes; + multi->max_errors = max_errors; } out: free_extent_map(em); @@ -1598,14 +1604,19 @@ static int end_bio_multi_stripe(struct bio *bio, return 1; #endif if (err) - multi->error = err; + atomic_inc(&multi->error); if (atomic_dec_and_test(&multi->stripes_pending)) { bio->bi_private = multi->private; bio->bi_end_io = multi->end_io; - if (!err && multi->error) - err = multi->error; + /* only send an error to the higher layers if it is + * beyond the tolerance of the multi-bio + */ + if (atomic_read(&multi->error) > multi->max_errors) + err = -EIO; + else + err = 0; kfree(multi); #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23) diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 3f9a17f2e41b..a9663e92bb14 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -90,7 +90,8 @@ struct btrfs_multi_bio { atomic_t stripes_pending; bio_end_io_t *end_io; void *private; - int error; + atomic_t error; + int max_errors; int num_stripes; struct btrfs_bio_stripe stripes[]; }; -- cgit v1.2.3 From a061fc8da7b990faa41ca503e66faef3ecdeead0 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 7 May 2008 11:43:44 -0400 Subject: Btrfs: Add support for online device removal This required a few structural changes to the code that manages bdev pointers: The VFS super block now gets an anon-bdev instead of a pointer to the lowest bdev. This allows us to avoid swapping the super block bdev pointer around at run time. The code to read in the super block no longer goes through the extent buffer interface. Things got ugly keeping the mapping constant. Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 3 +- fs/btrfs/disk-io.c | 101 ++++++++++------------ fs/btrfs/extent-tree.c | 139 +++++++++++++++++++++++++------ fs/btrfs/inode.c | 27 +++++- fs/btrfs/super.c | 35 ++------ fs/btrfs/transaction.c | 5 +- fs/btrfs/volumes.c | 221 +++++++++++++++++++++++++++++++++++++++++++++++-- fs/btrfs/volumes.h | 3 + 8 files changed, 412 insertions(+), 122 deletions(-) (limited to 'fs/btrfs/volumes.h') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 3b6f8524a4ad..33ab165591c5 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -505,7 +505,7 @@ struct btrfs_fs_info { u64 alloc_start; struct btrfs_transaction *running_transaction; struct btrfs_super_block super_copy; - struct extent_buffer *sb_buffer; + struct btrfs_super_block super_for_commit; struct block_device *__bdev; struct super_block *sb; struct inode *btree_inode; @@ -1208,6 +1208,7 @@ BTRFS_SETGET_STACK_FUNCS(root_limit, struct btrfs_root_item, byte_limit, 64); /* struct btrfs_super_block */ BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64); +BTRFS_SETGET_STACK_FUNCS(super_flags, struct btrfs_super_block, flags, 64); BTRFS_SETGET_STACK_FUNCS(super_generation, struct btrfs_super_block, generation, 64); BTRFS_SETGET_STACK_FUNCS(super_root, struct btrfs_super_block, root, 64); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index fabc31b334b6..9d5424ad01a3 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -78,9 +78,13 @@ struct extent_map *btree_get_extent(struct inode *inode, struct page *page, spin_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, start, len); - spin_unlock(&em_tree->lock); - if (em) + if (em) { + em->bdev = + BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; + spin_unlock(&em_tree->lock); goto out; + } + spin_unlock(&em_tree->lock); em = alloc_extent_map(GFP_NOFS); if (!em) { @@ -90,7 +94,7 @@ struct extent_map *btree_get_extent(struct inode *inode, struct page *page, em->start = 0; em->len = (u64)-1; em->block_start = 0; - em->bdev = inode->i_sb->s_bdev; + em->bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; spin_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em); @@ -435,11 +439,6 @@ static int __btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, ret = btrfs_bio_wq_end_io(root->fs_info, bio, 1); BUG_ON(ret); - if (offset == BTRFS_SUPER_INFO_OFFSET) { - bio->bi_bdev = root->fs_info->fs_devices->latest_bdev; - submit_bio(rw, bio); - return 0; - } return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num); } @@ -587,8 +586,7 @@ static int close_all_devices(struct btrfs_fs_info *fs_info) list = &fs_info->fs_devices->devices; list_for_each(next, list) { device = list_entry(next, struct btrfs_device, dev_list); - if (device->bdev && device->bdev != fs_info->sb->s_bdev) - close_bdev_excl(device->bdev); + close_bdev_excl(device->bdev); device->bdev = NULL; } return 0; @@ -1118,6 +1116,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, u32 leafsize; u32 blocksize; u32 stripesize; + struct buffer_head *bh; struct btrfs_root *extent_root = kmalloc(sizeof(struct btrfs_root), GFP_NOFS); struct btrfs_root *tree_root = kmalloc(sizeof(struct btrfs_root), @@ -1153,7 +1152,6 @@ struct btrfs_root *open_ctree(struct super_block *sb, spin_lock_init(&fs_info->new_trans_lock); init_completion(&fs_info->kobj_unregister); - sb_set_blocksize(sb, BTRFS_SUPER_INFO_SIZE); fs_info->tree_root = tree_root; fs_info->extent_root = extent_root; fs_info->chunk_root = chunk_root; @@ -1170,6 +1168,9 @@ struct btrfs_root *open_ctree(struct super_block *sb, fs_info->btree_inode->i_ino = 1; fs_info->btree_inode->i_nlink = 1; + sb->s_blocksize = 4096; + sb->s_blocksize_bits = blksize_bits(4096); + /* * we set the i_size on the btree inode to the max possible int. * the real end of the address space is determined by all of @@ -1229,19 +1230,16 @@ struct btrfs_root *open_ctree(struct super_block *sb, __setup_root(4096, 4096, 4096, 4096, tree_root, fs_info, BTRFS_ROOT_TREE_OBJECTID); - fs_info->sb_buffer = read_tree_block(tree_root, - BTRFS_SUPER_INFO_OFFSET, - 4096); - if (!fs_info->sb_buffer) + bh = __bread(fs_devices->latest_bdev, + BTRFS_SUPER_INFO_OFFSET / 4096, 4096); + if (!bh) goto fail_iput; - read_extent_buffer(fs_info->sb_buffer, &fs_info->super_copy, 0, - sizeof(fs_info->super_copy)); + memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy)); + brelse(bh); - read_extent_buffer(fs_info->sb_buffer, fs_info->fsid, - (unsigned long)btrfs_super_fsid(fs_info->sb_buffer), - BTRFS_FSID_SIZE); + memcpy(fs_info->fsid, fs_info->super_copy.fsid, BTRFS_FSID_SIZE); disk_super = &fs_info->super_copy; if (!btrfs_super_root(disk_super)) @@ -1263,7 +1261,9 @@ struct btrfs_root *open_ctree(struct super_block *sb, tree_root->leafsize = leafsize; tree_root->sectorsize = sectorsize; tree_root->stripesize = stripesize; - sb_set_blocksize(sb, sectorsize); + + sb->s_blocksize = sectorsize; + sb->s_blocksize_bits = blksize_bits(sectorsize); if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC, sizeof(disk_super->magic))) { @@ -1339,7 +1339,6 @@ fail_tree_root: fail_sys_array: mutex_unlock(&fs_info->fs_mutex); fail_sb_buffer: - free_extent_buffer(fs_info->sb_buffer); extent_io_tree_empty_lru(&BTRFS_I(fs_info->btree_inode)->io_tree); fail_iput: iput(fs_info->btree_inode); @@ -1380,41 +1379,44 @@ int write_all_supers(struct btrfs_root *root) struct list_head *cur; struct list_head *head = &root->fs_info->fs_devices->devices; struct btrfs_device *dev; - struct extent_buffer *sb; + struct btrfs_super_block *sb; struct btrfs_dev_item *dev_item; struct buffer_head *bh; int ret; int do_barriers; int max_errors; int total_errors = 0; + u32 crc; + u64 flags; max_errors = btrfs_super_num_devices(&root->fs_info->super_copy) - 1; do_barriers = !btrfs_test_opt(root, NOBARRIER); - sb = root->fs_info->sb_buffer; - dev_item = (struct btrfs_dev_item *)offsetof(struct btrfs_super_block, - dev_item); + sb = &root->fs_info->super_for_commit; + dev_item = &sb->dev_item; list_for_each(cur, head) { dev = list_entry(cur, struct btrfs_device, dev_list); - btrfs_set_device_type(sb, dev_item, dev->type); - btrfs_set_device_id(sb, dev_item, dev->devid); - btrfs_set_device_total_bytes(sb, dev_item, dev->total_bytes); - btrfs_set_device_bytes_used(sb, dev_item, dev->bytes_used); - btrfs_set_device_io_align(sb, dev_item, dev->io_align); - btrfs_set_device_io_width(sb, dev_item, dev->io_width); - btrfs_set_device_sector_size(sb, dev_item, dev->sector_size); - write_extent_buffer(sb, dev->uuid, - (unsigned long)btrfs_device_uuid(dev_item), - BTRFS_UUID_SIZE); - - btrfs_set_header_flag(sb, BTRFS_HEADER_FLAG_WRITTEN); - csum_tree_block(root, sb, 0); - - bh = __getblk(dev->bdev, BTRFS_SUPER_INFO_OFFSET / - root->fs_info->sb->s_blocksize, + btrfs_set_stack_device_type(dev_item, dev->type); + btrfs_set_stack_device_id(dev_item, dev->devid); + btrfs_set_stack_device_total_bytes(dev_item, dev->total_bytes); + btrfs_set_stack_device_bytes_used(dev_item, dev->bytes_used); + btrfs_set_stack_device_io_align(dev_item, dev->io_align); + btrfs_set_stack_device_io_width(dev_item, dev->io_width); + btrfs_set_stack_device_sector_size(dev_item, dev->sector_size); + memcpy(dev_item->uuid, dev->uuid, BTRFS_UUID_SIZE); + flags = btrfs_super_flags(sb); + btrfs_set_super_flags(sb, flags | BTRFS_HEADER_FLAG_WRITTEN); + + + crc = ~(u32)0; + crc = btrfs_csum_data(root, (char *)sb + BTRFS_CSUM_SIZE, crc, + BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE); + btrfs_csum_final(crc, sb->csum); + + bh = __getblk(dev->bdev, BTRFS_SUPER_INFO_OFFSET / 4096, BTRFS_SUPER_INFO_SIZE); - read_extent_buffer(sb, bh->b_data, 0, BTRFS_SUPER_INFO_SIZE); + memcpy(bh->b_data, sb, BTRFS_SUPER_INFO_SIZE); dev->pending_io = bh; get_bh(bh); @@ -1483,15 +1485,6 @@ int write_ctree_super(struct btrfs_trans_handle *trans, struct btrfs_root int ret; ret = write_all_supers(root); -#if 0 - if (!btrfs_test_opt(root, NOBARRIER)) - blkdev_issue_flush(sb->s_bdev, NULL); - set_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, super); - ret = sync_page_range_nolock(btree_inode, btree_inode->i_mapping, - super->start, super->len); - if (!btrfs_test_opt(root, NOBARRIER)) - blkdev_issue_flush(sb->s_bdev, NULL); -#endif return ret; } @@ -1570,8 +1563,6 @@ int close_ctree(struct btrfs_root *root) if (root->fs_info->dev_root->node); free_extent_buffer(root->fs_info->dev_root->node); - free_extent_buffer(fs_info->sb_buffer); - btrfs_free_block_groups(root->fs_info); del_fs_roots(fs_info); @@ -1652,7 +1643,7 @@ void btrfs_throttle(struct btrfs_root *root) { struct backing_dev_info *bdi; - bdi = root->fs_info->sb->s_bdev->bd_inode->i_mapping->backing_dev_info; + bdi = &root->fs_info->bdi; if (root->fs_info->throttles && bdi_write_congested(bdi)) { #if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,18) congestion_wait(WRITE, HZ/20); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index f94794a99329..c0e67bde8428 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -147,6 +147,8 @@ struct btrfs_block_group_cache *btrfs_lookup_block_group(struct u64 end; int ret; + bytenr = max_t(u64, bytenr, + BTRFS_SUPER_INFO_OFFSET + BTRFS_SUPER_INFO_SIZE); block_group_cache = &info->block_group_cache; ret = find_first_extent_bit(block_group_cache, bytenr, &start, &end, @@ -1059,16 +1061,25 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) } } -static u64 reduce_alloc_profile(u64 flags) +static u64 reduce_alloc_profile(struct btrfs_root *root, u64 flags) { + u64 num_devices = root->fs_info->fs_devices->num_devices; + + if (num_devices == 1) + flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0); + if (num_devices < 4) + flags &= ~BTRFS_BLOCK_GROUP_RAID10; + if ((flags & BTRFS_BLOCK_GROUP_DUP) && (flags & (BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_RAID10))) + BTRFS_BLOCK_GROUP_RAID10))) { flags &= ~BTRFS_BLOCK_GROUP_DUP; + } if ((flags & BTRFS_BLOCK_GROUP_RAID1) && - (flags & BTRFS_BLOCK_GROUP_RAID10)) + (flags & BTRFS_BLOCK_GROUP_RAID10)) { flags &= ~BTRFS_BLOCK_GROUP_RAID1; + } if ((flags & BTRFS_BLOCK_GROUP_RAID0) && ((flags & BTRFS_BLOCK_GROUP_RAID1) | @@ -1078,7 +1089,6 @@ static u64 reduce_alloc_profile(u64 flags) return flags; } - static int do_chunk_alloc(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root, u64 alloc_bytes, u64 flags) @@ -1089,7 +1099,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 num_bytes; int ret; - flags = reduce_alloc_profile(flags); + flags = reduce_alloc_profile(extent_root, flags); space_info = __find_space_info(extent_root->fs_info, flags); if (!space_info) { @@ -1169,6 +1179,21 @@ static int update_block_group(struct btrfs_trans_handle *trans, return 0; } +static u64 first_logical_byte(struct btrfs_root *root, u64 search_start) +{ + u64 start; + u64 end; + int ret; + ret = find_first_extent_bit(&root->fs_info->block_group_cache, + search_start, &start, &end, + BLOCK_GROUP_DATA | BLOCK_GROUP_METADATA | + BLOCK_GROUP_SYSTEM); + if (ret) + return 0; + return start; +} + + static int update_pinned_extents(struct btrfs_root *root, u64 bytenr, u64 num, int pin) { @@ -1185,16 +1210,25 @@ static int update_pinned_extents(struct btrfs_root *root, } while (num > 0) { cache = btrfs_lookup_block_group(fs_info, bytenr); - WARN_ON(!cache); - len = min(num, cache->key.offset - - (bytenr - cache->key.objectid)); + if (!cache) { + u64 first = first_logical_byte(root, bytenr); + WARN_ON(first < bytenr); + len = min(first - bytenr, num); + } else { + len = min(num, cache->key.offset - + (bytenr - cache->key.objectid)); + } if (pin) { - cache->pinned += len; - cache->space_info->bytes_pinned += len; + if (cache) { + cache->pinned += len; + cache->space_info->bytes_pinned += len; + } fs_info->total_pinned += len; } else { - cache->pinned -= len; - cache->space_info->bytes_pinned -= len; + if (cache) { + cache->pinned -= len; + cache->space_info->bytes_pinned -= len; + } fs_info->total_pinned -= len; } bytenr += len; @@ -1547,7 +1581,7 @@ static int noinline find_free_extent(struct btrfs_trans_handle *trans, int data) { int ret; - u64 orig_search_start = search_start; + u64 orig_search_start; struct btrfs_root * root = orig_root->fs_info->extent_root; struct btrfs_fs_info *info = root->fs_info; u64 total_needed = num_bytes; @@ -1577,6 +1611,9 @@ static int noinline find_free_extent(struct btrfs_trans_handle *trans, } } + search_start = max(search_start, first_logical_byte(root, 0)); + orig_search_start = search_start; + if (search_end == (u64)-1) search_end = btrfs_super_total_bytes(&info->super_copy); @@ -1751,7 +1788,7 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans, data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile; } again: - data = reduce_alloc_profile(data); + data = reduce_alloc_profile(root, data); if (root->ref_cows) { if (!(data & BTRFS_BLOCK_GROUP_METADATA)) { ret = do_chunk_alloc(trans, root->fs_info->extent_root, @@ -2309,6 +2346,7 @@ static int noinline relocate_inode_pages(struct inode *inode, u64 start, struct file_ra_state *ra; unsigned long total_read = 0; unsigned long ra_pages; + struct btrfs_trans_handle *trans; ra = kzalloc(sizeof(*ra), GFP_NOFS); @@ -2326,9 +2364,13 @@ static int noinline relocate_inode_pages(struct inode *inode, u64 start, calc_ra(i, last_index, ra_pages)); } total_read++; + if (((u64)i << PAGE_CACHE_SHIFT) > inode->i_size) + goto truncate_racing; + page = grab_cache_page(inode->i_mapping, i); - if (!page) + if (!page) { goto out_unlock; + } if (!PageUptodate(page)) { btrfs_readpage(NULL, page); lock_page(page); @@ -2350,20 +2392,33 @@ static int noinline relocate_inode_pages(struct inode *inode, u64 start, lock_extent(io_tree, page_start, page_end, GFP_NOFS); - set_page_dirty(page); set_extent_delalloc(io_tree, page_start, page_end, GFP_NOFS); + set_page_dirty(page); unlock_extent(io_tree, page_start, page_end, GFP_NOFS); unlock_page(page); page_cache_release(page); - balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1); } + balance_dirty_pages_ratelimited_nr(inode->i_mapping, + total_read); out_unlock: kfree(ra); + trans = btrfs_start_transaction(BTRFS_I(inode)->root, 1); + if (trans) { + btrfs_add_ordered_inode(inode); + btrfs_end_transaction(trans, BTRFS_I(inode)->root); + mark_inode_dirty(inode); + } mutex_unlock(&inode->i_mutex); return 0; + +truncate_racing: + vmtruncate(inode, inode->i_size); + balance_dirty_pages_ratelimited_nr(inode->i_mapping, + total_read); + goto out_unlock; } /* @@ -2466,6 +2521,27 @@ out: return 0; } +static int noinline del_extent_zero(struct btrfs_root *extent_root, + struct btrfs_path *path, + struct btrfs_key *extent_key) +{ + int ret; + struct btrfs_trans_handle *trans; + + trans = btrfs_start_transaction(extent_root, 1); + ret = btrfs_search_slot(trans, extent_root, extent_key, path, -1, 1); + if (ret > 0) { + ret = -EIO; + goto out; + } + if (ret < 0) + goto out; + ret = btrfs_del_item(trans, extent_root, path); +out: + btrfs_end_transaction(trans, extent_root); + return ret; +} + static int noinline relocate_one_extent(struct btrfs_root *extent_root, struct btrfs_path *path, struct btrfs_key *extent_key) @@ -2477,6 +2553,10 @@ static int noinline relocate_one_extent(struct btrfs_root *extent_root, u32 item_size; int ret = 0; + if (extent_key->objectid == 0) { + ret = del_extent_zero(extent_root, path, extent_key); + goto out; + } key.objectid = extent_key->objectid; key.type = BTRFS_EXTENT_REF_KEY; key.offset = 0; @@ -2490,15 +2570,24 @@ static int noinline relocate_one_extent(struct btrfs_root *extent_root, ret = 0; leaf = path->nodes[0]; nritems = btrfs_header_nritems(leaf); - if (path->slots[0] == nritems) - goto out; + if (path->slots[0] == nritems) { + ret = btrfs_next_leaf(extent_root, path); + if (ret > 0) { + ret = 0; + goto out; + } + if (ret < 0) + goto out; + } btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); - if (found_key.objectid != extent_key->objectid) + if (found_key.objectid != extent_key->objectid) { break; + } - if (found_key.type != BTRFS_EXTENT_REF_KEY) + if (found_key.type != BTRFS_EXTENT_REF_KEY) { break; + } key.offset = found_key.offset + 1; item_size = btrfs_item_size_nr(leaf, path->slots[0]); @@ -2519,7 +2608,7 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags) u64 stripped = BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10; - num_devices = btrfs_super_num_devices(&root->fs_info->super_copy); + num_devices = root->fs_info->fs_devices->num_devices; if (num_devices == 1) { stripped |= BTRFS_BLOCK_GROUP_DUP; stripped = flags & ~stripped; @@ -2535,9 +2624,6 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags) return flags; } else { /* they already had raid on here, just return */ - if ((flags & BTRFS_BLOCK_GROUP_DUP) && - (flags & BTRFS_BLOCK_GROUP_RAID1)) { - } if (flags & stripped) return flags; @@ -2570,7 +2656,7 @@ int btrfs_shrink_extent_tree(struct btrfs_root *root, u64 shrink_start) struct extent_buffer *leaf; u32 nritems; int ret; - int progress = 0; + int progress; shrink_block_group = btrfs_lookup_block_group(root->fs_info, shrink_start); @@ -2597,6 +2683,7 @@ again: shrink_block_group->ro = 1; total_found = 0; + progress = 0; key.objectid = shrink_start; key.offset = 0; key.type = 0; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index f7beb9b0d37a..b437d3bdf95e 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2194,6 +2194,8 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, again: spin_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, start, len); + if (em) + em->bdev = root->fs_info->fs_devices->latest_bdev; spin_unlock(&em_tree->lock); if (em) { @@ -2212,7 +2214,7 @@ again: em->start = EXTENT_MAP_HOLE; em->len = (u64)-1; - em->bdev = inode->i_sb->s_bdev; + em->bdev = root->fs_info->fs_devices->latest_bdev; ret = btrfs_lookup_file_extent(trans, root, path, objectid, start, trans != NULL); if (ret < 0) { @@ -3101,6 +3103,27 @@ out: return ret; } +long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg) +{ + struct btrfs_ioctl_vol_args *vol_args; + int ret; + + vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS); + + if (!vol_args) + return -ENOMEM; + + if (copy_from_user(vol_args, arg, sizeof(*vol_args))) { + ret = -EFAULT; + goto out; + } + ret = btrfs_rm_device(root, vol_args->name); + +out: + kfree(vol_args); + return ret; +} + int dup_item_to_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, @@ -3294,6 +3317,8 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_resize(root, (void __user *)arg); case BTRFS_IOC_ADD_DEV: return btrfs_ioctl_add_dev(root, (void __user *)arg); + case BTRFS_IOC_RM_DEV: + return btrfs_ioctl_rm_dev(root, (void __user *)arg); case BTRFS_IOC_BALANCE: return btrfs_balance(root->fs_info->dev_root); case BTRFS_IOC_CLONE: diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 7153dfaa3404..020e5a83e31f 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -315,24 +315,12 @@ static void btrfs_write_super(struct super_block *sb) sb->s_dirt = 0; } -/* - * This is almost a copy of get_sb_bdev in fs/super.c. - * We need the local copy to allow direct mounting of - * subvolumes, but this could be easily integrated back - * into the generic version. --hch - */ - -/* start copy & paste */ -static int set_bdev_super(struct super_block *s, void *data) +static int btrfs_test_super(struct super_block *s, void *data) { - s->s_bdev = data; - s->s_dev = s->s_bdev->bd_dev; - return 0; -} + struct btrfs_fs_devices *test_fs_devices = data; + struct btrfs_root *root = btrfs_sb(s); -static int test_bdev_super(struct super_block *s, void *data) -{ - return (void *)s->s_bdev == data; + return root->fs_info->fs_devices == test_fs_devices; } int btrfs_get_sb_bdev(struct file_system_type *fs_type, @@ -354,14 +342,9 @@ int btrfs_get_sb_bdev(struct file_system_type *fs_type, return error; bdev = fs_devices->lowest_bdev; - /* - * once the super is inserted into the list by sget, s_umount - * will protect the lockfs code from trying to start a snapshot - * while we are mounting - */ - down(&bdev->bd_mount_sem); - s = sget(fs_type, test_bdev_super, set_bdev_super, bdev); - up(&bdev->bd_mount_sem); + btrfs_lock_volumes(); + s = sget(fs_type, btrfs_test_super, set_anon_super, fs_devices); + btrfs_unlock_volumes(); if (IS_ERR(s)) goto error_s; @@ -373,13 +356,11 @@ int btrfs_get_sb_bdev(struct file_system_type *fs_type, goto error_bdev; } - close_bdev_excl(bdev); } else { char b[BDEVNAME_SIZE]; s->s_flags = flags; strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id)); - sb_set_blocksize(s, block_size(bdev)); error = btrfs_fill_super(s, fs_devices, data, flags & MS_SILENT ? 1 : 0); if (error) { @@ -458,7 +439,7 @@ static struct file_system_type btrfs_fs_type = { .owner = THIS_MODULE, .name = "btrfs", .get_sb = btrfs_get_sb, - .kill_sb = kill_block_super, + .kill_sb = kill_anon_super, .fs_flags = FS_REQUIRES_DEV, }; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 9826942fa18a..57746c11eae3 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -738,9 +738,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, chunk_root->node->start); btrfs_set_super_chunk_root_level(&root->fs_info->super_copy, btrfs_header_level(chunk_root->node)); - write_extent_buffer(root->fs_info->sb_buffer, - &root->fs_info->super_copy, 0, - sizeof(root->fs_info->super_copy)); + memcpy(&root->fs_info->super_for_commit, &root->fs_info->super_copy, + sizeof(root->fs_info->super_copy)); btrfs_copy_pinned(root, pinned_copy); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index b38187573108..55da5f0c56e3 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -45,6 +45,16 @@ struct map_lookup { static DEFINE_MUTEX(uuid_mutex); static LIST_HEAD(fs_uuids); +void btrfs_lock_volumes(void) +{ + mutex_lock(&uuid_mutex); +} + +void btrfs_unlock_volumes(void) +{ + mutex_unlock(&uuid_mutex); +} + int btrfs_cleanup_fs_uuids(void) { struct btrfs_fs_devices *fs_devices; @@ -193,12 +203,14 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, ret = PTR_ERR(bdev); goto fail; } + set_blocksize(bdev, 4096); if (device->devid == fs_devices->latest_devid) fs_devices->latest_bdev = bdev; if (device->devid == fs_devices->lowest_devid) { fs_devices->lowest_bdev = bdev; } device->bdev = bdev; + } mutex_unlock(&uuid_mutex); return 0; @@ -393,6 +405,9 @@ int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, struct btrfs_path *path; struct btrfs_root *root = device->dev_root; struct btrfs_key key; + struct btrfs_key found_key; + struct extent_buffer *leaf = NULL; + struct btrfs_dev_extent *extent = NULL; path = btrfs_alloc_path(); if (!path) @@ -403,8 +418,25 @@ int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, key.type = BTRFS_DEV_EXTENT_KEY; ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret > 0) { + ret = btrfs_previous_item(root, path, key.objectid, + BTRFS_DEV_EXTENT_KEY); + BUG_ON(ret); + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + extent = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_dev_extent); + BUG_ON(found_key.offset > start || found_key.offset + + btrfs_dev_extent_length(leaf, extent) < start); + ret = 0; + } else if (ret == 0) { + leaf = path->nodes[0]; + extent = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_dev_extent); + } BUG_ON(ret); + device->bytes_used -= btrfs_dev_extent_length(leaf, extent); ret = btrfs_del_item(trans, root, path); BUG_ON(ret); @@ -593,6 +625,170 @@ out: return ret; } +static int btrfs_rm_dev_item(struct btrfs_root *root, + struct btrfs_device *device) +{ + int ret; + struct btrfs_path *path; + struct block_device *bdev = device->bdev; + struct btrfs_device *next_dev; + struct btrfs_key key; + u64 total_bytes; + struct btrfs_fs_devices *fs_devices; + struct btrfs_trans_handle *trans; + + root = root->fs_info->chunk_root; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + trans = btrfs_start_transaction(root, 1); + key.objectid = BTRFS_DEV_ITEMS_OBJECTID; + key.type = BTRFS_DEV_ITEM_KEY; + key.offset = device->devid; + + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret < 0) + goto out; + + if (ret > 0) { + ret = -ENOENT; + goto out; + } + + ret = btrfs_del_item(trans, root, path); + if (ret) + goto out; + + /* + * at this point, the device is zero sized. We want to + * remove it from the devices list and zero out the old super + */ + list_del_init(&device->dev_list); + list_del_init(&device->dev_alloc_list); + fs_devices = root->fs_info->fs_devices; + + next_dev = list_entry(fs_devices->devices.next, struct btrfs_device, + dev_list); + if (bdev == fs_devices->lowest_bdev) + fs_devices->lowest_bdev = next_dev->bdev; + if (bdev == root->fs_info->sb->s_bdev) + root->fs_info->sb->s_bdev = next_dev->bdev; + if (bdev == fs_devices->latest_bdev) + fs_devices->latest_bdev = next_dev->bdev; + + total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy); + btrfs_set_super_total_bytes(&root->fs_info->super_copy, + total_bytes - device->total_bytes); + + total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy); + btrfs_set_super_num_devices(&root->fs_info->super_copy, + total_bytes - 1); +out: + btrfs_free_path(path); + btrfs_commit_transaction(trans, root); + return ret; +} + +int btrfs_rm_device(struct btrfs_root *root, char *device_path) +{ + struct btrfs_device *device; + struct block_device *bdev; + struct buffer_head *bh; + struct btrfs_super_block *disk_super; + u64 all_avail; + u64 devid; + int ret = 0; + + mutex_lock(&root->fs_info->fs_mutex); + mutex_lock(&uuid_mutex); + + all_avail = root->fs_info->avail_data_alloc_bits | + root->fs_info->avail_system_alloc_bits | + root->fs_info->avail_metadata_alloc_bits; + + if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && + root->fs_info->fs_devices->num_devices <= 4) { + printk("btrfs: unable to go below four devices on raid10\n"); + ret = -EINVAL; + goto out; + } + + if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && + root->fs_info->fs_devices->num_devices <= 2) { + printk("btrfs: unable to go below two devices on raid1\n"); + ret = -EINVAL; + goto out; + } + + bdev = open_bdev_excl(device_path, 0, root->fs_info->bdev_holder); + if (IS_ERR(bdev)) { + ret = PTR_ERR(bdev); + goto out; + } + + bh = __bread(bdev, BTRFS_SUPER_INFO_OFFSET / 4096, 4096); + if (!bh) { + ret = -EIO; + goto error_close; + } + disk_super = (struct btrfs_super_block *)bh->b_data; + if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC, + sizeof(disk_super->magic))) { + ret = -ENOENT; + goto error_brelse; + } + if (memcmp(disk_super->fsid, root->fs_info->fsid, BTRFS_FSID_SIZE)) { + ret = -ENOENT; + goto error_brelse; + } + devid = le64_to_cpu(disk_super->dev_item.devid); + device = btrfs_find_device(root, devid, NULL); + if (!device) { + ret = -ENOENT; + goto error_brelse; + } + + root->fs_info->fs_devices->num_devices--; + + ret = btrfs_shrink_device(device, 0); + if (ret) + goto error_brelse; + + + ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device); + if (ret) + goto error_brelse; + + /* make sure this device isn't detected as part of the FS anymore */ + memset(&disk_super->magic, 0, sizeof(disk_super->magic)); + set_buffer_dirty(bh); + sync_dirty_buffer(bh); + + brelse(bh); + + /* one close for the device struct or super_block */ + close_bdev_excl(device->bdev); + + /* one close for us */ + close_bdev_excl(device->bdev); + + kfree(device->name); + kfree(device); + ret = 0; + goto out; + +error_brelse: + brelse(bh); +error_close: + close_bdev_excl(bdev); +out: + mutex_unlock(&uuid_mutex); + mutex_unlock(&root->fs_info->fs_mutex); + return ret; +} + int btrfs_init_new_device(struct btrfs_root *root, char *device_path) { struct btrfs_trans_handle *trans; @@ -831,13 +1027,17 @@ int btrfs_relocate_chunk(struct btrfs_root *root, em = lookup_extent_mapping(em_tree, chunk_offset, 1); spin_unlock(&em_tree->lock); - BUG_ON(em->start > chunk_offset || em->start + em->len < chunk_offset); + BUG_ON(em->start > chunk_offset || + em->start + em->len < chunk_offset); map = (struct map_lookup *)em->bdev; for (i = 0; i < map->num_stripes; i++) { ret = btrfs_free_dev_extent(trans, map->stripes[i].dev, map->stripes[i].physical); BUG_ON(ret); + + ret = btrfs_update_device(trans, map->stripes[i].dev); + BUG_ON(ret); } ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid, chunk_offset); @@ -847,11 +1047,8 @@ int btrfs_relocate_chunk(struct btrfs_root *root, if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset); BUG_ON(ret); - goto out; } - - spin_lock(&em_tree->lock); remove_extent_mapping(em_tree, em); kfree(map); @@ -861,7 +1058,6 @@ int btrfs_relocate_chunk(struct btrfs_root *root, free_extent_map(em); spin_unlock(&em_tree->lock); -out: /* once for us */ free_extent_map(em); @@ -1449,7 +1645,7 @@ again: return 0; if (!em) { - printk("unable to find logical %Lu\n", logical); + printk("unable to find logical %Lu len %Lu\n", logical, *length); BUG(); } @@ -1712,6 +1908,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, logical = key->offset; length = btrfs_chunk_length(leaf, chunk); + spin_lock(&map_tree->map_tree.lock); em = lookup_extent_mapping(&map_tree->map_tree, logical, 1); spin_unlock(&map_tree->map_tree.lock); @@ -1845,7 +2042,7 @@ int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf) int btrfs_read_sys_array(struct btrfs_root *root) { struct btrfs_super_block *super_copy = &root->fs_info->super_copy; - struct extent_buffer *sb = root->fs_info->sb_buffer; + struct extent_buffer *sb; struct btrfs_disk_key *disk_key; struct btrfs_chunk *chunk; u8 *ptr; @@ -1857,6 +2054,12 @@ int btrfs_read_sys_array(struct btrfs_root *root) u32 cur; struct btrfs_key key; + sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET, + BTRFS_SUPER_INFO_SIZE); + if (!sb) + return -ENOMEM; + btrfs_set_buffer_uptodate(sb); + write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE); array_size = btrfs_super_sys_array_size(super_copy); ptr = super_copy->sys_chunk_array; @@ -1867,8 +2070,7 @@ int btrfs_read_sys_array(struct btrfs_root *root) disk_key = (struct btrfs_disk_key *)ptr; btrfs_disk_key_to_cpu(&key, disk_key); - len = sizeof(*disk_key); - ptr += len; + len = sizeof(*disk_key); ptr += len; sb_ptr += len; cur += len; @@ -1887,6 +2089,7 @@ int btrfs_read_sys_array(struct btrfs_root *root) sb_ptr += len; cur += len; } + free_extent_buffer(sb); return ret; } diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index a9663e92bb14..0f94a69e6eb6 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -125,6 +125,7 @@ int btrfs_close_devices(struct btrfs_fs_devices *fs_devices); int btrfs_add_device(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_device *device); +int btrfs_rm_device(struct btrfs_root *root, char *device_path); int btrfs_cleanup_fs_uuids(void); int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len); int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree, @@ -136,4 +137,6 @@ struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid, int btrfs_shrink_device(struct btrfs_device *device, u64 new_size); int btrfs_init_new_device(struct btrfs_root *root, char *path); int btrfs_balance(struct btrfs_root *dev_root); +void btrfs_unlock_volumes(void); +void btrfs_lock_volumes(void); #endif -- cgit v1.2.3 From dfe25020689bb2d318782d2c9c7141203583fc70 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 13 May 2008 13:46:40 -0400 Subject: Btrfs: Add mount -o degraded to allow mounts to continue with missing devices Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 3 + fs/btrfs/disk-io.c | 49 ++++++---- fs/btrfs/disk-io.h | 3 +- fs/btrfs/super.c | 25 +++-- fs/btrfs/volumes.c | 279 ++++++++++++++++++++++++++++++++++++++--------------- fs/btrfs/volumes.h | 3 + 6 files changed, 253 insertions(+), 109 deletions(-) (limited to 'fs/btrfs/volumes.h') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 6c2c2c4e4d2a..a9377fb16c37 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -657,6 +657,7 @@ struct btrfs_root { #define BTRFS_MOUNT_NODATACOW (1 << 1) #define BTRFS_MOUNT_NOBARRIER (1 << 2) #define BTRFS_MOUNT_SSD (1 << 3) +#define BTRFS_MOUNT_DEGRADED (1 << 4) #define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) #define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) @@ -1606,4 +1607,6 @@ int btrfs_delete_xattrs(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode); /* super.c */ u64 btrfs_parse_size(char *str); +int btrfs_parse_options(char *options, struct btrfs_root *root, + char **subvol_name); #endif diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 574b1245964e..38b0d9ecda6a 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -614,21 +614,6 @@ int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize, return ret; } -static int close_all_devices(struct btrfs_fs_info *fs_info) -{ - struct list_head *list; - struct list_head *next; - struct btrfs_device *device; - - list = &fs_info->fs_devices->devices; - list_for_each(next, list) { - device = list_entry(next, struct btrfs_device, dev_list); - close_bdev_excl(device->bdev); - device->bdev = NULL; - } - return 0; -} - struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize) { @@ -927,6 +912,8 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits) list_for_each(cur, &info->fs_devices->devices) { device = list_entry(cur, struct btrfs_device, dev_list); + if (!device->bdev) + continue; bdi = blk_get_backing_dev_info(device->bdev); if (bdi && bdi_congested(bdi, bdi_bits)) { ret = 1; @@ -1140,7 +1127,8 @@ static void btrfs_async_submit_work(struct work_struct *work) } struct btrfs_root *open_ctree(struct super_block *sb, - struct btrfs_fs_devices *fs_devices) + struct btrfs_fs_devices *fs_devices, + char *options) { u32 sectorsize; u32 nodesize; @@ -1276,12 +1264,19 @@ struct btrfs_root *open_ctree(struct super_block *sb, if (!btrfs_super_root(disk_super)) goto fail_sb_buffer; - if (btrfs_super_num_devices(disk_super) != fs_devices->num_devices) { + btrfs_parse_options(options, tree_root, NULL); + + if (btrfs_super_num_devices(disk_super) > fs_devices->num_devices) { printk("Btrfs: wanted %llu devices, but found %llu\n", (unsigned long long)btrfs_super_num_devices(disk_super), (unsigned long long)fs_devices->num_devices); - goto fail_sb_buffer; + if (btrfs_test_opt(tree_root, DEGRADED)) + printk("continuing in degraded mode\n"); + else { + goto fail_sb_buffer; + } } + fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super); nodesize = btrfs_super_nodesize(disk_super); @@ -1329,6 +1324,8 @@ struct btrfs_root *open_ctree(struct super_block *sb, ret = btrfs_read_chunk_tree(chunk_root); BUG_ON(ret); + btrfs_close_extra_devices(fs_devices); + blocksize = btrfs_level_size(tree_root, btrfs_super_root_level(disk_super)); @@ -1374,7 +1371,7 @@ fail_sb_buffer: fail_iput: iput(fs_info->btree_inode); fail: - close_all_devices(fs_info); + btrfs_close_devices(fs_info->fs_devices); btrfs_mapping_tree_free(&fs_info->mapping_tree); kfree(extent_root); @@ -1429,6 +1426,13 @@ int write_all_supers(struct btrfs_root *root) dev_item = &sb->dev_item; list_for_each(cur, head) { dev = list_entry(cur, struct btrfs_device, dev_list); + if (!dev->bdev) { + total_errors++; + continue; + } + if (!dev->in_fs_metadata) + continue; + btrfs_set_stack_device_type(dev_item, dev->type); btrfs_set_stack_device_id(dev_item, dev->devid); btrfs_set_stack_device_total_bytes(dev_item, dev->total_bytes); @@ -1482,6 +1486,11 @@ int write_all_supers(struct btrfs_root *root) list_for_each(cur, head) { dev = list_entry(cur, struct btrfs_device, dev_list); + if (!dev->bdev) + continue; + if (!dev->in_fs_metadata) + continue; + BUG_ON(!dev->pending_io); bh = dev->pending_io; wait_on_buffer(bh); @@ -1631,7 +1640,7 @@ int close_ctree(struct btrfs_root *root) kfree(hasher); } #endif - close_all_devices(fs_info); + btrfs_close_devices(fs_info->fs_devices); btrfs_mapping_tree_free(&fs_info->mapping_tree); #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,23) diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 30d1ed293c25..2bc64fefe6ea 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -33,7 +33,8 @@ struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf); struct btrfs_root *open_ctree(struct super_block *sb, - struct btrfs_fs_devices *fs_devices); + struct btrfs_fs_devices *fs_devices, + char *options); int close_ctree(struct btrfs_root *root); int write_ctree_super(struct btrfs_trans_handle *trans, struct btrfs_root *root); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 020e5a83e31f..273a5b511f50 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -65,11 +65,13 @@ static void btrfs_put_super (struct super_block * sb) } enum { - Opt_subvol, Opt_nodatasum, Opt_nodatacow, Opt_max_extent, - Opt_max_inline, Opt_alloc_start, Opt_nobarrier, Opt_ssd, Opt_err, + Opt_degraded, Opt_subvol, Opt_nodatasum, Opt_nodatacow, + Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, + Opt_ssd, Opt_err, }; static match_table_t tokens = { + {Opt_degraded, "degraded"}, {Opt_subvol, "subvol=%s"}, {Opt_nodatasum, "nodatasum"}, {Opt_nodatacow, "nodatacow"}, @@ -106,9 +108,8 @@ u64 btrfs_parse_size(char *str) return res; } -static int parse_options (char * options, - struct btrfs_root *root, - char **subvol_name) +int btrfs_parse_options(char *options, struct btrfs_root *root, + char **subvol_name) { char * p; struct btrfs_fs_info *info = NULL; @@ -135,6 +136,12 @@ static int parse_options (char * options, token = match_token(p, tokens, args); switch (token) { + case Opt_degraded: + if (info) { + printk("btrfs: allowing degraded mounts\n"); + btrfs_set_opt(info->mount_opt, DEGRADED); + } + break; case Opt_subvol: if (subvol_name) { *subvol_name = match_strdup(&args[0]); @@ -234,7 +241,7 @@ static int btrfs_fill_super(struct super_block * sb, sb->s_xattr = btrfs_xattr_handlers; sb->s_time_gran = 1; - tree_root = open_ctree(sb, fs_devices); + tree_root = open_ctree(sb, fs_devices, (char *)data); if (IS_ERR(tree_root)) { printk("btrfs: open_ctree failed\n"); @@ -267,8 +274,6 @@ static int btrfs_fill_super(struct super_block * sb, goto fail_close; } - parse_options((char *)data, tree_root, NULL); - /* this does the super kobj at the same time */ err = btrfs_sysfs_add_super(tree_root->fs_info); if (err) @@ -341,7 +346,7 @@ int btrfs_get_sb_bdev(struct file_system_type *fs_type, if (error) return error; - bdev = fs_devices->lowest_bdev; + bdev = fs_devices->latest_bdev; btrfs_lock_volumes(); s = sget(fs_type, btrfs_test_super, set_anon_super, fs_devices); btrfs_unlock_volumes(); @@ -411,7 +416,7 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int ret; char *subvol_name = NULL; - parse_options((char *)data, NULL, &subvol_name); + btrfs_parse_options((char *)data, NULL, &subvol_name); ret = btrfs_get_sb_bdev(fs_type, flags, dev_name, data, mnt, subvol_name ? subvol_name : "default"); if (subvol_name) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 5fc7fb481474..43f74d17bcea 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -73,6 +73,7 @@ int btrfs_cleanup_fs_uuids(void) close_bdev_excl(dev->bdev); } list_del(&dev->dev_list); + kfree(dev->name); kfree(dev); } } @@ -127,7 +128,6 @@ static int device_list_add(const char *path, memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE); fs_devices->latest_devid = devid; fs_devices->latest_trans = found_transid; - fs_devices->lowest_devid = (u64)-1; fs_devices->num_devices = 0; device = NULL; } else { @@ -159,13 +159,35 @@ static int device_list_add(const char *path, fs_devices->latest_devid = devid; fs_devices->latest_trans = found_transid; } - if (fs_devices->lowest_devid > devid) { - fs_devices->lowest_devid = devid; - } *fs_devices_ret = fs_devices; return 0; } +int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices) +{ + struct list_head *head = &fs_devices->devices; + struct list_head *cur; + struct btrfs_device *device; + + mutex_lock(&uuid_mutex); +again: + list_for_each(cur, head) { + device = list_entry(cur, struct btrfs_device, dev_list); + if (!device->in_fs_metadata) { +printk("getting rid of extra dev %s\n", device->name); + if (device->bdev) + close_bdev_excl(device->bdev); + list_del(&device->dev_list); + list_del(&device->dev_alloc_list); + fs_devices->num_devices--; + kfree(device->name); + kfree(device); + goto again; + } + } + mutex_unlock(&uuid_mutex); + return 0; +} int btrfs_close_devices(struct btrfs_fs_devices *fs_devices) { struct list_head *head = &fs_devices->devices; @@ -179,6 +201,7 @@ int btrfs_close_devices(struct btrfs_fs_devices *fs_devices) close_bdev_excl(device->bdev); } device->bdev = NULL; + device->in_fs_metadata = 0; } mutex_unlock(&uuid_mutex); return 0; @@ -199,6 +222,9 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, if (device->bdev) continue; + if (!device->name) + continue; + bdev = open_bdev_excl(device->name, flags, holder); if (IS_ERR(bdev)) { @@ -209,10 +235,8 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, set_blocksize(bdev, 4096); if (device->devid == fs_devices->latest_devid) fs_devices->latest_bdev = bdev; - if (device->devid == fs_devices->lowest_devid) { - fs_devices->lowest_bdev = bdev; - } device->bdev = bdev; + device->in_fs_metadata = 0; } mutex_unlock(&uuid_mutex); @@ -439,7 +463,8 @@ int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, } BUG_ON(ret); - device->bytes_used -= btrfs_dev_extent_length(leaf, extent); + if (device->bytes_used > 0) + device->bytes_used -= btrfs_dev_extent_length(leaf, extent); ret = btrfs_del_item(trans, root, path); BUG_ON(ret); @@ -460,6 +485,7 @@ int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, struct extent_buffer *leaf; struct btrfs_key key; + WARN_ON(!device->in_fs_metadata); path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -674,8 +700,6 @@ static int btrfs_rm_dev_item(struct btrfs_root *root, next_dev = list_entry(fs_devices->devices.next, struct btrfs_device, dev_list); - if (bdev == fs_devices->lowest_bdev) - fs_devices->lowest_bdev = next_dev->bdev; if (bdev == root->fs_info->sb->s_bdev) root->fs_info->sb->s_bdev = next_dev->bdev; if (bdev == fs_devices->latest_bdev) @@ -698,7 +722,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) { struct btrfs_device *device; struct block_device *bdev; - struct buffer_head *bh; + struct buffer_head *bh = NULL; struct btrfs_super_block *disk_super; u64 all_avail; u64 devid; @@ -712,47 +736,73 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) root->fs_info->avail_metadata_alloc_bits; if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && - root->fs_info->fs_devices->num_devices <= 4) { + btrfs_super_num_devices(&root->fs_info->super_copy) <= 4) { printk("btrfs: unable to go below four devices on raid10\n"); ret = -EINVAL; goto out; } if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && - root->fs_info->fs_devices->num_devices <= 2) { + btrfs_super_num_devices(&root->fs_info->super_copy) <= 2) { printk("btrfs: unable to go below two devices on raid1\n"); ret = -EINVAL; goto out; } - bdev = open_bdev_excl(device_path, 0, root->fs_info->bdev_holder); - if (IS_ERR(bdev)) { - ret = PTR_ERR(bdev); - goto out; - } + if (strcmp(device_path, "missing") == 0) { + struct list_head *cur; + struct list_head *devices; + struct btrfs_device *tmp; - bh = __bread(bdev, BTRFS_SUPER_INFO_OFFSET / 4096, 4096); - if (!bh) { - ret = -EIO; - goto error_close; - } - disk_super = (struct btrfs_super_block *)bh->b_data; - if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC, - sizeof(disk_super->magic))) { - ret = -ENOENT; - goto error_brelse; - } - if (memcmp(disk_super->fsid, root->fs_info->fsid, BTRFS_FSID_SIZE)) { - ret = -ENOENT; - goto error_brelse; - } - devid = le64_to_cpu(disk_super->dev_item.devid); - device = btrfs_find_device(root, devid, NULL); - if (!device) { - ret = -ENOENT; - goto error_brelse; - } + device = NULL; + devices = &root->fs_info->fs_devices->devices; + list_for_each(cur, devices) { + tmp = list_entry(cur, struct btrfs_device, dev_list); + if (tmp->in_fs_metadata && !tmp->bdev) { + device = tmp; + break; + } + } + bdev = NULL; + bh = NULL; + disk_super = NULL; + if (!device) { + printk("btrfs: no missing devices found to remove\n"); + goto out; + } + + } else { + bdev = open_bdev_excl(device_path, 0, + root->fs_info->bdev_holder); + if (IS_ERR(bdev)) { + ret = PTR_ERR(bdev); + goto out; + } + + bh = __bread(bdev, BTRFS_SUPER_INFO_OFFSET / 4096, 4096); + if (!bh) { + ret = -EIO; + goto error_close; + } + disk_super = (struct btrfs_super_block *)bh->b_data; + if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC, + sizeof(disk_super->magic))) { + ret = -ENOENT; + goto error_brelse; + } + if (memcmp(disk_super->fsid, root->fs_info->fsid, + BTRFS_FSID_SIZE)) { + ret = -ENOENT; + goto error_brelse; + } + devid = le64_to_cpu(disk_super->dev_item.devid); + device = btrfs_find_device(root, devid, NULL); + if (!device) { + ret = -ENOENT; + goto error_brelse; + } + } root->fs_info->fs_devices->num_devices--; ret = btrfs_shrink_device(device, 0); @@ -764,19 +814,25 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) if (ret) goto error_brelse; - /* make sure this device isn't detected as part of the FS anymore */ - memset(&disk_super->magic, 0, sizeof(disk_super->magic)); - set_buffer_dirty(bh); - sync_dirty_buffer(bh); - - brelse(bh); - - /* one close for the device struct or super_block */ - close_bdev_excl(device->bdev); + if (bh) { + /* make sure this device isn't detected as part of + * the FS anymore + */ + memset(&disk_super->magic, 0, sizeof(disk_super->magic)); + set_buffer_dirty(bh); + sync_dirty_buffer(bh); - /* one close for us */ - close_bdev_excl(device->bdev); + brelse(bh); + } + if (device->bdev) { + /* one close for the device struct or super_block */ + close_bdev_excl(device->bdev); + } + if (bdev) { + /* one close for us */ + close_bdev_excl(bdev); + } kfree(device->name); kfree(device); ret = 0; @@ -785,7 +841,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) error_brelse: brelse(bh); error_close: - close_bdev_excl(bdev); + if (bdev) + close_bdev_excl(bdev); out: mutex_unlock(&uuid_mutex); mutex_unlock(&root->fs_info->fs_mutex); @@ -839,6 +896,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) device->total_bytes = i_size_read(bdev->bd_inode); device->dev_root = root->fs_info->dev_root; device->bdev = bdev; + device->in_fs_metadata = 1; ret = btrfs_add_device(trans, root, device); if (ret) @@ -1041,8 +1099,10 @@ int btrfs_relocate_chunk(struct btrfs_root *root, map->stripes[i].physical); BUG_ON(ret); - ret = btrfs_update_device(trans, map->stripes[i].dev); - BUG_ON(ret); + if (map->stripes[i].dev) { + ret = btrfs_update_device(trans, map->stripes[i].dev); + BUG_ON(ret); + } } ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid, chunk_offset); @@ -1415,10 +1475,13 @@ again: while(index < num_stripes) { device = list_entry(cur, struct btrfs_device, dev_alloc_list); - avail = device->total_bytes - device->bytes_used; + if (device->total_bytes > device->bytes_used) + avail = device->total_bytes - device->bytes_used; + else + avail = 0; cur = cur->next; - if (avail >= min_free) { + if (device->in_fs_metadata && avail >= min_free) { u64 ignored_start = 0; ret = find_free_dev_extent(trans, device, path, min_free, @@ -1430,7 +1493,7 @@ again: if (type & BTRFS_BLOCK_GROUP_DUP) index++; } - } else if (avail > max_avail) + } else if (device->in_fs_metadata && avail > max_avail) max_avail = avail; if (cur == dev_list) break; @@ -1610,6 +1673,22 @@ int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len) return ret; } +static int find_live_mirror(struct map_lookup *map, int first, int num, + int optimal) +{ + int i; + if (map->stripes[optimal].dev->bdev) + return optimal; + for (i = first; i < first + num; i++) { + if (map->stripes[i].dev->bdev) + return i; + } + /* we couldn't find one that doesn't fail. Just return something + * and the io error handling code will clean up eventually + */ + return optimal; +} + static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, u64 logical, u64 *length, struct btrfs_multi_bio **multi_ret, @@ -1712,8 +1791,11 @@ again: num_stripes = map->num_stripes; else if (mirror_num) stripe_index = mirror_num - 1; - else - stripe_index = current->pid % map->num_stripes; + else { + stripe_index = find_live_mirror(map, 0, + map->num_stripes, + current->pid % map->num_stripes); + } } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { if (rw & (1 << BIO_RW)) @@ -1731,8 +1813,11 @@ again: num_stripes = map->sub_stripes; else if (mirror_num) stripe_index += mirror_num - 1; - else - stripe_index += current->pid % map->sub_stripes; + else { + stripe_index = find_live_mirror(map, stripe_index, + map->sub_stripes, stripe_index + + current->pid % map->sub_stripes); + } } else { /* * after this do_div call, stripe_nr is the number of stripes @@ -1749,9 +1834,11 @@ again: struct backing_dev_info *bdi; device = map->stripes[stripe_index].dev; - bdi = blk_get_backing_dev_info(device->bdev); - if (bdi->unplug_io_fn) { - bdi->unplug_io_fn(bdi, unplug_page); + if (device->bdev) { + bdi = blk_get_backing_dev_info(device->bdev); + if (bdi->unplug_io_fn) { + bdi->unplug_io_fn(bdi, unplug_page); + } } } else { multi->stripes[i].physical = @@ -1880,12 +1967,21 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, } bio->bi_sector = multi->stripes[dev_nr].physical >> 9; dev = multi->stripes[dev_nr].dev; - - bio->bi_bdev = dev->bdev; - spin_lock(&dev->io_lock); - dev->total_ios++; - spin_unlock(&dev->io_lock); - submit_bio(rw, bio); + if (dev && dev->bdev) { + bio->bi_bdev = dev->bdev; + spin_lock(&dev->io_lock); + dev->total_ios++; + spin_unlock(&dev->io_lock); + submit_bio(rw, bio); + } else { + bio->bi_bdev = root->fs_info->fs_devices->latest_bdev; + bio->bi_sector = logical >> 9; +#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23) + bio_endio(bio, bio->bi_size, -EIO); +#else + bio_endio(bio, -EIO); +#endif + } dev_nr++; } if (total_devs == 1) @@ -1901,6 +1997,27 @@ struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid, return __find_device(head, devid, uuid); } +static struct btrfs_device *add_missing_dev(struct btrfs_root *root, + u64 devid, u8 *dev_uuid) +{ + struct btrfs_device *device; + struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; + + device = kzalloc(sizeof(*device), GFP_NOFS); + list_add(&device->dev_list, + &fs_devices->devices); + list_add(&device->dev_alloc_list, + &fs_devices->alloc_list); + device->barriers = 1; + device->dev_root = root->fs_info->dev_root; + device->devid = devid; + fs_devices->num_devices++; + spin_lock_init(&device->io_lock); + memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE); + return device; +} + + static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, struct extent_buffer *leaf, struct btrfs_chunk *chunk) @@ -1965,11 +2082,22 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, btrfs_stripe_dev_uuid_nr(chunk, i), BTRFS_UUID_SIZE); map->stripes[i].dev = btrfs_find_device(root, devid, uuid); - if (!map->stripes[i].dev) { + + if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) { kfree(map); free_extent_map(em); return -EIO; } + if (!map->stripes[i].dev) { + map->stripes[i].dev = + add_missing_dev(root, devid, uuid); + if (!map->stripes[i].dev) { + kfree(map); + free_extent_map(em); + return -EIO; + } + } + map->stripes[i].dev->in_fs_metadata = 1; } spin_lock(&map_tree->map_tree.lock); @@ -2016,20 +2144,15 @@ static int read_one_dev(struct btrfs_root *root, BTRFS_UUID_SIZE); device = btrfs_find_device(root, devid, dev_uuid); if (!device) { - printk("warning devid %Lu not found already\n", devid); - device = kzalloc(sizeof(*device), GFP_NOFS); + printk("warning devid %Lu missing\n", devid); + device = add_missing_dev(root, devid, dev_uuid); if (!device) return -ENOMEM; - list_add(&device->dev_list, - &root->fs_info->fs_devices->devices); - list_add(&device->dev_alloc_list, - &root->fs_info->fs_devices->alloc_list); - device->barriers = 1; - spin_lock_init(&device->io_lock); } fill_device_from_item(leaf, dev_item, device); device->dev_root = root->fs_info->dev_root; + device->in_fs_metadata = 1; ret = 0; #if 0 ret = btrfs_open_device(device); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 0f94a69e6eb6..454fe8103329 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -27,8 +27,10 @@ struct btrfs_device { struct list_head dev_alloc_list; struct btrfs_root *dev_root; struct buffer_head *pending_io; + u64 generation; int barriers; + int in_fs_metadata; spinlock_t io_lock; @@ -122,6 +124,7 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, int btrfs_scan_one_device(const char *path, int flags, void *holder, struct btrfs_fs_devices **fs_devices_ret); int btrfs_close_devices(struct btrfs_fs_devices *fs_devices); +int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices); int btrfs_add_device(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_device *device); -- cgit v1.2.3 From a0af469b58944f6e8c5c8ecbebb42997baf0cb9e Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 13 May 2008 16:03:06 -0400 Subject: Fix btrfs_open_devices to deal with changes since the scan ioctls Devices can change after the scan ioctls are done, and btrfs_open_devices needs to be able to verify them as they are opened and used by the FS. Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 4 ++-- fs/btrfs/volumes.c | 70 +++++++++++++++++++++++++++++++++++++++++++++--------- fs/btrfs/volumes.h | 4 ++-- 3 files changed, 63 insertions(+), 15 deletions(-) (limited to 'fs/btrfs/volumes.h') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 38b0d9ecda6a..264f297260f8 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1266,10 +1266,10 @@ struct btrfs_root *open_ctree(struct super_block *sb, btrfs_parse_options(options, tree_root, NULL); - if (btrfs_super_num_devices(disk_super) > fs_devices->num_devices) { + if (btrfs_super_num_devices(disk_super) > fs_devices->open_devices) { printk("Btrfs: wanted %llu devices, but found %llu\n", (unsigned long long)btrfs_super_num_devices(disk_super), - (unsigned long long)fs_devices->num_devices); + (unsigned long long)fs_devices->open_devices); if (btrfs_test_opt(tree_root, DEGRADED)) printk("continuing in degraded mode\n"); else { diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 43f74d17bcea..501d23d3ebfd 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -71,6 +71,7 @@ int btrfs_cleanup_fs_uuids(void) dev_list); if (dev->bdev) { close_bdev_excl(dev->bdev); + fs_devices->open_devices--; } list_del(&dev->dev_list); kfree(dev->name); @@ -174,9 +175,10 @@ again: list_for_each(cur, head) { device = list_entry(cur, struct btrfs_device, dev_list); if (!device->in_fs_metadata) { -printk("getting rid of extra dev %s\n", device->name); - if (device->bdev) + if (device->bdev) { close_bdev_excl(device->bdev); + fs_devices->open_devices--; + } list_del(&device->dev_list); list_del(&device->dev_alloc_list); fs_devices->num_devices--; @@ -188,6 +190,7 @@ printk("getting rid of extra dev %s\n", device->name); mutex_unlock(&uuid_mutex); return 0; } + int btrfs_close_devices(struct btrfs_fs_devices *fs_devices) { struct list_head *head = &fs_devices->devices; @@ -199,10 +202,12 @@ int btrfs_close_devices(struct btrfs_fs_devices *fs_devices) device = list_entry(cur, struct btrfs_device, dev_list); if (device->bdev) { close_bdev_excl(device->bdev); + fs_devices->open_devices--; } device->bdev = NULL; device->in_fs_metadata = 0; } + fs_devices->mounted = 0; mutex_unlock(&uuid_mutex); return 0; } @@ -214,9 +219,19 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, struct list_head *head = &fs_devices->devices; struct list_head *cur; struct btrfs_device *device; - int ret; + struct block_device *latest_bdev = NULL; + struct buffer_head *bh; + struct btrfs_super_block *disk_super; + u64 latest_devid = 0; + u64 latest_transid = 0; + u64 transid; + u64 devid; + int ret = 0; mutex_lock(&uuid_mutex); + if (fs_devices->mounted) + goto out; + list_for_each(cur, head) { device = list_entry(cur, struct btrfs_device, dev_list); if (device->bdev) @@ -229,21 +244,52 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, if (IS_ERR(bdev)) { printk("open %s failed\n", device->name); - ret = PTR_ERR(bdev); - goto fail; + goto error; } set_blocksize(bdev, 4096); - if (device->devid == fs_devices->latest_devid) - fs_devices->latest_bdev = bdev; + + bh = __bread(bdev, BTRFS_SUPER_INFO_OFFSET / 4096, 4096); + if (!bh) + goto error_close; + + disk_super = (struct btrfs_super_block *)bh->b_data; + if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC, + sizeof(disk_super->magic))) + goto error_brelse; + + devid = le64_to_cpu(disk_super->dev_item.devid); + if (devid != device->devid) + goto error_brelse; + + transid = btrfs_super_generation(disk_super); + if (transid > latest_transid) { + latest_devid = devid; + latest_transid = transid; + latest_bdev = bdev; + } + device->bdev = bdev; device->in_fs_metadata = 0; + fs_devices->open_devices++; + continue; +error_brelse: + brelse(bh); +error_close: + close_bdev_excl(bdev); +error: + continue; } + if (fs_devices->open_devices == 0) { + ret = -EIO; + goto out; + } + fs_devices->mounted = 1; + fs_devices->latest_bdev = latest_bdev; + fs_devices->latest_devid = latest_devid; + fs_devices->latest_trans = latest_transid; +out: mutex_unlock(&uuid_mutex); - return 0; -fail: - mutex_unlock(&uuid_mutex); - btrfs_close_devices(fs_devices); return ret; } @@ -828,6 +874,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) if (device->bdev) { /* one close for the device struct or super_block */ close_bdev_excl(device->bdev); + root->fs_info->fs_devices->open_devices--; } if (bdev) { /* one close for us */ @@ -914,6 +961,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) list_add(&device->dev_alloc_list, &root->fs_info->fs_devices->alloc_list); root->fs_info->fs_devices->num_devices++; + root->fs_info->fs_devices->open_devices++; out: btrfs_end_transaction(trans, root); mutex_unlock(&root->fs_info->fs_mutex); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 454fe8103329..4df6b1608f91 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -71,16 +71,16 @@ struct btrfs_fs_devices { /* the device with this id has the most recent coyp of the super */ u64 latest_devid; u64 latest_trans; - u64 lowest_devid; u64 num_devices; + u64 open_devices; struct block_device *latest_bdev; - struct block_device *lowest_bdev; /* all of the devices in the FS */ struct list_head devices; /* devices not currently being allocated */ struct list_head alloc_list; struct list_head list; + int mounted; }; struct btrfs_bio_stripe { -- cgit v1.2.3 From 8b7128429235d9bd72cfd5ed20c77c4f3118f744 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 11 Jun 2008 16:50:36 -0400 Subject: Btrfs: Add async worker threads for pre and post IO checksumming Btrfs has been using workqueues to spread the checksumming load across other CPUs in the system. But, workqueues only schedule work on the same CPU that queued the work, giving them a limited benefit for systems with higher CPU counts. This code adds a generic facility to schedule work with pools of kthreads, and changes the bio submission code to queue bios up. The queueing is important to make sure large numbers of procs on the system don't turn streaming workloads into random workloads by sending IO down concurrently. The end result of all of this is much higher performance (and CPU usage) when doing checksumming on large machines. Two worker pools are created, one for writes and one for endio processing. The two could deadlock if we tried to service both from a single pool. Signed-off-by: Chris Mason --- fs/btrfs/Makefile | 2 +- fs/btrfs/async-thread.c | 288 ++++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/async-thread.h | 78 +++++++++++++ fs/btrfs/ctree.h | 14 ++- fs/btrfs/disk-io.c | 200 ++++++++++++++------------------- fs/btrfs/inode.c | 4 +- fs/btrfs/volumes.c | 162 ++++++++++++++++++++++++++- fs/btrfs/volumes.h | 10 +- 8 files changed, 626 insertions(+), 132 deletions(-) create mode 100644 fs/btrfs/async-thread.c create mode 100644 fs/btrfs/async-thread.h (limited to 'fs/btrfs/volumes.h') diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index e5fc3cfea0ab..9dcfc2fe3332 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -6,7 +6,7 @@ btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ hash.o file-item.o inode-item.o inode-map.o disk-io.o \ transaction.o bit-radix.o inode.o file.o tree-defrag.o \ extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ - extent_io.o volumes.o + extent_io.o volumes.o async-thread.o btrfs-$(CONFIG_FS_POSIX_ACL) += acl.o else diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c new file mode 100644 index 000000000000..2911b67bd6f7 --- /dev/null +++ b/fs/btrfs/async-thread.c @@ -0,0 +1,288 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include +#include +#include "async-thread.h" + +/* + * container for the kthread task pointer and the list of pending work + * One of these is allocated per thread. + */ +struct btrfs_worker_thread { + /* list of struct btrfs_work that are waiting for service */ + struct list_head pending; + + /* list of worker threads from struct btrfs_workers */ + struct list_head worker_list; + + /* kthread */ + struct task_struct *task; + + /* number of things on the pending list */ + atomic_t num_pending; + + /* protects the pending list. */ + spinlock_t lock; + + /* set to non-zero when this thread is already awake and kicking */ + int working; +}; + +/* + * main loop for servicing work items + */ +static int worker_loop(void *arg) +{ + struct btrfs_worker_thread *worker = arg; + struct list_head *cur; + struct btrfs_work *work; + do { + spin_lock_irq(&worker->lock); + while(!list_empty(&worker->pending)) { + cur = worker->pending.next; + work = list_entry(cur, struct btrfs_work, list); + list_del(&work->list); + clear_bit(0, &work->flags); + + work->worker = worker; + spin_unlock_irq(&worker->lock); + + work->func(work); + + atomic_dec(&worker->num_pending); + spin_lock_irq(&worker->lock); + } + worker->working = 0; + if (freezing(current)) { + refrigerator(); + } else { + set_current_state(TASK_INTERRUPTIBLE); + spin_unlock_irq(&worker->lock); + schedule(); + __set_current_state(TASK_RUNNING); + } + } while (!kthread_should_stop()); + return 0; +} + +/* + * this will wait for all the worker threads to shutdown + */ +int btrfs_stop_workers(struct btrfs_workers *workers) +{ + struct list_head *cur; + struct btrfs_worker_thread *worker; + + while(!list_empty(&workers->worker_list)) { + cur = workers->worker_list.next; + worker = list_entry(cur, struct btrfs_worker_thread, + worker_list); + kthread_stop(worker->task); + list_del(&worker->worker_list); + kfree(worker); + } + return 0; +} + +/* + * simple init on struct btrfs_workers + */ +void btrfs_init_workers(struct btrfs_workers *workers, int max) +{ + workers->num_workers = 0; + INIT_LIST_HEAD(&workers->worker_list); + workers->last = NULL; + spin_lock_init(&workers->lock); + workers->max_workers = max; +} + +/* + * starts new worker threads. This does not enforce the max worker + * count in case you need to temporarily go past it. + */ +int btrfs_start_workers(struct btrfs_workers *workers, int num_workers) +{ + struct btrfs_worker_thread *worker; + int ret = 0; + int i; + + for (i = 0; i < num_workers; i++) { + worker = kzalloc(sizeof(*worker), GFP_NOFS); + if (!worker) { + ret = -ENOMEM; + goto fail; + } + + INIT_LIST_HEAD(&worker->pending); + INIT_LIST_HEAD(&worker->worker_list); + spin_lock_init(&worker->lock); + atomic_set(&worker->num_pending, 0); + worker->task = kthread_run(worker_loop, worker, "btrfs"); + if (IS_ERR(worker->task)) { + ret = PTR_ERR(worker->task); + goto fail; + } + + spin_lock_irq(&workers->lock); + list_add_tail(&worker->worker_list, &workers->worker_list); + workers->last = worker; + workers->num_workers++; + spin_unlock_irq(&workers->lock); + } + return 0; +fail: + btrfs_stop_workers(workers); + return ret; +} + +/* + * run through the list and find a worker thread that doesn't have a lot + * to do right now. This can return null if we aren't yet at the thread + * count limit and all of the threads are busy. + */ +static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers) +{ + struct btrfs_worker_thread *worker; + struct list_head *next; + struct list_head *start; + int enforce_min = workers->num_workers < workers->max_workers; + + /* start with the last thread if it isn't busy */ + worker = workers->last; + if (atomic_read(&worker->num_pending) < 64) + goto done; + + next = worker->worker_list.next; + start = &worker->worker_list; + + /* + * check all the workers for someone that is bored. FIXME, do + * something smart here + */ + while(next != start) { + if (next == &workers->worker_list) { + next = workers->worker_list.next; + continue; + } + worker = list_entry(next, struct btrfs_worker_thread, + worker_list); + if (atomic_read(&worker->num_pending) < 64 || !enforce_min) + goto done; + next = next->next; + } + /* + * nobody was bored, if we're already at the max thread count, + * use the last thread + */ + if (!enforce_min || atomic_read(&workers->last->num_pending) < 64) { + return workers->last; + } + return NULL; +done: + workers->last = worker; + return worker; +} + +static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers) +{ + struct btrfs_worker_thread *worker; + unsigned long flags; + +again: + spin_lock_irqsave(&workers->lock, flags); + worker = next_worker(workers); + spin_unlock_irqrestore(&workers->lock, flags); + + if (!worker) { + spin_lock_irqsave(&workers->lock, flags); + if (workers->num_workers >= workers->max_workers) { + /* + * we have failed to find any workers, just + * return the force one + */ + worker = list_entry(workers->worker_list.next, + struct btrfs_worker_thread, worker_list); + spin_unlock_irqrestore(&workers->lock, flags); + } else { + spin_unlock_irqrestore(&workers->lock, flags); + /* we're below the limit, start another worker */ + btrfs_start_workers(workers, 1); + goto again; + } + } + return worker; +} + +/* + * btrfs_requeue_work just puts the work item back on the tail of the list + * it was taken from. It is intended for use with long running work functions + * that make some progress and want to give the cpu up for others. + */ +int btrfs_requeue_work(struct btrfs_work *work) +{ + struct btrfs_worker_thread *worker = work->worker; + unsigned long flags; + + if (test_and_set_bit(0, &work->flags)) + goto out; + + spin_lock_irqsave(&worker->lock, flags); + atomic_inc(&worker->num_pending); + list_add_tail(&work->list, &worker->pending); + spin_unlock_irqrestore(&worker->lock, flags); +out: + return 0; +} + +/* + * places a struct btrfs_work into the pending queue of one of the kthreads + */ +int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work) +{ + struct btrfs_worker_thread *worker; + unsigned long flags; + int wake = 0; + + /* don't requeue something already on a list */ + if (test_and_set_bit(0, &work->flags)) + goto out; + + worker = find_worker(workers); + + spin_lock_irqsave(&worker->lock, flags); + atomic_inc(&worker->num_pending); + list_add_tail(&work->list, &worker->pending); + + /* + * avoid calling into wake_up_process if this thread has already + * been kicked + */ + if (!worker->working) + wake = 1; + worker->working = 1; + + spin_unlock_irqrestore(&worker->lock, flags); + + if (wake) + wake_up_process(worker->task); +out: + return 0; +} diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h new file mode 100644 index 000000000000..52fc9da0f9e7 --- /dev/null +++ b/fs/btrfs/async-thread.h @@ -0,0 +1,78 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_ASYNC_THREAD_ +#define __BTRFS_ASYNC_THREAD_ + +struct btrfs_worker_thread; + +/* + * This is similar to a workqueue, but it is meant to spread the operations + * across all available cpus instead of just the CPU that was used to + * queue the work. There is also some batching introduced to try and + * cut down on context switches. + * + * By default threads are added on demand up to 2 * the number of cpus. + * Changing struct btrfs_workers->max_workers is one way to prevent + * demand creation of kthreads. + * + * the basic model of these worker threads is to embed a btrfs_work + * structure in your own data struct, and use container_of in a + * work function to get back to your data struct. + */ +struct btrfs_work { + /* + * only func should be set to the function you want called + * your work struct is passed as the only arg + */ + void (*func)(struct btrfs_work *work); + + /* + * flags should be set to zero. It is used to make sure the + * struct is only inserted once into the list. + */ + unsigned long flags; + + /* don't touch these */ + struct btrfs_worker_thread *worker; + struct list_head list; +}; + +struct btrfs_workers { + /* current number of running workers */ + int num_workers; + + /* max number of workers allowed. changed by btrfs_start_workers */ + int max_workers; + + /* list with all the work threads */ + struct list_head worker_list; + + /* the last worker thread to have something queued */ + struct btrfs_worker_thread *last; + + /* lock for finding the next worker thread to queue on */ + spinlock_t lock; +}; + +int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work); +int btrfs_start_workers(struct btrfs_workers *workers, int num_workers); +int btrfs_stop_workers(struct btrfs_workers *workers); +void btrfs_init_workers(struct btrfs_workers *workers, int max); +int btrfs_requeue_work(struct btrfs_work *work); +#endif diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 49cbc62b42f9..6c91a510c965 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -30,6 +30,7 @@ #include "bit-radix.h" #include "extent_io.h" #include "extent_map.h" +#include "async-thread.h" struct btrfs_trans_handle; struct btrfs_transaction; @@ -518,13 +519,20 @@ struct btrfs_fs_info { struct list_head hashers; struct list_head dead_roots; struct list_head end_io_work_list; - struct list_head async_submit_work_list; struct work_struct end_io_work; - struct work_struct async_submit_work; spinlock_t end_io_work_lock; - spinlock_t async_submit_work_lock; atomic_t nr_async_submits; + /* + * there is a pool of worker threads for checksumming during writes + * and a pool for checksumming after reads. This is because readers + * can run with FS locks held, and the writers may be waiting for + * those locks. We don't want ordering in the pending list to cause + * deadlocks, and so the two are serviced separately. + */ + struct btrfs_workers workers; + struct btrfs_workers endio_workers; + #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18) struct work_struct trans_work; #else diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index b9a53646ceb2..98ff4fbcb386 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -31,6 +31,7 @@ #include "btrfs_inode.h" #include "volumes.h" #include "print-tree.h" +#include "async-thread.h" #if 0 static int check_tree_block(struct btrfs_root *root, struct extent_buffer *buf) @@ -46,8 +47,7 @@ static int check_tree_block(struct btrfs_root *root, struct extent_buffer *buf) #endif static struct extent_io_ops btree_extent_io_ops; -static struct workqueue_struct *end_io_workqueue; -static struct workqueue_struct *async_submit_workqueue; +static void end_workqueue_fn(struct btrfs_work *work); struct end_io_wq { struct bio *bio; @@ -57,6 +57,7 @@ struct end_io_wq { int error; int metadata; struct list_head list; + struct btrfs_work work; }; struct async_submit_bio { @@ -66,6 +67,7 @@ struct async_submit_bio { extent_submit_bio_hook_t *submit_bio_hook; int rw; int mirror_num; + struct btrfs_work work; }; struct extent_map *btree_get_extent(struct inode *inode, struct page *page, @@ -389,7 +391,6 @@ static int end_workqueue_bio(struct bio *bio, { struct end_io_wq *end_io_wq = bio->bi_private; struct btrfs_fs_info *fs_info; - unsigned long flags; #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23) if (bio->bi_size) @@ -397,11 +398,10 @@ static int end_workqueue_bio(struct bio *bio, #endif fs_info = end_io_wq->info; - spin_lock_irqsave(&fs_info->end_io_work_lock, flags); end_io_wq->error = err; - list_add_tail(&end_io_wq->list, &fs_info->end_io_work_list); - spin_unlock_irqrestore(&fs_info->end_io_work_lock, flags); - queue_work(end_io_workqueue, &fs_info->end_io_work); + end_io_wq->work.func = end_workqueue_fn; + end_io_wq->work.flags = 0; + btrfs_queue_worker(&fs_info->endio_workers, &end_io_wq->work); #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23) return 0; @@ -428,6 +428,19 @@ int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, return 0; } +static void run_one_async_submit(struct btrfs_work *work) +{ + struct btrfs_fs_info *fs_info; + struct async_submit_bio *async; + + async = container_of(work, struct async_submit_bio, work); + fs_info = BTRFS_I(async->inode)->root->fs_info; + atomic_dec(&fs_info->nr_async_submits); + async->submit_bio_hook(async->inode, async->rw, async->bio, + async->mirror_num); + kfree(async); +} + int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, int rw, struct bio *bio, int mirror_num, extent_submit_bio_hook_t *submit_bio_hook) @@ -443,13 +456,10 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, async->bio = bio; async->mirror_num = mirror_num; async->submit_bio_hook = submit_bio_hook; - - spin_lock(&fs_info->async_submit_work_lock); - list_add_tail(&async->list, &fs_info->async_submit_work_list); + async->work.func = run_one_async_submit; + async->work.flags = 0; atomic_inc(&fs_info->nr_async_submits); - spin_unlock(&fs_info->async_submit_work_lock); - - queue_work(async_submit_workqueue, &fs_info->async_submit_work); + btrfs_queue_worker(&fs_info->workers, &async->work); return 0; } @@ -462,19 +472,32 @@ static int __btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, offset = bio->bi_sector << 9; + /* + * when we're called for a write, we're already in the async + * submission context. Just jump ingo btrfs_map_bio + */ if (rw & (1 << BIO_RW)) { - return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num); + return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, + mirror_num, 0); } + /* + * called for a read, do the setup so that checksum validation + * can happen in the async kernel threads + */ ret = btrfs_bio_wq_end_io(root->fs_info, bio, 1); BUG_ON(ret); - return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num); + return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1); } static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, int mirror_num) { + /* + * kthread helpers are used to submit writes so that checksumming + * can happen in parallel across all CPUs + */ if (!(rw & (1 << BIO_RW))) { return __btree_submit_bio_hook(inode, rw, bio, mirror_num); } @@ -1036,95 +1059,40 @@ static int bio_ready_for_csum(struct bio *bio) return ret; } -#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18) -static void btrfs_end_io_csum(void *p) -#else -static void btrfs_end_io_csum(struct work_struct *work) -#endif +/* + * called by the kthread helper functions to finally call the bio end_io + * functions. This is where read checksum verification actually happens + */ +static void end_workqueue_fn(struct btrfs_work *work) { -#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18) - struct btrfs_fs_info *fs_info = p; -#else - struct btrfs_fs_info *fs_info = container_of(work, - struct btrfs_fs_info, - end_io_work); -#endif - unsigned long flags; - struct end_io_wq *end_io_wq; struct bio *bio; - struct list_head *next; + struct end_io_wq *end_io_wq; + struct btrfs_fs_info *fs_info; int error; - int was_empty; - while(1) { - spin_lock_irqsave(&fs_info->end_io_work_lock, flags); - if (list_empty(&fs_info->end_io_work_list)) { - spin_unlock_irqrestore(&fs_info->end_io_work_lock, - flags); - return; - } - next = fs_info->end_io_work_list.next; - list_del(next); - spin_unlock_irqrestore(&fs_info->end_io_work_lock, flags); - - end_io_wq = list_entry(next, struct end_io_wq, list); - - bio = end_io_wq->bio; - if (end_io_wq->metadata && !bio_ready_for_csum(bio)) { - spin_lock_irqsave(&fs_info->end_io_work_lock, flags); - was_empty = list_empty(&fs_info->end_io_work_list); - list_add_tail(&end_io_wq->list, - &fs_info->end_io_work_list); - spin_unlock_irqrestore(&fs_info->end_io_work_lock, - flags); - if (was_empty) - return; - continue; - } - error = end_io_wq->error; - bio->bi_private = end_io_wq->private; - bio->bi_end_io = end_io_wq->end_io; - kfree(end_io_wq); -#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23) - bio_endio(bio, bio->bi_size, error); -#else - bio_endio(bio, error); -#endif - } -} + end_io_wq = container_of(work, struct end_io_wq, work); + bio = end_io_wq->bio; + fs_info = end_io_wq->info; -#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18) -static void btrfs_async_submit_work(void *p) -#else -static void btrfs_async_submit_work(struct work_struct *work) -#endif -{ -#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18) - struct btrfs_fs_info *fs_info = p; + /* metadata bios are special because the whole tree block must + * be checksummed at once. This makes sure the entire block is in + * ram and up to date before trying to verify things. For + * blocksize <= pagesize, it is basically a noop + */ + if (end_io_wq->metadata && !bio_ready_for_csum(bio)) { + btrfs_queue_worker(&fs_info->endio_workers, + &end_io_wq->work); + return; + } + error = end_io_wq->error; + bio->bi_private = end_io_wq->private; + bio->bi_end_io = end_io_wq->end_io; + kfree(end_io_wq); +#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23) + bio_endio(bio, bio->bi_size, error); #else - struct btrfs_fs_info *fs_info = container_of(work, - struct btrfs_fs_info, - async_submit_work); + bio_endio(bio, error); #endif - struct async_submit_bio *async; - struct list_head *next; - - while(1) { - spin_lock(&fs_info->async_submit_work_lock); - if (list_empty(&fs_info->async_submit_work_list)) { - spin_unlock(&fs_info->async_submit_work_lock); - return; - } - next = fs_info->async_submit_work_list.next; - list_del(next); - atomic_dec(&fs_info->nr_async_submits); - spin_unlock(&fs_info->async_submit_work_lock); - - async = list_entry(next, struct async_submit_bio, list); - async->submit_bio_hook(async->inode, async->rw, async->bio, - async->mirror_num); - kfree(async); - } } struct btrfs_root *open_ctree(struct super_block *sb, @@ -1155,19 +1123,11 @@ struct btrfs_root *open_ctree(struct super_block *sb, err = -ENOMEM; goto fail; } - end_io_workqueue = create_workqueue("btrfs-end-io"); - BUG_ON(!end_io_workqueue); - async_submit_workqueue = create_workqueue("btrfs-async-submit"); - INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_NOFS); INIT_LIST_HEAD(&fs_info->trans_list); INIT_LIST_HEAD(&fs_info->dead_roots); INIT_LIST_HEAD(&fs_info->hashers); - INIT_LIST_HEAD(&fs_info->end_io_work_list); - INIT_LIST_HEAD(&fs_info->async_submit_work_list); spin_lock_init(&fs_info->hash_lock); - spin_lock_init(&fs_info->end_io_work_lock); - spin_lock_init(&fs_info->async_submit_work_lock); spin_lock_init(&fs_info->delalloc_lock); spin_lock_init(&fs_info->new_trans_lock); @@ -1222,13 +1182,8 @@ struct btrfs_root *open_ctree(struct super_block *sb, fs_info->do_barriers = 1; #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18) - INIT_WORK(&fs_info->end_io_work, btrfs_end_io_csum, fs_info); - INIT_WORK(&fs_info->async_submit_work, btrfs_async_submit_work, - fs_info); INIT_WORK(&fs_info->trans_work, btrfs_transaction_cleaner, fs_info); #else - INIT_WORK(&fs_info->end_io_work, btrfs_end_io_csum); - INIT_WORK(&fs_info->async_submit_work, btrfs_async_submit_work); INIT_DELAYED_WORK(&fs_info->trans_work, btrfs_transaction_cleaner); #endif BTRFS_I(fs_info->btree_inode)->root = tree_root; @@ -1240,6 +1195,19 @@ struct btrfs_root *open_ctree(struct super_block *sb, mutex_init(&fs_info->trans_mutex); mutex_init(&fs_info->fs_mutex); + /* we need to start all the end_io workers up front because the + * queue work function gets called at interrupt time. The endio + * workers don't normally start IO, so some number of them <= the + * number of cpus is fine. They handle checksumming after a read. + * + * The other worker threads do start IO, so the max is larger than + * the number of CPUs. FIXME, tune this for huge machines + */ + btrfs_init_workers(&fs_info->workers, num_online_cpus() * 2); + btrfs_init_workers(&fs_info->endio_workers, num_online_cpus()); + btrfs_start_workers(&fs_info->workers, 1); + btrfs_start_workers(&fs_info->endio_workers, num_online_cpus()); + #if 0 ret = add_hasher(fs_info, "crc32c"); if (ret) { @@ -1375,6 +1343,8 @@ fail_sb_buffer: extent_io_tree_empty_lru(&BTRFS_I(fs_info->btree_inode)->io_tree); fail_iput: iput(fs_info->btree_inode); + btrfs_stop_workers(&fs_info->workers); + btrfs_stop_workers(&fs_info->endio_workers); fail: btrfs_close_devices(fs_info->fs_devices); btrfs_mapping_tree_free(&fs_info->mapping_tree); @@ -1623,16 +1593,10 @@ int close_ctree(struct btrfs_root *root) extent_io_tree_empty_lru(&fs_info->extent_ins); extent_io_tree_empty_lru(&BTRFS_I(fs_info->btree_inode)->io_tree); - flush_workqueue(async_submit_workqueue); - flush_workqueue(end_io_workqueue); - truncate_inode_pages(fs_info->btree_inode->i_mapping, 0); - flush_workqueue(async_submit_workqueue); - destroy_workqueue(async_submit_workqueue); - - flush_workqueue(end_io_workqueue); - destroy_workqueue(end_io_workqueue); + btrfs_stop_workers(&fs_info->workers); + btrfs_stop_workers(&fs_info->endio_workers); iput(fs_info->btree_inode); #if 0 diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 0f14697becef..7daef8d37006 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -359,7 +359,7 @@ int __btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, kfree(sums); - return btrfs_map_bio(root, rw, bio, mirror_num); + return btrfs_map_bio(root, rw, bio, mirror_num, 1); } int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, @@ -383,7 +383,7 @@ int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, inode, rw, bio, mirror_num, __btrfs_submit_bio_hook); mapit: - return btrfs_map_bio(root, rw, bio, mirror_num); + return btrfs_map_bio(root, rw, bio, mirror_num, 0); } int btrfs_readpage_io_hook(struct page *page, u64 start, u64 end) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 722eb4550154..c57458ce6339 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -27,6 +27,7 @@ #include "transaction.h" #include "print-tree.h" #include "volumes.h" +#include "async-thread.h" struct map_lookup { u64 type; @@ -110,6 +111,101 @@ static struct btrfs_fs_devices *find_fsid(u8 *fsid) return NULL; } +/* + * we try to collect pending bios for a device so we don't get a large + * number of procs sending bios down to the same device. This greatly + * improves the schedulers ability to collect and merge the bios. + * + * But, it also turns into a long list of bios to process and that is sure + * to eventually make the worker thread block. The solution here is to + * make some progress and then put this work struct back at the end of + * the list if the block device is congested. This way, multiple devices + * can make progress from a single worker thread. + */ +int run_scheduled_bios(struct btrfs_device *device) +{ + struct bio *pending; + struct backing_dev_info *bdi; + struct bio *tail; + struct bio *cur; + int again = 0; + unsigned long num_run = 0; + + bdi = device->bdev->bd_inode->i_mapping->backing_dev_info; +loop: + spin_lock(&device->io_lock); + + /* take all the bios off the list at once and process them + * later on (without the lock held). But, remember the + * tail and other pointers so the bios can be properly reinserted + * into the list if we hit congestion + */ + pending = device->pending_bios; + tail = device->pending_bio_tail; + WARN_ON(pending && !tail); + device->pending_bios = NULL; + device->pending_bio_tail = NULL; + + /* + * if pending was null this time around, no bios need processing + * at all and we can stop. Otherwise it'll loop back up again + * and do an additional check so no bios are missed. + * + * device->running_pending is used to synchronize with the + * schedule_bio code. + */ + if (pending) { + again = 1; + device->running_pending = 1; + } else { + again = 0; + device->running_pending = 0; + } + spin_unlock(&device->io_lock); + + while(pending) { + cur = pending; + pending = pending->bi_next; + cur->bi_next = NULL; + atomic_dec(&device->dev_root->fs_info->nr_async_submits); + submit_bio(cur->bi_rw, cur); + num_run++; + + /* + * we made progress, there is more work to do and the bdi + * is now congested. Back off and let other work structs + * run instead + */ + if (pending && num_run && bdi_write_congested(bdi)) { + struct bio *old_head; + + spin_lock(&device->io_lock); + old_head = device->pending_bios; + device->pending_bios = pending; + if (device->pending_bio_tail) + tail->bi_next = old_head; + else + device->pending_bio_tail = tail; + + spin_unlock(&device->io_lock); + btrfs_requeue_work(&device->work); + goto done; + } + } + if (again) + goto loop; +done: + return 0; +} + +void pending_bios_fn(struct btrfs_work *work) +{ + struct btrfs_device *device; + + device = container_of(work, struct btrfs_device, work); + run_scheduled_bios(device); +} + static int device_list_add(const char *path, struct btrfs_super_block *disk_super, u64 devid, struct btrfs_fs_devices **fs_devices_ret) @@ -141,6 +237,7 @@ static int device_list_add(const char *path, return -ENOMEM; } device->devid = devid; + device->work.func = pending_bios_fn; memcpy(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE); device->barriers = 1; @@ -925,6 +1022,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) } device->barriers = 1; + device->work.func = pending_bios_fn; generate_random_uuid(device->uuid); spin_lock_init(&device->io_lock); device->name = kstrdup(device_path, GFP_NOFS); @@ -1965,8 +2063,61 @@ static int end_bio_multi_stripe(struct bio *bio, #endif } +struct async_sched { + struct bio *bio; + int rw; + struct btrfs_fs_info *info; + struct btrfs_work work; +}; + +/* + * see run_scheduled_bios for a description of why bios are collected for + * async submit. + * + * This will add one bio to the pending list for a device and make sure + * the work struct is scheduled. + */ +int schedule_bio(struct btrfs_root *root, struct btrfs_device *device, + int rw, struct bio *bio) +{ + int should_queue = 1; + + /* don't bother with additional async steps for reads, right now */ + if (!(rw & (1 << BIO_RW))) { + submit_bio(rw, bio); + return 0; + } + + /* + * nr_async_sumbits allows us to reliably return congestion to the + * higher layers. Otherwise, the async bio makes it appear we have + * made progress against dirty pages when we've really just put it + * on a queue for later + */ + atomic_inc(&root->fs_info->nr_async_submits); + bio->bi_next = NULL; + bio->bi_rw |= rw; + + spin_lock(&device->io_lock); + + if (device->pending_bio_tail) + device->pending_bio_tail->bi_next = bio; + + device->pending_bio_tail = bio; + if (!device->pending_bios) + device->pending_bios = bio; + if (device->running_pending) + should_queue = 0; + + spin_unlock(&device->io_lock); + + if (should_queue) + btrfs_queue_worker(&root->fs_info->workers, &device->work); + return 0; +} + int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, - int mirror_num) + int mirror_num, int async_submit) { struct btrfs_mapping_tree *map_tree; struct btrfs_device *dev; @@ -2012,10 +2163,10 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, dev = multi->stripes[dev_nr].dev; if (dev && dev->bdev) { bio->bi_bdev = dev->bdev; - spin_lock(&dev->io_lock); - dev->total_ios++; - spin_unlock(&dev->io_lock); - submit_bio(rw, bio); + if (async_submit) + schedule_bio(root, dev, rw, bio); + else + submit_bio(rw, bio); } else { bio->bi_bdev = root->fs_info->fs_devices->latest_bdev; bio->bi_sector = logical >> 9; @@ -2054,6 +2205,7 @@ static struct btrfs_device *add_missing_dev(struct btrfs_root *root, device->barriers = 1; device->dev_root = root->fs_info->dev_root; device->devid = devid; + device->work.func = pending_bios_fn; fs_devices->num_devices++; spin_lock_init(&device->io_lock); memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 4df6b1608f91..48a44f7a9385 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -20,6 +20,7 @@ #define __BTRFS_VOLUMES_ #include +#include "async-thread.h" struct buffer_head; struct btrfs_device { @@ -27,6 +28,9 @@ struct btrfs_device { struct list_head dev_alloc_list; struct btrfs_root *dev_root; struct buffer_head *pending_io; + struct bio *pending_bios; + struct bio *pending_bio_tail; + int running_pending; u64 generation; int barriers; @@ -36,8 +40,6 @@ struct btrfs_device { struct block_device *bdev; - u64 total_ios; - char *name; /* the internal btrfs device id */ @@ -63,6 +65,8 @@ struct btrfs_device { /* physical drive uuid (or lvm uuid) */ u8 uuid[BTRFS_UUID_SIZE]; + + struct btrfs_work work; }; struct btrfs_fs_devices { @@ -117,7 +121,7 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, void btrfs_mapping_init(struct btrfs_mapping_tree *tree); void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree); int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, - int mirror_num); + int mirror_num, int async_submit); int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf); int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, int flags, void *holder); -- cgit v1.2.3 From 7d2b4daa67379960477568abda62b8ba9ee3a8aa Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 5 Aug 2008 10:13:57 -0400 Subject: Btrfs: Fix the multi-bio code to save the original bio for completion The multi-bio code is responsible for duplicating blocks in raid1 and single spindle duplication. It has counters to make sure all of the locations for a given extent are properly written before io completion is returned to the higher layers. But, it didn't always complete the same bio it was given, sometimes a clone was completed instead. This lead to problems with the async work queues because they saved a pointer to the bio in a struct off bi_private. The fix is to remember the original bio and only complete that one. Signed-off-by: Chris Mason --- fs/btrfs/volumes.c | 11 ++++++++++- fs/btrfs/volumes.h | 1 + 2 files changed, 11 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/volumes.h') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 0de51e37e789..09311b3066df 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2070,6 +2070,7 @@ static int end_bio_multi_stripe(struct bio *bio, #endif { struct btrfs_multi_bio *multi = bio->bi_private; + int is_orig_bio = 0; #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23) if (bio->bi_size) @@ -2078,7 +2079,14 @@ static int end_bio_multi_stripe(struct bio *bio, if (err) atomic_inc(&multi->error); + if (bio == multi->orig_bio) + is_orig_bio = 1; + if (atomic_dec_and_test(&multi->stripes_pending)) { + if (!is_orig_bio) { + bio_put(bio); + bio = multi->orig_bio; + } bio->bi_private = multi->private; bio->bi_end_io = multi->end_io; /* only send an error to the higher layers if it is @@ -2101,7 +2109,7 @@ static int end_bio_multi_stripe(struct bio *bio, #else bio_endio(bio, err); #endif - } else { + } else if (!is_orig_bio) { bio_put(bio); } #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23) @@ -2196,6 +2204,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, } multi->end_io = first_bio->bi_end_io; multi->private = first_bio->bi_private; + multi->orig_bio = first_bio; atomic_set(&multi->stripes_pending, multi->num_stripes); while(dev_nr < total_devs) { diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 48a44f7a9385..c50e50580b51 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -95,6 +95,7 @@ struct btrfs_bio_stripe { struct btrfs_multi_bio { atomic_t stripes_pending; bio_end_io_t *end_io; + struct bio *orig_bio; void *private; atomic_t error; int max_errors; -- cgit v1.2.3