diff options
author | Jens Axboe <axboe@kernel.dk> | 2013-05-01 09:23:05 +0200 |
---|---|---|
committer | Jens Axboe <axboe@kernel.dk> | 2013-05-01 09:23:05 +0200 |
commit | f50efd2fdbd9b35b11f5778ed85beb764184bda9 (patch) | |
tree | ed92b40995d60136fb387d210886e4aae2c37231 | |
parent | 0821e904057505c7e25d72e1a282105d023b26c9 (diff) | |
parent | ee66850642efda91d04179cae2414310675a1f73 (diff) |
Merge branch 'bcache-for-upstream' of http://evilpiepirate.org/git/linux-bcache into for-3.10/drivers
Kent writes:
Hey Jens, this is everything I've got ready for 3.10 - there's _still_
one more bug I'm trying to track down.
Andrew - I've got patches that rip out the pkey() and pbtree() macros,
but they're somewhat tied up with some other nontrivial refactorings so
I think I'm going to wait a bit on those.
-rw-r--r-- | drivers/md/bcache/alloc.c | 72 | ||||
-rw-r--r-- | drivers/md/bcache/bcache.h | 47 | ||||
-rw-r--r-- | drivers/md/bcache/btree.c | 3 | ||||
-rw-r--r-- | drivers/md/bcache/io.c | 35 | ||||
-rw-r--r-- | drivers/md/bcache/request.c | 2 | ||||
-rw-r--r-- | drivers/md/bcache/super.c | 166 |
6 files changed, 213 insertions, 112 deletions
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index 2879487d036a..048f2947e08b 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -243,31 +243,37 @@ static void invalidate_buckets_lru(struct cache *ca) ca->heap.used = 0; for_each_bucket(b, ca) { + /* + * If we fill up the unused list, if we then return before + * adding anything to the free_inc list we'll skip writing + * prios/gens and just go back to allocating from the unused + * list: + */ + if (fifo_full(&ca->unused)) + return; + if (!can_invalidate_bucket(ca, b)) continue; - if (!GC_SECTORS_USED(b)) { - if (!bch_bucket_add_unused(ca, b)) - return; - } else { - if (!heap_full(&ca->heap)) - heap_add(&ca->heap, b, bucket_max_cmp); - else if (bucket_max_cmp(b, heap_peek(&ca->heap))) { - ca->heap.data[0] = b; - heap_sift(&ca->heap, 0, bucket_max_cmp); - } + if (!GC_SECTORS_USED(b) && + bch_bucket_add_unused(ca, b)) + continue; + + if (!heap_full(&ca->heap)) + heap_add(&ca->heap, b, bucket_max_cmp); + else if (bucket_max_cmp(b, heap_peek(&ca->heap))) { + ca->heap.data[0] = b; + heap_sift(&ca->heap, 0, bucket_max_cmp); } } - if (ca->heap.used * 2 < ca->heap.size) - bch_queue_gc(ca->set); - for (i = ca->heap.used / 2 - 1; i >= 0; --i) heap_sift(&ca->heap, i, bucket_min_cmp); while (!fifo_full(&ca->free_inc)) { if (!heap_pop(&ca->heap, b, bucket_min_cmp)) { - /* We don't want to be calling invalidate_buckets() + /* + * We don't want to be calling invalidate_buckets() * multiple times when it can't do anything */ ca->invalidate_needs_gc = 1; @@ -343,15 +349,22 @@ static void invalidate_buckets(struct cache *ca) invalidate_buckets_random(ca); break; } + + pr_debug("free %zu/%zu free_inc %zu/%zu unused %zu/%zu", + fifo_used(&ca->free), ca->free.size, + fifo_used(&ca->free_inc), ca->free_inc.size, + fifo_used(&ca->unused), ca->unused.size); } #define allocator_wait(ca, cond) \ do { \ DEFINE_WAIT(__wait); \ \ - while (!(cond)) { \ + while (1) { \ prepare_to_wait(&ca->set->alloc_wait, \ &__wait, TASK_INTERRUPTIBLE); \ + if (cond) \ + break; \ \ mutex_unlock(&(ca)->set->bucket_lock); \ if (test_bit(CACHE_SET_STOPPING_2, &ca->set->flags)) { \ @@ -360,7 +373,6 @@ do { \ } \ \ schedule(); \ - __set_current_state(TASK_RUNNING); \ mutex_lock(&(ca)->set->bucket_lock); \ } \ \ @@ -374,6 +386,11 @@ void bch_allocator_thread(struct closure *cl) mutex_lock(&ca->set->bucket_lock); while (1) { + /* + * First, we pull buckets off of the unused and free_inc lists, + * possibly issue discards to them, then we add the bucket to + * the free list: + */ while (1) { long bucket; @@ -398,17 +415,26 @@ void bch_allocator_thread(struct closure *cl) } } - allocator_wait(ca, ca->set->gc_mark_valid); - invalidate_buckets(ca); + /* + * We've run out of free buckets, we need to find some buckets + * we can invalidate. First, invalidate them in memory and add + * them to the free_inc list: + */ - allocator_wait(ca, !atomic_read(&ca->set->prio_blocked) || - !CACHE_SYNC(&ca->set->sb)); + allocator_wait(ca, ca->set->gc_mark_valid && + (ca->need_save_prio > 64 || + !ca->invalidate_needs_gc)); + invalidate_buckets(ca); + /* + * Now, we write their new gens to disk so we can start writing + * new stuff to them: + */ + allocator_wait(ca, !atomic_read(&ca->set->prio_blocked)); if (CACHE_SYNC(&ca->set->sb) && (!fifo_empty(&ca->free_inc) || - ca->need_save_prio > 64)) { + ca->need_save_prio > 64)) bch_prio_write(ca); - } } } @@ -475,7 +501,7 @@ void bch_bucket_free(struct cache_set *c, struct bkey *k) for (i = 0; i < KEY_PTRS(k); i++) { struct bucket *b = PTR_BUCKET(c, k, i); - SET_GC_MARK(b, 0); + SET_GC_MARK(b, GC_MARK_RECLAIMABLE); SET_GC_SECTORS_USED(b, 0); bch_bucket_add_unused(PTR_CACHE(c, k, i), b); } diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index f05723565f17..340146d7c17f 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -223,11 +223,17 @@ struct bkey { #define BKEY_PADDED(key) \ union { struct bkey key; uint64_t key ## _pad[BKEY_PAD]; } -/* Version 1: Backing device +/* Version 0: Cache device + * Version 1: Backing device * Version 2: Seed pointer into btree node checksum - * Version 3: New UUID format + * Version 3: Cache device with new UUID format + * Version 4: Backing device with data offset */ -#define BCACHE_SB_VERSION 3 +#define BCACHE_SB_VERSION_CDEV 0 +#define BCACHE_SB_VERSION_BDEV 1 +#define BCACHE_SB_VERSION_CDEV_WITH_UUID 3 +#define BCACHE_SB_VERSION_BDEV_WITH_OFFSET 4 +#define BCACHE_SB_MAX_VERSION 4 #define SB_SECTOR 8 #define SB_SIZE 4096 @@ -236,13 +242,12 @@ struct bkey { /* SB_JOURNAL_BUCKETS must be divisible by BITS_PER_LONG */ #define MAX_CACHES_PER_SET 8 -#define BDEV_DATA_START 16 /* sectors */ +#define BDEV_DATA_START_DEFAULT 16 /* sectors */ struct cache_sb { uint64_t csum; uint64_t offset; /* sector where this sb was written */ uint64_t version; -#define CACHE_BACKING_DEV 1 uint8_t magic[16]; @@ -257,12 +262,28 @@ struct cache_sb { uint64_t seq; uint64_t pad[8]; - uint64_t nbuckets; /* device size */ - uint16_t block_size; /* sectors */ - uint16_t bucket_size; /* sectors */ + union { + struct { + /* Cache devices */ + uint64_t nbuckets; /* device size */ + + uint16_t block_size; /* sectors */ + uint16_t bucket_size; /* sectors */ - uint16_t nr_in_set; - uint16_t nr_this_dev; + uint16_t nr_in_set; + uint16_t nr_this_dev; + }; + struct { + /* Backing devices */ + uint64_t data_offset; + + /* + * block_size from the cache device section is still used by + * backing devices, so don't add anything here until we fix + * things to not need it for backing devices anymore + */ + }; + }; uint32_t last_mount; /* time_t */ @@ -861,6 +882,12 @@ static inline bool key_merging_disabled(struct cache_set *c) #endif } +static inline bool SB_IS_BDEV(const struct cache_sb *sb) +{ + return sb->version == BCACHE_SB_VERSION_BDEV + || sb->version == BCACHE_SB_VERSION_BDEV_WITH_OFFSET; +} + struct bbio { unsigned submit_time_us; union { diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 852340793777..7a5658f04e62 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -984,7 +984,7 @@ static void btree_node_free(struct btree *b, struct btree_op *op) if (b->prio_blocked && !atomic_sub_return(b->prio_blocked, &b->c->prio_blocked)) - closure_wake_up(&b->c->bucket_wait); + wake_up(&b->c->alloc_wait); b->prio_blocked = 0; @@ -1548,7 +1548,6 @@ static void bch_btree_gc(struct closure *cl) trace_bcache_gc_end(c->sb.set_uuid); wake_up(&c->alloc_wait); - closure_wake_up(&c->bucket_wait); continue_at(cl, bch_moving_gc, bch_gc_wq); } diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c index f565512f6fac..48efd4dea645 100644 --- a/drivers/md/bcache/io.c +++ b/drivers/md/bcache/io.c @@ -38,6 +38,15 @@ static void bch_generic_make_request_hack(struct bio *bio) bio = clone; } + /* + * Hack, since drivers that clone bios clone up to bi_max_vecs, but our + * bios might have had more than that (before we split them per device + * limitations). + * + * To be taken out once immutable bvec stuff is in. + */ + bio->bi_max_vecs = bio->bi_vcnt; + generic_make_request(bio); } @@ -149,34 +158,32 @@ static unsigned bch_bio_max_sectors(struct bio *bio) { unsigned ret = bio_sectors(bio); struct request_queue *q = bdev_get_queue(bio->bi_bdev); + unsigned max_segments = min_t(unsigned, BIO_MAX_PAGES, + queue_max_segments(q)); struct bio_vec *bv, *end = bio_iovec(bio) + - min_t(int, bio_segments(bio), queue_max_segments(q)); - - struct bvec_merge_data bvm = { - .bi_bdev = bio->bi_bdev, - .bi_sector = bio->bi_sector, - .bi_size = 0, - .bi_rw = bio->bi_rw, - }; + min_t(int, bio_segments(bio), max_segments); if (bio->bi_rw & REQ_DISCARD) return min(ret, q->limits.max_discard_sectors); - if (bio_segments(bio) > queue_max_segments(q) || + if (bio_segments(bio) > max_segments || q->merge_bvec_fn) { ret = 0; for (bv = bio_iovec(bio); bv < end; bv++) { + struct bvec_merge_data bvm = { + .bi_bdev = bio->bi_bdev, + .bi_sector = bio->bi_sector, + .bi_size = ret << 9, + .bi_rw = bio->bi_rw, + }; + if (q->merge_bvec_fn && q->merge_bvec_fn(q, &bvm, bv) < (int) bv->bv_len) break; - ret += bv->bv_len >> 9; - bvm.bi_size += bv->bv_len; + ret += bv->bv_len >> 9; } - - if (ret >= (BIO_MAX_PAGES * PAGE_SIZE) >> 9) - return (BIO_MAX_PAGES * PAGE_SIZE) >> 9; } ret = min(ret, queue_max_sectors(q)); diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 83731dc36f34..e5ff12e52d5b 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -1220,7 +1220,7 @@ static void cached_dev_make_request(struct request_queue *q, struct bio *bio) part_stat_unlock(); bio->bi_bdev = dc->bdev; - bio->bi_sector += BDEV_DATA_START; + bio->bi_sector += dc->sb.data_offset; if (cached_dev_get(dc)) { s = search_alloc(bio, d); diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 5fa3cd2d9ff0..c8046bc4aa57 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -110,15 +110,7 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev, sb->flags = le64_to_cpu(s->flags); sb->seq = le64_to_cpu(s->seq); - - sb->nbuckets = le64_to_cpu(s->nbuckets); - sb->block_size = le16_to_cpu(s->block_size); - sb->bucket_size = le16_to_cpu(s->bucket_size); - - sb->nr_in_set = le16_to_cpu(s->nr_in_set); - sb->nr_this_dev = le16_to_cpu(s->nr_this_dev); sb->last_mount = le32_to_cpu(s->last_mount); - sb->first_bucket = le16_to_cpu(s->first_bucket); sb->keys = le16_to_cpu(s->keys); @@ -147,53 +139,81 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev, if (bch_is_zero(sb->uuid, 16)) goto err; - err = "Unsupported superblock version"; - if (sb->version > BCACHE_SB_VERSION) - goto err; + sb->block_size = le16_to_cpu(s->block_size); - err = "Bad block/bucket size"; - if (!is_power_of_2(sb->block_size) || sb->block_size > PAGE_SECTORS || - !is_power_of_2(sb->bucket_size) || sb->bucket_size < PAGE_SECTORS) + err = "Superblock block size smaller than device block size"; + if (sb->block_size << 9 < bdev_logical_block_size(bdev)) goto err; - err = "Too many buckets"; - if (sb->nbuckets > LONG_MAX) - goto err; + switch (sb->version) { + case BCACHE_SB_VERSION_BDEV: + sb->data_offset = BDEV_DATA_START_DEFAULT; + break; + case BCACHE_SB_VERSION_BDEV_WITH_OFFSET: + sb->data_offset = le64_to_cpu(s->data_offset); - err = "Not enough buckets"; - if (sb->nbuckets < 1 << 7) - goto err; + err = "Bad data offset"; + if (sb->data_offset < BDEV_DATA_START_DEFAULT) + goto err; - err = "Invalid superblock: device too small"; - if (get_capacity(bdev->bd_disk) < sb->bucket_size * sb->nbuckets) - goto err; + break; + case BCACHE_SB_VERSION_CDEV: + case BCACHE_SB_VERSION_CDEV_WITH_UUID: + sb->nbuckets = le64_to_cpu(s->nbuckets); + sb->block_size = le16_to_cpu(s->block_size); + sb->bucket_size = le16_to_cpu(s->bucket_size); - if (sb->version == CACHE_BACKING_DEV) - goto out; + sb->nr_in_set = le16_to_cpu(s->nr_in_set); + sb->nr_this_dev = le16_to_cpu(s->nr_this_dev); - err = "Bad UUID"; - if (bch_is_zero(sb->set_uuid, 16)) - goto err; + err = "Too many buckets"; + if (sb->nbuckets > LONG_MAX) + goto err; - err = "Bad cache device number in set"; - if (!sb->nr_in_set || - sb->nr_in_set <= sb->nr_this_dev || - sb->nr_in_set > MAX_CACHES_PER_SET) - goto err; + err = "Not enough buckets"; + if (sb->nbuckets < 1 << 7) + goto err; - err = "Journal buckets not sequential"; - for (i = 0; i < sb->keys; i++) - if (sb->d[i] != sb->first_bucket + i) + err = "Bad block/bucket size"; + if (!is_power_of_2(sb->block_size) || + sb->block_size > PAGE_SECTORS || + !is_power_of_2(sb->bucket_size) || + sb->bucket_size < PAGE_SECTORS) goto err; - err = "Too many journal buckets"; - if (sb->first_bucket + sb->keys > sb->nbuckets) - goto err; + err = "Invalid superblock: device too small"; + if (get_capacity(bdev->bd_disk) < sb->bucket_size * sb->nbuckets) + goto err; + + err = "Bad UUID"; + if (bch_is_zero(sb->set_uuid, 16)) + goto err; + + err = "Bad cache device number in set"; + if (!sb->nr_in_set || + sb->nr_in_set <= sb->nr_this_dev || + sb->nr_in_set > MAX_CACHES_PER_SET) + goto err; - err = "Invalid superblock: first bucket comes before end of super"; - if (sb->first_bucket * sb->bucket_size < 16) + err = "Journal buckets not sequential"; + for (i = 0; i < sb->keys; i++) + if (sb->d[i] != sb->first_bucket + i) + goto err; + + err = "Too many journal buckets"; + if (sb->first_bucket + sb->keys > sb->nbuckets) + goto err; + + err = "Invalid superblock: first bucket comes before end of super"; + if (sb->first_bucket * sb->bucket_size < 16) + goto err; + + break; + default: + err = "Unsupported superblock version"; goto err; -out: + } + sb->last_mount = get_seconds(); err = NULL; @@ -286,7 +306,7 @@ void bcache_write_super(struct cache_set *c) for_each_cache(ca, c, i) { struct bio *bio = &ca->sb_bio; - ca->sb.version = BCACHE_SB_VERSION; + ca->sb.version = BCACHE_SB_VERSION_CDEV_WITH_UUID; ca->sb.seq = c->sb.seq; ca->sb.last_mount = c->sb.last_mount; @@ -641,6 +661,35 @@ void bcache_device_stop(struct bcache_device *d) closure_queue(&d->cl); } +static void bcache_device_unlink(struct bcache_device *d) +{ + unsigned i; + struct cache *ca; + + sysfs_remove_link(&d->c->kobj, d->name); + sysfs_remove_link(&d->kobj, "cache"); + + for_each_cache(ca, d->c, i) + bd_unlink_disk_holder(ca->bdev, d->disk); +} + +static void bcache_device_link(struct bcache_device *d, struct cache_set *c, + const char *name) +{ + unsigned i; + struct cache *ca; + + for_each_cache(ca, d->c, i) + bd_link_disk_holder(ca->bdev, d->disk); + + snprintf(d->name, BCACHEDEVNAME_SIZE, + "%s%u", name, d->id); + + WARN(sysfs_create_link(&d->kobj, &c->kobj, "cache") || + sysfs_create_link(&c->kobj, &d->kobj, d->name), + "Couldn't create device <-> cache set symlinks"); +} + static void bcache_device_detach(struct bcache_device *d) { lockdep_assert_held(&bch_register_lock); @@ -656,6 +705,8 @@ static void bcache_device_detach(struct bcache_device *d) atomic_set(&d->detaching, 0); } + bcache_device_unlink(d); + d->c->devices[d->id] = NULL; closure_put(&d->c->caching); d->c = NULL; @@ -673,17 +724,6 @@ static void bcache_device_attach(struct bcache_device *d, struct cache_set *c, closure_get(&c->caching); } -static void bcache_device_link(struct bcache_device *d, struct cache_set *c, - const char *name) -{ - snprintf(d->name, BCACHEDEVNAME_SIZE, - "%s%u", name, d->id); - - WARN(sysfs_create_link(&d->kobj, &c->kobj, "cache") || - sysfs_create_link(&c->kobj, &d->kobj, d->name), - "Couldn't create device <-> cache set symlinks"); -} - static void bcache_device_free(struct bcache_device *d) { lockdep_assert_held(&bch_register_lock); @@ -784,6 +824,7 @@ void bch_cached_dev_run(struct cached_dev *dc) } add_disk(d->disk); + bd_link_disk_holder(dc->bdev, dc->disk.disk); #if 0 char *env[] = { "SYMLINK=label" , NULL }; kobject_uevent_env(&disk_to_dev(d->disk)->kobj, KOBJ_CHANGE, env); @@ -803,9 +844,6 @@ static void cached_dev_detach_finish(struct work_struct *w) BUG_ON(!atomic_read(&dc->disk.detaching)); BUG_ON(atomic_read(&dc->count)); - sysfs_remove_link(&dc->disk.c->kobj, dc->disk.name); - sysfs_remove_link(&dc->disk.kobj, "cache"); - mutex_lock(&bch_register_lock); memset(&dc->sb.set_uuid, 0, 16); @@ -920,7 +958,6 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c) } bcache_device_attach(&dc->disk, c, u - c->uuids); - bcache_device_link(&dc->disk, c, "bdev"); list_move(&dc->list, &c->cached_devs); calc_cached_dev_sectors(c); @@ -938,6 +975,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c) } bch_cached_dev_run(dc); + bcache_device_link(&dc->disk, c, "bdev"); pr_info("Caching %s as %s on set %pU", bdevname(dc->bdev, buf), dc->disk.disk->disk_name, @@ -961,6 +999,7 @@ static void cached_dev_free(struct closure *cl) mutex_lock(&bch_register_lock); + bd_unlink_disk_holder(dc->bdev, dc->disk.disk); bcache_device_free(&dc->disk); list_del(&dc->list); @@ -1049,7 +1088,11 @@ static const char *register_bdev(struct cache_sb *sb, struct page *sb_page, g = dc->disk.disk; - set_capacity(g, dc->bdev->bd_part->nr_sects - 16); + set_capacity(g, dc->bdev->bd_part->nr_sects - dc->sb.data_offset); + + g->queue->backing_dev_info.ra_pages = + max(g->queue->backing_dev_info.ra_pages, + bdev->bd_queue->backing_dev_info.ra_pages); bch_cached_dev_request_init(dc); @@ -1099,8 +1142,7 @@ static void flash_dev_flush(struct closure *cl) { struct bcache_device *d = container_of(cl, struct bcache_device, cl); - sysfs_remove_link(&d->c->kobj, d->name); - sysfs_remove_link(&d->kobj, "cache"); + bcache_device_unlink(d); kobject_del(&d->kobj); continue_at(cl, flash_dev_free, system_wq); } @@ -1802,7 +1844,7 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, if (err) goto err_close; - if (sb->version == CACHE_BACKING_DEV) { + if (SB_IS_BDEV(sb)) { struct cached_dev *dc = kzalloc(sizeof(*dc), GFP_KERNEL); err = register_bdev(sb, sb_page, bdev, dc); |