summaryrefslogtreecommitdiff
path: root/drivers/md
diff options
context:
space:
mode:
authorMark Brown <broonie@linaro.org>2013-08-22 11:09:03 +0100
committerMark Brown <broonie@linaro.org>2013-08-22 11:09:03 +0100
commit64393c6e647c8a791243063d282c787b46721be7 (patch)
tree89ca12bd5d52e8df24d2bc13a36d30479a4e825d /drivers/md
parenta2388a498ad2f85be01aca29e364abf427d9b53c (diff)
parent741a509f34d8d702f70d0ad99b8152c57d76961e (diff)
Merge remote-tracking branch 'asoc/topic/ac97' into asoc-fsl
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/bcache/alloc.c46
-rw-r--r--drivers/md/bcache/bcache.h61
-rw-r--r--drivers/md/bcache/bset.c56
-rw-r--r--drivers/md/bcache/bset.h4
-rw-r--r--drivers/md/bcache/btree.c451
-rw-r--r--drivers/md/bcache/btree.h35
-rw-r--r--drivers/md/bcache/closure.c6
-rw-r--r--drivers/md/bcache/debug.c178
-rw-r--r--drivers/md/bcache/debug.h11
-rw-r--r--drivers/md/bcache/io.c68
-rw-r--r--drivers/md/bcache/journal.c25
-rw-r--r--drivers/md/bcache/movinggc.c24
-rw-r--r--drivers/md/bcache/request.c197
-rw-r--r--drivers/md/bcache/request.h2
-rw-r--r--drivers/md/bcache/super.c171
-rw-r--r--drivers/md/bcache/sysfs.c68
-rw-r--r--drivers/md/bcache/trace.c47
-rw-r--r--drivers/md/bcache/util.c17
-rw-r--r--drivers/md/bcache/util.h6
-rw-r--r--drivers/md/bcache/writeback.c133
-rw-r--r--drivers/md/bcache/writeback.h64
-rw-r--r--drivers/md/md.c14
-rw-r--r--drivers/md/raid1.c53
-rw-r--r--drivers/md/raid10.c19
-rw-r--r--drivers/md/raid5.c15
-rw-r--r--drivers/md/raid5.h1
26 files changed, 926 insertions, 846 deletions
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index 048f2947e08b..e45f5575fd4d 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -63,7 +63,10 @@
#include "bcache.h"
#include "btree.h"
+#include <linux/freezer.h>
+#include <linux/kthread.h>
#include <linux/random.h>
+#include <trace/events/bcache.h>
#define MAX_IN_FLIGHT_DISCARDS 8U
@@ -151,7 +154,7 @@ static void discard_finish(struct work_struct *w)
mutex_unlock(&ca->set->bucket_lock);
closure_wake_up(&ca->set->bucket_wait);
- wake_up(&ca->set->alloc_wait);
+ wake_up_process(ca->alloc_thread);
closure_put(&ca->set->cl);
}
@@ -350,38 +353,30 @@ static void invalidate_buckets(struct cache *ca)
break;
}
- pr_debug("free %zu/%zu free_inc %zu/%zu unused %zu/%zu",
- fifo_used(&ca->free), ca->free.size,
- fifo_used(&ca->free_inc), ca->free_inc.size,
- fifo_used(&ca->unused), ca->unused.size);
+ trace_bcache_alloc_invalidate(ca);
}
#define allocator_wait(ca, cond) \
do { \
- DEFINE_WAIT(__wait); \
- \
while (1) { \
- prepare_to_wait(&ca->set->alloc_wait, \
- &__wait, TASK_INTERRUPTIBLE); \
+ set_current_state(TASK_INTERRUPTIBLE); \
if (cond) \
break; \
\
mutex_unlock(&(ca)->set->bucket_lock); \
- if (test_bit(CACHE_SET_STOPPING_2, &ca->set->flags)) { \
- finish_wait(&ca->set->alloc_wait, &__wait); \
- closure_return(cl); \
- } \
+ if (kthread_should_stop()) \
+ return 0; \
\
+ try_to_freeze(); \
schedule(); \
mutex_lock(&(ca)->set->bucket_lock); \
} \
- \
- finish_wait(&ca->set->alloc_wait, &__wait); \
+ __set_current_state(TASK_RUNNING); \
} while (0)
-void bch_allocator_thread(struct closure *cl)
+static int bch_allocator_thread(void *arg)
{
- struct cache *ca = container_of(cl, struct cache, alloc);
+ struct cache *ca = arg;
mutex_lock(&ca->set->bucket_lock);
@@ -442,7 +437,7 @@ long bch_bucket_alloc(struct cache *ca, unsigned watermark, struct closure *cl)
{
long r = -1;
again:
- wake_up(&ca->set->alloc_wait);
+ wake_up_process(ca->alloc_thread);
if (fifo_used(&ca->free) > ca->watermark[watermark] &&
fifo_pop(&ca->free, r)) {
@@ -476,9 +471,7 @@ again:
return r;
}
- pr_debug("alloc failure: blocked %i free %zu free_inc %zu unused %zu",
- atomic_read(&ca->set->prio_blocked), fifo_used(&ca->free),
- fifo_used(&ca->free_inc), fifo_used(&ca->unused));
+ trace_bcache_alloc_fail(ca);
if (cl) {
closure_wait(&ca->set->bucket_wait, cl);
@@ -552,6 +545,17 @@ int bch_bucket_alloc_set(struct cache_set *c, unsigned watermark,
/* Init */
+int bch_cache_allocator_start(struct cache *ca)
+{
+ struct task_struct *k = kthread_run(bch_allocator_thread,
+ ca, "bcache_allocator");
+ if (IS_ERR(k))
+ return PTR_ERR(k);
+
+ ca->alloc_thread = k;
+ return 0;
+}
+
void bch_cache_allocator_exit(struct cache *ca)
{
struct discard *d;
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index d3e15b42a4ab..b39f6f0b45f2 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -178,7 +178,6 @@
#define pr_fmt(fmt) "bcache: %s() " fmt "\n", __func__
#include <linux/bio.h>
-#include <linux/blktrace_api.h>
#include <linux/kobject.h>
#include <linux/list.h>
#include <linux/mutex.h>
@@ -388,8 +387,6 @@ struct keybuf_key {
typedef bool (keybuf_pred_fn)(struct keybuf *, struct bkey *);
struct keybuf {
- keybuf_pred_fn *key_predicate;
-
struct bkey last_scanned;
spinlock_t lock;
@@ -437,9 +434,12 @@ struct bcache_device {
/* If nonzero, we're detaching/unregistering from cache set */
atomic_t detaching;
+ int flush_done;
+
+ uint64_t nr_stripes;
+ unsigned stripe_size_bits;
+ atomic_t *stripe_sectors_dirty;
- atomic_long_t sectors_dirty;
- unsigned long sectors_dirty_gc;
unsigned long sectors_dirty_last;
long sectors_dirty_derivative;
@@ -531,6 +531,7 @@ struct cached_dev {
unsigned sequential_merge:1;
unsigned verify:1;
+ unsigned partial_stripes_expensive:1;
unsigned writeback_metadata:1;
unsigned writeback_running:1;
unsigned char writeback_percent;
@@ -565,8 +566,7 @@ struct cache {
unsigned watermark[WATERMARK_MAX];
- struct closure alloc;
- struct workqueue_struct *alloc_workqueue;
+ struct task_struct *alloc_thread;
struct closure prio;
struct prio_set *disk_buckets;
@@ -664,13 +664,9 @@ struct gc_stat {
* CACHE_SET_STOPPING always gets set first when we're closing down a cache set;
* we'll continue to run normally for awhile with CACHE_SET_STOPPING set (i.e.
* flushing dirty data).
- *
- * CACHE_SET_STOPPING_2 gets set at the last phase, when it's time to shut down
- * the allocation thread.
*/
#define CACHE_SET_UNREGISTERING 0
#define CACHE_SET_STOPPING 1
-#define CACHE_SET_STOPPING_2 2
struct cache_set {
struct closure cl;
@@ -703,9 +699,6 @@ struct cache_set {
/* For the btree cache */
struct shrinker shrink;
- /* For the allocator itself */
- wait_queue_head_t alloc_wait;
-
/* For the btree cache and anything allocation related */
struct mutex bucket_lock;
@@ -823,10 +816,9 @@ struct cache_set {
/*
* A btree node on disk could have too many bsets for an iterator to fit
- * on the stack - this is a single element mempool for btree_read_work()
+ * on the stack - have to dynamically allocate them
*/
- struct mutex fill_lock;
- struct btree_iter *fill_iter;
+ mempool_t *fill_iter;
/*
* btree_sort() is a merge sort and requires temporary space - single
@@ -834,6 +826,7 @@ struct cache_set {
*/
struct mutex sort_lock;
struct bset *sort;
+ unsigned sort_crit_factor;
/* List of buckets we're currently writing data to */
struct list_head data_buckets;
@@ -906,8 +899,6 @@ static inline unsigned local_clock_us(void)
return local_clock() >> 10;
}
-#define MAX_BSETS 4U
-
#define BTREE_PRIO USHRT_MAX
#define INITIAL_PRIO 32768
@@ -1112,23 +1103,6 @@ static inline void __bkey_put(struct cache_set *c, struct bkey *k)
atomic_dec_bug(&PTR_BUCKET(c, k, i)->pin);
}
-/* Blktrace macros */
-
-#define blktrace_msg(c, fmt, ...) \
-do { \
- struct request_queue *q = bdev_get_queue(c->bdev); \
- if (q) \
- blk_add_trace_msg(q, fmt, ##__VA_ARGS__); \
-} while (0)
-
-#define blktrace_msg_all(s, fmt, ...) \
-do { \
- struct cache *_c; \
- unsigned i; \
- for_each_cache(_c, (s), i) \
- blktrace_msg(_c, fmt, ##__VA_ARGS__); \
-} while (0)
-
static inline void cached_dev_put(struct cached_dev *dc)
{
if (atomic_dec_and_test(&dc->count))
@@ -1173,10 +1147,16 @@ static inline uint8_t bucket_disk_gen(struct bucket *b)
static struct kobj_attribute ksysfs_##n = \
__ATTR(n, S_IWUSR|S_IRUSR, show, store)
-/* Forward declarations */
+static inline void wake_up_allocators(struct cache_set *c)
+{
+ struct cache *ca;
+ unsigned i;
+
+ for_each_cache(ca, c, i)
+ wake_up_process(ca->alloc_thread);
+}
-void bch_writeback_queue(struct cached_dev *);
-void bch_writeback_add(struct cached_dev *, unsigned);
+/* Forward declarations */
void bch_count_io_errors(struct cache *, int, const char *);
void bch_bbio_count_io_errors(struct cache_set *, struct bio *,
@@ -1193,7 +1173,6 @@ void bch_submit_bbio(struct bio *, struct cache_set *, struct bkey *, unsigned);
uint8_t bch_inc_gen(struct cache *, struct bucket *);
void bch_rescale_priorities(struct cache_set *, int);
bool bch_bucket_add_unused(struct cache *, struct bucket *);
-void bch_allocator_thread(struct closure *);
long bch_bucket_alloc(struct cache *, unsigned, struct closure *);
void bch_bucket_free(struct cache_set *, struct bkey *);
@@ -1241,9 +1220,9 @@ void bch_cache_set_stop(struct cache_set *);
struct cache_set *bch_cache_set_alloc(struct cache_sb *);
void bch_btree_cache_free(struct cache_set *);
int bch_btree_cache_alloc(struct cache_set *);
-void bch_cached_dev_writeback_init(struct cached_dev *);
void bch_moving_init_cache_set(struct cache_set *);
+int bch_cache_allocator_start(struct cache *ca);
void bch_cache_allocator_exit(struct cache *ca);
int bch_cache_allocator_init(struct cache *ca);
diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
index 1d27d3af3251..8010eed06a51 100644
--- a/drivers/md/bcache/bset.c
+++ b/drivers/md/bcache/bset.c
@@ -78,6 +78,7 @@ struct bkey *bch_keylist_pop(struct keylist *l)
bool __bch_ptr_invalid(struct cache_set *c, int level, const struct bkey *k)
{
unsigned i;
+ char buf[80];
if (level && (!KEY_PTRS(k) || !KEY_SIZE(k) || KEY_DIRTY(k)))
goto bad;
@@ -102,7 +103,8 @@ bool __bch_ptr_invalid(struct cache_set *c, int level, const struct bkey *k)
return false;
bad:
- cache_bug(c, "spotted bad key %s: %s", pkey(k), bch_ptr_status(c, k));
+ bch_bkey_to_text(buf, sizeof(buf), k);
+ cache_bug(c, "spotted bad key %s: %s", buf, bch_ptr_status(c, k));
return true;
}
@@ -162,10 +164,16 @@ bool bch_ptr_bad(struct btree *b, const struct bkey *k)
#ifdef CONFIG_BCACHE_EDEBUG
bug:
mutex_unlock(&b->c->bucket_lock);
- btree_bug(b,
+
+ {
+ char buf[80];
+
+ bch_bkey_to_text(buf, sizeof(buf), k);
+ btree_bug(b,
"inconsistent pointer %s: bucket %zu pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i",
- pkey(k), PTR_BUCKET_NR(b->c, k, i), atomic_read(&g->pin),
- g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen);
+ buf, PTR_BUCKET_NR(b->c, k, i), atomic_read(&g->pin),
+ g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen);
+ }
return true;
#endif
}
@@ -1084,33 +1092,39 @@ void bch_btree_sort_into(struct btree *b, struct btree *new)
new->sets->size = 0;
}
+#define SORT_CRIT (4096 / sizeof(uint64_t))
+
void bch_btree_sort_lazy(struct btree *b)
{
- if (b->nsets) {
- unsigned i, j, keys = 0, total;
+ unsigned crit = SORT_CRIT;
+ int i;
- for (i = 0; i <= b->nsets; i++)
- keys += b->sets[i].data->keys;
-
- total = keys;
+ /* Don't sort if nothing to do */
+ if (!b->nsets)
+ goto out;
- for (j = 0; j < b->nsets; j++) {
- if (keys * 2 < total ||
- keys < 1000) {
- bch_btree_sort_partial(b, j);
- return;
- }
+ /* If not a leaf node, always sort */
+ if (b->level) {
+ bch_btree_sort(b);
+ return;
+ }
- keys -= b->sets[j].data->keys;
- }
+ for (i = b->nsets - 1; i >= 0; --i) {
+ crit *= b->c->sort_crit_factor;
- /* Must sort if b->nsets == 3 or we'll overflow */
- if (b->nsets >= (MAX_BSETS - 1) - b->level) {
- bch_btree_sort(b);
+ if (b->sets[i].data->keys < crit) {
+ bch_btree_sort_partial(b, i);
return;
}
}
+ /* Sort if we'd overflow */
+ if (b->nsets + 1 == MAX_BSETS) {
+ bch_btree_sort(b);
+ return;
+ }
+
+out:
bset_build_written_tree(b);
}
diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h
index 57a9cff41546..ae115a253d73 100644
--- a/drivers/md/bcache/bset.h
+++ b/drivers/md/bcache/bset.h
@@ -1,6 +1,8 @@
#ifndef _BCACHE_BSET_H
#define _BCACHE_BSET_H
+#include <linux/slab.h>
+
/*
* BKEYS:
*
@@ -142,6 +144,8 @@
/* Btree key comparison/iteration */
+#define MAX_BSETS 4U
+
struct btree_iter {
size_t size, used;
struct btree_iter_set {
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 7a5658f04e62..ee372884c405 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -24,6 +24,7 @@
#include "btree.h"
#include "debug.h"
#include "request.h"
+#include "writeback.h"
#include <linux/slab.h>
#include <linux/bitops.h>
@@ -134,44 +135,17 @@ static uint64_t btree_csum_set(struct btree *b, struct bset *i)
return crc ^ 0xffffffffffffffffULL;
}
-static void btree_bio_endio(struct bio *bio, int error)
+static void bch_btree_node_read_done(struct btree *b)
{
- struct closure *cl = bio->bi_private;
- struct btree *b = container_of(cl, struct btree, io.cl);
-
- if (error)
- set_btree_node_io_error(b);
-
- bch_bbio_count_io_errors(b->c, bio, error, (bio->bi_rw & WRITE)
- ? "writing btree" : "reading btree");
- closure_put(cl);
-}
-
-static void btree_bio_init(struct btree *b)
-{
- BUG_ON(b->bio);
- b->bio = bch_bbio_alloc(b->c);
-
- b->bio->bi_end_io = btree_bio_endio;
- b->bio->bi_private = &b->io.cl;
-}
-
-void bch_btree_read_done(struct closure *cl)
-{
- struct btree *b = container_of(cl, struct btree, io.cl);
- struct bset *i = b->sets[0].data;
- struct btree_iter *iter = b->c->fill_iter;
const char *err = "bad btree header";
- BUG_ON(b->nsets || b->written);
-
- bch_bbio_free(b->bio, b->c);
- b->bio = NULL;
+ struct bset *i = b->sets[0].data;
+ struct btree_iter *iter;
- mutex_lock(&b->c->fill_lock);
+ iter = mempool_alloc(b->c->fill_iter, GFP_NOWAIT);
+ iter->size = b->c->sb.bucket_size / b->c->sb.block_size;
iter->used = 0;
- if (btree_node_io_error(b) ||
- !i->seq)
+ if (!i->seq)
goto err;
for (;
@@ -228,17 +202,8 @@ void bch_btree_read_done(struct closure *cl)
if (b->written < btree_blocks(b))
bch_bset_init_next(b);
out:
-
- mutex_unlock(&b->c->fill_lock);
-
- spin_lock(&b->c->btree_read_time_lock);
- bch_time_stats_update(&b->c->btree_read_time, b->io_start_time);
- spin_unlock(&b->c->btree_read_time_lock);
-
- smp_wmb(); /* read_done is our write lock */
- set_btree_node_read_done(b);
-
- closure_return(cl);
+ mempool_free(iter, b->c->fill_iter);
+ return;
err:
set_btree_node_io_error(b);
bch_cache_set_error(b->c, "%s at bucket %zu, block %zu, %u keys",
@@ -247,48 +212,69 @@ err:
goto out;
}
-void bch_btree_read(struct btree *b)
+static void btree_node_read_endio(struct bio *bio, int error)
+{
+ struct closure *cl = bio->bi_private;
+ closure_put(cl);
+}
+
+void bch_btree_node_read(struct btree *b)
{
- BUG_ON(b->nsets || b->written);
+ uint64_t start_time = local_clock();
+ struct closure cl;
+ struct bio *bio;
+
+ trace_bcache_btree_read(b);
+
+ closure_init_stack(&cl);
+
+ bio = bch_bbio_alloc(b->c);
+ bio->bi_rw = REQ_META|READ_SYNC;
+ bio->bi_size = KEY_SIZE(&b->key) << 9;
+ bio->bi_end_io = btree_node_read_endio;
+ bio->bi_private = &cl;
+
+ bch_bio_map(bio, b->sets[0].data);
+
+ bch_submit_bbio(bio, b->c, &b->key, 0);
+ closure_sync(&cl);
- if (!closure_trylock(&b->io.cl, &b->c->cl))
- BUG();
+ if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
+ set_btree_node_io_error(b);
- b->io_start_time = local_clock();
+ bch_bbio_free(bio, b->c);
- btree_bio_init(b);
- b->bio->bi_rw = REQ_META|READ_SYNC;
- b->bio->bi_size = KEY_SIZE(&b->key) << 9;
+ if (btree_node_io_error(b))
+ goto err;
- bch_bio_map(b->bio, b->sets[0].data);
+ bch_btree_node_read_done(b);
- pr_debug("%s", pbtree(b));
- trace_bcache_btree_read(b->bio);
- bch_submit_bbio(b->bio, b->c, &b->key, 0);
+ spin_lock(&b->c->btree_read_time_lock);
+ bch_time_stats_update(&b->c->btree_read_time, start_time);
+ spin_unlock(&b->c->btree_read_time_lock);
- continue_at(&b->io.cl, bch_btree_read_done, system_wq);
+ return;
+err:
+ bch_cache_set_error(b->c, "io error reading bucket %lu",
+ PTR_BUCKET_NR(b->c, &b->key, 0));
}
static void btree_complete_write(struct btree *b, struct btree_write *w)
{
if (w->prio_blocked &&
!atomic_sub_return(w->prio_blocked, &b->c->prio_blocked))
- wake_up(&b->c->alloc_wait);
+ wake_up_allocators(b->c);
if (w->journal) {
atomic_dec_bug(w->journal);
__closure_wake_up(&b->c->journal.wait);
}
- if (w->owner)
- closure_put(w->owner);
-
w->prio_blocked = 0;
w->journal = NULL;
- w->owner = NULL;
}
-static void __btree_write_done(struct closure *cl)
+static void __btree_node_write_done(struct closure *cl)
{
struct btree *b = container_of(cl, struct btree, io.cl);
struct btree_write *w = btree_prev_write(b);
@@ -304,7 +290,7 @@ static void __btree_write_done(struct closure *cl)
closure_return(cl);
}
-static void btree_write_done(struct closure *cl)
+static void btree_node_write_done(struct closure *cl)
{
struct btree *b = container_of(cl, struct btree, io.cl);
struct bio_vec *bv;
@@ -313,10 +299,22 @@ static void btree_write_done(struct closure *cl)
__bio_for_each_segment(bv, b->bio, n, 0)
__free_page(bv->bv_page);
- __btree_write_done(cl);
+ __btree_node_write_done(cl);
}
-static void do_btree_write(struct btree *b)
+static void btree_node_write_endio(struct bio *bio, int error)
+{
+ struct closure *cl = bio->bi_private;
+ struct btree *b = container_of(cl, struct btree, io.cl);
+
+ if (error)
+ set_btree_node_io_error(b);
+
+ bch_bbio_count_io_errors(b->c, bio, error, "writing btree");
+ closure_put(cl);
+}
+
+static void do_btree_node_write(struct btree *b)
{
struct closure *cl = &b->io.cl;
struct bset *i = b->sets[b->nsets].data;
@@ -325,15 +323,34 @@ static void do_btree_write(struct btree *b)
i->version = BCACHE_BSET_VERSION;
i->csum = btree_csum_set(b, i);
- btree_bio_init(b);
- b->bio->bi_rw = REQ_META|WRITE_SYNC;
- b->bio->bi_size = set_blocks(i, b->c) * block_bytes(b->c);
+ BUG_ON(b->bio);
+ b->bio = bch_bbio_alloc(b->c);
+
+ b->bio->bi_end_io = btree_node_write_endio;
+ b->bio->bi_private = &b->io.cl;
+ b->bio->bi_rw = REQ_META|WRITE_SYNC|REQ_FUA;
+ b->bio->bi_size = set_blocks(i, b->c) * block_bytes(b->c);
bch_bio_map(b->bio, i);
+ /*
+ * If we're appending to a leaf node, we don't technically need FUA -
+ * this write just needs to be persisted before the next journal write,
+ * which will be marked FLUSH|FUA.
+ *
+ * Similarly if we're writing a new btree root - the pointer is going to
+ * be in the next journal entry.
+ *
+ * But if we're writing a new btree node (that isn't a root) or
+ * appending to a non leaf btree node, we need either FUA or a flush
+ * when we write the parent with the new pointer. FUA is cheaper than a
+ * flush, and writes appending to leaf nodes aren't blocking anything so
+ * just make all btree node writes FUA to keep things sane.
+ */
+
bkey_copy(&k.key, &b->key);
SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + bset_offset(b, i));
- if (!bch_bio_alloc_pages(b->bio, GFP_NOIO)) {
+ if (!bio_alloc_pages(b->bio, GFP_NOIO)) {
int j;
struct bio_vec *bv;
void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1));
@@ -342,40 +359,41 @@ static void do_btree_write(struct btree *b)
memcpy(page_address(bv->bv_page),
base + j * PAGE_SIZE, PAGE_SIZE);
- trace_bcache_btree_write(b->bio);
bch_submit_bbio(b->bio, b->c, &k.key, 0);
- continue_at(cl, btree_write_done, NULL);
+ continue_at(cl, btree_node_write_done, NULL);
} else {
b->bio->bi_vcnt = 0;
bch_bio_map(b->bio, i);
- trace_bcache_btree_write(b->bio);
bch_submit_bbio(b->bio, b->c, &k.key, 0);
closure_sync(cl);
- __btree_write_done(cl);
+ __btree_node_write_done(cl);
}
}
-static void __btree_write(struct btree *b)
+void bch_btree_node_write(struct btree *b, struct closure *parent)
{
struct bset *i = b->sets[b->nsets].data;
+ trace_bcache_btree_write(b);
+
BUG_ON(current->bio_list);
+ BUG_ON(b->written >= btree_blocks(b));
+ BUG_ON(b->written && !i->keys);
+ BUG_ON(b->sets->data->seq != i->seq);
+ bch_check_key_order(b, i);
- closure_lock(&b->io, &b->c->cl);
cancel_delayed_work(&b->work);
+ /* If caller isn't waiting for write, parent refcount is cache set */
+ closure_lock(&b->io, parent ?: &b->c->cl);
+
clear_bit(BTREE_NODE_dirty, &b->flags);
change_bit(BTREE_NODE_write_idx, &b->flags);
- bch_check_key_order(b, i);
- BUG_ON(b->written && !i->keys);
-
- do_btree_write(b);
-
- pr_debug("%s block %i keys %i", pbtree(b), b->written, i->keys);
+ do_btree_node_write(b);
b->written += set_blocks(i, b->c);
atomic_long_add(set_blocks(i, b->c) * b->c->sb.block_size,
@@ -387,37 +405,31 @@ static void __btree_write(struct btree *b)
bch_bset_init_next(b);
}
-static void btree_write_work(struct work_struct *w)
+static void btree_node_write_work(struct work_struct *w)
{
struct btree *b = container_of(to_delayed_work(w), struct btree, work);
- down_write(&b->lock);
+ rw_lock(true, b, b->level);
if (btree_node_dirty(b))
- __btree_write(b);
- up_write(&b->lock);
+ bch_btree_node_write(b, NULL);
+ rw_unlock(true, b);
}
-void bch_btree_write(struct btree *b, bool now, struct btree_op *op)
+static void bch_btree_leaf_dirty(struct btree *b, struct btree_op *op)
{
struct bset *i = b->sets[b->nsets].data;
struct btree_write *w = btree_current_write(b);
- BUG_ON(b->written &&
- (b->written >= btree_blocks(b) ||
- i->seq != b->sets[0].data->seq ||
- !i->keys));
+ BUG_ON(!b->written);
+ BUG_ON(!i->keys);
- if (!btree_node_dirty(b)) {
- set_btree_node_dirty(b);
- queue_delayed_work(btree_io_wq, &b->work,
- msecs_to_jiffies(30000));
- }
+ if (!btree_node_dirty(b))
+ queue_delayed_work(btree_io_wq, &b->work, 30 * HZ);
- w->prio_blocked += b->prio_blocked;
- b->prio_blocked = 0;
+ set_btree_node_dirty(b);
- if (op && op->journal && !b->level) {
+ if (op && op->journal) {
if (w->journal &&
journal_pin_cmp(b->c, w, op)) {
atomic_dec_bug(w->journal);
@@ -430,23 +442,10 @@ void bch_btree_write(struct btree *b, bool now, struct btree_op *op)
}
}
- if (current->bio_list)
- return;
-
/* Force write if set is too big */
- if (now ||
- b->level ||
- set_bytes(i) > PAGE_SIZE - 48) {
- if (op && now) {
- /* Must wait on multiple writes */
- BUG_ON(w->owner);
- w->owner = &op->cl;
- closure_get(&op->cl);
- }
-
- __btree_write(b);
- }
- BUG_ON(!b->written);
+ if (set_bytes(i) > PAGE_SIZE - 48 &&
+ !current->bio_list)
+ bch_btree_node_write(b, NULL);
}
/*
@@ -559,7 +558,7 @@ static struct btree *mca_bucket_alloc(struct cache_set *c,
init_rwsem(&b->lock);
lockdep_set_novalidate_class(&b->lock);
INIT_LIST_HEAD(&b->list);
- INIT_DELAYED_WORK(&b->work, btree_write_work);
+ INIT_DELAYED_WORK(&b->work, btree_node_write_work);
b->c = c;
closure_init_unlocked(&b->io);
@@ -582,7 +581,7 @@ static int mca_reap(struct btree *b, struct closure *cl, unsigned min_order)
BUG_ON(btree_node_dirty(b) && !b->sets[0].data);
if (cl && btree_node_dirty(b))
- bch_btree_write(b, true, NULL);
+ bch_btree_node_write(b, NULL);
if (cl)
closure_wait_event_async(&b->io.wait, cl,
@@ -623,6 +622,13 @@ static int bch_mca_shrink(struct shrinker *shrink, struct shrink_control *sc)
else if (!mutex_trylock(&c->bucket_lock))
return -1;
+ /*
+ * It's _really_ critical that we don't free too many btree nodes - we
+ * have to always leave ourselves a reserve. The reserve is how we
+ * guarantee that allocating memory for a new btree node can always
+ * succeed, so that inserting keys into the btree can always succeed and
+ * IO can always make forward progress:
+ */
nr /= c->btree_pages;
nr = min_t(unsigned long, nr, mca_can_free(c));
@@ -766,6 +772,8 @@ static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k,
int ret = -ENOMEM;
struct btree *i;
+ trace_bcache_btree_cache_cannibalize(c);
+
if (!cl)
return ERR_PTR(-ENOMEM);
@@ -784,7 +792,6 @@ static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k,
return ERR_PTR(-EAGAIN);
}
- /* XXX: tracepoint */
c->try_harder = cl;
c->try_harder_start = local_clock();
retry:
@@ -905,6 +912,9 @@ retry:
b = mca_find(c, k);
if (!b) {
+ if (current->bio_list)
+ return ERR_PTR(-EAGAIN);
+
mutex_lock(&c->bucket_lock);
b = mca_alloc(c, k, level, &op->cl);
mutex_unlock(&c->bucket_lock);
@@ -914,7 +924,7 @@ retry:
if (IS_ERR(b))
return b;
- bch_btree_read(b);
+ bch_btree_node_read(b);
if (!write)
downgrade_write(&b->lock);
@@ -937,15 +947,12 @@ retry:
for (; i <= b->nsets; i++)
prefetch(b->sets[i].data);
- if (!closure_wait_event(&b->io.wait, &op->cl,
- btree_node_read_done(b))) {
- rw_unlock(write, b);
- b = ERR_PTR(-EAGAIN);
- } else if (btree_node_io_error(b)) {
+ if (btree_node_io_error(b)) {
rw_unlock(write, b);
- b = ERR_PTR(-EIO);
- } else
- BUG_ON(!b->written);
+ return ERR_PTR(-EIO);
+ }
+
+ BUG_ON(!b->written);
return b;
}
@@ -959,7 +966,7 @@ static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level)
mutex_unlock(&c->bucket_lock);
if (!IS_ERR_OR_NULL(b)) {
- bch_btree_read(b);
+ bch_btree_node_read(b);
rw_unlock(true, b);
}
}
@@ -970,24 +977,19 @@ static void btree_node_free(struct btree *b, struct btree_op *op)
{
unsigned i;
+ trace_bcache_btree_node_free(b);
+
/*
* The BUG_ON() in btree_node_get() implies that we must have a write
* lock on parent to free or even invalidate a node
*/
BUG_ON(op->lock <= b->level);
BUG_ON(b == b->c->root);
- pr_debug("bucket %s", pbtree(b));
if (btree_node_dirty(b))
btree_complete_write(b, btree_current_write(b));
clear_bit(BTREE_NODE_dirty, &b->flags);
- if (b->prio_blocked &&
- !atomic_sub_return(b->prio_blocked, &b->c->prio_blocked))
- wake_up(&b->c->alloc_wait);
-
- b->prio_blocked = 0;
-
cancel_delayed_work(&b->work);
mutex_lock(&b->c->bucket_lock);
@@ -1028,17 +1030,20 @@ retry:
goto retry;
}
- set_btree_node_read_done(b);
b->accessed = 1;
bch_bset_init_next(b);
mutex_unlock(&c->bucket_lock);
+
+ trace_bcache_btree_node_alloc(b);
return b;
err_free:
bch_bucket_free(c, &k.key);
__bkey_put(c, &k.key);
err:
mutex_unlock(&c->bucket_lock);
+
+ trace_bcache_btree_node_alloc_fail(b);
return b;
}
@@ -1137,11 +1142,8 @@ static int btree_gc_mark_node(struct btree *b, unsigned *keys,
gc->nkeys++;
gc->data += KEY_SIZE(k);
- if (KEY_DIRTY(k)) {
+ if (KEY_DIRTY(k))
gc->dirty += KEY_SIZE(k);
- if (d)
- d->sectors_dirty_gc += KEY_SIZE(k);
- }
}
for (t = b->sets; t <= &b->sets[b->nsets]; t++)
@@ -1166,14 +1168,11 @@ static struct btree *btree_gc_alloc(struct btree *b, struct bkey *k,
if (!IS_ERR_OR_NULL(n)) {
swap(b, n);
+ __bkey_put(b->c, &b->key);
memcpy(k->ptr, b->key.ptr,
sizeof(uint64_t) * KEY_PTRS(&b->key));
- __bkey_put(b->c, &b->key);
- atomic_inc(&b->c->prio_blocked);
- b->prio_blocked++;
-
btree_node_free(n, op);
up_write(&n->lock);
}
@@ -1278,7 +1277,7 @@ static void btree_gc_coalesce(struct btree *b, struct btree_op *op,
btree_node_free(r->b, op);
up_write(&r->b->lock);
- pr_debug("coalesced %u nodes", nodes);
+ trace_bcache_btree_gc_coalesce(nodes);
gc->nodes--;
nodes--;
@@ -1293,14 +1292,9 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op,
void write(struct btree *r)
{
if (!r->written)
- bch_btree_write(r, true, op);
- else if (btree_node_dirty(r)) {
- BUG_ON(btree_current_write(r)->owner);
- btree_current_write(r)->owner = writes;
- closure_get(writes);
-
- bch_btree_write(r, true, NULL);
- }
+ bch_btree_node_write(r, &op->cl);
+ else if (btree_node_dirty(r))
+ bch_btree_node_write(r, writes);
up_write(&r->lock);
}
@@ -1386,9 +1380,7 @@ static int bch_btree_gc_root(struct btree *b, struct btree_op *op,
ret = btree_gc_recurse(b, op, writes, gc);
if (!b->written || btree_node_dirty(b)) {
- atomic_inc(&b->c->prio_blocked);
- b->prio_blocked++;
- bch_btree_write(b, true, n ? op : NULL);
+ bch_btree_node_write(b, n ? &op->cl : NULL);
}
if (!IS_ERR_OR_NULL(n)) {
@@ -1405,7 +1397,6 @@ static void btree_gc_start(struct cache_set *c)
{
struct cache *ca;
struct bucket *b;
- struct bcache_device **d;
unsigned i;
if (!c->gc_mark_valid)
@@ -1419,16 +1410,12 @@ static void btree_gc_start(struct cache_set *c)
for_each_cache(ca, c, i)
for_each_bucket(b, ca) {
b->gc_gen = b->gen;
- if (!atomic_read(&b->pin))
+ if (!atomic_read(&b->pin)) {
SET_GC_MARK(b, GC_MARK_RECLAIMABLE);
+ SET_GC_SECTORS_USED(b, 0);
+ }
}
- for (d = c->devices;
- d < c->devices + c->nr_uuids;
- d++)
- if (*d)
- (*d)->sectors_dirty_gc = 0;
-
mutex_unlock(&c->bucket_lock);
}
@@ -1437,7 +1424,6 @@ size_t bch_btree_gc_finish(struct cache_set *c)
size_t available = 0;
struct bucket *b;
struct cache *ca;
- struct bcache_device **d;
unsigned i;
mutex_lock(&c->bucket_lock);
@@ -1480,22 +1466,6 @@ size_t bch_btree_gc_finish(struct cache_set *c)
}
}
- for (d = c->devices;
- d < c->devices + c->nr_uuids;
- d++)
- if (*d) {
- unsigned long last =
- atomic_long_read(&((*d)->sectors_dirty));
- long difference = (*d)->sectors_dirty_gc - last;
-
- pr_debug("sectors dirty off by %li", difference);
-
- (*d)->sectors_dirty_last += difference;
-
- atomic_long_set(&((*d)->sectors_dirty),
- (*d)->sectors_dirty_gc);
- }
-
mutex_unlock(&c->bucket_lock);
return available;
}
@@ -1508,10 +1478,9 @@ static void bch_btree_gc(struct closure *cl)
struct gc_stat stats;
struct closure writes;
struct btree_op op;
-
uint64_t start_time = local_clock();
- trace_bcache_gc_start(c->sb.set_uuid);
- blktrace_msg_all(c, "Starting gc");
+
+ trace_bcache_gc_start(c);
memset(&stats, 0, sizeof(struct gc_stat));
closure_init_stack(&writes);
@@ -1520,14 +1489,14 @@ static void bch_btree_gc(struct closure *cl)
btree_gc_start(c);
+ atomic_inc(&c->prio_blocked);
+
ret = btree_root(gc_root, c, &op, &writes, &stats);
closure_sync(&op.cl);
closure_sync(&writes);
if (ret) {
- blktrace_msg_all(c, "Stopped gc");
pr_warn("gc failed!");
-
continue_at(cl, bch_btree_gc, bch_gc_wq);
}
@@ -1537,6 +1506,9 @@ static void bch_btree_gc(struct closure *cl)
available = bch_btree_gc_finish(c);
+ atomic_dec(&c->prio_blocked);
+ wake_up_allocators(c);
+
bch_time_stats_update(&c->btree_gc_time, start_time);
stats.key_bytes *= sizeof(uint64_t);
@@ -1544,10 +1516,8 @@ static void bch_btree_gc(struct closure *cl)
stats.data <<= 9;
stats.in_use = (c->nbuckets - available) * 100 / c->nbuckets;
memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat));
- blktrace_msg_all(c, "Finished gc");
- trace_bcache_gc_end(c->sb.set_uuid);
- wake_up(&c->alloc_wait);
+ trace_bcache_gc_end(c);
continue_at(cl, bch_moving_gc, bch_gc_wq);
}
@@ -1654,14 +1624,14 @@ static bool fix_overlapping_extents(struct btree *b,
struct btree_iter *iter,
struct btree_op *op)
{
- void subtract_dirty(struct bkey *k, int sectors)
+ void subtract_dirty(struct bkey *k, uint64_t offset, int sectors)
{
- struct bcache_device *d = b->c->devices[KEY_INODE(k)];
-
- if (KEY_DIRTY(k) && d)
- atomic_long_sub(sectors, &d->sectors_dirty);
+ if (KEY_DIRTY(k))
+ bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k),
+ offset, -sectors);
}
+ uint64_t old_offset;
unsigned old_size, sectors_found = 0;
while (1) {
@@ -1673,6 +1643,7 @@ static bool fix_overlapping_extents(struct btree *b,
if (bkey_cmp(k, &START_KEY(insert)) <= 0)
continue;
+ old_offset = KEY_START(k);
old_size = KEY_SIZE(k);
/*
@@ -1728,7 +1699,7 @@ static bool fix_overlapping_extents(struct btree *b,
struct bkey *top;
- subtract_dirty(k, KEY_SIZE(insert));
+ subtract_dirty(k, KEY_START(insert), KEY_SIZE(insert));
if (bkey_written(b, k)) {
/*
@@ -1775,7 +1746,7 @@ static bool fix_overlapping_extents(struct btree *b,
}
}
- subtract_dirty(k, old_size - KEY_SIZE(k));
+ subtract_dirty(k, old_offset, old_size - KEY_SIZE(k));
}
check_failed:
@@ -1798,7 +1769,7 @@ static bool btree_insert_key(struct btree *b, struct btree_op *op,
{
struct bset *i = b->sets[b->nsets].data;
struct bkey *m, *prev;
- const char *status = "insert";
+ unsigned status = BTREE_INSERT_STATUS_INSERT;
BUG_ON(bkey_cmp(k, &b->key) > 0);
BUG_ON(b->level && !KEY_PTRS(k));
@@ -1831,17 +1802,17 @@ static bool btree_insert_key(struct btree *b, struct btree_op *op,
goto insert;
/* prev is in the tree, if we merge we're done */
- status = "back merging";
+ status = BTREE_INSERT_STATUS_BACK_MERGE;
if (prev &&
bch_bkey_try_merge(b, prev, k))
goto merged;
- status = "overwrote front";
+ status = BTREE_INSERT_STATUS_OVERWROTE;
if (m != end(i) &&
KEY_PTRS(m) == KEY_PTRS(k) && !KEY_SIZE(m))
goto copy;
- status = "front merge";
+ status = BTREE_INSERT_STATUS_FRONT_MERGE;
if (m != end(i) &&
bch_bkey_try_merge(b, k, m))
goto copy;
@@ -1851,21 +1822,21 @@ static bool btree_insert_key(struct btree *b, struct btree_op *op,
insert: shift_keys(b, m, k);
copy: bkey_copy(m, k);
merged:
- bch_check_keys(b, "%s for %s at %s: %s", status,
- op_type(op), pbtree(b), pkey(k));
- bch_check_key_order_msg(b, i, "%s for %s at %s: %s", status,
- op_type(op), pbtree(b), pkey(k));
+ if (KEY_DIRTY(k))
+ bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k),
+ KEY_START(k), KEY_SIZE(k));
+
+ bch_check_keys(b, "%u for %s", status, op_type(op));
if (b->level && !KEY_OFFSET(k))
- b->prio_blocked++;
+ btree_current_write(b)->prio_blocked++;
- pr_debug("%s for %s at %s: %s", status,
- op_type(op), pbtree(b), pkey(k));
+ trace_bcache_btree_insert_key(b, k, op->type, status);
return true;
}
-bool bch_btree_insert_keys(struct btree *b, struct btree_op *op)
+static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op)
{
bool ret = false;
struct bkey *k;
@@ -1896,7 +1867,7 @@ bool bch_btree_insert_check_key(struct btree *b, struct btree_op *op,
should_split(b))
goto out;
- op->replace = KEY(op->inode, bio_end(bio), bio_sectors(bio));
+ op->replace = KEY(op->inode, bio_end_sector(bio), bio_sectors(bio));
SET_KEY_PTRS(&op->replace, 1);
get_random_bytes(&op->replace.ptr[0], sizeof(uint64_t));
@@ -1907,7 +1878,6 @@ bool bch_btree_insert_check_key(struct btree *b, struct btree_op *op,
BUG_ON(op->type != BTREE_INSERT);
BUG_ON(!btree_insert_key(b, op, &tmp.k));
- bch_btree_write(b, false, NULL);
ret = true;
out:
downgrade_write(&b->lock);
@@ -1929,12 +1899,11 @@ static int btree_split(struct btree *b, struct btree_op *op)
split = set_blocks(n1->sets[0].data, n1->c) > (btree_blocks(b) * 4) / 5;
- pr_debug("%ssplitting at %s keys %i", split ? "" : "not ",
- pbtree(b), n1->sets[0].data->keys);
-
if (split) {
unsigned keys = 0;
+ trace_bcache_btree_node_split(b, n1->sets[0].data->keys);
+
n2 = bch_btree_node_alloc(b->c, b->level, &op->cl);
if (IS_ERR(n2))
goto err_free1;
@@ -1967,18 +1936,21 @@ static int btree_split(struct btree *b, struct btree_op *op)
bkey_copy_key(&n2->key, &b->key);
bch_keylist_add(&op->keys, &n2->key);
- bch_btree_write(n2, true, op);
+ bch_btree_node_write(n2, &op->cl);
rw_unlock(true, n2);
- } else
+ } else {
+ trace_bcache_btree_node_compact(b, n1->sets[0].data->keys);
+
bch_btree_insert_keys(n1, op);
+ }
bch_keylist_add(&op->keys, &n1->key);
- bch_btree_write(n1, true, op);
+ bch_btree_node_write(n1, &op->cl);
if (n3) {
bkey_copy_key(&n3->key, &MAX_KEY);
bch_btree_insert_keys(n3, op);
- bch_btree_write(n3, true, op);
+ bch_btree_node_write(n3, &op->cl);
closure_sync(&op->cl);
bch_btree_set_root(n3);
@@ -2082,8 +2054,12 @@ static int bch_btree_insert_recurse(struct btree *b, struct btree_op *op,
BUG_ON(write_block(b) != b->sets[b->nsets].data);
- if (bch_btree_insert_keys(b, op))
- bch_btree_write(b, false, op);
+ if (bch_btree_insert_keys(b, op)) {
+ if (!b->level)
+ bch_btree_leaf_dirty(b, op);
+ else
+ bch_btree_node_write(b, &op->cl);
+ }
}
return 0;
@@ -2140,6 +2116,11 @@ int bch_btree_insert(struct btree_op *op, struct cache_set *c)
void bch_btree_set_root(struct btree *b)
{
unsigned i;
+ struct closure cl;
+
+ closure_init_stack(&cl);
+
+ trace_bcache_btree_set_root(b);
BUG_ON(!b->written);
@@ -2153,8 +2134,8 @@ void bch_btree_set_root(struct btree *b)
b->c->root = b;
__bkey_put(b->c, &b->key);
- bch_journal_meta(b->c, NULL);
- pr_debug("%s for %pf", pbtree(b), __builtin_return_address(0));
+ bch_journal_meta(b->c, &cl);
+ closure_sync(&cl);
}
/* Cache lookup */
@@ -2215,9 +2196,6 @@ static int submit_partial_cache_hit(struct btree *b, struct btree_op *op,
KEY_OFFSET(k) - bio->bi_sector);
n = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split);
- if (!n)
- return -EAGAIN;
-
if (n == bio)
op->lookup_done = true;
@@ -2240,7 +2218,6 @@ static int submit_partial_cache_hit(struct btree *b, struct btree_op *op,
n->bi_end_io = bch_cache_read_endio;
n->bi_private = &s->cl;
- trace_bcache_cache_hit(n);
__bch_submit_bbio(n, b->c);
}
@@ -2257,9 +2234,6 @@ int bch_btree_search_recurse(struct btree *b, struct btree_op *op)
struct btree_iter iter;
bch_btree_iter_init(b, &iter, &KEY(op->inode, bio->bi_sector, 0));
- pr_debug("at %s searching for %u:%llu", pbtree(b), op->inode,
- (uint64_t) bio->bi_sector);
-
do {
k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad);
if (!k) {
@@ -2303,7 +2277,8 @@ static inline int keybuf_nonoverlapping_cmp(struct keybuf_key *l,
}
static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op,
- struct keybuf *buf, struct bkey *end)
+ struct keybuf *buf, struct bkey *end,
+ keybuf_pred_fn *pred)
{
struct btree_iter iter;
bch_btree_iter_init(b, &iter, &buf->last_scanned);
@@ -2322,11 +2297,9 @@ static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op,
if (bkey_cmp(&buf->last_scanned, end) >= 0)
break;
- if (buf->key_predicate(buf, k)) {
+ if (pred(buf, k)) {
struct keybuf_key *w;
- pr_debug("%s", pkey(k));
-
spin_lock(&buf->lock);
w = array_alloc(&buf->freelist);
@@ -2343,7 +2316,7 @@ static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op,
if (!k)
break;
- btree(refill_keybuf, k, b, op, buf, end);
+ btree(refill_keybuf, k, b, op, buf, end, pred);
/*
* Might get an error here, but can't really do anything
* and it'll get logged elsewhere. Just read what we
@@ -2361,7 +2334,7 @@ static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op,
}
void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf,
- struct bkey *end)
+ struct bkey *end, keybuf_pred_fn *pred)
{
struct bkey start = buf->last_scanned;
struct btree_op op;
@@ -2369,7 +2342,7 @@ void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf,
cond_resched();
- btree_root(refill_keybuf, c, &op, buf, end);
+ btree_root(refill_keybuf, c, &op, buf, end, pred);
closure_sync(&op.cl);
pr_debug("found %s keys from %llu:%llu to %llu:%llu",
@@ -2455,7 +2428,8 @@ struct keybuf_key *bch_keybuf_next(struct keybuf *buf)
struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c,
struct keybuf *buf,
- struct bkey *end)
+ struct bkey *end,
+ keybuf_pred_fn *pred)
{
struct keybuf_key *ret;
@@ -2469,15 +2443,14 @@ struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c,
break;
}
- bch_refill_keybuf(c, buf, end);
+ bch_refill_keybuf(c, buf, end, pred);
}
return ret;
}
-void bch_keybuf_init(struct keybuf *buf, keybuf_pred_fn *fn)
+void bch_keybuf_init(struct keybuf *buf)
{
- buf->key_predicate = fn;
buf->last_scanned = MAX_KEY;
buf->keys = RB_ROOT;
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index af4a7092a28c..3333d3723633 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -102,7 +102,6 @@
#include "debug.h"
struct btree_write {
- struct closure *owner;
atomic_t *journal;
/* If btree_split() frees a btree node, it writes a new pointer to that
@@ -142,16 +141,12 @@ struct btree {
*/
struct bset_tree sets[MAX_BSETS];
- /* Used to refcount bio splits, also protects b->bio */
+ /* For outstanding btree writes, used as a lock - protects write_idx */
struct closure_with_waitlist io;
- /* Gets transferred to w->prio_blocked - see the comment there */
- int prio_blocked;
-
struct list_head list;
struct delayed_work work;
- uint64_t io_start_time;
struct btree_write writes[2];
struct bio *bio;
};
@@ -164,13 +159,11 @@ static inline void set_btree_node_ ## flag(struct btree *b) \
{ set_bit(BTREE_NODE_ ## flag, &b->flags); } \
enum btree_flags {
- BTREE_NODE_read_done,
BTREE_NODE_io_error,
BTREE_NODE_dirty,
BTREE_NODE_write_idx,
};
-BTREE_FLAG(read_done);
BTREE_FLAG(io_error);
BTREE_FLAG(dirty);
BTREE_FLAG(write_idx);
@@ -278,6 +271,13 @@ struct btree_op {
BKEY_PADDED(replace);
};
+enum {
+ BTREE_INSERT_STATUS_INSERT,
+ BTREE_INSERT_STATUS_BACK_MERGE,
+ BTREE_INSERT_STATUS_OVERWROTE,
+ BTREE_INSERT_STATUS_FRONT_MERGE,
+};
+
void bch_btree_op_init_stack(struct btree_op *);
static inline void rw_lock(bool w, struct btree *b, int level)
@@ -293,9 +293,7 @@ static inline void rw_unlock(bool w, struct btree *b)
#ifdef CONFIG_BCACHE_EDEBUG
unsigned i;
- if (w &&
- b->key.ptr[0] &&
- btree_node_read_done(b))
+ if (w && b->key.ptr[0])
for (i = 0; i <= b->nsets; i++)
bch_check_key_order(b, b->sets[i].data);
#endif
@@ -370,9 +368,8 @@ static inline bool should_split(struct btree *b)
> btree_blocks(b));
}
-void bch_btree_read_done(struct closure *);
-void bch_btree_read(struct btree *);
-void bch_btree_write(struct btree *b, bool now, struct btree_op *op);
+void bch_btree_node_read(struct btree *);
+void bch_btree_node_write(struct btree *, struct closure *);
void bch_cannibalize_unlock(struct cache_set *, struct closure *);
void bch_btree_set_root(struct btree *);
@@ -380,7 +377,6 @@ struct btree *bch_btree_node_alloc(struct cache_set *, int, struct closure *);
struct btree *bch_btree_node_get(struct cache_set *, struct bkey *,
int, struct btree_op *);
-bool bch_btree_insert_keys(struct btree *, struct btree_op *);
bool bch_btree_insert_check_key(struct btree *, struct btree_op *,
struct bio *);
int bch_btree_insert(struct btree_op *, struct cache_set *);
@@ -393,13 +389,14 @@ void bch_moving_gc(struct closure *);
int bch_btree_check(struct cache_set *, struct btree_op *);
uint8_t __bch_btree_mark_key(struct cache_set *, int, struct bkey *);
-void bch_keybuf_init(struct keybuf *, keybuf_pred_fn *);
-void bch_refill_keybuf(struct cache_set *, struct keybuf *, struct bkey *);
+void bch_keybuf_init(struct keybuf *);
+void bch_refill_keybuf(struct cache_set *, struct keybuf *, struct bkey *,
+ keybuf_pred_fn *);
bool bch_keybuf_check_overlapping(struct keybuf *, struct bkey *,
struct bkey *);
void bch_keybuf_del(struct keybuf *, struct keybuf_key *);
struct keybuf_key *bch_keybuf_next(struct keybuf *);
-struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *,
- struct keybuf *, struct bkey *);
+struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *, struct keybuf *,
+ struct bkey *, keybuf_pred_fn *);
#endif
diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c
index bd05a9a8c7cf..9aba2017f0d1 100644
--- a/drivers/md/bcache/closure.c
+++ b/drivers/md/bcache/closure.c
@@ -66,16 +66,18 @@ static inline void closure_put_after_sub(struct closure *cl, int flags)
} else {
struct closure *parent = cl->parent;
struct closure_waitlist *wait = closure_waitlist(cl);
+ closure_fn *destructor = cl->fn;
closure_debug_destroy(cl);
+ smp_mb();
atomic_set(&cl->remaining, -1);
if (wait)
closure_wake_up(wait);
- if (cl->fn)
- cl->fn(cl);
+ if (destructor)
+ destructor(cl);
if (parent)
closure_put(parent);
diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c
index 89fd5204924e..88e6411eab4f 100644
--- a/drivers/md/bcache/debug.c
+++ b/drivers/md/bcache/debug.c
@@ -47,11 +47,10 @@ const char *bch_ptr_status(struct cache_set *c, const struct bkey *k)
return "";
}
-struct keyprint_hack bch_pkey(const struct bkey *k)
+int bch_bkey_to_text(char *buf, size_t size, const struct bkey *k)
{
unsigned i = 0;
- struct keyprint_hack r;
- char *out = r.s, *end = r.s + KEYHACK_SIZE;
+ char *out = buf, *end = buf + size;
#define p(...) (out += scnprintf(out, end - out, __VA_ARGS__))
@@ -75,16 +74,14 @@ struct keyprint_hack bch_pkey(const struct bkey *k)
if (KEY_CSUM(k))
p(" cs%llu %llx", KEY_CSUM(k), k->ptr[1]);
#undef p
- return r;
+ return out - buf;
}
-struct keyprint_hack bch_pbtree(const struct btree *b)
+int bch_btree_to_text(char *buf, size_t size, const struct btree *b)
{
- struct keyprint_hack r;
-
- snprintf(r.s, 40, "%zu level %i/%i", PTR_BUCKET_NR(b->c, &b->key, 0),
- b->level, b->c->root ? b->c->root->level : -1);
- return r;
+ return scnprintf(buf, size, "%zu level %i/%i",
+ PTR_BUCKET_NR(b->c, &b->key, 0),
+ b->level, b->c->root ? b->c->root->level : -1);
}
#if defined(CONFIG_BCACHE_DEBUG) || defined(CONFIG_BCACHE_EDEBUG)
@@ -100,10 +97,12 @@ static void dump_bset(struct btree *b, struct bset *i)
{
struct bkey *k;
unsigned j;
+ char buf[80];
for (k = i->start; k < end(i); k = bkey_next(k)) {
+ bch_bkey_to_text(buf, sizeof(buf), k);
printk(KERN_ERR "block %zu key %zi/%u: %s", index(i, b),
- (uint64_t *) k - i->d, i->keys, pkey(k));
+ (uint64_t *) k - i->d, i->keys, buf);
for (j = 0; j < KEY_PTRS(k); j++) {
size_t n = PTR_BUCKET_NR(b->c, k, j);
@@ -144,7 +143,7 @@ void bch_btree_verify(struct btree *b, struct bset *new)
v->written = 0;
v->level = b->level;
- bch_btree_read(v);
+ bch_btree_node_read(v);
closure_wait_event(&v->io.wait, &cl,
atomic_read(&b->io.cl.remaining) == -1);
@@ -200,7 +199,7 @@ void bch_data_verify(struct search *s)
if (!check)
return;
- if (bch_bio_alloc_pages(check, GFP_NOIO))
+ if (bio_alloc_pages(check, GFP_NOIO))
goto out_put;
check->bi_rw = READ_SYNC;
@@ -252,6 +251,7 @@ static void vdump_bucket_and_panic(struct btree *b, const char *fmt,
va_list args)
{
unsigned i;
+ char buf[80];
console_lock();
@@ -262,7 +262,8 @@ static void vdump_bucket_and_panic(struct btree *b, const char *fmt,
console_unlock();
- panic("at %s\n", pbtree(b));
+ bch_btree_to_text(buf, sizeof(buf), b);
+ panic("at %s\n", buf);
}
void bch_check_key_order_msg(struct btree *b, struct bset *i,
@@ -337,6 +338,7 @@ static ssize_t bch_dump_read(struct file *file, char __user *buf,
{
struct dump_iterator *i = file->private_data;
ssize_t ret = 0;
+ char kbuf[80];
while (size) {
struct keybuf_key *w;
@@ -355,11 +357,12 @@ static ssize_t bch_dump_read(struct file *file, char __user *buf,
if (i->bytes)
break;
- w = bch_keybuf_next_rescan(i->c, &i->keys, &MAX_KEY);
+ w = bch_keybuf_next_rescan(i->c, &i->keys, &MAX_KEY, dump_pred);
if (!w)
break;
- i->bytes = snprintf(i->buf, PAGE_SIZE, "%s\n", pkey(&w->key));
+ bch_bkey_to_text(kbuf, sizeof(kbuf), &w->key);
+ i->bytes = snprintf(i->buf, PAGE_SIZE, "%s\n", kbuf);
bch_keybuf_del(&i->keys, w);
}
@@ -377,7 +380,7 @@ static int bch_dump_open(struct inode *inode, struct file *file)
file->private_data = i;
i->c = c;
- bch_keybuf_init(&i->keys, dump_pred);
+ bch_keybuf_init(&i->keys);
i->keys.last_scanned = KEY(0, 0, 0);
return 0;
@@ -409,142 +412,6 @@ void bch_debug_init_cache_set(struct cache_set *c)
#endif
-/* Fuzz tester has rotted: */
-#if 0
-
-static ssize_t btree_fuzz(struct kobject *k, struct kobj_attribute *a,
- const char *buffer, size_t size)
-{
- void dump(struct btree *b)
- {
- struct bset *i;
-
- for (i = b->sets[0].data;
- index(i, b) < btree_blocks(b) &&
- i->seq == b->sets[0].data->seq;
- i = ((void *) i) + set_blocks(i, b->c) * block_bytes(b->c))
- dump_bset(b, i);
- }
-
- struct cache_sb *sb;
- struct cache_set *c;
- struct btree *all[3], *b, *fill, *orig;
- int j;
-
- struct btree_op op;
- bch_btree_op_init_stack(&op);
-
- sb = kzalloc(sizeof(struct cache_sb), GFP_KERNEL);
- if (!sb)
- return -ENOMEM;
-
- sb->bucket_size = 128;
- sb->block_size = 4;
-
- c = bch_cache_set_alloc(sb);
- if (!c)
- return -ENOMEM;
-
- for (j = 0; j < 3; j++) {
- BUG_ON(list_empty(&c->btree_cache));
- all[j] = list_first_entry(&c->btree_cache, struct btree, list);
- list_del_init(&all[j]->list);
-
- all[j]->key = KEY(0, 0, c->sb.bucket_size);
- bkey_copy_key(&all[j]->key, &MAX_KEY);
- }
-
- b = all[0];
- fill = all[1];
- orig = all[2];
-
- while (1) {
- for (j = 0; j < 3; j++)
- all[j]->written = all[j]->nsets = 0;
-
- bch_bset_init_next(b);
-
- while (1) {
- struct bset *i = write_block(b);
- struct bkey *k = op.keys.top;
- unsigned rand;
-
- bkey_init(k);
- rand = get_random_int();
-
- op.type = rand & 1
- ? BTREE_INSERT
- : BTREE_REPLACE;
- rand >>= 1;
-
- SET_KEY_SIZE(k, bucket_remainder(c, rand));
- rand >>= c->bucket_bits;
- rand &= 1024 * 512 - 1;
- rand += c->sb.bucket_size;
- SET_KEY_OFFSET(k, rand);
-#if 0
- SET_KEY_PTRS(k, 1);
-#endif
- bch_keylist_push(&op.keys);
- bch_btree_insert_keys(b, &op);
-
- if (should_split(b) ||
- set_blocks(i, b->c) !=
- __set_blocks(i, i->keys + 15, b->c)) {
- i->csum = csum_set(i);
-
- memcpy(write_block(fill),
- i, set_bytes(i));
-
- b->written += set_blocks(i, b->c);
- fill->written = b->written;
- if (b->written == btree_blocks(b))
- break;
-
- bch_btree_sort_lazy(b);
- bch_bset_init_next(b);
- }
- }
-
- memcpy(orig->sets[0].data,
- fill->sets[0].data,
- btree_bytes(c));
-
- bch_btree_sort(b);
- fill->written = 0;
- bch_btree_read_done(&fill->io.cl);
-
- if (b->sets[0].data->keys != fill->sets[0].data->keys ||
- memcmp(b->sets[0].data->start,
- fill->sets[0].data->start,
- b->sets[0].data->keys * sizeof(uint64_t))) {
- struct bset *i = b->sets[0].data;
- struct bkey *k, *l;
-
- for (k = i->start,
- l = fill->sets[0].data->start;
- k < end(i);
- k = bkey_next(k), l = bkey_next(l))
- if (bkey_cmp(k, l) ||
- KEY_SIZE(k) != KEY_SIZE(l))
- pr_err("key %zi differs: %s != %s",
- (uint64_t *) k - i->d,
- pkey(k), pkey(l));
-
- for (j = 0; j < 3; j++) {
- pr_err("**** Set %i ****", j);
- dump(all[j]);
- }
- panic("\n");
- }
-
- pr_info("fuzz complete: %i keys", b->sets[0].data->keys);
- }
-}
-
-kobj_attribute_write(fuzz, btree_fuzz);
-#endif
-
void bch_debug_exit(void)
{
if (!IS_ERR_OR_NULL(debug))
@@ -554,11 +421,6 @@ void bch_debug_exit(void)
int __init bch_debug_init(struct kobject *kobj)
{
int ret = 0;
-#if 0
- ret = sysfs_create_file(kobj, &ksysfs_fuzz.attr);
- if (ret)
- return ret;
-#endif
debug = debugfs_create_dir("bcache", NULL);
return ret;
diff --git a/drivers/md/bcache/debug.h b/drivers/md/bcache/debug.h
index f9378a218148..1c39b5a2489b 100644
--- a/drivers/md/bcache/debug.h
+++ b/drivers/md/bcache/debug.h
@@ -3,15 +3,8 @@
/* Btree/bkey debug printing */
-#define KEYHACK_SIZE 80
-struct keyprint_hack {
- char s[KEYHACK_SIZE];
-};
-
-struct keyprint_hack bch_pkey(const struct bkey *k);
-struct keyprint_hack bch_pbtree(const struct btree *b);
-#define pkey(k) (&bch_pkey(k).s[0])
-#define pbtree(b) (&bch_pbtree(b).s[0])
+int bch_bkey_to_text(char *buf, size_t size, const struct bkey *k);
+int bch_btree_to_text(char *buf, size_t size, const struct btree *b);
#ifdef CONFIG_BCACHE_EDEBUG
diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c
index 48efd4dea645..9056632995b1 100644
--- a/drivers/md/bcache/io.c
+++ b/drivers/md/bcache/io.c
@@ -9,6 +9,8 @@
#include "bset.h"
#include "debug.h"
+#include <linux/blkdev.h>
+
static void bch_bi_idx_hack_endio(struct bio *bio, int error)
{
struct bio *p = bio->bi_private;
@@ -66,13 +68,6 @@ static void bch_generic_make_request_hack(struct bio *bio)
* The newly allocated bio will point to @bio's bi_io_vec, if the split was on a
* bvec boundry; it is the caller's responsibility to ensure that @bio is not
* freed before the split.
- *
- * If bch_bio_split() is running under generic_make_request(), it's not safe to
- * allocate more than one bio from the same bio set. Therefore, if it is running
- * under generic_make_request() it masks out __GFP_WAIT when doing the
- * allocation. The caller must check for failure if there's any possibility of
- * it being called from under generic_make_request(); it is then the caller's
- * responsibility to retry from a safe context (by e.g. punting to workqueue).
*/
struct bio *bch_bio_split(struct bio *bio, int sectors,
gfp_t gfp, struct bio_set *bs)
@@ -83,20 +78,13 @@ struct bio *bch_bio_split(struct bio *bio, int sectors,
BUG_ON(sectors <= 0);
- /*
- * If we're being called from underneath generic_make_request() and we
- * already allocated any bios from this bio set, we risk deadlock if we
- * use the mempool. So instead, we possibly fail and let the caller punt
- * to workqueue or somesuch and retry in a safe context.
- */
- if (current->bio_list)
- gfp &= ~__GFP_WAIT;
-
if (sectors >= bio_sectors(bio))
return bio;
if (bio->bi_rw & REQ_DISCARD) {
ret = bio_alloc_bioset(gfp, 1, bs);
+ if (!ret)
+ return NULL;
idx = 0;
goto out;
}
@@ -160,17 +148,18 @@ static unsigned bch_bio_max_sectors(struct bio *bio)
struct request_queue *q = bdev_get_queue(bio->bi_bdev);
unsigned max_segments = min_t(unsigned, BIO_MAX_PAGES,
queue_max_segments(q));
- struct bio_vec *bv, *end = bio_iovec(bio) +
- min_t(int, bio_segments(bio), max_segments);
if (bio->bi_rw & REQ_DISCARD)
return min(ret, q->limits.max_discard_sectors);
if (bio_segments(bio) > max_segments ||
q->merge_bvec_fn) {
+ struct bio_vec *bv;
+ int i, seg = 0;
+
ret = 0;
- for (bv = bio_iovec(bio); bv < end; bv++) {
+ bio_for_each_segment(bv, bio, i) {
struct bvec_merge_data bvm = {
.bi_bdev = bio->bi_bdev,
.bi_sector = bio->bi_sector,
@@ -178,10 +167,14 @@ static unsigned bch_bio_max_sectors(struct bio *bio)
.bi_rw = bio->bi_rw,
};
+ if (seg == max_segments)
+ break;
+
if (q->merge_bvec_fn &&
q->merge_bvec_fn(q, &bvm, bv) < (int) bv->bv_len)
break;
+ seg++;
ret += bv->bv_len >> 9;
}
}
@@ -218,30 +211,10 @@ static void bch_bio_submit_split_endio(struct bio *bio, int error)
closure_put(cl);
}
-static void __bch_bio_submit_split(struct closure *cl)
-{
- struct bio_split_hook *s = container_of(cl, struct bio_split_hook, cl);
- struct bio *bio = s->bio, *n;
-
- do {
- n = bch_bio_split(bio, bch_bio_max_sectors(bio),
- GFP_NOIO, s->p->bio_split);
- if (!n)
- continue_at(cl, __bch_bio_submit_split, system_wq);
-
- n->bi_end_io = bch_bio_submit_split_endio;
- n->bi_private = cl;
-
- closure_get(cl);
- bch_generic_make_request_hack(n);
- } while (n != bio);
-
- continue_at(cl, bch_bio_submit_split_done, NULL);
-}
-
void bch_generic_make_request(struct bio *bio, struct bio_split_pool *p)
{
struct bio_split_hook *s;
+ struct bio *n;
if (!bio_has_data(bio) && !(bio->bi_rw & REQ_DISCARD))
goto submit;
@@ -250,6 +223,7 @@ void bch_generic_make_request(struct bio *bio, struct bio_split_pool *p)
goto submit;
s = mempool_alloc(p->bio_split_hook, GFP_NOIO);
+ closure_init(&s->cl, NULL);
s->bio = bio;
s->p = p;
@@ -257,8 +231,18 @@ void bch_generic_make_request(struct bio *bio, struct bio_split_pool *p)
s->bi_private = bio->bi_private;
bio_get(bio);
- closure_call(&s->cl, __bch_bio_submit_split, NULL, NULL);
- return;
+ do {
+ n = bch_bio_split(bio, bch_bio_max_sectors(bio),
+ GFP_NOIO, s->p->bio_split);
+
+ n->bi_end_io = bch_bio_submit_split_endio;
+ n->bi_private = &s->cl;
+
+ closure_get(&s->cl);
+ bch_generic_make_request_hack(n);
+ } while (n != bio);
+
+ continue_at(&s->cl, bch_bio_submit_split_done, NULL);
submit:
bch_generic_make_request_hack(bio);
}
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index 8c8dfdcd9d4c..ba95ab84b2be 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -9,6 +9,8 @@
#include "debug.h"
#include "request.h"
+#include <trace/events/bcache.h>
+
/*
* Journal replay/recovery:
*
@@ -182,9 +184,14 @@ bsearch:
pr_debug("starting binary search, l %u r %u", l, r);
while (l + 1 < r) {
+ seq = list_entry(list->prev, struct journal_replay,
+ list)->j.seq;
+
m = (l + r) >> 1;
+ read_bucket(m);
- if (read_bucket(m))
+ if (seq != list_entry(list->prev, struct journal_replay,
+ list)->j.seq)
l = m;
else
r = m;
@@ -300,7 +307,8 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list,
for (k = i->j.start;
k < end(&i->j);
k = bkey_next(k)) {
- pr_debug("%s", pkey(k));
+ trace_bcache_journal_replay_key(k);
+
bkey_copy(op->keys.top, k);
bch_keylist_push(&op->keys);
@@ -384,7 +392,7 @@ out:
return;
found:
if (btree_node_dirty(best))
- bch_btree_write(best, true, NULL);
+ bch_btree_node_write(best, NULL);
rw_unlock(true, best);
}
@@ -617,7 +625,7 @@ static void journal_write_unlocked(struct closure *cl)
bio_reset(bio);
bio->bi_sector = PTR_OFFSET(k, i);
bio->bi_bdev = ca->bdev;
- bio->bi_rw = REQ_WRITE|REQ_SYNC|REQ_META|REQ_FLUSH;
+ bio->bi_rw = REQ_WRITE|REQ_SYNC|REQ_META|REQ_FLUSH|REQ_FUA;
bio->bi_size = sectors << 9;
bio->bi_end_io = journal_write_endio;
@@ -712,7 +720,8 @@ void bch_journal(struct closure *cl)
spin_lock(&c->journal.lock);
if (journal_full(&c->journal)) {
- /* XXX: tracepoint */
+ trace_bcache_journal_full(c);
+
closure_wait(&c->journal.wait, cl);
journal_reclaim(c);
@@ -728,13 +737,15 @@ void bch_journal(struct closure *cl)
if (b * c->sb.block_size > PAGE_SECTORS << JSET_BITS ||
b > c->journal.blocks_free) {
- /* XXX: If we were inserting so many keys that they won't fit in
+ trace_bcache_journal_entry_full(c);
+
+ /*
+ * XXX: If we were inserting so many keys that they won't fit in
* an _empty_ journal write, we'll deadlock. For now, handle
* this in bch_keylist_realloc() - but something to think about.
*/
BUG_ON(!w->data->keys);
- /* XXX: tracepoint */
BUG_ON(!closure_wait(&w->wait, cl));
closure_flush(&c->journal.io);
diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c
index 8589512c972e..1a3b4f4786c3 100644
--- a/drivers/md/bcache/movinggc.c
+++ b/drivers/md/bcache/movinggc.c
@@ -9,6 +9,8 @@
#include "debug.h"
#include "request.h"
+#include <trace/events/bcache.h>
+
struct moving_io {
struct keybuf_key *w;
struct search s;
@@ -44,14 +46,14 @@ static void write_moving_finish(struct closure *cl)
{
struct moving_io *io = container_of(cl, struct moving_io, s.cl);
struct bio *bio = &io->bio.bio;
- struct bio_vec *bv = bio_iovec_idx(bio, bio->bi_vcnt);
+ struct bio_vec *bv;
+ int i;
- while (bv-- != bio->bi_io_vec)
+ bio_for_each_segment_all(bv, bio, i)
__free_page(bv->bv_page);
- pr_debug("%s %s", io->s.op.insert_collision
- ? "collision moving" : "moved",
- pkey(&io->w->key));
+ if (io->s.op.insert_collision)
+ trace_bcache_gc_copy_collision(&io->w->key);
bch_keybuf_del(&io->s.op.c->moving_gc_keys, io->w);
@@ -94,8 +96,6 @@ static void write_moving(struct closure *cl)
struct moving_io *io = container_of(s, struct moving_io, s);
if (!s->error) {
- trace_bcache_write_moving(&io->bio.bio);
-
moving_init(io);
io->bio.bio.bi_sector = KEY_START(&io->w->key);
@@ -122,7 +122,6 @@ static void read_moving_submit(struct closure *cl)
struct moving_io *io = container_of(s, struct moving_io, s);
struct bio *bio = &io->bio.bio;
- trace_bcache_read_moving(bio);
bch_submit_bbio(bio, s->op.c, &io->w->key, 0);
continue_at(cl, write_moving, bch_gc_wq);
@@ -138,7 +137,8 @@ static void read_moving(struct closure *cl)
/* XXX: if we error, background writeback could stall indefinitely */
while (!test_bit(CACHE_SET_STOPPING, &c->flags)) {
- w = bch_keybuf_next_rescan(c, &c->moving_gc_keys, &MAX_KEY);
+ w = bch_keybuf_next_rescan(c, &c->moving_gc_keys,
+ &MAX_KEY, moving_pred);
if (!w)
break;
@@ -159,10 +159,10 @@ static void read_moving(struct closure *cl)
bio->bi_rw = READ;
bio->bi_end_io = read_moving_endio;
- if (bch_bio_alloc_pages(bio, GFP_KERNEL))
+ if (bio_alloc_pages(bio, GFP_KERNEL))
goto err;
- pr_debug("%s", pkey(&w->key));
+ trace_bcache_gc_copy(&w->key);
closure_call(&io->s.cl, read_moving_submit, NULL, &c->gc.cl);
@@ -250,5 +250,5 @@ void bch_moving_gc(struct closure *cl)
void bch_moving_init_cache_set(struct cache_set *c)
{
- bch_keybuf_init(&c->moving_gc_keys, moving_pred);
+ bch_keybuf_init(&c->moving_gc_keys);
}
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index e5ff12e52d5b..786a1a4f74d8 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -10,6 +10,7 @@
#include "btree.h"
#include "debug.h"
#include "request.h"
+#include "writeback.h"
#include <linux/cgroup.h>
#include <linux/module.h>
@@ -21,8 +22,6 @@
#define CUTOFF_CACHE_ADD 95
#define CUTOFF_CACHE_READA 90
-#define CUTOFF_WRITEBACK 50
-#define CUTOFF_WRITEBACK_SYNC 75
struct kmem_cache *bch_search_cache;
@@ -489,6 +488,12 @@ static void bch_insert_data_loop(struct closure *cl)
bch_queue_gc(op->c);
}
+ /*
+ * Journal writes are marked REQ_FLUSH; if the original write was a
+ * flush, it'll wait on the journal write.
+ */
+ bio->bi_rw &= ~(REQ_FLUSH|REQ_FUA);
+
do {
unsigned i;
struct bkey *k;
@@ -510,10 +515,6 @@ static void bch_insert_data_loop(struct closure *cl)
goto err;
n = bch_bio_split(bio, KEY_SIZE(k), GFP_NOIO, split);
- if (!n) {
- __bkey_put(op->c, k);
- continue_at(cl, bch_insert_data_loop, bcache_wq);
- }
n->bi_end_io = bch_insert_data_endio;
n->bi_private = cl;
@@ -530,10 +531,9 @@ static void bch_insert_data_loop(struct closure *cl)
if (KEY_CSUM(k))
bio_csum(n, k);
- pr_debug("%s", pkey(k));
+ trace_bcache_cache_insert(k);
bch_keylist_push(&op->keys);
- trace_bcache_cache_insert(n, n->bi_sector, n->bi_bdev);
n->bi_rw |= REQ_WRITE;
bch_submit_bbio(n, op->c, k, 0);
} while (n != bio);
@@ -716,7 +716,7 @@ static struct search *search_alloc(struct bio *bio, struct bcache_device *d)
s->task = current;
s->orig_bio = bio;
s->write = (bio->bi_rw & REQ_WRITE) != 0;
- s->op.flush_journal = (bio->bi_rw & REQ_FLUSH) != 0;
+ s->op.flush_journal = (bio->bi_rw & (REQ_FLUSH|REQ_FUA)) != 0;
s->op.skip = (bio->bi_rw & REQ_DISCARD) != 0;
s->recoverable = 1;
s->start_time = jiffies;
@@ -784,11 +784,8 @@ static void request_read_error(struct closure *cl)
int i;
if (s->recoverable) {
- /* The cache read failed, but we can retry from the backing
- * device.
- */
- pr_debug("recovering at sector %llu",
- (uint64_t) s->orig_bio->bi_sector);
+ /* Retry from the backing device: */
+ trace_bcache_read_retry(s->orig_bio);
s->error = 0;
bv = s->bio.bio.bi_io_vec;
@@ -806,7 +803,6 @@ static void request_read_error(struct closure *cl)
/* XXX: invalidate cache */
- trace_bcache_read_retry(&s->bio.bio);
closure_bio_submit(&s->bio.bio, &s->cl, s->d);
}
@@ -827,53 +823,13 @@ static void request_read_done(struct closure *cl)
*/
if (s->op.cache_bio) {
- struct bio_vec *src, *dst;
- unsigned src_offset, dst_offset, bytes;
- void *dst_ptr;
-
bio_reset(s->op.cache_bio);
s->op.cache_bio->bi_sector = s->cache_miss->bi_sector;
s->op.cache_bio->bi_bdev = s->cache_miss->bi_bdev;
s->op.cache_bio->bi_size = s->cache_bio_sectors << 9;
bch_bio_map(s->op.cache_bio, NULL);
- src = bio_iovec(s->op.cache_bio);
- dst = bio_iovec(s->cache_miss);
- src_offset = src->bv_offset;
- dst_offset = dst->bv_offset;
- dst_ptr = kmap(dst->bv_page);
-
- while (1) {
- if (dst_offset == dst->bv_offset + dst->bv_len) {
- kunmap(dst->bv_page);
- dst++;
- if (dst == bio_iovec_idx(s->cache_miss,
- s->cache_miss->bi_vcnt))
- break;
-
- dst_offset = dst->bv_offset;
- dst_ptr = kmap(dst->bv_page);
- }
-
- if (src_offset == src->bv_offset + src->bv_len) {
- src++;
- if (src == bio_iovec_idx(s->op.cache_bio,
- s->op.cache_bio->bi_vcnt))
- BUG();
-
- src_offset = src->bv_offset;
- }
-
- bytes = min(dst->bv_offset + dst->bv_len - dst_offset,
- src->bv_offset + src->bv_len - src_offset);
-
- memcpy(dst_ptr + dst_offset,
- page_address(src->bv_page) + src_offset,
- bytes);
-
- src_offset += bytes;
- dst_offset += bytes;
- }
+ bio_copy_data(s->cache_miss, s->op.cache_bio);
bio_put(s->cache_miss);
s->cache_miss = NULL;
@@ -899,6 +855,7 @@ static void request_read_done_bh(struct closure *cl)
struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
bch_mark_cache_accounting(s, !s->cache_miss, s->op.skip);
+ trace_bcache_read(s->orig_bio, !s->cache_miss, s->op.skip);
if (s->error)
continue_at_nobarrier(cl, request_read_error, bcache_wq);
@@ -917,9 +874,6 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
struct bio *miss;
miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split);
- if (!miss)
- return -EAGAIN;
-
if (miss == bio)
s->op.lookup_done = true;
@@ -938,8 +892,9 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
reada = min(dc->readahead >> 9,
sectors - bio_sectors(miss));
- if (bio_end(miss) + reada > bdev_sectors(miss->bi_bdev))
- reada = bdev_sectors(miss->bi_bdev) - bio_end(miss);
+ if (bio_end_sector(miss) + reada > bdev_sectors(miss->bi_bdev))
+ reada = bdev_sectors(miss->bi_bdev) -
+ bio_end_sector(miss);
}
s->cache_bio_sectors = bio_sectors(miss) + reada;
@@ -963,13 +918,12 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
goto out_put;
bch_bio_map(s->op.cache_bio, NULL);
- if (bch_bio_alloc_pages(s->op.cache_bio, __GFP_NOWARN|GFP_NOIO))
+ if (bio_alloc_pages(s->op.cache_bio, __GFP_NOWARN|GFP_NOIO))
goto out_put;
s->cache_miss = miss;
bio_get(s->op.cache_bio);
- trace_bcache_cache_miss(s->orig_bio);
closure_bio_submit(s->op.cache_bio, &s->cl, s->d);
return ret;
@@ -1002,24 +956,13 @@ static void cached_dev_write_complete(struct closure *cl)
cached_dev_bio_complete(cl);
}
-static bool should_writeback(struct cached_dev *dc, struct bio *bio)
-{
- unsigned threshold = (bio->bi_rw & REQ_SYNC)
- ? CUTOFF_WRITEBACK_SYNC
- : CUTOFF_WRITEBACK;
-
- return !atomic_read(&dc->disk.detaching) &&
- cache_mode(dc, bio) == CACHE_MODE_WRITEBACK &&
- dc->disk.c->gc_stats.in_use < threshold;
-}
-
static void request_write(struct cached_dev *dc, struct search *s)
{
struct closure *cl = &s->cl;
struct bio *bio = &s->bio.bio;
struct bkey start, end;
start = KEY(dc->disk.id, bio->bi_sector, 0);
- end = KEY(dc->disk.id, bio_end(bio), 0);
+ end = KEY(dc->disk.id, bio_end_sector(bio), 0);
bch_keybuf_check_overlapping(&s->op.c->moving_gc_keys, &start, &end);
@@ -1034,22 +977,37 @@ static void request_write(struct cached_dev *dc, struct search *s)
if (bio->bi_rw & REQ_DISCARD)
goto skip;
+ if (should_writeback(dc, s->orig_bio,
+ cache_mode(dc, bio),
+ s->op.skip)) {
+ s->op.skip = false;
+ s->writeback = true;
+ }
+
if (s->op.skip)
goto skip;
- if (should_writeback(dc, s->orig_bio))
- s->writeback = true;
+ trace_bcache_write(s->orig_bio, s->writeback, s->op.skip);
if (!s->writeback) {
s->op.cache_bio = bio_clone_bioset(bio, GFP_NOIO,
dc->disk.bio_split);
- trace_bcache_writethrough(s->orig_bio);
closure_bio_submit(bio, cl, s->d);
} else {
- s->op.cache_bio = bio;
- trace_bcache_writeback(s->orig_bio);
- bch_writeback_add(dc, bio_sectors(bio));
+ bch_writeback_add(dc);
+
+ if (s->op.flush_journal) {
+ /* Also need to send a flush to the backing device */
+ s->op.cache_bio = bio_clone_bioset(bio, GFP_NOIO,
+ dc->disk.bio_split);
+
+ bio->bi_size = 0;
+ bio->bi_vcnt = 0;
+ closure_bio_submit(bio, cl, s->d);
+ } else {
+ s->op.cache_bio = bio;
+ }
}
out:
closure_call(&s->op.cl, bch_insert_data, NULL, cl);
@@ -1058,7 +1016,6 @@ skip:
s->op.skip = true;
s->op.cache_bio = s->orig_bio;
bio_get(s->op.cache_bio);
- trace_bcache_write_skip(s->orig_bio);
if ((bio->bi_rw & REQ_DISCARD) &&
!blk_queue_discard(bdev_get_queue(dc->bdev)))
@@ -1088,9 +1045,10 @@ static void request_nodata(struct cached_dev *dc, struct search *s)
/* Cached devices - read & write stuff */
-int bch_get_congested(struct cache_set *c)
+unsigned bch_get_congested(struct cache_set *c)
{
int i;
+ long rand;
if (!c->congested_read_threshold_us &&
!c->congested_write_threshold_us)
@@ -1106,7 +1064,13 @@ int bch_get_congested(struct cache_set *c)
i += CONGESTED_MAX;
- return i <= 0 ? 1 : fract_exp_two(i, 6);
+ if (i > 0)
+ i = fract_exp_two(i, 6);
+
+ rand = get_random_int();
+ i -= bitmap_weight(&rand, BITS_PER_LONG);
+
+ return i > 0 ? i : 1;
}
static void add_sequential(struct task_struct *t)
@@ -1126,10 +1090,8 @@ static void check_should_skip(struct cached_dev *dc, struct search *s)
{
struct cache_set *c = s->op.c;
struct bio *bio = &s->bio.bio;
-
- long rand;
- int cutoff = bch_get_congested(c);
unsigned mode = cache_mode(dc, bio);
+ unsigned sectors, congested = bch_get_congested(c);
if (atomic_read(&dc->disk.detaching) ||
c->gc_stats.in_use > CUTOFF_CACHE_ADD ||
@@ -1147,17 +1109,14 @@ static void check_should_skip(struct cached_dev *dc, struct search *s)
goto skip;
}
- if (!cutoff) {
- cutoff = dc->sequential_cutoff >> 9;
+ if (!congested && !dc->sequential_cutoff)
+ goto rescale;
- if (!cutoff)
- goto rescale;
-
- if (mode == CACHE_MODE_WRITEBACK &&
- (bio->bi_rw & REQ_WRITE) &&
- (bio->bi_rw & REQ_SYNC))
- goto rescale;
- }
+ if (!congested &&
+ mode == CACHE_MODE_WRITEBACK &&
+ (bio->bi_rw & REQ_WRITE) &&
+ (bio->bi_rw & REQ_SYNC))
+ goto rescale;
if (dc->sequential_merge) {
struct io *i;
@@ -1177,7 +1136,7 @@ found:
if (i->sequential + bio->bi_size > i->sequential)
i->sequential += bio->bi_size;
- i->last = bio_end(bio);
+ i->last = bio_end_sector(bio);
i->jiffies = jiffies + msecs_to_jiffies(5000);
s->task->sequential_io = i->sequential;
@@ -1192,12 +1151,19 @@ found:
add_sequential(s->task);
}
- rand = get_random_int();
- cutoff -= bitmap_weight(&rand, BITS_PER_LONG);
+ sectors = max(s->task->sequential_io,
+ s->task->sequential_io_avg) >> 9;
- if (cutoff <= (int) (max(s->task->sequential_io,
- s->task->sequential_io_avg) >> 9))
+ if (dc->sequential_cutoff &&
+ sectors >= dc->sequential_cutoff >> 9) {
+ trace_bcache_bypass_sequential(s->orig_bio);
goto skip;
+ }
+
+ if (congested && sectors >= congested) {
+ trace_bcache_bypass_congested(s->orig_bio);
+ goto skip;
+ }
rescale:
bch_rescale_priorities(c, bio_sectors(bio));
@@ -1288,30 +1254,25 @@ void bch_cached_dev_request_init(struct cached_dev *dc)
static int flash_dev_cache_miss(struct btree *b, struct search *s,
struct bio *bio, unsigned sectors)
{
+ struct bio_vec *bv;
+ int i;
+
/* Zero fill bio */
- while (bio->bi_idx != bio->bi_vcnt) {
- struct bio_vec *bv = bio_iovec(bio);
+ bio_for_each_segment(bv, bio, i) {
unsigned j = min(bv->bv_len >> 9, sectors);
void *p = kmap(bv->bv_page);
memset(p + bv->bv_offset, 0, j << 9);
kunmap(bv->bv_page);
- bv->bv_len -= j << 9;
- bv->bv_offset += j << 9;
-
- if (bv->bv_len)
- return 0;
-
- bio->bi_sector += j;
- bio->bi_size -= j << 9;
-
- bio->bi_idx++;
- sectors -= j;
+ sectors -= j;
}
- s->op.lookup_done = true;
+ bio_advance(bio, min(sectors << 9, bio->bi_size));
+
+ if (!bio->bi_size)
+ s->op.lookup_done = true;
return 0;
}
@@ -1338,8 +1299,8 @@ static void flash_dev_make_request(struct request_queue *q, struct bio *bio)
closure_call(&s->op.cl, btree_read_async, NULL, cl);
} else if (bio_has_data(bio) || s->op.skip) {
bch_keybuf_check_overlapping(&s->op.c->moving_gc_keys,
- &KEY(d->id, bio->bi_sector, 0),
- &KEY(d->id, bio_end(bio), 0));
+ &KEY(d->id, bio->bi_sector, 0),
+ &KEY(d->id, bio_end_sector(bio), 0));
s->writeback = true;
s->op.cache_bio = bio;
diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h
index 254d9ab5707c..57dc4784f4f4 100644
--- a/drivers/md/bcache/request.h
+++ b/drivers/md/bcache/request.h
@@ -30,7 +30,7 @@ struct search {
};
void bch_cache_read_endio(struct bio *, int);
-int bch_get_congested(struct cache_set *);
+unsigned bch_get_congested(struct cache_set *);
void bch_insert_data(struct closure *cl);
void bch_btree_insert_async(struct closure *);
void bch_cache_read_endio(struct bio *, int);
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index f88e2b653a3f..547c4c57b052 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -10,10 +10,13 @@
#include "btree.h"
#include "debug.h"
#include "request.h"
+#include "writeback.h"
+#include <linux/blkdev.h>
#include <linux/buffer_head.h>
#include <linux/debugfs.h>
#include <linux/genhd.h>
+#include <linux/kthread.h>
#include <linux/module.h>
#include <linux/random.h>
#include <linux/reboot.h>
@@ -342,6 +345,7 @@ static void uuid_io(struct cache_set *c, unsigned long rw,
struct closure *cl = &c->uuid_write.cl;
struct uuid_entry *u;
unsigned i;
+ char buf[80];
BUG_ON(!parent);
closure_lock(&c->uuid_write, parent);
@@ -362,8 +366,8 @@ static void uuid_io(struct cache_set *c, unsigned long rw,
break;
}
- pr_debug("%s UUIDs at %s", rw & REQ_WRITE ? "wrote" : "read",
- pkey(&c->uuid_bucket));
+ bch_bkey_to_text(buf, sizeof(buf), k);
+ pr_debug("%s UUIDs at %s", rw & REQ_WRITE ? "wrote" : "read", buf);
for (u = c->uuids; u < c->uuids + c->nr_uuids; u++)
if (!bch_is_zero(u->uuid, 16))
@@ -543,7 +547,6 @@ void bch_prio_write(struct cache *ca)
pr_debug("free %zu, free_inc %zu, unused %zu", fifo_used(&ca->free),
fifo_used(&ca->free_inc), fifo_used(&ca->unused));
- blktrace_msg(ca, "Starting priorities: " buckets_free(ca));
for (i = prio_buckets(ca) - 1; i >= 0; --i) {
long bucket;
@@ -704,7 +707,8 @@ static void bcache_device_detach(struct bcache_device *d)
atomic_set(&d->detaching, 0);
}
- bcache_device_unlink(d);
+ if (!d->flush_done)
+ bcache_device_unlink(d);
d->c->devices[d->id] = NULL;
closure_put(&d->c->caching);
@@ -743,13 +747,35 @@ static void bcache_device_free(struct bcache_device *d)
mempool_destroy(d->unaligned_bvec);
if (d->bio_split)
bioset_free(d->bio_split);
+ if (is_vmalloc_addr(d->stripe_sectors_dirty))
+ vfree(d->stripe_sectors_dirty);
+ else
+ kfree(d->stripe_sectors_dirty);
closure_debug_destroy(&d->cl);
}
-static int bcache_device_init(struct bcache_device *d, unsigned block_size)
+static int bcache_device_init(struct bcache_device *d, unsigned block_size,
+ sector_t sectors)
{
struct request_queue *q;
+ size_t n;
+
+ if (!d->stripe_size_bits)
+ d->stripe_size_bits = 31;
+
+ d->nr_stripes = round_up(sectors, 1 << d->stripe_size_bits) >>
+ d->stripe_size_bits;
+
+ if (!d->nr_stripes || d->nr_stripes > SIZE_MAX / sizeof(atomic_t))
+ return -ENOMEM;
+
+ n = d->nr_stripes * sizeof(atomic_t);
+ d->stripe_sectors_dirty = n < PAGE_SIZE << 6
+ ? kzalloc(n, GFP_KERNEL)
+ : vzalloc(n);
+ if (!d->stripe_sectors_dirty)
+ return -ENOMEM;
if (!(d->bio_split = bioset_create(4, offsetof(struct bbio, bio))) ||
!(d->unaligned_bvec = mempool_create_kmalloc_pool(1,
@@ -759,6 +785,7 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size)
!(q = blk_alloc_queue(GFP_KERNEL)))
return -ENOMEM;
+ set_capacity(d->disk, sectors);
snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", bcache_minor);
d->disk->major = bcache_major;
@@ -781,6 +808,8 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size)
set_bit(QUEUE_FLAG_NONROT, &d->disk->queue->queue_flags);
set_bit(QUEUE_FLAG_DISCARD, &d->disk->queue->queue_flags);
+ blk_queue_flush(q, REQ_FLUSH|REQ_FUA);
+
return 0;
}
@@ -800,6 +829,17 @@ static void calc_cached_dev_sectors(struct cache_set *c)
void bch_cached_dev_run(struct cached_dev *dc)
{
struct bcache_device *d = &dc->disk;
+ char buf[SB_LABEL_SIZE + 1];
+ char *env[] = {
+ "DRIVER=bcache",
+ kasprintf(GFP_KERNEL, "CACHED_UUID=%pU", dc->sb.uuid),
+ NULL,
+ NULL,
+ };
+
+ memcpy(buf, dc->sb.label, SB_LABEL_SIZE);
+ buf[SB_LABEL_SIZE] = '\0';
+ env[2] = kasprintf(GFP_KERNEL, "CACHED_LABEL=%s", buf);
if (atomic_xchg(&dc->running, 1))
return;
@@ -816,10 +856,12 @@ void bch_cached_dev_run(struct cached_dev *dc)
add_disk(d->disk);
bd_link_disk_holder(dc->bdev, dc->disk.disk);
-#if 0
- char *env[] = { "SYMLINK=label" , NULL };
+ /* won't show up in the uevent file, use udevadm monitor -e instead
+ * only class / kset properties are persistent */
kobject_uevent_env(&disk_to_dev(d->disk)->kobj, KOBJ_CHANGE, env);
-#endif
+ kfree(env[1]);
+ kfree(env[2]);
+
if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") ||
sysfs_create_link(&disk_to_dev(d->disk)->kobj, &d->kobj, "bcache"))
pr_debug("error creating sysfs link");
@@ -960,6 +1002,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c)
atomic_set(&dc->count, 1);
if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) {
+ bch_sectors_dirty_init(dc);
atomic_set(&dc->has_dirty, 1);
atomic_inc(&dc->count);
bch_writeback_queue(dc);
@@ -1014,6 +1057,14 @@ static void cached_dev_flush(struct closure *cl)
struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
struct bcache_device *d = &dc->disk;
+ mutex_lock(&bch_register_lock);
+ d->flush_done = 1;
+
+ if (d->c)
+ bcache_device_unlink(d);
+
+ mutex_unlock(&bch_register_lock);
+
bch_cache_accounting_destroy(&dc->accounting);
kobject_del(&d->kobj);
@@ -1045,7 +1096,8 @@ static int cached_dev_init(struct cached_dev *dc, unsigned block_size)
hlist_add_head(&io->hash, dc->io_hash + RECENT_IO);
}
- ret = bcache_device_init(&dc->disk, block_size);
+ ret = bcache_device_init(&dc->disk, block_size,
+ dc->bdev->bd_part->nr_sects - dc->sb.data_offset);
if (ret)
return ret;
@@ -1144,11 +1196,10 @@ static int flash_dev_run(struct cache_set *c, struct uuid_entry *u)
kobject_init(&d->kobj, &bch_flash_dev_ktype);
- if (bcache_device_init(d, block_bytes(c)))
+ if (bcache_device_init(d, block_bytes(c), u->sectors))
goto err;
bcache_device_attach(d, c, u - c->uuids);
- set_capacity(d->disk, u->sectors);
bch_flash_dev_request_init(d);
add_disk(d->disk);
@@ -1255,9 +1306,10 @@ static void cache_set_free(struct closure *cl)
free_pages((unsigned long) c->uuids, ilog2(bucket_pages(c)));
free_pages((unsigned long) c->sort, ilog2(bucket_pages(c)));
- kfree(c->fill_iter);
if (c->bio_split)
bioset_free(c->bio_split);
+ if (c->fill_iter)
+ mempool_destroy(c->fill_iter);
if (c->bio_meta)
mempool_destroy(c->bio_meta);
if (c->search)
@@ -1278,11 +1330,9 @@ static void cache_set_free(struct closure *cl)
static void cache_set_flush(struct closure *cl)
{
struct cache_set *c = container_of(cl, struct cache_set, caching);
+ struct cache *ca;
struct btree *b;
-
- /* Shut down allocator threads */
- set_bit(CACHE_SET_STOPPING_2, &c->flags);
- wake_up(&c->alloc_wait);
+ unsigned i;
bch_cache_accounting_destroy(&c->accounting);
@@ -1295,7 +1345,11 @@ static void cache_set_flush(struct closure *cl)
/* Should skip this if we're unregistering because of an error */
list_for_each_entry(b, &c->btree_cache, list)
if (btree_node_dirty(b))
- bch_btree_write(b, true, NULL);
+ bch_btree_node_write(b, NULL);
+
+ for_each_cache(ca, c, i)
+ if (ca->alloc_thread)
+ kthread_stop(ca->alloc_thread);
closure_return(cl);
}
@@ -1303,18 +1357,22 @@ static void cache_set_flush(struct closure *cl)
static void __cache_set_unregister(struct closure *cl)
{
struct cache_set *c = container_of(cl, struct cache_set, caching);
- struct cached_dev *dc, *t;
+ struct cached_dev *dc;
size_t i;
mutex_lock(&bch_register_lock);
- if (test_bit(CACHE_SET_UNREGISTERING, &c->flags))
- list_for_each_entry_safe(dc, t, &c->cached_devs, list)
- bch_cached_dev_detach(dc);
-
for (i = 0; i < c->nr_uuids; i++)
- if (c->devices[i] && UUID_FLASH_ONLY(&c->uuids[i]))
- bcache_device_stop(c->devices[i]);
+ if (c->devices[i]) {
+ if (!UUID_FLASH_ONLY(&c->uuids[i]) &&
+ test_bit(CACHE_SET_UNREGISTERING, &c->flags)) {
+ dc = container_of(c->devices[i],
+ struct cached_dev, disk);
+ bch_cached_dev_detach(dc);
+ } else {
+ bcache_device_stop(c->devices[i]);
+ }
+ }
mutex_unlock(&bch_register_lock);
@@ -1373,9 +1431,9 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
c->btree_pages = max_t(int, c->btree_pages / 4,
BTREE_MAX_PAGES);
- init_waitqueue_head(&c->alloc_wait);
+ c->sort_crit_factor = int_sqrt(c->btree_pages);
+
mutex_init(&c->bucket_lock);
- mutex_init(&c->fill_lock);
mutex_init(&c->sort_lock);
spin_lock_init(&c->sort_time_lock);
closure_init_unlocked(&c->sb_write);
@@ -1401,8 +1459,8 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
!(c->bio_meta = mempool_create_kmalloc_pool(2,
sizeof(struct bbio) + sizeof(struct bio_vec) *
bucket_pages(c))) ||
+ !(c->fill_iter = mempool_create_kmalloc_pool(1, iter_size)) ||
!(c->bio_split = bioset_create(4, offsetof(struct bbio, bio))) ||
- !(c->fill_iter = kmalloc(iter_size, GFP_KERNEL)) ||
!(c->sort = alloc_bucket_pages(GFP_KERNEL, c)) ||
!(c->uuids = alloc_bucket_pages(GFP_KERNEL, c)) ||
bch_journal_alloc(c) ||
@@ -1410,8 +1468,6 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
bch_open_buckets_alloc(c))
goto err;
- c->fill_iter->size = sb->bucket_size / sb->block_size;
-
c->congested_read_threshold_us = 2000;
c->congested_write_threshold_us = 20000;
c->error_limit = 8 << IO_ERROR_SHIFT;
@@ -1496,9 +1552,10 @@ static void run_cache_set(struct cache_set *c)
*/
bch_journal_next(&c->journal);
+ err = "error starting allocator thread";
for_each_cache(ca, c, i)
- closure_call(&ca->alloc, bch_allocator_thread,
- system_wq, &c->cl);
+ if (bch_cache_allocator_start(ca))
+ goto err;
/*
* First place it's safe to allocate: btree_check() and
@@ -1531,17 +1588,16 @@ static void run_cache_set(struct cache_set *c)
bch_btree_gc_finish(c);
+ err = "error starting allocator thread";
for_each_cache(ca, c, i)
- closure_call(&ca->alloc, bch_allocator_thread,
- ca->alloc_workqueue, &c->cl);
+ if (bch_cache_allocator_start(ca))
+ goto err;
mutex_lock(&c->bucket_lock);
for_each_cache(ca, c, i)
bch_prio_write(ca);
mutex_unlock(&c->bucket_lock);
- wake_up(&c->alloc_wait);
-
err = "cannot allocate new UUID bucket";
if (__uuid_write(c))
goto err_unlock_gc;
@@ -1552,7 +1608,7 @@ static void run_cache_set(struct cache_set *c)
goto err_unlock_gc;
bkey_copy_key(&c->root->key, &MAX_KEY);
- bch_btree_write(c->root, true, &op);
+ bch_btree_node_write(c->root, &op.cl);
bch_btree_set_root(c->root);
rw_unlock(true, c->root);
@@ -1673,9 +1729,6 @@ void bch_cache_release(struct kobject *kobj)
bio_split_pool_free(&ca->bio_split_hook);
- if (ca->alloc_workqueue)
- destroy_workqueue(ca->alloc_workqueue);
-
free_pages((unsigned long) ca->disk_buckets, ilog2(bucket_pages(ca)));
kfree(ca->prio_buckets);
vfree(ca->buckets);
@@ -1723,7 +1776,6 @@ static int cache_alloc(struct cache_sb *sb, struct cache *ca)
!(ca->prio_buckets = kzalloc(sizeof(uint64_t) * prio_buckets(ca) *
2, GFP_KERNEL)) ||
!(ca->disk_buckets = alloc_bucket_pages(GFP_KERNEL, ca)) ||
- !(ca->alloc_workqueue = alloc_workqueue("bch_allocator", 0, 1)) ||
bio_split_pool_init(&ca->bio_split_hook))
return -ENOMEM;
@@ -1786,6 +1838,36 @@ static ssize_t register_bcache(struct kobject *, struct kobj_attribute *,
kobj_attribute_write(register, register_bcache);
kobj_attribute_write(register_quiet, register_bcache);
+static bool bch_is_open_backing(struct block_device *bdev) {
+ struct cache_set *c, *tc;
+ struct cached_dev *dc, *t;
+
+ list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
+ list_for_each_entry_safe(dc, t, &c->cached_devs, list)
+ if (dc->bdev == bdev)
+ return true;
+ list_for_each_entry_safe(dc, t, &uncached_devices, list)
+ if (dc->bdev == bdev)
+ return true;
+ return false;
+}
+
+static bool bch_is_open_cache(struct block_device *bdev) {
+ struct cache_set *c, *tc;
+ struct cache *ca;
+ unsigned i;
+
+ list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
+ for_each_cache(ca, c, i)
+ if (ca->bdev == bdev)
+ return true;
+ return false;
+}
+
+static bool bch_is_open(struct block_device *bdev) {
+ return bch_is_open_cache(bdev) || bch_is_open_backing(bdev);
+}
+
static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
const char *buffer, size_t size)
{
@@ -1810,8 +1892,13 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
FMODE_READ|FMODE_WRITE|FMODE_EXCL,
sb);
if (IS_ERR(bdev)) {
- if (bdev == ERR_PTR(-EBUSY))
- err = "device busy";
+ if (bdev == ERR_PTR(-EBUSY)) {
+ bdev = lookup_bdev(strim(path));
+ if (!IS_ERR(bdev) && bch_is_open(bdev))
+ err = "device already registered";
+ else
+ err = "device busy";
+ }
goto err;
}
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index 4d9cca47e4c6..12a2c2846f99 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -9,7 +9,9 @@
#include "sysfs.h"
#include "btree.h"
#include "request.h"
+#include "writeback.h"
+#include <linux/blkdev.h>
#include <linux/sort.h>
static const char * const cache_replacement_policies[] = {
@@ -79,6 +81,9 @@ rw_attribute(writeback_rate_p_term_inverse);
rw_attribute(writeback_rate_d_smooth);
read_attribute(writeback_rate_debug);
+read_attribute(stripe_size);
+read_attribute(partial_stripes_expensive);
+
rw_attribute(synchronous);
rw_attribute(journal_delay_ms);
rw_attribute(discard);
@@ -127,7 +132,7 @@ SHOW(__bch_cached_dev)
char derivative[20];
char target[20];
bch_hprint(dirty,
- atomic_long_read(&dc->disk.sectors_dirty) << 9);
+ bcache_dev_sectors_dirty(&dc->disk) << 9);
bch_hprint(derivative, dc->writeback_rate_derivative << 9);
bch_hprint(target, dc->writeback_rate_target << 9);
@@ -143,7 +148,10 @@ SHOW(__bch_cached_dev)
}
sysfs_hprint(dirty_data,
- atomic_long_read(&dc->disk.sectors_dirty) << 9);
+ bcache_dev_sectors_dirty(&dc->disk) << 9);
+
+ sysfs_hprint(stripe_size, (1 << dc->disk.stripe_size_bits) << 9);
+ var_printf(partial_stripes_expensive, "%u");
var_printf(sequential_merge, "%i");
var_hprint(sequential_cutoff);
@@ -170,6 +178,7 @@ STORE(__cached_dev)
disk.kobj);
unsigned v = size;
struct cache_set *c;
+ struct kobj_uevent_env *env;
#define d_strtoul(var) sysfs_strtoul(var, dc->var)
#define d_strtoi_h(var) sysfs_hatoi(var, dc->var)
@@ -214,6 +223,7 @@ STORE(__cached_dev)
}
if (attr == &sysfs_label) {
+ /* note: endlines are preserved */
memcpy(dc->sb.label, buf, SB_LABEL_SIZE);
bch_write_bdev_super(dc, NULL);
if (dc->disk.c) {
@@ -221,6 +231,15 @@ STORE(__cached_dev)
buf, SB_LABEL_SIZE);
bch_uuid_write(dc->disk.c);
}
+ env = kzalloc(sizeof(struct kobj_uevent_env), GFP_KERNEL);
+ if (!env)
+ return -ENOMEM;
+ add_uevent_var(env, "DRIVER=bcache");
+ add_uevent_var(env, "CACHED_UUID=%pU", dc->sb.uuid),
+ add_uevent_var(env, "CACHED_LABEL=%s", buf);
+ kobject_uevent_env(
+ &disk_to_dev(dc->disk.disk)->kobj, KOBJ_CHANGE, env->envp);
+ kfree(env);
}
if (attr == &sysfs_attach) {
@@ -284,6 +303,8 @@ static struct attribute *bch_cached_dev_files[] = {
&sysfs_writeback_rate_d_smooth,
&sysfs_writeback_rate_debug,
&sysfs_dirty_data,
+ &sysfs_stripe_size,
+ &sysfs_partial_stripes_expensive,
&sysfs_sequential_cutoff,
&sysfs_sequential_merge,
&sysfs_clear_stats,
@@ -665,12 +686,10 @@ SHOW(__bch_cache)
int cmp(const void *l, const void *r)
{ return *((uint16_t *) r) - *((uint16_t *) l); }
- /* Number of quantiles we compute */
- const unsigned nq = 31;
-
size_t n = ca->sb.nbuckets, i, unused, btree;
uint64_t sum = 0;
- uint16_t q[nq], *p, *cached;
+ /* Compute 31 quantiles */
+ uint16_t q[31], *p, *cached;
ssize_t ret;
cached = p = vmalloc(ca->sb.nbuckets * sizeof(uint16_t));
@@ -703,26 +722,29 @@ SHOW(__bch_cache)
if (n)
do_div(sum, n);
- for (i = 0; i < nq; i++)
- q[i] = INITIAL_PRIO - cached[n * (i + 1) / (nq + 1)];
+ for (i = 0; i < ARRAY_SIZE(q); i++)
+ q[i] = INITIAL_PRIO - cached[n * (i + 1) /
+ (ARRAY_SIZE(q) + 1)];
vfree(p);
- ret = snprintf(buf, PAGE_SIZE,
- "Unused: %zu%%\n"
- "Metadata: %zu%%\n"
- "Average: %llu\n"
- "Sectors per Q: %zu\n"
- "Quantiles: [",
- unused * 100 / (size_t) ca->sb.nbuckets,
- btree * 100 / (size_t) ca->sb.nbuckets, sum,
- n * ca->sb.bucket_size / (nq + 1));
-
- for (i = 0; i < nq && ret < (ssize_t) PAGE_SIZE; i++)
- ret += snprintf(buf + ret, PAGE_SIZE - ret,
- i < nq - 1 ? "%u " : "%u]\n", q[i]);
-
- buf[PAGE_SIZE - 1] = '\0';
+ ret = scnprintf(buf, PAGE_SIZE,
+ "Unused: %zu%%\n"
+ "Metadata: %zu%%\n"
+ "Average: %llu\n"
+ "Sectors per Q: %zu\n"
+ "Quantiles: [",
+ unused * 100 / (size_t) ca->sb.nbuckets,
+ btree * 100 / (size_t) ca->sb.nbuckets, sum,
+ n * ca->sb.bucket_size / (ARRAY_SIZE(q) + 1));
+
+ for (i = 0; i < ARRAY_SIZE(q); i++)
+ ret += scnprintf(buf + ret, PAGE_SIZE - ret,
+ "%u ", q[i]);
+ ret--;
+
+ ret += scnprintf(buf + ret, PAGE_SIZE - ret, "]\n");
+
return ret;
}
diff --git a/drivers/md/bcache/trace.c b/drivers/md/bcache/trace.c
index 983f9bb411bc..f7b6c197f90f 100644
--- a/drivers/md/bcache/trace.c
+++ b/drivers/md/bcache/trace.c
@@ -2,6 +2,7 @@
#include "btree.h"
#include "request.h"
+#include <linux/blktrace_api.h>
#include <linux/module.h>
#define CREATE_TRACE_POINTS
@@ -9,18 +10,44 @@
EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_request_start);
EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_request_end);
-EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_passthrough);
-EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_cache_hit);
-EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_cache_miss);
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_bypass_sequential);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_bypass_congested);
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_read);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_write);
EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_read_retry);
-EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_writethrough);
-EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_writeback);
-EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_write_skip);
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_cache_insert);
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_journal_replay_key);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_journal_write);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_journal_full);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_journal_entry_full);
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_cache_cannibalize);
+
EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_read);
EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_write);
-EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_write_dirty);
-EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_read_dirty);
-EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_journal_write);
-EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_cache_insert);
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_alloc);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_alloc_fail);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_free);
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_gc_coalesce);
EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_gc_start);
EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_gc_end);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_gc_copy);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_gc_copy_collision);
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_insert_key);
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_split);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_compact);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_set_root);
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_alloc_invalidate);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_alloc_fail);
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_writeback);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_writeback_collision);
diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c
index da3a99e85b1e..98eb81159a22 100644
--- a/drivers/md/bcache/util.c
+++ b/drivers/md/bcache/util.c
@@ -228,23 +228,6 @@ start: bv->bv_len = min_t(size_t, PAGE_SIZE - bv->bv_offset,
}
}
-int bch_bio_alloc_pages(struct bio *bio, gfp_t gfp)
-{
- int i;
- struct bio_vec *bv;
-
- bio_for_each_segment(bv, bio, i) {
- bv->bv_page = alloc_page(gfp);
- if (!bv->bv_page) {
- while (bv-- != bio->bi_io_vec + bio->bi_idx)
- __free_page(bv->bv_page);
- return -ENOMEM;
- }
- }
-
- return 0;
-}
-
/*
* Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group (Any
* use permitted, subject to terms of PostgreSQL license; see.)
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
index 577393e38c3a..1ae2a73ad85f 100644
--- a/drivers/md/bcache/util.h
+++ b/drivers/md/bcache/util.h
@@ -15,8 +15,6 @@
struct closure;
-#include <trace/events/bcache.h>
-
#ifdef CONFIG_BCACHE_EDEBUG
#define atomic_dec_bug(v) BUG_ON(atomic_dec_return(v) < 0)
@@ -566,12 +564,8 @@ static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits)
return x;
}
-#define bio_end(bio) ((bio)->bi_sector + bio_sectors(bio))
-
void bch_bio_map(struct bio *bio, void *base);
-int bch_bio_alloc_pages(struct bio *bio, gfp_t gfp);
-
static inline sector_t bdev_sectors(struct block_device *bdev)
{
return bdev->bd_inode->i_size >> 9;
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 2714ed3991d1..22cbff551628 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -9,6 +9,9 @@
#include "bcache.h"
#include "btree.h"
#include "debug.h"
+#include "writeback.h"
+
+#include <trace/events/bcache.h>
static struct workqueue_struct *dirty_wq;
@@ -36,7 +39,7 @@ static void __update_writeback_rate(struct cached_dev *dc)
int change = 0;
int64_t error;
- int64_t dirty = atomic_long_read(&dc->disk.sectors_dirty);
+ int64_t dirty = bcache_dev_sectors_dirty(&dc->disk);
int64_t derivative = dirty - dc->disk.sectors_dirty_last;
dc->disk.sectors_dirty_last = dirty;
@@ -105,6 +108,31 @@ static bool dirty_pred(struct keybuf *buf, struct bkey *k)
return KEY_DIRTY(k);
}
+static bool dirty_full_stripe_pred(struct keybuf *buf, struct bkey *k)
+{
+ uint64_t stripe;
+ unsigned nr_sectors = KEY_SIZE(k);
+ struct cached_dev *dc = container_of(buf, struct cached_dev,
+ writeback_keys);
+ unsigned stripe_size = 1 << dc->disk.stripe_size_bits;
+
+ if (!KEY_DIRTY(k))
+ return false;
+
+ stripe = KEY_START(k) >> dc->disk.stripe_size_bits;
+ while (1) {
+ if (atomic_read(dc->disk.stripe_sectors_dirty + stripe) !=
+ stripe_size)
+ return false;
+
+ if (nr_sectors <= stripe_size)
+ return true;
+
+ nr_sectors -= stripe_size;
+ stripe++;
+ }
+}
+
static void dirty_init(struct keybuf_key *w)
{
struct dirty_io *io = w->private;
@@ -149,7 +177,22 @@ static void refill_dirty(struct closure *cl)
searched_from_start = true;
}
- bch_refill_keybuf(dc->disk.c, buf, &end);
+ if (dc->partial_stripes_expensive) {
+ uint64_t i;
+
+ for (i = 0; i < dc->disk.nr_stripes; i++)
+ if (atomic_read(dc->disk.stripe_sectors_dirty + i) ==
+ 1 << dc->disk.stripe_size_bits)
+ goto full_stripes;
+
+ goto normal_refill;
+full_stripes:
+ bch_refill_keybuf(dc->disk.c, buf, &end,
+ dirty_full_stripe_pred);
+ } else {
+normal_refill:
+ bch_refill_keybuf(dc->disk.c, buf, &end, dirty_pred);
+ }
if (bkey_cmp(&buf->last_scanned, &end) >= 0 && searched_from_start) {
/* Searched the entire btree - delay awhile */
@@ -181,10 +224,8 @@ void bch_writeback_queue(struct cached_dev *dc)
}
}
-void bch_writeback_add(struct cached_dev *dc, unsigned sectors)
+void bch_writeback_add(struct cached_dev *dc)
{
- atomic_long_add(sectors, &dc->disk.sectors_dirty);
-
if (!atomic_read(&dc->has_dirty) &&
!atomic_xchg(&dc->has_dirty, 1)) {
atomic_inc(&dc->count);
@@ -203,6 +244,34 @@ void bch_writeback_add(struct cached_dev *dc, unsigned sectors)
}
}
+void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode,
+ uint64_t offset, int nr_sectors)
+{
+ struct bcache_device *d = c->devices[inode];
+ unsigned stripe_size, stripe_offset;
+ uint64_t stripe;
+
+ if (!d)
+ return;
+
+ stripe_size = 1 << d->stripe_size_bits;
+ stripe = offset >> d->stripe_size_bits;
+ stripe_offset = offset & (stripe_size - 1);
+
+ while (nr_sectors) {
+ int s = min_t(unsigned, abs(nr_sectors),
+ stripe_size - stripe_offset);
+
+ if (nr_sectors < 0)
+ s = -s;
+
+ atomic_add(s, d->stripe_sectors_dirty + stripe);
+ nr_sectors -= s;
+ stripe_offset = 0;
+ stripe++;
+ }
+}
+
/* Background writeback - IO loop */
static void dirty_io_destructor(struct closure *cl)
@@ -216,9 +285,10 @@ static void write_dirty_finish(struct closure *cl)
struct dirty_io *io = container_of(cl, struct dirty_io, cl);
struct keybuf_key *w = io->bio.bi_private;
struct cached_dev *dc = io->dc;
- struct bio_vec *bv = bio_iovec_idx(&io->bio, io->bio.bi_vcnt);
+ struct bio_vec *bv;
+ int i;
- while (bv-- != io->bio.bi_io_vec)
+ bio_for_each_segment_all(bv, &io->bio, i)
__free_page(bv->bv_page);
/* This is kind of a dumb way of signalling errors. */
@@ -236,10 +306,12 @@ static void write_dirty_finish(struct closure *cl)
for (i = 0; i < KEY_PTRS(&w->key); i++)
atomic_inc(&PTR_BUCKET(dc->disk.c, &w->key, i)->pin);
- pr_debug("clearing %s", pkey(&w->key));
bch_btree_insert(&op, dc->disk.c);
closure_sync(&op.cl);
+ if (op.insert_collision)
+ trace_bcache_writeback_collision(&w->key);
+
atomic_long_inc(op.insert_collision
? &dc->disk.c->writeback_keys_failed
: &dc->disk.c->writeback_keys_done);
@@ -275,7 +347,6 @@ static void write_dirty(struct closure *cl)
io->bio.bi_bdev = io->dc->bdev;
io->bio.bi_end_io = dirty_endio;
- trace_bcache_write_dirty(&io->bio);
closure_bio_submit(&io->bio, cl, &io->dc->disk);
continue_at(cl, write_dirty_finish, dirty_wq);
@@ -296,7 +367,6 @@ static void read_dirty_submit(struct closure *cl)
{
struct dirty_io *io = container_of(cl, struct dirty_io, cl);
- trace_bcache_read_dirty(&io->bio);
closure_bio_submit(&io->bio, cl, &io->dc->disk);
continue_at(cl, write_dirty, dirty_wq);
@@ -349,10 +419,10 @@ static void read_dirty(struct closure *cl)
io->bio.bi_rw = READ;
io->bio.bi_end_io = read_dirty_endio;
- if (bch_bio_alloc_pages(&io->bio, GFP_KERNEL))
+ if (bio_alloc_pages(&io->bio, GFP_KERNEL))
goto err_free;
- pr_debug("%s", pkey(&w->key));
+ trace_bcache_writeback(&w->key);
closure_call(&io->cl, read_dirty_submit, NULL, &dc->disk.cl);
@@ -375,12 +445,49 @@ err:
refill_dirty(cl);
}
+/* Init */
+
+static int bch_btree_sectors_dirty_init(struct btree *b, struct btree_op *op,
+ struct cached_dev *dc)
+{
+ struct bkey *k;
+ struct btree_iter iter;
+
+ bch_btree_iter_init(b, &iter, &KEY(dc->disk.id, 0, 0));
+ while ((k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad)))
+ if (!b->level) {
+ if (KEY_INODE(k) > dc->disk.id)
+ break;
+
+ if (KEY_DIRTY(k))
+ bcache_dev_sectors_dirty_add(b->c, dc->disk.id,
+ KEY_START(k),
+ KEY_SIZE(k));
+ } else {
+ btree(sectors_dirty_init, k, b, op, dc);
+ if (KEY_INODE(k) > dc->disk.id)
+ break;
+
+ cond_resched();
+ }
+
+ return 0;
+}
+
+void bch_sectors_dirty_init(struct cached_dev *dc)
+{
+ struct btree_op op;
+
+ bch_btree_op_init_stack(&op);
+ btree_root(sectors_dirty_init, dc->disk.c, &op, dc);
+}
+
void bch_cached_dev_writeback_init(struct cached_dev *dc)
{
closure_init_unlocked(&dc->writeback);
init_rwsem(&dc->writeback_lock);
- bch_keybuf_init(&dc->writeback_keys, dirty_pred);
+ bch_keybuf_init(&dc->writeback_keys);
dc->writeback_metadata = true;
dc->writeback_running = true;
diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
new file mode 100644
index 000000000000..c91f61bb95b6
--- /dev/null
+++ b/drivers/md/bcache/writeback.h
@@ -0,0 +1,64 @@
+#ifndef _BCACHE_WRITEBACK_H
+#define _BCACHE_WRITEBACK_H
+
+#define CUTOFF_WRITEBACK 40
+#define CUTOFF_WRITEBACK_SYNC 70
+
+static inline uint64_t bcache_dev_sectors_dirty(struct bcache_device *d)
+{
+ uint64_t i, ret = 0;
+
+ for (i = 0; i < d->nr_stripes; i++)
+ ret += atomic_read(d->stripe_sectors_dirty + i);
+
+ return ret;
+}
+
+static inline bool bcache_dev_stripe_dirty(struct bcache_device *d,
+ uint64_t offset,
+ unsigned nr_sectors)
+{
+ uint64_t stripe = offset >> d->stripe_size_bits;
+
+ while (1) {
+ if (atomic_read(d->stripe_sectors_dirty + stripe))
+ return true;
+
+ if (nr_sectors <= 1 << d->stripe_size_bits)
+ return false;
+
+ nr_sectors -= 1 << d->stripe_size_bits;
+ stripe++;
+ }
+}
+
+static inline bool should_writeback(struct cached_dev *dc, struct bio *bio,
+ unsigned cache_mode, bool would_skip)
+{
+ unsigned in_use = dc->disk.c->gc_stats.in_use;
+
+ if (cache_mode != CACHE_MODE_WRITEBACK ||
+ atomic_read(&dc->disk.detaching) ||
+ in_use > CUTOFF_WRITEBACK_SYNC)
+ return false;
+
+ if (dc->partial_stripes_expensive &&
+ bcache_dev_stripe_dirty(&dc->disk, bio->bi_sector,
+ bio_sectors(bio)))
+ return true;
+
+ if (would_skip)
+ return false;
+
+ return bio->bi_rw & REQ_SYNC ||
+ in_use <= CUTOFF_WRITEBACK;
+}
+
+void bcache_dev_sectors_dirty_add(struct cache_set *, unsigned, uint64_t, int);
+void bch_writeback_queue(struct cached_dev *);
+void bch_writeback_add(struct cached_dev *);
+
+void bch_sectors_dirty_init(struct cached_dev *dc);
+void bch_cached_dev_writeback_init(struct cached_dev *);
+
+#endif
diff --git a/drivers/md/md.c b/drivers/md/md.c
index dddc87bcf64a..9f13e13506ef 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -7716,20 +7716,6 @@ static int remove_and_add_spares(struct mddev *mddev,
continue;
rdev->recovery_offset = 0;
- if (rdev->saved_raid_disk >= 0 && mddev->in_sync) {
- spin_lock_irq(&mddev->write_lock);
- if (mddev->in_sync)
- /* OK, this device, which is in_sync,
- * will definitely be noticed before
- * the next write, so recovery isn't
- * needed.
- */
- rdev->recovery_offset = mddev->recovery_cp;
- spin_unlock_irq(&mddev->write_lock);
- }
- if (mddev->ro && rdev->recovery_offset != MaxSector)
- /* not safe to add this disk now */
- continue;
if (mddev->pers->
hot_add_disk(mddev, rdev) == 0) {
if (sysfs_link_rdev(mddev, rdev))
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index ec734588a1c6..d60412c7f995 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1849,6 +1849,36 @@ static int process_checks(struct r1bio *r1_bio)
int i;
int vcnt;
+ /* Fix variable parts of all bios */
+ vcnt = (r1_bio->sectors + PAGE_SIZE / 512 - 1) >> (PAGE_SHIFT - 9);
+ for (i = 0; i < conf->raid_disks * 2; i++) {
+ int j;
+ int size;
+ struct bio *b = r1_bio->bios[i];
+ if (b->bi_end_io != end_sync_read)
+ continue;
+ /* fixup the bio for reuse */
+ bio_reset(b);
+ b->bi_vcnt = vcnt;
+ b->bi_size = r1_bio->sectors << 9;
+ b->bi_sector = r1_bio->sector +
+ conf->mirrors[i].rdev->data_offset;
+ b->bi_bdev = conf->mirrors[i].rdev->bdev;
+ b->bi_end_io = end_sync_read;
+ b->bi_private = r1_bio;
+
+ size = b->bi_size;
+ for (j = 0; j < vcnt ; j++) {
+ struct bio_vec *bi;
+ bi = &b->bi_io_vec[j];
+ bi->bv_offset = 0;
+ if (size > PAGE_SIZE)
+ bi->bv_len = PAGE_SIZE;
+ else
+ bi->bv_len = size;
+ size -= PAGE_SIZE;
+ }
+ }
for (primary = 0; primary < conf->raid_disks * 2; primary++)
if (r1_bio->bios[primary]->bi_end_io == end_sync_read &&
test_bit(BIO_UPTODATE, &r1_bio->bios[primary]->bi_flags)) {
@@ -1857,12 +1887,10 @@ static int process_checks(struct r1bio *r1_bio)
break;
}
r1_bio->read_disk = primary;
- vcnt = (r1_bio->sectors + PAGE_SIZE / 512 - 1) >> (PAGE_SHIFT - 9);
for (i = 0; i < conf->raid_disks * 2; i++) {
int j;
struct bio *pbio = r1_bio->bios[primary];
struct bio *sbio = r1_bio->bios[i];
- int size;
if (sbio->bi_end_io != end_sync_read)
continue;
@@ -1888,27 +1916,6 @@ static int process_checks(struct r1bio *r1_bio)
rdev_dec_pending(conf->mirrors[i].rdev, mddev);
continue;
}
- /* fixup the bio for reuse */
- bio_reset(sbio);
- sbio->bi_vcnt = vcnt;
- sbio->bi_size = r1_bio->sectors << 9;
- sbio->bi_sector = r1_bio->sector +
- conf->mirrors[i].rdev->data_offset;
- sbio->bi_bdev = conf->mirrors[i].rdev->bdev;
- sbio->bi_end_io = end_sync_read;
- sbio->bi_private = r1_bio;
-
- size = sbio->bi_size;
- for (j = 0; j < vcnt ; j++) {
- struct bio_vec *bi;
- bi = &sbio->bi_io_vec[j];
- bi->bv_offset = 0;
- if (size > PAGE_SIZE)
- bi->bv_len = PAGE_SIZE;
- else
- bi->bv_len = size;
- size -= PAGE_SIZE;
- }
bio_copy_data(sbio, pbio);
}
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index cd066b63bdaf..df7b0a06b0ea 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -2097,11 +2097,17 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
* both 'first' and 'i', so we just compare them.
* All vec entries are PAGE_SIZE;
*/
- for (j = 0; j < vcnt; j++)
+ int sectors = r10_bio->sectors;
+ for (j = 0; j < vcnt; j++) {
+ int len = PAGE_SIZE;
+ if (sectors < (len / 512))
+ len = sectors * 512;
if (memcmp(page_address(fbio->bi_io_vec[j].bv_page),
page_address(tbio->bi_io_vec[j].bv_page),
- fbio->bi_io_vec[j].bv_len))
+ len))
break;
+ sectors -= len/512;
+ }
if (j == vcnt)
continue;
atomic64_add(r10_bio->sectors, &mddev->resync_mismatches);
@@ -2284,12 +2290,18 @@ static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio)
d = r10_bio->devs[1].devnum;
wbio = r10_bio->devs[1].bio;
wbio2 = r10_bio->devs[1].repl_bio;
+ /* Need to test wbio2->bi_end_io before we call
+ * generic_make_request as if the former is NULL,
+ * the latter is free to free wbio2.
+ */
+ if (wbio2 && !wbio2->bi_end_io)
+ wbio2 = NULL;
if (wbio->bi_end_io) {
atomic_inc(&conf->mirrors[d].rdev->nr_pending);
md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(wbio));
generic_make_request(wbio);
}
- if (wbio2 && wbio2->bi_end_io) {
+ if (wbio2) {
atomic_inc(&conf->mirrors[d].replacement->nr_pending);
md_sync_acct(conf->mirrors[d].replacement->bdev,
bio_sectors(wbio2));
@@ -3407,6 +3419,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
if (bio->bi_end_io == end_sync_read) {
md_sync_acct(bio->bi_bdev, nr_sectors);
+ set_bit(BIO_UPTODATE, &bio->bi_flags);
generic_make_request(bio);
}
}
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 2bf094a587cb..78ea44336e75 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3462,6 +3462,7 @@ static void handle_stripe(struct stripe_head *sh)
test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) {
set_bit(STRIPE_SYNCING, &sh->state);
clear_bit(STRIPE_INSYNC, &sh->state);
+ clear_bit(STRIPE_REPLACED, &sh->state);
}
spin_unlock(&sh->stripe_lock);
}
@@ -3607,19 +3608,23 @@ static void handle_stripe(struct stripe_head *sh)
handle_parity_checks5(conf, sh, &s, disks);
}
- if (s.replacing && s.locked == 0
- && !test_bit(STRIPE_INSYNC, &sh->state)) {
+ if ((s.replacing || s.syncing) && s.locked == 0
+ && !test_bit(STRIPE_COMPUTE_RUN, &sh->state)
+ && !test_bit(STRIPE_REPLACED, &sh->state)) {
/* Write out to replacement devices where possible */
for (i = 0; i < conf->raid_disks; i++)
- if (test_bit(R5_UPTODATE, &sh->dev[i].flags) &&
- test_bit(R5_NeedReplace, &sh->dev[i].flags)) {
+ if (test_bit(R5_NeedReplace, &sh->dev[i].flags)) {
+ WARN_ON(!test_bit(R5_UPTODATE, &sh->dev[i].flags));
set_bit(R5_WantReplace, &sh->dev[i].flags);
set_bit(R5_LOCKED, &sh->dev[i].flags);
s.locked++;
}
- set_bit(STRIPE_INSYNC, &sh->state);
+ if (s.replacing)
+ set_bit(STRIPE_INSYNC, &sh->state);
+ set_bit(STRIPE_REPLACED, &sh->state);
}
if ((s.syncing || s.replacing) && s.locked == 0 &&
+ !test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
test_bit(STRIPE_INSYNC, &sh->state)) {
md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
clear_bit(STRIPE_SYNCING, &sh->state);
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index b0b663b119a8..70c49329ca9a 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -306,6 +306,7 @@ enum {
STRIPE_SYNC_REQUESTED,
STRIPE_SYNCING,
STRIPE_INSYNC,
+ STRIPE_REPLACED,
STRIPE_PREREAD_ACTIVE,
STRIPE_DELAYED,
STRIPE_DEGRADED,