5 files changed, 235 insertions, 33 deletions
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index c54b0e64c590..95053a903474 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -58,6 +58,15 @@
  */
 #define RBIO_CACHE_READY_BIT	3
 
+/*
+ * bbio and raid_map is managed by the caller, so we shouldn't free
+ * them here. And besides that, all rbios with this flag should not
+ * be cached, because we need raid_map to check the rbios' stripe
+ * is the same or not, but it is very likely that the caller has
+ * free raid_map, so don't cache those rbios.
+ */
+#define RBIO_HOLD_BBIO_MAP_BIT	4
+
 #define RBIO_CACHE_SIZE 1024
 
 struct btrfs_raid_bio {
@@ -799,6 +808,21 @@ done_nolock:
 		remove_rbio_from_cache(rbio);
 }
 
+static inline void
+__free_bbio_and_raid_map(struct btrfs_bio *bbio, u64 *raid_map, int need)
+{
+	if (need) {
+		kfree(raid_map);
+		kfree(bbio);
+	}
+}
+
+static inline void free_bbio_and_raid_map(struct btrfs_raid_bio *rbio)
+{
+	__free_bbio_and_raid_map(rbio->bbio, rbio->raid_map,
+			!test_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags));
+}
+
 static void __free_raid_bio(struct btrfs_raid_bio *rbio)
 {
 	int i;
@@ -817,8 +841,9 @@ static void __free_raid_bio(struct btrfs_raid_bio *rbio)
 			rbio->stripe_pages[i] = NULL;
 		}
 	}
-	kfree(rbio->raid_map);
-	kfree(rbio->bbio);
+
+	free_bbio_and_raid_map(rbio);
+
 	kfree(rbio);
 }
 
@@ -933,11 +958,8 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root,
 
 	rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2,
 			GFP_NOFS);
-	if (!rbio) {
-		kfree(raid_map);
-		kfree(bbio);
+	if (!rbio)
 		return ERR_PTR(-ENOMEM);
-	}
 
 	bio_list_init(&rbio->bio_list);
 	INIT_LIST_HEAD(&rbio->plug_list);
@@ -1692,8 +1714,10 @@ int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
 	struct blk_plug_cb *cb;
 
 	rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
-	if (IS_ERR(rbio))
+	if (IS_ERR(rbio)) {
+		__free_bbio_and_raid_map(bbio, raid_map, 1);
 		return PTR_ERR(rbio);
+	}
 	bio_list_add(&rbio->bio_list, bio);
 	rbio->bio_list_bytes = bio->bi_iter.bi_size;
 
@@ -1888,7 +1912,8 @@ cleanup:
 cleanup_io:
 
 	if (rbio->read_rebuild) {
-		if (err == 0)
+		if (err == 0 &&
+		    !test_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags))
 			cache_rbio_pages(rbio);
 		else
 			clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
@@ -2038,15 +2063,19 @@ cleanup:
  */
 int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
 			  struct btrfs_bio *bbio, u64 *raid_map,
-			  u64 stripe_len, int mirror_num)
+			  u64 stripe_len, int mirror_num, int hold_bbio)
 {
 	struct btrfs_raid_bio *rbio;
 	int ret;
 
 	rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
-	if (IS_ERR(rbio))
+	if (IS_ERR(rbio)) {
+		__free_bbio_and_raid_map(bbio, raid_map, !hold_bbio);
 		return PTR_ERR(rbio);
+	}
 
+	if (hold_bbio)
+		set_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags);
 	rbio->read_rebuild = 1;
 	bio_list_add(&rbio->bio_list, bio);
 	rbio->bio_list_bytes = bio->bi_iter.bi_size;
@@ -2054,8 +2083,7 @@ int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
 	rbio->faila = find_logical_bio_stripe(rbio, bio);
 	if (rbio->faila == -1) {
 		BUG();
-		kfree(raid_map);
-		kfree(bbio);
+		__free_bbio_and_raid_map(bbio, raid_map, !hold_bbio);
 		kfree(rbio);
 		return -EIO;
 	}
diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h
index ea5d73bfdfbe..b310e8c830d1 100644
--- a/fs/btrfs/raid56.h
+++ b/fs/btrfs/raid56.h
@@ -41,7 +41,7 @@ static inline int nr_data_stripes(struct map_lookup *map)
 
 int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
 				 struct btrfs_bio *bbio, u64 *raid_map,
-				 u64 stripe_len, int mirror_num);
+				 u64 stripe_len, int mirror_num, int hold_bbio);
 int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
 			       struct btrfs_bio *bbio, u64 *raid_map,
 			       u64 stripe_len);
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index efa083113827..ca4b9eb8b5da 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -63,6 +63,13 @@ struct scrub_ctx;
  */
 #define SCRUB_MAX_PAGES_PER_BLOCK	16	/* 64k per node/leaf/sector */
 
+struct scrub_recover {
+	atomic_t		refs;
+	struct btrfs_bio	*bbio;
+	u64			*raid_map;
+	u64			map_length;
+};
+
 struct scrub_page {
 	struct scrub_block	*sblock;
 	struct page		*page;
@@ -79,6 +86,8 @@ struct scrub_page {
 		unsigned int	io_error:1;
 	};
 	u8			csum[BTRFS_CSUM_SIZE];
+
+	struct scrub_recover	*recover;
 };
 
 struct scrub_bio {
@@ -196,7 +205,7 @@ static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
 static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
 				struct scrub_block *sblock, int is_metadata,
 				int have_csum, u8 *csum, u64 generation,
-				u16 csum_size);
+				u16 csum_size, int retry_failed_mirror);
 static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
 					 struct scrub_block *sblock,
 					 int is_metadata, int have_csum,
@@ -790,6 +799,20 @@ out:
 	scrub_pending_trans_workers_dec(sctx);
 }
 
+static inline void scrub_get_recover(struct scrub_recover *recover)
+{
+	atomic_inc(&recover->refs);
+}
+
+static inline void scrub_put_recover(struct scrub_recover *recover)
+{
+	if (atomic_dec_and_test(&recover->refs)) {
+		kfree(recover->bbio);
+		kfree(recover->raid_map);
+		kfree(recover);
+	}
+}
+
 /*
  * scrub_handle_errored_block gets called when either verification of the
  * pages failed or the bio failed to read, e.g. with EIO. In the latter
@@ -906,7 +929,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
 
 	/* build and submit the bios for the failed mirror, check checksums */
 	scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum,
-			    csum, generation, sctx->csum_size);
+			    csum, generation, sctx->csum_size, 1);
 
 	if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
 	    sblock_bad->no_io_error_seen) {
@@ -1019,7 +1042,7 @@ nodatasum_case:
 		/* build and submit the bios, check checksums */
 		scrub_recheck_block(fs_info, sblock_other, is_metadata,
 				    have_csum, csum, generation,
-				    sctx->csum_size);
+				    sctx->csum_size, 0);
 
 		if (!sblock_other->header_error &&
 		    !sblock_other->checksum_error &&
@@ -1169,7 +1192,7 @@ nodatasum_case:
 			 */
 			scrub_recheck_block(fs_info, sblock_bad,
 					    is_metadata, have_csum, csum,
-					    generation, sctx->csum_size);
+					    generation, sctx->csum_size, 1);
 			if (!sblock_bad->header_error &&
 			    !sblock_bad->checksum_error &&
 			    sblock_bad->no_io_error_seen)
@@ -1201,11 +1224,18 @@ out:
 		     mirror_index++) {
 			struct scrub_block *sblock = sblocks_for_recheck +
 						     mirror_index;
+			struct scrub_recover *recover;
 			int page_index;
 
 			for (page_index = 0; page_index < sblock->page_count;
 			     page_index++) {
 				sblock->pagev[page_index]->sblock = NULL;
+				recover = sblock->pagev[page_index]->recover;
+				if (recover) {
+					scrub_put_recover(recover);
+					sblock->pagev[page_index]->recover =
+									NULL;
+				}
 				scrub_page_put(sblock->pagev[page_index]);
 			}
 		}
@@ -1215,14 +1245,63 @@ out:
 	return 0;
 }
 
+static inline int scrub_nr_raid_mirrors(struct btrfs_bio *bbio, u64 *raid_map)
+{
+	if (raid_map) {
+		if (raid_map[bbio->num_stripes - 1] == RAID6_Q_STRIPE)
+			return 3;
+		else
+			return 2;
+	} else {
+		return (int)bbio->num_stripes;
+	}
+}
+
+static inline void scrub_stripe_index_and_offset(u64 logical, u64 *raid_map,
+						 u64 mapped_length,
+						 int nstripes, int mirror,
+						 int *stripe_index,
+						 u64 *stripe_offset)
+{
+	int i;
+
+	if (raid_map) {
+		/* RAID5/6 */
+		for (i = 0; i < nstripes; i++) {
+			if (raid_map[i] == RAID6_Q_STRIPE ||
+			    raid_map[i] == RAID5_P_STRIPE)
+				continue;
+
+			if (logical >= raid_map[i] &&
+			    logical < raid_map[i] + mapped_length)
+				break;
+		}
+
+		*stripe_index = i;
+		*stripe_offset = logical - raid_map[i];
+	} else {
+		/* The other RAID type */
+		*stripe_index = mirror;
+		*stripe_offset = 0;
+	}
+}
+
 static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
 				     struct btrfs_fs_info *fs_info,
 				     struct scrub_block *original_sblock,
 				     u64 length, u64 logical,
 				     struct scrub_block *sblocks_for_recheck)
 {
+	struct scrub_recover *recover;
+	struct btrfs_bio *bbio;
+	u64 *raid_map;
+	u64 sublen;
+	u64 mapped_length;
+	u64 stripe_offset;
+	int stripe_index;
 	int page_index;
 	int mirror_index;
+	int nmirrors;
 	int ret;
 
 	/*
@@ -1233,23 +1312,39 @@ static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
 
 	page_index = 0;
 	while (length > 0) {
-		u64 sublen = min_t(u64, length, PAGE_SIZE);
-		u64 mapped_length = sublen;
-		struct btrfs_bio *bbio = NULL;
+		sublen = min_t(u64, length, PAGE_SIZE);
+		mapped_length = sublen;
+		bbio = NULL;
+		raid_map = NULL;
 
 		/*
 		 * with a length of PAGE_SIZE, each returned stripe
 		 * represents one mirror
 		 */
-		ret = btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, logical,
-				      &mapped_length, &bbio, 0);
+		ret = btrfs_map_sblock(fs_info, REQ_GET_READ_MIRRORS, logical,
+				       &mapped_length, &bbio, 0, &raid_map);
 		if (ret || !bbio || mapped_length < sublen) {
 			kfree(bbio);
+			kfree(raid_map);
 			return -EIO;
 		}
 
+		recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS);
+		if (!recover) {
+			kfree(bbio);
+			kfree(raid_map);
+			return -ENOMEM;
+		}
+
+		atomic_set(&recover->refs, 1);
+		recover->bbio = bbio;
+		recover->raid_map = raid_map;
+		recover->map_length = mapped_length;
+
 		BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO);
-		for (mirror_index = 0; mirror_index < (int)bbio->num_stripes;
+
+		nmirrors = scrub_nr_raid_mirrors(bbio, raid_map);
+		for (mirror_index = 0; mirror_index < nmirrors;
 		     mirror_index++) {
 			struct scrub_block *sblock;
 			struct scrub_page *page;
@@ -1265,26 +1360,38 @@ leave_nomem:
 				spin_lock(&sctx->stat_lock);
 				sctx->stat.malloc_errors++;
 				spin_unlock(&sctx->stat_lock);
-				kfree(bbio);
+				scrub_put_recover(recover);
 				return -ENOMEM;
 			}
 			scrub_page_get(page);
 			sblock->pagev[page_index] = page;
 			page->logical = logical;
-			page->physical = bbio->stripes[mirror_index].physical;
+
+			scrub_stripe_index_and_offset(logical, raid_map,
+						      mapped_length,
+						      bbio->num_stripes,
+						      mirror_index,
+						      &stripe_index,
+						      &stripe_offset);
+			page->physical = bbio->stripes[stripe_index].physical +
+					 stripe_offset;
+			page->dev = bbio->stripes[stripe_index].dev;
+
 			BUG_ON(page_index >= original_sblock->page_count);
 			page->physical_for_dev_replace =
 				original_sblock->pagev[page_index]->
 				physical_for_dev_replace;
 			/* for missing devices, dev->bdev is NULL */
-			page->dev = bbio->stripes[mirror_index].dev;
 			page->mirror_num = mirror_index + 1;
 			sblock->page_count++;
 			page->page = alloc_page(GFP_NOFS);
 			if (!page->page)
 				goto leave_nomem;
+
+			scrub_get_recover(recover);
+			page->recover = recover;
 		}
-		kfree(bbio);
+		scrub_put_recover(recover);
 		length -= sublen;
 		logical += sublen;
 		page_index++;
@@ -1293,6 +1400,51 @@ leave_nomem:
 	return 0;
 }
 
+struct scrub_bio_ret {
+	struct completion event;
+	int error;
+};
+
+static void scrub_bio_wait_endio(struct bio *bio, int error)
+{
+	struct scrub_bio_ret *ret = bio->bi_private;
+
+	ret->error = error;
+	complete(&ret->event);
+}
+
+static inline int scrub_is_page_on_raid56(struct scrub_page *page)
+{
+	return page->recover && page->recover->raid_map;
+}
+
+static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
+					struct bio *bio,
+					struct scrub_page *page)
+{
+	struct scrub_bio_ret done;
+	int ret;
+
+	init_completion(&done.event);
+	done.error = 0;
+	bio->bi_iter.bi_sector = page->logical >> 9;
+	bio->bi_private = &done;
+	bio->bi_end_io = scrub_bio_wait_endio;
+
+	ret = raid56_parity_recover(fs_info->fs_root, bio, page->recover->bbio,
+				    page->recover->raid_map,
+				    page->recover->map_length,
+				    page->mirror_num, 1);
+	if (ret)
+		return ret;
+
+	wait_for_completion(&done.event);
+	if (done.error)
+		return -EIO;
+
+	return 0;
+}
+
 /*
  * this function will check the on disk data for checksum errors, header
  * errors and read I/O errors. If any I/O errors happen, the exact pages
@@ -1303,7 +1455,7 @@ leave_nomem:
 static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
 				struct scrub_block *sblock, int is_metadata,
 				int have_csum, u8 *csum, u64 generation,
-				u16 csum_size)
+				u16 csum_size, int retry_failed_mirror)
 {
 	int page_num;
 
@@ -1329,11 +1481,17 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
 			continue;
 		}
 		bio->bi_bdev = page->dev->bdev;
-		bio->bi_iter.bi_sector = page->physical >> 9;
 
 		bio_add_page(bio, page->page, PAGE_SIZE, 0);
-		if (btrfsic_submit_bio_wait(READ, bio))
-			sblock->no_io_error_seen = 0;
+		if (!retry_failed_mirror && scrub_is_page_on_raid56(page)) {
+			if (scrub_submit_raid56_bio_wait(fs_info, bio, page))
+				sblock->no_io_error_seen = 0;
+		} else {
+			bio->bi_iter.bi_sector = page->physical >> 9;
+
+			if (btrfsic_submit_bio_wait(READ, bio))
+				sblock->no_io_error_seen = 0;
+		}
 
 		bio_put(bio);
 	}
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 6f5b302a08cf..217c42ea90b0 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -5161,7 +5161,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
 				BTRFS_BLOCK_GROUP_RAID6)) {
 		u64 tmp;
 
-		if (raid_map_ret && ((rw & REQ_WRITE) || mirror_num > 1)) {
+		if (raid_map_ret &&
+		    ((rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) ||
+		     mirror_num > 1)) {
 			int i, rot;
 
 			/* push stripe_nr back to the start of the full stripe */
@@ -5440,6 +5442,16 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
 				 mirror_num, NULL);
 }
 
+/* For Scrub/replace */
+int btrfs_map_sblock(struct btrfs_fs_info *fs_info, int rw,
+		     u64 logical, u64 *length,
+		     struct btrfs_bio **bbio_ret, int mirror_num,
+		     u64 **raid_map_ret)
+{
+	return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret,
+				 mirror_num, raid_map_ret);
+}
+
 int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
 		     u64 chunk_start, u64 physical, u64 devid,
 		     u64 **logical, int *naddrs, int *stripe_len)
@@ -5809,7 +5821,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 		} else {
 			ret = raid56_parity_recover(root, bio, bbio,
 						    raid_map, map_length,
-						    mirror_num);
+						    mirror_num, 0);
 		}
 		/*
 		 * FIXME, replace dosen't support raid56 yet, please fix
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 08980fa23039..01094bb804c7 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -393,6 +393,10 @@ int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
 int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
 		    u64 logical, u64 *length,
 		    struct btrfs_bio **bbio_ret, int mirror_num);
+int btrfs_map_sblock(struct btrfs_fs_info *fs_info, int rw,
+		     u64 logical, u64 *length,
+		     struct btrfs_bio **bbio_ret, int mirror_num,
+		     u64 **raid_map_ret);
 int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
 		     u64 chunk_start, u64 physical, u64 devid,
 		     u64 **logical, int *naddrs, int *stripe_len);