From 2b604351bc99b4e4504758cbac369b660b71de0b Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Fri, 22 Jun 2007 15:45:27 -0700
Subject: ocfs2: simplify deallocation locking

Deallocation of suballocator blocks, most notably extent blocks, might
involve multiple suballocator inodes.

The locking for this can get extremely complicated, especially when the
suballocator inodes to delete from aren't known until deep within an
unrelated codepath.

Implement a simple scheme for recording the blocks to be unlinked so that
the actual deallocation can be done in a context which won't deadlock.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/alloc.h | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

(limited to 'fs/ocfs2/alloc.h')

diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index fbcb5934a081..01db0adc2150 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -63,6 +63,25 @@ int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
 int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
 					 struct ocfs2_dinode *tl_copy);
 
+/*
+ * Process local structure which describes the block unlinks done
+ * during an operation. This is populated via
+ * ocfs2_cache_block_dealloc().
+ *
+ * ocfs2_run_deallocs() should be called after the potentially
+ * de-allocating routines. No journal handles should be open, and most
+ * locks should have been dropped.
+ */
+struct ocfs2_cached_dealloc_ctxt {
+	struct ocfs2_per_slot_free_list		*c_first_suballocator;
+};
+static inline void ocfs2_init_dealloc_ctxt(struct ocfs2_cached_dealloc_ctxt *c)
+{
+	c->c_first_suballocator = NULL;
+}
+int ocfs2_run_deallocs(struct ocfs2_super *osb,
+		       struct ocfs2_cached_dealloc_ctxt *ctxt);
+
 struct ocfs2_truncate_context {
 	struct inode *tc_ext_alloc_inode;
 	struct buffer_head *tc_ext_alloc_bh;
-- 
cgit v1.2.3


From 59a5e416d1ab543a5248a2b34d83202c4d55d132 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Fri, 22 Jun 2007 15:52:36 -0700
Subject: ocfs2: plug truncate into cached dealloc routines

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/alloc.c    | 102 ++++++++++++++--------------------------------------
 fs/ocfs2/alloc.h    |   3 +-
 fs/ocfs2/aops.c     |   1 +
 fs/ocfs2/suballoc.c |  13 -------
 fs/ocfs2/suballoc.h |   4 ---
 5 files changed, 29 insertions(+), 94 deletions(-)

(limited to 'fs/ocfs2/alloc.h')

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 873bb99fc2ff..26e867087e95 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -50,6 +50,8 @@
 #include "buffer_head_io.h"
 
 static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
+static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
+					 struct ocfs2_extent_block *eb);
 
 /*
  * Structures which describe a path through a btree, and functions to
@@ -3161,6 +3163,15 @@ out:
 	return ret;
 }
 
+static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
+					 struct ocfs2_extent_block *eb)
+{
+	return ocfs2_cache_block_dealloc(ctxt, EXTENT_ALLOC_SYSTEM_INODE,
+					 le16_to_cpu(eb->h_suballoc_slot),
+					 le64_to_cpu(eb->h_blkno),
+					 le16_to_cpu(eb->h_suballoc_bit));
+}
+
 /* This function will figure out whether the currently last extent
  * block will be deleted, and if it will, what the new last extent
  * block will be so we can update his h_next_leaf_blk field, as well
@@ -3442,27 +3453,10 @@ delete:
 			BUG_ON(le32_to_cpu(el->l_recs[0].e_cpos));
 			BUG_ON(le64_to_cpu(el->l_recs[0].e_blkno));
 
-			if (le16_to_cpu(eb->h_suballoc_slot) == 0) {
-				/*
-				 * This code only understands how to
-				 * lock the suballocator in slot 0,
-				 * which is fine because allocation is
-				 * only ever done out of that
-				 * suballocator too. A future version
-				 * might change that however, so avoid
-				 * a free if we don't know how to
-				 * handle it. This way an fs incompat
-				 * bit will not be necessary.
-				 */
-				ret = ocfs2_free_extent_block(handle,
-							      tc->tc_ext_alloc_inode,
-							      tc->tc_ext_alloc_bh,
-							      eb);
-
-				/* An error here is not fatal. */
-				if (ret < 0)
-					mlog_errno(ret);
-			}
+			ret = ocfs2_cache_extent_block_free(&tc->tc_dealloc, eb);
+			/* An error here is not fatal. */
+			if (ret < 0)
+				mlog_errno(ret);
 		} else {
 			deleted_eb = 0;
 		}
@@ -3965,6 +3959,8 @@ bail:
 	if (handle)
 		ocfs2_commit_trans(osb, handle);
 
+	ocfs2_run_deallocs(osb, &tc->tc_dealloc);
+
 	ocfs2_free_path(path);
 
 	/* This will drop the ext_alloc cluster lock for us */
@@ -3975,23 +3971,18 @@ bail:
 }
 
 /*
- * Expects the inode to already be locked. This will figure out which
- * inodes need to be locked and will put them on the returned truncate
- * context.
+ * Expects the inode to already be locked.
  */
 int ocfs2_prepare_truncate(struct ocfs2_super *osb,
 			   struct inode *inode,
 			   struct buffer_head *fe_bh,
 			   struct ocfs2_truncate_context **tc)
 {
-	int status, metadata_delete, i;
+	int status;
 	unsigned int new_i_clusters;
 	struct ocfs2_dinode *fe;
 	struct ocfs2_extent_block *eb;
-	struct ocfs2_extent_list *el;
 	struct buffer_head *last_eb_bh = NULL;
-	struct inode *ext_alloc_inode = NULL;
-	struct buffer_head *ext_alloc_bh = NULL;
 
 	mlog_entry_void();
 
@@ -4011,12 +4002,9 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb,
 		mlog_errno(status);
 		goto bail;
 	}
+	ocfs2_init_dealloc_ctxt(&(*tc)->tc_dealloc);
 
-	metadata_delete = 0;
 	if (fe->id2.i_list.l_tree_depth) {
-		/* If we have a tree, then the truncate may result in
-		 * metadata deletes. Figure this out from the
-		 * rightmost leaf block.*/
 		status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
 					  &last_eb_bh, OCFS2_BH_CACHED, inode);
 		if (status < 0) {
@@ -4031,43 +4019,10 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb,
 			status = -EIO;
 			goto bail;
 		}
-		el = &(eb->h_list);
-
-		i = 0;
-		if (ocfs2_is_empty_extent(&el->l_recs[0]))
-			i = 1;
-		/*
-		 * XXX: Should we check that next_free_rec contains
-		 * the extent?
-		 */
-		if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_i_clusters)
-			metadata_delete = 1;
 	}
 
 	(*tc)->tc_last_eb_bh = last_eb_bh;
 
-	if (metadata_delete) {
-		mlog(0, "Will have to delete metadata for this trunc. "
-		     "locking allocator.\n");
-		ext_alloc_inode = ocfs2_get_system_file_inode(osb, EXTENT_ALLOC_SYSTEM_INODE, 0);
-		if (!ext_alloc_inode) {
-			status = -ENOMEM;
-			mlog_errno(status);
-			goto bail;
-		}
-
-		mutex_lock(&ext_alloc_inode->i_mutex);
-		(*tc)->tc_ext_alloc_inode = ext_alloc_inode;
-
-		status = ocfs2_meta_lock(ext_alloc_inode, &ext_alloc_bh, 1);
-		if (status < 0) {
-			mlog_errno(status);
-			goto bail;
-		}
-		(*tc)->tc_ext_alloc_bh = ext_alloc_bh;
-		(*tc)->tc_ext_alloc_locked = 1;
-	}
-
 	status = 0;
 bail:
 	if (status < 0) {
@@ -4081,16 +4036,13 @@ bail:
 
 static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc)
 {
-	if (tc->tc_ext_alloc_inode) {
-		if (tc->tc_ext_alloc_locked)
-			ocfs2_meta_unlock(tc->tc_ext_alloc_inode, 1);
-
-		mutex_unlock(&tc->tc_ext_alloc_inode->i_mutex);
-		iput(tc->tc_ext_alloc_inode);
-	}
-
-	if (tc->tc_ext_alloc_bh)
-		brelse(tc->tc_ext_alloc_bh);
+	/*
+	 * The caller is responsible for completing deallocation
+	 * before freeing the context.
+	 */
+	if (tc->tc_dealloc.c_first_suballocator != NULL)
+		mlog(ML_NOTICE,
+		     "Truncate completion has non-empty dealloc context\n");
 
 	if (tc->tc_last_eb_bh)
 		brelse(tc->tc_last_eb_bh);
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 01db0adc2150..cb02e53b593c 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -83,8 +83,7 @@ int ocfs2_run_deallocs(struct ocfs2_super *osb,
 		       struct ocfs2_cached_dealloc_ctxt *ctxt);
 
 struct ocfs2_truncate_context {
-	struct inode *tc_ext_alloc_inode;
-	struct buffer_head *tc_ext_alloc_bh;
+	struct ocfs2_cached_dealloc_ctxt tc_dealloc;
 	int tc_ext_alloc_locked; /* is it cluster locked? */
 	/* these get destroyed once it's passed to ocfs2_commit_truncate. */
 	struct buffer_head *tc_last_eb_bh;
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index e8d16ae12ef0..510bf84c9cf5 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1498,6 +1498,7 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
 	ocfs2_journal_dirty(handle, wc->w_di_bh);
 
 	ocfs2_commit_trans(osb, handle);
+
 	ocfs2_free_write_ctxt(wc);
 
 	return copied;
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 6788f2f1a667..82bf12f887a6 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -1708,19 +1708,6 @@ int ocfs2_free_dinode(handle_t *handle,
 					inode_alloc_bh, bit, bg_blkno, 1);
 }
 
-int ocfs2_free_extent_block(handle_t *handle,
-			    struct inode *eb_alloc_inode,
-			    struct buffer_head *eb_alloc_bh,
-			    struct ocfs2_extent_block *eb)
-{
-	u64 blk = le64_to_cpu(eb->h_blkno);
-	u16 bit = le16_to_cpu(eb->h_suballoc_bit);
-	u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
-
-	return ocfs2_free_suballoc_bits(handle, eb_alloc_inode, eb_alloc_bh,
-					bit, bg_blkno, 1);
-}
-
 int ocfs2_free_clusters(handle_t *handle,
 		       struct inode *bitmap_inode,
 		       struct buffer_head *bitmap_bh,
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index 7bc4819db4db..f212dc01a84b 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -96,10 +96,6 @@ int ocfs2_free_dinode(handle_t *handle,
 		      struct inode *inode_alloc_inode,
 		      struct buffer_head *inode_alloc_bh,
 		      struct ocfs2_dinode *di);
-int ocfs2_free_extent_block(handle_t *handle,
-			    struct inode *eb_alloc_inode,
-			    struct buffer_head *eb_alloc_bh,
-			    struct ocfs2_extent_block *eb);
 int ocfs2_free_clusters(handle_t *handle,
 			struct inode *bitmap_inode,
 			struct buffer_head *bitmap_bh,
-- 
cgit v1.2.3


From 328d5752e1259dfb29b7e65f6c2d145fddbaa750 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Mon, 18 Jun 2007 10:48:04 -0700
Subject: ocfs2: btree changes for unwritten extents

Writes to a region marked as unwritten might result in a record split or
merge. We can support splits by making minor changes to the existing insert
code. Merges require left rotations which mostly re-use right rotation
support functions.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/alloc.c      | 2043 ++++++++++++++++++++++++++++++++++++++++++++-----
 fs/ocfs2/alloc.h      |    6 +
 fs/ocfs2/endian.h     |    5 +
 fs/ocfs2/extent_map.c |   31 -
 fs/ocfs2/ocfs2.h      |   13 +
 fs/ocfs2/ocfs2_fs.h   |    7 +-
 6 files changed, 1892 insertions(+), 213 deletions(-)

(limited to 'fs/ocfs2/alloc.h')

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index a02d026cb0e4..0db6a1f724e1 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -118,6 +118,31 @@ static void ocfs2_free_path(struct ocfs2_path *path)
 	}
 }
 
+/*
+ * All the elements of src into dest. After this call, src could be freed
+ * without affecting dest.
+ *
+ * Both paths should have the same root. Any non-root elements of dest
+ * will be freed.
+ */
+static void ocfs2_cp_path(struct ocfs2_path *dest, struct ocfs2_path *src)
+{
+	int i;
+
+	BUG_ON(path_root_bh(dest) != path_root_bh(src));
+	BUG_ON(path_root_el(dest) != path_root_el(src));
+
+	ocfs2_reinit_path(dest, 1);
+
+	for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
+		dest->p_node[i].bh = src->p_node[i].bh;
+		dest->p_node[i].el = src->p_node[i].el;
+
+		if (dest->p_node[i].bh)
+			get_bh(dest->p_node[i].bh);
+	}
+}
+
 /*
  * Make the *dest path the same as src and re-initialize src path to
  * have a root only.
@@ -214,10 +239,41 @@ out:
 	return ret;
 }
 
+/*
+ * Return the index of the extent record which contains cluster #v_cluster.
+ * -1 is returned if it was not found.
+ *
+ * Should work fine on interior and exterior nodes.
+ */
+int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster)
+{
+	int ret = -1;
+	int i;
+	struct ocfs2_extent_rec *rec;
+	u32 rec_end, rec_start, clusters;
+
+	for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
+		rec = &el->l_recs[i];
+
+		rec_start = le32_to_cpu(rec->e_cpos);
+		clusters = ocfs2_rec_clusters(el, rec);
+
+		rec_end = rec_start + clusters;
+
+		if (v_cluster >= rec_start && v_cluster < rec_end) {
+			ret = i;
+			break;
+		}
+	}
+
+	return ret;
+}
+
 enum ocfs2_contig_type {
 	CONTIG_NONE = 0,
 	CONTIG_LEFT,
-	CONTIG_RIGHT
+	CONTIG_RIGHT,
+	CONTIG_LEFTRIGHT,
 };
 
 
@@ -255,6 +311,14 @@ static enum ocfs2_contig_type
 {
 	u64 blkno = le64_to_cpu(insert_rec->e_blkno);
 
+	/*
+	 * Refuse to coalesce extent records with different flag
+	 * fields - we don't want to mix unwritten extents with user
+	 * data.
+	 */
+	if (ext->e_flags != insert_rec->e_flags)
+		return CONTIG_NONE;
+
 	if (ocfs2_extents_adjacent(ext, insert_rec) &&
 	    ocfs2_block_extent_contig(inode->i_sb, ext, blkno))
 			return CONTIG_RIGHT;
@@ -279,7 +343,14 @@ enum ocfs2_append_type {
 	APPEND_TAIL,
 };
 
+enum ocfs2_split_type {
+	SPLIT_NONE = 0,
+	SPLIT_LEFT,
+	SPLIT_RIGHT,
+};
+
 struct ocfs2_insert_type {
+	enum ocfs2_split_type	ins_split;
 	enum ocfs2_append_type	ins_appending;
 	enum ocfs2_contig_type	ins_contig;
 	int			ins_contig_index;
@@ -287,6 +358,13 @@ struct ocfs2_insert_type {
 	int			ins_tree_depth;
 };
 
+struct ocfs2_merge_ctxt {
+	enum ocfs2_contig_type	c_contig_type;
+	int			c_has_empty_extent;
+	int			c_split_covers_rec;
+	int			c_used_tail_recs;
+};
+
 /*
  * How many free extents have we got before we need more meta data?
  */
@@ -457,7 +535,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
 			    struct inode *inode,
 			    struct buffer_head *fe_bh,
 			    struct buffer_head *eb_bh,
-			    struct buffer_head *last_eb_bh,
+			    struct buffer_head **last_eb_bh,
 			    struct ocfs2_alloc_context *meta_ac)
 {
 	int status, new_blocks, i;
@@ -472,7 +550,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
 
 	mlog_entry_void();
 
-	BUG_ON(!last_eb_bh);
+	BUG_ON(!last_eb_bh || !*last_eb_bh);
 
 	fe = (struct ocfs2_dinode *) fe_bh->b_data;
 
@@ -503,7 +581,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
 		goto bail;
 	}
 
-	eb = (struct ocfs2_extent_block *)last_eb_bh->b_data;
+	eb = (struct ocfs2_extent_block *)(*last_eb_bh)->b_data;
 	new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list);
 
 	/* Note: new_eb_bhs[new_blocks - 1] is the guy which will be
@@ -564,7 +642,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
 	 * journal_dirty erroring as it won't unless we've aborted the
 	 * handle (in which case we would never be here) so reserving
 	 * the write with journal_access is all we need to do. */
-	status = ocfs2_journal_access(handle, inode, last_eb_bh,
+	status = ocfs2_journal_access(handle, inode, *last_eb_bh,
 				      OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
 		mlog_errno(status);
@@ -597,10 +675,10 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
 	 * next_leaf on the previously last-extent-block. */
 	fe->i_last_eb_blk = cpu_to_le64(new_last_eb_blk);
 
-	eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
+	eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
 	eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk);
 
-	status = ocfs2_journal_dirty(handle, last_eb_bh);
+	status = ocfs2_journal_dirty(handle, *last_eb_bh);
 	if (status < 0)
 		mlog_errno(status);
 	status = ocfs2_journal_dirty(handle, fe_bh);
@@ -612,6 +690,14 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
 			mlog_errno(status);
 	}
 
+	/*
+	 * Some callers want to track the rightmost leaf so pass it
+	 * back here.
+	 */
+	brelse(*last_eb_bh);
+	get_bh(new_eb_bhs[0]);
+	*last_eb_bh = new_eb_bhs[0];
+
 	status = 0;
 bail:
 	if (new_eb_bhs) {
@@ -831,10 +917,12 @@ bail:
  * be considered invalid.
  *
  * Tree depth after the grow is returned via *final_depth.
+ *
+ * *last_eb_bh will be updated by ocfs2_add_branch().
  */
 static int ocfs2_grow_tree(struct inode *inode, handle_t *handle,
 			   struct buffer_head *di_bh, int *final_depth,
-			   struct buffer_head *last_eb_bh,
+			   struct buffer_head **last_eb_bh,
 			   struct ocfs2_alloc_context *meta_ac)
 {
 	int ret, shift;
@@ -869,10 +957,21 @@ static int ocfs2_grow_tree(struct inode *inode, handle_t *handle,
 			goto out;
 		}
 		depth++;
-		/* Special case: we have room now if we shifted from
-		 * tree_depth 0 */
-		if (depth == 1)
+		if (depth == 1) {
+			/*
+			 * Special case: we have room now if we shifted from
+			 * tree_depth 0, so no more work needs to be done.
+			 *
+			 * We won't be calling add_branch, so pass
+			 * back *last_eb_bh as the new leaf. At depth
+			 * zero, it should always be null so there's
+			 * no reason to brelse.
+			 */
+			BUG_ON(*last_eb_bh);
+			get_bh(bh);
+			*last_eb_bh = bh;
 			goto out;
+		}
 	}
 
 	/* call ocfs2_add_branch to add the final part of the tree with
@@ -998,6 +1097,22 @@ static void ocfs2_rotate_leaf(struct ocfs2_extent_list *el,
 
 }
 
+static void ocfs2_remove_empty_extent(struct ocfs2_extent_list *el)
+{
+	int size, num_recs = le16_to_cpu(el->l_next_free_rec);
+
+	BUG_ON(num_recs == 0);
+
+	if (ocfs2_is_empty_extent(&el->l_recs[0])) {
+		num_recs--;
+		size = num_recs * sizeof(struct ocfs2_extent_rec);
+		memmove(&el->l_recs[0], &el->l_recs[1], size);
+		memset(&el->l_recs[num_recs], 0,
+		       sizeof(struct ocfs2_extent_rec));
+		el->l_next_free_rec = cpu_to_le16(num_recs);
+	}
+}
+
 /*
  * Create an empty extent record .
  *
@@ -1275,6 +1390,10 @@ static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec,
 	 * immediately to their right.
 	 */
 	left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos);
+	if (ocfs2_is_empty_extent(&right_child_el->l_recs[0])) {
+		BUG_ON(le16_to_cpu(right_child_el->l_next_free_rec) <= 1);
+		left_clusters = le32_to_cpu(right_child_el->l_recs[1].e_cpos);
+	}
 	left_clusters -= le32_to_cpu(left_rec->e_cpos);
 	left_rec->e_int_clusters = cpu_to_le32(left_clusters);
 
@@ -1595,10 +1714,16 @@ out:
 	return ret;
 }
 
+/*
+ * Extend the transaction by enough credits to complete the rotation,
+ * and still leave at least the original number of credits allocated
+ * to this transaction.
+ */
 static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth,
+					   int op_credits,
 					   struct ocfs2_path *path)
 {
-	int credits = (path->p_tree_depth - subtree_depth) * 2 + 1;
+	int credits = (path->p_tree_depth - subtree_depth) * 2 + 1 + op_credits;
 
 	if (handle->h_buffer_credits < credits)
 		return ocfs2_extend_trans(handle, credits);
@@ -1632,6 +1757,29 @@ static int ocfs2_rotate_requires_path_adjustment(struct ocfs2_path *left_path,
 	return 0;
 }
 
+static int ocfs2_leftmost_rec_contains(struct ocfs2_extent_list *el, u32 cpos)
+{
+	int next_free = le16_to_cpu(el->l_next_free_rec);
+	unsigned int range;
+	struct ocfs2_extent_rec *rec;
+
+	if (next_free == 0)
+		return 0;
+
+	rec = &el->l_recs[0];
+	if (ocfs2_is_empty_extent(rec)) {
+		/* Empty list. */
+		if (next_free == 1)
+			return 0;
+		rec = &el->l_recs[1];
+	}
+
+	range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
+	if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range)
+		return 1;
+	return 0;
+}
+
 /*
  * Rotate all the records in a btree right one record, starting at insert_cpos.
  *
@@ -1650,11 +1798,12 @@ static int ocfs2_rotate_requires_path_adjustment(struct ocfs2_path *left_path,
  */
 static int ocfs2_rotate_tree_right(struct inode *inode,
 				   handle_t *handle,
+				   enum ocfs2_split_type split,
 				   u32 insert_cpos,
 				   struct ocfs2_path *right_path,
 				   struct ocfs2_path **ret_left_path)
 {
-	int ret, start;
+	int ret, start, orig_credits = handle->h_buffer_credits;
 	u32 cpos;
 	struct ocfs2_path *left_path = NULL;
 
@@ -1721,9 +1870,9 @@ static int ocfs2_rotate_tree_right(struct inode *inode,
 				(unsigned long long)
 				path_leaf_bh(left_path)->b_blocknr);
 
-		if (ocfs2_rotate_requires_path_adjustment(left_path,
+		if (split == SPLIT_NONE &&
+		    ocfs2_rotate_requires_path_adjustment(left_path,
 							  insert_cpos)) {
-			mlog(0, "Path adjustment required\n");
 
 			/*
 			 * We've rotated the tree as much as we
@@ -1751,7 +1900,7 @@ static int ocfs2_rotate_tree_right(struct inode *inode,
 		     right_path->p_tree_depth);
 
 		ret = ocfs2_extend_rotate_transaction(handle, start,
-						      right_path);
+						      orig_credits, right_path);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
@@ -1764,6 +1913,24 @@ static int ocfs2_rotate_tree_right(struct inode *inode,
 			goto out;
 		}
 
+		if (split != SPLIT_NONE &&
+		    ocfs2_leftmost_rec_contains(path_leaf_el(right_path),
+						insert_cpos)) {
+			/*
+			 * A rotate moves the rightmost left leaf
+			 * record over to the leftmost right leaf
+			 * slot. If we're doing an extent split
+			 * instead of a real insert, then we have to
+			 * check that the extent to be split wasn't
+			 * just moved over. If it was, then we can
+			 * exit here, passing left_path back -
+			 * ocfs2_split_extent() is smart enough to
+			 * search both leaves.
+			 */
+			*ret_left_path = left_path;
+			goto out_ret_path;
+		}
+
 		/*
 		 * There is no need to re-read the next right path
 		 * as we know that it'll be our current left
@@ -1786,206 +1953,1248 @@ out_ret_path:
 	return ret;
 }
 
-/*
- * Do the final bits of extent record insertion at the target leaf
- * list. If this leaf is part of an allocation tree, it is assumed
- * that the tree above has been prepared.
- */
-static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec,
-				 struct ocfs2_extent_list *el,
-				 struct ocfs2_insert_type *insert,
-				 struct inode *inode)
+static void ocfs2_update_edge_lengths(struct inode *inode, handle_t *handle,
+				      struct ocfs2_path *path)
 {
-	int i = insert->ins_contig_index;
-	unsigned int range;
+	int i, idx;
 	struct ocfs2_extent_rec *rec;
+	struct ocfs2_extent_list *el;
+	struct ocfs2_extent_block *eb;
+	u32 range;
 
-	BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
+	/* Path should always be rightmost. */
+	eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
+	BUG_ON(eb->h_next_leaf_blk != 0ULL);
 
-	/*
-	 * Contiguous insert - either left or right.
-	 */
-	if (insert->ins_contig != CONTIG_NONE) {
-		rec = &el->l_recs[i];
-		if (insert->ins_contig == CONTIG_LEFT) {
-			rec->e_blkno = insert_rec->e_blkno;
-			rec->e_cpos = insert_rec->e_cpos;
-		}
-		le16_add_cpu(&rec->e_leaf_clusters,
-			     le16_to_cpu(insert_rec->e_leaf_clusters));
-		return;
-	}
+	el = &eb->h_list;
+	BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
+	idx = le16_to_cpu(el->l_next_free_rec) - 1;
+	rec = &el->l_recs[idx];
+	range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
 
-	/*
-	 * Handle insert into an empty leaf.
-	 */
-	if (le16_to_cpu(el->l_next_free_rec) == 0 ||
-	    ((le16_to_cpu(el->l_next_free_rec) == 1) &&
-	     ocfs2_is_empty_extent(&el->l_recs[0]))) {
-		el->l_recs[0] = *insert_rec;
-		el->l_next_free_rec = cpu_to_le16(1);
-		return;
-	}
+	for (i = 0; i < path->p_tree_depth; i++) {
+		el = path->p_node[i].el;
+		idx = le16_to_cpu(el->l_next_free_rec) - 1;
+		rec = &el->l_recs[idx];
 
-	/*
-	 * Appending insert.
-	 */
-	if (insert->ins_appending == APPEND_TAIL) {
-		i = le16_to_cpu(el->l_next_free_rec) - 1;
-		rec = &el->l_recs[i];
-		range = le32_to_cpu(rec->e_cpos)
-			+ le16_to_cpu(rec->e_leaf_clusters);
-		BUG_ON(le32_to_cpu(insert_rec->e_cpos) < range);
+		rec->e_int_clusters = cpu_to_le32(range);
+		le32_add_cpu(&rec->e_int_clusters, -le32_to_cpu(rec->e_cpos));
 
-		mlog_bug_on_msg(le16_to_cpu(el->l_next_free_rec) >=
-				le16_to_cpu(el->l_count),
-				"inode %lu, depth %u, count %u, next free %u, "
-				"rec.cpos %u, rec.clusters %u, "
-				"insert.cpos %u, insert.clusters %u\n",
-				inode->i_ino,
-				le16_to_cpu(el->l_tree_depth),
-				le16_to_cpu(el->l_count),
-				le16_to_cpu(el->l_next_free_rec),
-				le32_to_cpu(el->l_recs[i].e_cpos),
-				le16_to_cpu(el->l_recs[i].e_leaf_clusters),
-				le32_to_cpu(insert_rec->e_cpos),
-				le16_to_cpu(insert_rec->e_leaf_clusters));
-		i++;
-		el->l_recs[i] = *insert_rec;
-		le16_add_cpu(&el->l_next_free_rec, 1);
-		return;
+		ocfs2_journal_dirty(handle, path->p_node[i].bh);
 	}
-
-	/*
-	 * Ok, we have to rotate.
-	 *
-	 * At this point, it is safe to assume that inserting into an
-	 * empty leaf and appending to a leaf have both been handled
-	 * above.
-	 *
-	 * This leaf needs to have space, either by the empty 1st
-	 * extent record, or by virtue of an l_next_rec < l_count.
-	 */
-	ocfs2_rotate_leaf(el, insert_rec);
 }
 
-static inline void ocfs2_update_dinode_clusters(struct inode *inode,
-						struct ocfs2_dinode *di,
-						u32 clusters)
+static void ocfs2_unlink_path(struct inode *inode, handle_t *handle,
+			      struct ocfs2_cached_dealloc_ctxt *dealloc,
+			      struct ocfs2_path *path, int unlink_start)
 {
-	le32_add_cpu(&di->i_clusters, clusters);
-	spin_lock(&OCFS2_I(inode)->ip_lock);
-	OCFS2_I(inode)->ip_clusters = le32_to_cpu(di->i_clusters);
-	spin_unlock(&OCFS2_I(inode)->ip_lock);
+	int ret, i;
+	struct ocfs2_extent_block *eb;
+	struct ocfs2_extent_list *el;
+	struct buffer_head *bh;
+
+	for(i = unlink_start; i < path_num_items(path); i++) {
+		bh = path->p_node[i].bh;
+
+		eb = (struct ocfs2_extent_block *)bh->b_data;
+		/*
+		 * Not all nodes might have had their final count
+		 * decremented by the caller - handle this here.
+		 */
+		el = &eb->h_list;
+		if (le16_to_cpu(el->l_next_free_rec) > 1) {
+			mlog(ML_ERROR,
+			     "Inode %llu, attempted to remove extent block "
+			     "%llu with %u records\n",
+			     (unsigned long long)OCFS2_I(inode)->ip_blkno,
+			     (unsigned long long)le64_to_cpu(eb->h_blkno),
+			     le16_to_cpu(el->l_next_free_rec));
+
+			ocfs2_journal_dirty(handle, bh);
+			ocfs2_remove_from_cache(inode, bh);
+			continue;
+		}
+
+		el->l_next_free_rec = 0;
+		memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
+
+		ocfs2_journal_dirty(handle, bh);
+
+		ret = ocfs2_cache_extent_block_free(dealloc, eb);
+		if (ret)
+			mlog_errno(ret);
+
+		ocfs2_remove_from_cache(inode, bh);
+	}
 }
 
-static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle,
-				    struct ocfs2_extent_rec *insert_rec,
-				    struct ocfs2_path *right_path,
-				    struct ocfs2_path **ret_left_path)
+static void ocfs2_unlink_subtree(struct inode *inode, handle_t *handle,
+				 struct ocfs2_path *left_path,
+				 struct ocfs2_path *right_path,
+				 int subtree_index,
+				 struct ocfs2_cached_dealloc_ctxt *dealloc)
 {
-	int ret, i, next_free;
-	struct buffer_head *bh;
+	int i;
+	struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
+	struct ocfs2_extent_list *root_el = left_path->p_node[subtree_index].el;
 	struct ocfs2_extent_list *el;
-	struct ocfs2_path *left_path = NULL;
+	struct ocfs2_extent_block *eb;
 
-	*ret_left_path = NULL;
+	el = path_leaf_el(left_path);
 
-	/*
-	 * This shouldn't happen for non-trees. The extent rec cluster
-	 * count manipulation below only works for interior nodes.
-	 */
-	BUG_ON(right_path->p_tree_depth == 0);
+	eb = (struct ocfs2_extent_block *)right_path->p_node[subtree_index + 1].bh->b_data;
 
-	/*
-	 * If our appending insert is at the leftmost edge of a leaf,
-	 * then we might need to update the rightmost records of the
-	 * neighboring path.
-	 */
-	el = path_leaf_el(right_path);
-	next_free = le16_to_cpu(el->l_next_free_rec);
-	if (next_free == 0 ||
-	    (next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0]))) {
-		u32 left_cpos;
+	for(i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++)
+		if (root_el->l_recs[i].e_blkno == eb->h_blkno)
+			break;
 
-		ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path,
-						    &left_cpos);
-		if (ret) {
-			mlog_errno(ret);
-			goto out;
-		}
+	BUG_ON(i >= le16_to_cpu(root_el->l_next_free_rec));
 
-		mlog(0, "Append may need a left path update. cpos: %u, "
-		     "left_cpos: %u\n", le32_to_cpu(insert_rec->e_cpos),
-		     left_cpos);
+	memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
+	le16_add_cpu(&root_el->l_next_free_rec, -1);
+
+	eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
+	eb->h_next_leaf_blk = 0;
+
+	ocfs2_journal_dirty(handle, root_bh);
+	ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
+
+	ocfs2_unlink_path(inode, handle, dealloc, right_path,
+			  subtree_index + 1);
+}
+
+static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
+				     struct ocfs2_path *left_path,
+				     struct ocfs2_path *right_path,
+				     int subtree_index,
+				     struct ocfs2_cached_dealloc_ctxt *dealloc,
+				     int *deleted)
+{
+	int ret, i, del_right_subtree = 0, right_has_empty = 0;
+	struct buffer_head *root_bh, *di_bh = path_root_bh(right_path);
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+	struct ocfs2_extent_list *right_leaf_el, *left_leaf_el;
+	struct ocfs2_extent_block *eb;
+
+	*deleted = 0;
+
+	right_leaf_el = path_leaf_el(right_path);
+	left_leaf_el = path_leaf_el(left_path);
+	root_bh = left_path->p_node[subtree_index].bh;
+	BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
+
+	if (!ocfs2_is_empty_extent(&left_leaf_el->l_recs[0]))
+		return 0;
 
+	eb = (struct ocfs2_extent_block *)path_leaf_bh(right_path)->b_data;
+	if (ocfs2_is_empty_extent(&right_leaf_el->l_recs[0])) {
 		/*
-		 * No need to worry if the append is already in the
-		 * leftmost leaf.
+		 * It's legal for us to proceed if the right leaf is
+		 * the rightmost one and it has an empty extent. There
+		 * are two cases to handle - whether the leaf will be
+		 * empty after removal or not. If the leaf isn't empty
+		 * then just remove the empty extent up front. The
+		 * next block will handle empty leaves by flagging
+		 * them for unlink.
+		 *
+		 * Non rightmost leaves will throw -EAGAIN and the
+		 * caller can manually move the subtree and retry.
 		 */
-		if (left_cpos) {
-			left_path = ocfs2_new_path(path_root_bh(right_path),
-						   path_root_el(right_path));
-			if (!left_path) {
-				ret = -ENOMEM;
-				mlog_errno(ret);
-				goto out;
-			}
 
-			ret = ocfs2_find_path(inode, left_path, left_cpos);
+		if (eb->h_next_leaf_blk != 0ULL)
+			return -EAGAIN;
+
+		if (le16_to_cpu(right_leaf_el->l_next_free_rec) > 1) {
+			ret = ocfs2_journal_access(handle, inode,
+						   path_leaf_bh(right_path),
+						   OCFS2_JOURNAL_ACCESS_WRITE);
 			if (ret) {
 				mlog_errno(ret);
 				goto out;
 			}
 
-			/*
-			 * ocfs2_insert_path() will pass the left_path to the
-			 * journal for us.
-			 */
-		}
+			ocfs2_remove_empty_extent(right_leaf_el);
+		} else
+			right_has_empty = 1;
 	}
 
-	ret = ocfs2_journal_access_path(inode, handle, right_path);
-	if (ret) {
-		mlog_errno(ret);
-		goto out;
-	}
+	if (eb->h_next_leaf_blk == 0ULL &&
+	    le16_to_cpu(right_leaf_el->l_next_free_rec) == 1) {
+		/*
+		 * We have to update i_last_eb_blk during the meta
+		 * data delete.
+		 */
+		ret = ocfs2_journal_access(handle, inode, di_bh,
+					   OCFS2_JOURNAL_ACCESS_WRITE);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		del_right_subtree = 1;
+	}
+
+	/*
+	 * Getting here with an empty extent in the right path implies
+	 * that it's the rightmost path and will be deleted.
+	 */
+	BUG_ON(right_has_empty && !del_right_subtree);
+
+	ret = ocfs2_journal_access(handle, inode, root_bh,
+				   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
+		ret = ocfs2_journal_access(handle, inode,
+					   right_path->p_node[i].bh,
+					   OCFS2_JOURNAL_ACCESS_WRITE);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		ret = ocfs2_journal_access(handle, inode,
+					   left_path->p_node[i].bh,
+					   OCFS2_JOURNAL_ACCESS_WRITE);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	if (!right_has_empty) {
+		/*
+		 * Only do this if we're moving a real
+		 * record. Otherwise, the action is delayed until
+		 * after removal of the right path in which case we
+		 * can do a simple shift to remove the empty extent.
+		 */
+		ocfs2_rotate_leaf(left_leaf_el, &right_leaf_el->l_recs[0]);
+		memset(&right_leaf_el->l_recs[0], 0,
+		       sizeof(struct ocfs2_extent_rec));
+	}
+	if (eb->h_next_leaf_blk == 0ULL) {
+		/*
+		 * Move recs over to get rid of empty extent, decrease
+		 * next_free. This is allowed to remove the last
+		 * extent in our leaf (setting l_next_free_rec to
+		 * zero) - the delete code below won't care.
+		 */
+		ocfs2_remove_empty_extent(right_leaf_el);
+	}
+
+	ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
+	if (ret)
+		mlog_errno(ret);
+	ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
+	if (ret)
+		mlog_errno(ret);
+
+	if (del_right_subtree) {
+		ocfs2_unlink_subtree(inode, handle, left_path, right_path,
+				     subtree_index, dealloc);
+		ocfs2_update_edge_lengths(inode, handle, left_path);
+
+		eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
+		di->i_last_eb_blk = eb->h_blkno;
+
+		/*
+		 * Removal of the extent in the left leaf was skipped
+		 * above so we could delete the right path
+		 * 1st.
+		 */
+		if (right_has_empty)
+			ocfs2_remove_empty_extent(left_leaf_el);
+
+		ret = ocfs2_journal_dirty(handle, di_bh);
+		if (ret)
+			mlog_errno(ret);
+
+		*deleted = 1;
+	} else
+		ocfs2_complete_edge_insert(inode, handle, left_path, right_path,
+					   subtree_index);
+
+out:
+	return ret;
+}
+
+/*
+ * Given a full path, determine what cpos value would return us a path
+ * containing the leaf immediately to the right of the current one.
+ *
+ * Will return zero if the path passed in is already the rightmost path.
+ *
+ * This looks similar, but is subtly different to
+ * ocfs2_find_cpos_for_left_leaf().
+ */
+static int ocfs2_find_cpos_for_right_leaf(struct super_block *sb,
+					  struct ocfs2_path *path, u32 *cpos)
+{
+	int i, j, ret = 0;
+	u64 blkno;
+	struct ocfs2_extent_list *el;
+
+	*cpos = 0;
+
+	if (path->p_tree_depth == 0)
+		return 0;
+
+	blkno = path_leaf_bh(path)->b_blocknr;
+
+	/* Start at the tree node just above the leaf and work our way up. */
+	i = path->p_tree_depth - 1;
+	while (i >= 0) {
+		int next_free;
+
+		el = path->p_node[i].el;
+
+		/*
+		 * Find the extent record just after the one in our
+		 * path.
+		 */
+		next_free = le16_to_cpu(el->l_next_free_rec);
+		for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) {
+			if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) {
+				if (j == (next_free - 1)) {
+					if (i == 0) {
+						/*
+						 * We've determined that the
+						 * path specified is already
+						 * the rightmost one - return a
+						 * cpos of zero.
+						 */
+						goto out;
+					}
+					/*
+					 * The rightmost record points to our
+					 * leaf - we need to travel up the
+					 * tree one level.
+					 */
+					goto next_node;
+				}
+
+				*cpos = le32_to_cpu(el->l_recs[j + 1].e_cpos);
+				goto out;
+			}
+		}
+
+		/*
+		 * If we got here, we never found a valid node where
+		 * the tree indicated one should be.
+		 */
+		ocfs2_error(sb,
+			    "Invalid extent tree at extent block %llu\n",
+			    (unsigned long long)blkno);
+		ret = -EROFS;
+		goto out;
+
+next_node:
+		blkno = path->p_node[i].bh->b_blocknr;
+		i--;
+	}
+
+out:
+	return ret;
+}
+
+static int ocfs2_rotate_rightmost_leaf_left(struct inode *inode,
+					    handle_t *handle,
+					    struct buffer_head *bh,
+					    struct ocfs2_extent_list *el)
+{
+	int ret;
+
+	if (!ocfs2_is_empty_extent(&el->l_recs[0]))
+		return 0;
+
+	ret = ocfs2_journal_access(handle, inode, bh,
+				   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ocfs2_remove_empty_extent(el);
+
+	ret = ocfs2_journal_dirty(handle, bh);
+	if (ret)
+		mlog_errno(ret);
+
+out:
+	return ret;
+}
+
+static int __ocfs2_rotate_tree_left(struct inode *inode,
+				    handle_t *handle, int orig_credits,
+				    struct ocfs2_path *path,
+				    struct ocfs2_cached_dealloc_ctxt *dealloc,
+				    struct ocfs2_path **empty_extent_path)
+{
+	int ret, subtree_root, deleted;
+	u32 right_cpos;
+	struct ocfs2_path *left_path = NULL;
+	struct ocfs2_path *right_path = NULL;
+
+	BUG_ON(!ocfs2_is_empty_extent(&(path_leaf_el(path)->l_recs[0])));
+
+	*empty_extent_path = NULL;
+
+	ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, path,
+					     &right_cpos);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	left_path = ocfs2_new_path(path_root_bh(path),
+				   path_root_el(path));
+	if (!left_path) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ocfs2_cp_path(left_path, path);
+
+	right_path = ocfs2_new_path(path_root_bh(path),
+				    path_root_el(path));
+	if (!right_path) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	while (right_cpos) {
+		ret = ocfs2_find_path(inode, right_path, right_cpos);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		subtree_root = ocfs2_find_subtree_root(inode, left_path,
+						       right_path);
+
+		mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n",
+		     subtree_root,
+		     (unsigned long long)
+		     right_path->p_node[subtree_root].bh->b_blocknr,
+		     right_path->p_tree_depth);
+
+		ret = ocfs2_extend_rotate_transaction(handle, subtree_root,
+						      orig_credits, left_path);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		ret = ocfs2_rotate_subtree_left(inode, handle, left_path,
+						right_path, subtree_root,
+						dealloc, &deleted);
+		if (ret == -EAGAIN) {
+			/*
+			 * The rotation has to temporarily stop due to
+			 * the right subtree having an empty
+			 * extent. Pass it back to the caller for a
+			 * fixup.
+			 */
+			*empty_extent_path = right_path;
+			right_path = NULL;
+			goto out;
+		}
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		/*
+		 * The subtree rotate might have removed records on
+		 * the rightmost edge. If so, then rotation is
+		 * complete.
+		 */
+		if (deleted)
+			break;
+
+		ocfs2_mv_path(left_path, right_path);
+
+		ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, left_path,
+						     &right_cpos);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+out:
+	ocfs2_free_path(right_path);
+	ocfs2_free_path(left_path);
+
+	return ret;
+}
+
+static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
+				       struct ocfs2_path *path,
+				       struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+	int ret, subtree_index;
+	u32 cpos;
+	struct ocfs2_path *left_path = NULL;
+	struct ocfs2_dinode *di;
+	struct ocfs2_extent_block *eb;
+	struct ocfs2_extent_list *el;
+
+	/*
+	 * XXX: This code assumes that the root is an inode, which is
+	 * true for now but may change as tree code gets generic.
+	 */
+	di = (struct ocfs2_dinode *)path_root_bh(path)->b_data;
+	if (!OCFS2_IS_VALID_DINODE(di)) {
+		ret = -EIO;
+		ocfs2_error(inode->i_sb,
+			    "Inode %llu has invalid path root",
+			    (unsigned long long)OCFS2_I(inode)->ip_blkno);
+		goto out;
+	}
+
+	/*
+	 * There's two ways we handle this depending on
+	 * whether path is the only existing one.
+	 */
+	ret = ocfs2_extend_rotate_transaction(handle, 0,
+					      handle->h_buffer_credits,
+					      path);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_journal_access_path(inode, handle, path);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (cpos) {
+		/*
+		 * We have a path to the left of this one - it needs
+		 * an update too.
+		 */
+		left_path = ocfs2_new_path(path_root_bh(path),
+					   path_root_el(path));
+		if (!left_path) {
+			ret = -ENOMEM;
+			mlog_errno(ret);
+			goto out;
+		}
+
+		ret = ocfs2_find_path(inode, left_path, cpos);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		ret = ocfs2_journal_access_path(inode, handle, left_path);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		subtree_index = ocfs2_find_subtree_root(inode, left_path, path);
+
+		ocfs2_unlink_subtree(inode, handle, left_path, path,
+				     subtree_index, dealloc);
+		ocfs2_update_edge_lengths(inode, handle, left_path);
+
+		eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
+		di->i_last_eb_blk = eb->h_blkno;
+	} else {
+		/*
+		 * 'path' is also the leftmost path which
+		 * means it must be the only one. This gets
+		 * handled differently because we want to
+		 * revert the inode back to having extents
+		 * in-line.
+		 */
+		ocfs2_unlink_path(inode, handle, dealloc, path, 1);
+
+		el = &di->id2.i_list;
+		el->l_tree_depth = 0;
+		el->l_next_free_rec = 0;
+		memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
+
+		di->i_last_eb_blk = 0;
+	}
+
+	ocfs2_journal_dirty(handle, path_root_bh(path));
+
+out:
+	ocfs2_free_path(left_path);
+	return ret;
+}
+
+/*
+ * Left rotation of btree records.
+ *
+ * In many ways, this is (unsurprisingly) the opposite of right
+ * rotation. We start at some non-rightmost path containing an empty
+ * extent in the leaf block. The code works its way to the rightmost
+ * path by rotating records to the left in every subtree.
+ *
+ * This is used by any code which reduces the number of extent records
+ * in a leaf. After removal, an empty record should be placed in the
+ * leftmost list position.
+ *
+ * This won't handle a length update of the rightmost path records if
+ * the rightmost tree leaf record is removed so the caller is
+ * responsible for detecting and correcting that.
+ */
+static int ocfs2_rotate_tree_left(struct inode *inode, handle_t *handle,
+				  struct ocfs2_path *path,
+				  struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+	int ret, orig_credits = handle->h_buffer_credits;
+	struct ocfs2_path *tmp_path = NULL, *restart_path = NULL;
+	struct ocfs2_extent_block *eb;
+	struct ocfs2_extent_list *el;
+
+	el = path_leaf_el(path);
+	if (!ocfs2_is_empty_extent(&el->l_recs[0]))
+		return 0;
+
+	if (path->p_tree_depth == 0) {
+rightmost_no_delete:
+		/*
+		 * In-inode extents. This is trivially handled, so do
+		 * it up front.
+		 */
+		ret = ocfs2_rotate_rightmost_leaf_left(inode, handle,
+						       path_leaf_bh(path),
+						       path_leaf_el(path));
+		if (ret)
+			mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * Handle rightmost branch now. There's several cases:
+	 *  1) simple rotation leaving records in there. That's trivial.
+	 *  2) rotation requiring a branch delete - there's no more
+	 *     records left. Two cases of this:
+	 *     a) There are branches to the left.
+	 *     b) This is also the leftmost (the only) branch.
+	 *
+	 *  1) is handled via ocfs2_rotate_rightmost_leaf_left()
+	 *  2a) we need the left branch so that we can update it with the unlink
+	 *  2b) we need to bring the inode back to inline extents.
+	 */
+
+	eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
+	el = &eb->h_list;
+	if (eb->h_next_leaf_blk == 0) {
+		/*
+		 * This gets a bit tricky if we're going to delete the
+		 * rightmost path. Get the other cases out of the way
+		 * 1st.
+		 */
+		if (le16_to_cpu(el->l_next_free_rec) > 1)
+			goto rightmost_no_delete;
+
+		if (le16_to_cpu(el->l_next_free_rec) == 0) {
+			ret = -EIO;
+			ocfs2_error(inode->i_sb,
+				    "Inode %llu has empty extent block at %llu",
+				    (unsigned long long)OCFS2_I(inode)->ip_blkno,
+				    (unsigned long long)le64_to_cpu(eb->h_blkno));
+			goto out;
+		}
+
+		/*
+		 * XXX: The caller can not trust "path" any more after
+		 * this as it will have been deleted. What do we do?
+		 *
+		 * In theory the rotate-for-merge code will never get
+		 * here because it'll always ask for a rotate in a
+		 * nonempty list.
+		 */
+
+		ret = ocfs2_remove_rightmost_path(inode, handle, path,
+						  dealloc);
+		if (ret)
+			mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * Now we can loop, remembering the path we get from -EAGAIN
+	 * and restarting from there.
+	 */
+try_rotate:
+	ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits, path,
+				       dealloc, &restart_path);
+	if (ret && ret != -EAGAIN) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	while (ret == -EAGAIN) {
+		tmp_path = restart_path;
+		restart_path = NULL;
+
+		ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits,
+					       tmp_path, dealloc,
+					       &restart_path);
+		if (ret && ret != -EAGAIN) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		ocfs2_free_path(tmp_path);
+		tmp_path = NULL;
+
+		if (ret == 0)
+			goto try_rotate;
+	}
+
+out:
+	ocfs2_free_path(tmp_path);
+	ocfs2_free_path(restart_path);
+	return ret;
+}
+
+static void ocfs2_cleanup_merge(struct ocfs2_extent_list *el,
+				int index)
+{
+	struct ocfs2_extent_rec *rec = &el->l_recs[index];
+	unsigned int size;
+
+	if (rec->e_leaf_clusters == 0) {
+		/*
+		 * We consumed all of the merged-from record. An empty
+		 * extent cannot exist anywhere but the 1st array
+		 * position, so move things over if the merged-from
+		 * record doesn't occupy that position.
+		 *
+		 * This creates a new empty extent so the caller
+		 * should be smart enough to have removed any existing
+		 * ones.
+		 */
+		if (index > 0) {
+			BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0]));
+			size = index * sizeof(struct ocfs2_extent_rec);
+			memmove(&el->l_recs[1], &el->l_recs[0], size);
+		}
+
+		/*
+		 * Always memset - the caller doesn't check whether it
+		 * created an empty extent, so there could be junk in
+		 * the other fields.
+		 */
+		memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
+	}
+}
+
+/*
+ * Remove split_rec clusters from the record at index and merge them
+ * onto the beginning of the record at index + 1.
+ */
+static int ocfs2_merge_rec_right(struct inode *inode, struct buffer_head *bh,
+				handle_t *handle,
+				struct ocfs2_extent_rec *split_rec,
+				struct ocfs2_extent_list *el, int index)
+{
+	int ret;
+	unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
+	struct ocfs2_extent_rec *left_rec;
+	struct ocfs2_extent_rec *right_rec;
+
+	BUG_ON(index >= le16_to_cpu(el->l_next_free_rec));
+
+	left_rec = &el->l_recs[index];
+	right_rec = &el->l_recs[index + 1];
+
+	ret = ocfs2_journal_access(handle, inode, bh,
+				   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	le16_add_cpu(&left_rec->e_leaf_clusters, -split_clusters);
+
+	le32_add_cpu(&right_rec->e_cpos, -split_clusters);
+	le64_add_cpu(&right_rec->e_blkno,
+		     -ocfs2_clusters_to_blocks(inode->i_sb, split_clusters));
+	le16_add_cpu(&right_rec->e_leaf_clusters, split_clusters);
+
+	ocfs2_cleanup_merge(el, index);
+
+	ret = ocfs2_journal_dirty(handle, bh);
+	if (ret)
+		mlog_errno(ret);
+
+out:
+	return ret;
+}
+
+/*
+ * Remove split_rec clusters from the record at index and merge them
+ * onto the tail of the record at index - 1.
+ */
+static int ocfs2_merge_rec_left(struct inode *inode, struct buffer_head *bh,
+				handle_t *handle,
+				struct ocfs2_extent_rec *split_rec,
+				struct ocfs2_extent_list *el, int index)
+{
+	int ret, has_empty_extent = 0;
+	unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
+	struct ocfs2_extent_rec *left_rec;
+	struct ocfs2_extent_rec *right_rec;
+
+	BUG_ON(index <= 0);
+
+	left_rec = &el->l_recs[index - 1];
+	right_rec = &el->l_recs[index];
+	if (ocfs2_is_empty_extent(&el->l_recs[0]))
+		has_empty_extent = 1;
+
+	ret = ocfs2_journal_access(handle, inode, bh,
+				   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (has_empty_extent && index == 1) {
+		/*
+		 * The easy case - we can just plop the record right in.
+		 */
+		*left_rec = *split_rec;
+
+		has_empty_extent = 0;
+	} else {
+		le16_add_cpu(&left_rec->e_leaf_clusters, split_clusters);
+	}
+
+	le32_add_cpu(&right_rec->e_cpos, split_clusters);
+	le64_add_cpu(&right_rec->e_blkno,
+		     ocfs2_clusters_to_blocks(inode->i_sb, split_clusters));
+	le16_add_cpu(&right_rec->e_leaf_clusters, -split_clusters);
+
+	ocfs2_cleanup_merge(el, index);
+
+	ret = ocfs2_journal_dirty(handle, bh);
+	if (ret)
+		mlog_errno(ret);
+
+out:
+	return ret;
+}
+
+static int ocfs2_try_to_merge_extent(struct inode *inode,
+				     handle_t *handle,
+				     struct ocfs2_path *left_path,
+				     int split_index,
+				     struct ocfs2_extent_rec *split_rec,
+				     struct ocfs2_cached_dealloc_ctxt *dealloc,
+				     struct ocfs2_merge_ctxt *ctxt)
+
+{
+	int ret = 0, delete_tail_recs = 0;
+	struct ocfs2_extent_list *el = path_leaf_el(left_path);
+	struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
+
+	BUG_ON(ctxt->c_contig_type == CONTIG_NONE);
+
+	if (ctxt->c_split_covers_rec) {
+		delete_tail_recs++;
+
+		if (ctxt->c_contig_type == CONTIG_LEFTRIGHT ||
+		    ctxt->c_has_empty_extent)
+			delete_tail_recs++;
+
+		if (ctxt->c_has_empty_extent) {
+			/*
+			 * The merge code will need to create an empty
+			 * extent to take the place of the newly
+			 * emptied slot. Remove any pre-existing empty
+			 * extents - having more than one in a leaf is
+			 * illegal.
+			 */
+			ret = ocfs2_rotate_tree_left(inode, handle, left_path,
+						     dealloc);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+			split_index--;
+			rec = &el->l_recs[split_index];
+		}
+	}
+
+	if (ctxt->c_contig_type == CONTIG_LEFTRIGHT) {
+		/*
+		 * Left-right contig implies this.
+		 */
+		BUG_ON(!ctxt->c_split_covers_rec);
+		BUG_ON(split_index == 0);
+
+		/*
+		 * Since the leftright insert always covers the entire
+		 * extent, this call will delete the insert record
+		 * entirely, resulting in an empty extent record added to
+		 * the extent block.
+		 *
+		 * Since the adding of an empty extent shifts
+		 * everything back to the right, there's no need to
+		 * update split_index here.
+		 */
+		ret = ocfs2_merge_rec_left(inode, path_leaf_bh(left_path),
+					   handle, split_rec, el, split_index);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		/*
+		 * We can only get this from logic error above.
+		 */
+		BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0]));
+
+		/*
+		 * The left merge left us with an empty extent, remove
+		 * it.
+		 */
+		ret = ocfs2_rotate_tree_left(inode, handle, left_path, dealloc);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+		split_index--;
+		rec = &el->l_recs[split_index];
+
+		/*
+		 * Note that we don't pass split_rec here on purpose -
+		 * we've merged it into the left side.
+		 */
+		ret = ocfs2_merge_rec_right(inode, path_leaf_bh(left_path),
+					    handle, rec, el, split_index);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0]));
+
+		ret = ocfs2_rotate_tree_left(inode, handle, left_path,
+					     dealloc);
+		/*
+		 * Error from this last rotate is not critical, so
+		 * print but don't bubble it up.
+		 */
+		if (ret)
+			mlog_errno(ret);
+		ret = 0;
+	} else {
+		/*
+		 * Merge a record to the left or right.
+		 *
+		 * 'contig_type' is relative to the existing record,
+		 * so for example, if we're "right contig", it's to
+		 * the record on the left (hence the left merge).
+		 */
+		if (ctxt->c_contig_type == CONTIG_RIGHT) {
+			ret = ocfs2_merge_rec_left(inode,
+						   path_leaf_bh(left_path),
+						   handle, split_rec, el,
+						   split_index);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+		} else {
+			ret = ocfs2_merge_rec_right(inode,
+						    path_leaf_bh(left_path),
+						    handle, split_rec, el,
+						    split_index);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+		}
+
+		if (ctxt->c_split_covers_rec) {
+			/*
+			 * The merge may have left an empty extent in
+			 * our leaf. Try to rotate it away.
+			 */
+			ret = ocfs2_rotate_tree_left(inode, handle, left_path,
+						     dealloc);
+			if (ret)
+				mlog_errno(ret);
+			ret = 0;
+		}
+	}
+
+out:
+	return ret;
+}
+
+static void ocfs2_subtract_from_rec(struct super_block *sb,
+				    enum ocfs2_split_type split,
+				    struct ocfs2_extent_rec *rec,
+				    struct ocfs2_extent_rec *split_rec)
+{
+	u64 len_blocks;
+
+	len_blocks = ocfs2_clusters_to_blocks(sb,
+				le16_to_cpu(split_rec->e_leaf_clusters));
+
+	if (split == SPLIT_LEFT) {
+		/*
+		 * Region is on the left edge of the existing
+		 * record.
+		 */
+		le32_add_cpu(&rec->e_cpos,
+			     le16_to_cpu(split_rec->e_leaf_clusters));
+		le64_add_cpu(&rec->e_blkno, len_blocks);
+		le16_add_cpu(&rec->e_leaf_clusters,
+			     -le16_to_cpu(split_rec->e_leaf_clusters));
+	} else {
+		/*
+		 * Region is on the right edge of the existing
+		 * record.
+		 */
+		le16_add_cpu(&rec->e_leaf_clusters,
+			     -le16_to_cpu(split_rec->e_leaf_clusters));
+	}
+}
+
+/*
+ * Do the final bits of extent record insertion at the target leaf
+ * list. If this leaf is part of an allocation tree, it is assumed
+ * that the tree above has been prepared.
+ */
+static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec,
+				 struct ocfs2_extent_list *el,
+				 struct ocfs2_insert_type *insert,
+				 struct inode *inode)
+{
+	int i = insert->ins_contig_index;
+	unsigned int range;
+	struct ocfs2_extent_rec *rec;
+
+	BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
+
+	if (insert->ins_split != SPLIT_NONE) {
+		i = ocfs2_search_extent_list(el, le32_to_cpu(insert_rec->e_cpos));
+		BUG_ON(i == -1);
+		rec = &el->l_recs[i];
+		ocfs2_subtract_from_rec(inode->i_sb, insert->ins_split, rec,
+					insert_rec);
+		goto rotate;
+	}
+
+	/*
+	 * Contiguous insert - either left or right.
+	 */
+	if (insert->ins_contig != CONTIG_NONE) {
+		rec = &el->l_recs[i];
+		if (insert->ins_contig == CONTIG_LEFT) {
+			rec->e_blkno = insert_rec->e_blkno;
+			rec->e_cpos = insert_rec->e_cpos;
+		}
+		le16_add_cpu(&rec->e_leaf_clusters,
+			     le16_to_cpu(insert_rec->e_leaf_clusters));
+		return;
+	}
+
+	/*
+	 * Handle insert into an empty leaf.
+	 */
+	if (le16_to_cpu(el->l_next_free_rec) == 0 ||
+	    ((le16_to_cpu(el->l_next_free_rec) == 1) &&
+	     ocfs2_is_empty_extent(&el->l_recs[0]))) {
+		el->l_recs[0] = *insert_rec;
+		el->l_next_free_rec = cpu_to_le16(1);
+		return;
+	}
+
+	/*
+	 * Appending insert.
+	 */
+	if (insert->ins_appending == APPEND_TAIL) {
+		i = le16_to_cpu(el->l_next_free_rec) - 1;
+		rec = &el->l_recs[i];
+		range = le32_to_cpu(rec->e_cpos)
+			+ le16_to_cpu(rec->e_leaf_clusters);
+		BUG_ON(le32_to_cpu(insert_rec->e_cpos) < range);
+
+		mlog_bug_on_msg(le16_to_cpu(el->l_next_free_rec) >=
+				le16_to_cpu(el->l_count),
+				"inode %lu, depth %u, count %u, next free %u, "
+				"rec.cpos %u, rec.clusters %u, "
+				"insert.cpos %u, insert.clusters %u\n",
+				inode->i_ino,
+				le16_to_cpu(el->l_tree_depth),
+				le16_to_cpu(el->l_count),
+				le16_to_cpu(el->l_next_free_rec),
+				le32_to_cpu(el->l_recs[i].e_cpos),
+				le16_to_cpu(el->l_recs[i].e_leaf_clusters),
+				le32_to_cpu(insert_rec->e_cpos),
+				le16_to_cpu(insert_rec->e_leaf_clusters));
+		i++;
+		el->l_recs[i] = *insert_rec;
+		le16_add_cpu(&el->l_next_free_rec, 1);
+		return;
+	}
+
+rotate:
+	/*
+	 * Ok, we have to rotate.
+	 *
+	 * At this point, it is safe to assume that inserting into an
+	 * empty leaf and appending to a leaf have both been handled
+	 * above.
+	 *
+	 * This leaf needs to have space, either by the empty 1st
+	 * extent record, or by virtue of an l_next_rec < l_count.
+	 */
+	ocfs2_rotate_leaf(el, insert_rec);
+}
+
+static inline void ocfs2_update_dinode_clusters(struct inode *inode,
+						struct ocfs2_dinode *di,
+						u32 clusters)
+{
+	le32_add_cpu(&di->i_clusters, clusters);
+	spin_lock(&OCFS2_I(inode)->ip_lock);
+	OCFS2_I(inode)->ip_clusters = le32_to_cpu(di->i_clusters);
+	spin_unlock(&OCFS2_I(inode)->ip_lock);
+}
+
+static void ocfs2_adjust_rightmost_records(struct inode *inode,
+					   handle_t *handle,
+					   struct ocfs2_path *path,
+					   struct ocfs2_extent_rec *insert_rec)
+{
+	int ret, i, next_free;
+	struct buffer_head *bh;
+	struct ocfs2_extent_list *el;
+	struct ocfs2_extent_rec *rec;
+
+	/*
+	 * Update everything except the leaf block.
+	 */
+	for (i = 0; i < path->p_tree_depth; i++) {
+		bh = path->p_node[i].bh;
+		el = path->p_node[i].el;
+
+		next_free = le16_to_cpu(el->l_next_free_rec);
+		if (next_free == 0) {
+			ocfs2_error(inode->i_sb,
+				    "Dinode %llu has a bad extent list",
+				    (unsigned long long)OCFS2_I(inode)->ip_blkno);
+			ret = -EIO;
+			return;
+		}
+
+		rec = &el->l_recs[next_free - 1];
+
+		rec->e_int_clusters = insert_rec->e_cpos;
+		le32_add_cpu(&rec->e_int_clusters,
+			     le16_to_cpu(insert_rec->e_leaf_clusters));
+		le32_add_cpu(&rec->e_int_clusters,
+			     -le32_to_cpu(rec->e_cpos));
+
+		ret = ocfs2_journal_dirty(handle, bh);
+		if (ret)
+			mlog_errno(ret);
+
+	}
+}
+
+static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle,
+				    struct ocfs2_extent_rec *insert_rec,
+				    struct ocfs2_path *right_path,
+				    struct ocfs2_path **ret_left_path)
+{
+	int ret, next_free;
+	struct ocfs2_extent_list *el;
+	struct ocfs2_path *left_path = NULL;
+
+	*ret_left_path = NULL;
+
+	/*
+	 * This shouldn't happen for non-trees. The extent rec cluster
+	 * count manipulation below only works for interior nodes.
+	 */
+	BUG_ON(right_path->p_tree_depth == 0);
 
-	el = path_root_el(right_path);
-	bh = path_root_bh(right_path);
-	i = 0;
-	while (1) {
-		struct ocfs2_extent_rec *rec;
+	/*
+	 * If our appending insert is at the leftmost edge of a leaf,
+	 * then we might need to update the rightmost records of the
+	 * neighboring path.
+	 */
+	el = path_leaf_el(right_path);
+	next_free = le16_to_cpu(el->l_next_free_rec);
+	if (next_free == 0 ||
+	    (next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0]))) {
+		u32 left_cpos;
 
-		next_free = le16_to_cpu(el->l_next_free_rec);
-		if (next_free == 0) {
-			ocfs2_error(inode->i_sb,
-				    "Dinode %llu has a bad extent list",
-				    (unsigned long long)OCFS2_I(inode)->ip_blkno);
-			ret = -EIO;
+		ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path,
+						    &left_cpos);
+		if (ret) {
+			mlog_errno(ret);
 			goto out;
 		}
 
-		rec = &el->l_recs[next_free - 1];
+		mlog(0, "Append may need a left path update. cpos: %u, "
+		     "left_cpos: %u\n", le32_to_cpu(insert_rec->e_cpos),
+		     left_cpos);
 
-		rec->e_int_clusters = insert_rec->e_cpos;
-		le32_add_cpu(&rec->e_int_clusters,
-			     le16_to_cpu(insert_rec->e_leaf_clusters));
-		le32_add_cpu(&rec->e_int_clusters,
-			     -le32_to_cpu(rec->e_cpos));
+		/*
+		 * No need to worry if the append is already in the
+		 * leftmost leaf.
+		 */
+		if (left_cpos) {
+			left_path = ocfs2_new_path(path_root_bh(right_path),
+						   path_root_el(right_path));
+			if (!left_path) {
+				ret = -ENOMEM;
+				mlog_errno(ret);
+				goto out;
+			}
 
-		ret = ocfs2_journal_dirty(handle, bh);
-		if (ret)
-			mlog_errno(ret);
+			ret = ocfs2_find_path(inode, left_path, left_cpos);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
 
-		/* Don't touch the leaf node */
-		if (++i >= right_path->p_tree_depth)
-			break;
+			/*
+			 * ocfs2_insert_path() will pass the left_path to the
+			 * journal for us.
+			 */
+		}
+	}
 
-		bh = right_path->p_node[i].bh;
-		el = right_path->p_node[i].el;
+	ret = ocfs2_journal_access_path(inode, handle, right_path);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
 	}
 
+	ocfs2_adjust_rightmost_records(inode, handle, right_path, insert_rec);
+
 	*ret_left_path = left_path;
 	ret = 0;
 out:
@@ -1995,6 +3204,83 @@ out:
 	return ret;
 }
 
+static void ocfs2_split_record(struct inode *inode,
+			       struct ocfs2_path *left_path,
+			       struct ocfs2_path *right_path,
+			       struct ocfs2_extent_rec *split_rec,
+			       enum ocfs2_split_type split)
+{
+	int index;
+	u32 cpos = le32_to_cpu(split_rec->e_cpos);
+	struct ocfs2_extent_list *left_el = NULL, *right_el, *insert_el, *el;
+	struct ocfs2_extent_rec *rec, *tmprec;
+
+	right_el = path_leaf_el(right_path);;
+	if (left_path)
+		left_el = path_leaf_el(left_path);
+
+	el = right_el;
+	insert_el = right_el;
+	index = ocfs2_search_extent_list(el, cpos);
+	if (index != -1) {
+		if (index == 0 && left_path) {
+			BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0]));
+
+			/*
+			 * This typically means that the record
+			 * started in the left path but moved to the
+			 * right as a result of rotation. We either
+			 * move the existing record to the left, or we
+			 * do the later insert there.
+			 *
+			 * In this case, the left path should always
+			 * exist as the rotate code will have passed
+			 * it back for a post-insert update.
+			 */
+
+			if (split == SPLIT_LEFT) {
+				/*
+				 * It's a left split. Since we know
+				 * that the rotate code gave us an
+				 * empty extent in the left path, we
+				 * can just do the insert there.
+				 */
+				insert_el = left_el;
+			} else {
+				/*
+				 * Right split - we have to move the
+				 * existing record over to the left
+				 * leaf. The insert will be into the
+				 * newly created empty extent in the
+				 * right leaf.
+				 */
+				tmprec = &right_el->l_recs[index];
+				ocfs2_rotate_leaf(left_el, tmprec);
+				el = left_el;
+
+				memset(tmprec, 0, sizeof(*tmprec));
+				index = ocfs2_search_extent_list(left_el, cpos);
+				BUG_ON(index == -1);
+			}
+		}
+	} else {
+		BUG_ON(!left_path);
+		BUG_ON(!ocfs2_is_empty_extent(&left_el->l_recs[0]));
+		/*
+		 * Left path is easy - we can just allow the insert to
+		 * happen.
+		 */
+		el = left_el;
+		insert_el = left_el;
+		index = ocfs2_search_extent_list(el, cpos);
+		BUG_ON(index == -1);
+	}
+
+	rec = &el->l_recs[index];
+	ocfs2_subtract_from_rec(inode->i_sb, split, rec, split_rec);
+	ocfs2_rotate_leaf(insert_el, split_rec);
+}
+
 /*
  * This function only does inserts on an allocation b-tree. For dinode
  * lists, ocfs2_insert_at_leaf() is called directly.
@@ -2012,7 +3298,6 @@ static int ocfs2_insert_path(struct inode *inode,
 {
 	int ret, subtree_index;
 	struct buffer_head *leaf_bh = path_leaf_bh(right_path);
-	struct ocfs2_extent_list *el;
 
 	/*
 	 * Pass both paths to the journal. The majority of inserts
@@ -2048,9 +3333,18 @@ static int ocfs2_insert_path(struct inode *inode,
 		}
 	}
 
-	el = path_leaf_el(right_path);
+	if (insert->ins_split != SPLIT_NONE) {
+		/*
+		 * We could call ocfs2_insert_at_leaf() for some types
+		 * of splits, but it's easier to just let one seperate
+		 * function sort it all out.
+		 */
+		ocfs2_split_record(inode, left_path, right_path,
+				   insert_rec, insert->ins_split);
+	} else
+		ocfs2_insert_at_leaf(insert_rec, path_leaf_el(right_path),
+				     insert, inode);
 
-	ocfs2_insert_at_leaf(insert_rec, el, insert, inode);
 	ret = ocfs2_journal_dirty(handle, leaf_bh);
 	if (ret)
 		mlog_errno(ret);
@@ -2139,7 +3433,7 @@ static int ocfs2_do_insert_extent(struct inode *inode,
 	 * can wind up skipping both of these two special cases...
 	 */
 	if (rotate) {
-		ret = ocfs2_rotate_tree_right(inode, handle,
+		ret = ocfs2_rotate_tree_right(inode, handle, type->ins_split,
 					      le32_to_cpu(insert_rec->e_cpos),
 					      right_path, &left_path);
 		if (ret) {
@@ -2164,8 +3458,9 @@ static int ocfs2_do_insert_extent(struct inode *inode,
 	}
 
 out_update_clusters:
-	ocfs2_update_dinode_clusters(inode, di,
-				     le16_to_cpu(insert_rec->e_leaf_clusters));
+	if (type->ins_split == SPLIT_NONE)
+		ocfs2_update_dinode_clusters(inode, di,
+					     le16_to_cpu(insert_rec->e_leaf_clusters));
 
 	ret = ocfs2_journal_dirty(handle, di_bh);
 	if (ret)
@@ -2178,6 +3473,44 @@ out:
 	return ret;
 }
 
+static enum ocfs2_contig_type
+ocfs2_figure_merge_contig_type(struct inode *inode,
+			       struct ocfs2_extent_list *el, int index,
+			       struct ocfs2_extent_rec *split_rec)
+{
+	struct ocfs2_extent_rec *rec;
+	enum ocfs2_contig_type ret = CONTIG_NONE;
+
+	/*
+	 * We're careful to check for an empty extent record here -
+	 * the merge code will know what to do if it sees one.
+	 */
+
+	if (index > 0) {
+		rec = &el->l_recs[index - 1];
+		if (index == 1 && ocfs2_is_empty_extent(rec)) {
+			if (split_rec->e_cpos == el->l_recs[index].e_cpos)
+				ret = CONTIG_RIGHT;
+		} else {
+			ret = ocfs2_extent_contig(inode, rec, split_rec);
+		}
+	}
+
+	if (index < (le16_to_cpu(el->l_next_free_rec) - 1)) {
+		enum ocfs2_contig_type contig_type;
+
+		rec = &el->l_recs[index + 1];
+		contig_type = ocfs2_extent_contig(inode, rec, split_rec);
+
+		if (contig_type == CONTIG_LEFT && ret == CONTIG_RIGHT)
+			ret = CONTIG_LEFTRIGHT;
+		else if (ret == CONTIG_NONE)
+			ret = contig_type;
+	}
+
+	return ret;
+}
+
 static void ocfs2_figure_contig_type(struct inode *inode,
 				     struct ocfs2_insert_type *insert,
 				     struct ocfs2_extent_list *el,
@@ -2269,6 +3602,8 @@ static int ocfs2_figure_insert_type(struct inode *inode,
 	struct ocfs2_path *path = NULL;
 	struct buffer_head *bh = NULL;
 
+	insert->ins_split = SPLIT_NONE;
+
 	el = &di->id2.i_list;
 	insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth);
 
@@ -2430,7 +3765,7 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
 
 	if (insert.ins_contig == CONTIG_NONE && insert.ins_free_records == 0) {
 		status = ocfs2_grow_tree(inode, handle, fe_bh,
-					 &insert.ins_tree_depth, last_eb_bh,
+					 &insert.ins_tree_depth, &last_eb_bh,
 					 meta_ac);
 		if (status) {
 			mlog_errno(status);
@@ -2456,6 +3791,352 @@ bail:
 	return status;
 }
 
+static void ocfs2_make_right_split_rec(struct super_block *sb,
+				       struct ocfs2_extent_rec *split_rec,
+				       u32 cpos,
+				       struct ocfs2_extent_rec *rec)
+{
+	u32 rec_cpos = le32_to_cpu(rec->e_cpos);
+	u32 rec_range = rec_cpos + le16_to_cpu(rec->e_leaf_clusters);
+
+	memset(split_rec, 0, sizeof(struct ocfs2_extent_rec));
+
+	split_rec->e_cpos = cpu_to_le32(cpos);
+	split_rec->e_leaf_clusters = cpu_to_le16(rec_range - cpos);
+
+	split_rec->e_blkno = rec->e_blkno;
+	le64_add_cpu(&split_rec->e_blkno,
+		     ocfs2_clusters_to_blocks(sb, cpos - rec_cpos));
+
+	split_rec->e_flags = rec->e_flags;
+}
+
+static int ocfs2_split_and_insert(struct inode *inode,
+				  handle_t *handle,
+				  struct ocfs2_path *path,
+				  struct buffer_head *di_bh,
+				  struct buffer_head **last_eb_bh,
+				  int split_index,
+				  struct ocfs2_extent_rec *orig_split_rec,
+				  struct ocfs2_alloc_context *meta_ac)
+{
+	int ret = 0, depth;
+	unsigned int insert_range, rec_range, do_leftright = 0;
+	struct ocfs2_extent_rec tmprec;
+	struct ocfs2_extent_list *rightmost_el;
+	struct ocfs2_extent_rec rec;
+	struct ocfs2_extent_rec split_rec = *orig_split_rec;
+	struct ocfs2_insert_type insert;
+	struct ocfs2_extent_block *eb;
+	struct ocfs2_dinode *di;
+
+leftright:
+	/*
+	 * Store a copy of the record on the stack - it might move
+	 * around as the tree is manipulated below.
+	 */
+	rec = path_leaf_el(path)->l_recs[split_index];
+
+	di = (struct ocfs2_dinode *)di_bh->b_data;
+	rightmost_el = &di->id2.i_list;
+
+	depth = le16_to_cpu(rightmost_el->l_tree_depth);
+	if (depth) {
+		BUG_ON(!(*last_eb_bh));
+		eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
+		rightmost_el = &eb->h_list;
+	}
+
+	if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
+	    le16_to_cpu(rightmost_el->l_count)) {
+		int old_depth = depth;
+
+		ret = ocfs2_grow_tree(inode, handle, di_bh, &depth, last_eb_bh,
+				      meta_ac);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		if (old_depth != depth) {
+			eb = (struct ocfs2_extent_block *)(*last_eb_bh)->b_data;
+			rightmost_el = &eb->h_list;
+		}
+	}
+
+	memset(&insert, 0, sizeof(struct ocfs2_insert_type));
+	insert.ins_appending = APPEND_NONE;
+	insert.ins_contig = CONTIG_NONE;
+	insert.ins_free_records = le16_to_cpu(rightmost_el->l_count)
+		- le16_to_cpu(rightmost_el->l_next_free_rec);
+	insert.ins_tree_depth = depth;
+
+	insert_range = le32_to_cpu(split_rec.e_cpos) +
+		le16_to_cpu(split_rec.e_leaf_clusters);
+	rec_range = le32_to_cpu(rec.e_cpos) +
+		le16_to_cpu(rec.e_leaf_clusters);
+
+	if (split_rec.e_cpos == rec.e_cpos) {
+		insert.ins_split = SPLIT_LEFT;
+	} else if (insert_range == rec_range) {
+		insert.ins_split = SPLIT_RIGHT;
+	} else {
+		/*
+		 * Left/right split. We fake this as a right split
+		 * first and then make a second pass as a left split.
+		 */
+		insert.ins_split = SPLIT_RIGHT;
+
+		ocfs2_make_right_split_rec(inode->i_sb, &tmprec, insert_range,
+					   &rec);
+
+		split_rec = tmprec;
+
+		BUG_ON(do_leftright);
+		do_leftright = 1;
+	}
+
+	ret = ocfs2_do_insert_extent(inode, handle, di_bh, &split_rec,
+				     &insert);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (do_leftright == 1) {
+		u32 cpos;
+		struct ocfs2_extent_list *el;
+
+		do_leftright++;
+		split_rec = *orig_split_rec;
+
+		ocfs2_reinit_path(path, 1);
+
+		cpos = le32_to_cpu(split_rec.e_cpos);
+		ret = ocfs2_find_path(inode, path, cpos);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		el = path_leaf_el(path);
+		split_index = ocfs2_search_extent_list(el, cpos);
+		goto leftright;
+	}
+out:
+
+	return ret;
+}
+
+/*
+ * Mark part or all of the extent record at split_index in the leaf
+ * pointed to by path as written. This removes the unwritten
+ * extent flag.
+ *
+ * Care is taken to handle contiguousness so as to not grow the tree.
+ *
+ * meta_ac is not strictly necessary - we only truly need it if growth
+ * of the tree is required. All other cases will degrade into a less
+ * optimal tree layout.
+ *
+ * last_eb_bh should be the rightmost leaf block for any inode with a
+ * btree. Since a split may grow the tree or a merge might shrink it, the caller cannot trust the contents of that buffer after this call.
+ *
+ * This code is optimized for readability - several passes might be
+ * made over certain portions of the tree. All of those blocks will
+ * have been brought into cache (and pinned via the journal), so the
+ * extra overhead is not expressed in terms of disk reads.
+ */
+static int __ocfs2_mark_extent_written(struct inode *inode,
+				       struct buffer_head *di_bh,
+				       handle_t *handle,
+				       struct ocfs2_path *path,
+				       int split_index,
+				       struct ocfs2_extent_rec *split_rec,
+				       struct ocfs2_alloc_context *meta_ac,
+				       struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+	int ret = 0;
+	struct ocfs2_extent_list *el = path_leaf_el(path);
+	struct buffer_head *eb_bh, *last_eb_bh = NULL;
+	struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
+	struct ocfs2_merge_ctxt ctxt;
+	struct ocfs2_extent_list *rightmost_el;
+
+	if (!rec->e_flags & OCFS2_EXT_UNWRITTEN) {
+		ret = -EIO;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (le32_to_cpu(rec->e_cpos) > le32_to_cpu(split_rec->e_cpos) ||
+	    ((le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)) <
+	     (le32_to_cpu(split_rec->e_cpos) + le16_to_cpu(split_rec->e_leaf_clusters)))) {
+		ret = -EIO;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	eb_bh = path_leaf_bh(path);
+	ret = ocfs2_journal_access(handle, inode, eb_bh,
+				   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ctxt.c_contig_type = ocfs2_figure_merge_contig_type(inode, el,
+							    split_index,
+							    split_rec);
+
+	/*
+	 * The core merge / split code wants to know how much room is
+	 * left in this inodes allocation tree, so we pass the
+	 * rightmost extent list.
+	 */
+	if (path->p_tree_depth) {
+		struct ocfs2_extent_block *eb;
+		struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+
+		ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
+				       le64_to_cpu(di->i_last_eb_blk),
+				       &last_eb_bh, OCFS2_BH_CACHED, inode);
+		if (ret) {
+			mlog_exit(ret);
+			goto out;
+		}
+
+		eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
+		if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+			OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
+			ret = -EROFS;
+			goto out;
+		}
+
+		rightmost_el = &eb->h_list;
+	} else
+		rightmost_el = path_root_el(path);
+
+	ctxt.c_used_tail_recs = le16_to_cpu(rightmost_el->l_next_free_rec);
+	if (ctxt.c_used_tail_recs > 0 &&
+	    ocfs2_is_empty_extent(&rightmost_el->l_recs[0]))
+		ctxt.c_used_tail_recs--;
+
+	if (rec->e_cpos == split_rec->e_cpos &&
+	    rec->e_leaf_clusters == split_rec->e_leaf_clusters)
+		ctxt.c_split_covers_rec = 1;
+	else
+		ctxt.c_split_covers_rec = 0;
+
+	ctxt.c_has_empty_extent = ocfs2_is_empty_extent(&el->l_recs[0]);
+
+	mlog(0, "index: %d, contig: %u, used_tail_recs: %u, "
+	     "has_empty: %u, split_covers: %u\n", split_index,
+	     ctxt.c_contig_type, ctxt.c_used_tail_recs,
+	     ctxt.c_has_empty_extent, ctxt.c_split_covers_rec);
+
+	if (ctxt.c_contig_type == CONTIG_NONE) {
+		if (ctxt.c_split_covers_rec)
+			el->l_recs[split_index] = *split_rec;
+		else
+			ret = ocfs2_split_and_insert(inode, handle, path, di_bh,
+						     &last_eb_bh, split_index,
+						     split_rec, meta_ac);
+		if (ret)
+			mlog_errno(ret);
+	} else {
+		ret = ocfs2_try_to_merge_extent(inode, handle, path,
+						split_index, split_rec,
+						dealloc, &ctxt);
+		if (ret)
+			mlog_errno(ret);
+	}
+
+	ocfs2_journal_dirty(handle, eb_bh);
+
+out:
+	brelse(last_eb_bh);
+	return ret;
+}
+
+/*
+ * Mark the already-existing extent at cpos as written for len clusters.
+ *
+ * If the existing extent is larger than the request, initiate a
+ * split. An attempt will be made at merging with adjacent extents.
+ *
+ * The caller is responsible for passing down meta_ac if we'll need it.
+ */
+int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *di_bh,
+			      handle_t *handle, u32 cpos, u32 len, u32 phys,
+			      struct ocfs2_alloc_context *meta_ac,
+			      struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+	int ret, index;
+	u64 start_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys);
+	struct ocfs2_extent_rec split_rec;
+	struct ocfs2_path *left_path = NULL;
+	struct ocfs2_extent_list *el;
+
+	mlog(0, "Inode %lu cpos %u, len %u, phys %u (%llu)\n",
+	     inode->i_ino, cpos, len, phys, (unsigned long long)start_blkno);
+
+	if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) {
+		ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents "
+			    "that are being written to, but the feature bit "
+			    "is not set in the super block.",
+			    (unsigned long long)OCFS2_I(inode)->ip_blkno);
+		ret = -EROFS;
+		goto out;
+	}
+
+	/*
+	 * XXX: This should be fixed up so that we just re-insert the
+	 * next extent records.
+	 */
+	ocfs2_extent_map_trunc(inode, 0);
+
+	left_path = ocfs2_new_inode_path(di_bh);
+	if (!left_path) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_find_path(inode, left_path, cpos);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+	el = path_leaf_el(left_path);
+
+	index = ocfs2_search_extent_list(el, cpos);
+	if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
+		ocfs2_error(inode->i_sb,
+			    "Inode %llu has an extent at cpos %u which can no "
+			    "longer be found.\n",
+			    (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos);
+		ret = -EROFS;
+		goto out;
+	}
+
+	memset(&split_rec, 0, sizeof(struct ocfs2_extent_rec));
+	split_rec.e_cpos = cpu_to_le32(cpos);
+	split_rec.e_leaf_clusters = cpu_to_le16(len);
+	split_rec.e_blkno = cpu_to_le64(start_blkno);
+	split_rec.e_flags = path_leaf_el(left_path)->l_recs[index].e_flags;
+	split_rec.e_flags &= ~OCFS2_EXT_UNWRITTEN;
+
+	ret = __ocfs2_mark_extent_written(inode, di_bh, handle, left_path,
+					  index, &split_rec, meta_ac, dealloc);
+	if (ret)
+		mlog_errno(ret);
+
+out:
+	ocfs2_free_path(left_path);
+	return ret;
+}
+
 static inline int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb)
 {
 	struct buffer_head *tl_bh = osb->osb_tl_bh;
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index cb02e53b593c..d3acf45225c2 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -35,6 +35,11 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
 			u64 start_blk,
 			u32 new_clusters,
 			struct ocfs2_alloc_context *meta_ac);
+struct ocfs2_cached_dealloc_ctxt;
+int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *di_bh,
+			      handle_t *handle, u32 cpos, u32 len, u32 phys,
+			      struct ocfs2_alloc_context *meta_ac,
+			      struct ocfs2_cached_dealloc_ctxt *dealloc);
 int ocfs2_num_free_extents(struct ocfs2_super *osb,
 			   struct inode *inode,
 			   struct ocfs2_dinode *fe);
@@ -102,6 +107,7 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb,
 
 int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el,
 		    u32 cpos, struct buffer_head **leaf_bh);
+int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster);
 
 /*
  * Helper function to look at the # of clusters in an extent record.
diff --git a/fs/ocfs2/endian.h b/fs/ocfs2/endian.h
index f226b2207628..ff257628af16 100644
--- a/fs/ocfs2/endian.h
+++ b/fs/ocfs2/endian.h
@@ -32,6 +32,11 @@ static inline void le32_add_cpu(__le32 *var, u32 val)
 	*var = cpu_to_le32(le32_to_cpu(*var) + val);
 }
 
+static inline void le64_add_cpu(__le64 *var, u64 val)
+{
+	*var = cpu_to_le64(le64_to_cpu(*var) + val);
+}
+
 static inline void le32_and_cpu(__le32 *var, u32 val)
 {
 	*var = cpu_to_le32(le32_to_cpu(*var) & val);
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index e23e416ca74c..03c1d365c78b 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -373,37 +373,6 @@ out:
 	return ret;
 }
 
-/*
- * Return the index of the extent record which contains cluster #v_cluster.
- * -1 is returned if it was not found.
- *
- * Should work fine on interior and exterior nodes.
- */
-static int ocfs2_search_extent_list(struct ocfs2_extent_list *el,
-				    u32 v_cluster)
-{
-	int ret = -1;
-	int i;
-	struct ocfs2_extent_rec *rec;
-	u32 rec_end, rec_start, clusters;
-
-	for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
-		rec = &el->l_recs[i];
-
-		rec_start = le32_to_cpu(rec->e_cpos);
-		clusters = ocfs2_rec_clusters(el, rec);
-
-		rec_end = rec_start + clusters;
-
-		if (v_cluster >= rec_start && v_cluster < rec_end) {
-			ret = i;
-			break;
-		}
-	}
-
-	return ret;
-}
-
 int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
 		       u32 *p_cluster, u32 *num_clusters,
 		       unsigned int *extent_flags)
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 648ef8e45eaa..5cc90a40b3c5 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -306,6 +306,19 @@ static inline int ocfs2_sparse_alloc(struct ocfs2_super *osb)
 	return 0;
 }
 
+static inline int ocfs2_writes_unwritten_extents(struct ocfs2_super *osb)
+{
+	/*
+	 * Support for sparse files is a pre-requisite
+	 */
+	if (!ocfs2_sparse_alloc(osb))
+		return 0;
+
+	if (osb->s_feature_ro_compat & OCFS2_FEATURE_RO_COMPAT_UNWRITTEN)
+		return 1;
+	return 0;
+}
+
 /* set / clear functions because cluster events can make these happen
  * in parallel so we want the transitions to be atomic. this also
  * means that any future flags osb_flags must be protected by spinlock
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index f0d9eb08547a..c20a74b99d87 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -88,7 +88,7 @@
 #define OCFS2_FEATURE_COMPAT_SUPP	OCFS2_FEATURE_COMPAT_BACKUP_SB
 #define OCFS2_FEATURE_INCOMPAT_SUPP	(OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT \
 					 | OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC)
-#define OCFS2_FEATURE_RO_COMPAT_SUPP	0
+#define OCFS2_FEATURE_RO_COMPAT_SUPP	OCFS2_FEATURE_RO_COMPAT_UNWRITTEN
 
 /*
  * Heartbeat-only devices are missing journals and other files.  The
@@ -116,6 +116,11 @@
  */
 #define OCFS2_FEATURE_COMPAT_BACKUP_SB		0x0001
 
+/*
+ * Unwritten extents support.
+ */
+#define OCFS2_FEATURE_RO_COMPAT_UNWRITTEN	0x0001
+
 /* The byte offset of the first backup block will be 1G.
  * The following will be 4G, 16G, 64G, 256G and 1T.
  */
-- 
cgit v1.2.3


From 2ae99a60374f360ba07037ebbf33d19b89ac43a6 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Fri, 9 Mar 2007 16:43:28 -0800
Subject: ocfs2: Support creation of unwritten extents

This can now be trivially supported with re-use of our existing extend code.

ocfs2_allocate_unwritten_extents() takes a start offset and a byte length
and iterates over the inode, adding extents (marked as unwritten) until len
is reached. Existing extents are skipped over.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/alloc.c |   2 +
 fs/ocfs2/alloc.h |   1 +
 fs/ocfs2/aops.c  |   2 +-
 fs/ocfs2/dir.c   |   2 +-
 fs/ocfs2/file.c  | 119 ++++++++++++++++++++++++++++++++++++++++++++-----------
 fs/ocfs2/file.h  |   5 ++-
 fs/ocfs2/namei.c |   2 +-
 7 files changed, 105 insertions(+), 28 deletions(-)

(limited to 'fs/ocfs2/alloc.h')

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 0db6a1f724e1..2e3898316468 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -3726,6 +3726,7 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
 			u32 cpos,
 			u64 start_blk,
 			u32 new_clusters,
+			u8 flags,
 			struct ocfs2_alloc_context *meta_ac)
 {
 	int status;
@@ -3749,6 +3750,7 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
 	rec.e_cpos = cpu_to_le32(cpos);
 	rec.e_blkno = cpu_to_le64(start_blk);
 	rec.e_leaf_clusters = cpu_to_le16(new_clusters);
+	rec.e_flags = flags;
 
 	status = ocfs2_figure_insert_type(inode, fe_bh, &last_eb_bh, &rec,
 					  &insert);
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index d3acf45225c2..e3284f3eb6b9 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -34,6 +34,7 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
 			u32 cpos,
 			u64 start_blk,
 			u32 new_clusters,
+			u8 flags,
 			struct ocfs2_alloc_context *meta_ac);
 struct ocfs2_cached_dealloc_ctxt;
 int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *di_bh,
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 8af923316d22..ec8b606b30e1 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1136,7 +1136,7 @@ static int ocfs2_write_cluster(struct address_space *mapping,
 		 */
 		tmp_pos = cpos;
 		ret = ocfs2_do_extend_allocation(OCFS2_SB(inode->i_sb), inode,
-						 &tmp_pos, 1, wc->w_di_bh,
+						 &tmp_pos, 1, 0, wc->w_di_bh,
 						 wc->w_handle, data_ac,
 						 meta_ac, NULL);
 		/*
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index c441ef1f2bad..0d5fdde959c8 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -368,7 +368,7 @@ int ocfs2_do_extend_dir(struct super_block *sb,
 		u32 offset = OCFS2_I(dir)->ip_clusters;
 
 		status = ocfs2_do_extend_allocation(OCFS2_SB(sb), dir, &offset,
-						    1, parent_fe_bh, handle,
+						    1, 0, parent_fe_bh, handle,
 						    data_ac, meta_ac, NULL);
 		BUG_ON(status == -EAGAIN);
 		if (status < 0) {
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 6745086da6fd..3e21ad9a6dde 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -425,6 +425,7 @@ int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
 			       struct inode *inode,
 			       u32 *logical_offset,
 			       u32 clusters_to_add,
+			       int mark_unwritten,
 			       struct buffer_head *fe_bh,
 			       handle_t *handle,
 			       struct ocfs2_alloc_context *data_ac,
@@ -437,9 +438,13 @@ int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
 	enum ocfs2_alloc_restarted reason = RESTART_NONE;
 	u32 bit_off, num_bits;
 	u64 block;
+	u8 flags = 0;
 
 	BUG_ON(!clusters_to_add);
 
+	if (mark_unwritten)
+		flags = OCFS2_EXT_UNWRITTEN;
+
 	free_extents = ocfs2_num_free_extents(osb, inode, fe);
 	if (free_extents < 0) {
 		status = free_extents;
@@ -489,7 +494,7 @@ int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
 	     num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);
 	status = ocfs2_insert_extent(osb, handle, inode, fe_bh,
 				     *logical_offset, block, num_bits,
-				     meta_ac);
+				     flags, meta_ac);
 	if (status < 0) {
 		mlog_errno(status);
 		goto leave;
@@ -522,9 +527,11 @@ leave:
  * For a given allocation, determine which allocators will need to be
  * accessed, and lock them, reserving the appropriate number of bits.
  *
- * Called from ocfs2_extend_allocation() for file systems which don't
- * support holes, and from ocfs2_write() for file systems which
- * understand sparse inodes.
+ * Sparse file systems call this from ocfs2_write_begin_nolock()
+ * and ocfs2_allocate_unwritten_extents().
+ *
+ * File systems which don't support holes call this from
+ * ocfs2_extend_allocation().
  */
 int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
 			  u32 clusters_to_add, u32 extents_to_split,
@@ -595,14 +602,13 @@ out:
 	return ret;
 }
 
-static int ocfs2_extend_allocation(struct inode *inode,
-				   u32 clusters_to_add)
+static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
+				     u32 clusters_to_add, int mark_unwritten)
 {
 	int status = 0;
 	int restart_func = 0;
-	int drop_alloc_sem = 0;
 	int credits;
-	u32 prev_clusters, logical_start;
+	u32 prev_clusters;
 	struct buffer_head *bh = NULL;
 	struct ocfs2_dinode *fe = NULL;
 	handle_t *handle = NULL;
@@ -617,7 +623,7 @@ static int ocfs2_extend_allocation(struct inode *inode,
 	 * This function only exists for file systems which don't
 	 * support holes.
 	 */
-	BUG_ON(ocfs2_sparse_alloc(osb));
+	BUG_ON(mark_unwritten && !ocfs2_sparse_alloc(osb));
 
 	status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh,
 				  OCFS2_BH_CACHED, inode);
@@ -633,18 +639,9 @@ static int ocfs2_extend_allocation(struct inode *inode,
 		goto leave;
 	}
 
-	logical_start = OCFS2_I(inode)->ip_clusters;
-
 restart_all:
 	BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
 
-	/* blocks peope in read/write from reading our allocation
-	 * until we're done changing it. We depend on i_mutex to block
-	 * other extend/truncate calls while we're here. Ordering wrt
-	 * start_trans is important here -- always do it before! */
-	down_write(&OCFS2_I(inode)->ip_alloc_sem);
-	drop_alloc_sem = 1;
-
 	status = ocfs2_lock_allocators(inode, fe, clusters_to_add, 0, &data_ac,
 				       &meta_ac);
 	if (status) {
@@ -678,6 +675,7 @@ restarted_transaction:
 					    inode,
 					    &logical_start,
 					    clusters_to_add,
+					    mark_unwritten,
 					    bh,
 					    handle,
 					    data_ac,
@@ -730,10 +728,6 @@ restarted_transaction:
 	     OCFS2_I(inode)->ip_clusters, i_size_read(inode));
 
 leave:
-	if (drop_alloc_sem) {
-		up_write(&OCFS2_I(inode)->ip_alloc_sem);
-		drop_alloc_sem = 0;
-	}
 	if (handle) {
 		ocfs2_commit_trans(osb, handle);
 		handle = NULL;
@@ -759,6 +753,25 @@ leave:
 	return status;
 }
 
+static int ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
+				   u32 clusters_to_add, int mark_unwritten)
+{
+	int ret;
+
+	/*
+	 * The alloc sem blocks peope in read/write from reading our
+	 * allocation until we're done changing it. We depend on
+	 * i_mutex to block other extend/truncate calls while we're
+	 * here.
+	 */
+	down_write(&OCFS2_I(inode)->ip_alloc_sem);
+	ret = __ocfs2_extend_allocation(inode, logical_start, clusters_to_add,
+					mark_unwritten);
+	up_write(&OCFS2_I(inode)->ip_alloc_sem);
+
+	return ret;
+}
+
 /* Some parts of this taken from generic_cont_expand, which turned out
  * to be too fragile to do exactly what we need without us having to
  * worry about recursive locking in ->prepare_write() and
@@ -900,7 +913,9 @@ static int ocfs2_extend_file(struct inode *inode,
 	}
 
 	if (clusters_to_add) {
-		ret = ocfs2_extend_allocation(inode, clusters_to_add);
+		ret = ocfs2_extend_allocation(inode,
+					      OCFS2_I(inode)->ip_clusters,
+					      clusters_to_add, 0);
 		if (ret < 0) {
 			mlog_errno(ret);
 			goto out_unlock;
@@ -1176,6 +1191,64 @@ out:
 	return ret;
 }
 
+/*
+ * Allocate enough extents to cover the region starting at byte offset
+ * start for len bytes. Existing extents are skipped, any extents
+ * added are marked as "unwritten".
+ */
+static int ocfs2_allocate_unwritten_extents(struct inode *inode,
+					    u64 start, u64 len)
+{
+	int ret;
+	u32 cpos, phys_cpos, clusters, alloc_size;
+
+	/*
+	 * We consider both start and len to be inclusive.
+	 */
+	cpos = start >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
+	clusters = ocfs2_clusters_for_bytes(inode->i_sb, start + len);
+	clusters -= cpos;
+
+	while (clusters) {
+		ret = ocfs2_get_clusters(inode, cpos, &phys_cpos,
+					 &alloc_size, NULL);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		/*
+		 * Hole or existing extent len can be arbitrary, so
+		 * cap it to our own allocation request.
+		 */
+		if (alloc_size > clusters)
+			alloc_size = clusters;
+
+		if (phys_cpos) {
+			/*
+			 * We already have an allocation at this
+			 * region so we can safely skip it.
+			 */
+			goto next;
+		}
+
+		ret = __ocfs2_extend_allocation(inode, cpos, alloc_size, 1);
+		if (ret) {
+			if (ret != -ENOSPC)
+				mlog_errno(ret);
+			goto out;
+		}
+
+next:
+		cpos += alloc_size;
+		clusters -= alloc_size;
+	}
+
+	ret = 0;
+out:
+	return ret;
+}
+
 static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
 					 loff_t *ppos,
 					 size_t count,
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index 54df3c4bd2fd..79115c92dc30 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -39,13 +39,14 @@ enum ocfs2_alloc_restarted {
 };
 int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
 			       struct inode *inode,
-			       u32 *cluster_start,
+			       u32 *logical_offset,
 			       u32 clusters_to_add,
+			       int mark_unwritten,
 			       struct buffer_head *fe_bh,
 			       handle_t *handle,
 			       struct ocfs2_alloc_context *data_ac,
 			       struct ocfs2_alloc_context *meta_ac,
-			       enum ocfs2_alloc_restarted *reason);
+			       enum ocfs2_alloc_restarted *reason_ret);
 int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
 			  u32 clusters_to_add, u32 extents_to_split,
 			  struct ocfs2_alloc_context **data_ac,
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 36289e6295ce..d430fdab16e9 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -1674,7 +1674,7 @@ static int ocfs2_symlink(struct inode *dir,
 		u32 offset = 0;
 
 		inode->i_op = &ocfs2_symlink_inode_operations;
-		status = ocfs2_do_extend_allocation(osb, inode, &offset, 1,
+		status = ocfs2_do_extend_allocation(osb, inode, &offset, 1, 0,
 						    new_fe_bh,
 						    handle, data_ac, NULL,
 						    NULL);
-- 
cgit v1.2.3


From 35edec1d52c075975991471d624b33b9336226f2 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Fri, 6 Jul 2007 14:41:18 -0700
Subject: ocfs2: update truncate handling of partial clusters

The partial cluster zeroing code used during truncate usually assumes that
the rightmost byte in the range to be zeroed lies on a cluster boundary.
This makes sense for truncate, but punching holes might require zeroing on
non-aligned rightmost boundaries.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/alloc.c | 72 +++++++++++++++++++++++---------------------------------
 fs/ocfs2/alloc.h |  4 ++--
 fs/ocfs2/file.c  |  5 +++-
 3 files changed, 35 insertions(+), 46 deletions(-)

(limited to 'fs/ocfs2/alloc.h')

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index fac1adb3880a..df186d2e8248 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -5668,9 +5668,9 @@ static int ocfs2_ordered_zero_func(handle_t *handle, struct buffer_head *bh)
 	return ocfs2_journal_dirty_data(handle, bh);
 }
 
-static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t isize,
-				     struct page **pages, int numpages,
-				     u64 phys, handle_t *handle)
+static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t start,
+				     loff_t end, struct page **pages,
+				     int numpages, u64 phys, handle_t *handle)
 {
 	int i, ret, partial = 0;
 	void *kaddr;
@@ -5683,26 +5683,14 @@ static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t isize,
 	if (numpages == 0)
 		goto out;
 
-	from = isize & (PAGE_CACHE_SIZE - 1); /* 1st page offset */
-	if (PAGE_CACHE_SHIFT > OCFS2_SB(sb)->s_clustersize_bits) {
-		/*
-		 * Since 'from' has been capped to a value below page
-		 * size, this calculation won't be able to overflow
-		 * 'to'
-		 */
-		to = ocfs2_align_bytes_to_clusters(sb, from);
-
-		/*
-		 * The truncate tail in this case should never contain
-		 * more than one page at maximum. The loop below also
-		 * assumes this.
-		 */
-		BUG_ON(numpages != 1);
-	}
-
+	to = PAGE_CACHE_SIZE;
 	for(i = 0; i < numpages; i++) {
 		page = pages[i];
 
+		from = start & (PAGE_CACHE_SIZE - 1);
+		if ((end >> PAGE_CACHE_SHIFT) == page->index)
+			to = end & (PAGE_CACHE_SIZE - 1);
+
 		BUG_ON(from > PAGE_CACHE_SIZE);
 		BUG_ON(to > PAGE_CACHE_SIZE);
 
@@ -5739,10 +5727,7 @@ static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t isize,
 
 		flush_dcache_page(page);
 
-		/*
-		 * Every page after the 1st one should be completely zero'd.
-		 */
-		from = 0;
+		start = (page->index + 1) << PAGE_CACHE_SHIFT;
 	}
 out:
 	if (pages) {
@@ -5755,24 +5740,26 @@ out:
 	}
 }
 
-static int ocfs2_grab_eof_pages(struct inode *inode, loff_t isize, struct page **pages,
-				int *num, u64 *phys)
+static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end,
+				struct page **pages, int *num, u64 *phys)
 {
 	int i, numpages = 0, ret = 0;
-	unsigned int csize = OCFS2_SB(inode->i_sb)->s_clustersize;
 	unsigned int ext_flags;
 	struct super_block *sb = inode->i_sb;
 	struct address_space *mapping = inode->i_mapping;
 	unsigned long index;
-	u64 next_cluster_bytes;
+	loff_t last_page_bytes;
 
 	BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb)));
+	BUG_ON(start > end);
 
-	/* Cluster boundary, so we don't need to grab any pages. */
-	if ((isize & (csize - 1)) == 0)
+	if (start == end)
 		goto out;
 
-	ret = ocfs2_extent_map_get_blocks(inode, isize >> sb->s_blocksize_bits,
+	BUG_ON(start >> OCFS2_SB(sb)->s_clustersize_bits !=
+	       (end - 1) >> OCFS2_SB(sb)->s_clustersize_bits);
+
+	ret = ocfs2_extent_map_get_blocks(inode, start >> sb->s_blocksize_bits,
 					  phys, NULL, &ext_flags);
 	if (ret) {
 		mlog_errno(ret);
@@ -5788,8 +5775,8 @@ static int ocfs2_grab_eof_pages(struct inode *inode, loff_t isize, struct page *
 	if (ext_flags & OCFS2_EXT_UNWRITTEN)
 		goto out;
 
-	next_cluster_bytes = ocfs2_align_bytes_to_clusters(inode->i_sb, isize);
-	index = isize >> PAGE_CACHE_SHIFT;
+	last_page_bytes = PAGE_ALIGN(end);
+	index = start >> PAGE_CACHE_SHIFT;
 	do {
 		pages[numpages] = grab_cache_page(mapping, index);
 		if (!pages[numpages]) {
@@ -5800,7 +5787,7 @@ static int ocfs2_grab_eof_pages(struct inode *inode, loff_t isize, struct page *
 
 		numpages++;
 		index++;
-	} while (index < (next_cluster_bytes >> PAGE_CACHE_SHIFT));
+	} while (index < (last_page_bytes >> PAGE_CACHE_SHIFT));
 
 out:
 	if (ret != 0) {
@@ -5829,11 +5816,10 @@ out:
  * otherwise block_write_full_page() will skip writeout of pages past
  * i_size. The new_i_size parameter is passed for this reason.
  */
-int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle,
-				 u64 new_i_size)
+int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
+				  u64 range_start, u64 range_end)
 {
 	int ret, numpages;
-	loff_t endbyte;
 	struct page **pages = NULL;
 	u64 phys;
 
@@ -5852,7 +5838,8 @@ int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle,
 		goto out;
 	}
 
-	ret = ocfs2_grab_eof_pages(inode, new_i_size, pages, &numpages, &phys);
+	ret = ocfs2_grab_eof_pages(inode, range_start, range_end, pages,
+				   &numpages, &phys);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
@@ -5861,17 +5848,16 @@ int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle,
 	if (numpages == 0)
 		goto out;
 
-	ocfs2_zero_cluster_pages(inode, new_i_size, pages, numpages, phys,
-				 handle);
+	ocfs2_zero_cluster_pages(inode, range_start, range_end, pages,
+				 numpages, phys, handle);
 
 	/*
 	 * Initiate writeout of the pages we zero'd here. We don't
 	 * wait on them - the truncate_inode_pages() call later will
 	 * do that for us.
 	 */
-	endbyte = ocfs2_align_bytes_to_clusters(inode->i_sb, new_i_size);
-	ret = do_sync_mapping_range(inode->i_mapping, new_i_size,
-				    endbyte - 1, SYNC_FILE_RANGE_WRITE);
+	ret = do_sync_mapping_range(inode->i_mapping, range_start,
+				    range_end - 1, SYNC_FILE_RANGE_WRITE);
 	if (ret)
 		mlog_errno(ret);
 
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index e3284f3eb6b9..752ef860873d 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -95,8 +95,8 @@ struct ocfs2_truncate_context {
 	struct buffer_head *tc_last_eb_bh;
 };
 
-int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle,
-				 u64 new_i_size);
+int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
+				  u64 range_start, u64 range_end);
 int ocfs2_prepare_truncate(struct ocfs2_super *osb,
 			   struct inode *inode,
 			   struct buffer_head *fe_bh,
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 3e21ad9a6dde..f0a6b1330a6e 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -263,6 +263,7 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
 	int status;
 	handle_t *handle;
 	struct ocfs2_dinode *di;
+	u64 cluster_bytes;
 
 	mlog_entry_void();
 
@@ -286,7 +287,9 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
 	/*
 	 * Do this before setting i_size.
 	 */
-	status = ocfs2_zero_tail_for_truncate(inode, handle, new_i_size);
+	cluster_bytes = ocfs2_align_bytes_to_clusters(inode->i_sb, new_i_size);
+	status = ocfs2_zero_range_for_truncate(inode, handle, new_i_size,
+					       cluster_bytes);
 	if (status) {
 		mlog_errno(status);
 		goto out_commit;
-- 
cgit v1.2.3


From 063c4561f52a74de686fe0ff2f96f4f54c9fecd2 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Tue, 3 Jul 2007 13:34:11 -0700
Subject: ocfs2: support for removing file regions

Provide an internal interface for the removal of arbitrary file regions.

ocfs2_remove_inode_range() takes a byte range within a file and will remove
existing extents within that range. Partial clusters will be zeroed so that
any read from within the region will return zeros.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/alloc.c   |  20 ++---
 fs/ocfs2/alloc.h   |  10 +++
 fs/ocfs2/file.c    | 242 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 fs/ocfs2/journal.h |   2 +
 4 files changed, 262 insertions(+), 12 deletions(-)

(limited to 'fs/ocfs2/alloc.h')

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index df186d2e8248..f5e11f4fa952 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -4373,10 +4373,10 @@ out:
 	return ret;
 }
 
-static int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh,
-			       u32 cpos, u32 len, handle_t *handle,
-			       struct ocfs2_alloc_context *meta_ac,
-			       struct ocfs2_cached_dealloc_ctxt *dealloc)
+int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh,
+			u32 cpos, u32 len, handle_t *handle,
+			struct ocfs2_alloc_context *meta_ac,
+			struct ocfs2_cached_dealloc_ctxt *dealloc)
 {
 	int ret, index;
 	u32 rec_range, trunc_range;
@@ -4506,7 +4506,7 @@ out:
 	return ret;
 }
 
-static inline int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb)
+int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb)
 {
 	struct buffer_head *tl_bh = osb->osb_tl_bh;
 	struct ocfs2_dinode *di;
@@ -4539,10 +4539,10 @@ static int ocfs2_truncate_log_can_coalesce(struct ocfs2_truncate_log *tl,
 	return current_tail == new_start;
 }
 
-static int ocfs2_truncate_log_append(struct ocfs2_super *osb,
-				     handle_t *handle,
-				     u64 start_blk,
-				     unsigned int num_clusters)
+int ocfs2_truncate_log_append(struct ocfs2_super *osb,
+			      handle_t *handle,
+			      u64 start_blk,
+			      unsigned int num_clusters)
 {
 	int status, index;
 	unsigned int start_cluster, tl_count;
@@ -4698,7 +4698,7 @@ bail:
 }
 
 /* Expects you to already be holding tl_inode->i_mutex */
-static int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
+int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
 {
 	int status;
 	unsigned int num_to_flush;
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 752ef860873d..990df48ae8d3 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -41,6 +41,10 @@ int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *di_bh,
 			      handle_t *handle, u32 cpos, u32 len, u32 phys,
 			      struct ocfs2_alloc_context *meta_ac,
 			      struct ocfs2_cached_dealloc_ctxt *dealloc);
+int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh,
+			u32 cpos, u32 len, handle_t *handle,
+			struct ocfs2_alloc_context *meta_ac,
+			struct ocfs2_cached_dealloc_ctxt *dealloc);
 int ocfs2_num_free_extents(struct ocfs2_super *osb,
 			   struct inode *inode,
 			   struct ocfs2_dinode *fe);
@@ -68,6 +72,12 @@ int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
 				      struct ocfs2_dinode **tl_copy);
 int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
 					 struct ocfs2_dinode *tl_copy);
+int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb);
+int ocfs2_truncate_log_append(struct ocfs2_super *osb,
+			      handle_t *handle,
+			      u64 start_blk,
+			      unsigned int num_clusters);
+int __ocfs2_flush_truncate_log(struct ocfs2_super *osb);
 
 /*
  * Process local structure which describes the block unlinks done
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index f0a6b1330a6e..11f7cf9f2511 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -541,12 +541,15 @@ int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
 			  struct ocfs2_alloc_context **data_ac,
 			  struct ocfs2_alloc_context **meta_ac)
 {
-	int ret, num_free_extents;
+	int ret = 0, num_free_extents;
 	unsigned int max_recs_needed = clusters_to_add + 2 * extents_to_split;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 
 	*meta_ac = NULL;
-	*data_ac = NULL;
+	if (data_ac)
+		*data_ac = NULL;
+
+	BUG_ON(clusters_to_add != 0 && data_ac == NULL);
 
 	mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, "
 	     "clusters_to_add = %u, extents_to_split = %u\n",
@@ -583,6 +586,9 @@ int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
 		}
 	}
 
+	if (clusters_to_add == 0)
+		goto out;
+
 	ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac);
 	if (ret < 0) {
 		if (ret != -ENOSPC)
@@ -1252,6 +1258,238 @@ out:
 	return ret;
 }
 
+static int __ocfs2_remove_inode_range(struct inode *inode,
+				      struct buffer_head *di_bh,
+				      u32 cpos, u32 phys_cpos, u32 len,
+				      struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+	int ret;
+	u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct inode *tl_inode = osb->osb_tl_inode;
+	handle_t *handle;
+	struct ocfs2_alloc_context *meta_ac = NULL;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+
+	ret = ocfs2_lock_allocators(inode, di, 0, 1, NULL, &meta_ac);
+	if (ret) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	mutex_lock(&tl_inode->i_mutex);
+
+	if (ocfs2_truncate_log_needs_flush(osb)) {
+		ret = __ocfs2_flush_truncate_log(osb);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	handle = ocfs2_start_trans(osb, OCFS2_REMOVE_EXTENT_CREDITS);
+	if (handle == NULL) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_journal_access(handle, inode, di_bh,
+				   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_remove_extent(inode, di_bh, cpos, len, handle, meta_ac,
+				  dealloc);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	OCFS2_I(inode)->ip_clusters -= len;
+	di->i_clusters = cpu_to_le32(OCFS2_I(inode)->ip_clusters);
+
+	ret = ocfs2_journal_dirty(handle, di_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	ret = ocfs2_truncate_log_append(osb, handle, phys_blkno, len);
+	if (ret)
+		mlog_errno(ret);
+
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+out:
+	mutex_unlock(&tl_inode->i_mutex);
+
+	if (meta_ac)
+		ocfs2_free_alloc_context(meta_ac);
+
+	return ret;
+}
+
+/*
+ * Truncate a byte range, avoiding pages within partial clusters. This
+ * preserves those pages for the zeroing code to write to.
+ */
+static void ocfs2_truncate_cluster_pages(struct inode *inode, u64 byte_start,
+					 u64 byte_len)
+{
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	loff_t start, end;
+	struct address_space *mapping = inode->i_mapping;
+
+	start = (loff_t)ocfs2_align_bytes_to_clusters(inode->i_sb, byte_start);
+	end = byte_start + byte_len;
+	end = end & ~(osb->s_clustersize - 1);
+
+	if (start < end) {
+		unmap_mapping_range(mapping, start, end - start, 0);
+		truncate_inode_pages_range(mapping, start, end - 1);
+	}
+}
+
+static int ocfs2_zero_partial_clusters(struct inode *inode,
+				       u64 start, u64 len)
+{
+	int ret = 0;
+	u64 tmpend, end = start + len;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	unsigned int csize = osb->s_clustersize;
+	handle_t *handle;
+
+	/*
+	 * The "start" and "end" values are NOT necessarily part of
+	 * the range whose allocation is being deleted. Rather, this
+	 * is what the user passed in with the request. We must zero
+	 * partial clusters here. There's no need to worry about
+	 * physical allocation - the zeroing code knows to skip holes.
+	 */
+	mlog(0, "byte start: %llu, end: %llu\n",
+	     (unsigned long long)start, (unsigned long long)end);
+
+	/*
+	 * If both edges are on a cluster boundary then there's no
+	 * zeroing required as the region is part of the allocation to
+	 * be truncated.
+	 */
+	if ((start & (csize - 1)) == 0 && (end & (csize - 1)) == 0)
+		goto out;
+
+	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+	if (handle == NULL) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * We want to get the byte offset of the end of the 1st cluster.
+	 */
+	tmpend = (u64)osb->s_clustersize + (start & ~(osb->s_clustersize - 1));
+	if (tmpend > end)
+		tmpend = end;
+
+	mlog(0, "1st range: start: %llu, tmpend: %llu\n",
+	     (unsigned long long)start, (unsigned long long)tmpend);
+
+	ret = ocfs2_zero_range_for_truncate(inode, handle, start, tmpend);
+	if (ret)
+		mlog_errno(ret);
+
+	if (tmpend < end) {
+		/*
+		 * This may make start and end equal, but the zeroing
+		 * code will skip any work in that case so there's no
+		 * need to catch it up here.
+		 */
+		start = end & ~(osb->s_clustersize - 1);
+
+		mlog(0, "2nd range: start: %llu, end: %llu\n",
+		     (unsigned long long)start, (unsigned long long)end);
+
+		ret = ocfs2_zero_range_for_truncate(inode, handle, start, end);
+		if (ret)
+			mlog_errno(ret);
+	}
+
+	ocfs2_commit_trans(osb, handle);
+out:
+	return ret;
+}
+
+static int ocfs2_remove_inode_range(struct inode *inode,
+				    struct buffer_head *di_bh, u64 byte_start,
+				    u64 byte_len)
+{
+	int ret = 0;
+	u32 trunc_start, trunc_len, cpos, phys_cpos, alloc_size;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_cached_dealloc_ctxt dealloc;
+
+	ocfs2_init_dealloc_ctxt(&dealloc);
+
+	if (byte_len == 0)
+		return 0;
+
+	trunc_start = ocfs2_clusters_for_bytes(osb->sb, byte_start);
+	trunc_len = (byte_start + byte_len) >> osb->s_clustersize_bits;
+	if (trunc_len >= trunc_start)
+		trunc_len -= trunc_start;
+	else
+		trunc_len = 0;
+
+	mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, clen: %u\n",
+	     (unsigned long long)OCFS2_I(inode)->ip_blkno,
+	     (unsigned long long)byte_start,
+	     (unsigned long long)byte_len, trunc_start, trunc_len);
+
+	ret = ocfs2_zero_partial_clusters(inode, byte_start, byte_len);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	cpos = trunc_start;
+	while (trunc_len) {
+		ret = ocfs2_get_clusters(inode, cpos, &phys_cpos,
+					 &alloc_size, NULL);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		if (alloc_size > trunc_len)
+			alloc_size = trunc_len;
+
+		/* Only do work for non-holes */
+		if (phys_cpos != 0) {
+			ret = __ocfs2_remove_inode_range(inode, di_bh, cpos,
+							 phys_cpos, alloc_size,
+							 &dealloc);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+		}
+
+		cpos += alloc_size;
+		trunc_len -= alloc_size;
+	}
+
+	ocfs2_truncate_cluster_pages(inode, byte_start, byte_len);
+
+out:
+	ocfs2_schedule_truncate_log_flush(osb, 1);
+	ocfs2_run_deallocs(osb, &dealloc);
+
+	return ret;
+}
+
 static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
 					 loff_t *ppos,
 					 size_t count,
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 3db5de4506da..ce60aab013aa 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -289,6 +289,8 @@ int                  ocfs2_journal_dirty_data(handle_t *handle,
 #define OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC (OCFS2_SUBALLOC_FREE 		      \
 					 + OCFS2_TRUNCATE_LOG_UPDATE)
 
+#define OCFS2_REMOVE_EXTENT_CREDITS (OCFS2_TRUNCATE_LOG_UPDATE + OCFS2_INODE_UPDATE_CREDITS)
+
 /* data block for new dir/symlink, 2 for bitmap updates (bitmap fe +
  * bitmap block for the new bit) */
 #define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + 2)
-- 
cgit v1.2.3