From 1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 16 Apr 2005 15:20:36 -0700 Subject: Linux-2.6.12-rc2 Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip! --- fs/ext3/Makefile | 12 + fs/ext3/acl.c | 547 ++++++++ fs/ext3/acl.h | 84 ++ fs/ext3/balloc.c | 1600 +++++++++++++++++++++++ fs/ext3/bitmap.c | 26 + fs/ext3/dir.c | 519 ++++++++ fs/ext3/file.c | 131 ++ fs/ext3/fsync.c | 88 ++ fs/ext3/hash.c | 152 +++ fs/ext3/ialloc.c | 794 ++++++++++++ fs/ext3/inode.c | 3132 ++++++++++++++++++++++++++++++++++++++++++++++ fs/ext3/ioctl.c | 243 ++++ fs/ext3/namei.c | 2378 +++++++++++++++++++++++++++++++++++ fs/ext3/resize.c | 996 +++++++++++++++ fs/ext3/super.c | 2539 +++++++++++++++++++++++++++++++++++++ fs/ext3/symlink.c | 54 + fs/ext3/xattr.c | 1320 +++++++++++++++++++ fs/ext3/xattr.h | 135 ++ fs/ext3/xattr_security.c | 55 + fs/ext3/xattr_trusted.c | 65 + fs/ext3/xattr_user.c | 79 ++ 21 files changed, 14949 insertions(+) create mode 100644 fs/ext3/Makefile create mode 100644 fs/ext3/acl.c create mode 100644 fs/ext3/acl.h create mode 100644 fs/ext3/balloc.c create mode 100644 fs/ext3/bitmap.c create mode 100644 fs/ext3/dir.c create mode 100644 fs/ext3/file.c create mode 100644 fs/ext3/fsync.c create mode 100644 fs/ext3/hash.c create mode 100644 fs/ext3/ialloc.c create mode 100644 fs/ext3/inode.c create mode 100644 fs/ext3/ioctl.c create mode 100644 fs/ext3/namei.c create mode 100644 fs/ext3/resize.c create mode 100644 fs/ext3/super.c create mode 100644 fs/ext3/symlink.c create mode 100644 fs/ext3/xattr.c create mode 100644 fs/ext3/xattr.h create mode 100644 fs/ext3/xattr_security.c create mode 100644 fs/ext3/xattr_trusted.c create mode 100644 fs/ext3/xattr_user.c (limited to 'fs/ext3') diff --git a/fs/ext3/Makefile b/fs/ext3/Makefile new file mode 100644 index 000000000000..704cd44a40c2 --- /dev/null +++ b/fs/ext3/Makefile @@ -0,0 +1,12 @@ +# +# Makefile for the linux ext3-filesystem routines. +# + +obj-$(CONFIG_EXT3_FS) += ext3.o + +ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ + ioctl.o namei.o super.o symlink.o hash.o resize.o + +ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o +ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o +ext3-$(CONFIG_EXT3_FS_SECURITY) += xattr_security.o diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c new file mode 100644 index 000000000000..328592c3a956 --- /dev/null +++ b/fs/ext3/acl.c @@ -0,0 +1,547 @@ +/* + * linux/fs/ext3/acl.c + * + * Copyright (C) 2001-2003 Andreas Gruenbacher, + */ + +#include +#include +#include +#include +#include +#include +#include "xattr.h" +#include "acl.h" + +/* + * Convert from filesystem to in-memory representation. + */ +static struct posix_acl * +ext3_acl_from_disk(const void *value, size_t size) +{ + const char *end = (char *)value + size; + int n, count; + struct posix_acl *acl; + + if (!value) + return NULL; + if (size < sizeof(ext3_acl_header)) + return ERR_PTR(-EINVAL); + if (((ext3_acl_header *)value)->a_version != + cpu_to_le32(EXT3_ACL_VERSION)) + return ERR_PTR(-EINVAL); + value = (char *)value + sizeof(ext3_acl_header); + count = ext3_acl_count(size); + if (count < 0) + return ERR_PTR(-EINVAL); + if (count == 0) + return NULL; + acl = posix_acl_alloc(count, GFP_KERNEL); + if (!acl) + return ERR_PTR(-ENOMEM); + for (n=0; n < count; n++) { + ext3_acl_entry *entry = + (ext3_acl_entry *)value; + if ((char *)value + sizeof(ext3_acl_entry_short) > end) + goto fail; + acl->a_entries[n].e_tag = le16_to_cpu(entry->e_tag); + acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm); + switch(acl->a_entries[n].e_tag) { + case ACL_USER_OBJ: + case ACL_GROUP_OBJ: + case ACL_MASK: + case ACL_OTHER: + value = (char *)value + + sizeof(ext3_acl_entry_short); + acl->a_entries[n].e_id = ACL_UNDEFINED_ID; + break; + + case ACL_USER: + case ACL_GROUP: + value = (char *)value + sizeof(ext3_acl_entry); + if ((char *)value > end) + goto fail; + acl->a_entries[n].e_id = + le32_to_cpu(entry->e_id); + break; + + default: + goto fail; + } + } + if (value != end) + goto fail; + return acl; + +fail: + posix_acl_release(acl); + return ERR_PTR(-EINVAL); +} + +/* + * Convert from in-memory to filesystem representation. + */ +static void * +ext3_acl_to_disk(const struct posix_acl *acl, size_t *size) +{ + ext3_acl_header *ext_acl; + char *e; + size_t n; + + *size = ext3_acl_size(acl->a_count); + ext_acl = (ext3_acl_header *)kmalloc(sizeof(ext3_acl_header) + + acl->a_count * sizeof(ext3_acl_entry), GFP_KERNEL); + if (!ext_acl) + return ERR_PTR(-ENOMEM); + ext_acl->a_version = cpu_to_le32(EXT3_ACL_VERSION); + e = (char *)ext_acl + sizeof(ext3_acl_header); + for (n=0; n < acl->a_count; n++) { + ext3_acl_entry *entry = (ext3_acl_entry *)e; + entry->e_tag = cpu_to_le16(acl->a_entries[n].e_tag); + entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm); + switch(acl->a_entries[n].e_tag) { + case ACL_USER: + case ACL_GROUP: + entry->e_id = + cpu_to_le32(acl->a_entries[n].e_id); + e += sizeof(ext3_acl_entry); + break; + + case ACL_USER_OBJ: + case ACL_GROUP_OBJ: + case ACL_MASK: + case ACL_OTHER: + e += sizeof(ext3_acl_entry_short); + break; + + default: + goto fail; + } + } + return (char *)ext_acl; + +fail: + kfree(ext_acl); + return ERR_PTR(-EINVAL); +} + +static inline struct posix_acl * +ext3_iget_acl(struct inode *inode, struct posix_acl **i_acl) +{ + struct posix_acl *acl = EXT3_ACL_NOT_CACHED; + + spin_lock(&inode->i_lock); + if (*i_acl != EXT3_ACL_NOT_CACHED) + acl = posix_acl_dup(*i_acl); + spin_unlock(&inode->i_lock); + + return acl; +} + +static inline void +ext3_iset_acl(struct inode *inode, struct posix_acl **i_acl, + struct posix_acl *acl) +{ + spin_lock(&inode->i_lock); + if (*i_acl != EXT3_ACL_NOT_CACHED) + posix_acl_release(*i_acl); + *i_acl = posix_acl_dup(acl); + spin_unlock(&inode->i_lock); +} + +/* + * Inode operation get_posix_acl(). + * + * inode->i_sem: don't care + */ +static struct posix_acl * +ext3_get_acl(struct inode *inode, int type) +{ + struct ext3_inode_info *ei = EXT3_I(inode); + int name_index; + char *value = NULL; + struct posix_acl *acl; + int retval; + + if (!test_opt(inode->i_sb, POSIX_ACL)) + return NULL; + + switch(type) { + case ACL_TYPE_ACCESS: + acl = ext3_iget_acl(inode, &ei->i_acl); + if (acl != EXT3_ACL_NOT_CACHED) + return acl; + name_index = EXT3_XATTR_INDEX_POSIX_ACL_ACCESS; + break; + + case ACL_TYPE_DEFAULT: + acl = ext3_iget_acl(inode, &ei->i_default_acl); + if (acl != EXT3_ACL_NOT_CACHED) + return acl; + name_index = EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT; + break; + + default: + return ERR_PTR(-EINVAL); + } + retval = ext3_xattr_get(inode, name_index, "", NULL, 0); + if (retval > 0) { + value = kmalloc(retval, GFP_KERNEL); + if (!value) + return ERR_PTR(-ENOMEM); + retval = ext3_xattr_get(inode, name_index, "", value, retval); + } + if (retval > 0) + acl = ext3_acl_from_disk(value, retval); + else if (retval == -ENODATA || retval == -ENOSYS) + acl = NULL; + else + acl = ERR_PTR(retval); + kfree(value); + + if (!IS_ERR(acl)) { + switch(type) { + case ACL_TYPE_ACCESS: + ext3_iset_acl(inode, &ei->i_acl, acl); + break; + + case ACL_TYPE_DEFAULT: + ext3_iset_acl(inode, &ei->i_default_acl, acl); + break; + } + } + return acl; +} + +/* + * Set the access or default ACL of an inode. + * + * inode->i_sem: down unless called from ext3_new_inode + */ +static int +ext3_set_acl(handle_t *handle, struct inode *inode, int type, + struct posix_acl *acl) +{ + struct ext3_inode_info *ei = EXT3_I(inode); + int name_index; + void *value = NULL; + size_t size; + int error; + + if (S_ISLNK(inode->i_mode)) + return -EOPNOTSUPP; + + switch(type) { + case ACL_TYPE_ACCESS: + name_index = EXT3_XATTR_INDEX_POSIX_ACL_ACCESS; + if (acl) { + mode_t mode = inode->i_mode; + error = posix_acl_equiv_mode(acl, &mode); + if (error < 0) + return error; + else { + inode->i_mode = mode; + ext3_mark_inode_dirty(handle, inode); + if (error == 0) + acl = NULL; + } + } + break; + + case ACL_TYPE_DEFAULT: + name_index = EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT; + if (!S_ISDIR(inode->i_mode)) + return acl ? -EACCES : 0; + break; + + default: + return -EINVAL; + } + if (acl) { + value = ext3_acl_to_disk(acl, &size); + if (IS_ERR(value)) + return (int)PTR_ERR(value); + } + + error = ext3_xattr_set_handle(handle, inode, name_index, "", + value, size, 0); + + kfree(value); + if (!error) { + switch(type) { + case ACL_TYPE_ACCESS: + ext3_iset_acl(inode, &ei->i_acl, acl); + break; + + case ACL_TYPE_DEFAULT: + ext3_iset_acl(inode, &ei->i_default_acl, acl); + break; + } + } + return error; +} + +static int +ext3_check_acl(struct inode *inode, int mask) +{ + struct posix_acl *acl = ext3_get_acl(inode, ACL_TYPE_ACCESS); + + if (acl) { + int error = posix_acl_permission(inode, acl, mask); + posix_acl_release(acl); + return error; + } + + return -EAGAIN; +} + +int +ext3_permission(struct inode *inode, int mask, struct nameidata *nd) +{ + return generic_permission(inode, mask, ext3_check_acl); +} + +/* + * Initialize the ACLs of a new inode. Called from ext3_new_inode. + * + * dir->i_sem: down + * inode->i_sem: up (access to inode is still exclusive) + */ +int +ext3_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) +{ + struct posix_acl *acl = NULL; + int error = 0; + + if (!S_ISLNK(inode->i_mode)) { + if (test_opt(dir->i_sb, POSIX_ACL)) { + acl = ext3_get_acl(dir, ACL_TYPE_DEFAULT); + if (IS_ERR(acl)) + return PTR_ERR(acl); + } + if (!acl) + inode->i_mode &= ~current->fs->umask; + } + if (test_opt(inode->i_sb, POSIX_ACL) && acl) { + struct posix_acl *clone; + mode_t mode; + + if (S_ISDIR(inode->i_mode)) { + error = ext3_set_acl(handle, inode, + ACL_TYPE_DEFAULT, acl); + if (error) + goto cleanup; + } + clone = posix_acl_clone(acl, GFP_KERNEL); + error = -ENOMEM; + if (!clone) + goto cleanup; + + mode = inode->i_mode; + error = posix_acl_create_masq(clone, &mode); + if (error >= 0) { + inode->i_mode = mode; + if (error > 0) { + /* This is an extended ACL */ + error = ext3_set_acl(handle, inode, + ACL_TYPE_ACCESS, clone); + } + } + posix_acl_release(clone); + } +cleanup: + posix_acl_release(acl); + return error; +} + +/* + * Does chmod for an inode that may have an Access Control List. The + * inode->i_mode field must be updated to the desired value by the caller + * before calling this function. + * Returns 0 on success, or a negative error number. + * + * We change the ACL rather than storing some ACL entries in the file + * mode permission bits (which would be more efficient), because that + * would break once additional permissions (like ACL_APPEND, ACL_DELETE + * for directories) are added. There are no more bits available in the + * file mode. + * + * inode->i_sem: down + */ +int +ext3_acl_chmod(struct inode *inode) +{ + struct posix_acl *acl, *clone; + int error; + + if (S_ISLNK(inode->i_mode)) + return -EOPNOTSUPP; + if (!test_opt(inode->i_sb, POSIX_ACL)) + return 0; + acl = ext3_get_acl(inode, ACL_TYPE_ACCESS); + if (IS_ERR(acl) || !acl) + return PTR_ERR(acl); + clone = posix_acl_clone(acl, GFP_KERNEL); + posix_acl_release(acl); + if (!clone) + return -ENOMEM; + error = posix_acl_chmod_masq(clone, inode->i_mode); + if (!error) { + handle_t *handle; + int retries = 0; + + retry: + handle = ext3_journal_start(inode, EXT3_DATA_TRANS_BLOCKS); + if (IS_ERR(handle)) { + error = PTR_ERR(handle); + ext3_std_error(inode->i_sb, error); + goto out; + } + error = ext3_set_acl(handle, inode, ACL_TYPE_ACCESS, clone); + ext3_journal_stop(handle); + if (error == -ENOSPC && + ext3_should_retry_alloc(inode->i_sb, &retries)) + goto retry; + } +out: + posix_acl_release(clone); + return error; +} + +/* + * Extended attribute handlers + */ +static size_t +ext3_xattr_list_acl_access(struct inode *inode, char *list, size_t list_len, + const char *name, size_t name_len) +{ + const size_t size = sizeof(XATTR_NAME_ACL_ACCESS); + + if (!test_opt(inode->i_sb, POSIX_ACL)) + return 0; + if (list && size <= list_len) + memcpy(list, XATTR_NAME_ACL_ACCESS, size); + return size; +} + +static size_t +ext3_xattr_list_acl_default(struct inode *inode, char *list, size_t list_len, + const char *name, size_t name_len) +{ + const size_t size = sizeof(XATTR_NAME_ACL_DEFAULT); + + if (!test_opt(inode->i_sb, POSIX_ACL)) + return 0; + if (list && size <= list_len) + memcpy(list, XATTR_NAME_ACL_DEFAULT, size); + return size; +} + +static int +ext3_xattr_get_acl(struct inode *inode, int type, void *buffer, size_t size) +{ + struct posix_acl *acl; + int error; + + if (!test_opt(inode->i_sb, POSIX_ACL)) + return -EOPNOTSUPP; + + acl = ext3_get_acl(inode, type); + if (IS_ERR(acl)) + return PTR_ERR(acl); + if (acl == NULL) + return -ENODATA; + error = posix_acl_to_xattr(acl, buffer, size); + posix_acl_release(acl); + + return error; +} + +static int +ext3_xattr_get_acl_access(struct inode *inode, const char *name, + void *buffer, size_t size) +{ + if (strcmp(name, "") != 0) + return -EINVAL; + return ext3_xattr_get_acl(inode, ACL_TYPE_ACCESS, buffer, size); +} + +static int +ext3_xattr_get_acl_default(struct inode *inode, const char *name, + void *buffer, size_t size) +{ + if (strcmp(name, "") != 0) + return -EINVAL; + return ext3_xattr_get_acl(inode, ACL_TYPE_DEFAULT, buffer, size); +} + +static int +ext3_xattr_set_acl(struct inode *inode, int type, const void *value, + size_t size) +{ + handle_t *handle; + struct posix_acl *acl; + int error, retries = 0; + + if (!test_opt(inode->i_sb, POSIX_ACL)) + return -EOPNOTSUPP; + if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER)) + return -EPERM; + + if (value) { + acl = posix_acl_from_xattr(value, size); + if (IS_ERR(acl)) + return PTR_ERR(acl); + else if (acl) { + error = posix_acl_valid(acl); + if (error) + goto release_and_out; + } + } else + acl = NULL; + +retry: + handle = ext3_journal_start(inode, EXT3_DATA_TRANS_BLOCKS); + if (IS_ERR(handle)) + return PTR_ERR(handle); + error = ext3_set_acl(handle, inode, type, acl); + ext3_journal_stop(handle); + if (error == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries)) + goto retry; + +release_and_out: + posix_acl_release(acl); + return error; +} + +static int +ext3_xattr_set_acl_access(struct inode *inode, const char *name, + const void *value, size_t size, int flags) +{ + if (strcmp(name, "") != 0) + return -EINVAL; + return ext3_xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size); +} + +static int +ext3_xattr_set_acl_default(struct inode *inode, const char *name, + const void *value, size_t size, int flags) +{ + if (strcmp(name, "") != 0) + return -EINVAL; + return ext3_xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size); +} + +struct xattr_handler ext3_xattr_acl_access_handler = { + .prefix = XATTR_NAME_ACL_ACCESS, + .list = ext3_xattr_list_acl_access, + .get = ext3_xattr_get_acl_access, + .set = ext3_xattr_set_acl_access, +}; + +struct xattr_handler ext3_xattr_acl_default_handler = { + .prefix = XATTR_NAME_ACL_DEFAULT, + .list = ext3_xattr_list_acl_default, + .get = ext3_xattr_get_acl_default, + .set = ext3_xattr_set_acl_default, +}; diff --git a/fs/ext3/acl.h b/fs/ext3/acl.h new file mode 100644 index 000000000000..98af0c0d0ba9 --- /dev/null +++ b/fs/ext3/acl.h @@ -0,0 +1,84 @@ +/* + File: fs/ext3/acl.h + + (C) 2001 Andreas Gruenbacher, +*/ + +#include + +#define EXT3_ACL_VERSION 0x0001 + +typedef struct { + __le16 e_tag; + __le16 e_perm; + __le32 e_id; +} ext3_acl_entry; + +typedef struct { + __le16 e_tag; + __le16 e_perm; +} ext3_acl_entry_short; + +typedef struct { + __le32 a_version; +} ext3_acl_header; + +static inline size_t ext3_acl_size(int count) +{ + if (count <= 4) { + return sizeof(ext3_acl_header) + + count * sizeof(ext3_acl_entry_short); + } else { + return sizeof(ext3_acl_header) + + 4 * sizeof(ext3_acl_entry_short) + + (count - 4) * sizeof(ext3_acl_entry); + } +} + +static inline int ext3_acl_count(size_t size) +{ + ssize_t s; + size -= sizeof(ext3_acl_header); + s = size - 4 * sizeof(ext3_acl_entry_short); + if (s < 0) { + if (size % sizeof(ext3_acl_entry_short)) + return -1; + return size / sizeof(ext3_acl_entry_short); + } else { + if (s % sizeof(ext3_acl_entry)) + return -1; + return s / sizeof(ext3_acl_entry) + 4; + } +} + +#ifdef CONFIG_EXT3_FS_POSIX_ACL + +/* Value for inode->u.ext3_i.i_acl and inode->u.ext3_i.i_default_acl + if the ACL has not been cached */ +#define EXT3_ACL_NOT_CACHED ((void *)-1) + +/* acl.c */ +extern int ext3_permission (struct inode *, int, struct nameidata *); +extern int ext3_acl_chmod (struct inode *); +extern int ext3_init_acl (handle_t *, struct inode *, struct inode *); + +extern int init_ext3_acl(void); +extern void exit_ext3_acl(void); + +#else /* CONFIG_EXT3_FS_POSIX_ACL */ +#include +#define ext3_permission NULL + +static inline int +ext3_acl_chmod(struct inode *inode) +{ + return 0; +} + +static inline int +ext3_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) +{ + return 0; +} +#endif /* CONFIG_EXT3_FS_POSIX_ACL */ + diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c new file mode 100644 index 000000000000..ccd632fcc6d8 --- /dev/null +++ b/fs/ext3/balloc.c @@ -0,0 +1,1600 @@ +/* + * linux/fs/ext3/balloc.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * Enhanced block allocation by Stephen Tweedie (sct@redhat.com), 1993 + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * balloc.c contains the blocks allocation and deallocation routines + */ + +/* + * The free blocks are managed by bitmaps. A file system contains several + * blocks groups. Each group contains 1 bitmap block for blocks, 1 bitmap + * block for inodes, N blocks for the inode table and data blocks. + * + * The file system contains group descriptors which are located after the + * super block. Each descriptor contains the number of the bitmap block and + * the free blocks count in the block. The descriptors are loaded in memory + * when a file system is mounted (see ext3_read_super). + */ + + +#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) + +struct ext3_group_desc * ext3_get_group_desc(struct super_block * sb, + unsigned int block_group, + struct buffer_head ** bh) +{ + unsigned long group_desc; + unsigned long offset; + struct ext3_group_desc * desc; + struct ext3_sb_info *sbi = EXT3_SB(sb); + + if (block_group >= sbi->s_groups_count) { + ext3_error (sb, "ext3_get_group_desc", + "block_group >= groups_count - " + "block_group = %d, groups_count = %lu", + block_group, sbi->s_groups_count); + + return NULL; + } + smp_rmb(); + + group_desc = block_group >> EXT3_DESC_PER_BLOCK_BITS(sb); + offset = block_group & (EXT3_DESC_PER_BLOCK(sb) - 1); + if (!sbi->s_group_desc[group_desc]) { + ext3_error (sb, "ext3_get_group_desc", + "Group descriptor not loaded - " + "block_group = %d, group_desc = %lu, desc = %lu", + block_group, group_desc, offset); + return NULL; + } + + desc = (struct ext3_group_desc *) sbi->s_group_desc[group_desc]->b_data; + if (bh) + *bh = sbi->s_group_desc[group_desc]; + return desc + offset; +} + +/* + * Read the bitmap for a given block_group, reading into the specified + * slot in the superblock's bitmap cache. + * + * Return buffer_head on success or NULL in case of failure. + */ +static struct buffer_head * +read_block_bitmap(struct super_block *sb, unsigned int block_group) +{ + struct ext3_group_desc * desc; + struct buffer_head * bh = NULL; + + desc = ext3_get_group_desc (sb, block_group, NULL); + if (!desc) + goto error_out; + bh = sb_bread(sb, le32_to_cpu(desc->bg_block_bitmap)); + if (!bh) + ext3_error (sb, "read_block_bitmap", + "Cannot read block bitmap - " + "block_group = %d, block_bitmap = %u", + block_group, le32_to_cpu(desc->bg_block_bitmap)); +error_out: + return bh; +} +/* + * The reservation window structure operations + * -------------------------------------------- + * Operations include: + * dump, find, add, remove, is_empty, find_next_reservable_window, etc. + * + * We use sorted double linked list for the per-filesystem reservation + * window list. (like in vm_region). + * + * Initially, we keep those small operations in the abstract functions, + * so later if we need a better searching tree than double linked-list, + * we could easily switch to that without changing too much + * code. + */ +#if 0 +static void __rsv_window_dump(struct rb_root *root, int verbose, + const char *fn) +{ + struct rb_node *n; + struct ext3_reserve_window_node *rsv, *prev; + int bad; + +restart: + n = rb_first(root); + bad = 0; + prev = NULL; + + printk("Block Allocation Reservation Windows Map (%s):\n", fn); + while (n) { + rsv = list_entry(n, struct ext3_reserve_window_node, rsv_node); + if (verbose) + printk("reservation window 0x%p " + "start: %d, end: %d\n", + rsv, rsv->rsv_start, rsv->rsv_end); + if (rsv->rsv_start && rsv->rsv_start >= rsv->rsv_end) { + printk("Bad reservation %p (start >= end)\n", + rsv); + bad = 1; + } + if (prev && prev->rsv_end >= rsv->rsv_start) { + printk("Bad reservation %p (prev->end >= start)\n", + rsv); + bad = 1; + } + if (bad) { + if (!verbose) { + printk("Restarting reservation walk in verbose mode\n"); + verbose = 1; + goto restart; + } + } + n = rb_next(n); + prev = rsv; + } + printk("Window map complete.\n"); + if (bad) + BUG(); +} +#define rsv_window_dump(root, verbose) \ + __rsv_window_dump((root), (verbose), __FUNCTION__) +#else +#define rsv_window_dump(root, verbose) do {} while (0) +#endif + +static int +goal_in_my_reservation(struct ext3_reserve_window *rsv, int goal, + unsigned int group, struct super_block * sb) +{ + unsigned long group_first_block, group_last_block; + + group_first_block = le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block) + + group * EXT3_BLOCKS_PER_GROUP(sb); + group_last_block = group_first_block + EXT3_BLOCKS_PER_GROUP(sb) - 1; + + if ((rsv->_rsv_start > group_last_block) || + (rsv->_rsv_end < group_first_block)) + return 0; + if ((goal >= 0) && ((goal + group_first_block < rsv->_rsv_start) + || (goal + group_first_block > rsv->_rsv_end))) + return 0; + return 1; +} + +/* + * Find the reserved window which includes the goal, or the previous one + * if the goal is not in any window. + * Returns NULL if there are no windows or if all windows start after the goal. + */ +static struct ext3_reserve_window_node * +search_reserve_window(struct rb_root *root, unsigned long goal) +{ + struct rb_node *n = root->rb_node; + struct ext3_reserve_window_node *rsv; + + if (!n) + return NULL; + + do { + rsv = rb_entry(n, struct ext3_reserve_window_node, rsv_node); + + if (goal < rsv->rsv_start) + n = n->rb_left; + else if (goal > rsv->rsv_end) + n = n->rb_right; + else + return rsv; + } while (n); + /* + * We've fallen off the end of the tree: the goal wasn't inside + * any particular node. OK, the previous node must be to one + * side of the interval containing the goal. If it's the RHS, + * we need to back up one. + */ + if (rsv->rsv_start > goal) { + n = rb_prev(&rsv->rsv_node); + rsv = rb_entry(n, struct ext3_reserve_window_node, rsv_node); + } + return rsv; +} + +void ext3_rsv_window_add(struct super_block *sb, + struct ext3_reserve_window_node *rsv) +{ + struct rb_root *root = &EXT3_SB(sb)->s_rsv_window_root; + struct rb_node *node = &rsv->rsv_node; + unsigned int start = rsv->rsv_start; + + struct rb_node ** p = &root->rb_node; + struct rb_node * parent = NULL; + struct ext3_reserve_window_node *this; + + while (*p) + { + parent = *p; + this = rb_entry(parent, struct ext3_reserve_window_node, rsv_node); + + if (start < this->rsv_start) + p = &(*p)->rb_left; + else if (start > this->rsv_end) + p = &(*p)->rb_right; + else + BUG(); + } + + rb_link_node(node, parent, p); + rb_insert_color(node, root); +} + +static void rsv_window_remove(struct super_block *sb, + struct ext3_reserve_window_node *rsv) +{ + rsv->rsv_start = EXT3_RESERVE_WINDOW_NOT_ALLOCATED; + rsv->rsv_end = EXT3_RESERVE_WINDOW_NOT_ALLOCATED; + rsv->rsv_alloc_hit = 0; + rb_erase(&rsv->rsv_node, &EXT3_SB(sb)->s_rsv_window_root); +} + +static inline int rsv_is_empty(struct ext3_reserve_window *rsv) +{ + /* a valid reservation end block could not be 0 */ + return (rsv->_rsv_end == EXT3_RESERVE_WINDOW_NOT_ALLOCATED); +} +void ext3_init_block_alloc_info(struct inode *inode) +{ + struct ext3_inode_info *ei = EXT3_I(inode); + struct ext3_block_alloc_info *block_i = ei->i_block_alloc_info; + struct super_block *sb = inode->i_sb; + + block_i = kmalloc(sizeof(*block_i), GFP_NOFS); + if (block_i) { + struct ext3_reserve_window_node *rsv = &block_i->rsv_window_node; + + rsv->rsv_start = EXT3_RESERVE_WINDOW_NOT_ALLOCATED; + rsv->rsv_end = EXT3_RESERVE_WINDOW_NOT_ALLOCATED; + + /* + * if filesystem is mounted with NORESERVATION, the goal + * reservation window size is set to zero to indicate + * block reservation is off + */ + if (!test_opt(sb, RESERVATION)) + rsv->rsv_goal_size = 0; + else + rsv->rsv_goal_size = EXT3_DEFAULT_RESERVE_BLOCKS; + rsv->rsv_alloc_hit = 0; + block_i->last_alloc_logical_block = 0; + block_i->last_alloc_physical_block = 0; + } + ei->i_block_alloc_info = block_i; +} + +void ext3_discard_reservation(struct inode *inode) +{ + struct ext3_inode_info *ei = EXT3_I(inode); + struct ext3_block_alloc_info *block_i = ei->i_block_alloc_info; + struct ext3_reserve_window_node *rsv; + spinlock_t *rsv_lock = &EXT3_SB(inode->i_sb)->s_rsv_window_lock; + + if (!block_i) + return; + + rsv = &block_i->rsv_window_node; + if (!rsv_is_empty(&rsv->rsv_window)) { + spin_lock(rsv_lock); + if (!rsv_is_empty(&rsv->rsv_window)) + rsv_window_remove(inode->i_sb, rsv); + spin_unlock(rsv_lock); + } +} + +/* Free given blocks, update quota and i_blocks field */ +void ext3_free_blocks_sb(handle_t *handle, struct super_block *sb, + unsigned long block, unsigned long count, + int *pdquot_freed_blocks) +{ + struct buffer_head *bitmap_bh = NULL; + struct buffer_head *gd_bh; + unsigned long block_group; + unsigned long bit; + unsigned long i; + unsigned long overflow; + struct ext3_group_desc * desc; + struct ext3_super_block * es; + struct ext3_sb_info *sbi; + int err = 0, ret; + unsigned group_freed; + + *pdquot_freed_blocks = 0; + sbi = EXT3_SB(sb); + es = sbi->s_es; + if (block < le32_to_cpu(es->s_first_data_block) || + block + count < block || + block + count > le32_to_cpu(es->s_blocks_count)) { + ext3_error (sb, "ext3_free_blocks", + "Freeing blocks not in datazone - " + "block = %lu, count = %lu", block, count); + goto error_return; + } + + ext3_debug ("freeing block(s) %lu-%lu\n", block, block + count - 1); + +do_more: + overflow = 0; + block_group = (block - le32_to_cpu(es->s_first_data_block)) / + EXT3_BLOCKS_PER_GROUP(sb); + bit = (block - le32_to_cpu(es->s_first_data_block)) % + EXT3_BLOCKS_PER_GROUP(sb); + /* + * Check to see if we are freeing blocks across a group + * boundary. + */ + if (bit + count > EXT3_BLOCKS_PER_GROUP(sb)) { + overflow = bit + count - EXT3_BLOCKS_PER_GROUP(sb); + count -= overflow; + } + brelse(bitmap_bh); + bitmap_bh = read_block_bitmap(sb, block_group); + if (!bitmap_bh) + goto error_return; + desc = ext3_get_group_desc (sb, block_group, &gd_bh); + if (!desc) + goto error_return; + + if (in_range (le32_to_cpu(desc->bg_block_bitmap), block, count) || + in_range (le32_to_cpu(desc->bg_inode_bitmap), block, count) || + in_range (block, le32_to_cpu(desc->bg_inode_table), + sbi->s_itb_per_group) || + in_range (block + count - 1, le32_to_cpu(desc->bg_inode_table), + sbi->s_itb_per_group)) + ext3_error (sb, "ext3_free_blocks", + "Freeing blocks in system zones - " + "Block = %lu, count = %lu", + block, count); + + /* + * We are about to start releasing blocks in the bitmap, + * so we need undo access. + */ + /* @@@ check errors */ + BUFFER_TRACE(bitmap_bh, "getting undo access"); + err = ext3_journal_get_undo_access(handle, bitmap_bh); + if (err) + goto error_return; + + /* + * We are about to modify some metadata. Call the journal APIs + * to unshare ->b_data if a currently-committing transaction is + * using it + */ + BUFFER_TRACE(gd_bh, "get_write_access"); + err = ext3_journal_get_write_access(handle, gd_bh); + if (err) + goto error_return; + + jbd_lock_bh_state(bitmap_bh); + + for (i = 0, group_freed = 0; i < count; i++) { + /* + * An HJ special. This is expensive... + */ +#ifdef CONFIG_JBD_DEBUG + jbd_unlock_bh_state(bitmap_bh); + { + struct buffer_head *debug_bh; + debug_bh = sb_find_get_block(sb, block + i); + if (debug_bh) { + BUFFER_TRACE(debug_bh, "Deleted!"); + if (!bh2jh(bitmap_bh)->b_committed_data) + BUFFER_TRACE(debug_bh, + "No commited data in bitmap"); + BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap"); + __brelse(debug_bh); + } + } + jbd_lock_bh_state(bitmap_bh); +#endif + if (need_resched()) { + jbd_unlock_bh_state(bitmap_bh); + cond_resched(); + jbd_lock_bh_state(bitmap_bh); + } + /* @@@ This prevents newly-allocated data from being + * freed and then reallocated within the same + * transaction. + * + * Ideally we would want to allow that to happen, but to + * do so requires making journal_forget() capable of + * revoking the queued write of a data block, which + * implies blocking on the journal lock. *forget() + * cannot block due to truncate races. + * + * Eventually we can fix this by making journal_forget() + * return a status indicating whether or not it was able + * to revoke the buffer. On successful revoke, it is + * safe not to set the allocation bit in the committed + * bitmap, because we know that there is no outstanding + * activity on the buffer any more and so it is safe to + * reallocate it. + */ + BUFFER_TRACE(bitmap_bh, "set in b_committed_data"); + J_ASSERT_BH(bitmap_bh, + bh2jh(bitmap_bh)->b_committed_data != NULL); + ext3_set_bit_atomic(sb_bgl_lock(sbi, block_group), bit + i, + bh2jh(bitmap_bh)->b_committed_data); + + /* + * We clear the bit in the bitmap after setting the committed + * data bit, because this is the reverse order to that which + * the allocator uses. + */ + BUFFER_TRACE(bitmap_bh, "clear bit"); + if (!ext3_clear_bit_atomic(sb_bgl_lock(sbi, block_group), + bit + i, bitmap_bh->b_data)) { + jbd_unlock_bh_state(bitmap_bh); + ext3_error(sb, __FUNCTION__, + "bit already cleared for block %lu", block + i); + jbd_lock_bh_state(bitmap_bh); + BUFFER_TRACE(bitmap_bh, "bit already cleared"); + } else { + group_freed++; + } + } + jbd_unlock_bh_state(bitmap_bh); + + spin_lock(sb_bgl_lock(sbi, block_group)); + desc->bg_free_blocks_count = + cpu_to_le16(le16_to_cpu(desc->bg_free_blocks_count) + + group_freed); + spin_unlock(sb_bgl_lock(sbi, block_group)); + percpu_counter_mod(&sbi->s_freeblocks_counter, count); + + /* We dirtied the bitmap block */ + BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); + err = ext3_journal_dirty_metadata(handle, bitmap_bh); + + /* And the group descriptor block */ + BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); + ret = ext3_journal_dirty_metadata(handle, gd_bh); + if (!err) err = ret; + *pdquot_freed_blocks += group_freed; + + if (overflow && !err) { + block += count; + count = overflow; + goto do_more; + } + sb->s_dirt = 1; +error_return: + brelse(bitmap_bh); + ext3_std_error(sb, err); + return; +} + +/* Free given blocks, update quota and i_blocks field */ +void ext3_free_blocks(handle_t *handle, struct inode *inode, + unsigned long block, unsigned long count) +{ + struct super_block * sb; + int dquot_freed_blocks; + + sb = inode->i_sb; + if (!sb) { + printk ("ext3_free_blocks: nonexistent device"); + return; + } + ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks); + if (dquot_freed_blocks) + DQUOT_FREE_BLOCK(inode, dquot_freed_blocks); + return; +} + +/* + * For ext3 allocations, we must not reuse any blocks which are + * allocated in the bitmap buffer's "last committed data" copy. This + * prevents deletes from freeing up the page for reuse until we have + * committed the delete transaction. + * + * If we didn't do this, then deleting something and reallocating it as + * data would allow the old block to be overwritten before the + * transaction committed (because we force data to disk before commit). + * This would lead to corruption if we crashed between overwriting the + * data and committing the delete. + * + * @@@ We may want to make this allocation behaviour conditional on + * data-writes at some point, and disable it for metadata allocations or + * sync-data inodes. + */ +static int ext3_test_allocatable(int nr, struct buffer_head *bh) +{ + int ret; + struct journal_head *jh = bh2jh(bh); + + if (ext3_test_bit(nr, bh->b_data)) + return 0; + + jbd_lock_bh_state(bh); + if (!jh->b_committed_data) + ret = 1; + else + ret = !ext3_test_bit(nr, jh->b_committed_data); + jbd_unlock_bh_state(bh); + return ret; +} + +static int +bitmap_search_next_usable_block(int start, struct buffer_head *bh, + int maxblocks) +{ + int next; + struct journal_head *jh = bh2jh(bh); + + /* + * The bitmap search --- search forward alternately through the actual + * bitmap and the last-committed copy until we find a bit free in + * both + */ + while (start < maxblocks) { + next = ext3_find_next_zero_bit(bh->b_data, maxblocks, start); + if (next >= maxblocks) + return -1; + if (ext3_test_allocatable(next, bh)) + return next; + jbd_lock_bh_state(bh); + if (jh->b_committed_data) + start = ext3_find_next_zero_bit(jh->b_committed_data, + maxblocks, next); + jbd_unlock_bh_state(bh); + } + return -1; +} + +/* + * Find an allocatable block in a bitmap. We honour both the bitmap and + * its last-committed copy (if that exists), and perform the "most + * appropriate allocation" algorithm of looking for a free block near + * the initial goal; then for a free byte somewhere in the bitmap; then + * for any free bit in the bitmap. + */ +static int +find_next_usable_block(int start, struct buffer_head *bh, int maxblocks) +{ + int here, next; + char *p, *r; + + if (start > 0) { + /* + * The goal was occupied; search forward for a free + * block within the next XX blocks. + * + * end_goal is more or less random, but it has to be + * less than EXT3_BLOCKS_PER_GROUP. Aligning up to the + * next 64-bit boundary is simple.. + */ + int end_goal = (start + 63) & ~63; + if (end_goal > maxblocks) + end_goal = maxblocks; + here = ext3_find_next_zero_bit(bh->b_data, end_goal, start); + if (here < end_goal && ext3_test_allocatable(here, bh)) + return here; + ext3_debug("Bit not found near goal\n"); + } + + here = start; + if (here < 0) + here = 0; + + p = ((char *)bh->b_data) + (here >> 3); + r = memscan(p, 0, (maxblocks - here + 7) >> 3); + next = (r - ((char *)bh->b_data)) << 3; + + if (next < maxblocks && next >= start && ext3_test_allocatable(next, bh)) + return next; + + /* + * The bitmap search --- search forward alternately through the actual + * bitmap and the last-committed copy until we find a bit free in + * both + */ + here = bitmap_search_next_usable_block(here, bh, maxblocks); + return here; +} + +/* + * We think we can allocate this block in this bitmap. Try to set the bit. + * If that succeeds then check that nobody has allocated and then freed the + * block since we saw that is was not marked in b_committed_data. If it _was_ + * allocated and freed then clear the bit in the bitmap again and return + * zero (failure). + */ +static inline int +claim_block(spinlock_t *lock, int block, struct buffer_head *bh) +{ + struct journal_head *jh = bh2jh(bh); + int ret; + + if (ext3_set_bit_atomic(lock, block, bh->b_data)) + return 0; + jbd_lock_bh_state(bh); + if (jh->b_committed_data && ext3_test_bit(block,jh->b_committed_data)) { + ext3_clear_bit_atomic(lock, block, bh->b_data); + ret = 0; + } else { + ret = 1; + } + jbd_unlock_bh_state(bh); + return ret; +} + +/* + * If we failed to allocate the desired block then we may end up crossing to a + * new bitmap. In that case we must release write access to the old one via + * ext3_journal_release_buffer(), else we'll run out of credits. + */ +static int +ext3_try_to_allocate(struct super_block *sb, handle_t *handle, int group, + struct buffer_head *bitmap_bh, int goal, struct ext3_reserve_window *my_rsv) +{ + int group_first_block, start, end; + + /* we do allocation within the reservation window if we have a window */ + if (my_rsv) { + group_first_block = + le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block) + + group * EXT3_BLOCKS_PER_GROUP(sb); + if (my_rsv->_rsv_start >= group_first_block) + start = my_rsv->_rsv_start - group_first_block; + else + /* reservation window cross group boundary */ + start = 0; + end = my_rsv->_rsv_end - group_first_block + 1; + if (end > EXT3_BLOCKS_PER_GROUP(sb)) + /* reservation window crosses group boundary */ + end = EXT3_BLOCKS_PER_GROUP(sb); + if ((start <= goal) && (goal < end)) + start = goal; + else + goal = -1; + } else { + if (goal > 0) + start = goal; + else + start = 0; + end = EXT3_BLOCKS_PER_GROUP(sb); + } + + BUG_ON(start > EXT3_BLOCKS_PER_GROUP(sb)); + +repeat: + if (goal < 0 || !ext3_test_allocatable(goal, bitmap_bh)) { + goal = find_next_usable_block(start, bitmap_bh, end); + if (goal < 0) + goto fail_access; + if (!my_rsv) { + int i; + + for (i = 0; i < 7 && goal > start && + ext3_test_allocatable(goal - 1, + bitmap_bh); + i++, goal--) + ; + } + } + start = goal; + + if (!claim_block(sb_bgl_lock(EXT3_SB(sb), group), goal, bitmap_bh)) { + /* + * The block was allocated by another thread, or it was + * allocated and then freed by another thread + */ + start++; + goal++; + if (start >= end) + goto fail_access; + goto repeat; + } + return goal; +fail_access: + return -1; +} + +/** + * find_next_reservable_window(): + * find a reservable space within the given range. + * It does not allocate the reservation window for now: + * alloc_new_reservation() will do the work later. + * + * @search_head: the head of the searching list; + * This is not necessarily the list head of the whole filesystem + * + * We have both head and start_block to assist the search + * for the reservable space. The list starts from head, + * but we will shift to the place where start_block is, + * then start from there, when looking for a reservable space. + * + * @size: the target new reservation window size + * + * @group_first_block: the first block we consider to start + * the real search from + * + * @last_block: + * the maximum block number that our goal reservable space + * could start from. This is normally the last block in this + * group. The search will end when we found the start of next + * possible reservable space is out of this boundary. + * This could handle the cross boundary reservation window + * request. + * + * basically we search from the given range, rather than the whole + * reservation double linked list, (start_block, last_block) + * to find a free region that is of my size and has not + * been reserved. + * + * on succeed, it returns the reservation window to be appended to. + * failed, return NULL. + */ +static struct ext3_reserve_window_node *find_next_reservable_window( + struct ext3_reserve_window_node *search_head, + unsigned long size, int *start_block, + int last_block) +{ + struct rb_node *next; + struct ext3_reserve_window_node *rsv, *prev; + int cur; + + /* TODO: make the start of the reservation window byte-aligned */ + /* cur = *start_block & ~7;*/ + cur = *start_block; + rsv = search_head; + if (!rsv) + return NULL; + + while (1) { + if (cur <= rsv->rsv_end) + cur = rsv->rsv_end + 1; + + /* TODO? + * in the case we could not find a reservable space + * that is what is expected, during the re-search, we could + * remember what's the largest reservable space we could have + * and return that one. + * + * For now it will fail if we could not find the reservable + * space with expected-size (or more)... + */ + if (cur > last_block) + return NULL; /* fail */ + + prev = rsv; + next = rb_next(&rsv->rsv_node); + rsv = list_entry(next, struct ext3_reserve_window_node, rsv_node); + + /* + * Reached the last reservation, we can just append to the + * previous one. + */ + if (!next) + break; + + if (cur + size <= rsv->rsv_start) { + /* + * Found a reserveable space big enough. We could + * have a reservation across the group boundary here + */ + break; + } + } + /* + * we come here either : + * when we reach the end of the whole list, + * and there is empty reservable space after last entry in the list. + * append it to the end of the list. + * + * or we found one reservable space in the middle of the list, + * return the reservation window that we could append to. + * succeed. + */ + *start_block = cur; + return prev; +} + +/** + * alloc_new_reservation()--allocate a new reservation window + * + * To make a new reservation, we search part of the filesystem + * reservation list (the list that inside the group). We try to + * allocate a new reservation window near the allocation goal, + * or the beginning of the group, if there is no goal. + * + * We first find a reservable space after the goal, then from + * there, we check the bitmap for the first free block after + * it. If there is no free block until the end of group, then the + * whole group is full, we failed. Otherwise, check if the free + * block is inside the expected reservable space, if so, we + * succeed. + * If the first free block is outside the reservable space, then + * start from the first free block, we search for next available + * space, and go on. + * + * on succeed, a new reservation will be found and inserted into the list + * It contains at least one free block, and it does not overlap with other + * reservation windows. + * + * failed: we failed to find a reservation window in this group + * + * @rsv: the reservation + * + * @goal: The goal (group-relative). It is where the search for a + * free reservable space should start from. + * if we have a goal(goal >0 ), then start from there, + * no goal(goal = -1), we start from the first block + * of the group. + * + * @sb: the super block + * @group: the group we are trying to allocate in + * @bitmap_bh: the block group block bitmap + */ +static int alloc_new_reservation(struct ext3_reserve_window_node *my_rsv, + int goal, struct super_block *sb, + unsigned int group, struct buffer_head *bitmap_bh) +{ + struct ext3_reserve_window_node *search_head; + int group_first_block, group_end_block, start_block; + int first_free_block; + int reservable_space_start; + struct ext3_reserve_window_node *prev_rsv; + struct rb_root *fs_rsv_root = &EXT3_SB(sb)->s_rsv_window_root; + unsigned long size; + + group_first_block = le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block) + + group * EXT3_BLOCKS_PER_GROUP(sb); + group_end_block = group_first_block + EXT3_BLOCKS_PER_GROUP(sb) - 1; + + if (goal < 0) + start_block = group_first_block; + else + start_block = goal + group_first_block; + + size = my_rsv->rsv_goal_size; + if (!rsv_is_empty(&my_rsv->rsv_window)) { + /* + * if the old reservation is cross group boundary + * and if the goal is inside the old reservation window, + * we will come here when we just failed to allocate from + * the first part of the window. We still have another part + * that belongs to the next group. In this case, there is no + * point to discard our window and try to allocate a new one + * in this group(which will fail). we should + * keep the reservation window, just simply move on. + * + * Maybe we could shift the start block of the reservation + * window to the first block of next group. + */ + + if ((my_rsv->rsv_start <= group_end_block) && + (my_rsv->rsv_end > group_end_block) && + (start_block >= my_rsv->rsv_start)) + return -1; + + if ((my_rsv->rsv_alloc_hit > + (my_rsv->rsv_end - my_rsv->rsv_start + 1) / 2)) { + /* + * if we previously allocation hit ration is greater than half + * we double the size of reservation window next time + * otherwise keep the same + */ + size = size * 2; + if (size > EXT3_MAX_RESERVE_BLOCKS) + size = EXT3_MAX_RESERVE_BLOCKS; + my_rsv->rsv_goal_size= size; + } + } + /* + * shift the search start to the window near the goal block + */ + search_head = search_reserve_window(fs_rsv_root, start_block); + + /* + * find_next_reservable_window() simply finds a reservable window + * inside the given range(start_block, group_end_block). + * + * To make sure the reservation window has a free bit inside it, we + * need to check the bitmap after we found a reservable window. + */ +retry: + prev_rsv = find_next_reservable_window(search_head, size, + &start_block, group_end_block); + if (prev_rsv == NULL) + goto failed; + reservable_space_start = start_block; + /* + * On success, find_next_reservable_window() returns the + * reservation window where there is a reservable space after it. + * Before we reserve this reservable space, we need + * to make sure there is at least a free block inside this region. + * + * searching the first free bit on the block bitmap and copy of + * last committed bitmap alternatively, until we found a allocatable + * block. Search start from the start block of the reservable space + * we just found. + */ + first_free_block = bitmap_search_next_usable_block( + reservable_space_start - group_first_block, + bitmap_bh, group_end_block - group_first_block + 1); + + if (first_free_block < 0) { + /* + * no free block left on the bitmap, no point + * to reserve the space. return failed. + */ + goto failed; + } + start_block = first_free_block + group_first_block; + /* + * check if the first free block is within the + * free space we just found + */ + if ((start_block >= reservable_space_start) && + (start_block < reservable_space_start + size)) + goto found_rsv_window; + /* + * if the first free bit we found is out of the reservable space + * this means there is no free block on the reservable space + * we should continue search for next reservable space, + * start from where the free block is, + * we also shift the list head to where we stopped last time + */ + search_head = prev_rsv; + goto retry; + +found_rsv_window: + /* + * great! the reservable space contains some free blocks. + * if the search returns that we should add the new + * window just next to where the old window, we don't + * need to remove the old window first then add it to the + * same place, just update the new start and new end. + */ + if (my_rsv != prev_rsv) { + if (!rsv_is_empty(&my_rsv->rsv_window)) + rsv_window_remove(sb, my_rsv); + } + my_rsv->rsv_start = reservable_space_start; + my_rsv->rsv_end = my_rsv->rsv_start + size - 1; + my_rsv->rsv_alloc_hit = 0; + if (my_rsv != prev_rsv) { + ext3_rsv_window_add(sb, my_rsv); + } + return 0; /* succeed */ +failed: + /* + * failed to find a new reservation window in the current + * group, remove the current(stale) reservation window + * if there is any + */ + if (!rsv_is_empty(&my_rsv->rsv_window)) + rsv_window_remove(sb, my_rsv); + return -1; /* failed */ +} + +/* + * This is the main function used to allocate a new block and its reservation + * window. + * + * Each time when a new block allocation is need, first try to allocate from + * its own reservation. If it does not have a reservation window, instead of + * looking for a free bit on bitmap first, then look up the reservation list to + * see if it is inside somebody else's reservation window, we try to allocate a + * reservation window for it starting from the goal first. Then do the block + * allocation within the reservation window. + * + * This will avoid keeping on searching the reservation list again and + * again when someboday is looking for a free block (without + * reservation), and there are lots of free blocks, but they are all + * being reserved. + * + * We use a sorted double linked list for the per-filesystem reservation list. + * The insert, remove and find a free space(non-reserved) operations for the + * sorted double linked list should be fast. + * + */ +static int +ext3_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle, + unsigned int group, struct buffer_head *bitmap_bh, + int goal, struct ext3_reserve_window_node * my_rsv, + int *errp) +{ + spinlock_t *rsv_lock; + unsigned long group_first_block; + int ret = 0; + int fatal; + + *errp = 0; + + /* + * Make sure we use undo access for the bitmap, because it is critical + * that we do the frozen_data COW on bitmap buffers in all cases even + * if the buffer is in BJ_Forget state in the committing transaction. + */ + BUFFER_TRACE(bitmap_bh, "get undo access for new block"); + fatal = ext3_journal_get_undo_access(handle, bitmap_bh); + if (fatal) { + *errp = fatal; + return -1; + } + + /* + * we don't deal with reservation when + * filesystem is mounted without reservation + * or the file is not a regular file + * or last attempt to allocate a block with reservation turned on failed + */ + if (my_rsv == NULL ) { + ret = ext3_try_to_allocate(sb, handle, group, bitmap_bh, goal, NULL); + goto out; + } + rsv_lock = &EXT3_SB(sb)->s_rsv_window_lock; + /* + * goal is a group relative block number (if there is a goal) + * 0 < goal < EXT3_BLOCKS_PER_GROUP(sb) + * first block is a filesystem wide block number + * first block is the block number of the first block in this group + */ + group_first_block = le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block) + + group * EXT3_BLOCKS_PER_GROUP(sb); + + /* + * Basically we will allocate a new block from inode's reservation + * window. + * + * We need to allocate a new reservation window, if: + * a) inode does not have a reservation window; or + * b) last attempt to allocate a block from existing reservation + * failed; or + * c) we come here with a goal and with a reservation window + * + * We do not need to allocate a new reservation window if we come here + * at the beginning with a goal and the goal is inside the window, or + * we don't have a goal but already have a reservation window. + * then we could go to allocate from the reservation window directly. + */ + while (1) { + struct ext3_reserve_window rsv_copy; + + rsv_copy._rsv_start = my_rsv->rsv_start; + rsv_copy._rsv_end = my_rsv->rsv_end; + + if (rsv_is_empty(&rsv_copy) || (ret < 0) || + !goal_in_my_reservation(&rsv_copy, goal, group, sb)) { + spin_lock(rsv_lock); + ret = alloc_new_reservation(my_rsv, goal, sb, + group, bitmap_bh); + rsv_copy._rsv_start = my_rsv->rsv_start; + rsv_copy._rsv_end = my_rsv->rsv_end; + spin_unlock(rsv_lock); + if (ret < 0) + break; /* failed */ + + if (!goal_in_my_reservation(&rsv_copy, goal, group, sb)) + goal = -1; + } + if ((rsv_copy._rsv_start >= group_first_block + EXT3_BLOCKS_PER_GROUP(sb)) + || (rsv_copy._rsv_end < group_first_block)) + BUG(); + ret = ext3_try_to_allocate(sb, handle, group, bitmap_bh, goal, + &rsv_copy); + if (ret >= 0) { + my_rsv->rsv_alloc_hit++; + break; /* succeed */ + } + } +out: + if (ret >= 0) { + BUFFER_TRACE(bitmap_bh, "journal_dirty_metadata for " + "bitmap block"); + fatal = ext3_journal_dirty_metadata(handle, bitmap_bh); + if (fatal) { + *errp = fatal; + return -1; + } + return ret; + } + + BUFFER_TRACE(bitmap_bh, "journal_release_buffer"); + ext3_journal_release_buffer(handle, bitmap_bh); + return ret; +} + +static int ext3_has_free_blocks(struct ext3_sb_info *sbi) +{ + int free_blocks, root_blocks; + + free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter); + root_blocks = le32_to_cpu(sbi->s_es->s_r_blocks_count); + if (free_blocks < root_blocks + 1 && !capable(CAP_SYS_RESOURCE) && + sbi->s_resuid != current->fsuid && + (sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) { + return 0; + } + return 1; +} + +/* + * ext3_should_retry_alloc() is called when ENOSPC is returned, and if + * it is profitable to retry the operation, this function will wait + * for the current or commiting transaction to complete, and then + * return TRUE. + */ +int ext3_should_retry_alloc(struct super_block *sb, int *retries) +{ + if (!ext3_has_free_blocks(EXT3_SB(sb)) || (*retries)++ > 3) + return 0; + + jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id); + + return journal_force_commit_nested(EXT3_SB(sb)->s_journal); +} + +/* + * ext3_new_block uses a goal block to assist allocation. If the goal is + * free, or there is a free block within 32 blocks of the goal, that block + * is allocated. Otherwise a forward search is made for a free block; within + * each block group the search first looks for an entire free byte in the block + * bitmap, and then for any free bit if that fails. + * This function also updates quota and i_blocks field. + */ +int ext3_new_block(handle_t *handle, struct inode *inode, + unsigned long goal, int *errp) +{ + struct buffer_head *bitmap_bh = NULL; + struct buffer_head *gdp_bh; + int group_no; + int goal_group; + int ret_block; + int bgi; /* blockgroup iteration index */ + int target_block; + int fatal = 0, err; + int performed_allocation = 0; + int free_blocks; + struct super_block *sb; + struct ext3_group_desc *gdp; + struct ext3_super_block *es; + struct ext3_sb_info *sbi; + struct ext3_reserve_window_node *my_rsv = NULL; + struct ext3_block_alloc_info *block_i; + unsigned short windowsz = 0; +#ifdef EXT3FS_DEBUG + static int goal_hits, goal_attempts; +#endif + unsigned long ngroups; + + *errp = -ENOSPC; + sb = inode->i_sb; + if (!sb) { + printk("ext3_new_block: nonexistent device"); + return 0; + } + + /* + * Check quota for allocation of this block. + */ + if (DQUOT_ALLOC_BLOCK(inode, 1)) { + *errp = -EDQUOT; + return 0; + } + + sbi = EXT3_SB(sb); + es = EXT3_SB(sb)->s_es; + ext3_debug("goal=%lu.\n", goal); + /* + * Allocate a block from reservation only when + * filesystem is mounted with reservation(default,-o reservation), and + * it's a regular file, and + * the desired window size is greater than 0 (One could use ioctl + * command EXT3_IOC_SETRSVSZ to set the window size to 0 to turn off + * reservation on that particular file) + */ + block_i = EXT3_I(inode)->i_block_alloc_info; + if (block_i && ((windowsz = block_i->rsv_window_node.rsv_goal_size) > 0)) + my_rsv = &block_i->rsv_window_node; + + if (!ext3_has_free_blocks(sbi)) { + *errp = -ENOSPC; + goto out; + } + + /* + * First, test whether the goal block is free. + */ + if (goal < le32_to_cpu(es->s_first_data_block) || + goal >= le32_to_cpu(es->s_blocks_count)) + goal = le32_to_cpu(es->s_first_data_block); + group_no = (goal - le32_to_cpu(es->s_first_data_block)) / + EXT3_BLOCKS_PER_GROUP(sb); + gdp = ext3_get_group_desc(sb, group_no, &gdp_bh); + if (!gdp) + goto io_error; + + goal_group = group_no; +retry: + free_blocks = le16_to_cpu(gdp->bg_free_blocks_count); + /* + * if there is not enough free blocks to make a new resevation + * turn off reservation for this allocation + */ + if (my_rsv && (free_blocks < windowsz) + && (rsv_is_empty(&my_rsv->rsv_window))) + my_rsv = NULL; + + if (free_blocks > 0) { + ret_block = ((goal - le32_to_cpu(es->s_first_data_block)) % + EXT3_BLOCKS_PER_GROUP(sb)); + bitmap_bh = read_block_bitmap(sb, group_no); + if (!bitmap_bh) + goto io_error; + ret_block = ext3_try_to_allocate_with_rsv(sb, handle, group_no, + bitmap_bh, ret_block, my_rsv, &fatal); + if (fatal) + goto out; + if (ret_block >= 0) + goto allocated; + } + + ngroups = EXT3_SB(sb)->s_groups_count; + smp_rmb(); + + /* + * Now search the rest of the groups. We assume that + * i and gdp correctly point to the last group visited. + */ + for (bgi = 0; bgi < ngroups; bgi++) { + group_no++; + if (group_no >= ngroups) + group_no = 0; + gdp = ext3_get_group_desc(sb, group_no, &gdp_bh); + if (!gdp) { + *errp = -EIO; + goto out; + } + free_blocks = le16_to_cpu(gdp->bg_free_blocks_count); + /* + * skip this group if the number of + * free blocks is less than half of the reservation + * window size. + */ + if (free_blocks <= (windowsz/2)) + continue; + + brelse(bitmap_bh); + bitmap_bh = read_block_bitmap(sb, group_no); + if (!bitmap_bh) + goto io_error; + ret_block = ext3_try_to_allocate_with_rsv(sb, handle, group_no, + bitmap_bh, -1, my_rsv, &fatal); + if (fatal) + goto out; + if (ret_block >= 0) + goto allocated; + } + /* + * We may end up a bogus ealier ENOSPC error due to + * filesystem is "full" of reservations, but + * there maybe indeed free blocks avaliable on disk + * In this case, we just forget about the reservations + * just do block allocation as without reservations. + */ + if (my_rsv) { + my_rsv = NULL; + group_no = goal_group; + goto retry; + } + /* No space left on the device */ + *errp = -ENOSPC; + goto out; + +allocated: + + ext3_debug("using block group %d(%d)\n", + group_no, gdp->bg_free_blocks_count); + + BUFFER_TRACE(gdp_bh, "get_write_access"); + fatal = ext3_journal_get_write_access(handle, gdp_bh); + if (fatal) + goto out; + + target_block = ret_block + group_no * EXT3_BLOCKS_PER_GROUP(sb) + + le32_to_cpu(es->s_first_data_block); + + if (target_block == le32_to_cpu(gdp->bg_block_bitmap) || + target_block == le32_to_cpu(gdp->bg_inode_bitmap) || + in_range(target_block, le32_to_cpu(gdp->bg_inode_table), + EXT3_SB(sb)->s_itb_per_group)) + ext3_error(sb, "ext3_new_block", + "Allocating block in system zone - " + "block = %u", target_block); + + performed_allocation = 1; + +#ifdef CONFIG_JBD_DEBUG + { + struct buffer_head *debug_bh; + + /* Record bitmap buffer state in the newly allocated block */ + debug_bh = sb_find_get_block(sb, target_block); + if (debug_bh) { + BUFFER_TRACE(debug_bh, "state when allocated"); + BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap state"); + brelse(debug_bh); + } + } + jbd_lock_bh_state(bitmap_bh); + spin_lock(sb_bgl_lock(sbi, group_no)); + if (buffer_jbd(bitmap_bh) && bh2jh(bitmap_bh)->b_committed_data) { + if (ext3_test_bit(ret_block, + bh2jh(bitmap_bh)->b_committed_data)) { + printk("%s: block was unexpectedly set in " + "b_committed_data\n", __FUNCTION__); + } + } + ext3_debug("found bit %d\n", ret_block); + spin_unlock(sb_bgl_lock(sbi, group_no)); + jbd_unlock_bh_state(bitmap_bh); +#endif + + /* ret_block was blockgroup-relative. Now it becomes fs-relative */ + ret_block = target_block; + + if (ret_block >= le32_to_cpu(es->s_blocks_count)) { + ext3_error(sb, "ext3_new_block", + "block(%d) >= blocks count(%d) - " + "block_group = %d, es == %p ", ret_block, + le32_to_cpu(es->s_blocks_count), group_no, es); + goto out; + } + + /* + * It is up to the caller to add the new buffer to a journal + * list of some description. We don't know in advance whether + * the caller wants to use it as metadata or data. + */ + ext3_debug("allocating block %d. Goal hits %d of %d.\n", + ret_block, goal_hits, goal_attempts); + + spin_lock(sb_bgl_lock(sbi, group_no)); + gdp->bg_free_blocks_count = + cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) - 1); + spin_unlock(sb_bgl_lock(sbi, group_no)); + percpu_counter_mod(&sbi->s_freeblocks_counter, -1); + + BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor"); + err = ext3_journal_dirty_metadata(handle, gdp_bh); + if (!fatal) + fatal = err; + + sb->s_dirt = 1; + if (fatal) + goto out; + + *errp = 0; + brelse(bitmap_bh); + return ret_block; + +io_error: + *errp = -EIO; +out: + if (fatal) { + *errp = fatal; + ext3_std_error(sb, fatal); + } + /* + * Undo the block allocation + */ + if (!performed_allocation) + DQUOT_FREE_BLOCK(inode, 1); + brelse(bitmap_bh); + return 0; +} + +unsigned long ext3_count_free_blocks(struct super_block *sb) +{ + unsigned long desc_count; + struct ext3_group_desc *gdp; + int i; + unsigned long ngroups; +#ifdef EXT3FS_DEBUG + struct ext3_super_block *es; + unsigned long bitmap_count, x; + struct buffer_head *bitmap_bh = NULL; + + lock_super(sb); + es = EXT3_SB(sb)->s_es; + desc_count = 0; + bitmap_count = 0; + gdp = NULL; + for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) { + gdp = ext3_get_group_desc(sb, i, NULL); + if (!gdp) + continue; + desc_count += le16_to_cpu(gdp->bg_free_blocks_count); + brelse(bitmap_bh); + bitmap_bh = read_block_bitmap(sb, i); + if (bitmap_bh == NULL) + continue; + + x = ext3_count_free(bitmap_bh, sb->s_blocksize); + printk("group %d: stored = %d, counted = %lu\n", + i, le16_to_cpu(gdp->bg_free_blocks_count), x); + bitmap_count += x; + } + brelse(bitmap_bh); + printk("ext3_count_free_blocks: stored = %u, computed = %lu, %lu\n", + le32_to_cpu(es->s_free_blocks_count), desc_count, bitmap_count); + unlock_super(sb); + return bitmap_count; +#else + desc_count = 0; + ngroups = EXT3_SB(sb)->s_groups_count; + smp_rmb(); + for (i = 0; i < ngroups; i++) { + gdp = ext3_get_group_desc(sb, i, NULL); + if (!gdp) + continue; + desc_count += le16_to_cpu(gdp->bg_free_blocks_count); + } + + return desc_count; +#endif +} + +static inline int +block_in_use(unsigned long block, struct super_block *sb, unsigned char *map) +{ + return ext3_test_bit ((block - + le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block)) % + EXT3_BLOCKS_PER_GROUP(sb), map); +} + +static inline int test_root(int a, int b) +{ + int num = b; + + while (a > num) + num *= b; + return num == a; +} + +static int ext3_group_sparse(int group) +{ + if (group <= 1) + return 1; + if (!(group & 1)) + return 0; + return (test_root(group, 7) || test_root(group, 5) || + test_root(group, 3)); +} + +/** + * ext3_bg_has_super - number of blocks used by the superblock in group + * @sb: superblock for filesystem + * @group: group number to check + * + * Return the number of blocks used by the superblock (primary or backup) + * in this group. Currently this will be only 0 or 1. + */ +int ext3_bg_has_super(struct super_block *sb, int group) +{ + if (EXT3_HAS_RO_COMPAT_FEATURE(sb,EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER)&& + !ext3_group_sparse(group)) + return 0; + return 1; +} + +/** + * ext3_bg_num_gdb - number of blocks used by the group table in group + * @sb: superblock for filesystem + * @group: group number to check + * + * Return the number of blocks used by the group descriptor table + * (primary or backup) in this group. In the future there may be a + * different number of descriptor blocks in each group. + */ +unsigned long ext3_bg_num_gdb(struct super_block *sb, int group) +{ + if (EXT3_HAS_RO_COMPAT_FEATURE(sb,EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER)&& + !ext3_group_sparse(group)) + return 0; + return EXT3_SB(sb)->s_gdb_count; +} + +#ifdef CONFIG_EXT3_CHECK +/* Called at mount-time, super-block is locked */ +void ext3_check_blocks_bitmap (struct super_block * sb) +{ + struct ext3_super_block *es; + unsigned long desc_count, bitmap_count, x, j; + unsigned long desc_blocks; + struct buffer_head *bitmap_bh = NULL; + struct ext3_group_desc *gdp; + int i; + + es = EXT3_SB(sb)->s_es; + desc_count = 0; + bitmap_count = 0; + gdp = NULL; + for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) { + gdp = ext3_get_group_desc (sb, i, NULL); + if (!gdp) + continue; + desc_count += le16_to_cpu(gdp->bg_free_blocks_count); + brelse(bitmap_bh); + bitmap_bh = read_block_bitmap(sb, i); + if (bitmap_bh == NULL) + continue; + + if (ext3_bg_has_super(sb, i) && + !ext3_test_bit(0, bitmap_bh->b_data)) + ext3_error(sb, __FUNCTION__, + "Superblock in group %d is marked free", i); + + desc_blocks = ext3_bg_num_gdb(sb, i); + for (j = 0; j < desc_blocks; j++) + if (!ext3_test_bit(j + 1, bitmap_bh->b_data)) + ext3_error(sb, __FUNCTION__, + "Descriptor block #%ld in group " + "%d is marked free", j, i); + + if (!block_in_use (le32_to_cpu(gdp->bg_block_bitmap), + sb, bitmap_bh->b_data)) + ext3_error (sb, "ext3_check_blocks_bitmap", + "Block bitmap for group %d is marked free", + i); + + if (!block_in_use (le32_to_cpu(gdp->bg_inode_bitmap), + sb, bitmap_bh->b_data)) + ext3_error (sb, "ext3_check_blocks_bitmap", + "Inode bitmap for group %d is marked free", + i); + + for (j = 0; j < EXT3_SB(sb)->s_itb_per_group; j++) + if (!block_in_use (le32_to_cpu(gdp->bg_inode_table) + j, + sb, bitmap_bh->b_data)) + ext3_error (sb, "ext3_check_blocks_bitmap", + "Block #%d of the inode table in " + "group %d is marked free", j, i); + + x = ext3_count_free(bitmap_bh, sb->s_blocksize); + if (le16_to_cpu(gdp->bg_free_blocks_count) != x) + ext3_error (sb, "ext3_check_blocks_bitmap", + "Wrong free blocks count for group %d, " + "stored = %d, counted = %lu", i, + le16_to_cpu(gdp->bg_free_blocks_count), x); + bitmap_count += x; + } + brelse(bitmap_bh); + if (le32_to_cpu(es->s_free_blocks_count) != bitmap_count) + ext3_error (sb, "ext3_check_blocks_bitmap", + "Wrong free blocks count in super block, " + "stored = %lu, counted = %lu", + (unsigned long)le32_to_cpu(es->s_free_blocks_count), + bitmap_count); +} +#endif diff --git a/fs/ext3/bitmap.c b/fs/ext3/bitmap.c new file mode 100644 index 000000000000..6c419b9ab0e8 --- /dev/null +++ b/fs/ext3/bitmap.c @@ -0,0 +1,26 @@ +/* + * linux/fs/ext3/bitmap.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + */ + +#include + + +static int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0}; + +unsigned long ext3_count_free (struct buffer_head * map, unsigned int numchars) +{ + unsigned int i; + unsigned long sum = 0; + + if (!map) + return (0); + for (i = 0; i < numchars; i++) + sum += nibblemap[map->b_data[i] & 0xf] + + nibblemap[(map->b_data[i] >> 4) & 0xf]; + return (sum); +} diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c new file mode 100644 index 000000000000..832867aef3dc --- /dev/null +++ b/fs/ext3/dir.c @@ -0,0 +1,519 @@ +/* + * linux/fs/ext3/dir.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/dir.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * ext3 directory handling functions + * + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + * + * Hash Tree Directory indexing (c) 2001 Daniel Phillips + * + */ + +#include +#include +#include +#include +#include +#include +#include + +static unsigned char ext3_filetype_table[] = { + DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK +}; + +static int ext3_readdir(struct file *, void *, filldir_t); +static int ext3_dx_readdir(struct file * filp, + void * dirent, filldir_t filldir); +static int ext3_release_dir (struct inode * inode, + struct file * filp); + +struct file_operations ext3_dir_operations = { + .llseek = generic_file_llseek, + .read = generic_read_dir, + .readdir = ext3_readdir, /* we take BKL. needed?*/ + .ioctl = ext3_ioctl, /* BKL held */ + .fsync = ext3_sync_file, /* BKL held */ +#ifdef CONFIG_EXT3_INDEX + .release = ext3_release_dir, +#endif +}; + + +static unsigned char get_dtype(struct super_block *sb, int filetype) +{ + if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_FILETYPE) || + (filetype >= EXT3_FT_MAX)) + return DT_UNKNOWN; + + return (ext3_filetype_table[filetype]); +} + + +int ext3_check_dir_entry (const char * function, struct inode * dir, + struct ext3_dir_entry_2 * de, + struct buffer_head * bh, + unsigned long offset) +{ + const char * error_msg = NULL; + const int rlen = le16_to_cpu(de->rec_len); + + if (rlen < EXT3_DIR_REC_LEN(1)) + error_msg = "rec_len is smaller than minimal"; + else if (rlen % 4 != 0) + error_msg = "rec_len % 4 != 0"; + else if (rlen < EXT3_DIR_REC_LEN(de->name_len)) + error_msg = "rec_len is too small for name_len"; + else if (((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize) + error_msg = "directory entry across blocks"; + else if (le32_to_cpu(de->inode) > + le32_to_cpu(EXT3_SB(dir->i_sb)->s_es->s_inodes_count)) + error_msg = "inode out of bounds"; + + if (error_msg != NULL) + ext3_error (dir->i_sb, function, + "bad entry in directory #%lu: %s - " + "offset=%lu, inode=%lu, rec_len=%d, name_len=%d", + dir->i_ino, error_msg, offset, + (unsigned long) le32_to_cpu(de->inode), + rlen, de->name_len); + return error_msg == NULL ? 1 : 0; +} + +static int ext3_readdir(struct file * filp, + void * dirent, filldir_t filldir) +{ + int error = 0; + unsigned long offset, blk; + int i, num, stored; + struct buffer_head * bh, * tmp, * bha[16]; + struct ext3_dir_entry_2 * de; + struct super_block * sb; + int err; + struct inode *inode = filp->f_dentry->d_inode; + int ret = 0; + + sb = inode->i_sb; + +#ifdef CONFIG_EXT3_INDEX + if (EXT3_HAS_COMPAT_FEATURE(inode->i_sb, + EXT3_FEATURE_COMPAT_DIR_INDEX) && + ((EXT3_I(inode)->i_flags & EXT3_INDEX_FL) || + ((inode->i_size >> sb->s_blocksize_bits) == 1))) { + err = ext3_dx_readdir(filp, dirent, filldir); + if (err != ERR_BAD_DX_DIR) { + ret = err; + goto out; + } + /* + * We don't set the inode dirty flag since it's not + * critical that it get flushed back to the disk. + */ + EXT3_I(filp->f_dentry->d_inode)->i_flags &= ~EXT3_INDEX_FL; + } +#endif + stored = 0; + bh = NULL; + offset = filp->f_pos & (sb->s_blocksize - 1); + + while (!error && !stored && filp->f_pos < inode->i_size) { + blk = (filp->f_pos) >> EXT3_BLOCK_SIZE_BITS(sb); + bh = ext3_bread(NULL, inode, blk, 0, &err); + if (!bh) { + ext3_error (sb, "ext3_readdir", + "directory #%lu contains a hole at offset %lu", + inode->i_ino, (unsigned long)filp->f_pos); + filp->f_pos += sb->s_blocksize - offset; + continue; + } + + /* + * Do the readahead + */ + if (!offset) { + for (i = 16 >> (EXT3_BLOCK_SIZE_BITS(sb) - 9), num = 0; + i > 0; i--) { + tmp = ext3_getblk (NULL, inode, ++blk, 0, &err); + if (tmp && !buffer_uptodate(tmp) && + !buffer_locked(tmp)) + bha[num++] = tmp; + else + brelse (tmp); + } + if (num) { + ll_rw_block (READA, num, bha); + for (i = 0; i < num; i++) + brelse (bha[i]); + } + } + +revalidate: + /* If the dir block has changed since the last call to + * readdir(2), then we might be pointing to an invalid + * dirent right now. Scan from the start of the block + * to make sure. */ + if (filp->f_version != inode->i_version) { + for (i = 0; i < sb->s_blocksize && i < offset; ) { + de = (struct ext3_dir_entry_2 *) + (bh->b_data + i); + /* It's too expensive to do a full + * dirent test each time round this + * loop, but we do have to test at + * least that it is non-zero. A + * failure will be detected in the + * dirent test below. */ + if (le16_to_cpu(de->rec_len) < + EXT3_DIR_REC_LEN(1)) + break; + i += le16_to_cpu(de->rec_len); + } + offset = i; + filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1)) + | offset; + filp->f_version = inode->i_version; + } + + while (!error && filp->f_pos < inode->i_size + && offset < sb->s_blocksize) { + de = (struct ext3_dir_entry_2 *) (bh->b_data + offset); + if (!ext3_check_dir_entry ("ext3_readdir", inode, de, + bh, offset)) { + /* On error, skip the f_pos to the + next block. */ + filp->f_pos = (filp->f_pos | + (sb->s_blocksize - 1)) + 1; + brelse (bh); + ret = stored; + goto out; + } + offset += le16_to_cpu(de->rec_len); + if (le32_to_cpu(de->inode)) { + /* We might block in the next section + * if the data destination is + * currently swapped out. So, use a + * version stamp to detect whether or + * not the directory has been modified + * during the copy operation. + */ + unsigned long version = filp->f_version; + + error = filldir(dirent, de->name, + de->name_len, + filp->f_pos, + le32_to_cpu(de->inode), + get_dtype(sb, de->file_type)); + if (error) + break; + if (version != filp->f_version) + goto revalidate; + stored ++; + } + filp->f_pos += le16_to_cpu(de->rec_len); + } + offset = 0; + brelse (bh); + } +out: + return ret; +} + +#ifdef CONFIG_EXT3_INDEX +/* + * These functions convert from the major/minor hash to an f_pos + * value. + * + * Currently we only use major hash numer. This is unfortunate, but + * on 32-bit machines, the same VFS interface is used for lseek and + * llseek, so if we use the 64 bit offset, then the 32-bit versions of + * lseek/telldir/seekdir will blow out spectacularly, and from within + * the ext2 low-level routine, we don't know if we're being called by + * a 64-bit version of the system call or the 32-bit version of the + * system call. Worse yet, NFSv2 only allows for a 32-bit readdir + * cookie. Sigh. + */ +#define hash2pos(major, minor) (major >> 1) +#define pos2maj_hash(pos) ((pos << 1) & 0xffffffff) +#define pos2min_hash(pos) (0) + +/* + * This structure holds the nodes of the red-black tree used to store + * the directory entry in hash order. + */ +struct fname { + __u32 hash; + __u32 minor_hash; + struct rb_node rb_hash; + struct fname *next; + __u32 inode; + __u8 name_len; + __u8 file_type; + char name[0]; +}; + +/* + * This functoin implements a non-recursive way of freeing all of the + * nodes in the red-black tree. + */ +static void free_rb_tree_fname(struct rb_root *root) +{ + struct rb_node *n = root->rb_node; + struct rb_node *parent; + struct fname *fname; + + while (n) { + /* Do the node's children first */ + if ((n)->rb_left) { + n = n->rb_left; + continue; + } + if (n->rb_right) { + n = n->rb_right; + continue; + } + /* + * The node has no children; free it, and then zero + * out parent's link to it. Finally go to the + * beginning of the loop and try to free the parent + * node. + */ + parent = n->rb_parent; + fname = rb_entry(n, struct fname, rb_hash); + while (fname) { + struct fname * old = fname; + fname = fname->next; + kfree (old); + } + if (!parent) + root->rb_node = NULL; + else if (parent->rb_left == n) + parent->rb_left = NULL; + else if (parent->rb_right == n) + parent->rb_right = NULL; + n = parent; + } + root->rb_node = NULL; +} + + +static struct dir_private_info *create_dir_info(loff_t pos) +{ + struct dir_private_info *p; + + p = kmalloc(sizeof(struct dir_private_info), GFP_KERNEL); + if (!p) + return NULL; + p->root.rb_node = NULL; + p->curr_node = NULL; + p->extra_fname = NULL; + p->last_pos = 0; + p->curr_hash = pos2maj_hash(pos); + p->curr_minor_hash = pos2min_hash(pos); + p->next_hash = 0; + return p; +} + +void ext3_htree_free_dir_info(struct dir_private_info *p) +{ + free_rb_tree_fname(&p->root); + kfree(p); +} + +/* + * Given a directory entry, enter it into the fname rb tree. + */ +int ext3_htree_store_dirent(struct file *dir_file, __u32 hash, + __u32 minor_hash, + struct ext3_dir_entry_2 *dirent) +{ + struct rb_node **p, *parent = NULL; + struct fname * fname, *new_fn; + struct dir_private_info *info; + int len; + + info = (struct dir_private_info *) dir_file->private_data; + p = &info->root.rb_node; + + /* Create and allocate the fname structure */ + len = sizeof(struct fname) + dirent->name_len + 1; + new_fn = kmalloc(len, GFP_KERNEL); + if (!new_fn) + return -ENOMEM; + memset(new_fn, 0, len); + new_fn->hash = hash; + new_fn->minor_hash = minor_hash; + new_fn->inode = le32_to_cpu(dirent->inode); + new_fn->name_len = dirent->name_len; + new_fn->file_type = dirent->file_type; + memcpy(new_fn->name, dirent->name, dirent->name_len); + new_fn->name[dirent->name_len] = 0; + + while (*p) { + parent = *p; + fname = rb_entry(parent, struct fname, rb_hash); + + /* + * If the hash and minor hash match up, then we put + * them on a linked list. This rarely happens... + */ + if ((new_fn->hash == fname->hash) && + (new_fn->minor_hash == fname->minor_hash)) { + new_fn->next = fname->next; + fname->next = new_fn; + return 0; + } + + if (new_fn->hash < fname->hash) + p = &(*p)->rb_left; + else if (new_fn->hash > fname->hash) + p = &(*p)->rb_right; + else if (new_fn->minor_hash < fname->minor_hash) + p = &(*p)->rb_left; + else /* if (new_fn->minor_hash > fname->minor_hash) */ + p = &(*p)->rb_right; + } + + rb_link_node(&new_fn->rb_hash, parent, p); + rb_insert_color(&new_fn->rb_hash, &info->root); + return 0; +} + + + +/* + * This is a helper function for ext3_dx_readdir. It calls filldir + * for all entres on the fname linked list. (Normally there is only + * one entry on the linked list, unless there are 62 bit hash collisions.) + */ +static int call_filldir(struct file * filp, void * dirent, + filldir_t filldir, struct fname *fname) +{ + struct dir_private_info *info = filp->private_data; + loff_t curr_pos; + struct inode *inode = filp->f_dentry->d_inode; + struct super_block * sb; + int error; + + sb = inode->i_sb; + + if (!fname) { + printk("call_filldir: called with null fname?!?\n"); + return 0; + } + curr_pos = hash2pos(fname->hash, fname->minor_hash); + while (fname) { + error = filldir(dirent, fname->name, + fname->name_len, curr_pos, + fname->inode, + get_dtype(sb, fname->file_type)); + if (error) { + filp->f_pos = curr_pos; + info->extra_fname = fname->next; + return error; + } + fname = fname->next; + } + return 0; +} + +static int ext3_dx_readdir(struct file * filp, + void * dirent, filldir_t filldir) +{ + struct dir_private_info *info = filp->private_data; + struct inode *inode = filp->f_dentry->d_inode; + struct fname *fname; + int ret; + + if (!info) { + info = create_dir_info(filp->f_pos); + if (!info) + return -ENOMEM; + filp->private_data = info; + } + + if (filp->f_pos == EXT3_HTREE_EOF) + return 0; /* EOF */ + + /* Some one has messed with f_pos; reset the world */ + if (info->last_pos != filp->f_pos) { + free_rb_tree_fname(&info->root); + info->curr_node = NULL; + info->extra_fname = NULL; + info->curr_hash = pos2maj_hash(filp->f_pos); + info->curr_minor_hash = pos2min_hash(filp->f_pos); + } + + /* + * If there are any leftover names on the hash collision + * chain, return them first. + */ + if (info->extra_fname && + call_filldir(filp, dirent, filldir, info->extra_fname)) + goto finished; + + if (!info->curr_node) + info->curr_node = rb_first(&info->root); + + while (1) { + /* + * Fill the rbtree if we have no more entries, + * or the inode has changed since we last read in the + * cached entries. + */ + if ((!info->curr_node) || + (filp->f_version != inode->i_version)) { + info->curr_node = NULL; + free_rb_tree_fname(&info->root); + filp->f_version = inode->i_version; + ret = ext3_htree_fill_tree(filp, info->curr_hash, + info->curr_minor_hash, + &info->next_hash); + if (ret < 0) + return ret; + if (ret == 0) { + filp->f_pos = EXT3_HTREE_EOF; + break; + } + info->curr_node = rb_first(&info->root); + } + + fname = rb_entry(info->curr_node, struct fname, rb_hash); + info->curr_hash = fname->hash; + info->curr_minor_hash = fname->minor_hash; + if (call_filldir(filp, dirent, filldir, fname)) + break; + + info->curr_node = rb_next(info->curr_node); + if (!info->curr_node) { + if (info->next_hash == ~0) { + filp->f_pos = EXT3_HTREE_EOF; + break; + } + info->curr_hash = info->next_hash; + info->curr_minor_hash = 0; + } + } +finished: + info->last_pos = filp->f_pos; + return 0; +} + +static int ext3_release_dir (struct inode * inode, struct file * filp) +{ + if (filp->private_data) + ext3_htree_free_dir_info(filp->private_data); + + return 0; +} + +#endif diff --git a/fs/ext3/file.c b/fs/ext3/file.c new file mode 100644 index 000000000000..5ad8cf0292df --- /dev/null +++ b/fs/ext3/file.c @@ -0,0 +1,131 @@ +/* + * linux/fs/ext3/file.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/file.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * ext3 fs regular file handling primitives + * + * 64-bit file support on 64-bit platforms by Jakub Jelinek + * (jj@sunsite.ms.mff.cuni.cz) + */ + +#include +#include +#include +#include +#include +#include "xattr.h" +#include "acl.h" + +/* + * Called when an inode is released. Note that this is different + * from ext3_file_open: open gets called at every open, but release + * gets called only when /all/ the files are closed. + */ +static int ext3_release_file (struct inode * inode, struct file * filp) +{ + /* if we are the last writer on the inode, drop the block reservation */ + if ((filp->f_mode & FMODE_WRITE) && + (atomic_read(&inode->i_writecount) == 1)) + ext3_discard_reservation(inode); + if (is_dx(inode) && filp->private_data) + ext3_htree_free_dir_info(filp->private_data); + + return 0; +} + +static ssize_t +ext3_file_write(struct kiocb *iocb, const char __user *buf, size_t count, loff_t pos) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_dentry->d_inode; + ssize_t ret; + int err; + + ret = generic_file_aio_write(iocb, buf, count, pos); + + /* + * Skip flushing if there was an error, or if nothing was written. + */ + if (ret <= 0) + return ret; + + /* + * If the inode is IS_SYNC, or is O_SYNC and we are doing data + * journalling then we need to make sure that we force the transaction + * to disk to keep all metadata uptodate synchronously. + */ + if (file->f_flags & O_SYNC) { + /* + * If we are non-data-journaled, then the dirty data has + * already been flushed to backing store by generic_osync_inode, + * and the inode has been flushed too if there have been any + * modifications other than mere timestamp updates. + * + * Open question --- do we care about flushing timestamps too + * if the inode is IS_SYNC? + */ + if (!ext3_should_journal_data(inode)) + return ret; + + goto force_commit; + } + + /* + * So we know that there has been no forced data flush. If the inode + * is marked IS_SYNC, we need to force one ourselves. + */ + if (!IS_SYNC(inode)) + return ret; + + /* + * Open question #2 --- should we force data to disk here too? If we + * don't, the only impact is that data=writeback filesystems won't + * flush data to disk automatically on IS_SYNC, only metadata (but + * historically, that is what ext2 has done.) + */ + +force_commit: + err = ext3_force_commit(inode->i_sb); + if (err) + return err; + return ret; +} + +struct file_operations ext3_file_operations = { + .llseek = generic_file_llseek, + .read = do_sync_read, + .write = do_sync_write, + .aio_read = generic_file_aio_read, + .aio_write = ext3_file_write, + .readv = generic_file_readv, + .writev = generic_file_writev, + .ioctl = ext3_ioctl, + .mmap = generic_file_mmap, + .open = generic_file_open, + .release = ext3_release_file, + .fsync = ext3_sync_file, + .sendfile = generic_file_sendfile, +}; + +struct inode_operations ext3_file_inode_operations = { + .truncate = ext3_truncate, + .setattr = ext3_setattr, +#ifdef CONFIG_EXT3_FS_XATTR + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = ext3_listxattr, + .removexattr = generic_removexattr, +#endif + .permission = ext3_permission, +}; + diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c new file mode 100644 index 000000000000..49382a208e05 --- /dev/null +++ b/fs/ext3/fsync.c @@ -0,0 +1,88 @@ +/* + * linux/fs/ext3/fsync.c + * + * Copyright (C) 1993 Stephen Tweedie (sct@redhat.com) + * from + * Copyright (C) 1992 Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * from + * linux/fs/minix/truncate.c Copyright (C) 1991, 1992 Linus Torvalds + * + * ext3fs fsync primitive + * + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + * + * Removed unnecessary code duplication for little endian machines + * and excessive __inline__s. + * Andi Kleen, 1997 + * + * Major simplications and cleanup - we only need to do the metadata, because + * we can depend on generic_block_fdatasync() to sync the data blocks. + */ + +#include +#include +#include +#include +#include +#include +#include + +/* + * akpm: A new design for ext3_sync_file(). + * + * This is only called from sys_fsync(), sys_fdatasync() and sys_msync(). + * There cannot be a transaction open by this task. + * Another task could have dirtied this inode. Its data can be in any + * state in the journalling system. + * + * What we do is just kick off a commit and wait on it. This will snapshot the + * inode to disk. + */ + +int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync) +{ + struct inode *inode = dentry->d_inode; + int ret = 0; + + J_ASSERT(ext3_journal_current_handle() == 0); + + /* + * data=writeback: + * The caller's filemap_fdatawrite()/wait will sync the data. + * sync_inode() will sync the metadata + * + * data=ordered: + * The caller's filemap_fdatawrite() will write the data and + * sync_inode() will write the inode if it is dirty. Then the caller's + * filemap_fdatawait() will wait on the pages. + * + * data=journal: + * filemap_fdatawrite won't do anything (the buffers are clean). + * ext3_force_commit will write the file data into the journal and + * will wait on that. + * filemap_fdatawait() will encounter a ton of newly-dirtied pages + * (they were dirtied by commit). But that's OK - the blocks are + * safe in-journal, which is all fsync() needs to ensure. + */ + if (ext3_should_journal_data(inode)) { + ret = ext3_force_commit(inode->i_sb); + goto out; + } + + /* + * The VFS has written the file data. If the inode is unaltered + * then we need not start a commit. + */ + if (inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC)) { + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .nr_to_write = 0, /* sys_fsync did this */ + }; + ret = sync_inode(inode, &wbc); + } +out: + return ret; +} diff --git a/fs/ext3/hash.c b/fs/ext3/hash.c new file mode 100644 index 000000000000..5a2d1235ead0 --- /dev/null +++ b/fs/ext3/hash.c @@ -0,0 +1,152 @@ +/* + * linux/fs/ext3/hash.c + * + * Copyright (C) 2002 by Theodore Ts'o + * + * This file is released under the GPL v2. + * + * This file may be redistributed under the terms of the GNU Public + * License. + */ + +#include +#include +#include +#include +#include + +#define DELTA 0x9E3779B9 + +static void TEA_transform(__u32 buf[4], __u32 const in[]) +{ + __u32 sum = 0; + __u32 b0 = buf[0], b1 = buf[1]; + __u32 a = in[0], b = in[1], c = in[2], d = in[3]; + int n = 16; + + do { + sum += DELTA; + b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b); + b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d); + } while(--n); + + buf[0] += b0; + buf[1] += b1; +} + + +/* The old legacy hash */ +static __u32 dx_hack_hash (const char *name, int len) +{ + __u32 hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9; + while (len--) { + __u32 hash = hash1 + (hash0 ^ (*name++ * 7152373)); + + if (hash & 0x80000000) hash -= 0x7fffffff; + hash1 = hash0; + hash0 = hash; + } + return (hash0 << 1); +} + +static void str2hashbuf(const char *msg, int len, __u32 *buf, int num) +{ + __u32 pad, val; + int i; + + pad = (__u32)len | ((__u32)len << 8); + pad |= pad << 16; + + val = pad; + if (len > num*4) + len = num * 4; + for (i=0; i < len; i++) { + if ((i % 4) == 0) + val = pad; + val = msg[i] + (val << 8); + if ((i % 4) == 3) { + *buf++ = val; + val = pad; + num--; + } + } + if (--num >= 0) + *buf++ = val; + while (--num >= 0) + *buf++ = pad; +} + +/* + * Returns the hash of a filename. If len is 0 and name is NULL, then + * this function can be used to test whether or not a hash version is + * supported. + * + * The seed is an 4 longword (32 bits) "secret" which can be used to + * uniquify a hash. If the seed is all zero's, then some default seed + * may be used. + * + * A particular hash version specifies whether or not the seed is + * represented, and whether or not the returned hash is 32 bits or 64 + * bits. 32 bit hashes will return 0 for the minor hash. + */ +int ext3fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo) +{ + __u32 hash; + __u32 minor_hash = 0; + const char *p; + int i; + __u32 in[8], buf[4]; + + /* Initialize the default seed for the hash checksum functions */ + buf[0] = 0x67452301; + buf[1] = 0xefcdab89; + buf[2] = 0x98badcfe; + buf[3] = 0x10325476; + + /* Check to see if the seed is all zero's */ + if (hinfo->seed) { + for (i=0; i < 4; i++) { + if (hinfo->seed[i]) + break; + } + if (i < 4) + memcpy(buf, hinfo->seed, sizeof(buf)); + } + + switch (hinfo->hash_version) { + case DX_HASH_LEGACY: + hash = dx_hack_hash(name, len); + break; + case DX_HASH_HALF_MD4: + p = name; + while (len > 0) { + str2hashbuf(p, len, in, 8); + half_md4_transform(buf, in); + len -= 32; + p += 32; + } + minor_hash = buf[2]; + hash = buf[1]; + break; + case DX_HASH_TEA: + p = name; + while (len > 0) { + str2hashbuf(p, len, in, 4); + TEA_transform(buf, in); + len -= 16; + p += 16; + } + hash = buf[0]; + minor_hash = buf[1]; + break; + default: + hinfo->hash = 0; + return -1; + } + hash = hash & ~1; + if (hash == (EXT3_HTREE_EOF << 1)) + hash = (EXT3_HTREE_EOF-1) << 1; + hinfo->hash = hash; + hinfo->minor_hash = minor_hash; + return 0; +} diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c new file mode 100644 index 000000000000..1e6f3ea28713 --- /dev/null +++ b/fs/ext3/ialloc.c @@ -0,0 +1,794 @@ +/* + * linux/fs/ext3/ialloc.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * BSD ufs-inspired inode and directory allocation by + * Stephen Tweedie (sct@redhat.com), 1993 + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "xattr.h" +#include "acl.h" + +/* + * ialloc.c contains the inodes allocation and deallocation routines + */ + +/* + * The free inodes are managed by bitmaps. A file system contains several + * blocks groups. Each group contains 1 bitmap block for blocks, 1 bitmap + * block for inodes, N blocks for the inode table and data blocks. + * + * The file system contains group descriptors which are located after the + * super block. Each descriptor contains the number of the bitmap block and + * the free blocks count in the block. + */ + + +/* + * Read the inode allocation bitmap for a given block_group, reading + * into the specified slot in the superblock's bitmap cache. + * + * Return buffer_head of bitmap on success or NULL. + */ +static struct buffer_head * +read_inode_bitmap(struct super_block * sb, unsigned long block_group) +{ + struct ext3_group_desc *desc; + struct buffer_head *bh = NULL; + + desc = ext3_get_group_desc(sb, block_group, NULL); + if (!desc) + goto error_out; + + bh = sb_bread(sb, le32_to_cpu(desc->bg_inode_bitmap)); + if (!bh) + ext3_error(sb, "read_inode_bitmap", + "Cannot read inode bitmap - " + "block_group = %lu, inode_bitmap = %u", + block_group, le32_to_cpu(desc->bg_inode_bitmap)); +error_out: + return bh; +} + +/* + * NOTE! When we get the inode, we're the only people + * that have access to it, and as such there are no + * race conditions we have to worry about. The inode + * is not on the hash-lists, and it cannot be reached + * through the filesystem because the directory entry + * has been deleted earlier. + * + * HOWEVER: we must make sure that we get no aliases, + * which means that we have to call "clear_inode()" + * _before_ we mark the inode not in use in the inode + * bitmaps. Otherwise a newly created file might use + * the same inode number (not actually the same pointer + * though), and then we'd have two inodes sharing the + * same inode number and space on the harddisk. + */ +void ext3_free_inode (handle_t *handle, struct inode * inode) +{ + struct super_block * sb = inode->i_sb; + int is_directory; + unsigned long ino; + struct buffer_head *bitmap_bh = NULL; + struct buffer_head *bh2; + unsigned long block_group; + unsigned long bit; + struct ext3_group_desc * gdp; + struct ext3_super_block * es; + struct ext3_sb_info *sbi; + int fatal = 0, err; + + if (atomic_read(&inode->i_count) > 1) { + printk ("ext3_free_inode: inode has count=%d\n", + atomic_read(&inode->i_count)); + return; + } + if (inode->i_nlink) { + printk ("ext3_free_inode: inode has nlink=%d\n", + inode->i_nlink); + return; + } + if (!sb) { + printk("ext3_free_inode: inode on nonexistent device\n"); + return; + } + sbi = EXT3_SB(sb); + + ino = inode->i_ino; + ext3_debug ("freeing inode %lu\n", ino); + + /* + * Note: we must free any quota before locking the superblock, + * as writing the quota to disk may need the lock as well. + */ + DQUOT_INIT(inode); + ext3_xattr_delete_inode(handle, inode); + DQUOT_FREE_INODE(inode); + DQUOT_DROP(inode); + + is_directory = S_ISDIR(inode->i_mode); + + /* Do this BEFORE marking the inode not in use or returning an error */ + clear_inode (inode); + + es = EXT3_SB(sb)->s_es; + if (ino < EXT3_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) { + ext3_error (sb, "ext3_free_inode", + "reserved or nonexistent inode %lu", ino); + goto error_return; + } + block_group = (ino - 1) / EXT3_INODES_PER_GROUP(sb); + bit = (ino - 1) % EXT3_INODES_PER_GROUP(sb); + bitmap_bh = read_inode_bitmap(sb, block_group); + if (!bitmap_bh) + goto error_return; + + BUFFER_TRACE(bitmap_bh, "get_write_access"); + fatal = ext3_journal_get_write_access(handle, bitmap_bh); + if (fatal) + goto error_return; + + /* Ok, now we can actually update the inode bitmaps.. */ + if (!ext3_clear_bit_atomic(sb_bgl_lock(sbi, block_group), + bit, bitmap_bh->b_data)) + ext3_error (sb, "ext3_free_inode", + "bit already cleared for inode %lu", ino); + else { + gdp = ext3_get_group_desc (sb, block_group, &bh2); + + BUFFER_TRACE(bh2, "get_write_access"); + fatal = ext3_journal_get_write_access(handle, bh2); + if (fatal) goto error_return; + + if (gdp) { + spin_lock(sb_bgl_lock(sbi, block_group)); + gdp->bg_free_inodes_count = cpu_to_le16( + le16_to_cpu(gdp->bg_free_inodes_count) + 1); + if (is_directory) + gdp->bg_used_dirs_count = cpu_to_le16( + le16_to_cpu(gdp->bg_used_dirs_count) - 1); + spin_unlock(sb_bgl_lock(sbi, block_group)); + percpu_counter_inc(&sbi->s_freeinodes_counter); + if (is_directory) + percpu_counter_dec(&sbi->s_dirs_counter); + + } + BUFFER_TRACE(bh2, "call ext3_journal_dirty_metadata"); + err = ext3_journal_dirty_metadata(handle, bh2); + if (!fatal) fatal = err; + } + BUFFER_TRACE(bitmap_bh, "call ext3_journal_dirty_metadata"); + err = ext3_journal_dirty_metadata(handle, bitmap_bh); + if (!fatal) + fatal = err; + sb->s_dirt = 1; +error_return: + brelse(bitmap_bh); + ext3_std_error(sb, fatal); +} + +/* + * There are two policies for allocating an inode. If the new inode is + * a directory, then a forward search is made for a block group with both + * free space and a low directory-to-inode ratio; if that fails, then of + * the groups with above-average free space, that group with the fewest + * directories already is chosen. + * + * For other inodes, search forward from the parent directory\'s block + * group to find a free inode. + */ +static int find_group_dir(struct super_block *sb, struct inode *parent) +{ + int ngroups = EXT3_SB(sb)->s_groups_count; + int freei, avefreei; + struct ext3_group_desc *desc, *best_desc = NULL; + struct buffer_head *bh; + int group, best_group = -1; + + freei = percpu_counter_read_positive(&EXT3_SB(sb)->s_freeinodes_counter); + avefreei = freei / ngroups; + + for (group = 0; group < ngroups; group++) { + desc = ext3_get_group_desc (sb, group, &bh); + if (!desc || !desc->bg_free_inodes_count) + continue; + if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei) + continue; + if (!best_desc || + (le16_to_cpu(desc->bg_free_blocks_count) > + le16_to_cpu(best_desc->bg_free_blocks_count))) { + best_group = group; + best_desc = desc; + } + } + return best_group; +} + +/* + * Orlov's allocator for directories. + * + * We always try to spread first-level directories. + * + * If there are blockgroups with both free inodes and free blocks counts + * not worse than average we return one with smallest directory count. + * Otherwise we simply return a random group. + * + * For the rest rules look so: + * + * It's OK to put directory into a group unless + * it has too many directories already (max_dirs) or + * it has too few free inodes left (min_inodes) or + * it has too few free blocks left (min_blocks) or + * it's already running too large debt (max_debt). + * Parent's group is prefered, if it doesn't satisfy these + * conditions we search cyclically through the rest. If none + * of the groups look good we just look for a group with more + * free inodes than average (starting at parent's group). + * + * Debt is incremented each time we allocate a directory and decremented + * when we allocate an inode, within 0--255. + */ + +#define INODE_COST 64 +#define BLOCK_COST 256 + +static int find_group_orlov(struct super_block *sb, struct inode *parent) +{ + int parent_group = EXT3_I(parent)->i_block_group; + struct ext3_sb_info *sbi = EXT3_SB(sb); + struct ext3_super_block *es = sbi->s_es; + int ngroups = sbi->s_groups_count; + int inodes_per_group = EXT3_INODES_PER_GROUP(sb); + int freei, avefreei; + int freeb, avefreeb; + int blocks_per_dir, ndirs; + int max_debt, max_dirs, min_blocks, min_inodes; + int group = -1, i; + struct ext3_group_desc *desc; + struct buffer_head *bh; + + freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter); + avefreei = freei / ngroups; + freeb = percpu_counter_read_positive(&sbi->s_freeblocks_counter); + avefreeb = freeb / ngroups; + ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter); + + if ((parent == sb->s_root->d_inode) || + (EXT3_I(parent)->i_flags & EXT3_TOPDIR_FL)) { + int best_ndir = inodes_per_group; + int best_group = -1; + + get_random_bytes(&group, sizeof(group)); + parent_group = (unsigned)group % ngroups; + for (i = 0; i < ngroups; i++) { + group = (parent_group + i) % ngroups; + desc = ext3_get_group_desc (sb, group, &bh); + if (!desc || !desc->bg_free_inodes_count) + continue; + if (le16_to_cpu(desc->bg_used_dirs_count) >= best_ndir) + continue; + if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei) + continue; + if (le16_to_cpu(desc->bg_free_blocks_count) < avefreeb) + continue; + best_group = group; + best_ndir = le16_to_cpu(desc->bg_used_dirs_count); + } + if (best_group >= 0) + return best_group; + goto fallback; + } + + blocks_per_dir = (le32_to_cpu(es->s_blocks_count) - freeb) / ndirs; + + max_dirs = ndirs / ngroups + inodes_per_group / 16; + min_inodes = avefreei - inodes_per_group / 4; + min_blocks = avefreeb - EXT3_BLOCKS_PER_GROUP(sb) / 4; + + max_debt = EXT3_BLOCKS_PER_GROUP(sb) / max(blocks_per_dir, BLOCK_COST); + if (max_debt * INODE_COST > inodes_per_group) + max_debt = inodes_per_group / INODE_COST; + if (max_debt > 255) + max_debt = 255; + if (max_debt == 0) + max_debt = 1; + + for (i = 0; i < ngroups; i++) { + group = (parent_group + i) % ngroups; + desc = ext3_get_group_desc (sb, group, &bh); + if (!desc || !desc->bg_free_inodes_count) + continue; + if (le16_to_cpu(desc->bg_used_dirs_count) >= max_dirs) + continue; + if (le16_to_cpu(desc->bg_free_inodes_count) < min_inodes) + continue; + if (le16_to_cpu(desc->bg_free_blocks_count) < min_blocks) + continue; + return group; + } + +fallback: + for (i = 0; i < ngroups; i++) { + group = (parent_group + i) % ngroups; + desc = ext3_get_group_desc (sb, group, &bh); + if (!desc || !desc->bg_free_inodes_count) + continue; + if (le16_to_cpu(desc->bg_free_inodes_count) >= avefreei) + return group; + } + + if (avefreei) { + /* + * The free-inodes counter is approximate, and for really small + * filesystems the above test can fail to find any blockgroups + */ + avefreei = 0; + goto fallback; + } + + return -1; +} + +static int find_group_other(struct super_block *sb, struct inode *parent) +{ + int parent_group = EXT3_I(parent)->i_block_group; + int ngroups = EXT3_SB(sb)->s_groups_count; + struct ext3_group_desc *desc; + struct buffer_head *bh; + int group, i; + + /* + * Try to place the inode in its parent directory + */ + group = parent_group; + desc = ext3_get_group_desc (sb, group, &bh); + if (desc && le16_to_cpu(desc->bg_free_inodes_count) && + le16_to_cpu(desc->bg_free_blocks_count)) + return group; + + /* + * We're going to place this inode in a different blockgroup from its + * parent. We want to cause files in a common directory to all land in + * the same blockgroup. But we want files which are in a different + * directory which shares a blockgroup with our parent to land in a + * different blockgroup. + * + * So add our directory's i_ino into the starting point for the hash. + */ + group = (group + parent->i_ino) % ngroups; + + /* + * Use a quadratic hash to find a group with a free inode and some free + * blocks. + */ + for (i = 1; i < ngroups; i <<= 1) { + group += i; + if (group >= ngroups) + group -= ngroups; + desc = ext3_get_group_desc (sb, group, &bh); + if (desc && le16_to_cpu(desc->bg_free_inodes_count) && + le16_to_cpu(desc->bg_free_blocks_count)) + return group; + } + + /* + * That failed: try linear search for a free inode, even if that group + * has no free blocks. + */ + group = parent_group; + for (i = 0; i < ngroups; i++) { + if (++group >= ngroups) + group = 0; + desc = ext3_get_group_desc (sb, group, &bh); + if (desc && le16_to_cpu(desc->bg_free_inodes_count)) + return group; + } + + return -1; +} + +/* + * There are two policies for allocating an inode. If the new inode is + * a directory, then a forward search is made for a block group with both + * free space and a low directory-to-inode ratio; if that fails, then of + * the groups with above-average free space, that group with the fewest + * directories already is chosen. + * + * For other inodes, search forward from the parent directory's block + * group to find a free inode. + */ +struct inode *ext3_new_inode(handle_t *handle, struct inode * dir, int mode) +{ + struct super_block *sb; + struct buffer_head *bitmap_bh = NULL; + struct buffer_head *bh2; + int group; + unsigned long ino = 0; + struct inode * inode; + struct ext3_group_desc * gdp = NULL; + struct ext3_super_block * es; + struct ext3_inode_info *ei; + struct ext3_sb_info *sbi; + int err = 0; + struct inode *ret; + int i; + + /* Cannot create files in a deleted directory */ + if (!dir || !dir->i_nlink) + return ERR_PTR(-EPERM); + + sb = dir->i_sb; + inode = new_inode(sb); + if (!inode) + return ERR_PTR(-ENOMEM); + ei = EXT3_I(inode); + + sbi = EXT3_SB(sb); + es = sbi->s_es; + if (S_ISDIR(mode)) { + if (test_opt (sb, OLDALLOC)) + group = find_group_dir(sb, dir); + else + group = find_group_orlov(sb, dir); + } else + group = find_group_other(sb, dir); + + err = -ENOSPC; + if (group == -1) + goto out; + + for (i = 0; i < sbi->s_groups_count; i++) { + err = -EIO; + + gdp = ext3_get_group_desc(sb, group, &bh2); + if (!gdp) + goto fail; + + brelse(bitmap_bh); + bitmap_bh = read_inode_bitmap(sb, group); + if (!bitmap_bh) + goto fail; + + ino = 0; + +repeat_in_this_group: + ino = ext3_find_next_zero_bit((unsigned long *) + bitmap_bh->b_data, EXT3_INODES_PER_GROUP(sb), ino); + if (ino < EXT3_INODES_PER_GROUP(sb)) { + + BUFFER_TRACE(bitmap_bh, "get_write_access"); + err = ext3_journal_get_write_access(handle, bitmap_bh); + if (err) + goto fail; + + if (!ext3_set_bit_atomic(sb_bgl_lock(sbi, group), + ino, bitmap_bh->b_data)) { + /* we won it */ + BUFFER_TRACE(bitmap_bh, + "call ext3_journal_dirty_metadata"); + err = ext3_journal_dirty_metadata(handle, + bitmap_bh); + if (err) + goto fail; + goto got; + } + /* we lost it */ + journal_release_buffer(handle, bitmap_bh); + + if (++ino < EXT3_INODES_PER_GROUP(sb)) + goto repeat_in_this_group; + } + + /* + * This case is possible in concurrent environment. It is very + * rare. We cannot repeat the find_group_xxx() call because + * that will simply return the same blockgroup, because the + * group descriptor metadata has not yet been updated. + * So we just go onto the next blockgroup. + */ + if (++group == sbi->s_groups_count) + group = 0; + } + err = -ENOSPC; + goto out; + +got: + ino += group * EXT3_INODES_PER_GROUP(sb) + 1; + if (ino < EXT3_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) { + ext3_error (sb, "ext3_new_inode", + "reserved inode or inode > inodes count - " + "block_group = %d, inode=%lu", group, ino); + err = -EIO; + goto fail; + } + + BUFFER_TRACE(bh2, "get_write_access"); + err = ext3_journal_get_write_access(handle, bh2); + if (err) goto fail; + spin_lock(sb_bgl_lock(sbi, group)); + gdp->bg_free_inodes_count = + cpu_to_le16(le16_to_cpu(gdp->bg_free_inodes_count) - 1); + if (S_ISDIR(mode)) { + gdp->bg_used_dirs_count = + cpu_to_le16(le16_to_cpu(gdp->bg_used_dirs_count) + 1); + } + spin_unlock(sb_bgl_lock(sbi, group)); + BUFFER_TRACE(bh2, "call ext3_journal_dirty_metadata"); + err = ext3_journal_dirty_metadata(handle, bh2); + if (err) goto fail; + + percpu_counter_dec(&sbi->s_freeinodes_counter); + if (S_ISDIR(mode)) + percpu_counter_inc(&sbi->s_dirs_counter); + sb->s_dirt = 1; + + inode->i_uid = current->fsuid; + if (test_opt (sb, GRPID)) + inode->i_gid = dir->i_gid; + else if (dir->i_mode & S_ISGID) { + inode->i_gid = dir->i_gid; + if (S_ISDIR(mode)) + mode |= S_ISGID; + } else + inode->i_gid = current->fsgid; + inode->i_mode = mode; + + inode->i_ino = ino; + /* This is the optimal IO size (for stat), not the fs block size */ + inode->i_blksize = PAGE_SIZE; + inode->i_blocks = 0; + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; + + memset(ei->i_data, 0, sizeof(ei->i_data)); + ei->i_dir_start_lookup = 0; + ei->i_disksize = 0; + + ei->i_flags = EXT3_I(dir)->i_flags & ~EXT3_INDEX_FL; + if (S_ISLNK(mode)) + ei->i_flags &= ~(EXT3_IMMUTABLE_FL|EXT3_APPEND_FL); + /* dirsync only applies to directories */ + if (!S_ISDIR(mode)) + ei->i_flags &= ~EXT3_DIRSYNC_FL; +#ifdef EXT3_FRAGMENTS + ei->i_faddr = 0; + ei->i_frag_no = 0; + ei->i_frag_size = 0; +#endif + ei->i_file_acl = 0; + ei->i_dir_acl = 0; + ei->i_dtime = 0; + ei->i_block_alloc_info = NULL; + ei->i_block_group = group; + + ext3_set_inode_flags(inode); + if (IS_DIRSYNC(inode)) + handle->h_sync = 1; + insert_inode_hash(inode); + spin_lock(&sbi->s_next_gen_lock); + inode->i_generation = sbi->s_next_generation++; + spin_unlock(&sbi->s_next_gen_lock); + + ei->i_state = EXT3_STATE_NEW; + ei->i_extra_isize = + (EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) ? + sizeof(struct ext3_inode) - EXT3_GOOD_OLD_INODE_SIZE : 0; + + ret = inode; + if(DQUOT_ALLOC_INODE(inode)) { + DQUOT_DROP(inode); + err = -EDQUOT; + goto fail2; + } + err = ext3_init_acl(handle, inode, dir); + if (err) { + DQUOT_FREE_INODE(inode); + goto fail2; + } + err = ext3_mark_inode_dirty(handle, inode); + if (err) { + ext3_std_error(sb, err); + DQUOT_FREE_INODE(inode); + goto fail2; + } + + ext3_debug("allocating inode %lu\n", inode->i_ino); + goto really_out; +fail: + ext3_std_error(sb, err); +out: + iput(inode); + ret = ERR_PTR(err); +really_out: + brelse(bitmap_bh); + return ret; + +fail2: + inode->i_flags |= S_NOQUOTA; + inode->i_nlink = 0; + iput(inode); + brelse(bitmap_bh); + return ERR_PTR(err); +} + +/* Verify that we are loading a valid orphan from disk */ +struct inode *ext3_orphan_get(struct super_block *sb, unsigned long ino) +{ + unsigned long max_ino = le32_to_cpu(EXT3_SB(sb)->s_es->s_inodes_count); + unsigned long block_group; + int bit; + struct buffer_head *bitmap_bh = NULL; + struct inode *inode = NULL; + + /* Error cases - e2fsck has already cleaned up for us */ + if (ino > max_ino) { + ext3_warning(sb, __FUNCTION__, + "bad orphan ino %lu! e2fsck was run?\n", ino); + goto out; + } + + block_group = (ino - 1) / EXT3_INODES_PER_GROUP(sb); + bit = (ino - 1) % EXT3_INODES_PER_GROUP(sb); + bitmap_bh = read_inode_bitmap(sb, block_group); + if (!bitmap_bh) { + ext3_warning(sb, __FUNCTION__, + "inode bitmap error for orphan %lu\n", ino); + goto out; + } + + /* Having the inode bit set should be a 100% indicator that this + * is a valid orphan (no e2fsck run on fs). Orphans also include + * inodes that were being truncated, so we can't check i_nlink==0. + */ + if (!ext3_test_bit(bit, bitmap_bh->b_data) || + !(inode = iget(sb, ino)) || is_bad_inode(inode) || + NEXT_ORPHAN(inode) > max_ino) { + ext3_warning(sb, __FUNCTION__, + "bad orphan inode %lu! e2fsck was run?\n", ino); + printk(KERN_NOTICE "ext3_test_bit(bit=%d, block=%llu) = %d\n", + bit, (unsigned long long)bitmap_bh->b_blocknr, + ext3_test_bit(bit, bitmap_bh->b_data)); + printk(KERN_NOTICE "inode=%p\n", inode); + if (inode) { + printk(KERN_NOTICE "is_bad_inode(inode)=%d\n", + is_bad_inode(inode)); + printk(KERN_NOTICE "NEXT_ORPHAN(inode)=%u\n", + NEXT_ORPHAN(inode)); + printk(KERN_NOTICE "max_ino=%lu\n", max_ino); + } + /* Avoid freeing blocks if we got a bad deleted inode */ + if (inode && inode->i_nlink == 0) + inode->i_blocks = 0; + iput(inode); + inode = NULL; + } +out: + brelse(bitmap_bh); + return inode; +} + +unsigned long ext3_count_free_inodes (struct super_block * sb) +{ + unsigned long desc_count; + struct ext3_group_desc *gdp; + int i; +#ifdef EXT3FS_DEBUG + struct ext3_super_block *es; + unsigned long bitmap_count, x; + struct buffer_head *bitmap_bh = NULL; + + lock_super (sb); + es = EXT3_SB(sb)->s_es; + desc_count = 0; + bitmap_count = 0; + gdp = NULL; + for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) { + gdp = ext3_get_group_desc (sb, i, NULL); + if (!gdp) + continue; + desc_count += le16_to_cpu(gdp->bg_free_inodes_count); + brelse(bitmap_bh); + bitmap_bh = read_inode_bitmap(sb, i); + if (!bitmap_bh) + continue; + + x = ext3_count_free(bitmap_bh, EXT3_INODES_PER_GROUP(sb) / 8); + printk("group %d: stored = %d, counted = %lu\n", + i, le16_to_cpu(gdp->bg_free_inodes_count), x); + bitmap_count += x; + } + brelse(bitmap_bh); + printk("ext3_count_free_inodes: stored = %u, computed = %lu, %lu\n", + le32_to_cpu(es->s_free_inodes_count), desc_count, bitmap_count); + unlock_super(sb); + return desc_count; +#else + desc_count = 0; + for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) { + gdp = ext3_get_group_desc (sb, i, NULL); + if (!gdp) + continue; + desc_count += le16_to_cpu(gdp->bg_free_inodes_count); + cond_resched(); + } + return desc_count; +#endif +} + +/* Called at mount-time, super-block is locked */ +unsigned long ext3_count_dirs (struct super_block * sb) +{ + unsigned long count = 0; + int i; + + for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) { + struct ext3_group_desc *gdp = ext3_get_group_desc (sb, i, NULL); + if (!gdp) + continue; + count += le16_to_cpu(gdp->bg_used_dirs_count); + } + return count; +} + +#ifdef CONFIG_EXT3_CHECK +/* Called at mount-time, super-block is locked */ +void ext3_check_inodes_bitmap (struct super_block * sb) +{ + struct ext3_super_block * es; + unsigned long desc_count, bitmap_count, x; + struct buffer_head *bitmap_bh = NULL; + struct ext3_group_desc * gdp; + int i; + + es = EXT3_SB(sb)->s_es; + desc_count = 0; + bitmap_count = 0; + gdp = NULL; + for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) { + gdp = ext3_get_group_desc (sb, i, NULL); + if (!gdp) + continue; + desc_count += le16_to_cpu(gdp->bg_free_inodes_count); + brelse(bitmap_bh); + bitmap_bh = read_inode_bitmap(sb, i); + if (!bitmap_bh) + continue; + + x = ext3_count_free(bitmap_bh, EXT3_INODES_PER_GROUP(sb) / 8); + if (le16_to_cpu(gdp->bg_free_inodes_count) != x) + ext3_error (sb, "ext3_check_inodes_bitmap", + "Wrong free inodes count in group %d, " + "stored = %d, counted = %lu", i, + le16_to_cpu(gdp->bg_free_inodes_count), x); + bitmap_count += x; + } + brelse(bitmap_bh); + if (le32_to_cpu(es->s_free_inodes_count) != bitmap_count) + ext3_error (sb, "ext3_check_inodes_bitmap", + "Wrong free inodes count in super block, " + "stored = %lu, counted = %lu", + (unsigned long)le32_to_cpu(es->s_free_inodes_count), + bitmap_count); +} +#endif diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c new file mode 100644 index 000000000000..040eb288bb1c --- /dev/null +++ b/fs/ext3/inode.c @@ -0,0 +1,3132 @@ +/* + * linux/fs/ext3/inode.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/inode.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * Goal-directed block allocation by Stephen Tweedie + * (sct@redhat.com), 1993, 1998 + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + * 64-bit file support on 64-bit platforms by Jakub Jelinek + * (jj@sunsite.ms.mff.cuni.cz) + * + * Assorted race fixes, rewrite of ext3_get_block() by Al Viro, 2000 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "xattr.h" +#include "acl.h" + +static int ext3_writepage_trans_blocks(struct inode *inode); + +/* + * Test whether an inode is a fast symlink. + */ +static inline int ext3_inode_is_fast_symlink(struct inode *inode) +{ + int ea_blocks = EXT3_I(inode)->i_file_acl ? + (inode->i_sb->s_blocksize >> 9) : 0; + + return (S_ISLNK(inode->i_mode) && + inode->i_blocks - ea_blocks == 0); +} + +/* The ext3 forget function must perform a revoke if we are freeing data + * which has been journaled. Metadata (eg. indirect blocks) must be + * revoked in all cases. + * + * "bh" may be NULL: a metadata block may have been freed from memory + * but there may still be a record of it in the journal, and that record + * still needs to be revoked. + */ + +int ext3_forget(handle_t *handle, int is_metadata, + struct inode *inode, struct buffer_head *bh, + int blocknr) +{ + int err; + + might_sleep(); + + BUFFER_TRACE(bh, "enter"); + + jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, " + "data mode %lx\n", + bh, is_metadata, inode->i_mode, + test_opt(inode->i_sb, DATA_FLAGS)); + + /* Never use the revoke function if we are doing full data + * journaling: there is no need to, and a V1 superblock won't + * support it. Otherwise, only skip the revoke on un-journaled + * data blocks. */ + + if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA || + (!is_metadata && !ext3_should_journal_data(inode))) { + if (bh) { + BUFFER_TRACE(bh, "call journal_forget"); + return ext3_journal_forget(handle, bh); + } + return 0; + } + + /* + * data!=journal && (is_metadata || should_journal_data(inode)) + */ + BUFFER_TRACE(bh, "call ext3_journal_revoke"); + err = ext3_journal_revoke(handle, blocknr, bh); + if (err) + ext3_abort(inode->i_sb, __FUNCTION__, + "error %d when attempting revoke", err); + BUFFER_TRACE(bh, "exit"); + return err; +} + +/* + * Work out how many blocks we need to progress with the next chunk of a + * truncate transaction. + */ + +static unsigned long blocks_for_truncate(struct inode *inode) +{ + unsigned long needed; + + needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9); + + /* Give ourselves just enough room to cope with inodes in which + * i_blocks is corrupt: we've seen disk corruptions in the past + * which resulted in random data in an inode which looked enough + * like a regular file for ext3 to try to delete it. Things + * will go a bit crazy if that happens, but at least we should + * try not to panic the whole kernel. */ + if (needed < 2) + needed = 2; + + /* But we need to bound the transaction so we don't overflow the + * journal. */ + if (needed > EXT3_MAX_TRANS_DATA) + needed = EXT3_MAX_TRANS_DATA; + + return EXT3_DATA_TRANS_BLOCKS + needed; +} + +/* + * Truncate transactions can be complex and absolutely huge. So we need to + * be able to restart the transaction at a conventient checkpoint to make + * sure we don't overflow the journal. + * + * start_transaction gets us a new handle for a truncate transaction, + * and extend_transaction tries to extend the existing one a bit. If + * extend fails, we need to propagate the failure up and restart the + * transaction in the top-level truncate loop. --sct + */ + +static handle_t *start_transaction(struct inode *inode) +{ + handle_t *result; + + result = ext3_journal_start(inode, blocks_for_truncate(inode)); + if (!IS_ERR(result)) + return result; + + ext3_std_error(inode->i_sb, PTR_ERR(result)); + return result; +} + +/* + * Try to extend this transaction for the purposes of truncation. + * + * Returns 0 if we managed to create more room. If we can't create more + * room, and the transaction must be restarted we return 1. + */ +static int try_to_extend_transaction(handle_t *handle, struct inode *inode) +{ + if (handle->h_buffer_credits > EXT3_RESERVE_TRANS_BLOCKS) + return 0; + if (!ext3_journal_extend(handle, blocks_for_truncate(inode))) + return 0; + return 1; +} + +/* + * Restart the transaction associated with *handle. This does a commit, + * so before we call here everything must be consistently dirtied against + * this transaction. + */ +static int ext3_journal_test_restart(handle_t *handle, struct inode *inode) +{ + jbd_debug(2, "restarting handle %p\n", handle); + return ext3_journal_restart(handle, blocks_for_truncate(inode)); +} + +/* + * Called at the last iput() if i_nlink is zero. + */ +void ext3_delete_inode (struct inode * inode) +{ + handle_t *handle; + + if (is_bad_inode(inode)) + goto no_delete; + + handle = start_transaction(inode); + if (IS_ERR(handle)) { + /* If we're going to skip the normal cleanup, we still + * need to make sure that the in-core orphan linked list + * is properly cleaned up. */ + ext3_orphan_del(NULL, inode); + goto no_delete; + } + + if (IS_SYNC(inode)) + handle->h_sync = 1; + inode->i_size = 0; + if (inode->i_blocks) + ext3_truncate(inode); + /* + * Kill off the orphan record which ext3_truncate created. + * AKPM: I think this can be inside the above `if'. + * Note that ext3_orphan_del() has to be able to cope with the + * deletion of a non-existent orphan - this is because we don't + * know if ext3_truncate() actually created an orphan record. + * (Well, we could do this if we need to, but heck - it works) + */ + ext3_orphan_del(handle, inode); + EXT3_I(inode)->i_dtime = get_seconds(); + + /* + * One subtle ordering requirement: if anything has gone wrong + * (transaction abort, IO errors, whatever), then we can still + * do these next steps (the fs will already have been marked as + * having errors), but we can't free the inode if the mark_dirty + * fails. + */ + if (ext3_mark_inode_dirty(handle, inode)) + /* If that failed, just do the required in-core inode clear. */ + clear_inode(inode); + else + ext3_free_inode(handle, inode); + ext3_journal_stop(handle); + return; +no_delete: + clear_inode(inode); /* We must guarantee clearing of inode... */ +} + +static int ext3_alloc_block (handle_t *handle, + struct inode * inode, unsigned long goal, int *err) +{ + unsigned long result; + + result = ext3_new_block(handle, inode, goal, err); + return result; +} + + +typedef struct { + __le32 *p; + __le32 key; + struct buffer_head *bh; +} Indirect; + +static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v) +{ + p->key = *(p->p = v); + p->bh = bh; +} + +static inline int verify_chain(Indirect *from, Indirect *to) +{ + while (from <= to && from->key == *from->p) + from++; + return (from > to); +} + +/** + * ext3_block_to_path - parse the block number into array of offsets + * @inode: inode in question (we are only interested in its superblock) + * @i_block: block number to be parsed + * @offsets: array to store the offsets in + * @boundary: set this non-zero if the referred-to block is likely to be + * followed (on disk) by an indirect block. + * + * To store the locations of file's data ext3 uses a data structure common + * for UNIX filesystems - tree of pointers anchored in the inode, with + * data blocks at leaves and indirect blocks in intermediate nodes. + * This function translates the block number into path in that tree - + * return value is the path length and @offsets[n] is the offset of + * pointer to (n+1)th node in the nth one. If @block is out of range + * (negative or too large) warning is printed and zero returned. + * + * Note: function doesn't find node addresses, so no IO is needed. All + * we need to know is the capacity of indirect blocks (taken from the + * inode->i_sb). + */ + +/* + * Portability note: the last comparison (check that we fit into triple + * indirect block) is spelled differently, because otherwise on an + * architecture with 32-bit longs and 8Kb pages we might get into trouble + * if our filesystem had 8Kb blocks. We might use long long, but that would + * kill us on x86. Oh, well, at least the sign propagation does not matter - + * i_block would have to be negative in the very beginning, so we would not + * get there at all. + */ + +static int ext3_block_to_path(struct inode *inode, + long i_block, int offsets[4], int *boundary) +{ + int ptrs = EXT3_ADDR_PER_BLOCK(inode->i_sb); + int ptrs_bits = EXT3_ADDR_PER_BLOCK_BITS(inode->i_sb); + const long direct_blocks = EXT3_NDIR_BLOCKS, + indirect_blocks = ptrs, + double_blocks = (1 << (ptrs_bits * 2)); + int n = 0; + int final = 0; + + if (i_block < 0) { + ext3_warning (inode->i_sb, "ext3_block_to_path", "block < 0"); + } else if (i_block < direct_blocks) { + offsets[n++] = i_block; + final = direct_blocks; + } else if ( (i_block -= direct_blocks) < indirect_blocks) { + offsets[n++] = EXT3_IND_BLOCK; + offsets[n++] = i_block; + final = ptrs; + } else if ((i_block -= indirect_blocks) < double_blocks) { + offsets[n++] = EXT3_DIND_BLOCK; + offsets[n++] = i_block >> ptrs_bits; + offsets[n++] = i_block & (ptrs - 1); + final = ptrs; + } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) { + offsets[n++] = EXT3_TIND_BLOCK; + offsets[n++] = i_block >> (ptrs_bits * 2); + offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1); + offsets[n++] = i_block & (ptrs - 1); + final = ptrs; + } else { + ext3_warning (inode->i_sb, "ext3_block_to_path", "block > big"); + } + if (boundary) + *boundary = (i_block & (ptrs - 1)) == (final - 1); + return n; +} + +/** + * ext3_get_branch - read the chain of indirect blocks leading to data + * @inode: inode in question + * @depth: depth of the chain (1 - direct pointer, etc.) + * @offsets: offsets of pointers in inode/indirect blocks + * @chain: place to store the result + * @err: here we store the error value + * + * Function fills the array of triples and returns %NULL + * if everything went OK or the pointer to the last filled triple + * (incomplete one) otherwise. Upon the return chain[i].key contains + * the number of (i+1)-th block in the chain (as it is stored in memory, + * i.e. little-endian 32-bit), chain[i].p contains the address of that + * number (it points into struct inode for i==0 and into the bh->b_data + * for i>0) and chain[i].bh points to the buffer_head of i-th indirect + * block for i>0 and NULL for i==0. In other words, it holds the block + * numbers of the chain, addresses they were taken from (and where we can + * verify that chain did not change) and buffer_heads hosting these + * numbers. + * + * Function stops when it stumbles upon zero pointer (absent block) + * (pointer to last triple returned, *@err == 0) + * or when it gets an IO error reading an indirect block + * (ditto, *@err == -EIO) + * or when it notices that chain had been changed while it was reading + * (ditto, *@err == -EAGAIN) + * or when it reads all @depth-1 indirect blocks successfully and finds + * the whole chain, all way to the data (returns %NULL, *err == 0). + */ +static Indirect *ext3_get_branch(struct inode *inode, int depth, int *offsets, + Indirect chain[4], int *err) +{ + struct super_block *sb = inode->i_sb; + Indirect *p = chain; + struct buffer_head *bh; + + *err = 0; + /* i_data is not going away, no lock needed */ + add_chain (chain, NULL, EXT3_I(inode)->i_data + *offsets); + if (!p->key) + goto no_block; + while (--depth) { + bh = sb_bread(sb, le32_to_cpu(p->key)); + if (!bh) + goto failure; + /* Reader: pointers */ + if (!verify_chain(chain, p)) + goto changed; + add_chain(++p, bh, (__le32*)bh->b_data + *++offsets); + /* Reader: end */ + if (!p->key) + goto no_block; + } + return NULL; + +changed: + brelse(bh); + *err = -EAGAIN; + goto no_block; +failure: + *err = -EIO; +no_block: + return p; +} + +/** + * ext3_find_near - find a place for allocation with sufficient locality + * @inode: owner + * @ind: descriptor of indirect block. + * + * This function returns the prefered place for block allocation. + * It is used when heuristic for sequential allocation fails. + * Rules are: + * + if there is a block to the left of our position - allocate near it. + * + if pointer will live in indirect block - allocate near that block. + * + if pointer will live in inode - allocate in the same + * cylinder group. + * + * In the latter case we colour the starting block by the callers PID to + * prevent it from clashing with concurrent allocations for a different inode + * in the same block group. The PID is used here so that functionally related + * files will be close-by on-disk. + * + * Caller must make sure that @ind is valid and will stay that way. + */ + +static unsigned long ext3_find_near(struct inode *inode, Indirect *ind) +{ + struct ext3_inode_info *ei = EXT3_I(inode); + __le32 *start = ind->bh ? (__le32*) ind->bh->b_data : ei->i_data; + __le32 *p; + unsigned long bg_start; + unsigned long colour; + + /* Try to find previous block */ + for (p = ind->p - 1; p >= start; p--) + if (*p) + return le32_to_cpu(*p); + + /* No such thing, so let's try location of indirect block */ + if (ind->bh) + return ind->bh->b_blocknr; + + /* + * It is going to be refered from inode itself? OK, just put it into + * the same cylinder group then. + */ + bg_start = (ei->i_block_group * EXT3_BLOCKS_PER_GROUP(inode->i_sb)) + + le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block); + colour = (current->pid % 16) * + (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16); + return bg_start + colour; +} + +/** + * ext3_find_goal - find a prefered place for allocation. + * @inode: owner + * @block: block we want + * @chain: chain of indirect blocks + * @partial: pointer to the last triple within a chain + * @goal: place to store the result. + * + * Normally this function find the prefered place for block allocation, + * stores it in *@goal and returns zero. If the branch had been changed + * under us we return -EAGAIN. + */ + +static int ext3_find_goal(struct inode *inode, long block, Indirect chain[4], + Indirect *partial, unsigned long *goal) +{ + struct ext3_block_alloc_info *block_i = EXT3_I(inode)->i_block_alloc_info; + + /* + * try the heuristic for sequential allocation, + * failing that at least try to get decent locality. + */ + if (block_i && (block == block_i->last_alloc_logical_block + 1) + && (block_i->last_alloc_physical_block != 0)) { + *goal = block_i->last_alloc_physical_block + 1; + return 0; + } + + if (verify_chain(chain, partial)) { + *goal = ext3_find_near(inode, partial); + return 0; + } + return -EAGAIN; +} + +/** + * ext3_alloc_branch - allocate and set up a chain of blocks. + * @inode: owner + * @num: depth of the chain (number of blocks to allocate) + * @offsets: offsets (in the blocks) to store the pointers to next. + * @branch: place to store the chain in. + * + * This function allocates @num blocks, zeroes out all but the last one, + * links them into chain and (if we are synchronous) writes them to disk. + * In other words, it prepares a branch that can be spliced onto the + * inode. It stores the information about that chain in the branch[], in + * the same format as ext3_get_branch() would do. We are calling it after + * we had read the existing part of chain and partial points to the last + * triple of that (one with zero ->key). Upon the exit we have the same + * picture as after the successful ext3_get_block(), excpet that in one + * place chain is disconnected - *branch->p is still zero (we did not + * set the last link), but branch->key contains the number that should + * be placed into *branch->p to fill that gap. + * + * If allocation fails we free all blocks we've allocated (and forget + * their buffer_heads) and return the error value the from failed + * ext3_alloc_block() (normally -ENOSPC). Otherwise we set the chain + * as described above and return 0. + */ + +static int ext3_alloc_branch(handle_t *handle, struct inode *inode, + int num, + unsigned long goal, + int *offsets, + Indirect *branch) +{ + int blocksize = inode->i_sb->s_blocksize; + int n = 0, keys = 0; + int err = 0; + int i; + int parent = ext3_alloc_block(handle, inode, goal, &err); + + branch[0].key = cpu_to_le32(parent); + if (parent) { + for (n = 1; n < num; n++) { + struct buffer_head *bh; + /* Allocate the next block */ + int nr = ext3_alloc_block(handle, inode, parent, &err); + if (!nr) + break; + branch[n].key = cpu_to_le32(nr); + keys = n+1; + + /* + * Get buffer_head for parent block, zero it out + * and set the pointer to new one, then send + * parent to disk. + */ + bh = sb_getblk(inode->i_sb, parent); + branch[n].bh = bh; + lock_buffer(bh); + BUFFER_TRACE(bh, "call get_create_access"); + err = ext3_journal_get_create_access(handle, bh); + if (err) { + unlock_buffer(bh); + brelse(bh); + break; + } + + memset(bh->b_data, 0, blocksize); + branch[n].p = (__le32*) bh->b_data + offsets[n]; + *branch[n].p = branch[n].key; + BUFFER_TRACE(bh, "marking uptodate"); + set_buffer_uptodate(bh); + unlock_buffer(bh); + + BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); + err = ext3_journal_dirty_metadata(handle, bh); + if (err) + break; + + parent = nr; + } + } + if (n == num) + return 0; + + /* Allocation failed, free what we already allocated */ + for (i = 1; i < keys; i++) { + BUFFER_TRACE(branch[i].bh, "call journal_forget"); + ext3_journal_forget(handle, branch[i].bh); + } + for (i = 0; i < keys; i++) + ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1); + return err; +} + +/** + * ext3_splice_branch - splice the allocated branch onto inode. + * @inode: owner + * @block: (logical) number of block we are adding + * @chain: chain of indirect blocks (with a missing link - see + * ext3_alloc_branch) + * @where: location of missing link + * @num: number of blocks we are adding + * + * This function verifies that chain (up to the missing link) had not + * changed, fills the missing link and does all housekeeping needed in + * inode (->i_blocks, etc.). In case of success we end up with the full + * chain to new block and return 0. Otherwise (== chain had been changed) + * we free the new blocks (forgetting their buffer_heads, indeed) and + * return -EAGAIN. + */ + +static int ext3_splice_branch(handle_t *handle, struct inode *inode, long block, + Indirect chain[4], Indirect *where, int num) +{ + int i; + int err = 0; + struct ext3_block_alloc_info *block_i = EXT3_I(inode)->i_block_alloc_info; + + /* + * If we're splicing into a [td]indirect block (as opposed to the + * inode) then we need to get write access to the [td]indirect block + * before the splice. + */ + if (where->bh) { + BUFFER_TRACE(where->bh, "get_write_access"); + err = ext3_journal_get_write_access(handle, where->bh); + if (err) + goto err_out; + } + /* Verify that place we are splicing to is still there and vacant */ + + if (!verify_chain(chain, where-1) || *where->p) + /* Writer: end */ + goto changed; + + /* That's it */ + + *where->p = where->key; + + /* + * update the most recently allocated logical & physical block + * in i_block_alloc_info, to assist find the proper goal block for next + * allocation + */ + if (block_i) { + block_i->last_alloc_logical_block = block; + block_i->last_alloc_physical_block = le32_to_cpu(where[num-1].key); + } + + /* We are done with atomic stuff, now do the rest of housekeeping */ + + inode->i_ctime = CURRENT_TIME_SEC; + ext3_mark_inode_dirty(handle, inode); + + /* had we spliced it onto indirect block? */ + if (where->bh) { + /* + * akpm: If we spliced it onto an indirect block, we haven't + * altered the inode. Note however that if it is being spliced + * onto an indirect block at the very end of the file (the + * file is growing) then we *will* alter the inode to reflect + * the new i_size. But that is not done here - it is done in + * generic_commit_write->__mark_inode_dirty->ext3_dirty_inode. + */ + jbd_debug(5, "splicing indirect only\n"); + BUFFER_TRACE(where->bh, "call ext3_journal_dirty_metadata"); + err = ext3_journal_dirty_metadata(handle, where->bh); + if (err) + goto err_out; + } else { + /* + * OK, we spliced it into the inode itself on a direct block. + * Inode was dirtied above. + */ + jbd_debug(5, "splicing direct\n"); + } + return err; + +changed: + /* + * AKPM: if where[i].bh isn't part of the current updating + * transaction then we explode nastily. Test this code path. + */ + jbd_debug(1, "the chain changed: try again\n"); + err = -EAGAIN; + +err_out: + for (i = 1; i < num; i++) { + BUFFER_TRACE(where[i].bh, "call journal_forget"); + ext3_journal_forget(handle, where[i].bh); + } + /* For the normal collision cleanup case, we free up the blocks. + * On genuine filesystem errors we don't even think about doing + * that. */ + if (err == -EAGAIN) + for (i = 0; i < num; i++) + ext3_free_blocks(handle, inode, + le32_to_cpu(where[i].key), 1); + return err; +} + +/* + * Allocation strategy is simple: if we have to allocate something, we will + * have to go the whole way to leaf. So let's do it before attaching anything + * to tree, set linkage between the newborn blocks, write them if sync is + * required, recheck the path, free and repeat if check fails, otherwise + * set the last missing link (that will protect us from any truncate-generated + * removals - all blocks on the path are immune now) and possibly force the + * write on the parent block. + * That has a nice additional property: no special recovery from the failed + * allocations is needed - we simply release blocks and do not touch anything + * reachable from inode. + * + * akpm: `handle' can be NULL if create == 0. + * + * The BKL may not be held on entry here. Be sure to take it early. + */ + +static int +ext3_get_block_handle(handle_t *handle, struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create, int extend_disksize) +{ + int err = -EIO; + int offsets[4]; + Indirect chain[4]; + Indirect *partial; + unsigned long goal; + int left; + int boundary = 0; + int depth = ext3_block_to_path(inode, iblock, offsets, &boundary); + struct ext3_inode_info *ei = EXT3_I(inode); + + J_ASSERT(handle != NULL || create == 0); + + if (depth == 0) + goto out; + +reread: + partial = ext3_get_branch(inode, depth, offsets, chain, &err); + + /* Simplest case - block found, no allocation needed */ + if (!partial) { + clear_buffer_new(bh_result); +got_it: + map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key)); + if (boundary) + set_buffer_boundary(bh_result); + /* Clean up and exit */ + partial = chain+depth-1; /* the whole chain */ + goto cleanup; + } + + /* Next simple case - plain lookup or failed read of indirect block */ + if (!create || err == -EIO) { +cleanup: + while (partial > chain) { + BUFFER_TRACE(partial->bh, "call brelse"); + brelse(partial->bh); + partial--; + } + BUFFER_TRACE(bh_result, "returned"); +out: + return err; + } + + /* + * Indirect block might be removed by truncate while we were + * reading it. Handling of that case (forget what we've got and + * reread) is taken out of the main path. + */ + if (err == -EAGAIN) + goto changed; + + goal = 0; + down(&ei->truncate_sem); + + /* lazy initialize the block allocation info here if necessary */ + if (S_ISREG(inode->i_mode) && (!ei->i_block_alloc_info)) { + ext3_init_block_alloc_info(inode); + } + + if (ext3_find_goal(inode, iblock, chain, partial, &goal) < 0) { + up(&ei->truncate_sem); + goto changed; + } + + left = (chain + depth) - partial; + + /* + * Block out ext3_truncate while we alter the tree + */ + err = ext3_alloc_branch(handle, inode, left, goal, + offsets+(partial-chain), partial); + + /* The ext3_splice_branch call will free and forget any buffers + * on the new chain if there is a failure, but that risks using + * up transaction credits, especially for bitmaps where the + * credits cannot be returned. Can we handle this somehow? We + * may need to return -EAGAIN upwards in the worst case. --sct */ + if (!err) + err = ext3_splice_branch(handle, inode, iblock, chain, + partial, left); + /* i_disksize growing is protected by truncate_sem + * don't forget to protect it if you're about to implement + * concurrent ext3_get_block() -bzzz */ + if (!err && extend_disksize && inode->i_size > ei->i_disksize) + ei->i_disksize = inode->i_size; + up(&ei->truncate_sem); + if (err == -EAGAIN) + goto changed; + if (err) + goto cleanup; + + set_buffer_new(bh_result); + goto got_it; + +changed: + while (partial > chain) { + jbd_debug(1, "buffer chain changed, retrying\n"); + BUFFER_TRACE(partial->bh, "brelsing"); + brelse(partial->bh); + partial--; + } + goto reread; +} + +static int ext3_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + handle_t *handle = NULL; + int ret; + + if (create) { + handle = ext3_journal_current_handle(); + J_ASSERT(handle != 0); + } + ret = ext3_get_block_handle(handle, inode, iblock, + bh_result, create, 1); + return ret; +} + +#define DIO_CREDITS (EXT3_RESERVE_TRANS_BLOCKS + 32) + +static int +ext3_direct_io_get_blocks(struct inode *inode, sector_t iblock, + unsigned long max_blocks, struct buffer_head *bh_result, + int create) +{ + handle_t *handle = journal_current_handle(); + int ret = 0; + + if (!handle) + goto get_block; /* A read */ + + if (handle->h_transaction->t_state == T_LOCKED) { + /* + * Huge direct-io writes can hold off commits for long + * periods of time. Let this commit run. + */ + ext3_journal_stop(handle); + handle = ext3_journal_start(inode, DIO_CREDITS); + if (IS_ERR(handle)) + ret = PTR_ERR(handle); + goto get_block; + } + + if (handle->h_buffer_credits <= EXT3_RESERVE_TRANS_BLOCKS) { + /* + * Getting low on buffer credits... + */ + ret = ext3_journal_extend(handle, DIO_CREDITS); + if (ret > 0) { + /* + * Couldn't extend the transaction. Start a new one. + */ + ret = ext3_journal_restart(handle, DIO_CREDITS); + } + } + +get_block: + if (ret == 0) + ret = ext3_get_block_handle(handle, inode, iblock, + bh_result, create, 0); + bh_result->b_size = (1 << inode->i_blkbits); + return ret; +} + +static int ext3_writepages_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh, int create) +{ + return ext3_direct_io_get_blocks(inode, iblock, 1, bh, create); +} + +/* + * `handle' can be NULL if create is zero + */ +struct buffer_head *ext3_getblk(handle_t *handle, struct inode * inode, + long block, int create, int * errp) +{ + struct buffer_head dummy; + int fatal = 0, err; + + J_ASSERT(handle != NULL || create == 0); + + dummy.b_state = 0; + dummy.b_blocknr = -1000; + buffer_trace_init(&dummy.b_history); + *errp = ext3_get_block_handle(handle, inode, block, &dummy, create, 1); + if (!*errp && buffer_mapped(&dummy)) { + struct buffer_head *bh; + bh = sb_getblk(inode->i_sb, dummy.b_blocknr); + if (buffer_new(&dummy)) { + J_ASSERT(create != 0); + J_ASSERT(handle != 0); + + /* Now that we do not always journal data, we + should keep in mind whether this should + always journal the new buffer as metadata. + For now, regular file writes use + ext3_get_block instead, so it's not a + problem. */ + lock_buffer(bh); + BUFFER_TRACE(bh, "call get_create_access"); + fatal = ext3_journal_get_create_access(handle, bh); + if (!fatal && !buffer_uptodate(bh)) { + memset(bh->b_data, 0, inode->i_sb->s_blocksize); + set_buffer_uptodate(bh); + } + unlock_buffer(bh); + BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); + err = ext3_journal_dirty_metadata(handle, bh); + if (!fatal) + fatal = err; + } else { + BUFFER_TRACE(bh, "not a new buffer"); + } + if (fatal) { + *errp = fatal; + brelse(bh); + bh = NULL; + } + return bh; + } + return NULL; +} + +struct buffer_head *ext3_bread(handle_t *handle, struct inode * inode, + int block, int create, int *err) +{ + struct buffer_head * bh; + + bh = ext3_getblk(handle, inode, block, create, err); + if (!bh) + return bh; + if (buffer_uptodate(bh)) + return bh; + ll_rw_block(READ, 1, &bh); + wait_on_buffer(bh); + if (buffer_uptodate(bh)) + return bh; + put_bh(bh); + *err = -EIO; + return NULL; +} + +static int walk_page_buffers( handle_t *handle, + struct buffer_head *head, + unsigned from, + unsigned to, + int *partial, + int (*fn)( handle_t *handle, + struct buffer_head *bh)) +{ + struct buffer_head *bh; + unsigned block_start, block_end; + unsigned blocksize = head->b_size; + int err, ret = 0; + struct buffer_head *next; + + for ( bh = head, block_start = 0; + ret == 0 && (bh != head || !block_start); + block_start = block_end, bh = next) + { + next = bh->b_this_page; + block_end = block_start + blocksize; + if (block_end <= from || block_start >= to) { + if (partial && !buffer_uptodate(bh)) + *partial = 1; + continue; + } + err = (*fn)(handle, bh); + if (!ret) + ret = err; + } + return ret; +} + +/* + * To preserve ordering, it is essential that the hole instantiation and + * the data write be encapsulated in a single transaction. We cannot + * close off a transaction and start a new one between the ext3_get_block() + * and the commit_write(). So doing the journal_start at the start of + * prepare_write() is the right place. + * + * Also, this function can nest inside ext3_writepage() -> + * block_write_full_page(). In that case, we *know* that ext3_writepage() + * has generated enough buffer credits to do the whole page. So we won't + * block on the journal in that case, which is good, because the caller may + * be PF_MEMALLOC. + * + * By accident, ext3 can be reentered when a transaction is open via + * quota file writes. If we were to commit the transaction while thus + * reentered, there can be a deadlock - we would be holding a quota + * lock, and the commit would never complete if another thread had a + * transaction open and was blocking on the quota lock - a ranking + * violation. + * + * So what we do is to rely on the fact that journal_stop/journal_start + * will _not_ run commit under these circumstances because handle->h_ref + * is elevated. We'll still have enough credits for the tiny quotafile + * write. + */ + +static int do_journal_get_write_access(handle_t *handle, + struct buffer_head *bh) +{ + if (!buffer_mapped(bh) || buffer_freed(bh)) + return 0; + return ext3_journal_get_write_access(handle, bh); +} + +static int ext3_prepare_write(struct file *file, struct page *page, + unsigned from, unsigned to) +{ + struct inode *inode = page->mapping->host; + int ret, needed_blocks = ext3_writepage_trans_blocks(inode); + handle_t *handle; + int retries = 0; + +retry: + handle = ext3_journal_start(inode, needed_blocks); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out; + } + if (test_opt(inode->i_sb, NOBH)) + ret = nobh_prepare_write(page, from, to, ext3_get_block); + else + ret = block_prepare_write(page, from, to, ext3_get_block); + if (ret) + goto prepare_write_failed; + + if (ext3_should_journal_data(inode)) { + ret = walk_page_buffers(handle, page_buffers(page), + from, to, NULL, do_journal_get_write_access); + } +prepare_write_failed: + if (ret) + ext3_journal_stop(handle); + if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries)) + goto retry; +out: + return ret; +} + +int +ext3_journal_dirty_data(handle_t *handle, struct buffer_head *bh) +{ + int err = journal_dirty_data(handle, bh); + if (err) + ext3_journal_abort_handle(__FUNCTION__, __FUNCTION__, + bh, handle,err); + return err; +} + +/* For commit_write() in data=journal mode */ +static int commit_write_fn(handle_t *handle, struct buffer_head *bh) +{ + if (!buffer_mapped(bh) || buffer_freed(bh)) + return 0; + set_buffer_uptodate(bh); + return ext3_journal_dirty_metadata(handle, bh); +} + +/* + * We need to pick up the new inode size which generic_commit_write gave us + * `file' can be NULL - eg, when called from page_symlink(). + * + * ext3 never places buffers on inode->i_mapping->private_list. metadata + * buffers are managed internally. + */ + +static int ext3_ordered_commit_write(struct file *file, struct page *page, + unsigned from, unsigned to) +{ + handle_t *handle = ext3_journal_current_handle(); + struct inode *inode = page->mapping->host; + int ret = 0, ret2; + + ret = walk_page_buffers(handle, page_buffers(page), + from, to, NULL, ext3_journal_dirty_data); + + if (ret == 0) { + /* + * generic_commit_write() will run mark_inode_dirty() if i_size + * changes. So let's piggyback the i_disksize mark_inode_dirty + * into that. + */ + loff_t new_i_size; + + new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; + if (new_i_size > EXT3_I(inode)->i_disksize) + EXT3_I(inode)->i_disksize = new_i_size; + ret = generic_commit_write(file, page, from, to); + } + ret2 = ext3_journal_stop(handle); + if (!ret) + ret = ret2; + return ret; +} + +static int ext3_writeback_commit_write(struct file *file, struct page *page, + unsigned from, unsigned to) +{ + handle_t *handle = ext3_journal_current_handle(); + struct inode *inode = page->mapping->host; + int ret = 0, ret2; + loff_t new_i_size; + + new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; + if (new_i_size > EXT3_I(inode)->i_disksize) + EXT3_I(inode)->i_disksize = new_i_size; + + if (test_opt(inode->i_sb, NOBH)) + ret = nobh_commit_write(file, page, from, to); + else + ret = generic_commit_write(file, page, from, to); + + ret2 = ext3_journal_stop(handle); + if (!ret) + ret = ret2; + return ret; +} + +static int ext3_journalled_commit_write(struct file *file, + struct page *page, unsigned from, unsigned to) +{ + handle_t *handle = ext3_journal_current_handle(); + struct inode *inode = page->mapping->host; + int ret = 0, ret2; + int partial = 0; + loff_t pos; + + /* + * Here we duplicate the generic_commit_write() functionality + */ + pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; + + ret = walk_page_buffers(handle, page_buffers(page), from, + to, &partial, commit_write_fn); + if (!partial) + SetPageUptodate(page); + if (pos > inode->i_size) + i_size_write(inode, pos); + EXT3_I(inode)->i_state |= EXT3_STATE_JDATA; + if (inode->i_size > EXT3_I(inode)->i_disksize) { + EXT3_I(inode)->i_disksize = inode->i_size; + ret2 = ext3_mark_inode_dirty(handle, inode); + if (!ret) + ret = ret2; + } + ret2 = ext3_journal_stop(handle); + if (!ret) + ret = ret2; + return ret; +} + +/* + * bmap() is special. It gets used by applications such as lilo and by + * the swapper to find the on-disk block of a specific piece of data. + * + * Naturally, this is dangerous if the block concerned is still in the + * journal. If somebody makes a swapfile on an ext3 data-journaling + * filesystem and enables swap, then they may get a nasty shock when the + * data getting swapped to that swapfile suddenly gets overwritten by + * the original zero's written out previously to the journal and + * awaiting writeback in the kernel's buffer cache. + * + * So, if we see any bmap calls here on a modified, data-journaled file, + * take extra steps to flush any blocks which might be in the cache. + */ +static sector_t ext3_bmap(struct address_space *mapping, sector_t block) +{ + struct inode *inode = mapping->host; + journal_t *journal; + int err; + + if (EXT3_I(inode)->i_state & EXT3_STATE_JDATA) { + /* + * This is a REALLY heavyweight approach, but the use of + * bmap on dirty files is expected to be extremely rare: + * only if we run lilo or swapon on a freshly made file + * do we expect this to happen. + * + * (bmap requires CAP_SYS_RAWIO so this does not + * represent an unprivileged user DOS attack --- we'd be + * in trouble if mortal users could trigger this path at + * will.) + * + * NB. EXT3_STATE_JDATA is not set on files other than + * regular files. If somebody wants to bmap a directory + * or symlink and gets confused because the buffer + * hasn't yet been flushed to disk, they deserve + * everything they get. + */ + + EXT3_I(inode)->i_state &= ~EXT3_STATE_JDATA; + journal = EXT3_JOURNAL(inode); + journal_lock_updates(journal); + err = journal_flush(journal); + journal_unlock_updates(journal); + + if (err) + return 0; + } + + return generic_block_bmap(mapping,block,ext3_get_block); +} + +static int bget_one(handle_t *handle, struct buffer_head *bh) +{ + get_bh(bh); + return 0; +} + +static int bput_one(handle_t *handle, struct buffer_head *bh) +{ + put_bh(bh); + return 0; +} + +static int journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh) +{ + if (buffer_mapped(bh)) + return ext3_journal_dirty_data(handle, bh); + return 0; +} + +/* + * Note that we always start a transaction even if we're not journalling + * data. This is to preserve ordering: any hole instantiation within + * __block_write_full_page -> ext3_get_block() should be journalled + * along with the data so we don't crash and then get metadata which + * refers to old data. + * + * In all journalling modes block_write_full_page() will start the I/O. + * + * Problem: + * + * ext3_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() -> + * ext3_writepage() + * + * Similar for: + * + * ext3_file_write() -> generic_file_write() -> __alloc_pages() -> ... + * + * Same applies to ext3_get_block(). We will deadlock on various things like + * lock_journal and i_truncate_sem. + * + * Setting PF_MEMALLOC here doesn't work - too many internal memory + * allocations fail. + * + * 16May01: If we're reentered then journal_current_handle() will be + * non-zero. We simply *return*. + * + * 1 July 2001: @@@ FIXME: + * In journalled data mode, a data buffer may be metadata against the + * current transaction. But the same file is part of a shared mapping + * and someone does a writepage() on it. + * + * We will move the buffer onto the async_data list, but *after* it has + * been dirtied. So there's a small window where we have dirty data on + * BJ_Metadata. + * + * Note that this only applies to the last partial page in the file. The + * bit which block_write_full_page() uses prepare/commit for. (That's + * broken code anyway: it's wrong for msync()). + * + * It's a rare case: affects the final partial page, for journalled data + * where the file is subject to bith write() and writepage() in the same + * transction. To fix it we'll need a custom block_write_full_page(). + * We'll probably need that anyway for journalling writepage() output. + * + * We don't honour synchronous mounts for writepage(). That would be + * disastrous. Any write() or metadata operation will sync the fs for + * us. + * + * AKPM2: if all the page's buffers are mapped to disk and !data=journal, + * we don't need to open a transaction here. + */ +static int ext3_ordered_writepage(struct page *page, + struct writeback_control *wbc) +{ + struct inode *inode = page->mapping->host; + struct buffer_head *page_bufs; + handle_t *handle = NULL; + int ret = 0; + int err; + + J_ASSERT(PageLocked(page)); + + /* + * We give up here if we're reentered, because it might be for a + * different filesystem. + */ + if (ext3_journal_current_handle()) + goto out_fail; + + handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode)); + + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out_fail; + } + + if (!page_has_buffers(page)) { + create_empty_buffers(page, inode->i_sb->s_blocksize, + (1 << BH_Dirty)|(1 << BH_Uptodate)); + } + page_bufs = page_buffers(page); + walk_page_buffers(handle, page_bufs, 0, + PAGE_CACHE_SIZE, NULL, bget_one); + + ret = block_write_full_page(page, ext3_get_block, wbc); + + /* + * The page can become unlocked at any point now, and + * truncate can then come in and change things. So we + * can't touch *page from now on. But *page_bufs is + * safe due to elevated refcount. + */ + + /* + * And attach them to the current transaction. But only if + * block_write_full_page() succeeded. Otherwise they are unmapped, + * and generally junk. + */ + if (ret == 0) { + err = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, + NULL, journal_dirty_data_fn); + if (!ret) + ret = err; + } + walk_page_buffers(handle, page_bufs, 0, + PAGE_CACHE_SIZE, NULL, bput_one); + err = ext3_journal_stop(handle); + if (!ret) + ret = err; + return ret; + +out_fail: + redirty_page_for_writepage(wbc, page); + unlock_page(page); + return ret; +} + +static int +ext3_writeback_writepage_helper(struct page *page, + struct writeback_control *wbc) +{ + return block_write_full_page(page, ext3_get_block, wbc); +} + +static int +ext3_writeback_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct inode *inode = mapping->host; + handle_t *handle = NULL; + int err, ret = 0; + + if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) + return ret; + + handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode)); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + return ret; + } + + ret = __mpage_writepages(mapping, wbc, ext3_writepages_get_block, + ext3_writeback_writepage_helper); + + /* + * Need to reaquire the handle since ext3_writepages_get_block() + * can restart the handle + */ + handle = journal_current_handle(); + + err = ext3_journal_stop(handle); + if (!ret) + ret = err; + return ret; +} + +static int ext3_writeback_writepage(struct page *page, + struct writeback_control *wbc) +{ + struct inode *inode = page->mapping->host; + handle_t *handle = NULL; + int ret = 0; + int err; + + if (ext3_journal_current_handle()) + goto out_fail; + + handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode)); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out_fail; + } + + if (test_opt(inode->i_sb, NOBH)) + ret = nobh_writepage(page, ext3_get_block, wbc); + else + ret = block_write_full_page(page, ext3_get_block, wbc); + + err = ext3_journal_stop(handle); + if (!ret) + ret = err; + return ret; + +out_fail: + redirty_page_for_writepage(wbc, page); + unlock_page(page); + return ret; +} + +static int ext3_journalled_writepage(struct page *page, + struct writeback_control *wbc) +{ + struct inode *inode = page->mapping->host; + handle_t *handle = NULL; + int ret = 0; + int err; + + if (ext3_journal_current_handle()) + goto no_write; + + handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode)); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto no_write; + } + + if (!page_has_buffers(page) || PageChecked(page)) { + /* + * It's mmapped pagecache. Add buffers and journal it. There + * doesn't seem much point in redirtying the page here. + */ + ClearPageChecked(page); + ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, + ext3_get_block); + if (ret != 0) + goto out_unlock; + ret = walk_page_buffers(handle, page_buffers(page), 0, + PAGE_CACHE_SIZE, NULL, do_journal_get_write_access); + + err = walk_page_buffers(handle, page_buffers(page), 0, + PAGE_CACHE_SIZE, NULL, commit_write_fn); + if (ret == 0) + ret = err; + EXT3_I(inode)->i_state |= EXT3_STATE_JDATA; + unlock_page(page); + } else { + /* + * It may be a page full of checkpoint-mode buffers. We don't + * really know unless we go poke around in the buffer_heads. + * But block_write_full_page will do the right thing. + */ + ret = block_write_full_page(page, ext3_get_block, wbc); + } + err = ext3_journal_stop(handle); + if (!ret) + ret = err; +out: + return ret; + +no_write: + redirty_page_for_writepage(wbc, page); +out_unlock: + unlock_page(page); + goto out; +} + +static int ext3_readpage(struct file *file, struct page *page) +{ + return mpage_readpage(page, ext3_get_block); +} + +static int +ext3_readpages(struct file *file, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages) +{ + return mpage_readpages(mapping, pages, nr_pages, ext3_get_block); +} + +static int ext3_invalidatepage(struct page *page, unsigned long offset) +{ + journal_t *journal = EXT3_JOURNAL(page->mapping->host); + + /* + * If it's a full truncate we just forget about the pending dirtying + */ + if (offset == 0) + ClearPageChecked(page); + + return journal_invalidatepage(journal, page, offset); +} + +static int ext3_releasepage(struct page *page, int wait) +{ + journal_t *journal = EXT3_JOURNAL(page->mapping->host); + + WARN_ON(PageChecked(page)); + if (!page_has_buffers(page)) + return 0; + return journal_try_to_free_buffers(journal, page, wait); +} + +/* + * If the O_DIRECT write will extend the file then add this inode to the + * orphan list. So recovery will truncate it back to the original size + * if the machine crashes during the write. + * + * If the O_DIRECT write is intantiating holes inside i_size and the machine + * crashes then stale disk data _may_ be exposed inside the file. + */ +static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb, + const struct iovec *iov, loff_t offset, + unsigned long nr_segs) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + struct ext3_inode_info *ei = EXT3_I(inode); + handle_t *handle = NULL; + ssize_t ret; + int orphan = 0; + size_t count = iov_length(iov, nr_segs); + + if (rw == WRITE) { + loff_t final_size = offset + count; + + handle = ext3_journal_start(inode, DIO_CREDITS); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out; + } + if (final_size > inode->i_size) { + ret = ext3_orphan_add(handle, inode); + if (ret) + goto out_stop; + orphan = 1; + ei->i_disksize = inode->i_size; + } + } + + ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, + offset, nr_segs, + ext3_direct_io_get_blocks, NULL); + + /* + * Reacquire the handle: ext3_direct_io_get_block() can restart the + * transaction + */ + handle = journal_current_handle(); + +out_stop: + if (handle) { + int err; + + if (orphan && inode->i_nlink) + ext3_orphan_del(handle, inode); + if (orphan && ret > 0) { + loff_t end = offset + ret; + if (end > inode->i_size) { + ei->i_disksize = end; + i_size_write(inode, end); + /* + * We're going to return a positive `ret' + * here due to non-zero-length I/O, so there's + * no way of reporting error returns from + * ext3_mark_inode_dirty() to userspace. So + * ignore it. + */ + ext3_mark_inode_dirty(handle, inode); + } + } + err = ext3_journal_stop(handle); + if (ret == 0) + ret = err; + } +out: + return ret; +} + +/* + * Pages can be marked dirty completely asynchronously from ext3's journalling + * activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do + * much here because ->set_page_dirty is called under VFS locks. The page is + * not necessarily locked. + * + * We cannot just dirty the page and leave attached buffers clean, because the + * buffers' dirty state is "definitive". We cannot just set the buffers dirty + * or jbddirty because all the journalling code will explode. + * + * So what we do is to mark the page "pending dirty" and next time writepage + * is called, propagate that into the buffers appropriately. + */ +static int ext3_journalled_set_page_dirty(struct page *page) +{ + SetPageChecked(page); + return __set_page_dirty_nobuffers(page); +} + +static struct address_space_operations ext3_ordered_aops = { + .readpage = ext3_readpage, + .readpages = ext3_readpages, + .writepage = ext3_ordered_writepage, + .sync_page = block_sync_page, + .prepare_write = ext3_prepare_write, + .commit_write = ext3_ordered_commit_write, + .bmap = ext3_bmap, + .invalidatepage = ext3_invalidatepage, + .releasepage = ext3_releasepage, + .direct_IO = ext3_direct_IO, +}; + +static struct address_space_operations ext3_writeback_aops = { + .readpage = ext3_readpage, + .readpages = ext3_readpages, + .writepage = ext3_writeback_writepage, + .writepages = ext3_writeback_writepages, + .sync_page = block_sync_page, + .prepare_write = ext3_prepare_write, + .commit_write = ext3_writeback_commit_write, + .bmap = ext3_bmap, + .invalidatepage = ext3_invalidatepage, + .releasepage = ext3_releasepage, + .direct_IO = ext3_direct_IO, +}; + +static struct address_space_operations ext3_journalled_aops = { + .readpage = ext3_readpage, + .readpages = ext3_readpages, + .writepage = ext3_journalled_writepage, + .sync_page = block_sync_page, + .prepare_write = ext3_prepare_write, + .commit_write = ext3_journalled_commit_write, + .set_page_dirty = ext3_journalled_set_page_dirty, + .bmap = ext3_bmap, + .invalidatepage = ext3_invalidatepage, + .releasepage = ext3_releasepage, +}; + +void ext3_set_aops(struct inode *inode) +{ + if (ext3_should_order_data(inode)) + inode->i_mapping->a_ops = &ext3_ordered_aops; + else if (ext3_should_writeback_data(inode)) + inode->i_mapping->a_ops = &ext3_writeback_aops; + else + inode->i_mapping->a_ops = &ext3_journalled_aops; +} + +/* + * ext3_block_truncate_page() zeroes out a mapping from file offset `from' + * up to the end of the block which corresponds to `from'. + * This required during truncate. We need to physically zero the tail end + * of that block so it doesn't yield old data if the file is later grown. + */ +static int ext3_block_truncate_page(handle_t *handle, struct page *page, + struct address_space *mapping, loff_t from) +{ + unsigned long index = from >> PAGE_CACHE_SHIFT; + unsigned offset = from & (PAGE_CACHE_SIZE-1); + unsigned blocksize, iblock, length, pos; + struct inode *inode = mapping->host; + struct buffer_head *bh; + int err = 0; + void *kaddr; + + blocksize = inode->i_sb->s_blocksize; + length = blocksize - (offset & (blocksize - 1)); + iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); + + /* + * For "nobh" option, we can only work if we don't need to + * read-in the page - otherwise we create buffers to do the IO. + */ + if (!page_has_buffers(page) && test_opt(inode->i_sb, NOBH)) { + if (PageUptodate(page)) { + kaddr = kmap_atomic(page, KM_USER0); + memset(kaddr + offset, 0, length); + flush_dcache_page(page); + kunmap_atomic(kaddr, KM_USER0); + set_page_dirty(page); + goto unlock; + } + } + + if (!page_has_buffers(page)) + create_empty_buffers(page, blocksize, 0); + + /* Find the buffer that contains "offset" */ + bh = page_buffers(page); + pos = blocksize; + while (offset >= pos) { + bh = bh->b_this_page; + iblock++; + pos += blocksize; + } + + err = 0; + if (buffer_freed(bh)) { + BUFFER_TRACE(bh, "freed: skip"); + goto unlock; + } + + if (!buffer_mapped(bh)) { + BUFFER_TRACE(bh, "unmapped"); + ext3_get_block(inode, iblock, bh, 0); + /* unmapped? It's a hole - nothing to do */ + if (!buffer_mapped(bh)) { + BUFFER_TRACE(bh, "still unmapped"); + goto unlock; + } + } + + /* Ok, it's mapped. Make sure it's up-to-date */ + if (PageUptodate(page)) + set_buffer_uptodate(bh); + + if (!buffer_uptodate(bh)) { + err = -EIO; + ll_rw_block(READ, 1, &bh); + wait_on_buffer(bh); + /* Uhhuh. Read error. Complain and punt. */ + if (!buffer_uptodate(bh)) + goto unlock; + } + + if (ext3_should_journal_data(inode)) { + BUFFER_TRACE(bh, "get write access"); + err = ext3_journal_get_write_access(handle, bh); + if (err) + goto unlock; + } + + kaddr = kmap_atomic(page, KM_USER0); + memset(kaddr + offset, 0, length); + flush_dcache_page(page); + kunmap_atomic(kaddr, KM_USER0); + + BUFFER_TRACE(bh, "zeroed end of block"); + + err = 0; + if (ext3_should_journal_data(inode)) { + err = ext3_journal_dirty_metadata(handle, bh); + } else { + if (ext3_should_order_data(inode)) + err = ext3_journal_dirty_data(handle, bh); + mark_buffer_dirty(bh); + } + +unlock: + unlock_page(page); + page_cache_release(page); + return err; +} + +/* + * Probably it should be a library function... search for first non-zero word + * or memcmp with zero_page, whatever is better for particular architecture. + * Linus? + */ +static inline int all_zeroes(__le32 *p, __le32 *q) +{ + while (p < q) + if (*p++) + return 0; + return 1; +} + +/** + * ext3_find_shared - find the indirect blocks for partial truncation. + * @inode: inode in question + * @depth: depth of the affected branch + * @offsets: offsets of pointers in that branch (see ext3_block_to_path) + * @chain: place to store the pointers to partial indirect blocks + * @top: place to the (detached) top of branch + * + * This is a helper function used by ext3_truncate(). + * + * When we do truncate() we may have to clean the ends of several + * indirect blocks but leave the blocks themselves alive. Block is + * partially truncated if some data below the new i_size is refered + * from it (and it is on the path to the first completely truncated + * data block, indeed). We have to free the top of that path along + * with everything to the right of the path. Since no allocation + * past the truncation point is possible until ext3_truncate() + * finishes, we may safely do the latter, but top of branch may + * require special attention - pageout below the truncation point + * might try to populate it. + * + * We atomically detach the top of branch from the tree, store the + * block number of its root in *@top, pointers to buffer_heads of + * partially truncated blocks - in @chain[].bh and pointers to + * their last elements that should not be removed - in + * @chain[].p. Return value is the pointer to last filled element + * of @chain. + * + * The work left to caller to do the actual freeing of subtrees: + * a) free the subtree starting from *@top + * b) free the subtrees whose roots are stored in + * (@chain[i].p+1 .. end of @chain[i].bh->b_data) + * c) free the subtrees growing from the inode past the @chain[0]. + * (no partially truncated stuff there). */ + +static Indirect *ext3_find_shared(struct inode *inode, + int depth, + int offsets[4], + Indirect chain[4], + __le32 *top) +{ + Indirect *partial, *p; + int k, err; + + *top = 0; + /* Make k index the deepest non-null offest + 1 */ + for (k = depth; k > 1 && !offsets[k-1]; k--) + ; + partial = ext3_get_branch(inode, k, offsets, chain, &err); + /* Writer: pointers */ + if (!partial) + partial = chain + k-1; + /* + * If the branch acquired continuation since we've looked at it - + * fine, it should all survive and (new) top doesn't belong to us. + */ + if (!partial->key && *partial->p) + /* Writer: end */ + goto no_top; + for (p=partial; p>chain && all_zeroes((__le32*)p->bh->b_data,p->p); p--) + ; + /* + * OK, we've found the last block that must survive. The rest of our + * branch should be detached before unlocking. However, if that rest + * of branch is all ours and does not grow immediately from the inode + * it's easier to cheat and just decrement partial->p. + */ + if (p == chain + k - 1 && p > chain) { + p->p--; + } else { + *top = *p->p; + /* Nope, don't do this in ext3. Must leave the tree intact */ +#if 0 + *p->p = 0; +#endif + } + /* Writer: end */ + + while(partial > p) + { + brelse(partial->bh); + partial--; + } +no_top: + return partial; +} + +/* + * Zero a number of block pointers in either an inode or an indirect block. + * If we restart the transaction we must again get write access to the + * indirect block for further modification. + * + * We release `count' blocks on disk, but (last - first) may be greater + * than `count' because there can be holes in there. + */ +static void +ext3_clear_blocks(handle_t *handle, struct inode *inode, struct buffer_head *bh, + unsigned long block_to_free, unsigned long count, + __le32 *first, __le32 *last) +{ + __le32 *p; + if (try_to_extend_transaction(handle, inode)) { + if (bh) { + BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); + ext3_journal_dirty_metadata(handle, bh); + } + ext3_mark_inode_dirty(handle, inode); + ext3_journal_test_restart(handle, inode); + if (bh) { + BUFFER_TRACE(bh, "retaking write access"); + ext3_journal_get_write_access(handle, bh); + } + } + + /* + * Any buffers which are on the journal will be in memory. We find + * them on the hash table so journal_revoke() will run journal_forget() + * on them. We've already detached each block from the file, so + * bforget() in journal_forget() should be safe. + * + * AKPM: turn on bforget in journal_forget()!!! + */ + for (p = first; p < last; p++) { + u32 nr = le32_to_cpu(*p); + if (nr) { + struct buffer_head *bh; + + *p = 0; + bh = sb_find_get_block(inode->i_sb, nr); + ext3_forget(handle, 0, inode, bh, nr); + } + } + + ext3_free_blocks(handle, inode, block_to_free, count); +} + +/** + * ext3_free_data - free a list of data blocks + * @handle: handle for this transaction + * @inode: inode we are dealing with + * @this_bh: indirect buffer_head which contains *@first and *@last + * @first: array of block numbers + * @last: points immediately past the end of array + * + * We are freeing all blocks refered from that array (numbers are stored as + * little-endian 32-bit) and updating @inode->i_blocks appropriately. + * + * We accumulate contiguous runs of blocks to free. Conveniently, if these + * blocks are contiguous then releasing them at one time will only affect one + * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't + * actually use a lot of journal space. + * + * @this_bh will be %NULL if @first and @last point into the inode's direct + * block pointers. + */ +static void ext3_free_data(handle_t *handle, struct inode *inode, + struct buffer_head *this_bh, + __le32 *first, __le32 *last) +{ + unsigned long block_to_free = 0; /* Starting block # of a run */ + unsigned long count = 0; /* Number of blocks in the run */ + __le32 *block_to_free_p = NULL; /* Pointer into inode/ind + corresponding to + block_to_free */ + unsigned long nr; /* Current block # */ + __le32 *p; /* Pointer into inode/ind + for current block */ + int err; + + if (this_bh) { /* For indirect block */ + BUFFER_TRACE(this_bh, "get_write_access"); + err = ext3_journal_get_write_access(handle, this_bh); + /* Important: if we can't update the indirect pointers + * to the blocks, we can't free them. */ + if (err) + return; + } + + for (p = first; p < last; p++) { + nr = le32_to_cpu(*p); + if (nr) { + /* accumulate blocks to free if they're contiguous */ + if (count == 0) { + block_to_free = nr; + block_to_free_p = p; + count = 1; + } else if (nr == block_to_free + count) { + count++; + } else { + ext3_clear_blocks(handle, inode, this_bh, + block_to_free, + count, block_to_free_p, p); + block_to_free = nr; + block_to_free_p = p; + count = 1; + } + } + } + + if (count > 0) + ext3_clear_blocks(handle, inode, this_bh, block_to_free, + count, block_to_free_p, p); + + if (this_bh) { + BUFFER_TRACE(this_bh, "call ext3_journal_dirty_metadata"); + ext3_journal_dirty_metadata(handle, this_bh); + } +} + +/** + * ext3_free_branches - free an array of branches + * @handle: JBD handle for this transaction + * @inode: inode we are dealing with + * @parent_bh: the buffer_head which contains *@first and *@last + * @first: array of block numbers + * @last: pointer immediately past the end of array + * @depth: depth of the branches to free + * + * We are freeing all blocks refered from these branches (numbers are + * stored as little-endian 32-bit) and updating @inode->i_blocks + * appropriately. + */ +static void ext3_free_branches(handle_t *handle, struct inode *inode, + struct buffer_head *parent_bh, + __le32 *first, __le32 *last, int depth) +{ + unsigned long nr; + __le32 *p; + + if (is_handle_aborted(handle)) + return; + + if (depth--) { + struct buffer_head *bh; + int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb); + p = last; + while (--p >= first) { + nr = le32_to_cpu(*p); + if (!nr) + continue; /* A hole */ + + /* Go read the buffer for the next level down */ + bh = sb_bread(inode->i_sb, nr); + + /* + * A read failure? Report error and clear slot + * (should be rare). + */ + if (!bh) { + ext3_error(inode->i_sb, "ext3_free_branches", + "Read failure, inode=%ld, block=%ld", + inode->i_ino, nr); + continue; + } + + /* This zaps the entire block. Bottom up. */ + BUFFER_TRACE(bh, "free child branches"); + ext3_free_branches(handle, inode, bh, + (__le32*)bh->b_data, + (__le32*)bh->b_data + addr_per_block, + depth); + + /* + * We've probably journalled the indirect block several + * times during the truncate. But it's no longer + * needed and we now drop it from the transaction via + * journal_revoke(). + * + * That's easy if it's exclusively part of this + * transaction. But if it's part of the committing + * transaction then journal_forget() will simply + * brelse() it. That means that if the underlying + * block is reallocated in ext3_get_block(), + * unmap_underlying_metadata() will find this block + * and will try to get rid of it. damn, damn. + * + * If this block has already been committed to the + * journal, a revoke record will be written. And + * revoke records must be emitted *before* clearing + * this block's bit in the bitmaps. + */ + ext3_forget(handle, 1, inode, bh, bh->b_blocknr); + + /* + * Everything below this this pointer has been + * released. Now let this top-of-subtree go. + * + * We want the freeing of this indirect block to be + * atomic in the journal with the updating of the + * bitmap block which owns it. So make some room in + * the journal. + * + * We zero the parent pointer *after* freeing its + * pointee in the bitmaps, so if extend_transaction() + * for some reason fails to put the bitmap changes and + * the release into the same transaction, recovery + * will merely complain about releasing a free block, + * rather than leaking blocks. + */ + if (is_handle_aborted(handle)) + return; + if (try_to_extend_transaction(handle, inode)) { + ext3_mark_inode_dirty(handle, inode); + ext3_journal_test_restart(handle, inode); + } + + ext3_free_blocks(handle, inode, nr, 1); + + if (parent_bh) { + /* + * The block which we have just freed is + * pointed to by an indirect block: journal it + */ + BUFFER_TRACE(parent_bh, "get_write_access"); + if (!ext3_journal_get_write_access(handle, + parent_bh)){ + *p = 0; + BUFFER_TRACE(parent_bh, + "call ext3_journal_dirty_metadata"); + ext3_journal_dirty_metadata(handle, + parent_bh); + } + } + } + } else { + /* We have reached the bottom of the tree. */ + BUFFER_TRACE(parent_bh, "free data blocks"); + ext3_free_data(handle, inode, parent_bh, first, last); + } +} + +/* + * ext3_truncate() + * + * We block out ext3_get_block() block instantiations across the entire + * transaction, and VFS/VM ensures that ext3_truncate() cannot run + * simultaneously on behalf of the same inode. + * + * As we work through the truncate and commmit bits of it to the journal there + * is one core, guiding principle: the file's tree must always be consistent on + * disk. We must be able to restart the truncate after a crash. + * + * The file's tree may be transiently inconsistent in memory (although it + * probably isn't), but whenever we close off and commit a journal transaction, + * the contents of (the filesystem + the journal) must be consistent and + * restartable. It's pretty simple, really: bottom up, right to left (although + * left-to-right works OK too). + * + * Note that at recovery time, journal replay occurs *before* the restart of + * truncate against the orphan inode list. + * + * The committed inode has the new, desired i_size (which is the same as + * i_disksize in this case). After a crash, ext3_orphan_cleanup() will see + * that this inode's truncate did not complete and it will again call + * ext3_truncate() to have another go. So there will be instantiated blocks + * to the right of the truncation point in a crashed ext3 filesystem. But + * that's fine - as long as they are linked from the inode, the post-crash + * ext3_truncate() run will find them and release them. + */ + +void ext3_truncate(struct inode * inode) +{ + handle_t *handle; + struct ext3_inode_info *ei = EXT3_I(inode); + __le32 *i_data = ei->i_data; + int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb); + struct address_space *mapping = inode->i_mapping; + int offsets[4]; + Indirect chain[4]; + Indirect *partial; + __le32 nr = 0; + int n; + long last_block; + unsigned blocksize = inode->i_sb->s_blocksize; + struct page *page; + + if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + S_ISLNK(inode->i_mode))) + return; + if (ext3_inode_is_fast_symlink(inode)) + return; + if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) + return; + + /* + * We have to lock the EOF page here, because lock_page() nests + * outside journal_start(). + */ + if ((inode->i_size & (blocksize - 1)) == 0) { + /* Block boundary? Nothing to do */ + page = NULL; + } else { + page = grab_cache_page(mapping, + inode->i_size >> PAGE_CACHE_SHIFT); + if (!page) + return; + } + + handle = start_transaction(inode); + if (IS_ERR(handle)) { + if (page) { + clear_highpage(page); + flush_dcache_page(page); + unlock_page(page); + page_cache_release(page); + } + return; /* AKPM: return what? */ + } + + last_block = (inode->i_size + blocksize-1) + >> EXT3_BLOCK_SIZE_BITS(inode->i_sb); + + if (page) + ext3_block_truncate_page(handle, page, mapping, inode->i_size); + + n = ext3_block_to_path(inode, last_block, offsets, NULL); + if (n == 0) + goto out_stop; /* error */ + + /* + * OK. This truncate is going to happen. We add the inode to the + * orphan list, so that if this truncate spans multiple transactions, + * and we crash, we will resume the truncate when the filesystem + * recovers. It also marks the inode dirty, to catch the new size. + * + * Implication: the file must always be in a sane, consistent + * truncatable state while each transaction commits. + */ + if (ext3_orphan_add(handle, inode)) + goto out_stop; + + /* + * The orphan list entry will now protect us from any crash which + * occurs before the truncate completes, so it is now safe to propagate + * the new, shorter inode size (held for now in i_size) into the + * on-disk inode. We do this via i_disksize, which is the value which + * ext3 *really* writes onto the disk inode. + */ + ei->i_disksize = inode->i_size; + + /* + * From here we block out all ext3_get_block() callers who want to + * modify the block allocation tree. + */ + down(&ei->truncate_sem); + + if (n == 1) { /* direct blocks */ + ext3_free_data(handle, inode, NULL, i_data+offsets[0], + i_data + EXT3_NDIR_BLOCKS); + goto do_indirects; + } + + partial = ext3_find_shared(inode, n, offsets, chain, &nr); + /* Kill the top of shared branch (not detached) */ + if (nr) { + if (partial == chain) { + /* Shared branch grows from the inode */ + ext3_free_branches(handle, inode, NULL, + &nr, &nr+1, (chain+n-1) - partial); + *partial->p = 0; + /* + * We mark the inode dirty prior to restart, + * and prior to stop. No need for it here. + */ + } else { + /* Shared branch grows from an indirect block */ + BUFFER_TRACE(partial->bh, "get_write_access"); + ext3_free_branches(handle, inode, partial->bh, + partial->p, + partial->p+1, (chain+n-1) - partial); + } + } + /* Clear the ends of indirect blocks on the shared branch */ + while (partial > chain) { + ext3_free_branches(handle, inode, partial->bh, partial->p + 1, + (__le32*)partial->bh->b_data+addr_per_block, + (chain+n-1) - partial); + BUFFER_TRACE(partial->bh, "call brelse"); + brelse (partial->bh); + partial--; + } +do_indirects: + /* Kill the remaining (whole) subtrees */ + switch (offsets[0]) { + default: + nr = i_data[EXT3_IND_BLOCK]; + if (nr) { + ext3_free_branches(handle, inode, NULL, + &nr, &nr+1, 1); + i_data[EXT3_IND_BLOCK] = 0; + } + case EXT3_IND_BLOCK: + nr = i_data[EXT3_DIND_BLOCK]; + if (nr) { + ext3_free_branches(handle, inode, NULL, + &nr, &nr+1, 2); + i_data[EXT3_DIND_BLOCK] = 0; + } + case EXT3_DIND_BLOCK: + nr = i_data[EXT3_TIND_BLOCK]; + if (nr) { + ext3_free_branches(handle, inode, NULL, + &nr, &nr+1, 3); + i_data[EXT3_TIND_BLOCK] = 0; + } + case EXT3_TIND_BLOCK: + ; + } + + ext3_discard_reservation(inode); + + up(&ei->truncate_sem); + inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC; + ext3_mark_inode_dirty(handle, inode); + + /* In a multi-transaction truncate, we only make the final + * transaction synchronous */ + if (IS_SYNC(inode)) + handle->h_sync = 1; +out_stop: + /* + * If this was a simple ftruncate(), and the file will remain alive + * then we need to clear up the orphan record which we created above. + * However, if this was a real unlink then we were called by + * ext3_delete_inode(), and we allow that function to clean up the + * orphan info for us. + */ + if (inode->i_nlink) + ext3_orphan_del(handle, inode); + + ext3_journal_stop(handle); +} + +static unsigned long ext3_get_inode_block(struct super_block *sb, + unsigned long ino, struct ext3_iloc *iloc) +{ + unsigned long desc, group_desc, block_group; + unsigned long offset, block; + struct buffer_head *bh; + struct ext3_group_desc * gdp; + + + if ((ino != EXT3_ROOT_INO && + ino != EXT3_JOURNAL_INO && + ino != EXT3_RESIZE_INO && + ino < EXT3_FIRST_INO(sb)) || + ino > le32_to_cpu( + EXT3_SB(sb)->s_es->s_inodes_count)) { + ext3_error (sb, "ext3_get_inode_block", + "bad inode number: %lu", ino); + return 0; + } + block_group = (ino - 1) / EXT3_INODES_PER_GROUP(sb); + if (block_group >= EXT3_SB(sb)->s_groups_count) { + ext3_error (sb, "ext3_get_inode_block", + "group >= groups count"); + return 0; + } + smp_rmb(); + group_desc = block_group >> EXT3_DESC_PER_BLOCK_BITS(sb); + desc = block_group & (EXT3_DESC_PER_BLOCK(sb) - 1); + bh = EXT3_SB(sb)->s_group_desc[group_desc]; + if (!bh) { + ext3_error (sb, "ext3_get_inode_block", + "Descriptor not loaded"); + return 0; + } + + gdp = (struct ext3_group_desc *) bh->b_data; + /* + * Figure out the offset within the block group inode table + */ + offset = ((ino - 1) % EXT3_INODES_PER_GROUP(sb)) * + EXT3_INODE_SIZE(sb); + block = le32_to_cpu(gdp[desc].bg_inode_table) + + (offset >> EXT3_BLOCK_SIZE_BITS(sb)); + + iloc->block_group = block_group; + iloc->offset = offset & (EXT3_BLOCK_SIZE(sb) - 1); + return block; +} + +/* + * ext3_get_inode_loc returns with an extra refcount against the inode's + * underlying buffer_head on success. If 'in_mem' is true, we have all + * data in memory that is needed to recreate the on-disk version of this + * inode. + */ +static int __ext3_get_inode_loc(struct inode *inode, + struct ext3_iloc *iloc, int in_mem) +{ + unsigned long block; + struct buffer_head *bh; + + block = ext3_get_inode_block(inode->i_sb, inode->i_ino, iloc); + if (!block) + return -EIO; + + bh = sb_getblk(inode->i_sb, block); + if (!bh) { + ext3_error (inode->i_sb, "ext3_get_inode_loc", + "unable to read inode block - " + "inode=%lu, block=%lu", inode->i_ino, block); + return -EIO; + } + if (!buffer_uptodate(bh)) { + lock_buffer(bh); + if (buffer_uptodate(bh)) { + /* someone brought it uptodate while we waited */ + unlock_buffer(bh); + goto has_buffer; + } + + /* + * If we have all information of the inode in memory and this + * is the only valid inode in the block, we need not read the + * block. + */ + if (in_mem) { + struct buffer_head *bitmap_bh; + struct ext3_group_desc *desc; + int inodes_per_buffer; + int inode_offset, i; + int block_group; + int start; + + block_group = (inode->i_ino - 1) / + EXT3_INODES_PER_GROUP(inode->i_sb); + inodes_per_buffer = bh->b_size / + EXT3_INODE_SIZE(inode->i_sb); + inode_offset = ((inode->i_ino - 1) % + EXT3_INODES_PER_GROUP(inode->i_sb)); + start = inode_offset & ~(inodes_per_buffer - 1); + + /* Is the inode bitmap in cache? */ + desc = ext3_get_group_desc(inode->i_sb, + block_group, NULL); + if (!desc) + goto make_io; + + bitmap_bh = sb_getblk(inode->i_sb, + le32_to_cpu(desc->bg_inode_bitmap)); + if (!bitmap_bh) + goto make_io; + + /* + * If the inode bitmap isn't in cache then the + * optimisation may end up performing two reads instead + * of one, so skip it. + */ + if (!buffer_uptodate(bitmap_bh)) { + brelse(bitmap_bh); + goto make_io; + } + for (i = start; i < start + inodes_per_buffer; i++) { + if (i == inode_offset) + continue; + if (ext3_test_bit(i, bitmap_bh->b_data)) + break; + } + brelse(bitmap_bh); + if (i == start + inodes_per_buffer) { + /* all other inodes are free, so skip I/O */ + memset(bh->b_data, 0, bh->b_size); + set_buffer_uptodate(bh); + unlock_buffer(bh); + goto has_buffer; + } + } + +make_io: + /* + * There are other valid inodes in the buffer, this inode + * has in-inode xattrs, or we don't have this inode in memory. + * Read the block from disk. + */ + get_bh(bh); + bh->b_end_io = end_buffer_read_sync; + submit_bh(READ, bh); + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) { + ext3_error(inode->i_sb, "ext3_get_inode_loc", + "unable to read inode block - " + "inode=%lu, block=%lu", + inode->i_ino, block); + brelse(bh); + return -EIO; + } + } +has_buffer: + iloc->bh = bh; + return 0; +} + +int ext3_get_inode_loc(struct inode *inode, struct ext3_iloc *iloc) +{ + /* We have all inode data except xattrs in memory here. */ + return __ext3_get_inode_loc(inode, iloc, + !(EXT3_I(inode)->i_state & EXT3_STATE_XATTR)); +} + +void ext3_set_inode_flags(struct inode *inode) +{ + unsigned int flags = EXT3_I(inode)->i_flags; + + inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); + if (flags & EXT3_SYNC_FL) + inode->i_flags |= S_SYNC; + if (flags & EXT3_APPEND_FL) + inode->i_flags |= S_APPEND; + if (flags & EXT3_IMMUTABLE_FL) + inode->i_flags |= S_IMMUTABLE; + if (flags & EXT3_NOATIME_FL) + inode->i_flags |= S_NOATIME; + if (flags & EXT3_DIRSYNC_FL) + inode->i_flags |= S_DIRSYNC; +} + +void ext3_read_inode(struct inode * inode) +{ + struct ext3_iloc iloc; + struct ext3_inode *raw_inode; + struct ext3_inode_info *ei = EXT3_I(inode); + struct buffer_head *bh; + int block; + +#ifdef CONFIG_EXT3_FS_POSIX_ACL + ei->i_acl = EXT3_ACL_NOT_CACHED; + ei->i_default_acl = EXT3_ACL_NOT_CACHED; +#endif + ei->i_block_alloc_info = NULL; + + if (__ext3_get_inode_loc(inode, &iloc, 0)) + goto bad_inode; + bh = iloc.bh; + raw_inode = ext3_raw_inode(&iloc); + inode->i_mode = le16_to_cpu(raw_inode->i_mode); + inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); + inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); + if(!(test_opt (inode->i_sb, NO_UID32))) { + inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; + inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; + } + inode->i_nlink = le16_to_cpu(raw_inode->i_links_count); + inode->i_size = le32_to_cpu(raw_inode->i_size); + inode->i_atime.tv_sec = le32_to_cpu(raw_inode->i_atime); + inode->i_ctime.tv_sec = le32_to_cpu(raw_inode->i_ctime); + inode->i_mtime.tv_sec = le32_to_cpu(raw_inode->i_mtime); + inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = inode->i_mtime.tv_nsec = 0; + + ei->i_state = 0; + ei->i_dir_start_lookup = 0; + ei->i_dtime = le32_to_cpu(raw_inode->i_dtime); + /* We now have enough fields to check if the inode was active or not. + * This is needed because nfsd might try to access dead inodes + * the test is that same one that e2fsck uses + * NeilBrown 1999oct15 + */ + if (inode->i_nlink == 0) { + if (inode->i_mode == 0 || + !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ORPHAN_FS)) { + /* this inode is deleted */ + brelse (bh); + goto bad_inode; + } + /* The only unlinked inodes we let through here have + * valid i_mode and are being read by the orphan + * recovery code: that's fine, we're about to complete + * the process of deleting those. */ + } + inode->i_blksize = PAGE_SIZE; /* This is the optimal IO size + * (for stat), not the fs block + * size */ + inode->i_blocks = le32_to_cpu(raw_inode->i_blocks); + ei->i_flags = le32_to_cpu(raw_inode->i_flags); +#ifdef EXT3_FRAGMENTS + ei->i_faddr = le32_to_cpu(raw_inode->i_faddr); + ei->i_frag_no = raw_inode->i_frag; + ei->i_frag_size = raw_inode->i_fsize; +#endif + ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl); + if (!S_ISREG(inode->i_mode)) { + ei->i_dir_acl = le32_to_cpu(raw_inode->i_dir_acl); + } else { + inode->i_size |= + ((__u64)le32_to_cpu(raw_inode->i_size_high)) << 32; + } + ei->i_disksize = inode->i_size; + inode->i_generation = le32_to_cpu(raw_inode->i_generation); + ei->i_block_group = iloc.block_group; + /* + * NOTE! The in-memory inode i_data array is in little-endian order + * even on big-endian machines: we do NOT byteswap the block numbers! + */ + for (block = 0; block < EXT3_N_BLOCKS; block++) + ei->i_data[block] = raw_inode->i_block[block]; + INIT_LIST_HEAD(&ei->i_orphan); + + if (inode->i_ino >= EXT3_FIRST_INO(inode->i_sb) + 1 && + EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) { + /* + * When mke2fs creates big inodes it does not zero out + * the unused bytes above EXT3_GOOD_OLD_INODE_SIZE, + * so ignore those first few inodes. + */ + ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); + if (EXT3_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > + EXT3_INODE_SIZE(inode->i_sb)) + goto bad_inode; + if (ei->i_extra_isize == 0) { + /* The extra space is currently unused. Use it. */ + ei->i_extra_isize = sizeof(struct ext3_inode) - + EXT3_GOOD_OLD_INODE_SIZE; + } else { + __le32 *magic = (void *)raw_inode + + EXT3_GOOD_OLD_INODE_SIZE + + ei->i_extra_isize; + if (*magic == cpu_to_le32(EXT3_XATTR_MAGIC)) + ei->i_state |= EXT3_STATE_XATTR; + } + } else + ei->i_extra_isize = 0; + + if (S_ISREG(inode->i_mode)) { + inode->i_op = &ext3_file_inode_operations; + inode->i_fop = &ext3_file_operations; + ext3_set_aops(inode); + } else if (S_ISDIR(inode->i_mode)) { + inode->i_op = &ext3_dir_inode_operations; + inode->i_fop = &ext3_dir_operations; + } else if (S_ISLNK(inode->i_mode)) { + if (ext3_inode_is_fast_symlink(inode)) + inode->i_op = &ext3_fast_symlink_inode_operations; + else { + inode->i_op = &ext3_symlink_inode_operations; + ext3_set_aops(inode); + } + } else { + inode->i_op = &ext3_special_inode_operations; + if (raw_inode->i_block[0]) + init_special_inode(inode, inode->i_mode, + old_decode_dev(le32_to_cpu(raw_inode->i_block[0]))); + else + init_special_inode(inode, inode->i_mode, + new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); + } + brelse (iloc.bh); + ext3_set_inode_flags(inode); + return; + +bad_inode: + make_bad_inode(inode); + return; +} + +/* + * Post the struct inode info into an on-disk inode location in the + * buffer-cache. This gobbles the caller's reference to the + * buffer_head in the inode location struct. + * + * The caller must have write access to iloc->bh. + */ +static int ext3_do_update_inode(handle_t *handle, + struct inode *inode, + struct ext3_iloc *iloc) +{ + struct ext3_inode *raw_inode = ext3_raw_inode(iloc); + struct ext3_inode_info *ei = EXT3_I(inode); + struct buffer_head *bh = iloc->bh; + int err = 0, rc, block; + + /* For fields not not tracking in the in-memory inode, + * initialise them to zero for new inodes. */ + if (ei->i_state & EXT3_STATE_NEW) + memset(raw_inode, 0, EXT3_SB(inode->i_sb)->s_inode_size); + + raw_inode->i_mode = cpu_to_le16(inode->i_mode); + if(!(test_opt(inode->i_sb, NO_UID32))) { + raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid)); + raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid)); +/* + * Fix up interoperability with old kernels. Otherwise, old inodes get + * re-used with the upper 16 bits of the uid/gid intact + */ + if(!ei->i_dtime) { + raw_inode->i_uid_high = + cpu_to_le16(high_16_bits(inode->i_uid)); + raw_inode->i_gid_high = + cpu_to_le16(high_16_bits(inode->i_gid)); + } else { + raw_inode->i_uid_high = 0; + raw_inode->i_gid_high = 0; + } + } else { + raw_inode->i_uid_low = + cpu_to_le16(fs_high2lowuid(inode->i_uid)); + raw_inode->i_gid_low = + cpu_to_le16(fs_high2lowgid(inode->i_gid)); + raw_inode->i_uid_high = 0; + raw_inode->i_gid_high = 0; + } + raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); + raw_inode->i_size = cpu_to_le32(ei->i_disksize); + raw_inode->i_atime = cpu_to_le32(inode->i_atime.tv_sec); + raw_inode->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec); + raw_inode->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec); + raw_inode->i_blocks = cpu_to_le32(inode->i_blocks); + raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); + raw_inode->i_flags = cpu_to_le32(ei->i_flags); +#ifdef EXT3_FRAGMENTS + raw_inode->i_faddr = cpu_to_le32(ei->i_faddr); + raw_inode->i_frag = ei->i_frag_no; + raw_inode->i_fsize = ei->i_frag_size; +#endif + raw_inode->i_file_acl = cpu_to_le32(ei->i_file_acl); + if (!S_ISREG(inode->i_mode)) { + raw_inode->i_dir_acl = cpu_to_le32(ei->i_dir_acl); + } else { + raw_inode->i_size_high = + cpu_to_le32(ei->i_disksize >> 32); + if (ei->i_disksize > 0x7fffffffULL) { + struct super_block *sb = inode->i_sb; + if (!EXT3_HAS_RO_COMPAT_FEATURE(sb, + EXT3_FEATURE_RO_COMPAT_LARGE_FILE) || + EXT3_SB(sb)->s_es->s_rev_level == + cpu_to_le32(EXT3_GOOD_OLD_REV)) { + /* If this is the first large file + * created, add a flag to the superblock. + */ + err = ext3_journal_get_write_access(handle, + EXT3_SB(sb)->s_sbh); + if (err) + goto out_brelse; + ext3_update_dynamic_rev(sb); + EXT3_SET_RO_COMPAT_FEATURE(sb, + EXT3_FEATURE_RO_COMPAT_LARGE_FILE); + sb->s_dirt = 1; + handle->h_sync = 1; + err = ext3_journal_dirty_metadata(handle, + EXT3_SB(sb)->s_sbh); + } + } + } + raw_inode->i_generation = cpu_to_le32(inode->i_generation); + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { + if (old_valid_dev(inode->i_rdev)) { + raw_inode->i_block[0] = + cpu_to_le32(old_encode_dev(inode->i_rdev)); + raw_inode->i_block[1] = 0; + } else { + raw_inode->i_block[0] = 0; + raw_inode->i_block[1] = + cpu_to_le32(new_encode_dev(inode->i_rdev)); + raw_inode->i_block[2] = 0; + } + } else for (block = 0; block < EXT3_N_BLOCKS; block++) + raw_inode->i_block[block] = ei->i_data[block]; + + if (EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) + raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize); + + BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); + rc = ext3_journal_dirty_metadata(handle, bh); + if (!err) + err = rc; + ei->i_state &= ~EXT3_STATE_NEW; + +out_brelse: + brelse (bh); + ext3_std_error(inode->i_sb, err); + return err; +} + +/* + * ext3_write_inode() + * + * We are called from a few places: + * + * - Within generic_file_write() for O_SYNC files. + * Here, there will be no transaction running. We wait for any running + * trasnaction to commit. + * + * - Within sys_sync(), kupdate and such. + * We wait on commit, if tol to. + * + * - Within prune_icache() (PF_MEMALLOC == true) + * Here we simply return. We can't afford to block kswapd on the + * journal commit. + * + * In all cases it is actually safe for us to return without doing anything, + * because the inode has been copied into a raw inode buffer in + * ext3_mark_inode_dirty(). This is a correctness thing for O_SYNC and for + * knfsd. + * + * Note that we are absolutely dependent upon all inode dirtiers doing the + * right thing: they *must* call mark_inode_dirty() after dirtying info in + * which we are interested. + * + * It would be a bug for them to not do this. The code: + * + * mark_inode_dirty(inode) + * stuff(); + * inode->i_size = expr; + * + * is in error because a kswapd-driven write_inode() could occur while + * `stuff()' is running, and the new i_size will be lost. Plus the inode + * will no longer be on the superblock's dirty inode list. + */ +int ext3_write_inode(struct inode *inode, int wait) +{ + if (current->flags & PF_MEMALLOC) + return 0; + + if (ext3_journal_current_handle()) { + jbd_debug(0, "called recursively, non-PF_MEMALLOC!\n"); + dump_stack(); + return -EIO; + } + + if (!wait) + return 0; + + return ext3_force_commit(inode->i_sb); +} + +/* + * ext3_setattr() + * + * Called from notify_change. + * + * We want to trap VFS attempts to truncate the file as soon as + * possible. In particular, we want to make sure that when the VFS + * shrinks i_size, we put the inode on the orphan list and modify + * i_disksize immediately, so that during the subsequent flushing of + * dirty pages and freeing of disk blocks, we can guarantee that any + * commit will leave the blocks being flushed in an unused state on + * disk. (On recovery, the inode will get truncated and the blocks will + * be freed, so we have a strong guarantee that no future commit will + * leave these blocks visible to the user.) + * + * Called with inode->sem down. + */ +int ext3_setattr(struct dentry *dentry, struct iattr *attr) +{ + struct inode *inode = dentry->d_inode; + int error, rc = 0; + const unsigned int ia_valid = attr->ia_valid; + + error = inode_change_ok(inode, attr); + if (error) + return error; + + if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || + (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { + handle_t *handle; + + /* (user+group)*(old+new) structure, inode write (sb, + * inode block, ? - but truncate inode update has it) */ + handle = ext3_journal_start(inode, 4*EXT3_QUOTA_INIT_BLOCKS+3); + if (IS_ERR(handle)) { + error = PTR_ERR(handle); + goto err_out; + } + error = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0; + if (error) { + ext3_journal_stop(handle); + return error; + } + /* Update corresponding info in inode so that everything is in + * one transaction */ + if (attr->ia_valid & ATTR_UID) + inode->i_uid = attr->ia_uid; + if (attr->ia_valid & ATTR_GID) + inode->i_gid = attr->ia_gid; + error = ext3_mark_inode_dirty(handle, inode); + ext3_journal_stop(handle); + } + + if (S_ISREG(inode->i_mode) && + attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) { + handle_t *handle; + + handle = ext3_journal_start(inode, 3); + if (IS_ERR(handle)) { + error = PTR_ERR(handle); + goto err_out; + } + + error = ext3_orphan_add(handle, inode); + EXT3_I(inode)->i_disksize = attr->ia_size; + rc = ext3_mark_inode_dirty(handle, inode); + if (!error) + error = rc; + ext3_journal_stop(handle); + } + + rc = inode_setattr(inode, attr); + + /* If inode_setattr's call to ext3_truncate failed to get a + * transaction handle at all, we need to clean up the in-core + * orphan list manually. */ + if (inode->i_nlink) + ext3_orphan_del(NULL, inode); + + if (!rc && (ia_valid & ATTR_MODE)) + rc = ext3_acl_chmod(inode); + +err_out: + ext3_std_error(inode->i_sb, error); + if (!error) + error = rc; + return error; +} + + +/* + * akpm: how many blocks doth make a writepage()? + * + * With N blocks per page, it may be: + * N data blocks + * 2 indirect block + * 2 dindirect + * 1 tindirect + * N+5 bitmap blocks (from the above) + * N+5 group descriptor summary blocks + * 1 inode block + * 1 superblock. + * 2 * EXT3_SINGLEDATA_TRANS_BLOCKS for the quote files + * + * 3 * (N + 5) + 2 + 2 * EXT3_SINGLEDATA_TRANS_BLOCKS + * + * With ordered or writeback data it's the same, less the N data blocks. + * + * If the inode's direct blocks can hold an integral number of pages then a + * page cannot straddle two indirect blocks, and we can only touch one indirect + * and dindirect block, and the "5" above becomes "3". + * + * This still overestimates under most circumstances. If we were to pass the + * start and end offsets in here as well we could do block_to_path() on each + * block and work out the exact number of indirects which are touched. Pah. + */ + +static int ext3_writepage_trans_blocks(struct inode *inode) +{ + int bpp = ext3_journal_blocks_per_page(inode); + int indirects = (EXT3_NDIR_BLOCKS % bpp) ? 5 : 3; + int ret; + + if (ext3_should_journal_data(inode)) + ret = 3 * (bpp + indirects) + 2; + else + ret = 2 * (bpp + indirects) + 2; + +#ifdef CONFIG_QUOTA + /* We know that structure was already allocated during DQUOT_INIT so + * we will be updating only the data blocks + inodes */ + ret += 2*EXT3_QUOTA_TRANS_BLOCKS; +#endif + + return ret; +} + +/* + * The caller must have previously called ext3_reserve_inode_write(). + * Give this, we know that the caller already has write access to iloc->bh. + */ +int ext3_mark_iloc_dirty(handle_t *handle, + struct inode *inode, struct ext3_iloc *iloc) +{ + int err = 0; + + /* the do_update_inode consumes one bh->b_count */ + get_bh(iloc->bh); + + /* ext3_do_update_inode() does journal_dirty_metadata */ + err = ext3_do_update_inode(handle, inode, iloc); + put_bh(iloc->bh); + return err; +} + +/* + * On success, We end up with an outstanding reference count against + * iloc->bh. This _must_ be cleaned up later. + */ + +int +ext3_reserve_inode_write(handle_t *handle, struct inode *inode, + struct ext3_iloc *iloc) +{ + int err = 0; + if (handle) { + err = ext3_get_inode_loc(inode, iloc); + if (!err) { + BUFFER_TRACE(iloc->bh, "get_write_access"); + err = ext3_journal_get_write_access(handle, iloc->bh); + if (err) { + brelse(iloc->bh); + iloc->bh = NULL; + } + } + } + ext3_std_error(inode->i_sb, err); + return err; +} + +/* + * akpm: What we do here is to mark the in-core inode as clean + * with respect to inode dirtiness (it may still be data-dirty). + * This means that the in-core inode may be reaped by prune_icache + * without having to perform any I/O. This is a very good thing, + * because *any* task may call prune_icache - even ones which + * have a transaction open against a different journal. + * + * Is this cheating? Not really. Sure, we haven't written the + * inode out, but prune_icache isn't a user-visible syncing function. + * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync) + * we start and wait on commits. + * + * Is this efficient/effective? Well, we're being nice to the system + * by cleaning up our inodes proactively so they can be reaped + * without I/O. But we are potentially leaving up to five seconds' + * worth of inodes floating about which prune_icache wants us to + * write out. One way to fix that would be to get prune_icache() + * to do a write_super() to free up some memory. It has the desired + * effect. + */ +int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode) +{ + struct ext3_iloc iloc; + int err; + + might_sleep(); + err = ext3_reserve_inode_write(handle, inode, &iloc); + if (!err) + err = ext3_mark_iloc_dirty(handle, inode, &iloc); + return err; +} + +/* + * akpm: ext3_dirty_inode() is called from __mark_inode_dirty() + * + * We're really interested in the case where a file is being extended. + * i_size has been changed by generic_commit_write() and we thus need + * to include the updated inode in the current transaction. + * + * Also, DQUOT_ALLOC_SPACE() will always dirty the inode when blocks + * are allocated to the file. + * + * If the inode is marked synchronous, we don't honour that here - doing + * so would cause a commit on atime updates, which we don't bother doing. + * We handle synchronous inodes at the highest possible level. + */ +void ext3_dirty_inode(struct inode *inode) +{ + handle_t *current_handle = ext3_journal_current_handle(); + handle_t *handle; + + handle = ext3_journal_start(inode, 2); + if (IS_ERR(handle)) + goto out; + if (current_handle && + current_handle->h_transaction != handle->h_transaction) { + /* This task has a transaction open against a different fs */ + printk(KERN_EMERG "%s: transactions do not match!\n", + __FUNCTION__); + } else { + jbd_debug(5, "marking dirty. outer handle=%p\n", + current_handle); + ext3_mark_inode_dirty(handle, inode); + } + ext3_journal_stop(handle); +out: + return; +} + +#ifdef AKPM +/* + * Bind an inode's backing buffer_head into this transaction, to prevent + * it from being flushed to disk early. Unlike + * ext3_reserve_inode_write, this leaves behind no bh reference and + * returns no iloc structure, so the caller needs to repeat the iloc + * lookup to mark the inode dirty later. + */ +static inline int +ext3_pin_inode(handle_t *handle, struct inode *inode) +{ + struct ext3_iloc iloc; + + int err = 0; + if (handle) { + err = ext3_get_inode_loc(inode, &iloc); + if (!err) { + BUFFER_TRACE(iloc.bh, "get_write_access"); + err = journal_get_write_access(handle, iloc.bh); + if (!err) + err = ext3_journal_dirty_metadata(handle, + iloc.bh); + brelse(iloc.bh); + } + } + ext3_std_error(inode->i_sb, err); + return err; +} +#endif + +int ext3_change_inode_journal_flag(struct inode *inode, int val) +{ + journal_t *journal; + handle_t *handle; + int err; + + /* + * We have to be very careful here: changing a data block's + * journaling status dynamically is dangerous. If we write a + * data block to the journal, change the status and then delete + * that block, we risk forgetting to revoke the old log record + * from the journal and so a subsequent replay can corrupt data. + * So, first we make sure that the journal is empty and that + * nobody is changing anything. + */ + + journal = EXT3_JOURNAL(inode); + if (is_journal_aborted(journal) || IS_RDONLY(inode)) + return -EROFS; + + journal_lock_updates(journal); + journal_flush(journal); + + /* + * OK, there are no updates running now, and all cached data is + * synced to disk. We are now in a completely consistent state + * which doesn't have anything in the journal, and we know that + * no filesystem updates are running, so it is safe to modify + * the inode's in-core data-journaling state flag now. + */ + + if (val) + EXT3_I(inode)->i_flags |= EXT3_JOURNAL_DATA_FL; + else + EXT3_I(inode)->i_flags &= ~EXT3_JOURNAL_DATA_FL; + ext3_set_aops(inode); + + journal_unlock_updates(journal); + + /* Finally we can mark the inode as dirty. */ + + handle = ext3_journal_start(inode, 1); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + err = ext3_mark_inode_dirty(handle, inode); + handle->h_sync = 1; + ext3_journal_stop(handle); + ext3_std_error(inode->i_sb, err); + + return err; +} diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c new file mode 100644 index 000000000000..706d68608381 --- /dev/null +++ b/fs/ext3/ioctl.c @@ -0,0 +1,243 @@ +/* + * linux/fs/ext3/ioctl.c + * + * Copyright (C) 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + */ + +#include +#include +#include +#include +#include +#include + + +int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, + unsigned long arg) +{ + struct ext3_inode_info *ei = EXT3_I(inode); + unsigned int flags; + unsigned short rsv_window_size; + + ext3_debug ("cmd = %u, arg = %lu\n", cmd, arg); + + switch (cmd) { + case EXT3_IOC_GETFLAGS: + flags = ei->i_flags & EXT3_FL_USER_VISIBLE; + return put_user(flags, (int __user *) arg); + case EXT3_IOC_SETFLAGS: { + handle_t *handle = NULL; + int err; + struct ext3_iloc iloc; + unsigned int oldflags; + unsigned int jflag; + + if (IS_RDONLY(inode)) + return -EROFS; + + if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER)) + return -EACCES; + + if (get_user(flags, (int __user *) arg)) + return -EFAULT; + + if (!S_ISDIR(inode->i_mode)) + flags &= ~EXT3_DIRSYNC_FL; + + oldflags = ei->i_flags; + + /* The JOURNAL_DATA flag is modifiable only by root */ + jflag = flags & EXT3_JOURNAL_DATA_FL; + + /* + * The IMMUTABLE and APPEND_ONLY flags can only be changed by + * the relevant capability. + * + * This test looks nicer. Thanks to Pauline Middelink + */ + if ((flags ^ oldflags) & (EXT3_APPEND_FL | EXT3_IMMUTABLE_FL)) { + if (!capable(CAP_LINUX_IMMUTABLE)) + return -EPERM; + } + + /* + * The JOURNAL_DATA flag can only be changed by + * the relevant capability. + */ + if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL)) { + if (!capable(CAP_SYS_RESOURCE)) + return -EPERM; + } + + + handle = ext3_journal_start(inode, 1); + if (IS_ERR(handle)) + return PTR_ERR(handle); + if (IS_SYNC(inode)) + handle->h_sync = 1; + err = ext3_reserve_inode_write(handle, inode, &iloc); + if (err) + goto flags_err; + + flags = flags & EXT3_FL_USER_MODIFIABLE; + flags |= oldflags & ~EXT3_FL_USER_MODIFIABLE; + ei->i_flags = flags; + + ext3_set_inode_flags(inode); + inode->i_ctime = CURRENT_TIME_SEC; + + err = ext3_mark_iloc_dirty(handle, inode, &iloc); +flags_err: + ext3_journal_stop(handle); + if (err) + return err; + + if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL)) + err = ext3_change_inode_journal_flag(inode, jflag); + return err; + } + case EXT3_IOC_GETVERSION: + case EXT3_IOC_GETVERSION_OLD: + return put_user(inode->i_generation, (int __user *) arg); + case EXT3_IOC_SETVERSION: + case EXT3_IOC_SETVERSION_OLD: { + handle_t *handle; + struct ext3_iloc iloc; + __u32 generation; + int err; + + if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER)) + return -EPERM; + if (IS_RDONLY(inode)) + return -EROFS; + if (get_user(generation, (int __user *) arg)) + return -EFAULT; + + handle = ext3_journal_start(inode, 1); + if (IS_ERR(handle)) + return PTR_ERR(handle); + err = ext3_reserve_inode_write(handle, inode, &iloc); + if (err == 0) { + inode->i_ctime = CURRENT_TIME_SEC; + inode->i_generation = generation; + err = ext3_mark_iloc_dirty(handle, inode, &iloc); + } + ext3_journal_stop(handle); + return err; + } +#ifdef CONFIG_JBD_DEBUG + case EXT3_IOC_WAIT_FOR_READONLY: + /* + * This is racy - by the time we're woken up and running, + * the superblock could be released. And the module could + * have been unloaded. So sue me. + * + * Returns 1 if it slept, else zero. + */ + { + struct super_block *sb = inode->i_sb; + DECLARE_WAITQUEUE(wait, current); + int ret = 0; + + set_current_state(TASK_INTERRUPTIBLE); + add_wait_queue(&EXT3_SB(sb)->ro_wait_queue, &wait); + if (timer_pending(&EXT3_SB(sb)->turn_ro_timer)) { + schedule(); + ret = 1; + } + remove_wait_queue(&EXT3_SB(sb)->ro_wait_queue, &wait); + return ret; + } +#endif + case EXT3_IOC_GETRSVSZ: + if (test_opt(inode->i_sb, RESERVATION) + && S_ISREG(inode->i_mode) + && ei->i_block_alloc_info) { + rsv_window_size = ei->i_block_alloc_info->rsv_window_node.rsv_goal_size; + return put_user(rsv_window_size, (int __user *)arg); + } + return -ENOTTY; + case EXT3_IOC_SETRSVSZ: { + + if (!test_opt(inode->i_sb, RESERVATION) ||!S_ISREG(inode->i_mode)) + return -ENOTTY; + + if (IS_RDONLY(inode)) + return -EROFS; + + if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER)) + return -EACCES; + + if (get_user(rsv_window_size, (int __user *)arg)) + return -EFAULT; + + if (rsv_window_size > EXT3_MAX_RESERVE_BLOCKS) + rsv_window_size = EXT3_MAX_RESERVE_BLOCKS; + + /* + * need to allocate reservation structure for this inode + * before set the window size + */ + down(&ei->truncate_sem); + if (!ei->i_block_alloc_info) + ext3_init_block_alloc_info(inode); + + if (ei->i_block_alloc_info){ + struct ext3_reserve_window_node *rsv = &ei->i_block_alloc_info->rsv_window_node; + rsv->rsv_goal_size = rsv_window_size; + } + up(&ei->truncate_sem); + return 0; + } + case EXT3_IOC_GROUP_EXTEND: { + unsigned long n_blocks_count; + struct super_block *sb = inode->i_sb; + int err; + + if (!capable(CAP_SYS_RESOURCE)) + return -EPERM; + + if (IS_RDONLY(inode)) + return -EROFS; + + if (get_user(n_blocks_count, (__u32 __user *)arg)) + return -EFAULT; + + err = ext3_group_extend(sb, EXT3_SB(sb)->s_es, n_blocks_count); + journal_lock_updates(EXT3_SB(sb)->s_journal); + journal_flush(EXT3_SB(sb)->s_journal); + journal_unlock_updates(EXT3_SB(sb)->s_journal); + + return err; + } + case EXT3_IOC_GROUP_ADD: { + struct ext3_new_group_data input; + struct super_block *sb = inode->i_sb; + int err; + + if (!capable(CAP_SYS_RESOURCE)) + return -EPERM; + + if (IS_RDONLY(inode)) + return -EROFS; + + if (copy_from_user(&input, (struct ext3_new_group_input __user *)arg, + sizeof(input))) + return -EFAULT; + + err = ext3_group_add(sb, &input); + journal_lock_updates(EXT3_SB(sb)->s_journal); + journal_flush(EXT3_SB(sb)->s_journal); + journal_unlock_updates(EXT3_SB(sb)->s_journal); + + return err; + } + + + default: + return -ENOTTY; + } +} diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c new file mode 100644 index 000000000000..79742d824a0a --- /dev/null +++ b/fs/ext3/namei.c @@ -0,0 +1,2378 @@ +/* + * linux/fs/ext3/namei.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/namei.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + * Directory entry file type support and forward compatibility hooks + * for B-tree directories by Theodore Ts'o (tytso@mit.edu), 1998 + * Hash Tree Directory indexing (c) + * Daniel Phillips, 2001 + * Hash Tree Directory indexing porting + * Christopher Li, 2002 + * Hash Tree Directory indexing cleanup + * Theodore Ts'o, 2002 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "xattr.h" +#include "acl.h" + +/* + * define how far ahead to read directories while searching them. + */ +#define NAMEI_RA_CHUNKS 2 +#define NAMEI_RA_BLOCKS 4 +#define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS) +#define NAMEI_RA_INDEX(c,b) (((c) * NAMEI_RA_BLOCKS) + (b)) + +static struct buffer_head *ext3_append(handle_t *handle, + struct inode *inode, + u32 *block, int *err) +{ + struct buffer_head *bh; + + *block = inode->i_size >> inode->i_sb->s_blocksize_bits; + + if ((bh = ext3_bread(handle, inode, *block, 1, err))) { + inode->i_size += inode->i_sb->s_blocksize; + EXT3_I(inode)->i_disksize = inode->i_size; + ext3_journal_get_write_access(handle,bh); + } + return bh; +} + +#ifndef assert +#define assert(test) J_ASSERT(test) +#endif + +#ifndef swap +#define swap(x, y) do { typeof(x) z = x; x = y; y = z; } while (0) +#endif + +#ifdef DX_DEBUG +#define dxtrace(command) command +#else +#define dxtrace(command) +#endif + +struct fake_dirent +{ + __le32 inode; + __le16 rec_len; + u8 name_len; + u8 file_type; +}; + +struct dx_countlimit +{ + __le16 limit; + __le16 count; +}; + +struct dx_entry +{ + __le32 hash; + __le32 block; +}; + +/* + * dx_root_info is laid out so that if it should somehow get overlaid by a + * dirent the two low bits of the hash version will be zero. Therefore, the + * hash version mod 4 should never be 0. Sincerely, the paranoia department. + */ + +struct dx_root +{ + struct fake_dirent dot; + char dot_name[4]; + struct fake_dirent dotdot; + char dotdot_name[4]; + struct dx_root_info + { + __le32 reserved_zero; + u8 hash_version; + u8 info_length; /* 8 */ + u8 indirect_levels; + u8 unused_flags; + } + info; + struct dx_entry entries[0]; +}; + +struct dx_node +{ + struct fake_dirent fake; + struct dx_entry entries[0]; +}; + + +struct dx_frame +{ + struct buffer_head *bh; + struct dx_entry *entries; + struct dx_entry *at; +}; + +struct dx_map_entry +{ + u32 hash; + u32 offs; +}; + +#ifdef CONFIG_EXT3_INDEX +static inline unsigned dx_get_block (struct dx_entry *entry); +static void dx_set_block (struct dx_entry *entry, unsigned value); +static inline unsigned dx_get_hash (struct dx_entry *entry); +static void dx_set_hash (struct dx_entry *entry, unsigned value); +static unsigned dx_get_count (struct dx_entry *entries); +static unsigned dx_get_limit (struct dx_entry *entries); +static void dx_set_count (struct dx_entry *entries, unsigned value); +static void dx_set_limit (struct dx_entry *entries, unsigned value); +static unsigned dx_root_limit (struct inode *dir, unsigned infosize); +static unsigned dx_node_limit (struct inode *dir); +static struct dx_frame *dx_probe(struct dentry *dentry, + struct inode *dir, + struct dx_hash_info *hinfo, + struct dx_frame *frame, + int *err); +static void dx_release (struct dx_frame *frames); +static int dx_make_map (struct ext3_dir_entry_2 *de, int size, + struct dx_hash_info *hinfo, struct dx_map_entry map[]); +static void dx_sort_map(struct dx_map_entry *map, unsigned count); +static struct ext3_dir_entry_2 *dx_move_dirents (char *from, char *to, + struct dx_map_entry *offsets, int count); +static struct ext3_dir_entry_2* dx_pack_dirents (char *base, int size); +static void dx_insert_block (struct dx_frame *frame, u32 hash, u32 block); +static int ext3_htree_next_block(struct inode *dir, __u32 hash, + struct dx_frame *frame, + struct dx_frame *frames, + __u32 *start_hash); +static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry, + struct ext3_dir_entry_2 **res_dir, int *err); +static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry, + struct inode *inode); + +/* + * Future: use high four bits of block for coalesce-on-delete flags + * Mask them off for now. + */ + +static inline unsigned dx_get_block (struct dx_entry *entry) +{ + return le32_to_cpu(entry->block) & 0x00ffffff; +} + +static inline void dx_set_block (struct dx_entry *entry, unsigned value) +{ + entry->block = cpu_to_le32(value); +} + +static inline unsigned dx_get_hash (struct dx_entry *entry) +{ + return le32_to_cpu(entry->hash); +} + +static inline void dx_set_hash (struct dx_entry *entry, unsigned value) +{ + entry->hash = cpu_to_le32(value); +} + +static inline unsigned dx_get_count (struct dx_entry *entries) +{ + return le16_to_cpu(((struct dx_countlimit *) entries)->count); +} + +static inline unsigned dx_get_limit (struct dx_entry *entries) +{ + return le16_to_cpu(((struct dx_countlimit *) entries)->limit); +} + +static inline void dx_set_count (struct dx_entry *entries, unsigned value) +{ + ((struct dx_countlimit *) entries)->count = cpu_to_le16(value); +} + +static inline void dx_set_limit (struct dx_entry *entries, unsigned value) +{ + ((struct dx_countlimit *) entries)->limit = cpu_to_le16(value); +} + +static inline unsigned dx_root_limit (struct inode *dir, unsigned infosize) +{ + unsigned entry_space = dir->i_sb->s_blocksize - EXT3_DIR_REC_LEN(1) - + EXT3_DIR_REC_LEN(2) - infosize; + return 0? 20: entry_space / sizeof(struct dx_entry); +} + +static inline unsigned dx_node_limit (struct inode *dir) +{ + unsigned entry_space = dir->i_sb->s_blocksize - EXT3_DIR_REC_LEN(0); + return 0? 22: entry_space / sizeof(struct dx_entry); +} + +/* + * Debug + */ +#ifdef DX_DEBUG +static void dx_show_index (char * label, struct dx_entry *entries) +{ + int i, n = dx_get_count (entries); + printk("%s index ", label); + for (i = 0; i < n; i++) + { + printk("%x->%u ", i? dx_get_hash(entries + i): 0, dx_get_block(entries + i)); + } + printk("\n"); +} + +struct stats +{ + unsigned names; + unsigned space; + unsigned bcount; +}; + +static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext3_dir_entry_2 *de, + int size, int show_names) +{ + unsigned names = 0, space = 0; + char *base = (char *) de; + struct dx_hash_info h = *hinfo; + + printk("names: "); + while ((char *) de < base + size) + { + if (de->inode) + { + if (show_names) + { + int len = de->name_len; + char *name = de->name; + while (len--) printk("%c", *name++); + ext3fs_dirhash(de->name, de->name_len, &h); + printk(":%x.%u ", h.hash, + ((char *) de - base)); + } + space += EXT3_DIR_REC_LEN(de->name_len); + names++; + } + de = (struct ext3_dir_entry_2 *) ((char *) de + le16_to_cpu(de->rec_len)); + } + printk("(%i)\n", names); + return (struct stats) { names, space, 1 }; +} + +struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir, + struct dx_entry *entries, int levels) +{ + unsigned blocksize = dir->i_sb->s_blocksize; + unsigned count = dx_get_count (entries), names = 0, space = 0, i; + unsigned bcount = 0; + struct buffer_head *bh; + int err; + printk("%i indexed blocks...\n", count); + for (i = 0; i < count; i++, entries++) + { + u32 block = dx_get_block(entries), hash = i? dx_get_hash(entries): 0; + u32 range = i < count - 1? (dx_get_hash(entries + 1) - hash): ~hash; + struct stats stats; + printk("%s%3u:%03u hash %8x/%8x ",levels?"":" ", i, block, hash, range); + if (!(bh = ext3_bread (NULL,dir, block, 0,&err))) continue; + stats = levels? + dx_show_entries(hinfo, dir, ((struct dx_node *) bh->b_data)->entries, levels - 1): + dx_show_leaf(hinfo, (struct ext3_dir_entry_2 *) bh->b_data, blocksize, 0); + names += stats.names; + space += stats.space; + bcount += stats.bcount; + brelse (bh); + } + if (bcount) + printk("%snames %u, fullness %u (%u%%)\n", levels?"":" ", + names, space/bcount,(space/bcount)*100/blocksize); + return (struct stats) { names, space, bcount}; +} +#endif /* DX_DEBUG */ + +/* + * Probe for a directory leaf block to search. + * + * dx_probe can return ERR_BAD_DX_DIR, which means there was a format + * error in the directory index, and the caller should fall back to + * searching the directory normally. The callers of dx_probe **MUST** + * check for this error code, and make sure it never gets reflected + * back to userspace. + */ +static struct dx_frame * +dx_probe(struct dentry *dentry, struct inode *dir, + struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err) +{ + unsigned count, indirect; + struct dx_entry *at, *entries, *p, *q, *m; + struct dx_root *root; + struct buffer_head *bh; + struct dx_frame *frame = frame_in; + u32 hash; + + frame->bh = NULL; + if (dentry) + dir = dentry->d_parent->d_inode; + if (!(bh = ext3_bread (NULL,dir, 0, 0, err))) + goto fail; + root = (struct dx_root *) bh->b_data; + if (root->info.hash_version != DX_HASH_TEA && + root->info.hash_version != DX_HASH_HALF_MD4 && + root->info.hash_version != DX_HASH_LEGACY) { + ext3_warning(dir->i_sb, __FUNCTION__, + "Unrecognised inode hash code %d", + root->info.hash_version); + brelse(bh); + *err = ERR_BAD_DX_DIR; + goto fail; + } + hinfo->hash_version = root->info.hash_version; + hinfo->seed = EXT3_SB(dir->i_sb)->s_hash_seed; + if (dentry) + ext3fs_dirhash(dentry->d_name.name, dentry->d_name.len, hinfo); + hash = hinfo->hash; + + if (root->info.unused_flags & 1) { + ext3_warning(dir->i_sb, __FUNCTION__, + "Unimplemented inode hash flags: %#06x", + root->info.unused_flags); + brelse(bh); + *err = ERR_BAD_DX_DIR; + goto fail; + } + + if ((indirect = root->info.indirect_levels) > 1) { + ext3_warning(dir->i_sb, __FUNCTION__, + "Unimplemented inode hash depth: %#06x", + root->info.indirect_levels); + brelse(bh); + *err = ERR_BAD_DX_DIR; + goto fail; + } + + entries = (struct dx_entry *) (((char *)&root->info) + + root->info.info_length); + assert(dx_get_limit(entries) == dx_root_limit(dir, + root->info.info_length)); + dxtrace (printk("Look up %x", hash)); + while (1) + { + count = dx_get_count(entries); + assert (count && count <= dx_get_limit(entries)); + p = entries + 1; + q = entries + count - 1; + while (p <= q) + { + m = p + (q - p)/2; + dxtrace(printk(".")); + if (dx_get_hash(m) > hash) + q = m - 1; + else + p = m + 1; + } + + if (0) // linear search cross check + { + unsigned n = count - 1; + at = entries; + while (n--) + { + dxtrace(printk(",")); + if (dx_get_hash(++at) > hash) + { + at--; + break; + } + } + assert (at == p - 1); + } + + at = p - 1; + dxtrace(printk(" %x->%u\n", at == entries? 0: dx_get_hash(at), dx_get_block(at))); + frame->bh = bh; + frame->entries = entries; + frame->at = at; + if (!indirect--) return frame; + if (!(bh = ext3_bread (NULL,dir, dx_get_block(at), 0, err))) + goto fail2; + at = entries = ((struct dx_node *) bh->b_data)->entries; + assert (dx_get_limit(entries) == dx_node_limit (dir)); + frame++; + } +fail2: + while (frame >= frame_in) { + brelse(frame->bh); + frame--; + } +fail: + return NULL; +} + +static void dx_release (struct dx_frame *frames) +{ + if (frames[0].bh == NULL) + return; + + if (((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels) + brelse(frames[1].bh); + brelse(frames[0].bh); +} + +/* + * This function increments the frame pointer to search the next leaf + * block, and reads in the necessary intervening nodes if the search + * should be necessary. Whether or not the search is necessary is + * controlled by the hash parameter. If the hash value is even, then + * the search is only continued if the next block starts with that + * hash value. This is used if we are searching for a specific file. + * + * If the hash value is HASH_NB_ALWAYS, then always go to the next block. + * + * This function returns 1 if the caller should continue to search, + * or 0 if it should not. If there is an error reading one of the + * index blocks, it will a negative error code. + * + * If start_hash is non-null, it will be filled in with the starting + * hash of the next page. + */ +static int ext3_htree_next_block(struct inode *dir, __u32 hash, + struct dx_frame *frame, + struct dx_frame *frames, + __u32 *start_hash) +{ + struct dx_frame *p; + struct buffer_head *bh; + int err, num_frames = 0; + __u32 bhash; + + p = frame; + /* + * Find the next leaf page by incrementing the frame pointer. + * If we run out of entries in the interior node, loop around and + * increment pointer in the parent node. When we break out of + * this loop, num_frames indicates the number of interior + * nodes need to be read. + */ + while (1) { + if (++(p->at) < p->entries + dx_get_count(p->entries)) + break; + if (p == frames) + return 0; + num_frames++; + p--; + } + + /* + * If the hash is 1, then continue only if the next page has a + * continuation hash of any value. This is used for readdir + * handling. Otherwise, check to see if the hash matches the + * desired contiuation hash. If it doesn't, return since + * there's no point to read in the successive index pages. + */ + bhash = dx_get_hash(p->at); + if (start_hash) + *start_hash = bhash; + if ((hash & 1) == 0) { + if ((bhash & ~1) != hash) + return 0; + } + /* + * If the hash is HASH_NB_ALWAYS, we always go to the next + * block so no check is necessary + */ + while (num_frames--) { + if (!(bh = ext3_bread(NULL, dir, dx_get_block(p->at), + 0, &err))) + return err; /* Failure */ + p++; + brelse (p->bh); + p->bh = bh; + p->at = p->entries = ((struct dx_node *) bh->b_data)->entries; + } + return 1; +} + + +/* + * p is at least 6 bytes before the end of page + */ +static inline struct ext3_dir_entry_2 *ext3_next_entry(struct ext3_dir_entry_2 *p) +{ + return (struct ext3_dir_entry_2 *)((char*)p + le16_to_cpu(p->rec_len)); +} + +/* + * This function fills a red-black tree with information from a + * directory block. It returns the number directory entries loaded + * into the tree. If there is an error it is returned in err. + */ +static int htree_dirblock_to_tree(struct file *dir_file, + struct inode *dir, int block, + struct dx_hash_info *hinfo, + __u32 start_hash, __u32 start_minor_hash) +{ + struct buffer_head *bh; + struct ext3_dir_entry_2 *de, *top; + int err, count = 0; + + dxtrace(printk("In htree dirblock_to_tree: block %d\n", block)); + if (!(bh = ext3_bread (NULL, dir, block, 0, &err))) + return err; + + de = (struct ext3_dir_entry_2 *) bh->b_data; + top = (struct ext3_dir_entry_2 *) ((char *) de + + dir->i_sb->s_blocksize - + EXT3_DIR_REC_LEN(0)); + for (; de < top; de = ext3_next_entry(de)) { + ext3fs_dirhash(de->name, de->name_len, hinfo); + if ((hinfo->hash < start_hash) || + ((hinfo->hash == start_hash) && + (hinfo->minor_hash < start_minor_hash))) + continue; + if (de->inode == 0) + continue; + if ((err = ext3_htree_store_dirent(dir_file, + hinfo->hash, hinfo->minor_hash, de)) != 0) { + brelse(bh); + return err; + } + count++; + } + brelse(bh); + return count; +} + + +/* + * This function fills a red-black tree with information from a + * directory. We start scanning the directory in hash order, starting + * at start_hash and start_minor_hash. + * + * This function returns the number of entries inserted into the tree, + * or a negative error code. + */ +int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash, + __u32 start_minor_hash, __u32 *next_hash) +{ + struct dx_hash_info hinfo; + struct ext3_dir_entry_2 *de; + struct dx_frame frames[2], *frame; + struct inode *dir; + int block, err; + int count = 0; + int ret; + __u32 hashval; + + dxtrace(printk("In htree_fill_tree, start hash: %x:%x\n", start_hash, + start_minor_hash)); + dir = dir_file->f_dentry->d_inode; + if (!(EXT3_I(dir)->i_flags & EXT3_INDEX_FL)) { + hinfo.hash_version = EXT3_SB(dir->i_sb)->s_def_hash_version; + hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed; + count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo, + start_hash, start_minor_hash); + *next_hash = ~0; + return count; + } + hinfo.hash = start_hash; + hinfo.minor_hash = 0; + frame = dx_probe(NULL, dir_file->f_dentry->d_inode, &hinfo, frames, &err); + if (!frame) + return err; + + /* Add '.' and '..' from the htree header */ + if (!start_hash && !start_minor_hash) { + de = (struct ext3_dir_entry_2 *) frames[0].bh->b_data; + if ((err = ext3_htree_store_dirent(dir_file, 0, 0, de)) != 0) + goto errout; + count++; + } + if (start_hash < 2 || (start_hash ==2 && start_minor_hash==0)) { + de = (struct ext3_dir_entry_2 *) frames[0].bh->b_data; + de = ext3_next_entry(de); + if ((err = ext3_htree_store_dirent(dir_file, 2, 0, de)) != 0) + goto errout; + count++; + } + + while (1) { + block = dx_get_block(frame->at); + ret = htree_dirblock_to_tree(dir_file, dir, block, &hinfo, + start_hash, start_minor_hash); + if (ret < 0) { + err = ret; + goto errout; + } + count += ret; + hashval = ~0; + ret = ext3_htree_next_block(dir, HASH_NB_ALWAYS, + frame, frames, &hashval); + *next_hash = hashval; + if (ret < 0) { + err = ret; + goto errout; + } + /* + * Stop if: (a) there are no more entries, or + * (b) we have inserted at least one entry and the + * next hash value is not a continuation + */ + if ((ret == 0) || + (count && ((hashval & 1) == 0))) + break; + } + dx_release(frames); + dxtrace(printk("Fill tree: returned %d entries, next hash: %x\n", + count, *next_hash)); + return count; +errout: + dx_release(frames); + return (err); +} + + +/* + * Directory block splitting, compacting + */ + +static int dx_make_map (struct ext3_dir_entry_2 *de, int size, + struct dx_hash_info *hinfo, struct dx_map_entry *map_tail) +{ + int count = 0; + char *base = (char *) de; + struct dx_hash_info h = *hinfo; + + while ((char *) de < base + size) + { + if (de->name_len && de->inode) { + ext3fs_dirhash(de->name, de->name_len, &h); + map_tail--; + map_tail->hash = h.hash; + map_tail->offs = (u32) ((char *) de - base); + count++; + cond_resched(); + } + /* XXX: do we need to check rec_len == 0 case? -Chris */ + de = (struct ext3_dir_entry_2 *) ((char *) de + le16_to_cpu(de->rec_len)); + } + return count; +} + +static void dx_sort_map (struct dx_map_entry *map, unsigned count) +{ + struct dx_map_entry *p, *q, *top = map + count - 1; + int more; + /* Combsort until bubble sort doesn't suck */ + while (count > 2) + { + count = count*10/13; + if (count - 9 < 2) /* 9, 10 -> 11 */ + count = 11; + for (p = top, q = p - count; q >= map; p--, q--) + if (p->hash < q->hash) + swap(*p, *q); + } + /* Garden variety bubble sort */ + do { + more = 0; + q = top; + while (q-- > map) + { + if (q[1].hash >= q[0].hash) + continue; + swap(*(q+1), *q); + more = 1; + } + } while(more); +} + +static void dx_insert_block(struct dx_frame *frame, u32 hash, u32 block) +{ + struct dx_entry *entries = frame->entries; + struct dx_entry *old = frame->at, *new = old + 1; + int count = dx_get_count(entries); + + assert(count < dx_get_limit(entries)); + assert(old < entries + count); + memmove(new + 1, new, (char *)(entries + count) - (char *)(new)); + dx_set_hash(new, hash); + dx_set_block(new, block); + dx_set_count(entries, count + 1); +} +#endif + + +static void ext3_update_dx_flag(struct inode *inode) +{ + if (!EXT3_HAS_COMPAT_FEATURE(inode->i_sb, + EXT3_FEATURE_COMPAT_DIR_INDEX)) + EXT3_I(inode)->i_flags &= ~EXT3_INDEX_FL; +} + +/* + * NOTE! unlike strncmp, ext3_match returns 1 for success, 0 for failure. + * + * `len <= EXT3_NAME_LEN' is guaranteed by caller. + * `de != NULL' is guaranteed by caller. + */ +static inline int ext3_match (int len, const char * const name, + struct ext3_dir_entry_2 * de) +{ + if (len != de->name_len) + return 0; + if (!de->inode) + return 0; + return !memcmp(name, de->name, len); +} + +/* + * Returns 0 if not found, -1 on failure, and 1 on success + */ +static inline int search_dirblock(struct buffer_head * bh, + struct inode *dir, + struct dentry *dentry, + unsigned long offset, + struct ext3_dir_entry_2 ** res_dir) +{ + struct ext3_dir_entry_2 * de; + char * dlimit; + int de_len; + const char *name = dentry->d_name.name; + int namelen = dentry->d_name.len; + + de = (struct ext3_dir_entry_2 *) bh->b_data; + dlimit = bh->b_data + dir->i_sb->s_blocksize; + while ((char *) de < dlimit) { + /* this code is executed quadratically often */ + /* do minimal checking `by hand' */ + + if ((char *) de + namelen <= dlimit && + ext3_match (namelen, name, de)) { + /* found a match - just to be sure, do a full check */ + if (!ext3_check_dir_entry("ext3_find_entry", + dir, de, bh, offset)) + return -1; + *res_dir = de; + return 1; + } + /* prevent looping on a bad block */ + de_len = le16_to_cpu(de->rec_len); + if (de_len <= 0) + return -1; + offset += de_len; + de = (struct ext3_dir_entry_2 *) ((char *) de + de_len); + } + return 0; +} + + +/* + * ext3_find_entry() + * + * finds an entry in the specified directory with the wanted name. It + * returns the cache buffer in which the entry was found, and the entry + * itself (as a parameter - res_dir). It does NOT read the inode of the + * entry - you'll have to do that yourself if you want to. + * + * The returned buffer_head has ->b_count elevated. The caller is expected + * to brelse() it when appropriate. + */ +static struct buffer_head * ext3_find_entry (struct dentry *dentry, + struct ext3_dir_entry_2 ** res_dir) +{ + struct super_block * sb; + struct buffer_head * bh_use[NAMEI_RA_SIZE]; + struct buffer_head * bh, *ret = NULL; + unsigned long start, block, b; + int ra_max = 0; /* Number of bh's in the readahead + buffer, bh_use[] */ + int ra_ptr = 0; /* Current index into readahead + buffer */ + int num = 0; + int nblocks, i, err; + struct inode *dir = dentry->d_parent->d_inode; + int namelen; + const u8 *name; + unsigned blocksize; + + *res_dir = NULL; + sb = dir->i_sb; + blocksize = sb->s_blocksize; + namelen = dentry->d_name.len; + name = dentry->d_name.name; + if (namelen > EXT3_NAME_LEN) + return NULL; +#ifdef CONFIG_EXT3_INDEX + if (is_dx(dir)) { + bh = ext3_dx_find_entry(dentry, res_dir, &err); + /* + * On success, or if the error was file not found, + * return. Otherwise, fall back to doing a search the + * old fashioned way. + */ + if (bh || (err != ERR_BAD_DX_DIR)) + return bh; + dxtrace(printk("ext3_find_entry: dx failed, falling back\n")); + } +#endif + nblocks = dir->i_size >> EXT3_BLOCK_SIZE_BITS(sb); + start = EXT3_I(dir)->i_dir_start_lookup; + if (start >= nblocks) + start = 0; + block = start; +restart: + do { + /* + * We deal with the read-ahead logic here. + */ + if (ra_ptr >= ra_max) { + /* Refill the readahead buffer */ + ra_ptr = 0; + b = block; + for (ra_max = 0; ra_max < NAMEI_RA_SIZE; ra_max++) { + /* + * Terminate if we reach the end of the + * directory and must wrap, or if our + * search has finished at this block. + */ + if (b >= nblocks || (num && block == start)) { + bh_use[ra_max] = NULL; + break; + } + num++; + bh = ext3_getblk(NULL, dir, b++, 0, &err); + bh_use[ra_max] = bh; + if (bh) + ll_rw_block(READ, 1, &bh); + } + } + if ((bh = bh_use[ra_ptr++]) == NULL) + goto next; + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) { + /* read error, skip block & hope for the best */ + ext3_error(sb, __FUNCTION__, "reading directory #%lu " + "offset %lu", dir->i_ino, block); + brelse(bh); + goto next; + } + i = search_dirblock(bh, dir, dentry, + block << EXT3_BLOCK_SIZE_BITS(sb), res_dir); + if (i == 1) { + EXT3_I(dir)->i_dir_start_lookup = block; + ret = bh; + goto cleanup_and_exit; + } else { + brelse(bh); + if (i < 0) + goto cleanup_and_exit; + } + next: + if (++block >= nblocks) + block = 0; + } while (block != start); + + /* + * If the directory has grown while we were searching, then + * search the last part of the directory before giving up. + */ + block = nblocks; + nblocks = dir->i_size >> EXT3_BLOCK_SIZE_BITS(sb); + if (block < nblocks) { + start = 0; + goto restart; + } + +cleanup_and_exit: + /* Clean up the read-ahead blocks */ + for (; ra_ptr < ra_max; ra_ptr++) + brelse (bh_use[ra_ptr]); + return ret; +} + +#ifdef CONFIG_EXT3_INDEX +static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry, + struct ext3_dir_entry_2 **res_dir, int *err) +{ + struct super_block * sb; + struct dx_hash_info hinfo; + u32 hash; + struct dx_frame frames[2], *frame; + struct ext3_dir_entry_2 *de, *top; + struct buffer_head *bh; + unsigned long block; + int retval; + int namelen = dentry->d_name.len; + const u8 *name = dentry->d_name.name; + struct inode *dir = dentry->d_parent->d_inode; + + sb = dir->i_sb; + if (!(frame = dx_probe(dentry, NULL, &hinfo, frames, err))) + return NULL; + hash = hinfo.hash; + do { + block = dx_get_block(frame->at); + if (!(bh = ext3_bread (NULL,dir, block, 0, err))) + goto errout; + de = (struct ext3_dir_entry_2 *) bh->b_data; + top = (struct ext3_dir_entry_2 *) ((char *) de + sb->s_blocksize - + EXT3_DIR_REC_LEN(0)); + for (; de < top; de = ext3_next_entry(de)) + if (ext3_match (namelen, name, de)) { + if (!ext3_check_dir_entry("ext3_find_entry", + dir, de, bh, + (block<b_data))) { + brelse (bh); + goto errout; + } + *res_dir = de; + dx_release (frames); + return bh; + } + brelse (bh); + /* Check to see if we should continue to search */ + retval = ext3_htree_next_block(dir, hash, frame, + frames, NULL); + if (retval < 0) { + ext3_warning(sb, __FUNCTION__, + "error reading index page in directory #%lu", + dir->i_ino); + *err = retval; + goto errout; + } + } while (retval == 1); + + *err = -ENOENT; +errout: + dxtrace(printk("%s not found\n", name)); + dx_release (frames); + return NULL; +} +#endif + +static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd) +{ + struct inode * inode; + struct ext3_dir_entry_2 * de; + struct buffer_head * bh; + + if (dentry->d_name.len > EXT3_NAME_LEN) + return ERR_PTR(-ENAMETOOLONG); + + bh = ext3_find_entry(dentry, &de); + inode = NULL; + if (bh) { + unsigned long ino = le32_to_cpu(de->inode); + brelse (bh); + inode = iget(dir->i_sb, ino); + + if (!inode) + return ERR_PTR(-EACCES); + } + if (inode) + return d_splice_alias(inode, dentry); + d_add(dentry, inode); + return NULL; +} + + +struct dentry *ext3_get_parent(struct dentry *child) +{ + unsigned long ino; + struct dentry *parent; + struct inode *inode; + struct dentry dotdot; + struct ext3_dir_entry_2 * de; + struct buffer_head *bh; + + dotdot.d_name.name = ".."; + dotdot.d_name.len = 2; + dotdot.d_parent = child; /* confusing, isn't it! */ + + bh = ext3_find_entry(&dotdot, &de); + inode = NULL; + if (!bh) + return ERR_PTR(-ENOENT); + ino = le32_to_cpu(de->inode); + brelse(bh); + inode = iget(child->d_inode->i_sb, ino); + + if (!inode) + return ERR_PTR(-EACCES); + + parent = d_alloc_anon(inode); + if (!parent) { + iput(inode); + parent = ERR_PTR(-ENOMEM); + } + return parent; +} + +#define S_SHIFT 12 +static unsigned char ext3_type_by_mode[S_IFMT >> S_SHIFT] = { + [S_IFREG >> S_SHIFT] = EXT3_FT_REG_FILE, + [S_IFDIR >> S_SHIFT] = EXT3_FT_DIR, + [S_IFCHR >> S_SHIFT] = EXT3_FT_CHRDEV, + [S_IFBLK >> S_SHIFT] = EXT3_FT_BLKDEV, + [S_IFIFO >> S_SHIFT] = EXT3_FT_FIFO, + [S_IFSOCK >> S_SHIFT] = EXT3_FT_SOCK, + [S_IFLNK >> S_SHIFT] = EXT3_FT_SYMLINK, +}; + +static inline void ext3_set_de_type(struct super_block *sb, + struct ext3_dir_entry_2 *de, + umode_t mode) { + if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_FILETYPE)) + de->file_type = ext3_type_by_mode[(mode & S_IFMT)>>S_SHIFT]; +} + +#ifdef CONFIG_EXT3_INDEX +static struct ext3_dir_entry_2 * +dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count) +{ + unsigned rec_len = 0; + + while (count--) { + struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *) (from + map->offs); + rec_len = EXT3_DIR_REC_LEN(de->name_len); + memcpy (to, de, rec_len); + ((struct ext3_dir_entry_2 *) to)->rec_len = + cpu_to_le16(rec_len); + de->inode = 0; + map++; + to += rec_len; + } + return (struct ext3_dir_entry_2 *) (to - rec_len); +} + +static struct ext3_dir_entry_2* dx_pack_dirents(char *base, int size) +{ + struct ext3_dir_entry_2 *next, *to, *prev, *de = (struct ext3_dir_entry_2 *) base; + unsigned rec_len = 0; + + prev = to = de; + while ((char*)de < base + size) { + next = (struct ext3_dir_entry_2 *) ((char *) de + + le16_to_cpu(de->rec_len)); + if (de->inode && de->name_len) { + rec_len = EXT3_DIR_REC_LEN(de->name_len); + if (de > to) + memmove(to, de, rec_len); + to->rec_len = cpu_to_le16(rec_len); + prev = to; + to = (struct ext3_dir_entry_2 *) (((char *) to) + rec_len); + } + de = next; + } + return prev; +} + +static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, + struct buffer_head **bh,struct dx_frame *frame, + struct dx_hash_info *hinfo, int *error) +{ + unsigned blocksize = dir->i_sb->s_blocksize; + unsigned count, continued; + struct buffer_head *bh2; + u32 newblock; + u32 hash2; + struct dx_map_entry *map; + char *data1 = (*bh)->b_data, *data2; + unsigned split; + struct ext3_dir_entry_2 *de = NULL, *de2; + int err; + + bh2 = ext3_append (handle, dir, &newblock, error); + if (!(bh2)) { + brelse(*bh); + *bh = NULL; + goto errout; + } + + BUFFER_TRACE(*bh, "get_write_access"); + err = ext3_journal_get_write_access(handle, *bh); + if (err) { + journal_error: + brelse(*bh); + brelse(bh2); + *bh = NULL; + ext3_std_error(dir->i_sb, err); + goto errout; + } + BUFFER_TRACE(frame->bh, "get_write_access"); + err = ext3_journal_get_write_access(handle, frame->bh); + if (err) + goto journal_error; + + data2 = bh2->b_data; + + /* create map in the end of data2 block */ + map = (struct dx_map_entry *) (data2 + blocksize); + count = dx_make_map ((struct ext3_dir_entry_2 *) data1, + blocksize, hinfo, map); + map -= count; + split = count/2; // need to adjust to actual middle + dx_sort_map (map, count); + hash2 = map[split].hash; + continued = hash2 == map[split - 1].hash; + dxtrace(printk("Split block %i at %x, %i/%i\n", + dx_get_block(frame->at), hash2, split, count-split)); + + /* Fancy dance to stay within two buffers */ + de2 = dx_move_dirents(data1, data2, map + split, count - split); + de = dx_pack_dirents(data1,blocksize); + de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de); + de2->rec_len = cpu_to_le16(data2 + blocksize - (char *) de2); + dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data1, blocksize, 1)); + dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data2, blocksize, 1)); + + /* Which block gets the new entry? */ + if (hinfo->hash >= hash2) + { + swap(*bh, bh2); + de = de2; + } + dx_insert_block (frame, hash2 + continued, newblock); + err = ext3_journal_dirty_metadata (handle, bh2); + if (err) + goto journal_error; + err = ext3_journal_dirty_metadata (handle, frame->bh); + if (err) + goto journal_error; + brelse (bh2); + dxtrace(dx_show_index ("frame", frame->entries)); +errout: + return de; +} +#endif + + +/* + * Add a new entry into a directory (leaf) block. If de is non-NULL, + * it points to a directory entry which is guaranteed to be large + * enough for new directory entry. If de is NULL, then + * add_dirent_to_buf will attempt search the directory block for + * space. It will return -ENOSPC if no space is available, and -EIO + * and -EEXIST if directory entry already exists. + * + * NOTE! bh is NOT released in the case where ENOSPC is returned. In + * all other cases bh is released. + */ +static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, + struct inode *inode, struct ext3_dir_entry_2 *de, + struct buffer_head * bh) +{ + struct inode *dir = dentry->d_parent->d_inode; + const char *name = dentry->d_name.name; + int namelen = dentry->d_name.len; + unsigned long offset = 0; + unsigned short reclen; + int nlen, rlen, err; + char *top; + + reclen = EXT3_DIR_REC_LEN(namelen); + if (!de) { + de = (struct ext3_dir_entry_2 *)bh->b_data; + top = bh->b_data + dir->i_sb->s_blocksize - reclen; + while ((char *) de <= top) { + if (!ext3_check_dir_entry("ext3_add_entry", dir, de, + bh, offset)) { + brelse (bh); + return -EIO; + } + if (ext3_match (namelen, name, de)) { + brelse (bh); + return -EEXIST; + } + nlen = EXT3_DIR_REC_LEN(de->name_len); + rlen = le16_to_cpu(de->rec_len); + if ((de->inode? rlen - nlen: rlen) >= reclen) + break; + de = (struct ext3_dir_entry_2 *)((char *)de + rlen); + offset += rlen; + } + if ((char *) de > top) + return -ENOSPC; + } + BUFFER_TRACE(bh, "get_write_access"); + err = ext3_journal_get_write_access(handle, bh); + if (err) { + ext3_std_error(dir->i_sb, err); + brelse(bh); + return err; + } + + /* By now the buffer is marked for journaling */ + nlen = EXT3_DIR_REC_LEN(de->name_len); + rlen = le16_to_cpu(de->rec_len); + if (de->inode) { + struct ext3_dir_entry_2 *de1 = (struct ext3_dir_entry_2 *)((char *)de + nlen); + de1->rec_len = cpu_to_le16(rlen - nlen); + de->rec_len = cpu_to_le16(nlen); + de = de1; + } + de->file_type = EXT3_FT_UNKNOWN; + if (inode) { + de->inode = cpu_to_le32(inode->i_ino); + ext3_set_de_type(dir->i_sb, de, inode->i_mode); + } else + de->inode = 0; + de->name_len = namelen; + memcpy (de->name, name, namelen); + /* + * XXX shouldn't update any times until successful + * completion of syscall, but too many callers depend + * on this. + * + * XXX similarly, too many callers depend on + * ext3_new_inode() setting the times, but error + * recovery deletes the inode, so the worst that can + * happen is that the times are slightly out of date + * and/or different from the directory change time. + */ + dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC; + ext3_update_dx_flag(dir); + dir->i_version++; + ext3_mark_inode_dirty(handle, dir); + BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); + err = ext3_journal_dirty_metadata(handle, bh); + if (err) + ext3_std_error(dir->i_sb, err); + brelse(bh); + return 0; +} + +#ifdef CONFIG_EXT3_INDEX +/* + * This converts a one block unindexed directory to a 3 block indexed + * directory, and adds the dentry to the indexed directory. + */ +static int make_indexed_dir(handle_t *handle, struct dentry *dentry, + struct inode *inode, struct buffer_head *bh) +{ + struct inode *dir = dentry->d_parent->d_inode; + const char *name = dentry->d_name.name; + int namelen = dentry->d_name.len; + struct buffer_head *bh2; + struct dx_root *root; + struct dx_frame frames[2], *frame; + struct dx_entry *entries; + struct ext3_dir_entry_2 *de, *de2; + char *data1, *top; + unsigned len; + int retval; + unsigned blocksize; + struct dx_hash_info hinfo; + u32 block; + struct fake_dirent *fde; + + blocksize = dir->i_sb->s_blocksize; + dxtrace(printk("Creating index\n")); + retval = ext3_journal_get_write_access(handle, bh); + if (retval) { + ext3_std_error(dir->i_sb, retval); + brelse(bh); + return retval; + } + root = (struct dx_root *) bh->b_data; + + bh2 = ext3_append (handle, dir, &block, &retval); + if (!(bh2)) { + brelse(bh); + return retval; + } + EXT3_I(dir)->i_flags |= EXT3_INDEX_FL; + data1 = bh2->b_data; + + /* The 0th block becomes the root, move the dirents out */ + fde = &root->dotdot; + de = (struct ext3_dir_entry_2 *)((char *)fde + le16_to_cpu(fde->rec_len)); + len = ((char *) root) + blocksize - (char *) de; + memcpy (data1, de, len); + de = (struct ext3_dir_entry_2 *) data1; + top = data1 + len; + while ((char *)(de2=(void*)de+le16_to_cpu(de->rec_len)) < top) + de = de2; + de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de); + /* Initialize the root; the dot dirents already exist */ + de = (struct ext3_dir_entry_2 *) (&root->dotdot); + de->rec_len = cpu_to_le16(blocksize - EXT3_DIR_REC_LEN(2)); + memset (&root->info, 0, sizeof(root->info)); + root->info.info_length = sizeof(root->info); + root->info.hash_version = EXT3_SB(dir->i_sb)->s_def_hash_version; + entries = root->entries; + dx_set_block (entries, 1); + dx_set_count (entries, 1); + dx_set_limit (entries, dx_root_limit(dir, sizeof(root->info))); + + /* Initialize as for dx_probe */ + hinfo.hash_version = root->info.hash_version; + hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed; + ext3fs_dirhash(name, namelen, &hinfo); + frame = frames; + frame->entries = entries; + frame->at = entries; + frame->bh = bh; + bh = bh2; + de = do_split(handle,dir, &bh, frame, &hinfo, &retval); + dx_release (frames); + if (!(de)) + return retval; + + return add_dirent_to_buf(handle, dentry, inode, de, bh); +} +#endif + +/* + * ext3_add_entry() + * + * adds a file entry to the specified directory, using the same + * semantics as ext3_find_entry(). It returns NULL if it failed. + * + * NOTE!! The inode part of 'de' is left at 0 - which means you + * may not sleep between calling this and putting something into + * the entry, as someone else might have used it while you slept. + */ +static int ext3_add_entry (handle_t *handle, struct dentry *dentry, + struct inode *inode) +{ + struct inode *dir = dentry->d_parent->d_inode; + unsigned long offset; + struct buffer_head * bh; + struct ext3_dir_entry_2 *de; + struct super_block * sb; + int retval; +#ifdef CONFIG_EXT3_INDEX + int dx_fallback=0; +#endif + unsigned blocksize; + unsigned nlen, rlen; + u32 block, blocks; + + sb = dir->i_sb; + blocksize = sb->s_blocksize; + if (!dentry->d_name.len) + return -EINVAL; +#ifdef CONFIG_EXT3_INDEX + if (is_dx(dir)) { + retval = ext3_dx_add_entry(handle, dentry, inode); + if (!retval || (retval != ERR_BAD_DX_DIR)) + return retval; + EXT3_I(dir)->i_flags &= ~EXT3_INDEX_FL; + dx_fallback++; + ext3_mark_inode_dirty(handle, dir); + } +#endif + blocks = dir->i_size >> sb->s_blocksize_bits; + for (block = 0, offset = 0; block < blocks; block++) { + bh = ext3_bread(handle, dir, block, 0, &retval); + if(!bh) + return retval; + retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh); + if (retval != -ENOSPC) + return retval; + +#ifdef CONFIG_EXT3_INDEX + if (blocks == 1 && !dx_fallback && + EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_DIR_INDEX)) + return make_indexed_dir(handle, dentry, inode, bh); +#endif + brelse(bh); + } + bh = ext3_append(handle, dir, &block, &retval); + if (!bh) + return retval; + de = (struct ext3_dir_entry_2 *) bh->b_data; + de->inode = 0; + de->rec_len = cpu_to_le16(rlen = blocksize); + nlen = 0; + return add_dirent_to_buf(handle, dentry, inode, de, bh); +} + +#ifdef CONFIG_EXT3_INDEX +/* + * Returns 0 for success, or a negative error value + */ +static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry, + struct inode *inode) +{ + struct dx_frame frames[2], *frame; + struct dx_entry *entries, *at; + struct dx_hash_info hinfo; + struct buffer_head * bh; + struct inode *dir = dentry->d_parent->d_inode; + struct super_block * sb = dir->i_sb; + struct ext3_dir_entry_2 *de; + int err; + + frame = dx_probe(dentry, NULL, &hinfo, frames, &err); + if (!frame) + return err; + entries = frame->entries; + at = frame->at; + + if (!(bh = ext3_bread(handle,dir, dx_get_block(frame->at), 0, &err))) + goto cleanup; + + BUFFER_TRACE(bh, "get_write_access"); + err = ext3_journal_get_write_access(handle, bh); + if (err) + goto journal_error; + + err = add_dirent_to_buf(handle, dentry, inode, NULL, bh); + if (err != -ENOSPC) { + bh = NULL; + goto cleanup; + } + + /* Block full, should compress but for now just split */ + dxtrace(printk("using %u of %u node entries\n", + dx_get_count(entries), dx_get_limit(entries))); + /* Need to split index? */ + if (dx_get_count(entries) == dx_get_limit(entries)) { + u32 newblock; + unsigned icount = dx_get_count(entries); + int levels = frame - frames; + struct dx_entry *entries2; + struct dx_node *node2; + struct buffer_head *bh2; + + if (levels && (dx_get_count(frames->entries) == + dx_get_limit(frames->entries))) { + ext3_warning(sb, __FUNCTION__, + "Directory index full!\n"); + err = -ENOSPC; + goto cleanup; + } + bh2 = ext3_append (handle, dir, &newblock, &err); + if (!(bh2)) + goto cleanup; + node2 = (struct dx_node *)(bh2->b_data); + entries2 = node2->entries; + node2->fake.rec_len = cpu_to_le16(sb->s_blocksize); + node2->fake.inode = 0; + BUFFER_TRACE(frame->bh, "get_write_access"); + err = ext3_journal_get_write_access(handle, frame->bh); + if (err) + goto journal_error; + if (levels) { + unsigned icount1 = icount/2, icount2 = icount - icount1; + unsigned hash2 = dx_get_hash(entries + icount1); + dxtrace(printk("Split index %i/%i\n", icount1, icount2)); + + BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */ + err = ext3_journal_get_write_access(handle, + frames[0].bh); + if (err) + goto journal_error; + + memcpy ((char *) entries2, (char *) (entries + icount1), + icount2 * sizeof(struct dx_entry)); + dx_set_count (entries, icount1); + dx_set_count (entries2, icount2); + dx_set_limit (entries2, dx_node_limit(dir)); + + /* Which index block gets the new entry? */ + if (at - entries >= icount1) { + frame->at = at = at - entries - icount1 + entries2; + frame->entries = entries = entries2; + swap(frame->bh, bh2); + } + dx_insert_block (frames + 0, hash2, newblock); + dxtrace(dx_show_index ("node", frames[1].entries)); + dxtrace(dx_show_index ("node", + ((struct dx_node *) bh2->b_data)->entries)); + err = ext3_journal_dirty_metadata(handle, bh2); + if (err) + goto journal_error; + brelse (bh2); + } else { + dxtrace(printk("Creating second level index...\n")); + memcpy((char *) entries2, (char *) entries, + icount * sizeof(struct dx_entry)); + dx_set_limit(entries2, dx_node_limit(dir)); + + /* Set up root */ + dx_set_count(entries, 1); + dx_set_block(entries + 0, newblock); + ((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1; + + /* Add new access path frame */ + frame = frames + 1; + frame->at = at = at - entries + entries2; + frame->entries = entries = entries2; + frame->bh = bh2; + err = ext3_journal_get_write_access(handle, + frame->bh); + if (err) + goto journal_error; + } + ext3_journal_dirty_metadata(handle, frames[0].bh); + } + de = do_split(handle, dir, &bh, frame, &hinfo, &err); + if (!de) + goto cleanup; + err = add_dirent_to_buf(handle, dentry, inode, de, bh); + bh = NULL; + goto cleanup; + +journal_error: + ext3_std_error(dir->i_sb, err); +cleanup: + if (bh) + brelse(bh); + dx_release(frames); + return err; +} +#endif + +/* + * ext3_delete_entry deletes a directory entry by merging it with the + * previous entry + */ +static int ext3_delete_entry (handle_t *handle, + struct inode * dir, + struct ext3_dir_entry_2 * de_del, + struct buffer_head * bh) +{ + struct ext3_dir_entry_2 * de, * pde; + int i; + + i = 0; + pde = NULL; + de = (struct ext3_dir_entry_2 *) bh->b_data; + while (i < bh->b_size) { + if (!ext3_check_dir_entry("ext3_delete_entry", dir, de, bh, i)) + return -EIO; + if (de == de_del) { + BUFFER_TRACE(bh, "get_write_access"); + ext3_journal_get_write_access(handle, bh); + if (pde) + pde->rec_len = + cpu_to_le16(le16_to_cpu(pde->rec_len) + + le16_to_cpu(de->rec_len)); + else + de->inode = 0; + dir->i_version++; + BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); + ext3_journal_dirty_metadata(handle, bh); + return 0; + } + i += le16_to_cpu(de->rec_len); + pde = de; + de = (struct ext3_dir_entry_2 *) + ((char *) de + le16_to_cpu(de->rec_len)); + } + return -ENOENT; +} + +/* + * ext3_mark_inode_dirty is somewhat expensive, so unlike ext2 we + * do not perform it in these functions. We perform it at the call site, + * if it is needed. + */ +static inline void ext3_inc_count(handle_t *handle, struct inode *inode) +{ + inode->i_nlink++; +} + +static inline void ext3_dec_count(handle_t *handle, struct inode *inode) +{ + inode->i_nlink--; +} + +static int ext3_add_nondir(handle_t *handle, + struct dentry *dentry, struct inode *inode) +{ + int err = ext3_add_entry(handle, dentry, inode); + if (!err) { + ext3_mark_inode_dirty(handle, inode); + d_instantiate(dentry, inode); + return 0; + } + ext3_dec_count(handle, inode); + iput(inode); + return err; +} + +/* + * By the time this is called, we already have created + * the directory cache entry for the new file, but it + * is so far negative - it has no inode. + * + * If the create succeeds, we fill in the inode information + * with d_instantiate(). + */ +static int ext3_create (struct inode * dir, struct dentry * dentry, int mode, + struct nameidata *nd) +{ + handle_t *handle; + struct inode * inode; + int err, retries = 0; + +retry: + handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + + EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 + + 2*EXT3_QUOTA_INIT_BLOCKS); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + if (IS_DIRSYNC(dir)) + handle->h_sync = 1; + + inode = ext3_new_inode (handle, dir, mode); + err = PTR_ERR(inode); + if (!IS_ERR(inode)) { + inode->i_op = &ext3_file_inode_operations; + inode->i_fop = &ext3_file_operations; + ext3_set_aops(inode); + err = ext3_add_nondir(handle, dentry, inode); + } + ext3_journal_stop(handle); + if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries)) + goto retry; + return err; +} + +static int ext3_mknod (struct inode * dir, struct dentry *dentry, + int mode, dev_t rdev) +{ + handle_t *handle; + struct inode *inode; + int err, retries = 0; + + if (!new_valid_dev(rdev)) + return -EINVAL; + +retry: + handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + + EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 + + 2*EXT3_QUOTA_INIT_BLOCKS); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + if (IS_DIRSYNC(dir)) + handle->h_sync = 1; + + inode = ext3_new_inode (handle, dir, mode); + err = PTR_ERR(inode); + if (!IS_ERR(inode)) { + init_special_inode(inode, inode->i_mode, rdev); +#ifdef CONFIG_EXT3_FS_XATTR + inode->i_op = &ext3_special_inode_operations; +#endif + err = ext3_add_nondir(handle, dentry, inode); + } + ext3_journal_stop(handle); + if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries)) + goto retry; + return err; +} + +static int ext3_mkdir(struct inode * dir, struct dentry * dentry, int mode) +{ + handle_t *handle; + struct inode * inode; + struct buffer_head * dir_block; + struct ext3_dir_entry_2 * de; + int err, retries = 0; + + if (dir->i_nlink >= EXT3_LINK_MAX) + return -EMLINK; + +retry: + handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + + EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 + + 2*EXT3_QUOTA_INIT_BLOCKS); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + if (IS_DIRSYNC(dir)) + handle->h_sync = 1; + + inode = ext3_new_inode (handle, dir, S_IFDIR | mode); + err = PTR_ERR(inode); + if (IS_ERR(inode)) + goto out_stop; + + inode->i_op = &ext3_dir_inode_operations; + inode->i_fop = &ext3_dir_operations; + inode->i_size = EXT3_I(inode)->i_disksize = inode->i_sb->s_blocksize; + dir_block = ext3_bread (handle, inode, 0, 1, &err); + if (!dir_block) { + inode->i_nlink--; /* is this nlink == 0? */ + ext3_mark_inode_dirty(handle, inode); + iput (inode); + goto out_stop; + } + BUFFER_TRACE(dir_block, "get_write_access"); + ext3_journal_get_write_access(handle, dir_block); + de = (struct ext3_dir_entry_2 *) dir_block->b_data; + de->inode = cpu_to_le32(inode->i_ino); + de->name_len = 1; + de->rec_len = cpu_to_le16(EXT3_DIR_REC_LEN(de->name_len)); + strcpy (de->name, "."); + ext3_set_de_type(dir->i_sb, de, S_IFDIR); + de = (struct ext3_dir_entry_2 *) + ((char *) de + le16_to_cpu(de->rec_len)); + de->inode = cpu_to_le32(dir->i_ino); + de->rec_len = cpu_to_le16(inode->i_sb->s_blocksize-EXT3_DIR_REC_LEN(1)); + de->name_len = 2; + strcpy (de->name, ".."); + ext3_set_de_type(dir->i_sb, de, S_IFDIR); + inode->i_nlink = 2; + BUFFER_TRACE(dir_block, "call ext3_journal_dirty_metadata"); + ext3_journal_dirty_metadata(handle, dir_block); + brelse (dir_block); + ext3_mark_inode_dirty(handle, inode); + err = ext3_add_entry (handle, dentry, inode); + if (err) { + inode->i_nlink = 0; + ext3_mark_inode_dirty(handle, inode); + iput (inode); + goto out_stop; + } + dir->i_nlink++; + ext3_update_dx_flag(dir); + ext3_mark_inode_dirty(handle, dir); + d_instantiate(dentry, inode); +out_stop: + ext3_journal_stop(handle); + if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries)) + goto retry; + return err; +} + +/* + * routine to check that the specified directory is empty (for rmdir) + */ +static int empty_dir (struct inode * inode) +{ + unsigned long offset; + struct buffer_head * bh; + struct ext3_dir_entry_2 * de, * de1; + struct super_block * sb; + int err = 0; + + sb = inode->i_sb; + if (inode->i_size < EXT3_DIR_REC_LEN(1) + EXT3_DIR_REC_LEN(2) || + !(bh = ext3_bread (NULL, inode, 0, 0, &err))) { + if (err) + ext3_error(inode->i_sb, __FUNCTION__, + "error %d reading directory #%lu offset 0", + err, inode->i_ino); + else + ext3_warning(inode->i_sb, __FUNCTION__, + "bad directory (dir #%lu) - no data block", + inode->i_ino); + return 1; + } + de = (struct ext3_dir_entry_2 *) bh->b_data; + de1 = (struct ext3_dir_entry_2 *) + ((char *) de + le16_to_cpu(de->rec_len)); + if (le32_to_cpu(de->inode) != inode->i_ino || + !le32_to_cpu(de1->inode) || + strcmp (".", de->name) || + strcmp ("..", de1->name)) { + ext3_warning (inode->i_sb, "empty_dir", + "bad directory (dir #%lu) - no `.' or `..'", + inode->i_ino); + brelse (bh); + return 1; + } + offset = le16_to_cpu(de->rec_len) + le16_to_cpu(de1->rec_len); + de = (struct ext3_dir_entry_2 *) + ((char *) de1 + le16_to_cpu(de1->rec_len)); + while (offset < inode->i_size ) { + if (!bh || + (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) { + err = 0; + brelse (bh); + bh = ext3_bread (NULL, inode, + offset >> EXT3_BLOCK_SIZE_BITS(sb), 0, &err); + if (!bh) { + if (err) + ext3_error(sb, __FUNCTION__, + "error %d reading directory" + " #%lu offset %lu", + err, inode->i_ino, offset); + offset += sb->s_blocksize; + continue; + } + de = (struct ext3_dir_entry_2 *) bh->b_data; + } + if (!ext3_check_dir_entry("empty_dir", inode, de, bh, offset)) { + de = (struct ext3_dir_entry_2 *)(bh->b_data + + sb->s_blocksize); + offset = (offset | (sb->s_blocksize - 1)) + 1; + continue; + } + if (le32_to_cpu(de->inode)) { + brelse (bh); + return 0; + } + offset += le16_to_cpu(de->rec_len); + de = (struct ext3_dir_entry_2 *) + ((char *) de + le16_to_cpu(de->rec_len)); + } + brelse (bh); + return 1; +} + +/* ext3_orphan_add() links an unlinked or truncated inode into a list of + * such inodes, starting at the superblock, in case we crash before the + * file is closed/deleted, or in case the inode truncate spans multiple + * transactions and the last transaction is not recovered after a crash. + * + * At filesystem recovery time, we walk this list deleting unlinked + * inodes and truncating linked inodes in ext3_orphan_cleanup(). + */ +int ext3_orphan_add(handle_t *handle, struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + struct ext3_iloc iloc; + int err = 0, rc; + + lock_super(sb); + if (!list_empty(&EXT3_I(inode)->i_orphan)) + goto out_unlock; + + /* Orphan handling is only valid for files with data blocks + * being truncated, or files being unlinked. */ + + /* @@@ FIXME: Observation from aviro: + * I think I can trigger J_ASSERT in ext3_orphan_add(). We block + * here (on lock_super()), so race with ext3_link() which might bump + * ->i_nlink. For, say it, character device. Not a regular file, + * not a directory, not a symlink and ->i_nlink > 0. + */ + J_ASSERT ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + S_ISLNK(inode->i_mode)) || inode->i_nlink == 0); + + BUFFER_TRACE(EXT3_SB(sb)->s_sbh, "get_write_access"); + err = ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh); + if (err) + goto out_unlock; + + err = ext3_reserve_inode_write(handle, inode, &iloc); + if (err) + goto out_unlock; + + /* Insert this inode at the head of the on-disk orphan list... */ + NEXT_ORPHAN(inode) = le32_to_cpu(EXT3_SB(sb)->s_es->s_last_orphan); + EXT3_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino); + err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); + rc = ext3_mark_iloc_dirty(handle, inode, &iloc); + if (!err) + err = rc; + + /* Only add to the head of the in-memory list if all the + * previous operations succeeded. If the orphan_add is going to + * fail (possibly taking the journal offline), we can't risk + * leaving the inode on the orphan list: stray orphan-list + * entries can cause panics at unmount time. + * + * This is safe: on error we're going to ignore the orphan list + * anyway on the next recovery. */ + if (!err) + list_add(&EXT3_I(inode)->i_orphan, &EXT3_SB(sb)->s_orphan); + + jbd_debug(4, "superblock will point to %ld\n", inode->i_ino); + jbd_debug(4, "orphan inode %ld will point to %d\n", + inode->i_ino, NEXT_ORPHAN(inode)); +out_unlock: + unlock_super(sb); + ext3_std_error(inode->i_sb, err); + return err; +} + +/* + * ext3_orphan_del() removes an unlinked or truncated inode from the list + * of such inodes stored on disk, because it is finally being cleaned up. + */ +int ext3_orphan_del(handle_t *handle, struct inode *inode) +{ + struct list_head *prev; + struct ext3_inode_info *ei = EXT3_I(inode); + struct ext3_sb_info *sbi; + unsigned long ino_next; + struct ext3_iloc iloc; + int err = 0; + + lock_super(inode->i_sb); + if (list_empty(&ei->i_orphan)) { + unlock_super(inode->i_sb); + return 0; + } + + ino_next = NEXT_ORPHAN(inode); + prev = ei->i_orphan.prev; + sbi = EXT3_SB(inode->i_sb); + + jbd_debug(4, "remove inode %lu from orphan list\n", inode->i_ino); + + list_del_init(&ei->i_orphan); + + /* If we're on an error path, we may not have a valid + * transaction handle with which to update the orphan list on + * disk, but we still need to remove the inode from the linked + * list in memory. */ + if (!handle) + goto out; + + err = ext3_reserve_inode_write(handle, inode, &iloc); + if (err) + goto out_err; + + if (prev == &sbi->s_orphan) { + jbd_debug(4, "superblock will point to %lu\n", ino_next); + BUFFER_TRACE(sbi->s_sbh, "get_write_access"); + err = ext3_journal_get_write_access(handle, sbi->s_sbh); + if (err) + goto out_brelse; + sbi->s_es->s_last_orphan = cpu_to_le32(ino_next); + err = ext3_journal_dirty_metadata(handle, sbi->s_sbh); + } else { + struct ext3_iloc iloc2; + struct inode *i_prev = + &list_entry(prev, struct ext3_inode_info, i_orphan)->vfs_inode; + + jbd_debug(4, "orphan inode %lu will point to %lu\n", + i_prev->i_ino, ino_next); + err = ext3_reserve_inode_write(handle, i_prev, &iloc2); + if (err) + goto out_brelse; + NEXT_ORPHAN(i_prev) = ino_next; + err = ext3_mark_iloc_dirty(handle, i_prev, &iloc2); + } + if (err) + goto out_brelse; + NEXT_ORPHAN(inode) = 0; + err = ext3_mark_iloc_dirty(handle, inode, &iloc); + +out_err: + ext3_std_error(inode->i_sb, err); +out: + unlock_super(inode->i_sb); + return err; + +out_brelse: + brelse(iloc.bh); + goto out_err; +} + +static int ext3_rmdir (struct inode * dir, struct dentry *dentry) +{ + int retval; + struct inode * inode; + struct buffer_head * bh; + struct ext3_dir_entry_2 * de; + handle_t *handle; + + /* Initialize quotas before so that eventual writes go in + * separate transaction */ + DQUOT_INIT(dentry->d_inode); + handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + retval = -ENOENT; + bh = ext3_find_entry (dentry, &de); + if (!bh) + goto end_rmdir; + + if (IS_DIRSYNC(dir)) + handle->h_sync = 1; + + inode = dentry->d_inode; + + retval = -EIO; + if (le32_to_cpu(de->inode) != inode->i_ino) + goto end_rmdir; + + retval = -ENOTEMPTY; + if (!empty_dir (inode)) + goto end_rmdir; + + retval = ext3_delete_entry(handle, dir, de, bh); + if (retval) + goto end_rmdir; + if (inode->i_nlink != 2) + ext3_warning (inode->i_sb, "ext3_rmdir", + "empty directory has nlink!=2 (%d)", + inode->i_nlink); + inode->i_version++; + inode->i_nlink = 0; + /* There's no need to set i_disksize: the fact that i_nlink is + * zero will ensure that the right thing happens during any + * recovery. */ + inode->i_size = 0; + ext3_orphan_add(handle, inode); + inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC; + ext3_mark_inode_dirty(handle, inode); + dir->i_nlink--; + ext3_update_dx_flag(dir); + ext3_mark_inode_dirty(handle, dir); + +end_rmdir: + ext3_journal_stop(handle); + brelse (bh); + return retval; +} + +static int ext3_unlink(struct inode * dir, struct dentry *dentry) +{ + int retval; + struct inode * inode; + struct buffer_head * bh; + struct ext3_dir_entry_2 * de; + handle_t *handle; + + /* Initialize quotas before so that eventual writes go + * in separate transaction */ + DQUOT_INIT(dentry->d_inode); + handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + if (IS_DIRSYNC(dir)) + handle->h_sync = 1; + + retval = -ENOENT; + bh = ext3_find_entry (dentry, &de); + if (!bh) + goto end_unlink; + + inode = dentry->d_inode; + + retval = -EIO; + if (le32_to_cpu(de->inode) != inode->i_ino) + goto end_unlink; + + if (!inode->i_nlink) { + ext3_warning (inode->i_sb, "ext3_unlink", + "Deleting nonexistent file (%lu), %d", + inode->i_ino, inode->i_nlink); + inode->i_nlink = 1; + } + retval = ext3_delete_entry(handle, dir, de, bh); + if (retval) + goto end_unlink; + dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC; + ext3_update_dx_flag(dir); + ext3_mark_inode_dirty(handle, dir); + inode->i_nlink--; + if (!inode->i_nlink) + ext3_orphan_add(handle, inode); + inode->i_ctime = dir->i_ctime; + ext3_mark_inode_dirty(handle, inode); + retval = 0; + +end_unlink: + ext3_journal_stop(handle); + brelse (bh); + return retval; +} + +static int ext3_symlink (struct inode * dir, + struct dentry *dentry, const char * symname) +{ + handle_t *handle; + struct inode * inode; + int l, err, retries = 0; + + l = strlen(symname)+1; + if (l > dir->i_sb->s_blocksize) + return -ENAMETOOLONG; + +retry: + handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + + EXT3_INDEX_EXTRA_TRANS_BLOCKS + 5 + + 2*EXT3_QUOTA_INIT_BLOCKS); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + if (IS_DIRSYNC(dir)) + handle->h_sync = 1; + + inode = ext3_new_inode (handle, dir, S_IFLNK|S_IRWXUGO); + err = PTR_ERR(inode); + if (IS_ERR(inode)) + goto out_stop; + + if (l > sizeof (EXT3_I(inode)->i_data)) { + inode->i_op = &ext3_symlink_inode_operations; + ext3_set_aops(inode); + /* + * page_symlink() calls into ext3_prepare/commit_write. + * We have a transaction open. All is sweetness. It also sets + * i_size in generic_commit_write(). + */ + err = page_symlink(inode, symname, l); + if (err) { + ext3_dec_count(handle, inode); + ext3_mark_inode_dirty(handle, inode); + iput (inode); + goto out_stop; + } + } else { + inode->i_op = &ext3_fast_symlink_inode_operations; + memcpy((char*)&EXT3_I(inode)->i_data,symname,l); + inode->i_size = l-1; + } + EXT3_I(inode)->i_disksize = inode->i_size; + err = ext3_add_nondir(handle, dentry, inode); +out_stop: + ext3_journal_stop(handle); + if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries)) + goto retry; + return err; +} + +static int ext3_link (struct dentry * old_dentry, + struct inode * dir, struct dentry *dentry) +{ + handle_t *handle; + struct inode *inode = old_dentry->d_inode; + int err, retries = 0; + + if (inode->i_nlink >= EXT3_LINK_MAX) + return -EMLINK; + +retry: + handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + + EXT3_INDEX_EXTRA_TRANS_BLOCKS); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + if (IS_DIRSYNC(dir)) + handle->h_sync = 1; + + inode->i_ctime = CURRENT_TIME_SEC; + ext3_inc_count(handle, inode); + atomic_inc(&inode->i_count); + + err = ext3_add_nondir(handle, dentry, inode); + ext3_journal_stop(handle); + if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries)) + goto retry; + return err; +} + +#define PARENT_INO(buffer) \ + ((struct ext3_dir_entry_2 *) ((char *) buffer + \ + le16_to_cpu(((struct ext3_dir_entry_2 *) buffer)->rec_len)))->inode + +/* + * Anybody can rename anything with this: the permission checks are left to the + * higher-level routines. + */ +static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry, + struct inode * new_dir,struct dentry *new_dentry) +{ + handle_t *handle; + struct inode * old_inode, * new_inode; + struct buffer_head * old_bh, * new_bh, * dir_bh; + struct ext3_dir_entry_2 * old_de, * new_de; + int retval; + + old_bh = new_bh = dir_bh = NULL; + + /* Initialize quotas before so that eventual writes go + * in separate transaction */ + if (new_dentry->d_inode) + DQUOT_INIT(new_dentry->d_inode); + handle = ext3_journal_start(old_dir, 2 * EXT3_DATA_TRANS_BLOCKS + + EXT3_INDEX_EXTRA_TRANS_BLOCKS + 2); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir)) + handle->h_sync = 1; + + old_bh = ext3_find_entry (old_dentry, &old_de); + /* + * Check for inode number is _not_ due to possible IO errors. + * We might rmdir the source, keep it as pwd of some process + * and merrily kill the link to whatever was created under the + * same name. Goodbye sticky bit ;-< + */ + old_inode = old_dentry->d_inode; + retval = -ENOENT; + if (!old_bh || le32_to_cpu(old_de->inode) != old_inode->i_ino) + goto end_rename; + + new_inode = new_dentry->d_inode; + new_bh = ext3_find_entry (new_dentry, &new_de); + if (new_bh) { + if (!new_inode) { + brelse (new_bh); + new_bh = NULL; + } + } + if (S_ISDIR(old_inode->i_mode)) { + if (new_inode) { + retval = -ENOTEMPTY; + if (!empty_dir (new_inode)) + goto end_rename; + } + retval = -EIO; + dir_bh = ext3_bread (handle, old_inode, 0, 0, &retval); + if (!dir_bh) + goto end_rename; + if (le32_to_cpu(PARENT_INO(dir_bh->b_data)) != old_dir->i_ino) + goto end_rename; + retval = -EMLINK; + if (!new_inode && new_dir!=old_dir && + new_dir->i_nlink >= EXT3_LINK_MAX) + goto end_rename; + } + if (!new_bh) { + retval = ext3_add_entry (handle, new_dentry, old_inode); + if (retval) + goto end_rename; + } else { + BUFFER_TRACE(new_bh, "get write access"); + ext3_journal_get_write_access(handle, new_bh); + new_de->inode = cpu_to_le32(old_inode->i_ino); + if (EXT3_HAS_INCOMPAT_FEATURE(new_dir->i_sb, + EXT3_FEATURE_INCOMPAT_FILETYPE)) + new_de->file_type = old_de->file_type; + new_dir->i_version++; + BUFFER_TRACE(new_bh, "call ext3_journal_dirty_metadata"); + ext3_journal_dirty_metadata(handle, new_bh); + brelse(new_bh); + new_bh = NULL; + } + + /* + * Like most other Unix systems, set the ctime for inodes on a + * rename. + */ + old_inode->i_ctime = CURRENT_TIME_SEC; + ext3_mark_inode_dirty(handle, old_inode); + + /* + * ok, that's it + */ + if (le32_to_cpu(old_de->inode) != old_inode->i_ino || + old_de->name_len != old_dentry->d_name.len || + strncmp(old_de->name, old_dentry->d_name.name, old_de->name_len) || + (retval = ext3_delete_entry(handle, old_dir, + old_de, old_bh)) == -ENOENT) { + /* old_de could have moved from under us during htree split, so + * make sure that we are deleting the right entry. We might + * also be pointing to a stale entry in the unused part of + * old_bh so just checking inum and the name isn't enough. */ + struct buffer_head *old_bh2; + struct ext3_dir_entry_2 *old_de2; + + old_bh2 = ext3_find_entry(old_dentry, &old_de2); + if (old_bh2) { + retval = ext3_delete_entry(handle, old_dir, + old_de2, old_bh2); + brelse(old_bh2); + } + } + if (retval) { + ext3_warning(old_dir->i_sb, "ext3_rename", + "Deleting old file (%lu), %d, error=%d", + old_dir->i_ino, old_dir->i_nlink, retval); + } + + if (new_inode) { + new_inode->i_nlink--; + new_inode->i_ctime = CURRENT_TIME_SEC; + } + old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME_SEC; + ext3_update_dx_flag(old_dir); + if (dir_bh) { + BUFFER_TRACE(dir_bh, "get_write_access"); + ext3_journal_get_write_access(handle, dir_bh); + PARENT_INO(dir_bh->b_data) = cpu_to_le32(new_dir->i_ino); + BUFFER_TRACE(dir_bh, "call ext3_journal_dirty_metadata"); + ext3_journal_dirty_metadata(handle, dir_bh); + old_dir->i_nlink--; + if (new_inode) { + new_inode->i_nlink--; + } else { + new_dir->i_nlink++; + ext3_update_dx_flag(new_dir); + ext3_mark_inode_dirty(handle, new_dir); + } + } + ext3_mark_inode_dirty(handle, old_dir); + if (new_inode) { + ext3_mark_inode_dirty(handle, new_inode); + if (!new_inode->i_nlink) + ext3_orphan_add(handle, new_inode); + } + retval = 0; + +end_rename: + brelse (dir_bh); + brelse (old_bh); + brelse (new_bh); + ext3_journal_stop(handle); + return retval; +} + +/* + * directories can handle most operations... + */ +struct inode_operations ext3_dir_inode_operations = { + .create = ext3_create, + .lookup = ext3_lookup, + .link = ext3_link, + .unlink = ext3_unlink, + .symlink = ext3_symlink, + .mkdir = ext3_mkdir, + .rmdir = ext3_rmdir, + .mknod = ext3_mknod, + .rename = ext3_rename, + .setattr = ext3_setattr, +#ifdef CONFIG_EXT3_FS_XATTR + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = ext3_listxattr, + .removexattr = generic_removexattr, +#endif + .permission = ext3_permission, +}; + +struct inode_operations ext3_special_inode_operations = { + .setattr = ext3_setattr, +#ifdef CONFIG_EXT3_FS_XATTR + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = ext3_listxattr, + .removexattr = generic_removexattr, +#endif + .permission = ext3_permission, +}; diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c new file mode 100644 index 000000000000..2c9f81278d5d --- /dev/null +++ b/fs/ext3/resize.c @@ -0,0 +1,996 @@ +/* + * linux/fs/ext3/resize.c + * + * Support for resizing an ext3 filesystem while it is mounted. + * + * Copyright (C) 2001, 2002 Andreas Dilger + * + * This could probably be made into a module, because it is not often in use. + */ + +#include + +#define EXT3FS_DEBUG + +#include +#include +#include + +#include +#include + + +#define outside(b, first, last) ((b) < (first) || (b) >= (last)) +#define inside(b, first, last) ((b) >= (first) && (b) < (last)) + +static int verify_group_input(struct super_block *sb, + struct ext3_new_group_data *input) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); + struct ext3_super_block *es = sbi->s_es; + unsigned start = le32_to_cpu(es->s_blocks_count); + unsigned end = start + input->blocks_count; + unsigned group = input->group; + unsigned itend = input->inode_table + EXT3_SB(sb)->s_itb_per_group; + unsigned overhead = ext3_bg_has_super(sb, group) ? + (1 + ext3_bg_num_gdb(sb, group) + + le16_to_cpu(es->s_reserved_gdt_blocks)) : 0; + unsigned metaend = start + overhead; + struct buffer_head *bh = NULL; + int free_blocks_count; + int err = -EINVAL; + + input->free_blocks_count = free_blocks_count = + input->blocks_count - 2 - overhead - sbi->s_itb_per_group; + + if (test_opt(sb, DEBUG)) + printk(KERN_DEBUG "EXT3-fs: adding %s group %u: %u blocks " + "(%d free, %u reserved)\n", + ext3_bg_has_super(sb, input->group) ? "normal" : + "no-super", input->group, input->blocks_count, + free_blocks_count, input->reserved_blocks); + + if (group != sbi->s_groups_count) + ext3_warning(sb, __FUNCTION__, + "Cannot add at group %u (only %lu groups)", + input->group, sbi->s_groups_count); + else if ((start - le32_to_cpu(es->s_first_data_block)) % + EXT3_BLOCKS_PER_GROUP(sb)) + ext3_warning(sb, __FUNCTION__, "Last group not full"); + else if (input->reserved_blocks > input->blocks_count / 5) + ext3_warning(sb, __FUNCTION__, "Reserved blocks too high (%u)", + input->reserved_blocks); + else if (free_blocks_count < 0) + ext3_warning(sb, __FUNCTION__, "Bad blocks count %u", + input->blocks_count); + else if (!(bh = sb_bread(sb, end - 1))) + ext3_warning(sb, __FUNCTION__, "Cannot read last block (%u)", + end - 1); + else if (outside(input->block_bitmap, start, end)) + ext3_warning(sb, __FUNCTION__, + "Block bitmap not in group (block %u)", + input->block_bitmap); + else if (outside(input->inode_bitmap, start, end)) + ext3_warning(sb, __FUNCTION__, + "Inode bitmap not in group (block %u)", + input->inode_bitmap); + else if (outside(input->inode_table, start, end) || + outside(itend - 1, start, end)) + ext3_warning(sb, __FUNCTION__, + "Inode table not in group (blocks %u-%u)", + input->inode_table, itend - 1); + else if (input->inode_bitmap == input->block_bitmap) + ext3_warning(sb, __FUNCTION__, + "Block bitmap same as inode bitmap (%u)", + input->block_bitmap); + else if (inside(input->block_bitmap, input->inode_table, itend)) + ext3_warning(sb, __FUNCTION__, + "Block bitmap (%u) in inode table (%u-%u)", + input->block_bitmap, input->inode_table, itend-1); + else if (inside(input->inode_bitmap, input->inode_table, itend)) + ext3_warning(sb, __FUNCTION__, + "Inode bitmap (%u) in inode table (%u-%u)", + input->inode_bitmap, input->inode_table, itend-1); + else if (inside(input->block_bitmap, start, metaend)) + ext3_warning(sb, __FUNCTION__, + "Block bitmap (%u) in GDT table (%u-%u)", + input->block_bitmap, start, metaend - 1); + else if (inside(input->inode_bitmap, start, metaend)) + ext3_warning(sb, __FUNCTION__, + "Inode bitmap (%u) in GDT table (%u-%u)", + input->inode_bitmap, start, metaend - 1); + else if (inside(input->inode_table, start, metaend) || + inside(itend - 1, start, metaend)) + ext3_warning(sb, __FUNCTION__, + "Inode table (%u-%u) overlaps GDT table (%u-%u)", + input->inode_table, itend - 1, start, metaend - 1); + else + err = 0; + brelse(bh); + + return err; +} + +static struct buffer_head *bclean(handle_t *handle, struct super_block *sb, + unsigned long blk) +{ + struct buffer_head *bh; + int err; + + bh = sb_getblk(sb, blk); + if ((err = ext3_journal_get_write_access(handle, bh))) { + brelse(bh); + bh = ERR_PTR(err); + } else { + lock_buffer(bh); + memset(bh->b_data, 0, sb->s_blocksize); + set_buffer_uptodate(bh); + unlock_buffer(bh); + } + + return bh; +} + +/* + * To avoid calling the atomic setbit hundreds or thousands of times, we only + * need to use it within a single byte (to ensure we get endianness right). + * We can use memset for the rest of the bitmap as there are no other users. + */ +static void mark_bitmap_end(int start_bit, int end_bit, char *bitmap) +{ + int i; + + if (start_bit >= end_bit) + return; + + ext3_debug("mark end bits +%d through +%d used\n", start_bit, end_bit); + for (i = start_bit; i < ((start_bit + 7) & ~7UL); i++) + ext3_set_bit(i, bitmap); + if (i < end_bit) + memset(bitmap + (i >> 3), 0xff, (end_bit - i) >> 3); +} + +/* + * Set up the block and inode bitmaps, and the inode table for the new group. + * This doesn't need to be part of the main transaction, since we are only + * changing blocks outside the actual filesystem. We still do journaling to + * ensure the recovery is correct in case of a failure just after resize. + * If any part of this fails, we simply abort the resize. + */ +static int setup_new_group_blocks(struct super_block *sb, + struct ext3_new_group_data *input) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); + unsigned long start = input->group * sbi->s_blocks_per_group + + le32_to_cpu(sbi->s_es->s_first_data_block); + int reserved_gdb = ext3_bg_has_super(sb, input->group) ? + le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) : 0; + unsigned long gdblocks = ext3_bg_num_gdb(sb, input->group); + struct buffer_head *bh; + handle_t *handle; + unsigned long block; + int bit; + int i; + int err = 0, err2; + + handle = ext3_journal_start_sb(sb, reserved_gdb + gdblocks + + 2 + sbi->s_itb_per_group); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + lock_super(sb); + if (input->group != sbi->s_groups_count) { + err = -EBUSY; + goto exit_journal; + } + + if (IS_ERR(bh = bclean(handle, sb, input->block_bitmap))) { + err = PTR_ERR(bh); + goto exit_journal; + } + + if (ext3_bg_has_super(sb, input->group)) { + ext3_debug("mark backup superblock %#04lx (+0)\n", start); + ext3_set_bit(0, bh->b_data); + } + + /* Copy all of the GDT blocks into the backup in this group */ + for (i = 0, bit = 1, block = start + 1; + i < gdblocks; i++, block++, bit++) { + struct buffer_head *gdb; + + ext3_debug("update backup group %#04lx (+%d)\n", block, bit); + + gdb = sb_getblk(sb, block); + if ((err = ext3_journal_get_write_access(handle, gdb))) { + brelse(gdb); + goto exit_bh; + } + lock_buffer(bh); + memcpy(gdb->b_data, sbi->s_group_desc[i], bh->b_size); + set_buffer_uptodate(gdb); + unlock_buffer(bh); + ext3_journal_dirty_metadata(handle, gdb); + ext3_set_bit(bit, bh->b_data); + brelse(gdb); + } + + /* Zero out all of the reserved backup group descriptor table blocks */ + for (i = 0, bit = gdblocks + 1, block = start + bit; + i < reserved_gdb; i++, block++, bit++) { + struct buffer_head *gdb; + + ext3_debug("clear reserved block %#04lx (+%d)\n", block, bit); + + if (IS_ERR(gdb = bclean(handle, sb, block))) { + err = PTR_ERR(bh); + goto exit_bh; + } + ext3_journal_dirty_metadata(handle, gdb); + ext3_set_bit(bit, bh->b_data); + brelse(gdb); + } + ext3_debug("mark block bitmap %#04x (+%ld)\n", input->block_bitmap, + input->block_bitmap - start); + ext3_set_bit(input->block_bitmap - start, bh->b_data); + ext3_debug("mark inode bitmap %#04x (+%ld)\n", input->inode_bitmap, + input->inode_bitmap - start); + ext3_set_bit(input->inode_bitmap - start, bh->b_data); + + /* Zero out all of the inode table blocks */ + for (i = 0, block = input->inode_table, bit = block - start; + i < sbi->s_itb_per_group; i++, bit++, block++) { + struct buffer_head *it; + + ext3_debug("clear inode block %#04x (+%ld)\n", block, bit); + if (IS_ERR(it = bclean(handle, sb, block))) { + err = PTR_ERR(it); + goto exit_bh; + } + ext3_journal_dirty_metadata(handle, it); + brelse(it); + ext3_set_bit(bit, bh->b_data); + } + mark_bitmap_end(input->blocks_count, EXT3_BLOCKS_PER_GROUP(sb), + bh->b_data); + ext3_journal_dirty_metadata(handle, bh); + brelse(bh); + + /* Mark unused entries in inode bitmap used */ + ext3_debug("clear inode bitmap %#04x (+%ld)\n", + input->inode_bitmap, input->inode_bitmap - start); + if (IS_ERR(bh = bclean(handle, sb, input->inode_bitmap))) { + err = PTR_ERR(bh); + goto exit_journal; + } + + mark_bitmap_end(EXT3_INODES_PER_GROUP(sb), EXT3_BLOCKS_PER_GROUP(sb), + bh->b_data); + ext3_journal_dirty_metadata(handle, bh); +exit_bh: + brelse(bh); + +exit_journal: + unlock_super(sb); + if ((err2 = ext3_journal_stop(handle)) && !err) + err = err2; + + return err; +} + +/* + * Iterate through the groups which hold BACKUP superblock/GDT copies in an + * ext3 filesystem. The counters should be initialized to 1, 5, and 7 before + * calling this for the first time. In a sparse filesystem it will be the + * sequence of powers of 3, 5, and 7: 1, 3, 5, 7, 9, 25, 27, 49, 81, ... + * For a non-sparse filesystem it will be every group: 1, 2, 3, 4, ... + */ +static unsigned ext3_list_backups(struct super_block *sb, unsigned *three, + unsigned *five, unsigned *seven) +{ + unsigned *min = three; + int mult = 3; + unsigned ret; + + if (!EXT3_HAS_RO_COMPAT_FEATURE(sb, + EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER)) { + ret = *min; + *min += 1; + return ret; + } + + if (*five < *min) { + min = five; + mult = 5; + } + if (*seven < *min) { + min = seven; + mult = 7; + } + + ret = *min; + *min *= mult; + + return ret; +} + +/* + * Check that all of the backup GDT blocks are held in the primary GDT block. + * It is assumed that they are stored in group order. Returns the number of + * groups in current filesystem that have BACKUPS, or -ve error code. + */ +static int verify_reserved_gdb(struct super_block *sb, + struct buffer_head *primary) +{ + const unsigned long blk = primary->b_blocknr; + const unsigned long end = EXT3_SB(sb)->s_groups_count; + unsigned three = 1; + unsigned five = 5; + unsigned seven = 7; + unsigned grp; + __u32 *p = (__u32 *)primary->b_data; + int gdbackups = 0; + + while ((grp = ext3_list_backups(sb, &three, &five, &seven)) < end) { + if (le32_to_cpu(*p++) != grp * EXT3_BLOCKS_PER_GROUP(sb) + blk){ + ext3_warning(sb, __FUNCTION__, + "reserved GDT %ld missing grp %d (%ld)\n", + blk, grp, + grp * EXT3_BLOCKS_PER_GROUP(sb) + blk); + return -EINVAL; + } + if (++gdbackups > EXT3_ADDR_PER_BLOCK(sb)) + return -EFBIG; + } + + return gdbackups; +} + +/* + * Called when we need to bring a reserved group descriptor table block into + * use from the resize inode. The primary copy of the new GDT block currently + * is an indirect block (under the double indirect block in the resize inode). + * The new backup GDT blocks will be stored as leaf blocks in this indirect + * block, in group order. Even though we know all the block numbers we need, + * we check to ensure that the resize inode has actually reserved these blocks. + * + * Don't need to update the block bitmaps because the blocks are still in use. + * + * We get all of the error cases out of the way, so that we are sure to not + * fail once we start modifying the data on disk, because JBD has no rollback. + */ +static int add_new_gdb(handle_t *handle, struct inode *inode, + struct ext3_new_group_data *input, + struct buffer_head **primary) +{ + struct super_block *sb = inode->i_sb; + struct ext3_super_block *es = EXT3_SB(sb)->s_es; + unsigned long gdb_num = input->group / EXT3_DESC_PER_BLOCK(sb); + unsigned long gdblock = EXT3_SB(sb)->s_sbh->b_blocknr + 1 + gdb_num; + struct buffer_head **o_group_desc, **n_group_desc; + struct buffer_head *dind; + int gdbackups; + struct ext3_iloc iloc; + __u32 *data; + int err; + + if (test_opt(sb, DEBUG)) + printk(KERN_DEBUG + "EXT3-fs: ext3_add_new_gdb: adding group block %lu\n", + gdb_num); + + /* + * If we are not using the primary superblock/GDT copy don't resize, + * because the user tools have no way of handling this. Probably a + * bad time to do it anyways. + */ + if (EXT3_SB(sb)->s_sbh->b_blocknr != + le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block)) { + ext3_warning(sb, __FUNCTION__, + "won't resize using backup superblock at %llu\n", + (unsigned long long)EXT3_SB(sb)->s_sbh->b_blocknr); + return -EPERM; + } + + *primary = sb_bread(sb, gdblock); + if (!*primary) + return -EIO; + + if ((gdbackups = verify_reserved_gdb(sb, *primary)) < 0) { + err = gdbackups; + goto exit_bh; + } + + data = EXT3_I(inode)->i_data + EXT3_DIND_BLOCK; + dind = sb_bread(sb, le32_to_cpu(*data)); + if (!dind) { + err = -EIO; + goto exit_bh; + } + + data = (__u32 *)dind->b_data; + if (le32_to_cpu(data[gdb_num % EXT3_ADDR_PER_BLOCK(sb)]) != gdblock) { + ext3_warning(sb, __FUNCTION__, + "new group %u GDT block %lu not reserved\n", + input->group, gdblock); + err = -EINVAL; + goto exit_dind; + } + + if ((err = ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh))) + goto exit_dind; + + if ((err = ext3_journal_get_write_access(handle, *primary))) + goto exit_sbh; + + if ((err = ext3_journal_get_write_access(handle, dind))) + goto exit_primary; + + /* ext3_reserve_inode_write() gets a reference on the iloc */ + if ((err = ext3_reserve_inode_write(handle, inode, &iloc))) + goto exit_dindj; + + n_group_desc = (struct buffer_head **)kmalloc((gdb_num + 1) * + sizeof(struct buffer_head *), GFP_KERNEL); + if (!n_group_desc) { + err = -ENOMEM; + ext3_warning (sb, __FUNCTION__, + "not enough memory for %lu groups", gdb_num + 1); + goto exit_inode; + } + + /* + * Finally, we have all of the possible failures behind us... + * + * Remove new GDT block from inode double-indirect block and clear out + * the new GDT block for use (which also "frees" the backup GDT blocks + * from the reserved inode). We don't need to change the bitmaps for + * these blocks, because they are marked as in-use from being in the + * reserved inode, and will become GDT blocks (primary and backup). + */ + data[gdb_num % EXT3_ADDR_PER_BLOCK(sb)] = 0; + ext3_journal_dirty_metadata(handle, dind); + brelse(dind); + inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9; + ext3_mark_iloc_dirty(handle, inode, &iloc); + memset((*primary)->b_data, 0, sb->s_blocksize); + ext3_journal_dirty_metadata(handle, *primary); + + o_group_desc = EXT3_SB(sb)->s_group_desc; + memcpy(n_group_desc, o_group_desc, + EXT3_SB(sb)->s_gdb_count * sizeof(struct buffer_head *)); + n_group_desc[gdb_num] = *primary; + EXT3_SB(sb)->s_group_desc = n_group_desc; + EXT3_SB(sb)->s_gdb_count++; + kfree(o_group_desc); + + es->s_reserved_gdt_blocks = + cpu_to_le16(le16_to_cpu(es->s_reserved_gdt_blocks) - 1); + ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); + + return 0; + +exit_inode: + //ext3_journal_release_buffer(handle, iloc.bh); + brelse(iloc.bh); +exit_dindj: + //ext3_journal_release_buffer(handle, dind); +exit_primary: + //ext3_journal_release_buffer(handle, *primary); +exit_sbh: + //ext3_journal_release_buffer(handle, *primary); +exit_dind: + brelse(dind); +exit_bh: + brelse(*primary); + + ext3_debug("leaving with error %d\n", err); + return err; +} + +/* + * Called when we are adding a new group which has a backup copy of each of + * the GDT blocks (i.e. sparse group) and there are reserved GDT blocks. + * We need to add these reserved backup GDT blocks to the resize inode, so + * that they are kept for future resizing and not allocated to files. + * + * Each reserved backup GDT block will go into a different indirect block. + * The indirect blocks are actually the primary reserved GDT blocks, + * so we know in advance what their block numbers are. We only get the + * double-indirect block to verify it is pointing to the primary reserved + * GDT blocks so we don't overwrite a data block by accident. The reserved + * backup GDT blocks are stored in their reserved primary GDT block. + */ +static int reserve_backup_gdb(handle_t *handle, struct inode *inode, + struct ext3_new_group_data *input) +{ + struct super_block *sb = inode->i_sb; + int reserved_gdb =le16_to_cpu(EXT3_SB(sb)->s_es->s_reserved_gdt_blocks); + struct buffer_head **primary; + struct buffer_head *dind; + struct ext3_iloc iloc; + unsigned long blk; + __u32 *data, *end; + int gdbackups = 0; + int res, i; + int err; + + primary = kmalloc(reserved_gdb * sizeof(*primary), GFP_KERNEL); + if (!primary) + return -ENOMEM; + + data = EXT3_I(inode)->i_data + EXT3_DIND_BLOCK; + dind = sb_bread(sb, le32_to_cpu(*data)); + if (!dind) { + err = -EIO; + goto exit_free; + } + + blk = EXT3_SB(sb)->s_sbh->b_blocknr + 1 + EXT3_SB(sb)->s_gdb_count; + data = (__u32 *)dind->b_data + EXT3_SB(sb)->s_gdb_count; + end = (__u32 *)dind->b_data + EXT3_ADDR_PER_BLOCK(sb); + + /* Get each reserved primary GDT block and verify it holds backups */ + for (res = 0; res < reserved_gdb; res++, blk++) { + if (le32_to_cpu(*data) != blk) { + ext3_warning(sb, __FUNCTION__, + "reserved block %lu not at offset %ld\n", + blk, (long)(data - (__u32 *)dind->b_data)); + err = -EINVAL; + goto exit_bh; + } + primary[res] = sb_bread(sb, blk); + if (!primary[res]) { + err = -EIO; + goto exit_bh; + } + if ((gdbackups = verify_reserved_gdb(sb, primary[res])) < 0) { + brelse(primary[res]); + err = gdbackups; + goto exit_bh; + } + if (++data >= end) + data = (__u32 *)dind->b_data; + } + + for (i = 0; i < reserved_gdb; i++) { + if ((err = ext3_journal_get_write_access(handle, primary[i]))) { + /* + int j; + for (j = 0; j < i; j++) + ext3_journal_release_buffer(handle, primary[j]); + */ + goto exit_bh; + } + } + + if ((err = ext3_reserve_inode_write(handle, inode, &iloc))) + goto exit_bh; + + /* + * Finally we can add each of the reserved backup GDT blocks from + * the new group to its reserved primary GDT block. + */ + blk = input->group * EXT3_BLOCKS_PER_GROUP(sb); + for (i = 0; i < reserved_gdb; i++) { + int err2; + data = (__u32 *)primary[i]->b_data; + /* printk("reserving backup %lu[%u] = %lu\n", + primary[i]->b_blocknr, gdbackups, + blk + primary[i]->b_blocknr); */ + data[gdbackups] = cpu_to_le32(blk + primary[i]->b_blocknr); + err2 = ext3_journal_dirty_metadata(handle, primary[i]); + if (!err) + err = err2; + } + inode->i_blocks += reserved_gdb * sb->s_blocksize >> 9; + ext3_mark_iloc_dirty(handle, inode, &iloc); + +exit_bh: + while (--res >= 0) + brelse(primary[res]); + brelse(dind); + +exit_free: + kfree(primary); + + return err; +} + +/* + * Update the backup copies of the ext3 metadata. These don't need to be part + * of the main resize transaction, because e2fsck will re-write them if there + * is a problem (basically only OOM will cause a problem). However, we + * _should_ update the backups if possible, in case the primary gets trashed + * for some reason and we need to run e2fsck from a backup superblock. The + * important part is that the new block and inode counts are in the backup + * superblocks, and the location of the new group metadata in the GDT backups. + * + * We do not need lock_super() for this, because these blocks are not + * otherwise touched by the filesystem code when it is mounted. We don't + * need to worry about last changing from sbi->s_groups_count, because the + * worst that can happen is that we do not copy the full number of backups + * at this time. The resize which changed s_groups_count will backup again. + */ +static void update_backups(struct super_block *sb, + int blk_off, char *data, int size) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); + const unsigned long last = sbi->s_groups_count; + const int bpg = EXT3_BLOCKS_PER_GROUP(sb); + unsigned three = 1; + unsigned five = 5; + unsigned seven = 7; + unsigned group; + int rest = sb->s_blocksize - size; + handle_t *handle; + int err = 0, err2; + + handle = ext3_journal_start_sb(sb, EXT3_MAX_TRANS_DATA); + if (IS_ERR(handle)) { + group = 1; + err = PTR_ERR(handle); + goto exit_err; + } + + while ((group = ext3_list_backups(sb, &three, &five, &seven)) < last) { + struct buffer_head *bh; + + /* Out of journal space, and can't get more - abort - so sad */ + if (handle->h_buffer_credits == 0 && + ext3_journal_extend(handle, EXT3_MAX_TRANS_DATA) && + (err = ext3_journal_restart(handle, EXT3_MAX_TRANS_DATA))) + break; + + bh = sb_getblk(sb, group * bpg + blk_off); + ext3_debug(sb, __FUNCTION__, "update metadata backup %#04lx\n", + bh->b_blocknr); + if ((err = ext3_journal_get_write_access(handle, bh))) + break; + lock_buffer(bh); + memcpy(bh->b_data, data, size); + if (rest) + memset(bh->b_data + size, 0, rest); + set_buffer_uptodate(bh); + unlock_buffer(bh); + ext3_journal_dirty_metadata(handle, bh); + brelse(bh); + } + if ((err2 = ext3_journal_stop(handle)) && !err) + err = err2; + + /* + * Ugh! Need to have e2fsck write the backup copies. It is too + * late to revert the resize, we shouldn't fail just because of + * the backup copies (they are only needed in case of corruption). + * + * However, if we got here we have a journal problem too, so we + * can't really start a transaction to mark the superblock. + * Chicken out and just set the flag on the hope it will be written + * to disk, and if not - we will simply wait until next fsck. + */ +exit_err: + if (err) { + ext3_warning(sb, __FUNCTION__, + "can't update backup for group %d (err %d), " + "forcing fsck on next reboot\n", group, err); + sbi->s_mount_state &= ~EXT3_VALID_FS; + sbi->s_es->s_state &= ~cpu_to_le16(EXT3_VALID_FS); + mark_buffer_dirty(sbi->s_sbh); + } +} + +/* Add group descriptor data to an existing or new group descriptor block. + * Ensure we handle all possible error conditions _before_ we start modifying + * the filesystem, because we cannot abort the transaction and not have it + * write the data to disk. + * + * If we are on a GDT block boundary, we need to get the reserved GDT block. + * Otherwise, we may need to add backup GDT blocks for a sparse group. + * + * We only need to hold the superblock lock while we are actually adding + * in the new group's counts to the superblock. Prior to that we have + * not really "added" the group at all. We re-check that we are still + * adding in the last group in case things have changed since verifying. + */ +int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); + struct ext3_super_block *es = sbi->s_es; + int reserved_gdb = ext3_bg_has_super(sb, input->group) ? + le16_to_cpu(es->s_reserved_gdt_blocks) : 0; + struct buffer_head *primary = NULL; + struct ext3_group_desc *gdp; + struct inode *inode = NULL; + handle_t *handle; + int gdb_off, gdb_num; + int err, err2; + + gdb_num = input->group / EXT3_DESC_PER_BLOCK(sb); + gdb_off = input->group % EXT3_DESC_PER_BLOCK(sb); + + if (gdb_off == 0 && !EXT3_HAS_RO_COMPAT_FEATURE(sb, + EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER)) { + ext3_warning(sb, __FUNCTION__, + "Can't resize non-sparse filesystem further\n"); + return -EPERM; + } + + if (reserved_gdb || gdb_off == 0) { + if (!EXT3_HAS_COMPAT_FEATURE(sb, + EXT3_FEATURE_COMPAT_RESIZE_INODE)){ + ext3_warning(sb, __FUNCTION__, + "No reserved GDT blocks, can't resize\n"); + return -EPERM; + } + inode = iget(sb, EXT3_RESIZE_INO); + if (!inode || is_bad_inode(inode)) { + ext3_warning(sb, __FUNCTION__, + "Error opening resize inode\n"); + iput(inode); + return -ENOENT; + } + } + + if ((err = verify_group_input(sb, input))) + goto exit_put; + + if ((err = setup_new_group_blocks(sb, input))) + goto exit_put; + + /* + * We will always be modifying at least the superblock and a GDT + * block. If we are adding a group past the last current GDT block, + * we will also modify the inode and the dindirect block. If we + * are adding a group with superblock/GDT backups we will also + * modify each of the reserved GDT dindirect blocks. + */ + handle = ext3_journal_start_sb(sb, + ext3_bg_has_super(sb, input->group) ? + 3 + reserved_gdb : 4); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + goto exit_put; + } + + lock_super(sb); + if (input->group != EXT3_SB(sb)->s_groups_count) { + ext3_warning(sb, __FUNCTION__, + "multiple resizers run on filesystem!\n"); + goto exit_journal; + } + + if ((err = ext3_journal_get_write_access(handle, sbi->s_sbh))) + goto exit_journal; + + /* + * We will only either add reserved group blocks to a backup group + * or remove reserved blocks for the first group in a new group block. + * Doing both would be mean more complex code, and sane people don't + * use non-sparse filesystems anymore. This is already checked above. + */ + if (gdb_off) { + primary = sbi->s_group_desc[gdb_num]; + if ((err = ext3_journal_get_write_access(handle, primary))) + goto exit_journal; + + if (reserved_gdb && ext3_bg_num_gdb(sb, input->group) && + (err = reserve_backup_gdb(handle, inode, input))) + goto exit_journal; + } else if ((err = add_new_gdb(handle, inode, input, &primary))) + goto exit_journal; + + /* + * OK, now we've set up the new group. Time to make it active. + * + * Current kernels don't lock all allocations via lock_super(), + * so we have to be safe wrt. concurrent accesses the group + * data. So we need to be careful to set all of the relevant + * group descriptor data etc. *before* we enable the group. + * + * The key field here is EXT3_SB(sb)->s_groups_count: as long as + * that retains its old value, nobody is going to access the new + * group. + * + * So first we update all the descriptor metadata for the new + * group; then we update the total disk blocks count; then we + * update the groups count to enable the group; then finally we + * update the free space counts so that the system can start + * using the new disk blocks. + */ + + /* Update group descriptor block for new group */ + gdp = (struct ext3_group_desc *)primary->b_data + gdb_off; + + gdp->bg_block_bitmap = cpu_to_le32(input->block_bitmap); + gdp->bg_inode_bitmap = cpu_to_le32(input->inode_bitmap); + gdp->bg_inode_table = cpu_to_le32(input->inode_table); + gdp->bg_free_blocks_count = cpu_to_le16(input->free_blocks_count); + gdp->bg_free_inodes_count = cpu_to_le16(EXT3_INODES_PER_GROUP(sb)); + + /* + * Make the new blocks and inodes valid next. We do this before + * increasing the group count so that once the group is enabled, + * all of its blocks and inodes are already valid. + * + * We always allocate group-by-group, then block-by-block or + * inode-by-inode within a group, so enabling these + * blocks/inodes before the group is live won't actually let us + * allocate the new space yet. + */ + es->s_blocks_count = cpu_to_le32(le32_to_cpu(es->s_blocks_count) + + input->blocks_count); + es->s_inodes_count = cpu_to_le32(le32_to_cpu(es->s_inodes_count) + + EXT3_INODES_PER_GROUP(sb)); + + /* + * We need to protect s_groups_count against other CPUs seeing + * inconsistent state in the superblock. + * + * The precise rules we use are: + * + * * Writers of s_groups_count *must* hold lock_super + * AND + * * Writers must perform a smp_wmb() after updating all dependent + * data and before modifying the groups count + * + * * Readers must hold lock_super() over the access + * OR + * * Readers must perform an smp_rmb() after reading the groups count + * and before reading any dependent data. + * + * NB. These rules can be relaxed when checking the group count + * while freeing data, as we can only allocate from a block + * group after serialising against the group count, and we can + * only then free after serialising in turn against that + * allocation. + */ + smp_wmb(); + + /* Update the global fs size fields */ + EXT3_SB(sb)->s_groups_count++; + + ext3_journal_dirty_metadata(handle, primary); + + /* Update the reserved block counts only once the new group is + * active. */ + es->s_r_blocks_count = cpu_to_le32(le32_to_cpu(es->s_r_blocks_count) + + input->reserved_blocks); + + /* Update the free space counts */ + percpu_counter_mod(&sbi->s_freeblocks_counter, + input->free_blocks_count); + percpu_counter_mod(&sbi->s_freeinodes_counter, + EXT3_INODES_PER_GROUP(sb)); + + ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); + sb->s_dirt = 1; + +exit_journal: + unlock_super(sb); + if ((err2 = ext3_journal_stop(handle)) && !err) + err = err2; + if (!err) { + update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es, + sizeof(struct ext3_super_block)); + update_backups(sb, primary->b_blocknr, primary->b_data, + primary->b_size); + } +exit_put: + iput(inode); + return err; +} /* ext3_group_add */ + +/* Extend the filesystem to the new number of blocks specified. This entry + * point is only used to extend the current filesystem to the end of the last + * existing group. It can be accessed via ioctl, or by "remount,resize=" + * for emergencies (because it has no dependencies on reserved blocks). + * + * If we _really_ wanted, we could use default values to call ext3_group_add() + * allow the "remount" trick to work for arbitrary resizing, assuming enough + * GDT blocks are reserved to grow to the desired size. + */ +int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es, + unsigned long n_blocks_count) +{ + unsigned long o_blocks_count; + unsigned long o_groups_count; + unsigned long last; + int add; + struct buffer_head * bh; + handle_t *handle; + int err, freed_blocks; + + /* We don't need to worry about locking wrt other resizers just + * yet: we're going to revalidate es->s_blocks_count after + * taking lock_super() below. */ + o_blocks_count = le32_to_cpu(es->s_blocks_count); + o_groups_count = EXT3_SB(sb)->s_groups_count; + + if (test_opt(sb, DEBUG)) + printk(KERN_DEBUG "EXT3-fs: extending last group from %lu to %lu blocks\n", + o_blocks_count, n_blocks_count); + + if (n_blocks_count == 0 || n_blocks_count == o_blocks_count) + return 0; + + if (n_blocks_count < o_blocks_count) { + ext3_warning(sb, __FUNCTION__, + "can't shrink FS - resize aborted"); + return -EBUSY; + } + + /* Handle the remaining blocks in the last group only. */ + last = (o_blocks_count - le32_to_cpu(es->s_first_data_block)) % + EXT3_BLOCKS_PER_GROUP(sb); + + if (last == 0) { + ext3_warning(sb, __FUNCTION__, + "need to use ext2online to resize further\n"); + return -EPERM; + } + + add = EXT3_BLOCKS_PER_GROUP(sb) - last; + + if (o_blocks_count + add > n_blocks_count) + add = n_blocks_count - o_blocks_count; + + if (o_blocks_count + add < n_blocks_count) + ext3_warning(sb, __FUNCTION__, + "will only finish group (%lu blocks, %u new)", + o_blocks_count + add, add); + + /* See if the device is actually as big as what was requested */ + bh = sb_bread(sb, o_blocks_count + add -1); + if (!bh) { + ext3_warning(sb, __FUNCTION__, + "can't read last block, resize aborted"); + return -ENOSPC; + } + brelse(bh); + + /* We will update the superblock, one block bitmap, and + * one group descriptor via ext3_free_blocks(). + */ + handle = ext3_journal_start_sb(sb, 3); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + ext3_warning(sb, __FUNCTION__, "error %d on journal start",err); + goto exit_put; + } + + lock_super(sb); + if (o_blocks_count != le32_to_cpu(es->s_blocks_count)) { + ext3_warning(sb, __FUNCTION__, + "multiple resizers run on filesystem!\n"); + err = -EBUSY; + goto exit_put; + } + + if ((err = ext3_journal_get_write_access(handle, + EXT3_SB(sb)->s_sbh))) { + ext3_warning(sb, __FUNCTION__, + "error %d on journal write access", err); + unlock_super(sb); + ext3_journal_stop(handle); + goto exit_put; + } + es->s_blocks_count = cpu_to_le32(o_blocks_count + add); + ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); + sb->s_dirt = 1; + unlock_super(sb); + ext3_debug("freeing blocks %ld through %ld\n", o_blocks_count, + o_blocks_count + add); + ext3_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks); + ext3_debug("freed blocks %ld through %ld\n", o_blocks_count, + o_blocks_count + add); + if ((err = ext3_journal_stop(handle))) + goto exit_put; + if (test_opt(sb, DEBUG)) + printk(KERN_DEBUG "EXT3-fs: extended group to %u blocks\n", + le32_to_cpu(es->s_blocks_count)); + update_backups(sb, EXT3_SB(sb)->s_sbh->b_blocknr, (char *)es, + sizeof(struct ext3_super_block)); +exit_put: + return err; +} /* ext3_group_extend */ diff --git a/fs/ext3/super.c b/fs/ext3/super.c new file mode 100644 index 000000000000..545b440a2d2f --- /dev/null +++ b/fs/ext3/super.c @@ -0,0 +1,2539 @@ +/* + * linux/fs/ext3/super.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/inode.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "xattr.h" +#include "acl.h" + +static int ext3_load_journal(struct super_block *, struct ext3_super_block *); +static int ext3_create_journal(struct super_block *, struct ext3_super_block *, + int); +static void ext3_commit_super (struct super_block * sb, + struct ext3_super_block * es, + int sync); +static void ext3_mark_recovery_complete(struct super_block * sb, + struct ext3_super_block * es); +static void ext3_clear_journal_err(struct super_block * sb, + struct ext3_super_block * es); +static int ext3_sync_fs(struct super_block *sb, int wait); +static const char *ext3_decode_error(struct super_block * sb, int errno, + char nbuf[16]); +static int ext3_remount (struct super_block * sb, int * flags, char * data); +static int ext3_statfs (struct super_block * sb, struct kstatfs * buf); +static void ext3_unlockfs(struct super_block *sb); +static void ext3_write_super (struct super_block * sb); +static void ext3_write_super_lockfs(struct super_block *sb); + +/* + * Wrappers for journal_start/end. + * + * The only special thing we need to do here is to make sure that all + * journal_end calls result in the superblock being marked dirty, so + * that sync() will call the filesystem's write_super callback if + * appropriate. + */ +handle_t *ext3_journal_start_sb(struct super_block *sb, int nblocks) +{ + journal_t *journal; + + if (sb->s_flags & MS_RDONLY) + return ERR_PTR(-EROFS); + + /* Special case here: if the journal has aborted behind our + * backs (eg. EIO in the commit thread), then we still need to + * take the FS itself readonly cleanly. */ + journal = EXT3_SB(sb)->s_journal; + if (is_journal_aborted(journal)) { + ext3_abort(sb, __FUNCTION__, + "Detected aborted journal"); + return ERR_PTR(-EROFS); + } + + return journal_start(journal, nblocks); +} + +/* + * The only special thing we need to do here is to make sure that all + * journal_stop calls result in the superblock being marked dirty, so + * that sync() will call the filesystem's write_super callback if + * appropriate. + */ +int __ext3_journal_stop(const char *where, handle_t *handle) +{ + struct super_block *sb; + int err; + int rc; + + sb = handle->h_transaction->t_journal->j_private; + err = handle->h_err; + rc = journal_stop(handle); + + if (!err) + err = rc; + if (err) + __ext3_std_error(sb, where, err); + return err; +} + +void ext3_journal_abort_handle(const char *caller, const char *err_fn, + struct buffer_head *bh, handle_t *handle, int err) +{ + char nbuf[16]; + const char *errstr = ext3_decode_error(NULL, err, nbuf); + + if (bh) + BUFFER_TRACE(bh, "abort"); + + if (!handle->h_err) + handle->h_err = err; + + if (is_handle_aborted(handle)) + return; + + printk(KERN_ERR "%s: aborting transaction: %s in %s\n", + caller, errstr, err_fn); + + journal_abort_handle(handle); +} + +/* Deal with the reporting of failure conditions on a filesystem such as + * inconsistencies detected or read IO failures. + * + * On ext2, we can store the error state of the filesystem in the + * superblock. That is not possible on ext3, because we may have other + * write ordering constraints on the superblock which prevent us from + * writing it out straight away; and given that the journal is about to + * be aborted, we can't rely on the current, or future, transactions to + * write out the superblock safely. + * + * We'll just use the journal_abort() error code to record an error in + * the journal instead. On recovery, the journal will compain about + * that error until we've noted it down and cleared it. + */ + +static void ext3_handle_error(struct super_block *sb) +{ + struct ext3_super_block *es = EXT3_SB(sb)->s_es; + + EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS; + es->s_state |= cpu_to_le16(EXT3_ERROR_FS); + + if (sb->s_flags & MS_RDONLY) + return; + + if (test_opt (sb, ERRORS_RO)) { + printk (KERN_CRIT "Remounting filesystem read-only\n"); + sb->s_flags |= MS_RDONLY; + } else { + journal_t *journal = EXT3_SB(sb)->s_journal; + + EXT3_SB(sb)->s_mount_opt |= EXT3_MOUNT_ABORT; + if (journal) + journal_abort(journal, -EIO); + } + if (test_opt(sb, ERRORS_PANIC)) + panic("EXT3-fs (device %s): panic forced after error\n", + sb->s_id); + ext3_commit_super(sb, es, 1); +} + +void ext3_error (struct super_block * sb, const char * function, + const char * fmt, ...) +{ + va_list args; + + va_start(args, fmt); + printk(KERN_CRIT "EXT3-fs error (device %s): %s: ",sb->s_id, function); + vprintk(fmt, args); + printk("\n"); + va_end(args); + + ext3_handle_error(sb); +} + +static const char *ext3_decode_error(struct super_block * sb, int errno, + char nbuf[16]) +{ + char *errstr = NULL; + + switch (errno) { + case -EIO: + errstr = "IO failure"; + break; + case -ENOMEM: + errstr = "Out of memory"; + break; + case -EROFS: + if (!sb || EXT3_SB(sb)->s_journal->j_flags & JFS_ABORT) + errstr = "Journal has aborted"; + else + errstr = "Readonly filesystem"; + break; + default: + /* If the caller passed in an extra buffer for unknown + * errors, textualise them now. Else we just return + * NULL. */ + if (nbuf) { + /* Check for truncated error codes... */ + if (snprintf(nbuf, 16, "error %d", -errno) >= 0) + errstr = nbuf; + } + break; + } + + return errstr; +} + +/* __ext3_std_error decodes expected errors from journaling functions + * automatically and invokes the appropriate error response. */ + +void __ext3_std_error (struct super_block * sb, const char * function, + int errno) +{ + char nbuf[16]; + const char *errstr = ext3_decode_error(sb, errno, nbuf); + + printk (KERN_CRIT "EXT3-fs error (device %s) in %s: %s\n", + sb->s_id, function, errstr); + + ext3_handle_error(sb); +} + +/* + * ext3_abort is a much stronger failure handler than ext3_error. The + * abort function may be used to deal with unrecoverable failures such + * as journal IO errors or ENOMEM at a critical moment in log management. + * + * We unconditionally force the filesystem into an ABORT|READONLY state, + * unless the error response on the fs has been set to panic in which + * case we take the easy way out and panic immediately. + */ + +void ext3_abort (struct super_block * sb, const char * function, + const char * fmt, ...) +{ + va_list args; + + printk (KERN_CRIT "ext3_abort called.\n"); + + va_start(args, fmt); + printk(KERN_CRIT "EXT3-fs error (device %s): %s: ",sb->s_id, function); + vprintk(fmt, args); + printk("\n"); + va_end(args); + + if (test_opt(sb, ERRORS_PANIC)) + panic("EXT3-fs panic from previous error\n"); + + if (sb->s_flags & MS_RDONLY) + return; + + printk(KERN_CRIT "Remounting filesystem read-only\n"); + EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS; + sb->s_flags |= MS_RDONLY; + EXT3_SB(sb)->s_mount_opt |= EXT3_MOUNT_ABORT; + journal_abort(EXT3_SB(sb)->s_journal, -EIO); +} + +void ext3_warning (struct super_block * sb, const char * function, + const char * fmt, ...) +{ + va_list args; + + va_start(args, fmt); + printk(KERN_WARNING "EXT3-fs warning (device %s): %s: ", + sb->s_id, function); + vprintk(fmt, args); + printk("\n"); + va_end(args); +} + +void ext3_update_dynamic_rev(struct super_block *sb) +{ + struct ext3_super_block *es = EXT3_SB(sb)->s_es; + + if (le32_to_cpu(es->s_rev_level) > EXT3_GOOD_OLD_REV) + return; + + ext3_warning(sb, __FUNCTION__, + "updating to rev %d because of new feature flag, " + "running e2fsck is recommended", + EXT3_DYNAMIC_REV); + + es->s_first_ino = cpu_to_le32(EXT3_GOOD_OLD_FIRST_INO); + es->s_inode_size = cpu_to_le16(EXT3_GOOD_OLD_INODE_SIZE); + es->s_rev_level = cpu_to_le32(EXT3_DYNAMIC_REV); + /* leave es->s_feature_*compat flags alone */ + /* es->s_uuid will be set by e2fsck if empty */ + + /* + * The rest of the superblock fields should be zero, and if not it + * means they are likely already in use, so leave them alone. We + * can leave it up to e2fsck to clean up any inconsistencies there. + */ +} + +/* + * Open the external journal device + */ +static struct block_device *ext3_blkdev_get(dev_t dev) +{ + struct block_device *bdev; + char b[BDEVNAME_SIZE]; + + bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE); + if (IS_ERR(bdev)) + goto fail; + return bdev; + +fail: + printk(KERN_ERR "EXT3: failed to open journal device %s: %ld\n", + __bdevname(dev, b), PTR_ERR(bdev)); + return NULL; +} + +/* + * Release the journal device + */ +static int ext3_blkdev_put(struct block_device *bdev) +{ + bd_release(bdev); + return blkdev_put(bdev); +} + +static int ext3_blkdev_remove(struct ext3_sb_info *sbi) +{ + struct block_device *bdev; + int ret = -ENODEV; + + bdev = sbi->journal_bdev; + if (bdev) { + ret = ext3_blkdev_put(bdev); + sbi->journal_bdev = NULL; + } + return ret; +} + +static inline struct inode *orphan_list_entry(struct list_head *l) +{ + return &list_entry(l, struct ext3_inode_info, i_orphan)->vfs_inode; +} + +static void dump_orphan_list(struct super_block *sb, struct ext3_sb_info *sbi) +{ + struct list_head *l; + + printk(KERN_ERR "sb orphan head is %d\n", + le32_to_cpu(sbi->s_es->s_last_orphan)); + + printk(KERN_ERR "sb_info orphan list:\n"); + list_for_each(l, &sbi->s_orphan) { + struct inode *inode = orphan_list_entry(l); + printk(KERN_ERR " " + "inode %s:%ld at %p: mode %o, nlink %d, next %d\n", + inode->i_sb->s_id, inode->i_ino, inode, + inode->i_mode, inode->i_nlink, + NEXT_ORPHAN(inode)); + } +} + +static void ext3_put_super (struct super_block * sb) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); + struct ext3_super_block *es = sbi->s_es; + int i; + + ext3_xattr_put_super(sb); + journal_destroy(sbi->s_journal); + if (!(sb->s_flags & MS_RDONLY)) { + EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); + es->s_state = cpu_to_le16(sbi->s_mount_state); + BUFFER_TRACE(sbi->s_sbh, "marking dirty"); + mark_buffer_dirty(sbi->s_sbh); + ext3_commit_super(sb, es, 1); + } + + for (i = 0; i < sbi->s_gdb_count; i++) + brelse(sbi->s_group_desc[i]); + kfree(sbi->s_group_desc); + percpu_counter_destroy(&sbi->s_freeblocks_counter); + percpu_counter_destroy(&sbi->s_freeinodes_counter); + percpu_counter_destroy(&sbi->s_dirs_counter); + brelse(sbi->s_sbh); +#ifdef CONFIG_QUOTA + for (i = 0; i < MAXQUOTAS; i++) + kfree(sbi->s_qf_names[i]); +#endif + + /* Debugging code just in case the in-memory inode orphan list + * isn't empty. The on-disk one can be non-empty if we've + * detected an error and taken the fs readonly, but the + * in-memory list had better be clean by this point. */ + if (!list_empty(&sbi->s_orphan)) + dump_orphan_list(sb, sbi); + J_ASSERT(list_empty(&sbi->s_orphan)); + + invalidate_bdev(sb->s_bdev, 0); + if (sbi->journal_bdev && sbi->journal_bdev != sb->s_bdev) { + /* + * Invalidate the journal device's buffers. We don't want them + * floating about in memory - the physical journal device may + * hotswapped, and it breaks the `ro-after' testing code. + */ + sync_blockdev(sbi->journal_bdev); + invalidate_bdev(sbi->journal_bdev, 0); + ext3_blkdev_remove(sbi); + } + sb->s_fs_info = NULL; + kfree(sbi); + return; +} + +static kmem_cache_t *ext3_inode_cachep; + +/* + * Called inside transaction, so use GFP_NOFS + */ +static struct inode *ext3_alloc_inode(struct super_block *sb) +{ + struct ext3_inode_info *ei; + + ei = kmem_cache_alloc(ext3_inode_cachep, SLAB_NOFS); + if (!ei) + return NULL; +#ifdef CONFIG_EXT3_FS_POSIX_ACL + ei->i_acl = EXT3_ACL_NOT_CACHED; + ei->i_default_acl = EXT3_ACL_NOT_CACHED; +#endif + ei->i_block_alloc_info = NULL; + ei->vfs_inode.i_version = 1; + return &ei->vfs_inode; +} + +static void ext3_destroy_inode(struct inode *inode) +{ + kmem_cache_free(ext3_inode_cachep, EXT3_I(inode)); +} + +static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags) +{ + struct ext3_inode_info *ei = (struct ext3_inode_info *) foo; + + if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == + SLAB_CTOR_CONSTRUCTOR) { + INIT_LIST_HEAD(&ei->i_orphan); +#ifdef CONFIG_EXT3_FS_XATTR + init_rwsem(&ei->xattr_sem); +#endif + init_MUTEX(&ei->truncate_sem); + inode_init_once(&ei->vfs_inode); + } +} + +static int init_inodecache(void) +{ + ext3_inode_cachep = kmem_cache_create("ext3_inode_cache", + sizeof(struct ext3_inode_info), + 0, SLAB_RECLAIM_ACCOUNT, + init_once, NULL); + if (ext3_inode_cachep == NULL) + return -ENOMEM; + return 0; +} + +static void destroy_inodecache(void) +{ + if (kmem_cache_destroy(ext3_inode_cachep)) + printk(KERN_INFO "ext3_inode_cache: not all structures were freed\n"); +} + +static void ext3_clear_inode(struct inode *inode) +{ + struct ext3_block_alloc_info *rsv = EXT3_I(inode)->i_block_alloc_info; +#ifdef CONFIG_EXT3_FS_POSIX_ACL + if (EXT3_I(inode)->i_acl && + EXT3_I(inode)->i_acl != EXT3_ACL_NOT_CACHED) { + posix_acl_release(EXT3_I(inode)->i_acl); + EXT3_I(inode)->i_acl = EXT3_ACL_NOT_CACHED; + } + if (EXT3_I(inode)->i_default_acl && + EXT3_I(inode)->i_default_acl != EXT3_ACL_NOT_CACHED) { + posix_acl_release(EXT3_I(inode)->i_default_acl); + EXT3_I(inode)->i_default_acl = EXT3_ACL_NOT_CACHED; + } +#endif + ext3_discard_reservation(inode); + EXT3_I(inode)->i_block_alloc_info = NULL; + kfree(rsv); +} + +#ifdef CONFIG_QUOTA + +#define QTYPE2NAME(t) ((t)==USRQUOTA?"user":"group") +#define QTYPE2MOPT(on, t) ((t)==USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA)) + +static int ext3_dquot_initialize(struct inode *inode, int type); +static int ext3_dquot_drop(struct inode *inode); +static int ext3_write_dquot(struct dquot *dquot); +static int ext3_acquire_dquot(struct dquot *dquot); +static int ext3_release_dquot(struct dquot *dquot); +static int ext3_mark_dquot_dirty(struct dquot *dquot); +static int ext3_write_info(struct super_block *sb, int type); +static int ext3_quota_on(struct super_block *sb, int type, int format_id, char *path); +static int ext3_quota_on_mount(struct super_block *sb, int type); +static ssize_t ext3_quota_read(struct super_block *sb, int type, char *data, + size_t len, loff_t off); +static ssize_t ext3_quota_write(struct super_block *sb, int type, + const char *data, size_t len, loff_t off); + +static struct dquot_operations ext3_quota_operations = { + .initialize = ext3_dquot_initialize, + .drop = ext3_dquot_drop, + .alloc_space = dquot_alloc_space, + .alloc_inode = dquot_alloc_inode, + .free_space = dquot_free_space, + .free_inode = dquot_free_inode, + .transfer = dquot_transfer, + .write_dquot = ext3_write_dquot, + .acquire_dquot = ext3_acquire_dquot, + .release_dquot = ext3_release_dquot, + .mark_dirty = ext3_mark_dquot_dirty, + .write_info = ext3_write_info +}; + +static struct quotactl_ops ext3_qctl_operations = { + .quota_on = ext3_quota_on, + .quota_off = vfs_quota_off, + .quota_sync = vfs_quota_sync, + .get_info = vfs_get_dqinfo, + .set_info = vfs_set_dqinfo, + .get_dqblk = vfs_get_dqblk, + .set_dqblk = vfs_set_dqblk +}; +#endif + +static struct super_operations ext3_sops = { + .alloc_inode = ext3_alloc_inode, + .destroy_inode = ext3_destroy_inode, + .read_inode = ext3_read_inode, + .write_inode = ext3_write_inode, + .dirty_inode = ext3_dirty_inode, + .delete_inode = ext3_delete_inode, + .put_super = ext3_put_super, + .write_super = ext3_write_super, + .sync_fs = ext3_sync_fs, + .write_super_lockfs = ext3_write_super_lockfs, + .unlockfs = ext3_unlockfs, + .statfs = ext3_statfs, + .remount_fs = ext3_remount, + .clear_inode = ext3_clear_inode, +#ifdef CONFIG_QUOTA + .quota_read = ext3_quota_read, + .quota_write = ext3_quota_write, +#endif +}; + +struct dentry *ext3_get_parent(struct dentry *child); +static struct export_operations ext3_export_ops = { + .get_parent = ext3_get_parent, +}; + +enum { + Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid, + Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro, + Opt_nouid32, Opt_check, Opt_nocheck, Opt_debug, Opt_oldalloc, Opt_orlov, + Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, + Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, + Opt_commit, Opt_journal_update, Opt_journal_inum, + Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, + Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, + Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, + Opt_ignore, Opt_barrier, Opt_err, Opt_resize, +}; + +static match_table_t tokens = { + {Opt_bsd_df, "bsddf"}, + {Opt_minix_df, "minixdf"}, + {Opt_grpid, "grpid"}, + {Opt_grpid, "bsdgroups"}, + {Opt_nogrpid, "nogrpid"}, + {Opt_nogrpid, "sysvgroups"}, + {Opt_resgid, "resgid=%u"}, + {Opt_resuid, "resuid=%u"}, + {Opt_sb, "sb=%u"}, + {Opt_err_cont, "errors=continue"}, + {Opt_err_panic, "errors=panic"}, + {Opt_err_ro, "errors=remount-ro"}, + {Opt_nouid32, "nouid32"}, + {Opt_nocheck, "nocheck"}, + {Opt_nocheck, "check=none"}, + {Opt_check, "check"}, + {Opt_debug, "debug"}, + {Opt_oldalloc, "oldalloc"}, + {Opt_orlov, "orlov"}, + {Opt_user_xattr, "user_xattr"}, + {Opt_nouser_xattr, "nouser_xattr"}, + {Opt_acl, "acl"}, + {Opt_noacl, "noacl"}, + {Opt_reservation, "reservation"}, + {Opt_noreservation, "noreservation"}, + {Opt_noload, "noload"}, + {Opt_nobh, "nobh"}, + {Opt_commit, "commit=%u"}, + {Opt_journal_update, "journal=update"}, + {Opt_journal_inum, "journal=%u"}, + {Opt_abort, "abort"}, + {Opt_data_journal, "data=journal"}, + {Opt_data_ordered, "data=ordered"}, + {Opt_data_writeback, "data=writeback"}, + {Opt_offusrjquota, "usrjquota="}, + {Opt_usrjquota, "usrjquota=%s"}, + {Opt_offgrpjquota, "grpjquota="}, + {Opt_grpjquota, "grpjquota=%s"}, + {Opt_jqfmt_vfsold, "jqfmt=vfsold"}, + {Opt_jqfmt_vfsv0, "jqfmt=vfsv0"}, + {Opt_ignore, "grpquota"}, + {Opt_ignore, "noquota"}, + {Opt_ignore, "quota"}, + {Opt_ignore, "usrquota"}, + {Opt_barrier, "barrier=%u"}, + {Opt_err, NULL}, + {Opt_resize, "resize"}, +}; + +static unsigned long get_sb_block(void **data) +{ + unsigned long sb_block; + char *options = (char *) *data; + + if (!options || strncmp(options, "sb=", 3) != 0) + return 1; /* Default location */ + options += 3; + sb_block = simple_strtoul(options, &options, 0); + if (*options && *options != ',') { + printk("EXT3-fs: Invalid sb specification: %s\n", + (char *) *data); + return 1; + } + if (*options == ',') + options++; + *data = (void *) options; + return sb_block; +} + +static int parse_options (char * options, struct super_block *sb, + unsigned long * inum, unsigned long *n_blocks_count, int is_remount) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); + char * p; + substring_t args[MAX_OPT_ARGS]; + int data_opt = 0; + int option; +#ifdef CONFIG_QUOTA + int qtype; + char *qname; +#endif + + if (!options) + return 1; + + while ((p = strsep (&options, ",")) != NULL) { + int token; + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case Opt_bsd_df: + clear_opt (sbi->s_mount_opt, MINIX_DF); + break; + case Opt_minix_df: + set_opt (sbi->s_mount_opt, MINIX_DF); + break; + case Opt_grpid: + set_opt (sbi->s_mount_opt, GRPID); + break; + case Opt_nogrpid: + clear_opt (sbi->s_mount_opt, GRPID); + break; + case Opt_resuid: + if (match_int(&args[0], &option)) + return 0; + sbi->s_resuid = option; + break; + case Opt_resgid: + if (match_int(&args[0], &option)) + return 0; + sbi->s_resgid = option; + break; + case Opt_sb: + /* handled by get_sb_block() instead of here */ + /* *sb_block = match_int(&args[0]); */ + break; + case Opt_err_panic: + clear_opt (sbi->s_mount_opt, ERRORS_CONT); + clear_opt (sbi->s_mount_opt, ERRORS_RO); + set_opt (sbi->s_mount_opt, ERRORS_PANIC); + break; + case Opt_err_ro: + clear_opt (sbi->s_mount_opt, ERRORS_CONT); + clear_opt (sbi->s_mount_opt, ERRORS_PANIC); + set_opt (sbi->s_mount_opt, ERRORS_RO); + break; + case Opt_err_cont: + clear_opt (sbi->s_mount_opt, ERRORS_RO); + clear_opt (sbi->s_mount_opt, ERRORS_PANIC); + set_opt (sbi->s_mount_opt, ERRORS_CONT); + break; + case Opt_nouid32: + set_opt (sbi->s_mount_opt, NO_UID32); + break; + case Opt_check: +#ifdef CONFIG_EXT3_CHECK + set_opt (sbi->s_mount_opt, CHECK); +#else + printk(KERN_ERR + "EXT3 Check option not supported\n"); +#endif + break; + case Opt_nocheck: + clear_opt (sbi->s_mount_opt, CHECK); + break; + case Opt_debug: + set_opt (sbi->s_mount_opt, DEBUG); + break; + case Opt_oldalloc: + set_opt (sbi->s_mount_opt, OLDALLOC); + break; + case Opt_orlov: + clear_opt (sbi->s_mount_opt, OLDALLOC); + break; +#ifdef CONFIG_EXT3_FS_XATTR + case Opt_user_xattr: + set_opt (sbi->s_mount_opt, XATTR_USER); + break; + case Opt_nouser_xattr: + clear_opt (sbi->s_mount_opt, XATTR_USER); + break; +#else + case Opt_user_xattr: + case Opt_nouser_xattr: + printk("EXT3 (no)user_xattr options not supported\n"); + break; +#endif +#ifdef CONFIG_EXT3_FS_POSIX_ACL + case Opt_acl: + set_opt(sbi->s_mount_opt, POSIX_ACL); + break; + case Opt_noacl: + clear_opt(sbi->s_mount_opt, POSIX_ACL); + break; +#else + case Opt_acl: + case Opt_noacl: + printk("EXT3 (no)acl options not supported\n"); + break; +#endif + case Opt_reservation: + set_opt(sbi->s_mount_opt, RESERVATION); + break; + case Opt_noreservation: + clear_opt(sbi->s_mount_opt, RESERVATION); + break; + case Opt_journal_update: + /* @@@ FIXME */ + /* Eventually we will want to be able to create + a journal file here. For now, only allow the + user to specify an existing inode to be the + journal file. */ + if (is_remount) { + printk(KERN_ERR "EXT3-fs: cannot specify " + "journal on remount\n"); + return 0; + } + set_opt (sbi->s_mount_opt, UPDATE_JOURNAL); + break; + case Opt_journal_inum: + if (is_remount) { + printk(KERN_ERR "EXT3-fs: cannot specify " + "journal on remount\n"); + return 0; + } + if (match_int(&args[0], &option)) + return 0; + *inum = option; + break; + case Opt_noload: + set_opt (sbi->s_mount_opt, NOLOAD); + break; + case Opt_commit: + if (match_int(&args[0], &option)) + return 0; + if (option < 0) + return 0; + if (option == 0) + option = JBD_DEFAULT_MAX_COMMIT_AGE; + sbi->s_commit_interval = HZ * option; + break; + case Opt_data_journal: + data_opt = EXT3_MOUNT_JOURNAL_DATA; + goto datacheck; + case Opt_data_ordered: + data_opt = EXT3_MOUNT_ORDERED_DATA; + goto datacheck; + case Opt_data_writeback: + data_opt = EXT3_MOUNT_WRITEBACK_DATA; + datacheck: + if (is_remount) { + if ((sbi->s_mount_opt & EXT3_MOUNT_DATA_FLAGS) + != data_opt) { + printk(KERN_ERR + "EXT3-fs: cannot change data " + "mode on remount\n"); + return 0; + } + } else { + sbi->s_mount_opt &= ~EXT3_MOUNT_DATA_FLAGS; + sbi->s_mount_opt |= data_opt; + } + break; +#ifdef CONFIG_QUOTA + case Opt_usrjquota: + qtype = USRQUOTA; + goto set_qf_name; + case Opt_grpjquota: + qtype = GRPQUOTA; +set_qf_name: + if (sb_any_quota_enabled(sb)) { + printk(KERN_ERR + "EXT3-fs: Cannot change journalled " + "quota options when quota turned on.\n"); + return 0; + } + qname = match_strdup(&args[0]); + if (!qname) { + printk(KERN_ERR + "EXT3-fs: not enough memory for " + "storing quotafile name.\n"); + return 0; + } + if (sbi->s_qf_names[qtype] && + strcmp(sbi->s_qf_names[qtype], qname)) { + printk(KERN_ERR + "EXT3-fs: %s quota file already " + "specified.\n", QTYPE2NAME(qtype)); + kfree(qname); + return 0; + } + sbi->s_qf_names[qtype] = qname; + if (strchr(sbi->s_qf_names[qtype], '/')) { + printk(KERN_ERR + "EXT3-fs: quotafile must be on " + "filesystem root.\n"); + kfree(sbi->s_qf_names[qtype]); + sbi->s_qf_names[qtype] = NULL; + return 0; + } + break; + case Opt_offusrjquota: + qtype = USRQUOTA; + goto clear_qf_name; + case Opt_offgrpjquota: + qtype = GRPQUOTA; +clear_qf_name: + if (sb_any_quota_enabled(sb)) { + printk(KERN_ERR "EXT3-fs: Cannot change " + "journalled quota options when " + "quota turned on.\n"); + return 0; + } + kfree(sbi->s_qf_names[qtype]); + sbi->s_qf_names[qtype] = NULL; + break; + case Opt_jqfmt_vfsold: + sbi->s_jquota_fmt = QFMT_VFS_OLD; + break; + case Opt_jqfmt_vfsv0: + sbi->s_jquota_fmt = QFMT_VFS_V0; + break; +#else + case Opt_usrjquota: + case Opt_grpjquota: + case Opt_offusrjquota: + case Opt_offgrpjquota: + case Opt_jqfmt_vfsold: + case Opt_jqfmt_vfsv0: + printk(KERN_ERR + "EXT3-fs: journalled quota options not " + "supported.\n"); + break; +#endif + case Opt_abort: + set_opt(sbi->s_mount_opt, ABORT); + break; + case Opt_barrier: + if (match_int(&args[0], &option)) + return 0; + if (option) + set_opt(sbi->s_mount_opt, BARRIER); + else + clear_opt(sbi->s_mount_opt, BARRIER); + break; + case Opt_ignore: + break; + case Opt_resize: + if (!n_blocks_count) { + printk("EXT3-fs: resize option only available " + "for remount\n"); + return 0; + } + match_int(&args[0], &option); + *n_blocks_count = option; + break; + case Opt_nobh: + set_opt(sbi->s_mount_opt, NOBH); + break; + default: + printk (KERN_ERR + "EXT3-fs: Unrecognized mount option \"%s\" " + "or missing value\n", p); + return 0; + } + } +#ifdef CONFIG_QUOTA + if (!sbi->s_jquota_fmt && (sbi->s_qf_names[USRQUOTA] || + sbi->s_qf_names[GRPQUOTA])) { + printk(KERN_ERR + "EXT3-fs: journalled quota format not specified.\n"); + return 0; + } +#endif + + return 1; +} + +static int ext3_setup_super(struct super_block *sb, struct ext3_super_block *es, + int read_only) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); + int res = 0; + + if (le32_to_cpu(es->s_rev_level) > EXT3_MAX_SUPP_REV) { + printk (KERN_ERR "EXT3-fs warning: revision level too high, " + "forcing read-only mode\n"); + res = MS_RDONLY; + } + if (read_only) + return res; + if (!(sbi->s_mount_state & EXT3_VALID_FS)) + printk (KERN_WARNING "EXT3-fs warning: mounting unchecked fs, " + "running e2fsck is recommended\n"); + else if ((sbi->s_mount_state & EXT3_ERROR_FS)) + printk (KERN_WARNING + "EXT3-fs warning: mounting fs with errors, " + "running e2fsck is recommended\n"); + else if ((__s16) le16_to_cpu(es->s_max_mnt_count) >= 0 && + le16_to_cpu(es->s_mnt_count) >= + (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count)) + printk (KERN_WARNING + "EXT3-fs warning: maximal mount count reached, " + "running e2fsck is recommended\n"); + else if (le32_to_cpu(es->s_checkinterval) && + (le32_to_cpu(es->s_lastcheck) + + le32_to_cpu(es->s_checkinterval) <= get_seconds())) + printk (KERN_WARNING + "EXT3-fs warning: checktime reached, " + "running e2fsck is recommended\n"); +#if 0 + /* @@@ We _will_ want to clear the valid bit if we find + inconsistencies, to force a fsck at reboot. But for + a plain journaled filesystem we can keep it set as + valid forever! :) */ + es->s_state = cpu_to_le16(le16_to_cpu(es->s_state) & ~EXT3_VALID_FS); +#endif + if (!(__s16) le16_to_cpu(es->s_max_mnt_count)) + es->s_max_mnt_count = cpu_to_le16(EXT3_DFL_MAX_MNT_COUNT); + es->s_mnt_count=cpu_to_le16(le16_to_cpu(es->s_mnt_count) + 1); + es->s_mtime = cpu_to_le32(get_seconds()); + ext3_update_dynamic_rev(sb); + EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); + + ext3_commit_super(sb, es, 1); + if (test_opt(sb, DEBUG)) + printk(KERN_INFO "[EXT3 FS bs=%lu, gc=%lu, " + "bpg=%lu, ipg=%lu, mo=%04lx]\n", + sb->s_blocksize, + sbi->s_groups_count, + EXT3_BLOCKS_PER_GROUP(sb), + EXT3_INODES_PER_GROUP(sb), + sbi->s_mount_opt); + + printk(KERN_INFO "EXT3 FS on %s, ", sb->s_id); + if (EXT3_SB(sb)->s_journal->j_inode == NULL) { + char b[BDEVNAME_SIZE]; + + printk("external journal on %s\n", + bdevname(EXT3_SB(sb)->s_journal->j_dev, b)); + } else { + printk("internal journal\n"); + } +#ifdef CONFIG_EXT3_CHECK + if (test_opt (sb, CHECK)) { + ext3_check_blocks_bitmap (sb); + ext3_check_inodes_bitmap (sb); + } +#endif + return res; +} + +/* Called at mount-time, super-block is locked */ +static int ext3_check_descriptors (struct super_block * sb) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); + unsigned long block = le32_to_cpu(sbi->s_es->s_first_data_block); + struct ext3_group_desc * gdp = NULL; + int desc_block = 0; + int i; + + ext3_debug ("Checking group descriptors"); + + for (i = 0; i < sbi->s_groups_count; i++) + { + if ((i % EXT3_DESC_PER_BLOCK(sb)) == 0) + gdp = (struct ext3_group_desc *) + sbi->s_group_desc[desc_block++]->b_data; + if (le32_to_cpu(gdp->bg_block_bitmap) < block || + le32_to_cpu(gdp->bg_block_bitmap) >= + block + EXT3_BLOCKS_PER_GROUP(sb)) + { + ext3_error (sb, "ext3_check_descriptors", + "Block bitmap for group %d" + " not in group (block %lu)!", + i, (unsigned long) + le32_to_cpu(gdp->bg_block_bitmap)); + return 0; + } + if (le32_to_cpu(gdp->bg_inode_bitmap) < block || + le32_to_cpu(gdp->bg_inode_bitmap) >= + block + EXT3_BLOCKS_PER_GROUP(sb)) + { + ext3_error (sb, "ext3_check_descriptors", + "Inode bitmap for group %d" + " not in group (block %lu)!", + i, (unsigned long) + le32_to_cpu(gdp->bg_inode_bitmap)); + return 0; + } + if (le32_to_cpu(gdp->bg_inode_table) < block || + le32_to_cpu(gdp->bg_inode_table) + sbi->s_itb_per_group >= + block + EXT3_BLOCKS_PER_GROUP(sb)) + { + ext3_error (sb, "ext3_check_descriptors", + "Inode table for group %d" + " not in group (block %lu)!", + i, (unsigned long) + le32_to_cpu(gdp->bg_inode_table)); + return 0; + } + block += EXT3_BLOCKS_PER_GROUP(sb); + gdp++; + } + + sbi->s_es->s_free_blocks_count=cpu_to_le32(ext3_count_free_blocks(sb)); + sbi->s_es->s_free_inodes_count=cpu_to_le32(ext3_count_free_inodes(sb)); + return 1; +} + + +/* ext3_orphan_cleanup() walks a singly-linked list of inodes (starting at + * the superblock) which were deleted from all directories, but held open by + * a process at the time of a crash. We walk the list and try to delete these + * inodes at recovery time (only with a read-write filesystem). + * + * In order to keep the orphan inode chain consistent during traversal (in + * case of crash during recovery), we link each inode into the superblock + * orphan list_head and handle it the same way as an inode deletion during + * normal operation (which journals the operations for us). + * + * We only do an iget() and an iput() on each inode, which is very safe if we + * accidentally point at an in-use or already deleted inode. The worst that + * can happen in this case is that we get a "bit already cleared" message from + * ext3_free_inode(). The only reason we would point at a wrong inode is if + * e2fsck was run on this filesystem, and it must have already done the orphan + * inode cleanup for us, so we can safely abort without any further action. + */ +static void ext3_orphan_cleanup (struct super_block * sb, + struct ext3_super_block * es) +{ + unsigned int s_flags = sb->s_flags; + int nr_orphans = 0, nr_truncates = 0; +#ifdef CONFIG_QUOTA + int i; +#endif + if (!es->s_last_orphan) { + jbd_debug(4, "no orphan inodes to clean up\n"); + return; + } + + if (EXT3_SB(sb)->s_mount_state & EXT3_ERROR_FS) { + if (es->s_last_orphan) + jbd_debug(1, "Errors on filesystem, " + "clearing orphan list.\n"); + es->s_last_orphan = 0; + jbd_debug(1, "Skipping orphan recovery on fs with errors.\n"); + return; + } + + if (s_flags & MS_RDONLY) { + printk(KERN_INFO "EXT3-fs: %s: orphan cleanup on readonly fs\n", + sb->s_id); + sb->s_flags &= ~MS_RDONLY; + } +#ifdef CONFIG_QUOTA + /* Needed for iput() to work correctly and not trash data */ + sb->s_flags |= MS_ACTIVE; + /* Turn on quotas so that they are updated correctly */ + for (i = 0; i < MAXQUOTAS; i++) { + if (EXT3_SB(sb)->s_qf_names[i]) { + int ret = ext3_quota_on_mount(sb, i); + if (ret < 0) + printk(KERN_ERR + "EXT3-fs: Cannot turn on journalled " + "quota: error %d\n", ret); + } + } +#endif + + while (es->s_last_orphan) { + struct inode *inode; + + if (!(inode = + ext3_orphan_get(sb, le32_to_cpu(es->s_last_orphan)))) { + es->s_last_orphan = 0; + break; + } + + list_add(&EXT3_I(inode)->i_orphan, &EXT3_SB(sb)->s_orphan); + DQUOT_INIT(inode); + if (inode->i_nlink) { + printk(KERN_DEBUG + "%s: truncating inode %ld to %Ld bytes\n", + __FUNCTION__, inode->i_ino, inode->i_size); + jbd_debug(2, "truncating inode %ld to %Ld bytes\n", + inode->i_ino, inode->i_size); + ext3_truncate(inode); + nr_truncates++; + } else { + printk(KERN_DEBUG + "%s: deleting unreferenced inode %ld\n", + __FUNCTION__, inode->i_ino); + jbd_debug(2, "deleting unreferenced inode %ld\n", + inode->i_ino); + nr_orphans++; + } + iput(inode); /* The delete magic happens here! */ + } + +#define PLURAL(x) (x), ((x)==1) ? "" : "s" + + if (nr_orphans) + printk(KERN_INFO "EXT3-fs: %s: %d orphan inode%s deleted\n", + sb->s_id, PLURAL(nr_orphans)); + if (nr_truncates) + printk(KERN_INFO "EXT3-fs: %s: %d truncate%s cleaned up\n", + sb->s_id, PLURAL(nr_truncates)); +#ifdef CONFIG_QUOTA + /* Turn quotas off */ + for (i = 0; i < MAXQUOTAS; i++) { + if (sb_dqopt(sb)->files[i]) + vfs_quota_off(sb, i); + } +#endif + sb->s_flags = s_flags; /* Restore MS_RDONLY status */ +} + +#define log2(n) ffz(~(n)) + +/* + * Maximal file size. There is a direct, and {,double-,triple-}indirect + * block limit, and also a limit of (2^32 - 1) 512-byte sectors in i_blocks. + * We need to be 1 filesystem block less than the 2^32 sector limit. + */ +static loff_t ext3_max_size(int bits) +{ + loff_t res = EXT3_NDIR_BLOCKS; + /* This constant is calculated to be the largest file size for a + * dense, 4k-blocksize file such that the total number of + * sectors in the file, including data and all indirect blocks, + * does not exceed 2^32. */ + const loff_t upper_limit = 0x1ff7fffd000LL; + + res += 1LL << (bits-2); + res += 1LL << (2*(bits-2)); + res += 1LL << (3*(bits-2)); + res <<= bits; + if (res > upper_limit) + res = upper_limit; + return res; +} + +static unsigned long descriptor_loc(struct super_block *sb, + unsigned long logic_sb_block, + int nr) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); + unsigned long bg, first_data_block, first_meta_bg; + int has_super = 0; + + first_data_block = le32_to_cpu(sbi->s_es->s_first_data_block); + first_meta_bg = le32_to_cpu(sbi->s_es->s_first_meta_bg); + + if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_META_BG) || + nr < first_meta_bg) + return (logic_sb_block + nr + 1); + bg = sbi->s_desc_per_block * nr; + if (ext3_bg_has_super(sb, bg)) + has_super = 1; + return (first_data_block + has_super + (bg * sbi->s_blocks_per_group)); +} + + +static int ext3_fill_super (struct super_block *sb, void *data, int silent) +{ + struct buffer_head * bh; + struct ext3_super_block *es = NULL; + struct ext3_sb_info *sbi; + unsigned long block; + unsigned long sb_block = get_sb_block(&data); + unsigned long logic_sb_block; + unsigned long offset = 0; + unsigned long journal_inum = 0; + unsigned long def_mount_opts; + struct inode *root; + int blocksize; + int hblock; + int db_count; + int i; + int needs_recovery; + __le32 features; + + sbi = kmalloc(sizeof(*sbi), GFP_KERNEL); + if (!sbi) + return -ENOMEM; + sb->s_fs_info = sbi; + memset(sbi, 0, sizeof(*sbi)); + sbi->s_mount_opt = 0; + sbi->s_resuid = EXT3_DEF_RESUID; + sbi->s_resgid = EXT3_DEF_RESGID; + + unlock_kernel(); + + blocksize = sb_min_blocksize(sb, EXT3_MIN_BLOCK_SIZE); + if (!blocksize) { + printk(KERN_ERR "EXT3-fs: unable to set blocksize\n"); + goto out_fail; + } + + /* + * The ext3 superblock will not be buffer aligned for other than 1kB + * block sizes. We need to calculate the offset from buffer start. + */ + if (blocksize != EXT3_MIN_BLOCK_SIZE) { + logic_sb_block = (sb_block * EXT3_MIN_BLOCK_SIZE) / blocksize; + offset = (sb_block * EXT3_MIN_BLOCK_SIZE) % blocksize; + } else { + logic_sb_block = sb_block; + } + + if (!(bh = sb_bread(sb, logic_sb_block))) { + printk (KERN_ERR "EXT3-fs: unable to read superblock\n"); + goto out_fail; + } + /* + * Note: s_es must be initialized as soon as possible because + * some ext3 macro-instructions depend on its value + */ + es = (struct ext3_super_block *) (((char *)bh->b_data) + offset); + sbi->s_es = es; + sb->s_magic = le16_to_cpu(es->s_magic); + if (sb->s_magic != EXT3_SUPER_MAGIC) + goto cantfind_ext3; + + /* Set defaults before we parse the mount options */ + def_mount_opts = le32_to_cpu(es->s_default_mount_opts); + if (def_mount_opts & EXT3_DEFM_DEBUG) + set_opt(sbi->s_mount_opt, DEBUG); + if (def_mount_opts & EXT3_DEFM_BSDGROUPS) + set_opt(sbi->s_mount_opt, GRPID); + if (def_mount_opts & EXT3_DEFM_UID16) + set_opt(sbi->s_mount_opt, NO_UID32); + if (def_mount_opts & EXT3_DEFM_XATTR_USER) + set_opt(sbi->s_mount_opt, XATTR_USER); + if (def_mount_opts & EXT3_DEFM_ACL) + set_opt(sbi->s_mount_opt, POSIX_ACL); + if ((def_mount_opts & EXT3_DEFM_JMODE) == EXT3_DEFM_JMODE_DATA) + sbi->s_mount_opt |= EXT3_MOUNT_JOURNAL_DATA; + else if ((def_mount_opts & EXT3_DEFM_JMODE) == EXT3_DEFM_JMODE_ORDERED) + sbi->s_mount_opt |= EXT3_MOUNT_ORDERED_DATA; + else if ((def_mount_opts & EXT3_DEFM_JMODE) == EXT3_DEFM_JMODE_WBACK) + sbi->s_mount_opt |= EXT3_MOUNT_WRITEBACK_DATA; + + if (le16_to_cpu(sbi->s_es->s_errors) == EXT3_ERRORS_PANIC) + set_opt(sbi->s_mount_opt, ERRORS_PANIC); + else if (le16_to_cpu(sbi->s_es->s_errors) == EXT3_ERRORS_RO) + set_opt(sbi->s_mount_opt, ERRORS_RO); + + sbi->s_resuid = le16_to_cpu(es->s_def_resuid); + sbi->s_resgid = le16_to_cpu(es->s_def_resgid); + + set_opt(sbi->s_mount_opt, RESERVATION); + + if (!parse_options ((char *) data, sb, &journal_inum, NULL, 0)) + goto failed_mount; + + sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | + ((sbi->s_mount_opt & EXT3_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0); + + if (le32_to_cpu(es->s_rev_level) == EXT3_GOOD_OLD_REV && + (EXT3_HAS_COMPAT_FEATURE(sb, ~0U) || + EXT3_HAS_RO_COMPAT_FEATURE(sb, ~0U) || + EXT3_HAS_INCOMPAT_FEATURE(sb, ~0U))) + printk(KERN_WARNING + "EXT3-fs warning: feature flags set on rev 0 fs, " + "running e2fsck is recommended\n"); + /* + * Check feature flags regardless of the revision level, since we + * previously didn't change the revision level when setting the flags, + * so there is a chance incompat flags are set on a rev 0 filesystem. + */ + features = EXT3_HAS_INCOMPAT_FEATURE(sb, ~EXT3_FEATURE_INCOMPAT_SUPP); + if (features) { + printk(KERN_ERR "EXT3-fs: %s: couldn't mount because of " + "unsupported optional features (%x).\n", + sb->s_id, le32_to_cpu(features)); + goto failed_mount; + } + features = EXT3_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP); + if (!(sb->s_flags & MS_RDONLY) && features) { + printk(KERN_ERR "EXT3-fs: %s: couldn't mount RDWR because of " + "unsupported optional features (%x).\n", + sb->s_id, le32_to_cpu(features)); + goto failed_mount; + } + blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size); + + if (blocksize < EXT3_MIN_BLOCK_SIZE || + blocksize > EXT3_MAX_BLOCK_SIZE) { + printk(KERN_ERR + "EXT3-fs: Unsupported filesystem blocksize %d on %s.\n", + blocksize, sb->s_id); + goto failed_mount; + } + + hblock = bdev_hardsect_size(sb->s_bdev); + if (sb->s_blocksize != blocksize) { + /* + * Make sure the blocksize for the filesystem is larger + * than the hardware sectorsize for the machine. + */ + if (blocksize < hblock) { + printk(KERN_ERR "EXT3-fs: blocksize %d too small for " + "device blocksize %d.\n", blocksize, hblock); + goto failed_mount; + } + + brelse (bh); + sb_set_blocksize(sb, blocksize); + logic_sb_block = (sb_block * EXT3_MIN_BLOCK_SIZE) / blocksize; + offset = (sb_block * EXT3_MIN_BLOCK_SIZE) % blocksize; + bh = sb_bread(sb, logic_sb_block); + if (!bh) { + printk(KERN_ERR + "EXT3-fs: Can't read superblock on 2nd try.\n"); + goto failed_mount; + } + es = (struct ext3_super_block *)(((char *)bh->b_data) + offset); + sbi->s_es = es; + if (es->s_magic != cpu_to_le16(EXT3_SUPER_MAGIC)) { + printk (KERN_ERR + "EXT3-fs: Magic mismatch, very weird !\n"); + goto failed_mount; + } + } + + sb->s_maxbytes = ext3_max_size(sb->s_blocksize_bits); + + if (le32_to_cpu(es->s_rev_level) == EXT3_GOOD_OLD_REV) { + sbi->s_inode_size = EXT3_GOOD_OLD_INODE_SIZE; + sbi->s_first_ino = EXT3_GOOD_OLD_FIRST_INO; + } else { + sbi->s_inode_size = le16_to_cpu(es->s_inode_size); + sbi->s_first_ino = le32_to_cpu(es->s_first_ino); + if ((sbi->s_inode_size < EXT3_GOOD_OLD_INODE_SIZE) || + (sbi->s_inode_size & (sbi->s_inode_size - 1)) || + (sbi->s_inode_size > blocksize)) { + printk (KERN_ERR + "EXT3-fs: unsupported inode size: %d\n", + sbi->s_inode_size); + goto failed_mount; + } + } + sbi->s_frag_size = EXT3_MIN_FRAG_SIZE << + le32_to_cpu(es->s_log_frag_size); + if (blocksize != sbi->s_frag_size) { + printk(KERN_ERR + "EXT3-fs: fragsize %lu != blocksize %u (unsupported)\n", + sbi->s_frag_size, blocksize); + goto failed_mount; + } + sbi->s_frags_per_block = 1; + sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group); + sbi->s_frags_per_group = le32_to_cpu(es->s_frags_per_group); + sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group); + if (EXT3_INODE_SIZE(sb) == 0) + goto cantfind_ext3; + sbi->s_inodes_per_block = blocksize / EXT3_INODE_SIZE(sb); + if (sbi->s_inodes_per_block == 0) + goto cantfind_ext3; + sbi->s_itb_per_group = sbi->s_inodes_per_group / + sbi->s_inodes_per_block; + sbi->s_desc_per_block = blocksize / sizeof(struct ext3_group_desc); + sbi->s_sbh = bh; + sbi->s_mount_state = le16_to_cpu(es->s_state); + sbi->s_addr_per_block_bits = log2(EXT3_ADDR_PER_BLOCK(sb)); + sbi->s_desc_per_block_bits = log2(EXT3_DESC_PER_BLOCK(sb)); + for (i=0; i < 4; i++) + sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]); + sbi->s_def_hash_version = es->s_def_hash_version; + + if (sbi->s_blocks_per_group > blocksize * 8) { + printk (KERN_ERR + "EXT3-fs: #blocks per group too big: %lu\n", + sbi->s_blocks_per_group); + goto failed_mount; + } + if (sbi->s_frags_per_group > blocksize * 8) { + printk (KERN_ERR + "EXT3-fs: #fragments per group too big: %lu\n", + sbi->s_frags_per_group); + goto failed_mount; + } + if (sbi->s_inodes_per_group > blocksize * 8) { + printk (KERN_ERR + "EXT3-fs: #inodes per group too big: %lu\n", + sbi->s_inodes_per_group); + goto failed_mount; + } + + if (EXT3_BLOCKS_PER_GROUP(sb) == 0) + goto cantfind_ext3; + sbi->s_groups_count = (le32_to_cpu(es->s_blocks_count) - + le32_to_cpu(es->s_first_data_block) + + EXT3_BLOCKS_PER_GROUP(sb) - 1) / + EXT3_BLOCKS_PER_GROUP(sb); + db_count = (sbi->s_groups_count + EXT3_DESC_PER_BLOCK(sb) - 1) / + EXT3_DESC_PER_BLOCK(sb); + sbi->s_group_desc = kmalloc(db_count * sizeof (struct buffer_head *), + GFP_KERNEL); + if (sbi->s_group_desc == NULL) { + printk (KERN_ERR "EXT3-fs: not enough memory\n"); + goto failed_mount; + } + + percpu_counter_init(&sbi->s_freeblocks_counter); + percpu_counter_init(&sbi->s_freeinodes_counter); + percpu_counter_init(&sbi->s_dirs_counter); + bgl_lock_init(&sbi->s_blockgroup_lock); + + for (i = 0; i < db_count; i++) { + block = descriptor_loc(sb, logic_sb_block, i); + sbi->s_group_desc[i] = sb_bread(sb, block); + if (!sbi->s_group_desc[i]) { + printk (KERN_ERR "EXT3-fs: " + "can't read group descriptor %d\n", i); + db_count = i; + goto failed_mount2; + } + } + if (!ext3_check_descriptors (sb)) { + printk (KERN_ERR "EXT3-fs: group descriptors corrupted !\n"); + goto failed_mount2; + } + sbi->s_gdb_count = db_count; + get_random_bytes(&sbi->s_next_generation, sizeof(u32)); + spin_lock_init(&sbi->s_next_gen_lock); + /* per fileystem reservation list head & lock */ + spin_lock_init(&sbi->s_rsv_window_lock); + sbi->s_rsv_window_root = RB_ROOT; + /* Add a single, static dummy reservation to the start of the + * reservation window list --- it gives us a placeholder for + * append-at-start-of-list which makes the allocation logic + * _much_ simpler. */ + sbi->s_rsv_window_head.rsv_start = EXT3_RESERVE_WINDOW_NOT_ALLOCATED; + sbi->s_rsv_window_head.rsv_end = EXT3_RESERVE_WINDOW_NOT_ALLOCATED; + sbi->s_rsv_window_head.rsv_alloc_hit = 0; + sbi->s_rsv_window_head.rsv_goal_size = 0; + ext3_rsv_window_add(sb, &sbi->s_rsv_window_head); + + /* + * set up enough so that it can read an inode + */ + sb->s_op = &ext3_sops; + sb->s_export_op = &ext3_export_ops; + sb->s_xattr = ext3_xattr_handlers; +#ifdef CONFIG_QUOTA + sb->s_qcop = &ext3_qctl_operations; + sb->dq_op = &ext3_quota_operations; +#endif + INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ + + sb->s_root = NULL; + + needs_recovery = (es->s_last_orphan != 0 || + EXT3_HAS_INCOMPAT_FEATURE(sb, + EXT3_FEATURE_INCOMPAT_RECOVER)); + + /* + * The first inode we look at is the journal inode. Don't try + * root first: it may be modified in the journal! + */ + if (!test_opt(sb, NOLOAD) && + EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL)) { + if (ext3_load_journal(sb, es)) + goto failed_mount2; + } else if (journal_inum) { + if (ext3_create_journal(sb, es, journal_inum)) + goto failed_mount2; + } else { + if (!silent) + printk (KERN_ERR + "ext3: No journal on filesystem on %s\n", + sb->s_id); + goto failed_mount2; + } + + /* We have now updated the journal if required, so we can + * validate the data journaling mode. */ + switch (test_opt(sb, DATA_FLAGS)) { + case 0: + /* No mode set, assume a default based on the journal + capabilities: ORDERED_DATA if the journal can + cope, else JOURNAL_DATA */ + if (journal_check_available_features + (sbi->s_journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE)) + set_opt(sbi->s_mount_opt, ORDERED_DATA); + else + set_opt(sbi->s_mount_opt, JOURNAL_DATA); + break; + + case EXT3_MOUNT_ORDERED_DATA: + case EXT3_MOUNT_WRITEBACK_DATA: + if (!journal_check_available_features + (sbi->s_journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE)) { + printk(KERN_ERR "EXT3-fs: Journal does not support " + "requested data journaling mode\n"); + goto failed_mount3; + } + default: + break; + } + + if (test_opt(sb, NOBH)) { + if (sb->s_blocksize_bits != PAGE_CACHE_SHIFT) { + printk(KERN_WARNING "EXT3-fs: Ignoring nobh option " + "since filesystem blocksize doesn't match " + "pagesize\n"); + clear_opt(sbi->s_mount_opt, NOBH); + } + if (!(test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_WRITEBACK_DATA)) { + printk(KERN_WARNING "EXT3-fs: Ignoring nobh option - " + "its supported only with writeback mode\n"); + clear_opt(sbi->s_mount_opt, NOBH); + } + } + /* + * The journal_load will have done any necessary log recovery, + * so we can safely mount the rest of the filesystem now. + */ + + root = iget(sb, EXT3_ROOT_INO); + sb->s_root = d_alloc_root(root); + if (!sb->s_root) { + printk(KERN_ERR "EXT3-fs: get root inode failed\n"); + iput(root); + goto failed_mount3; + } + if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) { + dput(sb->s_root); + sb->s_root = NULL; + printk(KERN_ERR "EXT3-fs: corrupt root inode, run e2fsck\n"); + goto failed_mount3; + } + + ext3_setup_super (sb, es, sb->s_flags & MS_RDONLY); + /* + * akpm: core read_super() calls in here with the superblock locked. + * That deadlocks, because orphan cleanup needs to lock the superblock + * in numerous places. Here we just pop the lock - it's relatively + * harmless, because we are now ready to accept write_super() requests, + * and aviro says that's the only reason for hanging onto the + * superblock lock. + */ + EXT3_SB(sb)->s_mount_state |= EXT3_ORPHAN_FS; + ext3_orphan_cleanup(sb, es); + EXT3_SB(sb)->s_mount_state &= ~EXT3_ORPHAN_FS; + if (needs_recovery) + printk (KERN_INFO "EXT3-fs: recovery complete.\n"); + ext3_mark_recovery_complete(sb, es); + printk (KERN_INFO "EXT3-fs: mounted filesystem with %s data mode.\n", + test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA ? "journal": + test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered": + "writeback"); + + percpu_counter_mod(&sbi->s_freeblocks_counter, + ext3_count_free_blocks(sb)); + percpu_counter_mod(&sbi->s_freeinodes_counter, + ext3_count_free_inodes(sb)); + percpu_counter_mod(&sbi->s_dirs_counter, + ext3_count_dirs(sb)); + + lock_kernel(); + return 0; + +cantfind_ext3: + if (!silent) + printk(KERN_ERR "VFS: Can't find ext3 filesystem on dev %s.\n", + sb->s_id); + goto failed_mount; + +failed_mount3: + journal_destroy(sbi->s_journal); +failed_mount2: + for (i = 0; i < db_count; i++) + brelse(sbi->s_group_desc[i]); + kfree(sbi->s_group_desc); +failed_mount: +#ifdef CONFIG_QUOTA + for (i = 0; i < MAXQUOTAS; i++) + kfree(sbi->s_qf_names[i]); +#endif + ext3_blkdev_remove(sbi); + brelse(bh); +out_fail: + sb->s_fs_info = NULL; + kfree(sbi); + lock_kernel(); + return -EINVAL; +} + +/* + * Setup any per-fs journal parameters now. We'll do this both on + * initial mount, once the journal has been initialised but before we've + * done any recovery; and again on any subsequent remount. + */ +static void ext3_init_journal_params(struct super_block *sb, journal_t *journal) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); + + if (sbi->s_commit_interval) + journal->j_commit_interval = sbi->s_commit_interval; + /* We could also set up an ext3-specific default for the commit + * interval here, but for now we'll just fall back to the jbd + * default. */ + + spin_lock(&journal->j_state_lock); + if (test_opt(sb, BARRIER)) + journal->j_flags |= JFS_BARRIER; + else + journal->j_flags &= ~JFS_BARRIER; + spin_unlock(&journal->j_state_lock); +} + +static journal_t *ext3_get_journal(struct super_block *sb, int journal_inum) +{ + struct inode *journal_inode; + journal_t *journal; + + /* First, test for the existence of a valid inode on disk. Bad + * things happen if we iget() an unused inode, as the subsequent + * iput() will try to delete it. */ + + journal_inode = iget(sb, journal_inum); + if (!journal_inode) { + printk(KERN_ERR "EXT3-fs: no journal found.\n"); + return NULL; + } + if (!journal_inode->i_nlink) { + make_bad_inode(journal_inode); + iput(journal_inode); + printk(KERN_ERR "EXT3-fs: journal inode is deleted.\n"); + return NULL; + } + + jbd_debug(2, "Journal inode found at %p: %Ld bytes\n", + journal_inode, journal_inode->i_size); + if (is_bad_inode(journal_inode) || !S_ISREG(journal_inode->i_mode)) { + printk(KERN_ERR "EXT3-fs: invalid journal inode.\n"); + iput(journal_inode); + return NULL; + } + + journal = journal_init_inode(journal_inode); + if (!journal) { + printk(KERN_ERR "EXT3-fs: Could not load journal inode\n"); + iput(journal_inode); + return NULL; + } + journal->j_private = sb; + ext3_init_journal_params(sb, journal); + return journal; +} + +static journal_t *ext3_get_dev_journal(struct super_block *sb, + dev_t j_dev) +{ + struct buffer_head * bh; + journal_t *journal; + int start; + int len; + int hblock, blocksize; + unsigned long sb_block; + unsigned long offset; + struct ext3_super_block * es; + struct block_device *bdev; + + bdev = ext3_blkdev_get(j_dev); + if (bdev == NULL) + return NULL; + + if (bd_claim(bdev, sb)) { + printk(KERN_ERR + "EXT3: failed to claim external journal device.\n"); + blkdev_put(bdev); + return NULL; + } + + blocksize = sb->s_blocksize; + hblock = bdev_hardsect_size(bdev); + if (blocksize < hblock) { + printk(KERN_ERR + "EXT3-fs: blocksize too small for journal device.\n"); + goto out_bdev; + } + + sb_block = EXT3_MIN_BLOCK_SIZE / blocksize; + offset = EXT3_MIN_BLOCK_SIZE % blocksize; + set_blocksize(bdev, blocksize); + if (!(bh = __bread(bdev, sb_block, blocksize))) { + printk(KERN_ERR "EXT3-fs: couldn't read superblock of " + "external journal\n"); + goto out_bdev; + } + + es = (struct ext3_super_block *) (((char *)bh->b_data) + offset); + if ((le16_to_cpu(es->s_magic) != EXT3_SUPER_MAGIC) || + !(le32_to_cpu(es->s_feature_incompat) & + EXT3_FEATURE_INCOMPAT_JOURNAL_DEV)) { + printk(KERN_ERR "EXT3-fs: external journal has " + "bad superblock\n"); + brelse(bh); + goto out_bdev; + } + + if (memcmp(EXT3_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) { + printk(KERN_ERR "EXT3-fs: journal UUID does not match\n"); + brelse(bh); + goto out_bdev; + } + + len = le32_to_cpu(es->s_blocks_count); + start = sb_block + 1; + brelse(bh); /* we're done with the superblock */ + + journal = journal_init_dev(bdev, sb->s_bdev, + start, len, blocksize); + if (!journal) { + printk(KERN_ERR "EXT3-fs: failed to create device journal\n"); + goto out_bdev; + } + journal->j_private = sb; + ll_rw_block(READ, 1, &journal->j_sb_buffer); + wait_on_buffer(journal->j_sb_buffer); + if (!buffer_uptodate(journal->j_sb_buffer)) { + printk(KERN_ERR "EXT3-fs: I/O error on journal device\n"); + goto out_journal; + } + if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) { + printk(KERN_ERR "EXT3-fs: External journal has more than one " + "user (unsupported) - %d\n", + be32_to_cpu(journal->j_superblock->s_nr_users)); + goto out_journal; + } + EXT3_SB(sb)->journal_bdev = bdev; + ext3_init_journal_params(sb, journal); + return journal; +out_journal: + journal_destroy(journal); +out_bdev: + ext3_blkdev_put(bdev); + return NULL; +} + +static int ext3_load_journal(struct super_block * sb, + struct ext3_super_block * es) +{ + journal_t *journal; + int journal_inum = le32_to_cpu(es->s_journal_inum); + dev_t journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev)); + int err = 0; + int really_read_only; + + really_read_only = bdev_read_only(sb->s_bdev); + + /* + * Are we loading a blank journal or performing recovery after a + * crash? For recovery, we need to check in advance whether we + * can get read-write access to the device. + */ + + if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER)) { + if (sb->s_flags & MS_RDONLY) { + printk(KERN_INFO "EXT3-fs: INFO: recovery " + "required on readonly filesystem.\n"); + if (really_read_only) { + printk(KERN_ERR "EXT3-fs: write access " + "unavailable, cannot proceed.\n"); + return -EROFS; + } + printk (KERN_INFO "EXT3-fs: write access will " + "be enabled during recovery.\n"); + } + } + + if (journal_inum && journal_dev) { + printk(KERN_ERR "EXT3-fs: filesystem has both journal " + "and inode journals!\n"); + return -EINVAL; + } + + if (journal_inum) { + if (!(journal = ext3_get_journal(sb, journal_inum))) + return -EINVAL; + } else { + if (!(journal = ext3_get_dev_journal(sb, journal_dev))) + return -EINVAL; + } + + if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) { + err = journal_update_format(journal); + if (err) { + printk(KERN_ERR "EXT3-fs: error updating journal.\n"); + journal_destroy(journal); + return err; + } + } + + if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER)) + err = journal_wipe(journal, !really_read_only); + if (!err) + err = journal_load(journal); + + if (err) { + printk(KERN_ERR "EXT3-fs: error loading journal.\n"); + journal_destroy(journal); + return err; + } + + EXT3_SB(sb)->s_journal = journal; + ext3_clear_journal_err(sb, es); + return 0; +} + +static int ext3_create_journal(struct super_block * sb, + struct ext3_super_block * es, + int journal_inum) +{ + journal_t *journal; + + if (sb->s_flags & MS_RDONLY) { + printk(KERN_ERR "EXT3-fs: readonly filesystem when trying to " + "create journal.\n"); + return -EROFS; + } + + if (!(journal = ext3_get_journal(sb, journal_inum))) + return -EINVAL; + + printk(KERN_INFO "EXT3-fs: creating new journal on inode %d\n", + journal_inum); + + if (journal_create(journal)) { + printk(KERN_ERR "EXT3-fs: error creating journal.\n"); + journal_destroy(journal); + return -EIO; + } + + EXT3_SB(sb)->s_journal = journal; + + ext3_update_dynamic_rev(sb); + EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); + EXT3_SET_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL); + + es->s_journal_inum = cpu_to_le32(journal_inum); + sb->s_dirt = 1; + + /* Make sure we flush the recovery flag to disk. */ + ext3_commit_super(sb, es, 1); + + return 0; +} + +static void ext3_commit_super (struct super_block * sb, + struct ext3_super_block * es, + int sync) +{ + struct buffer_head *sbh = EXT3_SB(sb)->s_sbh; + + if (!sbh) + return; + es->s_wtime = cpu_to_le32(get_seconds()); + es->s_free_blocks_count = cpu_to_le32(ext3_count_free_blocks(sb)); + es->s_free_inodes_count = cpu_to_le32(ext3_count_free_inodes(sb)); + BUFFER_TRACE(sbh, "marking dirty"); + mark_buffer_dirty(sbh); + if (sync) + sync_dirty_buffer(sbh); +} + + +/* + * Have we just finished recovery? If so, and if we are mounting (or + * remounting) the filesystem readonly, then we will end up with a + * consistent fs on disk. Record that fact. + */ +static void ext3_mark_recovery_complete(struct super_block * sb, + struct ext3_super_block * es) +{ + journal_t *journal = EXT3_SB(sb)->s_journal; + + journal_lock_updates(journal); + journal_flush(journal); + if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER) && + sb->s_flags & MS_RDONLY) { + EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); + sb->s_dirt = 0; + ext3_commit_super(sb, es, 1); + } + journal_unlock_updates(journal); +} + +/* + * If we are mounting (or read-write remounting) a filesystem whose journal + * has recorded an error from a previous lifetime, move that error to the + * main filesystem now. + */ +static void ext3_clear_journal_err(struct super_block * sb, + struct ext3_super_block * es) +{ + journal_t *journal; + int j_errno; + const char *errstr; + + journal = EXT3_SB(sb)->s_journal; + + /* + * Now check for any error status which may have been recorded in the + * journal by a prior ext3_error() or ext3_abort() + */ + + j_errno = journal_errno(journal); + if (j_errno) { + char nbuf[16]; + + errstr = ext3_decode_error(sb, j_errno, nbuf); + ext3_warning(sb, __FUNCTION__, "Filesystem error recorded " + "from previous mount: %s", errstr); + ext3_warning(sb, __FUNCTION__, "Marking fs in need of " + "filesystem check."); + + EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS; + es->s_state |= cpu_to_le16(EXT3_ERROR_FS); + ext3_commit_super (sb, es, 1); + + journal_clear_err(journal); + } +} + +/* + * Force the running and committing transactions to commit, + * and wait on the commit. + */ +int ext3_force_commit(struct super_block *sb) +{ + journal_t *journal; + int ret; + + if (sb->s_flags & MS_RDONLY) + return 0; + + journal = EXT3_SB(sb)->s_journal; + sb->s_dirt = 0; + ret = ext3_journal_force_commit(journal); + return ret; +} + +/* + * Ext3 always journals updates to the superblock itself, so we don't + * have to propagate any other updates to the superblock on disk at this + * point. Just start an async writeback to get the buffers on their way + * to the disk. + * + * This implicitly triggers the writebehind on sync(). + */ + +static void ext3_write_super (struct super_block * sb) +{ + if (down_trylock(&sb->s_lock) == 0) + BUG(); + sb->s_dirt = 0; +} + +static int ext3_sync_fs(struct super_block *sb, int wait) +{ + tid_t target; + + sb->s_dirt = 0; + if (journal_start_commit(EXT3_SB(sb)->s_journal, &target)) { + if (wait) + log_wait_commit(EXT3_SB(sb)->s_journal, target); + } + return 0; +} + +/* + * LVM calls this function before a (read-only) snapshot is created. This + * gives us a chance to flush the journal completely and mark the fs clean. + */ +static void ext3_write_super_lockfs(struct super_block *sb) +{ + sb->s_dirt = 0; + + if (!(sb->s_flags & MS_RDONLY)) { + journal_t *journal = EXT3_SB(sb)->s_journal; + + /* Now we set up the journal barrier. */ + journal_lock_updates(journal); + journal_flush(journal); + + /* Journal blocked and flushed, clear needs_recovery flag. */ + EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); + ext3_commit_super(sb, EXT3_SB(sb)->s_es, 1); + } +} + +/* + * Called by LVM after the snapshot is done. We need to reset the RECOVER + * flag here, even though the filesystem is not technically dirty yet. + */ +static void ext3_unlockfs(struct super_block *sb) +{ + if (!(sb->s_flags & MS_RDONLY)) { + lock_super(sb); + /* Reser the needs_recovery flag before the fs is unlocked. */ + EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); + ext3_commit_super(sb, EXT3_SB(sb)->s_es, 1); + unlock_super(sb); + journal_unlock_updates(EXT3_SB(sb)->s_journal); + } +} + +static int ext3_remount (struct super_block * sb, int * flags, char * data) +{ + struct ext3_super_block * es; + struct ext3_sb_info *sbi = EXT3_SB(sb); + unsigned long tmp; + unsigned long n_blocks_count = 0; + + /* + * Allow the "check" option to be passed as a remount option. + */ + if (!parse_options(data, sb, &tmp, &n_blocks_count, 1)) + return -EINVAL; + + if (sbi->s_mount_opt & EXT3_MOUNT_ABORT) + ext3_abort(sb, __FUNCTION__, "Abort forced by user"); + + sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | + ((sbi->s_mount_opt & EXT3_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0); + + es = sbi->s_es; + + ext3_init_journal_params(sb, sbi->s_journal); + + if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY) || + n_blocks_count > le32_to_cpu(es->s_blocks_count)) { + if (sbi->s_mount_opt & EXT3_MOUNT_ABORT) + return -EROFS; + + if (*flags & MS_RDONLY) { + /* + * First of all, the unconditional stuff we have to do + * to disable replay of the journal when we next remount + */ + sb->s_flags |= MS_RDONLY; + + /* + * OK, test if we are remounting a valid rw partition + * readonly, and if so set the rdonly flag and then + * mark the partition as valid again. + */ + if (!(es->s_state & cpu_to_le16(EXT3_VALID_FS)) && + (sbi->s_mount_state & EXT3_VALID_FS)) + es->s_state = cpu_to_le16(sbi->s_mount_state); + + ext3_mark_recovery_complete(sb, es); + } else { + __le32 ret; + if ((ret = EXT3_HAS_RO_COMPAT_FEATURE(sb, + ~EXT3_FEATURE_RO_COMPAT_SUPP))) { + printk(KERN_WARNING "EXT3-fs: %s: couldn't " + "remount RDWR because of unsupported " + "optional features (%x).\n", + sb->s_id, le32_to_cpu(ret)); + return -EROFS; + } + /* + * Mounting a RDONLY partition read-write, so reread + * and store the current valid flag. (It may have + * been changed by e2fsck since we originally mounted + * the partition.) + */ + ext3_clear_journal_err(sb, es); + sbi->s_mount_state = le16_to_cpu(es->s_state); + if ((ret = ext3_group_extend(sb, es, n_blocks_count))) + return ret; + if (!ext3_setup_super (sb, es, 0)) + sb->s_flags &= ~MS_RDONLY; + } + } + return 0; +} + +static int ext3_statfs (struct super_block * sb, struct kstatfs * buf) +{ + struct ext3_super_block *es = EXT3_SB(sb)->s_es; + unsigned long overhead; + int i; + + if (test_opt (sb, MINIX_DF)) + overhead = 0; + else { + unsigned long ngroups; + ngroups = EXT3_SB(sb)->s_groups_count; + smp_rmb(); + + /* + * Compute the overhead (FS structures) + */ + + /* + * All of the blocks before first_data_block are + * overhead + */ + overhead = le32_to_cpu(es->s_first_data_block); + + /* + * Add the overhead attributed to the superblock and + * block group descriptors. If the sparse superblocks + * feature is turned on, then not all groups have this. + */ + for (i = 0; i < ngroups; i++) { + overhead += ext3_bg_has_super(sb, i) + + ext3_bg_num_gdb(sb, i); + cond_resched(); + } + + /* + * Every block group has an inode bitmap, a block + * bitmap, and an inode table. + */ + overhead += (ngroups * (2 + EXT3_SB(sb)->s_itb_per_group)); + } + + buf->f_type = EXT3_SUPER_MAGIC; + buf->f_bsize = sb->s_blocksize; + buf->f_blocks = le32_to_cpu(es->s_blocks_count) - overhead; + buf->f_bfree = ext3_count_free_blocks (sb); + buf->f_bavail = buf->f_bfree - le32_to_cpu(es->s_r_blocks_count); + if (buf->f_bfree < le32_to_cpu(es->s_r_blocks_count)) + buf->f_bavail = 0; + buf->f_files = le32_to_cpu(es->s_inodes_count); + buf->f_ffree = ext3_count_free_inodes (sb); + buf->f_namelen = EXT3_NAME_LEN; + return 0; +} + +/* Helper function for writing quotas on sync - we need to start transaction before quota file + * is locked for write. Otherwise the are possible deadlocks: + * Process 1 Process 2 + * ext3_create() quota_sync() + * journal_start() write_dquot() + * DQUOT_INIT() down(dqio_sem) + * down(dqio_sem) journal_start() + * + */ + +#ifdef CONFIG_QUOTA + +static inline struct inode *dquot_to_inode(struct dquot *dquot) +{ + return sb_dqopt(dquot->dq_sb)->files[dquot->dq_type]; +} + +static int ext3_dquot_initialize(struct inode *inode, int type) +{ + handle_t *handle; + int ret, err; + + /* We may create quota structure so we need to reserve enough blocks */ + handle = ext3_journal_start(inode, 2*EXT3_QUOTA_INIT_BLOCKS); + if (IS_ERR(handle)) + return PTR_ERR(handle); + ret = dquot_initialize(inode, type); + err = ext3_journal_stop(handle); + if (!ret) + ret = err; + return ret; +} + +static int ext3_dquot_drop(struct inode *inode) +{ + handle_t *handle; + int ret, err; + + /* We may delete quota structure so we need to reserve enough blocks */ + handle = ext3_journal_start(inode, 2*EXT3_QUOTA_INIT_BLOCKS); + if (IS_ERR(handle)) + return PTR_ERR(handle); + ret = dquot_drop(inode); + err = ext3_journal_stop(handle); + if (!ret) + ret = err; + return ret; +} + +static int ext3_write_dquot(struct dquot *dquot) +{ + int ret, err; + handle_t *handle; + struct inode *inode; + + inode = dquot_to_inode(dquot); + handle = ext3_journal_start(inode, + EXT3_QUOTA_TRANS_BLOCKS); + if (IS_ERR(handle)) + return PTR_ERR(handle); + ret = dquot_commit(dquot); + err = ext3_journal_stop(handle); + if (!ret) + ret = err; + return ret; +} + +static int ext3_acquire_dquot(struct dquot *dquot) +{ + int ret, err; + handle_t *handle; + + handle = ext3_journal_start(dquot_to_inode(dquot), + EXT3_QUOTA_INIT_BLOCKS); + if (IS_ERR(handle)) + return PTR_ERR(handle); + ret = dquot_acquire(dquot); + err = ext3_journal_stop(handle); + if (!ret) + ret = err; + return ret; +} + +static int ext3_release_dquot(struct dquot *dquot) +{ + int ret, err; + handle_t *handle; + + handle = ext3_journal_start(dquot_to_inode(dquot), + EXT3_QUOTA_INIT_BLOCKS); + if (IS_ERR(handle)) + return PTR_ERR(handle); + ret = dquot_release(dquot); + err = ext3_journal_stop(handle); + if (!ret) + ret = err; + return ret; +} + +static int ext3_mark_dquot_dirty(struct dquot *dquot) +{ + /* Are we journalling quotas? */ + if (EXT3_SB(dquot->dq_sb)->s_qf_names[USRQUOTA] || + EXT3_SB(dquot->dq_sb)->s_qf_names[GRPQUOTA]) { + dquot_mark_dquot_dirty(dquot); + return ext3_write_dquot(dquot); + } else { + return dquot_mark_dquot_dirty(dquot); + } +} + +static int ext3_write_info(struct super_block *sb, int type) +{ + int ret, err; + handle_t *handle; + + /* Data block + inode block */ + handle = ext3_journal_start(sb->s_root->d_inode, 2); + if (IS_ERR(handle)) + return PTR_ERR(handle); + ret = dquot_commit_info(sb, type); + err = ext3_journal_stop(handle); + if (!ret) + ret = err; + return ret; +} + +/* + * Turn on quotas during mount time - we need to find + * the quota file and such... + */ +static int ext3_quota_on_mount(struct super_block *sb, int type) +{ + int err; + struct dentry *dentry; + struct qstr name = { .name = EXT3_SB(sb)->s_qf_names[type], + .hash = 0, + .len = strlen(EXT3_SB(sb)->s_qf_names[type])}; + + dentry = lookup_hash(&name, sb->s_root); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + err = vfs_quota_on_mount(type, EXT3_SB(sb)->s_jquota_fmt, dentry); + /* Now invalidate and put the dentry - quota got its own reference + * to inode and dentry has at least wrong hash so we had better + * throw it away */ + d_invalidate(dentry); + dput(dentry); + return err; +} + +/* + * Standard function to be called on quota_on + */ +static int ext3_quota_on(struct super_block *sb, int type, int format_id, + char *path) +{ + int err; + struct nameidata nd; + + /* Not journalling quota? */ + if (!EXT3_SB(sb)->s_qf_names[USRQUOTA] && + !EXT3_SB(sb)->s_qf_names[GRPQUOTA]) + return vfs_quota_on(sb, type, format_id, path); + err = path_lookup(path, LOOKUP_FOLLOW, &nd); + if (err) + return err; + /* Quotafile not on the same filesystem? */ + if (nd.mnt->mnt_sb != sb) { + path_release(&nd); + return -EXDEV; + } + /* Quotafile not of fs root? */ + if (nd.dentry->d_parent->d_inode != sb->s_root->d_inode) + printk(KERN_WARNING + "EXT3-fs: Quota file not on filesystem root. " + "Journalled quota will not work.\n"); + path_release(&nd); + return vfs_quota_on(sb, type, format_id, path); +} + +/* Read data from quotafile - avoid pagecache and such because we cannot afford + * acquiring the locks... As quota files are never truncated and quota code + * itself serializes the operations (and noone else should touch the files) + * we don't have to be afraid of races */ +static ssize_t ext3_quota_read(struct super_block *sb, int type, char *data, + size_t len, loff_t off) +{ + struct inode *inode = sb_dqopt(sb)->files[type]; + sector_t blk = off >> EXT3_BLOCK_SIZE_BITS(sb); + int err = 0; + int offset = off & (sb->s_blocksize - 1); + int tocopy; + size_t toread; + struct buffer_head *bh; + loff_t i_size = i_size_read(inode); + + if (off > i_size) + return 0; + if (off+len > i_size) + len = i_size-off; + toread = len; + while (toread > 0) { + tocopy = sb->s_blocksize - offset < toread ? + sb->s_blocksize - offset : toread; + bh = ext3_bread(NULL, inode, blk, 0, &err); + if (err) + return err; + if (!bh) /* A hole? */ + memset(data, 0, tocopy); + else + memcpy(data, bh->b_data+offset, tocopy); + brelse(bh); + offset = 0; + toread -= tocopy; + data += tocopy; + blk++; + } + return len; +} + +/* Write to quotafile (we know the transaction is already started and has + * enough credits) */ +static ssize_t ext3_quota_write(struct super_block *sb, int type, + const char *data, size_t len, loff_t off) +{ + struct inode *inode = sb_dqopt(sb)->files[type]; + sector_t blk = off >> EXT3_BLOCK_SIZE_BITS(sb); + int err = 0; + int offset = off & (sb->s_blocksize - 1); + int tocopy; + int journal_quota = EXT3_SB(sb)->s_qf_names[type] != NULL; + size_t towrite = len; + struct buffer_head *bh; + handle_t *handle = journal_current_handle(); + + down(&inode->i_sem); + while (towrite > 0) { + tocopy = sb->s_blocksize - offset < towrite ? + sb->s_blocksize - offset : towrite; + bh = ext3_bread(handle, inode, blk, 1, &err); + if (!bh) + goto out; + if (journal_quota) { + err = ext3_journal_get_write_access(handle, bh); + if (err) { + brelse(bh); + goto out; + } + } + lock_buffer(bh); + memcpy(bh->b_data+offset, data, tocopy); + flush_dcache_page(bh->b_page); + unlock_buffer(bh); + if (journal_quota) + err = ext3_journal_dirty_metadata(handle, bh); + else { + /* Always do at least ordered writes for quotas */ + err = ext3_journal_dirty_data(handle, bh); + mark_buffer_dirty(bh); + } + brelse(bh); + if (err) + goto out; + offset = 0; + towrite -= tocopy; + data += tocopy; + blk++; + } +out: + if (len == towrite) + return err; + if (inode->i_size < off+len-towrite) { + i_size_write(inode, off+len-towrite); + EXT3_I(inode)->i_disksize = inode->i_size; + } + inode->i_version++; + inode->i_mtime = inode->i_ctime = CURRENT_TIME; + ext3_mark_inode_dirty(handle, inode); + up(&inode->i_sem); + return len - towrite; +} + +#endif + +static struct super_block *ext3_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return get_sb_bdev(fs_type, flags, dev_name, data, ext3_fill_super); +} + +static struct file_system_type ext3_fs_type = { + .owner = THIS_MODULE, + .name = "ext3", + .get_sb = ext3_get_sb, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, +}; + +static int __init init_ext3_fs(void) +{ + int err = init_ext3_xattr(); + if (err) + return err; + err = init_inodecache(); + if (err) + goto out1; + err = register_filesystem(&ext3_fs_type); + if (err) + goto out; + return 0; +out: + destroy_inodecache(); +out1: + exit_ext3_xattr(); + return err; +} + +static void __exit exit_ext3_fs(void) +{ + unregister_filesystem(&ext3_fs_type); + destroy_inodecache(); + exit_ext3_xattr(); +} + +MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); +MODULE_DESCRIPTION("Second Extended Filesystem with journaling extensions"); +MODULE_LICENSE("GPL"); +module_init(init_ext3_fs) +module_exit(exit_ext3_fs) diff --git a/fs/ext3/symlink.c b/fs/ext3/symlink.c new file mode 100644 index 000000000000..8c3e72818fb0 --- /dev/null +++ b/fs/ext3/symlink.c @@ -0,0 +1,54 @@ +/* + * linux/fs/ext3/symlink.c + * + * Only fast symlinks left here - the rest is done by generic code. AV, 1999 + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/symlink.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * ext3 symlink handling code + */ + +#include +#include +#include +#include +#include "xattr.h" + +static int ext3_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + struct ext3_inode_info *ei = EXT3_I(dentry->d_inode); + nd_set_link(nd, (char*)ei->i_data); + return 0; +} + +struct inode_operations ext3_symlink_inode_operations = { + .readlink = generic_readlink, + .follow_link = page_follow_link_light, + .put_link = page_put_link, +#ifdef CONFIG_EXT3_FS_XATTR + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = ext3_listxattr, + .removexattr = generic_removexattr, +#endif +}; + +struct inode_operations ext3_fast_symlink_inode_operations = { + .readlink = generic_readlink, + .follow_link = ext3_follow_link, +#ifdef CONFIG_EXT3_FS_XATTR + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = ext3_listxattr, + .removexattr = generic_removexattr, +#endif +}; diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c new file mode 100644 index 000000000000..4cbc6d0212d3 --- /dev/null +++ b/fs/ext3/xattr.c @@ -0,0 +1,1320 @@ +/* + * linux/fs/ext3/xattr.c + * + * Copyright (C) 2001-2003 Andreas Gruenbacher, + * + * Fix by Harrison Xing . + * Ext3 code with a lot of help from Eric Jarman . + * Extended attributes for symlinks and special files added per + * suggestion of Luka Renko . + * xattr consolidation Copyright (c) 2004 James Morris , + * Red Hat Inc. + * ea-in-inode support by Alex Tomas aka bzzz + * and Andreas Gruenbacher . + */ + +/* + * Extended attributes are stored directly in inodes (on file systems with + * inodes bigger than 128 bytes) and on additional disk blocks. The i_file_acl + * field contains the block number if an inode uses an additional block. All + * attributes must fit in the inode and one additional block. Blocks that + * contain the identical set of attributes may be shared among several inodes. + * Identical blocks are detected by keeping a cache of blocks that have + * recently been accessed. + * + * The attributes in inodes and on blocks have a different header; the entries + * are stored in the same format: + * + * +------------------+ + * | header | + * | entry 1 | | + * | entry 2 | | growing downwards + * | entry 3 | v + * | four null bytes | + * | . . . | + * | value 1 | ^ + * | value 3 | | growing upwards + * | value 2 | | + * +------------------+ + * + * The header is followed by multiple entry descriptors. In disk blocks, the + * entry descriptors are kept sorted. In inodes, they are unsorted. The + * attribute values are aligned to the end of the block in no specific order. + * + * Locking strategy + * ---------------- + * EXT3_I(inode)->i_file_acl is protected by EXT3_I(inode)->xattr_sem. + * EA blocks are only changed if they are exclusive to an inode, so + * holding xattr_sem also means that nothing but the EA block's reference + * count can change. Multiple writers to the same block are synchronized + * by the buffer lock. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "xattr.h" +#include "acl.h" + +#define BHDR(bh) ((struct ext3_xattr_header *)((bh)->b_data)) +#define ENTRY(ptr) ((struct ext3_xattr_entry *)(ptr)) +#define BFIRST(bh) ENTRY(BHDR(bh)+1) +#define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0) + +#define IHDR(inode, raw_inode) \ + ((struct ext3_xattr_ibody_header *) \ + ((void *)raw_inode + \ + EXT3_GOOD_OLD_INODE_SIZE + \ + EXT3_I(inode)->i_extra_isize)) +#define IFIRST(hdr) ((struct ext3_xattr_entry *)((hdr)+1)) + +#ifdef EXT3_XATTR_DEBUG +# define ea_idebug(inode, f...) do { \ + printk(KERN_DEBUG "inode %s:%ld: ", \ + inode->i_sb->s_id, inode->i_ino); \ + printk(f); \ + printk("\n"); \ + } while (0) +# define ea_bdebug(bh, f...) do { \ + char b[BDEVNAME_SIZE]; \ + printk(KERN_DEBUG "block %s:%lu: ", \ + bdevname(bh->b_bdev, b), \ + (unsigned long) bh->b_blocknr); \ + printk(f); \ + printk("\n"); \ + } while (0) +#else +# define ea_idebug(f...) +# define ea_bdebug(f...) +#endif + +static void ext3_xattr_cache_insert(struct buffer_head *); +static struct buffer_head *ext3_xattr_cache_find(struct inode *, + struct ext3_xattr_header *, + struct mb_cache_entry **); +static void ext3_xattr_rehash(struct ext3_xattr_header *, + struct ext3_xattr_entry *); + +static struct mb_cache *ext3_xattr_cache; + +static struct xattr_handler *ext3_xattr_handler_map[] = { + [EXT3_XATTR_INDEX_USER] = &ext3_xattr_user_handler, +#ifdef CONFIG_EXT3_FS_POSIX_ACL + [EXT3_XATTR_INDEX_POSIX_ACL_ACCESS] = &ext3_xattr_acl_access_handler, + [EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT] = &ext3_xattr_acl_default_handler, +#endif + [EXT3_XATTR_INDEX_TRUSTED] = &ext3_xattr_trusted_handler, +#ifdef CONFIG_EXT3_FS_SECURITY + [EXT3_XATTR_INDEX_SECURITY] = &ext3_xattr_security_handler, +#endif +}; + +struct xattr_handler *ext3_xattr_handlers[] = { + &ext3_xattr_user_handler, + &ext3_xattr_trusted_handler, +#ifdef CONFIG_EXT3_FS_POSIX_ACL + &ext3_xattr_acl_access_handler, + &ext3_xattr_acl_default_handler, +#endif +#ifdef CONFIG_EXT3_FS_SECURITY + &ext3_xattr_security_handler, +#endif + NULL +}; + +static inline struct xattr_handler * +ext3_xattr_handler(int name_index) +{ + struct xattr_handler *handler = NULL; + + if (name_index > 0 && name_index < ARRAY_SIZE(ext3_xattr_handler_map)) + handler = ext3_xattr_handler_map[name_index]; + return handler; +} + +/* + * Inode operation listxattr() + * + * dentry->d_inode->i_sem: don't care + */ +ssize_t +ext3_listxattr(struct dentry *dentry, char *buffer, size_t size) +{ + return ext3_xattr_list(dentry->d_inode, buffer, size); +} + +static int +ext3_xattr_check_names(struct ext3_xattr_entry *entry, void *end) +{ + while (!IS_LAST_ENTRY(entry)) { + struct ext3_xattr_entry *next = EXT3_XATTR_NEXT(entry); + if ((void *)next >= end) + return -EIO; + entry = next; + } + return 0; +} + +static inline int +ext3_xattr_check_block(struct buffer_head *bh) +{ + int error; + + if (BHDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) || + BHDR(bh)->h_blocks != cpu_to_le32(1)) + return -EIO; + error = ext3_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size); + return error; +} + +static inline int +ext3_xattr_check_entry(struct ext3_xattr_entry *entry, size_t size) +{ + size_t value_size = le32_to_cpu(entry->e_value_size); + + if (entry->e_value_block != 0 || value_size > size || + le16_to_cpu(entry->e_value_offs) + value_size > size) + return -EIO; + return 0; +} + +static int +ext3_xattr_find_entry(struct ext3_xattr_entry **pentry, int name_index, + const char *name, size_t size, int sorted) +{ + struct ext3_xattr_entry *entry; + size_t name_len; + int cmp = 1; + + if (name == NULL) + return -EINVAL; + name_len = strlen(name); + entry = *pentry; + for (; !IS_LAST_ENTRY(entry); entry = EXT3_XATTR_NEXT(entry)) { + cmp = name_index - entry->e_name_index; + if (!cmp) + cmp = name_len - entry->e_name_len; + if (!cmp) + cmp = memcmp(name, entry->e_name, name_len); + if (cmp <= 0 && (sorted || cmp == 0)) + break; + } + *pentry = entry; + if (!cmp && ext3_xattr_check_entry(entry, size)) + return -EIO; + return cmp ? -ENODATA : 0; +} + +int +ext3_xattr_block_get(struct inode *inode, int name_index, const char *name, + void *buffer, size_t buffer_size) +{ + struct buffer_head *bh = NULL; + struct ext3_xattr_entry *entry; + size_t size; + int error; + + ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld", + name_index, name, buffer, (long)buffer_size); + + error = -ENODATA; + if (!EXT3_I(inode)->i_file_acl) + goto cleanup; + ea_idebug(inode, "reading block %d", EXT3_I(inode)->i_file_acl); + bh = sb_bread(inode->i_sb, EXT3_I(inode)->i_file_acl); + if (!bh) + goto cleanup; + ea_bdebug(bh, "b_count=%d, refcount=%d", + atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); + if (ext3_xattr_check_block(bh)) { +bad_block: ext3_error(inode->i_sb, __FUNCTION__, + "inode %ld: bad block %d", inode->i_ino, + EXT3_I(inode)->i_file_acl); + error = -EIO; + goto cleanup; + } + ext3_xattr_cache_insert(bh); + entry = BFIRST(bh); + error = ext3_xattr_find_entry(&entry, name_index, name, bh->b_size, 1); + if (error == -EIO) + goto bad_block; + if (error) + goto cleanup; + size = le32_to_cpu(entry->e_value_size); + if (buffer) { + error = -ERANGE; + if (size > buffer_size) + goto cleanup; + memcpy(buffer, bh->b_data + le16_to_cpu(entry->e_value_offs), + size); + } + error = size; + +cleanup: + brelse(bh); + return error; +} + +static int +ext3_xattr_ibody_get(struct inode *inode, int name_index, const char *name, + void *buffer, size_t buffer_size) +{ + struct ext3_xattr_ibody_header *header; + struct ext3_xattr_entry *entry; + struct ext3_inode *raw_inode; + struct ext3_iloc iloc; + size_t size; + void *end; + int error; + + if (!(EXT3_I(inode)->i_state & EXT3_STATE_XATTR)) + return -ENODATA; + error = ext3_get_inode_loc(inode, &iloc); + if (error) + return error; + raw_inode = ext3_raw_inode(&iloc); + header = IHDR(inode, raw_inode); + entry = IFIRST(header); + end = (void *)raw_inode + EXT3_SB(inode->i_sb)->s_inode_size; + error = ext3_xattr_check_names(entry, end); + if (error) + goto cleanup; + error = ext3_xattr_find_entry(&entry, name_index, name, + end - (void *)entry, 0); + if (error) + goto cleanup; + size = le32_to_cpu(entry->e_value_size); + if (buffer) { + error = -ERANGE; + if (size > buffer_size) + goto cleanup; + memcpy(buffer, (void *)IFIRST(header) + + le16_to_cpu(entry->e_value_offs), size); + } + error = size; + +cleanup: + brelse(iloc.bh); + return error; +} + +/* + * ext3_xattr_get() + * + * Copy an extended attribute into the buffer + * provided, or compute the buffer size required. + * Buffer is NULL to compute the size of the buffer required. + * + * Returns a negative error number on failure, or the number of bytes + * used / required on success. + */ +int +ext3_xattr_get(struct inode *inode, int name_index, const char *name, + void *buffer, size_t buffer_size) +{ + int error; + + down_read(&EXT3_I(inode)->xattr_sem); + error = ext3_xattr_ibody_get(inode, name_index, name, buffer, + buffer_size); + if (error == -ENODATA) + error = ext3_xattr_block_get(inode, name_index, name, buffer, + buffer_size); + up_read(&EXT3_I(inode)->xattr_sem); + return error; +} + +static int +ext3_xattr_list_entries(struct inode *inode, struct ext3_xattr_entry *entry, + char *buffer, size_t buffer_size) +{ + size_t rest = buffer_size; + + for (; !IS_LAST_ENTRY(entry); entry = EXT3_XATTR_NEXT(entry)) { + struct xattr_handler *handler = + ext3_xattr_handler(entry->e_name_index); + + if (handler) { + size_t size = handler->list(inode, buffer, rest, + entry->e_name, + entry->e_name_len); + if (buffer) { + if (size > rest) + return -ERANGE; + buffer += size; + } + rest -= size; + } + } + return buffer_size - rest; +} + +int +ext3_xattr_block_list(struct inode *inode, char *buffer, size_t buffer_size) +{ + struct buffer_head *bh = NULL; + int error; + + ea_idebug(inode, "buffer=%p, buffer_size=%ld", + buffer, (long)buffer_size); + + error = 0; + if (!EXT3_I(inode)->i_file_acl) + goto cleanup; + ea_idebug(inode, "reading block %d", EXT3_I(inode)->i_file_acl); + bh = sb_bread(inode->i_sb, EXT3_I(inode)->i_file_acl); + error = -EIO; + if (!bh) + goto cleanup; + ea_bdebug(bh, "b_count=%d, refcount=%d", + atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); + if (ext3_xattr_check_block(bh)) { + ext3_error(inode->i_sb, __FUNCTION__, + "inode %ld: bad block %d", inode->i_ino, + EXT3_I(inode)->i_file_acl); + error = -EIO; + goto cleanup; + } + ext3_xattr_cache_insert(bh); + error = ext3_xattr_list_entries(inode, BFIRST(bh), buffer, buffer_size); + +cleanup: + brelse(bh); + + return error; +} + +static int +ext3_xattr_ibody_list(struct inode *inode, char *buffer, size_t buffer_size) +{ + struct ext3_xattr_ibody_header *header; + struct ext3_inode *raw_inode; + struct ext3_iloc iloc; + void *end; + int error; + + if (!(EXT3_I(inode)->i_state & EXT3_STATE_XATTR)) + return 0; + error = ext3_get_inode_loc(inode, &iloc); + if (error) + return error; + raw_inode = ext3_raw_inode(&iloc); + header = IHDR(inode, raw_inode); + end = (void *)raw_inode + EXT3_SB(inode->i_sb)->s_inode_size; + error = ext3_xattr_check_names(IFIRST(header), end); + if (error) + goto cleanup; + error = ext3_xattr_list_entries(inode, IFIRST(header), + buffer, buffer_size); + +cleanup: + brelse(iloc.bh); + return error; +} + +/* + * ext3_xattr_list() + * + * Copy a list of attribute names into the buffer + * provided, or compute the buffer size required. + * Buffer is NULL to compute the size of the buffer required. + * + * Returns a negative error number on failure, or the number of bytes + * used / required on success. + */ +int +ext3_xattr_list(struct inode *inode, char *buffer, size_t buffer_size) +{ + int i_error, b_error; + + down_read(&EXT3_I(inode)->xattr_sem); + i_error = ext3_xattr_ibody_list(inode, buffer, buffer_size); + if (i_error < 0) { + b_error = 0; + } else { + if (buffer) { + buffer += i_error; + buffer_size -= i_error; + } + b_error = ext3_xattr_block_list(inode, buffer, buffer_size); + if (b_error < 0) + i_error = 0; + } + up_read(&EXT3_I(inode)->xattr_sem); + return i_error + b_error; +} + +/* + * If the EXT3_FEATURE_COMPAT_EXT_ATTR feature of this file system is + * not set, set it. + */ +static void ext3_xattr_update_super_block(handle_t *handle, + struct super_block *sb) +{ + if (EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_EXT_ATTR)) + return; + + lock_super(sb); + if (ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh) == 0) { + EXT3_SB(sb)->s_es->s_feature_compat |= + cpu_to_le32(EXT3_FEATURE_COMPAT_EXT_ATTR); + sb->s_dirt = 1; + ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); + } + unlock_super(sb); +} + +/* + * Release the xattr block BH: If the reference count is > 1, decrement + * it; otherwise free the block. + */ +static void +ext3_xattr_release_block(handle_t *handle, struct inode *inode, + struct buffer_head *bh) +{ + struct mb_cache_entry *ce = NULL; + + ce = mb_cache_entry_get(ext3_xattr_cache, bh->b_bdev, bh->b_blocknr); + if (BHDR(bh)->h_refcount == cpu_to_le32(1)) { + ea_bdebug(bh, "refcount now=0; freeing"); + if (ce) + mb_cache_entry_free(ce); + ext3_free_blocks(handle, inode, bh->b_blocknr, 1); + get_bh(bh); + ext3_forget(handle, 1, inode, bh, bh->b_blocknr); + } else { + if (ext3_journal_get_write_access(handle, bh) == 0) { + lock_buffer(bh); + BHDR(bh)->h_refcount = cpu_to_le32( + le32_to_cpu(BHDR(bh)->h_refcount) - 1); + ext3_journal_dirty_metadata(handle, bh); + if (IS_SYNC(inode)) + handle->h_sync = 1; + DQUOT_FREE_BLOCK(inode, 1); + unlock_buffer(bh); + ea_bdebug(bh, "refcount now=%d; releasing", + le32_to_cpu(BHDR(bh)->h_refcount)); + } + if (ce) + mb_cache_entry_release(ce); + } +} + +struct ext3_xattr_info { + int name_index; + const char *name; + const void *value; + size_t value_len; +}; + +struct ext3_xattr_search { + struct ext3_xattr_entry *first; + void *base; + void *end; + struct ext3_xattr_entry *here; + int not_found; +}; + +static int +ext3_xattr_set_entry(struct ext3_xattr_info *i, struct ext3_xattr_search *s) +{ + struct ext3_xattr_entry *last; + size_t free, min_offs = s->end - s->base, name_len = strlen(i->name); + + /* Compute min_offs and last. */ + last = s->first; + for (; !IS_LAST_ENTRY(last); last = EXT3_XATTR_NEXT(last)) { + if (!last->e_value_block && last->e_value_size) { + size_t offs = le16_to_cpu(last->e_value_offs); + if (offs < min_offs) + min_offs = offs; + } + } + free = min_offs - ((void *)last - s->base) - sizeof(__u32); + if (!s->not_found) { + if (!s->here->e_value_block && s->here->e_value_size) { + size_t size = le32_to_cpu(s->here->e_value_size); + free += EXT3_XATTR_SIZE(size); + } + free += EXT3_XATTR_LEN(name_len); + } + if (i->value) { + if (free < EXT3_XATTR_SIZE(i->value_len) || + free < EXT3_XATTR_LEN(name_len) + + EXT3_XATTR_SIZE(i->value_len)) + return -ENOSPC; + } + + if (i->value && s->not_found) { + /* Insert the new name. */ + size_t size = EXT3_XATTR_LEN(name_len); + size_t rest = (void *)last - (void *)s->here + sizeof(__u32); + memmove((void *)s->here + size, s->here, rest); + memset(s->here, 0, size); + s->here->e_name_index = i->name_index; + s->here->e_name_len = name_len; + memcpy(s->here->e_name, i->name, name_len); + } else { + if (!s->here->e_value_block && s->here->e_value_size) { + void *first_val = s->base + min_offs; + size_t offs = le16_to_cpu(s->here->e_value_offs); + void *val = s->base + offs; + size_t size = EXT3_XATTR_SIZE( + le32_to_cpu(s->here->e_value_size)); + + if (i->value && size == EXT3_XATTR_SIZE(i->value_len)) { + /* The old and the new value have the same + size. Just replace. */ + s->here->e_value_size = + cpu_to_le32(i->value_len); + memset(val + size - EXT3_XATTR_PAD, 0, + EXT3_XATTR_PAD); /* Clear pad bytes. */ + memcpy(val, i->value, i->value_len); + return 0; + } + + /* Remove the old value. */ + memmove(first_val + size, first_val, val - first_val); + memset(first_val, 0, size); + s->here->e_value_size = 0; + s->here->e_value_offs = 0; + min_offs += size; + + /* Adjust all value offsets. */ + last = s->first; + while (!IS_LAST_ENTRY(last)) { + size_t o = le16_to_cpu(last->e_value_offs); + if (!last->e_value_block && + last->e_value_size && o < offs) + last->e_value_offs = + cpu_to_le16(o + size); + last = EXT3_XATTR_NEXT(last); + } + } + if (!i->value) { + /* Remove the old name. */ + size_t size = EXT3_XATTR_LEN(name_len); + last = ENTRY((void *)last - size); + memmove(s->here, (void *)s->here + size, + (void *)last - (void *)s->here + sizeof(__u32)); + memset(last, 0, size); + } + } + + if (i->value) { + /* Insert the new value. */ + s->here->e_value_size = cpu_to_le32(i->value_len); + if (i->value_len) { + size_t size = EXT3_XATTR_SIZE(i->value_len); + void *val = s->base + min_offs - size; + s->here->e_value_offs = cpu_to_le16(min_offs - size); + memset(val + size - EXT3_XATTR_PAD, 0, + EXT3_XATTR_PAD); /* Clear the pad bytes. */ + memcpy(val, i->value, i->value_len); + } + } + return 0; +} + +struct ext3_xattr_block_find { + struct ext3_xattr_search s; + struct buffer_head *bh; +}; + +int +ext3_xattr_block_find(struct inode *inode, struct ext3_xattr_info *i, + struct ext3_xattr_block_find *bs) +{ + struct super_block *sb = inode->i_sb; + int error; + + ea_idebug(inode, "name=%d.%s, value=%p, value_len=%ld", + i->name_index, i->name, i->value, (long)i->value_len); + + if (EXT3_I(inode)->i_file_acl) { + /* The inode already has an extended attribute block. */ + bs->bh = sb_bread(sb, EXT3_I(inode)->i_file_acl); + error = -EIO; + if (!bs->bh) + goto cleanup; + ea_bdebug(bs->bh, "b_count=%d, refcount=%d", + atomic_read(&(bs->bh->b_count)), + le32_to_cpu(BHDR(bs->bh)->h_refcount)); + if (ext3_xattr_check_block(bs->bh)) { + ext3_error(sb, __FUNCTION__, + "inode %ld: bad block %d", inode->i_ino, + EXT3_I(inode)->i_file_acl); + error = -EIO; + goto cleanup; + } + /* Find the named attribute. */ + bs->s.base = BHDR(bs->bh); + bs->s.first = BFIRST(bs->bh); + bs->s.end = bs->bh->b_data + bs->bh->b_size; + bs->s.here = bs->s.first; + error = ext3_xattr_find_entry(&bs->s.here, i->name_index, + i->name, bs->bh->b_size, 1); + if (error && error != -ENODATA) + goto cleanup; + bs->s.not_found = error; + } + error = 0; + +cleanup: + return error; +} + +static int +ext3_xattr_block_set(handle_t *handle, struct inode *inode, + struct ext3_xattr_info *i, + struct ext3_xattr_block_find *bs) +{ + struct super_block *sb = inode->i_sb; + struct buffer_head *new_bh = NULL; + struct ext3_xattr_search *s = &bs->s; + struct mb_cache_entry *ce = NULL; + int error; + +#define header(x) ((struct ext3_xattr_header *)(x)) + + if (i->value && i->value_len > sb->s_blocksize) + return -ENOSPC; + if (s->base) { + ce = mb_cache_entry_get(ext3_xattr_cache, bs->bh->b_bdev, + bs->bh->b_blocknr); + if (header(s->base)->h_refcount == cpu_to_le32(1)) { + if (ce) { + mb_cache_entry_free(ce); + ce = NULL; + } + ea_bdebug(bs->bh, "modifying in-place"); + error = ext3_journal_get_write_access(handle, bs->bh); + if (error) + goto cleanup; + lock_buffer(bs->bh); + error = ext3_xattr_set_entry(i, s); + if (!error) { + if (!IS_LAST_ENTRY(s->first)) + ext3_xattr_rehash(header(s->base), + s->here); + ext3_xattr_cache_insert(bs->bh); + } + unlock_buffer(bs->bh); + if (error == -EIO) + goto bad_block; + if (!error) + error = ext3_journal_dirty_metadata(handle, + bs->bh); + if (error) + goto cleanup; + goto inserted; + } else { + int offset = (char *)s->here - bs->bh->b_data; + + if (ce) { + mb_cache_entry_release(ce); + ce = NULL; + } + ea_bdebug(bs->bh, "cloning"); + s->base = kmalloc(bs->bh->b_size, GFP_KERNEL); + error = -ENOMEM; + if (s->base == NULL) + goto cleanup; + memcpy(s->base, BHDR(bs->bh), bs->bh->b_size); + s->first = ENTRY(header(s->base)+1); + header(s->base)->h_refcount = cpu_to_le32(1); + s->here = ENTRY(s->base + offset); + s->end = s->base + bs->bh->b_size; + } + } else { + /* Allocate a buffer where we construct the new block. */ + s->base = kmalloc(sb->s_blocksize, GFP_KERNEL); + /* assert(header == s->base) */ + error = -ENOMEM; + if (s->base == NULL) + goto cleanup; + memset(s->base, 0, sb->s_blocksize); + header(s->base)->h_magic = cpu_to_le32(EXT3_XATTR_MAGIC); + header(s->base)->h_blocks = cpu_to_le32(1); + header(s->base)->h_refcount = cpu_to_le32(1); + s->first = ENTRY(header(s->base)+1); + s->here = ENTRY(header(s->base)+1); + s->end = s->base + sb->s_blocksize; + } + + error = ext3_xattr_set_entry(i, s); + if (error == -EIO) + goto bad_block; + if (error) + goto cleanup; + if (!IS_LAST_ENTRY(s->first)) + ext3_xattr_rehash(header(s->base), s->here); + +inserted: + if (!IS_LAST_ENTRY(s->first)) { + new_bh = ext3_xattr_cache_find(inode, header(s->base), &ce); + if (new_bh) { + /* We found an identical block in the cache. */ + if (new_bh == bs->bh) + ea_bdebug(new_bh, "keeping"); + else { + /* The old block is released after updating + the inode. */ + error = -EDQUOT; + if (DQUOT_ALLOC_BLOCK(inode, 1)) + goto cleanup; + error = ext3_journal_get_write_access(handle, + new_bh); + if (error) + goto cleanup_dquot; + lock_buffer(new_bh); + BHDR(new_bh)->h_refcount = cpu_to_le32(1 + + le32_to_cpu(BHDR(new_bh)->h_refcount)); + ea_bdebug(new_bh, "reusing; refcount now=%d", + le32_to_cpu(BHDR(new_bh)->h_refcount)); + unlock_buffer(new_bh); + error = ext3_journal_dirty_metadata(handle, + new_bh); + if (error) + goto cleanup_dquot; + } + mb_cache_entry_release(ce); + ce = NULL; + } else if (bs->bh && s->base == bs->bh->b_data) { + /* We were modifying this block in-place. */ + ea_bdebug(bs->bh, "keeping this block"); + new_bh = bs->bh; + get_bh(new_bh); + } else { + /* We need to allocate a new block */ + int goal = le32_to_cpu( + EXT3_SB(sb)->s_es->s_first_data_block) + + EXT3_I(inode)->i_block_group * + EXT3_BLOCKS_PER_GROUP(sb); + int block = ext3_new_block(handle, inode, goal, &error); + if (error) + goto cleanup; + ea_idebug(inode, "creating block %d", block); + + new_bh = sb_getblk(sb, block); + if (!new_bh) { +getblk_failed: + ext3_free_blocks(handle, inode, block, 1); + error = -EIO; + goto cleanup; + } + lock_buffer(new_bh); + error = ext3_journal_get_create_access(handle, new_bh); + if (error) { + unlock_buffer(new_bh); + goto getblk_failed; + } + memcpy(new_bh->b_data, s->base, new_bh->b_size); + set_buffer_uptodate(new_bh); + unlock_buffer(new_bh); + ext3_xattr_cache_insert(new_bh); + error = ext3_journal_dirty_metadata(handle, new_bh); + if (error) + goto cleanup; + } + } + + /* Update the inode. */ + EXT3_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0; + + /* Drop the previous xattr block. */ + if (bs->bh && bs->bh != new_bh) + ext3_xattr_release_block(handle, inode, bs->bh); + error = 0; + +cleanup: + if (ce) + mb_cache_entry_release(ce); + brelse(new_bh); + if (!(bs->bh && s->base == bs->bh->b_data)) + kfree(s->base); + + return error; + +cleanup_dquot: + DQUOT_FREE_BLOCK(inode, 1); + goto cleanup; + +bad_block: + ext3_error(inode->i_sb, __FUNCTION__, + "inode %ld: bad block %d", inode->i_ino, + EXT3_I(inode)->i_file_acl); + goto cleanup; + +#undef header +} + +struct ext3_xattr_ibody_find { + struct ext3_xattr_search s; + struct ext3_iloc iloc; +}; + +int +ext3_xattr_ibody_find(struct inode *inode, struct ext3_xattr_info *i, + struct ext3_xattr_ibody_find *is) +{ + struct ext3_xattr_ibody_header *header; + struct ext3_inode *raw_inode; + int error; + + if (EXT3_I(inode)->i_extra_isize == 0) + return 0; + raw_inode = ext3_raw_inode(&is->iloc); + header = IHDR(inode, raw_inode); + is->s.base = is->s.first = IFIRST(header); + is->s.here = is->s.first; + is->s.end = (void *)raw_inode + EXT3_SB(inode->i_sb)->s_inode_size; + if (EXT3_I(inode)->i_state & EXT3_STATE_XATTR) { + error = ext3_xattr_check_names(IFIRST(header), is->s.end); + if (error) + return error; + /* Find the named attribute. */ + error = ext3_xattr_find_entry(&is->s.here, i->name_index, + i->name, is->s.end - + (void *)is->s.base, 0); + if (error && error != -ENODATA) + return error; + is->s.not_found = error; + } + return 0; +} + +static int +ext3_xattr_ibody_set(handle_t *handle, struct inode *inode, + struct ext3_xattr_info *i, + struct ext3_xattr_ibody_find *is) +{ + struct ext3_xattr_ibody_header *header; + struct ext3_xattr_search *s = &is->s; + int error; + + if (EXT3_I(inode)->i_extra_isize == 0) + return -ENOSPC; + error = ext3_xattr_set_entry(i, s); + if (error) + return error; + header = IHDR(inode, ext3_raw_inode(&is->iloc)); + if (!IS_LAST_ENTRY(s->first)) { + header->h_magic = cpu_to_le32(EXT3_XATTR_MAGIC); + EXT3_I(inode)->i_state |= EXT3_STATE_XATTR; + } else { + header->h_magic = cpu_to_le32(0); + EXT3_I(inode)->i_state &= ~EXT3_STATE_XATTR; + } + return 0; +} + +/* + * ext3_xattr_set_handle() + * + * Create, replace or remove an extended attribute for this inode. Buffer + * is NULL to remove an existing extended attribute, and non-NULL to + * either replace an existing extended attribute, or create a new extended + * attribute. The flags XATTR_REPLACE and XATTR_CREATE + * specify that an extended attribute must exist and must not exist + * previous to the call, respectively. + * + * Returns 0, or a negative error number on failure. + */ +int +ext3_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, + const char *name, const void *value, size_t value_len, + int flags) +{ + struct ext3_xattr_info i = { + .name_index = name_index, + .name = name, + .value = value, + .value_len = value_len, + + }; + struct ext3_xattr_ibody_find is = { + .s = { .not_found = -ENODATA, }, + }; + struct ext3_xattr_block_find bs = { + .s = { .not_found = -ENODATA, }, + }; + int error; + + if (IS_RDONLY(inode)) + return -EROFS; + if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) + return -EPERM; + if (!name) + return -EINVAL; + if (strlen(name) > 255) + return -ERANGE; + down_write(&EXT3_I(inode)->xattr_sem); + error = ext3_get_inode_loc(inode, &is.iloc); + if (error) + goto cleanup; + + if (EXT3_I(inode)->i_state & EXT3_STATE_NEW) { + struct ext3_inode *raw_inode = ext3_raw_inode(&is.iloc); + memset(raw_inode, 0, EXT3_SB(inode->i_sb)->s_inode_size); + EXT3_I(inode)->i_state &= ~EXT3_STATE_NEW; + } + + error = ext3_xattr_ibody_find(inode, &i, &is); + if (error) + goto cleanup; + if (is.s.not_found) + error = ext3_xattr_block_find(inode, &i, &bs); + if (error) + goto cleanup; + if (is.s.not_found && bs.s.not_found) { + error = -ENODATA; + if (flags & XATTR_REPLACE) + goto cleanup; + error = 0; + if (!value) + goto cleanup; + } else { + error = -EEXIST; + if (flags & XATTR_CREATE) + goto cleanup; + } + error = ext3_journal_get_write_access(handle, is.iloc.bh); + if (error) + goto cleanup; + if (!value) { + if (!is.s.not_found) + error = ext3_xattr_ibody_set(handle, inode, &i, &is); + else if (!bs.s.not_found) + error = ext3_xattr_block_set(handle, inode, &i, &bs); + } else { + error = ext3_xattr_ibody_set(handle, inode, &i, &is); + if (!error && !bs.s.not_found) { + i.value = NULL; + error = ext3_xattr_block_set(handle, inode, &i, &bs); + } else if (error == -ENOSPC) { + error = ext3_xattr_block_set(handle, inode, &i, &bs); + if (error) + goto cleanup; + if (!is.s.not_found) { + i.value = NULL; + error = ext3_xattr_ibody_set(handle, inode, &i, + &is); + } + } + } + if (!error) { + ext3_xattr_update_super_block(handle, inode->i_sb); + inode->i_ctime = CURRENT_TIME_SEC; + error = ext3_mark_iloc_dirty(handle, inode, &is.iloc); + /* + * The bh is consumed by ext3_mark_iloc_dirty, even with + * error != 0. + */ + is.iloc.bh = NULL; + if (IS_SYNC(inode)) + handle->h_sync = 1; + } + +cleanup: + brelse(is.iloc.bh); + brelse(bs.bh); + up_write(&EXT3_I(inode)->xattr_sem); + return error; +} + +/* + * ext3_xattr_set() + * + * Like ext3_xattr_set_handle, but start from an inode. This extended + * attribute modification is a filesystem transaction by itself. + * + * Returns 0, or a negative error number on failure. + */ +int +ext3_xattr_set(struct inode *inode, int name_index, const char *name, + const void *value, size_t value_len, int flags) +{ + handle_t *handle; + int error, retries = 0; + +retry: + handle = ext3_journal_start(inode, EXT3_DATA_TRANS_BLOCKS); + if (IS_ERR(handle)) { + error = PTR_ERR(handle); + } else { + int error2; + + error = ext3_xattr_set_handle(handle, inode, name_index, name, + value, value_len, flags); + error2 = ext3_journal_stop(handle); + if (error == -ENOSPC && + ext3_should_retry_alloc(inode->i_sb, &retries)) + goto retry; + if (error == 0) + error = error2; + } + + return error; +} + +/* + * ext3_xattr_delete_inode() + * + * Free extended attribute resources associated with this inode. This + * is called immediately before an inode is freed. We have exclusive + * access to the inode. + */ +void +ext3_xattr_delete_inode(handle_t *handle, struct inode *inode) +{ + struct buffer_head *bh = NULL; + + if (!EXT3_I(inode)->i_file_acl) + goto cleanup; + bh = sb_bread(inode->i_sb, EXT3_I(inode)->i_file_acl); + if (!bh) { + ext3_error(inode->i_sb, __FUNCTION__, + "inode %ld: block %d read error", inode->i_ino, + EXT3_I(inode)->i_file_acl); + goto cleanup; + } + if (BHDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) || + BHDR(bh)->h_blocks != cpu_to_le32(1)) { + ext3_error(inode->i_sb, __FUNCTION__, + "inode %ld: bad block %d", inode->i_ino, + EXT3_I(inode)->i_file_acl); + goto cleanup; + } + ext3_xattr_release_block(handle, inode, bh); + EXT3_I(inode)->i_file_acl = 0; + +cleanup: + brelse(bh); +} + +/* + * ext3_xattr_put_super() + * + * This is called when a file system is unmounted. + */ +void +ext3_xattr_put_super(struct super_block *sb) +{ + mb_cache_shrink(ext3_xattr_cache, sb->s_bdev); +} + +/* + * ext3_xattr_cache_insert() + * + * Create a new entry in the extended attribute cache, and insert + * it unless such an entry is already in the cache. + * + * Returns 0, or a negative error number on failure. + */ +static void +ext3_xattr_cache_insert(struct buffer_head *bh) +{ + __u32 hash = le32_to_cpu(BHDR(bh)->h_hash); + struct mb_cache_entry *ce; + int error; + + ce = mb_cache_entry_alloc(ext3_xattr_cache); + if (!ce) { + ea_bdebug(bh, "out of memory"); + return; + } + error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, &hash); + if (error) { + mb_cache_entry_free(ce); + if (error == -EBUSY) { + ea_bdebug(bh, "already in cache"); + error = 0; + } + } else { + ea_bdebug(bh, "inserting [%x]", (int)hash); + mb_cache_entry_release(ce); + } +} + +/* + * ext3_xattr_cmp() + * + * Compare two extended attribute blocks for equality. + * + * Returns 0 if the blocks are equal, 1 if they differ, and + * a negative error number on errors. + */ +static int +ext3_xattr_cmp(struct ext3_xattr_header *header1, + struct ext3_xattr_header *header2) +{ + struct ext3_xattr_entry *entry1, *entry2; + + entry1 = ENTRY(header1+1); + entry2 = ENTRY(header2+1); + while (!IS_LAST_ENTRY(entry1)) { + if (IS_LAST_ENTRY(entry2)) + return 1; + if (entry1->e_hash != entry2->e_hash || + entry1->e_name_index != entry2->e_name_index || + entry1->e_name_len != entry2->e_name_len || + entry1->e_value_size != entry2->e_value_size || + memcmp(entry1->e_name, entry2->e_name, entry1->e_name_len)) + return 1; + if (entry1->e_value_block != 0 || entry2->e_value_block != 0) + return -EIO; + if (memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs), + (char *)header2 + le16_to_cpu(entry2->e_value_offs), + le32_to_cpu(entry1->e_value_size))) + return 1; + + entry1 = EXT3_XATTR_NEXT(entry1); + entry2 = EXT3_XATTR_NEXT(entry2); + } + if (!IS_LAST_ENTRY(entry2)) + return 1; + return 0; +} + +/* + * ext3_xattr_cache_find() + * + * Find an identical extended attribute block. + * + * Returns a pointer to the block found, or NULL if such a block was + * not found or an error occurred. + */ +static struct buffer_head * +ext3_xattr_cache_find(struct inode *inode, struct ext3_xattr_header *header, + struct mb_cache_entry **pce) +{ + __u32 hash = le32_to_cpu(header->h_hash); + struct mb_cache_entry *ce; + + if (!header->h_hash) + return NULL; /* never share */ + ea_idebug(inode, "looking for cached blocks [%x]", (int)hash); +again: + ce = mb_cache_entry_find_first(ext3_xattr_cache, 0, + inode->i_sb->s_bdev, hash); + while (ce) { + struct buffer_head *bh; + + if (IS_ERR(ce)) { + if (PTR_ERR(ce) == -EAGAIN) + goto again; + break; + } + bh = sb_bread(inode->i_sb, ce->e_block); + if (!bh) { + ext3_error(inode->i_sb, __FUNCTION__, + "inode %ld: block %ld read error", + inode->i_ino, (unsigned long) ce->e_block); + } else if (le32_to_cpu(BHDR(bh)->h_refcount) >= + EXT3_XATTR_REFCOUNT_MAX) { + ea_idebug(inode, "block %ld refcount %d>=%d", + (unsigned long) ce->e_block, + le32_to_cpu(BHDR(bh)->h_refcount), + EXT3_XATTR_REFCOUNT_MAX); + } else if (ext3_xattr_cmp(header, BHDR(bh)) == 0) { + *pce = ce; + return bh; + } + brelse(bh); + ce = mb_cache_entry_find_next(ce, 0, inode->i_sb->s_bdev, hash); + } + return NULL; +} + +#define NAME_HASH_SHIFT 5 +#define VALUE_HASH_SHIFT 16 + +/* + * ext3_xattr_hash_entry() + * + * Compute the hash of an extended attribute. + */ +static inline void ext3_xattr_hash_entry(struct ext3_xattr_header *header, + struct ext3_xattr_entry *entry) +{ + __u32 hash = 0; + char *name = entry->e_name; + int n; + + for (n=0; n < entry->e_name_len; n++) { + hash = (hash << NAME_HASH_SHIFT) ^ + (hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^ + *name++; + } + + if (entry->e_value_block == 0 && entry->e_value_size != 0) { + __le32 *value = (__le32 *)((char *)header + + le16_to_cpu(entry->e_value_offs)); + for (n = (le32_to_cpu(entry->e_value_size) + + EXT3_XATTR_ROUND) >> EXT3_XATTR_PAD_BITS; n; n--) { + hash = (hash << VALUE_HASH_SHIFT) ^ + (hash >> (8*sizeof(hash) - VALUE_HASH_SHIFT)) ^ + le32_to_cpu(*value++); + } + } + entry->e_hash = cpu_to_le32(hash); +} + +#undef NAME_HASH_SHIFT +#undef VALUE_HASH_SHIFT + +#define BLOCK_HASH_SHIFT 16 + +/* + * ext3_xattr_rehash() + * + * Re-compute the extended attribute hash value after an entry has changed. + */ +static void ext3_xattr_rehash(struct ext3_xattr_header *header, + struct ext3_xattr_entry *entry) +{ + struct ext3_xattr_entry *here; + __u32 hash = 0; + + ext3_xattr_hash_entry(header, entry); + here = ENTRY(header+1); + while (!IS_LAST_ENTRY(here)) { + if (!here->e_hash) { + /* Block is not shared if an entry's hash value == 0 */ + hash = 0; + break; + } + hash = (hash << BLOCK_HASH_SHIFT) ^ + (hash >> (8*sizeof(hash) - BLOCK_HASH_SHIFT)) ^ + le32_to_cpu(here->e_hash); + here = EXT3_XATTR_NEXT(here); + } + header->h_hash = cpu_to_le32(hash); +} + +#undef BLOCK_HASH_SHIFT + +int __init +init_ext3_xattr(void) +{ + ext3_xattr_cache = mb_cache_create("ext3_xattr", NULL, + sizeof(struct mb_cache_entry) + + sizeof(((struct mb_cache_entry *) 0)->e_indexes[0]), 1, 6); + if (!ext3_xattr_cache) + return -ENOMEM; + return 0; +} + +void +exit_ext3_xattr(void) +{ + if (ext3_xattr_cache) + mb_cache_destroy(ext3_xattr_cache); + ext3_xattr_cache = NULL; +} diff --git a/fs/ext3/xattr.h b/fs/ext3/xattr.h new file mode 100644 index 000000000000..eb31a69e82dc --- /dev/null +++ b/fs/ext3/xattr.h @@ -0,0 +1,135 @@ +/* + File: fs/ext3/xattr.h + + On-disk format of extended attributes for the ext3 filesystem. + + (C) 2001 Andreas Gruenbacher, +*/ + +#include +#include + +/* Magic value in attribute blocks */ +#define EXT3_XATTR_MAGIC 0xEA020000 + +/* Maximum number of references to one attribute block */ +#define EXT3_XATTR_REFCOUNT_MAX 1024 + +/* Name indexes */ +#define EXT3_XATTR_INDEX_USER 1 +#define EXT3_XATTR_INDEX_POSIX_ACL_ACCESS 2 +#define EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT 3 +#define EXT3_XATTR_INDEX_TRUSTED 4 +#define EXT3_XATTR_INDEX_LUSTRE 5 +#define EXT3_XATTR_INDEX_SECURITY 6 + +struct ext3_xattr_header { + __le32 h_magic; /* magic number for identification */ + __le32 h_refcount; /* reference count */ + __le32 h_blocks; /* number of disk blocks used */ + __le32 h_hash; /* hash value of all attributes */ + __u32 h_reserved[4]; /* zero right now */ +}; + +struct ext3_xattr_ibody_header { + __le32 h_magic; /* magic number for identification */ +}; + +struct ext3_xattr_entry { + __u8 e_name_len; /* length of name */ + __u8 e_name_index; /* attribute name index */ + __le16 e_value_offs; /* offset in disk block of value */ + __le32 e_value_block; /* disk block attribute is stored on (n/i) */ + __le32 e_value_size; /* size of attribute value */ + __le32 e_hash; /* hash value of name and value */ + char e_name[0]; /* attribute name */ +}; + +#define EXT3_XATTR_PAD_BITS 2 +#define EXT3_XATTR_PAD (1<e_name_len)) ) +#define EXT3_XATTR_SIZE(size) \ + (((size) + EXT3_XATTR_ROUND) & ~EXT3_XATTR_ROUND) + +# ifdef CONFIG_EXT3_FS_XATTR + +extern struct xattr_handler ext3_xattr_user_handler; +extern struct xattr_handler ext3_xattr_trusted_handler; +extern struct xattr_handler ext3_xattr_acl_access_handler; +extern struct xattr_handler ext3_xattr_acl_default_handler; +extern struct xattr_handler ext3_xattr_security_handler; + +extern ssize_t ext3_listxattr(struct dentry *, char *, size_t); + +extern int ext3_xattr_get(struct inode *, int, const char *, void *, size_t); +extern int ext3_xattr_list(struct inode *, char *, size_t); +extern int ext3_xattr_set(struct inode *, int, const char *, const void *, size_t, int); +extern int ext3_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int); + +extern void ext3_xattr_delete_inode(handle_t *, struct inode *); +extern void ext3_xattr_put_super(struct super_block *); + +extern int init_ext3_xattr(void); +extern void exit_ext3_xattr(void); + +extern struct xattr_handler *ext3_xattr_handlers[]; + +# else /* CONFIG_EXT3_FS_XATTR */ + +static inline int +ext3_xattr_get(struct inode *inode, int name_index, const char *name, + void *buffer, size_t size, int flags) +{ + return -EOPNOTSUPP; +} + +static inline int +ext3_xattr_list(struct inode *inode, void *buffer, size_t size) +{ + return -EOPNOTSUPP; +} + +static inline int +ext3_xattr_set(struct inode *inode, int name_index, const char *name, + const void *value, size_t size, int flags) +{ + return -EOPNOTSUPP; +} + +static inline int +ext3_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, + const char *name, const void *value, size_t size, int flags) +{ + return -EOPNOTSUPP; +} + +static inline void +ext3_xattr_delete_inode(handle_t *handle, struct inode *inode) +{ +} + +static inline void +ext3_xattr_put_super(struct super_block *sb) +{ +} + +static inline int +init_ext3_xattr(void) +{ + return 0; +} + +static inline void +exit_ext3_xattr(void) +{ +} + +#define ext3_xattr_handlers NULL + +# endif /* CONFIG_EXT3_FS_XATTR */ diff --git a/fs/ext3/xattr_security.c b/fs/ext3/xattr_security.c new file mode 100644 index 000000000000..ddc1c41750e1 --- /dev/null +++ b/fs/ext3/xattr_security.c @@ -0,0 +1,55 @@ +/* + * linux/fs/ext3/xattr_security.c + * Handler for storing security labels as extended attributes. + */ + +#include +#include +#include +#include +#include +#include +#include "xattr.h" + +static size_t +ext3_xattr_security_list(struct inode *inode, char *list, size_t list_size, + const char *name, size_t name_len) +{ + const size_t prefix_len = sizeof(XATTR_SECURITY_PREFIX)-1; + const size_t total_len = prefix_len + name_len + 1; + + + if (list && total_len <= list_size) { + memcpy(list, XATTR_SECURITY_PREFIX, prefix_len); + memcpy(list+prefix_len, name, name_len); + list[prefix_len + name_len] = '\0'; + } + return total_len; +} + +static int +ext3_xattr_security_get(struct inode *inode, const char *name, + void *buffer, size_t size) +{ + if (strcmp(name, "") == 0) + return -EINVAL; + return ext3_xattr_get(inode, EXT3_XATTR_INDEX_SECURITY, name, + buffer, size); +} + +static int +ext3_xattr_security_set(struct inode *inode, const char *name, + const void *value, size_t size, int flags) +{ + if (strcmp(name, "") == 0) + return -EINVAL; + return ext3_xattr_set(inode, EXT3_XATTR_INDEX_SECURITY, name, + value, size, flags); +} + +struct xattr_handler ext3_xattr_security_handler = { + .prefix = XATTR_SECURITY_PREFIX, + .list = ext3_xattr_security_list, + .get = ext3_xattr_security_get, + .set = ext3_xattr_security_set, +}; diff --git a/fs/ext3/xattr_trusted.c b/fs/ext3/xattr_trusted.c new file mode 100644 index 000000000000..f68bfd1cf519 --- /dev/null +++ b/fs/ext3/xattr_trusted.c @@ -0,0 +1,65 @@ +/* + * linux/fs/ext3/xattr_trusted.c + * Handler for trusted extended attributes. + * + * Copyright (C) 2003 by Andreas Gruenbacher, + */ + +#include +#include +#include +#include +#include +#include +#include "xattr.h" + +#define XATTR_TRUSTED_PREFIX "trusted." + +static size_t +ext3_xattr_trusted_list(struct inode *inode, char *list, size_t list_size, + const char *name, size_t name_len) +{ + const size_t prefix_len = sizeof(XATTR_TRUSTED_PREFIX)-1; + const size_t total_len = prefix_len + name_len + 1; + + if (!capable(CAP_SYS_ADMIN)) + return 0; + + if (list && total_len <= list_size) { + memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len); + memcpy(list+prefix_len, name, name_len); + list[prefix_len + name_len] = '\0'; + } + return total_len; +} + +static int +ext3_xattr_trusted_get(struct inode *inode, const char *name, + void *buffer, size_t size) +{ + if (strcmp(name, "") == 0) + return -EINVAL; + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + return ext3_xattr_get(inode, EXT3_XATTR_INDEX_TRUSTED, name, + buffer, size); +} + +static int +ext3_xattr_trusted_set(struct inode *inode, const char *name, + const void *value, size_t size, int flags) +{ + if (strcmp(name, "") == 0) + return -EINVAL; + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + return ext3_xattr_set(inode, EXT3_XATTR_INDEX_TRUSTED, name, + value, size, flags); +} + +struct xattr_handler ext3_xattr_trusted_handler = { + .prefix = XATTR_TRUSTED_PREFIX, + .list = ext3_xattr_trusted_list, + .get = ext3_xattr_trusted_get, + .set = ext3_xattr_trusted_set, +}; diff --git a/fs/ext3/xattr_user.c b/fs/ext3/xattr_user.c new file mode 100644 index 000000000000..e907cae7a07c --- /dev/null +++ b/fs/ext3/xattr_user.c @@ -0,0 +1,79 @@ +/* + * linux/fs/ext3/xattr_user.c + * Handler for extended user attributes. + * + * Copyright (C) 2001 by Andreas Gruenbacher, + */ + +#include +#include +#include +#include +#include +#include +#include "xattr.h" + +#define XATTR_USER_PREFIX "user." + +static size_t +ext3_xattr_user_list(struct inode *inode, char *list, size_t list_size, + const char *name, size_t name_len) +{ + const size_t prefix_len = sizeof(XATTR_USER_PREFIX)-1; + const size_t total_len = prefix_len + name_len + 1; + + if (!test_opt(inode->i_sb, XATTR_USER)) + return 0; + + if (list && total_len <= list_size) { + memcpy(list, XATTR_USER_PREFIX, prefix_len); + memcpy(list+prefix_len, name, name_len); + list[prefix_len + name_len] = '\0'; + } + return total_len; +} + +static int +ext3_xattr_user_get(struct inode *inode, const char *name, + void *buffer, size_t size) +{ + int error; + + if (strcmp(name, "") == 0) + return -EINVAL; + if (!test_opt(inode->i_sb, XATTR_USER)) + return -EOPNOTSUPP; + error = permission(inode, MAY_READ, NULL); + if (error) + return error; + + return ext3_xattr_get(inode, EXT3_XATTR_INDEX_USER, name, buffer, size); +} + +static int +ext3_xattr_user_set(struct inode *inode, const char *name, + const void *value, size_t size, int flags) +{ + int error; + + if (strcmp(name, "") == 0) + return -EINVAL; + if (!test_opt(inode->i_sb, XATTR_USER)) + return -EOPNOTSUPP; + if ( !S_ISREG(inode->i_mode) && + (!S_ISDIR(inode->i_mode) || inode->i_mode & S_ISVTX)) + return -EPERM; + error = permission(inode, MAY_WRITE, NULL); + if (error) + return error; + + return ext3_xattr_set(inode, EXT3_XATTR_INDEX_USER, name, + value, size, flags); +} + +struct xattr_handler ext3_xattr_user_handler = { + .prefix = XATTR_USER_PREFIX, + .list = ext3_xattr_user_list, + .get = ext3_xattr_user_get, + .set = ext3_xattr_user_set, +}; -- cgit v1.2.3